xml.c (10835B)
1 #include <sys/types.h> 2 3 #include <ctype.h> 4 #include <errno.h> 5 #include <limits.h> 6 #include <stdio.h> 7 #include <stdlib.h> 8 #include <string.h> 9 10 #include "xml.h" 11 12 static void 13 xml_parseattrs(struct xmlparser *x) 14 { 15 size_t namelen = 0, valuelen; 16 int c, endsep, endname = 0, valuestart = 0; 17 18 while ((c = GETNEXT) != EOF) { 19 if (isspace(c)) { 20 if (namelen) 21 endname = 1; 22 continue; 23 } else if (c == '?') 24 ; /* ignore */ 25 else if (c == '=') { 26 x->name[namelen] = '\0'; 27 valuestart = 1; 28 endname = 1; 29 } else if (namelen && ((endname && !valuestart && isalpha(c)) || (c == '>' || c == '/'))) { 30 /* attribute without value */ 31 x->name[namelen] = '\0'; 32 if (x->xmlattrstart) 33 x->xmlattrstart(x, x->tag, x->taglen, x->name, namelen); 34 if (x->xmlattr) 35 x->xmlattr(x, x->tag, x->taglen, x->name, namelen, "", 0); 36 if (x->xmlattrend) 37 x->xmlattrend(x, x->tag, x->taglen, x->name, namelen); 38 endname = 0; 39 x->name[0] = c; 40 namelen = 1; 41 } else if (namelen && valuestart) { 42 /* attribute with value */ 43 if (x->xmlattrstart) 44 x->xmlattrstart(x, x->tag, x->taglen, x->name, namelen); 45 46 valuelen = 0; 47 if (c == '\'' || c == '"') { 48 endsep = c; 49 } else { 50 endsep = ' '; /* isspace() */ 51 goto startvalue; 52 } 53 54 while ((c = GETNEXT) != EOF) { 55 startvalue: 56 if (c == '&') { /* entities */ 57 x->data[valuelen] = '\0'; 58 /* call data function with data before entity if there is data */ 59 if (valuelen && x->xmlattr) 60 x->xmlattr(x, x->tag, x->taglen, x->name, namelen, x->data, valuelen); 61 x->data[0] = c; 62 valuelen = 1; 63 while ((c = GETNEXT) != EOF) { 64 if (c == endsep || (endsep == ' ' && (c == '>' || isspace(c)))) 65 break; 66 if (valuelen < sizeof(x->data) - 1) 67 x->data[valuelen++] = c; 68 else { 69 /* entity too long for buffer, handle as normal data */ 70 x->data[valuelen] = '\0'; 71 if (x->xmlattr) 72 x->xmlattr(x, x->tag, x->taglen, x->name, namelen, x->data, valuelen); 73 x->data[0] = c; 74 valuelen = 1; 75 break; 76 } 77 if (c == ';') { 78 x->data[valuelen] = '\0'; 79 if (x->xmlattrentity) 80 x->xmlattrentity(x, x->tag, x->taglen, x->name, namelen, x->data, valuelen); 81 valuelen = 0; 82 break; 83 } 84 } 85 } else if (c != endsep && !(endsep == ' ' && (c == '>' || isspace(c)))) { 86 if (valuelen < sizeof(x->data) - 1) { 87 x->data[valuelen++] = c; 88 } else { 89 x->data[valuelen] = '\0'; 90 if (x->xmlattr) 91 x->xmlattr(x, x->tag, x->taglen, x->name, namelen, x->data, valuelen); 92 x->data[0] = c; 93 valuelen = 1; 94 } 95 } 96 if (c == endsep || (endsep == ' ' && (c == '>' || isspace(c)))) { 97 x->data[valuelen] = '\0'; 98 if (x->xmlattr) 99 x->xmlattr(x, x->tag, x->taglen, x->name, namelen, x->data, valuelen); 100 if (x->xmlattrend) 101 x->xmlattrend(x, x->tag, x->taglen, x->name, namelen); 102 break; 103 } 104 } 105 namelen = endname = valuestart = 0; 106 } else if (namelen < sizeof(x->name) - 1) { 107 x->name[namelen++] = c; 108 } 109 if (c == '>') { 110 break; 111 } else if (c == '/') { 112 x->isshorttag = 1; 113 x->name[0] = '\0'; 114 namelen = 0; 115 } 116 } 117 } 118 119 static void 120 xml_parsecomment(struct xmlparser *x) 121 { 122 size_t datalen = 0, i = 0; 123 int c; 124 125 if (x->xmlcommentstart) 126 x->xmlcommentstart(x); 127 while ((c = GETNEXT) != EOF) { 128 if (c == '-' || c == '>') { 129 if (x->xmlcomment && datalen) { 130 x->data[datalen] = '\0'; 131 x->xmlcomment(x, x->data, datalen); 132 datalen = 0; 133 } 134 } 135 136 if (c == '-') { 137 if (++i > 2) { 138 if (x->xmlcomment) 139 for (; i > 2; i--) 140 x->xmlcomment(x, "-", 1); 141 i = 2; 142 } 143 continue; 144 } else if (c == '>' && i == 2) { 145 if (x->xmlcommentend) 146 x->xmlcommentend(x); 147 return; 148 } else if (i) { 149 if (x->xmlcomment) { 150 for (; i > 0; i--) 151 x->xmlcomment(x, "-", 1); 152 } 153 i = 0; 154 } 155 156 if (datalen < sizeof(x->data) - 1) { 157 x->data[datalen++] = c; 158 } else { 159 x->data[datalen] = '\0'; 160 if (x->xmlcomment) 161 x->xmlcomment(x, x->data, datalen); 162 x->data[0] = c; 163 datalen = 1; 164 } 165 } 166 } 167 168 static void 169 xml_parsecdata(struct xmlparser *x) 170 { 171 size_t datalen = 0, i = 0; 172 int c; 173 174 if (x->xmlcdatastart) 175 x->xmlcdatastart(x); 176 while ((c = GETNEXT) != EOF) { 177 if (c == ']' || c == '>') { 178 if (x->xmlcdata && datalen) { 179 x->data[datalen] = '\0'; 180 x->xmlcdata(x, x->data, datalen); 181 datalen = 0; 182 } 183 } 184 185 if (c == ']') { 186 if (++i > 2) { 187 if (x->xmlcdata) 188 for (; i > 2; i--) 189 x->xmlcdata(x, "]", 1); 190 i = 2; 191 } 192 continue; 193 } else if (c == '>' && i == 2) { 194 if (x->xmlcdataend) 195 x->xmlcdataend(x); 196 return; 197 } else if (i) { 198 if (x->xmlcdata) 199 for (; i > 0; i--) 200 x->xmlcdata(x, "]", 1); 201 i = 0; 202 } 203 204 if (datalen < sizeof(x->data) - 1) { 205 x->data[datalen++] = c; 206 } else { 207 x->data[datalen] = '\0'; 208 if (x->xmlcdata) 209 x->xmlcdata(x, x->data, datalen); 210 x->data[0] = c; 211 datalen = 1; 212 } 213 } 214 } 215 216 static int 217 codepointtoutf8(long r, char *s) 218 { 219 if (r == 0) { 220 return 0; /* NUL byte */ 221 } else if (r <= 0x7F) { 222 /* 1 byte: 0aaaaaaa */ 223 s[0] = r; 224 return 1; 225 } else if (r <= 0x07FF) { 226 /* 2 bytes: 00000aaa aabbbbbb */ 227 s[0] = 0xC0 | ((r & 0x0007C0) >> 6); /* 110aaaaa */ 228 s[1] = 0x80 | (r & 0x00003F); /* 10bbbbbb */ 229 return 2; 230 } else if (r <= 0xFFFF) { 231 /* 3 bytes: aaaabbbb bbcccccc */ 232 s[0] = 0xE0 | ((r & 0x00F000) >> 12); /* 1110aaaa */ 233 s[1] = 0x80 | ((r & 0x000FC0) >> 6); /* 10bbbbbb */ 234 s[2] = 0x80 | (r & 0x00003F); /* 10cccccc */ 235 return 3; 236 } else { 237 /* 4 bytes: 000aaabb bbbbcccc ccdddddd */ 238 s[0] = 0xF0 | ((r & 0x1C0000) >> 18); /* 11110aaa */ 239 s[1] = 0x80 | ((r & 0x03F000) >> 12); /* 10bbbbbb */ 240 s[2] = 0x80 | ((r & 0x000FC0) >> 6); /* 10cccccc */ 241 s[3] = 0x80 | (r & 0x00003F); /* 10dddddd */ 242 return 4; 243 } 244 } 245 246 static int 247 namedentitytostr(const char *e, char *buf, size_t bufsiz) 248 { 249 static const struct { 250 const char *entity; 251 int c; 252 } entities[] = { 253 { "amp;", '&' }, 254 { "lt;", '<' }, 255 { "gt;", '>' }, 256 { "apos;", '\'' }, 257 { "quot;", '"' }, 258 { "AMP;", '&' }, 259 { "LT;", '<' }, 260 { "GT;", '>' }, 261 { "APOS;", '\'' }, 262 { "QUOT;", '"' } 263 }; 264 size_t i; 265 266 /* buffer is too small */ 267 if (bufsiz < 2) 268 return -1; 269 270 for (i = 0; i < sizeof(entities) / sizeof(*entities); i++) { 271 if (!strcmp(e, entities[i].entity)) { 272 buf[0] = entities[i].c; 273 buf[1] = '\0'; 274 return 1; 275 } 276 } 277 return 0; 278 } 279 280 static int 281 numericentitytostr(const char *e, char *buf, size_t bufsiz) 282 { 283 long l; 284 int len; 285 char *end; 286 287 /* buffer is too small */ 288 if (bufsiz < 5) 289 return -1; 290 291 errno = 0; 292 /* hex (16) or decimal (10) */ 293 if (*e == 'x') 294 l = strtoul(e + 1, &end, 16); 295 else 296 l = strtoul(e, &end, 10); 297 /* invalid value or not a well-formed entity or too high codepoint */ 298 if (errno || *end != ';' || l > 0x10FFFF) 299 return 0; 300 len = codepointtoutf8(l, buf); 301 buf[len] = '\0'; 302 303 return len; 304 } 305 306 /* convert named- or numeric entity string to buffer string 307 * returns byte-length of string. */ 308 int 309 xml_entitytostr(const char *e, char *buf, size_t bufsiz) 310 { 311 /* doesn't start with & */ 312 if (e[0] != '&') 313 return 0; 314 /* numeric entity */ 315 if (e[1] == '#') 316 return numericentitytostr(e + 2, buf, bufsiz); 317 else /* named entity */ 318 return namedentitytostr(e + 1, buf, bufsiz); 319 } 320 321 void 322 xml_parse(struct xmlparser *x) 323 { 324 size_t datalen, tagdatalen; 325 int c, isend; 326 327 while ((c = GETNEXT) != EOF && c != '<') 328 ; /* skip until < */ 329 330 while (c != EOF) { 331 if (c == '<') { /* parse tag */ 332 if ((c = GETNEXT) == EOF) 333 return; 334 335 if (c == '!') { /* cdata and comments */ 336 for (tagdatalen = 0; (c = GETNEXT) != EOF;) { 337 /* NOTE: sizeof(x->data) must be atleast sizeof("[CDATA[") */ 338 if (tagdatalen <= sizeof("[CDATA[") - 1) 339 x->data[tagdatalen++] = c; 340 if (c == '>') 341 break; 342 else if (c == '-' && tagdatalen == sizeof("--") - 1 && 343 (x->data[0] == '-')) { 344 xml_parsecomment(x); 345 break; 346 } else if (c == '[') { 347 if (tagdatalen == sizeof("[CDATA[") - 1 && 348 !strncmp(x->data, "[CDATA[", tagdatalen)) { 349 xml_parsecdata(x); 350 break; 351 } 352 } 353 } 354 } else { 355 /* normal tag (open, short open, close), processing instruction. */ 356 x->tag[0] = c; 357 x->taglen = 1; 358 x->isshorttag = isend = 0; 359 360 /* treat processing instruction as shorttag, don't strip "?" prefix. */ 361 if (c == '?') { 362 x->isshorttag = 1; 363 } else if (c == '/') { 364 if ((c = GETNEXT) == EOF) 365 return; 366 x->tag[0] = c; 367 isend = 1; 368 } 369 370 while ((c = GETNEXT) != EOF) { 371 if (c == '/') 372 x->isshorttag = 1; /* short tag */ 373 else if (c == '>' || isspace(c)) { 374 x->tag[x->taglen] = '\0'; 375 if (isend) { /* end tag, starts with </ */ 376 if (x->xmltagend) 377 x->xmltagend(x, x->tag, x->taglen, x->isshorttag); 378 x->tag[0] = '\0'; 379 x->taglen = 0; 380 } else { 381 /* start tag */ 382 if (x->xmltagstart) 383 x->xmltagstart(x, x->tag, x->taglen); 384 if (isspace(c)) 385 xml_parseattrs(x); 386 if (x->xmltagstartparsed) 387 x->xmltagstartparsed(x, x->tag, x->taglen, x->isshorttag); 388 } 389 /* call tagend for shortform or processing instruction */ 390 if (x->isshorttag) { 391 if (x->xmltagend) 392 x->xmltagend(x, x->tag, x->taglen, x->isshorttag); 393 x->tag[0] = '\0'; 394 x->taglen = 0; 395 } 396 break; 397 } else if (x->taglen < sizeof(x->tag) - 1) 398 x->tag[x->taglen++] = c; /* NOTE: tag name truncation */ 399 } 400 } 401 } else { 402 /* parse tag data */ 403 datalen = 0; 404 if (x->xmldatastart) 405 x->xmldatastart(x); 406 while ((c = GETNEXT) != EOF) { 407 if (c == '&') { 408 if (datalen) { 409 x->data[datalen] = '\0'; 410 if (x->xmldata) 411 x->xmldata(x, x->data, datalen); 412 } 413 x->data[0] = c; 414 datalen = 1; 415 while ((c = GETNEXT) != EOF) { 416 if (c == '<') 417 break; 418 if (datalen < sizeof(x->data) - 1) 419 x->data[datalen++] = c; 420 else { 421 /* entity too long for buffer, handle as normal data */ 422 x->data[datalen] = '\0'; 423 if (x->xmldata) 424 x->xmldata(x, x->data, datalen); 425 x->data[0] = c; 426 datalen = 1; 427 break; 428 } 429 if (c == ';') { 430 x->data[datalen] = '\0'; 431 if (x->xmldataentity) 432 x->xmldataentity(x, x->data, datalen); 433 datalen = 0; 434 break; 435 } 436 } 437 } else if (c != '<') { 438 if (datalen < sizeof(x->data) - 1) { 439 x->data[datalen++] = c; 440 } else { 441 x->data[datalen] = '\0'; 442 if (x->xmldata) 443 x->xmldata(x, x->data, datalen); 444 x->data[0] = c; 445 datalen = 1; 446 } 447 } 448 if (c == '<') { 449 x->data[datalen] = '\0'; 450 if (x->xmldata && datalen) 451 x->xmldata(x, x->data, datalen); 452 if (x->xmldataend) 453 x->xmldataend(x); 454 break; 455 } 456 } 457 } 458 } 459 }