luthor.c (48005B)
1 /* 2 * Designed to be included in other C files which define emitter 3 * operations. The same source may thus be used to parse different 4 * grammars. 5 * 6 * The operators cover the most common operators i the C family. Each 7 * operator does not have a name, it is represent by a long token code 8 * with up to 4 ASCII characters embedded literally. This avoids any 9 * semantic meaning at the lexer level. Emitters macros can redefine 10 * this behavior. 11 * 12 * No real harm is done in accepting a superset, but the source is 13 * intended to be modified, have things flagged or removed, other things 14 * added. The real complicity is in numbers, identifiers, and comments, 15 * which should be fairly complete with flagging as is. 16 * 17 * Keyword handling is done at macroes, and described elsewhere, but for 18 * identifier compatible keywords, this is quite efficient to handle on 19 * a per language basis without modifying this source. 20 * 21 * The Lisp language family is somewhat different and not directly 22 * suited for this lexer, although it can easily be modified to suit. 23 * The main reason is ';' for comments, and operators used as part of 24 * the identifier symbol set, and no need for operator classification, 25 * and different handling of single character symbols. 26 * 27 * So overall, we more or less have one efficient unified lexer that can 28 * manage many languages - this is good, because it is a pain to write a 29 * new lexer by hand, and lexer tools are what they are. 30 */ 31 32 #include "luthor.h" 33 34 #ifdef LEX_C99_NUMERIC 35 #define LEX_C_NUMERIC 36 #define LEX_HEX_FLOAT_NUMERIC 37 #define LEX_BINARY_NUMERIC 38 #endif 39 40 #ifdef LEX_C_NUMERIC 41 #define LEX_C_OCTAL_NUMERIC 42 #define LEX_HEX_NUMERIC 43 #endif 44 45 #ifdef LEX_JULIA_NUMERIC 46 #ifdef LEX_C_OCTAL_NUMERIC 47 /* 48 * LEX_JULIA_OCTAL_NUMERIC and LEX_C_OCTAL_NUMERIC can technically 49 * coexist, but leading zeroes give C style leading zero numbers 50 * which can lead to incorrect values depending on expectations. 51 * Therefore the full LEX_JULIA_NUMERIC flag is designed to not allow this. 52 */ 53 #error "LEX_C_OCTAL_NUMERIC conflicts with LEX_JULIA_NUMERIC leading zero integers" 54 #endif 55 56 /* 57 * Julia v0.3 insists on lower case, and has a different meaning for 58 * upper case. 59 */ 60 #define LEX_LOWER_CASE_NUMERIC_PREFIX 61 #define LEX_JULIA_OCTAL_NUMERIC 62 #define LEX_HEX_FLOAT_NUMERIC 63 #define LEX_BINARY_NUMERIC 64 65 #endif 66 67 #ifdef LEX_HEX_FLOAT_NUMERIC 68 #define LEX_HEX_NUMERIC 69 #endif 70 71 /* 72 * Numeric and string constants do not accept prefixes such as u, l, L, 73 * U, ll, LL, f, or F in C, or various others in Julia strings. Use the 74 * parser to detect juxtaposition between identifier and constant. In 75 * Julia numeric suffix means multiplication, in C it is a type 76 * qualifier. Sign, such as defined in JSON, are also not accepted - 77 * they must be operators. See source for various flag to enable 78 * different token types. 79 */ 80 81 /* 82 * Includes '_' in identifers by default. Defines follow characters in 83 * identifiers but not the lead character - it must be defined in switch 84 * cases. If the identifier allows for dash '-', it is probably better 85 * to handle it as an operator and flag surrounding space in the parser. 86 */ 87 #ifndef lex_isalnum 88 89 /* 90 * NOTE: isalnum, isalpha, is locale dependent. We only want to 91 * to consider that ASCII-7 subset and treat everything else as utf-8. 92 * This table is not for leading identifiers, as it contains 0..9. 93 * 94 * For more correct handling of UTF-8, see: 95 * https://theantlrguy.atlassian.net/wiki/display/ANTLR4/Grammar+Lexicon 96 * based on Java Ident = NameStartChar NameChar* 97 * 98 * While the following is UTF-16, it can be adapted to UTF-8 easily. 99 100 101 fragment 102 NameChar 103 : NameStartChar 104 | '0'..'9' 105 | '_' 106 | '\u00B7' 107 | '\u0300'..'\u036F' 108 | '\u203F'..'\u2040' 109 ; 110 fragment 111 NameStartChar 112 : 'A'..'Z' | 'a'..'z' 113 | '\u00C0'..'\u00D6' 114 | '\u00D8'..'\u00F6' 115 | '\u00F8'..'\u02FF' 116 | '\u0370'..'\u037D' 117 | '\u037F'..'\u1FFF' 118 | '\u200C'..'\u200D' 119 | '\u2070'..'\u218F' 120 | '\u2C00'..'\u2FEF' 121 | '\u3001'..'\uD7FF' 122 | '\uF900'..'\uFDCF' 123 | '\uFDF0'..'\uFFFD' 124 ; 125 */ 126 127 static const char lex_alnum[256] = { 128 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 129 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 130 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 131 /* 0..9 */ 132 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 133 /* A..O */ 134 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 135 /* P..Z, _ */ 136 #ifdef LEX_ID_WITHOUT_UNDERSCORE 137 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 138 #else 139 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 140 #endif 141 /* a..o */ 142 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 143 /* p..z */ 144 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 145 #ifdef LEX_ID_WITH_UTF8 146 /* utf-8 */ 147 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 148 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 149 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 150 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 151 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 152 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 153 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 154 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 155 #else 156 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 157 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 158 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 159 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 160 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 161 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 162 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 163 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 164 #endif 165 }; 166 167 #define lex_isalnum(c) (lex_alnum[(unsigned char)(c)]) 168 #endif 169 170 #ifndef lex_isbindigit 171 #define lex_isbindigit(c) ((c) == '0' || (c) == '1') 172 #endif 173 174 #ifndef lex_isoctdigit 175 #define lex_isoctdigit(c) ((unsigned)((c) - '0') < 8) 176 #endif 177 178 #ifndef lex_isdigit 179 #define lex_isdigit(c) ((unsigned)(c) >= '0' && (unsigned)(c) <= '9') 180 #endif 181 182 #ifndef lex_ishexdigit 183 #define lex_ishexdigit(c) (((c) >= '0' && ((unsigned)c) <= '9') || ((unsigned)(c | 0x20) >= 'a' && (unsigned)(c | 0x20) <= 'f')) 184 #endif 185 186 #ifndef lex_isctrl 187 #include <ctype.h> 188 #define lex_isctrl(c) (((unsigned)c) < 0x20 || (c) == 0x7f) 189 #endif 190 191 #ifndef lex_isblank 192 #define lex_isblank(c) ((c) == ' ' || (c) == '\t') 193 #endif 194 195 #ifndef lex_iszterm 196 #define lex_iszterm(c) ((c) == '\0') 197 #endif 198 199 /* 200 * If ZTERM is disabled, zero will be a LEX_CTRL token 201 * and allowed to be embedded in comments and strings, or 202 * elsewhere, as long as the parser accepts the token. 203 */ 204 #ifdef LEX_DISABLE_ZTERM 205 #undef lex_iszterm 206 #define lex_iszterm(c) (0) 207 #endif 208 209 /* 210 * The mode is normally LEX_MODE_NORMAL = 0 initially, or the returned 211 * mode from a previous call, unless LEX_MODE_INVALID = 1 was returned. 212 * If a buffer stopped in the middle of a string or a comment, the mode 213 * will reflect that. In all cases some amount of recovery is needed 214 * before starting a new buffer - see detailed comments in header file. 215 * If only a single buffer is used, special handling is still needed if 216 * the last line contains a single line comment because it will not be 217 * terminated, but it amounts to replace the emitted unterminated 218 * comment token with an end of comment token. 219 * 220 * Instead of 0, the mode can initially also be LEX_MODE_BOM - it will 221 * an strip optional BOM before moving to normal mode. Currently only 222 * UTF-8 BOM is supported, and this is unlikely to change. 223 * 224 * The context variable is user-defined and available to emitter macros. 225 * It may be null if unused. 226 * 227 */ 228 static int lex(const char *buf, size_t len, int mode, void *context) 229 { 230 const char *p, *q, *s, *d; 231 #if 0 232 /* TODO: old, remove this */ 233 , *z, *f; 234 #endif 235 236 p = buf; /* next char */ 237 q = p + len; /* end of buffer */ 238 s = p; /* start of token */ 239 d = p; /* end of integer part */ 240 241 #if 0 242 /* TODO: old, remove this */ 243 244 /* Used for float and leading zero detection in numerics. */ 245 z = p; 246 f = p; 247 #endif 248 249 /* 250 * Handle mid string and mid comment for reentering across 251 * buffer boundaries. Strip embedded counter from mode. 252 */ 253 switch(mode & (LEX_MODE_COUNT_BASE - 1)) { 254 255 case LEX_MODE_NORMAL: 256 goto lex_mode_normal; 257 258 case LEX_MODE_BOM: 259 goto lex_mode_bom; 260 261 #ifdef LEX_C_STRING 262 case LEX_MODE_C_STRING: 263 goto lex_mode_c_string; 264 #endif 265 #ifdef LEX_PYTHON_BLOCK_STRING 266 case LEX_MODE_PYTHON_BLOCK_STRING: 267 goto lex_mode_python_block_string; 268 #endif 269 #ifdef LEX_C_STRING_SQ 270 case LEX_MODE_C_STRING_SQ: 271 goto lex_mode_c_string_sq; 272 #endif 273 #ifdef LEX_PYTHON_BLOCK_STRING_SQ 274 case LEX_MODE_PYTHON_BLOCK_STRING_SQ: 275 goto lex_mode_python_block_string_sq; 276 #endif 277 #ifdef LEX_C_BLOCK_COMMENT 278 case LEX_MODE_C_BLOCK_COMMENT: 279 goto lex_mode_c_block_comment; 280 #endif 281 #if defined(LEX_SHELL_LINE_COMMENT) || defined(LEX_C99_LINE_COMMENT) 282 case LEX_MODE_LINE_COMMENT: 283 goto lex_mode_line_comment; 284 #endif 285 #ifdef LEX_JULIA_NESTED_COMMENT 286 case LEX_MODE_JULIA_NESTED_COMMENT: 287 goto lex_mode_julia_nested_comment; 288 #endif 289 290 default: 291 /* 292 * This is mostly to kill unused label warning when comments 293 * are disabled. 294 */ 295 goto lex_mode_exit; 296 } 297 298 lex_mode_bom: 299 300 mode = LEX_MODE_BOM; 301 302 /* 303 * Special entry mode to consume utf-8 bom if present. We don't 304 * support other boms, but we would use the same token if we did. 305 * 306 * We generally expect no bom present, but it is here if needed 307 * without requiring ugly hacks elsewhere. 308 */ 309 if (p + 3 < q && p[0] == '\xef' && p[1] == '\xbb' && p[2] == '\xbf') { 310 p += 3; 311 lex_emit_bom(s, p); 312 } 313 goto lex_mode_normal; 314 315 /* If source is updated, also update LEX_C_STRING_SQ accordingly. */ 316 #ifdef LEX_C_STRING 317 lex_mode_c_string: 318 319 mode = LEX_MODE_C_STRING; 320 321 for (;;) { 322 --p; 323 /* We do not allow blanks that are also control characters, such as \t. */ 324 while (++p != q && *p != '\\' && *p != '\"' && !lex_isctrl(*p)) { 325 } 326 if (s != p) { 327 lex_emit_string_part(s, p); 328 s = p; 329 } 330 if (*p == '\"') { 331 ++p; 332 lex_emit_string_end(s, p); 333 goto lex_mode_normal; 334 } 335 if (p == q || lex_iszterm(*p)) { 336 lex_emit_string_unterminated(p); 337 goto lex_mode_normal; 338 } 339 if (*p == '\\') { 340 ++p; 341 /* Escape is only itself, whatever is escped follows separately. */ 342 lex_emit_string_escape(s, p); 343 s = p; 344 if (p == q || lex_iszterm(*p)) { 345 lex_emit_string_unterminated(p); 346 goto lex_mode_normal; 347 } 348 if (*p == '\\' || *p == '\"') { 349 ++p; 350 continue; 351 } 352 /* 353 * Flag only relevant for single line strings, as it 354 * controls whether we fail on unterminated string at line 355 * ending with '\'. 356 * 357 * Julia does not support line continuation in strings 358 * (or elsewhere). C, Python, and Javascript do. 359 */ 360 #ifndef LEX_DISABLE_STRING_CONT 361 if (*p == '\n') { 362 if (++p != q && *p == '\r') { 363 ++p; 364 } 365 lex_emit_string_newline(s, p); 366 s = p; 367 continue; 368 } 369 if (*p == '\r') { 370 if (++p != q && *p == '\n') { 371 ++p; 372 } 373 lex_emit_string_newline(s, p); 374 s = p; 375 continue; 376 } 377 #endif 378 } 379 if (*p == '\n' || *p == '\r') { 380 lex_emit_string_unterminated(p); 381 goto lex_mode_normal; 382 } 383 ++p; 384 lex_emit_string_ctrl(s); 385 s = p; 386 } 387 #endif 388 389 /* 390 * This is a copy if LEX_C_STRING with single quote. It's not DRY, but 391 * no reason to parameterized inner loops, just because. Recopy of 392 * changes are to the above. 393 * 394 * Even if single quote is only used for CHAR types, it makes sense to 395 * parse as a full string since there can be all sorts of unicocde 396 * escapes and line continuations, newlines to report and unexpected 397 * control characters to deal with. 398 */ 399 #ifdef LEX_C_STRING_SQ 400 lex_mode_c_string_sq: 401 402 mode = LEX_MODE_C_STRING_SQ; 403 404 for (;;) { 405 --p; 406 while (++p != q && *p != '\\' && *p != '\'' && !lex_isctrl(*p)) { 407 } 408 if (s != p) { 409 lex_emit_string_part(s, p); 410 s = p; 411 } 412 if (*p == '\'') { 413 ++p; 414 lex_emit_string_end(s, p); 415 goto lex_mode_normal; 416 } 417 if (p == q || lex_iszterm(*p)) { 418 lex_emit_string_unterminated(p); 419 goto lex_mode_normal; 420 } 421 if (*p == '\\') { 422 ++p; 423 /* Escape is only itself, whatever is escped follows separately. */ 424 lex_emit_string_escape(s, p); 425 s = p; 426 if (p == q || lex_iszterm(*p)) { 427 lex_emit_string_unterminated(p); 428 goto lex_mode_normal; 429 } 430 if (*p == '\\' || *p == '\'') { 431 ++p; 432 continue; 433 } 434 /* 435 * Flag only relevant for single line strings, as it 436 * controls whether we fail on unterminated string at line 437 * ending with '\'. 438 * 439 * Julia does not support line continuation in strings 440 * (or elsewhere). C, Python, and Javascript do. 441 */ 442 #ifndef LEX_DISABLE_STRING_CONT 443 if (*p == '\n') { 444 if (++p != q && *p == '\r') { 445 ++p; 446 } 447 lex_emit_string_newline(s, p); 448 s = p; 449 continue; 450 } 451 if (*p == '\r') { 452 if (++p != q && *p == '\n') { 453 ++p; 454 } 455 lex_emit_string_newline(s, p); 456 s = p; 457 continue; 458 } 459 #endif 460 } 461 if (*p == '\n' || *p == '\r') { 462 lex_emit_string_unterminated(p); 463 goto lex_mode_normal; 464 } 465 ++p; 466 lex_emit_string_ctrl(s); 467 s = p; 468 } 469 #endif 470 471 /* 472 * """ Triple quoted Python block strings. """ 473 * Single quoted version (''') is a direct copy, update both places 474 * if a changed is needed. 475 * 476 * Note: there is no point in disabling line continuation 477 * for block strings, since it only affects unterminated 478 * string errors at newline. It all comes down to how 479 * escaped newline is interpreted by the parser. 480 */ 481 #ifdef LEX_PYTHON_BLOCK_STRING 482 lex_mode_python_block_string: 483 484 mode = LEX_MODE_PYTHON_BLOCK_STRING; 485 486 for (;;) { 487 --p; 488 while (++p != q && *p != '\\' && !lex_isctrl(*p)) { 489 if (*p == '\"' && p + 2 < q && p[1] == '\"' && p[2] == '\"') { 490 break; 491 } 492 } 493 if (s != p) { 494 lex_emit_string_part(s, p); 495 s = p; 496 } 497 if (p == q || lex_iszterm(*p)) { 498 lex_emit_string_unterminated(p); 499 goto lex_mode_normal; 500 } 501 if (*p == '\"') { 502 p += 3; 503 lex_emit_string_end(s, p); 504 goto lex_mode_normal; 505 } 506 if (*p == '\\') { 507 /* Escape is only itself, allowing parser to interpret and validate. */ 508 ++p; 509 lex_emit_string_escape(s, p); 510 s = p; 511 if (p + 1 != q && (*p == '\\' || *p == '\"')) { 512 ++p; 513 } 514 continue; 515 } 516 if (*p == '\n') { 517 if (++p != q && *p == '\r') { 518 ++p; 519 } 520 lex_emit_string_newline(s, p); 521 s = p; 522 continue; 523 } 524 if (*p == '\r') { 525 if (++p != q && *p == '\n') { 526 ++p; 527 } 528 lex_emit_string_newline(s, p); 529 s = p; 530 continue; 531 } 532 ++p; 533 lex_emit_string_ctrl(s); 534 s = p; 535 } 536 #endif 537 538 /* 539 * Python ''' style strings. 540 * Direct copy of """ quote version, update both if changed. 541 */ 542 #ifdef LEX_PYTHON_BLOCK_STRING_SQ 543 lex_mode_python_block_string_sq: 544 545 mode = LEX_MODE_PYTHON_BLOCK_STRING_SQ; 546 547 for (;;) { 548 --p; 549 while (++p != q && *p != '\\' && !lex_isctrl(*p)) { 550 if (*p == '\'' && p + 2 < q && p[1] == '\'' && p[2] == '\'') { 551 break; 552 } 553 } 554 if (s != p) { 555 lex_emit_string_part(s, p); 556 s = p; 557 } 558 if (p == q || lex_iszterm(*p)) { 559 lex_emit_string_unterminated(p); 560 goto lex_mode_normal; 561 } 562 if (*p == '\'') { 563 p += 3; 564 lex_emit_string_end(s, p); 565 goto lex_mode_normal; 566 } 567 if (*p == '\\') { 568 /* Escape is only itself, allowing parser to interpret and validate. */ 569 ++p; 570 lex_emit_string_escape(s, p); 571 s = p; 572 if (p + 1 != q && (*p == '\\' || *p == '\'')) { 573 ++p; 574 } 575 continue; 576 } 577 if (*p == '\n') { 578 if (++p != q && *p == '\r') { 579 ++p; 580 } 581 lex_emit_string_newline(s, p); 582 s = p; 583 continue; 584 } 585 if (*p == '\r') { 586 if (++p != q && *p == '\n') { 587 ++p; 588 } 589 lex_emit_string_newline(s, p); 590 s = p; 591 continue; 592 } 593 ++p; 594 lex_emit_string_ctrl(s); 595 s = p; 596 } 597 #endif 598 599 /* 600 * We don't really care if it is a shell style comment or a C99, 601 * or any other line oriented commment, as the termination is 602 * the same. 603 */ 604 #if defined(LEX_SHELL_LINE_COMMENT) || defined(LEX_C99_LINE_COMMENT) 605 lex_mode_line_comment: 606 607 mode = LEX_MODE_LINE_COMMENT; 608 609 for (;;) { 610 --p; 611 while (++p != q && (!lex_isctrl(*p))) { 612 } 613 if (s != p) { 614 lex_emit_comment_part(s, p); 615 s = p; 616 } 617 if (p == q || lex_iszterm(*p)) { 618 /* 619 * Unterminated comment here is not necessarily true, 620 * not even likely, nor possible, but we do this to 621 * handle buffer switch consistently: any non-normal 622 * mode exit will have an unterminated token to fix up. 623 * Here it would be conversion to end of comment, which 624 * we cannot know yet, since the line might continue in 625 * the next buffer. This is a zero length token. 626 */ 627 lex_emit_comment_unterminated(p); 628 goto lex_mode_exit; 629 } 630 if (*p == '\n' || *p == '\r') { 631 lex_emit_comment_end(s, p); 632 goto lex_mode_normal; 633 } 634 ++p; 635 lex_emit_comment_ctrl(s); 636 s = p; 637 } 638 #endif 639 640 #ifdef LEX_C_BLOCK_COMMENT 641 lex_mode_c_block_comment: 642 643 mode = LEX_MODE_C_BLOCK_COMMENT; 644 645 for (;;) { 646 --p; 647 while (++p != q && (!lex_isctrl(*p))) { 648 if (*p == '/' && p[-1] == '*') { 649 --p; 650 break; 651 } 652 } 653 if (s != p) { 654 lex_emit_comment_part(s, p); 655 s = p; 656 } 657 if (p == q || lex_iszterm(*p)) { 658 lex_emit_comment_unterminated(p); 659 goto lex_mode_exit; 660 } 661 if (*p == '\n') { 662 if (++p != q && *p == '\r') { 663 ++p; 664 } 665 lex_emit_newline(s, p); 666 s = p; 667 continue; 668 } 669 if (*p == '\r') { 670 if (++p != q && *p == '\n') { 671 ++p; 672 } 673 lex_emit_newline(s, p); 674 s = p; 675 continue; 676 } 677 if (lex_isctrl(*p)) { 678 ++p; 679 lex_emit_comment_ctrl(s); 680 s = p; 681 continue; 682 } 683 p += 2; 684 lex_emit_comment_end(s, p); 685 s = p; 686 goto lex_mode_normal; 687 } 688 #endif 689 690 /* Julia nests block comments as #= ... #= ...=# ... =# across multiple lines. */ 691 #ifdef LEX_JULIA_NESTED_COMMENT 692 lex_mode_julia_nested_comment: 693 694 /* Preserve nesting level on re-entrance. */ 695 if ((mode & (LEX_MODE_COUNT_BASE - 1)) != LEX_MODE_JULIA_NESTED_COMMENT) { 696 mode = LEX_MODE_JULIA_NESTED_COMMENT; 697 } 698 /* We have already entered. */ 699 mode += LEX_MODE_COUNT_BASE; 700 701 for (;;) { 702 --p; 703 while (++p != q && !lex_isctrl(*p)) { 704 if (*p == '#') { 705 if (p[-1] == '=') { 706 --p; 707 break; 708 } 709 if (p + 1 != q && p[1] == '=') { 710 break; 711 } 712 } 713 } 714 if (s != p) { 715 lex_emit_comment_part(s, p); 716 s = p; 717 } 718 if (p == q || lex_iszterm(*p)) { 719 lex_emit_comment_unterminated(p); 720 goto lex_mode_exit; 721 } 722 if (*p == '\n') { 723 if (++p != q && *p == '\r') { 724 ++p; 725 } 726 lex_emit_newline(s, p); 727 s = p; 728 continue; 729 } 730 if (*p == '\r') { 731 if (++p != q && *p == '\n') { 732 ++p; 733 } 734 lex_emit_newline(s, p); 735 s = p; 736 continue; 737 } 738 if (lex_isctrl(*p)) { 739 ++p; 740 lex_emit_comment_ctrl(s); 741 s = p; 742 continue; 743 } 744 if (*p == '=') { 745 p += 2; 746 lex_emit_comment_end(s, p); 747 s = p; 748 mode -= LEX_MODE_COUNT_BASE; 749 if (mode / LEX_MODE_COUNT_BASE > 0) { 750 continue; 751 } 752 goto lex_mode_normal; 753 } 754 /* The upper bits are used as counter. */ 755 mode += LEX_MODE_COUNT_BASE; 756 p += 2; 757 lex_emit_comment_begin(s, p, 0); 758 s = p; 759 if (mode / LEX_MODE_COUNT_BASE > LEX_MAX_NESTING_LEVELS) { 760 /* Prevent malicious input from overflowing counter. */ 761 lex_emit_comment_deeply_nested(p); 762 lex_emit_abort(p); 763 return mode; 764 } 765 } 766 #endif 767 768 /* Unlike other modes, we can always jump here without updating token start `s` first. */ 769 lex_mode_normal: 770 771 mode = LEX_MODE_NORMAL; 772 773 while (p != q) { 774 s = p; 775 776 switch(*p) { 777 778 #ifndef LEX_DISABLE_ZTERM 779 case '\0': 780 lex_emit_eos(s, p); 781 return mode; 782 #endif 783 784 /* \v, \f etc. are covered by the CTRL token, don't put it here. */ 785 case '\t': case ' ': 786 while (++p != q && lex_isblank(*p)) { 787 } 788 lex_emit_blank(s, p); 789 continue; 790 791 /* 792 * Newline should be emitter in all constructs, also comments 793 * and strings which have their own newline handling. 794 * Only one line is emitted at a time permitting simple line 795 * counting. 796 */ 797 case '\n': 798 if (++p != q && *p == '\r') { 799 ++p; 800 } 801 lex_emit_newline(s, p); 802 continue; 803 804 case '\r': 805 if (++p != q && *p == '\n') { 806 ++p; 807 } 808 lex_emit_newline(s, p); 809 continue; 810 811 /* 812 * C-style string, and Python style triple double quote 813 * delimited multi-line string. Prefix and suffix symbols 814 * should be parsed separately, e.g. L"hello" are two 815 * tokens. 816 */ 817 #if defined(LEX_C_STRING) || defined(LEX_PYTHON_BLOCK_STRING) 818 case '\"': 819 #ifdef LEX_PYTHON_BLOCK_STRING 820 if (p + 2 < q && p[1] == '\"' && p[2] == '\"') { 821 p += 3; 822 lex_emit_string_begin(s, p); 823 s = p; 824 goto lex_mode_python_block_string; 825 } 826 #endif 827 #ifdef LEX_C_STRING 828 ++p; 829 lex_emit_string_begin(s, p); 830 s = p; 831 goto lex_mode_c_string; 832 #endif 833 #endif 834 835 /* 836 * Single quoted version of strings, otherwise identical 837 * behavior. Can also be used for char constants if checked 838 * by parser subsequently. 839 */ 840 #if defined(LEX_C_STRING_SQ) || defined(LEX_PYTHON_BLOCK_STRING_SQ) 841 case '\'': 842 #ifdef LEX_PYTHON_BLOCK_STRING_SQ 843 if (p + 2 < q && p[1] == '\'' && p[2] == '\'') { 844 p += 3; 845 lex_emit_string_begin(s, p); 846 s = p; 847 goto lex_mode_python_block_string_sq; 848 } 849 #endif 850 #ifdef LEX_C_STRING_SQ 851 ++p; 852 lex_emit_string_begin(s, p); 853 s = p; 854 goto lex_mode_c_string_sq; 855 #endif 856 #endif 857 858 #if defined(LEX_SHELL_LINE_COMMENT) || defined(LEX_JULIA_NESTED_COMMENT) 859 /* 860 * Line comment excluding terminal line break. 861 * 862 * See also C99 line comment `//`. 863 * 864 * Julia uses `#=` and `=#` for nested block comments. 865 * (According to Julia developers, '#=` is motivated by `=` 866 * not being likely to start anything that you would put a 867 * comment around, unlike `#{`, `}#` or `#(`, `)#`)). 868 * 869 * Some known doc comment formats are identified and 870 * included in the comment_begin token. 871 */ 872 case '#': 873 ++p; 874 #ifdef LEX_JULIA_NESTED_COMMENT 875 if (p != q && *p == '=') { 876 ++p; 877 lex_emit_comment_begin(s, p, 0); 878 s = p; 879 goto lex_mode_julia_nested_comment; 880 } 881 #endif 882 lex_emit_comment_begin(s, p, 0); 883 s = p; 884 goto lex_mode_line_comment; 885 #endif 886 887 case '/': 888 ++p; 889 if (p != q) { 890 switch (*p) { 891 #ifdef LEX_C99_LINE_COMMENT 892 case '/': 893 ++p; 894 p += p != q && (*p == '/' || *p == '!'); 895 lex_emit_comment_begin(s, p, (p - s == 3)); 896 s = p; 897 goto lex_mode_line_comment; 898 #endif 899 #ifdef LEX_C_BLOCK_COMMENT 900 case '*': 901 ++p; 902 p += p != q && (*p == '*' || *p == '!'); 903 lex_emit_comment_begin(s, p, (p - s == 3)); 904 s = p; 905 goto lex_mode_c_block_comment; 906 #endif 907 case '=': 908 ++p; 909 lex_emit_compound_op('/', '=', s, p); 910 continue; 911 default: 912 break; 913 } 914 } 915 lex_emit_op('/', s, p); 916 continue; 917 918 case '(': case ')': case '[': case ']': case '{': case '}': 919 case ',': case ';': case '\\': case '?': 920 ++p; 921 lex_emit_op(*s, s, p); 922 continue; 923 924 case '%': case '!': case '~': case '^': 925 ++p; 926 if (p != q && *p == '=') { 927 ++p; 928 lex_emit_compound_op(*s, '=', s, p); 929 continue; 930 } 931 lex_emit_op(*s, s, p); 932 continue; 933 934 case '|': 935 ++p; 936 if (p != q) { 937 switch (*p) { 938 case '=': 939 ++p; 940 lex_emit_compound_op('|', '=', s, p); 941 continue; 942 case '|': 943 ++p; 944 lex_emit_compound_op('|', '|', s, p); 945 break; 946 default: 947 break; 948 } 949 } 950 lex_emit_op('|', s, p); 951 continue; 952 953 case '&': 954 ++p; 955 if (p != q) { 956 switch (*p) { 957 case '=': 958 ++p; 959 lex_emit_compound_op('&', '=', s, p); 960 continue; 961 case '&': 962 ++p; 963 lex_emit_compound_op('&', '&', s, p); 964 break; 965 default: 966 break; 967 } 968 } 969 lex_emit_op('&', s, p); 970 continue; 971 972 case '=': 973 ++p; 974 if (p != q) { 975 switch (*p) { 976 case '>': 977 ++p; 978 lex_emit_compound_op('=', '>', s, p); 979 continue; 980 case '=': 981 ++p; 982 if (p != q && *p == '=') { 983 ++p; 984 lex_emit_tricompound_op('=', '=', '=', s, p); 985 continue; 986 } 987 lex_emit_compound_op('=', '=', s, p); 988 break; 989 default: 990 break; 991 } 992 } 993 lex_emit_op('=', s, p); 994 continue; 995 996 case ':': 997 ++p; 998 if (p != q) { 999 switch (*p) { 1000 case '=': 1001 ++p; 1002 lex_emit_compound_op(':', '=', s, p); 1003 continue; 1004 case ':': 1005 ++p; 1006 if (p != q && *p == '=') { 1007 ++p; 1008 lex_emit_tricompound_op(':', ':', '=', s, p); 1009 continue; 1010 } 1011 lex_emit_compound_op(':', ':', s, p); 1012 continue; 1013 default: 1014 break; 1015 } 1016 } 1017 lex_emit_op(':', s, p); 1018 continue; 1019 1020 case '*': 1021 ++p; 1022 if (p != q) { 1023 switch (*p) { 1024 case '=': 1025 lex_emit_compound_op('*', '=', s, p); 1026 continue; 1027 case '*': 1028 /* **= hardly used anywhere? */ 1029 lex_emit_compound_op('*', '*', s, p); 1030 continue; 1031 default: 1032 break; 1033 } 1034 } 1035 lex_emit_op('*', s, p); 1036 continue; 1037 1038 case '<': 1039 ++p; 1040 if (p != q) { 1041 switch (*p) { 1042 case '-': 1043 ++p; 1044 lex_emit_compound_op('<', '-', s, p); 1045 continue; 1046 case '=': 1047 ++p; 1048 lex_emit_compound_op('<', '=', s, p); 1049 continue; 1050 case '<': 1051 ++p; 1052 if (p != q) { 1053 switch (*p) { 1054 case '=': 1055 ++p; 1056 lex_emit_tricompound_op('<', '<', '=', s, p); 1057 continue; 1058 case '<': 1059 ++p; 1060 if (p != q && *p == '=') { 1061 ++p; 1062 lex_emit_quadcompound_op('<', '<', '<', '=', s, p); 1063 continue; 1064 } 1065 lex_emit_tricompound_op('<', '<', '<', s, p); 1066 continue; 1067 default: 1068 break; 1069 } 1070 } 1071 lex_emit_compound_op('<', '<', s, p); 1072 continue; 1073 default: 1074 break; 1075 } 1076 } 1077 lex_emit_op('<', s, p); 1078 continue; 1079 1080 case '>': 1081 ++p; 1082 if (p != q) { 1083 switch (*p) { 1084 case '=': 1085 ++p; 1086 lex_emit_compound_op('>', '=', s, p); 1087 continue; 1088 case '>': 1089 ++p; 1090 if (p != q) { 1091 switch (*p) { 1092 case '=': 1093 ++p; 1094 lex_emit_tricompound_op('>', '>', '=', s, p); 1095 continue; 1096 case '>': 1097 ++p; 1098 if (p != q && *p == '=') { 1099 ++p; 1100 lex_emit_quadcompound_op('>', '>', '>', '=', s, p); 1101 continue; 1102 } 1103 lex_emit_tricompound_op('>', '>', '>', s, p); 1104 continue; 1105 default: 1106 break; 1107 } 1108 } 1109 lex_emit_compound_op('>', '>', s, p); 1110 continue; 1111 default: 1112 break; 1113 } 1114 } 1115 lex_emit_op('>', s, p); 1116 continue; 1117 1118 case '-': 1119 ++p; 1120 if (p != q) { 1121 switch (*p) { 1122 case '=': 1123 ++p; 1124 lex_emit_compound_op('-', '=', s, p); 1125 continue; 1126 case '-': 1127 ++p; 1128 lex_emit_compound_op('-', '-', s, p); 1129 continue; 1130 case '>': 1131 ++p; 1132 lex_emit_compound_op('-', '>', s, p); 1133 continue; 1134 default: 1135 break; 1136 } 1137 } 1138 lex_emit_op('-', s, p); 1139 continue; 1140 1141 case '+': 1142 ++p; 1143 if (p != q) { 1144 switch (*p) { 1145 case '=': 1146 ++p; 1147 lex_emit_compound_op('+', '=', s, p); 1148 continue; 1149 1150 case '+': 1151 ++p; 1152 lex_emit_compound_op('+', '+', s, p); 1153 continue; 1154 default: 1155 break; 1156 } 1157 } 1158 lex_emit_op('+', s, p); 1159 continue; 1160 1161 case '.': 1162 ++p; 1163 if (p != q) { 1164 switch (*p) { 1165 case '0': case '1': case '2': case '3': case '4': 1166 case '5': case '6': case '7': case '8': case '9': 1167 d = s; 1168 goto lex_dot_to_fraction_part; 1169 case '.': 1170 ++p; 1171 if (p != q && *p == '.') { 1172 ++p; 1173 lex_emit_tricompound_op('.', '.', '.', s, p); 1174 continue; 1175 } 1176 lex_emit_compound_op('.', '.', s, p); 1177 continue; 1178 default: 1179 break; 1180 } 1181 } 1182 lex_emit_op('.', s, p); 1183 continue; 1184 1185 case '0': 1186 if (++p != q) { 1187 switch (*p) { 1188 #ifdef LEX_C_OCTAL_NUMERIC 1189 1190 case '0': case '1': case '2': case '3': 1191 case '4': case '5': case '6': case '7': 1192 while (++p != q && lex_isoctdigit(*p)) { 1193 } 1194 d = p; 1195 if (p != q) { 1196 /* 1197 * Leading zeroes like 00.10 are valid C 1198 * floating point constants. 1199 */ 1200 if (*p == '.') { 1201 goto lex_c_octal_to_fraction_part; 1202 } 1203 if (*p == 'e' || *p == 'E') { 1204 goto lex_c_octal_to_exponent_part; 1205 } 1206 } 1207 lex_emit_octal(s, p); 1208 /* 1209 * If we have a number like 0079, it becomes 1210 * 007(octal), 9(decimal). The parser should 1211 * deal with this. 1212 * 1213 * To add to confusion i64 is a C integer suffix 1214 * like in 007i64, but 2+2i is a Go complex 1215 * constant. (Not specific to octals). 1216 * 1217 * This can all be handled by having the parser inspect 1218 * following identifier or numeric, parser 1219 * here meaning a lexer post processing step, not 1220 * necessarily the parser itself. 1221 */ 1222 1223 continue; 1224 #else 1225 /* 1226 * All integers reach default and enter 1227 * integer part. As a result, leading zeroes are 1228 * mapped to floats and integers which matches 1229 * Julia behavior. Other languages should decide 1230 * if leading zero is valid or not. JSON 1231 * disallows leading zero. 1232 */ 1233 #endif 1234 1235 #ifdef LEX_JULIA_OCTAL_NUMERIC 1236 /* 1237 * This is the style of octal, not 100% Julia 1238 * compatible. Also define Julia numeric to enforce 1239 * lower case. 1240 */ 1241 #ifndef LEX_LOWER_CASE_NUMERIC_PREFIX 1242 /* See also hex 0X. Julia v.0.3 uses lower case only here. */ 1243 case 'O': 1244 #endif 1245 /* 1246 * Julia accepts 0o700 as octal and 0b100 as 1247 * binary, and 0xa00 as hex, and 0100 as 1248 * integer, and 1e2 as 64 bit float and 1f2 as 1249 * 32 bit float. Julia 0.3 does not support 1250 * octal and binary fractions. 1251 */ 1252 case 'o': 1253 while (++p != q && lex_isoctdigit(*p)) { 1254 } 1255 lex_emit_octal(s, p); 1256 /* Avoid hitting int fall through. */ 1257 continue; 1258 #endif 1259 #ifdef LEX_BINARY_NUMERIC 1260 /* Binary in C++14. */ 1261 case 'b': 1262 #ifndef LEX_LOWER_CASE_NUMERIC_PREFIX 1263 /* See also hex 0X. Julia v.0.3 uses lower case only here. */ 1264 case 'B': 1265 #endif 1266 while (++p != q && lex_isbindigit(*p)) { 1267 } 1268 lex_emit_binary(s, p); 1269 /* Avoid hitting int fall through. */ 1270 continue; 1271 #endif 1272 #ifdef LEX_HEX_NUMERIC 1273 case 'x': 1274 #ifndef LEX_LOWER_CASE_NUMERIC_PREFIX 1275 /* 1276 * Julia v0.3 does not allow this, it thinks 0X1 is 1277 * 0 * X1, X1 being an identifier. 1278 * while 0x1 is a hex value due to precedence. 1279 * 1280 * TODO: This might change. 1281 */ 1282 1283 case 'X': 1284 #endif 1285 while (++p != q && lex_ishexdigit(*p)) { 1286 } 1287 #ifdef LEX_HEX_FLOAT_NUMERIC 1288 /* 1289 * Most hexadecimal floating poing conversion 1290 * functions, including Pythons 1291 * float.fromhex("0x1.0"), Julias parse 1292 * function, and and C strtod on 1293 * supporting platforms, will parse without 1294 * exponent. The same languages do not support 1295 * literal constants without the p exponent. 1296 * First it is named p because e is a hex digit, 1297 * second, the float suffix f is also a hex 1298 * digit: 0x1.f is ambigious in C without that 1299 * rule. Conversions have no such ambiguity. 1300 * In Julia, juxtaposition means that 0x1.f 1301 * could mean 0x1p0 * f or 0x1.fp0. 1302 * 1303 * Since we are not doing conversion here but 1304 * lexing a stream, we opt to require the p 1305 * suffix because making it optional could end 1306 * up consuming parts of the next token. 1307 * 1308 * But, we also make a flag to make the exponent 1309 * optional, anyway. It could be used for better 1310 * error reporting than just consuming the hex 1311 * part since we likely should accept the ambigous 1312 * syntax either way. 1313 */ 1314 d = p; 1315 if (p != q && *p == '.') { 1316 while (++p != q && lex_ishexdigit(*p)) { 1317 } 1318 } 1319 if (p != q && (*p == 'p' || *p == 'P')) { 1320 if (++p != q && *p != '+' && *p != '-') { 1321 --p; 1322 } 1323 /* The exponent is a decimal power of 2. */ 1324 while (++p != q && lex_isdigit(*p)) { 1325 } 1326 lex_emit_hex_float(s, p); 1327 continue; 1328 } 1329 #ifdef LEX_HEX_FLOAT_OPTIONAL_EXPONENT 1330 if (d != p) { 1331 lex_emit_hex_float(s, p); 1332 continue; 1333 } 1334 #else 1335 /* 1336 * Backtrack to decimal point. We require p to 1337 * be present because we could otherwise consume 1338 * part of the next token. 1339 */ 1340 p = d; 1341 #endif 1342 #endif /* LEX_HEX_FLOAT_NUMERIC */ 1343 lex_emit_hex(s, p); 1344 continue; 1345 #endif /* LEX_HEX_NUMERIC */ 1346 1347 default: 1348 /* 1349 * This means leading zeroes like 001 or 001.0 are 1350 * treated like like int and float respectively, 1351 * iff C octals are flaggged out. Otherwise they 1352 * become 001(octal), and 001(octal),.0(float) 1353 * which should be treated as an error because 1354 * future extensions might allow octal floats. 1355 * (Not likely, but interpretion is ambigious). 1356 */ 1357 break; 1358 } /* Switch under '0' case. */ 1359 1360 /* 1361 * Pure single digit '0' is an octal number in the C 1362 * spec. We have the option to treat it as an integer, 1363 * or as an octal. For strict C behavior, this can be 1364 * flagged in, but is disabled by default. It only 1365 * applies to single digit 0. Thus, with C octal 1366 * enabled, leading zeroes always go octal. 1367 */ 1368 } /* If condition around switch under '0' case. */ 1369 --p; 1370 goto lex_fallthrough_1; /* silence warning */ 1371 1372 lex_fallthrough_1: 1373 /* Leading integer digit in C integers. */ 1374 case '1': case '2': case '3': case '4': case '5': 1375 case '6': case '7': case '8': case '9': 1376 while (++p && lex_isdigit(*p)) { 1377 } 1378 d = p; 1379 if (*p == '.') { 1380 /* Silence unused label warnings when features are disabled. */ 1381 #ifdef LEX_C_OCTAL_NUMERIC 1382 lex_c_octal_to_fraction_part: 1383 #endif 1384 lex_dot_to_fraction_part: 1385 while (++p != q && lex_isdigit(*p)) { 1386 } 1387 } 1388 if (p != q && (*p == 'e' || *p == 'E')) { 1389 /* Silence unused label warnings when features are disabled. */ 1390 #ifdef LEX_C_OCTAL_NUMERIC 1391 lex_c_octal_to_exponent_part: 1392 #endif 1393 if (++p != q && *p != '+' && *p != '-') { 1394 --p; 1395 } 1396 while (++p != q && lex_isdigit(*p)) { 1397 } 1398 } 1399 if (d != p) { 1400 lex_emit_float(s, p); 1401 } else { 1402 #ifdef LEX_C_OCTAL_NUMERIC 1403 if (*s == '0') { 1404 lex_emit_octal(s, p); 1405 continue; 1406 } 1407 #endif 1408 lex_emit_int(s, p); 1409 } 1410 continue; 1411 1412 #ifndef LEX_ID_WITHOUT_UNDERSCORE 1413 case '_': 1414 #endif 1415 case 'A': case 'B': case 'C': case 'D': case 'E': 1416 case 'F': case 'G': case 'H': case 'I': case 'J': 1417 case 'K': case 'L': case 'M': case 'N': case 'O': 1418 case 'P': case 'Q': case 'R': case 'S': case 'T': 1419 case 'U': case 'V': case 'W': case 'X': case 'Y': 1420 case 'Z': 1421 case 'a': case 'b': case 'c': case 'd': case 'e': 1422 case 'f': case 'g': case 'h': case 'i': case 'j': 1423 case 'k': case 'l': case 'm': case 'n': case 'o': 1424 case 'p': case 'q': case 'r': case 's': case 't': 1425 case 'u': case 'v': case 'w': case 'x': case 'y': 1426 case 'z': 1427 1428 /* 1429 * We do not try to ensure utf-8 is terminated correctly nor 1430 * that any unicode character above ASCII is a character 1431 * suitable for identifiers. 1432 * 1433 * tag is calculated for keyword lookup, and we assume these 1434 * are always ASCII-7bit. It has the form: length, first 1435 * char, second, char, last char in lsb to msb order. If the 1436 * second char is missing, it becomes '\0'. The tag is not 1437 * entirely unique, but suitable for fast lookup. 1438 * 1439 * If utf-8 appears in tag, the tag is undefined except the 1440 * length is valid or overflows (meaning longer than any 1441 * keyword and thus safe to compare against if tag matches). 1442 * 1443 * If the grammar is case insensitive, the tag be can 1444 * downcased trivially by or'ring with 0x20202000 which 1445 * preserves the length field (clever design by ASCII 1446 * designers). After tag matching, a case insentive 1447 * compare is obviously also needed against the full lexeme. 1448 */ 1449 1450 { 1451 unsigned long tag; 1452 1453 tag = (unsigned long)*p << 8; 1454 if (++p != q && lex_isalnum(*p)) { 1455 tag |= (unsigned long)*p << 16; 1456 while (++p != q && lex_isalnum(*p)) { 1457 } 1458 } 1459 tag |= (unsigned long)p[-1] << 24; 1460 tag |= (unsigned char)(p - s) + (unsigned long)'0'; 1461 lex_emit_id(s, p, tag); 1462 continue; 1463 } 1464 1465 default: 1466 1467 #ifdef LEX_ID_WITH_UTF8 1468 /* 1469 * Identifier again, in case it starts with a utf-8 lead 1470 * character. This time we can ignore the tag, except the 1471 * length char must be valid to avoid buffer overruns 1472 * on potential kw check upstream. 1473 */ 1474 if (*p & '\x80') { 1475 unsigned long tag; 1476 1477 while (++p != q && lex_isalnum(*p)) { 1478 } 1479 tag = (unsigned char)(p - s) + '0'; 1480 lex_emit_id(s, p, tag); 1481 continue; 1482 } 1483 #endif 1484 ++p; 1485 /* normally 0x7f DEL and 0x00..0x1f incl. */ 1486 if (lex_isctrl(*s) && !lex_isblank(*s)) { 1487 lex_emit_ctrl(s); 1488 } else { 1489 lex_emit_symbol(*s, s, p); 1490 } 1491 continue; 1492 } /* Main switch in normal mode. */ 1493 } /* Main while loop in normal mode. */ 1494 1495 lex_mode_exit: 1496 if (mode == LEX_MODE_INVALID) { 1497 return mode; 1498 } 1499 1500 #ifndef LEX_DISABLE_ZTERM 1501 if (p != q && lex_iszterm(*p)) { 1502 lex_emit_eos(s, p); 1503 return mode; 1504 } 1505 #endif 1506 lex_emit_eob(p); 1507 return mode; 1508 } 1509