luthor.c - nostrdb - an unfairly fast embedded nostr database backed by lmdb

luthor.c (48005B)
      1 /*
      2  * Designed to be included in other C files which define emitter
      3  * operations. The same source may thus be used to parse different
      4  * grammars.
      5  *
      6  * The operators cover the most common operators i the C family.  Each
      7  * operator does not have a name, it is represent by a long token code
      8  * with up to 4 ASCII characters embedded literally. This avoids any
      9  * semantic meaning at the lexer level. Emitters macros can redefine
     10  * this behavior.
     11  *
     12  * No real harm is done in accepting a superset, but the source is
     13  * intended to be modified, have things flagged or removed, other things
     14  * added. The real complicity is in numbers, identifiers, and comments,
     15  * which should be fairly complete with flagging as is.
     16  *
     17  * Keyword handling is done at macroes, and described elsewhere, but for
     18  * identifier compatible keywords, this is quite efficient to handle on
     19  * a per language basis without modifying this source.
     20  *
     21  * The Lisp language family is somewhat different and not directly
     22  * suited for this lexer, although it can easily be modified to suit.
     23  * The main reason is ';' for comments, and operators used as part of
     24  * the identifier symbol set, and no need for operator classification,
     25  * and different handling of single character symbols.
     26  *
     27  * So overall, we more or less have one efficient unified lexer that can
     28  * manage many languages - this is good, because it is a pain to write a
     29  * new lexer by hand, and lexer tools are what they are.
     30  */
     31 
     32 #include "luthor.h"
     33 
     34 #ifdef LEX_C99_NUMERIC
     35 #define LEX_C_NUMERIC
     36 #define LEX_HEX_FLOAT_NUMERIC
     37 #define LEX_BINARY_NUMERIC
     38 #endif
     39 
     40 #ifdef LEX_C_NUMERIC
     41 #define LEX_C_OCTAL_NUMERIC
     42 #define LEX_HEX_NUMERIC
     43 #endif
     44 
     45 #ifdef LEX_JULIA_NUMERIC
     46 #ifdef LEX_C_OCTAL_NUMERIC
     47 /*
     48  * LEX_JULIA_OCTAL_NUMERIC and LEX_C_OCTAL_NUMERIC can technically
     49  * coexist, but leading zeroes give C style leading zero numbers
     50  * which can lead to incorrect values depending on expectations.
     51  * Therefore the full LEX_JULIA_NUMERIC flag is designed to not allow this.
     52  */
     53 #error "LEX_C_OCTAL_NUMERIC conflicts with LEX_JULIA_NUMERIC leading zero integers"
     54 #endif
     55 
     56 /*
     57  * Julia v0.3 insists on lower case, and has a different meaning for
     58  * upper case.
     59  */
     60 #define LEX_LOWER_CASE_NUMERIC_PREFIX
     61 #define LEX_JULIA_OCTAL_NUMERIC
     62 #define LEX_HEX_FLOAT_NUMERIC
     63 #define LEX_BINARY_NUMERIC
     64 
     65 #endif
     66 
     67 #ifdef LEX_HEX_FLOAT_NUMERIC
     68 #define LEX_HEX_NUMERIC
     69 #endif
     70 
     71 /*
     72  * Numeric and string constants do not accept prefixes such as u, l, L,
     73  * U, ll, LL, f, or F in C, or various others in Julia strings. Use the
     74  * parser to detect juxtaposition between identifier and constant. In
     75  * Julia numeric suffix means multiplication, in C it is a type
     76  * qualifier.  Sign, such as defined in JSON, are also not accepted -
     77  * they must be operators.  See source for various flag to enable
     78  * different token types.
     79  */
     80 
     81 /*
     82  * Includes '_' in identifers by default. Defines follow characters in
     83  * identifiers but not the lead character - it must be defined in switch
     84  * cases.  If the identifier allows for dash '-', it is probably better
     85  * to handle it as an operator and flag surrounding space in the parser.
     86  */
     87 #ifndef lex_isalnum
     88 
     89 /*
     90  * NOTE: isalnum, isalpha, is locale dependent. We only want to
     91  * to consider that ASCII-7 subset and treat everything else as utf-8.
     92  * This table is not for leading identifiers, as it contains 0..9.
     93  *
     94  * For more correct handling of UTF-8, see:
     95  * https://theantlrguy.atlassian.net/wiki/display/ANTLR4/Grammar+Lexicon
     96  * based on Java Ident = NameStartChar NameChar*
     97  *
     98  * While the following is UTF-16, it can be adapted to UTF-8 easily.
     99 
    100 
    101     fragment
    102     NameChar
    103        : NameStartChar
    104        | '0'..'9'
    105        | '_'
    106        | '\u00B7'
    107        | '\u0300'..'\u036F'
    108        | '\u203F'..'\u2040'
    109        ;
    110     fragment
    111     NameStartChar
    112        : 'A'..'Z' | 'a'..'z'
    113        | '\u00C0'..'\u00D6'
    114        | '\u00D8'..'\u00F6'
    115        | '\u00F8'..'\u02FF'
    116        | '\u0370'..'\u037D'
    117        | '\u037F'..'\u1FFF'
    118        | '\u200C'..'\u200D'
    119        | '\u2070'..'\u218F'
    120        | '\u2C00'..'\u2FEF'
    121        | '\u3001'..'\uD7FF'
    122        | '\uF900'..'\uFDCF'
    123        | '\uFDF0'..'\uFFFD'
    124        ;
    125  */
    126 
    127 static const char lex_alnum[256] = {
    128     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
    129     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
    130     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
    131     /* 0..9 */
    132     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0,
    133     /* A..O */
    134     0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
    135     /* P..Z, _ */
    136 #ifdef LEX_ID_WITHOUT_UNDERSCORE
    137     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0,
    138 #else
    139     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1,
    140 #endif
    141     /* a..o */
    142     0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
    143     /* p..z */
    144     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0,
    145 #ifdef LEX_ID_WITH_UTF8
    146     /* utf-8 */
    147     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
    148     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
    149     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
    150     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
    151     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
    152     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
    153     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
    154     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
    155 #else
    156     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
    157     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
    158     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
    159     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
    160     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
    161     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
    162     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
    163     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
    164 #endif
    165 };
    166 
    167 #define lex_isalnum(c) (lex_alnum[(unsigned char)(c)])
    168 #endif
    169 
    170 #ifndef lex_isbindigit
    171 #define lex_isbindigit(c) ((c) == '0' || (c) == '1')
    172 #endif
    173 
    174 #ifndef lex_isoctdigit
    175 #define lex_isoctdigit(c) ((unsigned)((c) - '0') < 8)
    176 #endif
    177 
    178 #ifndef lex_isdigit
    179 #define lex_isdigit(c) ((unsigned)(c) >= '0' && (unsigned)(c) <= '9')
    180 #endif
    181 
    182 #ifndef lex_ishexdigit
    183 #define lex_ishexdigit(c) (((c) >= '0' && ((unsigned)c) <= '9') || ((unsigned)(c | 0x20) >= 'a' && (unsigned)(c | 0x20) <= 'f'))
    184 #endif
    185 
    186 #ifndef lex_isctrl
    187 #include <ctype.h>
    188 #define lex_isctrl(c) (((unsigned)c) < 0x20 || (c) == 0x7f)
    189 #endif
    190 
    191 #ifndef lex_isblank
    192 #define lex_isblank(c) ((c) == ' ' || (c) == '\t')
    193 #endif
    194 
    195 #ifndef lex_iszterm
    196 #define lex_iszterm(c) ((c) == '\0')
    197 #endif
    198 
    199 /*
    200  * If ZTERM is disabled, zero will be a LEX_CTRL token
    201  * and allowed to be embedded in comments and strings, or
    202  * elsewhere, as long as the parser accepts the token.
    203  */
    204 #ifdef LEX_DISABLE_ZTERM
    205 #undef lex_iszterm
    206 #define lex_iszterm(c) (0)
    207 #endif
    208 
    209 /*
    210  * The mode is normally LEX_MODE_NORMAL = 0 initially, or the returned
    211  * mode from a previous call, unless LEX_MODE_INVALID = 1 was returned.
    212  * If a buffer stopped in the middle of a string or a comment, the mode
    213  * will reflect that. In all cases some amount of recovery is needed
    214  * before starting a new buffer - see detailed comments in header file.
    215  * If only a single buffer is used, special handling is still needed if
    216  * the last line contains a single line comment because it will not be
    217  * terminated, but it amounts to replace the emitted unterminated
    218  * comment token with an end of comment token.
    219  *
    220  * Instead of 0, the mode can initially also be LEX_MODE_BOM - it will
    221  * an strip optional BOM before moving to normal mode. Currently only
    222  * UTF-8 BOM is supported, and this is unlikely to change.
    223  *
    224  * The context variable is user-defined and available to emitter macros.
    225  * It may be null if unused.
    226  *
    227  */
    228 static int lex(const char *buf, size_t len, int mode, void *context)
    229 {
    230     const char *p, *q, *s, *d;
    231 #if 0
    232     /* TODO: old, remove this */
    233     , *z, *f;
    234 #endif
    235 
    236     p = buf;        /* next char */
    237     q = p + len;    /* end of buffer */
    238     s = p;          /* start of token */
    239     d = p;          /* end of integer part */
    240 
    241 #if 0
    242     /* TODO: old, remove this */
    243 
    244     /* Used for float and leading zero detection in numerics. */
    245     z = p;
    246     f = p;
    247 #endif
    248 
    249     /*
    250      * Handle mid string and mid comment for reentering across
    251      * buffer boundaries. Strip embedded counter from mode.
    252      */
    253     switch(mode & (LEX_MODE_COUNT_BASE - 1)) {
    254 
    255     case LEX_MODE_NORMAL:
    256         goto lex_mode_normal;
    257 
    258     case LEX_MODE_BOM:
    259         goto lex_mode_bom;
    260 
    261 #ifdef LEX_C_STRING
    262     case LEX_MODE_C_STRING:
    263         goto lex_mode_c_string;
    264 #endif
    265 #ifdef LEX_PYTHON_BLOCK_STRING
    266     case LEX_MODE_PYTHON_BLOCK_STRING:
    267         goto lex_mode_python_block_string;
    268 #endif
    269 #ifdef LEX_C_STRING_SQ
    270     case LEX_MODE_C_STRING_SQ:
    271         goto lex_mode_c_string_sq;
    272 #endif
    273 #ifdef LEX_PYTHON_BLOCK_STRING_SQ
    274     case LEX_MODE_PYTHON_BLOCK_STRING_SQ:
    275         goto lex_mode_python_block_string_sq;
    276 #endif
    277 #ifdef LEX_C_BLOCK_COMMENT
    278     case LEX_MODE_C_BLOCK_COMMENT:
    279         goto lex_mode_c_block_comment;
    280 #endif
    281 #if defined(LEX_SHELL_LINE_COMMENT) || defined(LEX_C99_LINE_COMMENT)
    282     case LEX_MODE_LINE_COMMENT:
    283         goto lex_mode_line_comment;
    284 #endif
    285 #ifdef LEX_JULIA_NESTED_COMMENT
    286     case LEX_MODE_JULIA_NESTED_COMMENT:
    287         goto lex_mode_julia_nested_comment;
    288 #endif
    289 
    290     default:
    291         /*
    292          * This is mostly to kill unused label warning when comments
    293          * are disabled.
    294          */
    295         goto lex_mode_exit;
    296     }
    297 
    298 lex_mode_bom:
    299 
    300     mode = LEX_MODE_BOM;
    301 
    302     /*
    303      * Special entry mode to consume utf-8 bom if present. We don't
    304      * support other boms, but we would use the same token if we did.
    305      *
    306      * We generally expect no bom present, but it is here if needed
    307      * without requiring ugly hacks elsewhere.
    308      */
    309     if (p + 3 < q && p[0] == '\xef' && p[1] == '\xbb' && p[2] == '\xbf') {
    310         p += 3;
    311         lex_emit_bom(s, p);
    312     }
    313     goto lex_mode_normal;
    314 
    315 /* If source is updated, also update LEX_C_STRING_SQ accordingly. */
    316 #ifdef LEX_C_STRING
    317 lex_mode_c_string:
    318 
    319     mode = LEX_MODE_C_STRING;
    320 
    321     for (;;) {
    322         --p;
    323         /* We do not allow blanks that are also control characters, such as \t. */
    324         while (++p != q && *p != '\\' && *p != '\"' && !lex_isctrl(*p)) {
    325         }
    326         if (s != p) {
    327             lex_emit_string_part(s, p);
    328             s = p;
    329         }
    330         if (*p == '\"') {
    331             ++p;
    332             lex_emit_string_end(s, p);
    333             goto lex_mode_normal;
    334         }
    335         if (p == q || lex_iszterm(*p)) {
    336             lex_emit_string_unterminated(p);
    337             goto lex_mode_normal;
    338         }
    339         if (*p == '\\') {
    340             ++p;
    341              /* Escape is only itself, whatever is escped follows separately. */
    342             lex_emit_string_escape(s, p);
    343             s = p;
    344             if (p == q || lex_iszterm(*p)) {
    345                 lex_emit_string_unterminated(p);
    346                 goto lex_mode_normal;
    347             }
    348             if (*p == '\\' || *p == '\"') {
    349                 ++p;
    350                 continue;
    351             }
    352             /*
    353              * Flag only relevant for single line strings, as it
    354              * controls whether we fail on unterminated string at line
    355              * ending with '\'.
    356              *
    357              * Julia does not support line continuation in strings
    358              * (or elsewhere). C, Python, and Javascript do.
    359              */
    360 #ifndef LEX_DISABLE_STRING_CONT
    361             if (*p == '\n') {
    362                 if (++p != q && *p == '\r') {
    363                     ++p;
    364                 }
    365                 lex_emit_string_newline(s, p);
    366                 s = p;
    367                 continue;
    368             }
    369             if (*p == '\r') {
    370                 if (++p != q && *p == '\n') {
    371                     ++p;
    372                 }
    373                 lex_emit_string_newline(s, p);
    374                 s = p;
    375                 continue;
    376             }
    377 #endif
    378         }
    379         if (*p == '\n' || *p == '\r') {
    380             lex_emit_string_unterminated(p);
    381             goto lex_mode_normal;
    382         }
    383         ++p;
    384         lex_emit_string_ctrl(s);
    385         s = p;
    386     }
    387 #endif
    388 
    389 /*
    390  * This is a copy if LEX_C_STRING with single quote. It's not DRY, but
    391  * no reason to parameterized inner loops, just because. Recopy of
    392  * changes are to the above.
    393  *
    394  * Even if single quote is only used for CHAR types, it makes sense to
    395  * parse as a full string since there can be all sorts of unicocde
    396  * escapes and line continuations, newlines to report and unexpected
    397  * control characters to deal with.
    398  */
    399 #ifdef LEX_C_STRING_SQ
    400 lex_mode_c_string_sq:
    401 
    402     mode = LEX_MODE_C_STRING_SQ;
    403 
    404     for (;;) {
    405         --p;
    406         while (++p != q && *p != '\\' && *p != '\'' && !lex_isctrl(*p)) {
    407         }
    408         if (s != p) {
    409             lex_emit_string_part(s, p);
    410             s = p;
    411         }
    412         if (*p == '\'') {
    413             ++p;
    414             lex_emit_string_end(s, p);
    415             goto lex_mode_normal;
    416         }
    417         if (p == q || lex_iszterm(*p)) {
    418             lex_emit_string_unterminated(p);
    419             goto lex_mode_normal;
    420         }
    421         if (*p == '\\') {
    422             ++p;
    423              /* Escape is only itself, whatever is escped follows separately. */
    424             lex_emit_string_escape(s, p);
    425             s = p;
    426             if (p == q || lex_iszterm(*p)) {
    427                 lex_emit_string_unterminated(p);
    428                 goto lex_mode_normal;
    429             }
    430             if (*p == '\\' || *p == '\'') {
    431                 ++p;
    432                 continue;
    433             }
    434             /*
    435              * Flag only relevant for single line strings, as it
    436              * controls whether we fail on unterminated string at line
    437              * ending with '\'.
    438              *
    439              * Julia does not support line continuation in strings
    440              * (or elsewhere). C, Python, and Javascript do.
    441              */
    442 #ifndef LEX_DISABLE_STRING_CONT
    443             if (*p == '\n') {
    444                 if (++p != q && *p == '\r') {
    445                     ++p;
    446                 }
    447                 lex_emit_string_newline(s, p);
    448                 s = p;
    449                 continue;
    450             }
    451             if (*p == '\r') {
    452                 if (++p != q && *p == '\n') {
    453                     ++p;
    454                 }
    455                 lex_emit_string_newline(s, p);
    456                 s = p;
    457                 continue;
    458             }
    459 #endif
    460         }
    461         if (*p == '\n' || *p == '\r') {
    462             lex_emit_string_unterminated(p);
    463             goto lex_mode_normal;
    464         }
    465         ++p;
    466         lex_emit_string_ctrl(s);
    467         s = p;
    468     }
    469 #endif
    470 
    471 /*
    472  * """ Triple quoted Python block strings. """
    473  * Single quoted version (''') is a direct copy, update both places
    474  * if a changed is needed.
    475  *
    476  * Note: there is no point in disabling line continuation
    477  * for block strings, since it only affects unterminated
    478  * string errors at newline. It all comes down to how
    479  * escaped newline is interpreted by the parser.
    480  */
    481 #ifdef LEX_PYTHON_BLOCK_STRING
    482 lex_mode_python_block_string:
    483 
    484     mode = LEX_MODE_PYTHON_BLOCK_STRING;
    485 
    486     for (;;) {
    487         --p;
    488         while (++p != q && *p != '\\' && !lex_isctrl(*p)) {
    489             if (*p == '\"' && p + 2 < q && p[1] == '\"' && p[2] == '\"') {
    490                 break;
    491             }
    492         }
    493         if (s != p) {
    494             lex_emit_string_part(s, p);
    495             s = p;
    496         }
    497         if (p == q || lex_iszterm(*p)) {
    498             lex_emit_string_unterminated(p);
    499             goto lex_mode_normal;
    500         }
    501         if (*p == '\"') {
    502             p += 3;
    503             lex_emit_string_end(s, p);
    504             goto lex_mode_normal;
    505         }
    506         if (*p == '\\') {
    507              /* Escape is only itself, allowing parser to interpret and validate. */
    508             ++p;
    509             lex_emit_string_escape(s, p);
    510             s = p;
    511             if (p + 1 != q && (*p == '\\' || *p == '\"')) {
    512                 ++p;
    513             }
    514             continue;
    515         }
    516         if (*p == '\n') {
    517             if (++p != q && *p == '\r') {
    518                 ++p;
    519             }
    520             lex_emit_string_newline(s, p);
    521             s = p;
    522             continue;
    523         }
    524         if (*p == '\r') {
    525             if (++p != q && *p == '\n') {
    526                 ++p;
    527             }
    528             lex_emit_string_newline(s, p);
    529             s = p;
    530             continue;
    531         }
    532         ++p;
    533         lex_emit_string_ctrl(s);
    534         s = p;
    535     }
    536 #endif
    537 
    538 /*
    539  * Python ''' style strings.
    540  * Direct copy of """ quote version, update both if changed.
    541  */
    542 #ifdef LEX_PYTHON_BLOCK_STRING_SQ
    543 lex_mode_python_block_string_sq:
    544 
    545     mode = LEX_MODE_PYTHON_BLOCK_STRING_SQ;
    546 
    547     for (;;) {
    548         --p;
    549         while (++p != q && *p != '\\' && !lex_isctrl(*p)) {
    550             if (*p == '\'' && p + 2 < q && p[1] == '\'' && p[2] == '\'') {
    551                 break;
    552             }
    553         }
    554         if (s != p) {
    555             lex_emit_string_part(s, p);
    556             s = p;
    557         }
    558         if (p == q || lex_iszterm(*p)) {
    559             lex_emit_string_unterminated(p);
    560             goto lex_mode_normal;
    561         }
    562         if (*p == '\'') {
    563             p += 3;
    564             lex_emit_string_end(s, p);
    565             goto lex_mode_normal;
    566         }
    567         if (*p == '\\') {
    568              /* Escape is only itself, allowing parser to interpret and validate. */
    569             ++p;
    570             lex_emit_string_escape(s, p);
    571             s = p;
    572             if (p + 1 != q && (*p == '\\' || *p == '\'')) {
    573                 ++p;
    574             }
    575             continue;
    576         }
    577         if (*p == '\n') {
    578             if (++p != q && *p == '\r') {
    579                 ++p;
    580             }
    581             lex_emit_string_newline(s, p);
    582             s = p;
    583             continue;
    584         }
    585         if (*p == '\r') {
    586             if (++p != q && *p == '\n') {
    587                 ++p;
    588             }
    589             lex_emit_string_newline(s, p);
    590             s = p;
    591             continue;
    592         }
    593         ++p;
    594         lex_emit_string_ctrl(s);
    595         s = p;
    596     }
    597 #endif
    598 
    599 /*
    600  * We don't really care if it is a shell style comment or a C99,
    601  * or any other line oriented commment, as the termination is
    602  * the same.
    603  */
    604 #if defined(LEX_SHELL_LINE_COMMENT) || defined(LEX_C99_LINE_COMMENT)
    605 lex_mode_line_comment:
    606 
    607     mode = LEX_MODE_LINE_COMMENT;
    608 
    609     for (;;) {
    610         --p;
    611         while (++p != q && (!lex_isctrl(*p))) {
    612         }
    613         if (s != p) {
    614             lex_emit_comment_part(s, p);
    615             s = p;
    616         }
    617         if (p == q || lex_iszterm(*p)) {
    618             /*
    619              * Unterminated comment here is not necessarily true,
    620              * not even likely, nor possible, but we do this to
    621              * handle buffer switch consistently: any non-normal
    622              * mode exit will have an unterminated token to fix up.
    623              * Here it would be conversion to end of comment, which
    624              * we cannot know yet, since the line might continue in
    625              * the next buffer. This is a zero length token.
    626              */
    627             lex_emit_comment_unterminated(p);
    628             goto lex_mode_exit;
    629         }
    630         if (*p == '\n' || *p == '\r') {
    631             lex_emit_comment_end(s, p);
    632             goto lex_mode_normal;
    633         }
    634         ++p;
    635         lex_emit_comment_ctrl(s);
    636         s = p;
    637     }
    638 #endif
    639 
    640 #ifdef LEX_C_BLOCK_COMMENT
    641 lex_mode_c_block_comment:
    642 
    643     mode = LEX_MODE_C_BLOCK_COMMENT;
    644 
    645     for (;;) {
    646         --p;
    647         while (++p != q && (!lex_isctrl(*p))) {
    648             if (*p == '/' && p[-1] == '*') {
    649                 --p;
    650                 break;
    651             }
    652         }
    653         if (s != p) {
    654             lex_emit_comment_part(s, p);
    655             s = p;
    656         }
    657         if (p == q || lex_iszterm(*p)) {
    658             lex_emit_comment_unterminated(p);
    659             goto lex_mode_exit;
    660         }
    661         if (*p == '\n') {
    662             if (++p != q && *p == '\r') {
    663                 ++p;
    664             }
    665             lex_emit_newline(s, p);
    666             s = p;
    667             continue;
    668         }
    669         if (*p == '\r') {
    670             if (++p != q && *p == '\n') {
    671                 ++p;
    672             }
    673             lex_emit_newline(s, p);
    674             s = p;
    675             continue;
    676         }
    677         if (lex_isctrl(*p)) {
    678             ++p;
    679             lex_emit_comment_ctrl(s);
    680             s = p;
    681             continue;
    682         }
    683         p += 2;
    684         lex_emit_comment_end(s, p);
    685         s = p;
    686         goto lex_mode_normal;
    687     }
    688 #endif
    689 
    690     /* Julia nests block comments as #= ... #= ...=# ... =# across multiple lines. */
    691 #ifdef LEX_JULIA_NESTED_COMMENT
    692 lex_mode_julia_nested_comment:
    693 
    694     /* Preserve nesting level on re-entrance. */
    695     if ((mode & (LEX_MODE_COUNT_BASE - 1)) != LEX_MODE_JULIA_NESTED_COMMENT) {
    696         mode = LEX_MODE_JULIA_NESTED_COMMENT;
    697     }
    698     /* We have already entered. */
    699     mode += LEX_MODE_COUNT_BASE;
    700 
    701     for (;;) {
    702         --p;
    703         while (++p != q && !lex_isctrl(*p)) {
    704             if (*p == '#') {
    705                 if (p[-1] == '=') {
    706                     --p;
    707                     break;
    708                 }
    709                 if (p + 1 != q && p[1] == '=') {
    710                     break;
    711                 }
    712             }
    713         }
    714         if (s != p) {
    715             lex_emit_comment_part(s, p);
    716             s = p;
    717         }
    718         if (p == q || lex_iszterm(*p)) {
    719             lex_emit_comment_unterminated(p);
    720             goto lex_mode_exit;
    721         }
    722         if (*p == '\n') {
    723             if (++p != q && *p == '\r') {
    724                 ++p;
    725             }
    726             lex_emit_newline(s, p);
    727             s = p;
    728             continue;
    729         }
    730         if (*p == '\r') {
    731             if (++p != q && *p == '\n') {
    732                 ++p;
    733             }
    734             lex_emit_newline(s, p);
    735             s = p;
    736             continue;
    737         }
    738         if (lex_isctrl(*p)) {
    739             ++p;
    740             lex_emit_comment_ctrl(s);
    741             s = p;
    742             continue;
    743         }
    744         if (*p == '=') {
    745             p += 2;
    746             lex_emit_comment_end(s, p);
    747             s = p;
    748             mode -= LEX_MODE_COUNT_BASE;
    749             if (mode / LEX_MODE_COUNT_BASE > 0) {
    750                 continue;
    751             }
    752             goto lex_mode_normal;
    753         }
    754         /* The upper bits are used as counter. */
    755         mode += LEX_MODE_COUNT_BASE;
    756         p += 2;
    757         lex_emit_comment_begin(s, p, 0);
    758         s = p;
    759         if (mode / LEX_MODE_COUNT_BASE > LEX_MAX_NESTING_LEVELS) {
    760             /* Prevent malicious input from overflowing counter. */
    761             lex_emit_comment_deeply_nested(p);
    762             lex_emit_abort(p);
    763             return mode;
    764         }
    765     }
    766 #endif
    767 
    768 /* Unlike other modes, we can always jump here without updating token start `s` first. */
    769 lex_mode_normal:
    770 
    771     mode = LEX_MODE_NORMAL;
    772 
    773     while (p != q) {
    774         s = p;
    775 
    776         switch(*p) {
    777 
    778 #ifndef LEX_DISABLE_ZTERM
    779         case '\0':
    780             lex_emit_eos(s, p);
    781             return mode;
    782 #endif
    783 
    784         /* \v, \f etc. are covered by the CTRL token, don't put it here. */
    785         case '\t': case ' ':
    786             while (++p != q && lex_isblank(*p)) {
    787             }
    788             lex_emit_blank(s, p);
    789             continue;
    790 
    791         /*
    792          * Newline should be emitter in all constructs, also comments
    793          * and strings which have their own newline handling.
    794          * Only one line is emitted at a time permitting simple line
    795          * counting.
    796          */
    797         case '\n':
    798             if (++p != q && *p == '\r') {
    799                 ++p;
    800             }
    801             lex_emit_newline(s, p);
    802             continue;
    803 
    804         case '\r':
    805             if (++p != q && *p == '\n') {
    806                 ++p;
    807             }
    808             lex_emit_newline(s, p);
    809             continue;
    810 
    811             /*
    812              * C-style string, and Python style triple double quote
    813              * delimited multi-line string. Prefix and suffix symbols
    814              * should be parsed separately, e.g. L"hello" are two
    815              * tokens.
    816              */
    817 #if defined(LEX_C_STRING) || defined(LEX_PYTHON_BLOCK_STRING)
    818         case '\"':
    819 #ifdef LEX_PYTHON_BLOCK_STRING
    820             if (p + 2 < q && p[1] == '\"' && p[2] == '\"') {
    821                 p += 3;
    822                 lex_emit_string_begin(s, p);
    823                 s = p;
    824                 goto lex_mode_python_block_string;
    825             }
    826 #endif
    827 #ifdef LEX_C_STRING
    828             ++p;
    829             lex_emit_string_begin(s, p);
    830             s = p;
    831             goto lex_mode_c_string;
    832 #endif
    833 #endif
    834 
    835             /*
    836              * Single quoted version of strings, otherwise identical
    837              * behavior. Can also be used for char constants if checked
    838              * by parser subsequently.
    839              */
    840 #if defined(LEX_C_STRING_SQ) || defined(LEX_PYTHON_BLOCK_STRING_SQ)
    841         case '\'':
    842 #ifdef LEX_PYTHON_BLOCK_STRING_SQ
    843             if (p + 2 < q && p[1] == '\'' && p[2] == '\'') {
    844                 p += 3;
    845                 lex_emit_string_begin(s, p);
    846                 s = p;
    847                 goto lex_mode_python_block_string_sq;
    848             }
    849 #endif
    850 #ifdef LEX_C_STRING_SQ
    851             ++p;
    852             lex_emit_string_begin(s, p);
    853             s = p;
    854             goto lex_mode_c_string_sq;
    855 #endif
    856 #endif
    857 
    858 #if defined(LEX_SHELL_LINE_COMMENT) || defined(LEX_JULIA_NESTED_COMMENT)
    859             /*
    860              * Line comment excluding terminal line break.
    861              *
    862              * See also C99 line comment `//`.
    863              *
    864              * Julia uses `#=` and `=#` for nested block comments.
    865              * (According to Julia developers, '#=` is motivated by `=`
    866              * not being likely to start anything that you would put a
    867              * comment around, unlike `#{`, `}#` or `#(`, `)#`)).
    868              *
    869              * Some known doc comment formats are identified and
    870              * included in the comment_begin token.
    871              */
    872         case '#':
    873             ++p;
    874 #ifdef LEX_JULIA_NESTED_COMMENT
    875             if (p != q && *p == '=') {
    876                 ++p;
    877                 lex_emit_comment_begin(s, p, 0);
    878                 s = p;
    879                 goto lex_mode_julia_nested_comment;
    880             }
    881 #endif
    882             lex_emit_comment_begin(s, p, 0);
    883             s = p;
    884             goto lex_mode_line_comment;
    885 #endif
    886 
    887         case '/':
    888             ++p;
    889             if (p != q) {
    890                 switch (*p) {
    891 #ifdef LEX_C99_LINE_COMMENT
    892                 case '/':
    893                     ++p;
    894                     p += p != q && (*p == '/' || *p == '!');
    895                     lex_emit_comment_begin(s, p, (p - s == 3));
    896                     s = p;
    897                     goto lex_mode_line_comment;
    898 #endif
    899 #ifdef LEX_C_BLOCK_COMMENT
    900                 case '*':
    901                     ++p;
    902                     p += p != q && (*p == '*' || *p == '!');
    903                     lex_emit_comment_begin(s, p, (p - s == 3));
    904                     s = p;
    905                     goto lex_mode_c_block_comment;
    906 #endif
    907                 case '=':
    908                     ++p;
    909                     lex_emit_compound_op('/', '=', s, p);
    910                     continue;
    911                 default:
    912                     break;
    913                 }
    914             }
    915             lex_emit_op('/', s, p);
    916             continue;
    917 
    918         case '(': case ')': case '[': case ']': case '{': case '}':
    919         case ',': case ';': case '\\': case '?':
    920             ++p;
    921             lex_emit_op(*s, s, p);
    922             continue;
    923 
    924         case '%': case '!': case '~': case '^':
    925             ++p;
    926             if (p != q && *p == '=') {
    927                 ++p;
    928                 lex_emit_compound_op(*s, '=', s, p);
    929                 continue;
    930             }
    931             lex_emit_op(*s, s, p);
    932             continue;
    933 
    934         case '|':
    935             ++p;
    936             if (p != q) {
    937                 switch (*p) {
    938                 case '=':
    939                     ++p;
    940                     lex_emit_compound_op('|', '=', s, p);
    941                     continue;
    942                 case '|':
    943                     ++p;
    944                     lex_emit_compound_op('|', '|', s, p);
    945                     break;
    946                 default:
    947                     break;
    948                 }
    949             }
    950             lex_emit_op('|', s, p);
    951             continue;
    952 
    953         case '&':
    954             ++p;
    955             if (p != q) {
    956                 switch (*p) {
    957                 case '=':
    958                     ++p;
    959                     lex_emit_compound_op('&', '=', s, p);
    960                     continue;
    961                 case '&':
    962                     ++p;
    963                     lex_emit_compound_op('&', '&', s, p);
    964                     break;
    965                 default:
    966                     break;
    967                 }
    968             }
    969             lex_emit_op('&', s, p);
    970             continue;
    971 
    972         case '=':
    973             ++p;
    974             if (p != q) {
    975                 switch (*p) {
    976                 case '>':
    977                     ++p;
    978                     lex_emit_compound_op('=', '>', s, p);
    979                     continue;
    980                 case '=':
    981                     ++p;
    982                     if (p != q && *p == '=') {
    983                         ++p;
    984                         lex_emit_tricompound_op('=', '=', '=', s, p);
    985                         continue;
    986                     }
    987                     lex_emit_compound_op('=', '=', s, p);
    988                     break;
    989                 default:
    990                     break;
    991                 }
    992             }
    993             lex_emit_op('=', s, p);
    994             continue;
    995 
    996         case ':':
    997             ++p;
    998             if (p != q) {
    999                 switch (*p) {
   1000                 case '=':
   1001                     ++p;
   1002                     lex_emit_compound_op(':', '=', s, p);
   1003                     continue;
   1004                 case ':':
   1005                     ++p;
   1006                     if (p != q && *p == '=') {
   1007                         ++p;
   1008                         lex_emit_tricompound_op(':', ':', '=', s, p);
   1009                         continue;
   1010                     }
   1011                     lex_emit_compound_op(':', ':', s, p);
   1012                     continue;
   1013                 default:
   1014                     break;
   1015                 }
   1016             }
   1017             lex_emit_op(':', s, p);
   1018             continue;
   1019 
   1020         case '*':
   1021             ++p;
   1022             if (p != q) {
   1023                 switch (*p) {
   1024                 case '=':
   1025                     lex_emit_compound_op('*', '=', s, p);
   1026                     continue;
   1027                 case '*':
   1028                     /* **= hardly used anywhere? */
   1029                     lex_emit_compound_op('*', '*', s, p);
   1030                     continue;
   1031                 default:
   1032                     break;
   1033                 }
   1034             }
   1035             lex_emit_op('*', s, p);
   1036             continue;
   1037 
   1038         case '<':
   1039             ++p;
   1040             if (p != q) {
   1041                 switch (*p) {
   1042                 case '-':
   1043                     ++p;
   1044                     lex_emit_compound_op('<', '-', s, p);
   1045                     continue;
   1046                 case '=':
   1047                     ++p;
   1048                     lex_emit_compound_op('<', '=', s, p);
   1049                     continue;
   1050                 case '<':
   1051                     ++p;
   1052                     if (p != q) {
   1053                         switch (*p) {
   1054                         case '=':
   1055                             ++p;
   1056                             lex_emit_tricompound_op('<', '<', '=', s, p);
   1057                             continue;
   1058                         case '<':
   1059                             ++p;
   1060                             if (p != q && *p == '=') {
   1061                                 ++p;
   1062                                 lex_emit_quadcompound_op('<', '<', '<', '=', s, p);
   1063                                 continue;
   1064                             }
   1065                             lex_emit_tricompound_op('<', '<', '<', s, p);
   1066                             continue;
   1067                         default:
   1068                             break;
   1069                         }
   1070                     }
   1071                     lex_emit_compound_op('<', '<', s, p);
   1072                     continue;
   1073                 default:
   1074                     break;
   1075                 }
   1076             }
   1077             lex_emit_op('<', s, p);
   1078             continue;
   1079 
   1080         case '>':
   1081             ++p;
   1082             if (p != q) {
   1083                 switch (*p) {
   1084                 case '=':
   1085                     ++p;
   1086                     lex_emit_compound_op('>', '=', s, p);
   1087                     continue;
   1088                 case '>':
   1089                     ++p;
   1090                     if (p != q) {
   1091                         switch (*p) {
   1092                         case '=':
   1093                             ++p;
   1094                             lex_emit_tricompound_op('>', '>', '=', s, p);
   1095                             continue;
   1096                         case '>':
   1097                             ++p;
   1098                             if (p != q && *p == '=') {
   1099                                 ++p;
   1100                                 lex_emit_quadcompound_op('>', '>', '>', '=', s, p);
   1101                                 continue;
   1102                             }
   1103                             lex_emit_tricompound_op('>', '>', '>', s, p);
   1104                             continue;
   1105                         default:
   1106                             break;
   1107                         }
   1108                     }
   1109                     lex_emit_compound_op('>', '>', s, p);
   1110                     continue;
   1111                 default:
   1112                     break;
   1113                 }
   1114             }
   1115             lex_emit_op('>', s, p);
   1116             continue;
   1117 
   1118         case '-':
   1119             ++p;
   1120             if (p != q) {
   1121                 switch (*p) {
   1122                 case '=':
   1123                     ++p;
   1124                     lex_emit_compound_op('-', '=', s, p);
   1125                     continue;
   1126                 case '-':
   1127                     ++p;
   1128                     lex_emit_compound_op('-', '-', s, p);
   1129                     continue;
   1130                 case '>':
   1131                     ++p;
   1132                     lex_emit_compound_op('-', '>', s, p);
   1133                     continue;
   1134                 default:
   1135                     break;
   1136                 }
   1137             }
   1138             lex_emit_op('-', s, p);
   1139             continue;
   1140 
   1141         case '+':
   1142             ++p;
   1143             if (p != q) {
   1144                 switch (*p) {
   1145                 case '=':
   1146                     ++p;
   1147                     lex_emit_compound_op('+', '=', s, p);
   1148                     continue;
   1149 
   1150                 case '+':
   1151                     ++p;
   1152                     lex_emit_compound_op('+', '+', s, p);
   1153                     continue;
   1154                 default:
   1155                     break;
   1156                 }
   1157             }
   1158             lex_emit_op('+', s, p);
   1159             continue;
   1160 
   1161         case '.':
   1162             ++p;
   1163             if (p != q) {
   1164                 switch (*p) {
   1165                 case '0': case '1': case '2': case '3': case '4':
   1166                 case '5': case '6': case '7': case '8': case '9':
   1167                     d = s;
   1168                     goto lex_dot_to_fraction_part;
   1169                 case '.':
   1170                     ++p;
   1171                     if (p != q && *p == '.') {
   1172                         ++p;
   1173                         lex_emit_tricompound_op('.', '.', '.', s, p);
   1174                         continue;
   1175                     }
   1176                     lex_emit_compound_op('.', '.', s, p);
   1177                     continue;
   1178                 default:
   1179                     break;
   1180                 }
   1181             }
   1182             lex_emit_op('.', s, p);
   1183             continue;
   1184 
   1185         case '0':
   1186             if (++p != q) {
   1187                 switch (*p) {
   1188 #ifdef LEX_C_OCTAL_NUMERIC
   1189 
   1190                 case '0': case '1': case '2': case '3':
   1191                 case '4': case '5': case '6': case '7':
   1192                     while (++p != q && lex_isoctdigit(*p)) {
   1193                     }
   1194                     d = p;
   1195                     if (p != q) {
   1196                         /*
   1197                          * Leading zeroes like 00.10 are valid C
   1198                          * floating point constants.
   1199                          */
   1200                         if (*p == '.') {
   1201                             goto lex_c_octal_to_fraction_part;
   1202                         }
   1203                         if (*p == 'e' || *p == 'E') {
   1204                             goto lex_c_octal_to_exponent_part;
   1205                         }
   1206                     }
   1207                     lex_emit_octal(s, p);
   1208                     /*
   1209                      * If we have a number like 0079, it becomes
   1210                      * 007(octal), 9(decimal). The parser should
   1211                      * deal with this.
   1212                      *
   1213                      * To add to confusion i64 is a C integer suffix
   1214                      * like in 007i64, but 2+2i is a Go complex
   1215                      * constant. (Not specific to octals).
   1216                      *
   1217                      * This can all be handled by having the parser inspect
   1218                      * following identifier or numeric, parser
   1219                      * here meaning a lexer post processing step, not
   1220                      * necessarily the parser itself.
   1221                      */
   1222 
   1223                     continue;
   1224 #else
   1225                     /*
   1226                      * All integers reach default and enter
   1227                      * integer part. As a result, leading zeroes are
   1228                      * mapped to floats and integers which matches
   1229                      * Julia behavior. Other languages should decide
   1230                      * if leading zero is valid or not. JSON
   1231                      * disallows leading zero.
   1232                      */
   1233 #endif
   1234 
   1235 #ifdef LEX_JULIA_OCTAL_NUMERIC
   1236                     /*
   1237                      * This is the style of octal, not 100% Julia
   1238                      * compatible. Also define Julia numeric to enforce
   1239                      * lower case.
   1240                      */
   1241 #ifndef LEX_LOWER_CASE_NUMERIC_PREFIX
   1242                     /* See also hex 0X. Julia v.0.3 uses lower case only here. */
   1243                 case 'O':
   1244 #endif
   1245                     /*
   1246                      * Julia accepts 0o700 as octal and 0b100 as
   1247                      * binary, and 0xa00 as hex, and 0100 as
   1248                      * integer, and 1e2 as 64 bit float and 1f2 as
   1249                      * 32 bit float. Julia 0.3 does not support
   1250                      * octal and binary fractions.
   1251                      */
   1252                 case 'o':
   1253                     while (++p != q && lex_isoctdigit(*p)) {
   1254                     }
   1255                     lex_emit_octal(s, p);
   1256                     /* Avoid hitting int fall through. */
   1257                     continue;
   1258 #endif
   1259 #ifdef LEX_BINARY_NUMERIC
   1260                     /* Binary in C++14. */
   1261                 case 'b':
   1262 #ifndef LEX_LOWER_CASE_NUMERIC_PREFIX
   1263                     /* See also hex 0X. Julia v.0.3 uses lower case only here. */
   1264                 case 'B':
   1265 #endif
   1266                     while (++p != q && lex_isbindigit(*p)) {
   1267                     }
   1268                     lex_emit_binary(s, p);
   1269                     /* Avoid hitting int fall through. */
   1270                     continue;
   1271 #endif
   1272 #ifdef LEX_HEX_NUMERIC
   1273                 case 'x':
   1274 #ifndef LEX_LOWER_CASE_NUMERIC_PREFIX
   1275                     /*
   1276                      * Julia v0.3 does not allow this, it thinks 0X1 is
   1277                      * 0 * X1, X1 being an identifier.
   1278                      * while 0x1 is a hex value due to precedence.
   1279                      *
   1280                      * TODO: This might change.
   1281                      */
   1282 
   1283                 case 'X':
   1284 #endif
   1285                     while (++p != q && lex_ishexdigit(*p)) {
   1286                     }
   1287 #ifdef LEX_HEX_FLOAT_NUMERIC
   1288                     /*
   1289                      * Most hexadecimal floating poing conversion
   1290                      * functions, including Pythons
   1291                      * float.fromhex("0x1.0"), Julias parse
   1292                      * function, and and C strtod on
   1293                      * supporting platforms, will parse without
   1294                      * exponent. The same languages do not support
   1295                      * literal constants without the p exponent.
   1296                      * First it is named p because e is a hex digit,
   1297                      * second, the float suffix f is also a hex
   1298                      * digit: 0x1.f is ambigious in C without that
   1299                      * rule. Conversions have no such ambiguity.
   1300                      * In Julia, juxtaposition means that 0x1.f
   1301                      * could mean 0x1p0 * f or 0x1.fp0.
   1302                      *
   1303                      * Since we are not doing conversion here but
   1304                      * lexing a stream, we opt to require the p
   1305                      * suffix because making it optional could end
   1306                      * up consuming parts of the next token.
   1307                      *
   1308                      * But, we also make a flag to make the exponent
   1309                      * optional, anyway. It could be used for better
   1310                      * error reporting than just consuming the hex
   1311                      * part since we likely should accept the ambigous
   1312                      * syntax either way.
   1313                      */
   1314                     d = p;
   1315                     if (p != q && *p == '.') {
   1316                         while (++p != q && lex_ishexdigit(*p)) {
   1317                         }
   1318                     }
   1319                     if (p != q && (*p == 'p' || *p == 'P')) {
   1320                         if (++p != q && *p != '+' && *p != '-') {
   1321                             --p;
   1322                         }
   1323                         /* The exponent is a decimal power of 2. */
   1324                         while (++p != q && lex_isdigit(*p)) {
   1325                         }
   1326                         lex_emit_hex_float(s, p);
   1327                         continue;
   1328                     }
   1329 #ifdef LEX_HEX_FLOAT_OPTIONAL_EXPONENT
   1330                     if (d != p) {
   1331                         lex_emit_hex_float(s, p);
   1332                         continue;
   1333                     }
   1334 #else
   1335                     /*
   1336                      * Backtrack to decimal point. We require p to
   1337                      * be present because we could otherwise consume
   1338                      * part of the next token.
   1339                      */
   1340                     p = d;
   1341 #endif
   1342 #endif /* LEX_HEX_FLOAT_NUMERIC */
   1343                     lex_emit_hex(s, p);
   1344                     continue;
   1345 #endif /* LEX_HEX_NUMERIC */
   1346 
   1347                 default:
   1348                     /*
   1349                      * This means leading zeroes like 001 or 001.0 are
   1350                      * treated like like int and float respectively,
   1351                      * iff C octals are flaggged out. Otherwise they
   1352                      * become 001(octal), and 001(octal),.0(float)
   1353                      * which should be treated as an error because
   1354                      * future extensions might allow octal floats.
   1355                      * (Not likely, but interpretion is ambigious).
   1356                      */
   1357                     break;
   1358                 } /* Switch under '0' case. */
   1359 
   1360                 /*
   1361                  * Pure single digit '0' is an octal number in the C
   1362                  * spec. We have the option to treat it as an integer,
   1363                  * or as an octal. For strict C behavior, this can be
   1364                  * flagged in, but is disabled by default. It only
   1365                  * applies to single digit 0. Thus, with C octal
   1366                  * enabled, leading zeroes always go octal.
   1367                  */
   1368             } /* If condition around switch under '0' case. */
   1369             --p;
   1370             goto lex_fallthrough_1; /* silence warning */
   1371 
   1372         lex_fallthrough_1:
   1373             /* Leading integer digit in C integers. */
   1374         case '1': case '2': case '3': case '4': case '5':
   1375         case '6': case '7': case '8': case '9':
   1376             while (++p && lex_isdigit(*p)) {
   1377             }
   1378             d = p;
   1379             if (*p == '.') {
   1380 /* Silence unused label warnings when features are disabled. */
   1381 #ifdef LEX_C_OCTAL_NUMERIC
   1382 lex_c_octal_to_fraction_part:
   1383 #endif
   1384 lex_dot_to_fraction_part:
   1385                 while (++p != q && lex_isdigit(*p)) {
   1386                 }
   1387             }
   1388             if (p != q && (*p == 'e' || *p == 'E')) {
   1389 /* Silence unused label warnings when features are disabled. */
   1390 #ifdef LEX_C_OCTAL_NUMERIC
   1391 lex_c_octal_to_exponent_part:
   1392 #endif
   1393                 if (++p != q && *p != '+' && *p != '-') {
   1394                     --p;
   1395                 }
   1396                 while (++p != q && lex_isdigit(*p)) {
   1397                 }
   1398             }
   1399             if (d != p) {
   1400                 lex_emit_float(s, p);
   1401             } else {
   1402 #ifdef LEX_C_OCTAL_NUMERIC
   1403                 if (*s == '0') {
   1404                     lex_emit_octal(s, p);
   1405                     continue;
   1406                 }
   1407 #endif
   1408                 lex_emit_int(s, p);
   1409             }
   1410             continue;
   1411 
   1412 #ifndef LEX_ID_WITHOUT_UNDERSCORE
   1413             case '_':
   1414 #endif
   1415             case 'A': case 'B': case 'C': case 'D': case 'E':
   1416             case 'F': case 'G': case 'H': case 'I': case 'J':
   1417             case 'K': case 'L': case 'M': case 'N': case 'O':
   1418             case 'P': case 'Q': case 'R': case 'S': case 'T':
   1419             case 'U': case 'V': case 'W': case 'X': case 'Y':
   1420             case 'Z':
   1421             case 'a': case 'b': case 'c': case 'd': case 'e':
   1422             case 'f': case 'g': case 'h': case 'i': case 'j':
   1423             case 'k': case 'l': case 'm': case 'n': case 'o':
   1424             case 'p': case 'q': case 'r': case 's': case 't':
   1425             case 'u': case 'v': case 'w': case 'x': case 'y':
   1426             case 'z':
   1427 
   1428                 /*
   1429                  * We do not try to ensure utf-8 is terminated correctly nor
   1430                  * that any unicode character above ASCII is a character
   1431                  * suitable for identifiers.
   1432                  *
   1433                  * tag is calculated for keyword lookup, and we assume these
   1434                  * are always ASCII-7bit.  It has the form: length, first
   1435                  * char, second, char, last char in lsb to msb order. If the
   1436                  * second char is missing, it becomes '\0'. The tag is not
   1437                  * entirely unique, but suitable for fast lookup.
   1438                  *
   1439                  * If utf-8 appears in tag, the tag is undefined except the
   1440                  * length is valid or overflows (meaning longer than any
   1441                  * keyword and thus safe to compare against if tag matches).
   1442                  *
   1443                  * If the grammar is case insensitive, the tag be can
   1444                  * downcased trivially by or'ring with 0x20202000 which
   1445                  * preserves the length field (clever design by ASCII
   1446                  * designers). After tag matching, a case insentive
   1447                  * compare is obviously also needed against the full lexeme.
   1448                  */
   1449 
   1450                 {
   1451                     unsigned long tag;
   1452 
   1453                     tag = (unsigned long)*p << 8;
   1454                     if (++p != q && lex_isalnum(*p)) {
   1455                         tag |= (unsigned long)*p << 16;
   1456                         while (++p != q && lex_isalnum(*p)) {
   1457                         }
   1458                     }
   1459                     tag |= (unsigned long)p[-1] << 24;
   1460                     tag |= (unsigned char)(p - s) + (unsigned long)'0';
   1461                     lex_emit_id(s, p, tag);
   1462                     continue;
   1463                 }
   1464 
   1465             default:
   1466 
   1467 #ifdef LEX_ID_WITH_UTF8
   1468                 /*
   1469                  * Identifier again, in case it starts with a utf-8 lead
   1470                  * character. This time we can ignore the tag, except the
   1471                  * length char must be valid to avoid buffer overruns
   1472                  * on potential kw check upstream.
   1473                  */
   1474                 if (*p & '\x80') {
   1475                     unsigned long tag;
   1476 
   1477                     while (++p != q && lex_isalnum(*p)) {
   1478                     }
   1479                     tag = (unsigned char)(p - s) + '0';
   1480                     lex_emit_id(s, p, tag);
   1481                     continue;
   1482                 }
   1483 #endif
   1484                 ++p;
   1485                 /* normally 0x7f DEL and 0x00..0x1f incl. */
   1486                 if (lex_isctrl(*s) && !lex_isblank(*s)) {
   1487                     lex_emit_ctrl(s);
   1488                 } else {
   1489                     lex_emit_symbol(*s, s, p);
   1490                 }
   1491                 continue;
   1492         } /* Main switch in normal mode. */
   1493     } /* Main while loop in normal mode. */
   1494 
   1495 lex_mode_exit:
   1496     if (mode == LEX_MODE_INVALID) {
   1497         return mode;
   1498     }
   1499 
   1500 #ifndef LEX_DISABLE_ZTERM
   1501     if (p != q && lex_iszterm(*p)) {
   1502         lex_emit_eos(s, p);
   1503         return mode;
   1504     }
   1505 #endif
   1506     lex_emit_eob(p);
   1507     return mode;
   1508 }
   1509
	nostrdb an unfairly fast embedded nostr database backed by lmdb
	git clone git://jb55.com/nostrdb
	Log \| Files \| Refs \| Submodules \| README \| LICENSE