nostrdb

an unfairly fast embedded nostr database backed by lmdb
git clone git://jb55.com/nostrdb
Log | Files | Refs | Submodules | README | LICENSE

tokens.h (21687B)


      1 #ifndef LEX_TOKENS_H
      2 #define LEX_TOKENS_H
      3 
      4 /* Define LEX_DEBUG to enable token printing and describing functions. */
      5 
      6 
      7 enum {
      8 
      9     /*
     10      * EOF is not emitted by lexer, but may be used by driver after
     11      * last buffer is processed.
     12      */
     13     LEX_TOK_EOF = 0,
     14 
     15     /*
     16      * Either EOB or EOS is emitted as the last token before exit,
     17      * or also ABORT in some lexers. Unterminated string or comment
     18      * will be emitted immediately one of these when relevant.
     19      *
     20      * It may be useful to redefine lex_emit_eos and lex_emit_eob to
     21      * produce LEX_TOK_EOF or error directly for simple string lexing.
     22      */
     23     LEX_TOK_EOB = 1,
     24     LEX_TOK_EOS = 2,
     25 
     26     /*
     27      * ABORT can be used for early exit by some lexers while other
     28      * lexers may choose to run to buffer end regardless of input (with
     29      * the exception of deeply nested comments).
     30      */
     31     LEX_TOK_ABORT = 3,
     32 
     33     /*
     34      * Byte order marker. Only happen if lexer was started in bom mode
     35      * and the input stream contains a leading bom marker.
     36      * The token can only be the first token in the stream. Utf-8 is the
     37      * only supported bom, but the lexeme may be checked in case other
     38      * boms are added later. Normally it is routed to lex_emit_other
     39      * along with comments so it just ignores the bom if present. It is
     40      * generally recommended to consume utf-8 bom for interoperability,
     41      * but also to not store it for the same reason.
     42      */
     43     LEX_TOK_BOM,
     44 
     45     /*
     46      * Any control character that is not newline or blank will be
     47      * emitted as single character token here. This token is discussed
     48      * in several comments below. For strings and comments, also
     49      * blank control characters will be emitted since they are usually
     50      * not desired unexpectd.
     51      */
     52     LEX_TOK_CTRL,
     53     LEX_TOK_STRING_CTRL,
     54     LEX_TOK_COMMENT_CTRL,
     55 
     56     /*
     57      * Any printable ASCII character that is not otherwise consumed will
     58      * be issued as a single length symbol token. Further discussion
     59      * below. The symbol and CTRL tokens ensure that the entire input
     60      * stream is covered by tokens. If utf-8 identifies have not been
     61      * flagged, utf-8 leading characters may also end up here, and so
     62      * my utf-8 characters in general, that are not viewed as valid
     63      * identifiers (depending on configuration).
     64      */
     65     LEX_TOK_SYMBOL,
     66 
     67     /*
     68      * Variable length identifier starting with (_A-Za-z) by default and
     69      * followed by zero or more (_A-Za-z0-9) characters. (_) can be
     70      * flagged out. utf-8 can be flagged in. Be default any non-ASCII
     71      * character (0x80 and above), is treated as part of an identifier
     72      * for simplicity and speed, but this may be redefined. Any broken
     73      * utf-8 is not sanitized, thus 0x80 would be a valid identifier
     74      * token with utf-8 identifiers enabled, and otherwise it would be a
     75      * symbol token.
     76      *
     77      * The ID does a magic trick: It maps the lexeme to a very simple
     78      * and fast 32 bit hash code called a tag. The tag is emitted with
     79      * the id token and can be used for fast keyword lookup. The
     80      * hash tag is:
     81      *
     82      *     (length)(first char)(second char)(last char)
     83      *
     84      * where length is ASCII '0' .. '9' where any length overflow is an
     85      * arbitrary value, but such that the length is never longer than
     86      * the lexeme. The last char is the last char regardless of length.
     87      * For short identifiers, the second char may be the first char
     88      * duplicated, and the last char may be first char.
     89      *
     90      * This code is very simple to write by hand: "5whe" means while,
     91      * and can be used in a case switch before a strcmp with "while".
     92      * Conflicts are possible, but then several keywords are tested like
     93      * any other hash conflict. This keyword lookup is user driven, but
     94      * can follow example code quite straightforward.
     95      *
     96      * The lex_emit_id macro can be implemented to provide the above
     97      * lookup and inject a keyword token instead. By convention such
     98      * tokens have negative values to avoid conflicts with lexer
     99      * generated tokens.
    100      *
    101      * The ID also has a special role in prefixes and suffixes: C string
    102      * literals like (L"hello") and numeric literals like (42f) are
    103      * lexed as two tokens, one of which is an ID. The parser must
    104      * process this and observe absence of whitespace where such syntax
    105      * is relevant.
    106      *
    107      * While not specific to ID, the emitter macroes can be designed to
    108      * keep track of start of lines and end of whitespace and attach
    109      * state flags to each token (at line start, after whitespace). The
    110      * whitespace tokens can then be dropped. This might help parsing
    111      * things like suffixes efficiently.
    112      */
    113     LEX_TOK_ID,
    114 
    115     /*
    116      * C-int :: pos-dec-digit dec-digit *
    117      * Julia-int ::= dec-digit+
    118      *
    119      *    pos-dec-digit ::= '1'..'9'
    120      *    dec-digit ::= '0'..'9'
    121      *
    122      * Floating point numbers take precedence when possible so 00.10 is
    123      * always a deciaml floating point value when decimal floats are
    124      * enabled.
    125      *
    126      * The C-int is automatically enabled if C-octals are enabled, and
    127      * disabled otherwise. There is no specific Julia-int type - we just
    128      * use the terminology to represent integers with leading zeroes.
    129      *
    130      * Julia style integers accept leading zeroes. C style integers with
    131      * leading zeroes are consumed as C style octal numbers, so 0019 is
    132      * parsed as either 0019(Julia-int), or 001(C-octal), 9(C-int).
    133      *
    134      * Single digit '0' maps to octal when C-octals are enabled and to
    135      * Julia-int otherwise. (Yes, integers are not that simple, it
    136      * seems).
    137      *
    138      * Both C and Julia octal numbers (see octal token) can be active
    139      * simultaneously. This can be used to control leading zero
    140      * behavior, even if C-octal numbers are not part of the grammar
    141      * being parsed. For example, a language might use 0o777 octal
    142      * numbers and disallow 0777 integers. Enabling C-octals makes this
    143      * easy to detect (but should accept octal 0).
    144      *
    145      * There is no destinction between the styles in the int token, but
    146      * leading zeroes are easily detected in the lexeme.
    147      *
    148      * Constant suffixes like 1L are treated as 1(INT), and L(ID). The
    149      * same goes for other numeric values.
    150      *
    151      * Parser should check for leading zeroes and decide if it is valid,
    152      * a warning, or an error (it is in JSON). This also goes for float.
    153      *
    154      * Numericals, not limited to INT, may appear shorter than they are
    155      * due to buffer splits. Special recovery is required, but will only
    156      * happen just before EOS or EOB tokens (i.e. buffer split events).
    157      */
    158     LEX_TOK_INT,
    159 
    160     /*
    161      * float ::= (int ['.' dec-digits*] dec-exponent)
    162      *          | ([int] '.' dec-digits* [dec-exponent])
    163      *      dec-exponents ::= ('e' | 'E') ['+' | '-'] dec-digits*
    164      *      dec-digits ::= '0'..'9'
    165      *      int ::= dec-digits*
    166      *
    167      * Consumes a superset of C float representation without suffix.
    168      * Some invalid tokens such as 0.E are accepted. Valid tokens such
    169      * as 00.10 take precedence over octal numbers even if it is a
    170      * prefix, and the same is obviously true with respect to decimal
    171      * integers.
    172      *
    173      * JSON does not allow leading zeroes, and also not leading '.'.
    174      * This can easily be checked in the lexeme.
    175      *
    176      * The octal notation affecting integer leading zeroes is not
    177      * relevant to floats because floats take precedence over octal and
    178      * decimal int when containing '.', 'e' or 'E'.
    179      */
    180     LEX_TOK_FLOAT,
    181 
    182     /*
    183      * binary ::= (0b | 0B) ('0' | '1')*
    184      *
    185      * 0b100 or just 0b, parser must check that digits are present,
    186      * otherwise it may be interpreted as zero, just like octal zero
    187      * in C.
    188      *
    189      * Like 0X hex, 0B can be flagged out because Julia v0.3 does not
    190      * support uppercase 0B.
    191      */
    192     LEX_TOK_BINARY,
    193 
    194     /*
    195      * C-octal ::= 0 octal-digit*
    196      *     octal-digits ::= '0'..'7'
    197      *
    198      * Julia-octal ::= 0o octal-digits*
    199      *     octal-digits ::= '0'..'7'
    200      *
    201      * 0777 for C style octal numbers, or 0o777 for Julia syntax. Julia
    202      * v.0.3 does not allow uppercase 0O777, it would mean 0 * O777.
    203      *
    204      * When enabled, decimal floating points take precedence: 00.10 is
    205      * parsed as 00.10(decimal float), as per C standard.
    206      *
    207      * NOTE: It is possible for both styles to be active simultaneously.
    208      * This may be relevant in order to control handling of leading
    209      * zeroes in decimal integers.
    210      *
    211      * If C-octal numbers are flagged out, leading zeroes are mapped to
    212      * integers and the numerical value may change.  Julia behaves this
    213      * way. Nothing prevents support of both C and Julia octal numbers,
    214      * but leading zeroes will then be interpreted the C way - it is not
    215      * recommended to do this.
    216      */
    217     LEX_TOK_OCTAL,
    218 
    219     /*
    220      * hex ::= hex-int
    221      *     hex-digits ::= 'a'..'f'| 'A'..'f' | '0'..'9'
    222      *     hex-int ::= (0x | 0X) hex_digts*
    223      *
    224      * where hex_digits are customizable (e.g. all lower case), and hex
    225      * prefix 0x can be flagged to be lower case only (as in Julia).
    226      *
    227      * If hex floats are enabled, they take precedence:
    228      * 0x1.0(hex-float), if not, 0x1.0 will parse as: 0x1(hex) followed
    229      * by .0(decimal float).
    230      *
    231      * The lead prefix 0x may by flagged to be lower case only because
    232      * this is required by Julia v0.3 where 0X means 0 * X. Julia
    233      * accepts uppercase in the remaining hex digits (and exponent for
    234      * floats). This could possibly change in future versions.
    235      *
    236      * The zero length sequence (0x | 0X) is accepted and left to the
    237      * parser since the lexer emits a token for everything it sees.
    238      * Conceptually it may be interpreted as zero, equivalent to 0 being
    239      * both octal prefix and numeric 0 in C style octal representation.
    240      * Or it may be an error.
    241      */
    242     LEX_TOK_HEX,
    243 
    244     /*
    245      * hex_float ::= hex-int ['.' hex_digit*] hex-exponent
    246      *     hex-exponent ::= ('p' | 'P') ['+' | '-'] decimal-digit*
    247      *     decimal-digit ::= '0'..'9'
    248      *
    249      * A superset of IEEE-754-2008 Hexadecimal Floating Point notation.
    250      *
    251      * We require the exponent to be present, but does not ensure the
    252      * value is otherwise complete, e.g. 0x1p+ would be accepted. The p
    253      * is needed because otherwise 0x1.f could be accepted, and f is a
    254      * float suffix in C, and juxtapostion factor (0x1. * f) in Julia,
    255      * at least, that is one possible interpretation.
    256      *
    257      * The exponent can be flagged optional in which case 0x1.f will be
    258      * consumed as a single hex float toke as a single hex float token.
    259      * This may either simply be accepted in some grammars, or used to
    260      * provide an error message. If the exponent is required, 0x1.f will
    261      * be lexed as three tokens:
    262      *
    263      *     <'0x1'(hex int), '.'(op), 'f'(id)>.
    264      *
    265      * Thus it may be a good idea to allow the exponent to be optional
    266      * anyway and issue an error message or warning if the p is absent
    267      * later in the parsing stage.
    268      *
    269      * Note that, as per IEEE-754, the exponent is a decimal power of
    270      * two. In other words, the number of bits to shift the
    271      * (hexa)decimal point. Also note that it is p and not e because e
    272      * is a hex digit.
    273      */
    274     LEX_TOK_HEX_FLOAT,
    275 
    276     /*
    277      * blank ::= ('\t' | '\x20')+
    278      *
    279      * Longest run in buffer holding only '\t' and '\x20' (space).
    280      *
    281      * buffer splits may generate adjacent blanks depending on recovery
    282      * processing. (The same goes for other line oriented runs such as
    283      * string parts and comment parts).
    284      */
    285     LEX_TOK_BLANK,
    286 
    287     /* newline ::= '\r' | '\n' | '\r\n' | '\n\r'
    288      *
    289      * Will always appear, also inside strings and comments. Can be used
    290      * to track line starts and counts reliably as only one newline is
    291      * issued at a time, and it is issued everywhere, also in strings
    292      * and comments.
    293      *
    294      * May be preceeded by string escape token inside strings. This can
    295      * be interpreted as line continuation within strings specifically,
    296      * as is the case in Python and Javascript (and in C via
    297      * pre-processor).
    298      *
    299      * The LEX_TOK_STRING_NEWLINE is emitted inside strings so the ordinary
    300      * newline may be ignored in comments and other non-string content.
    301      */
    302     LEX_TOK_NEWLINE,
    303     LEX_TOK_STRING_NEWLINE,
    304 
    305     /*
    306      * string ::= string_start
    307      *            (string_part | string_escape |
    308      *                 string_ctrl | string_newline)*
    309      *            (string_end | string_unterminated)
    310      *
    311      * There are several optional string styles. They all start with
    312      * this token. The length and content provided details. Python
    313      * may start with """ or ''' and this token will then have length
    314      * 3 and three quotes as lexeme content. If the lexer exits before
    315      * string end token, the returned lexer mode will remember the
    316      * state and can be used for reentry - this also goes for comments.
    317      *
    318      * Strings can only contain part, escape, newline, and control
    319      * tokens, and either string unterminated or string end token
    320      * at last.
    321      */
    322     LEX_TOK_STRING_BEGIN,
    323 
    324     /* Longest run without control characters, without (\), without
    325      * newline, and without the relevant end delimiter. The run may be
    326      * shortened due to buffer splits. The part may, as an exception,
    327      * begin with an end delimiter character or a (\) if it was
    328      * preceeded by a string escape token. The escape character is
    329      * always (\). Strings that use "" or '' as escape will be treated
    330      * as start and end of separate strings. Strings that do not supoort
    331      * (\) should just treat escape as a part of the string.
    332      */
    333     LEX_TOK_STRING_PART,
    334 
    335     /*
    336      * This is always a single character token (\) and only happens
    337      * inside strings. See also string part token.
    338      */
    339     LEX_TOK_STRING_ESCAPE,
    340 
    341     /* This token is similar to string start. It may be absent at buffer
    342      * splits, but will then an unterminated string token will be used
    343      * just before the split event token.
    344      *
    345      * */
    346     LEX_TOK_STRING_END,
    347 
    348     /*
    349      * This is emitted before the buffer ends, or before unescaped
    350      * newlines for line oriented string types (the usual strings).
    351      * At buffer splits, recovery should clean it up. The returned
    352      * mode allow parsing to continue in a new buffer with a slight
    353      * content overlap.
    354      *
    355      * If string like ("hello, world!") in C, reaches end of line, it
    356      * may be continued" ("hello, \)newline(world!"). If this line
    357      * continuation is flagged out, this will lead to string
    358      * unterminated, even if not at end of buffer. For block strings
    359      * like """hello""", this only happens at end of buffer.
    360      */
    361     LEX_TOK_STRING_UNTERMINATED,
    362 
    363     /*
    364      *
    365      *     comment ::= comment_start
    366      *     (comment_part | ctrl | newline)*
    367      *     (comment_end | comment_unterminated)
    368      *
    369      *
    370      * Comments work like strings in most respects. They emit parts, and
    371      * control characters, but not escape characters, and cannot be
    372      * continued at end of line. Block comments are like python block
    373      * strings (''').
    374      *
    375      * Julia supports nested comments (#= ... #= =# =#). In this case
    376      * a new start token can be emitted before an end token. If the
    377      * parser exits due to buffer split, the mode has the nesting level
    378      * encoded so it can resumed in a new buffer.
    379      *
    380      * Line comments will have their end token just before newline, or
    381      * unterminated comment just before buffer split token (EOB or EOS).
    382      * (\) characters are consumed by the comment part tokens and do not
    383      * affect the end of any comment.
    384      *
    385      * Comment begin may include extra characters when a doc comment is
    386      * recognized. The emitter flags this. End comments are unaffected.
    387      */
    388     LEX_TOK_COMMENT_BEGIN,
    389     LEX_TOK_COMMENT_PART,
    390     LEX_TOK_COMMENT_END,
    391     LEX_TOK_COMMENT_UNTERMINATED,
    392 
    393     /*
    394      * Issued before ABORT token if nesting level is above a predefined
    395      * level. This is to protect against malicious and misguided
    396      * content, otherwise the nesting level counter could wrap and
    397      * generate a different interpretation, which could be bad. The
    398      * parser would probably do similar things with nested tokens.
    399      */
    400     LEX_TOK_COMMENT_DEEPLY_NESTED,
    401 
    402 
    403     /* Operators are all recognized single character symbols, or up to
    404      * four characters. The token value is the ASCII codes shifted 8
    405      * bits per extra character, by default, but the emitter macros
    406      * can redefine this. Values below 32 are reserved token types as
    407      * discussed above.
    408      *
    409      * What exactly represents an operator depends on what the lexer has
    410      * enabled.
    411      *
    412      * Printable ASCII symbols that are NOT recognized, are emitted as
    413      * the SYMBOL token and is always length 1. The value can be derived
    414      * from the lexeme, but not the token itself. This may be perfectly
    415      * fine for the parser, or it may be used to indicate an error.
    416      * There are no illegal characters per se.
    417      *
    418      * Non-printable ASCII characters that are not covered by newline or
    419      * blank, are emitted as CTRL tokens. These act the same as the
    420      * symbol token and may be used to indicate error, or to handle form
    421      * feed and other whitespace not handled by default. Unlike symbol,
    422      * however, CTRL also appear in strings and comments since they are
    423      * generally not allowed and this makes it easy to capture (there is
    424      * virtually no performance overhead in providing this service
    425      * unless attempting to parse a binary format).
    426      */
    427 
    428     /* Don't bleed into this range. */
    429     LEX_TOK_OPERATOR_BASE = 32,
    430 
    431 
    432     /*
    433      * Operators use ASCII range.
    434      * Compound operators use range 0x80 to 0x7fff
    435      * and possibly above for triple sequences.
    436      * Custom keywords are normally negative but can be mapped
    437      * to any other.
    438      *
    439      * The layout is designed for efficient table lookup.
    440      * Compound operators might benefit from remapping down to a smaller
    441      * range for compact lookup tables, but it depends on the parser.
    442      */
    443 };
    444 
    445 /*
    446  * Custom keyword token range is negative, and well below -99..0 where
    447  * special codes are reserved.
    448  */
    449 #ifndef LEX_TOK_KW_BASE
    450 #define LEX_TOK_KW_BASE -1000
    451 #endif
    452 
    453 #ifndef LEX_TOK_KW_NOT_FOUND
    454 #define LEX_TOK_KW_NOT_FOUND LEX_TOK_ID
    455 #endif
    456 
    457 
    458 #ifdef LEX_DEBUG
    459 
    460 #include <stdio.h>
    461 #include <string.h>
    462 
    463 static const char *lex_describe_token(long token)
    464 {
    465     switch(token) {
    466     case LEX_TOK_BOM: return "BOM marker";
    467     case LEX_TOK_EOF: return "EOF";
    468     case LEX_TOK_EOS: return "buffer zero terminated";
    469     case LEX_TOK_EOB: return "buffer exhausted";
    470     case LEX_TOK_ABORT: return "abort";
    471     case LEX_TOK_CTRL: return "control";
    472     case LEX_TOK_STRING_CTRL: return "string control";
    473     case LEX_TOK_COMMENT_CTRL: return "comment control";
    474     case LEX_TOK_SYMBOL: return "symbol";
    475     case LEX_TOK_ID: return "identifier";
    476     case LEX_TOK_INT: return "integer";
    477     case LEX_TOK_FLOAT: return "float";
    478     case LEX_TOK_BINARY: return "binary";
    479     case LEX_TOK_OCTAL: return "octal";
    480     case LEX_TOK_HEX: return "hex";
    481     case LEX_TOK_HEX_FLOAT: return "hex float";
    482     case LEX_TOK_BLANK: return "blank";
    483     case LEX_TOK_NEWLINE: return "newline";
    484     case LEX_TOK_STRING_NEWLINE: return "string newline";
    485     case LEX_TOK_STRING_BEGIN: return "string begin";
    486     case LEX_TOK_STRING_PART: return "string part";
    487     case LEX_TOK_STRING_END: return "string end";
    488     case LEX_TOK_STRING_ESCAPE: return "string escape";
    489     case LEX_TOK_STRING_UNTERMINATED: return "unterminated string";
    490     case LEX_TOK_COMMENT_BEGIN: return "comment begin";
    491     case LEX_TOK_COMMENT_PART: return "comment part";
    492     case LEX_TOK_COMMENT_END: return "comment end";
    493     case LEX_TOK_COMMENT_UNTERMINATED: return "unterminated comment";
    494     case LEX_TOK_COMMENT_DEEPLY_NESTED: return "deeply nested comment";
    495 
    496     default:
    497         if (token < LEX_TOK_EOF) {
    498             return "keyword";
    499         }
    500         if (token < 32) {
    501             return "undefined";
    502         }
    503         if (token < 0x100L) {
    504             return "operator";
    505         }
    506         if (token < 0x10000L) {
    507             return "compound operator";
    508         }
    509         if (token < 0x1000000L) {
    510             return "tricompound operator";
    511         }
    512         if (token < 0x7f0000000L) {
    513             return "quadcompound operator";
    514         }
    515         return "reserved";
    516     }
    517 }
    518 
    519 static void lex_fprint_token(FILE *fp,
    520         long token,
    521         const char *first, const char *last,
    522         int line, int pos)
    523 {
    524     char buf[10];
    525     const char *lexeme = first;
    526     int len = (int)(last - first);
    527     switch (token) {
    528     case LEX_TOK_EOS:
    529     case LEX_TOK_CTRL:
    530         sprintf(buf, "^%02x", (int)*first);
    531         lexeme = buf;
    532         len = strlen(buf);
    533         break;
    534     default:
    535         break;
    536     }
    537     fprintf(fp, "%04d:%03d %s (0x%lx): `%.*s`\n",
    538             line, pos, lex_describe_token(token), token, len, lexeme);
    539 }
    540 
    541 #define lex_print_token(token, first, last, line, pos)                  \
    542         lex_fprint_token(stdout, token, first, last, line, pos)
    543 
    544 #else /* LEX_DEBUG */
    545 
    546 #define lex_describe_token(token) "debug not available"
    547 #define lex_fprint_token(fp, token, first, last, line, pos) ((void)0)
    548 #define lex_print_token(token, first, last, line, pos) ((void)0)
    549 
    550 #endif /* LEX_DEBUG */
    551 
    552 
    553 #endif /* LEX_TOKENS_H */
    554