nostrdb

an unfairly fast embedded nostr database backed by lmdb
git clone git://jb55.com/nostrdb
Log | Files | Refs | Submodules | README | LICENSE

luthor.h (18021B)


      1 /*
      2  * Mostly generic lexer that can be hacked to suit specific syntax. See
      3  * more detailed comments further down in this file.
      4  *
      5  * Normally include luthor.c instead of luthor.h so emitter functions
      6  * can be custom defined, and optionally also fast keyword definitions.
      7  *
      8  * At the very minimum, define lex_emit which other emitters default to.
      9  *
     10  * Create a wrapper function to drive the lex function in said file.
     11  *
     12  * Use this header in separate parser logic to access the token values
     13  * if relevant.
     14  */
     15 
     16 #ifndef LUTHOR_H
     17 #define LUTHOR_H
     18 
     19 #ifdef LEX_KEYWORDS
     20 #include <string.h> /* memcmp for kw match */
     21 #endif
     22 
     23 #include "tokens.h"
     24 
     25 #ifndef lex_emit
     26 #define lex_emit(token, first, last) ((void)0)
     27 #endif
     28 
     29 /*
     30  * Default for comments, bom, and other things that are not necessarily
     31  * of interest to the parser, but may be to buffer wrap handling,
     32  * debugging, and pretty printers.
     33  */
     34 #ifndef lex_emit_other
     35 #define lex_emit_other(token, first, last) ((void)0)
     36 #endif
     37 
     38 #ifndef lex_emit_eof
     39 #define lex_emit_eof(pos) lex_emit(LEX_TOK_EOF, pos, pos)
     40 #endif
     41 
     42 #ifndef lex_emit_abort
     43 #define lex_emit_abort(pos) lex_emit(LEX_TOK_ABORT, pos, pos)
     44 #endif
     45 
     46 #ifndef lex_emit_eob
     47 #define lex_emit_eob(pos) lex_emit(LEX_TOK_EOB, pos, pos)
     48 #endif
     49 
     50 #ifndef lex_emit_eos
     51 #define lex_emit_eos(first, last) lex_emit(LEX_TOK_EOS, first, last)
     52 #endif
     53 
     54 #ifndef lex_emit_bom
     55 #define lex_emit_bom(first, last) lex_emit_other(LEX_TOK_BOM, first, last)
     56 #endif
     57 
     58 #ifndef lex_emit_id
     59 #ifdef LEX_KEYWORDS
     60 /* LEX_KW_TABLE_BEGIN .. LEX_KEYWORD_TABLE_END defines lex_match_kw. */
     61 #define lex_emit_id(first, last, tag) lex_emit(lex_match_kw(tag, first), first, last)
     62 #else
     63 #define lex_emit_id(first, last, tag) lex_emit(LEX_TOK_ID, first, last)
     64 #endif
     65 #endif
     66 
     67 /*
     68  * This is a default for unknown symbols. It may be treated as an error,
     69  * or it can be processed further by the parser instead of customizing
     70  * the lexer. It ensures that there is always a token for every part of
     71  * the input stream.
     72  */
     73 #ifndef lex_emit_symbol
     74 #define lex_emit_symbol(token, first, last) lex_emit(LEX_TOK_SYMBOL, first, last)
     75 #endif
     76 
     77 /*
     78  * Control characters 0x01 .. 0x1f, 0x7f(DEL), excluding \0\r\n\t which have
     79  * separate tokens.
     80  *
     81  * Control characters in strings and comments are passed on as body
     82  * elements, except \0\r\n which breaks the string up.
     83  */
     84 #ifndef lex_emit_ctrl
     85 #define lex_emit_ctrl(pos) lex_emit(LEX_TOK_CTRL, pos, pos + 1)
     86 #endif
     87 
     88 #ifndef lex_emit_string_ctrl
     89 #define lex_emit_string_ctrl(pos) lex_emit(LEX_TOK_STRING_CTRL, pos, pos + 1)
     90 #endif
     91 
     92 #ifndef lex_emit_comment_ctrl
     93 #define lex_emit_comment_ctrl(pos) lex_emit_other(LEX_TOK_COMMENT_CTRL, pos, pos + 1)
     94 #endif
     95 
     96 /*
     97  * This enables user to both count lines, and to calculate character
     98  * offset for subsequent lexemes. New line starts a lexeme, line break
     99  * symbol is located at lexeme - skipped and with have length 2 if \r\n
    100  * or \n\r break, and 1 otherwise.
    101  */
    102 #ifndef lex_emit_newline
    103 #define lex_emit_newline(first, last) lex_emit(LEX_TOK_NEWLINE, first, last)
    104 #endif
    105 
    106 #ifndef lex_emit_string_newline
    107 #define lex_emit_string_newline(first, last) lex_emit(LEX_TOK_STRING_NEWLINE, first, last)
    108 #endif
    109 
    110 #ifndef lex_emit_int
    111 #define lex_emit_int(first, last) lex_emit(LEX_TOK_INT, first, last)
    112 #endif
    113 
    114 #ifndef lex_emit_float
    115 #define lex_emit_float(first, last) lex_emit(LEX_TOK_FLOAT, first, last)
    116 #endif
    117 
    118 #ifndef lex_emit_int_suffix
    119 #define lex_emit_int_suffix(first, last) lex_emit(LEX_TOK_INT_SUFFIX, first, last)
    120 #endif
    121 
    122 #ifndef lex_emit_float_suffix
    123 #define lex_emit_floatint_suffix(first, last) lex_emit(LEX_TOK_FLOAT_SUFFIX, first, last)
    124 #endif
    125 
    126 #ifndef lex_emit_binary
    127 #define lex_emit_binary(first, last) lex_emit(LEX_TOK_BINARY, first, last)
    128 #endif
    129 
    130 #ifndef lex_emit_octal
    131 #define lex_emit_octal(first, last) lex_emit(LEX_TOK_OCTAL, first, last)
    132 #endif
    133 
    134 #ifndef lex_emit_hex
    135 #define lex_emit_hex(first, last) lex_emit(LEX_TOK_HEX, first, last)
    136 #endif
    137 
    138 #ifndef lex_emit_hex_float
    139 #define lex_emit_hex_float(first, last) lex_emit(LEX_TOK_HEX_FLOAT, first, last)
    140 #endif
    141 
    142 /*
    143  * The comment token can be used to aid backtracking during buffer
    144  * switch.
    145  */
    146 #ifndef lex_emit_comment_begin
    147 #define lex_emit_comment_begin(first, last, is_doc)                         \
    148     lex_emit_other(LEX_TOK_COMMENT_BEGIN, first, last)
    149 #endif
    150 
    151 #ifndef lex_emit_comment_part
    152 #define lex_emit_comment_part(first, last) lex_emit_other(LEX_TOK_COMMENT_PART, first, last)
    153 #endif
    154 
    155 #ifndef lex_emit_comment_end
    156 #define lex_emit_comment_end(first, last) lex_emit_other(LEX_TOK_COMMENT_END, first, last)
    157 #endif
    158 
    159 #ifndef lex_emit_comment_unterminated
    160 #define lex_emit_comment_unterminated(pos)                                  \
    161         lex_emit_other(LEX_TOK_COMMENT_UNTERMINATED, pos, pos)
    162 #endif
    163 
    164 #ifndef lex_emit_comment_deeply_nested
    165 #define lex_emit_comment_deeply_nested(pos)                                 \
    166         lex_emit_other(LEX_TOK_COMMENT_DEEPLY_NESTED, pos, pos)
    167 #endif
    168 
    169 #ifndef lex_emit_string_begin
    170 #define lex_emit_string_begin(first, last) lex_emit(LEX_TOK_STRING_BEGIN, first, last)
    171 #endif
    172 
    173 #ifndef lex_emit_string_part
    174 #define lex_emit_string_part(first, last) lex_emit(LEX_TOK_STRING_PART, first, last)
    175 #endif
    176 
    177 #ifndef lex_emit_string_end
    178 #define lex_emit_string_end(first, last) lex_emit(LEX_TOK_STRING_END, first, last)
    179 #endif
    180 
    181 #ifndef lex_emit_string_escape
    182 #define lex_emit_string_escape(first, last) lex_emit(LEX_TOK_STRING_ESCAPE, first, last)
    183 #endif
    184 
    185 #ifndef lex_emit_string_unterminated
    186 #define lex_emit_string_unterminated(pos)                                   \
    187         lex_emit(LEX_TOK_STRING_UNTERMINATED, pos, pos)
    188 #endif
    189 
    190 #ifndef lex_emit_blank
    191 #define lex_emit_blank(first, last)                                         \
    192         lex_emit_other(LEX_TOK_BLANK, first, last)
    193 #endif
    194 
    195 #ifndef lex_emit_op
    196 #define lex_emit_op(op, first, last)  lex_emit((long)(op), first, last)
    197 #endif
    198 
    199 #ifndef lex_emit_compound_op
    200 #define lex_emit_compound_op(op1, op2, first, last)                         \
    201         lex_emit(((long)(op1) | ((long)(op2) << 8)), first, last)
    202 #endif
    203 
    204 #ifndef lex_emit_tricompound_op
    205 #define lex_emit_tricompound_op(op1, op2, op3, first, last)                 \
    206         lex_emit(((long)(op1) | ((long)(op2) << 8)) |                       \
    207                 ((long)(op3)<<16), first, last)
    208 #endif
    209 
    210 #ifndef lex_emit_quadcompound_op
    211 #define lex_emit_quadcompound_op(op1, op2, op3, op4, first, last)           \
    212         lex_emit(((long)(op1) | ((long)(op2) << 8)) |                       \
    213                 ((long)(op3) << 16) | ((long)(op4) << 24), first, last)
    214 #endif
    215 
    216 /* Used to limit number of nested comment level. */
    217 #ifndef LEX_MAX_NESTING_LEVELS
    218 #define LEX_MAX_NESTING_LEVELS 100
    219 #endif
    220 
    221 
    222 /* Keyword handling macros, see `keywords.c` for an example usage. */
    223 #ifdef LEX_KEYWORDS
    224 
    225 /*
    226  * This implements a switch statement branching on the 4 character
    227  * keyword tag (unsigned long value) which is produced by the lexers id
    228  * recognizer. A final check is needed with to ensure an exact
    229  * match with a given id. Two keywords rarely conflicts, but it is
    230  * possible, and therefore kw_begin kw_match kw_match ... kw_end is used
    231  * to cover this.
    232  *
    233  * See example usage elsewhere for details.
    234  *
    235  * The first element x0 is length '0'..'9' and ensure comparisons will
    236  * not overrun the buffer where the lexeme is stored during string
    237  * comparison, iff the keywords report the length correctly.
    238  *
    239  * The next elements in the tag are the first, second, and last
    240  * character of lexeme / keyword, replacing second character with '\0'
    241  * on single length keywords, so keyword 'e' is tagged '1', 'e', '\0', 'e',
    242  * and 'while' is tagged '5' 'w', 'h', 'e', where the length is lsb
    243  * and last chararacter is msb.
    244  *
    245  * An enum with tok_kw_<name> elements is expected to provide return
    246  * values on match. These should start at LEX_TOK_KW_BASE and are
    247  * negative.
    248  *
    249  */
    250 #define lex_kw_begin(x0, x1, x2, x3)                                    \
    251         case                                                            \
    252             ((unsigned long)(x0) |                                      \
    253             ((unsigned long)(x1) << 8) |                                \
    254             ((unsigned long)(x2) << 16) |                               \
    255             ((unsigned long)(x3) << 24)) :
    256 
    257 #define lex_kw_match(kw)                                                \
    258         if (memcmp(#kw, lexeme, sizeof(#kw) - 1) == 0)                  \
    259                 return tok_kw_##kw;
    260 
    261 #define lex_kw_end()                                                    \
    262         break;
    263 
    264 #define lex_kw(kw, x0, x1, x2, x3)                                      \
    265         lex_kw_begin(x0, x1, x2, x3)                                    \
    266             lex_kw_match(kw)                                            \
    267         lex_kw_end()
    268 
    269 static long lex_match_kw(unsigned long tag, const char *lexeme);
    270 
    271 /* Static so multiple grammers are possible in a single program. */
    272 #define LEX_KW_TABLE_BEGIN                                              \
    273 static long lex_match_kw(unsigned long tag, const char *lexeme)         \
    274 {                                                                       \
    275     switch (tag) {                                                      \
    276 
    277 #define LEX_KW_TABLE_END                                                \
    278     default:                                                            \
    279         break;                                                          \
    280     }                                                                   \
    281     return LEX_TOK_KW_NOT_FOUND;                                        \
    282 }
    283 
    284 #else
    285 
    286 /* Allow flagging in and out without unused warning or missing macros */
    287 #define lex_kw_begin(x0, x1, x2, x3)
    288 #define lex_kw_match(kw)
    289 #define lex_kw_end()
    290 #define lex_kw(kw, x0, x1, x2, x3)
    291 #define LEX_KEYWORD_TABLE_BEGIN
    292 #define LEX_KEYWORD_TABLE_END
    293 
    294 #endif /* LEX_KEYWORDS */
    295 
    296 
    297 
    298 /*
    299  * Modes used for recovery when switching to a new buffer and handling
    300  * internal state changes for strings and comments.
    301  */
    302 enum {
    303     /* Always 0, is initial lexer state. */
    304     LEX_MODE_NORMAL = 0,
    305 
    306     /* Returned if lex is given unsupported mode. */
    307     LEX_MODE_INVALID = 1,
    308 
    309     /*
    310      * Can be used in place of normal mode to consume optional bom
    311      * marker at buffer start. Only utf-8 bom is supported.
    312      */
    313     LEX_MODE_BOM,
    314 
    315     /*
    316      * Returned at end of buffer if mid string or mid comment, may also
    317      * be larger for nested comments as nesting level is encoded.
    318      */
    319     LEX_MODE_C_STRING,
    320     LEX_MODE_C_STRING_SQ,
    321     LEX_MODE_PYTHON_BLOCK_STRING,
    322     LEX_MODE_PYTHON_BLOCK_STRING_SQ,
    323     LEX_MODE_C_BLOCK_COMMENT,
    324     LEX_MODE_LINE_COMMENT,
    325     LEX_MODE_JULIA_NESTED_COMMENT,
    326 
    327 
    328     /* Counter embedded in mode. */
    329     LEX_MODE_COUNT_BASE = 16,
    330 };
    331 
    332 
    333 
    334 /* ON CALLING AND USING LEX FUNCTION
    335  *
    336  * If utf-8 BOM possible, detect this before calling the lexer and
    337  * advance the buffer. JSON explititly disallows BOM, but recommends
    338  * consuming it if present. If some other Unicode BOM is found, convert
    339  * the buffer first. The lexer assumes ALL non-ascii characters are
    340  * valid trailing identifiers which mostly works well. Strings with
    341  * broken utf-8 are passed on as is. utf-8 identifiers must be enabled
    342  * with #define LEX_ENABLE_UTF8_ID
    343  *
    344  * If required, postprocess identifiers and strings for valid utf-8.  It
    345  * is assumed that all keywords are at most 9 characters long and always
    346  * ASCII. Otherwise post process them in a hash table on identifier
    347  * event. This enables a fast compiled trie lookup of keywords.
    348  *
    349  * Newline and control characters are always emitted, also inside
    350  * strings and comments. The exception is \r, \n, \t, \0 which are
    351  * handled specially, or if the lexer is adapted to handle certain
    352  * control characters specially.
    353  *
    354  * Each token is not guaranteed correct, only to be delimited correct,
    355  * if it is indeed correct. Only very few tokens can be zero length, for
    356  * example, the parser can rely on string part token not being empty
    357  * which is important in dealing with line continuation. The end of
    358  * buffer token is empty, and so is the unterminates string token, and
    359  * also the comment end token for single line tokens, but not the
    360  * multi-line version. There is a token for every part of the input
    361  * stream, but the parser can easily define some to be ignored and have
    362  * them optimized out.
    363  *
    364  * Strings have start token, and optionally sequences of control,
    365  * escape, and newline tokens, followed by either string end token or
    366  * string unterminated token. Strings delimiters can be one
    367  * (single-line) or three double quotes (multi-line, like python, but
    368  * cannot be single quotes, unlike Python. Python, C and Javascript
    369  * string continuation is handled by having the parser observing string
    370  * escape followed by newline token. Escape is always a single
    371  * character '\' token, and the parser is responsible for consuming the
    372  * following content. If string syntax with double delimiter is used to
    373  * define escaped delimiter, this will occur as two separate strings
    374  * with no space between. The parser can handle this on its own; if, in
    375  * such strings, '\"' does not mean escaped delimiter, the string will
    376  * not terminate correctly, and the lexer must be apapted. Unterminated
    377  * string may happen at end of buffer, also for single line comments.
    378  * This is because the string might continue in a new buffer. The parser
    379  * should deal with this.
    380  *
    381  * Comments always start with a start token, followed by zero or more
    382  * comment part tokens interleaved with control and newline tokens,
    383  * terminated by either comment end token, or unterminated comment
    384  * token. If the comment is single, the unterminated comment token may
    385  * appear at the last line instead of the expected end of comment token
    386  * because the comment might continue in a new buffer. The parser
    387  * should deal with this. Escapes and line continuations have no effects
    388  * in comments, unlike strings.
    389  *
    390  * The lexer only carries one state variable: the mode. The mode can be
    391  * normal (default and equals zero), or single or multi string or
    392  * comment modes.  These modes are used to to recover after switching
    393  * buffers as discussed below.
    394  *
    395  * The lexer can run to completion without involving the parser and
    396  * could be used to pipeline tokens into another thread for concurrent
    397  * parsing which is safe since the input buffer is considered read-only.
    398  *
    399  *
    400  * KEYWORDS
    401  *
    402  * Keywords are treated as identifiers by default. By including a
    403  * keyword table the `lex_emit_id` macro will check if the id is a
    404  * keyword and translate the token if it is. Using the provided keyword
    405  * table macros is just one way to do it. This is better explained by
    406  * looking at an example. Keyword lookup based on the precomputed keyword
    407  * tag provided to the lookup function are limited to 9 characters, but a
    408  * custom lookup function need not use it and then the tag precomputation
    409  * will be optimized out.
    410  *
    411  * Keywords are defined by the lookup function and should be negative
    412  * starting at LEX_TOK_KW_BASE to avoid conflicts with other token types.
    413  *
    414  *
    415  * WRAPPING MULTIPLE BUFFERS
    416  *
    417  * The user may need to deal with multiple buffers because data may
    418  * arrive asynchronously over the network, and may have many concurrent
    419  * lexing jobs. The emitter part is not difficult since a ring buffer
    420  * can grow, or the parser can be called directly (except queuing a few
    421  * tokens for backtracking as we shall see).
    422  *
    423  * If the lexer were an explicit statemachine as in Flex, we could get
    424  * an yywrap event to fill buffers, but our state is on the stack and in
    425  * registers for optimization. We may use co-routines, but it doesn't
    426  * cover all issues, and, as it turns out is not necessary with the
    427  * following restrictions on syntax:
    428  *
    429  * All variable length tokens such as numerics and identifiers are
    430  * limited in length. Strings and comments are not, but are broken into
    431  * zero, one, or several body tokens per line. ANSI-C limits line length
    432  * to 509 characters (allowing for continuation and two byte linebreaks
    433  * in a 512 byte buffer). But JSON has no line continuation for strings
    434  * and may (and often do) store everything on a single line. Whitespace
    435  * can also extend beyond given limit.
    436  *
    437  * If we ignore whitespace, strings and comments, we can discard the
    438  * last token (or last two in case there are paired tokens, such as
    439  * leading zero followed by numeric. Parsing can then resume in a new
    440  * buffer where the first 512 bytes (or similar) are duplicated from the
    441  * previous buffer. The lexer is then restarted at the last token (pair)
    442  * start which may turn out to change the length or even introduce a
    443  * different result such introducing leading zero. The lexer need no
    444  * specific state to do this.
    445  *
    446  * For strings and comments, we need a flag to allow entering the lexer
    447  * mid string or mid comment. The newline and line continuation tokens
    448  * need to be dropped, and the last body may need to be truncated as it
    449  * can embed a partial delimiter. The simplest way to deal with this is
    450  * to backtrack tokens until the last token begins at a safe position,
    451  * about 3-6 charaters earlier, and truncating body segments that span
    452  * this barrier. Whitespace can also be truncated.
    453  *
    454  * We can generalize this further by going at least K bytes back in an N
    455  * overlap buffer region and require non-strings (and non-comments) to
    456  * not exceed N-K bytes, where K and N are specific to the syntax and
    457  * the I/O topology.
    458  *
    459  * We can add flags to tokens that can help decide how to enter
    460  * backtracking mode without covering every possible scanner loop - i.e.
    461  * are we mid string, mid comment, single-line or multi-line.
    462  *
    463  * All the lexer needs to do then, is to receive the backtracking mode
    464  * flags. A wrapping driver can deal with backtrack logic, which is
    465  * specific to how tokens are emitted. Whitespace need no recovery mode
    466  * but perhaps new whitespace should extend existing to simplify
    467  * parsing.
    468  */
    469 
    470 
    471 #endif /* LUTHOR_H */
    472