luthor.h (18021B)
1 /* 2 * Mostly generic lexer that can be hacked to suit specific syntax. See 3 * more detailed comments further down in this file. 4 * 5 * Normally include luthor.c instead of luthor.h so emitter functions 6 * can be custom defined, and optionally also fast keyword definitions. 7 * 8 * At the very minimum, define lex_emit which other emitters default to. 9 * 10 * Create a wrapper function to drive the lex function in said file. 11 * 12 * Use this header in separate parser logic to access the token values 13 * if relevant. 14 */ 15 16 #ifndef LUTHOR_H 17 #define LUTHOR_H 18 19 #ifdef LEX_KEYWORDS 20 #include <string.h> /* memcmp for kw match */ 21 #endif 22 23 #include "tokens.h" 24 25 #ifndef lex_emit 26 #define lex_emit(token, first, last) ((void)0) 27 #endif 28 29 /* 30 * Default for comments, bom, and other things that are not necessarily 31 * of interest to the parser, but may be to buffer wrap handling, 32 * debugging, and pretty printers. 33 */ 34 #ifndef lex_emit_other 35 #define lex_emit_other(token, first, last) ((void)0) 36 #endif 37 38 #ifndef lex_emit_eof 39 #define lex_emit_eof(pos) lex_emit(LEX_TOK_EOF, pos, pos) 40 #endif 41 42 #ifndef lex_emit_abort 43 #define lex_emit_abort(pos) lex_emit(LEX_TOK_ABORT, pos, pos) 44 #endif 45 46 #ifndef lex_emit_eob 47 #define lex_emit_eob(pos) lex_emit(LEX_TOK_EOB, pos, pos) 48 #endif 49 50 #ifndef lex_emit_eos 51 #define lex_emit_eos(first, last) lex_emit(LEX_TOK_EOS, first, last) 52 #endif 53 54 #ifndef lex_emit_bom 55 #define lex_emit_bom(first, last) lex_emit_other(LEX_TOK_BOM, first, last) 56 #endif 57 58 #ifndef lex_emit_id 59 #ifdef LEX_KEYWORDS 60 /* LEX_KW_TABLE_BEGIN .. LEX_KEYWORD_TABLE_END defines lex_match_kw. */ 61 #define lex_emit_id(first, last, tag) lex_emit(lex_match_kw(tag, first), first, last) 62 #else 63 #define lex_emit_id(first, last, tag) lex_emit(LEX_TOK_ID, first, last) 64 #endif 65 #endif 66 67 /* 68 * This is a default for unknown symbols. It may be treated as an error, 69 * or it can be processed further by the parser instead of customizing 70 * the lexer. It ensures that there is always a token for every part of 71 * the input stream. 72 */ 73 #ifndef lex_emit_symbol 74 #define lex_emit_symbol(token, first, last) lex_emit(LEX_TOK_SYMBOL, first, last) 75 #endif 76 77 /* 78 * Control characters 0x01 .. 0x1f, 0x7f(DEL), excluding \0\r\n\t which have 79 * separate tokens. 80 * 81 * Control characters in strings and comments are passed on as body 82 * elements, except \0\r\n which breaks the string up. 83 */ 84 #ifndef lex_emit_ctrl 85 #define lex_emit_ctrl(pos) lex_emit(LEX_TOK_CTRL, pos, pos + 1) 86 #endif 87 88 #ifndef lex_emit_string_ctrl 89 #define lex_emit_string_ctrl(pos) lex_emit(LEX_TOK_STRING_CTRL, pos, pos + 1) 90 #endif 91 92 #ifndef lex_emit_comment_ctrl 93 #define lex_emit_comment_ctrl(pos) lex_emit_other(LEX_TOK_COMMENT_CTRL, pos, pos + 1) 94 #endif 95 96 /* 97 * This enables user to both count lines, and to calculate character 98 * offset for subsequent lexemes. New line starts a lexeme, line break 99 * symbol is located at lexeme - skipped and with have length 2 if \r\n 100 * or \n\r break, and 1 otherwise. 101 */ 102 #ifndef lex_emit_newline 103 #define lex_emit_newline(first, last) lex_emit(LEX_TOK_NEWLINE, first, last) 104 #endif 105 106 #ifndef lex_emit_string_newline 107 #define lex_emit_string_newline(first, last) lex_emit(LEX_TOK_STRING_NEWLINE, first, last) 108 #endif 109 110 #ifndef lex_emit_int 111 #define lex_emit_int(first, last) lex_emit(LEX_TOK_INT, first, last) 112 #endif 113 114 #ifndef lex_emit_float 115 #define lex_emit_float(first, last) lex_emit(LEX_TOK_FLOAT, first, last) 116 #endif 117 118 #ifndef lex_emit_int_suffix 119 #define lex_emit_int_suffix(first, last) lex_emit(LEX_TOK_INT_SUFFIX, first, last) 120 #endif 121 122 #ifndef lex_emit_float_suffix 123 #define lex_emit_floatint_suffix(first, last) lex_emit(LEX_TOK_FLOAT_SUFFIX, first, last) 124 #endif 125 126 #ifndef lex_emit_binary 127 #define lex_emit_binary(first, last) lex_emit(LEX_TOK_BINARY, first, last) 128 #endif 129 130 #ifndef lex_emit_octal 131 #define lex_emit_octal(first, last) lex_emit(LEX_TOK_OCTAL, first, last) 132 #endif 133 134 #ifndef lex_emit_hex 135 #define lex_emit_hex(first, last) lex_emit(LEX_TOK_HEX, first, last) 136 #endif 137 138 #ifndef lex_emit_hex_float 139 #define lex_emit_hex_float(first, last) lex_emit(LEX_TOK_HEX_FLOAT, first, last) 140 #endif 141 142 /* 143 * The comment token can be used to aid backtracking during buffer 144 * switch. 145 */ 146 #ifndef lex_emit_comment_begin 147 #define lex_emit_comment_begin(first, last, is_doc) \ 148 lex_emit_other(LEX_TOK_COMMENT_BEGIN, first, last) 149 #endif 150 151 #ifndef lex_emit_comment_part 152 #define lex_emit_comment_part(first, last) lex_emit_other(LEX_TOK_COMMENT_PART, first, last) 153 #endif 154 155 #ifndef lex_emit_comment_end 156 #define lex_emit_comment_end(first, last) lex_emit_other(LEX_TOK_COMMENT_END, first, last) 157 #endif 158 159 #ifndef lex_emit_comment_unterminated 160 #define lex_emit_comment_unterminated(pos) \ 161 lex_emit_other(LEX_TOK_COMMENT_UNTERMINATED, pos, pos) 162 #endif 163 164 #ifndef lex_emit_comment_deeply_nested 165 #define lex_emit_comment_deeply_nested(pos) \ 166 lex_emit_other(LEX_TOK_COMMENT_DEEPLY_NESTED, pos, pos) 167 #endif 168 169 #ifndef lex_emit_string_begin 170 #define lex_emit_string_begin(first, last) lex_emit(LEX_TOK_STRING_BEGIN, first, last) 171 #endif 172 173 #ifndef lex_emit_string_part 174 #define lex_emit_string_part(first, last) lex_emit(LEX_TOK_STRING_PART, first, last) 175 #endif 176 177 #ifndef lex_emit_string_end 178 #define lex_emit_string_end(first, last) lex_emit(LEX_TOK_STRING_END, first, last) 179 #endif 180 181 #ifndef lex_emit_string_escape 182 #define lex_emit_string_escape(first, last) lex_emit(LEX_TOK_STRING_ESCAPE, first, last) 183 #endif 184 185 #ifndef lex_emit_string_unterminated 186 #define lex_emit_string_unterminated(pos) \ 187 lex_emit(LEX_TOK_STRING_UNTERMINATED, pos, pos) 188 #endif 189 190 #ifndef lex_emit_blank 191 #define lex_emit_blank(first, last) \ 192 lex_emit_other(LEX_TOK_BLANK, first, last) 193 #endif 194 195 #ifndef lex_emit_op 196 #define lex_emit_op(op, first, last) lex_emit((long)(op), first, last) 197 #endif 198 199 #ifndef lex_emit_compound_op 200 #define lex_emit_compound_op(op1, op2, first, last) \ 201 lex_emit(((long)(op1) | ((long)(op2) << 8)), first, last) 202 #endif 203 204 #ifndef lex_emit_tricompound_op 205 #define lex_emit_tricompound_op(op1, op2, op3, first, last) \ 206 lex_emit(((long)(op1) | ((long)(op2) << 8)) | \ 207 ((long)(op3)<<16), first, last) 208 #endif 209 210 #ifndef lex_emit_quadcompound_op 211 #define lex_emit_quadcompound_op(op1, op2, op3, op4, first, last) \ 212 lex_emit(((long)(op1) | ((long)(op2) << 8)) | \ 213 ((long)(op3) << 16) | ((long)(op4) << 24), first, last) 214 #endif 215 216 /* Used to limit number of nested comment level. */ 217 #ifndef LEX_MAX_NESTING_LEVELS 218 #define LEX_MAX_NESTING_LEVELS 100 219 #endif 220 221 222 /* Keyword handling macros, see `keywords.c` for an example usage. */ 223 #ifdef LEX_KEYWORDS 224 225 /* 226 * This implements a switch statement branching on the 4 character 227 * keyword tag (unsigned long value) which is produced by the lexers id 228 * recognizer. A final check is needed with to ensure an exact 229 * match with a given id. Two keywords rarely conflicts, but it is 230 * possible, and therefore kw_begin kw_match kw_match ... kw_end is used 231 * to cover this. 232 * 233 * See example usage elsewhere for details. 234 * 235 * The first element x0 is length '0'..'9' and ensure comparisons will 236 * not overrun the buffer where the lexeme is stored during string 237 * comparison, iff the keywords report the length correctly. 238 * 239 * The next elements in the tag are the first, second, and last 240 * character of lexeme / keyword, replacing second character with '\0' 241 * on single length keywords, so keyword 'e' is tagged '1', 'e', '\0', 'e', 242 * and 'while' is tagged '5' 'w', 'h', 'e', where the length is lsb 243 * and last chararacter is msb. 244 * 245 * An enum with tok_kw_<name> elements is expected to provide return 246 * values on match. These should start at LEX_TOK_KW_BASE and are 247 * negative. 248 * 249 */ 250 #define lex_kw_begin(x0, x1, x2, x3) \ 251 case \ 252 ((unsigned long)(x0) | \ 253 ((unsigned long)(x1) << 8) | \ 254 ((unsigned long)(x2) << 16) | \ 255 ((unsigned long)(x3) << 24)) : 256 257 #define lex_kw_match(kw) \ 258 if (memcmp(#kw, lexeme, sizeof(#kw) - 1) == 0) \ 259 return tok_kw_##kw; 260 261 #define lex_kw_end() \ 262 break; 263 264 #define lex_kw(kw, x0, x1, x2, x3) \ 265 lex_kw_begin(x0, x1, x2, x3) \ 266 lex_kw_match(kw) \ 267 lex_kw_end() 268 269 static long lex_match_kw(unsigned long tag, const char *lexeme); 270 271 /* Static so multiple grammers are possible in a single program. */ 272 #define LEX_KW_TABLE_BEGIN \ 273 static long lex_match_kw(unsigned long tag, const char *lexeme) \ 274 { \ 275 switch (tag) { \ 276 277 #define LEX_KW_TABLE_END \ 278 default: \ 279 break; \ 280 } \ 281 return LEX_TOK_KW_NOT_FOUND; \ 282 } 283 284 #else 285 286 /* Allow flagging in and out without unused warning or missing macros */ 287 #define lex_kw_begin(x0, x1, x2, x3) 288 #define lex_kw_match(kw) 289 #define lex_kw_end() 290 #define lex_kw(kw, x0, x1, x2, x3) 291 #define LEX_KEYWORD_TABLE_BEGIN 292 #define LEX_KEYWORD_TABLE_END 293 294 #endif /* LEX_KEYWORDS */ 295 296 297 298 /* 299 * Modes used for recovery when switching to a new buffer and handling 300 * internal state changes for strings and comments. 301 */ 302 enum { 303 /* Always 0, is initial lexer state. */ 304 LEX_MODE_NORMAL = 0, 305 306 /* Returned if lex is given unsupported mode. */ 307 LEX_MODE_INVALID = 1, 308 309 /* 310 * Can be used in place of normal mode to consume optional bom 311 * marker at buffer start. Only utf-8 bom is supported. 312 */ 313 LEX_MODE_BOM, 314 315 /* 316 * Returned at end of buffer if mid string or mid comment, may also 317 * be larger for nested comments as nesting level is encoded. 318 */ 319 LEX_MODE_C_STRING, 320 LEX_MODE_C_STRING_SQ, 321 LEX_MODE_PYTHON_BLOCK_STRING, 322 LEX_MODE_PYTHON_BLOCK_STRING_SQ, 323 LEX_MODE_C_BLOCK_COMMENT, 324 LEX_MODE_LINE_COMMENT, 325 LEX_MODE_JULIA_NESTED_COMMENT, 326 327 328 /* Counter embedded in mode. */ 329 LEX_MODE_COUNT_BASE = 16, 330 }; 331 332 333 334 /* ON CALLING AND USING LEX FUNCTION 335 * 336 * If utf-8 BOM possible, detect this before calling the lexer and 337 * advance the buffer. JSON explititly disallows BOM, but recommends 338 * consuming it if present. If some other Unicode BOM is found, convert 339 * the buffer first. The lexer assumes ALL non-ascii characters are 340 * valid trailing identifiers which mostly works well. Strings with 341 * broken utf-8 are passed on as is. utf-8 identifiers must be enabled 342 * with #define LEX_ENABLE_UTF8_ID 343 * 344 * If required, postprocess identifiers and strings for valid utf-8. It 345 * is assumed that all keywords are at most 9 characters long and always 346 * ASCII. Otherwise post process them in a hash table on identifier 347 * event. This enables a fast compiled trie lookup of keywords. 348 * 349 * Newline and control characters are always emitted, also inside 350 * strings and comments. The exception is \r, \n, \t, \0 which are 351 * handled specially, or if the lexer is adapted to handle certain 352 * control characters specially. 353 * 354 * Each token is not guaranteed correct, only to be delimited correct, 355 * if it is indeed correct. Only very few tokens can be zero length, for 356 * example, the parser can rely on string part token not being empty 357 * which is important in dealing with line continuation. The end of 358 * buffer token is empty, and so is the unterminates string token, and 359 * also the comment end token for single line tokens, but not the 360 * multi-line version. There is a token for every part of the input 361 * stream, but the parser can easily define some to be ignored and have 362 * them optimized out. 363 * 364 * Strings have start token, and optionally sequences of control, 365 * escape, and newline tokens, followed by either string end token or 366 * string unterminated token. Strings delimiters can be one 367 * (single-line) or three double quotes (multi-line, like python, but 368 * cannot be single quotes, unlike Python. Python, C and Javascript 369 * string continuation is handled by having the parser observing string 370 * escape followed by newline token. Escape is always a single 371 * character '\' token, and the parser is responsible for consuming the 372 * following content. If string syntax with double delimiter is used to 373 * define escaped delimiter, this will occur as two separate strings 374 * with no space between. The parser can handle this on its own; if, in 375 * such strings, '\"' does not mean escaped delimiter, the string will 376 * not terminate correctly, and the lexer must be apapted. Unterminated 377 * string may happen at end of buffer, also for single line comments. 378 * This is because the string might continue in a new buffer. The parser 379 * should deal with this. 380 * 381 * Comments always start with a start token, followed by zero or more 382 * comment part tokens interleaved with control and newline tokens, 383 * terminated by either comment end token, or unterminated comment 384 * token. If the comment is single, the unterminated comment token may 385 * appear at the last line instead of the expected end of comment token 386 * because the comment might continue in a new buffer. The parser 387 * should deal with this. Escapes and line continuations have no effects 388 * in comments, unlike strings. 389 * 390 * The lexer only carries one state variable: the mode. The mode can be 391 * normal (default and equals zero), or single or multi string or 392 * comment modes. These modes are used to to recover after switching 393 * buffers as discussed below. 394 * 395 * The lexer can run to completion without involving the parser and 396 * could be used to pipeline tokens into another thread for concurrent 397 * parsing which is safe since the input buffer is considered read-only. 398 * 399 * 400 * KEYWORDS 401 * 402 * Keywords are treated as identifiers by default. By including a 403 * keyword table the `lex_emit_id` macro will check if the id is a 404 * keyword and translate the token if it is. Using the provided keyword 405 * table macros is just one way to do it. This is better explained by 406 * looking at an example. Keyword lookup based on the precomputed keyword 407 * tag provided to the lookup function are limited to 9 characters, but a 408 * custom lookup function need not use it and then the tag precomputation 409 * will be optimized out. 410 * 411 * Keywords are defined by the lookup function and should be negative 412 * starting at LEX_TOK_KW_BASE to avoid conflicts with other token types. 413 * 414 * 415 * WRAPPING MULTIPLE BUFFERS 416 * 417 * The user may need to deal with multiple buffers because data may 418 * arrive asynchronously over the network, and may have many concurrent 419 * lexing jobs. The emitter part is not difficult since a ring buffer 420 * can grow, or the parser can be called directly (except queuing a few 421 * tokens for backtracking as we shall see). 422 * 423 * If the lexer were an explicit statemachine as in Flex, we could get 424 * an yywrap event to fill buffers, but our state is on the stack and in 425 * registers for optimization. We may use co-routines, but it doesn't 426 * cover all issues, and, as it turns out is not necessary with the 427 * following restrictions on syntax: 428 * 429 * All variable length tokens such as numerics and identifiers are 430 * limited in length. Strings and comments are not, but are broken into 431 * zero, one, or several body tokens per line. ANSI-C limits line length 432 * to 509 characters (allowing for continuation and two byte linebreaks 433 * in a 512 byte buffer). But JSON has no line continuation for strings 434 * and may (and often do) store everything on a single line. Whitespace 435 * can also extend beyond given limit. 436 * 437 * If we ignore whitespace, strings and comments, we can discard the 438 * last token (or last two in case there are paired tokens, such as 439 * leading zero followed by numeric. Parsing can then resume in a new 440 * buffer where the first 512 bytes (or similar) are duplicated from the 441 * previous buffer. The lexer is then restarted at the last token (pair) 442 * start which may turn out to change the length or even introduce a 443 * different result such introducing leading zero. The lexer need no 444 * specific state to do this. 445 * 446 * For strings and comments, we need a flag to allow entering the lexer 447 * mid string or mid comment. The newline and line continuation tokens 448 * need to be dropped, and the last body may need to be truncated as it 449 * can embed a partial delimiter. The simplest way to deal with this is 450 * to backtrack tokens until the last token begins at a safe position, 451 * about 3-6 charaters earlier, and truncating body segments that span 452 * this barrier. Whitespace can also be truncated. 453 * 454 * We can generalize this further by going at least K bytes back in an N 455 * overlap buffer region and require non-strings (and non-comments) to 456 * not exceed N-K bytes, where K and N are specific to the syntax and 457 * the I/O topology. 458 * 459 * We can add flags to tokens that can help decide how to enter 460 * backtracking mode without covering every possible scanner loop - i.e. 461 * are we mid string, mid comment, single-line or multi-line. 462 * 463 * All the lexer needs to do then, is to receive the backtracking mode 464 * flags. A wrapping driver can deal with backtrack logic, which is 465 * specific to how tokens are emitted. Whitespace need no recovery mode 466 * but perhaps new whitespace should extend existing to simplify 467 * parsing. 468 */ 469 470 471 #endif /* LUTHOR_H */ 472