tokens.h (21687B)
1 #ifndef LEX_TOKENS_H 2 #define LEX_TOKENS_H 3 4 /* Define LEX_DEBUG to enable token printing and describing functions. */ 5 6 7 enum { 8 9 /* 10 * EOF is not emitted by lexer, but may be used by driver after 11 * last buffer is processed. 12 */ 13 LEX_TOK_EOF = 0, 14 15 /* 16 * Either EOB or EOS is emitted as the last token before exit, 17 * or also ABORT in some lexers. Unterminated string or comment 18 * will be emitted immediately one of these when relevant. 19 * 20 * It may be useful to redefine lex_emit_eos and lex_emit_eob to 21 * produce LEX_TOK_EOF or error directly for simple string lexing. 22 */ 23 LEX_TOK_EOB = 1, 24 LEX_TOK_EOS = 2, 25 26 /* 27 * ABORT can be used for early exit by some lexers while other 28 * lexers may choose to run to buffer end regardless of input (with 29 * the exception of deeply nested comments). 30 */ 31 LEX_TOK_ABORT = 3, 32 33 /* 34 * Byte order marker. Only happen if lexer was started in bom mode 35 * and the input stream contains a leading bom marker. 36 * The token can only be the first token in the stream. Utf-8 is the 37 * only supported bom, but the lexeme may be checked in case other 38 * boms are added later. Normally it is routed to lex_emit_other 39 * along with comments so it just ignores the bom if present. It is 40 * generally recommended to consume utf-8 bom for interoperability, 41 * but also to not store it for the same reason. 42 */ 43 LEX_TOK_BOM, 44 45 /* 46 * Any control character that is not newline or blank will be 47 * emitted as single character token here. This token is discussed 48 * in several comments below. For strings and comments, also 49 * blank control characters will be emitted since they are usually 50 * not desired unexpectd. 51 */ 52 LEX_TOK_CTRL, 53 LEX_TOK_STRING_CTRL, 54 LEX_TOK_COMMENT_CTRL, 55 56 /* 57 * Any printable ASCII character that is not otherwise consumed will 58 * be issued as a single length symbol token. Further discussion 59 * below. The symbol and CTRL tokens ensure that the entire input 60 * stream is covered by tokens. If utf-8 identifies have not been 61 * flagged, utf-8 leading characters may also end up here, and so 62 * my utf-8 characters in general, that are not viewed as valid 63 * identifiers (depending on configuration). 64 */ 65 LEX_TOK_SYMBOL, 66 67 /* 68 * Variable length identifier starting with (_A-Za-z) by default and 69 * followed by zero or more (_A-Za-z0-9) characters. (_) can be 70 * flagged out. utf-8 can be flagged in. Be default any non-ASCII 71 * character (0x80 and above), is treated as part of an identifier 72 * for simplicity and speed, but this may be redefined. Any broken 73 * utf-8 is not sanitized, thus 0x80 would be a valid identifier 74 * token with utf-8 identifiers enabled, and otherwise it would be a 75 * symbol token. 76 * 77 * The ID does a magic trick: It maps the lexeme to a very simple 78 * and fast 32 bit hash code called a tag. The tag is emitted with 79 * the id token and can be used for fast keyword lookup. The 80 * hash tag is: 81 * 82 * (length)(first char)(second char)(last char) 83 * 84 * where length is ASCII '0' .. '9' where any length overflow is an 85 * arbitrary value, but such that the length is never longer than 86 * the lexeme. The last char is the last char regardless of length. 87 * For short identifiers, the second char may be the first char 88 * duplicated, and the last char may be first char. 89 * 90 * This code is very simple to write by hand: "5whe" means while, 91 * and can be used in a case switch before a strcmp with "while". 92 * Conflicts are possible, but then several keywords are tested like 93 * any other hash conflict. This keyword lookup is user driven, but 94 * can follow example code quite straightforward. 95 * 96 * The lex_emit_id macro can be implemented to provide the above 97 * lookup and inject a keyword token instead. By convention such 98 * tokens have negative values to avoid conflicts with lexer 99 * generated tokens. 100 * 101 * The ID also has a special role in prefixes and suffixes: C string 102 * literals like (L"hello") and numeric literals like (42f) are 103 * lexed as two tokens, one of which is an ID. The parser must 104 * process this and observe absence of whitespace where such syntax 105 * is relevant. 106 * 107 * While not specific to ID, the emitter macroes can be designed to 108 * keep track of start of lines and end of whitespace and attach 109 * state flags to each token (at line start, after whitespace). The 110 * whitespace tokens can then be dropped. This might help parsing 111 * things like suffixes efficiently. 112 */ 113 LEX_TOK_ID, 114 115 /* 116 * C-int :: pos-dec-digit dec-digit * 117 * Julia-int ::= dec-digit+ 118 * 119 * pos-dec-digit ::= '1'..'9' 120 * dec-digit ::= '0'..'9' 121 * 122 * Floating point numbers take precedence when possible so 00.10 is 123 * always a deciaml floating point value when decimal floats are 124 * enabled. 125 * 126 * The C-int is automatically enabled if C-octals are enabled, and 127 * disabled otherwise. There is no specific Julia-int type - we just 128 * use the terminology to represent integers with leading zeroes. 129 * 130 * Julia style integers accept leading zeroes. C style integers with 131 * leading zeroes are consumed as C style octal numbers, so 0019 is 132 * parsed as either 0019(Julia-int), or 001(C-octal), 9(C-int). 133 * 134 * Single digit '0' maps to octal when C-octals are enabled and to 135 * Julia-int otherwise. (Yes, integers are not that simple, it 136 * seems). 137 * 138 * Both C and Julia octal numbers (see octal token) can be active 139 * simultaneously. This can be used to control leading zero 140 * behavior, even if C-octal numbers are not part of the grammar 141 * being parsed. For example, a language might use 0o777 octal 142 * numbers and disallow 0777 integers. Enabling C-octals makes this 143 * easy to detect (but should accept octal 0). 144 * 145 * There is no destinction between the styles in the int token, but 146 * leading zeroes are easily detected in the lexeme. 147 * 148 * Constant suffixes like 1L are treated as 1(INT), and L(ID). The 149 * same goes for other numeric values. 150 * 151 * Parser should check for leading zeroes and decide if it is valid, 152 * a warning, or an error (it is in JSON). This also goes for float. 153 * 154 * Numericals, not limited to INT, may appear shorter than they are 155 * due to buffer splits. Special recovery is required, but will only 156 * happen just before EOS or EOB tokens (i.e. buffer split events). 157 */ 158 LEX_TOK_INT, 159 160 /* 161 * float ::= (int ['.' dec-digits*] dec-exponent) 162 * | ([int] '.' dec-digits* [dec-exponent]) 163 * dec-exponents ::= ('e' | 'E') ['+' | '-'] dec-digits* 164 * dec-digits ::= '0'..'9' 165 * int ::= dec-digits* 166 * 167 * Consumes a superset of C float representation without suffix. 168 * Some invalid tokens such as 0.E are accepted. Valid tokens such 169 * as 00.10 take precedence over octal numbers even if it is a 170 * prefix, and the same is obviously true with respect to decimal 171 * integers. 172 * 173 * JSON does not allow leading zeroes, and also not leading '.'. 174 * This can easily be checked in the lexeme. 175 * 176 * The octal notation affecting integer leading zeroes is not 177 * relevant to floats because floats take precedence over octal and 178 * decimal int when containing '.', 'e' or 'E'. 179 */ 180 LEX_TOK_FLOAT, 181 182 /* 183 * binary ::= (0b | 0B) ('0' | '1')* 184 * 185 * 0b100 or just 0b, parser must check that digits are present, 186 * otherwise it may be interpreted as zero, just like octal zero 187 * in C. 188 * 189 * Like 0X hex, 0B can be flagged out because Julia v0.3 does not 190 * support uppercase 0B. 191 */ 192 LEX_TOK_BINARY, 193 194 /* 195 * C-octal ::= 0 octal-digit* 196 * octal-digits ::= '0'..'7' 197 * 198 * Julia-octal ::= 0o octal-digits* 199 * octal-digits ::= '0'..'7' 200 * 201 * 0777 for C style octal numbers, or 0o777 for Julia syntax. Julia 202 * v.0.3 does not allow uppercase 0O777, it would mean 0 * O777. 203 * 204 * When enabled, decimal floating points take precedence: 00.10 is 205 * parsed as 00.10(decimal float), as per C standard. 206 * 207 * NOTE: It is possible for both styles to be active simultaneously. 208 * This may be relevant in order to control handling of leading 209 * zeroes in decimal integers. 210 * 211 * If C-octal numbers are flagged out, leading zeroes are mapped to 212 * integers and the numerical value may change. Julia behaves this 213 * way. Nothing prevents support of both C and Julia octal numbers, 214 * but leading zeroes will then be interpreted the C way - it is not 215 * recommended to do this. 216 */ 217 LEX_TOK_OCTAL, 218 219 /* 220 * hex ::= hex-int 221 * hex-digits ::= 'a'..'f'| 'A'..'f' | '0'..'9' 222 * hex-int ::= (0x | 0X) hex_digts* 223 * 224 * where hex_digits are customizable (e.g. all lower case), and hex 225 * prefix 0x can be flagged to be lower case only (as in Julia). 226 * 227 * If hex floats are enabled, they take precedence: 228 * 0x1.0(hex-float), if not, 0x1.0 will parse as: 0x1(hex) followed 229 * by .0(decimal float). 230 * 231 * The lead prefix 0x may by flagged to be lower case only because 232 * this is required by Julia v0.3 where 0X means 0 * X. Julia 233 * accepts uppercase in the remaining hex digits (and exponent for 234 * floats). This could possibly change in future versions. 235 * 236 * The zero length sequence (0x | 0X) is accepted and left to the 237 * parser since the lexer emits a token for everything it sees. 238 * Conceptually it may be interpreted as zero, equivalent to 0 being 239 * both octal prefix and numeric 0 in C style octal representation. 240 * Or it may be an error. 241 */ 242 LEX_TOK_HEX, 243 244 /* 245 * hex_float ::= hex-int ['.' hex_digit*] hex-exponent 246 * hex-exponent ::= ('p' | 'P') ['+' | '-'] decimal-digit* 247 * decimal-digit ::= '0'..'9' 248 * 249 * A superset of IEEE-754-2008 Hexadecimal Floating Point notation. 250 * 251 * We require the exponent to be present, but does not ensure the 252 * value is otherwise complete, e.g. 0x1p+ would be accepted. The p 253 * is needed because otherwise 0x1.f could be accepted, and f is a 254 * float suffix in C, and juxtapostion factor (0x1. * f) in Julia, 255 * at least, that is one possible interpretation. 256 * 257 * The exponent can be flagged optional in which case 0x1.f will be 258 * consumed as a single hex float toke as a single hex float token. 259 * This may either simply be accepted in some grammars, or used to 260 * provide an error message. If the exponent is required, 0x1.f will 261 * be lexed as three tokens: 262 * 263 * <'0x1'(hex int), '.'(op), 'f'(id)>. 264 * 265 * Thus it may be a good idea to allow the exponent to be optional 266 * anyway and issue an error message or warning if the p is absent 267 * later in the parsing stage. 268 * 269 * Note that, as per IEEE-754, the exponent is a decimal power of 270 * two. In other words, the number of bits to shift the 271 * (hexa)decimal point. Also note that it is p and not e because e 272 * is a hex digit. 273 */ 274 LEX_TOK_HEX_FLOAT, 275 276 /* 277 * blank ::= ('\t' | '\x20')+ 278 * 279 * Longest run in buffer holding only '\t' and '\x20' (space). 280 * 281 * buffer splits may generate adjacent blanks depending on recovery 282 * processing. (The same goes for other line oriented runs such as 283 * string parts and comment parts). 284 */ 285 LEX_TOK_BLANK, 286 287 /* newline ::= '\r' | '\n' | '\r\n' | '\n\r' 288 * 289 * Will always appear, also inside strings and comments. Can be used 290 * to track line starts and counts reliably as only one newline is 291 * issued at a time, and it is issued everywhere, also in strings 292 * and comments. 293 * 294 * May be preceeded by string escape token inside strings. This can 295 * be interpreted as line continuation within strings specifically, 296 * as is the case in Python and Javascript (and in C via 297 * pre-processor). 298 * 299 * The LEX_TOK_STRING_NEWLINE is emitted inside strings so the ordinary 300 * newline may be ignored in comments and other non-string content. 301 */ 302 LEX_TOK_NEWLINE, 303 LEX_TOK_STRING_NEWLINE, 304 305 /* 306 * string ::= string_start 307 * (string_part | string_escape | 308 * string_ctrl | string_newline)* 309 * (string_end | string_unterminated) 310 * 311 * There are several optional string styles. They all start with 312 * this token. The length and content provided details. Python 313 * may start with """ or ''' and this token will then have length 314 * 3 and three quotes as lexeme content. If the lexer exits before 315 * string end token, the returned lexer mode will remember the 316 * state and can be used for reentry - this also goes for comments. 317 * 318 * Strings can only contain part, escape, newline, and control 319 * tokens, and either string unterminated or string end token 320 * at last. 321 */ 322 LEX_TOK_STRING_BEGIN, 323 324 /* Longest run without control characters, without (\), without 325 * newline, and without the relevant end delimiter. The run may be 326 * shortened due to buffer splits. The part may, as an exception, 327 * begin with an end delimiter character or a (\) if it was 328 * preceeded by a string escape token. The escape character is 329 * always (\). Strings that use "" or '' as escape will be treated 330 * as start and end of separate strings. Strings that do not supoort 331 * (\) should just treat escape as a part of the string. 332 */ 333 LEX_TOK_STRING_PART, 334 335 /* 336 * This is always a single character token (\) and only happens 337 * inside strings. See also string part token. 338 */ 339 LEX_TOK_STRING_ESCAPE, 340 341 /* This token is similar to string start. It may be absent at buffer 342 * splits, but will then an unterminated string token will be used 343 * just before the split event token. 344 * 345 * */ 346 LEX_TOK_STRING_END, 347 348 /* 349 * This is emitted before the buffer ends, or before unescaped 350 * newlines for line oriented string types (the usual strings). 351 * At buffer splits, recovery should clean it up. The returned 352 * mode allow parsing to continue in a new buffer with a slight 353 * content overlap. 354 * 355 * If string like ("hello, world!") in C, reaches end of line, it 356 * may be continued" ("hello, \)newline(world!"). If this line 357 * continuation is flagged out, this will lead to string 358 * unterminated, even if not at end of buffer. For block strings 359 * like """hello""", this only happens at end of buffer. 360 */ 361 LEX_TOK_STRING_UNTERMINATED, 362 363 /* 364 * 365 * comment ::= comment_start 366 * (comment_part | ctrl | newline)* 367 * (comment_end | comment_unterminated) 368 * 369 * 370 * Comments work like strings in most respects. They emit parts, and 371 * control characters, but not escape characters, and cannot be 372 * continued at end of line. Block comments are like python block 373 * strings ('''). 374 * 375 * Julia supports nested comments (#= ... #= =# =#). In this case 376 * a new start token can be emitted before an end token. If the 377 * parser exits due to buffer split, the mode has the nesting level 378 * encoded so it can resumed in a new buffer. 379 * 380 * Line comments will have their end token just before newline, or 381 * unterminated comment just before buffer split token (EOB or EOS). 382 * (\) characters are consumed by the comment part tokens and do not 383 * affect the end of any comment. 384 * 385 * Comment begin may include extra characters when a doc comment is 386 * recognized. The emitter flags this. End comments are unaffected. 387 */ 388 LEX_TOK_COMMENT_BEGIN, 389 LEX_TOK_COMMENT_PART, 390 LEX_TOK_COMMENT_END, 391 LEX_TOK_COMMENT_UNTERMINATED, 392 393 /* 394 * Issued before ABORT token if nesting level is above a predefined 395 * level. This is to protect against malicious and misguided 396 * content, otherwise the nesting level counter could wrap and 397 * generate a different interpretation, which could be bad. The 398 * parser would probably do similar things with nested tokens. 399 */ 400 LEX_TOK_COMMENT_DEEPLY_NESTED, 401 402 403 /* Operators are all recognized single character symbols, or up to 404 * four characters. The token value is the ASCII codes shifted 8 405 * bits per extra character, by default, but the emitter macros 406 * can redefine this. Values below 32 are reserved token types as 407 * discussed above. 408 * 409 * What exactly represents an operator depends on what the lexer has 410 * enabled. 411 * 412 * Printable ASCII symbols that are NOT recognized, are emitted as 413 * the SYMBOL token and is always length 1. The value can be derived 414 * from the lexeme, but not the token itself. This may be perfectly 415 * fine for the parser, or it may be used to indicate an error. 416 * There are no illegal characters per se. 417 * 418 * Non-printable ASCII characters that are not covered by newline or 419 * blank, are emitted as CTRL tokens. These act the same as the 420 * symbol token and may be used to indicate error, or to handle form 421 * feed and other whitespace not handled by default. Unlike symbol, 422 * however, CTRL also appear in strings and comments since they are 423 * generally not allowed and this makes it easy to capture (there is 424 * virtually no performance overhead in providing this service 425 * unless attempting to parse a binary format). 426 */ 427 428 /* Don't bleed into this range. */ 429 LEX_TOK_OPERATOR_BASE = 32, 430 431 432 /* 433 * Operators use ASCII range. 434 * Compound operators use range 0x80 to 0x7fff 435 * and possibly above for triple sequences. 436 * Custom keywords are normally negative but can be mapped 437 * to any other. 438 * 439 * The layout is designed for efficient table lookup. 440 * Compound operators might benefit from remapping down to a smaller 441 * range for compact lookup tables, but it depends on the parser. 442 */ 443 }; 444 445 /* 446 * Custom keyword token range is negative, and well below -99..0 where 447 * special codes are reserved. 448 */ 449 #ifndef LEX_TOK_KW_BASE 450 #define LEX_TOK_KW_BASE -1000 451 #endif 452 453 #ifndef LEX_TOK_KW_NOT_FOUND 454 #define LEX_TOK_KW_NOT_FOUND LEX_TOK_ID 455 #endif 456 457 458 #ifdef LEX_DEBUG 459 460 #include <stdio.h> 461 #include <string.h> 462 463 static const char *lex_describe_token(long token) 464 { 465 switch(token) { 466 case LEX_TOK_BOM: return "BOM marker"; 467 case LEX_TOK_EOF: return "EOF"; 468 case LEX_TOK_EOS: return "buffer zero terminated"; 469 case LEX_TOK_EOB: return "buffer exhausted"; 470 case LEX_TOK_ABORT: return "abort"; 471 case LEX_TOK_CTRL: return "control"; 472 case LEX_TOK_STRING_CTRL: return "string control"; 473 case LEX_TOK_COMMENT_CTRL: return "comment control"; 474 case LEX_TOK_SYMBOL: return "symbol"; 475 case LEX_TOK_ID: return "identifier"; 476 case LEX_TOK_INT: return "integer"; 477 case LEX_TOK_FLOAT: return "float"; 478 case LEX_TOK_BINARY: return "binary"; 479 case LEX_TOK_OCTAL: return "octal"; 480 case LEX_TOK_HEX: return "hex"; 481 case LEX_TOK_HEX_FLOAT: return "hex float"; 482 case LEX_TOK_BLANK: return "blank"; 483 case LEX_TOK_NEWLINE: return "newline"; 484 case LEX_TOK_STRING_NEWLINE: return "string newline"; 485 case LEX_TOK_STRING_BEGIN: return "string begin"; 486 case LEX_TOK_STRING_PART: return "string part"; 487 case LEX_TOK_STRING_END: return "string end"; 488 case LEX_TOK_STRING_ESCAPE: return "string escape"; 489 case LEX_TOK_STRING_UNTERMINATED: return "unterminated string"; 490 case LEX_TOK_COMMENT_BEGIN: return "comment begin"; 491 case LEX_TOK_COMMENT_PART: return "comment part"; 492 case LEX_TOK_COMMENT_END: return "comment end"; 493 case LEX_TOK_COMMENT_UNTERMINATED: return "unterminated comment"; 494 case LEX_TOK_COMMENT_DEEPLY_NESTED: return "deeply nested comment"; 495 496 default: 497 if (token < LEX_TOK_EOF) { 498 return "keyword"; 499 } 500 if (token < 32) { 501 return "undefined"; 502 } 503 if (token < 0x100L) { 504 return "operator"; 505 } 506 if (token < 0x10000L) { 507 return "compound operator"; 508 } 509 if (token < 0x1000000L) { 510 return "tricompound operator"; 511 } 512 if (token < 0x7f0000000L) { 513 return "quadcompound operator"; 514 } 515 return "reserved"; 516 } 517 } 518 519 static void lex_fprint_token(FILE *fp, 520 long token, 521 const char *first, const char *last, 522 int line, int pos) 523 { 524 char buf[10]; 525 const char *lexeme = first; 526 int len = (int)(last - first); 527 switch (token) { 528 case LEX_TOK_EOS: 529 case LEX_TOK_CTRL: 530 sprintf(buf, "^%02x", (int)*first); 531 lexeme = buf; 532 len = strlen(buf); 533 break; 534 default: 535 break; 536 } 537 fprintf(fp, "%04d:%03d %s (0x%lx): `%.*s`\n", 538 line, pos, lex_describe_token(token), token, len, lexeme); 539 } 540 541 #define lex_print_token(token, first, last, line, pos) \ 542 lex_fprint_token(stdout, token, first, last, line, pos) 543 544 #else /* LEX_DEBUG */ 545 546 #define lex_describe_token(token) "debug not available" 547 #define lex_fprint_token(fp, token, first, last, line, pos) ((void)0) 548 #define lex_print_token(token, first, last, line, pos) ((void)0) 549 550 #endif /* LEX_DEBUG */ 551 552 553 #endif /* LEX_TOKENS_H */ 554