protoverse

A metaverse protocol
git clone git://jb55.com/protoverse
Log | Files | Refs | README | LICENSE

commit 4df9d18ed9e1b11b609ef859afbd2076c67f1840
parent 0efd3a0318c06340896e5ff9a79df7454ea3313f
Author: William Casarin <jb55@jb55.com>
Date:   Sun, 16 Apr 2023 15:35:15 -0700

tweak tokenizer to support wast

A little hacky but now I can build a wast parser

Diffstat:
Msrc/cursor.h | 2+-
Msrc/parse.c | 229++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++-------------
Msrc/parse.h | 28++++++++++++++++++++++++----
Msrc/parser.h | 2+-
Msrc/protoverse.c | 5+++--
5 files changed, 222 insertions(+), 44 deletions(-)

diff --git a/src/cursor.h b/src/cursor.h @@ -97,7 +97,7 @@ static inline void copy_cursor(struct cursor *src, struct cursor *dest) static inline int pull_byte(struct cursor *cursor, u8 *c) { - if (unlikely(cursor->p + 1 > cursor->end)) + if (unlikely(cursor->p >= cursor->end)) return 0; *c = *cursor->p; diff --git a/src/parse.c b/src/parse.c @@ -9,6 +9,7 @@ #include <ctype.h> #include <stdlib.h> #include <assert.h> +#include <tgmath.h> #ifdef DEBUG #define tokdebug printf @@ -73,6 +74,7 @@ static const char *token_error_string(enum token_error err) case TE_STR_START_CHAR: return "string didn't start with \""; case TE_SYM_START_CHAR: return "symbol didn't start with a-z"; case TE_NUM_START_CHAR: return "number didn't start with 0-9 or -"; + case TE_NAN_START_CHAR: return "nan number didn't start with nan:"; case TE_SYM_CHAR: return "invalid symbol character"; case TE_NUM_CHAR: return "invalid number character"; case TE_SYM_OVERFLOW: return "symbol push overflow"; @@ -207,7 +209,7 @@ void print_token_error(struct token_cursor *cursor) cursor->err == TE_NUM_CHAR || cursor->err == TE_SYM_CHAR; - printf("\nerror: %s %.*s\n", token_error_string(cursor->err), + printf("\nerror: %s: %.*s\n", token_error_string(cursor->err), is_chr_data?1:0, (char*)&cursor->err_data.c); } @@ -299,12 +301,12 @@ static int push_number(struct token_cursor *tokens, struct tok_str str) static int is_start_symbol_char(char c) { - return c >= 'a' && c <= 'z'; + return (c >= 'a' && c <= 'z') || c == '$' || c == '='; } static int is_symbol_char(char c) { - return is_start_symbol_char(c) || c == '-' || c == '_' || + return is_start_symbol_char(c) || c == '.' || c == '-' || c == '_' || (c >= '0' && c <= '9'); } @@ -330,53 +332,125 @@ static int pull_escaped_char(struct cursor *cursor, u8 *c) return 2; } -static int pull_number(struct token_cursor *cursor, u8 **start) +static int pull_str(struct cursor *cur, const char *str) +{ + int len = strlen(str); + + if (cur->p + len >= cur->end) + return 0; + + if (memcmp(cur->p, str, len) != 0) + return 0; + + cur->p += len; + return 1; +} + +static int pull_nan(struct cursor *cur, enum nantype *type) +{ + u8 *start = cur->p; + + if (!consume_byte(cur, 'n')) goto fail; + if (!consume_byte(cur, 'a')) goto fail; + if (!consume_byte(cur, 'n')) goto fail; + if (!consume_byte(cur, ':')) goto fail; + + if (pull_str(cur, "arithmetic")) { + if (type) *type = NAN_ARITHMETIC; + return 1; + } + + if (pull_str(cur, "canonical")) { + if (type) *type = NAN_CANONICAL; + return 1; + } + + while (cur->p < cur->end) { + if (*cur->p == ')' || isspace(*cur->p)) + return 1; + cur->p++; + } + +fail: + cur->p = start; + return 0; +} + +static int pull_inf(struct cursor *cur) { + u8 *start = cur->p; + u8 c; + consume_byte(cur, '-'); + consume_byte(cur, '+'); + if (!consume_byte(cur, 'i')) goto fail; + if (!consume_byte(cur, 'n')) goto fail; + if (!consume_byte(cur, 'f')) goto fail; + if (pull_byte(cur, &c)) { + if (isspace(c) || c == ')') { + cur->p--; + return 1; + } + goto fail; + } + + return 1; +fail: + cur->p = start; + return 0; +} + +static int pull_number(struct token_cursor *cursor) { int ok = 1; int chars = 0; u8 c; + u8 *start = cursor->c.p; - struct cursor temp; + if (pull_nan(&cursor->c, NULL)) + return 1; - *start = temp.p = cursor->c.p; - temp.end = cursor->c.end; + if (pull_inf(&cursor->c)) + return 1; while (1) { - ok = pull_byte(&temp, &c); - if (!ok) return 0; + if (!pull_byte(&cursor->c, &c)) + goto fail; - /* first char should start with a letter */ + /* first char shouldn't start with a letter */ if (chars == 0 && !isdigit(c) && c != '-') { cursor->err = TE_NUM_START_CHAR; cursor->err_data.c = c; - return 0; + goto fail; } else if (chars > 0 && (isspace(c) || c == ')')) { /* we got a number */ break; - } else if (chars > 0 && !isdigit(c) && c != '.') { + } else if (chars > 0 && !isxdigit(c) && c != ':' && c != '.' && c != 'n' && c != 'a' && c != 'x' && c != 'p' && c != '-' && c != '+') { cursor->err = TE_NUM_CHAR; cursor->err_data.c = c; - return 0; + goto fail; } chars++; if (!ok) { cursor->err = TE_SYM_OVERFLOW; - return 0; + goto fail; } } if (!ok) { cursor->err = TE_SYM_OVERFLOW; - return 0; + goto fail; } - cursor->c.p = temp.p-1; + cursor->c.p--; cursor->err = TE_OK; /* remove the first counted quote since this was not pushed */ return chars; + +fail: + cursor->c.p = start; + return 0; } static int pull_string(struct token_cursor *cursor, u8 **start, int *len) @@ -515,12 +589,10 @@ static int read_and_push_atom(struct token_cursor *cursor, struct token_cursor * } start = cursor->c.p; - ok = pull_number(cursor, &start); - if (ok) { - str.len = ok; + if (pull_number(cursor)) { + str.len = cursor->c.p - start; str.data = start; - ok = push_number(tokens, str); - if (!ok) { + if (!push_number(tokens, str)) { printf("read_and_push_atom number push overflow\n"); return 0; } @@ -555,13 +627,86 @@ static void consume_line(struct cursor *cur) { static int consume_comment(struct cursor *cur) { u8* start = cur->p; if (!consume_byte(cur, ';')) - return 0; - if (!consume_byte(cur, ';')) { - cur->p = start; - return 0; - } + goto fail; + if (!consume_byte(cur, ';')) + goto fail; consume_line(cur); return 1; + +fail: + cur->p = start; + return 0; +} + +static int parse_hex_float(const char *str, int len, double *result) { + struct cursor _cur = {0}, *cur = NULL; + double significand = 0.0; + char *endptr; + int significand_digits = 0; + int fractional_digits = 0; + int found_period = 0; + int exponent = 0; + int sign = 1; + u8 c; + u8 *start = cur->p; + + cur = &_cur; + make_cursor((u8*)str, (u8*)str + len, cur); + + if (!pull_byte(cur, &c)) + goto fail; + + if (c == '-') { + sign = -1; + if (!pull_byte(cur, &c)) goto fail; + } else if (c == '+') { + if (!pull_byte(cur, &c)) goto fail; + } + + if (c != '0') + goto fail; + + if (!pull_byte(cur, &c)) + goto fail; + + if (tolower(c) != 'x') + goto fail; + + while (pull_byte(cur, &c) && (isxdigit(c) || c == '.')) { + if (c == '.') { + if (found_period) goto fail; + found_period = 1; + } else { + int digit_value = isdigit(c) ? c - '0' : 10 + tolower(c) - 'a'; + significand = significand * 16 + digit_value; + if (found_period) + fractional_digits++; + significand_digits++; + } + } + + if (significand_digits == 0) + goto fail; + + if (tolower(c) == 'p') { + if (!pull_byte(cur, &c)) + goto fail; + exponent = strtol((char *)cur->p - 1, &endptr, 10); + if ((char *)cur->p - 1 == endptr) + goto fail; + cur->p = (u8 *)endptr; + } else { + cur->p--; // Unpull the byte that is not 'p' + } + + exponent -= 4 * fractional_digits; + *result = sign * ldexp(significand, exponent); + + return 1; + +fail: + cur->p = start; + return 0; } int tokenize_cells(u8 *buf, int buf_size, struct token_cursor *tokens) @@ -785,7 +930,7 @@ static int pull_number_token(struct token_cursor *tokens, struct tok_str *str) return parse_stringy_token(tokens, str, T_NUMBER); } -static int parse_number(struct token_cursor *tokens, union number *number) +static int parse_number(struct token_cursor *tokens, struct number *number) { int ok; struct tok_str str; @@ -794,12 +939,16 @@ static int parse_number(struct token_cursor *tokens, union number *number) ok = pull_number_token(tokens, &str); if (!ok) return 0; - /* TODO: float numbers */ - number->integer = strtol((char*)str.data, &end, 10); + if (parse_hex_float((const char*)str.data, str.len, &number->value.fdouble)) { + number->type = NUM_FLOAT; + } else { + number->type = NUM_INTEGER; + number->value.integer = strtol((char*)str.data, &end, 10); - if ((u8*)end != (str.data + str.len)) { - printf("parse_number failed\n"); - return 0; + if ((u8*)end != (str.data + str.len)) { + printf("parse_number failed\n"); + return 0; + } } return 1; @@ -939,9 +1088,17 @@ static int parse_size(struct token_cursor *tokens, struct attribute *attr) ok = parse_number(&temp, &attr->data.number); if (!ok) return 0; - tokdebug("attribute %s %d\n", - attr_type_str(attr->type), - attr->data.number.integer); +#ifdef DEBUG + if (attr->data.number.type == NUM_INTEGER) { + tokdebug("attribute %s %d\n", + attr_type_str(attr->type), + attr->data.number.value.integer); + } else if (attr->data.number.type == NUM_FLOAT) { + tokdebug("attribute %s %f\n", + attr_type_str(attr->type), + attr->data.number.value.fdouble); + } +#endif copy_token_cursor(&temp, tokens); @@ -1407,7 +1564,7 @@ int init_parser(struct parser *parser) int ok; int attrs_size = sizeof(struct attribute) * 1024; - int tokens_size = 2048*32; + int tokens_size = 2048*256; int cells_size = sizeof(struct cell) * 1024; int memsize = attrs_size + tokens_size + cells_size; diff --git a/src/parse.h b/src/parse.h @@ -19,6 +19,7 @@ enum token_error { TE_UNEXPECTED_TOKEN, TE_UNEXPECTED_SYMBOL, TE_SYM_OVERFLOW, + TE_NAN_START_CHAR, }; enum cell_type { @@ -78,9 +79,28 @@ struct tok_str { int len; }; -union number { - int integer; - double fdouble; +enum numtype { + NUM_INTEGER, + NUM_FLOAT +}; + +enum nantype { + NAN_ARITHMETIC, + NAN_CANONICAL, + NAN_LITERAL +}; + +struct number { + enum numtype type; + union { + int integer; + double fdouble; + } value; +}; + +struct nan { + enum nantype type; + struct number number; }; struct bufstr { @@ -97,7 +117,7 @@ union attr_data { struct bufstr str; struct data_attr data_attr; enum shape shape; - union number number; + struct number number; }; struct attribute { diff --git a/src/parser.h b/src/parser.h @@ -25,7 +25,7 @@ static int consume_bytes(struct cursor *cursor, const unsigned char *match, int static inline int consume_byte(struct cursor *cursor, unsigned char match) { - if (unlikely(cursor->p + 1 > cursor->end)) + if (unlikely(cursor->p >= cursor->end)) return 0; if (*cursor->p != match) return 0; diff --git a/src/protoverse.c b/src/protoverse.c @@ -139,7 +139,8 @@ extern char **environ; int main(int argc, const char *argv[]) { - static u8 buf[4096*256]; + const int buflen = 10000000; + u8 *buf = malloc(buflen); char **env = environ; const char *space, *code_file; const char *cmd; @@ -165,7 +166,7 @@ int main(int argc, const char *argv[]) return 1; } space = argv[2]; - ok = parse_file(&parser, space, &root, buf, sizeof(buf)); + ok = parse_file(&parser, space, &root, buf, buflen); if (!ok) { printf("failed to parse file\n"); return 1;