commit 4df9d18ed9e1b11b609ef859afbd2076c67f1840
parent 0efd3a0318c06340896e5ff9a79df7454ea3313f
Author: William Casarin <jb55@jb55.com>
Date: Sun, 16 Apr 2023 15:35:15 -0700
tweak tokenizer to support wast
A little hacky but now I can build a wast parser
Diffstat:
5 files changed, 222 insertions(+), 44 deletions(-)
diff --git a/src/cursor.h b/src/cursor.h
@@ -97,7 +97,7 @@ static inline void copy_cursor(struct cursor *src, struct cursor *dest)
static inline int pull_byte(struct cursor *cursor, u8 *c)
{
- if (unlikely(cursor->p + 1 > cursor->end))
+ if (unlikely(cursor->p >= cursor->end))
return 0;
*c = *cursor->p;
diff --git a/src/parse.c b/src/parse.c
@@ -9,6 +9,7 @@
#include <ctype.h>
#include <stdlib.h>
#include <assert.h>
+#include <tgmath.h>
#ifdef DEBUG
#define tokdebug printf
@@ -73,6 +74,7 @@ static const char *token_error_string(enum token_error err)
case TE_STR_START_CHAR: return "string didn't start with \"";
case TE_SYM_START_CHAR: return "symbol didn't start with a-z";
case TE_NUM_START_CHAR: return "number didn't start with 0-9 or -";
+ case TE_NAN_START_CHAR: return "nan number didn't start with nan:";
case TE_SYM_CHAR: return "invalid symbol character";
case TE_NUM_CHAR: return "invalid number character";
case TE_SYM_OVERFLOW: return "symbol push overflow";
@@ -207,7 +209,7 @@ void print_token_error(struct token_cursor *cursor)
cursor->err == TE_NUM_CHAR ||
cursor->err == TE_SYM_CHAR;
- printf("\nerror: %s %.*s\n", token_error_string(cursor->err),
+ printf("\nerror: %s: %.*s\n", token_error_string(cursor->err),
is_chr_data?1:0, (char*)&cursor->err_data.c);
}
@@ -299,12 +301,12 @@ static int push_number(struct token_cursor *tokens, struct tok_str str)
static int is_start_symbol_char(char c)
{
- return c >= 'a' && c <= 'z';
+ return (c >= 'a' && c <= 'z') || c == '$' || c == '=';
}
static int is_symbol_char(char c)
{
- return is_start_symbol_char(c) || c == '-' || c == '_' ||
+ return is_start_symbol_char(c) || c == '.' || c == '-' || c == '_' ||
(c >= '0' && c <= '9');
}
@@ -330,53 +332,125 @@ static int pull_escaped_char(struct cursor *cursor, u8 *c)
return 2;
}
-static int pull_number(struct token_cursor *cursor, u8 **start)
+static int pull_str(struct cursor *cur, const char *str)
+{
+ int len = strlen(str);
+
+ if (cur->p + len >= cur->end)
+ return 0;
+
+ if (memcmp(cur->p, str, len) != 0)
+ return 0;
+
+ cur->p += len;
+ return 1;
+}
+
+static int pull_nan(struct cursor *cur, enum nantype *type)
+{
+ u8 *start = cur->p;
+
+ if (!consume_byte(cur, 'n')) goto fail;
+ if (!consume_byte(cur, 'a')) goto fail;
+ if (!consume_byte(cur, 'n')) goto fail;
+ if (!consume_byte(cur, ':')) goto fail;
+
+ if (pull_str(cur, "arithmetic")) {
+ if (type) *type = NAN_ARITHMETIC;
+ return 1;
+ }
+
+ if (pull_str(cur, "canonical")) {
+ if (type) *type = NAN_CANONICAL;
+ return 1;
+ }
+
+ while (cur->p < cur->end) {
+ if (*cur->p == ')' || isspace(*cur->p))
+ return 1;
+ cur->p++;
+ }
+
+fail:
+ cur->p = start;
+ return 0;
+}
+
+static int pull_inf(struct cursor *cur) {
+ u8 *start = cur->p;
+ u8 c;
+ consume_byte(cur, '-');
+ consume_byte(cur, '+');
+ if (!consume_byte(cur, 'i')) goto fail;
+ if (!consume_byte(cur, 'n')) goto fail;
+ if (!consume_byte(cur, 'f')) goto fail;
+ if (pull_byte(cur, &c)) {
+ if (isspace(c) || c == ')') {
+ cur->p--;
+ return 1;
+ }
+ goto fail;
+ }
+
+ return 1;
+fail:
+ cur->p = start;
+ return 0;
+}
+
+static int pull_number(struct token_cursor *cursor)
{
int ok = 1;
int chars = 0;
u8 c;
+ u8 *start = cursor->c.p;
- struct cursor temp;
+ if (pull_nan(&cursor->c, NULL))
+ return 1;
- *start = temp.p = cursor->c.p;
- temp.end = cursor->c.end;
+ if (pull_inf(&cursor->c))
+ return 1;
while (1) {
- ok = pull_byte(&temp, &c);
- if (!ok) return 0;
+ if (!pull_byte(&cursor->c, &c))
+ goto fail;
- /* first char should start with a letter */
+ /* first char shouldn't start with a letter */
if (chars == 0 && !isdigit(c) && c != '-') {
cursor->err = TE_NUM_START_CHAR;
cursor->err_data.c = c;
- return 0;
+ goto fail;
} else if (chars > 0 && (isspace(c) || c == ')')) {
/* we got a number */
break;
- } else if (chars > 0 && !isdigit(c) && c != '.') {
+ } else if (chars > 0 && !isxdigit(c) && c != ':' && c != '.' && c != 'n' && c != 'a' && c != 'x' && c != 'p' && c != '-' && c != '+') {
cursor->err = TE_NUM_CHAR;
cursor->err_data.c = c;
- return 0;
+ goto fail;
}
chars++;
if (!ok) {
cursor->err = TE_SYM_OVERFLOW;
- return 0;
+ goto fail;
}
}
if (!ok) {
cursor->err = TE_SYM_OVERFLOW;
- return 0;
+ goto fail;
}
- cursor->c.p = temp.p-1;
+ cursor->c.p--;
cursor->err = TE_OK;
/* remove the first counted quote since this was not pushed */
return chars;
+
+fail:
+ cursor->c.p = start;
+ return 0;
}
static int pull_string(struct token_cursor *cursor, u8 **start, int *len)
@@ -515,12 +589,10 @@ static int read_and_push_atom(struct token_cursor *cursor, struct token_cursor *
}
start = cursor->c.p;
- ok = pull_number(cursor, &start);
- if (ok) {
- str.len = ok;
+ if (pull_number(cursor)) {
+ str.len = cursor->c.p - start;
str.data = start;
- ok = push_number(tokens, str);
- if (!ok) {
+ if (!push_number(tokens, str)) {
printf("read_and_push_atom number push overflow\n");
return 0;
}
@@ -555,13 +627,86 @@ static void consume_line(struct cursor *cur) {
static int consume_comment(struct cursor *cur) {
u8* start = cur->p;
if (!consume_byte(cur, ';'))
- return 0;
- if (!consume_byte(cur, ';')) {
- cur->p = start;
- return 0;
- }
+ goto fail;
+ if (!consume_byte(cur, ';'))
+ goto fail;
consume_line(cur);
return 1;
+
+fail:
+ cur->p = start;
+ return 0;
+}
+
+static int parse_hex_float(const char *str, int len, double *result) {
+ struct cursor _cur = {0}, *cur = NULL;
+ double significand = 0.0;
+ char *endptr;
+ int significand_digits = 0;
+ int fractional_digits = 0;
+ int found_period = 0;
+ int exponent = 0;
+ int sign = 1;
+ u8 c;
+ u8 *start = cur->p;
+
+ cur = &_cur;
+ make_cursor((u8*)str, (u8*)str + len, cur);
+
+ if (!pull_byte(cur, &c))
+ goto fail;
+
+ if (c == '-') {
+ sign = -1;
+ if (!pull_byte(cur, &c)) goto fail;
+ } else if (c == '+') {
+ if (!pull_byte(cur, &c)) goto fail;
+ }
+
+ if (c != '0')
+ goto fail;
+
+ if (!pull_byte(cur, &c))
+ goto fail;
+
+ if (tolower(c) != 'x')
+ goto fail;
+
+ while (pull_byte(cur, &c) && (isxdigit(c) || c == '.')) {
+ if (c == '.') {
+ if (found_period) goto fail;
+ found_period = 1;
+ } else {
+ int digit_value = isdigit(c) ? c - '0' : 10 + tolower(c) - 'a';
+ significand = significand * 16 + digit_value;
+ if (found_period)
+ fractional_digits++;
+ significand_digits++;
+ }
+ }
+
+ if (significand_digits == 0)
+ goto fail;
+
+ if (tolower(c) == 'p') {
+ if (!pull_byte(cur, &c))
+ goto fail;
+ exponent = strtol((char *)cur->p - 1, &endptr, 10);
+ if ((char *)cur->p - 1 == endptr)
+ goto fail;
+ cur->p = (u8 *)endptr;
+ } else {
+ cur->p--; // Unpull the byte that is not 'p'
+ }
+
+ exponent -= 4 * fractional_digits;
+ *result = sign * ldexp(significand, exponent);
+
+ return 1;
+
+fail:
+ cur->p = start;
+ return 0;
}
int tokenize_cells(u8 *buf, int buf_size, struct token_cursor *tokens)
@@ -785,7 +930,7 @@ static int pull_number_token(struct token_cursor *tokens, struct tok_str *str)
return parse_stringy_token(tokens, str, T_NUMBER);
}
-static int parse_number(struct token_cursor *tokens, union number *number)
+static int parse_number(struct token_cursor *tokens, struct number *number)
{
int ok;
struct tok_str str;
@@ -794,12 +939,16 @@ static int parse_number(struct token_cursor *tokens, union number *number)
ok = pull_number_token(tokens, &str);
if (!ok) return 0;
- /* TODO: float numbers */
- number->integer = strtol((char*)str.data, &end, 10);
+ if (parse_hex_float((const char*)str.data, str.len, &number->value.fdouble)) {
+ number->type = NUM_FLOAT;
+ } else {
+ number->type = NUM_INTEGER;
+ number->value.integer = strtol((char*)str.data, &end, 10);
- if ((u8*)end != (str.data + str.len)) {
- printf("parse_number failed\n");
- return 0;
+ if ((u8*)end != (str.data + str.len)) {
+ printf("parse_number failed\n");
+ return 0;
+ }
}
return 1;
@@ -939,9 +1088,17 @@ static int parse_size(struct token_cursor *tokens, struct attribute *attr)
ok = parse_number(&temp, &attr->data.number);
if (!ok) return 0;
- tokdebug("attribute %s %d\n",
- attr_type_str(attr->type),
- attr->data.number.integer);
+#ifdef DEBUG
+ if (attr->data.number.type == NUM_INTEGER) {
+ tokdebug("attribute %s %d\n",
+ attr_type_str(attr->type),
+ attr->data.number.value.integer);
+ } else if (attr->data.number.type == NUM_FLOAT) {
+ tokdebug("attribute %s %f\n",
+ attr_type_str(attr->type),
+ attr->data.number.value.fdouble);
+ }
+#endif
copy_token_cursor(&temp, tokens);
@@ -1407,7 +1564,7 @@ int init_parser(struct parser *parser)
int ok;
int attrs_size = sizeof(struct attribute) * 1024;
- int tokens_size = 2048*32;
+ int tokens_size = 2048*256;
int cells_size = sizeof(struct cell) * 1024;
int memsize = attrs_size + tokens_size + cells_size;
diff --git a/src/parse.h b/src/parse.h
@@ -19,6 +19,7 @@ enum token_error {
TE_UNEXPECTED_TOKEN,
TE_UNEXPECTED_SYMBOL,
TE_SYM_OVERFLOW,
+ TE_NAN_START_CHAR,
};
enum cell_type {
@@ -78,9 +79,28 @@ struct tok_str {
int len;
};
-union number {
- int integer;
- double fdouble;
+enum numtype {
+ NUM_INTEGER,
+ NUM_FLOAT
+};
+
+enum nantype {
+ NAN_ARITHMETIC,
+ NAN_CANONICAL,
+ NAN_LITERAL
+};
+
+struct number {
+ enum numtype type;
+ union {
+ int integer;
+ double fdouble;
+ } value;
+};
+
+struct nan {
+ enum nantype type;
+ struct number number;
};
struct bufstr {
@@ -97,7 +117,7 @@ union attr_data {
struct bufstr str;
struct data_attr data_attr;
enum shape shape;
- union number number;
+ struct number number;
};
struct attribute {
diff --git a/src/parser.h b/src/parser.h
@@ -25,7 +25,7 @@ static int consume_bytes(struct cursor *cursor, const unsigned char *match, int
static inline int consume_byte(struct cursor *cursor, unsigned char match)
{
- if (unlikely(cursor->p + 1 > cursor->end))
+ if (unlikely(cursor->p >= cursor->end))
return 0;
if (*cursor->p != match)
return 0;
diff --git a/src/protoverse.c b/src/protoverse.c
@@ -139,7 +139,8 @@ extern char **environ;
int main(int argc, const char *argv[])
{
- static u8 buf[4096*256];
+ const int buflen = 10000000;
+ u8 *buf = malloc(buflen);
char **env = environ;
const char *space, *code_file;
const char *cmd;
@@ -165,7 +166,7 @@ int main(int argc, const char *argv[])
return 1;
}
space = argv[2];
- ok = parse_file(&parser, space, &root, buf, sizeof(buf));
+ ok = parse_file(&parser, space, &root, buf, buflen);
if (!ok) {
printf("failed to parse file\n");
return 1;