tweak tokenizer to support wast - protoverse

commit 4df9d18ed9e1b11b609ef859afbd2076c67f1840
parent 0efd3a0318c06340896e5ff9a79df7454ea3313f
Author: William Casarin <jb55@jb55.com>
Date:   Sun, 16 Apr 2023 15:35:15 -0700

tweak tokenizer to support wast

A little hacky but now I can build a wast parser

Diffstat:
M src/cursor.h  | 2 +-
M src/parse.c  | 229 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++-------------
M src/parse.h  | 28 ++++++++++++++++++++++++----
M src/parser.h  | 2 +-
M src/protoverse.c  | 5 +++--

5 files changed, 222 insertions(+), 44 deletions(-)
diff --git a/src/cursor.h b/src/cursor.h
@@ -97,7 +97,7 @@ static inline void copy_cursor(struct cursor *src, struct cursor *dest)
 
 static inline int pull_byte(struct cursor *cursor, u8 *c)
 {
-	if (unlikely(cursor->p + 1 > cursor->end))
+	if (unlikely(cursor->p >= cursor->end))
 		return 0;
 
 	*c = *cursor->p;
diff --git a/src/parse.c b/src/parse.c
@@ -9,6 +9,7 @@
 #include <ctype.h>
 #include <stdlib.h>
 #include <assert.h>
+#include <tgmath.h>
 
 #ifdef DEBUG
 #define tokdebug printf
@@ -73,6 +74,7 @@ static const char *token_error_string(enum token_error err)
 	case TE_STR_START_CHAR: return "string didn't start with \"";
 	case TE_SYM_START_CHAR: return "symbol didn't start with a-z";
 	case TE_NUM_START_CHAR: return "number didn't start with 0-9 or -";
+	case TE_NAN_START_CHAR: return "nan number didn't start with nan:";
 	case TE_SYM_CHAR: return "invalid symbol character";
 	case TE_NUM_CHAR: return "invalid number character";
 	case TE_SYM_OVERFLOW: return "symbol push overflow";
@@ -207,7 +209,7 @@ void print_token_error(struct token_cursor *cursor)
 			cursor->err == TE_NUM_CHAR ||
 			cursor->err == TE_SYM_CHAR;
 
-		printf("\nerror: %s %.*s\n", token_error_string(cursor->err),
+		printf("\nerror: %s: %.*s\n", token_error_string(cursor->err),
 		       is_chr_data?1:0, (char*)&cursor->err_data.c);
 	}
 
@@ -299,12 +301,12 @@ static int push_number(struct token_cursor *tokens, struct tok_str str)
 
 static int is_start_symbol_char(char c)
 {
-	return c >= 'a' && c <= 'z';
+	return (c >= 'a' && c <= 'z') || c == '$' || c == '=';
 }
 
 static int is_symbol_char(char c)
 {
-	return is_start_symbol_char(c) || c == '-' || c == '_' ||
+	return is_start_symbol_char(c) || c == '.' || c == '-' || c == '_' ||
 		(c >= '0' && c <= '9');
 }
 
@@ -330,53 +332,125 @@ static int pull_escaped_char(struct cursor *cursor, u8 *c)
 	return 2;
 }
 
-static int pull_number(struct token_cursor *cursor, u8 **start)
+static int pull_str(struct cursor *cur, const char *str)
+{
+	int len = strlen(str);
+
+	if (cur->p + len >= cur->end)
+		return 0;
+
+	if (memcmp(cur->p, str, len) != 0)
+		return 0;
+
+	cur->p += len;
+	return 1;
+}
+
+static int pull_nan(struct cursor *cur, enum nantype *type)
+{
+	u8 *start = cur->p;
+
+	if (!consume_byte(cur, 'n')) goto fail;
+	if (!consume_byte(cur, 'a')) goto fail;
+	if (!consume_byte(cur, 'n')) goto fail;
+	if (!consume_byte(cur, ':')) goto fail;
+
+	if (pull_str(cur, "arithmetic"))  {
+		if (type) *type = NAN_ARITHMETIC;
+		return 1;
+	}
+
+	if (pull_str(cur, "canonical")) {
+		if (type) *type = NAN_CANONICAL;
+		return 1;
+	}
+
+	while (cur->p < cur->end) {
+		if (*cur->p == ')' || isspace(*cur->p))
+			return 1;
+		cur->p++;
+	}
+
+fail:
+	cur->p = start;
+	return 0;
+}
+
+static int pull_inf(struct cursor *cur) {
+	u8 *start = cur->p;
+	u8 c;
+	consume_byte(cur, '-');
+	consume_byte(cur, '+');
+	if (!consume_byte(cur, 'i')) goto fail;
+	if (!consume_byte(cur, 'n')) goto fail;
+	if (!consume_byte(cur, 'f')) goto fail;
+	if (pull_byte(cur, &c))  {
+		if (isspace(c) || c == ')') {
+			cur->p--;
+			return 1;
+		}
+		goto fail;
+	}
+
+	return 1;
+fail:
+	cur->p = start;
+	return 0;
+}
+
+static int pull_number(struct token_cursor *cursor)
 {
 	int ok = 1;
 	int chars = 0;
 	u8 c;
+	u8 *start = cursor->c.p;
 
-	struct cursor temp;
+	if (pull_nan(&cursor->c, NULL))
+		return 1;
 
-	*start = temp.p = cursor->c.p;
-	temp.end = cursor->c.end;
+	if (pull_inf(&cursor->c))
+		return 1;
 
 	while (1) {
-		ok = pull_byte(&temp, &c);
-		if (!ok) return 0;
+		if (!pull_byte(&cursor->c, &c))
+			goto fail;
 
-		/* first char should start with a letter */
+		/* first char shouldn't start with a letter */
 		if (chars == 0 && !isdigit(c) && c != '-') {
 			cursor->err = TE_NUM_START_CHAR;
 			cursor->err_data.c = c;
-			return 0;
+			goto fail;
 		} else if (chars > 0 && (isspace(c) || c == ')')) {
 			/* we got a number */
 			break;
-		} else if (chars > 0 && !isdigit(c) && c != '.') {
+		} else if (chars > 0 && !isxdigit(c) && c != ':' && c != '.' && c != 'n' && c != 'a' && c != 'x' && c != 'p' && c != '-' && c != '+') {
 			cursor->err = TE_NUM_CHAR;
 			cursor->err_data.c = c;
-			return 0;
+			goto fail;
 		}
 
 		chars++;
 
 		if (!ok) {
 			cursor->err = TE_SYM_OVERFLOW;
-			return 0;
+			goto fail;
 		}
 	}
 
 	if (!ok) {
 		cursor->err = TE_SYM_OVERFLOW;
-		return 0;
+		goto fail;
 	}
 
-	cursor->c.p = temp.p-1;
+	cursor->c.p--;
 	cursor->err = TE_OK;
 
 	/* remove the first counted quote since this was not pushed */
 	return chars;
+
+fail:
+	cursor->c.p = start;
+	return 0;
 }
 
 static int pull_string(struct token_cursor *cursor, u8 **start, int *len)
@@ -515,12 +589,10 @@ static int read_and_push_atom(struct token_cursor *cursor, struct token_cursor *
 	}
 
 	start = cursor->c.p;
-	ok = pull_number(cursor, &start);
-	if (ok) {
-		str.len  = ok;
+	if (pull_number(cursor)) {
+		str.len  = cursor->c.p - start;
 		str.data = start;
-		ok = push_number(tokens, str);
-		if (!ok) {
+		if (!push_number(tokens, str)) {
 			printf("read_and_push_atom number push overflow\n");
 			return 0;
 		}
@@ -555,13 +627,86 @@ static void consume_line(struct cursor *cur) {
 static int consume_comment(struct cursor *cur) {
 	u8* start = cur->p;
 	if (!consume_byte(cur, ';'))
-		return 0;
-	if (!consume_byte(cur, ';')) {
-		cur->p = start;
-		return 0;
-	}
+		goto fail;
+	if (!consume_byte(cur, ';'))
+		goto fail;
 	consume_line(cur);
 	return 1;
+
+fail:
+	cur->p = start;
+	return 0;
+}
+
+static int parse_hex_float(const char *str, int len, double *result) {
+	struct cursor _cur = {0}, *cur = NULL;
+	double significand = 0.0;
+	char *endptr;
+	int significand_digits = 0;
+	int fractional_digits = 0;
+	int found_period = 0;
+	int exponent = 0;
+	int sign = 1;
+	u8 c;
+	u8 *start = cur->p;
+
+	cur = &_cur;
+	make_cursor((u8*)str, (u8*)str + len, cur);
+
+	if (!pull_byte(cur, &c))
+		goto fail;
+
+	if (c == '-') {
+		sign = -1;
+		if (!pull_byte(cur, &c)) goto fail;
+	} else if (c == '+') {
+		if (!pull_byte(cur, &c)) goto fail;
+	}
+
+	if (c != '0')
+		goto fail;
+
+	if (!pull_byte(cur, &c))
+		goto fail;
+
+	if (tolower(c) != 'x')
+		goto fail;
+
+	while (pull_byte(cur, &c) && (isxdigit(c) || c == '.')) {
+		if (c == '.') {
+			if (found_period) goto fail;
+			found_period = 1;
+		} else {
+			int digit_value = isdigit(c) ? c - '0' : 10 + tolower(c) - 'a';
+			significand = significand * 16 + digit_value;
+			if (found_period)
+				fractional_digits++;
+			significand_digits++;
+		}
+	}
+
+	if (significand_digits == 0)
+		goto fail;
+
+	if (tolower(c) == 'p') {
+		if (!pull_byte(cur, &c))
+			goto fail;
+		exponent = strtol((char *)cur->p - 1, &endptr, 10);
+		if ((char *)cur->p - 1 == endptr)
+			goto fail;
+		cur->p = (u8 *)endptr;
+	} else {
+		cur->p--; // Unpull the byte that is not 'p'
+	}
+
+	exponent -= 4 * fractional_digits;
+	*result = sign * ldexp(significand, exponent);
+
+	return 1;
+
+fail:
+	cur->p = start;
+	return 0;
 }
 
 int tokenize_cells(u8 *buf, int buf_size, struct token_cursor *tokens)
@@ -785,7 +930,7 @@ static int pull_number_token(struct token_cursor *tokens, struct tok_str *str)
 	return parse_stringy_token(tokens, str, T_NUMBER);
 }
 
-static int parse_number(struct token_cursor *tokens, union number *number)
+static int parse_number(struct token_cursor *tokens, struct number *number)
 {
 	int ok;
 	struct tok_str str;
@@ -794,12 +939,16 @@ static int parse_number(struct token_cursor *tokens, union number *number)
 	ok = pull_number_token(tokens, &str);
 	if (!ok) return 0;
 
-	/* TODO: float numbers */
-	number->integer = strtol((char*)str.data, &end, 10);
+	if (parse_hex_float((const char*)str.data, str.len, &number->value.fdouble)) {
+		number->type = NUM_FLOAT;
+	} else {
+		number->type = NUM_INTEGER;
+		number->value.integer = strtol((char*)str.data, &end, 10);
 
-	if ((u8*)end != (str.data + str.len)) {
-		printf("parse_number failed\n");
-		return 0;
+		if ((u8*)end != (str.data + str.len)) {
+			printf("parse_number failed\n");
+			return 0;
+		}
 	}
 
 	return 1;
@@ -939,9 +1088,17 @@ static int parse_size(struct token_cursor *tokens, struct attribute *attr)
 	ok = parse_number(&temp, &attr->data.number);
 	if (!ok) return 0;
 
-	tokdebug("attribute %s %d\n",
-		 attr_type_str(attr->type),
-		 attr->data.number.integer);
+#ifdef DEBUG
+	if (attr->data.number.type == NUM_INTEGER) {
+		tokdebug("attribute %s %d\n",
+			 attr_type_str(attr->type),
+			 attr->data.number.value.integer);
+	} else if (attr->data.number.type == NUM_FLOAT) {
+		tokdebug("attribute %s %f\n",
+			 attr_type_str(attr->type),
+			 attr->data.number.value.fdouble);
+	}
+#endif
 
 	copy_token_cursor(&temp, tokens);
 
@@ -1407,7 +1564,7 @@ int init_parser(struct parser *parser)
 	int ok;
 
 	int attrs_size = sizeof(struct attribute) * 1024;
-	int tokens_size = 2048*32;
+	int tokens_size = 2048*256;
 	int cells_size = sizeof(struct cell) * 1024;
 	int memsize = attrs_size + tokens_size + cells_size;
 
diff --git a/src/parse.h b/src/parse.h
@@ -19,6 +19,7 @@ enum token_error {
 	TE_UNEXPECTED_TOKEN,
 	TE_UNEXPECTED_SYMBOL,
 	TE_SYM_OVERFLOW,
+	TE_NAN_START_CHAR,
 };
 
 enum cell_type {
@@ -78,9 +79,28 @@ struct tok_str {
 	int len;
 };
 
-union number {
-	int integer;
-	double fdouble;
+enum numtype {
+	NUM_INTEGER,
+	NUM_FLOAT
+};
+
+enum nantype {
+	NAN_ARITHMETIC,
+	NAN_CANONICAL,
+	NAN_LITERAL
+};
+
+struct number {
+	enum numtype type;
+	union {
+		int integer;
+		double fdouble;
+	} value;
+};
+
+struct nan {
+	enum nantype type;
+	struct number number;
 };
 
 struct bufstr {
@@ -97,7 +117,7 @@ union attr_data {
 	struct bufstr str;
 	struct data_attr data_attr;
 	enum shape shape;
-	union number number;
+	struct number number;
 };
 
 struct attribute {
diff --git a/src/parser.h b/src/parser.h
@@ -25,7 +25,7 @@ static int consume_bytes(struct cursor *cursor, const unsigned char *match, int 
 
 static inline int consume_byte(struct cursor *cursor, unsigned char match)
 {
-	if (unlikely(cursor->p + 1 > cursor->end))
+	if (unlikely(cursor->p >= cursor->end))
 		return 0;
 	if (*cursor->p != match)
 		return 0;
diff --git a/src/protoverse.c b/src/protoverse.c
@@ -139,7 +139,8 @@ extern char **environ;
 
 int main(int argc, const char *argv[])
 {
-	static u8 buf[4096*256];
+	const int buflen = 10000000;
+	u8 *buf = malloc(buflen);
 	char **env = environ;
 	const char *space, *code_file;
 	const char *cmd;
@@ -165,7 +166,7 @@ int main(int argc, const char *argv[])
 			return 1;
 		}
 		space = argv[2];
-		ok = parse_file(&parser, space, &root, buf, sizeof(buf));
+		ok = parse_file(&parser, space, &root, buf, buflen);
 		if (!ok) {
 			printf("failed to parse file\n");
 			return 1;

	protoverse A metaverse protocol
	git clone git://jb55.com/protoverse
	Log \| Files \| Refs \| README \| LICENSE

M	src/cursor.h	\|	2	+-
M	src/parse.c	\|	229	++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++-------------
M	src/parse.h	\|	28	++++++++++++++++++++++++----
M	src/parser.h	\|	2	+-
M	src/protoverse.c	\|	5	+++--