cursor: sync with damus' cursor - nostrdb - an unfairly fast embedded nostr database backed by lmdb

commit d7cac21e2884370c2a87c5533636e2caf7e1ef1c
parent 539cb0651e197a5f0670762413c087b5e121df69
Author: William Casarin <jb55@jb55.com>
Date:   Mon,  4 Dec 2023 14:50:53 -0800

cursor: sync with damus' cursor

Diffstat:
M cursor.h  | 670 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++------------
M nostrdb.c  | 176 -------------------------------------------------------------------------------
A typedefs.h  | 14 ++++++++++++++

3 files changed, 587 insertions(+), 273 deletions(-)
diff --git a/cursor.h b/cursor.h
@@ -2,11 +2,12 @@
 #ifndef JB55_CURSOR_H
 #define JB55_CURSOR_H
 
-//#include <ctype.h>
-//#include <assert.h>
+#include "typedefs.h"
 
-#include <string.h>
+#include <stdio.h>
 #include <ctype.h>
+#include <assert.h>
+#include <string.h>
 
 #define unlikely(x) __builtin_expect((x),0)
 #define likely(x)   __builtin_expect((x),1)
@@ -17,170 +18,276 @@ struct cursor {
 	unsigned char *end;
 };
 
-static inline void make_cursor(unsigned char *start, unsigned char *end, struct cursor *cursor)
+struct array {
+	struct cursor cur;
+	unsigned int elem_size;
+};
+
+static inline void reset_cursor(struct cursor *cursor)
+{
+	cursor->p = cursor->start;
+}
+
+static inline void wipe_cursor(struct cursor *cursor)
+{
+	reset_cursor(cursor);
+	memset(cursor->start, 0, cursor->end - cursor->start);
+}
+
+static inline void make_cursor(u8 *start, u8 *end, struct cursor *cursor)
 {
 	cursor->start = start;
 	cursor->p = start;
 	cursor->end = end;
 }
 
-static inline int cursor_push_byte(struct cursor *cursor, unsigned char c)
+static inline void make_array(struct array *a, u8* start, u8 *end, unsigned int elem_size)
 {
-	if (unlikely(cursor->p + 1 > cursor->end)) {
-		return 0;
+	make_cursor(start, end, &a->cur);
+	a->elem_size = elem_size;
+}
+
+static inline int cursor_eof(struct cursor *c)
+{
+	return c->p == c->end;
+}
+
+static inline void *cursor_malloc(struct cursor *mem, unsigned long size)
+{
+	void *ret;
+
+	if (mem->p + size > mem->end) {
+		return NULL;
 	}
 
-	*cursor->p = c;
-	cursor->p++;
+	ret = mem->p;
+	mem->p += size;
 
-	return 1;
+	return ret;
 }
 
-static inline int cursor_push(struct cursor *cursor, unsigned char *data, int len)
+static inline void *cursor_alloc(struct cursor *mem, unsigned long size)
 {
-	if (unlikely(cursor->p + len >= cursor->end)) {
+	void *ret;
+	if (!(ret = cursor_malloc(mem, size))) {
 		return 0;
 	}
 
-	if (cursor->p != data)
-		memcpy(cursor->p, data, len);
-
-	cursor->p += len;
+	memset(ret, 0, size);
+	return ret;
+}
 
+static inline int cursor_slice(struct cursor *mem, struct cursor *slice, size_t size)
+{
+	u8 *p;
+	if (!(p = cursor_alloc(mem, size))) {
+		return 0;
+	}
+	make_cursor(p, mem->p, slice);
 	return 1;
 }
 
-static inline int cursor_push_lowercase(struct cursor *cur, const char *str, int len)
+
+static inline void copy_cursor(struct cursor *src, struct cursor *dest)
 {
-	int i;
+	dest->start = src->start;
+	dest->p = src->p;
+	dest->end = src->end;
+}
 
-	if (unlikely(cur->p + len >= cur->end))
+static inline int cursor_skip(struct cursor *cursor, int n)
+{
+    if (cursor->p + n >= cursor->end)
+        return 0;
+
+    cursor->p += n;
+
+    return 1;
+}
+
+static inline int pull_byte(struct cursor *cursor, u8 *c)
+{
+	if (unlikely(cursor->p >= cursor->end))
 		return 0;
 
-	for (i = 0; i < len; i++)
-		cur->p[i] = tolower(str[i]);
+	*c = *cursor->p;
+	cursor->p++;
 
-	cur->p += len;
 	return 1;
 }
 
-static inline int cursor_push_str(struct cursor *cursor, const char *str)
+static inline int parse_byte(struct cursor *cursor, u8 *c)
 {
-	return cursor_push(cursor, (unsigned char*)str, (int)strlen(str));
+    if (unlikely(cursor->p >= cursor->end))
+        return 0;
+
+    *c = *cursor->p;
+    //cursor->p++;
+
+    return 1;
 }
 
-static inline int cursor_push_c_str(struct cursor *cursor, const char *str)
-{
-	return cursor_push_str(cursor, str) && cursor_push_byte(cursor, 0);
+static inline int parse_char(struct cursor *cur, char c) {
+    if (cur->p >= cur->end)
+        return 0;
+        
+    if (*cur->p == c) {
+        cur->p++;
+        return 1;
+    }
+    
+    return 0;
 }
 
-static inline void *cursor_malloc(struct cursor *mem, unsigned long size)
+static inline int peek_char(struct cursor *cur, int ind) {
+    if ((cur->p + ind < cur->start) || (cur->p + ind >= cur->end))
+        return -1;
+    
+    return *(cur->p + ind);
+}
+
+static inline int cursor_pull_c_str(struct cursor *cursor, const char **str)
 {
-	void *ret;
+	*str = (const char*)cursor->p;
 
-	if (mem->p + size > mem->end) {
+	for (; cursor->p < cursor->end; cursor->p++) {
+		if (*cursor->p == 0) {
+			cursor->p++;
+			return 1;
+		}
+	}
+
+	return 0;
+}
+
+
+static inline int cursor_push_byte(struct cursor *cursor, u8 c)
+{
+	if (unlikely(cursor->p + 1 > cursor->end)) {
 		return 0;
 	}
 
-	ret = mem->p;
-	mem->p += size;
+	*cursor->p = c;
+	cursor->p++;
 
-	return ret;
+	return 1;
 }
 
-static inline int cursor_skip(struct cursor *cursor, int n)
+static inline int cursor_pull(struct cursor *cursor, u8 *data, int len)
 {
-    if (cursor->p + n >= cursor->end)
-        return 0;
+	if (unlikely(cursor->p + len > cursor->end)) {
+		return 0;
+	}
 
-    cursor->p += n;
+	memcpy(data, cursor->p, len);
+	cursor->p += len;
 
-    return 1;
+	return 1;
 }
 
-static inline int cursor_slice(struct cursor *mem, struct cursor *slice, size_t size)
+static inline int pull_data_into_cursor(struct cursor *cursor,
+			  struct cursor *dest,
+			  unsigned char **data,
+			  int len)
 {
-	unsigned char *p;
-	if (!(p = cursor_malloc(mem, size))) {
+	int ok;
+
+	if (unlikely(dest->p + len > dest->end)) {
+		printf("not enough room in dest buffer\n");
 		return 0;
 	}
-	make_cursor(p, mem->p, slice);
+
+	ok = cursor_pull(cursor, dest->p, len);
+	if (!ok) return 0;
+
+	*data = dest->p;
+	dest->p += len;
+
 	return 1;
 }
 
-static inline size_t cursor_count(struct cursor *cursor, size_t elem_size) {
-	return (cursor->p - cursor->start)/elem_size;
-}
+static inline int cursor_dropn(struct cursor *cur, int size, int n)
+{
+	if (n == 0)
+		return 1;
 
-static inline int cursor_push_u32(struct cursor *cursor, uint32_t i) {
-	return cursor_push(cursor, (unsigned char*)&i, sizeof(i));
-}
+	if (unlikely(cur->p - size*n < cur->start)) {
+		return 0;
+	}
 
-static inline int cursor_push_u16(struct cursor *cursor, uint16_t i) {
-	return cursor_push(cursor, (unsigned char*)&i, sizeof(i));
+	cur->p -= size*n;
+	return 1;
 }
 
-#define max(a,b) ((a) > (b) ? (a) : (b))
-#include <stdio.h>
-static inline void cursor_print_around(struct cursor *cur, int range)
+static inline int cursor_drop(struct cursor *cur, int size)
 {
-	unsigned char *c;
+	return cursor_dropn(cur, size, 1);
+}
 
-	printf("[%ld/%ld]\n", cur->p - cur->start, cur->end - cur->start);
+static inline unsigned char *cursor_topn(struct cursor *cur, int len, int n)
+{
+	n += 1;
+	if (unlikely(cur->p - len*n < cur->start)) {
+		return NULL;
+	}
+	return cur->p - len*n;
+}
 
-	c = max(cur->p - range, cur->start);
-	for (; c < cur->end && c < (cur->p + range); c++) {
-		printf("%02x", *c);
+static inline unsigned char *cursor_top(struct cursor *cur, int len)
+{
+	if (unlikely(cur->p - len < cur->start)) {
+		return NULL;
 	}
-	printf("\n");
+	return cur->p - len;
+}
 
-	c = max(cur->p - range, cur->start);
-	for (; c < cur->end && c < (cur->p + range); c++) {
-		if (c == cur->p) {
-			printf("^");
-			continue;
-		}
-		printf("  ");
+static inline int cursor_top_int(struct cursor *cur, int *i)
+{
+	u8 *p;
+	if (unlikely(!(p = cursor_top(cur, sizeof(*i))))) {
+		return 0;
 	}
-	printf("\n");
+	*i = *((int*)p);
+	return 1;
 }
-#undef max
 
-static inline int pull_byte(struct cursor *cursor, unsigned char *c)
+static inline int cursor_pop(struct cursor *cur, u8 *data, int len)
 {
-	if (unlikely(cursor->p + 1 > cursor->end))
+	if (unlikely(cur->p - len < cur->start)) {
 		return 0;
+	}
 
-	*c = *cursor->p;
-	cursor->p++;
+	cur->p -= len;
+	memcpy(data, cur->p, len);
 
 	return 1;
 }
 
-
-static inline int pull_varint(struct cursor *cursor, int *n)
+static inline int cursor_push(struct cursor *cursor, u8 *data, int len)
 {
-	int ok, i;
-	unsigned char b;
-	*n = 0;
+	if (unlikely(cursor->p + len >= cursor->end)) {
+		return 0;
+	}
 
-	for (i = 0;; i++) {
-		ok = pull_byte(cursor, &b);
-		if (!ok) return 0;
+	if (cursor->p != data)
+		memcpy(cursor->p, data, len);
 
-		*n |= ((int)b & 0x7F) << (i * 7);
+	cursor->p += len;
 
-		/* is_last */
-		if ((b & 0x80) == 0) {
-			return i+1;
-		}
+	return 1;
+}
 
-		if (i == 4) return 0;
-	}
+static inline int cursor_push_int(struct cursor *cursor, int i)
+{
+	return cursor_push(cursor, (u8*)&i, sizeof(i));
+}
 
-	return 0;
+static inline size_t cursor_count(struct cursor *cursor, size_t elem_size)
+{
+	return (cursor->p - cursor->start)/elem_size;
 }
 
+/* TODO: push_varint */
 static inline int push_varint(struct cursor *cursor, int n)
 {
 	int ok, len;
@@ -206,29 +313,398 @@ static inline int push_varint(struct cursor *cursor, int n)
 	return len;
 }
 
-static inline int cursor_memset(struct cursor *cursor, unsigned char c, int n)
+/* TODO: pull_varint */
+static inline int pull_varint(struct cursor *cursor, int *n)
+{
+	int ok, i;
+	unsigned char b;
+	*n = 0;
+
+	for (i = 0;; i++) {
+		ok = pull_byte(cursor, &b);
+		if (!ok) return 0;
+
+		*n |= ((int)b & 0x7F) << (i * 7);
+
+		/* is_last */
+		if ((b & 0x80) == 0) {
+			return i+1;
+		}
+
+		if (i == 4) return 0;
+	}
+
+	return 0;
+}
+
+static inline int cursor_pull_int(struct cursor *cursor, int *i)
+{
+	return cursor_pull(cursor, (u8*)i, sizeof(*i));
+}
+
+static inline int cursor_push_u32(struct cursor *cursor, uint32_t i) {
+    return cursor_push(cursor, (unsigned char*)&i, sizeof(i));
+}
+
+static inline int cursor_push_u16(struct cursor *cursor, u16 i)
 {
-	if (cursor->p + n >= cursor->end)
+	return cursor_push(cursor, (u8*)&i, sizeof(i));
+}
+
+static inline void *index_cursor(struct cursor *cursor, unsigned int index, int elem_size)
+{
+	u8 *p;
+	p = &cursor->start[elem_size * index];
+
+	if (unlikely(p >= cursor->end))
+		return NULL;
+
+	return (void*)p;
+}
+
+
+static inline int push_sized_str(struct cursor *cursor, const char *str, int len)
+{
+	return cursor_push(cursor, (u8*)str, len);
+}
+
+static inline int cursor_push_lowercase(struct cursor *cur, const char *str, int len)
+{
+	int i;
+
+	if (unlikely(cur->p + len >= cur->end))
 		return 0;
 
-	memset(cursor->p, c, n);
-	cursor->p += n;
+	for (i = 0; i < len; i++)
+		cur->p[i] = tolower(str[i]);
 
+	cur->p += len;
 	return 1;
 }
 
-static inline int cursor_pull(struct cursor *cursor, unsigned char *data,
-			      int len)
+static inline int cursor_push_str(struct cursor *cursor, const char *str)
 {
-	if (unlikely(cursor->p + len > cursor->end)) {
+	return cursor_push(cursor, (u8*)str, (int)strlen(str));
+}
+
+static inline int cursor_push_c_str(struct cursor *cursor, const char *str)
+{
+	return cursor_push_str(cursor, str) && cursor_push_byte(cursor, 0);
+}
+
+/* TODO: push varint size */
+static inline int push_prefixed_str(struct cursor *cursor, const char *str)
+{
+	int ok, len;
+	len = (int)strlen(str);
+	ok = push_varint(cursor, len);
+	if (!ok) return 0;
+	return push_sized_str(cursor, str, len);
+}
+
+static inline int pull_prefixed_str(struct cursor *cursor, struct cursor *dest_buf, const char **str)
+{
+	int len, ok;
+
+	ok = pull_varint(cursor, &len);
+	if (!ok) return 0;
+
+	if (unlikely(dest_buf->p + len > dest_buf->end)) {
 		return 0;
 	}
 
-	memcpy(data, cursor->p, len);
-	cursor->p += len;
+	ok = pull_data_into_cursor(cursor, dest_buf, (unsigned char**)str, len);
+	if (!ok) return 0;
+
+	ok = cursor_push_byte(dest_buf, 0);
 
 	return 1;
 }
 
+static inline int cursor_remaining_capacity(struct cursor *cursor)
+{
+	return (int)(cursor->end - cursor->p);
+}
+
+
+#define max(a,b) ((a) > (b) ? (a) : (b))
+static inline void cursor_print_around(struct cursor *cur, int range)
+{
+	unsigned char *c;
+
+	printf("[%ld/%ld]\n", cur->p - cur->start, cur->end - cur->start);
+
+	c = max(cur->p - range, cur->start);
+	for (; c < cur->end && c < (cur->p + range); c++) {
+		printf("%02x", *c);
+	}
+	printf("\n");
+
+	c = max(cur->p - range, cur->start);
+	for (; c < cur->end && c < (cur->p + range); c++) {
+		if (c == cur->p) {
+			printf("^");
+			continue;
+		}
+		printf("  ");
+	}
+	printf("\n");
+}
+#undef max
+
+static inline int pull_bytes(struct cursor *cur, int count, const u8 **bytes) {
+    if (cur->p + count > cur->end)
+        return 0;
+    
+    *bytes = cur->p;
+    cur->p += count;
+    return 1;
+}
+
+static inline int parse_str(struct cursor *cur, const char *str) {
+    int i;
+    char c, cs;
+    unsigned long len;
+    
+    len = strlen(str);
+    
+    if (cur->p + len >= cur->end)
+        return 0;
+    
+    for (i = 0; i < len; i++) {
+        c = tolower(cur->p[i]);
+        cs = tolower(str[i]);
+        
+        if (c != cs)
+            return 0;
+    }
+    
+    cur->p += len;
+    
+    return 1;
+}
+
+static inline int is_whitespace(char c) {
+    return c == ' ' || c == '\t' || c == '\n' || c == '\v' || c == '\f' || c == '\r';
+}
+
+static inline int is_underscore(char c) {
+    return c == '_';
+}
+
+static inline int is_utf8_byte(u8 c) {
+    return c & 0x80;
+}
+
+static inline int parse_utf8_char(struct cursor *cursor, unsigned int *code_point, unsigned int *utf8_length)
+{
+    u8 first_byte;
+    if (!parse_byte(cursor, &first_byte))
+        return 0; // Not enough data
+
+    // Determine the number of bytes in this UTF-8 character
+    int remaining_bytes = 0;
+    if (first_byte < 0x80) {
+        *code_point = first_byte;
+        return 1;
+    } else if ((first_byte & 0xE0) == 0xC0) {
+        remaining_bytes = 1;
+        *utf8_length = remaining_bytes + 1;
+        *code_point = first_byte & 0x1F;
+    } else if ((first_byte & 0xF0) == 0xE0) {
+        remaining_bytes = 2;
+        *utf8_length = remaining_bytes + 1;
+        *code_point = first_byte & 0x0F;
+    } else if ((first_byte & 0xF8) == 0xF0) {
+        remaining_bytes = 3;
+        *utf8_length = remaining_bytes + 1;
+        *code_point = first_byte & 0x07;
+    } else {
+        remaining_bytes = 0;
+        *utf8_length = 1; // Assume 1 byte length for unrecognized UTF-8 characters
+        // TODO: We need to gracefully handle unrecognized UTF-8 characters
+        printf("Invalid UTF-8 byte: %x\n", *code_point);
+        *code_point = ((first_byte & 0xF0) << 6); // Prevent testing as punctuation
+        return 0; // Invalid first byte
+    }
+
+    // Peek at remaining bytes
+    for (int i = 0; i < remaining_bytes; ++i) {
+        signed char next_byte;
+        if ((next_byte = peek_char(cursor, i+1)) == -1) {
+            *utf8_length = 1;
+            return 0; // Not enough data
+        }
+        
+        // Debugging lines
+        //printf("Cursor: %s\n", cursor->p);
+        //printf("Codepoint: %x\n", *code_point);
+        //printf("Codepoint <<6: %x\n", ((*code_point << 6) | (next_byte & 0x3F)));
+        //printf("Remaining bytes: %x\n", remaining_bytes);
+        //printf("First byte: %x\n", first_byte);
+        //printf("Next byte: %x\n", next_byte);
+        //printf("Bitwise AND result: %x\n", (next_byte & 0xC0));
+        
+        if ((next_byte & 0xC0) != 0x80) {
+            *utf8_length = 1;
+            return 0; // Invalid byte in sequence
+        }
+
+        *code_point = (*code_point << 6) | (next_byte & 0x3F);
+    }
+
+    return 1;
+}
+
+/**
+  * Checks if a given Unicode code point is a punctuation character
+  *
+  * @param codepoint The Unicode code point to check. @return true if the
+  * code point is a punctuation character, false otherwise.
+  */
+static inline int is_punctuation(unsigned int codepoint) {
+
+    // Check for underscore (underscore is not treated as punctuation)
+    if (is_underscore(codepoint))
+        return 0;
+    
+    // Check for ASCII punctuation
+    if (ispunct(codepoint))
+        return 1;
+
+    // Check for Unicode punctuation exceptions (punctuation allowed in hashtags)
+    if (codepoint == 0x301C || codepoint == 0xFF5E) // Japanese Wave Dash / Tilde
+        return 0;
+    
+    // Check for Unicode punctuation
+    // NOTE: We may need to adjust the codepoint ranges in the future,
+    // to include/exclude certain types of Unicode characters in hashtags.
+    // Unicode Blocks Reference: https://www.compart.com/en/unicode/block
+    return (
+        // Latin-1 Supplement No-Break Space (NBSP): U+00A0
+        (codepoint == 0x00A0) ||
+        
+        // Latin-1 Supplement Punctuation: U+00A1 to U+00BF
+        (codepoint >= 0x00A1 && codepoint <= 0x00BF) ||
+
+        // General Punctuation: U+2000 to U+206F
+        (codepoint >= 0x2000 && codepoint <= 0x206F) ||
+
+        // Currency Symbols: U+20A0 to U+20CF
+        (codepoint >= 0x20A0 && codepoint <= 0x20CF) ||
+
+        // Supplemental Punctuation: U+2E00 to U+2E7F
+        (codepoint >= 0x2E00 && codepoint <= 0x2E7F) ||
+
+        // CJK Symbols and Punctuation: U+3000 to U+303F
+        (codepoint >= 0x3000 && codepoint <= 0x303F) ||
+
+        // Ideographic Description Characters: U+2FF0 to U+2FFF
+        (codepoint >= 0x2FF0 && codepoint <= 0x2FFF)
+    );
+}
+
+static inline int is_right_boundary(int c) {
+    return is_whitespace(c) || is_punctuation(c);
+}
+
+static inline int is_left_boundary(char c) {
+    return is_right_boundary(c) || is_utf8_byte(c);
+}
+
+static inline int is_alphanumeric(char c) {
+    return (c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z') || (c >= '0' && c <= '9');
+}
+
+static inline int consume_until_boundary(struct cursor *cur) {
+    unsigned int c;
+    unsigned int char_length = 1;
+    unsigned int *utf8_char_length = &char_length;
+    
+    while (cur->p < cur->end) {
+        c = *cur->p;
+        
+        *utf8_char_length = 1;
+        
+        if (is_whitespace(c))
+            return 1;
+        
+        // Need to check for UTF-8 characters, which can be multiple bytes long
+        if (is_utf8_byte(c)) {
+            if (!parse_utf8_char(cur, &c, utf8_char_length)) {
+                if (!is_right_boundary(c)){
+                    // TODO: We should work towards handling all UTF-8 characters.
+                    printf("Invalid UTF-8 code point: %x\n", c);
+                }
+            }
+        }
+        
+        if (is_right_boundary(c))
+            return 1;
+        
+        // Need to use a variable character byte length for UTF-8 (2-4 bytes)
+        if (cur->p + *utf8_char_length <= cur->end)
+            cur->p += *utf8_char_length;
+        else
+            cur->p++;
+    }
+    
+    return 1;
+}
+
+static inline int consume_until_whitespace(struct cursor *cur, int or_end) {
+    char c;
+    int consumedAtLeastOne = 0;
+    
+    while (cur->p < cur->end) {
+        c = *cur->p;
+        
+        if (is_whitespace(c))
+            return consumedAtLeastOne;
+        
+        cur->p++;
+        consumedAtLeastOne = 1;
+    }
+    
+    return or_end;
+}
+
+static inline int consume_until_non_alphanumeric(struct cursor *cur, int or_end) {
+    char c;
+    int consumedAtLeastOne = 0;
+
+    while (cur->p < cur->end) {
+        c = *cur->p;
+
+        if (!is_alphanumeric(c))
+            return consumedAtLeastOne;
+
+        cur->p++;
+        consumedAtLeastOne = 1;
+    }
+
+    return or_end;
+}
+
+
+static inline int cursor_memset(struct cursor *cursor, unsigned char c, int n)
+{
+    if (cursor->p + n >= cursor->end)
+        return 0;
+
+    memset(cursor->p, c, n);
+    cursor->p += n;
+
+    return 1;
+}
+
+static void consume_whitespace_or_punctuation(struct cursor *cur)
+{
+	while (cur->p < cur->end) {
+		if (!is_right_boundary(*cur->p))
+			return;
+		cur->p++;
+	}
+}
 
 #endif
diff --git a/nostrdb.c b/nostrdb.c
@@ -2114,182 +2114,6 @@ static int ndb_write_note_kind_index(struct ndb_txn *txn, struct ndb_note *note,
 	return 1;
 }
 
-/**
-  * Checks if a given Unicode code point is a punctuation character
-  *
-  * @param codepoint The Unicode code point to check. @return true if the
-  * code point is a punctuation character, false otherwise.
-  */
-static inline int is_punctuation(unsigned int codepoint) {
-	// Check for underscore (underscore is not treated as punctuation)
-	if (codepoint == '_')
-		return 0;
-
-	// Check for ASCII punctuation
-	if (codepoint <= 128 && ispunct(codepoint))
-		return 1;
-
-	// Check for Unicode punctuation exceptions (punctuation allowed in hashtags)
-	if (codepoint == 0x301C || codepoint == 0xFF5E) // Japanese Wave Dash / Tilde
-		return 0;
-
-	// Check for Unicode punctuation
-	// NOTE: We may need to adjust the codepoint ranges in the future,
-	// to include/exclude certain types of Unicode characters in hashtags.
-	// Unicode Blocks Reference: https://www.compart.com/en/unicode/block
-	return (
-		// Latin-1 Supplement No-Break Space (NBSP): U+00A0
-		(codepoint == 0x00A0) ||
-
-		// Latin-1 Supplement Punctuation: U+00A1 to U+00BF
-		(codepoint >= 0x00A1 && codepoint <= 0x00BF) ||
-
-		// General Punctuation: U+2000 to U+206F
-		(codepoint >= 0x2000 && codepoint <= 0x206F) ||
-
-		// Currency Symbols: U+20A0 to U+20CF
-		(codepoint >= 0x20A0 && codepoint <= 0x20CF) ||
-
-		// Supplemental Punctuation: U+2E00 to U+2E7F
-		(codepoint >= 0x2E00 && codepoint <= 0x2E7F) ||
-
-		// CJK Symbols and Punctuation: U+3000 to U+303F
-		(codepoint >= 0x3000 && codepoint <= 0x303F) ||
-
-		// Ideographic Description Characters: U+2FF0 to U+2FFF
-		(codepoint >= 0x2FF0 && codepoint <= 0x2FFF)
-	);
-}
-
-static inline int is_whitespace(char c) {
-    return c == ' ' || c == '\t' || c == '\n' || c == '\v' || c == '\f' || c == '\r';
-}
-
-static inline int is_right_boundary(int c) {
-	return is_whitespace(c) || is_punctuation(c);
-}
-
-static inline int parse_byte(struct cursor *cursor, unsigned char *c)
-{
-	if (unlikely(cursor->p >= cursor->end))
-		return 0;
-
-	*c = *cursor->p;
-
-	return 1;
-}
-
-static inline int peek_char(struct cursor *cur, int ind) {
-	if ((cur->p + ind < cur->start) || (cur->p + ind >= cur->end))
-		return -1;
-
-	return *(cur->p + ind);
-}
-
-static int parse_utf8_char(struct cursor *cursor, unsigned int *code_point,
-			   unsigned int *utf8_length)
-{
-	unsigned char first_byte;
-	if (!parse_byte(cursor, &first_byte))
-		return 0; // Not enough data
-
-	// Determine the number of bytes in this UTF-8 character
-	int remaining_bytes = 0;
-	if (first_byte < 0x80) {
-		*code_point = first_byte;
-		return 1;
-	} else if ((first_byte & 0xE0) == 0xC0) {
-		remaining_bytes = 1;
-		*utf8_length = remaining_bytes + 1;
-		*code_point = first_byte & 0x1F;
-	} else if ((first_byte & 0xF0) == 0xE0) {
-		remaining_bytes = 2;
-		*utf8_length = remaining_bytes + 1;
-		*code_point = first_byte & 0x0F;
-	} else if ((first_byte & 0xF8) == 0xF0) {
-		remaining_bytes = 3;
-		*utf8_length = remaining_bytes + 1;
-		*code_point = first_byte & 0x07;
-	} else {
-		remaining_bytes = 0;
-		*utf8_length = 1; // Assume 1 byte length for unrecognized UTF-8 characters
-		// TODO: We need to gracefully handle unrecognized UTF-8 characters
-		//printf("Invalid UTF-8 byte: %x\n", *code_point);
-		*code_point = ((first_byte & 0xF0) << 6); // Prevent testing as punctuation
-		return 0; // Invalid first byte
-	}
-
-	// Peek at remaining bytes
-	for (int i = 0; i < remaining_bytes; ++i) {
-		signed char next_byte;
-		if ((next_byte = peek_char(cursor, i+1)) == -1) {
-			*utf8_length = 1;
-			return 0; // Not enough data
-		}
-
-		if ((next_byte & 0xC0) != 0x80) {
-			*utf8_length = 1;
-			return 0; // Invalid byte in sequence
-		}
-
-		*code_point = (*code_point << 6) | (next_byte & 0x3F);
-	}
-
-	return 1;
-}
-
-
-static inline int is_utf8_byte(unsigned char c) {
-    return c & 0x80;
-}
-
-static inline int consume_until_boundary(struct cursor *cur) {
-	unsigned int c;
-	unsigned int char_length = 1;
-	unsigned int *utf8_char_length = &char_length;
-
-	while (cur->p < cur->end) {
-		c = *cur->p;
-		*utf8_char_length = 1;
-
-		if (is_whitespace(c))
-			return 1;
-
-		// Need to check for UTF-8 characters, which can be multiple
-		// bytes long
-		if (is_utf8_byte(c)) {
-			if (!parse_utf8_char(cur, &c, utf8_char_length)) {
-				if (!is_right_boundary(c)){
-					// TODO: We should work towards
-					// handling all UTF-8 characters.
-					//printf("Invalid UTF-8 code point: %x\n", c);
-					return 0;
-				}
-			}
-		}
-
-		if (is_right_boundary(c))
-			return 1;
-
-		// Need to use a variable character byte length for UTF-8 (2-4 bytes)
-		if (cur->p + *utf8_char_length <= cur->end)
-			cur->p += *utf8_char_length;
-		else
-			cur->p++;
-	}
-
-	return 1;
-}
-
-static void consume_whitespace_or_punctuation(struct cursor *cur)
-{
-	while (cur->p < cur->end) {
-		if (!is_right_boundary(*cur->p))
-			return;
-		cur->p++;
-	}
-}
-
 static int ndb_write_word_to_index(struct ndb_txn *txn, const char *word,
 				   int word_len, int word_index,
 				   uint64_t timestamp, uint64_t note_id)
diff --git a/typedefs.h b/typedefs.h
@@ -0,0 +1,14 @@
+
+#ifndef PROTOVERSE_TYPEDEFS_H
+#define PROTOVERSE_TYPEDEFS_H
+
+#include <stdint.h>
+
+typedef unsigned char u8;
+typedef unsigned int u32;
+typedef unsigned short u16;
+typedef uint64_t u64;
+typedef int64_t s64;
+
+
+#endif /* PROTOVERSE_TYPEDEFS_H */

	nostrdb an unfairly fast embedded nostr database backed by lmdb
	git clone git://jb55.com/nostrdb
	Log \| Files \| Refs \| Submodules \| README \| LICENSE

M	cursor.h	\|	670	+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++------------
M	nostrdb.c	\|	176	-------------------------------------------------------------------------------
A	typedefs.h	\|	14	++++++++++++++