nostrdb

an unfairly fast embedded nostr database backed by lmdb
git clone git://jb55.com/nostrdb
Log | Files | Refs | Submodules | README | LICENSE

cursor.h (16151B)


      1 
      2 #ifndef JB55_CURSOR_H
      3 #define JB55_CURSOR_H
      4 
      5 #include "ccan/likely/likely.h"
      6 
      7 #include <stdio.h>
      8 #include <inttypes.h>
      9 #include <ctype.h>
     10 #include <assert.h>
     11 #include <string.h>
     12 
     13 struct cursor {
     14 	unsigned char *start;
     15 	unsigned char *p;
     16 	unsigned char *end;
     17 };
     18 
     19 static inline void reset_cursor(struct cursor *cursor)
     20 {
     21 	cursor->p = cursor->start;
     22 }
     23 
     24 static inline void wipe_cursor(struct cursor *cursor)
     25 {
     26 	reset_cursor(cursor);
     27 	memset(cursor->start, 0, cursor->end - cursor->start);
     28 }
     29 
     30 static inline void make_cursor(unsigned char *start, unsigned char *end, struct cursor *cursor)
     31 {
     32 	cursor->start = start;
     33 	cursor->p = start;
     34 	cursor->end = end;
     35 }
     36 
     37 static inline int cursor_eof(struct cursor *c)
     38 {
     39 	return c->p == c->end;
     40 }
     41 
     42 static inline void *cursor_malloc(struct cursor *mem, unsigned long size)
     43 {
     44 	void *ret;
     45 
     46 	if (mem->p + size > mem->end) {
     47 		return NULL;
     48 	}
     49 
     50 	ret = mem->p;
     51 	mem->p += size;
     52 
     53 	return ret;
     54 }
     55 
     56 static inline void *cursor_alloc(struct cursor *mem, unsigned long size)
     57 {
     58 	void *ret;
     59 	if (!(ret = cursor_malloc(mem, size))) {
     60 		return 0;
     61 	}
     62 
     63 	memset(ret, 0, size);
     64 	return ret;
     65 }
     66 
     67 static inline int cursor_slice(struct cursor *mem, struct cursor *slice, size_t size)
     68 {
     69 	unsigned char *p;
     70 	if (!(p = cursor_alloc(mem, size))) {
     71 		return 0;
     72 	}
     73 	make_cursor(p, mem->p, slice);
     74 	return 1;
     75 }
     76 
     77 static inline int cursor_malloc_slice(struct cursor *mem, struct cursor *slice, size_t size)
     78 {
     79 	unsigned char *p;
     80 	if (!(p = cursor_malloc(mem, size))) {
     81 		return 0;
     82 	}
     83 	make_cursor(p, mem->p, slice);
     84 	return 1;
     85 }
     86 
     87 
     88 static inline void copy_cursor(struct cursor *src, struct cursor *dest)
     89 {
     90 	dest->start = src->start;
     91 	dest->p = src->p;
     92 	dest->end = src->end;
     93 }
     94 
     95 static inline int cursor_skip(struct cursor *cursor, int n)
     96 {
     97     if (cursor->p + n >= cursor->end)
     98         return 0;
     99 
    100     cursor->p += n;
    101 
    102     return 1;
    103 }
    104 
    105 static inline int cursor_pull_byte(struct cursor *cursor, unsigned char *c)
    106 {
    107 	if (unlikely(cursor->p >= cursor->end))
    108 		return 0;
    109 
    110 	*c = *cursor->p;
    111 	cursor->p++;
    112 
    113 	return 1;
    114 }
    115 
    116 static inline int parse_byte(struct cursor *cursor, unsigned char *c)
    117 {
    118     if (unlikely(cursor->p >= cursor->end))
    119         return 0;
    120 
    121     *c = *cursor->p;
    122     //cursor->p++;
    123 
    124     return 1;
    125 }
    126 
    127 static inline int parse_char(struct cursor *cur, char c) {
    128     if (cur->p >= cur->end)
    129         return 0;
    130         
    131     if (*cur->p == c) {
    132         cur->p++;
    133         return 1;
    134     }
    135     
    136     return 0;
    137 }
    138 
    139 static inline int peek_char(struct cursor *cur, int ind) {
    140     if ((cur->p + ind < cur->start) || (cur->p + ind >= cur->end))
    141         return -1;
    142     
    143     return *(cur->p + ind);
    144 }
    145 
    146 static inline int cursor_pull_c_str(struct cursor *cursor, const char **str)
    147 {
    148 	*str = (const char*)cursor->p;
    149 
    150 	for (; cursor->p < cursor->end; cursor->p++) {
    151 		if (*cursor->p == 0) {
    152 			cursor->p++;
    153 			return 1;
    154 		}
    155 	}
    156 
    157 	return 0;
    158 }
    159 
    160 
    161 static inline int cursor_push_byte(struct cursor *cursor, unsigned char c)
    162 {
    163 	if (unlikely(cursor->p + 1 > cursor->end)) {
    164 		return 0;
    165 	}
    166 
    167 	*cursor->p = c;
    168 	cursor->p++;
    169 
    170 	return 1;
    171 }
    172 
    173 static inline int cursor_pull(struct cursor *cursor, unsigned char *data, int len)
    174 {
    175 	if (unlikely(cursor->p + len > cursor->end)) {
    176 		return 0;
    177 	}
    178 
    179 	memcpy(data, cursor->p, len);
    180 	cursor->p += len;
    181 
    182 	return 1;
    183 }
    184 
    185 static inline int pull_data_into_cursor(struct cursor *cursor,
    186 			  struct cursor *dest,
    187 			  unsigned char **data,
    188 			  int len)
    189 {
    190 	int ok;
    191 
    192 	if (unlikely(dest->p + len > dest->end)) {
    193 		printf("not enough room in dest buffer\n");
    194 		return 0;
    195 	}
    196 
    197 	ok = cursor_pull(cursor, dest->p, len);
    198 	if (!ok) return 0;
    199 
    200 	*data = dest->p;
    201 	dest->p += len;
    202 
    203 	return 1;
    204 }
    205 
    206 static inline int cursor_dropn(struct cursor *cur, int size, int n)
    207 {
    208 	if (n == 0)
    209 		return 1;
    210 
    211 	if (unlikely(cur->p - size*n < cur->start)) {
    212 		return 0;
    213 	}
    214 
    215 	cur->p -= size*n;
    216 	return 1;
    217 }
    218 
    219 static inline int cursor_drop(struct cursor *cur, int size)
    220 {
    221 	return cursor_dropn(cur, size, 1);
    222 }
    223 
    224 static inline unsigned char *cursor_topn(struct cursor *cur, int len, int n)
    225 {
    226 	n += 1;
    227 	if (unlikely(cur->p - len*n < cur->start)) {
    228 		return NULL;
    229 	}
    230 	return cur->p - len*n;
    231 }
    232 
    233 static inline unsigned char *cursor_top(struct cursor *cur, int len)
    234 {
    235 	if (unlikely(cur->p - len < cur->start)) {
    236 		return NULL;
    237 	}
    238 	return cur->p - len;
    239 }
    240 
    241 static inline int cursor_top_int(struct cursor *cur, int *i)
    242 {
    243 	unsigned char *p;
    244 	if (unlikely(!(p = cursor_top(cur, sizeof(*i))))) {
    245 		return 0;
    246 	}
    247 	*i = *((int*)p);
    248 	return 1;
    249 }
    250 
    251 static inline int cursor_pop(struct cursor *cur, unsigned char *data, int len)
    252 {
    253 	if (unlikely(cur->p - len < cur->start)) {
    254 		return 0;
    255 	}
    256 
    257 	cur->p -= len;
    258 	memcpy(data, cur->p, len);
    259 
    260 	return 1;
    261 }
    262 
    263 static inline int cursor_push(struct cursor *cursor, unsigned char *data, int len)
    264 {
    265 	if (unlikely(cursor->p + len > cursor->end)) {
    266 		return 0;
    267 	}
    268 
    269 	if (cursor->p != data)
    270 		memcpy(cursor->p, data, len);
    271 
    272 	cursor->p += len;
    273 
    274 	return 1;
    275 }
    276 
    277 static inline int cursor_push_int(struct cursor *cursor, int i)
    278 {
    279 	return cursor_push(cursor, (unsigned char*)&i, sizeof(i));
    280 }
    281 
    282 static inline size_t cursor_count(struct cursor *cursor, size_t elem_size)
    283 {
    284 	return (cursor->p - cursor->start)/elem_size;
    285 }
    286 
    287 /* Encodes a 64-bit integer into a variable-length format and pushes it into a cursor.
    288  * Returns the number of bytes used or -1 in case of an error. */
    289 static inline int cursor_push_varint(struct cursor *cursor, uint64_t n)
    290 {
    291 	int len = 0;
    292 	do {
    293 		unsigned char b = (n & 0x7F) | (n > 0x7F ? 0x80 : 0);
    294 		n >>= 7;
    295 		if (!cursor_push_byte(cursor, b))
    296 			return -1; // Error handling
    297 		len++;
    298 	} while (n != 0);
    299 
    300 	return len;
    301 }
    302 
    303 static inline int cursor_pull_varint(struct cursor *cursor, uint64_t *n)
    304 {
    305 	int ok, i;
    306 	unsigned char b;
    307 
    308 	*n = 0;
    309 
    310 	for (i = 0; i < 10; i++) { // Loop up to 10 bytes for 64-bit
    311 		ok = cursor_pull_byte(cursor, &b);
    312 		if (!ok) return 0;
    313 
    314 		*n |= ((int64_t)b & 0x7F) << (i * 7);
    315 
    316 		if ((b & 0x80) == 0) {
    317 			return i + 1; // Successfully read i+1 bytes
    318 		}
    319 	}
    320 
    321 	return 10; // Successfully read 10 bytes for a full 64-bit integer
    322 }
    323 
    324 static int cursor_pull_varint_u32(struct cursor *cursor, uint32_t *v)
    325 {
    326 	uint64_t bigval;
    327 
    328 	if (!cursor_pull_varint(cursor, &bigval))
    329 		return 0;
    330 
    331 	if (bigval > UINT32_MAX)
    332 		return 0;
    333 
    334 	*v = (uint32_t) bigval;
    335 	return 1;
    336 }
    337 
    338 static inline int cursor_pull_int(struct cursor *cursor, int *i)
    339 {
    340 	return cursor_pull(cursor, (unsigned char*)i, sizeof(*i));
    341 }
    342 
    343 static inline int cursor_push_u32(struct cursor *cursor, uint32_t i) {
    344     return cursor_push(cursor, (unsigned char*)&i, sizeof(i));
    345 }
    346 
    347 static inline int cursor_push_u16(struct cursor *cursor, unsigned short i)
    348 {
    349 	return cursor_push(cursor, (unsigned char*)&i, sizeof(i));
    350 }
    351 
    352 static inline int cursor_pull_u16(struct cursor *cursor, uint16_t *i)
    353 {
    354 	return cursor_pull(cursor, (unsigned char*)i, sizeof(*i));
    355 }
    356 
    357 static inline void *index_cursor(struct cursor *cursor, unsigned int index, int elem_size)
    358 {
    359 	unsigned char *p;
    360 	p = &cursor->start[elem_size * index];
    361 
    362 	if (unlikely(p >= cursor->end))
    363 		return NULL;
    364 
    365 	return (void*)p;
    366 }
    367 
    368 
    369 static inline int push_sized_str(struct cursor *cursor, const char *str, int len)
    370 {
    371 	return cursor_push(cursor, (unsigned char*)str, len);
    372 }
    373 
    374 static inline int cursor_push_lowercase(struct cursor *cur, const char *str, int len)
    375 {
    376 	int i;
    377 
    378 	if (unlikely(cur->p + len >= cur->end))
    379 		return 0;
    380 
    381 	for (i = 0; i < len; i++)
    382 		cur->p[i] = tolower(str[i]);
    383 
    384 	cur->p += len;
    385 	return 1;
    386 }
    387 
    388 static inline int cursor_push_str(struct cursor *cursor, const char *str)
    389 {
    390 	return cursor_push(cursor, (unsigned char*)str, (int)strlen(str));
    391 }
    392 
    393 static inline int cursor_push_c_str(struct cursor *cursor, const char *str)
    394 {
    395 	if (str == NULL)
    396 		return cursor_push_byte(cursor, 0);
    397 	return cursor_push_str(cursor, str) && cursor_push_byte(cursor, 0);
    398 }
    399 
    400 /* TODO: push varint size */
    401 static inline int push_prefixed_str(struct cursor *cursor, const char *str)
    402 {
    403 	uint64_t len;
    404 	len = strlen(str);
    405 	if (!cursor_push_varint(cursor, len))
    406 		return 0;
    407 	return push_sized_str(cursor, str, len);
    408 }
    409 
    410 static inline int pull_prefixed_str(struct cursor *cursor, struct cursor *dest_buf, const char **str)
    411 {
    412 	uint64_t len;
    413 
    414 	if (!cursor_pull_varint(cursor, &len))
    415 		return 0;
    416 
    417 	if (unlikely(dest_buf->p + len > dest_buf->end))
    418 		return 0;
    419 
    420 	if (!pull_data_into_cursor(cursor, dest_buf, (unsigned char**)str, len))
    421 		return 0;
    422 
    423 	return cursor_push_byte(dest_buf, 0);
    424 }
    425 
    426 static inline int cursor_remaining_capacity(struct cursor *cursor)
    427 {
    428 	return (int)(cursor->end - cursor->p);
    429 }
    430 
    431 
    432 #define max(a,b) ((a) > (b) ? (a) : (b))
    433 static inline void cursor_print_around(struct cursor *cur, int range)
    434 {
    435 	unsigned char *c;
    436 
    437 	printf("[%ld/%ld]\n", cur->p - cur->start, cur->end - cur->start);
    438 
    439 	c = max(cur->p - range, cur->start);
    440 	for (; c < cur->end && c < (cur->p + range); c++) {
    441 		printf("%02x", *c);
    442 	}
    443 	printf("\n");
    444 
    445 	c = max(cur->p - range, cur->start);
    446 	for (; c < cur->end && c < (cur->p + range); c++) {
    447 		if (c == cur->p) {
    448 			printf("^");
    449 			continue;
    450 		}
    451 		printf("  ");
    452 	}
    453 	printf("\n");
    454 }
    455 #undef max
    456 
    457 static inline int pull_bytes(struct cursor *cur, int count, const unsigned char **bytes) {
    458     if (cur->p + count > cur->end)
    459         return 0;
    460     
    461     *bytes = cur->p;
    462     cur->p += count;
    463     return 1;
    464 }
    465 
    466 static inline int parse_str(struct cursor *cur, const char *str) {
    467     unsigned int i;
    468     char c, cs;
    469     unsigned long len;
    470     
    471     len = strlen(str);
    472     
    473     if (cur->p + len >= cur->end)
    474         return 0;
    475     
    476     for (i = 0; i < len; i++) {
    477         c = tolower(cur->p[i]);
    478         cs = tolower(str[i]);
    479         
    480         if (c != cs)
    481             return 0;
    482     }
    483     
    484     cur->p += len;
    485     
    486     return 1;
    487 }
    488 
    489 static inline int is_whitespace(char c) {
    490     return c == ' ' || c == '\t' || c == '\n' || c == '\v' || c == '\f' || c == '\r';
    491 }
    492 
    493 static inline int is_underscore(char c) {
    494     return c == '_';
    495 }
    496 
    497 static inline int is_utf8_byte(unsigned char c) {
    498     return c & 0x80;
    499 }
    500 
    501 static inline int parse_utf8_char(struct cursor *cursor, unsigned int *code_point, unsigned int *utf8_length)
    502 {
    503     unsigned char first_byte;
    504     if (!parse_byte(cursor, &first_byte))
    505         return 0; // Not enough data
    506 
    507     // Determine the number of bytes in this UTF-8 character
    508     int remaining_bytes = 0;
    509     if (first_byte < 0x80) {
    510         *code_point = first_byte;
    511         return 1;
    512     } else if ((first_byte & 0xE0) == 0xC0) {
    513         remaining_bytes = 1;
    514         *utf8_length = remaining_bytes + 1;
    515         *code_point = first_byte & 0x1F;
    516     } else if ((first_byte & 0xF0) == 0xE0) {
    517         remaining_bytes = 2;
    518         *utf8_length = remaining_bytes + 1;
    519         *code_point = first_byte & 0x0F;
    520     } else if ((first_byte & 0xF8) == 0xF0) {
    521         remaining_bytes = 3;
    522         *utf8_length = remaining_bytes + 1;
    523         *code_point = first_byte & 0x07;
    524     } else {
    525         remaining_bytes = 0;
    526         *utf8_length = 1; // Assume 1 byte length for unrecognized UTF-8 characters
    527         // TODO: We need to gracefully handle unrecognized UTF-8 characters
    528         //printf("Invalid UTF-8 byte: %x\n", *code_point);
    529         *code_point = ((first_byte & 0xF0) << 6); // Prevent testing as punctuation
    530         return 0; // Invalid first byte
    531     }
    532 
    533     // Peek at remaining bytes
    534     for (int i = 0; i < remaining_bytes; ++i) {
    535         signed char next_byte;
    536         if ((next_byte = peek_char(cursor, i+1)) == -1) {
    537             *utf8_length = 1;
    538             return 0; // Not enough data
    539         }
    540         
    541         // Debugging lines
    542         //printf("Cursor: %s\n", cursor->p);
    543         //printf("Codepoint: %x\n", *code_point);
    544         //printf("Codepoint <<6: %x\n", ((*code_point << 6) | (next_byte & 0x3F)));
    545         //printf("Remaining bytes: %x\n", remaining_bytes);
    546         //printf("First byte: %x\n", first_byte);
    547         //printf("Next byte: %x\n", next_byte);
    548         //printf("Bitwise AND result: %x\n", (next_byte & 0xC0));
    549         
    550         if ((next_byte & 0xC0) != 0x80) {
    551             *utf8_length = 1;
    552             return 0; // Invalid byte in sequence
    553         }
    554 
    555         *code_point = (*code_point << 6) | (next_byte & 0x3F);
    556     }
    557 
    558     return 1;
    559 }
    560 
    561 /**
    562   * Checks if a given Unicode code point is a punctuation character
    563   *
    564   * @param codepoint The Unicode code point to check. @return true if the
    565   * code point is a punctuation character, false otherwise.
    566   */
    567 static inline int is_punctuation(unsigned int codepoint) {
    568 
    569     // Check for underscore (underscore is not treated as punctuation)
    570     if (is_underscore(codepoint))
    571         return 0;
    572     
    573     // Check for ASCII punctuation
    574     if (codepoint <= 128 && ispunct(codepoint))
    575         return 1;
    576 
    577     // Check for Unicode punctuation exceptions (punctuation allowed in hashtags)
    578     if (codepoint == 0x301C || codepoint == 0xFF5E) // Japanese Wave Dash / Tilde
    579         return 0;
    580     
    581     // Check for Unicode punctuation
    582     // NOTE: We may need to adjust the codepoint ranges in the future,
    583     // to include/exclude certain types of Unicode characters in hashtags.
    584     // Unicode Blocks Reference: https://www.compart.com/en/unicode/block
    585     return (
    586         // Latin-1 Supplement No-Break Space (NBSP): U+00A0
    587         (codepoint == 0x00A0) ||
    588         
    589         // Latin-1 Supplement Punctuation: U+00A1 to U+00BF
    590         (codepoint >= 0x00A1 && codepoint <= 0x00BF) ||
    591 
    592         // General Punctuation: U+2000 to U+206F
    593         (codepoint >= 0x2000 && codepoint <= 0x206F) ||
    594 
    595         // Currency Symbols: U+20A0 to U+20CF
    596         (codepoint >= 0x20A0 && codepoint <= 0x20CF) ||
    597 
    598         // Supplemental Punctuation: U+2E00 to U+2E7F
    599         (codepoint >= 0x2E00 && codepoint <= 0x2E7F) ||
    600 
    601         // CJK Symbols and Punctuation: U+3000 to U+303F
    602         (codepoint >= 0x3000 && codepoint <= 0x303F) ||
    603 
    604         // Ideographic Description Characters: U+2FF0 to U+2FFF
    605         (codepoint >= 0x2FF0 && codepoint <= 0x2FFF)
    606     );
    607 }
    608 
    609 static inline int is_right_boundary(int c) {
    610     return is_whitespace(c) || is_punctuation(c);
    611 }
    612 
    613 static inline int is_left_boundary(char c) {
    614     return is_right_boundary(c) || is_utf8_byte(c);
    615 }
    616 
    617 static inline int is_alphanumeric(char c) {
    618     return (c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z') || (c >= '0' && c <= '9');
    619 }
    620 
    621 static inline int consume_until_boundary(struct cursor *cur) {
    622 	unsigned int c;
    623 	unsigned int char_length = 1;
    624 	unsigned int *utf8_char_length = &char_length;
    625 	
    626 	while (cur->p < cur->end) {
    627 		c = *cur->p;
    628 		
    629 		*utf8_char_length = 1;
    630 		
    631 		if (is_whitespace(c))
    632 			return 1;
    633 		
    634 		// Need to check for UTF-8 characters, which can be multiple bytes long
    635 		if (is_utf8_byte(c)) {
    636 			if (!parse_utf8_char(cur, &c, utf8_char_length)) {
    637 				if (!is_right_boundary(c)){
    638 					return 0;
    639 				}
    640 			}
    641 		}
    642 		
    643 		if (is_right_boundary(c))
    644 			return 1;
    645 		
    646 		// Need to use a variable character byte length for UTF-8 (2-4 bytes)
    647 		if (cur->p + *utf8_char_length <= cur->end)
    648 			cur->p += *utf8_char_length;
    649 		else
    650 			cur->p++;
    651 	}
    652 	
    653 	return 1;
    654 }
    655 
    656 static inline int consume_until_whitespace(struct cursor *cur, int or_end) {
    657     char c;
    658     int consumedAtLeastOne = 0;
    659     
    660     while (cur->p < cur->end) {
    661         c = *cur->p;
    662         
    663         if (is_whitespace(c))
    664             return consumedAtLeastOne;
    665         
    666         cur->p++;
    667         consumedAtLeastOne = 1;
    668     }
    669     
    670     return or_end;
    671 }
    672 
    673 static inline int consume_until_non_alphanumeric(struct cursor *cur, int or_end) {
    674     char c;
    675     int consumedAtLeastOne = 0;
    676 
    677     while (cur->p < cur->end) {
    678         c = *cur->p;
    679 
    680         if (!is_alphanumeric(c))
    681             return consumedAtLeastOne;
    682 
    683         cur->p++;
    684         consumedAtLeastOne = 1;
    685     }
    686 
    687     return or_end;
    688 }
    689 
    690 
    691 static inline int cursor_memset(struct cursor *cursor, unsigned char c, int n)
    692 {
    693     if (cursor->p + n >= cursor->end)
    694         return 0;
    695 
    696     memset(cursor->p, c, n);
    697     cursor->p += n;
    698 
    699     return 1;
    700 }
    701 
    702 static void consume_whitespace_or_punctuation(struct cursor *cur)
    703 {
    704 	while (cur->p < cur->end) {
    705 		if (!is_right_boundary(*cur->p))
    706 			return;
    707 		cur->p++;
    708 	}
    709 }
    710 
    711 // pad cursor buffer to n-byte alignment
    712 static inline int cursor_align(struct cursor *cur, int bytes) {
    713 	size_t size = cur->p - cur->start;
    714 	int pad;
    715 
    716 	// pad to n-byte alignment
    717 	pad = ((size + (bytes-1)) & ~(bytes-1)) - size;
    718 
    719 	if (pad > 0 && !cursor_memset(cur, 0, pad))
    720 		return 0;
    721 
    722 	return 1;
    723 }
    724 
    725 
    726 #endif