nostrdb

an unfairly fast embedded nostr database backed by lmdb
git clone git://jb55.com/nostrdb
Log | Files | Refs | Submodules | README | LICENSE

cursor.h (15449B)


      1 
      2 #ifndef JB55_CURSOR_H
      3 #define JB55_CURSOR_H
      4 
      5 #include "typedefs.h"
      6 
      7 #include <stdio.h>
      8 #include <ctype.h>
      9 #include <assert.h>
     10 #include <string.h>
     11 
     12 #define unlikely(x) __builtin_expect((x),0)
     13 #define likely(x)   __builtin_expect((x),1)
     14 
     15 struct cursor {
     16 	unsigned char *start;
     17 	unsigned char *p;
     18 	unsigned char *end;
     19 };
     20 
     21 struct array {
     22 	struct cursor cur;
     23 	unsigned int elem_size;
     24 };
     25 
     26 static inline void reset_cursor(struct cursor *cursor)
     27 {
     28 	cursor->p = cursor->start;
     29 }
     30 
     31 static inline void wipe_cursor(struct cursor *cursor)
     32 {
     33 	reset_cursor(cursor);
     34 	memset(cursor->start, 0, cursor->end - cursor->start);
     35 }
     36 
     37 static inline void make_cursor(u8 *start, u8 *end, struct cursor *cursor)
     38 {
     39 	cursor->start = start;
     40 	cursor->p = start;
     41 	cursor->end = end;
     42 }
     43 
     44 static inline void make_array(struct array *a, u8* start, u8 *end, unsigned int elem_size)
     45 {
     46 	make_cursor(start, end, &a->cur);
     47 	a->elem_size = elem_size;
     48 }
     49 
     50 static inline int cursor_eof(struct cursor *c)
     51 {
     52 	return c->p == c->end;
     53 }
     54 
     55 static inline void *cursor_malloc(struct cursor *mem, unsigned long size)
     56 {
     57 	void *ret;
     58 
     59 	if (mem->p + size > mem->end) {
     60 		return NULL;
     61 	}
     62 
     63 	ret = mem->p;
     64 	mem->p += size;
     65 
     66 	return ret;
     67 }
     68 
     69 static inline void *cursor_alloc(struct cursor *mem, unsigned long size)
     70 {
     71 	void *ret;
     72 	if (!(ret = cursor_malloc(mem, size))) {
     73 		return 0;
     74 	}
     75 
     76 	memset(ret, 0, size);
     77 	return ret;
     78 }
     79 
     80 static inline int cursor_slice(struct cursor *mem, struct cursor *slice, size_t size)
     81 {
     82 	u8 *p;
     83 	if (!(p = cursor_alloc(mem, size))) {
     84 		return 0;
     85 	}
     86 	make_cursor(p, mem->p, slice);
     87 	return 1;
     88 }
     89 
     90 
     91 static inline void copy_cursor(struct cursor *src, struct cursor *dest)
     92 {
     93 	dest->start = src->start;
     94 	dest->p = src->p;
     95 	dest->end = src->end;
     96 }
     97 
     98 static inline int cursor_skip(struct cursor *cursor, int n)
     99 {
    100     if (cursor->p + n >= cursor->end)
    101         return 0;
    102 
    103     cursor->p += n;
    104 
    105     return 1;
    106 }
    107 
    108 static inline int pull_byte(struct cursor *cursor, u8 *c)
    109 {
    110 	if (unlikely(cursor->p >= cursor->end))
    111 		return 0;
    112 
    113 	*c = *cursor->p;
    114 	cursor->p++;
    115 
    116 	return 1;
    117 }
    118 
    119 static inline int parse_byte(struct cursor *cursor, u8 *c)
    120 {
    121     if (unlikely(cursor->p >= cursor->end))
    122         return 0;
    123 
    124     *c = *cursor->p;
    125     //cursor->p++;
    126 
    127     return 1;
    128 }
    129 
    130 static inline int parse_char(struct cursor *cur, char c) {
    131     if (cur->p >= cur->end)
    132         return 0;
    133         
    134     if (*cur->p == c) {
    135         cur->p++;
    136         return 1;
    137     }
    138     
    139     return 0;
    140 }
    141 
    142 static inline int peek_char(struct cursor *cur, int ind) {
    143     if ((cur->p + ind < cur->start) || (cur->p + ind >= cur->end))
    144         return -1;
    145     
    146     return *(cur->p + ind);
    147 }
    148 
    149 static inline int cursor_pull_c_str(struct cursor *cursor, const char **str)
    150 {
    151 	*str = (const char*)cursor->p;
    152 
    153 	for (; cursor->p < cursor->end; cursor->p++) {
    154 		if (*cursor->p == 0) {
    155 			cursor->p++;
    156 			return 1;
    157 		}
    158 	}
    159 
    160 	return 0;
    161 }
    162 
    163 
    164 static inline int cursor_push_byte(struct cursor *cursor, u8 c)
    165 {
    166 	if (unlikely(cursor->p + 1 > cursor->end)) {
    167 		return 0;
    168 	}
    169 
    170 	*cursor->p = c;
    171 	cursor->p++;
    172 
    173 	return 1;
    174 }
    175 
    176 static inline int cursor_pull(struct cursor *cursor, u8 *data, int len)
    177 {
    178 	if (unlikely(cursor->p + len > cursor->end)) {
    179 		return 0;
    180 	}
    181 
    182 	memcpy(data, cursor->p, len);
    183 	cursor->p += len;
    184 
    185 	return 1;
    186 }
    187 
    188 static inline int pull_data_into_cursor(struct cursor *cursor,
    189 			  struct cursor *dest,
    190 			  unsigned char **data,
    191 			  int len)
    192 {
    193 	int ok;
    194 
    195 	if (unlikely(dest->p + len > dest->end)) {
    196 		printf("not enough room in dest buffer\n");
    197 		return 0;
    198 	}
    199 
    200 	ok = cursor_pull(cursor, dest->p, len);
    201 	if (!ok) return 0;
    202 
    203 	*data = dest->p;
    204 	dest->p += len;
    205 
    206 	return 1;
    207 }
    208 
    209 static inline int cursor_dropn(struct cursor *cur, int size, int n)
    210 {
    211 	if (n == 0)
    212 		return 1;
    213 
    214 	if (unlikely(cur->p - size*n < cur->start)) {
    215 		return 0;
    216 	}
    217 
    218 	cur->p -= size*n;
    219 	return 1;
    220 }
    221 
    222 static inline int cursor_drop(struct cursor *cur, int size)
    223 {
    224 	return cursor_dropn(cur, size, 1);
    225 }
    226 
    227 static inline unsigned char *cursor_topn(struct cursor *cur, int len, int n)
    228 {
    229 	n += 1;
    230 	if (unlikely(cur->p - len*n < cur->start)) {
    231 		return NULL;
    232 	}
    233 	return cur->p - len*n;
    234 }
    235 
    236 static inline unsigned char *cursor_top(struct cursor *cur, int len)
    237 {
    238 	if (unlikely(cur->p - len < cur->start)) {
    239 		return NULL;
    240 	}
    241 	return cur->p - len;
    242 }
    243 
    244 static inline int cursor_top_int(struct cursor *cur, int *i)
    245 {
    246 	u8 *p;
    247 	if (unlikely(!(p = cursor_top(cur, sizeof(*i))))) {
    248 		return 0;
    249 	}
    250 	*i = *((int*)p);
    251 	return 1;
    252 }
    253 
    254 static inline int cursor_pop(struct cursor *cur, u8 *data, int len)
    255 {
    256 	if (unlikely(cur->p - len < cur->start)) {
    257 		return 0;
    258 	}
    259 
    260 	cur->p -= len;
    261 	memcpy(data, cur->p, len);
    262 
    263 	return 1;
    264 }
    265 
    266 static inline int cursor_push(struct cursor *cursor, u8 *data, int len)
    267 {
    268 	if (unlikely(cursor->p + len >= cursor->end)) {
    269 		return 0;
    270 	}
    271 
    272 	if (cursor->p != data)
    273 		memcpy(cursor->p, data, len);
    274 
    275 	cursor->p += len;
    276 
    277 	return 1;
    278 }
    279 
    280 static inline int cursor_push_int(struct cursor *cursor, int i)
    281 {
    282 	return cursor_push(cursor, (u8*)&i, sizeof(i));
    283 }
    284 
    285 static inline size_t cursor_count(struct cursor *cursor, size_t elem_size)
    286 {
    287 	return (cursor->p - cursor->start)/elem_size;
    288 }
    289 
    290 /* TODO: push_varint */
    291 static inline int push_varint(struct cursor *cursor, int n)
    292 {
    293 	int ok, len;
    294 	unsigned char b;
    295 	len = 0;
    296 
    297 	while (1) {
    298 		b = (n & 0xFF) | 0x80;
    299 		n >>= 7;
    300 		if (n == 0) {
    301 			b &= 0x7F;
    302 			ok = cursor_push_byte(cursor, b);
    303 			len++;
    304 			if (!ok) return 0;
    305 			break;
    306 		}
    307 
    308 		ok = cursor_push_byte(cursor, b);
    309 		len++;
    310 		if (!ok) return 0;
    311 	}
    312 
    313 	return len;
    314 }
    315 
    316 /* TODO: pull_varint */
    317 static inline int pull_varint(struct cursor *cursor, int *n)
    318 {
    319 	int ok, i;
    320 	unsigned char b;
    321 	*n = 0;
    322 
    323 	for (i = 0;; i++) {
    324 		ok = pull_byte(cursor, &b);
    325 		if (!ok) return 0;
    326 
    327 		*n |= ((int)b & 0x7F) << (i * 7);
    328 
    329 		/* is_last */
    330 		if ((b & 0x80) == 0) {
    331 			return i+1;
    332 		}
    333 
    334 		if (i == 4) return 0;
    335 	}
    336 
    337 	return 0;
    338 }
    339 
    340 static inline int cursor_pull_int(struct cursor *cursor, int *i)
    341 {
    342 	return cursor_pull(cursor, (u8*)i, sizeof(*i));
    343 }
    344 
    345 static inline int cursor_push_u32(struct cursor *cursor, uint32_t i) {
    346     return cursor_push(cursor, (unsigned char*)&i, sizeof(i));
    347 }
    348 
    349 static inline int cursor_push_u16(struct cursor *cursor, u16 i)
    350 {
    351 	return cursor_push(cursor, (u8*)&i, sizeof(i));
    352 }
    353 
    354 static inline void *index_cursor(struct cursor *cursor, unsigned int index, int elem_size)
    355 {
    356 	u8 *p;
    357 	p = &cursor->start[elem_size * index];
    358 
    359 	if (unlikely(p >= cursor->end))
    360 		return NULL;
    361 
    362 	return (void*)p;
    363 }
    364 
    365 
    366 static inline int push_sized_str(struct cursor *cursor, const char *str, int len)
    367 {
    368 	return cursor_push(cursor, (u8*)str, len);
    369 }
    370 
    371 static inline int cursor_push_lowercase(struct cursor *cur, const char *str, int len)
    372 {
    373 	int i;
    374 
    375 	if (unlikely(cur->p + len >= cur->end))
    376 		return 0;
    377 
    378 	for (i = 0; i < len; i++)
    379 		cur->p[i] = tolower(str[i]);
    380 
    381 	cur->p += len;
    382 	return 1;
    383 }
    384 
    385 static inline int cursor_push_str(struct cursor *cursor, const char *str)
    386 {
    387 	return cursor_push(cursor, (u8*)str, (int)strlen(str));
    388 }
    389 
    390 static inline int cursor_push_c_str(struct cursor *cursor, const char *str)
    391 {
    392 	return cursor_push_str(cursor, str) && cursor_push_byte(cursor, 0);
    393 }
    394 
    395 /* TODO: push varint size */
    396 static inline int push_prefixed_str(struct cursor *cursor, const char *str)
    397 {
    398 	int ok, len;
    399 	len = (int)strlen(str);
    400 	ok = push_varint(cursor, len);
    401 	if (!ok) return 0;
    402 	return push_sized_str(cursor, str, len);
    403 }
    404 
    405 static inline int pull_prefixed_str(struct cursor *cursor, struct cursor *dest_buf, const char **str)
    406 {
    407 	int len, ok;
    408 
    409 	ok = pull_varint(cursor, &len);
    410 	if (!ok) return 0;
    411 
    412 	if (unlikely(dest_buf->p + len > dest_buf->end)) {
    413 		return 0;
    414 	}
    415 
    416 	ok = pull_data_into_cursor(cursor, dest_buf, (unsigned char**)str, len);
    417 	if (!ok) return 0;
    418 
    419 	ok = cursor_push_byte(dest_buf, 0);
    420 
    421 	return 1;
    422 }
    423 
    424 static inline int cursor_remaining_capacity(struct cursor *cursor)
    425 {
    426 	return (int)(cursor->end - cursor->p);
    427 }
    428 
    429 
    430 #define max(a,b) ((a) > (b) ? (a) : (b))
    431 static inline void cursor_print_around(struct cursor *cur, int range)
    432 {
    433 	unsigned char *c;
    434 
    435 	printf("[%ld/%ld]\n", cur->p - cur->start, cur->end - cur->start);
    436 
    437 	c = max(cur->p - range, cur->start);
    438 	for (; c < cur->end && c < (cur->p + range); c++) {
    439 		printf("%02x", *c);
    440 	}
    441 	printf("\n");
    442 
    443 	c = max(cur->p - range, cur->start);
    444 	for (; c < cur->end && c < (cur->p + range); c++) {
    445 		if (c == cur->p) {
    446 			printf("^");
    447 			continue;
    448 		}
    449 		printf("  ");
    450 	}
    451 	printf("\n");
    452 }
    453 #undef max
    454 
    455 static inline int pull_bytes(struct cursor *cur, int count, const u8 **bytes) {
    456     if (cur->p + count > cur->end)
    457         return 0;
    458     
    459     *bytes = cur->p;
    460     cur->p += count;
    461     return 1;
    462 }
    463 
    464 static inline int parse_str(struct cursor *cur, const char *str) {
    465     char c, cs;
    466     unsigned long i, len;
    467     
    468     len = strlen(str);
    469     
    470     if (cur->p + len >= cur->end)
    471         return 0;
    472     
    473     for (i = 0; i < len; i++) {
    474         c = tolower(cur->p[i]);
    475         cs = tolower(str[i]);
    476         
    477         if (c != cs)
    478             return 0;
    479     }
    480     
    481     cur->p += len;
    482     
    483     return 1;
    484 }
    485 
    486 static inline int is_whitespace(char c) {
    487     return c == ' ' || c == '\t' || c == '\n' || c == '\v' || c == '\f' || c == '\r';
    488 }
    489 
    490 static inline int is_underscore(char c) {
    491     return c == '_';
    492 }
    493 
    494 static inline int is_utf8_byte(u8 c) {
    495     return c & 0x80;
    496 }
    497 
    498 static inline int parse_utf8_char(struct cursor *cursor, unsigned int *code_point, unsigned int *utf8_length)
    499 {
    500     u8 first_byte;
    501     if (!parse_byte(cursor, &first_byte))
    502         return 0; // Not enough data
    503 
    504     // Determine the number of bytes in this UTF-8 character
    505     int remaining_bytes = 0;
    506     if (first_byte < 0x80) {
    507         *code_point = first_byte;
    508         return 1;
    509     } else if ((first_byte & 0xE0) == 0xC0) {
    510         remaining_bytes = 1;
    511         *utf8_length = remaining_bytes + 1;
    512         *code_point = first_byte & 0x1F;
    513     } else if ((first_byte & 0xF0) == 0xE0) {
    514         remaining_bytes = 2;
    515         *utf8_length = remaining_bytes + 1;
    516         *code_point = first_byte & 0x0F;
    517     } else if ((first_byte & 0xF8) == 0xF0) {
    518         remaining_bytes = 3;
    519         *utf8_length = remaining_bytes + 1;
    520         *code_point = first_byte & 0x07;
    521     } else {
    522         remaining_bytes = 0;
    523         *utf8_length = 1; // Assume 1 byte length for unrecognized UTF-8 characters
    524         // TODO: We need to gracefully handle unrecognized UTF-8 characters
    525         //printf("Invalid UTF-8 byte: %x\n", *code_point);
    526         *code_point = ((first_byte & 0xF0) << 6); // Prevent testing as punctuation
    527         return 0; // Invalid first byte
    528     }
    529 
    530     // Peek at remaining bytes
    531     for (int i = 0; i < remaining_bytes; ++i) {
    532         signed char next_byte;
    533         if ((next_byte = peek_char(cursor, i+1)) == -1) {
    534             *utf8_length = 1;
    535             return 0; // Not enough data
    536         }
    537         
    538         // Debugging lines
    539         //printf("Cursor: %s\n", cursor->p);
    540         //printf("Codepoint: %x\n", *code_point);
    541         //printf("Codepoint <<6: %x\n", ((*code_point << 6) | (next_byte & 0x3F)));
    542         //printf("Remaining bytes: %x\n", remaining_bytes);
    543         //printf("First byte: %x\n", first_byte);
    544         //printf("Next byte: %x\n", next_byte);
    545         //printf("Bitwise AND result: %x\n", (next_byte & 0xC0));
    546         
    547         if ((next_byte & 0xC0) != 0x80) {
    548             *utf8_length = 1;
    549             return 0; // Invalid byte in sequence
    550         }
    551 
    552         *code_point = (*code_point << 6) | (next_byte & 0x3F);
    553     }
    554 
    555     return 1;
    556 }
    557 
    558 /**
    559   * Checks if a given Unicode code point is a punctuation character
    560   *
    561   * @param codepoint The Unicode code point to check. @return true if the
    562   * code point is a punctuation character, false otherwise.
    563   */
    564 static inline int is_punctuation(unsigned int codepoint) {
    565 
    566     // Check for underscore (underscore is not treated as punctuation)
    567     if (is_underscore(codepoint))
    568         return 0;
    569     
    570     // Check for ASCII punctuation
    571     if (ispunct(codepoint))
    572         return 1;
    573 
    574     // Check for Unicode punctuation exceptions (punctuation allowed in hashtags)
    575     if (codepoint == 0x301C || codepoint == 0xFF5E) // Japanese Wave Dash / Tilde
    576         return 0;
    577     
    578     // Check for Unicode punctuation
    579     // NOTE: We may need to adjust the codepoint ranges in the future,
    580     // to include/exclude certain types of Unicode characters in hashtags.
    581     // Unicode Blocks Reference: https://www.compart.com/en/unicode/block
    582     return (
    583         // Latin-1 Supplement No-Break Space (NBSP): U+00A0
    584         (codepoint == 0x00A0) ||
    585         
    586         // Latin-1 Supplement Punctuation: U+00A1 to U+00BF
    587         (codepoint >= 0x00A1 && codepoint <= 0x00BF) ||
    588 
    589         // General Punctuation: U+2000 to U+206F
    590         (codepoint >= 0x2000 && codepoint <= 0x206F) ||
    591 
    592         // Currency Symbols: U+20A0 to U+20CF
    593         (codepoint >= 0x20A0 && codepoint <= 0x20CF) ||
    594 
    595         // Supplemental Punctuation: U+2E00 to U+2E7F
    596         (codepoint >= 0x2E00 && codepoint <= 0x2E7F) ||
    597 
    598         // CJK Symbols and Punctuation: U+3000 to U+303F
    599         (codepoint >= 0x3000 && codepoint <= 0x303F) ||
    600 
    601         // Ideographic Description Characters: U+2FF0 to U+2FFF
    602         (codepoint >= 0x2FF0 && codepoint <= 0x2FFF)
    603     );
    604 }
    605 
    606 static inline int is_right_boundary(int c) {
    607     return is_whitespace(c) || is_punctuation(c);
    608 }
    609 
    610 static inline int is_left_boundary(char c) {
    611     return is_right_boundary(c) || is_utf8_byte(c);
    612 }
    613 
    614 static inline int is_alphanumeric(char c) {
    615     return (c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z') || (c >= '0' && c <= '9');
    616 }
    617 
    618 static inline int consume_until_boundary(struct cursor *cur) {
    619     unsigned int c;
    620     unsigned int char_length = 1;
    621     unsigned int *utf8_char_length = &char_length;
    622     
    623     while (cur->p < cur->end) {
    624         c = *cur->p;
    625         
    626         *utf8_char_length = 1;
    627         
    628         if (is_whitespace(c))
    629             return 1;
    630         
    631         // Need to check for UTF-8 characters, which can be multiple bytes long
    632         if (is_utf8_byte(c)) {
    633             if (!parse_utf8_char(cur, &c, utf8_char_length)) {
    634                 if (!is_right_boundary(c)){
    635                     // TODO: We should work towards handling all UTF-8 characters.
    636                     //printf("Invalid UTF-8 code point: %x\n", c);
    637                 }
    638             }
    639         }
    640         
    641         if (is_right_boundary(c))
    642             return 1;
    643         
    644         // Need to use a variable character byte length for UTF-8 (2-4 bytes)
    645         if (cur->p + *utf8_char_length <= cur->end)
    646             cur->p += *utf8_char_length;
    647         else
    648             cur->p++;
    649     }
    650     
    651     return 1;
    652 }
    653 
    654 static inline int consume_until_whitespace(struct cursor *cur, int or_end) {
    655     char c;
    656     int consumedAtLeastOne = 0;
    657     
    658     while (cur->p < cur->end) {
    659         c = *cur->p;
    660         
    661         if (is_whitespace(c))
    662             return consumedAtLeastOne;
    663         
    664         cur->p++;
    665         consumedAtLeastOne = 1;
    666     }
    667     
    668     return or_end;
    669 }
    670 
    671 static inline int consume_until_non_alphanumeric(struct cursor *cur, int or_end) {
    672     char c;
    673     int consumedAtLeastOne = 0;
    674 
    675     while (cur->p < cur->end) {
    676         c = *cur->p;
    677 
    678         if (!is_alphanumeric(c))
    679             return consumedAtLeastOne;
    680 
    681         cur->p++;
    682         consumedAtLeastOne = 1;
    683     }
    684 
    685     return or_end;
    686 }
    687 
    688 
    689 static inline int cursor_memset(struct cursor *cursor, unsigned char c, int n)
    690 {
    691     if (cursor->p + n >= cursor->end)
    692         return 0;
    693 
    694     memset(cursor->p, c, n);
    695     cursor->p += n;
    696 
    697     return 1;
    698 }
    699 
    700 static void consume_whitespace_or_punctuation(struct cursor *cur)
    701 {
    702 	while (cur->p < cur->end) {
    703 		if (!is_right_boundary(*cur->p))
    704 			return;
    705 		cur->p++;
    706 	}
    707 }
    708 
    709 #endif