damus

nostr ios client
git clone git://jb55.com/damus
Log | Files | Refs | README | LICENSE

cursor.h (16444B)


      1 
      2 #ifndef JB55_CURSOR_H
      3 #define JB55_CURSOR_H
      4 
      5 #include "typedefs.h"
      6 #include "varint.h"
      7 
      8 #include <stdio.h>
      9 #include <ctype.h>
     10 #include <assert.h>
     11 #include <string.h>
     12 
     13 #define unlikely(x) __builtin_expect((x),0)
     14 #define likely(x)   __builtin_expect((x),1)
     15 
     16 struct cursor {
     17 	unsigned char *start;
     18 	unsigned char *p;
     19 	unsigned char *end;
     20 };
     21 
     22 struct array {
     23 	struct cursor cur;
     24 	unsigned int elem_size;
     25 };
     26 
     27 static inline void reset_cursor(struct cursor *cursor)
     28 {
     29 	cursor->p = cursor->start;
     30 }
     31 
     32 static inline void wipe_cursor(struct cursor *cursor)
     33 {
     34 	reset_cursor(cursor);
     35 	memset(cursor->start, 0, cursor->end - cursor->start);
     36 }
     37 
     38 static inline void make_cursor(u8 *start, u8 *end, struct cursor *cursor)
     39 {
     40 	cursor->start = start;
     41 	cursor->p = start;
     42 	cursor->end = end;
     43 }
     44 
     45 static inline void make_array(struct array *a, u8* start, u8 *end, unsigned int elem_size)
     46 {
     47 	make_cursor(start, end, &a->cur);
     48 	a->elem_size = elem_size;
     49 }
     50 
     51 static inline int cursor_eof(struct cursor *c)
     52 {
     53 	return c->p == c->end;
     54 }
     55 
     56 static inline void *cursor_malloc(struct cursor *mem, unsigned long size)
     57 {
     58 	void *ret;
     59 
     60 	if (mem->p + size > mem->end) {
     61 		return NULL;
     62 	}
     63 
     64 	ret = mem->p;
     65 	mem->p += size;
     66 
     67 	return ret;
     68 }
     69 
     70 static inline void *cursor_alloc(struct cursor *mem, unsigned long size)
     71 {
     72 	void *ret;
     73 	if (!(ret = cursor_malloc(mem, size))) {
     74 		return 0;
     75 	}
     76 
     77 	memset(ret, 0, size);
     78 	return ret;
     79 }
     80 
     81 static inline int cursor_slice(struct cursor *mem, struct cursor *slice, size_t size)
     82 {
     83 	u8 *p;
     84 	if (!(p = cursor_alloc(mem, size))) {
     85 		return 0;
     86 	}
     87 	make_cursor(p, mem->p, slice);
     88 	return 1;
     89 }
     90 
     91 
     92 static inline void copy_cursor(struct cursor *src, struct cursor *dest)
     93 {
     94 	dest->start = src->start;
     95 	dest->p = src->p;
     96 	dest->end = src->end;
     97 }
     98 
     99 static inline int cursor_skip(struct cursor *cursor, int n)
    100 {
    101     if (cursor->p + n >= cursor->end)
    102         return 0;
    103 
    104     cursor->p += n;
    105 
    106     return 1;
    107 }
    108 
    109 static inline int pull_byte(struct cursor *cursor, u8 *c)
    110 {
    111 	if (unlikely(cursor->p >= cursor->end))
    112 		return 0;
    113 
    114 	*c = *cursor->p;
    115 	cursor->p++;
    116 
    117 	return 1;
    118 }
    119 
    120 static inline int parse_byte(struct cursor *cursor, u8 *c)
    121 {
    122     if (unlikely(cursor->p >= cursor->end))
    123         return 0;
    124 
    125     *c = *cursor->p;
    126     //cursor->p++;
    127 
    128     return 1;
    129 }
    130 
    131 static inline int parse_char(struct cursor *cur, char c) {
    132     if (cur->p >= cur->end)
    133         return 0;
    134         
    135     if (*cur->p == c) {
    136         cur->p++;
    137         return 1;
    138     }
    139     
    140     return 0;
    141 }
    142 
    143 static inline int peek_char(struct cursor *cur, int ind) {
    144     if ((cur->p + ind < cur->start) || (cur->p + ind >= cur->end))
    145         return -1;
    146     
    147     return *(cur->p + ind);
    148 }
    149 
    150 static inline int cursor_pull_c_str(struct cursor *cursor, const char **str)
    151 {
    152 	*str = (const char*)cursor->p;
    153 
    154 	for (; cursor->p < cursor->end; cursor->p++) {
    155 		if (*cursor->p == 0) {
    156 			cursor->p++;
    157 			return 1;
    158 		}
    159 	}
    160 
    161 	return 0;
    162 }
    163 
    164 
    165 static inline int cursor_push_byte(struct cursor *cursor, u8 c)
    166 {
    167 	if (unlikely(cursor->p + 1 > cursor->end)) {
    168 		return 0;
    169 	}
    170 
    171 	*cursor->p = c;
    172 	cursor->p++;
    173 
    174 	return 1;
    175 }
    176 
    177 static inline int cursor_pull(struct cursor *cursor, u8 *data, int len)
    178 {
    179 	if (unlikely(cursor->p + len > cursor->end)) {
    180 		return 0;
    181 	}
    182 
    183 	memcpy(data, cursor->p, len);
    184 	cursor->p += len;
    185 
    186 	return 1;
    187 }
    188 
    189 static inline int pull_data_into_cursor(struct cursor *cursor,
    190 			  struct cursor *dest,
    191 			  unsigned char **data,
    192 			  int len)
    193 {
    194 	int ok;
    195 
    196 	if (unlikely(dest->p + len > dest->end)) {
    197 		printf("not enough room in dest buffer\n");
    198 		return 0;
    199 	}
    200 
    201 	ok = cursor_pull(cursor, dest->p, len);
    202 	if (!ok) return 0;
    203 
    204 	*data = dest->p;
    205 	dest->p += len;
    206 
    207 	return 1;
    208 }
    209 
    210 static inline int cursor_dropn(struct cursor *cur, int size, int n)
    211 {
    212 	if (n == 0)
    213 		return 1;
    214 
    215 	if (unlikely(cur->p - size*n < cur->start)) {
    216 		return 0;
    217 	}
    218 
    219 	cur->p -= size*n;
    220 	return 1;
    221 }
    222 
    223 static inline int cursor_drop(struct cursor *cur, int size)
    224 {
    225 	return cursor_dropn(cur, size, 1);
    226 }
    227 
    228 static inline unsigned char *cursor_topn(struct cursor *cur, int len, int n)
    229 {
    230 	n += 1;
    231 	if (unlikely(cur->p - len*n < cur->start)) {
    232 		return NULL;
    233 	}
    234 	return cur->p - len*n;
    235 }
    236 
    237 static inline unsigned char *cursor_top(struct cursor *cur, int len)
    238 {
    239 	if (unlikely(cur->p - len < cur->start)) {
    240 		return NULL;
    241 	}
    242 	return cur->p - len;
    243 }
    244 
    245 static inline int cursor_top_int(struct cursor *cur, int *i)
    246 {
    247 	u8 *p;
    248 	if (unlikely(!(p = cursor_top(cur, sizeof(*i))))) {
    249 		return 0;
    250 	}
    251 	*i = *((int*)p);
    252 	return 1;
    253 }
    254 
    255 static inline int cursor_pop(struct cursor *cur, u8 *data, int len)
    256 {
    257 	if (unlikely(cur->p - len < cur->start)) {
    258 		return 0;
    259 	}
    260 
    261 	cur->p -= len;
    262 	memcpy(data, cur->p, len);
    263 
    264 	return 1;
    265 }
    266 
    267 static inline int cursor_push(struct cursor *cursor, u8 *data, int len)
    268 {
    269 	if (unlikely(cursor->p + len >= cursor->end)) {
    270 		return 0;
    271 	}
    272 
    273 	if (cursor->p != data)
    274 		memcpy(cursor->p, data, len);
    275 
    276 	cursor->p += len;
    277 
    278 	return 1;
    279 }
    280 
    281 static inline int cursor_push_int(struct cursor *cursor, int i)
    282 {
    283 	return cursor_push(cursor, (u8*)&i, sizeof(i));
    284 }
    285 
    286 static inline size_t cursor_count(struct cursor *cursor, size_t elem_size)
    287 {
    288 	return (cursor->p - cursor->start)/elem_size;
    289 }
    290 
    291 /* TODO: push_varint */
    292 static inline int push_varint(struct cursor *cursor, int n)
    293 {
    294 	int ok, len;
    295 	unsigned char b;
    296 	len = 0;
    297 
    298 	while (1) {
    299 		b = (n & 0xFF) | 0x80;
    300 		n >>= 7;
    301 		if (n == 0) {
    302 			b &= 0x7F;
    303 			ok = cursor_push_byte(cursor, b);
    304 			len++;
    305 			if (!ok) return 0;
    306 			break;
    307 		}
    308 
    309 		ok = cursor_push_byte(cursor, b);
    310 		len++;
    311 		if (!ok) return 0;
    312 	}
    313 
    314 	return len;
    315 }
    316 
    317 /* TODO: pull_varint */
    318 static inline int pull_varint(struct cursor *cursor, int *n)
    319 {
    320 	int ok, i;
    321 	unsigned char b;
    322 	*n = 0;
    323 
    324 	for (i = 0;; i++) {
    325 		ok = pull_byte(cursor, &b);
    326 		if (!ok) return 0;
    327 
    328 		*n |= ((int)b & 0x7F) << (i * 7);
    329 
    330 		/* is_last */
    331 		if ((b & 0x80) == 0) {
    332 			return i+1;
    333 		}
    334 
    335 		if (i == 4) return 0;
    336 	}
    337 
    338 	return 0;
    339 }
    340 
    341 static inline int cursor_pull_int(struct cursor *cursor, int *i)
    342 {
    343 	return cursor_pull(cursor, (u8*)i, sizeof(*i));
    344 }
    345 
    346 static inline int cursor_push_u32(struct cursor *cursor, uint32_t i) {
    347     return cursor_push(cursor, (unsigned char*)&i, sizeof(i));
    348 }
    349 
    350 static inline int cursor_push_u16(struct cursor *cursor, u16 i)
    351 {
    352 	return cursor_push(cursor, (u8*)&i, sizeof(i));
    353 }
    354 
    355 static inline void *index_cursor(struct cursor *cursor, unsigned int index, int elem_size)
    356 {
    357 	u8 *p;
    358 	p = &cursor->start[elem_size * index];
    359 
    360 	if (unlikely(p >= cursor->end))
    361 		return NULL;
    362 
    363 	return (void*)p;
    364 }
    365 
    366 
    367 static inline int push_sized_str(struct cursor *cursor, const char *str, int len)
    368 {
    369 	return cursor_push(cursor, (u8*)str, len);
    370 }
    371 
    372 static inline int cursor_push_lowercase(struct cursor *cur, const char *str, int len)
    373 {
    374 	int i;
    375 
    376 	if (unlikely(cur->p + len >= cur->end))
    377 		return 0;
    378 
    379 	for (i = 0; i < len; i++)
    380 		cur->p[i] = tolower(str[i]);
    381 
    382 	cur->p += len;
    383 	return 1;
    384 }
    385 
    386 static inline int cursor_push_str(struct cursor *cursor, const char *str)
    387 {
    388 	return cursor_push(cursor, (u8*)str, (int)strlen(str));
    389 }
    390 
    391 static inline int cursor_push_c_str(struct cursor *cursor, const char *str)
    392 {
    393 	return cursor_push_str(cursor, str) && cursor_push_byte(cursor, 0);
    394 }
    395 
    396 /* TODO: push varint size */
    397 static inline int push_prefixed_str(struct cursor *cursor, const char *str)
    398 {
    399 	int ok, len;
    400 	len = (int)strlen(str);
    401 	ok = push_varint(cursor, len);
    402 	if (!ok) return 0;
    403 	return push_sized_str(cursor, str, len);
    404 }
    405 
    406 static inline int pull_prefixed_str(struct cursor *cursor, struct cursor *dest_buf, const char **str)
    407 {
    408 	int len, ok;
    409 
    410 	ok = pull_varint(cursor, &len);
    411 	if (!ok) return 0;
    412 
    413 	if (unlikely(dest_buf->p + len > dest_buf->end)) {
    414 		return 0;
    415 	}
    416 
    417 	ok = pull_data_into_cursor(cursor, dest_buf, (unsigned char**)str, len);
    418 	if (!ok) return 0;
    419 
    420 	ok = cursor_push_byte(dest_buf, 0);
    421 
    422 	return 1;
    423 }
    424 
    425 static inline int cursor_remaining_capacity(struct cursor *cursor)
    426 {
    427 	return (int)(cursor->end - cursor->p);
    428 }
    429 
    430 
    431 #define max(a,b) ((a) > (b) ? (a) : (b))
    432 static inline void cursor_print_around(struct cursor *cur, int range)
    433 {
    434 	unsigned char *c;
    435 
    436 	printf("[%ld/%ld]\n", cur->p - cur->start, cur->end - cur->start);
    437 
    438 	c = max(cur->p - range, cur->start);
    439 	for (; c < cur->end && c < (cur->p + range); c++) {
    440 		printf("%02x", *c);
    441 	}
    442 	printf("\n");
    443 
    444 	c = max(cur->p - range, cur->start);
    445 	for (; c < cur->end && c < (cur->p + range); c++) {
    446 		if (c == cur->p) {
    447 			printf("^");
    448 			continue;
    449 		}
    450 		printf("  ");
    451 	}
    452 	printf("\n");
    453 }
    454 #undef max
    455 
    456 static inline int pull_bytes(struct cursor *cur, int count, const u8 **bytes) {
    457     if (cur->p + count > cur->end)
    458         return 0;
    459     
    460     *bytes = cur->p;
    461     cur->p += count;
    462     return 1;
    463 }
    464 
    465 static inline int parse_str(struct cursor *cur, const char *str) {
    466     int i;
    467     char c, cs;
    468     unsigned long len;
    469     
    470     len = strlen(str);
    471     
    472     if (cur->p + len >= cur->end)
    473         return 0;
    474     
    475     for (i = 0; i < len; i++) {
    476         c = tolower(cur->p[i]);
    477         cs = tolower(str[i]);
    478         
    479         if (c != cs)
    480             return 0;
    481     }
    482     
    483     cur->p += len;
    484     
    485     return 1;
    486 }
    487 
    488 static inline int is_whitespace(int c) {
    489     return c == ' ' || c == '\t' || c == '\n' || c == '\v' || c == '\f' || c == '\r';
    490 }
    491 
    492 
    493 static inline int next_char_is_whitespace(unsigned char *curChar, unsigned char *endChar) {
    494     unsigned char * next = curChar + 1;
    495     if(next > endChar) return 0;
    496     else if(next == endChar) return 1;
    497     return is_whitespace(*next);
    498 }
    499 
    500 static int char_disallowed_at_end_url(char c){
    501     return c == '.' || c == ',';
    502 }
    503 
    504 static inline int is_final_url_char(unsigned char *curChar, unsigned char *endChar){
    505     if(is_whitespace(*curChar)){
    506         return 1;
    507     }
    508     else if(next_char_is_whitespace(curChar, endChar)) {
    509         // next char is whitespace so this char could be the final char in the url
    510         return char_disallowed_at_end_url(*curChar);
    511     }
    512     else{
    513         // next char isn't whitespace so it can't be a final char
    514         return 0;
    515     }
    516 }
    517 
    518 static inline int is_underscore(int c) {
    519     return c == '_';
    520 }
    521 
    522 static inline int is_utf8_byte(u8 c) {
    523     return c & 0x80;
    524 }
    525 
    526 static inline int parse_utf8_char(struct cursor *cursor, unsigned int *code_point, unsigned int *utf8_length)
    527 {
    528     u8 first_byte;
    529     if (!parse_byte(cursor, &first_byte))
    530         return 0; // Not enough data
    531 
    532     // Determine the number of bytes in this UTF-8 character
    533     int remaining_bytes = 0;
    534     if (first_byte < 0x80) {
    535         *code_point = first_byte;
    536         return 1;
    537     } else if ((first_byte & 0xE0) == 0xC0) {
    538         remaining_bytes = 1;
    539         *utf8_length = remaining_bytes + 1;
    540         *code_point = first_byte & 0x1F;
    541     } else if ((first_byte & 0xF0) == 0xE0) {
    542         remaining_bytes = 2;
    543         *utf8_length = remaining_bytes + 1;
    544         *code_point = first_byte & 0x0F;
    545     } else if ((first_byte & 0xF8) == 0xF0) {
    546         remaining_bytes = 3;
    547         *utf8_length = remaining_bytes + 1;
    548         *code_point = first_byte & 0x07;
    549     } else {
    550         remaining_bytes = 0;
    551         *utf8_length = 1; // Assume 1 byte length for unrecognized UTF-8 characters
    552         // TODO: We need to gracefully handle unrecognized UTF-8 characters
    553         printf("Invalid UTF-8 byte: %x\n", *code_point);
    554         *code_point = ((first_byte & 0xF0) << 6); // Prevent testing as punctuation
    555         return 0; // Invalid first byte
    556     }
    557 
    558     // Peek at remaining bytes
    559     for (int i = 0; i < remaining_bytes; ++i) {
    560         signed char next_byte;
    561         if ((next_byte = peek_char(cursor, i+1)) == -1) {
    562             *utf8_length = 1;
    563             return 0; // Not enough data
    564         }
    565         
    566         // Debugging lines
    567         //printf("Cursor: %s\n", cursor->p);
    568         //printf("Codepoint: %x\n", *code_point);
    569         //printf("Codepoint <<6: %x\n", ((*code_point << 6) | (next_byte & 0x3F)));
    570         //printf("Remaining bytes: %x\n", remaining_bytes);
    571         //printf("First byte: %x\n", first_byte);
    572         //printf("Next byte: %x\n", next_byte);
    573         //printf("Bitwise AND result: %x\n", (next_byte & 0xC0));
    574         
    575         if ((next_byte & 0xC0) != 0x80) {
    576             *utf8_length = 1;
    577             return 0; // Invalid byte in sequence
    578         }
    579 
    580         *code_point = (*code_point << 6) | (next_byte & 0x3F);
    581     }
    582 
    583     return 1;
    584 }
    585 
    586 /**
    587   * Checks if a given Unicode code point is a punctuation character
    588   *
    589   * @param codepoint The Unicode code point to check. @return true if the
    590   * code point is a punctuation character, false otherwise.
    591   */
    592 static inline int is_punctuation(unsigned int codepoint) {
    593 
    594     // Check for underscore (underscore is not treated as punctuation)
    595     if (is_underscore(codepoint))
    596         return 0;
    597     
    598     // Check for ASCII punctuation
    599     if (ispunct(codepoint))
    600         return 1;
    601 
    602     // Check for Unicode punctuation exceptions (punctuation allowed in hashtags)
    603     if (codepoint == 0x301C || codepoint == 0xFF5E) // Japanese Wave Dash / Tilde
    604         return 0;
    605     
    606     // Check for Unicode punctuation
    607     // NOTE: We may need to adjust the codepoint ranges in the future,
    608     // to include/exclude certain types of Unicode characters in hashtags.
    609     // Unicode Blocks Reference: https://www.compart.com/en/unicode/block
    610     return (
    611         // Latin-1 Supplement No-Break Space (NBSP): U+00A0
    612         (codepoint == 0x00A0) ||
    613         
    614         // Latin-1 Supplement Punctuation: U+00A1 to U+00BF
    615         (codepoint >= 0x00A1 && codepoint <= 0x00BF) ||
    616 
    617         // General Punctuation: U+2000 to U+206F
    618         (codepoint >= 0x2000 && codepoint <= 0x206F) ||
    619 
    620         // Currency Symbols: U+20A0 to U+20CF
    621         (codepoint >= 0x20A0 && codepoint <= 0x20CF) ||
    622 
    623         // Supplemental Punctuation: U+2E00 to U+2E7F
    624         (codepoint >= 0x2E00 && codepoint <= 0x2E7F) ||
    625 
    626         // CJK Symbols and Punctuation: U+3000 to U+303F
    627         (codepoint >= 0x3000 && codepoint <= 0x303F) ||
    628 
    629         // Ideographic Description Characters: U+2FF0 to U+2FFF
    630         (codepoint >= 0x2FF0 && codepoint <= 0x2FFF)
    631     );
    632 }
    633 
    634 static inline int is_right_boundary(int c) {
    635     return is_whitespace(c) || is_punctuation(c);
    636 }
    637 
    638 static inline int is_left_boundary(char c) {
    639     return is_right_boundary(c) || is_utf8_byte(c);
    640 }
    641 
    642 static inline int is_alphanumeric(char c) {
    643     return (c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z') || (c >= '0' && c <= '9');
    644 }
    645 
    646 static inline int consume_until_boundary(struct cursor *cur) {
    647     unsigned int c;
    648     unsigned int char_length = 1;
    649     unsigned int *utf8_char_length = &char_length;
    650     
    651     while (cur->p < cur->end) {
    652         c = *cur->p;
    653         
    654         *utf8_char_length = 1;
    655         
    656         if (is_whitespace(c))
    657             return 1;
    658         
    659         // Need to check for UTF-8 characters, which can be multiple bytes long
    660         if (is_utf8_byte(c)) {
    661             if (!parse_utf8_char(cur, &c, utf8_char_length)) {
    662                 if (!is_right_boundary(c)){
    663                     // TODO: We should work towards handling all UTF-8 characters.
    664                     printf("Invalid UTF-8 code point: %x\n", c);
    665                 }
    666             }
    667         }
    668         
    669         if (is_right_boundary(c))
    670             return 1;
    671         
    672         // Need to use a variable character byte length for UTF-8 (2-4 bytes)
    673         if (cur->p + *utf8_char_length <= cur->end)
    674             cur->p += *utf8_char_length;
    675         else
    676             cur->p++;
    677     }
    678     
    679     return 1;
    680 }
    681 
    682 static inline int consume_until_whitespace(struct cursor *cur, int or_end) {
    683     char c;
    684     int consumedAtLeastOne = 0;
    685     
    686     while (cur->p < cur->end) {
    687         c = *cur->p;
    688         
    689         if (is_whitespace(c))
    690             return consumedAtLeastOne;
    691         
    692         cur->p++;
    693         consumedAtLeastOne = 1;
    694     }
    695     
    696     return or_end;
    697 }
    698 
    699 static inline int consume_until_end_url(struct cursor *cur, int or_end) {
    700     char c;
    701     int consumedAtLeastOne = 0;
    702     
    703     while (cur->p < cur->end) {
    704         c = *cur->p;
    705         
    706         if (is_final_url_char(cur->p, cur->end))
    707             return consumedAtLeastOne;
    708         
    709         cur->p++;
    710         consumedAtLeastOne = 1;
    711     }
    712     
    713     return or_end;
    714 }
    715 
    716 static inline int consume_until_non_alphanumeric(struct cursor *cur, int or_end) {
    717     char c;
    718     int consumedAtLeastOne = 0;
    719 
    720     while (cur->p < cur->end) {
    721         c = *cur->p;
    722 
    723         if (!is_alphanumeric(c))
    724             return consumedAtLeastOne;
    725 
    726         cur->p++;
    727         consumedAtLeastOne = 1;
    728     }
    729 
    730     return or_end;
    731 }
    732 
    733 
    734 static inline int cursor_memset(struct cursor *cursor, unsigned char c, int n)
    735 {
    736     if (cursor->p + n >= cursor->end)
    737         return 0;
    738 
    739     memset(cursor->p, c, n);
    740     cursor->p += n;
    741 
    742     return 1;
    743 }
    744 
    745 
    746 #endif