cursor.h (16151B)
1 2 #ifndef JB55_CURSOR_H 3 #define JB55_CURSOR_H 4 5 #include "ccan/likely/likely.h" 6 7 #include <stdio.h> 8 #include <inttypes.h> 9 #include <ctype.h> 10 #include <assert.h> 11 #include <string.h> 12 13 struct cursor { 14 unsigned char *start; 15 unsigned char *p; 16 unsigned char *end; 17 }; 18 19 static inline void reset_cursor(struct cursor *cursor) 20 { 21 cursor->p = cursor->start; 22 } 23 24 static inline void wipe_cursor(struct cursor *cursor) 25 { 26 reset_cursor(cursor); 27 memset(cursor->start, 0, cursor->end - cursor->start); 28 } 29 30 static inline void make_cursor(unsigned char *start, unsigned char *end, struct cursor *cursor) 31 { 32 cursor->start = start; 33 cursor->p = start; 34 cursor->end = end; 35 } 36 37 static inline int cursor_eof(struct cursor *c) 38 { 39 return c->p == c->end; 40 } 41 42 static inline void *cursor_malloc(struct cursor *mem, unsigned long size) 43 { 44 void *ret; 45 46 if (mem->p + size > mem->end) { 47 return NULL; 48 } 49 50 ret = mem->p; 51 mem->p += size; 52 53 return ret; 54 } 55 56 static inline void *cursor_alloc(struct cursor *mem, unsigned long size) 57 { 58 void *ret; 59 if (!(ret = cursor_malloc(mem, size))) { 60 return 0; 61 } 62 63 memset(ret, 0, size); 64 return ret; 65 } 66 67 static inline int cursor_slice(struct cursor *mem, struct cursor *slice, size_t size) 68 { 69 unsigned char *p; 70 if (!(p = cursor_alloc(mem, size))) { 71 return 0; 72 } 73 make_cursor(p, mem->p, slice); 74 return 1; 75 } 76 77 static inline int cursor_malloc_slice(struct cursor *mem, struct cursor *slice, size_t size) 78 { 79 unsigned char *p; 80 if (!(p = cursor_malloc(mem, size))) { 81 return 0; 82 } 83 make_cursor(p, mem->p, slice); 84 return 1; 85 } 86 87 88 static inline void copy_cursor(struct cursor *src, struct cursor *dest) 89 { 90 dest->start = src->start; 91 dest->p = src->p; 92 dest->end = src->end; 93 } 94 95 static inline int cursor_skip(struct cursor *cursor, int n) 96 { 97 if (cursor->p + n >= cursor->end) 98 return 0; 99 100 cursor->p += n; 101 102 return 1; 103 } 104 105 static inline int cursor_pull_byte(struct cursor *cursor, unsigned char *c) 106 { 107 if (unlikely(cursor->p >= cursor->end)) 108 return 0; 109 110 *c = *cursor->p; 111 cursor->p++; 112 113 return 1; 114 } 115 116 static inline int parse_byte(struct cursor *cursor, unsigned char *c) 117 { 118 if (unlikely(cursor->p >= cursor->end)) 119 return 0; 120 121 *c = *cursor->p; 122 //cursor->p++; 123 124 return 1; 125 } 126 127 static inline int parse_char(struct cursor *cur, char c) { 128 if (cur->p >= cur->end) 129 return 0; 130 131 if (*cur->p == c) { 132 cur->p++; 133 return 1; 134 } 135 136 return 0; 137 } 138 139 static inline int peek_char(struct cursor *cur, int ind) { 140 if ((cur->p + ind < cur->start) || (cur->p + ind >= cur->end)) 141 return -1; 142 143 return *(cur->p + ind); 144 } 145 146 static inline int cursor_pull_c_str(struct cursor *cursor, const char **str) 147 { 148 *str = (const char*)cursor->p; 149 150 for (; cursor->p < cursor->end; cursor->p++) { 151 if (*cursor->p == 0) { 152 cursor->p++; 153 return 1; 154 } 155 } 156 157 return 0; 158 } 159 160 161 static inline int cursor_push_byte(struct cursor *cursor, unsigned char c) 162 { 163 if (unlikely(cursor->p + 1 > cursor->end)) { 164 return 0; 165 } 166 167 *cursor->p = c; 168 cursor->p++; 169 170 return 1; 171 } 172 173 static inline int cursor_pull(struct cursor *cursor, unsigned char *data, int len) 174 { 175 if (unlikely(cursor->p + len > cursor->end)) { 176 return 0; 177 } 178 179 memcpy(data, cursor->p, len); 180 cursor->p += len; 181 182 return 1; 183 } 184 185 static inline int pull_data_into_cursor(struct cursor *cursor, 186 struct cursor *dest, 187 unsigned char **data, 188 int len) 189 { 190 int ok; 191 192 if (unlikely(dest->p + len > dest->end)) { 193 printf("not enough room in dest buffer\n"); 194 return 0; 195 } 196 197 ok = cursor_pull(cursor, dest->p, len); 198 if (!ok) return 0; 199 200 *data = dest->p; 201 dest->p += len; 202 203 return 1; 204 } 205 206 static inline int cursor_dropn(struct cursor *cur, int size, int n) 207 { 208 if (n == 0) 209 return 1; 210 211 if (unlikely(cur->p - size*n < cur->start)) { 212 return 0; 213 } 214 215 cur->p -= size*n; 216 return 1; 217 } 218 219 static inline int cursor_drop(struct cursor *cur, int size) 220 { 221 return cursor_dropn(cur, size, 1); 222 } 223 224 static inline unsigned char *cursor_topn(struct cursor *cur, int len, int n) 225 { 226 n += 1; 227 if (unlikely(cur->p - len*n < cur->start)) { 228 return NULL; 229 } 230 return cur->p - len*n; 231 } 232 233 static inline unsigned char *cursor_top(struct cursor *cur, int len) 234 { 235 if (unlikely(cur->p - len < cur->start)) { 236 return NULL; 237 } 238 return cur->p - len; 239 } 240 241 static inline int cursor_top_int(struct cursor *cur, int *i) 242 { 243 unsigned char *p; 244 if (unlikely(!(p = cursor_top(cur, sizeof(*i))))) { 245 return 0; 246 } 247 *i = *((int*)p); 248 return 1; 249 } 250 251 static inline int cursor_pop(struct cursor *cur, unsigned char *data, int len) 252 { 253 if (unlikely(cur->p - len < cur->start)) { 254 return 0; 255 } 256 257 cur->p -= len; 258 memcpy(data, cur->p, len); 259 260 return 1; 261 } 262 263 static inline int cursor_push(struct cursor *cursor, unsigned char *data, int len) 264 { 265 if (unlikely(cursor->p + len > cursor->end)) { 266 return 0; 267 } 268 269 if (cursor->p != data) 270 memcpy(cursor->p, data, len); 271 272 cursor->p += len; 273 274 return 1; 275 } 276 277 static inline int cursor_push_int(struct cursor *cursor, int i) 278 { 279 return cursor_push(cursor, (unsigned char*)&i, sizeof(i)); 280 } 281 282 static inline size_t cursor_count(struct cursor *cursor, size_t elem_size) 283 { 284 return (cursor->p - cursor->start)/elem_size; 285 } 286 287 /* Encodes a 64-bit integer into a variable-length format and pushes it into a cursor. 288 * Returns the number of bytes used or -1 in case of an error. */ 289 static inline int cursor_push_varint(struct cursor *cursor, uint64_t n) 290 { 291 int len = 0; 292 do { 293 unsigned char b = (n & 0x7F) | (n > 0x7F ? 0x80 : 0); 294 n >>= 7; 295 if (!cursor_push_byte(cursor, b)) 296 return -1; // Error handling 297 len++; 298 } while (n != 0); 299 300 return len; 301 } 302 303 static inline int cursor_pull_varint(struct cursor *cursor, uint64_t *n) 304 { 305 int ok, i; 306 unsigned char b; 307 308 *n = 0; 309 310 for (i = 0; i < 10; i++) { // Loop up to 10 bytes for 64-bit 311 ok = cursor_pull_byte(cursor, &b); 312 if (!ok) return 0; 313 314 *n |= ((int64_t)b & 0x7F) << (i * 7); 315 316 if ((b & 0x80) == 0) { 317 return i + 1; // Successfully read i+1 bytes 318 } 319 } 320 321 return 10; // Successfully read 10 bytes for a full 64-bit integer 322 } 323 324 static int cursor_pull_varint_u32(struct cursor *cursor, uint32_t *v) 325 { 326 uint64_t bigval; 327 328 if (!cursor_pull_varint(cursor, &bigval)) 329 return 0; 330 331 if (bigval > UINT32_MAX) 332 return 0; 333 334 *v = (uint32_t) bigval; 335 return 1; 336 } 337 338 static inline int cursor_pull_int(struct cursor *cursor, int *i) 339 { 340 return cursor_pull(cursor, (unsigned char*)i, sizeof(*i)); 341 } 342 343 static inline int cursor_push_u32(struct cursor *cursor, uint32_t i) { 344 return cursor_push(cursor, (unsigned char*)&i, sizeof(i)); 345 } 346 347 static inline int cursor_push_u16(struct cursor *cursor, unsigned short i) 348 { 349 return cursor_push(cursor, (unsigned char*)&i, sizeof(i)); 350 } 351 352 static inline int cursor_pull_u16(struct cursor *cursor, uint16_t *i) 353 { 354 return cursor_pull(cursor, (unsigned char*)i, sizeof(*i)); 355 } 356 357 static inline void *index_cursor(struct cursor *cursor, unsigned int index, int elem_size) 358 { 359 unsigned char *p; 360 p = &cursor->start[elem_size * index]; 361 362 if (unlikely(p >= cursor->end)) 363 return NULL; 364 365 return (void*)p; 366 } 367 368 369 static inline int push_sized_str(struct cursor *cursor, const char *str, int len) 370 { 371 return cursor_push(cursor, (unsigned char*)str, len); 372 } 373 374 static inline int cursor_push_lowercase(struct cursor *cur, const char *str, int len) 375 { 376 int i; 377 378 if (unlikely(cur->p + len >= cur->end)) 379 return 0; 380 381 for (i = 0; i < len; i++) 382 cur->p[i] = tolower(str[i]); 383 384 cur->p += len; 385 return 1; 386 } 387 388 static inline int cursor_push_str(struct cursor *cursor, const char *str) 389 { 390 return cursor_push(cursor, (unsigned char*)str, (int)strlen(str)); 391 } 392 393 static inline int cursor_push_c_str(struct cursor *cursor, const char *str) 394 { 395 if (str == NULL) 396 return cursor_push_byte(cursor, 0); 397 return cursor_push_str(cursor, str) && cursor_push_byte(cursor, 0); 398 } 399 400 /* TODO: push varint size */ 401 static inline int push_prefixed_str(struct cursor *cursor, const char *str) 402 { 403 uint64_t len; 404 len = strlen(str); 405 if (!cursor_push_varint(cursor, len)) 406 return 0; 407 return push_sized_str(cursor, str, len); 408 } 409 410 static inline int pull_prefixed_str(struct cursor *cursor, struct cursor *dest_buf, const char **str) 411 { 412 uint64_t len; 413 414 if (!cursor_pull_varint(cursor, &len)) 415 return 0; 416 417 if (unlikely(dest_buf->p + len > dest_buf->end)) 418 return 0; 419 420 if (!pull_data_into_cursor(cursor, dest_buf, (unsigned char**)str, len)) 421 return 0; 422 423 return cursor_push_byte(dest_buf, 0); 424 } 425 426 static inline int cursor_remaining_capacity(struct cursor *cursor) 427 { 428 return (int)(cursor->end - cursor->p); 429 } 430 431 432 #define max(a,b) ((a) > (b) ? (a) : (b)) 433 static inline void cursor_print_around(struct cursor *cur, int range) 434 { 435 unsigned char *c; 436 437 printf("[%ld/%ld]\n", cur->p - cur->start, cur->end - cur->start); 438 439 c = max(cur->p - range, cur->start); 440 for (; c < cur->end && c < (cur->p + range); c++) { 441 printf("%02x", *c); 442 } 443 printf("\n"); 444 445 c = max(cur->p - range, cur->start); 446 for (; c < cur->end && c < (cur->p + range); c++) { 447 if (c == cur->p) { 448 printf("^"); 449 continue; 450 } 451 printf(" "); 452 } 453 printf("\n"); 454 } 455 #undef max 456 457 static inline int pull_bytes(struct cursor *cur, int count, const unsigned char **bytes) { 458 if (cur->p + count > cur->end) 459 return 0; 460 461 *bytes = cur->p; 462 cur->p += count; 463 return 1; 464 } 465 466 static inline int parse_str(struct cursor *cur, const char *str) { 467 unsigned int i; 468 char c, cs; 469 unsigned long len; 470 471 len = strlen(str); 472 473 if (cur->p + len >= cur->end) 474 return 0; 475 476 for (i = 0; i < len; i++) { 477 c = tolower(cur->p[i]); 478 cs = tolower(str[i]); 479 480 if (c != cs) 481 return 0; 482 } 483 484 cur->p += len; 485 486 return 1; 487 } 488 489 static inline int is_whitespace(char c) { 490 return c == ' ' || c == '\t' || c == '\n' || c == '\v' || c == '\f' || c == '\r'; 491 } 492 493 static inline int is_underscore(char c) { 494 return c == '_'; 495 } 496 497 static inline int is_utf8_byte(unsigned char c) { 498 return c & 0x80; 499 } 500 501 static inline int parse_utf8_char(struct cursor *cursor, unsigned int *code_point, unsigned int *utf8_length) 502 { 503 unsigned char first_byte; 504 if (!parse_byte(cursor, &first_byte)) 505 return 0; // Not enough data 506 507 // Determine the number of bytes in this UTF-8 character 508 int remaining_bytes = 0; 509 if (first_byte < 0x80) { 510 *code_point = first_byte; 511 return 1; 512 } else if ((first_byte & 0xE0) == 0xC0) { 513 remaining_bytes = 1; 514 *utf8_length = remaining_bytes + 1; 515 *code_point = first_byte & 0x1F; 516 } else if ((first_byte & 0xF0) == 0xE0) { 517 remaining_bytes = 2; 518 *utf8_length = remaining_bytes + 1; 519 *code_point = first_byte & 0x0F; 520 } else if ((first_byte & 0xF8) == 0xF0) { 521 remaining_bytes = 3; 522 *utf8_length = remaining_bytes + 1; 523 *code_point = first_byte & 0x07; 524 } else { 525 remaining_bytes = 0; 526 *utf8_length = 1; // Assume 1 byte length for unrecognized UTF-8 characters 527 // TODO: We need to gracefully handle unrecognized UTF-8 characters 528 //printf("Invalid UTF-8 byte: %x\n", *code_point); 529 *code_point = ((first_byte & 0xF0) << 6); // Prevent testing as punctuation 530 return 0; // Invalid first byte 531 } 532 533 // Peek at remaining bytes 534 for (int i = 0; i < remaining_bytes; ++i) { 535 signed char next_byte; 536 if ((next_byte = peek_char(cursor, i+1)) == -1) { 537 *utf8_length = 1; 538 return 0; // Not enough data 539 } 540 541 // Debugging lines 542 //printf("Cursor: %s\n", cursor->p); 543 //printf("Codepoint: %x\n", *code_point); 544 //printf("Codepoint <<6: %x\n", ((*code_point << 6) | (next_byte & 0x3F))); 545 //printf("Remaining bytes: %x\n", remaining_bytes); 546 //printf("First byte: %x\n", first_byte); 547 //printf("Next byte: %x\n", next_byte); 548 //printf("Bitwise AND result: %x\n", (next_byte & 0xC0)); 549 550 if ((next_byte & 0xC0) != 0x80) { 551 *utf8_length = 1; 552 return 0; // Invalid byte in sequence 553 } 554 555 *code_point = (*code_point << 6) | (next_byte & 0x3F); 556 } 557 558 return 1; 559 } 560 561 /** 562 * Checks if a given Unicode code point is a punctuation character 563 * 564 * @param codepoint The Unicode code point to check. @return true if the 565 * code point is a punctuation character, false otherwise. 566 */ 567 static inline int is_punctuation(unsigned int codepoint) { 568 569 // Check for underscore (underscore is not treated as punctuation) 570 if (is_underscore(codepoint)) 571 return 0; 572 573 // Check for ASCII punctuation 574 if (codepoint <= 128 && ispunct(codepoint)) 575 return 1; 576 577 // Check for Unicode punctuation exceptions (punctuation allowed in hashtags) 578 if (codepoint == 0x301C || codepoint == 0xFF5E) // Japanese Wave Dash / Tilde 579 return 0; 580 581 // Check for Unicode punctuation 582 // NOTE: We may need to adjust the codepoint ranges in the future, 583 // to include/exclude certain types of Unicode characters in hashtags. 584 // Unicode Blocks Reference: https://www.compart.com/en/unicode/block 585 return ( 586 // Latin-1 Supplement No-Break Space (NBSP): U+00A0 587 (codepoint == 0x00A0) || 588 589 // Latin-1 Supplement Punctuation: U+00A1 to U+00BF 590 (codepoint >= 0x00A1 && codepoint <= 0x00BF) || 591 592 // General Punctuation: U+2000 to U+206F 593 (codepoint >= 0x2000 && codepoint <= 0x206F) || 594 595 // Currency Symbols: U+20A0 to U+20CF 596 (codepoint >= 0x20A0 && codepoint <= 0x20CF) || 597 598 // Supplemental Punctuation: U+2E00 to U+2E7F 599 (codepoint >= 0x2E00 && codepoint <= 0x2E7F) || 600 601 // CJK Symbols and Punctuation: U+3000 to U+303F 602 (codepoint >= 0x3000 && codepoint <= 0x303F) || 603 604 // Ideographic Description Characters: U+2FF0 to U+2FFF 605 (codepoint >= 0x2FF0 && codepoint <= 0x2FFF) 606 ); 607 } 608 609 static inline int is_right_boundary(int c) { 610 return is_whitespace(c) || is_punctuation(c); 611 } 612 613 static inline int is_left_boundary(char c) { 614 return is_right_boundary(c) || is_utf8_byte(c); 615 } 616 617 static inline int is_alphanumeric(char c) { 618 return (c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z') || (c >= '0' && c <= '9'); 619 } 620 621 static inline int consume_until_boundary(struct cursor *cur) { 622 unsigned int c; 623 unsigned int char_length = 1; 624 unsigned int *utf8_char_length = &char_length; 625 626 while (cur->p < cur->end) { 627 c = *cur->p; 628 629 *utf8_char_length = 1; 630 631 if (is_whitespace(c)) 632 return 1; 633 634 // Need to check for UTF-8 characters, which can be multiple bytes long 635 if (is_utf8_byte(c)) { 636 if (!parse_utf8_char(cur, &c, utf8_char_length)) { 637 if (!is_right_boundary(c)){ 638 return 0; 639 } 640 } 641 } 642 643 if (is_right_boundary(c)) 644 return 1; 645 646 // Need to use a variable character byte length for UTF-8 (2-4 bytes) 647 if (cur->p + *utf8_char_length <= cur->end) 648 cur->p += *utf8_char_length; 649 else 650 cur->p++; 651 } 652 653 return 1; 654 } 655 656 static inline int consume_until_whitespace(struct cursor *cur, int or_end) { 657 char c; 658 int consumedAtLeastOne = 0; 659 660 while (cur->p < cur->end) { 661 c = *cur->p; 662 663 if (is_whitespace(c)) 664 return consumedAtLeastOne; 665 666 cur->p++; 667 consumedAtLeastOne = 1; 668 } 669 670 return or_end; 671 } 672 673 static inline int consume_until_non_alphanumeric(struct cursor *cur, int or_end) { 674 char c; 675 int consumedAtLeastOne = 0; 676 677 while (cur->p < cur->end) { 678 c = *cur->p; 679 680 if (!is_alphanumeric(c)) 681 return consumedAtLeastOne; 682 683 cur->p++; 684 consumedAtLeastOne = 1; 685 } 686 687 return or_end; 688 } 689 690 691 static inline int cursor_memset(struct cursor *cursor, unsigned char c, int n) 692 { 693 if (cursor->p + n >= cursor->end) 694 return 0; 695 696 memset(cursor->p, c, n); 697 cursor->p += n; 698 699 return 1; 700 } 701 702 static void consume_whitespace_or_punctuation(struct cursor *cur) 703 { 704 while (cur->p < cur->end) { 705 if (!is_right_boundary(*cur->p)) 706 return; 707 cur->p++; 708 } 709 } 710 711 // pad cursor buffer to n-byte alignment 712 static inline int cursor_align(struct cursor *cur, int bytes) { 713 size_t size = cur->p - cur->start; 714 int pad; 715 716 // pad to n-byte alignment 717 pad = ((size + (bytes-1)) & ~(bytes-1)) - size; 718 719 if (pad > 0 && !cursor_memset(cur, 0, pad)) 720 return 0; 721 722 return 1; 723 } 724 725 726 #endif