cursor.h (16444B)
1 2 #ifndef JB55_CURSOR_H 3 #define JB55_CURSOR_H 4 5 #include "typedefs.h" 6 #include "varint.h" 7 8 #include <stdio.h> 9 #include <ctype.h> 10 #include <assert.h> 11 #include <string.h> 12 13 #define unlikely(x) __builtin_expect((x),0) 14 #define likely(x) __builtin_expect((x),1) 15 16 struct cursor { 17 unsigned char *start; 18 unsigned char *p; 19 unsigned char *end; 20 }; 21 22 struct array { 23 struct cursor cur; 24 unsigned int elem_size; 25 }; 26 27 static inline void reset_cursor(struct cursor *cursor) 28 { 29 cursor->p = cursor->start; 30 } 31 32 static inline void wipe_cursor(struct cursor *cursor) 33 { 34 reset_cursor(cursor); 35 memset(cursor->start, 0, cursor->end - cursor->start); 36 } 37 38 static inline void make_cursor(u8 *start, u8 *end, struct cursor *cursor) 39 { 40 cursor->start = start; 41 cursor->p = start; 42 cursor->end = end; 43 } 44 45 static inline void make_array(struct array *a, u8* start, u8 *end, unsigned int elem_size) 46 { 47 make_cursor(start, end, &a->cur); 48 a->elem_size = elem_size; 49 } 50 51 static inline int cursor_eof(struct cursor *c) 52 { 53 return c->p == c->end; 54 } 55 56 static inline void *cursor_malloc(struct cursor *mem, unsigned long size) 57 { 58 void *ret; 59 60 if (mem->p + size > mem->end) { 61 return NULL; 62 } 63 64 ret = mem->p; 65 mem->p += size; 66 67 return ret; 68 } 69 70 static inline void *cursor_alloc(struct cursor *mem, unsigned long size) 71 { 72 void *ret; 73 if (!(ret = cursor_malloc(mem, size))) { 74 return 0; 75 } 76 77 memset(ret, 0, size); 78 return ret; 79 } 80 81 static inline int cursor_slice(struct cursor *mem, struct cursor *slice, size_t size) 82 { 83 u8 *p; 84 if (!(p = cursor_alloc(mem, size))) { 85 return 0; 86 } 87 make_cursor(p, mem->p, slice); 88 return 1; 89 } 90 91 92 static inline void copy_cursor(struct cursor *src, struct cursor *dest) 93 { 94 dest->start = src->start; 95 dest->p = src->p; 96 dest->end = src->end; 97 } 98 99 static inline int cursor_skip(struct cursor *cursor, int n) 100 { 101 if (cursor->p + n >= cursor->end) 102 return 0; 103 104 cursor->p += n; 105 106 return 1; 107 } 108 109 static inline int pull_byte(struct cursor *cursor, u8 *c) 110 { 111 if (unlikely(cursor->p >= cursor->end)) 112 return 0; 113 114 *c = *cursor->p; 115 cursor->p++; 116 117 return 1; 118 } 119 120 static inline int parse_byte(struct cursor *cursor, u8 *c) 121 { 122 if (unlikely(cursor->p >= cursor->end)) 123 return 0; 124 125 *c = *cursor->p; 126 //cursor->p++; 127 128 return 1; 129 } 130 131 static inline int parse_char(struct cursor *cur, char c) { 132 if (cur->p >= cur->end) 133 return 0; 134 135 if (*cur->p == c) { 136 cur->p++; 137 return 1; 138 } 139 140 return 0; 141 } 142 143 static inline int peek_char(struct cursor *cur, int ind) { 144 if ((cur->p + ind < cur->start) || (cur->p + ind >= cur->end)) 145 return -1; 146 147 return *(cur->p + ind); 148 } 149 150 static inline int cursor_pull_c_str(struct cursor *cursor, const char **str) 151 { 152 *str = (const char*)cursor->p; 153 154 for (; cursor->p < cursor->end; cursor->p++) { 155 if (*cursor->p == 0) { 156 cursor->p++; 157 return 1; 158 } 159 } 160 161 return 0; 162 } 163 164 165 static inline int cursor_push_byte(struct cursor *cursor, u8 c) 166 { 167 if (unlikely(cursor->p + 1 > cursor->end)) { 168 return 0; 169 } 170 171 *cursor->p = c; 172 cursor->p++; 173 174 return 1; 175 } 176 177 static inline int cursor_pull(struct cursor *cursor, u8 *data, int len) 178 { 179 if (unlikely(cursor->p + len > cursor->end)) { 180 return 0; 181 } 182 183 memcpy(data, cursor->p, len); 184 cursor->p += len; 185 186 return 1; 187 } 188 189 static inline int pull_data_into_cursor(struct cursor *cursor, 190 struct cursor *dest, 191 unsigned char **data, 192 int len) 193 { 194 int ok; 195 196 if (unlikely(dest->p + len > dest->end)) { 197 printf("not enough room in dest buffer\n"); 198 return 0; 199 } 200 201 ok = cursor_pull(cursor, dest->p, len); 202 if (!ok) return 0; 203 204 *data = dest->p; 205 dest->p += len; 206 207 return 1; 208 } 209 210 static inline int cursor_dropn(struct cursor *cur, int size, int n) 211 { 212 if (n == 0) 213 return 1; 214 215 if (unlikely(cur->p - size*n < cur->start)) { 216 return 0; 217 } 218 219 cur->p -= size*n; 220 return 1; 221 } 222 223 static inline int cursor_drop(struct cursor *cur, int size) 224 { 225 return cursor_dropn(cur, size, 1); 226 } 227 228 static inline unsigned char *cursor_topn(struct cursor *cur, int len, int n) 229 { 230 n += 1; 231 if (unlikely(cur->p - len*n < cur->start)) { 232 return NULL; 233 } 234 return cur->p - len*n; 235 } 236 237 static inline unsigned char *cursor_top(struct cursor *cur, int len) 238 { 239 if (unlikely(cur->p - len < cur->start)) { 240 return NULL; 241 } 242 return cur->p - len; 243 } 244 245 static inline int cursor_top_int(struct cursor *cur, int *i) 246 { 247 u8 *p; 248 if (unlikely(!(p = cursor_top(cur, sizeof(*i))))) { 249 return 0; 250 } 251 *i = *((int*)p); 252 return 1; 253 } 254 255 static inline int cursor_pop(struct cursor *cur, u8 *data, int len) 256 { 257 if (unlikely(cur->p - len < cur->start)) { 258 return 0; 259 } 260 261 cur->p -= len; 262 memcpy(data, cur->p, len); 263 264 return 1; 265 } 266 267 static inline int cursor_push(struct cursor *cursor, u8 *data, int len) 268 { 269 if (unlikely(cursor->p + len >= cursor->end)) { 270 return 0; 271 } 272 273 if (cursor->p != data) 274 memcpy(cursor->p, data, len); 275 276 cursor->p += len; 277 278 return 1; 279 } 280 281 static inline int cursor_push_int(struct cursor *cursor, int i) 282 { 283 return cursor_push(cursor, (u8*)&i, sizeof(i)); 284 } 285 286 static inline size_t cursor_count(struct cursor *cursor, size_t elem_size) 287 { 288 return (cursor->p - cursor->start)/elem_size; 289 } 290 291 /* TODO: push_varint */ 292 static inline int push_varint(struct cursor *cursor, int n) 293 { 294 int ok, len; 295 unsigned char b; 296 len = 0; 297 298 while (1) { 299 b = (n & 0xFF) | 0x80; 300 n >>= 7; 301 if (n == 0) { 302 b &= 0x7F; 303 ok = cursor_push_byte(cursor, b); 304 len++; 305 if (!ok) return 0; 306 break; 307 } 308 309 ok = cursor_push_byte(cursor, b); 310 len++; 311 if (!ok) return 0; 312 } 313 314 return len; 315 } 316 317 /* TODO: pull_varint */ 318 static inline int pull_varint(struct cursor *cursor, int *n) 319 { 320 int ok, i; 321 unsigned char b; 322 *n = 0; 323 324 for (i = 0;; i++) { 325 ok = pull_byte(cursor, &b); 326 if (!ok) return 0; 327 328 *n |= ((int)b & 0x7F) << (i * 7); 329 330 /* is_last */ 331 if ((b & 0x80) == 0) { 332 return i+1; 333 } 334 335 if (i == 4) return 0; 336 } 337 338 return 0; 339 } 340 341 static inline int cursor_pull_int(struct cursor *cursor, int *i) 342 { 343 return cursor_pull(cursor, (u8*)i, sizeof(*i)); 344 } 345 346 static inline int cursor_push_u32(struct cursor *cursor, uint32_t i) { 347 return cursor_push(cursor, (unsigned char*)&i, sizeof(i)); 348 } 349 350 static inline int cursor_push_u16(struct cursor *cursor, u16 i) 351 { 352 return cursor_push(cursor, (u8*)&i, sizeof(i)); 353 } 354 355 static inline void *index_cursor(struct cursor *cursor, unsigned int index, int elem_size) 356 { 357 u8 *p; 358 p = &cursor->start[elem_size * index]; 359 360 if (unlikely(p >= cursor->end)) 361 return NULL; 362 363 return (void*)p; 364 } 365 366 367 static inline int push_sized_str(struct cursor *cursor, const char *str, int len) 368 { 369 return cursor_push(cursor, (u8*)str, len); 370 } 371 372 static inline int cursor_push_lowercase(struct cursor *cur, const char *str, int len) 373 { 374 int i; 375 376 if (unlikely(cur->p + len >= cur->end)) 377 return 0; 378 379 for (i = 0; i < len; i++) 380 cur->p[i] = tolower(str[i]); 381 382 cur->p += len; 383 return 1; 384 } 385 386 static inline int cursor_push_str(struct cursor *cursor, const char *str) 387 { 388 return cursor_push(cursor, (u8*)str, (int)strlen(str)); 389 } 390 391 static inline int cursor_push_c_str(struct cursor *cursor, const char *str) 392 { 393 return cursor_push_str(cursor, str) && cursor_push_byte(cursor, 0); 394 } 395 396 /* TODO: push varint size */ 397 static inline int push_prefixed_str(struct cursor *cursor, const char *str) 398 { 399 int ok, len; 400 len = (int)strlen(str); 401 ok = push_varint(cursor, len); 402 if (!ok) return 0; 403 return push_sized_str(cursor, str, len); 404 } 405 406 static inline int pull_prefixed_str(struct cursor *cursor, struct cursor *dest_buf, const char **str) 407 { 408 int len, ok; 409 410 ok = pull_varint(cursor, &len); 411 if (!ok) return 0; 412 413 if (unlikely(dest_buf->p + len > dest_buf->end)) { 414 return 0; 415 } 416 417 ok = pull_data_into_cursor(cursor, dest_buf, (unsigned char**)str, len); 418 if (!ok) return 0; 419 420 ok = cursor_push_byte(dest_buf, 0); 421 422 return 1; 423 } 424 425 static inline int cursor_remaining_capacity(struct cursor *cursor) 426 { 427 return (int)(cursor->end - cursor->p); 428 } 429 430 431 #define max(a,b) ((a) > (b) ? (a) : (b)) 432 static inline void cursor_print_around(struct cursor *cur, int range) 433 { 434 unsigned char *c; 435 436 printf("[%ld/%ld]\n", cur->p - cur->start, cur->end - cur->start); 437 438 c = max(cur->p - range, cur->start); 439 for (; c < cur->end && c < (cur->p + range); c++) { 440 printf("%02x", *c); 441 } 442 printf("\n"); 443 444 c = max(cur->p - range, cur->start); 445 for (; c < cur->end && c < (cur->p + range); c++) { 446 if (c == cur->p) { 447 printf("^"); 448 continue; 449 } 450 printf(" "); 451 } 452 printf("\n"); 453 } 454 #undef max 455 456 static inline int pull_bytes(struct cursor *cur, int count, const u8 **bytes) { 457 if (cur->p + count > cur->end) 458 return 0; 459 460 *bytes = cur->p; 461 cur->p += count; 462 return 1; 463 } 464 465 static inline int parse_str(struct cursor *cur, const char *str) { 466 int i; 467 char c, cs; 468 unsigned long len; 469 470 len = strlen(str); 471 472 if (cur->p + len >= cur->end) 473 return 0; 474 475 for (i = 0; i < len; i++) { 476 c = tolower(cur->p[i]); 477 cs = tolower(str[i]); 478 479 if (c != cs) 480 return 0; 481 } 482 483 cur->p += len; 484 485 return 1; 486 } 487 488 static inline int is_whitespace(int c) { 489 return c == ' ' || c == '\t' || c == '\n' || c == '\v' || c == '\f' || c == '\r'; 490 } 491 492 493 static inline int next_char_is_whitespace(unsigned char *curChar, unsigned char *endChar) { 494 unsigned char * next = curChar + 1; 495 if(next > endChar) return 0; 496 else if(next == endChar) return 1; 497 return is_whitespace(*next); 498 } 499 500 static int char_disallowed_at_end_url(char c){ 501 return c == '.' || c == ','; 502 } 503 504 static inline int is_final_url_char(unsigned char *curChar, unsigned char *endChar){ 505 if(is_whitespace(*curChar)){ 506 return 1; 507 } 508 else if(next_char_is_whitespace(curChar, endChar)) { 509 // next char is whitespace so this char could be the final char in the url 510 return char_disallowed_at_end_url(*curChar); 511 } 512 else{ 513 // next char isn't whitespace so it can't be a final char 514 return 0; 515 } 516 } 517 518 static inline int is_underscore(int c) { 519 return c == '_'; 520 } 521 522 static inline int is_utf8_byte(u8 c) { 523 return c & 0x80; 524 } 525 526 static inline int parse_utf8_char(struct cursor *cursor, unsigned int *code_point, unsigned int *utf8_length) 527 { 528 u8 first_byte; 529 if (!parse_byte(cursor, &first_byte)) 530 return 0; // Not enough data 531 532 // Determine the number of bytes in this UTF-8 character 533 int remaining_bytes = 0; 534 if (first_byte < 0x80) { 535 *code_point = first_byte; 536 return 1; 537 } else if ((first_byte & 0xE0) == 0xC0) { 538 remaining_bytes = 1; 539 *utf8_length = remaining_bytes + 1; 540 *code_point = first_byte & 0x1F; 541 } else if ((first_byte & 0xF0) == 0xE0) { 542 remaining_bytes = 2; 543 *utf8_length = remaining_bytes + 1; 544 *code_point = first_byte & 0x0F; 545 } else if ((first_byte & 0xF8) == 0xF0) { 546 remaining_bytes = 3; 547 *utf8_length = remaining_bytes + 1; 548 *code_point = first_byte & 0x07; 549 } else { 550 remaining_bytes = 0; 551 *utf8_length = 1; // Assume 1 byte length for unrecognized UTF-8 characters 552 // TODO: We need to gracefully handle unrecognized UTF-8 characters 553 printf("Invalid UTF-8 byte: %x\n", *code_point); 554 *code_point = ((first_byte & 0xF0) << 6); // Prevent testing as punctuation 555 return 0; // Invalid first byte 556 } 557 558 // Peek at remaining bytes 559 for (int i = 0; i < remaining_bytes; ++i) { 560 signed char next_byte; 561 if ((next_byte = peek_char(cursor, i+1)) == -1) { 562 *utf8_length = 1; 563 return 0; // Not enough data 564 } 565 566 // Debugging lines 567 //printf("Cursor: %s\n", cursor->p); 568 //printf("Codepoint: %x\n", *code_point); 569 //printf("Codepoint <<6: %x\n", ((*code_point << 6) | (next_byte & 0x3F))); 570 //printf("Remaining bytes: %x\n", remaining_bytes); 571 //printf("First byte: %x\n", first_byte); 572 //printf("Next byte: %x\n", next_byte); 573 //printf("Bitwise AND result: %x\n", (next_byte & 0xC0)); 574 575 if ((next_byte & 0xC0) != 0x80) { 576 *utf8_length = 1; 577 return 0; // Invalid byte in sequence 578 } 579 580 *code_point = (*code_point << 6) | (next_byte & 0x3F); 581 } 582 583 return 1; 584 } 585 586 /** 587 * Checks if a given Unicode code point is a punctuation character 588 * 589 * @param codepoint The Unicode code point to check. @return true if the 590 * code point is a punctuation character, false otherwise. 591 */ 592 static inline int is_punctuation(unsigned int codepoint) { 593 594 // Check for underscore (underscore is not treated as punctuation) 595 if (is_underscore(codepoint)) 596 return 0; 597 598 // Check for ASCII punctuation 599 if (ispunct(codepoint)) 600 return 1; 601 602 // Check for Unicode punctuation exceptions (punctuation allowed in hashtags) 603 if (codepoint == 0x301C || codepoint == 0xFF5E) // Japanese Wave Dash / Tilde 604 return 0; 605 606 // Check for Unicode punctuation 607 // NOTE: We may need to adjust the codepoint ranges in the future, 608 // to include/exclude certain types of Unicode characters in hashtags. 609 // Unicode Blocks Reference: https://www.compart.com/en/unicode/block 610 return ( 611 // Latin-1 Supplement No-Break Space (NBSP): U+00A0 612 (codepoint == 0x00A0) || 613 614 // Latin-1 Supplement Punctuation: U+00A1 to U+00BF 615 (codepoint >= 0x00A1 && codepoint <= 0x00BF) || 616 617 // General Punctuation: U+2000 to U+206F 618 (codepoint >= 0x2000 && codepoint <= 0x206F) || 619 620 // Currency Symbols: U+20A0 to U+20CF 621 (codepoint >= 0x20A0 && codepoint <= 0x20CF) || 622 623 // Supplemental Punctuation: U+2E00 to U+2E7F 624 (codepoint >= 0x2E00 && codepoint <= 0x2E7F) || 625 626 // CJK Symbols and Punctuation: U+3000 to U+303F 627 (codepoint >= 0x3000 && codepoint <= 0x303F) || 628 629 // Ideographic Description Characters: U+2FF0 to U+2FFF 630 (codepoint >= 0x2FF0 && codepoint <= 0x2FFF) 631 ); 632 } 633 634 static inline int is_right_boundary(int c) { 635 return is_whitespace(c) || is_punctuation(c); 636 } 637 638 static inline int is_left_boundary(char c) { 639 return is_right_boundary(c) || is_utf8_byte(c); 640 } 641 642 static inline int is_alphanumeric(char c) { 643 return (c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z') || (c >= '0' && c <= '9'); 644 } 645 646 static inline int consume_until_boundary(struct cursor *cur) { 647 unsigned int c; 648 unsigned int char_length = 1; 649 unsigned int *utf8_char_length = &char_length; 650 651 while (cur->p < cur->end) { 652 c = *cur->p; 653 654 *utf8_char_length = 1; 655 656 if (is_whitespace(c)) 657 return 1; 658 659 // Need to check for UTF-8 characters, which can be multiple bytes long 660 if (is_utf8_byte(c)) { 661 if (!parse_utf8_char(cur, &c, utf8_char_length)) { 662 if (!is_right_boundary(c)){ 663 // TODO: We should work towards handling all UTF-8 characters. 664 printf("Invalid UTF-8 code point: %x\n", c); 665 } 666 } 667 } 668 669 if (is_right_boundary(c)) 670 return 1; 671 672 // Need to use a variable character byte length for UTF-8 (2-4 bytes) 673 if (cur->p + *utf8_char_length <= cur->end) 674 cur->p += *utf8_char_length; 675 else 676 cur->p++; 677 } 678 679 return 1; 680 } 681 682 static inline int consume_until_whitespace(struct cursor *cur, int or_end) { 683 char c; 684 int consumedAtLeastOne = 0; 685 686 while (cur->p < cur->end) { 687 c = *cur->p; 688 689 if (is_whitespace(c)) 690 return consumedAtLeastOne; 691 692 cur->p++; 693 consumedAtLeastOne = 1; 694 } 695 696 return or_end; 697 } 698 699 static inline int consume_until_end_url(struct cursor *cur, int or_end) { 700 char c; 701 int consumedAtLeastOne = 0; 702 703 while (cur->p < cur->end) { 704 c = *cur->p; 705 706 if (is_final_url_char(cur->p, cur->end)) 707 return consumedAtLeastOne; 708 709 cur->p++; 710 consumedAtLeastOne = 1; 711 } 712 713 return or_end; 714 } 715 716 static inline int consume_until_non_alphanumeric(struct cursor *cur, int or_end) { 717 char c; 718 int consumedAtLeastOne = 0; 719 720 while (cur->p < cur->end) { 721 c = *cur->p; 722 723 if (!is_alphanumeric(c)) 724 return consumedAtLeastOne; 725 726 cur->p++; 727 consumedAtLeastOne = 1; 728 } 729 730 return or_end; 731 } 732 733 734 static inline int cursor_memset(struct cursor *cursor, unsigned char c, int n) 735 { 736 if (cursor->p + n >= cursor->end) 737 return 0; 738 739 memset(cursor->p, c, n); 740 cursor->p += n; 741 742 return 1; 743 } 744 745 746 #endif