cursor.h (15449B)
1 2 #ifndef JB55_CURSOR_H 3 #define JB55_CURSOR_H 4 5 #include "typedefs.h" 6 7 #include <stdio.h> 8 #include <ctype.h> 9 #include <assert.h> 10 #include <string.h> 11 12 #define unlikely(x) __builtin_expect((x),0) 13 #define likely(x) __builtin_expect((x),1) 14 15 struct cursor { 16 unsigned char *start; 17 unsigned char *p; 18 unsigned char *end; 19 }; 20 21 struct array { 22 struct cursor cur; 23 unsigned int elem_size; 24 }; 25 26 static inline void reset_cursor(struct cursor *cursor) 27 { 28 cursor->p = cursor->start; 29 } 30 31 static inline void wipe_cursor(struct cursor *cursor) 32 { 33 reset_cursor(cursor); 34 memset(cursor->start, 0, cursor->end - cursor->start); 35 } 36 37 static inline void make_cursor(u8 *start, u8 *end, struct cursor *cursor) 38 { 39 cursor->start = start; 40 cursor->p = start; 41 cursor->end = end; 42 } 43 44 static inline void make_array(struct array *a, u8* start, u8 *end, unsigned int elem_size) 45 { 46 make_cursor(start, end, &a->cur); 47 a->elem_size = elem_size; 48 } 49 50 static inline int cursor_eof(struct cursor *c) 51 { 52 return c->p == c->end; 53 } 54 55 static inline void *cursor_malloc(struct cursor *mem, unsigned long size) 56 { 57 void *ret; 58 59 if (mem->p + size > mem->end) { 60 return NULL; 61 } 62 63 ret = mem->p; 64 mem->p += size; 65 66 return ret; 67 } 68 69 static inline void *cursor_alloc(struct cursor *mem, unsigned long size) 70 { 71 void *ret; 72 if (!(ret = cursor_malloc(mem, size))) { 73 return 0; 74 } 75 76 memset(ret, 0, size); 77 return ret; 78 } 79 80 static inline int cursor_slice(struct cursor *mem, struct cursor *slice, size_t size) 81 { 82 u8 *p; 83 if (!(p = cursor_alloc(mem, size))) { 84 return 0; 85 } 86 make_cursor(p, mem->p, slice); 87 return 1; 88 } 89 90 91 static inline void copy_cursor(struct cursor *src, struct cursor *dest) 92 { 93 dest->start = src->start; 94 dest->p = src->p; 95 dest->end = src->end; 96 } 97 98 static inline int cursor_skip(struct cursor *cursor, int n) 99 { 100 if (cursor->p + n >= cursor->end) 101 return 0; 102 103 cursor->p += n; 104 105 return 1; 106 } 107 108 static inline int pull_byte(struct cursor *cursor, u8 *c) 109 { 110 if (unlikely(cursor->p >= cursor->end)) 111 return 0; 112 113 *c = *cursor->p; 114 cursor->p++; 115 116 return 1; 117 } 118 119 static inline int parse_byte(struct cursor *cursor, u8 *c) 120 { 121 if (unlikely(cursor->p >= cursor->end)) 122 return 0; 123 124 *c = *cursor->p; 125 //cursor->p++; 126 127 return 1; 128 } 129 130 static inline int parse_char(struct cursor *cur, char c) { 131 if (cur->p >= cur->end) 132 return 0; 133 134 if (*cur->p == c) { 135 cur->p++; 136 return 1; 137 } 138 139 return 0; 140 } 141 142 static inline int peek_char(struct cursor *cur, int ind) { 143 if ((cur->p + ind < cur->start) || (cur->p + ind >= cur->end)) 144 return -1; 145 146 return *(cur->p + ind); 147 } 148 149 static inline int cursor_pull_c_str(struct cursor *cursor, const char **str) 150 { 151 *str = (const char*)cursor->p; 152 153 for (; cursor->p < cursor->end; cursor->p++) { 154 if (*cursor->p == 0) { 155 cursor->p++; 156 return 1; 157 } 158 } 159 160 return 0; 161 } 162 163 164 static inline int cursor_push_byte(struct cursor *cursor, u8 c) 165 { 166 if (unlikely(cursor->p + 1 > cursor->end)) { 167 return 0; 168 } 169 170 *cursor->p = c; 171 cursor->p++; 172 173 return 1; 174 } 175 176 static inline int cursor_pull(struct cursor *cursor, u8 *data, int len) 177 { 178 if (unlikely(cursor->p + len > cursor->end)) { 179 return 0; 180 } 181 182 memcpy(data, cursor->p, len); 183 cursor->p += len; 184 185 return 1; 186 } 187 188 static inline int pull_data_into_cursor(struct cursor *cursor, 189 struct cursor *dest, 190 unsigned char **data, 191 int len) 192 { 193 int ok; 194 195 if (unlikely(dest->p + len > dest->end)) { 196 printf("not enough room in dest buffer\n"); 197 return 0; 198 } 199 200 ok = cursor_pull(cursor, dest->p, len); 201 if (!ok) return 0; 202 203 *data = dest->p; 204 dest->p += len; 205 206 return 1; 207 } 208 209 static inline int cursor_dropn(struct cursor *cur, int size, int n) 210 { 211 if (n == 0) 212 return 1; 213 214 if (unlikely(cur->p - size*n < cur->start)) { 215 return 0; 216 } 217 218 cur->p -= size*n; 219 return 1; 220 } 221 222 static inline int cursor_drop(struct cursor *cur, int size) 223 { 224 return cursor_dropn(cur, size, 1); 225 } 226 227 static inline unsigned char *cursor_topn(struct cursor *cur, int len, int n) 228 { 229 n += 1; 230 if (unlikely(cur->p - len*n < cur->start)) { 231 return NULL; 232 } 233 return cur->p - len*n; 234 } 235 236 static inline unsigned char *cursor_top(struct cursor *cur, int len) 237 { 238 if (unlikely(cur->p - len < cur->start)) { 239 return NULL; 240 } 241 return cur->p - len; 242 } 243 244 static inline int cursor_top_int(struct cursor *cur, int *i) 245 { 246 u8 *p; 247 if (unlikely(!(p = cursor_top(cur, sizeof(*i))))) { 248 return 0; 249 } 250 *i = *((int*)p); 251 return 1; 252 } 253 254 static inline int cursor_pop(struct cursor *cur, u8 *data, int len) 255 { 256 if (unlikely(cur->p - len < cur->start)) { 257 return 0; 258 } 259 260 cur->p -= len; 261 memcpy(data, cur->p, len); 262 263 return 1; 264 } 265 266 static inline int cursor_push(struct cursor *cursor, u8 *data, int len) 267 { 268 if (unlikely(cursor->p + len >= cursor->end)) { 269 return 0; 270 } 271 272 if (cursor->p != data) 273 memcpy(cursor->p, data, len); 274 275 cursor->p += len; 276 277 return 1; 278 } 279 280 static inline int cursor_push_int(struct cursor *cursor, int i) 281 { 282 return cursor_push(cursor, (u8*)&i, sizeof(i)); 283 } 284 285 static inline size_t cursor_count(struct cursor *cursor, size_t elem_size) 286 { 287 return (cursor->p - cursor->start)/elem_size; 288 } 289 290 /* TODO: push_varint */ 291 static inline int push_varint(struct cursor *cursor, int n) 292 { 293 int ok, len; 294 unsigned char b; 295 len = 0; 296 297 while (1) { 298 b = (n & 0xFF) | 0x80; 299 n >>= 7; 300 if (n == 0) { 301 b &= 0x7F; 302 ok = cursor_push_byte(cursor, b); 303 len++; 304 if (!ok) return 0; 305 break; 306 } 307 308 ok = cursor_push_byte(cursor, b); 309 len++; 310 if (!ok) return 0; 311 } 312 313 return len; 314 } 315 316 /* TODO: pull_varint */ 317 static inline int pull_varint(struct cursor *cursor, int *n) 318 { 319 int ok, i; 320 unsigned char b; 321 *n = 0; 322 323 for (i = 0;; i++) { 324 ok = pull_byte(cursor, &b); 325 if (!ok) return 0; 326 327 *n |= ((int)b & 0x7F) << (i * 7); 328 329 /* is_last */ 330 if ((b & 0x80) == 0) { 331 return i+1; 332 } 333 334 if (i == 4) return 0; 335 } 336 337 return 0; 338 } 339 340 static inline int cursor_pull_int(struct cursor *cursor, int *i) 341 { 342 return cursor_pull(cursor, (u8*)i, sizeof(*i)); 343 } 344 345 static inline int cursor_push_u32(struct cursor *cursor, uint32_t i) { 346 return cursor_push(cursor, (unsigned char*)&i, sizeof(i)); 347 } 348 349 static inline int cursor_push_u16(struct cursor *cursor, u16 i) 350 { 351 return cursor_push(cursor, (u8*)&i, sizeof(i)); 352 } 353 354 static inline void *index_cursor(struct cursor *cursor, unsigned int index, int elem_size) 355 { 356 u8 *p; 357 p = &cursor->start[elem_size * index]; 358 359 if (unlikely(p >= cursor->end)) 360 return NULL; 361 362 return (void*)p; 363 } 364 365 366 static inline int push_sized_str(struct cursor *cursor, const char *str, int len) 367 { 368 return cursor_push(cursor, (u8*)str, len); 369 } 370 371 static inline int cursor_push_lowercase(struct cursor *cur, const char *str, int len) 372 { 373 int i; 374 375 if (unlikely(cur->p + len >= cur->end)) 376 return 0; 377 378 for (i = 0; i < len; i++) 379 cur->p[i] = tolower(str[i]); 380 381 cur->p += len; 382 return 1; 383 } 384 385 static inline int cursor_push_str(struct cursor *cursor, const char *str) 386 { 387 return cursor_push(cursor, (u8*)str, (int)strlen(str)); 388 } 389 390 static inline int cursor_push_c_str(struct cursor *cursor, const char *str) 391 { 392 return cursor_push_str(cursor, str) && cursor_push_byte(cursor, 0); 393 } 394 395 /* TODO: push varint size */ 396 static inline int push_prefixed_str(struct cursor *cursor, const char *str) 397 { 398 int ok, len; 399 len = (int)strlen(str); 400 ok = push_varint(cursor, len); 401 if (!ok) return 0; 402 return push_sized_str(cursor, str, len); 403 } 404 405 static inline int pull_prefixed_str(struct cursor *cursor, struct cursor *dest_buf, const char **str) 406 { 407 int len, ok; 408 409 ok = pull_varint(cursor, &len); 410 if (!ok) return 0; 411 412 if (unlikely(dest_buf->p + len > dest_buf->end)) { 413 return 0; 414 } 415 416 ok = pull_data_into_cursor(cursor, dest_buf, (unsigned char**)str, len); 417 if (!ok) return 0; 418 419 ok = cursor_push_byte(dest_buf, 0); 420 421 return 1; 422 } 423 424 static inline int cursor_remaining_capacity(struct cursor *cursor) 425 { 426 return (int)(cursor->end - cursor->p); 427 } 428 429 430 #define max(a,b) ((a) > (b) ? (a) : (b)) 431 static inline void cursor_print_around(struct cursor *cur, int range) 432 { 433 unsigned char *c; 434 435 printf("[%ld/%ld]\n", cur->p - cur->start, cur->end - cur->start); 436 437 c = max(cur->p - range, cur->start); 438 for (; c < cur->end && c < (cur->p + range); c++) { 439 printf("%02x", *c); 440 } 441 printf("\n"); 442 443 c = max(cur->p - range, cur->start); 444 for (; c < cur->end && c < (cur->p + range); c++) { 445 if (c == cur->p) { 446 printf("^"); 447 continue; 448 } 449 printf(" "); 450 } 451 printf("\n"); 452 } 453 #undef max 454 455 static inline int pull_bytes(struct cursor *cur, int count, const u8 **bytes) { 456 if (cur->p + count > cur->end) 457 return 0; 458 459 *bytes = cur->p; 460 cur->p += count; 461 return 1; 462 } 463 464 static inline int parse_str(struct cursor *cur, const char *str) { 465 char c, cs; 466 unsigned long i, len; 467 468 len = strlen(str); 469 470 if (cur->p + len >= cur->end) 471 return 0; 472 473 for (i = 0; i < len; i++) { 474 c = tolower(cur->p[i]); 475 cs = tolower(str[i]); 476 477 if (c != cs) 478 return 0; 479 } 480 481 cur->p += len; 482 483 return 1; 484 } 485 486 static inline int is_whitespace(char c) { 487 return c == ' ' || c == '\t' || c == '\n' || c == '\v' || c == '\f' || c == '\r'; 488 } 489 490 static inline int is_underscore(char c) { 491 return c == '_'; 492 } 493 494 static inline int is_utf8_byte(u8 c) { 495 return c & 0x80; 496 } 497 498 static inline int parse_utf8_char(struct cursor *cursor, unsigned int *code_point, unsigned int *utf8_length) 499 { 500 u8 first_byte; 501 if (!parse_byte(cursor, &first_byte)) 502 return 0; // Not enough data 503 504 // Determine the number of bytes in this UTF-8 character 505 int remaining_bytes = 0; 506 if (first_byte < 0x80) { 507 *code_point = first_byte; 508 return 1; 509 } else if ((first_byte & 0xE0) == 0xC0) { 510 remaining_bytes = 1; 511 *utf8_length = remaining_bytes + 1; 512 *code_point = first_byte & 0x1F; 513 } else if ((first_byte & 0xF0) == 0xE0) { 514 remaining_bytes = 2; 515 *utf8_length = remaining_bytes + 1; 516 *code_point = first_byte & 0x0F; 517 } else if ((first_byte & 0xF8) == 0xF0) { 518 remaining_bytes = 3; 519 *utf8_length = remaining_bytes + 1; 520 *code_point = first_byte & 0x07; 521 } else { 522 remaining_bytes = 0; 523 *utf8_length = 1; // Assume 1 byte length for unrecognized UTF-8 characters 524 // TODO: We need to gracefully handle unrecognized UTF-8 characters 525 //printf("Invalid UTF-8 byte: %x\n", *code_point); 526 *code_point = ((first_byte & 0xF0) << 6); // Prevent testing as punctuation 527 return 0; // Invalid first byte 528 } 529 530 // Peek at remaining bytes 531 for (int i = 0; i < remaining_bytes; ++i) { 532 signed char next_byte; 533 if ((next_byte = peek_char(cursor, i+1)) == -1) { 534 *utf8_length = 1; 535 return 0; // Not enough data 536 } 537 538 // Debugging lines 539 //printf("Cursor: %s\n", cursor->p); 540 //printf("Codepoint: %x\n", *code_point); 541 //printf("Codepoint <<6: %x\n", ((*code_point << 6) | (next_byte & 0x3F))); 542 //printf("Remaining bytes: %x\n", remaining_bytes); 543 //printf("First byte: %x\n", first_byte); 544 //printf("Next byte: %x\n", next_byte); 545 //printf("Bitwise AND result: %x\n", (next_byte & 0xC0)); 546 547 if ((next_byte & 0xC0) != 0x80) { 548 *utf8_length = 1; 549 return 0; // Invalid byte in sequence 550 } 551 552 *code_point = (*code_point << 6) | (next_byte & 0x3F); 553 } 554 555 return 1; 556 } 557 558 /** 559 * Checks if a given Unicode code point is a punctuation character 560 * 561 * @param codepoint The Unicode code point to check. @return true if the 562 * code point is a punctuation character, false otherwise. 563 */ 564 static inline int is_punctuation(unsigned int codepoint) { 565 566 // Check for underscore (underscore is not treated as punctuation) 567 if (is_underscore(codepoint)) 568 return 0; 569 570 // Check for ASCII punctuation 571 if (ispunct(codepoint)) 572 return 1; 573 574 // Check for Unicode punctuation exceptions (punctuation allowed in hashtags) 575 if (codepoint == 0x301C || codepoint == 0xFF5E) // Japanese Wave Dash / Tilde 576 return 0; 577 578 // Check for Unicode punctuation 579 // NOTE: We may need to adjust the codepoint ranges in the future, 580 // to include/exclude certain types of Unicode characters in hashtags. 581 // Unicode Blocks Reference: https://www.compart.com/en/unicode/block 582 return ( 583 // Latin-1 Supplement No-Break Space (NBSP): U+00A0 584 (codepoint == 0x00A0) || 585 586 // Latin-1 Supplement Punctuation: U+00A1 to U+00BF 587 (codepoint >= 0x00A1 && codepoint <= 0x00BF) || 588 589 // General Punctuation: U+2000 to U+206F 590 (codepoint >= 0x2000 && codepoint <= 0x206F) || 591 592 // Currency Symbols: U+20A0 to U+20CF 593 (codepoint >= 0x20A0 && codepoint <= 0x20CF) || 594 595 // Supplemental Punctuation: U+2E00 to U+2E7F 596 (codepoint >= 0x2E00 && codepoint <= 0x2E7F) || 597 598 // CJK Symbols and Punctuation: U+3000 to U+303F 599 (codepoint >= 0x3000 && codepoint <= 0x303F) || 600 601 // Ideographic Description Characters: U+2FF0 to U+2FFF 602 (codepoint >= 0x2FF0 && codepoint <= 0x2FFF) 603 ); 604 } 605 606 static inline int is_right_boundary(int c) { 607 return is_whitespace(c) || is_punctuation(c); 608 } 609 610 static inline int is_left_boundary(char c) { 611 return is_right_boundary(c) || is_utf8_byte(c); 612 } 613 614 static inline int is_alphanumeric(char c) { 615 return (c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z') || (c >= '0' && c <= '9'); 616 } 617 618 static inline int consume_until_boundary(struct cursor *cur) { 619 unsigned int c; 620 unsigned int char_length = 1; 621 unsigned int *utf8_char_length = &char_length; 622 623 while (cur->p < cur->end) { 624 c = *cur->p; 625 626 *utf8_char_length = 1; 627 628 if (is_whitespace(c)) 629 return 1; 630 631 // Need to check for UTF-8 characters, which can be multiple bytes long 632 if (is_utf8_byte(c)) { 633 if (!parse_utf8_char(cur, &c, utf8_char_length)) { 634 if (!is_right_boundary(c)){ 635 // TODO: We should work towards handling all UTF-8 characters. 636 //printf("Invalid UTF-8 code point: %x\n", c); 637 } 638 } 639 } 640 641 if (is_right_boundary(c)) 642 return 1; 643 644 // Need to use a variable character byte length for UTF-8 (2-4 bytes) 645 if (cur->p + *utf8_char_length <= cur->end) 646 cur->p += *utf8_char_length; 647 else 648 cur->p++; 649 } 650 651 return 1; 652 } 653 654 static inline int consume_until_whitespace(struct cursor *cur, int or_end) { 655 char c; 656 int consumedAtLeastOne = 0; 657 658 while (cur->p < cur->end) { 659 c = *cur->p; 660 661 if (is_whitespace(c)) 662 return consumedAtLeastOne; 663 664 cur->p++; 665 consumedAtLeastOne = 1; 666 } 667 668 return or_end; 669 } 670 671 static inline int consume_until_non_alphanumeric(struct cursor *cur, int or_end) { 672 char c; 673 int consumedAtLeastOne = 0; 674 675 while (cur->p < cur->end) { 676 c = *cur->p; 677 678 if (!is_alphanumeric(c)) 679 return consumedAtLeastOne; 680 681 cur->p++; 682 consumedAtLeastOne = 1; 683 } 684 685 return or_end; 686 } 687 688 689 static inline int cursor_memset(struct cursor *cursor, unsigned char c, int n) 690 { 691 if (cursor->p + n >= cursor->end) 692 return 0; 693 694 memset(cursor->p, c, n); 695 cursor->p += n; 696 697 return 1; 698 } 699 700 static void consume_whitespace_or_punctuation(struct cursor *cur) 701 { 702 while (cur->p < cur->end) { 703 if (!is_right_boundary(*cur->p)) 704 return; 705 cur->p++; 706 } 707 } 708 709 #endif