content_parser.c (12922B)
1 #include "cursor.h" 2 #include "nostr_bech32.h" 3 #include "block.h" 4 #include "nostrdb.h" 5 #include "invoice.h" 6 7 #ifndef _WIN32 8 #include "bolt11/bolt11.h" 9 #endif 10 11 #include "bolt11/bech32.h" 12 13 #include <stdlib.h> 14 #include <string.h> 15 16 #include "cursor.h" 17 18 struct ndb_content_parser { 19 int bech32_strs; 20 struct cursor buffer; 21 struct cursor content; 22 struct ndb_blocks *blocks; 23 }; 24 25 static int parse_digit(struct cursor *cur, int *digit) { 26 int c; 27 if ((c = peek_char(cur, 0)) == -1) { 28 *digit = 0; 29 return 0; 30 } 31 32 c -= '0'; 33 34 if (c >= 0 && c <= 9) { 35 *digit = c; 36 cur->p++; 37 return 1; 38 } 39 return 0; 40 } 41 42 43 static int parse_mention_index(struct cursor *cur, struct ndb_block *block) { 44 int d1, d2, d3, ind; 45 unsigned char *start = cur->p; 46 47 if (!parse_str(cur, "#[")) 48 return 0; 49 50 if (!parse_digit(cur, &d1)) { 51 cur->p = start; 52 return 0; 53 } 54 55 ind = d1; 56 57 if (parse_digit(cur, &d2)) 58 ind = (d1 * 10) + d2; 59 60 if (parse_digit(cur, &d3)) 61 ind = (d1 * 100) + (d2 * 10) + d3; 62 63 if (!parse_char(cur, ']')) { 64 cur->p = start; 65 return 0; 66 } 67 68 block->type = BLOCK_MENTION_INDEX; 69 block->block.mention_index = ind; 70 71 return 1; 72 } 73 74 static int parse_hashtag(struct cursor *cur, struct ndb_block *block) { 75 int c; 76 unsigned char *start = cur->p; 77 78 if (!parse_char(cur, '#')) 79 return 0; 80 81 c = peek_char(cur, 0); 82 if (c == -1 || is_whitespace(c) || c == '#') { 83 cur->p = start; 84 return 0; 85 } 86 87 consume_until_boundary(cur); 88 89 block->type = BLOCK_HASHTAG; 90 block->block.str.str = (const char*)(start + 1); 91 block->block.str.len = cur->p - (start + 1); 92 93 return 1; 94 } 95 96 // 97 // decode and push a bech32 mention into our blocks output buffer. 98 // 99 // bech32 blocks are stored as: 100 // 101 // bech32_buffer_size : u16 102 // nostr_bech32_type : varint 103 // bech32_data : [u8] 104 // 105 // The TLV form is compact already, so we just use it directly 106 // 107 // This allows us to not duplicate all of the TLV encoding and decoding code 108 // for our on-disk nostrdb format. 109 // 110 static int push_bech32_mention(struct ndb_content_parser *p, struct ndb_str_block *bech32) 111 { 112 // we decode the raw bech32 directly into the output buffer 113 struct cursor u8, u5; 114 unsigned char *start; 115 uint16_t *u8_size; 116 enum nostr_bech32_type type; 117 size_t u5_out_len, u8_out_len; 118 static const int MAX_PREFIX = 8; 119 char prefix[9] = {0}; 120 121 start = p->buffer.p; 122 123 if (!parse_nostr_bech32_type(bech32->str, &type)) 124 goto fail; 125 126 // make sure to push the str block! 127 if (!push_str_block(&p->buffer, (const char*)p->content.start, bech32)) 128 goto fail; 129 // 130 // save a spot for the raw bech32 buffer size 131 u8_size = (uint16_t*)p->buffer.p; 132 if (!cursor_skip(&p->buffer, 2)) 133 goto fail; 134 135 if (!cursor_push_varint(&p->buffer, type)) 136 goto fail; 137 138 if (!cursor_malloc_slice(&p->buffer, &u8, bech32->len)) 139 goto fail; 140 141 if (!cursor_malloc_slice(&p->buffer, &u5, bech32->len)) 142 goto fail; 143 144 if (bech32_decode_len(prefix, u5.p, &u5_out_len, bech32->str, 145 bech32->len, MAX_PREFIX) == BECH32_ENCODING_NONE) { 146 goto fail; 147 } 148 149 u5.p += u5_out_len; 150 151 if (!bech32_convert_bits(u8.p, &u8_out_len, 8, u5.start, u5.p - u5.start, 5, 0)) 152 goto fail; 153 154 u8.p += u8_out_len; 155 156 // move the out cursor to the end of the 8-bit buffer 157 p->buffer.p = u8.p; 158 159 if (u8_out_len > UINT16_MAX) 160 goto fail; 161 162 // mark the size of the bech32 buffer 163 *u8_size = (uint16_t)u8_out_len; 164 165 return 1; 166 167 fail: 168 p->buffer.p = start; 169 return 0; 170 } 171 172 static int push_invoice_str(struct ndb_content_parser *p, struct ndb_str_block *str) 173 { 174 #ifdef _WIN32 175 // we shouldn't be pushing invoices on windows until we fix 176 // bolt11 parser portability 177 return 0; 178 #else 179 unsigned char *start; 180 struct bolt11 *bolt11; 181 char *fail; 182 183 if (!(bolt11 = bolt11_decode_minimal(NULL, str->str, &fail))) 184 return 0; 185 186 start = p->buffer.p; 187 188 // push the text block just incase we don't care for the invoice 189 if (!push_str_block(&p->buffer, (const char*)p->content.start, str)) 190 return 0; 191 192 // push decoded invoice data for quick access 193 if (!ndb_encode_invoice(&p->buffer, bolt11)) { 194 p->buffer.p = start; 195 tal_free(bolt11); 196 return 0; 197 } 198 199 tal_free(bolt11); 200 return 1; 201 #endif 202 } 203 204 int push_block(struct ndb_content_parser *p, struct ndb_block *block); 205 static int add_text_block(struct ndb_content_parser *p, const char *start, const char *end) 206 { 207 struct ndb_block b; 208 209 if (start == end) 210 return 1; 211 212 b.type = BLOCK_TEXT; 213 b.block.str.str = start; 214 b.block.str.len = end - start; 215 216 return push_block(p, &b); 217 } 218 219 220 int push_block(struct ndb_content_parser *p, struct ndb_block *block) 221 { 222 unsigned char *start = p->buffer.p; 223 224 // push the tag 225 if (!cursor_push_varint(&p->buffer, block->type)) 226 return 0; 227 228 switch (block->type) { 229 case BLOCK_HASHTAG: 230 case BLOCK_TEXT: 231 case BLOCK_URL: 232 if (!push_str_block(&p->buffer, (const char*)p->content.start, 233 &block->block.str)) 234 goto fail; 235 break; 236 237 case BLOCK_MENTION_INDEX: 238 if (!cursor_push_varint(&p->buffer, block->block.mention_index)) 239 goto fail; 240 break; 241 case BLOCK_MENTION_BECH32: 242 // we only push bech32 strs here 243 if (!push_bech32_mention(p, &block->block.str)) { 244 // if we fail for some reason, try pushing just a text block 245 p->buffer.p = start; 246 if (!add_text_block(p, block->block.str.str, 247 block->block.str.str + 248 block->block.str.len)) { 249 goto fail; 250 } 251 } 252 break; 253 254 case BLOCK_INVOICE: 255 // we only push invoice strs here 256 if (!push_invoice_str(p, &block->block.str)) { 257 // if we fail for some reason, try pushing just a text block 258 p->buffer.p = start; 259 if (!add_text_block(p, block->block.str.str, 260 block->block.str.str + block->block.str.len)) { 261 goto fail; 262 } 263 } 264 break; 265 } 266 267 p->blocks->num_blocks++; 268 269 return 1; 270 271 fail: 272 p->buffer.p = start; 273 return 0; 274 } 275 276 277 278 static inline int next_char_is_whitespace(unsigned char *cur, unsigned char *end) { 279 unsigned char *next = cur + 1; 280 281 if (next > end) 282 return 0; 283 284 if (next == end) 285 return 1; 286 287 return is_whitespace(*next); 288 } 289 290 static inline int char_disallowed_at_end_url(char c) 291 { 292 return c == '.' || c == ','; 293 294 } 295 296 static int is_final_url_char(unsigned char *cur, unsigned char *end) 297 { 298 if (is_whitespace(*cur)) 299 return 1; 300 301 if (next_char_is_whitespace(cur, end)) { 302 // next char is whitespace so this char could be the final char in the url 303 return char_disallowed_at_end_url(*cur); 304 } 305 306 // next char isn't whitespace so it can't be a final char 307 return 0; 308 } 309 310 static int consume_until_end_url(struct cursor *cur, int or_end) { 311 unsigned char *start = cur->p; 312 313 while (cur->p < cur->end) { 314 if (is_final_url_char(cur->p, cur->end)) 315 return cur->p != start; 316 317 cur->p++; 318 } 319 320 return or_end; 321 } 322 323 static int consume_url_fragment(struct cursor *cur) 324 { 325 int c; 326 327 if ((c = peek_char(cur, 0)) < 0) 328 return 1; 329 330 if (c != '#' && c != '?') { 331 return 1; 332 } 333 334 cur->p++; 335 336 return consume_until_end_url(cur, 1); 337 } 338 339 static int consume_url_path(struct cursor *cur) 340 { 341 int c; 342 343 if ((c = peek_char(cur, 0)) < 0) 344 return 1; 345 346 if (c != '/') { 347 return 1; 348 } 349 350 while (cur->p < cur->end) { 351 c = *cur->p; 352 353 if (c == '?' || c == '#' || is_final_url_char(cur->p, cur->end)) { 354 return 1; 355 } 356 357 cur->p++; 358 } 359 360 return 1; 361 } 362 363 static int consume_url_host(struct cursor *cur) 364 { 365 char c; 366 int count = 0; 367 368 while (cur->p < cur->end) { 369 c = *cur->p; 370 // TODO: handle IDNs 371 if ((is_alphanumeric(c) || c == '.' || c == '-') && !is_final_url_char(cur->p, cur->end)) 372 { 373 count++; 374 cur->p++; 375 continue; 376 } 377 378 return count != 0; 379 } 380 381 382 // this means the end of the URL hostname is the end of the buffer and we finished 383 return count != 0; 384 } 385 386 static int parse_url(struct cursor *cur, struct ndb_block *block) { 387 unsigned char *start = cur->p; 388 unsigned char *host; 389 unsigned char tmp[4096]; 390 int host_len; 391 struct cursor path_cur, tmp_cur; 392 enum nostr_bech32_type type; 393 make_cursor(tmp, tmp + sizeof(tmp), &tmp_cur); 394 395 if (!parse_str(cur, "http")) 396 return 0; 397 398 if (parse_char(cur, 's') || parse_char(cur, 'S')) { 399 if (!parse_str(cur, "://")) { 400 cur->p = start; 401 return 0; 402 } 403 } else { 404 if (!parse_str(cur, "://")) { 405 cur->p = start; 406 return 0; 407 } 408 } 409 410 // make sure to save the hostname. We will use this to detect damus.io links 411 host = cur->p; 412 413 if (!consume_url_host(cur)) { 414 cur->p = start; 415 return 0; 416 } 417 418 // get the length of the host string 419 host_len = (int)(cur->p - host); 420 421 // save the current parse state so that we can continue from here when 422 // parsing the bech32 in the damus.io link if we have it 423 copy_cursor(cur, &path_cur); 424 425 // skip leading / 426 cursor_skip(&path_cur, 1); 427 428 if (!consume_url_path(cur)) { 429 cur->p = start; 430 return 0; 431 } 432 433 if (!consume_url_fragment(cur)) { 434 cur->p = start; 435 return 0; 436 } 437 438 // smart parens 439 if ((start - 1) >= cur->start && 440 start < cur->end && 441 *(start - 1) == '(' && 442 (cur->p - 1) < cur->end && 443 *(cur->p - 1) == ')') 444 { 445 cur->p--; 446 } 447 448 // save the bech32 string pos in case we hit a damus.io link 449 block->block.str.str = (const char *)path_cur.p; 450 451 // if we have a damus link, make it a mention 452 if (host_len == 8 453 && !strncmp((const char *)host, "damus.io", 8) 454 && parse_nostr_bech32_str(&path_cur, &type)) 455 { 456 block->block.str.len = path_cur.p - path_cur.start; 457 block->type = BLOCK_MENTION_BECH32; 458 return 1; 459 } 460 461 block->type = BLOCK_URL; 462 block->block.str.str = (const char *)start; 463 block->block.str.len = cur->p - start; 464 465 return 1; 466 } 467 468 static int parse_invoice(struct cursor *cur, struct ndb_block *block) { 469 unsigned char *start, *end; 470 471 #ifdef _WIN32 472 // bolt11 stuff requires non-portable cc stuff, so ignore for now 473 return 0; 474 #else 475 476 // optional 477 parse_str(cur, "lightning:"); 478 479 start = cur->p; 480 481 if (!parse_str(cur, "lnbc")) 482 return 0; 483 484 if (!consume_until_whitespace(cur, 1)) { 485 cur->p = start; 486 return 0; 487 } 488 489 end = cur->p; 490 491 block->type = BLOCK_INVOICE; 492 493 block->block.str.str = (const char*)start; 494 block->block.str.len = end - start; 495 496 cur->p = end; 497 498 return 1; 499 #endif 500 } 501 502 503 static int parse_mention_bech32(struct cursor *cur, struct ndb_block *block) { 504 unsigned char *start = cur->p; 505 enum nostr_bech32_type type; 506 507 parse_char(cur, '@'); 508 parse_str(cur, "nostr:"); 509 510 block->block.str.str = (const char *)cur->p; 511 512 if (!parse_nostr_bech32_str(cur, &type)) { 513 cur->p = start; 514 return 0; 515 } 516 517 block->block.str.len = cur->p - (unsigned char*)block->block.str.str; 518 block->type = BLOCK_MENTION_BECH32; 519 520 return 1; 521 } 522 523 static int add_text_then_block(struct ndb_content_parser *p, 524 struct ndb_block *block, 525 unsigned char **start, 526 const unsigned char *pre_mention) 527 { 528 if (!add_text_block(p, (const char *)*start, (const char*)pre_mention)) 529 return 0; 530 531 *start = (unsigned char*)p->content.p; 532 533 return push_block(p, block); 534 } 535 536 int ndb_parse_content(unsigned char *buf, int buf_size, 537 const char *content, int content_len, 538 struct ndb_blocks **blocks_p) 539 { 540 int cp, c; 541 struct ndb_content_parser parser; 542 struct ndb_block block; 543 544 unsigned char *start, *pre_mention, *blocks_start; 545 546 make_cursor(buf, buf + buf_size, &parser.buffer); 547 548 // allocate some space for the blocks header 549 *blocks_p = parser.blocks = (struct ndb_blocks *)buf; 550 parser.buffer.p += sizeof(struct ndb_blocks); 551 552 make_cursor((unsigned char *)content, 553 (unsigned char*)content + content_len, &parser.content); 554 555 parser.blocks->words = 0; 556 parser.blocks->num_blocks = 0; 557 parser.blocks->blocks_size = 0; 558 parser.blocks->flags = 0; 559 parser.blocks->version = 1; 560 561 blocks_start = start = parser.content.p; 562 while (parser.content.p < parser.content.end) { 563 cp = peek_char(&parser.content, -1); 564 c = peek_char(&parser.content, 0); 565 566 // new word 567 if (is_whitespace(cp) && !is_whitespace(c)) 568 parser.blocks->words++; 569 570 pre_mention = parser.content.p; 571 if (cp == -1 || is_left_boundary(cp) || c == '#') { 572 if (c == '#' && (parse_mention_index(&parser.content, &block) || parse_hashtag(&parser.content, &block))) { 573 if (!add_text_then_block(&parser, &block, &start, pre_mention)) 574 return 0; 575 continue; 576 } else if ((c == 'h' || c == 'H') && parse_url(&parser.content, &block)) { 577 if (!add_text_then_block(&parser, &block, &start, pre_mention)) 578 return 0; 579 continue; 580 } else if ((c == 'l' || c == 'L') && parse_invoice(&parser.content, &block)) { 581 if (!add_text_then_block(&parser, &block, &start, pre_mention)) 582 return 0; 583 continue; 584 } else if ((c == 'n' || c == '@') && parse_mention_bech32(&parser.content, &block)) { 585 if (!add_text_then_block(&parser, &block, &start, pre_mention)) 586 return 0; 587 continue; 588 } 589 } 590 591 parser.content.p++; 592 } 593 594 if (parser.content.p - start > 0) { 595 if (!add_text_block(&parser, (const char*)start, (const char *)parser.content.p)) 596 return 0; 597 } 598 599 parser.blocks->blocks_size = parser.buffer.p - blocks_start; 600 601 // 602 // pad to 8-byte alignment 603 // 604 if (!cursor_align(&parser.buffer, 8)) 605 return 0; 606 assert((parser.buffer.p - parser.buffer.start) % 8 == 0); 607 parser.blocks->total_size = parser.buffer.p - parser.buffer.start; 608 609 return 1; 610 } 611