content_parser.c (12903B)
1 #include "cursor.h" 2 #include "nostr_bech32.h" 3 #include "block.h" 4 #include "nostrdb.h" 5 #include "invoice.h" 6 7 #ifndef _WIN32 8 #include "bolt11/bolt11.h" 9 #endif 10 11 #include "bolt11/bech32.h" 12 13 #include <stdlib.h> 14 #include <string.h> 15 16 #include "cursor.h" 17 18 struct ndb_content_parser { 19 int bech32_strs; 20 struct cursor buffer; 21 struct cursor content; 22 struct ndb_blocks *blocks; 23 }; 24 25 static int parse_digit(struct cursor *cur, int *digit) { 26 int c; 27 if ((c = peek_char(cur, 0)) == -1) 28 return 0; 29 30 c -= '0'; 31 32 if (c >= 0 && c <= 9) { 33 *digit = c; 34 cur->p++; 35 return 1; 36 } 37 return 0; 38 } 39 40 41 static int parse_mention_index(struct cursor *cur, struct ndb_block *block) { 42 int d1, d2, d3, ind; 43 unsigned char *start = cur->p; 44 45 if (!parse_str(cur, "#[")) 46 return 0; 47 48 if (!parse_digit(cur, &d1)) { 49 cur->p = start; 50 return 0; 51 } 52 53 ind = d1; 54 55 if (parse_digit(cur, &d2)) 56 ind = (d1 * 10) + d2; 57 58 if (parse_digit(cur, &d3)) 59 ind = (d1 * 100) + (d2 * 10) + d3; 60 61 if (!parse_char(cur, ']')) { 62 cur->p = start; 63 return 0; 64 } 65 66 block->type = BLOCK_MENTION_INDEX; 67 block->block.mention_index = ind; 68 69 return 1; 70 } 71 72 static int parse_hashtag(struct cursor *cur, struct ndb_block *block) { 73 int c; 74 unsigned char *start = cur->p; 75 76 if (!parse_char(cur, '#')) 77 return 0; 78 79 c = peek_char(cur, 0); 80 if (c == -1 || is_whitespace(c) || c == '#') { 81 cur->p = start; 82 return 0; 83 } 84 85 consume_until_boundary(cur); 86 87 block->type = BLOCK_HASHTAG; 88 block->block.str.str = (const char*)(start + 1); 89 block->block.str.len = cur->p - (start + 1); 90 91 return 1; 92 } 93 94 // 95 // decode and push a bech32 mention into our blocks output buffer. 96 // 97 // bech32 blocks are stored as: 98 // 99 // bech32_buffer_size : u16 100 // nostr_bech32_type : varint 101 // bech32_data : [u8] 102 // 103 // The TLV form is compact already, so we just use it directly 104 // 105 // This allows us to not duplicate all of the TLV encoding and decoding code 106 // for our on-disk nostrdb format. 107 // 108 static int push_bech32_mention(struct ndb_content_parser *p, struct ndb_str_block *bech32) 109 { 110 // we decode the raw bech32 directly into the output buffer 111 struct cursor u8, u5; 112 unsigned char *start; 113 uint16_t *u8_size; 114 enum nostr_bech32_type type; 115 size_t u5_out_len, u8_out_len; 116 static const int MAX_PREFIX = 8; 117 char prefix[9] = {0}; 118 119 start = p->buffer.p; 120 121 if (!parse_nostr_bech32_type(bech32->str, &type)) 122 goto fail; 123 124 // make sure to push the str block! 125 if (!push_str_block(&p->buffer, (const char*)p->content.start, bech32)) 126 goto fail; 127 // 128 // save a spot for the raw bech32 buffer size 129 u8_size = (uint16_t*)p->buffer.p; 130 if (!cursor_skip(&p->buffer, 2)) 131 goto fail; 132 133 if (!cursor_push_varint(&p->buffer, type)) 134 goto fail; 135 136 if (!cursor_malloc_slice(&p->buffer, &u8, bech32->len)) 137 goto fail; 138 139 if (!cursor_malloc_slice(&p->buffer, &u5, bech32->len)) 140 goto fail; 141 142 if (bech32_decode_len(prefix, u5.p, &u5_out_len, bech32->str, 143 bech32->len, MAX_PREFIX) == BECH32_ENCODING_NONE) { 144 goto fail; 145 } 146 147 u5.p += u5_out_len; 148 149 if (!bech32_convert_bits(u8.p, &u8_out_len, 8, u5.start, u5.p - u5.start, 5, 0)) 150 goto fail; 151 152 u8.p += u8_out_len; 153 154 // move the out cursor to the end of the 8-bit buffer 155 p->buffer.p = u8.p; 156 157 if (u8_out_len > UINT16_MAX) 158 goto fail; 159 160 // mark the size of the bech32 buffer 161 *u8_size = (uint16_t)u8_out_len; 162 163 return 1; 164 165 fail: 166 p->buffer.p = start; 167 return 0; 168 } 169 170 static int push_invoice_str(struct ndb_content_parser *p, struct ndb_str_block *str) 171 { 172 #ifdef _WIN32 173 // we shouldn't be pushing invoices on windows until we fix 174 // bolt11 parser portability 175 return 0; 176 #else 177 unsigned char *start; 178 struct bolt11 *bolt11; 179 char *fail; 180 181 if (!(bolt11 = bolt11_decode_minimal(NULL, str->str, &fail))) 182 return 0; 183 184 start = p->buffer.p; 185 186 // push the text block just incase we don't care for the invoice 187 if (!push_str_block(&p->buffer, (const char*)p->content.start, str)) 188 return 0; 189 190 // push decoded invoice data for quick access 191 if (!ndb_encode_invoice(&p->buffer, bolt11)) { 192 p->buffer.p = start; 193 tal_free(bolt11); 194 return 0; 195 } 196 197 tal_free(bolt11); 198 return 1; 199 #endif 200 } 201 202 int push_block(struct ndb_content_parser *p, struct ndb_block *block); 203 static int add_text_block(struct ndb_content_parser *p, const char *start, const char *end) 204 { 205 struct ndb_block b; 206 207 if (start == end) 208 return 1; 209 210 b.type = BLOCK_TEXT; 211 b.block.str.str = start; 212 b.block.str.len = end - start; 213 214 return push_block(p, &b); 215 } 216 217 218 int push_block(struct ndb_content_parser *p, struct ndb_block *block) 219 { 220 unsigned char *start = p->buffer.p; 221 222 // push the tag 223 if (!cursor_push_varint(&p->buffer, block->type)) 224 return 0; 225 226 switch (block->type) { 227 case BLOCK_HASHTAG: 228 case BLOCK_TEXT: 229 case BLOCK_URL: 230 if (!push_str_block(&p->buffer, (const char*)p->content.start, 231 &block->block.str)) 232 goto fail; 233 break; 234 235 case BLOCK_MENTION_INDEX: 236 if (!cursor_push_varint(&p->buffer, block->block.mention_index)) 237 goto fail; 238 break; 239 case BLOCK_MENTION_BECH32: 240 // we only push bech32 strs here 241 if (!push_bech32_mention(p, &block->block.str)) { 242 // if we fail for some reason, try pushing just a text block 243 p->buffer.p = start; 244 if (!add_text_block(p, block->block.str.str, 245 block->block.str.str + 246 block->block.str.len)) { 247 goto fail; 248 } 249 } 250 break; 251 252 case BLOCK_INVOICE: 253 // we only push invoice strs here 254 if (!push_invoice_str(p, &block->block.str)) { 255 // if we fail for some reason, try pushing just a text block 256 p->buffer.p = start; 257 if (!add_text_block(p, block->block.str.str, 258 block->block.str.str + block->block.str.len)) { 259 goto fail; 260 } 261 } 262 break; 263 } 264 265 p->blocks->num_blocks++; 266 267 return 1; 268 269 fail: 270 p->buffer.p = start; 271 return 0; 272 } 273 274 275 276 static inline int next_char_is_whitespace(unsigned char *cur, unsigned char *end) { 277 unsigned char *next = cur + 1; 278 279 if (next > end) 280 return 0; 281 282 if (next == end) 283 return 1; 284 285 return is_whitespace(*next); 286 } 287 288 static inline int char_disallowed_at_end_url(char c) 289 { 290 return c == '.' || c == ','; 291 292 } 293 294 static int is_final_url_char(unsigned char *cur, unsigned char *end) 295 { 296 if (is_whitespace(*cur)) 297 return 1; 298 299 if (next_char_is_whitespace(cur, end)) { 300 // next char is whitespace so this char could be the final char in the url 301 return char_disallowed_at_end_url(*cur); 302 } 303 304 // next char isn't whitespace so it can't be a final char 305 return 0; 306 } 307 308 static int consume_until_end_url(struct cursor *cur, int or_end) { 309 unsigned char *start = cur->p; 310 311 while (cur->p < cur->end) { 312 if (is_final_url_char(cur->p, cur->end)) 313 return cur->p != start; 314 315 cur->p++; 316 } 317 318 return or_end; 319 } 320 321 static int consume_url_fragment(struct cursor *cur) 322 { 323 int c; 324 325 if ((c = peek_char(cur, 0)) < 0) 326 return 1; 327 328 if (c != '#' && c != '?') { 329 return 1; 330 } 331 332 cur->p++; 333 334 return consume_until_end_url(cur, 1); 335 } 336 337 static int consume_url_path(struct cursor *cur) 338 { 339 int c; 340 341 if ((c = peek_char(cur, 0)) < 0) 342 return 1; 343 344 if (c != '/') { 345 return 1; 346 } 347 348 while (cur->p < cur->end) { 349 c = *cur->p; 350 351 if (c == '?' || c == '#' || is_final_url_char(cur->p, cur->end)) { 352 return 1; 353 } 354 355 cur->p++; 356 } 357 358 return 1; 359 } 360 361 static int consume_url_host(struct cursor *cur) 362 { 363 char c; 364 int count = 0; 365 366 while (cur->p < cur->end) { 367 c = *cur->p; 368 // TODO: handle IDNs 369 if ((is_alphanumeric(c) || c == '.' || c == '-') && !is_final_url_char(cur->p, cur->end)) 370 { 371 count++; 372 cur->p++; 373 continue; 374 } 375 376 return count != 0; 377 } 378 379 380 // this means the end of the URL hostname is the end of the buffer and we finished 381 return count != 0; 382 } 383 384 static int parse_url(struct cursor *cur, struct ndb_block *block) { 385 unsigned char *start = cur->p; 386 unsigned char *host; 387 unsigned char tmp[4096]; 388 int host_len; 389 struct cursor path_cur, tmp_cur; 390 enum nostr_bech32_type type; 391 make_cursor(tmp, tmp + sizeof(tmp), &tmp_cur); 392 393 if (!parse_str(cur, "http")) 394 return 0; 395 396 if (parse_char(cur, 's') || parse_char(cur, 'S')) { 397 if (!parse_str(cur, "://")) { 398 cur->p = start; 399 return 0; 400 } 401 } else { 402 if (!parse_str(cur, "://")) { 403 cur->p = start; 404 return 0; 405 } 406 } 407 408 // make sure to save the hostname. We will use this to detect damus.io links 409 host = cur->p; 410 411 if (!consume_url_host(cur)) { 412 cur->p = start; 413 return 0; 414 } 415 416 // get the length of the host string 417 host_len = (int)(cur->p - host); 418 419 // save the current parse state so that we can continue from here when 420 // parsing the bech32 in the damus.io link if we have it 421 copy_cursor(cur, &path_cur); 422 423 // skip leading / 424 cursor_skip(&path_cur, 1); 425 426 if (!consume_url_path(cur)) { 427 cur->p = start; 428 return 0; 429 } 430 431 if (!consume_url_fragment(cur)) { 432 cur->p = start; 433 return 0; 434 } 435 436 // smart parens 437 if ((start - 1) >= cur->start && 438 start < cur->end && 439 *(start - 1) == '(' && 440 (cur->p - 1) < cur->end && 441 *(cur->p - 1) == ')') 442 { 443 cur->p--; 444 } 445 446 // save the bech32 string pos in case we hit a damus.io link 447 block->block.str.str = (const char *)path_cur.p; 448 449 // if we have a damus link, make it a mention 450 if (host_len == 8 451 && !strncmp((const char *)host, "damus.io", 8) 452 && parse_nostr_bech32_str(&path_cur, &type)) 453 { 454 block->block.str.len = path_cur.p - path_cur.start; 455 block->type = BLOCK_MENTION_BECH32; 456 return 1; 457 } 458 459 block->type = BLOCK_URL; 460 block->block.str.str = (const char *)start; 461 block->block.str.len = cur->p - start; 462 463 return 1; 464 } 465 466 static int parse_invoice(struct cursor *cur, struct ndb_block *block) { 467 unsigned char *start, *end; 468 469 #ifdef _WIN32 470 // bolt11 stuff requires non-portable cc stuff, so ignore for now 471 return 0; 472 #else 473 474 // optional 475 parse_str(cur, "lightning:"); 476 477 start = cur->p; 478 479 if (!parse_str(cur, "lnbc")) 480 return 0; 481 482 if (!consume_until_whitespace(cur, 1)) { 483 cur->p = start; 484 return 0; 485 } 486 487 end = cur->p; 488 489 block->type = BLOCK_INVOICE; 490 491 block->block.str.str = (const char*)start; 492 block->block.str.len = end - start; 493 494 cur->p = end; 495 496 return 1; 497 #endif 498 } 499 500 501 static int parse_mention_bech32(struct cursor *cur, struct ndb_block *block) { 502 unsigned char *start = cur->p; 503 enum nostr_bech32_type type; 504 505 parse_char(cur, '@'); 506 parse_str(cur, "nostr:"); 507 508 block->block.str.str = (const char *)cur->p; 509 510 if (!parse_nostr_bech32_str(cur, &type)) { 511 cur->p = start; 512 return 0; 513 } 514 515 block->block.str.len = cur->p - (unsigned char*)block->block.str.str; 516 block->type = BLOCK_MENTION_BECH32; 517 518 return 1; 519 } 520 521 static int add_text_then_block(struct ndb_content_parser *p, 522 struct ndb_block *block, 523 unsigned char **start, 524 const unsigned char *pre_mention) 525 { 526 if (!add_text_block(p, (const char *)*start, (const char*)pre_mention)) 527 return 0; 528 529 *start = (unsigned char*)p->content.p; 530 531 return push_block(p, block); 532 } 533 534 int ndb_parse_content(unsigned char *buf, int buf_size, 535 const char *content, int content_len, 536 struct ndb_blocks **blocks_p) 537 { 538 int cp, c; 539 struct ndb_content_parser parser; 540 struct ndb_block block; 541 542 unsigned char *start, *pre_mention, *blocks_start; 543 544 make_cursor(buf, buf + buf_size, &parser.buffer); 545 546 // allocate some space for the blocks header 547 *blocks_p = parser.blocks = (struct ndb_blocks *)buf; 548 parser.buffer.p += sizeof(struct ndb_blocks); 549 550 make_cursor((unsigned char *)content, 551 (unsigned char*)content + content_len, &parser.content); 552 553 parser.blocks->words = 0; 554 parser.blocks->num_blocks = 0; 555 parser.blocks->blocks_size = 0; 556 parser.blocks->flags = 0; 557 parser.blocks->version = 1; 558 559 blocks_start = start = parser.content.p; 560 while (parser.content.p < parser.content.end) { 561 cp = peek_char(&parser.content, -1); 562 c = peek_char(&parser.content, 0); 563 564 // new word 565 if (is_whitespace(cp) && !is_whitespace(c)) 566 parser.blocks->words++; 567 568 pre_mention = parser.content.p; 569 if (cp == -1 || is_left_boundary(cp) || c == '#') { 570 if (c == '#' && (parse_mention_index(&parser.content, &block) || parse_hashtag(&parser.content, &block))) { 571 if (!add_text_then_block(&parser, &block, &start, pre_mention)) 572 return 0; 573 continue; 574 } else if ((c == 'h' || c == 'H') && parse_url(&parser.content, &block)) { 575 if (!add_text_then_block(&parser, &block, &start, pre_mention)) 576 return 0; 577 continue; 578 } else if ((c == 'l' || c == 'L') && parse_invoice(&parser.content, &block)) { 579 if (!add_text_then_block(&parser, &block, &start, pre_mention)) 580 return 0; 581 continue; 582 } else if ((c == 'n' || c == '@') && parse_mention_bech32(&parser.content, &block)) { 583 if (!add_text_then_block(&parser, &block, &start, pre_mention)) 584 return 0; 585 continue; 586 } 587 } 588 589 parser.content.p++; 590 } 591 592 if (parser.content.p - start > 0) { 593 if (!add_text_block(&parser, (const char*)start, (const char *)parser.content.p)) 594 return 0; 595 } 596 597 parser.blocks->blocks_size = parser.buffer.p - blocks_start; 598 599 // 600 // pad to 8-byte alignment 601 // 602 if (!cursor_align(&parser.buffer, 8)) 603 return 0; 604 assert((parser.buffer.p - parser.buffer.start) % 8 == 0); 605 parser.blocks->total_size = parser.buffer.p - parser.buffer.start; 606 607 return 1; 608 } 609