content_parser.c (12643B)
1 #include "cursor.h" 2 #include "nostr_bech32.h" 3 #include "block.h" 4 #include "nostrdb.h" 5 #include "invoice.h" 6 #include "bolt11/bolt11.h" 7 #include "bolt11/bech32.h" 8 #include <stdlib.h> 9 #include <string.h> 10 11 #include "cursor.h" 12 13 struct ndb_content_parser { 14 int bech32_strs; 15 struct cursor buffer; 16 struct cursor content; 17 struct ndb_blocks *blocks; 18 }; 19 20 static int parse_digit(struct cursor *cur, int *digit) { 21 int c; 22 if ((c = peek_char(cur, 0)) == -1) 23 return 0; 24 25 c -= '0'; 26 27 if (c >= 0 && c <= 9) { 28 *digit = c; 29 cur->p++; 30 return 1; 31 } 32 return 0; 33 } 34 35 36 static int parse_mention_index(struct cursor *cur, struct ndb_block *block) { 37 int d1, d2, d3, ind; 38 unsigned char *start = cur->p; 39 40 if (!parse_str(cur, "#[")) 41 return 0; 42 43 if (!parse_digit(cur, &d1)) { 44 cur->p = start; 45 return 0; 46 } 47 48 ind = d1; 49 50 if (parse_digit(cur, &d2)) 51 ind = (d1 * 10) + d2; 52 53 if (parse_digit(cur, &d3)) 54 ind = (d1 * 100) + (d2 * 10) + d3; 55 56 if (!parse_char(cur, ']')) { 57 cur->p = start; 58 return 0; 59 } 60 61 block->type = BLOCK_MENTION_INDEX; 62 block->block.mention_index = ind; 63 64 return 1; 65 } 66 67 static int parse_hashtag(struct cursor *cur, struct ndb_block *block) { 68 int c; 69 unsigned char *start = cur->p; 70 71 if (!parse_char(cur, '#')) 72 return 0; 73 74 c = peek_char(cur, 0); 75 if (c == -1 || is_whitespace(c) || c == '#') { 76 cur->p = start; 77 return 0; 78 } 79 80 consume_until_boundary(cur); 81 82 block->type = BLOCK_HASHTAG; 83 block->block.str.str = (const char*)(start + 1); 84 block->block.str.len = cur->p - (start + 1); 85 86 return 1; 87 } 88 89 // 90 // decode and push a bech32 mention into our blocks output buffer. 91 // 92 // bech32 blocks are stored as: 93 // 94 // bech32_buffer_size : u16 95 // nostr_bech32_type : varint 96 // bech32_data : [u8] 97 // 98 // The TLV form is compact already, so we just use it directly 99 // 100 // This allows us to not duplicate all of the TLV encoding and decoding code 101 // for our on-disk nostrdb format. 102 // 103 static int push_bech32_mention(struct ndb_content_parser *p, struct ndb_str_block *bech32) 104 { 105 // we decode the raw bech32 directly into the output buffer 106 struct cursor u8, u5; 107 unsigned char *start; 108 uint16_t *u8_size; 109 enum nostr_bech32_type type; 110 size_t u5_out_len, u8_out_len; 111 static const int MAX_PREFIX = 8; 112 char prefix[9] = {0}; 113 114 start = p->buffer.p; 115 116 if (!parse_nostr_bech32_type(bech32->str, &type)) 117 goto fail; 118 119 // make sure to push the str block! 120 if (!push_str_block(&p->buffer, (const char*)p->content.start, bech32)) 121 goto fail; 122 // 123 // save a spot for the raw bech32 buffer size 124 u8_size = (uint16_t*)p->buffer.p; 125 if (!cursor_skip(&p->buffer, 2)) 126 goto fail; 127 128 if (!cursor_push_varint(&p->buffer, type)) 129 goto fail; 130 131 if (!cursor_malloc_slice(&p->buffer, &u8, bech32->len)) 132 goto fail; 133 134 if (!cursor_malloc_slice(&p->buffer, &u5, bech32->len)) 135 goto fail; 136 137 if (bech32_decode_len(prefix, u5.p, &u5_out_len, bech32->str, 138 bech32->len, MAX_PREFIX) == BECH32_ENCODING_NONE) { 139 goto fail; 140 } 141 142 u5.p += u5_out_len; 143 144 if (!bech32_convert_bits(u8.p, &u8_out_len, 8, u5.start, u5.p - u5.start, 5, 0)) 145 goto fail; 146 147 u8.p += u8_out_len; 148 149 // move the out cursor to the end of the 8-bit buffer 150 p->buffer.p = u8.p; 151 152 if (u8_out_len > UINT16_MAX) 153 goto fail; 154 155 // mark the size of the bech32 buffer 156 *u8_size = (uint16_t)u8_out_len; 157 158 return 1; 159 160 fail: 161 p->buffer.p = start; 162 return 0; 163 } 164 165 static int push_invoice_str(struct ndb_content_parser *p, struct ndb_str_block *str) 166 { 167 unsigned char *start; 168 struct bolt11 *bolt11; 169 char *fail; 170 171 if (!(bolt11 = bolt11_decode_minimal(NULL, str->str, &fail))) 172 return 0; 173 174 start = p->buffer.p; 175 176 // push the text block just incase we don't care for the invoice 177 if (!push_str_block(&p->buffer, (const char*)p->content.start, str)) 178 return 0; 179 180 // push decoded invoice data for quick access 181 if (!ndb_encode_invoice(&p->buffer, bolt11)) { 182 p->buffer.p = start; 183 tal_free(bolt11); 184 return 0; 185 } 186 187 tal_free(bolt11); 188 return 1; 189 } 190 191 int push_block(struct ndb_content_parser *p, struct ndb_block *block); 192 static int add_text_block(struct ndb_content_parser *p, const char *start, const char *end) 193 { 194 struct ndb_block b; 195 196 if (start == end) 197 return 1; 198 199 b.type = BLOCK_TEXT; 200 b.block.str.str = start; 201 b.block.str.len = end - start; 202 203 return push_block(p, &b); 204 } 205 206 207 int push_block(struct ndb_content_parser *p, struct ndb_block *block) 208 { 209 unsigned char *start = p->buffer.p; 210 211 // push the tag 212 if (!cursor_push_varint(&p->buffer, block->type)) 213 return 0; 214 215 switch (block->type) { 216 case BLOCK_HASHTAG: 217 case BLOCK_TEXT: 218 case BLOCK_URL: 219 if (!push_str_block(&p->buffer, (const char*)p->content.start, 220 &block->block.str)) 221 goto fail; 222 break; 223 224 case BLOCK_MENTION_INDEX: 225 if (!cursor_push_varint(&p->buffer, block->block.mention_index)) 226 goto fail; 227 break; 228 case BLOCK_MENTION_BECH32: 229 // we only push bech32 strs here 230 if (!push_bech32_mention(p, &block->block.str)) { 231 // if we fail for some reason, try pushing just a text block 232 p->buffer.p = start; 233 if (!add_text_block(p, block->block.str.str, 234 block->block.str.str + 235 block->block.str.len)) { 236 goto fail; 237 } 238 } 239 break; 240 241 case BLOCK_INVOICE: 242 // we only push invoice strs here 243 if (!push_invoice_str(p, &block->block.str)) { 244 // if we fail for some reason, try pushing just a text block 245 p->buffer.p = start; 246 if (!add_text_block(p, block->block.str.str, 247 block->block.str.str + block->block.str.len)) { 248 goto fail; 249 } 250 } 251 break; 252 } 253 254 p->blocks->num_blocks++; 255 256 return 1; 257 258 fail: 259 p->buffer.p = start; 260 return 0; 261 } 262 263 264 265 static inline int next_char_is_whitespace(unsigned char *cur, unsigned char *end) { 266 unsigned char *next = cur + 1; 267 268 if (next > end) 269 return 0; 270 271 if (next == end) 272 return 1; 273 274 return is_whitespace(*next); 275 } 276 277 static inline int char_disallowed_at_end_url(char c) 278 { 279 return c == '.' || c == ','; 280 281 } 282 283 static int is_final_url_char(unsigned char *cur, unsigned char *end) 284 { 285 if (is_whitespace(*cur)) 286 return 1; 287 288 if (next_char_is_whitespace(cur, end)) { 289 // next char is whitespace so this char could be the final char in the url 290 return char_disallowed_at_end_url(*cur); 291 } 292 293 // next char isn't whitespace so it can't be a final char 294 return 0; 295 } 296 297 static int consume_until_end_url(struct cursor *cur, int or_end) { 298 unsigned char *start = cur->p; 299 300 while (cur->p < cur->end) { 301 if (is_final_url_char(cur->p, cur->end)) 302 return cur->p != start; 303 304 cur->p++; 305 } 306 307 return or_end; 308 } 309 310 static int consume_url_fragment(struct cursor *cur) 311 { 312 int c; 313 314 if ((c = peek_char(cur, 0)) < 0) 315 return 1; 316 317 if (c != '#' && c != '?') { 318 return 1; 319 } 320 321 cur->p++; 322 323 return consume_until_end_url(cur, 1); 324 } 325 326 static int consume_url_path(struct cursor *cur) 327 { 328 int c; 329 330 if ((c = peek_char(cur, 0)) < 0) 331 return 1; 332 333 if (c != '/') { 334 return 1; 335 } 336 337 while (cur->p < cur->end) { 338 c = *cur->p; 339 340 if (c == '?' || c == '#' || is_final_url_char(cur->p, cur->end)) { 341 return 1; 342 } 343 344 cur->p++; 345 } 346 347 return 1; 348 } 349 350 static int consume_url_host(struct cursor *cur) 351 { 352 char c; 353 int count = 0; 354 355 while (cur->p < cur->end) { 356 c = *cur->p; 357 // TODO: handle IDNs 358 if ((is_alphanumeric(c) || c == '.' || c == '-') && !is_final_url_char(cur->p, cur->end)) 359 { 360 count++; 361 cur->p++; 362 continue; 363 } 364 365 return count != 0; 366 } 367 368 369 // this means the end of the URL hostname is the end of the buffer and we finished 370 return count != 0; 371 } 372 373 static int parse_url(struct cursor *cur, struct ndb_block *block) { 374 unsigned char *start = cur->p; 375 unsigned char *host; 376 unsigned char tmp[4096]; 377 int host_len; 378 struct cursor path_cur, tmp_cur; 379 enum nostr_bech32_type type; 380 make_cursor(tmp, tmp + sizeof(tmp), &tmp_cur); 381 382 if (!parse_str(cur, "http")) 383 return 0; 384 385 if (parse_char(cur, 's') || parse_char(cur, 'S')) { 386 if (!parse_str(cur, "://")) { 387 cur->p = start; 388 return 0; 389 } 390 } else { 391 if (!parse_str(cur, "://")) { 392 cur->p = start; 393 return 0; 394 } 395 } 396 397 // make sure to save the hostname. We will use this to detect damus.io links 398 host = cur->p; 399 400 if (!consume_url_host(cur)) { 401 cur->p = start; 402 return 0; 403 } 404 405 // get the length of the host string 406 host_len = (int)(cur->p - host); 407 408 // save the current parse state so that we can continue from here when 409 // parsing the bech32 in the damus.io link if we have it 410 copy_cursor(cur, &path_cur); 411 412 // skip leading / 413 cursor_skip(&path_cur, 1); 414 415 if (!consume_url_path(cur)) { 416 cur->p = start; 417 return 0; 418 } 419 420 if (!consume_url_fragment(cur)) { 421 cur->p = start; 422 return 0; 423 } 424 425 // smart parens 426 if ((start - 1) >= cur->start && 427 start < cur->end && 428 *(start - 1) == '(' && 429 (cur->p - 1) < cur->end && 430 *(cur->p - 1) == ')') 431 { 432 cur->p--; 433 } 434 435 // save the bech32 string pos in case we hit a damus.io link 436 block->block.str.str = (const char *)path_cur.p; 437 438 // if we have a damus link, make it a mention 439 if (host_len == 8 440 && !strncmp((const char *)host, "damus.io", 8) 441 && parse_nostr_bech32_str(&path_cur, &type)) 442 { 443 block->block.str.len = path_cur.p - path_cur.start; 444 block->type = BLOCK_MENTION_BECH32; 445 return 1; 446 } 447 448 block->type = BLOCK_URL; 449 block->block.str.str = (const char *)start; 450 block->block.str.len = cur->p - start; 451 452 return 1; 453 } 454 455 static int parse_invoice(struct cursor *cur, struct ndb_block *block) { 456 unsigned char *start, *end; 457 458 // optional 459 parse_str(cur, "lightning:"); 460 461 start = cur->p; 462 463 if (!parse_str(cur, "lnbc")) 464 return 0; 465 466 if (!consume_until_whitespace(cur, 1)) { 467 cur->p = start; 468 return 0; 469 } 470 471 end = cur->p; 472 473 block->type = BLOCK_INVOICE; 474 475 block->block.str.str = (const char*)start; 476 block->block.str.len = end - start; 477 478 cur->p = end; 479 480 return 1; 481 } 482 483 484 static int parse_mention_bech32(struct cursor *cur, struct ndb_block *block) { 485 unsigned char *start = cur->p; 486 enum nostr_bech32_type type; 487 488 parse_char(cur, '@'); 489 parse_str(cur, "nostr:"); 490 491 block->block.str.str = (const char *)cur->p; 492 493 if (!parse_nostr_bech32_str(cur, &type)) { 494 cur->p = start; 495 return 0; 496 } 497 498 block->block.str.len = cur->p - (unsigned char*)block->block.str.str; 499 block->type = BLOCK_MENTION_BECH32; 500 501 return 1; 502 } 503 504 static int add_text_then_block(struct ndb_content_parser *p, 505 struct ndb_block *block, 506 unsigned char **start, 507 const unsigned char *pre_mention) 508 { 509 if (!add_text_block(p, (const char *)*start, (const char*)pre_mention)) 510 return 0; 511 512 *start = (unsigned char*)p->content.p; 513 514 return push_block(p, block); 515 } 516 517 int ndb_parse_content(unsigned char *buf, int buf_size, 518 const char *content, int content_len, 519 struct ndb_blocks **blocks_p) 520 { 521 int cp, c; 522 struct ndb_content_parser parser; 523 struct ndb_block block; 524 525 unsigned char *start, *pre_mention, *blocks_start; 526 527 make_cursor(buf, buf + buf_size, &parser.buffer); 528 529 // allocate some space for the blocks header 530 *blocks_p = parser.blocks = (struct ndb_blocks *)buf; 531 parser.buffer.p += sizeof(struct ndb_blocks); 532 533 make_cursor((unsigned char *)content, 534 (unsigned char*)content + content_len, &parser.content); 535 536 parser.blocks->words = 0; 537 parser.blocks->num_blocks = 0; 538 parser.blocks->blocks_size = 0; 539 parser.blocks->flags = 0; 540 parser.blocks->version = 1; 541 542 blocks_start = start = parser.content.p; 543 while (parser.content.p < parser.content.end) { 544 cp = peek_char(&parser.content, -1); 545 c = peek_char(&parser.content, 0); 546 547 // new word 548 if (is_whitespace(cp) && !is_whitespace(c)) 549 parser.blocks->words++; 550 551 pre_mention = parser.content.p; 552 if (cp == -1 || is_left_boundary(cp) || c == '#') { 553 if (c == '#' && (parse_mention_index(&parser.content, &block) || parse_hashtag(&parser.content, &block))) { 554 if (!add_text_then_block(&parser, &block, &start, pre_mention)) 555 return 0; 556 continue; 557 } else if ((c == 'h' || c == 'H') && parse_url(&parser.content, &block)) { 558 if (!add_text_then_block(&parser, &block, &start, pre_mention)) 559 return 0; 560 continue; 561 } else if ((c == 'l' || c == 'L') && parse_invoice(&parser.content, &block)) { 562 if (!add_text_then_block(&parser, &block, &start, pre_mention)) 563 return 0; 564 continue; 565 } else if ((c == 'n' || c == '@') && parse_mention_bech32(&parser.content, &block)) { 566 if (!add_text_then_block(&parser, &block, &start, pre_mention)) 567 return 0; 568 continue; 569 } 570 } 571 572 parser.content.p++; 573 } 574 575 if (parser.content.p - start > 0) { 576 if (!add_text_block(&parser, (const char*)start, (const char *)parser.content.p)) 577 return 0; 578 } 579 580 parser.blocks->blocks_size = parser.buffer.p - blocks_start; 581 582 // 583 // pad to 8-byte alignment 584 // 585 if (!cursor_align(&parser.buffer, 8)) 586 return 0; 587 assert((parser.buffer.p - parser.buffer.start) % 8 == 0); 588 parser.blocks->total_size = parser.buffer.p - parser.buffer.start; 589 590 return 1; 591 } 592