nostrdb

an unfairly fast embedded nostr database backed by lmdb
git clone git://jb55.com/nostrdb
Log | Files | Refs | Submodules | README | LICENSE

content_parser.c (12903B)


      1 #include "cursor.h"
      2 #include "nostr_bech32.h"
      3 #include "block.h"
      4 #include "nostrdb.h"
      5 #include "invoice.h"
      6 
      7 #ifndef _WIN32
      8 #include "bolt11/bolt11.h"
      9 #endif
     10 
     11 #include "bolt11/bech32.h"
     12 
     13 #include <stdlib.h>
     14 #include <string.h>
     15 
     16 #include "cursor.h"
     17 
     18 struct ndb_content_parser {
     19 	int bech32_strs;
     20 	struct cursor buffer;
     21 	struct cursor content;
     22 	struct ndb_blocks *blocks;
     23 };
     24 
     25 static int parse_digit(struct cursor *cur, int *digit) {
     26 	int c;
     27 	if ((c = peek_char(cur, 0)) == -1)
     28 		return 0;
     29 	
     30 	c -= '0';
     31 	
     32 	if (c >= 0 && c <= 9) {
     33 		*digit = c;
     34 		cur->p++;
     35 		return 1;
     36 	}
     37 	return 0;
     38 }
     39 
     40 
     41 static int parse_mention_index(struct cursor *cur, struct ndb_block *block) {
     42 	int d1, d2, d3, ind;
     43 	unsigned char *start = cur->p;
     44 	
     45 	if (!parse_str(cur, "#["))
     46 		return 0;
     47 	
     48 	if (!parse_digit(cur, &d1)) {
     49 		cur->p = start;
     50 		return 0;
     51 	}
     52 	
     53 	ind = d1;
     54 	
     55 	if (parse_digit(cur, &d2))
     56 		ind = (d1 * 10) + d2;
     57 	
     58 	if (parse_digit(cur, &d3))
     59 		ind = (d1 * 100) + (d2 * 10) + d3;
     60 	
     61 	if (!parse_char(cur, ']')) {
     62 		cur->p = start;
     63 		return 0;
     64 	}
     65 	
     66 	block->type = BLOCK_MENTION_INDEX;
     67 	block->block.mention_index = ind;
     68 	
     69 	return 1;
     70 }
     71 
     72 static int parse_hashtag(struct cursor *cur, struct ndb_block *block) {
     73 	int c;
     74 	unsigned char *start = cur->p;
     75 	
     76 	if (!parse_char(cur, '#'))
     77 		return 0;
     78 	
     79 	c = peek_char(cur, 0);
     80 	if (c == -1 || is_whitespace(c) || c == '#') {
     81 		cur->p = start;
     82 		return 0;
     83 	}
     84 	
     85 	consume_until_boundary(cur);
     86 	
     87 	block->type = BLOCK_HASHTAG;
     88 	block->block.str.str = (const char*)(start + 1);
     89 	block->block.str.len = cur->p - (start + 1);
     90 	
     91 	return 1;
     92 }
     93 
     94 //
     95 // decode and push a bech32 mention into our blocks output buffer.
     96 //
     97 // bech32 blocks are stored as:
     98 //
     99 //     bech32_buffer_size : u16
    100 //     nostr_bech32_type  : varint
    101 //     bech32_data        : [u8]
    102 //
    103 // The TLV form is compact already, so we just use it directly
    104 //
    105 // This allows us to not duplicate all of the TLV encoding and decoding code
    106 // for our on-disk nostrdb format.
    107 //
    108 static int push_bech32_mention(struct ndb_content_parser *p, struct ndb_str_block *bech32)
    109 {
    110 	// we decode the raw bech32 directly into the output buffer
    111 	struct cursor u8, u5;
    112 	unsigned char *start;
    113 	uint16_t *u8_size;
    114 	enum nostr_bech32_type type;
    115 	size_t u5_out_len, u8_out_len;
    116 	static const int MAX_PREFIX = 8;
    117 	char prefix[9] = {0};
    118 
    119 	start = p->buffer.p;
    120 
    121 	if (!parse_nostr_bech32_type(bech32->str, &type))
    122 		goto fail;
    123 
    124 	// make sure to push the str block!
    125 	if (!push_str_block(&p->buffer, (const char*)p->content.start, bech32))
    126 		goto fail;
    127 	//
    128 	// save a spot for the raw bech32 buffer size
    129 	u8_size = (uint16_t*)p->buffer.p;
    130 	if (!cursor_skip(&p->buffer, 2))
    131 		goto fail;
    132 
    133 	if (!cursor_push_varint(&p->buffer, type))
    134 		goto fail;
    135 
    136 	if (!cursor_malloc_slice(&p->buffer, &u8, bech32->len))
    137 		goto fail;
    138 
    139 	if (!cursor_malloc_slice(&p->buffer, &u5, bech32->len))
    140 		goto fail;
    141 	
    142 	if (bech32_decode_len(prefix, u5.p, &u5_out_len, bech32->str,
    143 			      bech32->len, MAX_PREFIX) == BECH32_ENCODING_NONE) {
    144 		goto fail;
    145 	}
    146 
    147 	u5.p += u5_out_len;
    148 
    149 	if (!bech32_convert_bits(u8.p, &u8_out_len, 8, u5.start, u5.p - u5.start, 5, 0))
    150 		goto fail;
    151 
    152 	u8.p += u8_out_len;
    153 
    154 	// move the out cursor to the end of the 8-bit buffer
    155 	p->buffer.p = u8.p;
    156 
    157 	if (u8_out_len > UINT16_MAX)
    158 		goto fail;
    159 
    160 	// mark the size of the bech32 buffer
    161 	*u8_size = (uint16_t)u8_out_len;
    162 
    163 	return 1;
    164 
    165 fail:
    166 	p->buffer.p = start;
    167 	return 0;
    168 }
    169 
    170 static int push_invoice_str(struct ndb_content_parser *p, struct ndb_str_block *str)
    171 {
    172 #ifdef _WIN32
    173 	// we shouldn't be pushing invoices on windows until we fix
    174 	// bolt11 parser portability
    175 	return 0;
    176 #else
    177 	unsigned char *start;
    178 	struct bolt11 *bolt11;
    179 	char *fail;
    180 
    181 	if (!(bolt11 = bolt11_decode_minimal(NULL, str->str, &fail)))
    182 		return 0;
    183 
    184 	start = p->buffer.p;
    185 
    186 	// push the text block just incase we don't care for the invoice
    187 	if (!push_str_block(&p->buffer, (const char*)p->content.start, str))
    188 		return 0;
    189 
    190 	// push decoded invoice data for quick access
    191 	if (!ndb_encode_invoice(&p->buffer, bolt11)) {
    192 		p->buffer.p = start;
    193 		tal_free(bolt11);
    194 		return 0;
    195 	}
    196 
    197 	tal_free(bolt11);
    198 	return 1;
    199 #endif
    200 }
    201 
    202 int push_block(struct ndb_content_parser *p, struct ndb_block *block);
    203 static int add_text_block(struct ndb_content_parser *p, const char *start, const char *end)
    204 {
    205 	struct ndb_block b;
    206 	
    207 	if (start == end)
    208 		return 1;
    209 	
    210 	b.type = BLOCK_TEXT;
    211 	b.block.str.str = start;
    212 	b.block.str.len = end - start;
    213 	
    214 	return push_block(p, &b);
    215 }
    216 
    217 
    218 int push_block(struct ndb_content_parser *p, struct ndb_block *block)
    219 {
    220 	unsigned char *start = p->buffer.p;
    221 
    222 	// push the tag
    223 	if (!cursor_push_varint(&p->buffer, block->type))
    224 		return 0;
    225 
    226 	switch (block->type) {
    227 	case BLOCK_HASHTAG:
    228 	case BLOCK_TEXT:
    229 	case BLOCK_URL:
    230 		if (!push_str_block(&p->buffer, (const char*)p->content.start,
    231 			       &block->block.str))
    232 			goto fail;
    233 		break;
    234 
    235 	case BLOCK_MENTION_INDEX:
    236 		if (!cursor_push_varint(&p->buffer, block->block.mention_index))
    237 			goto fail;
    238 		break;
    239 	case BLOCK_MENTION_BECH32:
    240 		// we only push bech32 strs here
    241 		if (!push_bech32_mention(p, &block->block.str)) {
    242 			// if we fail for some reason, try pushing just a text block
    243 			p->buffer.p = start;
    244 			if (!add_text_block(p, block->block.str.str,
    245 					       block->block.str.str +
    246 					       block->block.str.len)) {
    247 				goto fail;
    248 			}
    249 		}
    250 		break;
    251 
    252 	case BLOCK_INVOICE:
    253 		// we only push invoice strs here
    254 		if (!push_invoice_str(p, &block->block.str)) {
    255 			// if we fail for some reason, try pushing just a text block
    256 			p->buffer.p = start;
    257 			if (!add_text_block(p, block->block.str.str,
    258 					    block->block.str.str + block->block.str.len)) {
    259 				goto fail;
    260 			}
    261 		}
    262 		break;
    263 	}
    264 
    265 	p->blocks->num_blocks++;
    266 
    267 	return 1;
    268 
    269 fail:
    270 	p->buffer.p = start;
    271 	return 0;
    272 }
    273 
    274 
    275 
    276 static inline int next_char_is_whitespace(unsigned char *cur, unsigned char *end) {
    277 	unsigned char *next = cur + 1;
    278 
    279 	if (next > end)
    280 		return 0;
    281 
    282 	if (next == end)
    283 		return 1;
    284 
    285 	return is_whitespace(*next);
    286 }
    287 
    288 static inline int char_disallowed_at_end_url(char c)
    289 {
    290 	return c == '.' || c == ',';
    291  
    292 }
    293 
    294 static int is_final_url_char(unsigned char *cur, unsigned char *end) 
    295 {
    296 	if (is_whitespace(*cur))
    297 		return 1;
    298 
    299 	if (next_char_is_whitespace(cur, end)) {
    300 		// next char is whitespace so this char could be the final char in the url
    301 		return char_disallowed_at_end_url(*cur);
    302 	}
    303 
    304 	// next char isn't whitespace so it can't be a final char
    305 	return 0;
    306 }
    307 
    308 static int consume_until_end_url(struct cursor *cur, int or_end) {
    309 	unsigned char *start = cur->p;
    310 
    311 	while (cur->p < cur->end) {
    312 		if (is_final_url_char(cur->p, cur->end))
    313 			return cur->p != start;
    314 
    315 		cur->p++;
    316 	}
    317 
    318 	return or_end;
    319 }
    320 
    321 static int consume_url_fragment(struct cursor *cur)
    322 {
    323 	int c;
    324 
    325 	if ((c = peek_char(cur, 0)) < 0)
    326 		return 1;
    327 
    328 	if (c != '#' && c != '?') {
    329 		return 1;
    330 	}
    331 
    332 	cur->p++;
    333 
    334 	return consume_until_end_url(cur, 1);
    335 }
    336 
    337 static int consume_url_path(struct cursor *cur)
    338 {
    339 	int c;
    340 
    341 	if ((c = peek_char(cur, 0)) < 0)
    342 		return 1;
    343 
    344 	if (c != '/') {
    345 		return 1;
    346 	}
    347 
    348 	while (cur->p < cur->end) {
    349 		c = *cur->p;
    350 
    351 		if (c == '?' || c == '#' || is_final_url_char(cur->p, cur->end)) {
    352 			return 1;
    353 		}
    354 
    355 		cur->p++;
    356 	}
    357 
    358 	return 1;
    359 }
    360 
    361 static int consume_url_host(struct cursor *cur)
    362 {
    363 	char c;
    364 	int count = 0;
    365 
    366 	while (cur->p < cur->end) {
    367 		c = *cur->p;
    368 		// TODO: handle IDNs
    369 		if ((is_alphanumeric(c) || c == '.' || c == '-') && !is_final_url_char(cur->p, cur->end))
    370 		{
    371 			count++;
    372 			cur->p++;
    373 			continue;
    374 		}
    375 
    376 		return count != 0;
    377 	}
    378 
    379 
    380 	// this means the end of the URL hostname is the end of the buffer and we finished
    381 	return count != 0;
    382 }
    383 
    384 static int parse_url(struct cursor *cur, struct ndb_block *block) {
    385 	unsigned char *start = cur->p;
    386 	unsigned char *host;
    387 	unsigned char tmp[4096];
    388 	int host_len;
    389 	struct cursor path_cur, tmp_cur;
    390 	enum nostr_bech32_type type;
    391 	make_cursor(tmp, tmp + sizeof(tmp), &tmp_cur);
    392 	
    393 	if (!parse_str(cur, "http"))
    394 		return 0;
    395 	
    396 	if (parse_char(cur, 's') || parse_char(cur, 'S')) {
    397 		if (!parse_str(cur, "://")) {
    398 			cur->p = start;
    399 			return 0;
    400 		}
    401 	} else {
    402 		if (!parse_str(cur, "://")) {
    403 			cur->p = start;
    404 			return 0;
    405 		}
    406 	}
    407 
    408 	// make sure to save the hostname. We will use this to detect damus.io links
    409 	host = cur->p;
    410 
    411 	if (!consume_url_host(cur)) {
    412 		cur->p = start;
    413 		return 0;
    414 	}
    415 
    416 	// get the length of the host string
    417 	host_len = (int)(cur->p - host);
    418 
    419 	// save the current parse state so that we can continue from here when
    420 	// parsing the bech32 in the damus.io link if we have it
    421 	copy_cursor(cur, &path_cur);
    422 
    423 	// skip leading /
    424 	cursor_skip(&path_cur, 1);
    425 
    426 	if (!consume_url_path(cur)) {
    427 		cur->p = start;
    428 		return 0;
    429 	}
    430 
    431 	if (!consume_url_fragment(cur)) {
    432 		cur->p = start;
    433 		return 0;
    434 	}
    435 
    436 	// smart parens
    437 	if ((start - 1) >= cur->start &&
    438 		start < cur->end &&
    439 		*(start - 1) == '(' &&
    440 		(cur->p - 1) < cur->end &&
    441 		*(cur->p - 1) == ')')
    442 	{
    443 		cur->p--;
    444 	}
    445 
    446 	// save the bech32 string pos in case we hit a damus.io link
    447 	block->block.str.str = (const char *)path_cur.p;
    448 
    449 	// if we have a damus link, make it a mention
    450 	if (host_len == 8
    451 	&& !strncmp((const char *)host, "damus.io", 8)
    452 	&& parse_nostr_bech32_str(&path_cur, &type))
    453 	{
    454 		block->block.str.len = path_cur.p - path_cur.start;
    455 		block->type = BLOCK_MENTION_BECH32;
    456 		return 1;
    457 	}
    458 
    459 	block->type = BLOCK_URL;
    460 	block->block.str.str = (const char *)start;
    461 	block->block.str.len = cur->p - start;
    462 	
    463 	return 1;
    464 }
    465 
    466 static int parse_invoice(struct cursor *cur, struct ndb_block *block) {
    467 	unsigned char *start, *end;
    468 
    469 #ifdef _WIN32
    470 	// bolt11 stuff requires non-portable cc stuff, so ignore for now
    471 	return 0;
    472 #else
    473 
    474 	// optional
    475 	parse_str(cur, "lightning:");
    476 	
    477 	start = cur->p;
    478 	
    479 	if (!parse_str(cur, "lnbc"))
    480 		return 0;
    481 	
    482 	if (!consume_until_whitespace(cur, 1)) {
    483 		cur->p = start;
    484 		return 0;
    485 	}
    486 	
    487 	end = cur->p;
    488 	
    489 	block->type = BLOCK_INVOICE;
    490 	
    491 	block->block.str.str = (const char*)start;
    492 	block->block.str.len = end - start;
    493 	
    494 	cur->p = end;
    495 	
    496 	return 1;
    497 #endif
    498 }
    499 
    500 
    501 static int parse_mention_bech32(struct cursor *cur, struct ndb_block *block) {
    502 	unsigned char *start = cur->p;
    503 	enum nostr_bech32_type type;
    504 	
    505 	parse_char(cur, '@');
    506 	parse_str(cur, "nostr:");
    507 
    508 	block->block.str.str = (const char *)cur->p;
    509 	
    510 	if (!parse_nostr_bech32_str(cur, &type)) {
    511 		cur->p = start;
    512 		return 0;
    513 	}
    514 	
    515 	block->block.str.len = cur->p - (unsigned char*)block->block.str.str;
    516 	block->type = BLOCK_MENTION_BECH32;
    517 
    518 	return 1;
    519 }
    520 
    521 static int add_text_then_block(struct ndb_content_parser *p,
    522 			       struct ndb_block *block,
    523 			       unsigned char **start,
    524 			       const unsigned char *pre_mention)
    525 {
    526 	if (!add_text_block(p, (const char *)*start, (const char*)pre_mention))
    527 		return 0;
    528 	
    529 	*start = (unsigned char*)p->content.p;
    530 	
    531 	return push_block(p, block);
    532 }
    533 
    534 int ndb_parse_content(unsigned char *buf, int buf_size,
    535 		      const char *content, int content_len,
    536 		      struct ndb_blocks **blocks_p)
    537 {
    538 	int cp, c;
    539 	struct ndb_content_parser parser;
    540 	struct ndb_block block;
    541 
    542 	unsigned char *start, *pre_mention, *blocks_start;
    543 	
    544 	make_cursor(buf, buf + buf_size, &parser.buffer);
    545 
    546 	// allocate some space for the blocks header
    547 	*blocks_p = parser.blocks = (struct ndb_blocks *)buf;
    548 	parser.buffer.p += sizeof(struct ndb_blocks);
    549 
    550 	make_cursor((unsigned char *)content,
    551 		    (unsigned char*)content + content_len, &parser.content);
    552 
    553 	parser.blocks->words = 0;
    554 	parser.blocks->num_blocks = 0;
    555 	parser.blocks->blocks_size = 0;
    556 	parser.blocks->flags = 0;
    557 	parser.blocks->version = 1;
    558 
    559 	blocks_start = start = parser.content.p;
    560 	while (parser.content.p < parser.content.end) {
    561 		cp = peek_char(&parser.content, -1);
    562 		c  = peek_char(&parser.content, 0);
    563 		
    564 		// new word
    565 		if (is_whitespace(cp) && !is_whitespace(c))
    566 			parser.blocks->words++;
    567 		
    568 		pre_mention = parser.content.p;
    569 		if (cp == -1 || is_left_boundary(cp) || c == '#') {
    570 			if (c == '#' && (parse_mention_index(&parser.content, &block) || parse_hashtag(&parser.content, &block))) {
    571 				if (!add_text_then_block(&parser, &block, &start, pre_mention))
    572 					return 0;
    573 				continue;
    574 			} else if ((c == 'h' || c == 'H') && parse_url(&parser.content, &block)) {
    575 				if (!add_text_then_block(&parser, &block, &start, pre_mention))
    576 					return 0;
    577 				continue;
    578 			} else if ((c == 'l' || c == 'L') && parse_invoice(&parser.content, &block)) {
    579 				if (!add_text_then_block(&parser, &block, &start, pre_mention))
    580 					return 0;
    581 				continue;
    582 			} else if ((c == 'n' || c == '@') && parse_mention_bech32(&parser.content, &block)) {
    583 				if (!add_text_then_block(&parser, &block, &start, pre_mention))
    584 					return 0;
    585 				continue;
    586 			}
    587 		}
    588 		
    589 		parser.content.p++;
    590 	}
    591 	
    592 	if (parser.content.p - start > 0) {
    593 		if (!add_text_block(&parser, (const char*)start, (const char *)parser.content.p))
    594 			return 0;
    595 	}
    596 
    597 	parser.blocks->blocks_size = parser.buffer.p - blocks_start;
    598 
    599 	//
    600 	// pad to 8-byte alignment
    601 	//
    602 	if (!cursor_align(&parser.buffer, 8))
    603 		return 0;
    604 	assert((parser.buffer.p - parser.buffer.start) % 8 == 0);
    605 	parser.blocks->total_size = parser.buffer.p - parser.buffer.start;
    606 
    607 	return 1;
    608 }
    609