content_parser.c - nostrdb - an unfairly fast embedded nostr database backed by lmdb

content_parser.c (12922B)
      1 #include "cursor.h"
      2 #include "nostr_bech32.h"
      3 #include "block.h"
      4 #include "nostrdb.h"
      5 #include "invoice.h"
      6 
      7 #ifndef _WIN32
      8 #include "bolt11/bolt11.h"
      9 #endif
     10 
     11 #include "bolt11/bech32.h"
     12 
     13 #include <stdlib.h>
     14 #include <string.h>
     15 
     16 #include "cursor.h"
     17 
     18 struct ndb_content_parser {
     19 	int bech32_strs;
     20 	struct cursor buffer;
     21 	struct cursor content;
     22 	struct ndb_blocks *blocks;
     23 };
     24 
     25 static int parse_digit(struct cursor *cur, int *digit) {
     26 	int c;
     27 	if ((c = peek_char(cur, 0)) == -1) {
     28 		*digit = 0;
     29 		return 0;
     30 	}
     31 	
     32 	c -= '0';
     33 	
     34 	if (c >= 0 && c <= 9) {
     35 		*digit = c;
     36 		cur->p++;
     37 		return 1;
     38 	}
     39 	return 0;
     40 }
     41 
     42 
     43 static int parse_mention_index(struct cursor *cur, struct ndb_block *block) {
     44 	int d1, d2, d3, ind;
     45 	unsigned char *start = cur->p;
     46 	
     47 	if (!parse_str(cur, "#["))
     48 		return 0;
     49 	
     50 	if (!parse_digit(cur, &d1)) {
     51 		cur->p = start;
     52 		return 0;
     53 	}
     54 	
     55 	ind = d1;
     56 	
     57 	if (parse_digit(cur, &d2))
     58 		ind = (d1 * 10) + d2;
     59 	
     60 	if (parse_digit(cur, &d3))
     61 		ind = (d1 * 100) + (d2 * 10) + d3;
     62 	
     63 	if (!parse_char(cur, ']')) {
     64 		cur->p = start;
     65 		return 0;
     66 	}
     67 	
     68 	block->type = BLOCK_MENTION_INDEX;
     69 	block->block.mention_index = ind;
     70 	
     71 	return 1;
     72 }
     73 
     74 static int parse_hashtag(struct cursor *cur, struct ndb_block *block) {
     75 	int c;
     76 	unsigned char *start = cur->p;
     77 	
     78 	if (!parse_char(cur, '#'))
     79 		return 0;
     80 	
     81 	c = peek_char(cur, 0);
     82 	if (c == -1 || is_whitespace(c) || c == '#') {
     83 		cur->p = start;
     84 		return 0;
     85 	}
     86 	
     87 	consume_until_boundary(cur);
     88 	
     89 	block->type = BLOCK_HASHTAG;
     90 	block->block.str.str = (const char*)(start + 1);
     91 	block->block.str.len = cur->p - (start + 1);
     92 	
     93 	return 1;
     94 }
     95 
     96 //
     97 // decode and push a bech32 mention into our blocks output buffer.
     98 //
     99 // bech32 blocks are stored as:
    100 //
    101 //     bech32_buffer_size : u16
    102 //     nostr_bech32_type  : varint
    103 //     bech32_data        : [u8]
    104 //
    105 // The TLV form is compact already, so we just use it directly
    106 //
    107 // This allows us to not duplicate all of the TLV encoding and decoding code
    108 // for our on-disk nostrdb format.
    109 //
    110 static int push_bech32_mention(struct ndb_content_parser *p, struct ndb_str_block *bech32)
    111 {
    112 	// we decode the raw bech32 directly into the output buffer
    113 	struct cursor u8, u5;
    114 	unsigned char *start;
    115 	uint16_t *u8_size;
    116 	enum nostr_bech32_type type;
    117 	size_t u5_out_len, u8_out_len;
    118 	static const int MAX_PREFIX = 8;
    119 	char prefix[9] = {0};
    120 
    121 	start = p->buffer.p;
    122 
    123 	if (!parse_nostr_bech32_type(bech32->str, &type))
    124 		goto fail;
    125 
    126 	// make sure to push the str block!
    127 	if (!push_str_block(&p->buffer, (const char*)p->content.start, bech32))
    128 		goto fail;
    129 	//
    130 	// save a spot for the raw bech32 buffer size
    131 	u8_size = (uint16_t*)p->buffer.p;
    132 	if (!cursor_skip(&p->buffer, 2))
    133 		goto fail;
    134 
    135 	if (!cursor_push_varint(&p->buffer, type))
    136 		goto fail;
    137 
    138 	if (!cursor_malloc_slice(&p->buffer, &u8, bech32->len))
    139 		goto fail;
    140 
    141 	if (!cursor_malloc_slice(&p->buffer, &u5, bech32->len))
    142 		goto fail;
    143 	
    144 	if (bech32_decode_len(prefix, u5.p, &u5_out_len, bech32->str,
    145 			      bech32->len, MAX_PREFIX) == BECH32_ENCODING_NONE) {
    146 		goto fail;
    147 	}
    148 
    149 	u5.p += u5_out_len;
    150 
    151 	if (!bech32_convert_bits(u8.p, &u8_out_len, 8, u5.start, u5.p - u5.start, 5, 0))
    152 		goto fail;
    153 
    154 	u8.p += u8_out_len;
    155 
    156 	// move the out cursor to the end of the 8-bit buffer
    157 	p->buffer.p = u8.p;
    158 
    159 	if (u8_out_len > UINT16_MAX)
    160 		goto fail;
    161 
    162 	// mark the size of the bech32 buffer
    163 	*u8_size = (uint16_t)u8_out_len;
    164 
    165 	return 1;
    166 
    167 fail:
    168 	p->buffer.p = start;
    169 	return 0;
    170 }
    171 
    172 static int push_invoice_str(struct ndb_content_parser *p, struct ndb_str_block *str)
    173 {
    174 #ifdef _WIN32
    175 	// we shouldn't be pushing invoices on windows until we fix
    176 	// bolt11 parser portability
    177 	return 0;
    178 #else
    179 	unsigned char *start;
    180 	struct bolt11 *bolt11;
    181 	char *fail;
    182 
    183 	if (!(bolt11 = bolt11_decode_minimal(NULL, str->str, &fail)))
    184 		return 0;
    185 
    186 	start = p->buffer.p;
    187 
    188 	// push the text block just incase we don't care for the invoice
    189 	if (!push_str_block(&p->buffer, (const char*)p->content.start, str))
    190 		return 0;
    191 
    192 	// push decoded invoice data for quick access
    193 	if (!ndb_encode_invoice(&p->buffer, bolt11)) {
    194 		p->buffer.p = start;
    195 		tal_free(bolt11);
    196 		return 0;
    197 	}
    198 
    199 	tal_free(bolt11);
    200 	return 1;
    201 #endif
    202 }
    203 
    204 int push_block(struct ndb_content_parser *p, struct ndb_block *block);
    205 static int add_text_block(struct ndb_content_parser *p, const char *start, const char *end)
    206 {
    207 	struct ndb_block b;
    208 	
    209 	if (start == end)
    210 		return 1;
    211 	
    212 	b.type = BLOCK_TEXT;
    213 	b.block.str.str = start;
    214 	b.block.str.len = end - start;
    215 	
    216 	return push_block(p, &b);
    217 }
    218 
    219 
    220 int push_block(struct ndb_content_parser *p, struct ndb_block *block)
    221 {
    222 	unsigned char *start = p->buffer.p;
    223 
    224 	// push the tag
    225 	if (!cursor_push_varint(&p->buffer, block->type))
    226 		return 0;
    227 
    228 	switch (block->type) {
    229 	case BLOCK_HASHTAG:
    230 	case BLOCK_TEXT:
    231 	case BLOCK_URL:
    232 		if (!push_str_block(&p->buffer, (const char*)p->content.start,
    233 			       &block->block.str))
    234 			goto fail;
    235 		break;
    236 
    237 	case BLOCK_MENTION_INDEX:
    238 		if (!cursor_push_varint(&p->buffer, block->block.mention_index))
    239 			goto fail;
    240 		break;
    241 	case BLOCK_MENTION_BECH32:
    242 		// we only push bech32 strs here
    243 		if (!push_bech32_mention(p, &block->block.str)) {
    244 			// if we fail for some reason, try pushing just a text block
    245 			p->buffer.p = start;
    246 			if (!add_text_block(p, block->block.str.str,
    247 					       block->block.str.str +
    248 					       block->block.str.len)) {
    249 				goto fail;
    250 			}
    251 		}
    252 		break;
    253 
    254 	case BLOCK_INVOICE:
    255 		// we only push invoice strs here
    256 		if (!push_invoice_str(p, &block->block.str)) {
    257 			// if we fail for some reason, try pushing just a text block
    258 			p->buffer.p = start;
    259 			if (!add_text_block(p, block->block.str.str,
    260 					    block->block.str.str + block->block.str.len)) {
    261 				goto fail;
    262 			}
    263 		}
    264 		break;
    265 	}
    266 
    267 	p->blocks->num_blocks++;
    268 
    269 	return 1;
    270 
    271 fail:
    272 	p->buffer.p = start;
    273 	return 0;
    274 }
    275 
    276 
    277 
    278 static inline int next_char_is_whitespace(unsigned char *cur, unsigned char *end) {
    279 	unsigned char *next = cur + 1;
    280 
    281 	if (next > end)
    282 		return 0;
    283 
    284 	if (next == end)
    285 		return 1;
    286 
    287 	return is_whitespace(*next);
    288 }
    289 
    290 static inline int char_disallowed_at_end_url(char c)
    291 {
    292 	return c == '.' || c == ',';
    293  
    294 }
    295 
    296 static int is_final_url_char(unsigned char *cur, unsigned char *end) 
    297 {
    298 	if (is_whitespace(*cur))
    299 		return 1;
    300 
    301 	if (next_char_is_whitespace(cur, end)) {
    302 		// next char is whitespace so this char could be the final char in the url
    303 		return char_disallowed_at_end_url(*cur);
    304 	}
    305 
    306 	// next char isn't whitespace so it can't be a final char
    307 	return 0;
    308 }
    309 
    310 static int consume_until_end_url(struct cursor *cur, int or_end) {
    311 	unsigned char *start = cur->p;
    312 
    313 	while (cur->p < cur->end) {
    314 		if (is_final_url_char(cur->p, cur->end))
    315 			return cur->p != start;
    316 
    317 		cur->p++;
    318 	}
    319 
    320 	return or_end;
    321 }
    322 
    323 static int consume_url_fragment(struct cursor *cur)
    324 {
    325 	int c;
    326 
    327 	if ((c = peek_char(cur, 0)) < 0)
    328 		return 1;
    329 
    330 	if (c != '#' && c != '?') {
    331 		return 1;
    332 	}
    333 
    334 	cur->p++;
    335 
    336 	return consume_until_end_url(cur, 1);
    337 }
    338 
    339 static int consume_url_path(struct cursor *cur)
    340 {
    341 	int c;
    342 
    343 	if ((c = peek_char(cur, 0)) < 0)
    344 		return 1;
    345 
    346 	if (c != '/') {
    347 		return 1;
    348 	}
    349 
    350 	while (cur->p < cur->end) {
    351 		c = *cur->p;
    352 
    353 		if (c == '?' || c == '#' || is_final_url_char(cur->p, cur->end)) {
    354 			return 1;
    355 		}
    356 
    357 		cur->p++;
    358 	}
    359 
    360 	return 1;
    361 }
    362 
    363 static int consume_url_host(struct cursor *cur)
    364 {
    365 	char c;
    366 	int count = 0;
    367 
    368 	while (cur->p < cur->end) {
    369 		c = *cur->p;
    370 		// TODO: handle IDNs
    371 		if ((is_alphanumeric(c) || c == '.' || c == '-') && !is_final_url_char(cur->p, cur->end))
    372 		{
    373 			count++;
    374 			cur->p++;
    375 			continue;
    376 		}
    377 
    378 		return count != 0;
    379 	}
    380 
    381 
    382 	// this means the end of the URL hostname is the end of the buffer and we finished
    383 	return count != 0;
    384 }
    385 
    386 static int parse_url(struct cursor *cur, struct ndb_block *block) {
    387 	unsigned char *start = cur->p;
    388 	unsigned char *host;
    389 	unsigned char tmp[4096];
    390 	int host_len;
    391 	struct cursor path_cur, tmp_cur;
    392 	enum nostr_bech32_type type;
    393 	make_cursor(tmp, tmp + sizeof(tmp), &tmp_cur);
    394 	
    395 	if (!parse_str(cur, "http"))
    396 		return 0;
    397 	
    398 	if (parse_char(cur, 's') || parse_char(cur, 'S')) {
    399 		if (!parse_str(cur, "://")) {
    400 			cur->p = start;
    401 			return 0;
    402 		}
    403 	} else {
    404 		if (!parse_str(cur, "://")) {
    405 			cur->p = start;
    406 			return 0;
    407 		}
    408 	}
    409 
    410 	// make sure to save the hostname. We will use this to detect damus.io links
    411 	host = cur->p;
    412 
    413 	if (!consume_url_host(cur)) {
    414 		cur->p = start;
    415 		return 0;
    416 	}
    417 
    418 	// get the length of the host string
    419 	host_len = (int)(cur->p - host);
    420 
    421 	// save the current parse state so that we can continue from here when
    422 	// parsing the bech32 in the damus.io link if we have it
    423 	copy_cursor(cur, &path_cur);
    424 
    425 	// skip leading /
    426 	cursor_skip(&path_cur, 1);
    427 
    428 	if (!consume_url_path(cur)) {
    429 		cur->p = start;
    430 		return 0;
    431 	}
    432 
    433 	if (!consume_url_fragment(cur)) {
    434 		cur->p = start;
    435 		return 0;
    436 	}
    437 
    438 	// smart parens
    439 	if ((start - 1) >= cur->start &&
    440 		start < cur->end &&
    441 		*(start - 1) == '(' &&
    442 		(cur->p - 1) < cur->end &&
    443 		*(cur->p - 1) == ')')
    444 	{
    445 		cur->p--;
    446 	}
    447 
    448 	// save the bech32 string pos in case we hit a damus.io link
    449 	block->block.str.str = (const char *)path_cur.p;
    450 
    451 	// if we have a damus link, make it a mention
    452 	if (host_len == 8
    453 	&& !strncmp((const char *)host, "damus.io", 8)
    454 	&& parse_nostr_bech32_str(&path_cur, &type))
    455 	{
    456 		block->block.str.len = path_cur.p - path_cur.start;
    457 		block->type = BLOCK_MENTION_BECH32;
    458 		return 1;
    459 	}
    460 
    461 	block->type = BLOCK_URL;
    462 	block->block.str.str = (const char *)start;
    463 	block->block.str.len = cur->p - start;
    464 	
    465 	return 1;
    466 }
    467 
    468 static int parse_invoice(struct cursor *cur, struct ndb_block *block) {
    469 	unsigned char *start, *end;
    470 
    471 #ifdef _WIN32
    472 	// bolt11 stuff requires non-portable cc stuff, so ignore for now
    473 	return 0;
    474 #else
    475 
    476 	// optional
    477 	parse_str(cur, "lightning:");
    478 	
    479 	start = cur->p;
    480 	
    481 	if (!parse_str(cur, "lnbc"))
    482 		return 0;
    483 	
    484 	if (!consume_until_whitespace(cur, 1)) {
    485 		cur->p = start;
    486 		return 0;
    487 	}
    488 	
    489 	end = cur->p;
    490 	
    491 	block->type = BLOCK_INVOICE;
    492 	
    493 	block->block.str.str = (const char*)start;
    494 	block->block.str.len = end - start;
    495 	
    496 	cur->p = end;
    497 	
    498 	return 1;
    499 #endif
    500 }
    501 
    502 
    503 static int parse_mention_bech32(struct cursor *cur, struct ndb_block *block) {
    504 	unsigned char *start = cur->p;
    505 	enum nostr_bech32_type type;
    506 	
    507 	parse_char(cur, '@');
    508 	parse_str(cur, "nostr:");
    509 
    510 	block->block.str.str = (const char *)cur->p;
    511 	
    512 	if (!parse_nostr_bech32_str(cur, &type)) {
    513 		cur->p = start;
    514 		return 0;
    515 	}
    516 	
    517 	block->block.str.len = cur->p - (unsigned char*)block->block.str.str;
    518 	block->type = BLOCK_MENTION_BECH32;
    519 
    520 	return 1;
    521 }
    522 
    523 static int add_text_then_block(struct ndb_content_parser *p,
    524 			       struct ndb_block *block,
    525 			       unsigned char **start,
    526 			       const unsigned char *pre_mention)
    527 {
    528 	if (!add_text_block(p, (const char *)*start, (const char*)pre_mention))
    529 		return 0;
    530 	
    531 	*start = (unsigned char*)p->content.p;
    532 	
    533 	return push_block(p, block);
    534 }
    535 
    536 int ndb_parse_content(unsigned char *buf, int buf_size,
    537 		      const char *content, int content_len,
    538 		      struct ndb_blocks **blocks_p)
    539 {
    540 	int cp, c;
    541 	struct ndb_content_parser parser;
    542 	struct ndb_block block;
    543 
    544 	unsigned char *start, *pre_mention, *blocks_start;
    545 	
    546 	make_cursor(buf, buf + buf_size, &parser.buffer);
    547 
    548 	// allocate some space for the blocks header
    549 	*blocks_p = parser.blocks = (struct ndb_blocks *)buf;
    550 	parser.buffer.p += sizeof(struct ndb_blocks);
    551 
    552 	make_cursor((unsigned char *)content,
    553 		    (unsigned char*)content + content_len, &parser.content);
    554 
    555 	parser.blocks->words = 0;
    556 	parser.blocks->num_blocks = 0;
    557 	parser.blocks->blocks_size = 0;
    558 	parser.blocks->flags = 0;
    559 	parser.blocks->version = 1;
    560 
    561 	blocks_start = start = parser.content.p;
    562 	while (parser.content.p < parser.content.end) {
    563 		cp = peek_char(&parser.content, -1);
    564 		c  = peek_char(&parser.content, 0);
    565 		
    566 		// new word
    567 		if (is_whitespace(cp) && !is_whitespace(c))
    568 			parser.blocks->words++;
    569 		
    570 		pre_mention = parser.content.p;
    571 		if (cp == -1 || is_left_boundary(cp) || c == '#') {
    572 			if (c == '#' && (parse_mention_index(&parser.content, &block) || parse_hashtag(&parser.content, &block))) {
    573 				if (!add_text_then_block(&parser, &block, &start, pre_mention))
    574 					return 0;
    575 				continue;
    576 			} else if ((c == 'h' || c == 'H') && parse_url(&parser.content, &block)) {
    577 				if (!add_text_then_block(&parser, &block, &start, pre_mention))
    578 					return 0;
    579 				continue;
    580 			} else if ((c == 'l' || c == 'L') && parse_invoice(&parser.content, &block)) {
    581 				if (!add_text_then_block(&parser, &block, &start, pre_mention))
    582 					return 0;
    583 				continue;
    584 			} else if ((c == 'n' || c == '@') && parse_mention_bech32(&parser.content, &block)) {
    585 				if (!add_text_then_block(&parser, &block, &start, pre_mention))
    586 					return 0;
    587 				continue;
    588 			}
    589 		}
    590 		
    591 		parser.content.p++;
    592 	}
    593 	
    594 	if (parser.content.p - start > 0) {
    595 		if (!add_text_block(&parser, (const char*)start, (const char *)parser.content.p))
    596 			return 0;
    597 	}
    598 
    599 	parser.blocks->blocks_size = parser.buffer.p - blocks_start;
    600 
    601 	//
    602 	// pad to 8-byte alignment
    603 	//
    604 	if (!cursor_align(&parser.buffer, 8))
    605 		return 0;
    606 	assert((parser.buffer.p - parser.buffer.start) % 8 == 0);
    607 	parser.blocks->total_size = parser.buffer.p - parser.buffer.start;
    608 
    609 	return 1;
    610 }
    611
	nostrdb an unfairly fast embedded nostr database backed by lmdb
	git clone git://jb55.com/nostrdb
	Log \| Files \| Refs \| Submodules \| README \| LICENSE