nostrdb

an unfairly fast embedded nostr database backed by lmdb
git clone git://jb55.com/nostrdb
Log | Files | Refs | Submodules | README | LICENSE

content_parser.c (12643B)


      1 #include "cursor.h"
      2 #include "nostr_bech32.h"
      3 #include "block.h"
      4 #include "nostrdb.h"
      5 #include "invoice.h"
      6 #include "bolt11/bolt11.h"
      7 #include "bolt11/bech32.h"
      8 #include <stdlib.h>
      9 #include <string.h>
     10 
     11 #include "cursor.h"
     12 
     13 struct ndb_content_parser {
     14 	int bech32_strs;
     15 	struct cursor buffer;
     16 	struct cursor content;
     17 	struct ndb_blocks *blocks;
     18 };
     19 
     20 static int parse_digit(struct cursor *cur, int *digit) {
     21 	int c;
     22 	if ((c = peek_char(cur, 0)) == -1)
     23 		return 0;
     24 	
     25 	c -= '0';
     26 	
     27 	if (c >= 0 && c <= 9) {
     28 		*digit = c;
     29 		cur->p++;
     30 		return 1;
     31 	}
     32 	return 0;
     33 }
     34 
     35 
     36 static int parse_mention_index(struct cursor *cur, struct ndb_block *block) {
     37 	int d1, d2, d3, ind;
     38 	unsigned char *start = cur->p;
     39 	
     40 	if (!parse_str(cur, "#["))
     41 		return 0;
     42 	
     43 	if (!parse_digit(cur, &d1)) {
     44 		cur->p = start;
     45 		return 0;
     46 	}
     47 	
     48 	ind = d1;
     49 	
     50 	if (parse_digit(cur, &d2))
     51 		ind = (d1 * 10) + d2;
     52 	
     53 	if (parse_digit(cur, &d3))
     54 		ind = (d1 * 100) + (d2 * 10) + d3;
     55 	
     56 	if (!parse_char(cur, ']')) {
     57 		cur->p = start;
     58 		return 0;
     59 	}
     60 	
     61 	block->type = BLOCK_MENTION_INDEX;
     62 	block->block.mention_index = ind;
     63 	
     64 	return 1;
     65 }
     66 
     67 static int parse_hashtag(struct cursor *cur, struct ndb_block *block) {
     68 	int c;
     69 	unsigned char *start = cur->p;
     70 	
     71 	if (!parse_char(cur, '#'))
     72 		return 0;
     73 	
     74 	c = peek_char(cur, 0);
     75 	if (c == -1 || is_whitespace(c) || c == '#') {
     76 		cur->p = start;
     77 		return 0;
     78 	}
     79 	
     80 	consume_until_boundary(cur);
     81 	
     82 	block->type = BLOCK_HASHTAG;
     83 	block->block.str.str = (const char*)(start + 1);
     84 	block->block.str.len = cur->p - (start + 1);
     85 	
     86 	return 1;
     87 }
     88 
     89 //
     90 // decode and push a bech32 mention into our blocks output buffer.
     91 //
     92 // bech32 blocks are stored as:
     93 //
     94 //     bech32_buffer_size : u16
     95 //     nostr_bech32_type  : varint
     96 //     bech32_data        : [u8]
     97 //
     98 // The TLV form is compact already, so we just use it directly
     99 //
    100 // This allows us to not duplicate all of the TLV encoding and decoding code
    101 // for our on-disk nostrdb format.
    102 //
    103 static int push_bech32_mention(struct ndb_content_parser *p, struct ndb_str_block *bech32)
    104 {
    105 	// we decode the raw bech32 directly into the output buffer
    106 	struct cursor u8, u5;
    107 	unsigned char *start;
    108 	uint16_t *u8_size;
    109 	enum nostr_bech32_type type;
    110 	size_t u5_out_len, u8_out_len;
    111 	static const int MAX_PREFIX = 8;
    112 	char prefix[9] = {0};
    113 
    114 	start = p->buffer.p;
    115 
    116 	if (!parse_nostr_bech32_type(bech32->str, &type))
    117 		goto fail;
    118 
    119 	// make sure to push the str block!
    120 	if (!push_str_block(&p->buffer, (const char*)p->content.start, bech32))
    121 		goto fail;
    122 	//
    123 	// save a spot for the raw bech32 buffer size
    124 	u8_size = (uint16_t*)p->buffer.p;
    125 	if (!cursor_skip(&p->buffer, 2))
    126 		goto fail;
    127 
    128 	if (!cursor_push_varint(&p->buffer, type))
    129 		goto fail;
    130 
    131 	if (!cursor_malloc_slice(&p->buffer, &u8, bech32->len))
    132 		goto fail;
    133 
    134 	if (!cursor_malloc_slice(&p->buffer, &u5, bech32->len))
    135 		goto fail;
    136 	
    137 	if (bech32_decode_len(prefix, u5.p, &u5_out_len, bech32->str,
    138 			      bech32->len, MAX_PREFIX) == BECH32_ENCODING_NONE) {
    139 		goto fail;
    140 	}
    141 
    142 	u5.p += u5_out_len;
    143 
    144 	if (!bech32_convert_bits(u8.p, &u8_out_len, 8, u5.start, u5.p - u5.start, 5, 0))
    145 		goto fail;
    146 
    147 	u8.p += u8_out_len;
    148 
    149 	// move the out cursor to the end of the 8-bit buffer
    150 	p->buffer.p = u8.p;
    151 
    152 	if (u8_out_len > UINT16_MAX)
    153 		goto fail;
    154 
    155 	// mark the size of the bech32 buffer
    156 	*u8_size = (uint16_t)u8_out_len;
    157 
    158 	return 1;
    159 
    160 fail:
    161 	p->buffer.p = start;
    162 	return 0;
    163 }
    164 
    165 static int push_invoice_str(struct ndb_content_parser *p, struct ndb_str_block *str)
    166 {
    167 	unsigned char *start;
    168 	struct bolt11 *bolt11;
    169 	char *fail;
    170 
    171 	if (!(bolt11 = bolt11_decode_minimal(NULL, str->str, &fail)))
    172 		return 0;
    173 
    174 	start = p->buffer.p;
    175 
    176 	// push the text block just incase we don't care for the invoice
    177 	if (!push_str_block(&p->buffer, (const char*)p->content.start, str))
    178 		return 0;
    179 
    180 	// push decoded invoice data for quick access
    181 	if (!ndb_encode_invoice(&p->buffer, bolt11)) {
    182 		p->buffer.p = start;
    183 		tal_free(bolt11);
    184 		return 0;
    185 	}
    186 
    187 	tal_free(bolt11);
    188 	return 1;
    189 }
    190 
    191 int push_block(struct ndb_content_parser *p, struct ndb_block *block);
    192 static int add_text_block(struct ndb_content_parser *p, const char *start, const char *end)
    193 {
    194 	struct ndb_block b;
    195 	
    196 	if (start == end)
    197 		return 1;
    198 	
    199 	b.type = BLOCK_TEXT;
    200 	b.block.str.str = start;
    201 	b.block.str.len = end - start;
    202 	
    203 	return push_block(p, &b);
    204 }
    205 
    206 
    207 int push_block(struct ndb_content_parser *p, struct ndb_block *block)
    208 {
    209 	unsigned char *start = p->buffer.p;
    210 
    211 	// push the tag
    212 	if (!cursor_push_varint(&p->buffer, block->type))
    213 		return 0;
    214 
    215 	switch (block->type) {
    216 	case BLOCK_HASHTAG:
    217 	case BLOCK_TEXT:
    218 	case BLOCK_URL:
    219 		if (!push_str_block(&p->buffer, (const char*)p->content.start,
    220 			       &block->block.str))
    221 			goto fail;
    222 		break;
    223 
    224 	case BLOCK_MENTION_INDEX:
    225 		if (!cursor_push_varint(&p->buffer, block->block.mention_index))
    226 			goto fail;
    227 		break;
    228 	case BLOCK_MENTION_BECH32:
    229 		// we only push bech32 strs here
    230 		if (!push_bech32_mention(p, &block->block.str)) {
    231 			// if we fail for some reason, try pushing just a text block
    232 			p->buffer.p = start;
    233 			if (!add_text_block(p, block->block.str.str,
    234 					       block->block.str.str +
    235 					       block->block.str.len)) {
    236 				goto fail;
    237 			}
    238 		}
    239 		break;
    240 
    241 	case BLOCK_INVOICE:
    242 		// we only push invoice strs here
    243 		if (!push_invoice_str(p, &block->block.str)) {
    244 			// if we fail for some reason, try pushing just a text block
    245 			p->buffer.p = start;
    246 			if (!add_text_block(p, block->block.str.str,
    247 					    block->block.str.str + block->block.str.len)) {
    248 				goto fail;
    249 			}
    250 		}
    251 		break;
    252 	}
    253 
    254 	p->blocks->num_blocks++;
    255 
    256 	return 1;
    257 
    258 fail:
    259 	p->buffer.p = start;
    260 	return 0;
    261 }
    262 
    263 
    264 
    265 static inline int next_char_is_whitespace(unsigned char *cur, unsigned char *end) {
    266 	unsigned char *next = cur + 1;
    267 
    268 	if (next > end)
    269 		return 0;
    270 
    271 	if (next == end)
    272 		return 1;
    273 
    274 	return is_whitespace(*next);
    275 }
    276 
    277 static inline int char_disallowed_at_end_url(char c)
    278 {
    279 	return c == '.' || c == ',';
    280  
    281 }
    282 
    283 static int is_final_url_char(unsigned char *cur, unsigned char *end) 
    284 {
    285 	if (is_whitespace(*cur))
    286 		return 1;
    287 
    288 	if (next_char_is_whitespace(cur, end)) {
    289 		// next char is whitespace so this char could be the final char in the url
    290 		return char_disallowed_at_end_url(*cur);
    291 	}
    292 
    293 	// next char isn't whitespace so it can't be a final char
    294 	return 0;
    295 }
    296 
    297 static int consume_until_end_url(struct cursor *cur, int or_end) {
    298 	unsigned char *start = cur->p;
    299 
    300 	while (cur->p < cur->end) {
    301 		if (is_final_url_char(cur->p, cur->end))
    302 			return cur->p != start;
    303 
    304 		cur->p++;
    305 	}
    306 
    307 	return or_end;
    308 }
    309 
    310 static int consume_url_fragment(struct cursor *cur)
    311 {
    312 	int c;
    313 
    314 	if ((c = peek_char(cur, 0)) < 0)
    315 		return 1;
    316 
    317 	if (c != '#' && c != '?') {
    318 		return 1;
    319 	}
    320 
    321 	cur->p++;
    322 
    323 	return consume_until_end_url(cur, 1);
    324 }
    325 
    326 static int consume_url_path(struct cursor *cur)
    327 {
    328 	int c;
    329 
    330 	if ((c = peek_char(cur, 0)) < 0)
    331 		return 1;
    332 
    333 	if (c != '/') {
    334 		return 1;
    335 	}
    336 
    337 	while (cur->p < cur->end) {
    338 		c = *cur->p;
    339 
    340 		if (c == '?' || c == '#' || is_final_url_char(cur->p, cur->end)) {
    341 			return 1;
    342 		}
    343 
    344 		cur->p++;
    345 	}
    346 
    347 	return 1;
    348 }
    349 
    350 static int consume_url_host(struct cursor *cur)
    351 {
    352 	char c;
    353 	int count = 0;
    354 
    355 	while (cur->p < cur->end) {
    356 		c = *cur->p;
    357 		// TODO: handle IDNs
    358 		if ((is_alphanumeric(c) || c == '.' || c == '-') && !is_final_url_char(cur->p, cur->end))
    359 		{
    360 			count++;
    361 			cur->p++;
    362 			continue;
    363 		}
    364 
    365 		return count != 0;
    366 	}
    367 
    368 
    369 	// this means the end of the URL hostname is the end of the buffer and we finished
    370 	return count != 0;
    371 }
    372 
    373 static int parse_url(struct cursor *cur, struct ndb_block *block) {
    374 	unsigned char *start = cur->p;
    375 	unsigned char *host;
    376 	unsigned char tmp[4096];
    377 	int host_len;
    378 	struct cursor path_cur, tmp_cur;
    379 	enum nostr_bech32_type type;
    380 	make_cursor(tmp, tmp + sizeof(tmp), &tmp_cur);
    381 	
    382 	if (!parse_str(cur, "http"))
    383 		return 0;
    384 	
    385 	if (parse_char(cur, 's') || parse_char(cur, 'S')) {
    386 		if (!parse_str(cur, "://")) {
    387 			cur->p = start;
    388 			return 0;
    389 		}
    390 	} else {
    391 		if (!parse_str(cur, "://")) {
    392 			cur->p = start;
    393 			return 0;
    394 		}
    395 	}
    396 
    397 	// make sure to save the hostname. We will use this to detect damus.io links
    398 	host = cur->p;
    399 
    400 	if (!consume_url_host(cur)) {
    401 		cur->p = start;
    402 		return 0;
    403 	}
    404 
    405 	// get the length of the host string
    406 	host_len = (int)(cur->p - host);
    407 
    408 	// save the current parse state so that we can continue from here when
    409 	// parsing the bech32 in the damus.io link if we have it
    410 	copy_cursor(cur, &path_cur);
    411 
    412 	// skip leading /
    413 	cursor_skip(&path_cur, 1);
    414 
    415 	if (!consume_url_path(cur)) {
    416 		cur->p = start;
    417 		return 0;
    418 	}
    419 
    420 	if (!consume_url_fragment(cur)) {
    421 		cur->p = start;
    422 		return 0;
    423 	}
    424 
    425 	// smart parens
    426 	if ((start - 1) >= cur->start &&
    427 		start < cur->end &&
    428 		*(start - 1) == '(' &&
    429 		(cur->p - 1) < cur->end &&
    430 		*(cur->p - 1) == ')')
    431 	{
    432 		cur->p--;
    433 	}
    434 
    435 	// save the bech32 string pos in case we hit a damus.io link
    436 	block->block.str.str = (const char *)path_cur.p;
    437 
    438 	// if we have a damus link, make it a mention
    439 	if (host_len == 8
    440 	&& !strncmp((const char *)host, "damus.io", 8)
    441 	&& parse_nostr_bech32_str(&path_cur, &type))
    442 	{
    443 		block->block.str.len = path_cur.p - path_cur.start;
    444 		block->type = BLOCK_MENTION_BECH32;
    445 		return 1;
    446 	}
    447 
    448 	block->type = BLOCK_URL;
    449 	block->block.str.str = (const char *)start;
    450 	block->block.str.len = cur->p - start;
    451 	
    452 	return 1;
    453 }
    454 
    455 static int parse_invoice(struct cursor *cur, struct ndb_block *block) {
    456 	unsigned char *start, *end;
    457 
    458 	// optional
    459 	parse_str(cur, "lightning:");
    460 	
    461 	start = cur->p;
    462 	
    463 	if (!parse_str(cur, "lnbc"))
    464 		return 0;
    465 	
    466 	if (!consume_until_whitespace(cur, 1)) {
    467 		cur->p = start;
    468 		return 0;
    469 	}
    470 	
    471 	end = cur->p;
    472 	
    473 	block->type = BLOCK_INVOICE;
    474 	
    475 	block->block.str.str = (const char*)start;
    476 	block->block.str.len = end - start;
    477 	
    478 	cur->p = end;
    479 	
    480 	return 1;
    481 }
    482 
    483 
    484 static int parse_mention_bech32(struct cursor *cur, struct ndb_block *block) {
    485 	unsigned char *start = cur->p;
    486 	enum nostr_bech32_type type;
    487 	
    488 	parse_char(cur, '@');
    489 	parse_str(cur, "nostr:");
    490 
    491 	block->block.str.str = (const char *)cur->p;
    492 	
    493 	if (!parse_nostr_bech32_str(cur, &type)) {
    494 		cur->p = start;
    495 		return 0;
    496 	}
    497 	
    498 	block->block.str.len = cur->p - (unsigned char*)block->block.str.str;
    499 	block->type = BLOCK_MENTION_BECH32;
    500 
    501 	return 1;
    502 }
    503 
    504 static int add_text_then_block(struct ndb_content_parser *p,
    505 			       struct ndb_block *block,
    506 			       unsigned char **start,
    507 			       const unsigned char *pre_mention)
    508 {
    509 	if (!add_text_block(p, (const char *)*start, (const char*)pre_mention))
    510 		return 0;
    511 	
    512 	*start = (unsigned char*)p->content.p;
    513 	
    514 	return push_block(p, block);
    515 }
    516 
    517 int ndb_parse_content(unsigned char *buf, int buf_size,
    518 		      const char *content, int content_len,
    519 		      struct ndb_blocks **blocks_p)
    520 {
    521 	int cp, c;
    522 	struct ndb_content_parser parser;
    523 	struct ndb_block block;
    524 
    525 	unsigned char *start, *pre_mention, *blocks_start;
    526 	
    527 	make_cursor(buf, buf + buf_size, &parser.buffer);
    528 
    529 	// allocate some space for the blocks header
    530 	*blocks_p = parser.blocks = (struct ndb_blocks *)buf;
    531 	parser.buffer.p += sizeof(struct ndb_blocks);
    532 
    533 	make_cursor((unsigned char *)content,
    534 		    (unsigned char*)content + content_len, &parser.content);
    535 
    536 	parser.blocks->words = 0;
    537 	parser.blocks->num_blocks = 0;
    538 	parser.blocks->blocks_size = 0;
    539 	parser.blocks->flags = 0;
    540 	parser.blocks->version = 1;
    541 
    542 	blocks_start = start = parser.content.p;
    543 	while (parser.content.p < parser.content.end) {
    544 		cp = peek_char(&parser.content, -1);
    545 		c  = peek_char(&parser.content, 0);
    546 		
    547 		// new word
    548 		if (is_whitespace(cp) && !is_whitespace(c))
    549 			parser.blocks->words++;
    550 		
    551 		pre_mention = parser.content.p;
    552 		if (cp == -1 || is_left_boundary(cp) || c == '#') {
    553 			if (c == '#' && (parse_mention_index(&parser.content, &block) || parse_hashtag(&parser.content, &block))) {
    554 				if (!add_text_then_block(&parser, &block, &start, pre_mention))
    555 					return 0;
    556 				continue;
    557 			} else if ((c == 'h' || c == 'H') && parse_url(&parser.content, &block)) {
    558 				if (!add_text_then_block(&parser, &block, &start, pre_mention))
    559 					return 0;
    560 				continue;
    561 			} else if ((c == 'l' || c == 'L') && parse_invoice(&parser.content, &block)) {
    562 				if (!add_text_then_block(&parser, &block, &start, pre_mention))
    563 					return 0;
    564 				continue;
    565 			} else if ((c == 'n' || c == '@') && parse_mention_bech32(&parser.content, &block)) {
    566 				if (!add_text_then_block(&parser, &block, &start, pre_mention))
    567 					return 0;
    568 				continue;
    569 			}
    570 		}
    571 		
    572 		parser.content.p++;
    573 	}
    574 	
    575 	if (parser.content.p - start > 0) {
    576 		if (!add_text_block(&parser, (const char*)start, (const char *)parser.content.p))
    577 			return 0;
    578 	}
    579 
    580 	parser.blocks->blocks_size = parser.buffer.p - blocks_start;
    581 
    582 	//
    583 	// pad to 8-byte alignment
    584 	//
    585 	if (!cursor_align(&parser.buffer, 8))
    586 		return 0;
    587 	assert((parser.buffer.p - parser.buffer.start) % 8 == 0);
    588 	parser.blocks->total_size = parser.buffer.p - parser.buffer.start;
    589 
    590 	return 1;
    591 }
    592