nostrdb

an unfairly fast embedded nostr database backed by lmdb
git clone git://jb55.com/nostrdb
Log | Files | Refs | Submodules | README | LICENSE

commit bbf04f50bb7f7fc3d0caabc394f2f79773f64392
parent 02a405e873ca5409a238fa9934285708a560b06a
Author: William Casarin <jb55@jb55.com>
Date:   Fri, 22 Dec 2023 16:56:53 -0800

add initial content parser

Diffstat:
Asrc/content_parser.c | 388+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
1 file changed, 388 insertions(+), 0 deletions(-)

diff --git a/src/content_parser.c b/src/content_parser.c @@ -0,0 +1,388 @@ +#include "damus.h" +#include "cursor.h" +#include "bolt11.h" +#include "bech32.h" +#include <stdlib.h> +#include <string.h> + +#include "cursor.h" + +static int parse_digit(struct cursor *cur, int *digit) { + int c; + if ((c = peek_char(cur, 0)) == -1) + return 0; + + c -= '0'; + + if (c >= 0 && c <= 9) { + *digit = c; + cur->p++; + return 1; + } + return 0; +} + + +static int parse_mention_index(struct cursor *cur, struct note_block *block) { + int d1, d2, d3, ind; + u8 *start = cur->p; + + if (!parse_str(cur, "#[")) + return 0; + + if (!parse_digit(cur, &d1)) { + cur->p = start; + return 0; + } + + ind = d1; + + if (parse_digit(cur, &d2)) + ind = (d1 * 10) + d2; + + if (parse_digit(cur, &d3)) + ind = (d1 * 100) + (d2 * 10) + d3; + + if (!parse_char(cur, ']')) { + cur->p = start; + return 0; + } + + block->type = BLOCK_MENTION_INDEX; + block->block.mention_index = ind; + + return 1; +} + +static int parse_hashtag(struct cursor *cur, struct note_block *block) { + int c; + u8 *start = cur->p; + + if (!parse_char(cur, '#')) + return 0; + + c = peek_char(cur, 0); + if (c == -1 || is_whitespace(c) || c == '#') { + cur->p = start; + return 0; + } + + consume_until_boundary(cur); + + block->type = BLOCK_HASHTAG; + block->block.str.start = (const char*)(start + 1); + block->block.str.end = (const char*)cur->p; + + return 1; +} + +static int add_block(struct note_blocks *blocks, struct note_block block) +{ + if (blocks->num_blocks + 1 >= MAX_BLOCKS) + return 0; + + blocks->blocks[blocks->num_blocks++] = block; + return 1; +} + +static int add_text_block(struct note_blocks *blocks, const u8 *start, const u8 *end) +{ + struct note_block b; + + if (start == end) + return 1; + + b.type = BLOCK_TEXT; + b.block.str.start = (const char*)start; + b.block.str.end = (const char*)end; + + return add_block(blocks, b); +} + +static int consume_url_fragment(struct cursor *cur) +{ + int c; + + if ((c = peek_char(cur, 0)) < 0) + return 1; + + if (c != '#' && c != '?') { + return 1; + } + + cur->p++; + + return consume_until_whitespace(cur, 1); +} + +static int consume_url_path(struct cursor *cur) +{ + int c; + + if ((c = peek_char(cur, 0)) < 0) + return 1; + + if (c != '/') { + return 1; + } + + while (cur->p < cur->end) { + c = *cur->p; + + if (c == '?' || c == '#' || is_whitespace(c)) { + return 1; + } + + cur->p++; + } + + return 1; +} + +static int consume_url_host(struct cursor *cur) +{ + char c; + int count = 0; + + while (cur->p < cur->end) { + c = *cur->p; + // TODO: handle IDNs + if (is_alphanumeric(c) || c == '.' || c == '-') + { + count++; + cur->p++; + continue; + } + + return count != 0; + } + + + // this means the end of the URL hostname is the end of the buffer and we finished + return count != 0; +} + +static int parse_url(struct cursor *cur, struct note_block *block) { + u8 *start = cur->p; + u8 *host; + int host_len; + struct cursor path_cur; + + if (!parse_str(cur, "http")) + return 0; + + if (parse_char(cur, 's') || parse_char(cur, 'S')) { + if (!parse_str(cur, "://")) { + cur->p = start; + return 0; + } + } else { + if (!parse_str(cur, "://")) { + cur->p = start; + return 0; + } + } + + // make sure to save the hostname. We will use this to detect damus.io links + host = cur->p; + + if (!consume_url_host(cur)) { + cur->p = start; + return 0; + } + + // get the length of the host string + host_len = (int)(cur->p - host); + + // save the current parse state so that we can continue from here when + // parsing the bech32 in the damus.io link if we have it + copy_cursor(cur, &path_cur); + + // skip leading / + cursor_skip(&path_cur, 1); + + if (!consume_url_path(cur)) { + cur->p = start; + return 0; + } + + if (!consume_url_fragment(cur)) { + cur->p = start; + return 0; + } + + // smart parens + if (start - 1 >= 0 && + start < cur->end && + *(start - 1) == '(' && + (cur->p - 1) < cur->end && + *(cur->p - 1) == ')') + { + cur->p--; + } + + // save the bech32 string pos in case we hit a damus.io link + block->block.str.start = (const char *)path_cur.p; + + // if we have a damus link, make it a mention + if (host_len == 8 + && !strncmp((const char *)host, "damus.io", 8) + && parse_nostr_bech32(&path_cur, &block->block.mention_bech32.bech32)) + { + block->block.str.end = (const char *)path_cur.p; + block->type = BLOCK_MENTION_BECH32; + return 1; + } + + block->type = BLOCK_URL; + block->block.str.start = (const char *)start; + block->block.str.end = (const char *)cur->p; + + return 1; +} + +static int parse_invoice(struct cursor *cur, struct note_block *block) { + u8 *start, *end; + char *fail; + struct bolt11 *bolt11; + // optional + parse_str(cur, "lightning:"); + + start = cur->p; + + if (!parse_str(cur, "lnbc")) + return 0; + + if (!consume_until_whitespace(cur, 1)) { + cur->p = start; + return 0; + } + + end = cur->p; + + char str[end - start + 1]; + str[end - start] = 0; + memcpy(str, start, end - start); + + if (!(bolt11 = bolt11_decode(NULL, str, &fail))) { + cur->p = start; + return 0; + } + + block->type = BLOCK_INVOICE; + + block->block.invoice.invstr.start = (const char*)start; + block->block.invoice.invstr.end = (const char*)end; + block->block.invoice.bolt11 = bolt11; + + cur->p = end; + + return 1; +} + + +static int parse_mention_bech32(struct cursor *cur, struct note_block *block) { + u8 *start = cur->p; + + parse_char(cur, '@'); + parse_str(cur, "nostr:"); + + block->block.str.start = (const char *)cur->p; + + if (!parse_nostr_bech32(cur, &block->block.mention_bech32.bech32)) { + cur->p = start; + return 0; + } + + block->block.str.end = (const char *)cur->p; + + block->type = BLOCK_MENTION_BECH32; + + return 1; +} + +static int add_text_then_block(struct cursor *cur, struct note_blocks *blocks, struct note_block block, u8 **start, const u8 *pre_mention) +{ + if (!add_text_block(blocks, *start, pre_mention)) + return 0; + + *start = (u8*)cur->p; + + if (!add_block(blocks, block)) + return 0; + + return 1; +} + +int ndb_parse_content(struct note_blocks *blocks, const char *content) { + int cp, c; + struct cursor cur; + struct note_block block; + u8 *start, *pre_mention; + + blocks->words = 0; + blocks->num_blocks = 0; + make_cursor((u8*)content, (u8*)content + strlen(content), &cur); + + start = cur.p; + while (cur.p < cur.end && blocks->num_blocks < MAX_BLOCKS) { + cp = peek_char(&cur, -1); + c = peek_char(&cur, 0); + + // new word + if (is_whitespace(cp) && !is_whitespace(c)) { + blocks->words++; + } + + pre_mention = cur.p; + if (cp == -1 || is_left_boundary(cp) || c == '#') { + if (c == '#' && (parse_mention_index(&cur, &block) || parse_hashtag(&cur, &block))) { + if (!add_text_then_block(&cur, blocks, block, &start, pre_mention)) + return 0; + continue; + } else if ((c == 'h' || c == 'H') && parse_url(&cur, &block)) { + if (!add_text_then_block(&cur, blocks, block, &start, pre_mention)) + return 0; + continue; + } else if ((c == 'l' || c == 'L') && parse_invoice(&cur, &block)) { + if (!add_text_then_block(&cur, blocks, block, &start, pre_mention)) + return 0; + continue; + } else if ((c == 'n' || c == '@') && parse_mention_bech32(&cur, &block)) { + if (!add_text_then_block(&cur, blocks, block, &start, pre_mention)) + return 0; + continue; + } + } + + cur.p++; + } + + if (cur.p - start > 0) { + if (!add_text_block(blocks, start, cur.p)) + return 0; + } + + return 1; +} + +void blocks_init(struct note_blocks *blocks) { + blocks->blocks = malloc(sizeof(struct note_block) * MAX_BLOCKS); + blocks->num_blocks = 0; +} + +void blocks_free(struct note_blocks *blocks) { + if (!blocks->blocks) { + return; + } + + for (int i = 0; i < blocks->num_blocks; ++i) { + if (blocks->blocks[i].type == BLOCK_MENTION_BECH32) { + free(blocks->blocks[i].block.mention_bech32.bech32.buffer); + blocks->blocks[i].block.mention_bech32.bech32.buffer = NULL; + } + } + + free(blocks->blocks); + blocks->num_blocks = 0; +}