content_parser: add initial db decoders - nostrdb - an unfairly fast embedded nostr database backed by lmdb

commit dbb9cabf34f1f5dd58a4023e16a31929c0fa378e
parent d5dcaef57ca6f9bc0ab861aeecdb2e9e038816d9
Author: William Casarin <jb55@jb55.com>
Date:   Wed, 27 Dec 2023 14:55:17 -0800

content_parser: add initial db decoders

We need to pull the data out as well! Let's add some initial decoders.
We still need tests to make sure it's working.

Diffstat:
M Makefile  | 2 +-
M shell.nix  | 2 +-
M src/block.h  | 7 +++----
M src/content_parser.c  | 179 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++--------------
M src/cursor.h  | 5 +++++
M src/nostrdb.h  | 8 ++++++++

6 files changed, 167 insertions(+), 36 deletions(-)
diff --git a/Makefile b/Makefile
@@ -1,5 +1,5 @@
 CFLAGS = -Wall -Wno-misleading-indentation -Wno-unused-function -Werror -O2 -g -Isrc -Ideps/secp256k1/include -Ideps/lmdb -Ideps/flatcc/include
-HEADERS = deps/lmdb/lmdb.h deps/secp256k1/include/secp256k1.h src/sha256.h src/nostrdb.h src/cursor.h src/hex.h src/jsmn.h src/config.h src/sha256.h src/random.h src/memchr.h src/cpu.h $(C_BINDINGS) 
+HEADERS = deps/lmdb/lmdb.h deps/secp256k1/include/secp256k1.h src/sha256.h src/nostrdb.h src/cursor.h src/hex.h src/jsmn.h src/config.h src/sha256.h src/random.h src/memchr.h src/cpu.h src/nostr_bech32.h src/block.h src/str_block.h $(C_BINDINGS) 
 FLATCC_SRCS=deps/flatcc/src/runtime/json_parser.c deps/flatcc/src/runtime/verifier.c deps/flatcc/src/runtime/builder.c deps/flatcc/src/runtime/emitter.c deps/flatcc/src/runtime/refmap.c
 BOLT11_SRCS = src/bolt11/bolt11.c src/bolt11/bech32.c src/bolt11/tal.c src/bolt11/talstr.c src/bolt11/take.c src/bolt11/list.c src/bolt11/utf8.c src/bolt11/amount.c src/bolt11/hash_u5.c
 SRCS = src/nostrdb.c src/sha256.c src/invoice.c src/nostr_bech32.c src/content_parser.c $(BOLT11_SRCS) $(FLATCC_SRCS)
diff --git a/shell.nix b/shell.nix
@@ -1,7 +1,7 @@
 { pkgs ? import <nixpkgs> {} }:
 with pkgs;
 mkShell {
-  buildInputs = [ autoreconfHook flatbuffers flatcc pkg-config ];
+  buildInputs = [ autoreconfHook flatbuffers flatcc gdb pkg-config ];
 
   LIBCLANG_PATH="${llvmPackages.libclang}/lib";
 }
diff --git a/src/block.h b/src/block.h
@@ -38,19 +38,18 @@ struct ndb_mention_bech32_block {
 	struct nostr_bech32 bech32;
 };
 
-struct invoice_block {
+struct ndb_invoice_block {
 	struct str_block invstr;
 	struct ndb_invoice invoice;
-	struct bolt11 *bolt11;
 };
 
 struct note_block {
 	enum block_type type;
 	union {
 		struct str_block str;
-		struct invoice_block invoice;
+		struct ndb_invoice_block invoice;
 		struct ndb_mention_bech32_block mention_bech32;
-		int mention_index;
+		uint32_t mention_index;
 	} block;
 };
 
diff --git a/src/content_parser.c b/src/content_parser.c
@@ -103,7 +103,7 @@ static int pull_str_block(struct cursor *buf, const char *content, struct str_bl
 }
 
 //
-// decode and push a bech32 string into our blocks output buffer.
+// decode and push a bech32 mention into our blocks output buffer.
 //
 // bech32 blocks are stored as:
 //
@@ -116,7 +116,7 @@ static int pull_str_block(struct cursor *buf, const char *content, struct str_bl
 // This allows us to not duplicate all of the TLV encoding and decoding code
 // for our on-disk nostrdb format.
 //
-static int push_bech32_str(struct ndb_content_parser *p, struct str_block *bech32)
+static int push_bech32_mention(struct ndb_content_parser *p, struct str_block *bech32)
 {
 	// we decode the raw bech32 directly into the output buffer
 	struct cursor u8, u5;
@@ -132,6 +132,10 @@ static int push_bech32_str(struct ndb_content_parser *p, struct str_block *bech3
 	if (!parse_nostr_bech32_type(bech32->str, &type))
 		goto fail;
 
+	// make sure to push the str block!
+	if (!push_str_block(&p->buffer, (const char*)p->content.start, bech32))
+		goto fail;
+
 	if (!cursor_push_varint(&p->buffer, type))
 		goto fail;
 
@@ -174,7 +178,7 @@ fail:
 	return 0;
 }
 
-static int push_invoice_str(struct cursor *buf, struct str_block *str)
+static int push_invoice_str(struct ndb_content_parser *p, struct str_block *str)
 {
 	unsigned char *start;
 	struct bolt11 *bolt11;
@@ -183,9 +187,15 @@ static int push_invoice_str(struct cursor *buf, struct str_block *str)
 	if (!(bolt11 = bolt11_decode(NULL, str->str, &fail)))
 		return 0;
 
-	start = buf->p;
-	if (!ndb_encode_invoice(buf, bolt11)) {
-		buf->p = start;
+	start = p->buffer.p;
+
+	// push the text block just incase we don't care for the invoice
+	if (!push_str_block(&p->buffer, (const char*)p->content.start, str))
+		return 0;
+
+	// push decoded invoice data for quick access
+	if (!ndb_encode_invoice(&p->buffer, bolt11)) {
+		p->buffer.p = start;
 		tal_free(bolt11);
 		return 0;
 	}
@@ -194,45 +204,95 @@ static int push_invoice_str(struct cursor *buf, struct str_block *str)
 	return 1;
 }
 
-static int push_block(struct ndb_content_parser *p, struct note_block *block)
+static int pull_nostr_bech32_type(struct cursor *cur, enum nostr_bech32_type *type)
 {
-	// push the tag
-	if (!cursor_push_varint(&p->buffer, block->type))
+	uint64_t inttype;
+	if (!cursor_pull_varint(cur, &inttype))
+		return 0;
+
+	if (inttype > NOSTR_BECH32_KNOWN_TYPES)
+		return 0;
+
+	*type = inttype;
+	return 1;
+}
+
+static int pull_bech32_mention(const char *content, struct cursor *cur,
+			       struct ndb_mention_bech32_block *block) {
+	uint16_t size;
+	unsigned char *start;
+	enum nostr_bech32_type type;
+
+	start = cur->p;
+
+	if (!pull_str_block(cur, content, &block->str))
+		return 0;
+
+	if (!cursor_pull_u16(cur, &size))
+		return 0;
+
+	if (!pull_nostr_bech32_type(cur, &type))
+		return 0;
+
+	if (!parse_nostr_bech32_buffer(cur, type, &block->bech32))
+		return 0;
+
+	cur->p = start + size;
+	return 1;
+}
+
+static int pull_invoice(const char *content, struct cursor *cur,
+			struct ndb_invoice_block *block)
+{
+	if (!pull_str_block(cur, content, &block->invstr))
 		return 0;
 
+	return ndb_decode_invoice(cur, &block->invoice);
+}
+
+static int pull_block(const char *content, struct cursor *cur, struct note_block *block)
+{
+	unsigned char *start = cur->p;
+	uint32_t type;
+
+	if (!cursor_pull_varint_u32(cur, &type))
+		return 0;
+
+	block->type = type;
+
 	switch (block->type) {
 	case BLOCK_HASHTAG:
 	case BLOCK_TEXT:
 	case BLOCK_URL:
-		if (!push_str_block(&p->buffer, (const char*)p->content.start,
-			       &block->block.str))
-			return 0;
+		if (!pull_str_block(cur, content, &block->block.str))
+			goto fail;
 		break;
 
 	case BLOCK_MENTION_INDEX:
-		if (!cursor_push_varint(&p->buffer, block->block.mention_index))
-			return 0;
+		if (!cursor_pull_varint_u32(cur, &block->block.mention_index))
+			goto fail;
 		break;
+
 	case BLOCK_MENTION_BECH32:
-		// we only push bech32 strs here
-		if (!push_bech32_str(p, &block->block.str))
-			return 0;
+		if (!pull_bech32_mention(content, cur, &block->block.mention_bech32))
+			goto fail;
 		break;
-		
+
 	case BLOCK_INVOICE:
 		// we only push invoice strs here
-		if (!push_invoice_str(&p->buffer, &block->block.str))
-			return 0;
+		if (!pull_invoice(content, cur, &block->block.invoice))
+			goto fail;
 		break;
 	}
 
-	p->blocks->num_blocks++;
-
 	return 1;
+fail:
+	cur->p = start;
+	return 0;
 }
 
-static int add_text_block(struct ndb_content_parser *p,
-			  const unsigned char *start, const unsigned char *end)
+int push_block(struct ndb_content_parser *p, struct note_block *block);
+static int add_text_block(struct ndb_content_parser *p, const char *start, const char *end)
 {
 	struct note_block b;
 	
@@ -240,12 +300,69 @@ static int add_text_block(struct ndb_content_parser *p,
 		return 1;
 	
 	b.type = BLOCK_TEXT;
-	b.block.str.str = (const char*)start;
+	b.block.str.str = start;
 	b.block.str.len = end - start;
 	
 	return push_block(p, &b);
 }
 
+
+int push_block(struct ndb_content_parser *p, struct note_block *block)
+{
+	unsigned char *start = p->buffer.p;
+
+	// push the tag
+	if (!cursor_push_varint(&p->buffer, block->type))
+		return 0;
+
+	switch (block->type) {
+	case BLOCK_HASHTAG:
+	case BLOCK_TEXT:
+	case BLOCK_URL:
+		if (!push_str_block(&p->buffer, (const char*)p->content.start,
+			       &block->block.str))
+			goto fail;
+		break;
+
+	case BLOCK_MENTION_INDEX:
+		if (!cursor_push_varint(&p->buffer, block->block.mention_index))
+			goto fail;
+		break;
+	case BLOCK_MENTION_BECH32:
+		// we only push bech32 strs here
+		if (!push_bech32_mention(p, &block->block.str)) {
+			// if we fail for some reason, try pushing just a text block
+			p->buffer.p = start;
+			if (!add_text_block(p, block->block.str.str,
+					       block->block.str.str +
+					       block->block.str.len)) {
+				goto fail;
+			}
+		}
+		break;
+
+	case BLOCK_INVOICE:
+		// we only push invoice strs here
+		if (!push_invoice_str(p, &block->block.str)) {
+			// if we fail for some reason, try pushing just a text block
+			p->buffer.p = start;
+			if (!add_text_block(p, block->block.str.str,
+					    block->block.str.str + block->block.str.len)) {
+				goto fail;
+			}
+		}
+		break;
+	}
+
+	p->blocks->num_blocks++;
+
+	return 1;
+
+fail:
+	p->buffer.p = start;
+	return 0;
+}
+
 static int consume_url_fragment(struct cursor *cur)
 {
 	int c;
@@ -315,6 +432,7 @@ static int parse_url(struct cursor *cur, struct note_block *block) {
 	unsigned char tmp[4096];
 	int host_len;
 	struct cursor path_cur, tmp_cur;
+	enum nostr_bech32_type type;
 	make_cursor(tmp, tmp + sizeof(tmp), &tmp_cur);
 	
 	if (!parse_str(cur, "http"))
@@ -376,7 +494,7 @@ static int parse_url(struct cursor *cur, struct note_block *block) {
 	// if we have a damus link, make it a mention
 	if (host_len == 8
 	&& !strncmp((const char *)host, "damus.io", 8)
-	&& parse_nostr_bech32_str(&path_cur))
+	&& parse_nostr_bech32_str(&path_cur, &type))
 	{
 		block->block.str.len = path_cur.p - path_cur.start;
 		block->type = BLOCK_MENTION_BECH32;
@@ -421,13 +539,14 @@ static int parse_invoice(struct cursor *cur, struct note_block *block) {
 
 static int parse_mention_bech32(struct cursor *cur, struct note_block *block) {
 	unsigned char *start = cur->p;
+	enum nostr_bech32_type type;
 	
 	parse_char(cur, '@');
 	parse_str(cur, "nostr:");
 
 	block->block.str.str = (const char *)cur->p;
 	
-	if (!parse_nostr_bech32_str(cur)) {
+	if (!parse_nostr_bech32_str(cur, &type)) {
 		cur->p = start;
 		return 0;
 	}
@@ -443,7 +562,7 @@ static int add_text_then_block(struct ndb_content_parser *p,
 			       unsigned char **start,
 			       const unsigned char *pre_mention)
 {
-	if (!add_text_block(p, *start, pre_mention))
+	if (!add_text_block(p, (const char *)*start, (const char*)pre_mention))
 		return 0;
 	
 	*start = (unsigned char*)p->content.p;
@@ -457,8 +576,8 @@ int ndb_parse_content(unsigned char *buf, int buf_size,
 {
 	int cp, c;
 	struct ndb_content_parser parser;
-
 	struct note_block block;
+
 	unsigned char *start, *pre_mention;
 	
 	make_cursor(buf, buf + buf_size, &parser.buffer);
@@ -508,7 +627,7 @@ int ndb_parse_content(unsigned char *buf, int buf_size,
 	}
 	
 	if (parser.content.p - start > 0) {
-		if (!add_text_block(&parser, start, parser.content.p))
+		if (!add_text_block(&parser, (const char*)start, (const char *)parser.content.p))
 			return 0;
 	}
 
diff --git a/src/cursor.h b/src/cursor.h
@@ -360,6 +360,11 @@ static inline int cursor_push_u16(struct cursor *cursor, unsigned short i)
 	return cursor_push(cursor, (unsigned char*)&i, sizeof(i));
 }
 
+static inline int cursor_pull_u16(struct cursor *cursor, uint16_t *i)
+{
+	return cursor_pull(cursor, (unsigned char*)i, sizeof(*i));
+}
+
 static inline void *index_cursor(struct cursor *cursor, unsigned int index, int elem_size)
 {
 	unsigned char *p;
diff --git a/src/nostrdb.h b/src/nostrdb.h
@@ -198,6 +198,12 @@ struct ndb_builder {
 	struct ndb_tag *current_tag;
 };
 
+/*
+struct ndb_block_iterator {
+	struct note_block block;
+};
+*/
+
 struct ndb_iterator {
 	struct ndb_note *note;
 	struct ndb_tag *tag;
@@ -390,4 +396,6 @@ int ndb_parse_content(unsigned char *buf, int buf_size,
 		      const char *content, int content_len,
 		      struct ndb_note_blocks **blocks_p);
 
+//int ndb_blocks_iterate_next(struct ndb_block_iterator *iter);
+
 #endif

	nostrdb an unfairly fast embedded nostr database backed by lmdb
	git clone git://jb55.com/nostrdb
	Log \| Files \| Refs \| Submodules \| README \| LICENSE

M	Makefile	\|	2	+-
M	shell.nix	\|	2	+-
M	src/block.h	\|	7	+++----
M	src/content_parser.c	\|	179	+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++--------------
M	src/cursor.h	\|	5	+++++
M	src/nostrdb.h	\|	8	++++++++