nostrdb: the unfairly fast nostr database - nostrdb - an unfairly fast embedded nostr database backed by lmdb

commit 281b363654a4ca5be6332b446e66795015118dab
Author: William Casarin <jb55@jb55.com>
Date:   Wed, 19 Jul 2023 12:10:28 -0700

nostrdb: the unfairly fast nostr database

Diffstat:
A .gitignore  | 4 ++++
A COPYING  | 1 +
A LICENSE  | 1 +
A Makefile  | 16 ++++++++++++++++
A README.md  | 11 +++++++++++
A cursor.h  | 125 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
A nostr_index.fbs  | 32 ++++++++++++++++++++++++++++++++
A nostrdb.c  | 144 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
A nostrdb.h  | 172 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
A test.c  | 68 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++

10 files changed, 574 insertions(+), 0 deletions(-)
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1,4 @@
+tags
+test
+.buildcmd
+.build-result
diff --git a/COPYING b/COPYING
@@ -0,0 +1 @@
+MIT
diff --git a/LICENSE b/LICENSE
@@ -0,0 +1 @@
+MIT
diff --git a/Makefile b/Makefile
@@ -0,0 +1,16 @@
+
+CFLAGS = -Wall
+
+check: test
+	./test
+
+tags:
+	ctags *.c *.h
+
+test: test.c nostrdb.c nostrdb.h
+	$(CC) test.c nostrdb.c -o $@
+
+%.o: %.c
+	$(CC) $(CFLAGS)
+
+.PHONY: tags
diff --git a/README.md b/README.md
@@ -0,0 +1,11 @@
+
+# nostrdb
+
+The unfairly fast nostr database backed by lmdb.
+
+nostrdb stores nostr events as a custom in-memory representation that enables
+zero-copy and O(1) access to all note fields. This is similar to flatbuffers
+but it is custom built for nostr events.
+
+These events are then memory-mapped inside lmdb, enabling insanely fast,
+zero-copy access and querying.
diff --git a/cursor.h b/cursor.h
@@ -0,0 +1,125 @@
+
+#ifndef JB55_CURSOR_H
+#define JB55_CURSOR_H
+
+//#include <ctype.h>
+//#include <assert.h>
+
+#include <string.h>
+#include <ctype.h>
+
+#define unlikely(x) __builtin_expect((x),0)
+#define likely(x)   __builtin_expect((x),1)
+
+struct cursor {
+	unsigned char *start;
+	unsigned char *p;
+	unsigned char *end;
+};
+
+static inline void make_cursor(unsigned char *start, unsigned char *end, struct cursor *cursor)
+{
+	cursor->start = start;
+	cursor->p = start;
+	cursor->end = end;
+}
+
+static inline int cursor_push_byte(struct cursor *cursor, unsigned char c)
+{
+	if (unlikely(cursor->p + 1 > cursor->end)) {
+		return 0;
+	}
+
+	*cursor->p = c;
+	cursor->p++;
+
+	return 1;
+}
+
+static inline int cursor_push(struct cursor *cursor, unsigned char *data, int len)
+{
+	if (unlikely(cursor->p + len >= cursor->end)) {
+		return 0;
+	}
+
+	if (cursor->p != data)
+		memcpy(cursor->p, data, len);
+
+	cursor->p += len;
+
+	return 1;
+}
+
+static inline int cursor_push_str(struct cursor *cursor, const char *str)
+{
+	return cursor_push(cursor, (unsigned char*)str, (int)strlen(str));
+}
+
+static inline int cursor_push_c_str(struct cursor *cursor, const char *str)
+{
+	return cursor_push_str(cursor, str) && cursor_push_byte(cursor, 0);
+}
+
+static inline void *cursor_malloc(struct cursor *mem, unsigned long size)
+{
+	void *ret;
+
+	if (mem->p + size > mem->end) {
+		return 0;
+	}
+
+	ret = mem->p;
+	mem->p += size;
+
+	return ret;
+}
+
+static inline int cursor_slice(struct cursor *mem, struct cursor *slice, size_t size)
+{
+	unsigned char *p;
+	if (!(p = cursor_malloc(mem, size))) {
+		return 0;
+	}
+	make_cursor(p, mem->p, slice);
+	return 1;
+}
+
+static inline size_t cursor_count(struct cursor *cursor, size_t elem_size) {
+	return (cursor->p - cursor->start)/elem_size;
+}
+
+static inline int cursor_push_u32(struct cursor *cursor, uint32_t i) {
+	return cursor_push(cursor, (unsigned char*)&i, sizeof(i));
+}
+
+static inline int cursor_push_u16(struct cursor *cursor, uint16_t i) {
+	return cursor_push(cursor, (unsigned char*)&i, sizeof(i));
+}
+
+#define max(a,b) ((a) > (b) ? (a) : (b))
+#include <stdio.h>
+static inline void cursor_print_around(struct cursor *cur, int range)
+{
+	unsigned char *c;
+
+	printf("[%ld/%ld]\n", cur->p - cur->start, cur->end - cur->start);
+
+	c = max(cur->p - range, cur->start);
+	for (; c < cur->end && c < (cur->p + range); c++) {
+		printf("%02x", *c);
+	}
+	printf("\n");
+
+	c = max(cur->p - range, cur->start);
+	for (; c < cur->end && c < (cur->p + range); c++) {
+		if (c == cur->p) {
+			printf("^");
+			continue;
+		}
+		printf("  ");
+	}
+	printf("\n");
+}
+#undef max
+
+#endif
diff --git a/nostr_index.fbs b/nostr_index.fbs
@@ -0,0 +1,32 @@
+namespace NostrIndex;
+
+struct Fixed32Bytes {
+    val: [ubyte:32];
+}
+
+struct Fixed64Bytes {
+    val: [ubyte:64];
+}
+
+table TagGeneral {
+    key: uint8;
+    val: [ubyte];
+}
+
+table TagFixed32 {
+    key: uint8;
+    val: Fixed32Bytes;
+}
+
+table Event {
+    id: Fixed32Bytes;
+    pubkey: Fixed32Bytes;
+    created_at: uint64;
+    kind: uint64;
+    tagsGeneral: [TagGeneral];
+    tagsFixed32: [TagFixed32];
+    signature: Fixed64Bytes;
+}
+
+table Empty {}
+root_type Empty;
diff --git a/nostrdb.c b/nostrdb.c
@@ -0,0 +1,144 @@
+
+#include "nostrdb.h"
+#include <stdlib.h>
+
+int ndb_builder_new(struct ndb_builder *builder, int *bufsize) {
+	struct ndb_note *note;
+	struct cursor mem;
+	// 1MB + 0.5MB
+	builder->size = 1024 * 1024;
+	if (bufsize)
+		builder->size = *bufsize;
+
+	int str_indices_size = sizeof(uint32_t) * (2<<14);
+	unsigned char *bytes = malloc(builder->size + str_indices_size);
+	if (!bytes) return 0;
+
+	make_cursor(bytes, bytes + builder->size + str_indices_size, &mem);
+
+	note = builder->note = (struct ndb_note *)bytes;
+	size_t half = builder->size >> 1;
+
+	if (!cursor_slice(&mem, &builder->note_cur, half)) return 0;
+	if (!cursor_slice(&mem, &builder->strings, half)) return 0;
+	if (!cursor_slice(&mem, &builder->str_indices, str_indices_size)) return 0;
+
+	memset(note, 0, sizeof(*note));
+	builder->note_cur.p += sizeof(*note);
+
+	note->version = 1;
+
+	return 1;
+}
+
+int ndb_builder_finalize(struct ndb_builder *builder, struct ndb_note **note) {
+	int strings_len = builder->strings.p - builder->strings.start;
+	unsigned char *end = builder->note_cur.p + strings_len;
+	int total_size = end - builder->note_cur.start;
+
+	// move the strings buffer next to the end of our ndb_note
+	memmove(builder->note_cur.p, builder->strings.start, strings_len);
+
+	// set the strings location
+	builder->note->strings = builder->note_cur.p - builder->note_cur.start;
+
+	// remove any extra data
+	*note = realloc(builder->note_cur.start, total_size);
+
+	return total_size;
+}
+
+struct ndb_note *ndb_builder_note(struct ndb_builder *builder) {
+	return builder->note;
+}
+
+int ndb_builder_make_string(struct ndb_builder *builder, const char *str, union packed_str *pstr) {
+	int len = strlen(str);
+	uint32_t loc;
+
+	if (len == 0) {
+		*pstr = ndb_char_to_packed_str(0);
+		return 1;
+	} else if (len == 1) {
+		*pstr = ndb_char_to_packed_str(str[0]);
+		return 1;
+	} else if (len == 2) {
+		*pstr = ndb_chars_to_packed_str(str[0], str[1]);
+		return 1;
+	}
+
+	// find existing matching string to avoid duplicate strings
+	int indices = cursor_count(&builder->str_indices, sizeof(uint32_t));
+	for (int i = 0; i < indices; i++) {
+		uint32_t index = ((uint32_t*)builder->str_indices.start)[i];
+		const char *some_str = (const char*)builder->strings.start + index;
+
+		if (!strcmp(some_str, str)) {
+			// found an existing matching str, use that index
+			*pstr = ndb_offset_str(index);
+			return 1;
+		}
+	}
+
+	// no string found, push a new one
+	loc = builder->strings.p - builder->strings.start;
+	if (!(cursor_push(&builder->strings, (unsigned char*)str, len) &&
+	      cursor_push_byte(&builder->strings, '\0'))) {
+		return 0;
+	}
+	*pstr = ndb_offset_str(loc);
+
+	// record in builder indices. ignore return value, if we can't cache it
+	// then whatever
+	cursor_push_u32(&builder->str_indices, loc);
+
+	return 1;
+}
+
+int ndb_builder_set_content(struct ndb_builder *builder, const char *content) {
+	return ndb_builder_make_string(builder, content, &builder->note->content);
+}
+
+void ndb_builder_set_pubkey(struct ndb_builder *builder, unsigned char *pubkey) {
+	memcpy(builder->note->pubkey, pubkey, 32);
+}
+
+void ndb_builder_set_id(struct ndb_builder *builder, unsigned char *id) {
+	memcpy(builder->note->id, id, 32);
+}
+
+void ndb_builder_set_signature(struct ndb_builder *builder, unsigned char *signature) {
+	memcpy(builder->note->signature, signature, 64);
+}
+
+void ndb_builder_set_kind(struct ndb_builder *builder, uint32_t kind) {
+	builder->note->kind = kind;
+}
+
+static inline int cursor_push_tag(struct cursor *cur, struct ndb_tag *tag) {
+	return cursor_push_u16(cur, tag->count);
+}
+
+int ndb_builder_add_tag(struct ndb_builder *builder, const char **strs, uint16_t num_strs) {
+	int i;
+	union packed_str pstr;
+	const char *str;
+	struct ndb_tag tag;
+
+	builder->note->tags.count++;
+	tag.count = num_strs;
+
+	if (!cursor_push_tag(&builder->note_cur, &tag))
+		return 0;
+
+	for (i = 0; i < num_strs; i++) {
+		str = strs[i];
+		if (!ndb_builder_make_string(builder, str, &pstr))
+			return 0;
+		if (!cursor_push_u32(&builder->note_cur, pstr.offset))
+			return 0;
+	}
+
+	return 1;
+}
+
diff --git a/nostrdb.h b/nostrdb.h
@@ -0,0 +1,172 @@
+#ifndef NOSTRDB_H
+#define NOSTRDB_H
+
+#include <inttypes.h>
+#include "cursor.h"
+
+// these must be byte-aligned, they are directly accessing the serialized data
+// representation
+#pragma pack(push, 1)
+
+// we assume little endian everywhere. sorry not sorry.
+union packed_str {
+	uint32_t offset;
+
+	struct {
+		char str[3];
+		unsigned char flag;
+	} packed;
+
+	unsigned char bytes[4];
+};
+
+struct ndb_tag {
+	uint16_t count;
+	union packed_str strs[0];
+};
+
+struct ndb_tags {
+	uint16_t count;
+	struct ndb_tag tag[0];
+};
+
+// v1
+struct ndb_note {
+	unsigned char version;    // v=1
+	unsigned char padding[3]; // keep things aligned
+	unsigned char id[32];
+	unsigned char pubkey[32];
+	unsigned char signature[64];
+
+	uint32_t created_at;
+	uint32_t kind;
+	union packed_str content;
+	uint32_t strings;
+	uint32_t json;
+
+	// nothing can come after tags since it contains variadic data
+	struct ndb_tags tags;
+};
+
+#pragma pack(pop)
+
+struct ndb_builder {
+	struct cursor note_cur;
+	struct cursor strings;
+	struct cursor str_indices;
+	struct ndb_note *note;
+	int size;
+};
+
+struct ndb_iterator {
+	struct ndb_note *note;
+	struct ndb_tag *tag;
+
+	// current outer index
+	int index;
+};
+
+int ndb_builder_new(struct ndb_builder *builder, int *bufsize);
+int ndb_builder_finalize(struct ndb_builder *builder, struct ndb_note **note);
+int ndb_builder_set_content(struct ndb_builder *builder, const char *content);
+void ndb_builder_set_signature(struct ndb_builder *builder, unsigned char *signature);
+void ndb_builder_set_pubkey(struct ndb_builder *builder, unsigned char *pubkey);
+void ndb_builder_set_id(struct ndb_builder *builder, unsigned char *id);
+void ndb_builder_set_kind(struct ndb_builder *builder, uint32_t kind);
+int ndb_builder_add_tag(struct ndb_builder *builder, const char **strs, uint16_t num_strs);
+
+static inline unsigned char *ndb_note_id(struct ndb_note *note) {
+	return note->id;
+}
+
+static inline unsigned char *ndb_note_pubkey(struct ndb_note *note) {
+	return note->pubkey;
+}
+
+static inline unsigned char *ndb_note_signature(struct ndb_note *note) {
+	return note->signature;
+}
+
+static inline uint32_t ndb_note_created_at(struct ndb_note *note) {
+	return note->created_at;
+}
+
+static inline int ndb_str_is_packed(union packed_str str) {
+	return (str.offset >> 31) & 0x1;
+}
+
+static inline struct ndb_note *ndb_note_from_bytes(unsigned char *bytes) {
+	struct ndb_note *note = (struct ndb_note *)bytes;
+	if (note->version != 1)
+		return 0;
+	return note;
+}
+
+static inline union packed_str ndb_offset_str(uint32_t offset) {
+	// ensure accidents like -1 don't corrupt our packed_str
+	union packed_str str;
+	str.offset = offset & 0x7FFFFFFF;
+	return str;
+}
+
+static inline union packed_str
+ndb_char_to_packed_str(char c) {
+	union packed_str str;
+	str.packed.flag = 0xFF;
+	str.packed.str[0] = c;
+	str.packed.str[1] = '\0';
+	return str;
+}
+
+static inline union packed_str
+ndb_chars_to_packed_str(char c1, char c2) {
+	union packed_str str;
+	str.packed.flag = 0xFF;
+	str.packed.str[0] = c1;
+	str.packed.str[1] = c2;
+	str.packed.str[2] = '\0';
+	return str;
+}
+
+static inline const char *
+ndb_note_string(struct ndb_note *note, union packed_str *str) {
+	if (ndb_str_is_packed(*str))
+		return str->packed.str;
+
+	return ((const char *)note) + note->strings + str->offset;
+}
+
+static inline const char *
+ndb_note_tag_index(struct ndb_note *note, struct ndb_tag *tag, int index) {
+	if (index >= tag->count) {
+		return 0;
+	}
+
+	return ndb_note_string(note, &tag->strs[index]);
+}
+
+static inline int
+ndb_tags_iterate_start(struct ndb_note *note, struct ndb_iterator *iter) {
+	uint16_t count = note->tags.count;
+
+	iter->note = note;
+	iter->tag = note->tags.tag;
+	iter->index = 0;
+
+	return iter->tag->count != 0;
+}
+
+static inline int
+ndb_tags_iterate_next(struct ndb_iterator *iter) {
+	struct ndb_tags *tags = &iter->note->tags;
+
+	if (++iter->index < tags->count) {
+		uint32_t tag_data_size = iter->tag->count * sizeof(iter->tag->strs[0]);
+		iter->tag = (struct ndb_tag *)(iter->tag->strs[0].bytes + tag_data_size);
+		return 1;
+	}
+
+	return 0;
+}
+
+#endif
diff --git a/test.c b/test.c
@@ -0,0 +1,68 @@
+
+#include "nostrdb.h"
+
+#include <stdio.h>
+#include <assert.h>
+
+#define ARRAY_SIZE(x) (sizeof(x) / sizeof(x[0]))
+
+int main(int argc, const char *argv[]) {
+	struct ndb_builder builder, *b = &builder;
+	struct ndb_note *note;
+	int len, ok;
+
+	unsigned char id[32];
+	memset(id, 1, 32);
+
+	unsigned char pubkey[32];
+	memset(pubkey, 2, 32);
+
+	unsigned char sig[64];
+	memset(sig, 3, 64);
+
+
+	const char *hex_pk = "5d9b81b2d4d5609c5565286fc3b511dc6b9a1b3d7d1174310c624d61d1f82bb9";
+	const char *tag[] = { "p", hex_pk };
+	const char *word_tag[] = { "word", "words", "w" };
+
+	ndb_builder_new(b, 0);
+	note = builder.note;
+
+	memset(note->padding, 3, sizeof(note->padding));
+
+	ndb_builder_set_content(b, "hello, world!");
+	ndb_builder_set_id(b, id);
+	ndb_builder_set_pubkey(b, pubkey);
+	ndb_builder_set_signature(b, sig);
+	ndb_builder_add_tag(b, tag, ARRAY_SIZE(tag));
+	ndb_builder_add_tag(b, word_tag, ARRAY_SIZE(word_tag));
+
+	len = ndb_builder_finalize(b, &note);
+
+	assert(note->tags.count == 2);
+
+	// test iterator
+	struct ndb_iterator iter, *it = &iter;
+	
+	ok = ndb_tags_iterate_start(note, it);
+	assert(ok);
+	assert(it->tag->count == 2);
+	const char *p   = ndb_note_string(note, &it->tag->strs[0]);
+	const char *hpk = ndb_note_string(note, &it->tag->strs[1]);
+	assert(hpk);
+	assert(!ndb_str_is_packed(it->tag->strs[1]));
+	assert(!strcmp(hpk, hex_pk));
+	assert(!strcmp(p, "p"));
+
+	ok = ndb_tags_iterate_next(it);
+	assert(ok);
+	assert(it->tag->count == 3);
+	assert(!strcmp(ndb_note_string(note, &it->tag->strs[0]), "word"));
+	assert(!strcmp(ndb_note_string(note, &it->tag->strs[1]), "words"));
+	assert(!strcmp(ndb_note_string(note, &it->tag->strs[2]), "w"));
+
+	ok = ndb_tags_iterate_next(it);
+	assert(!ok);
+
+	fwrite(note, len, 1, stdout);
+}

	nostrdb an unfairly fast embedded nostr database backed by lmdb
	git clone git://jb55.com/nostrdb
	Log \| Files \| Refs \| README \| LICENSE

A	.gitignore	\|	4	++++
A	COPYING	\|	1	+
A	LICENSE	\|	1	+
A	Makefile	\|	16	++++++++++++++++
A	README.md	\|	11	+++++++++++
A	cursor.h	\|	125	+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
A	nostr_index.fbs	\|	32	++++++++++++++++++++++++++++++++
A	nostrdb.c	\|	144	+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
A	nostrdb.h	\|	172	+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
A	test.c	\|	68	++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++