filters: add initial filter interface - nostrdb - an unfairly fast embedded nostr database backed by lmdb

commit 30ed801285dd80b07f4441138ca502c9a1ce25f2
parent 7c5ddd44b06e1e15e3f44e83fb34187537c35d7d
Author: William Casarin <jb55@jb55.com>
Date:   Wed, 22 Nov 2023 13:46:12 -0800

filters: add initial filter interface

This adds a way to construct filters and match them against notes. This
will be used by our query interface in the future.

Diffstat:
M nostrdb.c  | 388 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
M nostrdb.h  | 57 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++
M test.c  | 67 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++

3 files changed, 512 insertions(+), 0 deletions(-)
diff --git a/nostrdb.c b/nostrdb.c
@@ -29,6 +29,9 @@ static const int THREAD_QUEUE_BATCH = 4096;
 // the maximum size of inbox queues
 static const int DEFAULT_QUEUE_SIZE = 1000000;
 
+// increase if we need bigger filters
+#define NDB_FILTER_PAGES 64
+
 #define ndb_flag_set(flags, f) ((flags & f) == f)
 
 #define NDB_PARSED_ID           (1 << 0)
@@ -144,6 +147,391 @@ static void lowercase_strncpy(char *dst, const char *src, int n) {
 	}
 }
 
+int ndb_filter_init(struct ndb_filter *filter)
+{
+	struct cursor cur;
+	int page_size, elem_pages, data_pages, buf_size;
+
+	page_size = 4096; // assuming this, not a big deal if we're wrong
+	elem_pages = NDB_FILTER_PAGES / 4;
+	data_pages = NDB_FILTER_PAGES - elem_pages;
+	buf_size = page_size * NDB_FILTER_PAGES;
+
+	unsigned char *buf = malloc(buf_size);
+	if (!buf)
+		return 0;
+
+	// init memory arena for the cursor
+	make_cursor(buf, buf + buf_size, &cur);
+
+	cursor_slice(&cur, &filter->elem_buf, page_size * elem_pages);
+	cursor_slice(&cur, &filter->data_buf, page_size * data_pages);
+
+	// make sure we are fully allocated
+	assert(cur.p == cur.end);
+
+	// make sure elem_buf is the start of the buffer
+	assert(filter->elem_buf.start == cur.start);
+
+	filter->num_elements = 0;
+	filter->elements[0] = (struct ndb_filter_elements*) buf;
+	filter->current = NULL;
+
+	return 1;
+}
+
+void ndb_filter_reset(struct ndb_filter *filter)
+{
+	filter->num_elements = 0;
+	filter->elem_buf.p = filter->elem_buf.start;
+	filter->data_buf.p = filter->data_buf.start;
+	filter->current = NULL;
+}
+
+void ndb_filter_free(struct ndb_filter *filter)
+{
+	if (filter->elem_buf.start)
+		free(filter->elem_buf.start);
+
+	memset(filter, 0, sizeof(*filter));
+}
+
+static const char *ndb_filter_field_name(enum ndb_filter_fieldtype field)
+{
+	switch (field) {
+	case NDB_FILTER_IDS: return "ids";
+	case NDB_FILTER_AUTHORS: return "authors";
+	case NDB_FILTER_KINDS: return "kinds";
+	case NDB_FILTER_GENERIC: return "generic";
+	case NDB_FILTER_SINCE: return "since";
+	case NDB_FILTER_UNTIL: return "until";
+	case NDB_FILTER_LIMIT: return "limit";
+	}
+
+	return "unknown";
+}
+
+static int ndb_filter_start_field_impl(struct ndb_filter *filter, enum ndb_filter_fieldtype field, char generic)
+{
+	int i;
+	struct ndb_filter_elements *els, *el;
+
+	if (filter->current) {
+		fprintf(stderr, "ndb_filter_start_field: filter field already in progress, did you forget to call ndb_filter_end_field?\n");
+		return 0;
+	}
+
+	// you can only start and end fields once
+	for (i = 0; i < filter->num_elements; i++) {
+		el = filter->elements[i];
+		if (el->field.type == field) {
+			fprintf(stderr, "ndb_filter_start_field: field '%s' already exists\n",
+					ndb_filter_field_name(field));
+			return 0;
+		}
+	}
+
+	els = (struct ndb_filter_elements *) filter->elem_buf.p ;
+	filter->current = els;
+
+	// advance elem buffer to the variable data section
+	if (!cursor_skip(&filter->elem_buf, sizeof(struct ndb_filter_elements))) {
+		fprintf(stderr, "ndb_filter_start_field: '%s' oom (todo: realloc?)\n",
+				ndb_filter_field_name(field));
+		return 0;
+	}
+
+	els->field.type = field;
+	els->field.generic = generic;
+	els->field.elem_type = 0;
+	els->count = 0;
+
+	return 1;
+}
+
+int ndb_filter_start_field(struct ndb_filter *filter, enum ndb_filter_fieldtype field)
+{
+	return ndb_filter_start_field_impl(filter, field, 0);
+}
+
+int ndb_filter_start_generic_field(struct ndb_filter *filter, char tag)
+{
+	return ndb_filter_start_field_impl(filter, NDB_FILTER_GENERIC, tag);
+}
+
+static int ndb_filter_add_element(struct ndb_filter *filter, union ndb_filter_element el)
+{
+	unsigned char *data;
+	const char *str;
+
+	if (!filter->current)
+		return 0;
+
+	data = filter->data_buf.p;
+
+	switch (filter->current->field.type) {
+	case NDB_FILTER_IDS:
+	case NDB_FILTER_AUTHORS:
+		if (!cursor_push(&filter->data_buf, (unsigned char *)el.id, 32))
+			return 0;
+		el.id = data;
+		break;
+	case NDB_FILTER_KINDS:
+		break;
+	case NDB_FILTER_SINCE:
+	case NDB_FILTER_UNTIL:
+	case NDB_FILTER_LIMIT:
+		// only one allowed for since/until
+		if (filter->current->count != 0)
+			return 0;
+		break;
+	case NDB_FILTER_GENERIC:
+		str = (const char *)filter->data_buf.p;
+		if (!cursor_push_c_str(&filter->data_buf, el.string))
+			return 0;
+		// push a pointer of the string in the databuf as an element
+		el.string = str;
+		break;
+	}
+
+	if (!cursor_push(&filter->elem_buf, (unsigned char*)&el, sizeof(el)))
+		return 0;
+
+	filter->current->count++;
+
+	return 1;
+}
+
+static int ndb_filter_set_elem_type(struct ndb_filter *filter,
+				    enum ndb_generic_element_type elem_type)
+{
+	enum ndb_generic_element_type current_elem_type;
+
+	if (!filter->current)
+		return 0;
+
+	current_elem_type = filter->current->field.elem_type;
+
+	// element types must be uniform
+	if (current_elem_type != elem_type && current_elem_type != NDB_ELEMENT_UNKNOWN) {
+		fprintf(stderr, "ndb_filter_set_elem_type: element types must be uniform\n");
+		return 0;
+	}
+
+	filter->current->field.elem_type = elem_type;
+
+	return 1;
+}
+
+int ndb_filter_add_str_element(struct ndb_filter *filter, const char *str)
+{
+	union ndb_filter_element el;
+
+	if (!filter->current)
+		return 0;
+
+	// only generic queries are allowed to have strings
+	switch (filter->current->field.type) {
+	case NDB_FILTER_SINCE:
+	case NDB_FILTER_UNTIL:
+	case NDB_FILTER_LIMIT:
+	case NDB_FILTER_IDS:
+	case NDB_FILTER_AUTHORS:
+	case NDB_FILTER_KINDS:
+		return 0;
+	case NDB_FILTER_GENERIC:
+		break;
+	}
+
+	if (!ndb_filter_set_elem_type(filter, NDB_ELEMENT_STRING))
+		return 0;
+
+	el.string = str;
+	return ndb_filter_add_element(filter, el);
+}
+
+int ndb_filter_add_int_element(struct ndb_filter *filter, uint64_t integer)
+{
+	union ndb_filter_element el;
+	if (!filter->current)
+		return 0;
+
+	switch (filter->current->field.type) {
+	case NDB_FILTER_IDS:
+	case NDB_FILTER_AUTHORS:
+	case NDB_FILTER_GENERIC:
+		return 0;
+	case NDB_FILTER_KINDS:
+	case NDB_FILTER_SINCE:
+	case NDB_FILTER_UNTIL:
+	case NDB_FILTER_LIMIT:
+		break;
+	}
+
+	el.integer = integer;
+
+	return ndb_filter_add_element(filter, el);
+}
+
+int ndb_filter_add_id_element(struct ndb_filter *filter, const unsigned char *id)
+{
+	union ndb_filter_element el;
+
+	if (!filter->current)
+		return 0;
+
+	// only certain filter types allow pushing id elements
+	switch (filter->current->field.type) {
+	case NDB_FILTER_SINCE:
+	case NDB_FILTER_UNTIL:
+	case NDB_FILTER_LIMIT:
+		return 0;
+	case NDB_FILTER_IDS:
+	case NDB_FILTER_AUTHORS:
+	case NDB_FILTER_KINDS:
+	case NDB_FILTER_GENERIC:
+		break;
+	}
+
+	if (!ndb_filter_set_elem_type(filter, NDB_ELEMENT_ID))
+		return 0;
+
+	// this is needed so that generic filters know its an id
+	el.id = id;
+
+	return ndb_filter_add_element(filter, el);
+}
+
+// TODO: build a hashtable so this is O(1)
+static int ndb_generic_filter_matches(struct ndb_filter_elements *els,
+				      struct ndb_note *note)
+{
+	int i;
+	union ndb_filter_element el;
+	struct ndb_iterator iter, *it = &iter;
+	struct ndb_str str;
+
+	ndb_tags_iterate_start(note, it);
+
+	while (ndb_tags_iterate_next(it)) {
+		// we're looking for tags with 2 or more entries: ["p", id], etc
+		if (it->tag->count < 2)
+			continue;
+
+		str = ndb_note_str(note, &it->tag->strs[0]);
+
+		// we only care about packed strings (single char, etc)
+		if (str.flag != NDB_PACKED_STR)
+			continue;
+
+		// do we have #e matching e (or p, etc)
+		if (str.str[0] != els->field.generic)
+			continue;
+
+		str = ndb_note_str(note, &it->tag->strs[1]);
+
+		switch (els->field.elem_type) {
+		case NDB_ELEMENT_ID:
+			// if our filter element type is an id, then we
+			// expect a packed id in the tag, otherwise skip
+			if (str.flag != NDB_PACKED_ID)
+				continue;
+			break;
+		case NDB_ELEMENT_STRING:
+			// if our filter element type is a string, then
+			// we should not expect an id
+			if (str.flag == NDB_PACKED_ID)
+				continue;
+			break;
+		case NDB_ELEMENT_UNKNOWN:
+		default:
+			// For some reason the element type is not set. It's
+			// possible nothing was added to the generic filter?
+			// Let's just fail here and log a note for debugging
+			fprintf(stderr, "UNUSUAL ndb_generic_filter_matches: have unknown element type %d\n", els->field.elem_type);
+			return 0;
+		}
+
+		for (i = 0; i < els->count; i++) {
+			el = els->elements[i];
+			switch (els->field.elem_type) {
+			case NDB_ELEMENT_ID:
+				if (!memcmp(el.id, str.id, 32))
+					return 1;
+				break;
+			case NDB_ELEMENT_STRING:
+				if (!strcmp(el.string, str.str))
+					return 1;
+				break;
+			case NDB_ELEMENT_UNKNOWN:
+				return 0;
+			}
+		}
+	}
+
+	return 0;
+}
+
+// returns 1 if a filter matches a note
+int ndb_filter_matches(struct ndb_filter *filter, struct ndb_note *note)
+{
+	int i, j;
+	struct ndb_filter_elements *els;
+
+	for (i = 0; i < filter->num_elements; i++) {
+		els = filter->elements[i];
+
+		switch (els->field.type) {
+		case NDB_FILTER_KINDS:
+			for (j = 0; j < els->count; j++) {
+				if ((unsigned int)els->elements[j].integer == note->kind)
+					goto cont;
+			}
+			break;
+		// TODO: add filter hashtable for large id lists
+		case NDB_FILTER_IDS:
+			for (j = 0; j < els->count; j++) {
+				if (!memcmp(els->elements[j].id, note->id, 32))
+					goto cont;
+			}
+			break;
+		case NDB_FILTER_AUTHORS:
+			for (j = 0; j < els->count; j++) {
+				if (!memcmp(els->elements[j].id, note->pubkey, 32))
+					goto cont;
+			}
+			break;
+		case NDB_FILTER_GENERIC:
+			if (ndb_generic_filter_matches(els, note))
+				continue;
+			break;
+		case NDB_FILTER_SINCE:
+			assert(els->count == 1);
+			if (note->created_at >= els->elements[0].integer)
+				continue;
+			break;
+		case NDB_FILTER_UNTIL:
+			assert(els->count == 1);
+			if (note->created_at < els->elements[0].integer)
+				continue;
+		case NDB_FILTER_LIMIT:
+cont:
+			continue;
+		}
+
+		// all need to match
+		return 0;
+	}
+
+	return 1;
+}
+
+void ndb_filter_end_field(struct ndb_filter *filter)
+{
+	filter->elements[filter->num_elements++] = filter->current;
+	filter->current = NULL;
+}
+
 static void ndb_make_search_key(struct ndb_search_key *key, unsigned char *id,
 			        uint64_t timestamp, const char *search)
 {
diff --git a/nostrdb.h b/nostrdb.h
@@ -223,6 +223,51 @@ struct ndb_iterator {
 	int index;
 };
 
+enum ndb_filter_fieldtype {
+	NDB_FILTER_IDS     = 1,
+	NDB_FILTER_AUTHORS = 2,
+	NDB_FILTER_KINDS   = 3,
+	NDB_FILTER_GENERIC = 4,
+	NDB_FILTER_SINCE   = 5,
+	NDB_FILTER_UNTIL   = 6,
+	NDB_FILTER_LIMIT   = 7,
+};
+#define NDB_NUM_FILTERS 7
+
+// when matching generic tags, we need to know if we're dealing with
+// a pointer to a 32-byte ID or a null terminated string
+enum ndb_generic_element_type {
+	NDB_ELEMENT_UNKNOWN = 0,
+	NDB_ELEMENT_STRING  = 1,
+	NDB_ELEMENT_ID      = 2,
+};
+
+union ndb_filter_element {
+	const char *string;
+	const unsigned char *id;
+	uint64_t integer;
+};
+
+struct ndb_filter_field {
+	enum ndb_filter_fieldtype type;
+	enum ndb_generic_element_type elem_type;
+	char generic; // for generic queries like #t
+};
+
+struct ndb_filter_elements {
+	struct ndb_filter_field field;
+	int count;
+	union ndb_filter_element elements[0];
+};
+
+struct ndb_filter {
+	struct cursor elem_buf;
+	struct cursor data_buf;
+	int num_elements;
+	struct ndb_filter_elements *current;
+	struct ndb_filter_elements *elements[NDB_NUM_FILTERS];
+};
+
 // HELPERS
 int ndb_calculate_id(struct ndb_note *note, unsigned char *buf, int buflen);
 int ndb_sign_id(struct ndb_keypair *keypair, unsigned char id[32], unsigned char sig[64]);
@@ -269,6 +314,18 @@ void ndb_builder_set_kind(struct ndb_builder *builder, uint32_t kind);
 int ndb_builder_new_tag(struct ndb_builder *builder);
 int ndb_builder_push_tag_str(struct ndb_builder *builder, const char *str, int len);
 
+// FILTERS
+int ndb_filter_init(struct ndb_filter *);
+int ndb_filter_add_id_element(struct ndb_filter *, const unsigned char *id);
+int ndb_filter_add_int_element(struct ndb_filter *, uint64_t integer);
+int ndb_filter_add_str_element(struct ndb_filter *, const char *str);
+int ndb_filter_start_field(struct ndb_filter *, enum ndb_filter_fieldtype);
+int ndb_filter_start_generic_field(struct ndb_filter *, char tag);
+int ndb_filter_matches(struct ndb_filter *, struct ndb_note *);
+void ndb_filter_reset(struct ndb_filter *);
+void ndb_filter_end_field(struct ndb_filter *);
+void ndb_filter_free(struct ndb_filter *filter);
+
 // stats
 int ndb_stat(struct ndb *ndb, struct ndb_stat *stat);
 void ndb_stat_counts_init(struct ndb_stat_counts *counts);
diff --git a/test.c b/test.c
@@ -47,6 +47,72 @@ static void print_search(struct ndb_txn *txn, struct ndb_search *search)
 }
 
 
+static void test_filters()
+{
+	struct ndb_filter filter, *f;
+	struct ndb_note *note;
+	unsigned char buffer[4096];
+
+	const char *test_note = "{\"id\": \"160e76ca67405d7ce9ef7d2dd72f3f36401c8661a73d45498af842d40b01b736\",\"pubkey\": \"67c67870aebc327eb2a2e765e6dbb42f0f120d2c4e4e28dc16b824cf72a5acc1\",\"created_at\": 1700688516,\"kind\": 1337,\"tags\": [[\"t\",\"hashtag\"],[\"t\",\"grownostr\"],[\"p\",\"4d2e7a6a8e08007ace5a03391d21735f45caf1bf3d67b492adc28967ab46525e\"]],\"content\": \"\",\"sig\": \"20c2d070261ed269559ada40ca5ac395c389681ee3b5f7d50de19dd9b328dd70cf27d9d13875e87c968d9b49fa05f66e90f18037be4529b9e582c7e2afac3f06\"}";
+
+	assert(ndb_note_from_json(test_note, strlen(test_note), &note, buffer, sizeof(buffer)));
+
+	f = &filter;
+
+	assert(ndb_filter_init(f));
+	assert(ndb_filter_start_field(f, NDB_FILTER_KINDS));
+	assert(ndb_filter_add_int_element(f, 1337));
+	assert(ndb_filter_add_int_element(f, 2));
+
+	assert(f->current->count == 2);
+	assert(f->current->field.type == NDB_FILTER_KINDS);
+
+	// can't start if we've already started
+	assert(ndb_filter_start_field(f, NDB_FILTER_KINDS) == 0);
+	assert(ndb_filter_start_field(f, NDB_FILTER_GENERIC) == 0);
+	ndb_filter_end_field(f);
+
+	// try matching the filter
+	assert(ndb_filter_matches(f, note));
+
+	note->kind = 1;
+
+	// inverse match
+	assert(!ndb_filter_matches(f, note));
+
+	// should also match 2
+	note->kind = 2;
+	assert(ndb_filter_matches(f, note));
+
+	// don't free, just reset data pointers
+	ndb_filter_reset(f);
+
+	// now try generic matches
+	assert(ndb_filter_start_generic_field(f, 't'));
+	assert(ndb_filter_add_str_element(f, "grownostr"));
+	ndb_filter_end_field(f);
+	assert(ndb_filter_start_field(f, NDB_FILTER_KINDS));
+	assert(ndb_filter_add_int_element(f, 3));
+	ndb_filter_end_field(f);
+
+	// shouldn't match the kind filter
+	assert(!ndb_filter_matches(f, note));
+
+	note->kind = 3;
+
+	// now it should
+	assert(ndb_filter_matches(f, note));
+
+	ndb_filter_reset(f);
+	assert(ndb_filter_start_field(f, NDB_FILTER_AUTHORS));
+	assert(ndb_filter_add_id_element(f, note->pubkey));
+	ndb_filter_end_field(f);
+	assert(f->current == NULL);
+	assert(ndb_filter_matches(f, note));
+
+	ndb_filter_free(f);
+}
+
 // Test fetched_at profile records. These are saved when new profiles are
 // processed, or the last time we've fetched the profile.
 static void test_fetched_at()
@@ -877,6 +943,7 @@ static void test_fast_strchr()
 }
 
 int main(int argc, const char *argv[]) {
+	test_filters();
 	test_migrate();
 	test_fetched_at();
 	test_profile_updates();

	nostrdb an unfairly fast embedded nostr database backed by lmdb
	git clone git://jb55.com/nostrdb
	Log \| Files \| Refs \| Submodules \| README \| LICENSE

M	nostrdb.c	\|	388	+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
M	nostrdb.h	\|	57	+++++++++++++++++++++++++++++++++++++++++++++++++++++++++
M	test.c	\|	67	+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++