nostrdb

an unfairly fast embedded nostr database backed by lmdb
git clone git://jb55.com/nostrdb
Log | Files | Refs | Submodules | README | LICENSE

commit 30ed801285dd80b07f4441138ca502c9a1ce25f2
parent 7c5ddd44b06e1e15e3f44e83fb34187537c35d7d
Author: William Casarin <jb55@jb55.com>
Date:   Wed, 22 Nov 2023 13:46:12 -0800

filters: add initial filter interface

This adds a way to construct filters and match them against notes. This
will be used by our query interface in the future.

Diffstat:
Mnostrdb.c | 388+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Mnostrdb.h | 57+++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Mtest.c | 67+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
3 files changed, 512 insertions(+), 0 deletions(-)

diff --git a/nostrdb.c b/nostrdb.c @@ -29,6 +29,9 @@ static const int THREAD_QUEUE_BATCH = 4096; // the maximum size of inbox queues static const int DEFAULT_QUEUE_SIZE = 1000000; +// increase if we need bigger filters +#define NDB_FILTER_PAGES 64 + #define ndb_flag_set(flags, f) ((flags & f) == f) #define NDB_PARSED_ID (1 << 0) @@ -144,6 +147,391 @@ static void lowercase_strncpy(char *dst, const char *src, int n) { } } +int ndb_filter_init(struct ndb_filter *filter) +{ + struct cursor cur; + int page_size, elem_pages, data_pages, buf_size; + + page_size = 4096; // assuming this, not a big deal if we're wrong + elem_pages = NDB_FILTER_PAGES / 4; + data_pages = NDB_FILTER_PAGES - elem_pages; + buf_size = page_size * NDB_FILTER_PAGES; + + unsigned char *buf = malloc(buf_size); + if (!buf) + return 0; + + // init memory arena for the cursor + make_cursor(buf, buf + buf_size, &cur); + + cursor_slice(&cur, &filter->elem_buf, page_size * elem_pages); + cursor_slice(&cur, &filter->data_buf, page_size * data_pages); + + // make sure we are fully allocated + assert(cur.p == cur.end); + + // make sure elem_buf is the start of the buffer + assert(filter->elem_buf.start == cur.start); + + filter->num_elements = 0; + filter->elements[0] = (struct ndb_filter_elements*) buf; + filter->current = NULL; + + return 1; +} + +void ndb_filter_reset(struct ndb_filter *filter) +{ + filter->num_elements = 0; + filter->elem_buf.p = filter->elem_buf.start; + filter->data_buf.p = filter->data_buf.start; + filter->current = NULL; +} + +void ndb_filter_free(struct ndb_filter *filter) +{ + if (filter->elem_buf.start) + free(filter->elem_buf.start); + + memset(filter, 0, sizeof(*filter)); +} + +static const char *ndb_filter_field_name(enum ndb_filter_fieldtype field) +{ + switch (field) { + case NDB_FILTER_IDS: return "ids"; + case NDB_FILTER_AUTHORS: return "authors"; + case NDB_FILTER_KINDS: return "kinds"; + case NDB_FILTER_GENERIC: return "generic"; + case NDB_FILTER_SINCE: return "since"; + case NDB_FILTER_UNTIL: return "until"; + case NDB_FILTER_LIMIT: return "limit"; + } + + return "unknown"; +} + +static int ndb_filter_start_field_impl(struct ndb_filter *filter, enum ndb_filter_fieldtype field, char generic) +{ + int i; + struct ndb_filter_elements *els, *el; + + if (filter->current) { + fprintf(stderr, "ndb_filter_start_field: filter field already in progress, did you forget to call ndb_filter_end_field?\n"); + return 0; + } + + // you can only start and end fields once + for (i = 0; i < filter->num_elements; i++) { + el = filter->elements[i]; + if (el->field.type == field) { + fprintf(stderr, "ndb_filter_start_field: field '%s' already exists\n", + ndb_filter_field_name(field)); + return 0; + } + } + + els = (struct ndb_filter_elements *) filter->elem_buf.p ; + filter->current = els; + + // advance elem buffer to the variable data section + if (!cursor_skip(&filter->elem_buf, sizeof(struct ndb_filter_elements))) { + fprintf(stderr, "ndb_filter_start_field: '%s' oom (todo: realloc?)\n", + ndb_filter_field_name(field)); + return 0; + } + + els->field.type = field; + els->field.generic = generic; + els->field.elem_type = 0; + els->count = 0; + + return 1; +} + +int ndb_filter_start_field(struct ndb_filter *filter, enum ndb_filter_fieldtype field) +{ + return ndb_filter_start_field_impl(filter, field, 0); +} + +int ndb_filter_start_generic_field(struct ndb_filter *filter, char tag) +{ + return ndb_filter_start_field_impl(filter, NDB_FILTER_GENERIC, tag); +} + +static int ndb_filter_add_element(struct ndb_filter *filter, union ndb_filter_element el) +{ + unsigned char *data; + const char *str; + + if (!filter->current) + return 0; + + data = filter->data_buf.p; + + switch (filter->current->field.type) { + case NDB_FILTER_IDS: + case NDB_FILTER_AUTHORS: + if (!cursor_push(&filter->data_buf, (unsigned char *)el.id, 32)) + return 0; + el.id = data; + break; + case NDB_FILTER_KINDS: + break; + case NDB_FILTER_SINCE: + case NDB_FILTER_UNTIL: + case NDB_FILTER_LIMIT: + // only one allowed for since/until + if (filter->current->count != 0) + return 0; + break; + case NDB_FILTER_GENERIC: + str = (const char *)filter->data_buf.p; + if (!cursor_push_c_str(&filter->data_buf, el.string)) + return 0; + // push a pointer of the string in the databuf as an element + el.string = str; + break; + } + + if (!cursor_push(&filter->elem_buf, (unsigned char*)&el, sizeof(el))) + return 0; + + filter->current->count++; + + return 1; +} + +static int ndb_filter_set_elem_type(struct ndb_filter *filter, + enum ndb_generic_element_type elem_type) +{ + enum ndb_generic_element_type current_elem_type; + + if (!filter->current) + return 0; + + current_elem_type = filter->current->field.elem_type; + + // element types must be uniform + if (current_elem_type != elem_type && current_elem_type != NDB_ELEMENT_UNKNOWN) { + fprintf(stderr, "ndb_filter_set_elem_type: element types must be uniform\n"); + return 0; + } + + filter->current->field.elem_type = elem_type; + + return 1; +} + +int ndb_filter_add_str_element(struct ndb_filter *filter, const char *str) +{ + union ndb_filter_element el; + + if (!filter->current) + return 0; + + // only generic queries are allowed to have strings + switch (filter->current->field.type) { + case NDB_FILTER_SINCE: + case NDB_FILTER_UNTIL: + case NDB_FILTER_LIMIT: + case NDB_FILTER_IDS: + case NDB_FILTER_AUTHORS: + case NDB_FILTER_KINDS: + return 0; + case NDB_FILTER_GENERIC: + break; + } + + if (!ndb_filter_set_elem_type(filter, NDB_ELEMENT_STRING)) + return 0; + + el.string = str; + return ndb_filter_add_element(filter, el); +} + +int ndb_filter_add_int_element(struct ndb_filter *filter, uint64_t integer) +{ + union ndb_filter_element el; + if (!filter->current) + return 0; + + switch (filter->current->field.type) { + case NDB_FILTER_IDS: + case NDB_FILTER_AUTHORS: + case NDB_FILTER_GENERIC: + return 0; + case NDB_FILTER_KINDS: + case NDB_FILTER_SINCE: + case NDB_FILTER_UNTIL: + case NDB_FILTER_LIMIT: + break; + } + + el.integer = integer; + + return ndb_filter_add_element(filter, el); +} + +int ndb_filter_add_id_element(struct ndb_filter *filter, const unsigned char *id) +{ + union ndb_filter_element el; + + if (!filter->current) + return 0; + + // only certain filter types allow pushing id elements + switch (filter->current->field.type) { + case NDB_FILTER_SINCE: + case NDB_FILTER_UNTIL: + case NDB_FILTER_LIMIT: + return 0; + case NDB_FILTER_IDS: + case NDB_FILTER_AUTHORS: + case NDB_FILTER_KINDS: + case NDB_FILTER_GENERIC: + break; + } + + if (!ndb_filter_set_elem_type(filter, NDB_ELEMENT_ID)) + return 0; + + // this is needed so that generic filters know its an id + el.id = id; + + return ndb_filter_add_element(filter, el); +} + +// TODO: build a hashtable so this is O(1) +static int ndb_generic_filter_matches(struct ndb_filter_elements *els, + struct ndb_note *note) +{ + int i; + union ndb_filter_element el; + struct ndb_iterator iter, *it = &iter; + struct ndb_str str; + + ndb_tags_iterate_start(note, it); + + while (ndb_tags_iterate_next(it)) { + // we're looking for tags with 2 or more entries: ["p", id], etc + if (it->tag->count < 2) + continue; + + str = ndb_note_str(note, &it->tag->strs[0]); + + // we only care about packed strings (single char, etc) + if (str.flag != NDB_PACKED_STR) + continue; + + // do we have #e matching e (or p, etc) + if (str.str[0] != els->field.generic) + continue; + + str = ndb_note_str(note, &it->tag->strs[1]); + + switch (els->field.elem_type) { + case NDB_ELEMENT_ID: + // if our filter element type is an id, then we + // expect a packed id in the tag, otherwise skip + if (str.flag != NDB_PACKED_ID) + continue; + break; + case NDB_ELEMENT_STRING: + // if our filter element type is a string, then + // we should not expect an id + if (str.flag == NDB_PACKED_ID) + continue; + break; + case NDB_ELEMENT_UNKNOWN: + default: + // For some reason the element type is not set. It's + // possible nothing was added to the generic filter? + // Let's just fail here and log a note for debugging + fprintf(stderr, "UNUSUAL ndb_generic_filter_matches: have unknown element type %d\n", els->field.elem_type); + return 0; + } + + for (i = 0; i < els->count; i++) { + el = els->elements[i]; + switch (els->field.elem_type) { + case NDB_ELEMENT_ID: + if (!memcmp(el.id, str.id, 32)) + return 1; + break; + case NDB_ELEMENT_STRING: + if (!strcmp(el.string, str.str)) + return 1; + break; + case NDB_ELEMENT_UNKNOWN: + return 0; + } + } + } + + return 0; +} + +// returns 1 if a filter matches a note +int ndb_filter_matches(struct ndb_filter *filter, struct ndb_note *note) +{ + int i, j; + struct ndb_filter_elements *els; + + for (i = 0; i < filter->num_elements; i++) { + els = filter->elements[i]; + + switch (els->field.type) { + case NDB_FILTER_KINDS: + for (j = 0; j < els->count; j++) { + if ((unsigned int)els->elements[j].integer == note->kind) + goto cont; + } + break; + // TODO: add filter hashtable for large id lists + case NDB_FILTER_IDS: + for (j = 0; j < els->count; j++) { + if (!memcmp(els->elements[j].id, note->id, 32)) + goto cont; + } + break; + case NDB_FILTER_AUTHORS: + for (j = 0; j < els->count; j++) { + if (!memcmp(els->elements[j].id, note->pubkey, 32)) + goto cont; + } + break; + case NDB_FILTER_GENERIC: + if (ndb_generic_filter_matches(els, note)) + continue; + break; + case NDB_FILTER_SINCE: + assert(els->count == 1); + if (note->created_at >= els->elements[0].integer) + continue; + break; + case NDB_FILTER_UNTIL: + assert(els->count == 1); + if (note->created_at < els->elements[0].integer) + continue; + case NDB_FILTER_LIMIT: +cont: + continue; + } + + // all need to match + return 0; + } + + return 1; +} + +void ndb_filter_end_field(struct ndb_filter *filter) +{ + filter->elements[filter->num_elements++] = filter->current; + filter->current = NULL; +} + static void ndb_make_search_key(struct ndb_search_key *key, unsigned char *id, uint64_t timestamp, const char *search) { diff --git a/nostrdb.h b/nostrdb.h @@ -223,6 +223,51 @@ struct ndb_iterator { int index; }; +enum ndb_filter_fieldtype { + NDB_FILTER_IDS = 1, + NDB_FILTER_AUTHORS = 2, + NDB_FILTER_KINDS = 3, + NDB_FILTER_GENERIC = 4, + NDB_FILTER_SINCE = 5, + NDB_FILTER_UNTIL = 6, + NDB_FILTER_LIMIT = 7, +}; +#define NDB_NUM_FILTERS 7 + +// when matching generic tags, we need to know if we're dealing with +// a pointer to a 32-byte ID or a null terminated string +enum ndb_generic_element_type { + NDB_ELEMENT_UNKNOWN = 0, + NDB_ELEMENT_STRING = 1, + NDB_ELEMENT_ID = 2, +}; + +union ndb_filter_element { + const char *string; + const unsigned char *id; + uint64_t integer; +}; + +struct ndb_filter_field { + enum ndb_filter_fieldtype type; + enum ndb_generic_element_type elem_type; + char generic; // for generic queries like #t +}; + +struct ndb_filter_elements { + struct ndb_filter_field field; + int count; + union ndb_filter_element elements[0]; +}; + +struct ndb_filter { + struct cursor elem_buf; + struct cursor data_buf; + int num_elements; + struct ndb_filter_elements *current; + struct ndb_filter_elements *elements[NDB_NUM_FILTERS]; +}; + // HELPERS int ndb_calculate_id(struct ndb_note *note, unsigned char *buf, int buflen); int ndb_sign_id(struct ndb_keypair *keypair, unsigned char id[32], unsigned char sig[64]); @@ -269,6 +314,18 @@ void ndb_builder_set_kind(struct ndb_builder *builder, uint32_t kind); int ndb_builder_new_tag(struct ndb_builder *builder); int ndb_builder_push_tag_str(struct ndb_builder *builder, const char *str, int len); +// FILTERS +int ndb_filter_init(struct ndb_filter *); +int ndb_filter_add_id_element(struct ndb_filter *, const unsigned char *id); +int ndb_filter_add_int_element(struct ndb_filter *, uint64_t integer); +int ndb_filter_add_str_element(struct ndb_filter *, const char *str); +int ndb_filter_start_field(struct ndb_filter *, enum ndb_filter_fieldtype); +int ndb_filter_start_generic_field(struct ndb_filter *, char tag); +int ndb_filter_matches(struct ndb_filter *, struct ndb_note *); +void ndb_filter_reset(struct ndb_filter *); +void ndb_filter_end_field(struct ndb_filter *); +void ndb_filter_free(struct ndb_filter *filter); + // stats int ndb_stat(struct ndb *ndb, struct ndb_stat *stat); void ndb_stat_counts_init(struct ndb_stat_counts *counts); diff --git a/test.c b/test.c @@ -47,6 +47,72 @@ static void print_search(struct ndb_txn *txn, struct ndb_search *search) } +static void test_filters() +{ + struct ndb_filter filter, *f; + struct ndb_note *note; + unsigned char buffer[4096]; + + const char *test_note = "{\"id\": \"160e76ca67405d7ce9ef7d2dd72f3f36401c8661a73d45498af842d40b01b736\",\"pubkey\": \"67c67870aebc327eb2a2e765e6dbb42f0f120d2c4e4e28dc16b824cf72a5acc1\",\"created_at\": 1700688516,\"kind\": 1337,\"tags\": [[\"t\",\"hashtag\"],[\"t\",\"grownostr\"],[\"p\",\"4d2e7a6a8e08007ace5a03391d21735f45caf1bf3d67b492adc28967ab46525e\"]],\"content\": \"\",\"sig\": \"20c2d070261ed269559ada40ca5ac395c389681ee3b5f7d50de19dd9b328dd70cf27d9d13875e87c968d9b49fa05f66e90f18037be4529b9e582c7e2afac3f06\"}"; + + assert(ndb_note_from_json(test_note, strlen(test_note), &note, buffer, sizeof(buffer))); + + f = &filter; + + assert(ndb_filter_init(f)); + assert(ndb_filter_start_field(f, NDB_FILTER_KINDS)); + assert(ndb_filter_add_int_element(f, 1337)); + assert(ndb_filter_add_int_element(f, 2)); + + assert(f->current->count == 2); + assert(f->current->field.type == NDB_FILTER_KINDS); + + // can't start if we've already started + assert(ndb_filter_start_field(f, NDB_FILTER_KINDS) == 0); + assert(ndb_filter_start_field(f, NDB_FILTER_GENERIC) == 0); + ndb_filter_end_field(f); + + // try matching the filter + assert(ndb_filter_matches(f, note)); + + note->kind = 1; + + // inverse match + assert(!ndb_filter_matches(f, note)); + + // should also match 2 + note->kind = 2; + assert(ndb_filter_matches(f, note)); + + // don't free, just reset data pointers + ndb_filter_reset(f); + + // now try generic matches + assert(ndb_filter_start_generic_field(f, 't')); + assert(ndb_filter_add_str_element(f, "grownostr")); + ndb_filter_end_field(f); + assert(ndb_filter_start_field(f, NDB_FILTER_KINDS)); + assert(ndb_filter_add_int_element(f, 3)); + ndb_filter_end_field(f); + + // shouldn't match the kind filter + assert(!ndb_filter_matches(f, note)); + + note->kind = 3; + + // now it should + assert(ndb_filter_matches(f, note)); + + ndb_filter_reset(f); + assert(ndb_filter_start_field(f, NDB_FILTER_AUTHORS)); + assert(ndb_filter_add_id_element(f, note->pubkey)); + ndb_filter_end_field(f); + assert(f->current == NULL); + assert(ndb_filter_matches(f, note)); + + ndb_filter_free(f); +} + // Test fetched_at profile records. These are saved when new profiles are // processed, or the last time we've fetched the profile. static void test_fetched_at() @@ -877,6 +943,7 @@ static void test_fast_strchr() } int main(int argc, const char *argv[]) { + test_filters(); test_migrate(); test_fetched_at(); test_profile_updates();