nostrdb

an unfairly fast embedded nostr database backed by lmdb
git clone git://jb55.com/nostrdb
Log | Files | Refs | Submodules | README | LICENSE

commit 7e3651889ac7abc1e2e14ae9503e094bd2a4cdad
parent 2a4eb709b2be3236d7417e9a407b757241ff90bb
Author: William Casarin <jb55@jb55.com>
Date:   Mon, 13 Jan 2025 18:19:22 -0800

Implement nip50 fulltext searching

This adds support for nip50 fulltext searches. This allows you to use
the nostrdb query interface for executing fulltext searches instead of
the typical `ndb_text_search` api. The benefits of this include a
standardized query interface that also further filters on other fields
in the filter.

Changelog-Added: Add nip50 search filters and queries

Diffstat:
Mndb.c | 50++++++++++++--------------------------------------
Msrc/nostrdb.c | 116++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++-------------
Msrc/nostrdb.h | 4++++
3 files changed, 113 insertions(+), 57 deletions(-)

diff --git a/ndb.c b/ndb.c @@ -17,8 +17,7 @@ static int usage() printf("usage: ndb [--skip-verification] [-d db_dir] <command>\n\n"); printf("commands\n\n"); printf(" stat\n"); - printf(" search [--oldest-first] [--limit 42] <fulltext query>\n"); - printf(" query [-k 42] [-k 1337] [-l 42] [-e abcdef...] [-a abcdef... -a bcdef...]\n"); + printf(" query [-k 42] [-k 1337] [--search term] [-l 42] [-e abcdef...] [-a abcdef... -a bcdef...]\n"); printf(" profile <pubkey>\n"); printf(" print-search-keys\n"); printf(" print-kind-keys\n"); @@ -136,14 +135,11 @@ int main(int argc, char *argv[]) long nanos; struct ndb_stat stat; struct ndb_txn txn; - struct ndb_text_search_results results; - struct ndb_text_search_result *result; const char *dir; unsigned char *data; size_t data_len; struct ndb_config config; struct timespec t1, t2; - struct ndb_text_search_config search_config; unsigned char tmp_id[32]; // profiles @@ -154,7 +150,6 @@ int main(int argc, char *argv[]) res = 0; ndb_default_config(&config); - ndb_default_text_search_config(&search_config); ndb_config_set_mapsize(&config, 1024ULL * 1024ULL * 1024ULL * 1024ULL /* 1 TiB */); if (argc < 2) { @@ -184,38 +179,7 @@ int main(int argc, char *argv[]) return 2; } - if (argc >= 3 && !strcmp(argv[1], "search")) { - for (i = 0; i < 2; i++) { - if (!strcmp(argv[2], "--oldest-first")) { - ndb_text_search_config_set_order(&search_config, NDB_ORDER_ASCENDING); - argv++; - argc--; - } else if (!strcmp(argv[2], "--limit") || !strcmp(argv[2], "-l")) { - limit = atoi(argv[3]); - ndb_text_search_config_set_limit(&search_config, limit); - argv += 2; - argc -= 2; - } - } - - ndb_begin_query(ndb, &txn); - clock_gettime(CLOCK_MONOTONIC, &t1); - ndb_text_search(&txn, argv[2], &results, &search_config); - clock_gettime(CLOCK_MONOTONIC, &t2); - - nanos = (t2.tv_sec - t1.tv_sec) * (long)1e9 + (t2.tv_nsec - t1.tv_nsec); - - fprintf(stderr, "%d results in %f ms\n", results.num_results, nanos/1000000.0); - - // print results for now - for (i = 0; i < results.num_results; i++) { - result = &results.results[i]; - //fprintf(stderr, "[%02d] ", i+1); - ndb_print_text_search_result(&txn, result); - } - - ndb_end_query(&txn); - } else if (argc == 2 && !strcmp(argv[1], "stat")) { + if (argc == 2 && !strcmp(argv[1], "stat")) { if (!ndb_stat(ndb, &stat)) { res = 3; goto cleanup; @@ -282,6 +246,16 @@ int main(int argc, char *argv[]) ndb_filter_end_field(f); argv += 2; argc -= 2; + } else if (!strcmp(argv[0], "--search") || !strcmp(argv[0], "-S")) { + if (current_field) { + ndb_filter_end_field(f); + current_field = 0; + } + ndb_filter_start_field(f, NDB_FILTER_SEARCH); + ndb_filter_add_str_element(f, argv[1]); + ndb_filter_end_field(f); + argv += 2; + argc -= 2; } else if (!strcmp(argv[0], "-e")) { if (current_field != 'e') { if (!ndb_filter_start_tag_field(f, 'e')) { diff --git a/src/nostrdb.c b/src/nostrdb.c @@ -230,6 +230,7 @@ enum ndb_query_plan { NDB_PLAN_AUTHOR_KINDS, NDB_PLAN_CREATED, NDB_PLAN_TAGS, + NDB_PLAN_SEARCH, }; // A id + u64 + timestamp @@ -334,27 +335,27 @@ static int ndb_make_noted_text_search_key(unsigned char *buf, int bufsize, static int ndb_make_text_search_key_low(unsigned char *buf, int bufsize, int wordlen, const char *word, + uint64_t since, int *keysize) { - uint64_t timestamp, note_id; - timestamp = 0; + uint64_t note_id; note_id = 0; return ndb_make_text_search_key(buf, bufsize, 0, wordlen, word, - timestamp, note_id, keysize); + since, note_id, keysize); } static int ndb_make_text_search_key_high(unsigned char *buf, int bufsize, int wordlen, const char *word, + uint64_t until, int *keysize) { - uint64_t timestamp, note_id; - timestamp = INT32_MAX; + uint64_t note_id; note_id = INT32_MAX; return ndb_make_text_search_key(buf, bufsize, 0, wordlen, word, - timestamp, note_id, keysize); + until, note_id, keysize); } -typedef int (*ndb_text_search_key_order_fn)(unsigned char *buf, int bufsize, int wordlen, const char *word, int *keysize); +typedef int (*ndb_text_search_key_order_fn)(unsigned char *buf, int bufsize, int wordlen, const char *word, uint64_t timestamp, int *keysize); /** From LMDB: Compare two items lexically */ static int mdb_cmp_memn(const MDB_val *a, const MDB_val *b) { @@ -3044,6 +3045,43 @@ static int query_is_full(struct ndb_query_results *results, int limit) return cursor_count(&results->cur, sizeof(struct ndb_query_result)) >= limit; } +static int ndb_query_plan_execute_search(struct ndb_txn *txn, + struct ndb_filter *filter, + struct ndb_query_results *results, + int limit) +{ + const char *search; + int i; + struct ndb_text_search_results text_results; + struct ndb_text_search_result *text_result; + struct ndb_text_search_config config; + struct ndb_query_result result; + + ndb_default_text_search_config(&config); + + if (!(search = ndb_filter_find_search(filter))) + return 0; + + if (!ndb_text_search_with(txn, search, &text_results, &config, filter)) + return 0; + + for (i = 0; i < text_results.num_results; i++) { + if (query_is_full(results, limit)) + break; + + text_result = &text_results.results[i]; + + result.note = text_result->note; + result.note_size = text_result->note_size; + result.note_id = text_result->key.note_id; + + if (!push_query_result(results, &result)) + break; + } + + return 1; +} + static int ndb_query_plan_execute_ids(struct ndb_txn *txn, struct ndb_filter *filter, struct ndb_query_results *results, @@ -3456,15 +3494,18 @@ next: static enum ndb_query_plan ndb_filter_plan(struct ndb_filter *filter) { - struct ndb_filter_elements *ids, *kinds, *authors, *tags; + struct ndb_filter_elements *ids, *kinds, *authors, *tags, *search; ids = ndb_filter_find_elements(filter, NDB_FILTER_IDS); + search = ndb_filter_find_elements(filter, NDB_FILTER_SEARCH); kinds = ndb_filter_find_elements(filter, NDB_FILTER_KINDS); authors = ndb_filter_find_elements(filter, NDB_FILTER_AUTHORS); tags = ndb_filter_find_elements(filter, NDB_FILTER_TAGS); // this is rougly similar to the heuristic in strfry's dbscan - if (ids) { + if (search) { + return NDB_PLAN_SEARCH; + } else if (ids) { return NDB_PLAN_IDS; } else if (kinds && authors && authors->count <= 10) { return NDB_PLAN_AUTHOR_KINDS; @@ -3483,6 +3524,7 @@ static const char *ndb_query_plan_name(int plan_id) { switch (plan_id) { case NDB_PLAN_IDS: return "ids"; + case NDB_PLAN_SEARCH: return "search"; case NDB_PLAN_KINDS: return "kinds"; case NDB_PLAN_TAGS: return "tags"; case NDB_PLAN_CREATED: return "created"; @@ -3518,6 +3560,11 @@ static int ndb_query_filter(struct ndb_txn *txn, struct ndb_filter *filter, return 0; break; + case NDB_PLAN_SEARCH: + if (!ndb_query_plan_execute_search(txn, filter, &results, limit)) + return 0; + break; + // We have just kinds, just scan the kind index case NDB_PLAN_KINDS: if (!ndb_query_plan_execute_kinds(txn, filter, &results, limit)) @@ -4031,6 +4078,7 @@ int ndb_text_search_with(struct ndb_txn *txn, const char *query, struct ndb_word *search_word; struct ndb_note *note; struct cursor cur; + uint64_t since, until, timestamp_op, *pint, note_size; ndb_text_search_key_order_fn key_order_fn; MDB_dbi text_db; MDB_cursor *cursor; @@ -4038,17 +4086,36 @@ int ndb_text_search_with(struct ndb_txn *txn, const char *query, int i, j, keysize, saved_size, limit; MDB_cursor_op op, order_op; + note_size = 0; + note = 0; saved = NULL; ndb_text_search_results_init(results); ndb_search_words_init(&search_words); - // search config + until = UINT64_MAX; + since = 0; limit = MAX_TEXT_SEARCH_RESULTS; + + // until, since from filter + if (filter != NULL) { + if ((pint = ndb_filter_get_int(filter, NDB_FILTER_UNTIL))) + until = *pint; + + if ((pint = ndb_filter_get_int(filter, NDB_FILTER_SINCE))) + since = *pint; + + if ((pint = ndb_filter_get_int(filter, NDB_FILTER_LIMIT))) + limit = *pint; + } + order_op = MDB_PREV; key_order_fn = ndb_make_text_search_key_high; + timestamp_op = until; if (config) { if (config->order == NDB_ORDER_ASCENDING) { order_op = MDB_NEXT; + // set the min timestamp value to since when ascending + timestamp_op = since; key_order_fn = ndb_make_text_search_key_low; } limit = min(limit, config->limit); @@ -4067,9 +4134,11 @@ int ndb_text_search_with(struct ndb_txn *txn, const char *query, return 0; } - // TODO: sort words from largest to smallest. This should complete the - // query quicker because the larger words are likely to have fewer - // entries in the search index. + // This should complete the query quicker because the larger words are + // likely to have fewer entries in the search index. This is not always + // true. Words with higher frequency (like bitcoin on nostr in 2024) + // may be slower. TODO: Skip word recursion by leveraging a minimal + // perfect hashmap of parsed words on a note sort_largest_to_smallest(&search_words); // for each word, we recursively find all of the submatches @@ -4099,7 +4168,9 @@ int ndb_text_search_with(struct ndb_txn *txn, const char *query, // match if (!key_order_fn(buffer, sizeof(buffer), search_words.words[0].word_len, - search_words.words[0].word, &keysize)) + search_words.words[0].word, + timestamp_op, + &keysize)) { // word is too big to fit in 1024-sized key continue; @@ -4172,10 +4243,12 @@ int ndb_text_search_with(struct ndb_txn *txn, const char *query, // save the first key match, since we will continue from // this on the next root word result - if (j == 0 && !saved) { - memcpy(saved_buf, k.mv_data, k.mv_size); - saved = saved_buf; - saved_size = k.mv_size; + if (j == 0) { + if (!saved) { + memcpy(saved_buf, k.mv_data, k.mv_size); + saved = saved_buf; + saved_size = k.mv_size; + } // since we will be trying to match the same // note_id on all subsequent word matches, @@ -4185,15 +4258,20 @@ int ndb_text_search_with(struct ndb_txn *txn, const char *query, // remaining word queries if (filter) { if ((note = ndb_get_note_by_key(txn, - result->key.note_id, NULL))) + result->key.note_id, + &note_size))) { if (!ndb_filter_matches(filter, note)) { break; } + result->note = note; + result->note_size = note_size; } } } + result->note = note; + result->note_size = note_size; last_candidate = *result; last_result = &last_candidate; } diff --git a/src/nostrdb.h b/src/nostrdb.h @@ -314,6 +314,10 @@ struct ndb_text_search_key struct ndb_text_search_result { struct ndb_text_search_key key; int prefix_chars; + + // This is only set if we passed a filter for nip50 searches + struct ndb_note *note; + uint64_t note_size; }; struct ndb_text_search_results {