Implement nip50 fulltext searching - nostrdb - an unfairly fast embedded nostr database backed by lmdb

commit 7e3651889ac7abc1e2e14ae9503e094bd2a4cdad
parent 2a4eb709b2be3236d7417e9a407b757241ff90bb
Author: William Casarin <jb55@jb55.com>
Date:   Mon, 13 Jan 2025 18:19:22 -0800

Implement nip50 fulltext searching

This adds support for nip50 fulltext searches. This allows you to use
the nostrdb query interface for executing fulltext searches instead of
the typical `ndb_text_search` api. The benefits of this include a
standardized query interface that also further filters on other fields
in the filter.

Changelog-Added: Add nip50 search filters and queries

Diffstat:
M ndb.c  | 50 ++++++++++++--------------------------------------
M src/nostrdb.c  | 116 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++-------------
M src/nostrdb.h  | 4 ++++

3 files changed, 113 insertions(+), 57 deletions(-)
diff --git a/ndb.c b/ndb.c
@@ -17,8 +17,7 @@ static int usage()
 	printf("usage: ndb [--skip-verification] [-d db_dir] <command>\n\n");
 	printf("commands\n\n");
 	printf("	stat\n");
-	printf("	search [--oldest-first] [--limit 42] <fulltext query>\n");
-	printf("	query [-k 42] [-k 1337] [-l 42] [-e abcdef...] [-a abcdef... -a bcdef...]\n");
+	printf("	query [-k 42] [-k 1337] [--search term] [-l 42] [-e abcdef...] [-a abcdef... -a bcdef...]\n");
 	printf("	profile <pubkey>\n");
 	printf("	print-search-keys\n");
 	printf("	print-kind-keys\n");
@@ -136,14 +135,11 @@ int main(int argc, char *argv[])
 	long nanos;
 	struct ndb_stat stat;
 	struct ndb_txn txn;
-	struct ndb_text_search_results results;
-	struct ndb_text_search_result *result;
 	const char *dir;
 	unsigned char *data;
 	size_t data_len;
 	struct ndb_config config;
 	struct timespec t1, t2;
-	struct ndb_text_search_config search_config;
 	unsigned char tmp_id[32];
 
 	// profiles
@@ -154,7 +150,6 @@ int main(int argc, char *argv[])
 
 	res = 0;
 	ndb_default_config(&config);
-	ndb_default_text_search_config(&search_config);
 	ndb_config_set_mapsize(&config, 1024ULL * 1024ULL * 1024ULL * 1024ULL /* 1 TiB */);
 
 	if (argc < 2) {
@@ -184,38 +179,7 @@ int main(int argc, char *argv[])
 		return 2;
 	}
 
-	if (argc >= 3 && !strcmp(argv[1], "search")) {
-		for (i = 0; i < 2; i++) {
-			if (!strcmp(argv[2], "--oldest-first")) {
-				ndb_text_search_config_set_order(&search_config, NDB_ORDER_ASCENDING);
-				argv++;
-				argc--;
-			} else if (!strcmp(argv[2], "--limit") || !strcmp(argv[2], "-l")) {
-				limit = atoi(argv[3]);
-				ndb_text_search_config_set_limit(&search_config, limit);
-				argv += 2;
-				argc -= 2;
-			}
-		}
-
-		ndb_begin_query(ndb, &txn);
-		clock_gettime(CLOCK_MONOTONIC, &t1);
-		ndb_text_search(&txn, argv[2], &results, &search_config);
-		clock_gettime(CLOCK_MONOTONIC, &t2);
-
-		nanos = (t2.tv_sec - t1.tv_sec) * (long)1e9 + (t2.tv_nsec - t1.tv_nsec);
-
-		fprintf(stderr, "%d results in %f ms\n", results.num_results, nanos/1000000.0);
-
-		// print results for now
-		for (i = 0; i < results.num_results; i++) {
-			result = &results.results[i];
-			//fprintf(stderr, "[%02d] ", i+1);
-			ndb_print_text_search_result(&txn, result);
-		}
-
-		ndb_end_query(&txn);
-	} else if (argc == 2 && !strcmp(argv[1], "stat")) {
+	if (argc == 2 && !strcmp(argv[1], "stat")) {
 		if (!ndb_stat(ndb, &stat)) {
 			res = 3;
 			goto cleanup;
@@ -282,6 +246,16 @@ int main(int argc, char *argv[])
 				ndb_filter_end_field(f);
 				argv += 2;
 				argc -= 2;
+			} else if (!strcmp(argv[0], "--search") || !strcmp(argv[0], "-S")) {
+				if (current_field) {
+					ndb_filter_end_field(f);
+					current_field = 0;
+				}
+				ndb_filter_start_field(f, NDB_FILTER_SEARCH);
+				ndb_filter_add_str_element(f, argv[1]);
+				ndb_filter_end_field(f);
+				argv += 2;
+				argc -= 2;
 			} else if (!strcmp(argv[0], "-e")) {
 				if (current_field != 'e') {
 					if (!ndb_filter_start_tag_field(f, 'e')) {
diff --git a/src/nostrdb.c b/src/nostrdb.c
@@ -230,6 +230,7 @@ enum ndb_query_plan {
 	NDB_PLAN_AUTHOR_KINDS,
 	NDB_PLAN_CREATED,
 	NDB_PLAN_TAGS,
+	NDB_PLAN_SEARCH,
 };
 
 // A id + u64 + timestamp
@@ -334,27 +335,27 @@ static int ndb_make_noted_text_search_key(unsigned char *buf, int bufsize,
 
 static int ndb_make_text_search_key_low(unsigned char *buf, int bufsize,
 					int wordlen, const char *word,
+					uint64_t since,
 					int *keysize)
 {
-	uint64_t timestamp, note_id;
-	timestamp = 0;
+	uint64_t note_id;
 	note_id = 0;
 	return ndb_make_text_search_key(buf, bufsize, 0, wordlen, word,
-					timestamp, note_id, keysize);
+					since, note_id, keysize);
 }
 
 static int ndb_make_text_search_key_high(unsigned char *buf, int bufsize,
 					 int wordlen, const char *word,
+					 uint64_t until,
 					 int *keysize)
 {
-	uint64_t timestamp, note_id;
-	timestamp = INT32_MAX;
+	uint64_t note_id;
 	note_id = INT32_MAX;
 	return ndb_make_text_search_key(buf, bufsize, 0, wordlen, word,
-					timestamp, note_id, keysize);
+					until, note_id, keysize);
 }
 
-typedef int (*ndb_text_search_key_order_fn)(unsigned char *buf, int bufsize, int wordlen, const char *word, int *keysize);
+typedef int (*ndb_text_search_key_order_fn)(unsigned char *buf, int bufsize, int wordlen, const char *word, uint64_t timestamp, int *keysize);
 
 /** From LMDB: Compare two items lexically */
 static int mdb_cmp_memn(const MDB_val *a, const MDB_val *b) {
@@ -3044,6 +3045,43 @@ static int query_is_full(struct ndb_query_results *results, int limit)
 	return cursor_count(&results->cur, sizeof(struct ndb_query_result)) >= limit;
 }
 
+static int ndb_query_plan_execute_search(struct ndb_txn *txn,
+					 struct ndb_filter *filter,
+					 struct ndb_query_results *results,
+					 int limit)
+{
+	const char *search;
+	int i;
+	struct ndb_text_search_results text_results;
+	struct ndb_text_search_result *text_result;
+	struct ndb_text_search_config config;
+	struct ndb_query_result result;
+
+	ndb_default_text_search_config(&config);
+
+	if (!(search = ndb_filter_find_search(filter)))
+		return 0;
+
+	if (!ndb_text_search_with(txn, search, &text_results, &config, filter))
+		return 0;
+
+	for (i = 0; i < text_results.num_results; i++) {
+		if (query_is_full(results, limit))
+			break;
+
+		text_result = &text_results.results[i];
+
+		result.note = text_result->note;
+		result.note_size = text_result->note_size;
+		result.note_id = text_result->key.note_id;
+
+		if (!push_query_result(results, &result))
+			break;
+	}
+
+	return 1;
+}
+
 static int ndb_query_plan_execute_ids(struct ndb_txn *txn,
 				      struct ndb_filter *filter,
 				      struct ndb_query_results *results,
@@ -3456,15 +3494,18 @@ next:
 
 static enum ndb_query_plan ndb_filter_plan(struct ndb_filter *filter)
 {
-	struct ndb_filter_elements *ids, *kinds, *authors, *tags;
+	struct ndb_filter_elements *ids, *kinds, *authors, *tags, *search;
 
 	ids = ndb_filter_find_elements(filter, NDB_FILTER_IDS);
+	search = ndb_filter_find_elements(filter, NDB_FILTER_SEARCH);
 	kinds = ndb_filter_find_elements(filter, NDB_FILTER_KINDS);
 	authors = ndb_filter_find_elements(filter, NDB_FILTER_AUTHORS);
 	tags = ndb_filter_find_elements(filter, NDB_FILTER_TAGS);
 
 	// this is rougly similar to the heuristic in strfry's dbscan
-	if (ids) {
+	if (search) {
+		return NDB_PLAN_SEARCH;
+	} else if (ids) {
 		return NDB_PLAN_IDS;
 	} else if (kinds && authors && authors->count <= 10) {
 		return NDB_PLAN_AUTHOR_KINDS;
@@ -3483,6 +3524,7 @@ static const char *ndb_query_plan_name(int plan_id)
 {
 	switch (plan_id) {
 		case NDB_PLAN_IDS:     return "ids";
+		case NDB_PLAN_SEARCH:  return "search";
 		case NDB_PLAN_KINDS:   return "kinds";
 		case NDB_PLAN_TAGS:    return "tags";
 		case NDB_PLAN_CREATED: return "created";
@@ -3518,6 +3560,11 @@ static int ndb_query_filter(struct ndb_txn *txn, struct ndb_filter *filter,
 			return 0;
 		break;
 
+	case NDB_PLAN_SEARCH:
+		if (!ndb_query_plan_execute_search(txn, filter, &results, limit))
+			return 0;
+		break;
+
 	// We have just kinds, just scan the kind index
 	case NDB_PLAN_KINDS:
 		if (!ndb_query_plan_execute_kinds(txn, filter, &results, limit))
@@ -4031,6 +4078,7 @@ int ndb_text_search_with(struct ndb_txn *txn, const char *query,
 	struct ndb_word *search_word;
 	struct ndb_note *note;
 	struct cursor cur;
+	uint64_t since, until, timestamp_op, *pint, note_size;
 	ndb_text_search_key_order_fn key_order_fn;
 	MDB_dbi text_db;
 	MDB_cursor *cursor;
@@ -4038,17 +4086,36 @@ int ndb_text_search_with(struct ndb_txn *txn, const char *query,
 	int i, j, keysize, saved_size, limit;
 	MDB_cursor_op op, order_op;
 
+	note_size = 0;
+	note = 0;
 	saved = NULL;
 	ndb_text_search_results_init(results);
 	ndb_search_words_init(&search_words);
 
-	// search config
+	until = UINT64_MAX;
+	since = 0;
 	limit = MAX_TEXT_SEARCH_RESULTS;
+
+	// until, since from filter
+	if (filter != NULL) {
+		if ((pint = ndb_filter_get_int(filter, NDB_FILTER_UNTIL)))
+			until = *pint;
+
+		if ((pint = ndb_filter_get_int(filter, NDB_FILTER_SINCE)))
+			since = *pint;
+
+		if ((pint = ndb_filter_get_int(filter, NDB_FILTER_LIMIT)))
+			limit = *pint;
+	}
+
 	order_op = MDB_PREV;
 	key_order_fn = ndb_make_text_search_key_high;
+	timestamp_op = until;
 	if (config) {
 		if (config->order == NDB_ORDER_ASCENDING) {
 			order_op = MDB_NEXT;
+			// set the min timestamp value to since when ascending
+			timestamp_op = since;
 			key_order_fn = ndb_make_text_search_key_low;
 		}
 		limit = min(limit, config->limit);
@@ -4067,9 +4134,11 @@ int ndb_text_search_with(struct ndb_txn *txn, const char *query,
 		return 0;
 	}
 
-	// TODO: sort words from largest to smallest. This should complete the
-	// query quicker because the larger words are likely to have fewer
-	// entries in the search index.
+	// This should complete the query quicker because the larger words are
+	// likely to have fewer entries in the search index. This is not always
+	// true. Words with higher frequency (like bitcoin on nostr in 2024)
+	// may be slower. TODO: Skip word recursion by leveraging a minimal
+	// perfect hashmap of parsed words on a note
 	sort_largest_to_smallest(&search_words);
 
 	// for each word, we recursively find all of the submatches
@@ -4099,7 +4168,9 @@ int ndb_text_search_with(struct ndb_txn *txn, const char *query,
 			// match
 			if (!key_order_fn(buffer, sizeof(buffer),
 					  search_words.words[0].word_len,
-					  search_words.words[0].word, &keysize))
+					  search_words.words[0].word,
+					  timestamp_op,
+					  &keysize))
 			{
 				// word is too big to fit in 1024-sized key
 				continue;
@@ -4172,10 +4243,12 @@ int ndb_text_search_with(struct ndb_txn *txn, const char *query,
 
 			// save the first key match, since we will continue from
 			// this on the next root word result
-			if (j == 0 && !saved) {
-				memcpy(saved_buf, k.mv_data, k.mv_size);
-				saved = saved_buf;
-				saved_size = k.mv_size;
+			if (j == 0) {
+				if (!saved) {
+					memcpy(saved_buf, k.mv_data, k.mv_size);
+					saved = saved_buf;
+					saved_size = k.mv_size;
+				}
 
 				// since we will be trying to match the same
 				// note_id on all subsequent word matches,
@@ -4185,15 +4258,20 @@ int ndb_text_search_with(struct ndb_txn *txn, const char *query,
 				// remaining word queries
 				if (filter) {
 					if ((note = ndb_get_note_by_key(txn,
-						result->key.note_id, NULL)))
+							result->key.note_id,
+							&note_size)))
 					{
 						if (!ndb_filter_matches(filter, note)) {
 							break;
 						}
+						result->note = note;
+						result->note_size = note_size;
 					}
 				}
 			}
 
+			result->note = note;
+			result->note_size = note_size;
 			last_candidate = *result;
 			last_result = &last_candidate;
 		}
diff --git a/src/nostrdb.h b/src/nostrdb.h
@@ -314,6 +314,10 @@ struct ndb_text_search_key
 struct ndb_text_search_result {
 	struct ndb_text_search_key key;
 	int prefix_chars;
+
+	// This is only set if we passed a filter for nip50 searches
+	struct ndb_note *note;
+	uint64_t note_size;
 };
 
 struct ndb_text_search_results {

	nostrdb an unfairly fast embedded nostr database backed by lmdb
	git clone git://jb55.com/nostrdb
	Log \| Files \| Refs \| Submodules \| README \| LICENSE

M	ndb.c	\|	50	++++++++++++--------------------------------------
M	src/nostrdb.c	\|	116	++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++-------------
M	src/nostrdb.h	\|	4	++++