commit c145d69aa92de9d60fdb15b92e74f2c94260e145
parent 2146ec37179d70e7b760c6131183cd0f442c29c9
Author: William Casarin <jb55@jb55.com>
Date: Mon, 13 Jan 2025 18:11:38 -0800
search: sort search terms from largest to smallest
Add a helper for sorting search words from largest to smallest. This
should help search performance. For example, let's say our search index
is like so:
"the pokemon is cool"
the
the
the
...
* 1000
Our root word search would have to start 1000 new recursive queries. By
sorting by the largest word:
pokemon
pokemon
pokemon
...
* 10
We only have to do 10 recursive searches, assuming larger words are less
common, which will likely be the case most of the time
Diffstat:
1 file changed, 31 insertions(+), 0 deletions(-)
diff --git a/src/nostrdb.c b/src/nostrdb.c
@@ -3990,6 +3990,32 @@ void ndb_text_search_config_set_limit(struct ndb_text_search_config *cfg, int li
cfg->limit = limit;
}
+static int compare_search_words(const void *pa, const void *pb)
+{
+ struct ndb_word *a, *b;
+
+ a = (struct ndb_word *)pa;
+ b = (struct ndb_word *)pb;
+
+ if (a->word_len == b->word_len) {
+ return 0;
+ } else if (a->word_len > b->word_len) {
+ // biggest words should be at the front of the list,
+ // so we say it's "smaller" here
+ return -1;
+ } else {
+ return 1;
+ }
+}
+
+// Sort search words from largest to smallest. Larger words are less likely
+// in the index, allowing our scan to walk fewer words at the root when
+// recursively matching.
+void sort_largest_to_smallest(struct ndb_search_words *words)
+{
+ qsort(words->words, words->num_words, sizeof(words->words[0]), compare_search_words);
+}
+
int ndb_text_search(struct ndb_txn *txn, const char *query,
struct ndb_text_search_results *results,
struct ndb_text_search_config *config)
@@ -4038,6 +4064,11 @@ int ndb_text_search(struct ndb_txn *txn, const char *query,
return 0;
}
+ // TODO: sort words from largest to smallest. This should complete the
+ // query quicker because the larger words are likely to have fewer
+ // entries in the search index.
+ sort_largest_to_smallest(&search_words);
+
// for each word, we recursively find all of the submatches
while (results->num_results < limit) {
last_result = NULL;