commit a3915f898309cf07c5188347ae2492674bc70d46
parent 8eb0af2f554b700c93f3a5ffb649f862862f60a3
Author: William Casarin <jb55@jb55.com>
Date: Fri, 1 Dec 2023 17:04:01 -0800
search: allow searching from newest-to-oldest and oldest-to-newest
Diffstat:
M | README.md | | | 4 | ++-- |
M | ndb.c | | | 24 | ++++++++++++++++++++---- |
M | nostrdb.c | | | 89 | ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++----------------- |
M | nostrdb.h | | | 15 | ++++++++++++++- |
4 files changed, 106 insertions(+), 26 deletions(-)
diff --git a/README.md b/README.md
@@ -38,7 +38,7 @@ usage: ndb [--skip-verification] [-d db_dir] <command>
commands
stat
- search <fulltext query>
+ search [--oldest-first] [--limit 42] <fulltext query>
import <line-delimited json file>
settings
@@ -60,7 +60,7 @@ nostrdb supports fulltext queries. You can import some test events like so:
```
$ make testdata/many-events.json
$ ndb --skip-verification import testdata/many-events.json
-$ ndb search 'nosy ostrich'
+$ ndb search --limit 2 --oldest-first 'nosy ostrich'
[01] K<'ostrich' 7 1671217526 note_id:253309>
Q: What do you call a nosy ostrich?
diff --git a/ndb.c b/ndb.c
@@ -2,6 +2,7 @@
#include "nostrdb.h"
#include <stdio.h>
+#include <stdlib.h>
#include <sys/stat.h>
#include <sys/mman.h>
#include <unistd.h>
@@ -12,7 +13,7 @@ static int usage()
printf("usage: ndb [--skip-verification] [-d db_dir] <command>\n\n");
printf("commands\n\n");
printf(" stat\n");
- printf(" search <fulltext query>\n");
+ printf(" search [--oldest-first] [--limit 42] <fulltext query>\n");
printf(" import <line-delimited json file>\n\n");
printf("settings\n\n");
printf(" --skip-verification skip signature validation\n");
@@ -113,7 +114,7 @@ static void ndb_print_text_search_result(struct ndb_txn *txn,
int main(int argc, char *argv[])
{
struct ndb *ndb;
- int i, flags;
+ int i, flags, limit;
struct ndb_stat stat;
struct ndb_txn txn;
struct ndb_text_search_results results;
@@ -122,7 +123,9 @@ int main(int argc, char *argv[])
unsigned char *data;
size_t data_len;
struct ndb_config config;
+ struct ndb_text_search_config search_config;
ndb_default_config(&config);
+ ndb_default_text_search_config(&search_config);
ndb_config_set_mapsize(&config, 1024ULL * 1024ULL * 1024ULL * 1024ULL /* 1 TiB */);
if (argc < 2) {
@@ -152,9 +155,22 @@ int main(int argc, char *argv[])
return 2;
}
- if (argc == 3 && !strcmp(argv[1], "search")) {
+ if (argc >= 3 && !strcmp(argv[1], "search")) {
+ for (i = 0; i < 2; i++) {
+ if (!strcmp(argv[2], "--oldest-first")) {
+ ndb_text_search_config_set_order(&search_config, NDB_ORDER_ASCENDING);
+ argv++;
+ argc--;
+ } else if (!strcmp(argv[2], "--limit")) {
+ limit = atoi(argv[3]);
+ ndb_text_search_config_set_limit(&search_config, limit);
+ argv += 2;
+ argc -= 2;
+ }
+ }
+
ndb_begin_query(ndb, &txn);
- ndb_text_search(&txn, argv[2], &results, 999);
+ ndb_text_search(&txn, argv[2], &results, &search_config);
// print results for now
for (i = 0; i < results.num_results; i++) {
diff --git a/nostrdb.c b/nostrdb.c
@@ -177,7 +177,7 @@ static int ndb_make_text_search_key(unsigned char *buf, int bufsize,
// TODO: need update this to uint64_t
// we push this first because our query function can pull this off
// quicky to check matches
- if (!push_varint(&cur, (int)note_id))
+ if (!push_varint(&cur, (int32_t)note_id))
return 0;
// string length
@@ -238,12 +238,14 @@ static int ndb_make_text_search_key_high(unsigned char *buf, int bufsize,
int *keysize)
{
uint64_t timestamp, note_id;
- timestamp = UINT64_MAX;
- note_id = UINT64_MAX;
+ timestamp = INT32_MAX;
+ note_id = INT32_MAX;
return ndb_make_text_search_key(buf, bufsize, 0, wordlen, word,
timestamp, note_id, keysize);
}
+typedef int (*ndb_text_search_key_order_fn)(unsigned char *buf, int bufsize, int wordlen, const char *word, int *keysize);
+
/** From LMDB: Compare two items lexically */
static int mdb_cmp_memn(const MDB_val *a, const MDB_val *b) {
int diff;
@@ -2469,7 +2471,8 @@ static int prefix_count(const char *str1, int len1, const char *str2, int len2)
static int ndb_text_search_next_word(MDB_cursor *cursor, MDB_cursor_op op,
MDB_val *k, struct ndb_word *search_word,
struct ndb_text_search_result *last_result,
- struct ndb_text_search_result *result)
+ struct ndb_text_search_result *result,
+ MDB_cursor_op order_op)
{
struct cursor key_cursor;
MDB_val v;
@@ -2481,8 +2484,18 @@ static int ndb_text_search_next_word(MDB_cursor *cursor, MDB_cursor_op op,
// key.
//
// Subsequent searches should use MDB_NEXT
- if (mdb_cursor_get(cursor, k, &v, op))
- return 0;
+ if (mdb_cursor_get(cursor, k, &v, op)) {
+ // we should only do this if we're going in reverse
+ if (op == MDB_SET_RANGE && order_op == MDB_PREV) {
+ // if set range worked and our key exists, it should be
+ // the one right before this one
+ if (mdb_cursor_get(cursor, k, &v, MDB_PREV))
+ return 0;
+ } else {
+ return 0;
+ }
+ }
+
make_cursor(k->mv_data, k->mv_data + k->mv_size, &key_cursor);
@@ -2566,28 +2579,68 @@ static void ndb_text_search_results_init(
results->num_results = 0;
}
+static void ndb_print_text_search_key(struct ndb_text_search_key *key)
+{
+ printf("K<'%.*s' %d %" PRIu64 " note_id:%" PRIu64 ">", key->str_len, key->str,
+ key->word_index,
+ key->timestamp,
+ key->note_id);
+}
+
+void ndb_default_text_search_config(struct ndb_text_search_config *cfg)
+{
+ cfg->order = NDB_ORDER_DESCENDING;
+ cfg->limit = MAX_TEXT_SEARCH_RESULTS;
+}
+
+void ndb_text_search_config_set_order(struct ndb_text_search_config *cfg,
+ enum ndb_search_order order)
+{
+ cfg->order = order;
+}
+
+void ndb_text_search_config_set_limit(struct ndb_text_search_config *cfg, int limit)
+{
+ cfg->limit = limit;
+}
+
int ndb_text_search(struct ndb_txn *txn, const char *query,
- struct ndb_text_search_results *results, int limit)
+ struct ndb_text_search_results *results,
+ struct ndb_text_search_config *config)
{
unsigned char buffer[1024], *buf;
unsigned char saved_buf[1024], *saved;
struct ndb_text_search_result *result, *last_result;
struct ndb_text_search_result candidate, last_candidate;
struct ndb_search_words search_words;
+ //struct ndb_text_search_key search_key;
struct ndb_word *search_word;
struct cursor cur;
+ ndb_text_search_key_order_fn key_order_fn;
MDB_dbi text_db;
MDB_cursor *cursor;
MDB_val k, v;
- int i, j, keysize, saved_size;
- MDB_cursor_op op;
+ int i, j, keysize, saved_size, limit;
+ MDB_cursor_op op, order_op;
//int num_note_ids;
saved = NULL;
ndb_text_search_results_init(results);
ndb_search_words_init(&search_words);
+
+ // search config
+ limit = MAX_TEXT_SEARCH_RESULTS;
+ order_op = MDB_PREV;
+ key_order_fn = ndb_make_text_search_key_high;
+ if (config) {
+ if (config->order == NDB_ORDER_ASCENDING) {
+ order_op = MDB_NEXT;
+ key_order_fn = ndb_make_text_search_key_low;
+ }
+ limit = min(limit, config->limit);
+ }
+ // end search config
- //num_note_ids = 0;
text_db = txn->lmdb->dbs[NDB_DB_NOTE_TEXT];
make_cursor((unsigned char *)query, (unsigned char *)query + strlen(query), &cur);
@@ -2600,8 +2653,6 @@ int ndb_text_search(struct ndb_txn *txn, const char *query,
return 0;
}
- limit = min(MAX_TEXT_SEARCH_RESULTS, limit);
-
// for each word, we recursively find all of the submatches
while (results->num_results < limit) {
last_result = NULL;
@@ -2619,18 +2670,17 @@ int ndb_text_search(struct ndb_txn *txn, const char *query,
// reposition the cursor so we can continue
if (mdb_cursor_get(cursor, &k, &v, MDB_SET_RANGE))
- break;
+ return 0;
- op = MDB_NEXT;
+ op = order_op;
} else {
// construct a packed fulltext search key using this
// word this key doesn't contain any timestamp or index
// info, so it should range match instead of exact
// match
- if (!ndb_make_text_search_key_low(
- buffer, sizeof(buffer),
- search_words.words[0].word_len,
- search_words.words[0].word, &keysize))
+ if (!key_order_fn(buffer, sizeof(buffer),
+ search_words.words[0].word_len,
+ search_words.words[0].word, &keysize))
{
// word is too big to fit in 1024-sized key
continue;
@@ -2677,7 +2727,8 @@ int ndb_text_search(struct ndb_txn *txn, const char *query,
if (!ndb_text_search_next_word(cursor, op, &k,
search_word,
last_result,
- &candidate)) {
+ &candidate,
+ order_op)) {
break;
}
diff --git a/nostrdb.h b/nostrdb.h
@@ -277,6 +277,11 @@ enum ndb_generic_element_type {
NDB_ELEMENT_ID = 2,
};
+enum ndb_search_order {
+ NDB_ORDER_DESCENDING,
+ NDB_ORDER_ASCENDING,
+};
+
union ndb_filter_element {
const char *string;
const unsigned char *id;
@@ -311,6 +316,11 @@ struct ndb_config {
ndb_ingest_filter_fn ingest_filter;
};
+struct ndb_text_search_config {
+ enum ndb_search_order order;
+ int limit;
+};
+
// CONFIG
void ndb_default_config(struct ndb_config *);
void ndb_config_set_ingest_threads(struct ndb_config *config, int threads);
@@ -377,7 +387,10 @@ void ndb_filter_end_field(struct ndb_filter *);
void ndb_filter_free(struct ndb_filter *filter);
// FULLTEXT SEARCH
-int ndb_text_search(struct ndb_txn *txn, const char *query, struct ndb_text_search_results *, int limit);
+int ndb_text_search(struct ndb_txn *txn, const char *query, struct ndb_text_search_results *, struct ndb_text_search_config *);
+void ndb_default_text_search_config(struct ndb_text_search_config *);
+void ndb_text_search_config_set_order(struct ndb_text_search_config *, enum ndb_search_order);
+void ndb_text_search_config_set_limit(struct ndb_text_search_config *, int limit);
// stats
int ndb_stat(struct ndb *ndb, struct ndb_stat *stat);