nostrdb

an unfairly fast embedded nostr database backed by lmdb
git clone git://jb55.com/nostrdb
Log | Files | Refs | Submodules | README | LICENSE

commit a841b449102176241485b8882abd4e7002e6b520
parent 644124f1340eb6aa92fd4d33f50259067d43af4a
Author: kernelkind <kernelkind@gmail.com>
Date:   Thu, 28 Dec 2023 13:52:24 -0800

parser: handle period at end of url

Fix parsing URL when encountering a period at the end of the url by
setting it as disallowed from being present at the end of a
URL.

Some characters are disallowed to be present at the end of URLs.
Presently, the period character is the only disallowed character.
A character is the last character in the URL if it is followed by
is_whitespace() or if it's the last character in the string.

Signed-off-by: kernelkind <kernelkind@gmail.com>
Tested-by: William Casarin <jb55@jb55.com>
Signed-off-by: William Casarin <jb55@jb5.com>

Diffstat:
Msrc/content_parser.c | 53++++++++++++++++++++++++++++++++++++++++++++++++++---
1 file changed, 50 insertions(+), 3 deletions(-)

diff --git a/src/content_parser.c b/src/content_parser.c @@ -363,6 +363,53 @@ fail: return 0; } + + +static inline int next_char_is_whitespace(unsigned char *cur, unsigned char *end) { + unsigned char *next = cur + 1; + + if (next > end) + return 0; + + if (next == end) + return 1; + + return is_whitespace(*next); +} + +static inline int char_disallowed_at_end_url(char c) +{ + return c == '.' || c == ','; + +} + +static int is_final_url_char(unsigned char *cur, unsigned char *end) +{ + if (is_whitespace(*cur)) + return 1; + + if (next_char_is_whitespace(cur, end)) { + // next char is whitespace so this char could be the final char in the url + return char_disallowed_at_end_url(*cur); + } + + // next char isn't whitespace so it can't be a final char + return 0; +} + +static int consume_until_end_url(struct cursor *cur, int or_end) { + unsigned char *start = cur->p; + + while (cur->p < cur->end) { + if (is_final_url_char(cur->p, cur->end)) + return cur->p != start; + + cur->p++; + } + + return or_end; +} + static int consume_url_fragment(struct cursor *cur) { int c; @@ -376,7 +423,7 @@ static int consume_url_fragment(struct cursor *cur) cur->p++; - return consume_until_whitespace(cur, 1); + return consume_until_end_url(cur, 1); } static int consume_url_path(struct cursor *cur) @@ -393,7 +440,7 @@ static int consume_url_path(struct cursor *cur) while (cur->p < cur->end) { c = *cur->p; - if (c == '?' || c == '#' || is_whitespace(c)) { + if (c == '?' || c == '#' || is_final_url_char(cur->p, cur->end)) { return 1; } @@ -411,7 +458,7 @@ static int consume_url_host(struct cursor *cur) while (cur->p < cur->end) { c = *cur->p; // TODO: handle IDNs - if (is_alphanumeric(c) || c == '.' || c == '-') + if ((is_alphanumeric(c) || c == '.' || c == '-') && !is_final_url_char(cur->p, cur->end)) { count++; cur->p++;