damus

nostr ios client
git clone git://jb55.com/damus
Log | Files | Refs | README | LICENSE

commit 53e9269da6be28128d083f13cf3ac7f55f1266af
parent 85930df8e3cb65a53209b21f206998b48483f29e
Author: William Casarin <jb55@jb55.com>
Date:   Sun,  6 Aug 2023 13:47:33 -0700

urls: fix wikipedia url detection with parenthesis

Fixes: f0df4aa218cc ("Strip common punctuations from URLs")
Fixes: https://github.com/damus-io/damus/issues/1027
Closes: https://github.com/damus-io/damus/pull/1063
Changelog-Fixed: Fix wikipedia url detection with parenthesis

Diffstat:
Mdamus-c/cursor.h | 6+-----
Mdamus-c/damus.c | 73++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++-----
2 files changed, 69 insertions(+), 10 deletions(-)

diff --git a/damus-c/cursor.h b/damus-c/cursor.h @@ -447,12 +447,8 @@ static inline int is_left_boundary(char c) { return is_right_boundary(c) || is_utf8_byte(c); } -static inline int is_invalid_url_ending(char c) { - return c == '!' || c == '?' || c == ')' || c == '.' || c == ',' || c == ';'; -} - static inline int is_alphanumeric(char c) { - return (c >= 'a' && c <= 'z') || (c >= '0' && c <= '9'); + return (c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z') || (c >= '0' && c <= '9'); } static inline int consume_until_boundary(struct cursor *cur) { diff --git a/damus-c/damus.c b/damus-c/damus.c @@ -104,6 +104,69 @@ static int add_text_block(struct note_blocks *blocks, const u8 *start, const u8 return add_block(blocks, b); } +static int consume_url_fragment(struct cursor *cur) +{ + int c; + + if ((c = peek_char(cur, 0)) < 0) + return 1; + + if (c != '#' && c != '?') { + return 1; + } + + cur->p++; + + return consume_until_whitespace(cur, 1); +} + +static int consume_url_path(struct cursor *cur) +{ + int c; + + if ((c = peek_char(cur, 0)) < 0) + return 1; + + if (c != '/') { + return 1; + } + + while (cur->p < cur->end) { + c = *cur->p; + + if (c == '?' || c == '#' || is_whitespace(c)) { + return 1; + } + + cur->p++; + } + + return 1; +} + +static int consume_url_host(struct cursor *cur) +{ + char c; + int count = 0; + + while (cur->p < cur->end) { + c = *cur->p; + // TODO: handle IDNs + if (is_alphanumeric(c) || c == '.' || c == '-') + { + count++; + cur->p++; + continue; + } + + return count != 0; + } + + + // this means the end of the URL hostname is the end of the buffer and we finished + return count != 0; +} + static int parse_url(struct cursor *cur, struct note_block *block) { u8 *start = cur->p; @@ -121,15 +184,15 @@ static int parse_url(struct cursor *cur, struct note_block *block) { return 0; } } - - if (!consume_until_whitespace(cur, 1)) { + + if (!(consume_url_host(cur) && + consume_url_path(cur) && + consume_url_fragment(cur))) + { cur->p = start; return 0; } - // strip any unwanted characters - while(is_invalid_url_ending(peek_char(cur, -1))) cur->p--; - block->type = BLOCK_URL; block->block.str.start = (const char *)start; block->block.str.end = (const char *)cur->p;