urls: fix wikipedia url detection with parenthesis - damus

commit 53e9269da6be28128d083f13cf3ac7f55f1266af
parent 85930df8e3cb65a53209b21f206998b48483f29e
Author: William Casarin <jb55@jb55.com>
Date:   Sun,  6 Aug 2023 13:47:33 -0700

urls: fix wikipedia url detection with parenthesis

Fixes: f0df4aa218cc ("Strip common punctuations from URLs")
Fixes: https://github.com/damus-io/damus/issues/1027
Closes: https://github.com/damus-io/damus/pull/1063
Changelog-Fixed: Fix wikipedia url detection with parenthesis

Diffstat:
M damus-c/cursor.h  | 6 +-----
M damus-c/damus.c  | 73 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++-----

2 files changed, 69 insertions(+), 10 deletions(-)
diff --git a/damus-c/cursor.h b/damus-c/cursor.h
@@ -447,12 +447,8 @@ static inline int is_left_boundary(char c) {
     return is_right_boundary(c) || is_utf8_byte(c);
 }
 
-static inline int is_invalid_url_ending(char c) {
-    return c == '!' || c == '?' || c == ')' || c == '.' || c == ',' || c == ';';
-}
-
 static inline int is_alphanumeric(char c) {
-    return (c >= 'a' && c <= 'z') || (c >= '0' && c <= '9');
+    return (c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z') || (c >= '0' && c <= '9');
 }
 
 static inline int consume_until_boundary(struct cursor *cur) {
diff --git a/damus-c/damus.c b/damus-c/damus.c
@@ -104,6 +104,69 @@ static int add_text_block(struct note_blocks *blocks, const u8 *start, const u8 
     return add_block(blocks, b);
 }
 
+static int consume_url_fragment(struct cursor *cur)
+{
+    int c;
+
+    if ((c = peek_char(cur, 0)) < 0)
+        return 1;
+
+    if (c != '#' && c != '?') {
+        return 1;
+    }
+
+    cur->p++;
+
+    return consume_until_whitespace(cur, 1);
+}
+
+static int consume_url_path(struct cursor *cur)
+{
+    int c;
+
+    if ((c = peek_char(cur, 0)) < 0)
+        return 1;
+
+    if (c != '/') {
+        return 1;
+    }
+
+    while (cur->p < cur->end) {
+        c = *cur->p;
+
+        if (c == '?' || c == '#' || is_whitespace(c)) {
+            return 1;
+        }
+
+        cur->p++;
+    }
+
+    return 1;
+}
+
+static int consume_url_host(struct cursor *cur)
+{
+	char c;
+	int count = 0;
+
+	while (cur->p < cur->end) {
+		c = *cur->p;
+		// TODO: handle IDNs
+        if (is_alphanumeric(c) || c == '.' || c == '-')
+		{
+			count++;
+			cur->p++;
+			continue;
+		}
+
+		return count != 0;
+	}
+
+
+	// this means the end of the URL hostname is the end of the buffer and we finished
+	return count != 0;
+}
+
 static int parse_url(struct cursor *cur, struct note_block *block) {
     u8 *start = cur->p;
     
@@ -121,15 +184,15 @@ static int parse_url(struct cursor *cur, struct note_block *block) {
             return 0;
         }
     }
-    
-    if (!consume_until_whitespace(cur, 1)) {
+
+    if (!(consume_url_host(cur) &&
+          consume_url_path(cur) &&
+          consume_url_fragment(cur)))
+    {
         cur->p = start;
         return 0;
     }
     
-    // strip any unwanted characters
-    while(is_invalid_url_ending(peek_char(cur, -1))) cur->p--;
-    
     block->type = BLOCK_URL;
     block->block.str.start = (const char *)start;
     block->block.str.end = (const char *)cur->p;

	damus nostr ios client
	git clone git://jb55.com/damus
	Log \| Files \| Refs \| README \| LICENSE

M	damus-c/cursor.h	\|	6	+-----
M	damus-c/damus.c	\|	73	++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++-----