commit a841b449102176241485b8882abd4e7002e6b520
parent 644124f1340eb6aa92fd4d33f50259067d43af4a
Author: kernelkind <kernelkind@gmail.com>
Date: Thu, 28 Dec 2023 13:52:24 -0800
parser: handle period at end of url
Fix parsing URL when encountering a period at the end of the url by
setting it as disallowed from being present at the end of a
URL.
Some characters are disallowed to be present at the end of URLs.
Presently, the period character is the only disallowed character.
A character is the last character in the URL if it is followed by
is_whitespace() or if it's the last character in the string.
Signed-off-by: kernelkind <kernelkind@gmail.com>
Tested-by: William Casarin <jb55@jb55.com>
Signed-off-by: William Casarin <jb55@jb5.com>
Diffstat:
1 file changed, 50 insertions(+), 3 deletions(-)
diff --git a/src/content_parser.c b/src/content_parser.c
@@ -363,6 +363,53 @@ fail:
return 0;
}
+
+
+static inline int next_char_is_whitespace(unsigned char *cur, unsigned char *end) {
+ unsigned char *next = cur + 1;
+
+ if (next > end)
+ return 0;
+
+ if (next == end)
+ return 1;
+
+ return is_whitespace(*next);
+}
+
+static inline int char_disallowed_at_end_url(char c)
+{
+ return c == '.' || c == ',';
+
+}
+
+static int is_final_url_char(unsigned char *cur, unsigned char *end)
+{
+ if (is_whitespace(*cur))
+ return 1;
+
+ if (next_char_is_whitespace(cur, end)) {
+ // next char is whitespace so this char could be the final char in the url
+ return char_disallowed_at_end_url(*cur);
+ }
+
+ // next char isn't whitespace so it can't be a final char
+ return 0;
+}
+
+static int consume_until_end_url(struct cursor *cur, int or_end) {
+ unsigned char *start = cur->p;
+
+ while (cur->p < cur->end) {
+ if (is_final_url_char(cur->p, cur->end))
+ return cur->p != start;
+
+ cur->p++;
+ }
+
+ return or_end;
+}
+
static int consume_url_fragment(struct cursor *cur)
{
int c;
@@ -376,7 +423,7 @@ static int consume_url_fragment(struct cursor *cur)
cur->p++;
- return consume_until_whitespace(cur, 1);
+ return consume_until_end_url(cur, 1);
}
static int consume_url_path(struct cursor *cur)
@@ -393,7 +440,7 @@ static int consume_url_path(struct cursor *cur)
while (cur->p < cur->end) {
c = *cur->p;
- if (c == '?' || c == '#' || is_whitespace(c)) {
+ if (c == '?' || c == '#' || is_final_url_char(cur->p, cur->end)) {
return 1;
}
@@ -411,7 +458,7 @@ static int consume_url_host(struct cursor *cur)
while (cur->p < cur->end) {
c = *cur->p;
// TODO: handle IDNs
- if (is_alphanumeric(c) || c == '.' || c == '-')
+ if ((is_alphanumeric(c) || c == '.' || c == '-') && !is_final_url_char(cur->p, cur->end))
{
count++;
cur->p++;