commit 62301a1218328c7c1933f88d641ec94d5ec90dbd
parent d591c694dd3cb93112029c43cf2ee574a1417351
Author: William Casarin <jb55@jb55.com>
Date: Fri, 30 Jun 2023 16:30:48 -0700
parser: add fast utf8 parsing
Diffstat:
M | src/parser.rs | | | 136 | ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++----------- |
1 file changed, 118 insertions(+), 18 deletions(-)
diff --git a/src/parser.rs b/src/parser.rs
@@ -1,27 +1,109 @@
use log::info;
+#[derive(Debug, PartialEq)]
struct Parser<'a> {
- data: &'a str,
+ data: &'a [u8],
pos: usize,
}
+#[derive(Debug, PartialEq)]
+enum ParseError {
+ NotFound,
+ BadUtf8Encoding,
+ EOF,
+}
+
+type Result<T> = std::result::Result<T, ParseError>;
+
impl<'a> Parser<'a> {
- fn new(data: &'a str) -> Parser {
+ fn new(data: &'a [u8]) -> Parser {
Parser { data: data, pos: 0 }
}
- fn parse_until(&mut self, needle: char) -> bool {
- let mut count = 0;
- for c in self.data[self.pos..].chars() {
- if c == needle {
- self.pos += count - 1;
- return true;
- } else {
- count += 1;
+ fn pull_byte(&mut self) -> Result<u8> {
+ if self.pos + 1 > self.data.len() {
+ return Err(ParseError::EOF);
+ }
+
+ let c = self.data[self.pos];
+ self.pos += 1;
+ return Ok(c);
+ }
+
+ pub fn peek_char(&mut self) -> Result<char> {
+ let peek = true;
+ self.pull_or_peek_char(peek)
+ }
+
+ pub fn pull_char(&mut self) -> Result<char> {
+ let peek = false;
+ self.pull_or_peek_char(peek)
+ }
+
+ fn pull_or_peek_char(&mut self, peek: bool) -> Result<char> {
+ let mut codepoint: u32 = 0;
+
+ let start = self.pos;
+ let b0 = self.pull_byte()? as u32;
+
+ if b0 & 0x80 != 0 {
+ if (b0 & 0xE0) == 0xC0 {
+ // Two-byte sequence
+ let b1 = self.pull_byte()? as u32;
+ codepoint = ((b0 & 0x1F) << 6) | (b1 & 0x3F);
+ } else if (b0 & 0xF0) == 0xE0 {
+ // Three-byte sequence
+ let b1 = self.pull_byte()? as u32;
+ let b2 = self.pull_byte()? as u32;
+ codepoint = ((b0 & 0x0F) << 12) | ((b1 & 0x3F) << 6) | (b2 & 0x3F);
+ } else if (b0 & 0xF8) == 0xF0 {
+ // Four-byte sequence
+ let b1 = self.pull_byte()? as u32;
+ let b2 = self.pull_byte()? as u32;
+ let b3 = self.pull_byte()? as u32;
+ codepoint =
+ ((b0 & 0x07) << 18) | ((b1 & 0x3F) << 12) | ((b2 & 0x3F) << 6) | (b3 & 0x3F);
}
+ } else {
+ // Single-byte ASCII character
+ codepoint = b0;
+ }
+
+ if peek {
+ self.pos = start;
}
- return false;
+ match std::char::from_u32(codepoint) {
+ Some(c) => Ok(c),
+ None => Err(ParseError::BadUtf8Encoding),
+ }
+ }
+
+ fn current(&mut self) -> Result<char> {
+ let last_pos = self.pos;
+ let c = self.pull_char();
+ if c.is_ok() {
+ self.pos = last_pos;
+ }
+ return c;
+ }
+
+ fn parse_until_char(&mut self, needle: char) -> Result<()> {
+ self.parse_until(|c| c == needle)?;
+ Ok(())
+ }
+
+ fn parse_until<F: Fn(char) -> bool>(&mut self, matches: F) -> Result<()> {
+ let len = self.data.len();
+ while self.pos < len {
+ let prev_pos = self.pos;
+ if matches(self.pull_char()?) {
+ self.pos = prev_pos;
+ return Ok(());
+ }
+ }
+
+ Err(ParseError::NotFound)
}
}
@@ -30,12 +112,30 @@ mod test {
use super::*;
#[test]
- fn test_parser() {
- let s = "hey there #hashtag";
- let mut parser = Parser::new(s);
- parser.parse_until('#');
- assert_eq!(parser.pos, 9);
- parser.parse_until('t');
- assert_eq!(parser.pos, 14);
+ fn test_parser() -> Result<()> {
+ // v alien v
+ // 00000000: 20f0 9f91 bd23 6861 7368 7461 670a _....#hashtag.
+ let s = " #hashtag ";
+ let mut parser = Parser::new(s.as_bytes());
+ let mut res = parser.parse_until_char('#');
+ assert_eq!(res, Ok(()));
+ assert_eq!(parser.pos, 1);
+ res = parser.parse_until_char('t');
+ assert_eq!(res, Ok(()));
+ assert_eq!(parser.pos, 6);
+ Ok(())
+ }
+
+ #[test]
+ fn test_utf8_parsing() -> Result<()> {
+ let s = "hey there #👽.";
+ let mut parser = Parser::new(s.as_bytes());
+ let _ = parser.parse_until_char('👽');
+ assert_eq!(parser.current(), Ok('👽'));
+ assert_eq!(parser.pos, 11);
+ let res = parser.parse_until(|c| c.is_ascii_whitespace() || c.is_ascii_punctuation());
+ assert_eq!(res, Ok(()));
+ assert_eq!(parser.current(), Ok('.'));
+ Ok(())
}
}