commit 3033943e52a2ad030709d1ef86c745c0fb03b0a5
parent cc043a4ec3efd0299c130df0efef2f0b7c6ae9bb
Author: William Casarin <jb55@jb55.com>
Date: Sat, 1 Jul 2023 11:27:47 -0700
parser: add utf8 seeking functions
useful for peeking the previous utf8 char on a 0-copy utf8 buffer view
Diffstat:
4 files changed, 173 insertions(+), 35 deletions(-)
diff --git a/src/error.rs b/src/error.rs
@@ -1,6 +1,9 @@
+use crate::parser;
+
#[derive(Eq, PartialEq, Debug)]
pub enum Error {
Nostr(enostr::Error),
+ Parse(parser::Error),
Generic(String),
}
@@ -10,6 +13,12 @@ impl From<String> for Error {
}
}
+impl From<parser::Error> for Error {
+ fn from(s: parser::Error) -> Self {
+ Error::Parse(s)
+ }
+}
+
impl From<enostr::Error> for Error {
fn from(err: enostr::Error) -> Self {
Error::Nostr(err)
diff --git a/src/lib.rs b/src/lib.rs
@@ -3,6 +3,7 @@ mod app;
mod contacts;
mod error;
mod parser;
+mod result;
pub use app::Damus;
pub use error::Error;
diff --git a/src/parser.rs b/src/parser.rs
@@ -1,28 +1,65 @@
-use log::info;
+use log::{debug, info};
-#[derive(Debug, PartialEq)]
-struct Parser<'a> {
+#[derive(Debug, PartialEq, Eq)]
+pub struct Parser<'a> {
data: &'a [u8],
pos: usize,
}
-#[derive(Debug, PartialEq)]
-enum ParseError {
+#[derive(Debug, PartialEq, Eq)]
+pub enum Bound {
+ Start,
+ End,
+}
+
+#[derive(Debug, PartialEq, Eq)]
+pub enum Error {
NotFound,
BadUtf8Encoding,
- EOF,
+ OutOfBounds(Bound),
}
-type Result<T> = std::result::Result<T, ParseError>;
+type Result<T> = std::result::Result<T, Error>;
+
+pub fn is_oob<T>(r: Result<T>) -> bool {
+ match r {
+ Err(Error::OutOfBounds(_)) => true,
+ Err(_) => false,
+ Ok(_) => false,
+ }
+}
impl<'a> Parser<'a> {
- fn new(data: &'a [u8]) -> Parser {
+ pub fn from_bytes(data: &'a [u8]) -> Parser<'a> {
Parser { data: data, pos: 0 }
}
- fn pull_byte(&mut self) -> Result<u8> {
- if self.pos + 1 > self.data.len() {
- return Err(ParseError::EOF);
+ pub fn from_str(string: &'a str) -> Parser<'a> {
+ Parser {
+ data: string.as_bytes(),
+ pos: 0,
+ }
+ }
+
+ pub fn set_pos(&mut self, pos: usize) {
+ self.pos = pos;
+ }
+
+ pub fn pos(&self) -> usize {
+ self.pos
+ }
+
+ pub fn data(&self) -> &[u8] {
+ self.data
+ }
+
+ pub fn len(&self) -> usize {
+ self.data.len()
+ }
+
+ pub fn pull_byte(&mut self) -> Result<u8> {
+ if self.pos + 1 > self.len() {
+ return Err(Error::OutOfBounds(Bound::End));
}
let c = self.data[self.pos];
@@ -30,6 +67,25 @@ impl<'a> Parser<'a> {
return Ok(c);
}
+ pub fn skip<F: Fn(char) -> bool>(&mut self, should_skip: F) -> Result<()> {
+ let len = self.len();
+ while self.pos < len {
+ let prev = self.pos();
+ if should_skip(self.pull_char()?) {
+ continue;
+ } else {
+ self.set_pos(prev);
+ return Ok(());
+ }
+ }
+
+ return Err(Error::OutOfBounds(Bound::End));
+ }
+
+ pub fn skip_whitespace(&mut self) -> Result<()> {
+ self.skip(|c| c.is_ascii_whitespace())
+ }
+
pub fn peek_char(&mut self) -> Result<char> {
let peek = true;
self.pull_or_peek_char(peek)
@@ -40,6 +96,65 @@ impl<'a> Parser<'a> {
self.pull_or_peek_char(peek)
}
+ pub fn seek_prev_byte(&mut self) -> Result<()> {
+ if self.pos == 0 {
+ return Err(Error::OutOfBounds(Bound::Start));
+ }
+ self.pos -= 1;
+
+ Ok(())
+ }
+
+ fn peek_prev_char(&self) -> Result<char> {
+ let mut i = 1;
+ let mut codepoint = 0u32;
+ let mut bs: [u32; 4] = [0; 4];
+
+ if self.pos == 0 {
+ return Err(Error::OutOfBounds(Bound::Start));
+ }
+
+ while i <= 4 && ((self.pos as i32) - (i as i32) >= 0) {
+ let byte = self.data[self.pos - i] as u32;
+ let masked = byte & 0b11000000;
+ if masked == 0b10000000 {
+ // continuation byte
+ bs[i - 1] = byte & 0b00111111;
+ i += 1;
+ } else if masked == 0b11000000 {
+ // start byte
+ match i {
+ 4 => {
+ codepoint = ((bs[3] & 0x07) << 18)
+ | ((bs[2] & 0x3F) << 12)
+ | ((bs[1] & 0x3F) << 6)
+ | (bs[0] & 0x3F)
+ }
+ 3 => {
+ codepoint = ((bs[2] & 0x0F) << 12) | ((bs[1] & 0x3F) << 6) | (bs[0] & 0x3F)
+ }
+ 2 => codepoint = ((bs[1] & 0x0F) << 6) | (bs[0] & 0x3F),
+ _ => return Err(Error::BadUtf8Encoding),
+ }
+ return parser_codepoint_char(codepoint);
+ } else {
+ return parser_codepoint_char(byte);
+ }
+ }
+
+ // If we reached here, we reached the start of the string without finding a non-continuation byte.
+ Err(Error::BadUtf8Encoding)
+ }
+
+ pub fn seek_prev_char(&mut self) -> Result<()> {
+ self.seek_prev_byte()?;
+ while self.pos > 0 && (self.data[self.pos] & 0b11000000) == 0b10000000 {
+ self.pos -= 1;
+ }
+
+ Ok(())
+ }
+
fn pull_or_peek_char(&mut self, peek: bool) -> Result<char> {
let mut codepoint: u32 = 0;
@@ -47,10 +162,10 @@ impl<'a> Parser<'a> {
let b0 = self.pull_byte()? as u32;
if b0 & 0x80 != 0 {
- if (b0 & 0xE0) == 0xC0 {
+ if (b0 & 0b11100000) == 0b11000000 {
// Two-byte sequence
let b1 = self.pull_byte()? as u32;
- codepoint = ((b0 & 0x1F) << 6) | (b1 & 0x3F);
+ codepoint = ((b0 & 0b00011111) << 6) | (b1 & 0b00111111);
} else if (b0 & 0xF0) == 0xE0 {
// Three-byte sequence
let b1 = self.pull_byte()? as u32;
@@ -75,35 +190,32 @@ impl<'a> Parser<'a> {
match std::char::from_u32(codepoint) {
Some(c) => Ok(c),
- None => Err(ParseError::BadUtf8Encoding),
+ None => Err(Error::BadUtf8Encoding),
}
}
- fn current(&mut self) -> Result<char> {
- let last_pos = self.pos;
- let c = self.pull_char();
- if c.is_ok() {
- self.pos = last_pos;
- }
- return c;
- }
-
- fn parse_until_char(&mut self, needle: char) -> Result<()> {
- self.parse_until(|c| c == needle)?;
- Ok(())
+ pub fn parse_until_char(&mut self, needle: char) -> Result<()> {
+ self.parse_until(|c| c == needle)
}
- fn parse_until<F: Fn(char) -> bool>(&mut self, matches: F) -> Result<()> {
- let len = self.data.len();
+ pub fn parse_until<F: Fn(char) -> bool>(&mut self, matches: F) -> Result<()> {
+ let len = self.len();
while self.pos < len {
- let prev_pos = self.pos;
+ let prev = self.pos;
if matches(self.pull_char()?) {
- self.pos = prev_pos;
+ self.pos = prev;
return Ok(());
}
}
- Err(ParseError::NotFound)
+ Err(Error::OutOfBounds(Bound::End))
+ }
+}
+
+fn parser_codepoint_char(codepoint: u32) -> Result<char> {
+ match std::char::from_u32(codepoint) {
+ Some(c) => Ok(c),
+ None => Err(Error::BadUtf8Encoding),
}
}
@@ -116,7 +228,7 @@ mod test {
// v alien v
// 00000000: 20f0 9f91 bd23 6861 7368 7461 670a _....#hashtag.
let s = " #hashtag ";
- let mut parser = Parser::new(s.as_bytes());
+ let mut parser = Parser::from_str(s);
let mut res = parser.parse_until_char('#');
assert_eq!(res, Ok(()));
assert_eq!(parser.pos, 1);
@@ -127,15 +239,28 @@ mod test {
}
#[test]
+ fn test_peek_prev_char() {
+ let s = ".👽.";
+ let mut parser = Parser::from_str(s);
+ let r1 = parser.parse_until_char('👽');
+ assert_eq!(r1, Ok(()));
+ let r2 = parser.pull_char();
+ assert_eq!(r2, Ok('👽'));
+ let r3 = parser.peek_prev_char();
+ assert_eq!(r3, Ok('👽'));
+ assert_eq!(parser.pos(), 5);
+ }
+
+ #[test]
fn test_utf8_parsing() -> Result<()> {
let s = "hey there #👽.";
- let mut parser = Parser::new(s.as_bytes());
+ let mut parser = Parser::from_str(s);
let _ = parser.parse_until_char('👽');
- assert_eq!(parser.current(), Ok('👽'));
+ assert_eq!(parser.peek_char(), Ok('👽'));
assert_eq!(parser.pos, 11);
let res = parser.parse_until(|c| c.is_ascii_whitespace() || c.is_ascii_punctuation());
assert_eq!(res, Ok(()));
- assert_eq!(parser.current(), Ok('.'));
+ assert_eq!(parser.peek_char(), Ok('.'));
Ok(())
}
}
diff --git a/src/result.rs b/src/result.rs
@@ -0,0 +1,3 @@
+use crate::error::Error;
+
+type Result<T> = std::result::Result<T, Error>;