notedeck

One damus client to rule them all
git clone git://jb55.com/notedeck
Log | Files | Refs | README | LICENSE

commit 3033943e52a2ad030709d1ef86c745c0fb03b0a5
parent cc043a4ec3efd0299c130df0efef2f0b7c6ae9bb
Author: William Casarin <jb55@jb55.com>
Date:   Sat,  1 Jul 2023 11:27:47 -0700

parser: add utf8 seeking functions

useful for peeking the previous utf8 char on a 0-copy utf8 buffer view

Diffstat:
Msrc/error.rs | 9+++++++++
Msrc/lib.rs | 1+
Msrc/parser.rs | 195+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++---------------
Asrc/result.rs | 3+++
4 files changed, 173 insertions(+), 35 deletions(-)

diff --git a/src/error.rs b/src/error.rs @@ -1,6 +1,9 @@ +use crate::parser; + #[derive(Eq, PartialEq, Debug)] pub enum Error { Nostr(enostr::Error), + Parse(parser::Error), Generic(String), } @@ -10,6 +13,12 @@ impl From<String> for Error { } } +impl From<parser::Error> for Error { + fn from(s: parser::Error) -> Self { + Error::Parse(s) + } +} + impl From<enostr::Error> for Error { fn from(err: enostr::Error) -> Self { Error::Nostr(err) diff --git a/src/lib.rs b/src/lib.rs @@ -3,6 +3,7 @@ mod app; mod contacts; mod error; mod parser; +mod result; pub use app::Damus; pub use error::Error; diff --git a/src/parser.rs b/src/parser.rs @@ -1,28 +1,65 @@ -use log::info; +use log::{debug, info}; -#[derive(Debug, PartialEq)] -struct Parser<'a> { +#[derive(Debug, PartialEq, Eq)] +pub struct Parser<'a> { data: &'a [u8], pos: usize, } -#[derive(Debug, PartialEq)] -enum ParseError { +#[derive(Debug, PartialEq, Eq)] +pub enum Bound { + Start, + End, +} + +#[derive(Debug, PartialEq, Eq)] +pub enum Error { NotFound, BadUtf8Encoding, - EOF, + OutOfBounds(Bound), } -type Result<T> = std::result::Result<T, ParseError>; +type Result<T> = std::result::Result<T, Error>; + +pub fn is_oob<T>(r: Result<T>) -> bool { + match r { + Err(Error::OutOfBounds(_)) => true, + Err(_) => false, + Ok(_) => false, + } +} impl<'a> Parser<'a> { - fn new(data: &'a [u8]) -> Parser { + pub fn from_bytes(data: &'a [u8]) -> Parser<'a> { Parser { data: data, pos: 0 } } - fn pull_byte(&mut self) -> Result<u8> { - if self.pos + 1 > self.data.len() { - return Err(ParseError::EOF); + pub fn from_str(string: &'a str) -> Parser<'a> { + Parser { + data: string.as_bytes(), + pos: 0, + } + } + + pub fn set_pos(&mut self, pos: usize) { + self.pos = pos; + } + + pub fn pos(&self) -> usize { + self.pos + } + + pub fn data(&self) -> &[u8] { + self.data + } + + pub fn len(&self) -> usize { + self.data.len() + } + + pub fn pull_byte(&mut self) -> Result<u8> { + if self.pos + 1 > self.len() { + return Err(Error::OutOfBounds(Bound::End)); } let c = self.data[self.pos]; @@ -30,6 +67,25 @@ impl<'a> Parser<'a> { return Ok(c); } + pub fn skip<F: Fn(char) -> bool>(&mut self, should_skip: F) -> Result<()> { + let len = self.len(); + while self.pos < len { + let prev = self.pos(); + if should_skip(self.pull_char()?) { + continue; + } else { + self.set_pos(prev); + return Ok(()); + } + } + + return Err(Error::OutOfBounds(Bound::End)); + } + + pub fn skip_whitespace(&mut self) -> Result<()> { + self.skip(|c| c.is_ascii_whitespace()) + } + pub fn peek_char(&mut self) -> Result<char> { let peek = true; self.pull_or_peek_char(peek) @@ -40,6 +96,65 @@ impl<'a> Parser<'a> { self.pull_or_peek_char(peek) } + pub fn seek_prev_byte(&mut self) -> Result<()> { + if self.pos == 0 { + return Err(Error::OutOfBounds(Bound::Start)); + } + self.pos -= 1; + + Ok(()) + } + + fn peek_prev_char(&self) -> Result<char> { + let mut i = 1; + let mut codepoint = 0u32; + let mut bs: [u32; 4] = [0; 4]; + + if self.pos == 0 { + return Err(Error::OutOfBounds(Bound::Start)); + } + + while i <= 4 && ((self.pos as i32) - (i as i32) >= 0) { + let byte = self.data[self.pos - i] as u32; + let masked = byte & 0b11000000; + if masked == 0b10000000 { + // continuation byte + bs[i - 1] = byte & 0b00111111; + i += 1; + } else if masked == 0b11000000 { + // start byte + match i { + 4 => { + codepoint = ((bs[3] & 0x07) << 18) + | ((bs[2] & 0x3F) << 12) + | ((bs[1] & 0x3F) << 6) + | (bs[0] & 0x3F) + } + 3 => { + codepoint = ((bs[2] & 0x0F) << 12) | ((bs[1] & 0x3F) << 6) | (bs[0] & 0x3F) + } + 2 => codepoint = ((bs[1] & 0x0F) << 6) | (bs[0] & 0x3F), + _ => return Err(Error::BadUtf8Encoding), + } + return parser_codepoint_char(codepoint); + } else { + return parser_codepoint_char(byte); + } + } + + // If we reached here, we reached the start of the string without finding a non-continuation byte. + Err(Error::BadUtf8Encoding) + } + + pub fn seek_prev_char(&mut self) -> Result<()> { + self.seek_prev_byte()?; + while self.pos > 0 && (self.data[self.pos] & 0b11000000) == 0b10000000 { + self.pos -= 1; + } + + Ok(()) + } + fn pull_or_peek_char(&mut self, peek: bool) -> Result<char> { let mut codepoint: u32 = 0; @@ -47,10 +162,10 @@ impl<'a> Parser<'a> { let b0 = self.pull_byte()? as u32; if b0 & 0x80 != 0 { - if (b0 & 0xE0) == 0xC0 { + if (b0 & 0b11100000) == 0b11000000 { // Two-byte sequence let b1 = self.pull_byte()? as u32; - codepoint = ((b0 & 0x1F) << 6) | (b1 & 0x3F); + codepoint = ((b0 & 0b00011111) << 6) | (b1 & 0b00111111); } else if (b0 & 0xF0) == 0xE0 { // Three-byte sequence let b1 = self.pull_byte()? as u32; @@ -75,35 +190,32 @@ impl<'a> Parser<'a> { match std::char::from_u32(codepoint) { Some(c) => Ok(c), - None => Err(ParseError::BadUtf8Encoding), + None => Err(Error::BadUtf8Encoding), } } - fn current(&mut self) -> Result<char> { - let last_pos = self.pos; - let c = self.pull_char(); - if c.is_ok() { - self.pos = last_pos; - } - return c; - } - - fn parse_until_char(&mut self, needle: char) -> Result<()> { - self.parse_until(|c| c == needle)?; - Ok(()) + pub fn parse_until_char(&mut self, needle: char) -> Result<()> { + self.parse_until(|c| c == needle) } - fn parse_until<F: Fn(char) -> bool>(&mut self, matches: F) -> Result<()> { - let len = self.data.len(); + pub fn parse_until<F: Fn(char) -> bool>(&mut self, matches: F) -> Result<()> { + let len = self.len(); while self.pos < len { - let prev_pos = self.pos; + let prev = self.pos; if matches(self.pull_char()?) { - self.pos = prev_pos; + self.pos = prev; return Ok(()); } } - Err(ParseError::NotFound) + Err(Error::OutOfBounds(Bound::End)) + } +} + +fn parser_codepoint_char(codepoint: u32) -> Result<char> { + match std::char::from_u32(codepoint) { + Some(c) => Ok(c), + None => Err(Error::BadUtf8Encoding), } } @@ -116,7 +228,7 @@ mod test { // v alien v // 00000000: 20f0 9f91 bd23 6861 7368 7461 670a _....#hashtag. let s = " #hashtag "; - let mut parser = Parser::new(s.as_bytes()); + let mut parser = Parser::from_str(s); let mut res = parser.parse_until_char('#'); assert_eq!(res, Ok(())); assert_eq!(parser.pos, 1); @@ -127,15 +239,28 @@ mod test { } #[test] + fn test_peek_prev_char() { + let s = ".👽."; + let mut parser = Parser::from_str(s); + let r1 = parser.parse_until_char('👽'); + assert_eq!(r1, Ok(())); + let r2 = parser.pull_char(); + assert_eq!(r2, Ok('👽')); + let r3 = parser.peek_prev_char(); + assert_eq!(r3, Ok('👽')); + assert_eq!(parser.pos(), 5); + } + + #[test] fn test_utf8_parsing() -> Result<()> { let s = "hey there #👽."; - let mut parser = Parser::new(s.as_bytes()); + let mut parser = Parser::from_str(s); let _ = parser.parse_until_char('👽'); - assert_eq!(parser.current(), Ok('👽')); + assert_eq!(parser.peek_char(), Ok('👽')); assert_eq!(parser.pos, 11); let res = parser.parse_until(|c| c.is_ascii_whitespace() || c.is_ascii_punctuation()); assert_eq!(res, Ok(())); - assert_eq!(parser.current(), Ok('.')); + assert_eq!(parser.peek_char(), Ok('.')); Ok(()) } } diff --git a/src/result.rs b/src/result.rs @@ -0,0 +1,3 @@ +use crate::error::Error; + +type Result<T> = std::result::Result<T, Error>;