shatter

A fast, zero-copy nostr content parser in Rust
git clone git://jb55.com/shatter
Log | Files | Refs | README

commit bc765dc91b6e8bb02fa141e39e4c3d2530ba0e08
parent 7d9c79d276a7830e4f658b5b9356259ec0addfe0
Author: William Casarin <jb55@jb55.com>
Date:   Mon,  3 Jul 2023 07:57:49 -0700

initial hashtag parsing

Diffstat:
M.gitignore | 2++
MCargo.toml | 2++
Msrc/lib.rs | 16++--------------
Asrc/parser.rs | 278+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Asrc/result.rs | 1+
Asrc/shard.rs | 270+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
6 files changed, 555 insertions(+), 14 deletions(-)

diff --git a/.gitignore b/.gitignore @@ -1,5 +1,7 @@ /target .direnv +.buildcmd +.build-result # Added by cargo diff --git a/Cargo.toml b/Cargo.toml @@ -6,3 +6,5 @@ edition = "2021" # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html [dependencies] +env_logger = "0.10.0" +log = "0.4.19" diff --git a/src/lib.rs b/src/lib.rs @@ -1,14 +1,2 @@ -pub fn add(left: usize, right: usize) -> usize { - left + right -} - -#[cfg(test)] -mod tests { - use super::*; - - #[test] - fn it_works() { - let result = add(2, 2); - assert_eq!(result, 4); - } -} +pub mod parser; +pub mod shard; diff --git a/src/parser.rs b/src/parser.rs @@ -0,0 +1,278 @@ + + +#[derive(Debug, PartialEq, Eq)] +pub struct Parser<'a> { + data: &'a [u8], + pos: usize, +} + +#[derive(Debug, PartialEq, Eq)] +pub enum Bound { + Start, + End, +} + +#[derive(Debug, PartialEq, Eq)] +pub enum Error { + NotFound, + BadUtf8Encoding, + OutOfBounds(Bound), +} + +pub type Result<T> = std::result::Result<T, Error>; + +pub fn is_oob<T>(r: Result<T>) -> bool { + match r { + Err(Error::OutOfBounds(_)) => true, + Err(_) => false, + Ok(_) => false, + } +} + +impl<'a> Parser<'a> { + pub fn from_bytes(data: &'a [u8]) -> Parser<'a> { + Parser { data: data, pos: 0 } + } + + pub fn from_str(string: &'a str) -> Parser<'a> { + Parser { + data: string.as_bytes(), + pos: 0, + } + } + + #[inline(always)] + pub fn set_pos(&mut self, pos: usize) { + self.pos = pos; + } + + #[inline(always)] + pub fn pos(&self) -> usize { + self.pos + } + + #[inline(always)] + pub fn data(&self) -> &[u8] { + self.data + } + + #[inline(always)] + pub fn len(&self) -> usize { + self.data.len() + } + + pub fn pull_byte(&mut self) -> Result<u8> { + if self.pos + 1 > self.len() { + return Err(Error::OutOfBounds(Bound::End)); + } + + let c = self.data[self.pos]; + self.pos += 1; + return Ok(c); + } + + pub fn skip<F: Fn(char) -> bool>(&mut self, should_skip: F) -> Result<()> { + let len = self.len(); + while self.pos < len { + let prev = self.pos(); + if should_skip(self.pull_char()?) { + continue; + } else { + self.set_pos(prev); + return Ok(()); + } + } + + return Err(Error::OutOfBounds(Bound::End)); + } + + pub fn skip_whitespace(&mut self) -> Result<()> { + self.skip(|c| c.is_ascii_whitespace()) + } + + pub fn peek_char(&mut self) -> Result<char> { + let peek = true; + self.pull_or_peek_char(peek) + } + + pub fn pull_char(&mut self) -> Result<char> { + let peek = false; + self.pull_or_peek_char(peek) + } + + pub fn peek_prev_byte(&mut self) -> Result<u8> { + if self.pos == 0 { + return Err(Error::OutOfBounds(Bound::Start)); + } + + Ok(self.data[self.pos - 1]) + } + + pub fn seek_prev_byte(&mut self) -> Result<()> { + if self.pos == 0 { + return Err(Error::OutOfBounds(Bound::Start)); + } + self.pos -= 1; + + Ok(()) + } + + pub fn peek_prev_char(&self) -> Result<char> { + let mut i = 1; + let mut codepoint = 0u32; + let mut bs: [u32; 4] = [0; 4]; + + if self.pos == 0 { + return Err(Error::OutOfBounds(Bound::Start)); + } + + while i <= 4 && ((self.pos as i32) - (i as i32) >= 0) { + let byte = self.data[self.pos - i] as u32; + let masked = byte & 0b11000000; + if masked == 0b10000000 { + // continuation byte + bs[i - 1] = byte & 0b00111111; + i += 1; + } else if masked == 0b11000000 { + // start byte + match i { + 4 => { + codepoint = ((bs[3] & 0x07) << 18) + | ((bs[2] & 0x3F) << 12) + | ((bs[1] & 0x3F) << 6) + | (bs[0] & 0x3F) + } + 3 => { + codepoint = ((bs[2] & 0x0F) << 12) | ((bs[1] & 0x3F) << 6) | (bs[0] & 0x3F) + } + 2 => codepoint = ((bs[1] & 0x0F) << 6) | (bs[0] & 0x3F), + _ => return Err(Error::BadUtf8Encoding), + } + return parser_codepoint_char(codepoint); + } else { + return parser_codepoint_char(byte); + } + } + + // If we reached here, we reached the start of the string without finding a non-continuation byte. + Err(Error::BadUtf8Encoding) + } + + pub fn seek_prev_char(&mut self) -> Result<()> { + self.seek_prev_byte()?; + while self.pos > 0 && (self.data[self.pos] & 0b11000000) == 0b10000000 { + self.pos -= 1; + } + + Ok(()) + } + + fn pull_or_peek_char(&mut self, peek: bool) -> Result<char> { + let mut codepoint: u32 = 0; + + let start = self.pos; + let b0 = self.pull_byte()? as u32; + + if b0 & 0x80 != 0 { + if (b0 & 0b11100000) == 0b11000000 { + // Two-byte sequence + let b1 = self.pull_byte()? as u32; + codepoint = ((b0 & 0b00011111) << 6) | (b1 & 0b00111111); + } else if (b0 & 0xF0) == 0xE0 { + // Three-byte sequence + let b1 = self.pull_byte()? as u32; + let b2 = self.pull_byte()? as u32; + codepoint = ((b0 & 0x0F) << 12) | ((b1 & 0x3F) << 6) | (b2 & 0x3F); + } else if (b0 & 0xF8) == 0xF0 { + // Four-byte sequence + let b1 = self.pull_byte()? as u32; + let b2 = self.pull_byte()? as u32; + let b3 = self.pull_byte()? as u32; + codepoint = + ((b0 & 0x07) << 18) | ((b1 & 0x3F) << 12) | ((b2 & 0x3F) << 6) | (b3 & 0x3F); + } + } else { + // Single-byte ASCII character + return Ok((b0 as u8) as char); + } + + if peek { + self.pos = start; + } + + match std::char::from_u32(codepoint) { + Some(c) => Ok(c), + None => Err(Error::BadUtf8Encoding), + } + } + + pub fn parse_until_char(&mut self, needle: char) -> Result<()> { + self.parse_until(|c| c == needle) + } + + pub fn parse_until<F: Fn(char) -> bool>(&mut self, matches: F) -> Result<()> { + let len = self.len(); + while self.pos < len { + let prev = self.pos; + if matches(self.pull_char()?) { + self.pos = prev; + return Ok(()); + } + } + + Err(Error::OutOfBounds(Bound::End)) + } +} + +fn parser_codepoint_char(codepoint: u32) -> Result<char> { + match std::char::from_u32(codepoint) { + Some(c) => Ok(c), + None => Err(Error::BadUtf8Encoding), + } +} + +#[cfg(test)] +mod test { + use super::*; + + #[test] + fn test_parser() -> Result<()> { + // v alien v + // 00000000: 20f0 9f91 bd23 6861 7368 7461 670a _....#hashtag. + let s = " #hashtag "; + let mut parser = Parser::from_str(s); + let mut res = parser.parse_until_char('#'); + assert_eq!(res, Ok(())); + assert_eq!(parser.pos, 1); + res = parser.parse_until_char('t'); + assert_eq!(res, Ok(())); + assert_eq!(parser.pos, 6); + Ok(()) + } + + #[test] + fn test_peek_prev_char() { + let s = ".👽."; + let mut parser = Parser::from_str(s); + let r1 = parser.parse_until_char('👽'); + assert_eq!(r1, Ok(())); + let r2 = parser.pull_char(); + assert_eq!(r2, Ok('👽')); + let r3 = parser.peek_prev_char(); + assert_eq!(r3, Ok('👽')); + assert_eq!(parser.pos(), 5); + } + + #[test] + fn test_utf8_parsing() -> Result<()> { + let s = "hey there #👽."; + let mut parser = Parser::from_str(s); + let _ = parser.parse_until_char('👽'); + assert_eq!(parser.peek_char(), Ok('👽')); + assert_eq!(parser.pos, 11); + let res = parser.parse_until(|c| c.is_ascii_whitespace() || c.is_ascii_punctuation()); + assert_eq!(res, Ok(())); + assert_eq!(parser.peek_char(), Ok('.')); + Ok(()) + } +} diff --git a/src/result.rs b/src/result.rs @@ -0,0 +1 @@ + diff --git a/src/shard.rs b/src/shard.rs @@ -0,0 +1,270 @@ +use crate::parser::{Bound, Error, Parser, Result}; +use log::debug; + +#[derive(Debug, PartialEq, Eq)] +struct ByteSlice { + pos: usize, + len: usize, +} + +impl ByteSlice { + pub fn new(pos: usize, len: usize) -> ByteSlice { + ByteSlice { pos, len } + } + + pub fn bytes<'a>(&self, data: &'a [u8]) -> &'a [u8] { + &data[self.pos..self.pos + self.len] + } + + pub fn str<'a>(&self, data: &'a [u8]) -> Option<&'a str> { + std::str::from_utf8(self.bytes(data)).ok() + } +} + +#[derive(Debug, PartialEq, Eq)] +enum Shard { + Text(ByteSlice), + Mention(ByteSlice), + Hashtag(ByteSlice), + Url(ByteSlice), + //Invoice(Invoice) + //Relay(String) +} + +#[derive(Debug)] +struct Shards { + shards: Vec<Shard>, + num_words: i32, +} + +impl Shards { + fn new() -> Shards { + Shards { + shards: vec![], + num_words: 0, + } + } + + fn parse_hashtag(parser: &mut Parser) -> Result<ByteSlice> { + let start = parser.pos(); + match parser.parse_until(is_boundary_char) { + Ok(()) | Err(Error::OutOfBounds(Bound::End)) => { + debug!("got to hashtag boundary @ {}", parser.pos()); + let len = parser.pos() - start; + if len <= 0 { + return Err(Error::NotFound); + } + return Ok(ByteSlice::new(start, len)); + } + Err(err) => Err(err.into()), + } + } + + fn push_txt(&mut self, start: usize, upto: usize) { + let len = upto - start; + if len == 0 { + return; + } + + let txt_slice = ByteSlice::new(start, len); + /* + debug!( + "pushing text block {:?} @ {} '{:?}'", + txt_slice, + parser.pos(), + txt_slice.str(parser.data()) + ); + */ + self.shards.push(Shard::Text(txt_slice)); + } + + pub fn parse(content: &str) -> Result<Shards> { + let mut parser = Parser::from_str(content); + let len = parser.len(); + let mut shards = Shards::new(); + let mut start = parser.pos(); + + while parser.pos() < len { + let before_parse = parser.pos(); + let prev_boundary = is_left_boundary(&parser.peek_prev_byte()); + let c1 = parser.data()[parser.pos()] as char; + parser.set_pos(parser.pos() + 1); + + if c1 == '#' && prev_boundary { + match Shards::parse_hashtag(&mut parser) { + Ok(ht) => { + shards.push_txt(start, before_parse); + start = parser.pos(); + + debug!("pushing hashtag {:?}", ht); + shards.shards.push(Shard::Hashtag(ht)); + } + + Err(err) => { + debug!("failed parsing hashtag @ {}: {:?}", parser.pos(), err); + } + } + } + } + + shards.push_txt(start, parser.pos()); + Ok(shards) + } +} + +fn is_boundary(r: &Result<char>) -> bool { + match r { + Err(Error::OutOfBounds(_)) => true, + Err(_) => false, + Ok(c) => is_boundary_char(*c), + } +} + +fn is_left_boundary(r: &Result<u8>) -> bool { + match r { + Err(Error::OutOfBounds(_)) => true, + Err(_) => false, + Ok(c) => is_left_boundary_char(*c), + } +} + +fn is_eof<T>(r: &Result<T>) -> bool { + match r { + Err(Error::OutOfBounds(Bound::End)) => true, + _ => false, + } +} + +fn is_boundary_char(c: char) -> bool { + c.is_ascii_whitespace() || c.is_ascii_punctuation() +} + +fn is_left_boundary_char(c: u8) -> bool { + is_boundary_char(c as char) || ((c & 0b10000000) == 0b10000000) +} + +#[cfg(test)] +mod test { + use super::*; + use std::sync::Once; + + static INIT: Once = Once::new(); + + /// Setup function that is only run once, even if called multiple times. + fn setup() { + INIT.call_once(|| { + env_logger::init(); + }); + } + + #[test] + fn test_is_boundary() { + setup(); + + let content = "a"; + let parser = Parser::from_str(&content); + let res = parser.peek_prev_char(); + assert_eq!(is_boundary(&res), true); + } + + #[test] + fn test_parse_hashtag_basic() { + setup(); + + let content = "abc #😎"; + debug!("hashtag_basic content '{}'", content); + let shards = Shards::parse(content).unwrap(); + let bs = shards.shards; + assert_eq!(bs.len(), 2); + assert_eq!(bs[0], Shard::Text(ByteSlice::new(0, 4))); + assert_eq!(bs[1], Shard::Hashtag(ByteSlice::new(5, 4))); + } + + #[test] + fn test_parse_hashtag_adjacent() { + setup(); + + let content = "aa#abc"; + let shards = Shards::parse(content).unwrap(); + let bs = shards.shards; + assert_eq!(bs.len(), 1); + assert_eq!(bs[0], Shard::Text(ByteSlice::new(0, 6))); + } + + #[test] + fn test_parse_hashtag_start() { + setup(); + + let content = "#abc."; + debug!("test_parse_hashtag_start '{}'", content); + let shards = Shards::parse(content).unwrap(); + let bs = shards.shards; + assert_eq!(bs.len(), 2); + assert_eq!(bs[0], Shard::Hashtag(ByteSlice::new(1, 3))); + assert_eq!(bs[1], Shard::Text(ByteSlice::new(4, 1))); + } + + #[test] + fn test_parse_hashtag_end() { + setup(); + + let content = "#abc"; + debug!("test_parse_hashtag_end '{}'", content); + let shards = Shards::parse(content).unwrap(); + let bs = shards.shards; + assert_eq!(bs.len(), 1); + assert_eq!(bs[0], Shard::Hashtag(ByteSlice::new(1, 3))); + } + + #[test] + fn test_parse_hashtag_punc_before() { + setup(); + + let content = ".#abc"; + let shards = Shards::parse(content).unwrap(); + let bs = shards.shards; + assert_eq!(bs.len(), 2); + assert_eq!(bs[0], Shard::Text(ByteSlice::new(0, 1))); + assert_eq!(bs[1], Shard::Hashtag(ByteSlice::new(2, 3))); + } + + #[test] + fn test_multiple_hashtags() { + setup(); + + let content = ".#alice.#bob"; + let shards = Shards::parse(content).unwrap(); + let bs = shards.shards; + assert_eq!(bs.len(), 4); + assert_eq!(bs[0], Shard::Text(ByteSlice::new(0, 1))); + assert_eq!(bs[1], Shard::Hashtag(ByteSlice::new(2, 5))); + assert_eq!(bs[2], Shard::Text(ByteSlice::new(7, 1))); + assert_eq!(bs[3], Shard::Hashtag(ByteSlice::new(9, 3))); + } + + #[test] + fn test_multiple_adjacent_hashtags() { + setup(); + + let content = "#alice#bob"; + debug!("test_multiple_adjacent_hashtags '{}'", content); + let shards = Shards::parse(content).unwrap(); + let bs = shards.shards; + assert_eq!(bs.len(), 2); + assert_eq!(bs[0], Shard::Hashtag(ByteSlice::new(1, 5))); + assert_eq!(bs[1], Shard::Hashtag(ByteSlice::new(7, 3))); + } + + #[test] + fn test_parse_hashtag_emoji_before() { + setup(); + + // 00000000: f09f 98a4 2361 6263 ....#abc + let content = "😤#abc"; + let shards = Shards::parse(content).unwrap(); + let bs = shards.shards; + assert_eq!(bs.len(), 2); + assert_eq!(bs[0], Shard::Text(ByteSlice::new(0, 4))); + assert_eq!(bs[1], Shard::Hashtag(ByteSlice::new(5, 3))); + } +}