commit bc765dc91b6e8bb02fa141e39e4c3d2530ba0e08
parent 7d9c79d276a7830e4f658b5b9356259ec0addfe0
Author: William Casarin <jb55@jb55.com>
Date: Mon, 3 Jul 2023 07:57:49 -0700
initial hashtag parsing
Diffstat:
6 files changed, 555 insertions(+), 14 deletions(-)
diff --git a/.gitignore b/.gitignore
@@ -1,5 +1,7 @@
/target
.direnv
+.buildcmd
+.build-result
# Added by cargo
diff --git a/Cargo.toml b/Cargo.toml
@@ -6,3 +6,5 @@ edition = "2021"
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
[dependencies]
+env_logger = "0.10.0"
+log = "0.4.19"
diff --git a/src/lib.rs b/src/lib.rs
@@ -1,14 +1,2 @@
-pub fn add(left: usize, right: usize) -> usize {
- left + right
-}
-
-#[cfg(test)]
-mod tests {
- use super::*;
-
- #[test]
- fn it_works() {
- let result = add(2, 2);
- assert_eq!(result, 4);
- }
-}
+pub mod parser;
+pub mod shard;
diff --git a/src/parser.rs b/src/parser.rs
@@ -0,0 +1,278 @@
+
+
+#[derive(Debug, PartialEq, Eq)]
+pub struct Parser<'a> {
+ data: &'a [u8],
+ pos: usize,
+}
+
+#[derive(Debug, PartialEq, Eq)]
+pub enum Bound {
+ Start,
+ End,
+}
+
+#[derive(Debug, PartialEq, Eq)]
+pub enum Error {
+ NotFound,
+ BadUtf8Encoding,
+ OutOfBounds(Bound),
+}
+
+pub type Result<T> = std::result::Result<T, Error>;
+
+pub fn is_oob<T>(r: Result<T>) -> bool {
+ match r {
+ Err(Error::OutOfBounds(_)) => true,
+ Err(_) => false,
+ Ok(_) => false,
+ }
+}
+
+impl<'a> Parser<'a> {
+ pub fn from_bytes(data: &'a [u8]) -> Parser<'a> {
+ Parser { data: data, pos: 0 }
+ }
+
+ pub fn from_str(string: &'a str) -> Parser<'a> {
+ Parser {
+ data: string.as_bytes(),
+ pos: 0,
+ }
+ }
+
+ #[inline(always)]
+ pub fn set_pos(&mut self, pos: usize) {
+ self.pos = pos;
+ }
+
+ #[inline(always)]
+ pub fn pos(&self) -> usize {
+ self.pos
+ }
+
+ #[inline(always)]
+ pub fn data(&self) -> &[u8] {
+ self.data
+ }
+
+ #[inline(always)]
+ pub fn len(&self) -> usize {
+ self.data.len()
+ }
+
+ pub fn pull_byte(&mut self) -> Result<u8> {
+ if self.pos + 1 > self.len() {
+ return Err(Error::OutOfBounds(Bound::End));
+ }
+
+ let c = self.data[self.pos];
+ self.pos += 1;
+ return Ok(c);
+ }
+
+ pub fn skip<F: Fn(char) -> bool>(&mut self, should_skip: F) -> Result<()> {
+ let len = self.len();
+ while self.pos < len {
+ let prev = self.pos();
+ if should_skip(self.pull_char()?) {
+ continue;
+ } else {
+ self.set_pos(prev);
+ return Ok(());
+ }
+ }
+
+ return Err(Error::OutOfBounds(Bound::End));
+ }
+
+ pub fn skip_whitespace(&mut self) -> Result<()> {
+ self.skip(|c| c.is_ascii_whitespace())
+ }
+
+ pub fn peek_char(&mut self) -> Result<char> {
+ let peek = true;
+ self.pull_or_peek_char(peek)
+ }
+
+ pub fn pull_char(&mut self) -> Result<char> {
+ let peek = false;
+ self.pull_or_peek_char(peek)
+ }
+
+ pub fn peek_prev_byte(&mut self) -> Result<u8> {
+ if self.pos == 0 {
+ return Err(Error::OutOfBounds(Bound::Start));
+ }
+
+ Ok(self.data[self.pos - 1])
+ }
+
+ pub fn seek_prev_byte(&mut self) -> Result<()> {
+ if self.pos == 0 {
+ return Err(Error::OutOfBounds(Bound::Start));
+ }
+ self.pos -= 1;
+
+ Ok(())
+ }
+
+ pub fn peek_prev_char(&self) -> Result<char> {
+ let mut i = 1;
+ let mut codepoint = 0u32;
+ let mut bs: [u32; 4] = [0; 4];
+
+ if self.pos == 0 {
+ return Err(Error::OutOfBounds(Bound::Start));
+ }
+
+ while i <= 4 && ((self.pos as i32) - (i as i32) >= 0) {
+ let byte = self.data[self.pos - i] as u32;
+ let masked = byte & 0b11000000;
+ if masked == 0b10000000 {
+ // continuation byte
+ bs[i - 1] = byte & 0b00111111;
+ i += 1;
+ } else if masked == 0b11000000 {
+ // start byte
+ match i {
+ 4 => {
+ codepoint = ((bs[3] & 0x07) << 18)
+ | ((bs[2] & 0x3F) << 12)
+ | ((bs[1] & 0x3F) << 6)
+ | (bs[0] & 0x3F)
+ }
+ 3 => {
+ codepoint = ((bs[2] & 0x0F) << 12) | ((bs[1] & 0x3F) << 6) | (bs[0] & 0x3F)
+ }
+ 2 => codepoint = ((bs[1] & 0x0F) << 6) | (bs[0] & 0x3F),
+ _ => return Err(Error::BadUtf8Encoding),
+ }
+ return parser_codepoint_char(codepoint);
+ } else {
+ return parser_codepoint_char(byte);
+ }
+ }
+
+ // If we reached here, we reached the start of the string without finding a non-continuation byte.
+ Err(Error::BadUtf8Encoding)
+ }
+
+ pub fn seek_prev_char(&mut self) -> Result<()> {
+ self.seek_prev_byte()?;
+ while self.pos > 0 && (self.data[self.pos] & 0b11000000) == 0b10000000 {
+ self.pos -= 1;
+ }
+
+ Ok(())
+ }
+
+ fn pull_or_peek_char(&mut self, peek: bool) -> Result<char> {
+ let mut codepoint: u32 = 0;
+
+ let start = self.pos;
+ let b0 = self.pull_byte()? as u32;
+
+ if b0 & 0x80 != 0 {
+ if (b0 & 0b11100000) == 0b11000000 {
+ // Two-byte sequence
+ let b1 = self.pull_byte()? as u32;
+ codepoint = ((b0 & 0b00011111) << 6) | (b1 & 0b00111111);
+ } else if (b0 & 0xF0) == 0xE0 {
+ // Three-byte sequence
+ let b1 = self.pull_byte()? as u32;
+ let b2 = self.pull_byte()? as u32;
+ codepoint = ((b0 & 0x0F) << 12) | ((b1 & 0x3F) << 6) | (b2 & 0x3F);
+ } else if (b0 & 0xF8) == 0xF0 {
+ // Four-byte sequence
+ let b1 = self.pull_byte()? as u32;
+ let b2 = self.pull_byte()? as u32;
+ let b3 = self.pull_byte()? as u32;
+ codepoint =
+ ((b0 & 0x07) << 18) | ((b1 & 0x3F) << 12) | ((b2 & 0x3F) << 6) | (b3 & 0x3F);
+ }
+ } else {
+ // Single-byte ASCII character
+ return Ok((b0 as u8) as char);
+ }
+
+ if peek {
+ self.pos = start;
+ }
+
+ match std::char::from_u32(codepoint) {
+ Some(c) => Ok(c),
+ None => Err(Error::BadUtf8Encoding),
+ }
+ }
+
+ pub fn parse_until_char(&mut self, needle: char) -> Result<()> {
+ self.parse_until(|c| c == needle)
+ }
+
+ pub fn parse_until<F: Fn(char) -> bool>(&mut self, matches: F) -> Result<()> {
+ let len = self.len();
+ while self.pos < len {
+ let prev = self.pos;
+ if matches(self.pull_char()?) {
+ self.pos = prev;
+ return Ok(());
+ }
+ }
+
+ Err(Error::OutOfBounds(Bound::End))
+ }
+}
+
+fn parser_codepoint_char(codepoint: u32) -> Result<char> {
+ match std::char::from_u32(codepoint) {
+ Some(c) => Ok(c),
+ None => Err(Error::BadUtf8Encoding),
+ }
+}
+
+#[cfg(test)]
+mod test {
+ use super::*;
+
+ #[test]
+ fn test_parser() -> Result<()> {
+ // v alien v
+ // 00000000: 20f0 9f91 bd23 6861 7368 7461 670a _....#hashtag.
+ let s = " #hashtag ";
+ let mut parser = Parser::from_str(s);
+ let mut res = parser.parse_until_char('#');
+ assert_eq!(res, Ok(()));
+ assert_eq!(parser.pos, 1);
+ res = parser.parse_until_char('t');
+ assert_eq!(res, Ok(()));
+ assert_eq!(parser.pos, 6);
+ Ok(())
+ }
+
+ #[test]
+ fn test_peek_prev_char() {
+ let s = ".👽.";
+ let mut parser = Parser::from_str(s);
+ let r1 = parser.parse_until_char('👽');
+ assert_eq!(r1, Ok(()));
+ let r2 = parser.pull_char();
+ assert_eq!(r2, Ok('👽'));
+ let r3 = parser.peek_prev_char();
+ assert_eq!(r3, Ok('👽'));
+ assert_eq!(parser.pos(), 5);
+ }
+
+ #[test]
+ fn test_utf8_parsing() -> Result<()> {
+ let s = "hey there #👽.";
+ let mut parser = Parser::from_str(s);
+ let _ = parser.parse_until_char('👽');
+ assert_eq!(parser.peek_char(), Ok('👽'));
+ assert_eq!(parser.pos, 11);
+ let res = parser.parse_until(|c| c.is_ascii_whitespace() || c.is_ascii_punctuation());
+ assert_eq!(res, Ok(()));
+ assert_eq!(parser.peek_char(), Ok('.'));
+ Ok(())
+ }
+}
diff --git a/src/result.rs b/src/result.rs
@@ -0,0 +1 @@
+
diff --git a/src/shard.rs b/src/shard.rs
@@ -0,0 +1,270 @@
+use crate::parser::{Bound, Error, Parser, Result};
+use log::debug;
+
+#[derive(Debug, PartialEq, Eq)]
+struct ByteSlice {
+ pos: usize,
+ len: usize,
+}
+
+impl ByteSlice {
+ pub fn new(pos: usize, len: usize) -> ByteSlice {
+ ByteSlice { pos, len }
+ }
+
+ pub fn bytes<'a>(&self, data: &'a [u8]) -> &'a [u8] {
+ &data[self.pos..self.pos + self.len]
+ }
+
+ pub fn str<'a>(&self, data: &'a [u8]) -> Option<&'a str> {
+ std::str::from_utf8(self.bytes(data)).ok()
+ }
+}
+
+#[derive(Debug, PartialEq, Eq)]
+enum Shard {
+ Text(ByteSlice),
+ Mention(ByteSlice),
+ Hashtag(ByteSlice),
+ Url(ByteSlice),
+ //Invoice(Invoice)
+ //Relay(String)
+}
+
+#[derive(Debug)]
+struct Shards {
+ shards: Vec<Shard>,
+ num_words: i32,
+}
+
+impl Shards {
+ fn new() -> Shards {
+ Shards {
+ shards: vec![],
+ num_words: 0,
+ }
+ }
+
+ fn parse_hashtag(parser: &mut Parser) -> Result<ByteSlice> {
+ let start = parser.pos();
+ match parser.parse_until(is_boundary_char) {
+ Ok(()) | Err(Error::OutOfBounds(Bound::End)) => {
+ debug!("got to hashtag boundary @ {}", parser.pos());
+ let len = parser.pos() - start;
+ if len <= 0 {
+ return Err(Error::NotFound);
+ }
+ return Ok(ByteSlice::new(start, len));
+ }
+ Err(err) => Err(err.into()),
+ }
+ }
+
+ fn push_txt(&mut self, start: usize, upto: usize) {
+ let len = upto - start;
+ if len == 0 {
+ return;
+ }
+
+ let txt_slice = ByteSlice::new(start, len);
+ /*
+ debug!(
+ "pushing text block {:?} @ {} '{:?}'",
+ txt_slice,
+ parser.pos(),
+ txt_slice.str(parser.data())
+ );
+ */
+ self.shards.push(Shard::Text(txt_slice));
+ }
+
+ pub fn parse(content: &str) -> Result<Shards> {
+ let mut parser = Parser::from_str(content);
+ let len = parser.len();
+ let mut shards = Shards::new();
+ let mut start = parser.pos();
+
+ while parser.pos() < len {
+ let before_parse = parser.pos();
+ let prev_boundary = is_left_boundary(&parser.peek_prev_byte());
+ let c1 = parser.data()[parser.pos()] as char;
+ parser.set_pos(parser.pos() + 1);
+
+ if c1 == '#' && prev_boundary {
+ match Shards::parse_hashtag(&mut parser) {
+ Ok(ht) => {
+ shards.push_txt(start, before_parse);
+ start = parser.pos();
+
+ debug!("pushing hashtag {:?}", ht);
+ shards.shards.push(Shard::Hashtag(ht));
+ }
+
+ Err(err) => {
+ debug!("failed parsing hashtag @ {}: {:?}", parser.pos(), err);
+ }
+ }
+ }
+ }
+
+ shards.push_txt(start, parser.pos());
+ Ok(shards)
+ }
+}
+
+fn is_boundary(r: &Result<char>) -> bool {
+ match r {
+ Err(Error::OutOfBounds(_)) => true,
+ Err(_) => false,
+ Ok(c) => is_boundary_char(*c),
+ }
+}
+
+fn is_left_boundary(r: &Result<u8>) -> bool {
+ match r {
+ Err(Error::OutOfBounds(_)) => true,
+ Err(_) => false,
+ Ok(c) => is_left_boundary_char(*c),
+ }
+}
+
+fn is_eof<T>(r: &Result<T>) -> bool {
+ match r {
+ Err(Error::OutOfBounds(Bound::End)) => true,
+ _ => false,
+ }
+}
+
+fn is_boundary_char(c: char) -> bool {
+ c.is_ascii_whitespace() || c.is_ascii_punctuation()
+}
+
+fn is_left_boundary_char(c: u8) -> bool {
+ is_boundary_char(c as char) || ((c & 0b10000000) == 0b10000000)
+}
+
+#[cfg(test)]
+mod test {
+ use super::*;
+ use std::sync::Once;
+
+ static INIT: Once = Once::new();
+
+ /// Setup function that is only run once, even if called multiple times.
+ fn setup() {
+ INIT.call_once(|| {
+ env_logger::init();
+ });
+ }
+
+ #[test]
+ fn test_is_boundary() {
+ setup();
+
+ let content = "a";
+ let parser = Parser::from_str(&content);
+ let res = parser.peek_prev_char();
+ assert_eq!(is_boundary(&res), true);
+ }
+
+ #[test]
+ fn test_parse_hashtag_basic() {
+ setup();
+
+ let content = "abc #😎";
+ debug!("hashtag_basic content '{}'", content);
+ let shards = Shards::parse(content).unwrap();
+ let bs = shards.shards;
+ assert_eq!(bs.len(), 2);
+ assert_eq!(bs[0], Shard::Text(ByteSlice::new(0, 4)));
+ assert_eq!(bs[1], Shard::Hashtag(ByteSlice::new(5, 4)));
+ }
+
+ #[test]
+ fn test_parse_hashtag_adjacent() {
+ setup();
+
+ let content = "aa#abc";
+ let shards = Shards::parse(content).unwrap();
+ let bs = shards.shards;
+ assert_eq!(bs.len(), 1);
+ assert_eq!(bs[0], Shard::Text(ByteSlice::new(0, 6)));
+ }
+
+ #[test]
+ fn test_parse_hashtag_start() {
+ setup();
+
+ let content = "#abc.";
+ debug!("test_parse_hashtag_start '{}'", content);
+ let shards = Shards::parse(content).unwrap();
+ let bs = shards.shards;
+ assert_eq!(bs.len(), 2);
+ assert_eq!(bs[0], Shard::Hashtag(ByteSlice::new(1, 3)));
+ assert_eq!(bs[1], Shard::Text(ByteSlice::new(4, 1)));
+ }
+
+ #[test]
+ fn test_parse_hashtag_end() {
+ setup();
+
+ let content = "#abc";
+ debug!("test_parse_hashtag_end '{}'", content);
+ let shards = Shards::parse(content).unwrap();
+ let bs = shards.shards;
+ assert_eq!(bs.len(), 1);
+ assert_eq!(bs[0], Shard::Hashtag(ByteSlice::new(1, 3)));
+ }
+
+ #[test]
+ fn test_parse_hashtag_punc_before() {
+ setup();
+
+ let content = ".#abc";
+ let shards = Shards::parse(content).unwrap();
+ let bs = shards.shards;
+ assert_eq!(bs.len(), 2);
+ assert_eq!(bs[0], Shard::Text(ByteSlice::new(0, 1)));
+ assert_eq!(bs[1], Shard::Hashtag(ByteSlice::new(2, 3)));
+ }
+
+ #[test]
+ fn test_multiple_hashtags() {
+ setup();
+
+ let content = ".#alice.#bob";
+ let shards = Shards::parse(content).unwrap();
+ let bs = shards.shards;
+ assert_eq!(bs.len(), 4);
+ assert_eq!(bs[0], Shard::Text(ByteSlice::new(0, 1)));
+ assert_eq!(bs[1], Shard::Hashtag(ByteSlice::new(2, 5)));
+ assert_eq!(bs[2], Shard::Text(ByteSlice::new(7, 1)));
+ assert_eq!(bs[3], Shard::Hashtag(ByteSlice::new(9, 3)));
+ }
+
+ #[test]
+ fn test_multiple_adjacent_hashtags() {
+ setup();
+
+ let content = "#alice#bob";
+ debug!("test_multiple_adjacent_hashtags '{}'", content);
+ let shards = Shards::parse(content).unwrap();
+ let bs = shards.shards;
+ assert_eq!(bs.len(), 2);
+ assert_eq!(bs[0], Shard::Hashtag(ByteSlice::new(1, 5)));
+ assert_eq!(bs[1], Shard::Hashtag(ByteSlice::new(7, 3)));
+ }
+
+ #[test]
+ fn test_parse_hashtag_emoji_before() {
+ setup();
+
+ // 00000000: f09f 98a4 2361 6263 ....#abc
+ let content = "😤#abc";
+ let shards = Shards::parse(content).unwrap();
+ let bs = shards.shards;
+ assert_eq!(bs.len(), 2);
+ assert_eq!(bs[0], Shard::Text(ByteSlice::new(0, 4)));
+ assert_eq!(bs[1], Shard::Hashtag(ByteSlice::new(5, 3)));
+ }
+}