shatter

A fast, zero-copy nostr content parser in Rust
git clone git://jb55.com/shatter
Log | Files | Refs | README

shard.rs (9223B)


      1 use crate::parser::{Bound, Error, Parser, Result};
      2 use log::debug;
      3 
      4 /// A slice into the original buffer. Contains a position, which is an
      5 /// index into the buffer, and the length of the segment.
      6 #[derive(Debug, PartialEq, Eq)]
      7 pub struct ByteSlice {
      8     pos: u32,
      9     len: u32,
     10 }
     11 
     12 impl ByteSlice {
     13     pub fn new(pos: u32, len: u32) -> ByteSlice {
     14         ByteSlice { pos, len }
     15     }
     16 
     17     #[inline(always)]
     18     fn pos_usize(&self) -> usize {
     19         self.pos as usize
     20     }
     21 
     22     #[inline(always)]
     23     fn len_usize(&self) -> usize {
     24         self.len as usize
     25     }
     26 
     27     /// Get the slice of the buffer as a native array slice
     28     pub fn bytes<'a>(&self, data: &'a [u8]) -> &'a [u8] {
     29         &data[self.pos_usize()..self.pos_usize() + self.len_usize()]
     30     }
     31 
     32     /// Get the slice of the buffer as a string
     33     pub fn str<'a>(&self, data: &'a [u8]) -> std::result::Result<&'a str, std::str::Utf8Error> {
     34         std::str::from_utf8(self.bytes(data))
     35     }
     36 }
     37 
     38 /// A nostr mention: nostr:bech32... #[0]... etc
     39 #[derive(Debug, PartialEq, Eq)]
     40 pub enum Mention {
     41     /// A tag mention #[1], etc
     42     Index(u16),
     43     /// A nostr: mention, starting from the start of the bech32 string to
     44     /// the end It is not parsed at this point to keep things lazy and quick
     45     Bech32(ByteSlice),
     46 }
     47 
     48 /// A Shard represents a part of the shattered content.
     49 #[derive(Debug, PartialEq, Eq)]
     50 pub enum Shard {
     51     Text(ByteSlice),
     52     Mention(Mention),
     53     Hashtag(ByteSlice),
     54     Url(ByteSlice),
     55     //Invoice(Invoice)
     56     //Relay(String)
     57 }
     58 
     59 #[derive(Debug)]
     60 pub struct Shards {
     61     shards: Vec<Shard>,
     62     //num_words: i32,
     63 }
     64 
     65 impl Shards {
     66     pub fn new() -> Shards {
     67         Shards {
     68             // some initial capacity so we don't have to allocate on small parses
     69             shards: Vec::with_capacity(32),
     70         }
     71     }
     72 
     73     /// Parse an indexed mention (#[0], etc)
     74     fn parse_indexed_mention(parser: &mut Parser) -> Result<u16> {
     75         let start = parser.pos();
     76         {
     77             parser.parse_char('[')?;
     78             let ind = parser.parse_digits()?;
     79             parser.parse_char(']')?;
     80             Ok(ind)
     81         }
     82         .map_err(|err| {
     83             parser.set_pos(start);
     84             err
     85         })
     86     }
     87 
     88     /// Parse a hashtag (content after the #)
     89     fn parse_hashtag(parser: &mut Parser) -> Result<ByteSlice> {
     90         let start = parser.pos();
     91         match parser.parse_until(is_boundary_char) {
     92             Ok(()) | Err(Error::OutOfBounds(Bound::End)) => {
     93                 let len = parser.pos() - start;
     94                 if len <= 0 {
     95                     return Err(Error::NotFound);
     96                 }
     97                 return Ok(ByteSlice::new(start as u32, len as u32));
     98             }
     99             Err(err) => Err(err.into()),
    100         }
    101     }
    102 
    103     fn push_txt(&mut self, start: usize, upto: usize) {
    104         let len = upto - start;
    105         if len == 0 {
    106             return;
    107         }
    108 
    109         let txt_slice = ByteSlice::new(start as u32, len as u32);
    110         /*
    111         debug!(
    112             "pushing text block {:?} @ {} '{:?}'",
    113             txt_slice,
    114             parser.pos(),
    115             txt_slice.str(parser.data())
    116         );
    117         */
    118         self.shards.push(Shard::Text(txt_slice));
    119     }
    120 
    121     /// Parse (shatter) content into shards
    122     pub fn parse(content: &str) -> Result<Shards> {
    123         let mut parser = Parser::from_str(content);
    124         let len = parser.len();
    125         let mut shards = Shards::new();
    126         let mut start = parser.pos();
    127 
    128         while parser.pos() < len {
    129             let before_parse = parser.pos();
    130             let prev_boundary = is_left_boundary(&parser.peek_prev_byte());
    131             let c1 = parser.data()[parser.pos()] as char;
    132             parser.set_pos(parser.pos() + 1);
    133 
    134             if c1 == '#' && prev_boundary {
    135                 if let Ok(ht) = Shards::parse_hashtag(&mut parser) {
    136                     shards.push_txt(start, before_parse);
    137                     start = parser.pos();
    138                     debug!("pushing hashtag {:?}", ht);
    139                     shards.shards.push(Shard::Hashtag(ht));
    140                 } else if let Ok(ind) = Shards::parse_indexed_mention(&mut parser) {
    141                     shards.push_txt(start, before_parse);
    142                     start = parser.pos();
    143                     debug!("pushing indexed mention {:?}", ind);
    144                     shards.shards.push(Shard::Mention(Mention::Index(ind)));
    145                 }
    146             }
    147         }
    148 
    149         shards.push_txt(start, parser.pos());
    150         Ok(shards)
    151     }
    152 }
    153 
    154 fn is_left_boundary(r: &Result<u8>) -> bool {
    155     match r {
    156         Err(Error::OutOfBounds(_)) => true,
    157         Err(_) => false,
    158         Ok(c) => is_left_boundary_char(*c),
    159     }
    160 }
    161 
    162 fn is_boundary_char(c: char) -> bool {
    163     c.is_ascii_whitespace() || c.is_ascii_punctuation()
    164 }
    165 
    166 fn is_left_boundary_char(c: u8) -> bool {
    167     is_boundary_char(c as char) || ((c & 0b10000000) == 0b10000000)
    168 }
    169 
    170 #[cfg(test)]
    171 mod test {
    172     use super::*;
    173     use std::sync::Once;
    174 
    175     static INIT: Once = Once::new();
    176 
    177     fn is_boundary(r: &Result<char>) -> bool {
    178         match r {
    179             Err(Error::OutOfBounds(_)) => true,
    180             Err(_) => false,
    181             Ok(c) => is_boundary_char(*c),
    182         }
    183     }
    184 
    185     /// Setup function that is only run once, even if called multiple times.
    186     fn setup() {
    187         INIT.call_once(|| {
    188             env_logger::init();
    189         });
    190     }
    191 
    192     #[test]
    193     fn test_is_boundary() {
    194         setup();
    195 
    196         let content = "a";
    197         let parser = Parser::from_str(&content);
    198         let res = parser.peek_prev_char();
    199         assert_eq!(is_boundary(&res), true);
    200     }
    201 
    202     #[test]
    203     fn test_parse_hashtag_basic() {
    204         setup();
    205 
    206         let content = "abc #😎";
    207         debug!("hashtag_basic content '{}'", content);
    208         let shards = Shards::parse(content).unwrap();
    209         let bs = shards.shards;
    210         assert_eq!(bs.len(), 2);
    211         assert_eq!(bs[0], Shard::Text(ByteSlice::new(0, 4)));
    212         assert_eq!(bs[1], Shard::Hashtag(ByteSlice::new(5, 4)));
    213     }
    214 
    215     #[test]
    216     fn test_parse_hashtag_adjacent() {
    217         setup();
    218 
    219         let content = "aa#abc";
    220         let shards = Shards::parse(content).unwrap();
    221         let bs = shards.shards;
    222         assert_eq!(bs.len(), 1);
    223         assert_eq!(bs[0], Shard::Text(ByteSlice::new(0, 6)));
    224     }
    225 
    226     #[test]
    227     fn test_parse_hashtag_start() {
    228         setup();
    229 
    230         let content = "#abc.";
    231         debug!("test_parse_hashtag_start '{}'", content);
    232         let shards = Shards::parse(content).unwrap();
    233         let bs = shards.shards;
    234         assert_eq!(bs.len(), 2);
    235         assert_eq!(bs[0], Shard::Hashtag(ByteSlice::new(1, 3)));
    236         assert_eq!(bs[1], Shard::Text(ByteSlice::new(4, 1)));
    237     }
    238 
    239     #[test]
    240     fn test_parse_hashtag_end() {
    241         setup();
    242 
    243         let content = "#abc";
    244         debug!("test_parse_hashtag_end '{}'", content);
    245         let shards = Shards::parse(content).unwrap();
    246         let bs = shards.shards;
    247         assert_eq!(bs.len(), 1);
    248         assert_eq!(bs[0], Shard::Hashtag(ByteSlice::new(1, 3)));
    249     }
    250 
    251     #[test]
    252     fn test_parse_hashtag_punc_before() {
    253         setup();
    254 
    255         let content = ".#abc";
    256         let shards = Shards::parse(content).unwrap();
    257         let bs = shards.shards;
    258         assert_eq!(bs.len(), 2);
    259         assert_eq!(bs[0], Shard::Text(ByteSlice::new(0, 1)));
    260         assert_eq!(bs[1], Shard::Hashtag(ByteSlice::new(2, 3)));
    261     }
    262 
    263     #[test]
    264     fn test_indexed_mention() {
    265         setup();
    266 
    267         let content = "this is #[19] #[1 a mention";
    268         debug!("test_indexed_mention '{}'", content);
    269         let shards = Shards::parse(content).unwrap();
    270         let bs = shards.shards;
    271         assert_eq!(bs.len(), 3);
    272         assert_eq!(bs[0], Shard::Text(ByteSlice::new(0, 8)));
    273         assert_eq!(bs[1], Shard::Mention(Mention::Index(19)));
    274         assert_eq!(bs[2], Shard::Text(ByteSlice::new(13, 14)));
    275     }
    276 
    277     #[test]
    278     fn test_multiple_hashtags() {
    279         setup();
    280 
    281         let content = ".#alice.#bob";
    282         let shards = Shards::parse(content).unwrap();
    283         let bs = shards.shards;
    284         assert_eq!(bs.len(), 4);
    285         assert_eq!(bs[0], Shard::Text(ByteSlice::new(0, 1)));
    286         assert_eq!(bs[1], Shard::Hashtag(ByteSlice::new(2, 5)));
    287         assert_eq!(bs[2], Shard::Text(ByteSlice::new(7, 1)));
    288         assert_eq!(bs[3], Shard::Hashtag(ByteSlice::new(9, 3)));
    289     }
    290 
    291     #[test]
    292     fn test_multiple_adjacent_hashtags() {
    293         setup();
    294 
    295         let content = "#alice#bob";
    296         debug!("test_multiple_adjacent_hashtags '{}'", content);
    297         let shards = Shards::parse(content).unwrap();
    298         let bs = shards.shards;
    299         assert_eq!(bs.len(), 2);
    300         assert_eq!(bs[0], Shard::Hashtag(ByteSlice::new(1, 5)));
    301         assert_eq!(bs[1], Shard::Hashtag(ByteSlice::new(7, 3)));
    302     }
    303 
    304     #[test]
    305     fn test_parse_hashtag_emoji_before() {
    306         setup();
    307 
    308         // 00000000: f09f 98a4 2361 6263    ....#abc
    309         let content = "😤#abc";
    310         let shards = Shards::parse(content).unwrap();
    311         let bs = shards.shards;
    312         assert_eq!(bs.len(), 2);
    313         assert_eq!(bs[0], Shard::Text(ByteSlice::new(0, 4)));
    314         assert_eq!(bs[1], Shard::Hashtag(ByteSlice::new(5, 3)));
    315     }
    316 }