shard.rs (9223B)
1 use crate::parser::{Bound, Error, Parser, Result}; 2 use log::debug; 3 4 /// A slice into the original buffer. Contains a position, which is an 5 /// index into the buffer, and the length of the segment. 6 #[derive(Debug, PartialEq, Eq)] 7 pub struct ByteSlice { 8 pos: u32, 9 len: u32, 10 } 11 12 impl ByteSlice { 13 pub fn new(pos: u32, len: u32) -> ByteSlice { 14 ByteSlice { pos, len } 15 } 16 17 #[inline(always)] 18 fn pos_usize(&self) -> usize { 19 self.pos as usize 20 } 21 22 #[inline(always)] 23 fn len_usize(&self) -> usize { 24 self.len as usize 25 } 26 27 /// Get the slice of the buffer as a native array slice 28 pub fn bytes<'a>(&self, data: &'a [u8]) -> &'a [u8] { 29 &data[self.pos_usize()..self.pos_usize() + self.len_usize()] 30 } 31 32 /// Get the slice of the buffer as a string 33 pub fn str<'a>(&self, data: &'a [u8]) -> std::result::Result<&'a str, std::str::Utf8Error> { 34 std::str::from_utf8(self.bytes(data)) 35 } 36 } 37 38 /// A nostr mention: nostr:bech32... #[0]... etc 39 #[derive(Debug, PartialEq, Eq)] 40 pub enum Mention { 41 /// A tag mention #[1], etc 42 Index(u16), 43 /// A nostr: mention, starting from the start of the bech32 string to 44 /// the end It is not parsed at this point to keep things lazy and quick 45 Bech32(ByteSlice), 46 } 47 48 /// A Shard represents a part of the shattered content. 49 #[derive(Debug, PartialEq, Eq)] 50 pub enum Shard { 51 Text(ByteSlice), 52 Mention(Mention), 53 Hashtag(ByteSlice), 54 Url(ByteSlice), 55 //Invoice(Invoice) 56 //Relay(String) 57 } 58 59 #[derive(Debug)] 60 pub struct Shards { 61 shards: Vec<Shard>, 62 //num_words: i32, 63 } 64 65 impl Shards { 66 pub fn new() -> Shards { 67 Shards { 68 // some initial capacity so we don't have to allocate on small parses 69 shards: Vec::with_capacity(32), 70 } 71 } 72 73 /// Parse an indexed mention (#[0], etc) 74 fn parse_indexed_mention(parser: &mut Parser) -> Result<u16> { 75 let start = parser.pos(); 76 { 77 parser.parse_char('[')?; 78 let ind = parser.parse_digits()?; 79 parser.parse_char(']')?; 80 Ok(ind) 81 } 82 .map_err(|err| { 83 parser.set_pos(start); 84 err 85 }) 86 } 87 88 /// Parse a hashtag (content after the #) 89 fn parse_hashtag(parser: &mut Parser) -> Result<ByteSlice> { 90 let start = parser.pos(); 91 match parser.parse_until(is_boundary_char) { 92 Ok(()) | Err(Error::OutOfBounds(Bound::End)) => { 93 let len = parser.pos() - start; 94 if len <= 0 { 95 return Err(Error::NotFound); 96 } 97 return Ok(ByteSlice::new(start as u32, len as u32)); 98 } 99 Err(err) => Err(err.into()), 100 } 101 } 102 103 fn push_txt(&mut self, start: usize, upto: usize) { 104 let len = upto - start; 105 if len == 0 { 106 return; 107 } 108 109 let txt_slice = ByteSlice::new(start as u32, len as u32); 110 /* 111 debug!( 112 "pushing text block {:?} @ {} '{:?}'", 113 txt_slice, 114 parser.pos(), 115 txt_slice.str(parser.data()) 116 ); 117 */ 118 self.shards.push(Shard::Text(txt_slice)); 119 } 120 121 /// Parse (shatter) content into shards 122 pub fn parse(content: &str) -> Result<Shards> { 123 let mut parser = Parser::from_str(content); 124 let len = parser.len(); 125 let mut shards = Shards::new(); 126 let mut start = parser.pos(); 127 128 while parser.pos() < len { 129 let before_parse = parser.pos(); 130 let prev_boundary = is_left_boundary(&parser.peek_prev_byte()); 131 let c1 = parser.data()[parser.pos()] as char; 132 parser.set_pos(parser.pos() + 1); 133 134 if c1 == '#' && prev_boundary { 135 if let Ok(ht) = Shards::parse_hashtag(&mut parser) { 136 shards.push_txt(start, before_parse); 137 start = parser.pos(); 138 debug!("pushing hashtag {:?}", ht); 139 shards.shards.push(Shard::Hashtag(ht)); 140 } else if let Ok(ind) = Shards::parse_indexed_mention(&mut parser) { 141 shards.push_txt(start, before_parse); 142 start = parser.pos(); 143 debug!("pushing indexed mention {:?}", ind); 144 shards.shards.push(Shard::Mention(Mention::Index(ind))); 145 } 146 } 147 } 148 149 shards.push_txt(start, parser.pos()); 150 Ok(shards) 151 } 152 } 153 154 fn is_left_boundary(r: &Result<u8>) -> bool { 155 match r { 156 Err(Error::OutOfBounds(_)) => true, 157 Err(_) => false, 158 Ok(c) => is_left_boundary_char(*c), 159 } 160 } 161 162 fn is_boundary_char(c: char) -> bool { 163 c.is_ascii_whitespace() || c.is_ascii_punctuation() 164 } 165 166 fn is_left_boundary_char(c: u8) -> bool { 167 is_boundary_char(c as char) || ((c & 0b10000000) == 0b10000000) 168 } 169 170 #[cfg(test)] 171 mod test { 172 use super::*; 173 use std::sync::Once; 174 175 static INIT: Once = Once::new(); 176 177 fn is_boundary(r: &Result<char>) -> bool { 178 match r { 179 Err(Error::OutOfBounds(_)) => true, 180 Err(_) => false, 181 Ok(c) => is_boundary_char(*c), 182 } 183 } 184 185 /// Setup function that is only run once, even if called multiple times. 186 fn setup() { 187 INIT.call_once(|| { 188 env_logger::init(); 189 }); 190 } 191 192 #[test] 193 fn test_is_boundary() { 194 setup(); 195 196 let content = "a"; 197 let parser = Parser::from_str(&content); 198 let res = parser.peek_prev_char(); 199 assert_eq!(is_boundary(&res), true); 200 } 201 202 #[test] 203 fn test_parse_hashtag_basic() { 204 setup(); 205 206 let content = "abc #😎"; 207 debug!("hashtag_basic content '{}'", content); 208 let shards = Shards::parse(content).unwrap(); 209 let bs = shards.shards; 210 assert_eq!(bs.len(), 2); 211 assert_eq!(bs[0], Shard::Text(ByteSlice::new(0, 4))); 212 assert_eq!(bs[1], Shard::Hashtag(ByteSlice::new(5, 4))); 213 } 214 215 #[test] 216 fn test_parse_hashtag_adjacent() { 217 setup(); 218 219 let content = "aa#abc"; 220 let shards = Shards::parse(content).unwrap(); 221 let bs = shards.shards; 222 assert_eq!(bs.len(), 1); 223 assert_eq!(bs[0], Shard::Text(ByteSlice::new(0, 6))); 224 } 225 226 #[test] 227 fn test_parse_hashtag_start() { 228 setup(); 229 230 let content = "#abc."; 231 debug!("test_parse_hashtag_start '{}'", content); 232 let shards = Shards::parse(content).unwrap(); 233 let bs = shards.shards; 234 assert_eq!(bs.len(), 2); 235 assert_eq!(bs[0], Shard::Hashtag(ByteSlice::new(1, 3))); 236 assert_eq!(bs[1], Shard::Text(ByteSlice::new(4, 1))); 237 } 238 239 #[test] 240 fn test_parse_hashtag_end() { 241 setup(); 242 243 let content = "#abc"; 244 debug!("test_parse_hashtag_end '{}'", content); 245 let shards = Shards::parse(content).unwrap(); 246 let bs = shards.shards; 247 assert_eq!(bs.len(), 1); 248 assert_eq!(bs[0], Shard::Hashtag(ByteSlice::new(1, 3))); 249 } 250 251 #[test] 252 fn test_parse_hashtag_punc_before() { 253 setup(); 254 255 let content = ".#abc"; 256 let shards = Shards::parse(content).unwrap(); 257 let bs = shards.shards; 258 assert_eq!(bs.len(), 2); 259 assert_eq!(bs[0], Shard::Text(ByteSlice::new(0, 1))); 260 assert_eq!(bs[1], Shard::Hashtag(ByteSlice::new(2, 3))); 261 } 262 263 #[test] 264 fn test_indexed_mention() { 265 setup(); 266 267 let content = "this is #[19] #[1 a mention"; 268 debug!("test_indexed_mention '{}'", content); 269 let shards = Shards::parse(content).unwrap(); 270 let bs = shards.shards; 271 assert_eq!(bs.len(), 3); 272 assert_eq!(bs[0], Shard::Text(ByteSlice::new(0, 8))); 273 assert_eq!(bs[1], Shard::Mention(Mention::Index(19))); 274 assert_eq!(bs[2], Shard::Text(ByteSlice::new(13, 14))); 275 } 276 277 #[test] 278 fn test_multiple_hashtags() { 279 setup(); 280 281 let content = ".#alice.#bob"; 282 let shards = Shards::parse(content).unwrap(); 283 let bs = shards.shards; 284 assert_eq!(bs.len(), 4); 285 assert_eq!(bs[0], Shard::Text(ByteSlice::new(0, 1))); 286 assert_eq!(bs[1], Shard::Hashtag(ByteSlice::new(2, 5))); 287 assert_eq!(bs[2], Shard::Text(ByteSlice::new(7, 1))); 288 assert_eq!(bs[3], Shard::Hashtag(ByteSlice::new(9, 3))); 289 } 290 291 #[test] 292 fn test_multiple_adjacent_hashtags() { 293 setup(); 294 295 let content = "#alice#bob"; 296 debug!("test_multiple_adjacent_hashtags '{}'", content); 297 let shards = Shards::parse(content).unwrap(); 298 let bs = shards.shards; 299 assert_eq!(bs.len(), 2); 300 assert_eq!(bs[0], Shard::Hashtag(ByteSlice::new(1, 5))); 301 assert_eq!(bs[1], Shard::Hashtag(ByteSlice::new(7, 3))); 302 } 303 304 #[test] 305 fn test_parse_hashtag_emoji_before() { 306 setup(); 307 308 // 00000000: f09f 98a4 2361 6263 ....#abc 309 let content = "😤#abc"; 310 let shards = Shards::parse(content).unwrap(); 311 let bs = shards.shards; 312 assert_eq!(bs.len(), 2); 313 assert_eq!(bs[0], Shard::Text(ByteSlice::new(0, 4))); 314 assert_eq!(bs[1], Shard::Hashtag(ByteSlice::new(5, 3))); 315 } 316 }