parser.rs (9323B)
1 #[derive(Debug, PartialEq, Eq)] 2 pub struct Parser<'a> { 3 data: &'a [u8], 4 pos: usize, 5 } 6 7 #[derive(Debug, PartialEq, Eq)] 8 pub enum Bound { 9 Start, 10 End, 11 } 12 13 #[derive(Debug, PartialEq, Eq)] 14 pub enum Error { 15 NotFound, 16 BadUtf8Encoding, 17 OutOfBounds(Bound), 18 } 19 20 pub type Result<T> = std::result::Result<T, Error>; 21 22 pub fn is_oob<T>(r: Result<T>) -> bool { 23 match r { 24 Err(Error::OutOfBounds(_)) => true, 25 Err(_) => false, 26 Ok(_) => false, 27 } 28 } 29 30 impl<'a> Parser<'a> { 31 pub fn from_bytes(data: &'a [u8]) -> Parser<'a> { 32 Parser { data: data, pos: 0 } 33 } 34 35 pub fn from_str(string: &'a str) -> Parser<'a> { 36 Parser { 37 data: string.as_bytes(), 38 pos: 0, 39 } 40 } 41 42 #[inline(always)] 43 pub fn set_pos(&mut self, pos: usize) { 44 self.pos = pos; 45 } 46 47 #[inline(always)] 48 pub fn pos(&self) -> usize { 49 self.pos 50 } 51 52 #[inline(always)] 53 pub fn data(&self) -> &[u8] { 54 self.data 55 } 56 57 #[inline(always)] 58 pub fn len(&self) -> usize { 59 self.data.len() 60 } 61 62 pub fn pull_byte(&mut self) -> Result<u8> { 63 if self.pos + 1 > self.len() { 64 return Err(Error::OutOfBounds(Bound::End)); 65 } 66 67 let c = self.data[self.pos]; 68 self.pos += 1; 69 return Ok(c); 70 } 71 72 pub fn skip<F: Fn(char) -> bool>(&mut self, should_skip: F) -> Result<()> { 73 let len = self.len(); 74 while self.pos < len { 75 let prev = self.pos(); 76 if should_skip(self.pull_char()?) { 77 continue; 78 } else { 79 self.set_pos(prev); 80 return Ok(()); 81 } 82 } 83 84 return Err(Error::OutOfBounds(Bound::End)); 85 } 86 87 pub fn skip_whitespace(&mut self) -> Result<()> { 88 self.skip(|c| c.is_ascii_whitespace()) 89 } 90 91 pub fn parse_digits(&mut self) -> Result<u16> { 92 let mut i = self.pos(); 93 let mut digits = 0u16; 94 let mut number = 0u16; 95 96 while i < self.len() { 97 let byte = self.data()[i]; 98 99 // if it's a utf8 char this is not a digit 100 if (byte & 0x80) == 0x80 || !(byte as char).is_ascii_digit() || digits == 5 { 101 break; 102 } 103 104 let digit = (byte - b'0') as u16; 105 106 number = number.saturating_mul(10).saturating_add(digit); 107 108 digits += 1; 109 i += 1; 110 } 111 112 if digits == 0 || digits > 5 { 113 return Err(Error::NotFound); 114 } 115 116 self.set_pos(i); 117 Ok(number) 118 } 119 120 /// Parser a specific character. If not found, do not advance the parser. 121 pub fn parse_char(&mut self, matching: char) -> Result<()> { 122 let start = self.pos(); 123 let c = self.pull_char()?; 124 if c == matching { 125 return Ok(()); 126 } 127 self.set_pos(start); 128 return Err(Error::NotFound); 129 } 130 131 pub fn peek_char(&mut self) -> Result<char> { 132 let peek = true; 133 self.pull_or_peek_char(peek) 134 } 135 136 pub fn pull_char(&mut self) -> Result<char> { 137 let peek = false; 138 self.pull_or_peek_char(peek) 139 } 140 141 pub fn peek_prev_byte(&mut self) -> Result<u8> { 142 if self.pos == 0 { 143 return Err(Error::OutOfBounds(Bound::Start)); 144 } 145 146 Ok(self.data[self.pos - 1]) 147 } 148 149 pub fn seek_prev_byte(&mut self) -> Result<()> { 150 if self.pos == 0 { 151 return Err(Error::OutOfBounds(Bound::Start)); 152 } 153 self.pos -= 1; 154 155 Ok(()) 156 } 157 158 pub fn peek_prev_char(&self) -> Result<char> { 159 let mut i = 1; 160 let codepoint: u32; 161 let mut bs: [u32; 4] = [0; 4]; 162 163 if self.pos == 0 { 164 return Err(Error::OutOfBounds(Bound::Start)); 165 } 166 167 while i <= 4 && ((self.pos as i32) - (i as i32) >= 0) { 168 let byte = self.data[self.pos - i] as u32; 169 let masked = byte & 0b11000000; 170 if masked == 0b10000000 { 171 // continuation byte 172 bs[i - 1] = byte & 0b00111111; 173 i += 1; 174 } else if masked == 0b11000000 { 175 // start byte 176 match i { 177 4 => { 178 codepoint = ((bs[3] & 0x07) << 18) 179 | ((bs[2] & 0x3F) << 12) 180 | ((bs[1] & 0x3F) << 6) 181 | (bs[0] & 0x3F) 182 } 183 3 => { 184 codepoint = ((bs[2] & 0x0F) << 12) | ((bs[1] & 0x3F) << 6) | (bs[0] & 0x3F) 185 } 186 2 => codepoint = ((bs[1] & 0x0F) << 6) | (bs[0] & 0x3F), 187 _ => return Err(Error::BadUtf8Encoding), 188 } 189 return parser_codepoint_char(codepoint); 190 } else { 191 return parser_codepoint_char(byte); 192 } 193 } 194 195 // If we reached here, we reached the start of the string without finding a non-continuation byte. 196 Err(Error::BadUtf8Encoding) 197 } 198 199 pub fn seek_prev_char(&mut self) -> Result<()> { 200 self.seek_prev_byte()?; 201 while self.pos > 0 && (self.data[self.pos] & 0b11000000) == 0b10000000 { 202 self.pos -= 1; 203 } 204 205 Ok(()) 206 } 207 208 fn pull_or_peek_char(&mut self, peek: bool) -> Result<char> { 209 let mut codepoint: u32 = 0; 210 211 let start = self.pos; 212 let b0 = self.pull_byte()? as u32; 213 214 if b0 & 0x80 != 0 { 215 if (b0 & 0b11100000) == 0b11000000 { 216 // Two-byte sequence 217 let b1 = self.pull_byte()? as u32; 218 codepoint = ((b0 & 0b00011111) << 6) | (b1 & 0b00111111); 219 } else if (b0 & 0xF0) == 0xE0 { 220 // Three-byte sequence 221 let b1 = self.pull_byte()? as u32; 222 let b2 = self.pull_byte()? as u32; 223 codepoint = ((b0 & 0x0F) << 12) | ((b1 & 0x3F) << 6) | (b2 & 0x3F); 224 } else if (b0 & 0xF8) == 0xF0 { 225 // Four-byte sequence 226 let b1 = self.pull_byte()? as u32; 227 let b2 = self.pull_byte()? as u32; 228 let b3 = self.pull_byte()? as u32; 229 codepoint = 230 ((b0 & 0x07) << 18) | ((b1 & 0x3F) << 12) | ((b2 & 0x3F) << 6) | (b3 & 0x3F); 231 } 232 } else { 233 // Single-byte ASCII character 234 return Ok((b0 as u8) as char); 235 } 236 237 if peek { 238 self.pos = start; 239 } 240 241 match std::char::from_u32(codepoint) { 242 Some(c) => Ok(c), 243 None => Err(Error::BadUtf8Encoding), 244 } 245 } 246 247 pub fn parse_until_char(&mut self, needle: char) -> Result<()> { 248 self.parse_until(|c| c == needle) 249 } 250 251 pub fn parse_until<F: Fn(char) -> bool>(&mut self, matches: F) -> Result<()> { 252 let len = self.len(); 253 while self.pos < len { 254 let byte = self.data[self.pos]; 255 let prev = self.pos; 256 257 let chr = if is_utf8(byte) { 258 self.pull_char()? 259 } else { 260 self.pos += 1; 261 byte as char 262 }; 263 264 if matches(chr) { 265 self.pos = prev; 266 return Ok(()); 267 } 268 } 269 270 Err(Error::OutOfBounds(Bound::End)) 271 } 272 } 273 274 fn parser_codepoint_char(codepoint: u32) -> Result<char> { 275 match std::char::from_u32(codepoint) { 276 Some(c) => Ok(c), 277 None => Err(Error::BadUtf8Encoding), 278 } 279 } 280 281 #[cfg(test)] 282 mod test { 283 use super::*; 284 285 #[test] 286 fn test_parser() -> Result<()> { 287 // v alien v 288 // 00000000: 20f0 9f91 bd23 6861 7368 7461 670a _....#hashtag. 289 let s = " #hashtag "; 290 let mut parser = Parser::from_str(s); 291 let mut res = parser.parse_until_char('#'); 292 assert_eq!(res, Ok(())); 293 assert_eq!(parser.pos, 1); 294 res = parser.parse_until_char('t'); 295 assert_eq!(res, Ok(())); 296 assert_eq!(parser.pos, 6); 297 Ok(()) 298 } 299 300 #[test] 301 fn test_parse_digits() { 302 let s = "[1315]"; 303 let mut parser = Parser::from_str(s); 304 let r1 = parser.parse_char('['); 305 assert_eq!(r1, Ok(())); 306 let r2 = parser.parse_digits(); 307 assert_eq!(r2, Ok(1315)); 308 assert_eq!(parser.pos(), 5); 309 } 310 311 #[test] 312 fn test_peek_prev_char() { 313 let s = ".👽."; 314 let mut parser = Parser::from_str(s); 315 let r1 = parser.parse_until_char('👽'); 316 assert_eq!(r1, Ok(())); 317 let r2 = parser.pull_char(); 318 assert_eq!(r2, Ok('👽')); 319 let r3 = parser.peek_prev_char(); 320 assert_eq!(r3, Ok('👽')); 321 assert_eq!(parser.pos(), 5); 322 } 323 324 #[test] 325 fn test_utf8_parsing() -> Result<()> { 326 let s = "hey there #👽."; 327 let mut parser = Parser::from_str(s); 328 let _ = parser.parse_until_char('👽'); 329 assert_eq!(parser.peek_char(), Ok('👽')); 330 assert_eq!(parser.pos, 11); 331 let res = parser.parse_until(|c| c.is_ascii_whitespace() || c.is_ascii_punctuation()); 332 assert_eq!(res, Ok(())); 333 assert_eq!(parser.peek_char(), Ok('.')); 334 Ok(()) 335 } 336 } 337 338 #[inline(always)] 339 fn is_utf8(byte: u8) -> bool { 340 (byte & 0x80) == 0x80 341 }