parser.rs (3876B)
1 use log::info; 2 3 #[derive(Debug, PartialEq)] 4 struct Parser<'a> { 5 data: &'a [u8], 6 pos: usize, 7 } 8 9 #[derive(Debug, PartialEq)] 10 enum ParseError { 11 NotFound, 12 BadUtf8Encoding, 13 EOF, 14 } 15 16 type Result<T> = std::result::Result<T, ParseError>; 17 18 impl<'a> Parser<'a> { 19 fn new(data: &'a [u8]) -> Parser { 20 Parser { data: data, pos: 0 } 21 } 22 23 fn pull_byte(&mut self) -> Result<u8> { 24 if self.pos + 1 > self.data.len() { 25 return Err(ParseError::EOF); 26 } 27 28 let c = self.data[self.pos]; 29 self.pos += 1; 30 return Ok(c); 31 } 32 33 pub fn peek_char(&mut self) -> Result<char> { 34 let peek = true; 35 self.pull_or_peek_char(peek) 36 } 37 38 pub fn pull_char(&mut self) -> Result<char> { 39 let peek = false; 40 self.pull_or_peek_char(peek) 41 } 42 43 fn pull_or_peek_char(&mut self, peek: bool) -> Result<char> { 44 let mut codepoint: u32 = 0; 45 46 let start = self.pos; 47 let b0 = self.pull_byte()? as u32; 48 49 if b0 & 0x80 != 0 { 50 if (b0 & 0xE0) == 0xC0 { 51 // Two-byte sequence 52 let b1 = self.pull_byte()? as u32; 53 codepoint = ((b0 & 0x1F) << 6) | (b1 & 0x3F); 54 } else if (b0 & 0xF0) == 0xE0 { 55 // Three-byte sequence 56 let b1 = self.pull_byte()? as u32; 57 let b2 = self.pull_byte()? as u32; 58 codepoint = ((b0 & 0x0F) << 12) | ((b1 & 0x3F) << 6) | (b2 & 0x3F); 59 } else if (b0 & 0xF8) == 0xF0 { 60 // Four-byte sequence 61 let b1 = self.pull_byte()? as u32; 62 let b2 = self.pull_byte()? as u32; 63 let b3 = self.pull_byte()? as u32; 64 codepoint = 65 ((b0 & 0x07) << 18) | ((b1 & 0x3F) << 12) | ((b2 & 0x3F) << 6) | (b3 & 0x3F); 66 } 67 } else { 68 // Single-byte ASCII character 69 codepoint = b0; 70 } 71 72 if peek { 73 self.pos = start; 74 } 75 76 match std::char::from_u32(codepoint) { 77 Some(c) => Ok(c), 78 None => Err(ParseError::BadUtf8Encoding), 79 } 80 } 81 82 fn current(&mut self) -> Result<char> { 83 let last_pos = self.pos; 84 let c = self.pull_char(); 85 if c.is_ok() { 86 self.pos = last_pos; 87 } 88 return c; 89 } 90 91 fn parse_until_char(&mut self, needle: char) -> Result<()> { 92 self.parse_until(|c| c == needle)?; 93 Ok(()) 94 } 95 96 fn parse_until<F: Fn(char) -> bool>(&mut self, matches: F) -> Result<()> { 97 let len = self.data.len(); 98 while self.pos < len { 99 let prev_pos = self.pos; 100 if matches(self.pull_char()?) { 101 self.pos = prev_pos; 102 return Ok(()); 103 } 104 } 105 106 Err(ParseError::NotFound) 107 } 108 } 109 110 #[cfg(test)] 111 mod test { 112 use super::*; 113 114 #[test] 115 fn test_parser() -> Result<()> { 116 // v alien v 117 // 00000000: 20f0 9f91 bd23 6861 7368 7461 670a _....#hashtag. 118 let s = " #hashtag "; 119 let mut parser = Parser::new(s.as_bytes()); 120 let mut res = parser.parse_until_char('#'); 121 assert_eq!(res, Ok(())); 122 assert_eq!(parser.pos, 1); 123 res = parser.parse_until_char('t'); 124 assert_eq!(res, Ok(())); 125 assert_eq!(parser.pos, 6); 126 Ok(()) 127 } 128 129 #[test] 130 fn test_utf8_parsing() -> Result<()> { 131 let s = "hey there #👽."; 132 let mut parser = Parser::new(s.as_bytes()); 133 let _ = parser.parse_until_char('👽'); 134 assert_eq!(parser.current(), Ok('👽')); 135 assert_eq!(parser.pos, 11); 136 let res = parser.parse_until(|c| c.is_ascii_whitespace() || c.is_ascii_punctuation()); 137 assert_eq!(res, Ok(())); 138 assert_eq!(parser.current(), Ok('.')); 139 Ok(()) 140 } 141 }