domus

One damus client to rule them all
git clone git://jb55.com/domus
Log | Files | Refs | README

parser.rs (3876B)


      1 use log::info;
      2 
      3 #[derive(Debug, PartialEq)]
      4 struct Parser<'a> {
      5     data: &'a [u8],
      6     pos: usize,
      7 }
      8 
      9 #[derive(Debug, PartialEq)]
     10 enum ParseError {
     11     NotFound,
     12     BadUtf8Encoding,
     13     EOF,
     14 }
     15 
     16 type Result<T> = std::result::Result<T, ParseError>;
     17 
     18 impl<'a> Parser<'a> {
     19     fn new(data: &'a [u8]) -> Parser {
     20         Parser { data: data, pos: 0 }
     21     }
     22 
     23     fn pull_byte(&mut self) -> Result<u8> {
     24         if self.pos + 1 > self.data.len() {
     25             return Err(ParseError::EOF);
     26         }
     27 
     28         let c = self.data[self.pos];
     29         self.pos += 1;
     30         return Ok(c);
     31     }
     32 
     33     pub fn peek_char(&mut self) -> Result<char> {
     34         let peek = true;
     35         self.pull_or_peek_char(peek)
     36     }
     37 
     38     pub fn pull_char(&mut self) -> Result<char> {
     39         let peek = false;
     40         self.pull_or_peek_char(peek)
     41     }
     42 
     43     fn pull_or_peek_char(&mut self, peek: bool) -> Result<char> {
     44         let mut codepoint: u32 = 0;
     45 
     46         let start = self.pos;
     47         let b0 = self.pull_byte()? as u32;
     48 
     49         if b0 & 0x80 != 0 {
     50             if (b0 & 0xE0) == 0xC0 {
     51                 // Two-byte sequence
     52                 let b1 = self.pull_byte()? as u32;
     53                 codepoint = ((b0 & 0x1F) << 6) | (b1 & 0x3F);
     54             } else if (b0 & 0xF0) == 0xE0 {
     55                 // Three-byte sequence
     56                 let b1 = self.pull_byte()? as u32;
     57                 let b2 = self.pull_byte()? as u32;
     58                 codepoint = ((b0 & 0x0F) << 12) | ((b1 & 0x3F) << 6) | (b2 & 0x3F);
     59             } else if (b0 & 0xF8) == 0xF0 {
     60                 // Four-byte sequence
     61                 let b1 = self.pull_byte()? as u32;
     62                 let b2 = self.pull_byte()? as u32;
     63                 let b3 = self.pull_byte()? as u32;
     64                 codepoint =
     65                     ((b0 & 0x07) << 18) | ((b1 & 0x3F) << 12) | ((b2 & 0x3F) << 6) | (b3 & 0x3F);
     66             }
     67         } else {
     68             // Single-byte ASCII character
     69             codepoint = b0;
     70         }
     71 
     72         if peek {
     73             self.pos = start;
     74         }
     75 
     76         match std::char::from_u32(codepoint) {
     77             Some(c) => Ok(c),
     78             None => Err(ParseError::BadUtf8Encoding),
     79         }
     80     }
     81 
     82     fn current(&mut self) -> Result<char> {
     83         let last_pos = self.pos;
     84         let c = self.pull_char();
     85         if c.is_ok() {
     86             self.pos = last_pos;
     87         }
     88         return c;
     89     }
     90 
     91     fn parse_until_char(&mut self, needle: char) -> Result<()> {
     92         self.parse_until(|c| c == needle)?;
     93         Ok(())
     94     }
     95 
     96     fn parse_until<F: Fn(char) -> bool>(&mut self, matches: F) -> Result<()> {
     97         let len = self.data.len();
     98         while self.pos < len {
     99             let prev_pos = self.pos;
    100             if matches(self.pull_char()?) {
    101                 self.pos = prev_pos;
    102                 return Ok(());
    103             }
    104         }
    105 
    106         Err(ParseError::NotFound)
    107     }
    108 }
    109 
    110 #[cfg(test)]
    111 mod test {
    112     use super::*;
    113 
    114     #[test]
    115     fn test_parser() -> Result<()> {
    116         //             v alien  v
    117         // 00000000: 20f0 9f91 bd23 6861 7368 7461 670a       _....#hashtag.
    118         let s = " #hashtag ";
    119         let mut parser = Parser::new(s.as_bytes());
    120         let mut res = parser.parse_until_char('#');
    121         assert_eq!(res, Ok(()));
    122         assert_eq!(parser.pos, 1);
    123         res = parser.parse_until_char('t');
    124         assert_eq!(res, Ok(()));
    125         assert_eq!(parser.pos, 6);
    126         Ok(())
    127     }
    128 
    129     #[test]
    130     fn test_utf8_parsing() -> Result<()> {
    131         let s = "hey there #👽.";
    132         let mut parser = Parser::new(s.as_bytes());
    133         let _ = parser.parse_until_char('👽');
    134         assert_eq!(parser.current(), Ok('👽'));
    135         assert_eq!(parser.pos, 11);
    136         let res = parser.parse_until(|c| c.is_ascii_whitespace() || c.is_ascii_punctuation());
    137         assert_eq!(res, Ok(()));
    138         assert_eq!(parser.current(), Ok('.'));
    139         Ok(())
    140     }
    141 }