notedeck

One damus client to rule them all
git clone git://jb55.com/notedeck
Log | Files | Refs | README | LICENSE

tokenizer.rs (6037B)


      1 use std::fmt;
      2 
      3 /// A token from the s-expression tokenizer.
      4 /// String references are zero-copy slices into the input.
      5 #[derive(Debug, Clone, PartialEq)]
      6 pub enum Token<'a> {
      7     Open,
      8     Close,
      9     Symbol(&'a str),
     10     Str(&'a str),
     11     Number(&'a str),
     12 }
     13 
     14 #[derive(Debug)]
     15 pub struct TokenError {
     16     pub msg: String,
     17     pub pos: usize,
     18 }
     19 
     20 impl fmt::Display for TokenError {
     21     fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
     22         write!(f, "token error at position {}: {}", self.pos, self.msg)
     23     }
     24 }
     25 
     26 impl std::error::Error for TokenError {}
     27 
     28 fn is_symbol_start(c: u8) -> bool {
     29     c.is_ascii_lowercase()
     30 }
     31 
     32 fn is_symbol_char(c: u8) -> bool {
     33     c.is_ascii_lowercase() || c.is_ascii_digit() || c == b'-' || c == b'_'
     34 }
     35 
     36 fn scan_symbol(input: &[u8], start: usize) -> Result<usize, TokenError> {
     37     if start >= input.len() || !is_symbol_start(input[start]) {
     38         return Err(TokenError {
     39             msg: "symbol must start with a-z".into(),
     40             pos: start,
     41         });
     42     }
     43     let mut end = start + 1;
     44     while end < input.len() {
     45         let c = input[end];
     46         if c.is_ascii_whitespace() || c == b')' || c == b'(' {
     47             break;
     48         }
     49         if !is_symbol_char(c) {
     50             return Err(TokenError {
     51                 msg: format!("invalid symbol character '{}'", c as char),
     52                 pos: end,
     53             });
     54         }
     55         end += 1;
     56     }
     57     Ok(end)
     58 }
     59 
     60 fn scan_number(input: &[u8], start: usize) -> Result<usize, TokenError> {
     61     if start >= input.len() {
     62         return Err(TokenError {
     63             msg: "unexpected end of input in number".into(),
     64             pos: start,
     65         });
     66     }
     67     let first = input[start];
     68     if !first.is_ascii_digit() && first != b'-' {
     69         return Err(TokenError {
     70             msg: "number must start with 0-9 or -".into(),
     71             pos: start,
     72         });
     73     }
     74     let mut end = start + 1;
     75     while end < input.len() {
     76         let c = input[end];
     77         if c.is_ascii_whitespace() || c == b')' || c == b'(' {
     78             break;
     79         }
     80         if !c.is_ascii_digit() && c != b'.' {
     81             return Err(TokenError {
     82                 msg: format!("invalid number character '{}'", c as char),
     83                 pos: end,
     84             });
     85         }
     86         end += 1;
     87     }
     88     Ok(end)
     89 }
     90 
     91 fn scan_string(input: &[u8], start: usize) -> Result<(usize, usize), TokenError> {
     92     // start should point at the opening quote
     93     if start >= input.len() || input[start] != b'"' {
     94         return Err(TokenError {
     95             msg: "string must start with '\"'".into(),
     96             pos: start,
     97         });
     98     }
     99     let content_start = start + 1;
    100     let mut i = content_start;
    101     while i < input.len() {
    102         if input[i] == b'\\' {
    103             i += 2; // skip escaped char
    104             continue;
    105         }
    106         if input[i] == b'"' {
    107             return Ok((content_start, i)); // i points at closing quote
    108         }
    109         i += 1;
    110     }
    111     Err(TokenError {
    112         msg: "unterminated string".into(),
    113         pos: start,
    114     })
    115 }
    116 
    117 /// Tokenize an s-expression input string into a sequence of tokens.
    118 /// Token string/symbol/number values are zero-copy references into the input.
    119 pub fn tokenize(input: &str) -> Result<Vec<Token<'_>>, TokenError> {
    120     let mut tokens = Vec::new();
    121     let bytes = input.as_bytes();
    122     let mut i = 0;
    123 
    124     while i < bytes.len() {
    125         let c = bytes[i];
    126 
    127         if c.is_ascii_whitespace() {
    128             i += 1;
    129             continue;
    130         }
    131 
    132         match c {
    133             b'(' => {
    134                 tokens.push(Token::Open);
    135                 i += 1;
    136             }
    137             b')' => {
    138                 tokens.push(Token::Close);
    139                 i += 1;
    140             }
    141             b'"' => {
    142                 let (content_start, content_end) = scan_string(bytes, i)?;
    143                 tokens.push(Token::Str(&input[content_start..content_end]));
    144                 i = content_end + 1; // skip closing quote
    145             }
    146             b'a'..=b'z' => {
    147                 let end = scan_symbol(bytes, i)?;
    148                 tokens.push(Token::Symbol(&input[i..end]));
    149                 i = end;
    150             }
    151             b'0'..=b'9' | b'-' => {
    152                 let end = scan_number(bytes, i)?;
    153                 tokens.push(Token::Number(&input[i..end]));
    154                 i = end;
    155             }
    156             _ => {
    157                 return Err(TokenError {
    158                     msg: format!("unexpected character '{}'", c as char),
    159                     pos: i,
    160                 });
    161             }
    162         }
    163     }
    164 
    165     Ok(tokens)
    166 }
    167 
    168 #[cfg(test)]
    169 mod tests {
    170     use super::*;
    171 
    172     #[test]
    173     fn test_tokenize_simple() {
    174         let tokens = tokenize("(room (name \"hello\"))").unwrap();
    175         assert_eq!(
    176             tokens,
    177             vec![
    178                 Token::Open,
    179                 Token::Symbol("room"),
    180                 Token::Open,
    181                 Token::Symbol("name"),
    182                 Token::Str("hello"),
    183                 Token::Close,
    184                 Token::Close,
    185             ]
    186         );
    187     }
    188 
    189     #[test]
    190     fn test_tokenize_number() {
    191         let tokens = tokenize("(width 10)").unwrap();
    192         assert_eq!(
    193             tokens,
    194             vec![
    195                 Token::Open,
    196                 Token::Symbol("width"),
    197                 Token::Number("10"),
    198                 Token::Close,
    199             ]
    200         );
    201     }
    202 
    203     #[test]
    204     fn test_tokenize_symbol_with_dash() {
    205         let tokens = tokenize("(id welcome-desk)").unwrap();
    206         assert_eq!(
    207             tokens,
    208             vec![
    209                 Token::Open,
    210                 Token::Symbol("id"),
    211                 Token::Symbol("welcome-desk"),
    212                 Token::Close,
    213             ]
    214         );
    215     }
    216 
    217     #[test]
    218     fn test_tokenize_negative_number() {
    219         let tokens = tokenize("(height -5)").unwrap();
    220         assert_eq!(
    221             tokens,
    222             vec![
    223                 Token::Open,
    224                 Token::Symbol("height"),
    225                 Token::Number("-5"),
    226                 Token::Close,
    227             ]
    228         );
    229     }
    230 }