tokenizer.rs (6037B)
1 use std::fmt; 2 3 /// A token from the s-expression tokenizer. 4 /// String references are zero-copy slices into the input. 5 #[derive(Debug, Clone, PartialEq)] 6 pub enum Token<'a> { 7 Open, 8 Close, 9 Symbol(&'a str), 10 Str(&'a str), 11 Number(&'a str), 12 } 13 14 #[derive(Debug)] 15 pub struct TokenError { 16 pub msg: String, 17 pub pos: usize, 18 } 19 20 impl fmt::Display for TokenError { 21 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { 22 write!(f, "token error at position {}: {}", self.pos, self.msg) 23 } 24 } 25 26 impl std::error::Error for TokenError {} 27 28 fn is_symbol_start(c: u8) -> bool { 29 c.is_ascii_lowercase() 30 } 31 32 fn is_symbol_char(c: u8) -> bool { 33 c.is_ascii_lowercase() || c.is_ascii_digit() || c == b'-' || c == b'_' 34 } 35 36 fn scan_symbol(input: &[u8], start: usize) -> Result<usize, TokenError> { 37 if start >= input.len() || !is_symbol_start(input[start]) { 38 return Err(TokenError { 39 msg: "symbol must start with a-z".into(), 40 pos: start, 41 }); 42 } 43 let mut end = start + 1; 44 while end < input.len() { 45 let c = input[end]; 46 if c.is_ascii_whitespace() || c == b')' || c == b'(' { 47 break; 48 } 49 if !is_symbol_char(c) { 50 return Err(TokenError { 51 msg: format!("invalid symbol character '{}'", c as char), 52 pos: end, 53 }); 54 } 55 end += 1; 56 } 57 Ok(end) 58 } 59 60 fn scan_number(input: &[u8], start: usize) -> Result<usize, TokenError> { 61 if start >= input.len() { 62 return Err(TokenError { 63 msg: "unexpected end of input in number".into(), 64 pos: start, 65 }); 66 } 67 let first = input[start]; 68 if !first.is_ascii_digit() && first != b'-' { 69 return Err(TokenError { 70 msg: "number must start with 0-9 or -".into(), 71 pos: start, 72 }); 73 } 74 let mut end = start + 1; 75 while end < input.len() { 76 let c = input[end]; 77 if c.is_ascii_whitespace() || c == b')' || c == b'(' { 78 break; 79 } 80 if !c.is_ascii_digit() && c != b'.' { 81 return Err(TokenError { 82 msg: format!("invalid number character '{}'", c as char), 83 pos: end, 84 }); 85 } 86 end += 1; 87 } 88 Ok(end) 89 } 90 91 fn scan_string(input: &[u8], start: usize) -> Result<(usize, usize), TokenError> { 92 // start should point at the opening quote 93 if start >= input.len() || input[start] != b'"' { 94 return Err(TokenError { 95 msg: "string must start with '\"'".into(), 96 pos: start, 97 }); 98 } 99 let content_start = start + 1; 100 let mut i = content_start; 101 while i < input.len() { 102 if input[i] == b'\\' { 103 i += 2; // skip escaped char 104 continue; 105 } 106 if input[i] == b'"' { 107 return Ok((content_start, i)); // i points at closing quote 108 } 109 i += 1; 110 } 111 Err(TokenError { 112 msg: "unterminated string".into(), 113 pos: start, 114 }) 115 } 116 117 /// Tokenize an s-expression input string into a sequence of tokens. 118 /// Token string/symbol/number values are zero-copy references into the input. 119 pub fn tokenize(input: &str) -> Result<Vec<Token<'_>>, TokenError> { 120 let mut tokens = Vec::new(); 121 let bytes = input.as_bytes(); 122 let mut i = 0; 123 124 while i < bytes.len() { 125 let c = bytes[i]; 126 127 if c.is_ascii_whitespace() { 128 i += 1; 129 continue; 130 } 131 132 match c { 133 b'(' => { 134 tokens.push(Token::Open); 135 i += 1; 136 } 137 b')' => { 138 tokens.push(Token::Close); 139 i += 1; 140 } 141 b'"' => { 142 let (content_start, content_end) = scan_string(bytes, i)?; 143 tokens.push(Token::Str(&input[content_start..content_end])); 144 i = content_end + 1; // skip closing quote 145 } 146 b'a'..=b'z' => { 147 let end = scan_symbol(bytes, i)?; 148 tokens.push(Token::Symbol(&input[i..end])); 149 i = end; 150 } 151 b'0'..=b'9' | b'-' => { 152 let end = scan_number(bytes, i)?; 153 tokens.push(Token::Number(&input[i..end])); 154 i = end; 155 } 156 _ => { 157 return Err(TokenError { 158 msg: format!("unexpected character '{}'", c as char), 159 pos: i, 160 }); 161 } 162 } 163 } 164 165 Ok(tokens) 166 } 167 168 #[cfg(test)] 169 mod tests { 170 use super::*; 171 172 #[test] 173 fn test_tokenize_simple() { 174 let tokens = tokenize("(room (name \"hello\"))").unwrap(); 175 assert_eq!( 176 tokens, 177 vec![ 178 Token::Open, 179 Token::Symbol("room"), 180 Token::Open, 181 Token::Symbol("name"), 182 Token::Str("hello"), 183 Token::Close, 184 Token::Close, 185 ] 186 ); 187 } 188 189 #[test] 190 fn test_tokenize_number() { 191 let tokens = tokenize("(width 10)").unwrap(); 192 assert_eq!( 193 tokens, 194 vec![ 195 Token::Open, 196 Token::Symbol("width"), 197 Token::Number("10"), 198 Token::Close, 199 ] 200 ); 201 } 202 203 #[test] 204 fn test_tokenize_symbol_with_dash() { 205 let tokens = tokenize("(id welcome-desk)").unwrap(); 206 assert_eq!( 207 tokens, 208 vec![ 209 Token::Open, 210 Token::Symbol("id"), 211 Token::Symbol("welcome-desk"), 212 Token::Close, 213 ] 214 ); 215 } 216 217 #[test] 218 fn test_tokenize_negative_number() { 219 let tokens = tokenize("(height -5)").unwrap(); 220 assert_eq!( 221 tokens, 222 vec![ 223 Token::Open, 224 Token::Symbol("height"), 225 Token::Number("-5"), 226 Token::Close, 227 ] 228 ); 229 } 230 }