diff --git a/src/lexer.rs b/src/lexer.rs index e4e39aa..d0f5c71 100644 --- a/src/lexer.rs +++ b/src/lexer.rs @@ -1,9 +1,17 @@ use std::iter::Peekable; +#[derive(Debug)] +pub enum CharacterClass { + Digit, + Whitespace, + WordCharacter +} + #[derive(Debug)] pub enum Token { Character(char), - CharacterRange(char, char) + CharacterRange(char, char), + CharacterClass(CharacterClass, bool), } #[derive(Debug)] @@ -24,6 +32,32 @@ where } } + fn read_integer(&mut self, radix: u32) -> Option { + match u32::from_str_radix(self.scanner.by_ref().take_while(|c| c.is_digit(radix)).collect::().to_owned().as_str(), radix) { + Ok(result) => Some(result), + Err(e) => { + eprintln!("Failed to read DecimalIntgegerLiteral: {e}"); + None + } + } + } + + fn read_hex_number(&mut self, hex_digits: usize) -> Option { + let number_string = self.scanner.by_ref().take(hex_digits).collect::(); + + if number_string.len() != hex_digits { + return None; + } + + match u32::from_str_radix(number_string.as_str(), 16) { + Ok(result) => Some(result), + Err(e) => { + eprintln!("Failed to read DecimalIntgegerLiteral: {e}"); + None + } + } + } + fn handle_character(&mut self) -> Option { let character = self.scanner.next()?; @@ -35,6 +69,57 @@ where Some(Token::Character(character)) } + + fn handle_escape(&mut self) -> Option { + // discard leading backslash + self.scanner.next()?; + + let escapee = self.scanner.peek()?.to_ascii_lowercase(); + + if "dsw".contains(escapee) { + return self.handle_character_class_escape(); + } + + if escapee.is_digit(10) { + return Some(Token::Character(char::from_u32(self.read_integer(10)?)?)) + } + + self.handle_character_escape() + } + + fn handle_character_class_escape(&mut self) -> Option { + match self.scanner.next()? { + 'd' => Some(Token::CharacterClass(CharacterClass::Digit, false)), + 'D' => Some(Token::CharacterClass(CharacterClass::Digit, true)), + 's' => Some(Token::CharacterClass(CharacterClass::Whitespace, false)), + 'S' => Some(Token::CharacterClass(CharacterClass::Whitespace, true)), + 'w' => Some(Token::CharacterClass(CharacterClass::WordCharacter, false)), + 'W' => Some(Token::CharacterClass(CharacterClass::WordCharacter, true)), + _ => panic!() + } + } + + fn handle_character_escape(&mut self) -> Option { + match self.scanner.next()? { + 'f' => Some(Token::Character(char::from_u32(12)?)), + 'n' => Some(Token::Character('\n')), + 'r' => Some(Token::Character('\r')), + 't' => Some(Token::Character('\t')), + 'v' => Some(Token::Character(char::from_u32(11)?)), + + 'c' => { + eprintln!("Control sequences are not supported yet"); + None + } + + 'x' => Some(Token::Character(char::from_u32(self.read_hex_number(2)?)?)), + 'u' => Some(Token::Character(char::from_u32(self.read_hex_number(4)?)?)), + c => { + eprintln!("Invalid escape sequence '\\{c}'"); + None + } + } + } } impl Iterator for Lexer @@ -50,7 +135,7 @@ where '-' => todo!(), '^' => todo!(), '$' => todo!(), - '\\' => todo!(), + '\\' => self.handle_escape(), '.' => todo!(), '*' => todo!(), '+' => todo!(), diff --git a/src/main.rs b/src/main.rs index f227f7a..46801d9 100644 --- a/src/main.rs +++ b/src/main.rs @@ -2,7 +2,7 @@ mod parser; mod lexer; fn main() { - let output = match parser::parse_string("Hello A-Ztesting!") { + let output = match parser::parse_string(r"Hello A-Ztesting! \s\D \228 \xAF \u2F55 \o") { Ok(val) => val, Err(e) => panic!("{e}") };