Files
projectname/frontend/src/lexer/mod.rs
T

196 lines
5.0 KiB
Rust

use std::{fmt::Debug, str::FromStr};
use crate::{input::Input, lexer::token::Keyword};
use self::token::{Punct, Token};
pub mod token;
#[derive(Debug)]
pub enum LexerError<E: Debug> {
InputError(E),
}
pub struct Lexer<I: Input<char>> {
input: I,
buffer: Option<Token>,
}
impl<I: Input<char>> Lexer<I> {
pub fn new(input: I) -> Self {
Self {
input,
buffer: None,
}
}
fn skip_whitespace(&mut self) -> Result<(), LexerError<I::Error>> {
loop {
let Some(ch) = self.input.peek().map_err(LexerError::InputError)? else {
break;
};
if !ch.is_whitespace() {
break;
}
self.input.next().ok();
}
Ok(())
}
fn lex_ident_or_keyword(&mut self) -> Result<Token, LexerError<I::Error>> {
let mut buffer = String::new();
loop {
let Some(ch) = self.input.peek().map_err(LexerError::InputError)? else {
break;
};
if !token::is_ident_rest(ch) {
break;
}
buffer.push(ch);
self.input.next().ok();
}
Ok(if let Ok(kw) = Keyword::from_str(&buffer) {
Token::Keyword(kw)
} else {
Token::Ident(buffer)
})
}
fn lex_number(&mut self) -> Result<Token, LexerError<I::Error>> {
let mut buffer = String::new();
loop {
let Some(ch) = self.input.peek().map_err(LexerError::InputError)? else {
break;
};
if !ch.is_alphanumeric() || ch == '.' {
break;
}
buffer.push(ch);
self.input.next().ok();
}
let value = if let Some(value) = buffer
.strip_prefix("0x")
.and_then(|v| u64::from_str_radix(v, 16).ok())
{
value
} else if let Ok(value) = u64::from_str_radix(&buffer, 10) {
value
} else {
todo!("Invalid number: {:?}", buffer);
};
Ok(Token::LitInt(value))
}
fn lex_operator(&mut self) -> Result<Token, LexerError<I::Error>> {
let mut buffer = String::new();
loop {
let Some(ch) = self.input.peek().map_err(LexerError::InputError)? else {
break;
};
if !token::is_operator(ch) {
break;
}
buffer.push(ch);
self.input.next().ok();
}
Ok(Token::Operator(buffer))
}
fn lex_token(&mut self) -> Result<Option<Token>, LexerError<I::Error>> {
self.skip_whitespace()?;
let Some(ch) = self.input.peek().map_err(LexerError::InputError)? else {
return Ok(None);
};
Ok(if token::is_ident_head(ch) {
Some(self.lex_ident_or_keyword()?)
} else if ch.is_digit(10) {
Some(self.lex_number()?)
} else if token::is_operator(ch) {
Some(self.lex_operator()?)
} else if let Ok(punct) = Punct::try_from(ch) {
self.input.next().ok();
Some(Token::Punct(punct))
} else {
todo!()
})
}
}
impl<I: Input<char>> Input<Token> for Lexer<I> {
type Error = LexerError<I::Error>;
fn peek(&mut self) -> Result<Option<Token>, Self::Error> {
if self.buffer.is_none() {
self.buffer = self.lex_token()?;
}
Ok(self.buffer.clone())
}
fn next(&mut self) -> Result<Option<Token>, Self::Error> {
let result = if let Some(token) = self.buffer.as_ref() {
Some(token.clone())
} else {
self.lex_token()?
};
self.buffer = self.lex_token()?;
Ok(result)
}
}
#[cfg(test)]
mod tests {
use crate::{
input::{Input, SliceInput},
lexer::token::{Keyword, Token},
};
use super::{Lexer, LexerError};
fn single_token(s: &str) -> Result<Option<Token>, LexerError<!>> {
let mut lex = Lexer::new(SliceInput::new(s.chars()));
lex.next()
}
#[test]
fn kw_or_ident() {
let s = "a as b where c";
let mut lex = Lexer::new(SliceInput::new(s.chars()));
assert_eq!(lex.next().unwrap(), Some(Token::Ident("a".to_owned())));
assert_eq!(lex.next().unwrap(), Some(Token::Keyword(Keyword::As)));
assert_eq!(lex.next().unwrap(), Some(Token::Ident("b".to_owned())));
assert_eq!(lex.next().unwrap(), Some(Token::Keyword(Keyword::Where)));
assert_eq!(lex.next().unwrap(), Some(Token::Ident("c".to_owned())));
assert_eq!(lex.next().unwrap(), None);
}
#[test]
fn lit_int() {
assert_eq!(single_token("0").unwrap(), Some(Token::LitInt(0)));
assert_eq!(single_token("000").unwrap(), Some(Token::LitInt(0)));
assert_eq!(single_token("0x0").unwrap(), Some(Token::LitInt(0)));
assert_eq!(single_token("0x123").unwrap(), Some(Token::LitInt(0x123)));
assert_eq!(single_token("0123").unwrap(), Some(Token::LitInt(123)));
}
}