196 lines
5.0 KiB
Rust
196 lines
5.0 KiB
Rust
use std::{fmt::Debug, str::FromStr};
|
|
|
|
use crate::{input::Input, lexer::token::Keyword};
|
|
|
|
use self::token::{Punct, Token};
|
|
|
|
pub mod token;
|
|
|
|
#[derive(Debug)]
|
|
pub enum LexerError<E: Debug> {
|
|
InputError(E),
|
|
}
|
|
|
|
pub struct Lexer<I: Input<char>> {
|
|
input: I,
|
|
buffer: Option<Token>,
|
|
}
|
|
|
|
impl<I: Input<char>> Lexer<I> {
|
|
pub fn new(input: I) -> Self {
|
|
Self {
|
|
input,
|
|
buffer: None,
|
|
}
|
|
}
|
|
|
|
fn skip_whitespace(&mut self) -> Result<(), LexerError<I::Error>> {
|
|
loop {
|
|
let Some(ch) = self.input.peek().map_err(LexerError::InputError)? else {
|
|
break;
|
|
};
|
|
|
|
if !ch.is_whitespace() {
|
|
break;
|
|
}
|
|
|
|
self.input.next().ok();
|
|
}
|
|
|
|
Ok(())
|
|
}
|
|
|
|
fn lex_ident_or_keyword(&mut self) -> Result<Token, LexerError<I::Error>> {
|
|
let mut buffer = String::new();
|
|
|
|
loop {
|
|
let Some(ch) = self.input.peek().map_err(LexerError::InputError)? else {
|
|
break;
|
|
};
|
|
|
|
if !token::is_ident_rest(ch) {
|
|
break;
|
|
}
|
|
|
|
buffer.push(ch);
|
|
|
|
self.input.next().ok();
|
|
}
|
|
|
|
Ok(if let Ok(kw) = Keyword::from_str(&buffer) {
|
|
Token::Keyword(kw)
|
|
} else {
|
|
Token::Ident(buffer)
|
|
})
|
|
}
|
|
|
|
fn lex_number(&mut self) -> Result<Token, LexerError<I::Error>> {
|
|
let mut buffer = String::new();
|
|
|
|
loop {
|
|
let Some(ch) = self.input.peek().map_err(LexerError::InputError)? else {
|
|
break;
|
|
};
|
|
|
|
if !ch.is_alphanumeric() || ch == '.' {
|
|
break;
|
|
}
|
|
|
|
buffer.push(ch);
|
|
self.input.next().ok();
|
|
}
|
|
|
|
let value = if let Some(value) = buffer
|
|
.strip_prefix("0x")
|
|
.and_then(|v| u64::from_str_radix(v, 16).ok())
|
|
{
|
|
value
|
|
} else if let Ok(value) = u64::from_str_radix(&buffer, 10) {
|
|
value
|
|
} else {
|
|
todo!("Invalid number: {:?}", buffer);
|
|
};
|
|
|
|
Ok(Token::LitInt(value))
|
|
}
|
|
|
|
fn lex_operator(&mut self) -> Result<Token, LexerError<I::Error>> {
|
|
let mut buffer = String::new();
|
|
|
|
loop {
|
|
let Some(ch) = self.input.peek().map_err(LexerError::InputError)? else {
|
|
break;
|
|
};
|
|
|
|
if !token::is_operator(ch) {
|
|
break;
|
|
}
|
|
|
|
buffer.push(ch);
|
|
self.input.next().ok();
|
|
}
|
|
|
|
Ok(Token::Operator(buffer))
|
|
}
|
|
|
|
fn lex_token(&mut self) -> Result<Option<Token>, LexerError<I::Error>> {
|
|
self.skip_whitespace()?;
|
|
|
|
let Some(ch) = self.input.peek().map_err(LexerError::InputError)? else {
|
|
return Ok(None);
|
|
};
|
|
|
|
Ok(if token::is_ident_head(ch) {
|
|
Some(self.lex_ident_or_keyword()?)
|
|
} else if ch.is_digit(10) {
|
|
Some(self.lex_number()?)
|
|
} else if token::is_operator(ch) {
|
|
Some(self.lex_operator()?)
|
|
} else if let Ok(punct) = Punct::try_from(ch) {
|
|
self.input.next().ok();
|
|
Some(Token::Punct(punct))
|
|
} else {
|
|
todo!()
|
|
})
|
|
}
|
|
}
|
|
|
|
impl<I: Input<char>> Input<Token> for Lexer<I> {
|
|
type Error = LexerError<I::Error>;
|
|
|
|
fn peek(&mut self) -> Result<Option<Token>, Self::Error> {
|
|
if self.buffer.is_none() {
|
|
self.buffer = self.lex_token()?;
|
|
}
|
|
|
|
Ok(self.buffer.clone())
|
|
}
|
|
|
|
fn next(&mut self) -> Result<Option<Token>, Self::Error> {
|
|
let result = if let Some(token) = self.buffer.as_ref() {
|
|
Some(token.clone())
|
|
} else {
|
|
self.lex_token()?
|
|
};
|
|
self.buffer = self.lex_token()?;
|
|
Ok(result)
|
|
}
|
|
}
|
|
|
|
#[cfg(test)]
|
|
mod tests {
|
|
use crate::{
|
|
input::{Input, SliceInput},
|
|
lexer::token::{Keyword, Token},
|
|
};
|
|
|
|
use super::{Lexer, LexerError};
|
|
|
|
fn single_token(s: &str) -> Result<Option<Token>, LexerError<!>> {
|
|
let mut lex = Lexer::new(SliceInput::new(s.chars()));
|
|
lex.next()
|
|
}
|
|
|
|
#[test]
|
|
fn kw_or_ident() {
|
|
let s = "a as b where c";
|
|
let mut lex = Lexer::new(SliceInput::new(s.chars()));
|
|
|
|
assert_eq!(lex.next().unwrap(), Some(Token::Ident("a".to_owned())));
|
|
assert_eq!(lex.next().unwrap(), Some(Token::Keyword(Keyword::As)));
|
|
assert_eq!(lex.next().unwrap(), Some(Token::Ident("b".to_owned())));
|
|
assert_eq!(lex.next().unwrap(), Some(Token::Keyword(Keyword::Where)));
|
|
assert_eq!(lex.next().unwrap(), Some(Token::Ident("c".to_owned())));
|
|
assert_eq!(lex.next().unwrap(), None);
|
|
}
|
|
|
|
#[test]
|
|
fn lit_int() {
|
|
assert_eq!(single_token("0").unwrap(), Some(Token::LitInt(0)));
|
|
assert_eq!(single_token("000").unwrap(), Some(Token::LitInt(0)));
|
|
assert_eq!(single_token("0x0").unwrap(), Some(Token::LitInt(0)));
|
|
assert_eq!(single_token("0x123").unwrap(), Some(Token::LitInt(0x123)));
|
|
assert_eq!(single_token("0123").unwrap(), Some(Token::LitInt(123)));
|
|
}
|
|
}
|