Basic lexing + ty/typedef/binary parsing

This commit is contained in:
2023-12-17 23:21:07 +02:00
parent c91f9ddd6c
commit 2dfdd61149
12 changed files with 717 additions and 23 deletions
Generated
+3
View File
@@ -13,3 +13,6 @@ version = "0.1.0"
[[package]]
name = "frontend"
version = "0.1.0"
dependencies = [
"ast",
]
+24 -11
View File
@@ -1,14 +1,27 @@
pub fn add(left: usize, right: usize) -> usize {
left + right
use std::collections::HashMap;
#[derive(Debug, PartialEq)]
pub enum Node {
Ident(String),
LitInt(u64),
Binary(String, Box<Node>, Box<Node>),
Parenthesized(Box<Node>),
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn it_works() {
let result = add(2, 2);
assert_eq!(result, 4);
}
#[derive(Debug, PartialEq)]
pub enum Type {
Ident(String),
Arrow(Box<Type>, Box<Type>),
}
#[derive(Debug, PartialEq)]
pub struct TypeDefinition {
pub name: String,
pub constructors: HashMap<String, TypeConstructor>,
}
#[derive(Debug, PartialEq)]
pub enum TypeConstructor {
Record(HashMap<String, Box<Type>>),
Variant(Vec<Box<Type>>),
}
+1
View File
@@ -6,3 +6,4 @@ edition = "2021"
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
[dependencies]
ast = { path = "../ast" }
+51
View File
@@ -0,0 +1,51 @@
use std::{fmt::Debug, iter::Peekable, slice::Iter, str::Chars};
pub trait Input<T> {
type Error: Debug;
fn peek(&mut self) -> Result<Option<T>, Self::Error>;
fn next(&mut self) -> Result<Option<T>, Self::Error>;
}
pub struct SliceInput<T, I: Iterator<Item = T>> {
it: Peekable<I>,
}
impl<T, I: Iterator<Item = T>> SliceInput<T, I> {
pub fn new<R: IntoIterator<IntoIter = I>>(it: R) -> Self {
Self {
it: it.into_iter().peekable(),
}
}
}
impl<T: Clone, I: Iterator<Item = T>> Input<T> for SliceInput<T, I> {
type Error = !;
fn peek(&mut self) -> Result<Option<T>, Self::Error> {
Ok(self.it.peek().cloned())
}
fn next(&mut self) -> Result<Option<T>, Self::Error> {
Ok(self.it.next())
}
}
#[cfg(test)]
mod tests {
use crate::input::{Input, SliceInput};
#[test]
fn string_input() {
let s = "a123";
let mut i = SliceInput::new(s.chars());
assert_eq!(i.peek().unwrap().unwrap(), 'a');
assert_eq!(i.next().unwrap().unwrap(), 'a');
assert_eq!(i.next().unwrap().unwrap(), '1');
assert_eq!(i.next().unwrap().unwrap(), '2');
assert_eq!(i.next().unwrap().unwrap(), '3');
assert!(i.peek().unwrap().is_none());
assert!(i.next().unwrap().is_none());
}
}
+195
View File
@@ -0,0 +1,195 @@
use std::{fmt::Debug, str::FromStr};
use crate::{input::Input, lexer::token::Keyword};
use self::token::{Punct, Token};
pub mod token;
#[derive(Debug)]
pub enum LexerError<E: Debug> {
InputError(E),
}
pub struct Lexer<I: Input<char>> {
input: I,
buffer: Option<Token>,
}
impl<I: Input<char>> Lexer<I> {
pub fn new(input: I) -> Self {
Self {
input,
buffer: None,
}
}
fn skip_whitespace(&mut self) -> Result<(), LexerError<I::Error>> {
loop {
let Some(ch) = self.input.peek().map_err(LexerError::InputError)? else {
break;
};
if !ch.is_whitespace() {
break;
}
self.input.next().ok();
}
Ok(())
}
fn lex_ident_or_keyword(&mut self) -> Result<Token, LexerError<I::Error>> {
let mut buffer = String::new();
loop {
let Some(ch) = self.input.peek().map_err(LexerError::InputError)? else {
break;
};
if !token::is_ident_rest(ch) {
break;
}
buffer.push(ch);
self.input.next().ok();
}
Ok(if let Ok(kw) = Keyword::from_str(&buffer) {
Token::Keyword(kw)
} else {
Token::Ident(buffer)
})
}
fn lex_number(&mut self) -> Result<Token, LexerError<I::Error>> {
let mut buffer = String::new();
loop {
let Some(ch) = self.input.peek().map_err(LexerError::InputError)? else {
break;
};
if !ch.is_alphanumeric() || ch == '.' {
break;
}
buffer.push(ch);
self.input.next().ok();
}
let value = if let Some(value) = buffer
.strip_prefix("0x")
.and_then(|v| u64::from_str_radix(v, 16).ok())
{
value
} else if let Ok(value) = u64::from_str_radix(&buffer, 10) {
value
} else {
todo!("Invalid number: {:?}", buffer);
};
Ok(Token::LitInt(value))
}
fn lex_operator(&mut self) -> Result<Token, LexerError<I::Error>> {
let mut buffer = String::new();
loop {
let Some(ch) = self.input.peek().map_err(LexerError::InputError)? else {
break;
};
if !token::is_operator(ch) {
break;
}
buffer.push(ch);
self.input.next().ok();
}
Ok(Token::Operator(buffer))
}
fn lex_token(&mut self) -> Result<Option<Token>, LexerError<I::Error>> {
self.skip_whitespace()?;
let Some(ch) = self.input.peek().map_err(LexerError::InputError)? else {
return Ok(None);
};
Ok(if token::is_ident_head(ch) {
Some(self.lex_ident_or_keyword()?)
} else if ch.is_digit(10) {
Some(self.lex_number()?)
} else if token::is_operator(ch) {
Some(self.lex_operator()?)
} else if let Ok(punct) = Punct::try_from(ch) {
self.input.next().ok();
Some(Token::Punct(punct))
} else {
todo!()
})
}
}
impl<I: Input<char>> Input<Token> for Lexer<I> {
type Error = LexerError<I::Error>;
fn peek(&mut self) -> Result<Option<Token>, Self::Error> {
if self.buffer.is_none() {
self.buffer = self.lex_token()?;
}
Ok(self.buffer.clone())
}
fn next(&mut self) -> Result<Option<Token>, Self::Error> {
let result = if let Some(token) = self.buffer.as_ref() {
Some(token.clone())
} else {
self.lex_token()?
};
self.buffer = self.lex_token()?;
Ok(result)
}
}
#[cfg(test)]
mod tests {
use crate::{
input::{Input, SliceInput},
lexer::token::{Keyword, Token},
};
use super::{Lexer, LexerError};
fn single_token(s: &str) -> Result<Option<Token>, LexerError<!>> {
let mut lex = Lexer::new(SliceInput::new(s.chars()));
lex.next()
}
#[test]
fn kw_or_ident() {
let s = "a as b where c";
let mut lex = Lexer::new(SliceInput::new(s.chars()));
assert_eq!(lex.next().unwrap(), Some(Token::Ident("a".to_owned())));
assert_eq!(lex.next().unwrap(), Some(Token::Keyword(Keyword::As)));
assert_eq!(lex.next().unwrap(), Some(Token::Ident("b".to_owned())));
assert_eq!(lex.next().unwrap(), Some(Token::Keyword(Keyword::Where)));
assert_eq!(lex.next().unwrap(), Some(Token::Ident("c".to_owned())));
assert_eq!(lex.next().unwrap(), None);
}
#[test]
fn lit_int() {
assert_eq!(single_token("0").unwrap(), Some(Token::LitInt(0)));
assert_eq!(single_token("000").unwrap(), Some(Token::LitInt(0)));
assert_eq!(single_token("0x0").unwrap(), Some(Token::LitInt(0)));
assert_eq!(single_token("0x123").unwrap(), Some(Token::LitInt(0x123)));
assert_eq!(single_token("0123").unwrap(), Some(Token::LitInt(123)));
}
}
+95
View File
@@ -0,0 +1,95 @@
macro_rules! string_enum {
{
$(#[$meta:meta])*
$vis:vis enum $name:ident {
$($discriminant:ident => $text:literal),+ $(,)?
}
} => {
$(#[$meta])*
$vis enum $name {
$($discriminant),+
}
impl core::str::FromStr for $name {
type Err = ();
fn from_str(s: &str) -> Result<$name, ()> {
match s {
$($text => Ok($name::$discriminant)),+,
_ => Err(())
}
}
}
};
}
macro_rules! char_enum {
{
$(#[$meta:meta])*
$vis:vis enum $name:ident {
$($discriminant:ident => $text:literal),+ $(,)?
}
} => {
$(#[$meta])*
$vis enum $name {
$($discriminant),+
}
impl core::convert::TryFrom<char> for $name {
type Error = ();
fn try_from(c: char) -> Result<$name, ()> {
match c {
$($text => Ok($name::$discriminant)),+,
_ => Err(())
}
}
}
};
}
char_enum! {
#[derive(Clone, Debug, PartialEq, Eq)]
pub enum Punct {
Comma => ',',
LParen => '(',
RParen => ')',
LBrace => '{',
RBrace => '}',
LBracket => '[',
RBracket => ']',
}
}
string_enum! {
#[derive(Clone, Debug, PartialEq, Eq)]
pub enum Keyword {
As => "as",
Where => "where",
Let => "let",
In => "in",
Type => "type"
}
}
#[derive(Clone, Debug, PartialEq, Eq)]
pub enum Token {
Operator(String),
Punct(Punct),
Ident(String),
Keyword(Keyword),
LitInt(u64),
LitString(String),
}
pub fn is_ident_head(ch: char) -> bool {
ch.is_alphabetic() || ch == '_'
}
pub fn is_ident_rest(ch: char) -> bool {
ch.is_alphanumeric() || ch == '_'
}
pub fn is_operator(ch: char) -> bool {
":;<>./?=-+!@#$%^&*~".contains(ch)
}
+17 -12
View File
@@ -1,14 +1,19 @@
pub fn add(left: usize, right: usize) -> usize {
left + right
}
#![feature(never_type, let_chains)]
#[cfg(test)]
mod tests {
use super::*;
/*
#[test]
fn it_works() {
let result = add(2, 2);
assert_eq!(result, 4);
}
}
fact : Num a => a -> a
fact 0 = 1
fact 1 = 1
fact n = f1 * f2
where
f1 = fact (n - 1)
f2 = fact (n - 2)
main : IO ()
main = Debug.log `123 ${fact 13}`
*/
pub mod input;
pub mod lexer;
pub mod parser;
+23
View File
@@ -0,0 +1,23 @@
use crate::{input::Input, lexer::token::Token};
use super::ParserError;
pub fn many1<I, T, F: Fn(&mut I) -> Result<T, ParserError<I::Error>>>(
input: &mut I,
parse: F,
) -> Result<Vec<T>, ParserError<I::Error>>
where
I: Input<Token>,
{
let mut result = vec![];
loop {
let Ok(item) = parse(input) else {
break;
};
result.push(item);
}
Ok(result)
}
+76
View File
@@ -0,0 +1,76 @@
use ast::Node;
use crate::lexer::token::{Punct, Token};
use super::ParserError;
def_parser! {
pub parser parse_atom(input) -> Box<Node> {
next_expect!(input, token, ParserError::UnexpectedEof);
match token {
Token::Ident(name) => Ok(Box::new(Node::Ident(name))),
Token::LitInt(value) => Ok(Box::new(Node::LitInt(value))),
Token::Punct(Punct::LParen) => {
let expr = parse_expr(input)?;
next_expect!(input, Token::Punct(Punct::RParen), ParserError::Expected(")"));
if let Node::Parenthesized(expr) = *expr {
Ok(expr)
} else {
Ok(Box::new(Node::Parenthesized(expr)))
}
},
_ => todo!()
}
}
}
def_parser! {
pub parser maybe_binary(input, lhs: Box<Node>) -> Box<Node> {
let Some(Token::Operator(op)) = input.peek().map_err(ParserError::LexerError)? else {
return Ok(lhs);
};
input.next().ok();
let rhs = parse_expr(input)?;
Ok(Box::new(Node::Binary(op, lhs, rhs)))
}
}
def_parser! {
pub parser parse_expr(input) -> Box<Node> {
let value = parse_atom(input)?;
let value = maybe_binary(input, value)?;
Ok(value)
}
}
#[cfg(test)]
mod tests {
use ast::Node;
use crate::{input::SliceInput, lexer::token::Token, parser::expr::parse_expr};
#[test]
fn binary() {
let t = vec![
Token::LitInt(123),
Token::Operator("+".to_owned()),
Token::Ident("a".to_owned()),
];
let b = parse_expr(&mut SliceInput::new(t));
assert_eq!(
b.unwrap(),
Box::new(Node::Binary(
"+".to_owned(),
Box::new(Node::LitInt(123)),
Box::new(Node::Ident("a".to_owned()))
))
);
}
}
+42
View File
@@ -0,0 +1,42 @@
use std::fmt::Debug;
macro_rules! def_parser {
{
$vis:vis parser $name:ident ($input_name:ident $(, $extra_arg:ident: $extra_arg_ty:ty)*) -> $res_ty:ty $body:block
} => {
$vis fn $name<I>($input_name: &mut I $(, $extra_arg: $extra_arg_ty)*)
-> Result<$res_ty, $crate::parser::ParserError<I::Error>>
where I: $crate::input::Input<$crate::lexer::token::Token>
$body
};
}
macro_rules! peek_expect {
($lex:expr, $pat:pat, $error:expr) => {
let Some($pat) = $lex
.peek()
.map_err($crate::parser::ParserError::LexerError)?
else {
return Err($error);
};
};
}
macro_rules! next_expect {
($lex:expr, $pat:pat, $error:expr) => {
peek_expect!($lex, $pat, $error);
$lex.next().ok();
};
}
pub mod combinators;
pub mod expr;
pub mod ty;
pub mod typedef;
#[derive(Debug)]
pub enum ParserError<E: Debug> {
LexerError(E),
Expected(&'static str),
UnexpectedEof,
}
+76
View File
@@ -0,0 +1,76 @@
use ast::Type;
use super::ParserError;
use crate::lexer::token::Token;
def_parser! {
pub parser parse_type(input) -> Box<Type> {
next_expect!(input, Token::Ident(name), ParserError::Expected("Type name"));
let lhs = Box::new(Type::Ident(name));
if let Some(Token::Operator(op)) = input.peek().unwrap() && op == "->" {
input.next().ok();
let rhs = parse_type(input)?;
Ok(Box::new(Type::Arrow(lhs, rhs)))
} else {
Ok(lhs)
}
}
}
#[cfg(test)]
mod tests {
use ast::Type;
use crate::{input::SliceInput, lexer::token::Token};
use super::parse_type;
#[test]
fn simple_ident_ty() {
let t = vec![Token::Ident("A".to_owned()), Token::Ident("B".to_owned())];
let mut input = SliceInput::new(t);
let r = parse_type(&mut input);
assert_eq!(r.unwrap(), Box::new(Type::Ident("A".to_owned())));
let r = parse_type(&mut input);
assert_eq!(r.unwrap(), Box::new(Type::Ident("B".to_owned())));
}
#[test]
fn arrow_ty() {
let t = vec![
Token::Ident("A".to_owned()),
Token::Operator("->".to_owned()),
Token::Ident("B".to_owned()),
];
let r = parse_type(&mut SliceInput::new(t));
assert_eq!(
r.unwrap(),
Box::new(Type::Arrow(
Box::new(Type::Ident("A".to_owned())),
Box::new(Type::Ident("B".to_owned()))
))
);
let t = vec![
Token::Ident("A".to_owned()),
Token::Operator("->".to_owned()),
Token::Ident("B".to_owned()),
Token::Operator("->".to_owned()),
Token::Ident("C".to_owned()),
];
let r = parse_type(&mut SliceInput::new(t));
assert_eq!(
r.unwrap(),
Box::new(Type::Arrow(
Box::new(Type::Ident("A".to_owned())),
Box::new(Type::Arrow(
Box::new(Type::Ident("B".to_owned())),
Box::new(Type::Ident("C".to_owned())),
))
))
);
}
}
+114
View File
@@ -0,0 +1,114 @@
use std::collections::HashMap;
use ast::{TypeConstructor, TypeDefinition};
use super::{combinators::many1, ty::parse_type, ParserError};
use crate::lexer::token::{Keyword, Punct, Token};
def_parser! {
parser parse_type_constructor(input) -> (String, TypeConstructor) {
// A a b c
// OR
// A { x: T, y: U }
next_expect!(input, Token::Ident(name), ParserError::Expected("Type constructor name"));
let Some(token) = input.peek().unwrap() else {
todo!();
};
let cons = match token {
Token::Punct(Punct::LBrace) => todo!(),
_ => {
let args = many1(input, parse_type)?;
TypeConstructor::Variant(args)
}
};
Ok((name, cons))
}
}
def_parser! {
pub parser parse_type_definition(input) -> Box<TypeDefinition> {
// type X =
next_expect!(input, Token::Keyword(Keyword::Type), ParserError::Expected("type"));
next_expect!(input, Token::Ident(name), ParserError::Expected("name"));
next_expect!(input, Token::Operator(op), ParserError::Expected("="));
if op != "=" {
todo!()
}
let mut conss = HashMap::new();
loop {
let (cons_name, cons) = parse_type_constructor(input)?;
if conss.insert(cons_name, cons).is_some() {
todo!();
}
if let Some(Token::Operator(op)) = input.peek().unwrap() && op == "|".to_owned() {
input.next().ok();
continue;
} else {
break;
}
}
Ok(Box::new(TypeDefinition {
name,
constructors: conss
}))
}
}
#[cfg(test)]
mod tests {
use std::collections::HashMap;
use ast::{Type, TypeConstructor, TypeDefinition};
use crate::{
input::SliceInput,
lexer::token::{Keyword, Token},
parser::typedef::parse_type_definition,
};
#[test]
fn type_def() {
let t = vec![
Token::Keyword(Keyword::Type),
Token::Ident("T".to_owned()),
Token::Operator("=".to_owned()),
Token::Ident("A".to_owned()),
Token::Ident("Int".to_owned()),
Token::Ident("Bool".to_owned()),
Token::Operator("|".to_owned()),
Token::Ident("B".to_owned()),
Token::Ident("String".to_owned()),
];
let b = parse_type_definition(&mut SliceInput::new(t));
assert_eq!(
b.unwrap(),
Box::new(TypeDefinition {
name: "T".to_owned(),
constructors: HashMap::from_iter([
(
"A".to_owned(),
TypeConstructor::Variant(vec![
Box::new(Type::Ident("Int".to_owned())),
Box::new(Type::Ident("Bool".to_owned()))
])
),
(
"B".to_owned(),
TypeConstructor::Variant(vec![Box::new(Type::Ident("String".to_owned()))])
)
]),
})
);
}
}