From 2dfdd61149d3687f572f6dca68b948dea78bc054 Mon Sep 17 00:00:00 2001 From: Mark Poliakov Date: Sun, 17 Dec 2023 23:21:07 +0200 Subject: [PATCH] Basic lexing + ty/typedef/binary parsing --- Cargo.lock | 3 + ast/src/lib.rs | 35 ++++-- frontend/Cargo.toml | 1 + frontend/src/input.rs | 51 ++++++++ frontend/src/lexer/mod.rs | 195 +++++++++++++++++++++++++++++ frontend/src/lexer/token.rs | 95 ++++++++++++++ frontend/src/lib.rs | 29 +++-- frontend/src/parser/combinators.rs | 23 ++++ frontend/src/parser/expr.rs | 76 +++++++++++ frontend/src/parser/mod.rs | 42 +++++++ frontend/src/parser/ty.rs | 76 +++++++++++ frontend/src/parser/typedef.rs | 114 +++++++++++++++++ 12 files changed, 717 insertions(+), 23 deletions(-) create mode 100644 frontend/src/input.rs create mode 100644 frontend/src/lexer/mod.rs create mode 100644 frontend/src/lexer/token.rs create mode 100644 frontend/src/parser/combinators.rs create mode 100644 frontend/src/parser/expr.rs create mode 100644 frontend/src/parser/mod.rs create mode 100644 frontend/src/parser/ty.rs create mode 100644 frontend/src/parser/typedef.rs diff --git a/Cargo.lock b/Cargo.lock index fd9d76f..f3d2512 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -13,3 +13,6 @@ version = "0.1.0" [[package]] name = "frontend" version = "0.1.0" +dependencies = [ + "ast", +] diff --git a/ast/src/lib.rs b/ast/src/lib.rs index 7d12d9a..26b2792 100644 --- a/ast/src/lib.rs +++ b/ast/src/lib.rs @@ -1,14 +1,27 @@ -pub fn add(left: usize, right: usize) -> usize { - left + right +use std::collections::HashMap; + +#[derive(Debug, PartialEq)] +pub enum Node { + Ident(String), + LitInt(u64), + Binary(String, Box, Box), + Parenthesized(Box), } -#[cfg(test)] -mod tests { - use super::*; - - #[test] - fn it_works() { - let result = add(2, 2); - assert_eq!(result, 4); - } +#[derive(Debug, PartialEq)] +pub enum Type { + Ident(String), + Arrow(Box, Box), +} + +#[derive(Debug, PartialEq)] +pub struct TypeDefinition { + pub name: String, + pub constructors: HashMap, +} + +#[derive(Debug, PartialEq)] +pub enum TypeConstructor { + Record(HashMap>), + Variant(Vec>), } diff --git a/frontend/Cargo.toml b/frontend/Cargo.toml index 4046d5f..633cf96 100644 --- a/frontend/Cargo.toml +++ b/frontend/Cargo.toml @@ -6,3 +6,4 @@ edition = "2021" # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html [dependencies] +ast = { path = "../ast" } diff --git a/frontend/src/input.rs b/frontend/src/input.rs new file mode 100644 index 0000000..438b617 --- /dev/null +++ b/frontend/src/input.rs @@ -0,0 +1,51 @@ +use std::{fmt::Debug, iter::Peekable, slice::Iter, str::Chars}; + +pub trait Input { + type Error: Debug; + + fn peek(&mut self) -> Result, Self::Error>; + fn next(&mut self) -> Result, Self::Error>; +} + +pub struct SliceInput> { + it: Peekable, +} + +impl> SliceInput { + pub fn new>(it: R) -> Self { + Self { + it: it.into_iter().peekable(), + } + } +} + +impl> Input for SliceInput { + type Error = !; + + fn peek(&mut self) -> Result, Self::Error> { + Ok(self.it.peek().cloned()) + } + + fn next(&mut self) -> Result, Self::Error> { + Ok(self.it.next()) + } +} + +#[cfg(test)] +mod tests { + use crate::input::{Input, SliceInput}; + + #[test] + fn string_input() { + let s = "a123"; + let mut i = SliceInput::new(s.chars()); + + assert_eq!(i.peek().unwrap().unwrap(), 'a'); + assert_eq!(i.next().unwrap().unwrap(), 'a'); + assert_eq!(i.next().unwrap().unwrap(), '1'); + assert_eq!(i.next().unwrap().unwrap(), '2'); + assert_eq!(i.next().unwrap().unwrap(), '3'); + assert!(i.peek().unwrap().is_none()); + assert!(i.next().unwrap().is_none()); + } +} diff --git a/frontend/src/lexer/mod.rs b/frontend/src/lexer/mod.rs new file mode 100644 index 0000000..bf2d809 --- /dev/null +++ b/frontend/src/lexer/mod.rs @@ -0,0 +1,195 @@ +use std::{fmt::Debug, str::FromStr}; + +use crate::{input::Input, lexer::token::Keyword}; + +use self::token::{Punct, Token}; + +pub mod token; + +#[derive(Debug)] +pub enum LexerError { + InputError(E), +} + +pub struct Lexer> { + input: I, + buffer: Option, +} + +impl> Lexer { + pub fn new(input: I) -> Self { + Self { + input, + buffer: None, + } + } + + fn skip_whitespace(&mut self) -> Result<(), LexerError> { + loop { + let Some(ch) = self.input.peek().map_err(LexerError::InputError)? else { + break; + }; + + if !ch.is_whitespace() { + break; + } + + self.input.next().ok(); + } + + Ok(()) + } + + fn lex_ident_or_keyword(&mut self) -> Result> { + let mut buffer = String::new(); + + loop { + let Some(ch) = self.input.peek().map_err(LexerError::InputError)? else { + break; + }; + + if !token::is_ident_rest(ch) { + break; + } + + buffer.push(ch); + + self.input.next().ok(); + } + + Ok(if let Ok(kw) = Keyword::from_str(&buffer) { + Token::Keyword(kw) + } else { + Token::Ident(buffer) + }) + } + + fn lex_number(&mut self) -> Result> { + let mut buffer = String::new(); + + loop { + let Some(ch) = self.input.peek().map_err(LexerError::InputError)? else { + break; + }; + + if !ch.is_alphanumeric() || ch == '.' { + break; + } + + buffer.push(ch); + self.input.next().ok(); + } + + let value = if let Some(value) = buffer + .strip_prefix("0x") + .and_then(|v| u64::from_str_radix(v, 16).ok()) + { + value + } else if let Ok(value) = u64::from_str_radix(&buffer, 10) { + value + } else { + todo!("Invalid number: {:?}", buffer); + }; + + Ok(Token::LitInt(value)) + } + + fn lex_operator(&mut self) -> Result> { + let mut buffer = String::new(); + + loop { + let Some(ch) = self.input.peek().map_err(LexerError::InputError)? else { + break; + }; + + if !token::is_operator(ch) { + break; + } + + buffer.push(ch); + self.input.next().ok(); + } + + Ok(Token::Operator(buffer)) + } + + fn lex_token(&mut self) -> Result, LexerError> { + self.skip_whitespace()?; + + let Some(ch) = self.input.peek().map_err(LexerError::InputError)? else { + return Ok(None); + }; + + Ok(if token::is_ident_head(ch) { + Some(self.lex_ident_or_keyword()?) + } else if ch.is_digit(10) { + Some(self.lex_number()?) + } else if token::is_operator(ch) { + Some(self.lex_operator()?) + } else if let Ok(punct) = Punct::try_from(ch) { + self.input.next().ok(); + Some(Token::Punct(punct)) + } else { + todo!() + }) + } +} + +impl> Input for Lexer { + type Error = LexerError; + + fn peek(&mut self) -> Result, Self::Error> { + if self.buffer.is_none() { + self.buffer = self.lex_token()?; + } + + Ok(self.buffer.clone()) + } + + fn next(&mut self) -> Result, Self::Error> { + let result = if let Some(token) = self.buffer.as_ref() { + Some(token.clone()) + } else { + self.lex_token()? + }; + self.buffer = self.lex_token()?; + Ok(result) + } +} + +#[cfg(test)] +mod tests { + use crate::{ + input::{Input, SliceInput}, + lexer::token::{Keyword, Token}, + }; + + use super::{Lexer, LexerError}; + + fn single_token(s: &str) -> Result, LexerError> { + let mut lex = Lexer::new(SliceInput::new(s.chars())); + lex.next() + } + + #[test] + fn kw_or_ident() { + let s = "a as b where c"; + let mut lex = Lexer::new(SliceInput::new(s.chars())); + + assert_eq!(lex.next().unwrap(), Some(Token::Ident("a".to_owned()))); + assert_eq!(lex.next().unwrap(), Some(Token::Keyword(Keyword::As))); + assert_eq!(lex.next().unwrap(), Some(Token::Ident("b".to_owned()))); + assert_eq!(lex.next().unwrap(), Some(Token::Keyword(Keyword::Where))); + assert_eq!(lex.next().unwrap(), Some(Token::Ident("c".to_owned()))); + assert_eq!(lex.next().unwrap(), None); + } + + #[test] + fn lit_int() { + assert_eq!(single_token("0").unwrap(), Some(Token::LitInt(0))); + assert_eq!(single_token("000").unwrap(), Some(Token::LitInt(0))); + assert_eq!(single_token("0x0").unwrap(), Some(Token::LitInt(0))); + assert_eq!(single_token("0x123").unwrap(), Some(Token::LitInt(0x123))); + assert_eq!(single_token("0123").unwrap(), Some(Token::LitInt(123))); + } +} diff --git a/frontend/src/lexer/token.rs b/frontend/src/lexer/token.rs new file mode 100644 index 0000000..8d99ad8 --- /dev/null +++ b/frontend/src/lexer/token.rs @@ -0,0 +1,95 @@ +macro_rules! string_enum { + { + $(#[$meta:meta])* + $vis:vis enum $name:ident { + $($discriminant:ident => $text:literal),+ $(,)? + } + } => { + $(#[$meta])* + $vis enum $name { + $($discriminant),+ + } + + impl core::str::FromStr for $name { + type Err = (); + + fn from_str(s: &str) -> Result<$name, ()> { + match s { + $($text => Ok($name::$discriminant)),+, + _ => Err(()) + } + } + } + }; +} + +macro_rules! char_enum { + { + $(#[$meta:meta])* + $vis:vis enum $name:ident { + $($discriminant:ident => $text:literal),+ $(,)? + } + } => { + $(#[$meta])* + $vis enum $name { + $($discriminant),+ + } + + impl core::convert::TryFrom for $name { + type Error = (); + + fn try_from(c: char) -> Result<$name, ()> { + match c { + $($text => Ok($name::$discriminant)),+, + _ => Err(()) + } + } + } + }; +} + +char_enum! { + #[derive(Clone, Debug, PartialEq, Eq)] + pub enum Punct { + Comma => ',', + LParen => '(', + RParen => ')', + LBrace => '{', + RBrace => '}', + LBracket => '[', + RBracket => ']', + } +} + +string_enum! { + #[derive(Clone, Debug, PartialEq, Eq)] + pub enum Keyword { + As => "as", + Where => "where", + Let => "let", + In => "in", + Type => "type" + } +} + +#[derive(Clone, Debug, PartialEq, Eq)] +pub enum Token { + Operator(String), + Punct(Punct), + Ident(String), + Keyword(Keyword), + LitInt(u64), + LitString(String), +} + +pub fn is_ident_head(ch: char) -> bool { + ch.is_alphabetic() || ch == '_' +} + +pub fn is_ident_rest(ch: char) -> bool { + ch.is_alphanumeric() || ch == '_' +} + +pub fn is_operator(ch: char) -> bool { + ":;<>./?=-+!@#$%^&*~".contains(ch) +} diff --git a/frontend/src/lib.rs b/frontend/src/lib.rs index 7d12d9a..2ff1baf 100644 --- a/frontend/src/lib.rs +++ b/frontend/src/lib.rs @@ -1,14 +1,19 @@ -pub fn add(left: usize, right: usize) -> usize { - left + right -} +#![feature(never_type, let_chains)] -#[cfg(test)] -mod tests { - use super::*; +/* - #[test] - fn it_works() { - let result = add(2, 2); - assert_eq!(result, 4); - } -} +fact : Num a => a -> a +fact 0 = 1 +fact 1 = 1 +fact n = f1 * f2 +where + f1 = fact (n - 1) + f2 = fact (n - 2) + +main : IO () +main = Debug.log `123 ${fact 13}` + */ + +pub mod input; +pub mod lexer; +pub mod parser; diff --git a/frontend/src/parser/combinators.rs b/frontend/src/parser/combinators.rs new file mode 100644 index 0000000..6263fad --- /dev/null +++ b/frontend/src/parser/combinators.rs @@ -0,0 +1,23 @@ +use crate::{input::Input, lexer::token::Token}; + +use super::ParserError; + +pub fn many1 Result>>( + input: &mut I, + parse: F, +) -> Result, ParserError> +where + I: Input, +{ + let mut result = vec![]; + + loop { + let Ok(item) = parse(input) else { + break; + }; + + result.push(item); + } + + Ok(result) +} diff --git a/frontend/src/parser/expr.rs b/frontend/src/parser/expr.rs new file mode 100644 index 0000000..f0ad926 --- /dev/null +++ b/frontend/src/parser/expr.rs @@ -0,0 +1,76 @@ +use ast::Node; + +use crate::lexer::token::{Punct, Token}; + +use super::ParserError; + +def_parser! { + pub parser parse_atom(input) -> Box { + next_expect!(input, token, ParserError::UnexpectedEof); + + match token { + Token::Ident(name) => Ok(Box::new(Node::Ident(name))), + Token::LitInt(value) => Ok(Box::new(Node::LitInt(value))), + Token::Punct(Punct::LParen) => { + let expr = parse_expr(input)?; + next_expect!(input, Token::Punct(Punct::RParen), ParserError::Expected(")")); + + if let Node::Parenthesized(expr) = *expr { + Ok(expr) + } else { + Ok(Box::new(Node::Parenthesized(expr))) + } + }, + _ => todo!() + } + } +} + +def_parser! { + pub parser maybe_binary(input, lhs: Box) -> Box { + let Some(Token::Operator(op)) = input.peek().map_err(ParserError::LexerError)? else { + return Ok(lhs); + }; + + input.next().ok(); + + let rhs = parse_expr(input)?; + + Ok(Box::new(Node::Binary(op, lhs, rhs))) + } +} + +def_parser! { + pub parser parse_expr(input) -> Box { + let value = parse_atom(input)?; + let value = maybe_binary(input, value)?; + + Ok(value) + } +} + +#[cfg(test)] +mod tests { + use ast::Node; + + use crate::{input::SliceInput, lexer::token::Token, parser::expr::parse_expr}; + + #[test] + fn binary() { + let t = vec![ + Token::LitInt(123), + Token::Operator("+".to_owned()), + Token::Ident("a".to_owned()), + ]; + + let b = parse_expr(&mut SliceInput::new(t)); + assert_eq!( + b.unwrap(), + Box::new(Node::Binary( + "+".to_owned(), + Box::new(Node::LitInt(123)), + Box::new(Node::Ident("a".to_owned())) + )) + ); + } +} diff --git a/frontend/src/parser/mod.rs b/frontend/src/parser/mod.rs new file mode 100644 index 0000000..cfbd988 --- /dev/null +++ b/frontend/src/parser/mod.rs @@ -0,0 +1,42 @@ +use std::fmt::Debug; + +macro_rules! def_parser { + { + $vis:vis parser $name:ident ($input_name:ident $(, $extra_arg:ident: $extra_arg_ty:ty)*) -> $res_ty:ty $body:block + } => { + $vis fn $name($input_name: &mut I $(, $extra_arg: $extra_arg_ty)*) + -> Result<$res_ty, $crate::parser::ParserError> + where I: $crate::input::Input<$crate::lexer::token::Token> + $body + }; +} + +macro_rules! peek_expect { + ($lex:expr, $pat:pat, $error:expr) => { + let Some($pat) = $lex + .peek() + .map_err($crate::parser::ParserError::LexerError)? + else { + return Err($error); + }; + }; +} + +macro_rules! next_expect { + ($lex:expr, $pat:pat, $error:expr) => { + peek_expect!($lex, $pat, $error); + $lex.next().ok(); + }; +} + +pub mod combinators; +pub mod expr; +pub mod ty; +pub mod typedef; + +#[derive(Debug)] +pub enum ParserError { + LexerError(E), + Expected(&'static str), + UnexpectedEof, +} diff --git a/frontend/src/parser/ty.rs b/frontend/src/parser/ty.rs new file mode 100644 index 0000000..0537d1e --- /dev/null +++ b/frontend/src/parser/ty.rs @@ -0,0 +1,76 @@ +use ast::Type; + +use super::ParserError; +use crate::lexer::token::Token; + +def_parser! { + pub parser parse_type(input) -> Box { + next_expect!(input, Token::Ident(name), ParserError::Expected("Type name")); + + let lhs = Box::new(Type::Ident(name)); + + if let Some(Token::Operator(op)) = input.peek().unwrap() && op == "->" { + input.next().ok(); + let rhs = parse_type(input)?; + + Ok(Box::new(Type::Arrow(lhs, rhs))) + } else { + Ok(lhs) + } + } +} + +#[cfg(test)] +mod tests { + use ast::Type; + + use crate::{input::SliceInput, lexer::token::Token}; + + use super::parse_type; + + #[test] + fn simple_ident_ty() { + let t = vec![Token::Ident("A".to_owned()), Token::Ident("B".to_owned())]; + let mut input = SliceInput::new(t); + let r = parse_type(&mut input); + assert_eq!(r.unwrap(), Box::new(Type::Ident("A".to_owned()))); + let r = parse_type(&mut input); + assert_eq!(r.unwrap(), Box::new(Type::Ident("B".to_owned()))); + } + + #[test] + fn arrow_ty() { + let t = vec![ + Token::Ident("A".to_owned()), + Token::Operator("->".to_owned()), + Token::Ident("B".to_owned()), + ]; + let r = parse_type(&mut SliceInput::new(t)); + assert_eq!( + r.unwrap(), + Box::new(Type::Arrow( + Box::new(Type::Ident("A".to_owned())), + Box::new(Type::Ident("B".to_owned())) + )) + ); + + let t = vec![ + Token::Ident("A".to_owned()), + Token::Operator("->".to_owned()), + Token::Ident("B".to_owned()), + Token::Operator("->".to_owned()), + Token::Ident("C".to_owned()), + ]; + let r = parse_type(&mut SliceInput::new(t)); + assert_eq!( + r.unwrap(), + Box::new(Type::Arrow( + Box::new(Type::Ident("A".to_owned())), + Box::new(Type::Arrow( + Box::new(Type::Ident("B".to_owned())), + Box::new(Type::Ident("C".to_owned())), + )) + )) + ); + } +} diff --git a/frontend/src/parser/typedef.rs b/frontend/src/parser/typedef.rs new file mode 100644 index 0000000..9d6b38e --- /dev/null +++ b/frontend/src/parser/typedef.rs @@ -0,0 +1,114 @@ +use std::collections::HashMap; + +use ast::{TypeConstructor, TypeDefinition}; + +use super::{combinators::many1, ty::parse_type, ParserError}; +use crate::lexer::token::{Keyword, Punct, Token}; + +def_parser! { + parser parse_type_constructor(input) -> (String, TypeConstructor) { + // A a b c + // OR + // A { x: T, y: U } + next_expect!(input, Token::Ident(name), ParserError::Expected("Type constructor name")); + + let Some(token) = input.peek().unwrap() else { + todo!(); + }; + + let cons = match token { + Token::Punct(Punct::LBrace) => todo!(), + _ => { + let args = many1(input, parse_type)?; + + TypeConstructor::Variant(args) + } + }; + + Ok((name, cons)) + } +} + +def_parser! { + pub parser parse_type_definition(input) -> Box { + // type X = + + next_expect!(input, Token::Keyword(Keyword::Type), ParserError::Expected("type")); + next_expect!(input, Token::Ident(name), ParserError::Expected("name")); + next_expect!(input, Token::Operator(op), ParserError::Expected("=")); + if op != "=" { + todo!() + } + + let mut conss = HashMap::new(); + + loop { + let (cons_name, cons) = parse_type_constructor(input)?; + + if conss.insert(cons_name, cons).is_some() { + todo!(); + } + + if let Some(Token::Operator(op)) = input.peek().unwrap() && op == "|".to_owned() { + input.next().ok(); + continue; + } else { + break; + } + } + + Ok(Box::new(TypeDefinition { + name, + constructors: conss + })) + } +} + +#[cfg(test)] +mod tests { + use std::collections::HashMap; + + use ast::{Type, TypeConstructor, TypeDefinition}; + + use crate::{ + input::SliceInput, + lexer::token::{Keyword, Token}, + parser::typedef::parse_type_definition, + }; + + #[test] + fn type_def() { + let t = vec![ + Token::Keyword(Keyword::Type), + Token::Ident("T".to_owned()), + Token::Operator("=".to_owned()), + Token::Ident("A".to_owned()), + Token::Ident("Int".to_owned()), + Token::Ident("Bool".to_owned()), + Token::Operator("|".to_owned()), + Token::Ident("B".to_owned()), + Token::Ident("String".to_owned()), + ]; + + let b = parse_type_definition(&mut SliceInput::new(t)); + assert_eq!( + b.unwrap(), + Box::new(TypeDefinition { + name: "T".to_owned(), + constructors: HashMap::from_iter([ + ( + "A".to_owned(), + TypeConstructor::Variant(vec![ + Box::new(Type::Ident("Int".to_owned())), + Box::new(Type::Ident("Bool".to_owned())) + ]) + ), + ( + "B".to_owned(), + TypeConstructor::Variant(vec![Box::new(Type::Ident("String".to_owned()))]) + ) + ]), + }) + ); + } +}