411 lines
11 KiB
Rust

use std::str::FromStr;
use nom::{
branch::alt,
bytes::complete::{is_a, is_not, tag},
character::complete::{alphanumeric1, char, space0},
combinator::{map, recognize, value, verify},
multi::{fold_many1, many0, many1_count},
sequence::{delimited, pair, preceded, separated_pair},
IResult,
};
#[derive(Debug, Clone, PartialEq)]
pub enum Fragment<'a> {
Literal(&'a str),
QuotedLiteral(&'a str),
Variable(&'a str),
}
#[derive(Debug, Clone, PartialEq)]
pub struct Word<'a>(pub Vec<Fragment<'a>>);
#[derive(Debug)]
pub struct TokenStream<'a> {
input: &'a str,
buffer: Option<Token<'a>>,
}
#[derive(Debug, Clone, PartialEq)]
pub enum Redirect<'a> {
Output(OutputRedirect<'a>),
Input(Word<'a>),
}
#[derive(Debug, Clone, PartialEq)]
pub enum OutputRedirect<'a> {
Err(Word<'a>),
Out(Word<'a>),
Both(Word<'a>),
}
#[derive(Debug, Clone, PartialEq)]
pub enum Keyword {
If,
While,
For,
Match,
Let,
}
#[derive(Debug, Clone, PartialEq)]
pub enum Operator {
Eq,
Gt,
Lt,
Or,
Assign,
}
#[derive(Debug, Clone, PartialEq)]
pub enum Punctuation {
LBrace,
RBrace,
LParen,
RParen,
LBracket,
RBracket,
}
#[derive(Debug, Clone, PartialEq)]
pub enum Token<'a> {
Word(Word<'a>),
Redirect(Redirect<'a>),
Keyword(Keyword),
Punctuation(Punctuation),
Operator(Operator),
}
type NomError<'a> = nom::Err<nom::error::Error<&'a str>>;
impl<'a> TokenStream<'a> {
pub fn new(input: &'a str) -> Self {
Self {
input,
buffer: None,
}
}
pub fn is_eof(&self) -> bool {
self.input.is_empty() && self.buffer.is_none()
}
fn read(&mut self) -> Result<Option<Token<'a>>, NomError<'a>> {
if self.input.is_empty() {
self.buffer = None;
Ok(None)
} else {
let (rest, token) = lex_token(self.input)?;
self.input = rest;
self.buffer = Some(token.clone());
Ok(Some(token))
}
}
pub fn next(&mut self) -> Result<Option<Token<'a>>, NomError<'a>> {
let token = self.peek()?;
self.read()?;
Ok(token)
}
pub fn peek(&mut self) -> Result<Option<Token<'a>>, NomError<'a>> {
if let Some(buffer) = self.buffer.clone() {
return Ok(Some(buffer));
}
self.read()
}
}
impl Word<'_> {
pub fn as_literal(&self) -> Option<&str> {
if self.0.len() != 1 {
return None;
}
let Fragment::Literal(lit) = self.0[0] else {
return None;
};
Some(lit)
}
}
impl FromStr for Keyword {
type Err = ();
fn from_str(s: &str) -> Result<Self, Self::Err> {
match s {
"if" => Ok(Self::If),
"while" => Ok(Self::While),
"for" => Ok(Self::For),
"match" => Ok(Self::Match),
"let" => Ok(Self::Let),
_ => Err(()),
}
}
}
fn lex_identifier(i: &str) -> IResult<&str, &str> {
recognize(many1_count(alt((alphanumeric1, is_a("-_")))))(i)
}
fn lex_filename(i: &str) -> IResult<&str, &str> {
recognize(many1_count(alt((alphanumeric1, is_a("./-_:")))))(i)
}
fn lex_braced_var(i: &str) -> IResult<&str, &str> {
// ${ABCD}
delimited(tag("${"), lex_identifier, char('}'))(i)
}
fn lex_unbraced_var(i: &str) -> IResult<&str, &str> {
// $ABCD
preceded(char('$'), lex_identifier)(i)
}
fn lex_var(i: &str) -> IResult<&str, &str> {
alt((lex_braced_var, lex_unbraced_var))(i)
}
fn lex_dquoted_literal(i: &str) -> IResult<&str, &str> {
let is_not_var_slash_quote = is_not("\\\"$");
verify(is_not_var_slash_quote, |s: &str| !s.is_empty())(i)
}
fn lex_dquoted(i: &str) -> IResult<&str, Vec<Fragment>> {
delimited(
char('"'),
many0(alt((
map(lex_var, Fragment::Variable),
map(lex_dquoted_literal, Fragment::QuotedLiteral),
))),
char('"'),
)(i)
}
fn lex_squoted_text(i: &str) -> IResult<&str, &str> {
let is_not_slash_quote = is_not("\\'");
recognize(many0(is_not_slash_quote))(i)
}
fn lex_squoted(i: &str) -> IResult<&str, &str> {
delimited(char('\''), lex_squoted_text, char('\''))(i)
}
fn lex_unquoted_fragment(i: &str) -> IResult<&str, Fragment> {
alt((
map(lex_var, Fragment::Variable),
map(lex_filename, Fragment::Literal),
))(i)
}
fn lex_word(i: &str) -> IResult<&str, Word> {
fold_many1(
alt((
lex_dquoted,
map(lex_squoted, |s| vec![Fragment::QuotedLiteral(s)]),
map(lex_unquoted_fragment, |s| vec![s]),
)),
|| Word(vec![]),
|mut acc, items| {
acc.0.extend(items);
acc
},
)(i)
}
fn lex_explicit_output_redirect(i: &str) -> IResult<&str, OutputRedirect> {
// out>abcdef
// err>abcdef
// out+err>abcdef
// oe>abcdef
// eo>abcdef
#[derive(Debug, Clone)]
enum Source {
Out,
Err,
Both,
}
map(
separated_pair(
alt((
value(Source::Out, tag("out")),
value(Source::Err, tag("err")),
value(Source::Both, tag("oe")),
value(Source::Both, tag("eo")),
value(Source::Out, char('o')),
value(Source::Err, char('e')),
)),
char('>'),
lex_word,
),
|(source, path)| match source {
Source::Out => OutputRedirect::Out(path),
Source::Err => OutputRedirect::Err(path),
Source::Both => OutputRedirect::Both(path),
},
)(i)
}
fn lex_implicit_output_redirect(i: &str) -> IResult<&str, OutputRedirect> {
// >abcdef
map(
preceded(pair(char('>'), space0), lex_word),
OutputRedirect::Out,
)(i)
}
fn lex_output_redirect(i: &str) -> IResult<&str, OutputRedirect> {
alt((lex_implicit_output_redirect, lex_explicit_output_redirect))(i)
}
fn lex_input_redirect(i: &str) -> IResult<&str, Word> {
// <abcdef
preceded(pair(char('<'), space0), lex_word)(i)
}
fn lex_redirect(i: &str) -> IResult<&str, Redirect> {
alt((
map(lex_input_redirect, Redirect::Input),
map(lex_output_redirect, Redirect::Output),
))(i)
}
fn lex_maybe_keyword(i: &str) -> IResult<&str, Token> {
// TODO this will recognize quoted text as a keyword
map(lex_word, |word| {
if let Some(kw) = word.as_literal().and_then(|s| Keyword::from_str(s).ok()) {
return Token::Keyword(kw);
}
Token::Word(word)
})(i)
}
fn lex_punctuation(i: &str) -> IResult<&str, Punctuation> {
alt((
value(Punctuation::LBrace, char('{')),
value(Punctuation::RBrace, char('}')),
value(Punctuation::LParen, char('(')),
value(Punctuation::RParen, char(')')),
value(Punctuation::LBracket, char('[')),
value(Punctuation::RBracket, char(']')),
))(i)
}
fn lex_operator(i: &str) -> IResult<&str, Operator> {
alt((
value(Operator::Eq, tag("==")),
value(Operator::Assign, char('=')),
value(Operator::Or, char('|')),
))(i)
}
pub fn lex_token(i: &str) -> IResult<&str, Token> {
preceded(
space0,
alt((
map(lex_punctuation, Token::Punctuation),
map(lex_redirect, Token::Redirect),
map(lex_operator, Token::Operator),
lex_maybe_keyword,
)),
)(i)
}
pub fn lex_tokens(i: &str) -> IResult<&str, Vec<Token>> {
many0(lex_token)(i)
}
#[cfg(test)]
mod tests {
use std::fmt;
use nom::IResult;
use super::{
lex_filename, lex_tokens, Fragment, Keyword, Operator, OutputRedirect, Redirect, Token,
Word,
};
#[track_caller]
fn run_tests<
'a,
T: PartialEq + fmt::Debug,
I: IntoIterator<Item = (&'a str, T, &'a str)>,
F: Fn(&'a str) -> IResult<&'a str, T>,
>(
it: I,
parser: F,
) {
let location = std::panic::Location::caller();
for (i, (input, expect, expect_rest)) in it.into_iter().enumerate() {
let (rest, output) = match parser(input) {
Ok(ok) => ok,
Err(error) => {
eprintln!("Test #{i} in {location:?} failed:");
eprintln!("* Input: {input:?}");
eprintln!("* Parser returned error: {error}");
panic!();
}
};
if rest != expect_rest {
eprintln!("Test #{i} in {location:?} failed:");
eprintln!("* Input: {input:?}");
if expect_rest.is_empty() {
eprintln!("* Unexpected trailing characters: {rest:?}");
} else {
eprintln!("* Expected trailing characters: {expect_rest:?}");
eprintln!("* Actual trailing characters: {rest:?}");
}
panic!();
}
if output != expect {
eprintln!("Test #{i} in {location:?} failed:");
eprintln!("* Input: {input:?}");
eprintln!("* Expected output: {expect:?}");
eprintln!("* Actual output: {output:?}");
panic!();
}
}
}
#[test]
fn test_lex_filename() {
run_tests(
[
("./abc123_a-a/file>other", "./abc123_a-a/file", ">other"),
("/a/b/c d e f g", "/a/b/c", " d e f g"),
],
lex_filename,
)
}
#[test]
fn test_lex_tokens() {
run_tests(
[
(
" if /a/b/c\" $a b c\"$d efg",
vec![
Token::Keyword(Keyword::If),
Token::Word(Word(vec![
Fragment::Literal("/a/b/c"),
Fragment::Literal(" "),
Fragment::Variable("a"),
Fragment::Literal(" b c"),
Fragment::Variable("d"),
])),
Token::Word(Word(vec![Fragment::Literal("efg")])),
],
"",
),
(
"\t>$d\"filename\"",
vec![Token::Redirect(Redirect::Output(OutputRedirect::Out(
Word(vec![Fragment::Variable("d"), Fragment::Literal("filename")]),
)))],
"",
),
(
"| abc",
vec![
Token::Operator(Operator::Or),
Token::Word(Word(vec![Fragment::Literal("abc")])),
],
"",
),
],
lex_tokens,
)
}
}