yggdrasil/userspace/shell/src/syntax/lex.rs

use std::str::FromStr;

use nom::{
    branch::alt,
    bytes::complete::{is_a, is_not, tag},
    character::complete::{alphanumeric1, char, space0},
    combinator::{map, recognize, value, verify},
    multi::{fold_many1, many0, many1_count},
    sequence::{delimited, pair, preceded, separated_pair},
    IResult,
};

#[derive(Debug, Clone, PartialEq)]
pub enum Fragment<'a> {
    Literal(&'a str),
    QuotedLiteral(&'a str),
    Variable(&'a str),
}
#[derive(Debug, Clone, PartialEq)]
pub struct Word<'a>(pub Vec<Fragment<'a>>);

#[derive(Debug)]
pub struct TokenStream<'a> {
    input: &'a str,
    buffer: Option<Token<'a>>,
}

#[derive(Debug, Clone, PartialEq)]
pub enum Redirect<'a> {
    Output(OutputRedirect<'a>),
    Input(Word<'a>),
}
#[derive(Debug, Clone, PartialEq)]
pub enum OutputRedirect<'a> {
    Err(Word<'a>),
    Out(Word<'a>),
    Both(Word<'a>),
}

#[derive(Debug, Clone, PartialEq)]
pub enum Keyword {
    If,
    While,
    For,
    Match,
    Let,
}
#[derive(Debug, Clone, PartialEq)]
pub enum Operator {
    Eq,
    Gt,
    Lt,
    Or,
    Assign,
}
#[derive(Debug, Clone, PartialEq)]
pub enum Punctuation {
    LBrace,
    RBrace,
    LParen,
    RParen,
    LBracket,
    RBracket,
}

#[derive(Debug, Clone, PartialEq)]
pub enum Token<'a> {
    Word(Word<'a>),
    Redirect(Redirect<'a>),
    Keyword(Keyword),
    Punctuation(Punctuation),
    Operator(Operator),
}

type NomError<'a> = nom::Err<nom::error::Error<&'a str>>;
impl<'a> TokenStream<'a> {
    pub fn new(input: &'a str) -> Self {
        Self {
            input,
            buffer: None,
        }
    }

    pub fn is_eof(&self) -> bool {
        self.input.is_empty() && self.buffer.is_none()
    }

    fn read(&mut self) -> Result<Option<Token<'a>>, NomError<'a>> {
        if self.input.is_empty() {
            self.buffer = None;
            Ok(None)
        } else {
            let (rest, token) = lex_token(self.input)?;
            self.input = rest;
            self.buffer = Some(token.clone());
            Ok(Some(token))
        }
    }

    pub fn next(&mut self) -> Result<Option<Token<'a>>, NomError<'a>> {
        let token = self.peek()?;
        self.read()?;
        Ok(token)
    }
    pub fn peek(&mut self) -> Result<Option<Token<'a>>, NomError<'a>> {
        if let Some(buffer) = self.buffer.clone() {
            return Ok(Some(buffer));
        }
        self.read()
    }
}

impl Word<'_> {
    pub fn as_literal(&self) -> Option<&str> {
        if self.0.len() != 1 {
            return None;
        }
        let Fragment::Literal(lit) = self.0[0] else {
            return None;
        };
        Some(lit)
    }
}

impl FromStr for Keyword {
    type Err = ();

    fn from_str(s: &str) -> Result<Self, Self::Err> {
        match s {
            "if" => Ok(Self::If),
            "while" => Ok(Self::While),
            "for" => Ok(Self::For),
            "match" => Ok(Self::Match),
            "let" => Ok(Self::Let),
            _ => Err(()),
        }
    }
}

fn lex_identifier(i: &str) -> IResult<&str, &str> {
    recognize(many1_count(alt((alphanumeric1, is_a("-_")))))(i)
}
fn lex_filename(i: &str) -> IResult<&str, &str> {
    recognize(many1_count(alt((alphanumeric1, is_a("./-_:")))))(i)
}

fn lex_braced_var(i: &str) -> IResult<&str, &str> {
    // ${ABCD}
    delimited(tag("${"), lex_identifier, char('}'))(i)
}
fn lex_unbraced_var(i: &str) -> IResult<&str, &str> {
    // $ABCD
    preceded(char('$'), lex_identifier)(i)
}
fn lex_var(i: &str) -> IResult<&str, &str> {
    alt((lex_braced_var, lex_unbraced_var))(i)
}

fn lex_dquoted_literal(i: &str) -> IResult<&str, &str> {
    let is_not_var_slash_quote = is_not("\\\"$");
    verify(is_not_var_slash_quote, |s: &str| !s.is_empty())(i)
}
fn lex_dquoted(i: &str) -> IResult<&str, Vec<Fragment>> {
    delimited(
        char('"'),
        many0(alt((
            map(lex_var, Fragment::Variable),
            map(lex_dquoted_literal, Fragment::QuotedLiteral),
        ))),
        char('"'),
    )(i)
}

fn lex_squoted_text(i: &str) -> IResult<&str, &str> {
    let is_not_slash_quote = is_not("\\'");
    recognize(many0(is_not_slash_quote))(i)
}
fn lex_squoted(i: &str) -> IResult<&str, &str> {
    delimited(char('\''), lex_squoted_text, char('\''))(i)
}

fn lex_unquoted_fragment(i: &str) -> IResult<&str, Fragment> {
    alt((
        map(lex_var, Fragment::Variable),
        map(lex_filename, Fragment::Literal),
    ))(i)
}

fn lex_word(i: &str) -> IResult<&str, Word> {
    fold_many1(
        alt((
            lex_dquoted,
            map(lex_squoted, |s| vec![Fragment::QuotedLiteral(s)]),
            map(lex_unquoted_fragment, |s| vec![s]),
        )),
        || Word(vec![]),
        |mut acc, items| {
            acc.0.extend(items);
            acc
        },
    )(i)
}

fn lex_explicit_output_redirect(i: &str) -> IResult<&str, OutputRedirect> {
    // out>abcdef
    // err>abcdef
    // out+err>abcdef
    // oe>abcdef
    // eo>abcdef

    #[derive(Debug, Clone)]
    enum Source {
        Out,
        Err,
        Both,
    }

    map(
        separated_pair(
            alt((
                value(Source::Out, tag("out")),
                value(Source::Err, tag("err")),
                value(Source::Both, tag("oe")),
                value(Source::Both, tag("eo")),
                value(Source::Out, char('o')),
                value(Source::Err, char('e')),
            )),
            char('>'),
            lex_word,
        ),
        |(source, path)| match source {
            Source::Out => OutputRedirect::Out(path),
            Source::Err => OutputRedirect::Err(path),
            Source::Both => OutputRedirect::Both(path),
        },
    )(i)
}
fn lex_implicit_output_redirect(i: &str) -> IResult<&str, OutputRedirect> {
    // >abcdef
    map(
        preceded(pair(char('>'), space0), lex_word),
        OutputRedirect::Out,
    )(i)
}

fn lex_output_redirect(i: &str) -> IResult<&str, OutputRedirect> {
    alt((lex_implicit_output_redirect, lex_explicit_output_redirect))(i)
}
fn lex_input_redirect(i: &str) -> IResult<&str, Word> {
    // <abcdef
    preceded(pair(char('<'), space0), lex_word)(i)
}
fn lex_redirect(i: &str) -> IResult<&str, Redirect> {
    alt((
        map(lex_input_redirect, Redirect::Input),
        map(lex_output_redirect, Redirect::Output),
    ))(i)
}

fn lex_maybe_keyword(i: &str) -> IResult<&str, Token> {
    // TODO this will recognize quoted text as a keyword
    map(lex_word, |word| {
        if let Some(kw) = word.as_literal().and_then(|s| Keyword::from_str(s).ok()) {
            return Token::Keyword(kw);
        }
        Token::Word(word)
    })(i)
}

fn lex_punctuation(i: &str) -> IResult<&str, Punctuation> {
    alt((
        value(Punctuation::LBrace, char('{')),
        value(Punctuation::RBrace, char('}')),
        value(Punctuation::LParen, char('(')),
        value(Punctuation::RParen, char(')')),
        value(Punctuation::LBracket, char('[')),
        value(Punctuation::RBracket, char(']')),
    ))(i)
}

fn lex_operator(i: &str) -> IResult<&str, Operator> {
    alt((
        value(Operator::Eq, tag("==")),
        value(Operator::Assign, char('=')),
        value(Operator::Or, char('|')),
    ))(i)
}

pub fn lex_token(i: &str) -> IResult<&str, Token> {
    preceded(
        space0,
        alt((
            map(lex_punctuation, Token::Punctuation),
            map(lex_redirect, Token::Redirect),
            map(lex_operator, Token::Operator),
            lex_maybe_keyword,
        )),
    )(i)
}

pub fn lex_tokens(i: &str) -> IResult<&str, Vec<Token>> {
    many0(lex_token)(i)
}

#[cfg(test)]
mod tests {
    use std::fmt;

    use nom::IResult;

    use super::{
        lex_filename, lex_tokens, Fragment, Keyword, Operator, OutputRedirect, Redirect, Token,
        Word,
    };

    #[track_caller]
    fn run_tests<
        'a,
        T: PartialEq + fmt::Debug,
        I: IntoIterator<Item = (&'a str, T, &'a str)>,
        F: Fn(&'a str) -> IResult<&'a str, T>,
    >(
        it: I,
        parser: F,
    ) {
        let location = std::panic::Location::caller();

        for (i, (input, expect, expect_rest)) in it.into_iter().enumerate() {
            let (rest, output) = match parser(input) {
                Ok(ok) => ok,
                Err(error) => {
                    eprintln!("Test #{i} in {location:?} failed:");
                    eprintln!("* Input: {input:?}");
                    eprintln!("* Parser returned error: {error}");
                    panic!();
                }
            };

            if rest != expect_rest {
                eprintln!("Test #{i} in {location:?} failed:");
                eprintln!("* Input: {input:?}");
                if expect_rest.is_empty() {
                    eprintln!("* Unexpected trailing characters: {rest:?}");
                } else {
                    eprintln!("* Expected trailing characters: {expect_rest:?}");
                    eprintln!("* Actual trailing characters: {rest:?}");
                }
                panic!();
            }

            if output != expect {
                eprintln!("Test #{i} in {location:?} failed:");
                eprintln!("* Input: {input:?}");
                eprintln!("* Expected output: {expect:?}");
                eprintln!("* Actual output: {output:?}");
                panic!();
            }
        }
    }

    #[test]
    fn test_lex_filename() {
        run_tests(
            [
                ("./abc123_a-a/file>other", "./abc123_a-a/file", ">other"),
                ("/a/b/c d e f g", "/a/b/c", " d e f g"),
            ],
            lex_filename,
        )
    }

    #[test]
    fn test_lex_tokens() {
        run_tests(
            [
                (
                    " if  /a/b/c\" $a b c\"$d efg",
                    vec![
                        Token::Keyword(Keyword::If),
                        Token::Word(Word(vec![
                            Fragment::Literal("/a/b/c"),
                            Fragment::Literal(" "),
                            Fragment::Variable("a"),
                            Fragment::Literal(" b c"),
                            Fragment::Variable("d"),
                        ])),
                        Token::Word(Word(vec![Fragment::Literal("efg")])),
                    ],
                    "",
                ),
                (
                    "\t>$d\"filename\"",
                    vec![Token::Redirect(Redirect::Output(OutputRedirect::Out(
                        Word(vec![Fragment::Variable("d"), Fragment::Literal("filename")]),
                    )))],
                    "",
                ),
                (
                    "| abc",
                    vec![
                        Token::Operator(Operator::Or),
                        Token::Word(Word(vec![Fragment::Literal("abc")])),
                    ],
                    "",
                ),
            ],
            lex_tokens,
        )
    }
}