Initial commit

3 years ago · 3a3b0ce99a
8 changed files with 581 additions and 0 deletions
--- a/.gitignore
+++ b/.gitignore
@ -0,0 +1,2 @@
 target/
 .vscode/
--- a/Cargo.lock
+++ b/Cargo.lock
@ -0,0 +1,5 @@
 # This file is automatically @generated by Cargo.
 # It is not intended for manual editing.
 [[package]]
 name = "flex"
 version = "0.0.1"
--- a/Cargo.toml
+++ b/Cargo.toml
@ -0,0 +1,9 @@
 [package]
 name = "flex"
 version = "0.0.1"
 authors = ["Garrit Franke <garritfranke@gmail.com>"]
 edition = "2018"
 # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
 [dependencies]
--- a/src/lexer/cursor.rs
+++ b/src/lexer/cursor.rs
@ -0,0 +1,78 @@
 use std::str::Chars;
 /// Peekable iterator over a char sequence.
 ///
 /// Next characters can be peeked via `nth_char` method,
 /// and position can be shifted forward via `bump` method.
 pub(crate) struct Cursor<'a> {
    initial_len: usize,
    chars: Chars<'a>,
    prev: char,
 }
 pub(crate) const EOF_CHAR: char = '\0';
 impl<'a> Cursor<'a> {
    pub(crate) fn new(input: &'a str) -> Cursor<'a> {
        Cursor {
            initial_len: input.len(),
            chars: input.chars(),
            #[cfg(debug_assertions)]
            prev: EOF_CHAR,
        }
    }
    /// For debug assertions only
    /// Returns the last eaten symbol (or '\0' in release builds).
    pub(crate) fn prev(&self) -> char {
        #[cfg(debug_assertions)]
        {
            self.prev
        }
        #[cfg(not(debug_assertions))]
        {
            '\0'
        }
    }
    /// Returns nth character relative to the current cursor position.
    /// If requested position doesn't exist, `EOF_CHAR` is returned.
    /// However, getting `EOF_CHAR` doesn't always mean actual end of file,
    /// it should be checked with `is_eof` method.
    fn nth_char(&self, n: usize) -> char {
        self.chars().nth(n).unwrap_or(EOF_CHAR)
    }
    /// Peeks the next symbol from the input stream without consuming it.
    pub(crate) fn first(&self) -> char {
        self.nth_char(0)
    }
    /// Checks if there is nothing more to consume.
    pub(crate) fn is_eof(&self) -> bool {
        self.chars.as_str().is_empty()
    }
    /// Returns amount of already consumed symbols.
    pub(crate) fn len_consumed(&self) -> usize {
        self.initial_len - self.chars.as_str().len()
    }
    /// Returns a `Chars` iterator over the remaining characters.
    pub(crate) fn chars(&self) -> Chars<'a> {
        self.chars.clone()
    }
    /// Moves to the next character.
    pub(crate) fn bump(&mut self) -> Option<char> {
        let c = self.chars.next()?;
        #[cfg(debug_assertions)]
        {
            self.prev = c;
        }
        Some(c)
    }
 }
--- a/src/lexer/mod.rs
+++ b/src/lexer/mod.rs
@ -0,0 +1,269 @@
 pub(crate) mod cursor;
 use self::TokenKind::*;
 use cursor::Cursor;
 #[cfg(test)]
 mod tests;
 #[derive(Debug, PartialEq, Eq)]
 pub struct Token {
    pub kind: TokenKind,
    pub len: usize,
    pub raw: String,
 }
 impl Token {
    fn new(kind: TokenKind, len: usize, raw: String) -> Token {
        Token { kind, len, raw }
    }
 }
 /// Enum representing common lexeme types.
 #[derive(Clone, Copy, Debug, PartialEq, Eq, PartialOrd, Ord)]
 pub enum TokenKind {
    /// Any whitespace characters sequence.
    Whitespace,
    Literal {
        kind: LiteralKind,
    },
    /// Keywords such as 'if' or 'else'
    Identifier {
        kind: IdentifierKind,
    },
    /// // Lorem Ipsum
    Comment,
    /// "+"
    Plus,
    /// "-"
    Minus,
    /// "*"
    Star,
    /// "/"
    Slash,
    /// ":"
    Colon,
    /// "="
    Equals,
    /// "=="
    DeepEquals,
    /// "<"
    SmallerThen,
    /// ">"
    LargerThen,
    /// "("
    BraceOpen,
    /// ")"
    BraceClose,
    /// "["
    SquareBraceOpen,
    /// "]"
    SquareBraceClose,
    /// "\t"
    Tab,
    /// "\n"
    CarriageReturn,
    /// Unknown token, not expected by the lexer, e.g. "№"
    Unknown,
 }
 #[derive(Clone, Copy, Debug, PartialEq, Eq, PartialOrd, Ord)]
 pub enum LiteralKind {
    Int,
    Str,
 }
 #[derive(Clone, Copy, Debug, PartialEq, Eq, PartialOrd, Ord)]
 pub enum IdentifierKind {
    Let,
    If,
    Else,
    Function,
    Boolean,
    Unknown,
 }
 /// Creates an iterator that produces tokens from the input string.
 pub fn tokenize(mut input: &str) -> Vec<Token> {
    std::iter::from_fn(move || {
        if input.is_empty() {
            return None;
        }
        let token = first_token(input);
        input = &input[token.len..];
        Some(token)
    })
    .collect()
 }
 /// Parses the first token from the provided input string.
 pub fn first_token(input: &str) -> Token {
    debug_assert!(!input.is_empty());
    Cursor::new(input).advance_token()
 }
 pub fn is_whitespace(c: char) -> bool {
    match c {
        ' ' => true,
        _ => false,
    }
 }
 /// True if `c` is valid as a first character of an identifier.
 /// See [Rust language reference](https://doc.rust-lang.org/reference/identifiers.html) for
 /// a formal definition of valid identifier name.
 pub fn is_id_start(c: char) -> bool {
    ('a' <= c && c <= 'z') || ('A' <= c && c <= 'Z') || c == '_'
 }
 /// True if `c` is valid as a non-first character of an identifier.
 /// See [Rust language reference](https://doc.rust-lang.org/reference/identifiers.html) for
 /// a formal definition of valid identifier name.
 pub fn is_id_continue(c: char) -> bool {
    ('a' <= c && c <= 'z') || ('A' <= c && c <= 'Z') || ('0' <= c && c <= '9') || c == '_'
 }
 impl Cursor<'_> {
    /// Parses a token from the input string.
    fn advance_token(&mut self) -> Token {
        // Original chars used to identify the token later on
        let original_chars = self.chars();
        let first_char = self.bump().unwrap();
        let token_kind = match first_char {
            c if is_whitespace(c) => self.whitespace(),
            '0'..='9' => {
                let kind = self.number();
                TokenKind::Literal { kind }
            }
            '"' | '\'' => {
                let kind = self.string();
                TokenKind::Literal { kind }
            }
            '+' => Plus,
            '-' => Minus,
            '*' => Star,
            '/' => match self.first() {
                '/' => self.comment(),
                _ => Slash,
            },
            '=' => match self.first() {
                '=' => DeepEquals,
                _ => Equals,
            },
            ':' => Colon,
            '<' => SmallerThen,
            '>' => LargerThen,
            '(' => BraceOpen,
            ')' => BraceClose,
            '[' => SquareBraceOpen,
            ']' => SquareBraceClose,
            c if is_id_start(c) => {
                let kind = self.identifier(c);
                Identifier { kind }
            }
            '\n' => CarriageReturn,
            '\t' => Tab,
            _ => Unknown,
        };
        let len = self.len_consumed();
        let mut raw = original_chars.collect::<String>();
        // Cut the original tokens to the length of the token
        raw.truncate(len);
        Token::new(token_kind, len, raw)
    }
    /// Eats symbols while predicate returns true or until the end of file is reached.
    /// Returns amount of eaten symbols.
    fn eat_while<F>(&mut self, mut predicate: F) -> usize
    where
        F: FnMut(char) -> bool,
    {
        let mut eaten: usize = 0;
        while predicate(self.first()) && !self.is_eof() {
            eaten += 1;
            self.bump();
        }
        eaten
    }
    fn whitespace(&mut self) -> TokenKind {
        debug_assert!(is_whitespace(self.prev()));
        self.eat_while(is_whitespace);
        Whitespace
    }
    fn number(&mut self) -> LiteralKind {
        self.eat_digits();
        LiteralKind::Int
    }
    fn string(&mut self) -> LiteralKind {
        self.eat_string();
        LiteralKind::Str
    }
    fn identifier(&mut self, first_char: char) -> IdentifierKind {
        let mut original: String = self.chars().collect::<String>();
        let len = self.eat_while(is_id_continue);
        // Cut original "rest"-character stream to length of token
        // and prepend first character, because it has been eaten beforehand
        original.truncate(len);
        original = format!("{}{}", first_char, original);
        match original {
            c if c == "if" => IdentifierKind::If,
            c if c == "else" => IdentifierKind::Else,
            c if c == "fn" => IdentifierKind::Function,
            c if c == "true" || c == "false" => IdentifierKind::Boolean,
            c if c == "let" => IdentifierKind::Let,
            _ => IdentifierKind::Unknown,
        }
    }
    fn comment(&mut self) -> TokenKind {
        // FIXME: Might lead to a bug, if End of file is encountered
        while self.first() != '\n' {
            self.bump();
        }
        TokenKind::Comment
    }
    fn eat_digits(&mut self) -> bool {
        let mut has_digits = false;
        loop {
            match self.first() {
                '_' => {
                    self.bump();
                }
                '0'..='9' => {
                    has_digits = true;
                    self.bump();
                }
                _ => break,
            }
        }
        has_digits
    }
    fn eat_string(&mut self) {
        // FIXME: double quoted strings could probably be ended by single quoted, and vice versa.
        // Possible fix: Pass the token of the string beginning down to this method and check against it.
        loop {
            match self.first() {
                '"' | '\'' => break,
                _ => self.bump(),
            };
        }
        // Eat last quote
        self.bump();
    }
 }
--- a/src/lexer/tests.rs
+++ b/src/lexer/tests.rs
@ -0,0 +1,173 @@
 #[cfg(test)]
 mod tests {
    use crate::lexer::*;
    #[test]
    fn test_basic_tokenizing() {
        let mut tokens = tokenize("1 = 2").into_iter();
        assert_eq!(
            tokens.nth(0).unwrap(),
            Token {
                len: 1,
                kind: TokenKind::Literal {
                    kind: LiteralKind::Int
                },
                raw: "1".to_owned()
            }
        );
        assert_eq!(
            tokens.nth(0).unwrap(),
            Token {
                len: 1,
                kind: TokenKind::Whitespace,
                raw: " ".to_owned()
            }
        );
        assert_eq!(
            tokens.nth(0).unwrap(),
            Token {
                len: 1,
                kind: TokenKind::Equals,
                raw: "=".to_owned()
            }
        );
        assert_eq!(
            tokens.nth(0).unwrap(),
            Token {
                len: 1,
                kind: TokenKind::Whitespace,
                raw: " ".to_owned()
            }
        );
        assert_eq!(
            tokens.nth(0).unwrap(),
            Token {
                len: 1,
                kind: TokenKind::Literal {
                    kind: LiteralKind::Int
                },
                raw: "2".to_owned()
            }
        );
    }
    #[test]
    fn test_tokenizing_without_whitespace() {
        let mut tokens = tokenize("1=2").into_iter();
        assert_eq!(
            tokens.nth(0).unwrap(),
            Token {
                len: 1,
                kind: TokenKind::Literal {
                    kind: LiteralKind::Int
                },
                raw: "1".to_owned()
            }
        );
        assert_eq!(
            tokens.nth(0).unwrap(),
            Token {
                len: 1,
                kind: TokenKind::Equals,
                raw: "=".to_owned()
            }
        );
        assert_eq!(
            tokens.nth(0).unwrap(),
            Token {
                len: 1,
                kind: TokenKind::Literal {
                    kind: LiteralKind::Int
                },
                raw: "2".to_owned()
            }
        );
    }
    #[test]
    fn test_booleans() {
        let mut tokens = tokenize("true false").into_iter();
        assert_eq!(
            tokens.nth(0).unwrap(),
            Token {
                len: 4,
                kind: TokenKind::Identifier {
                    kind: IdentifierKind::Boolean
                },
                raw: "true".to_owned()
            }
        );
        assert_eq!(
            tokens.nth(1).unwrap(),
            Token {
                len: 5,
                kind: TokenKind::Identifier {
                    kind: IdentifierKind::Boolean
                },
                raw: "false".to_owned()
            }
        );
    }
    #[test]
    fn test_functions() {
        let mut tokens = tokenize("fn fib n:").into_iter();
        assert_eq!(
            tokens.nth(0).unwrap(),
            Token {
                len: 2,
                kind: TokenKind::Identifier {
                    kind: IdentifierKind::Function
                },
                raw: "fn".to_owned()
            }
        );
    }
    #[test]
    fn test_comments() {
        let mut tokens = tokenize(
            "
        -- foo
        fn fib n:
        ",
        )
        .into_iter()
        .filter(|t| {
            t.kind != TokenKind::Whitespace
                && t.kind != TokenKind::Tab
                && t.kind != TokenKind::CarriageReturn
        });
        assert_eq!(
            tokens.nth(0).unwrap(),
            Token {
                len: 6,
                kind: TokenKind::Comment,
                raw: "-- foo".to_owned(),
            }
        );
        assert_eq!(
            tokens.nth(0).unwrap(),
            Token {
                len: 2,
                kind: TokenKind::Identifier {
                    kind: IdentifierKind::Function
                },
                raw: "fn".to_owned(),
            }
        );
    }
 }
--- a/src/main.rs
+++ b/src/main.rs
@ -0,0 +1,9 @@
 mod lexer;
 mod parser;
 fn main() {
    let tokens = lexer::tokenize(&"let x = 2");
    // let ast = parser::parse(tokens.into_iter());
    println!("{:?}", tokens)
 }
--- a/src/parser/mod.rs
+++ b/src/parser/mod.rs
@ -0,0 +1,36 @@
 use crate::lexer::Token;
 pub struct Parser {
    tokens: Box<dyn Iterator<Item = Token>>,
    current: Option<Token>,
    indentation_level: usize,
 }
 impl Parser {
    pub(crate) fn new(tokens: impl Iterator<Item = Token> + 'static) -> Self {
        Parser {
            tokens: Box::new(tokens),
            current: None,
            indentation_level: 0,
        }
    }
    fn next(&mut self) {
        self.current = self.tokens.next();
    }
 }
 #[derive(Debug)]
 pub struct AST;
 pub fn parse(tokens: impl Iterator<Item = Token> + 'static) -> AST {
    let mut parser = Parser::new(tokens);
    let ast = AST {};
    loop {
        parser.next();
        break;
    }
    ast
 }