From 3a3b0ce99a7ebc34fa9638375fe4e34eae9b8bd4 Mon Sep 17 00:00:00 2001
From: Garrit Franke <garritfranke@gmail.com>
Date: Wed, 2 Dec 2020 22:32:36 +0100
Subject: [PATCH] Initial commit

---
 .gitignore          |   2 +
 Cargo.lock          |   5 +
 Cargo.toml          |   9 ++
 src/lexer/cursor.rs |  78 +++++++++++++
 src/lexer/mod.rs    | 269 ++++++++++++++++++++++++++++++++++++++++++++
 src/lexer/tests.rs  | 173 ++++++++++++++++++++++++++++
 src/main.rs         |   9 ++
 src/parser/mod.rs   |  36 ++++++
 8 files changed, 581 insertions(+)
 create mode 100644 .gitignore
 create mode 100644 Cargo.lock
 create mode 100644 Cargo.toml
 create mode 100644 src/lexer/cursor.rs
 create mode 100644 src/lexer/mod.rs
 create mode 100644 src/lexer/tests.rs
 create mode 100644 src/main.rs
 create mode 100644 src/parser/mod.rs
diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..64ee209
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,2 @@
+target/
+.vscode/
\ No newline at end of file
diff --git a/Cargo.lock b/Cargo.lock
new file mode 100644
index 0000000..eef2fba
--- /dev/null
+++ b/Cargo.lock
@@ -0,0 +1,5 @@
+# This file is automatically @generated by Cargo.
+# It is not intended for manual editing.
+[[package]]
+name = "flex"
+version = "0.0.1"
diff --git a/Cargo.toml b/Cargo.toml
new file mode 100644
index 0000000..7733efe
--- /dev/null
+++ b/Cargo.toml
@@ -0,0 +1,9 @@
+[package]
+name = "flex"
+version = "0.0.1"
+authors = ["Garrit Franke <garritfranke@gmail.com>"]
+edition = "2018"
+
+# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
+
+[dependencies]
diff --git a/src/lexer/cursor.rs b/src/lexer/cursor.rs
new file mode 100644
index 0000000..69a2de0
--- /dev/null
+++ b/src/lexer/cursor.rs
@@ -0,0 +1,78 @@
+use std::str::Chars;
+
+/// Peekable iterator over a char sequence.
+///
+/// Next characters can be peeked via `nth_char` method,
+/// and position can be shifted forward via `bump` method.
+pub(crate) struct Cursor<'a> {
+    initial_len: usize,
+    chars: Chars<'a>,
+    prev: char,
+}
+
+pub(crate) const EOF_CHAR: char = '\0';
+
+impl<'a> Cursor<'a> {
+    pub(crate) fn new(input: &'a str) -> Cursor<'a> {
+        Cursor {
+            initial_len: input.len(),
+            chars: input.chars(),
+            #[cfg(debug_assertions)]
+            prev: EOF_CHAR,
+        }
+    }
+
+    /// For debug assertions only
+    /// Returns the last eaten symbol (or '\0' in release builds).
+    pub(crate) fn prev(&self) -> char {
+        #[cfg(debug_assertions)]
+        {
+            self.prev
+        }
+
+        #[cfg(not(debug_assertions))]
+        {
+            '\0'
+        }
+    }
+
+    /// Returns nth character relative to the current cursor position.
+    /// If requested position doesn't exist, `EOF_CHAR` is returned.
+    /// However, getting `EOF_CHAR` doesn't always mean actual end of file,
+    /// it should be checked with `is_eof` method.
+    fn nth_char(&self, n: usize) -> char {
+        self.chars().nth(n).unwrap_or(EOF_CHAR)
+    }
+
+    /// Peeks the next symbol from the input stream without consuming it.
+    pub(crate) fn first(&self) -> char {
+        self.nth_char(0)
+    }
+
+    /// Checks if there is nothing more to consume.
+    pub(crate) fn is_eof(&self) -> bool {
+        self.chars.as_str().is_empty()
+    }
+
+    /// Returns amount of already consumed symbols.
+    pub(crate) fn len_consumed(&self) -> usize {
+        self.initial_len - self.chars.as_str().len()
+    }
+
+    /// Returns a `Chars` iterator over the remaining characters.
+    pub(crate) fn chars(&self) -> Chars<'a> {
+        self.chars.clone()
+    }
+
+    /// Moves to the next character.
+    pub(crate) fn bump(&mut self) -> Option<char> {
+        let c = self.chars.next()?;
+
+        #[cfg(debug_assertions)]
+        {
+            self.prev = c;
+        }
+
+        Some(c)
+    }
+}
diff --git a/src/lexer/mod.rs b/src/lexer/mod.rs
new file mode 100644
index 0000000..1590f9a
--- /dev/null
+++ b/src/lexer/mod.rs
@@ -0,0 +1,269 @@
+pub(crate) mod cursor;
+
+use self::TokenKind::*;
+use cursor::Cursor;
+
+#[cfg(test)]
+mod tests;
+
+#[derive(Debug, PartialEq, Eq)]
+pub struct Token {
+    pub kind: TokenKind,
+    pub len: usize,
+    pub raw: String,
+}
+
+impl Token {
+    fn new(kind: TokenKind, len: usize, raw: String) -> Token {
+        Token { kind, len, raw }
+    }
+}
+
+/// Enum representing common lexeme types.
+#[derive(Clone, Copy, Debug, PartialEq, Eq, PartialOrd, Ord)]
+pub enum TokenKind {
+    /// Any whitespace characters sequence.
+    Whitespace,
+    Literal {
+        kind: LiteralKind,
+    },
+    /// Keywords such as 'if' or 'else'
+    Identifier {
+        kind: IdentifierKind,
+    },
+    /// // Lorem Ipsum
+    Comment,
+    /// "+"
+    Plus,
+    /// "-"
+    Minus,
+    /// "*"
+    Star,
+    /// "/"
+    Slash,
+    /// ":"
+    Colon,
+    /// "="
+    Equals,
+    /// "=="
+    DeepEquals,
+    /// "<"
+    SmallerThen,
+    /// ">"
+    LargerThen,
+    /// "("
+    BraceOpen,
+    /// ")"
+    BraceClose,
+    /// "["
+    SquareBraceOpen,
+    /// "]"
+    SquareBraceClose,
+    /// "\t"
+    Tab,
+    /// "\n"
+    CarriageReturn,
+    /// Unknown token, not expected by the lexer, e.g. "№"
+    Unknown,
+}
+
+#[derive(Clone, Copy, Debug, PartialEq, Eq, PartialOrd, Ord)]
+pub enum LiteralKind {
+    Int,
+    Str,
+}
+
+#[derive(Clone, Copy, Debug, PartialEq, Eq, PartialOrd, Ord)]
+pub enum IdentifierKind {
+    Let,
+    If,
+    Else,
+    Function,
+    Boolean,
+    Unknown,
+}
+
+/// Creates an iterator that produces tokens from the input string.
+pub fn tokenize(mut input: &str) -> Vec<Token> {
+    std::iter::from_fn(move || {
+        if input.is_empty() {
+            return None;
+        }
+        let token = first_token(input);
+        input = &input[token.len..];
+        Some(token)
+    })
+    .collect()
+}
+
+/// Parses the first token from the provided input string.
+pub fn first_token(input: &str) -> Token {
+    debug_assert!(!input.is_empty());
+    Cursor::new(input).advance_token()
+}
+
+pub fn is_whitespace(c: char) -> bool {
+    match c {
+        ' ' => true,
+        _ => false,
+    }
+}
+
+/// True if `c` is valid as a first character of an identifier.
+/// See [Rust language reference](https://doc.rust-lang.org/reference/identifiers.html) for
+/// a formal definition of valid identifier name.
+pub fn is_id_start(c: char) -> bool {
+    ('a' <= c && c <= 'z') || ('A' <= c && c <= 'Z') || c == '_'
+}
+
+/// True if `c` is valid as a non-first character of an identifier.
+/// See [Rust language reference](https://doc.rust-lang.org/reference/identifiers.html) for
+/// a formal definition of valid identifier name.
+pub fn is_id_continue(c: char) -> bool {
+    ('a' <= c && c <= 'z') || ('A' <= c && c <= 'Z') || ('0' <= c && c <= '9') || c == '_'
+}
+
+impl Cursor<'_> {
+    /// Parses a token from the input string.
+    fn advance_token(&mut self) -> Token {
+        // Original chars used to identify the token later on
+        let original_chars = self.chars();
+        let first_char = self.bump().unwrap();
+        let token_kind = match first_char {
+            c if is_whitespace(c) => self.whitespace(),
+            '0'..='9' => {
+                let kind = self.number();
+
+                TokenKind::Literal { kind }
+            }
+            '"' | '\'' => {
+                let kind = self.string();
+
+                TokenKind::Literal { kind }
+            }
+            '+' => Plus,
+            '-' => Minus,
+            '*' => Star,
+            '/' => match self.first() {
+                '/' => self.comment(),
+                _ => Slash,
+            },
+            '=' => match self.first() {
+                '=' => DeepEquals,
+                _ => Equals,
+            },
+            ':' => Colon,
+            '<' => SmallerThen,
+            '>' => LargerThen,
+            '(' => BraceOpen,
+            ')' => BraceClose,
+            '[' => SquareBraceOpen,
+            ']' => SquareBraceClose,
+            c if is_id_start(c) => {
+                let kind = self.identifier(c);
+
+                Identifier { kind }
+            }
+            '\n' => CarriageReturn,
+            '\t' => Tab,
+            _ => Unknown,
+        };
+
+        let len = self.len_consumed();
+        let mut raw = original_chars.collect::<String>();
+        // Cut the original tokens to the length of the token
+        raw.truncate(len);
+        Token::new(token_kind, len, raw)
+    }
+
+    /// Eats symbols while predicate returns true or until the end of file is reached.
+    /// Returns amount of eaten symbols.
+    fn eat_while<F>(&mut self, mut predicate: F) -> usize
+    where
+        F: FnMut(char) -> bool,
+    {
+        let mut eaten: usize = 0;
+        while predicate(self.first()) && !self.is_eof() {
+            eaten += 1;
+            self.bump();
+        }
+
+        eaten
+    }
+
+    fn whitespace(&mut self) -> TokenKind {
+        debug_assert!(is_whitespace(self.prev()));
+        self.eat_while(is_whitespace);
+        Whitespace
+    }
+
+    fn number(&mut self) -> LiteralKind {
+        self.eat_digits();
+        LiteralKind::Int
+    }
+
+    fn string(&mut self) -> LiteralKind {
+        self.eat_string();
+
+        LiteralKind::Str
+    }
+
+    fn identifier(&mut self, first_char: char) -> IdentifierKind {
+        let mut original: String = self.chars().collect::<String>();
+        let len = self.eat_while(is_id_continue);
+
+        // Cut original "rest"-character stream to length of token
+        // and prepend first character, because it has been eaten beforehand
+        original.truncate(len);
+        original = format!("{}{}", first_char, original);
+
+        match original {
+            c if c == "if" => IdentifierKind::If,
+            c if c == "else" => IdentifierKind::Else,
+            c if c == "fn" => IdentifierKind::Function,
+            c if c == "true" || c == "false" => IdentifierKind::Boolean,
+            c if c == "let" => IdentifierKind::Let,
+            _ => IdentifierKind::Unknown,
+        }
+    }
+
+    fn comment(&mut self) -> TokenKind {
+        // FIXME: Might lead to a bug, if End of file is encountered
+        while self.first() != '\n' {
+            self.bump();
+        }
+
+        TokenKind::Comment
+    }
+
+    fn eat_digits(&mut self) -> bool {
+        let mut has_digits = false;
+        loop {
+            match self.first() {
+                '_' => {
+                    self.bump();
+                }
+                '0'..='9' => {
+                    has_digits = true;
+                    self.bump();
+                }
+                _ => break,
+            }
+        }
+        has_digits
+    }
+
+    fn eat_string(&mut self) {
+        // FIXME: double quoted strings could probably be ended by single quoted, and vice versa.
+        // Possible fix: Pass the token of the string beginning down to this method and check against it.
+        loop {
+            match self.first() {
+                '"' | '\'' => break,
+                _ => self.bump(),
+            };
+        }
+
+        // Eat last quote
+        self.bump();
+    }
+}
diff --git a/src/lexer/tests.rs b/src/lexer/tests.rs
new file mode 100644
index 0000000..46016e1
--- /dev/null
+++ b/src/lexer/tests.rs
@@ -0,0 +1,173 @@
+#[cfg(test)]
+mod tests {
+    use crate::lexer::*;
+
+    #[test]
+    fn test_basic_tokenizing() {
+        let mut tokens = tokenize("1 = 2").into_iter();
+
+        assert_eq!(
+            tokens.nth(0).unwrap(),
+            Token {
+                len: 1,
+                kind: TokenKind::Literal {
+                    kind: LiteralKind::Int
+                },
+                raw: "1".to_owned()
+            }
+        );
+
+        assert_eq!(
+            tokens.nth(0).unwrap(),
+            Token {
+                len: 1,
+                kind: TokenKind::Whitespace,
+                raw: " ".to_owned()
+            }
+        );
+
+        assert_eq!(
+            tokens.nth(0).unwrap(),
+            Token {
+                len: 1,
+                kind: TokenKind::Equals,
+                raw: "=".to_owned()
+            }
+        );
+
+        assert_eq!(
+            tokens.nth(0).unwrap(),
+            Token {
+                len: 1,
+                kind: TokenKind::Whitespace,
+                raw: " ".to_owned()
+            }
+        );
+
+        assert_eq!(
+            tokens.nth(0).unwrap(),
+            Token {
+                len: 1,
+                kind: TokenKind::Literal {
+                    kind: LiteralKind::Int
+                },
+                raw: "2".to_owned()
+            }
+        );
+    }
+
+    #[test]
+    fn test_tokenizing_without_whitespace() {
+        let mut tokens = tokenize("1=2").into_iter();
+
+        assert_eq!(
+            tokens.nth(0).unwrap(),
+            Token {
+                len: 1,
+                kind: TokenKind::Literal {
+                    kind: LiteralKind::Int
+                },
+                raw: "1".to_owned()
+            }
+        );
+
+        assert_eq!(
+            tokens.nth(0).unwrap(),
+            Token {
+                len: 1,
+                kind: TokenKind::Equals,
+                raw: "=".to_owned()
+            }
+        );
+
+        assert_eq!(
+            tokens.nth(0).unwrap(),
+            Token {
+                len: 1,
+                kind: TokenKind::Literal {
+                    kind: LiteralKind::Int
+                },
+                raw: "2".to_owned()
+            }
+        );
+    }
+
+    #[test]
+    fn test_booleans() {
+        let mut tokens = tokenize("true false").into_iter();
+
+        assert_eq!(
+            tokens.nth(0).unwrap(),
+            Token {
+                len: 4,
+                kind: TokenKind::Identifier {
+                    kind: IdentifierKind::Boolean
+                },
+                raw: "true".to_owned()
+            }
+        );
+
+        assert_eq!(
+            tokens.nth(1).unwrap(),
+            Token {
+                len: 5,
+                kind: TokenKind::Identifier {
+                    kind: IdentifierKind::Boolean
+                },
+                raw: "false".to_owned()
+            }
+        );
+    }
+
+    #[test]
+    fn test_functions() {
+        let mut tokens = tokenize("fn fib n:").into_iter();
+
+        assert_eq!(
+            tokens.nth(0).unwrap(),
+            Token {
+                len: 2,
+                kind: TokenKind::Identifier {
+                    kind: IdentifierKind::Function
+                },
+                raw: "fn".to_owned()
+            }
+        );
+    }
+
+    #[test]
+    fn test_comments() {
+        let mut tokens = tokenize(
+            "
+        -- foo
+        fn fib n:
+        ",
+        )
+        .into_iter()
+        .filter(|t| {
+            t.kind != TokenKind::Whitespace
+                && t.kind != TokenKind::Tab
+                && t.kind != TokenKind::CarriageReturn
+        });
+
+        assert_eq!(
+            tokens.nth(0).unwrap(),
+            Token {
+                len: 6,
+                kind: TokenKind::Comment,
+                raw: "-- foo".to_owned(),
+            }
+        );
+
+        assert_eq!(
+            tokens.nth(0).unwrap(),
+            Token {
+                len: 2,
+                kind: TokenKind::Identifier {
+                    kind: IdentifierKind::Function
+                },
+                raw: "fn".to_owned(),
+            }
+        );
+    }
+}
diff --git a/src/main.rs b/src/main.rs
new file mode 100644
index 0000000..add3a58
--- /dev/null
+++ b/src/main.rs
@@ -0,0 +1,9 @@
+mod lexer;
+mod parser;
+
+fn main() {
+    let tokens = lexer::tokenize(&"let x = 2");
+    // let ast = parser::parse(tokens.into_iter());
+
+    println!("{:?}", tokens)
+}
diff --git a/src/parser/mod.rs b/src/parser/mod.rs
new file mode 100644
index 0000000..3b06319
--- /dev/null
+++ b/src/parser/mod.rs
@@ -0,0 +1,36 @@
+use crate::lexer::Token;
+
+pub struct Parser {
+    tokens: Box<dyn Iterator<Item = Token>>,
+    current: Option<Token>,
+    indentation_level: usize,
+}
+
+impl Parser {
+    pub(crate) fn new(tokens: impl Iterator<Item = Token> + 'static) -> Self {
+        Parser {
+            tokens: Box::new(tokens),
+            current: None,
+            indentation_level: 0,
+        }
+    }
+
+    fn next(&mut self) {
+        self.current = self.tokens.next();
+    }
+}
+
+#[derive(Debug)]
+pub struct AST;
+
+pub fn parse(tokens: impl Iterator<Item = Token> + 'static) -> AST {
+    let mut parser = Parser::new(tokens);
+    let ast = AST {};
+
+    loop {
+        parser.next();
+        break;
+    }
+
+    ast
+}