From 3a3b0ce99a7ebc34fa9638375fe4e34eae9b8bd4 Mon Sep 17 00:00:00 2001 From: Garrit Franke Date: Wed, 2 Dec 2020 22:32:36 +0100 Subject: [PATCH] Initial commit --- .gitignore | 2 + Cargo.lock | 5 + Cargo.toml | 9 ++ src/lexer/cursor.rs | 78 +++++++++++++ src/lexer/mod.rs | 269 ++++++++++++++++++++++++++++++++++++++++++++ src/lexer/tests.rs | 173 ++++++++++++++++++++++++++++ src/main.rs | 9 ++ src/parser/mod.rs | 36 ++++++ 8 files changed, 581 insertions(+) create mode 100644 .gitignore create mode 100644 Cargo.lock create mode 100644 Cargo.toml create mode 100644 src/lexer/cursor.rs create mode 100644 src/lexer/mod.rs create mode 100644 src/lexer/tests.rs create mode 100644 src/main.rs create mode 100644 src/parser/mod.rs diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..64ee209 --- /dev/null +++ b/.gitignore @@ -0,0 +1,2 @@ +target/ +.vscode/ \ No newline at end of file diff --git a/Cargo.lock b/Cargo.lock new file mode 100644 index 0000000..eef2fba --- /dev/null +++ b/Cargo.lock @@ -0,0 +1,5 @@ +# This file is automatically @generated by Cargo. +# It is not intended for manual editing. +[[package]] +name = "flex" +version = "0.0.1" diff --git a/Cargo.toml b/Cargo.toml new file mode 100644 index 0000000..7733efe --- /dev/null +++ b/Cargo.toml @@ -0,0 +1,9 @@ +[package] +name = "flex" +version = "0.0.1" +authors = ["Garrit Franke "] +edition = "2018" + +# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html + +[dependencies] diff --git a/src/lexer/cursor.rs b/src/lexer/cursor.rs new file mode 100644 index 0000000..69a2de0 --- /dev/null +++ b/src/lexer/cursor.rs @@ -0,0 +1,78 @@ +use std::str::Chars; + +/// Peekable iterator over a char sequence. +/// +/// Next characters can be peeked via `nth_char` method, +/// and position can be shifted forward via `bump` method. +pub(crate) struct Cursor<'a> { + initial_len: usize, + chars: Chars<'a>, + prev: char, +} + +pub(crate) const EOF_CHAR: char = '\0'; + +impl<'a> Cursor<'a> { + pub(crate) fn new(input: &'a str) -> Cursor<'a> { + Cursor { + initial_len: input.len(), + chars: input.chars(), + #[cfg(debug_assertions)] + prev: EOF_CHAR, + } + } + + /// For debug assertions only + /// Returns the last eaten symbol (or '\0' in release builds). + pub(crate) fn prev(&self) -> char { + #[cfg(debug_assertions)] + { + self.prev + } + + #[cfg(not(debug_assertions))] + { + '\0' + } + } + + /// Returns nth character relative to the current cursor position. + /// If requested position doesn't exist, `EOF_CHAR` is returned. + /// However, getting `EOF_CHAR` doesn't always mean actual end of file, + /// it should be checked with `is_eof` method. + fn nth_char(&self, n: usize) -> char { + self.chars().nth(n).unwrap_or(EOF_CHAR) + } + + /// Peeks the next symbol from the input stream without consuming it. + pub(crate) fn first(&self) -> char { + self.nth_char(0) + } + + /// Checks if there is nothing more to consume. + pub(crate) fn is_eof(&self) -> bool { + self.chars.as_str().is_empty() + } + + /// Returns amount of already consumed symbols. + pub(crate) fn len_consumed(&self) -> usize { + self.initial_len - self.chars.as_str().len() + } + + /// Returns a `Chars` iterator over the remaining characters. + pub(crate) fn chars(&self) -> Chars<'a> { + self.chars.clone() + } + + /// Moves to the next character. + pub(crate) fn bump(&mut self) -> Option { + let c = self.chars.next()?; + + #[cfg(debug_assertions)] + { + self.prev = c; + } + + Some(c) + } +} diff --git a/src/lexer/mod.rs b/src/lexer/mod.rs new file mode 100644 index 0000000..1590f9a --- /dev/null +++ b/src/lexer/mod.rs @@ -0,0 +1,269 @@ +pub(crate) mod cursor; + +use self::TokenKind::*; +use cursor::Cursor; + +#[cfg(test)] +mod tests; + +#[derive(Debug, PartialEq, Eq)] +pub struct Token { + pub kind: TokenKind, + pub len: usize, + pub raw: String, +} + +impl Token { + fn new(kind: TokenKind, len: usize, raw: String) -> Token { + Token { kind, len, raw } + } +} + +/// Enum representing common lexeme types. +#[derive(Clone, Copy, Debug, PartialEq, Eq, PartialOrd, Ord)] +pub enum TokenKind { + /// Any whitespace characters sequence. + Whitespace, + Literal { + kind: LiteralKind, + }, + /// Keywords such as 'if' or 'else' + Identifier { + kind: IdentifierKind, + }, + /// // Lorem Ipsum + Comment, + /// "+" + Plus, + /// "-" + Minus, + /// "*" + Star, + /// "/" + Slash, + /// ":" + Colon, + /// "=" + Equals, + /// "==" + DeepEquals, + /// "<" + SmallerThen, + /// ">" + LargerThen, + /// "(" + BraceOpen, + /// ")" + BraceClose, + /// "[" + SquareBraceOpen, + /// "]" + SquareBraceClose, + /// "\t" + Tab, + /// "\n" + CarriageReturn, + /// Unknown token, not expected by the lexer, e.g. "№" + Unknown, +} + +#[derive(Clone, Copy, Debug, PartialEq, Eq, PartialOrd, Ord)] +pub enum LiteralKind { + Int, + Str, +} + +#[derive(Clone, Copy, Debug, PartialEq, Eq, PartialOrd, Ord)] +pub enum IdentifierKind { + Let, + If, + Else, + Function, + Boolean, + Unknown, +} + +/// Creates an iterator that produces tokens from the input string. +pub fn tokenize(mut input: &str) -> Vec { + std::iter::from_fn(move || { + if input.is_empty() { + return None; + } + let token = first_token(input); + input = &input[token.len..]; + Some(token) + }) + .collect() +} + +/// Parses the first token from the provided input string. +pub fn first_token(input: &str) -> Token { + debug_assert!(!input.is_empty()); + Cursor::new(input).advance_token() +} + +pub fn is_whitespace(c: char) -> bool { + match c { + ' ' => true, + _ => false, + } +} + +/// True if `c` is valid as a first character of an identifier. +/// See [Rust language reference](https://doc.rust-lang.org/reference/identifiers.html) for +/// a formal definition of valid identifier name. +pub fn is_id_start(c: char) -> bool { + ('a' <= c && c <= 'z') || ('A' <= c && c <= 'Z') || c == '_' +} + +/// True if `c` is valid as a non-first character of an identifier. +/// See [Rust language reference](https://doc.rust-lang.org/reference/identifiers.html) for +/// a formal definition of valid identifier name. +pub fn is_id_continue(c: char) -> bool { + ('a' <= c && c <= 'z') || ('A' <= c && c <= 'Z') || ('0' <= c && c <= '9') || c == '_' +} + +impl Cursor<'_> { + /// Parses a token from the input string. + fn advance_token(&mut self) -> Token { + // Original chars used to identify the token later on + let original_chars = self.chars(); + let first_char = self.bump().unwrap(); + let token_kind = match first_char { + c if is_whitespace(c) => self.whitespace(), + '0'..='9' => { + let kind = self.number(); + + TokenKind::Literal { kind } + } + '"' | '\'' => { + let kind = self.string(); + + TokenKind::Literal { kind } + } + '+' => Plus, + '-' => Minus, + '*' => Star, + '/' => match self.first() { + '/' => self.comment(), + _ => Slash, + }, + '=' => match self.first() { + '=' => DeepEquals, + _ => Equals, + }, + ':' => Colon, + '<' => SmallerThen, + '>' => LargerThen, + '(' => BraceOpen, + ')' => BraceClose, + '[' => SquareBraceOpen, + ']' => SquareBraceClose, + c if is_id_start(c) => { + let kind = self.identifier(c); + + Identifier { kind } + } + '\n' => CarriageReturn, + '\t' => Tab, + _ => Unknown, + }; + + let len = self.len_consumed(); + let mut raw = original_chars.collect::(); + // Cut the original tokens to the length of the token + raw.truncate(len); + Token::new(token_kind, len, raw) + } + + /// Eats symbols while predicate returns true or until the end of file is reached. + /// Returns amount of eaten symbols. + fn eat_while(&mut self, mut predicate: F) -> usize + where + F: FnMut(char) -> bool, + { + let mut eaten: usize = 0; + while predicate(self.first()) && !self.is_eof() { + eaten += 1; + self.bump(); + } + + eaten + } + + fn whitespace(&mut self) -> TokenKind { + debug_assert!(is_whitespace(self.prev())); + self.eat_while(is_whitespace); + Whitespace + } + + fn number(&mut self) -> LiteralKind { + self.eat_digits(); + LiteralKind::Int + } + + fn string(&mut self) -> LiteralKind { + self.eat_string(); + + LiteralKind::Str + } + + fn identifier(&mut self, first_char: char) -> IdentifierKind { + let mut original: String = self.chars().collect::(); + let len = self.eat_while(is_id_continue); + + // Cut original "rest"-character stream to length of token + // and prepend first character, because it has been eaten beforehand + original.truncate(len); + original = format!("{}{}", first_char, original); + + match original { + c if c == "if" => IdentifierKind::If, + c if c == "else" => IdentifierKind::Else, + c if c == "fn" => IdentifierKind::Function, + c if c == "true" || c == "false" => IdentifierKind::Boolean, + c if c == "let" => IdentifierKind::Let, + _ => IdentifierKind::Unknown, + } + } + + fn comment(&mut self) -> TokenKind { + // FIXME: Might lead to a bug, if End of file is encountered + while self.first() != '\n' { + self.bump(); + } + + TokenKind::Comment + } + + fn eat_digits(&mut self) -> bool { + let mut has_digits = false; + loop { + match self.first() { + '_' => { + self.bump(); + } + '0'..='9' => { + has_digits = true; + self.bump(); + } + _ => break, + } + } + has_digits + } + + fn eat_string(&mut self) { + // FIXME: double quoted strings could probably be ended by single quoted, and vice versa. + // Possible fix: Pass the token of the string beginning down to this method and check against it. + loop { + match self.first() { + '"' | '\'' => break, + _ => self.bump(), + }; + } + + // Eat last quote + self.bump(); + } +} diff --git a/src/lexer/tests.rs b/src/lexer/tests.rs new file mode 100644 index 0000000..46016e1 --- /dev/null +++ b/src/lexer/tests.rs @@ -0,0 +1,173 @@ +#[cfg(test)] +mod tests { + use crate::lexer::*; + + #[test] + fn test_basic_tokenizing() { + let mut tokens = tokenize("1 = 2").into_iter(); + + assert_eq!( + tokens.nth(0).unwrap(), + Token { + len: 1, + kind: TokenKind::Literal { + kind: LiteralKind::Int + }, + raw: "1".to_owned() + } + ); + + assert_eq!( + tokens.nth(0).unwrap(), + Token { + len: 1, + kind: TokenKind::Whitespace, + raw: " ".to_owned() + } + ); + + assert_eq!( + tokens.nth(0).unwrap(), + Token { + len: 1, + kind: TokenKind::Equals, + raw: "=".to_owned() + } + ); + + assert_eq!( + tokens.nth(0).unwrap(), + Token { + len: 1, + kind: TokenKind::Whitespace, + raw: " ".to_owned() + } + ); + + assert_eq!( + tokens.nth(0).unwrap(), + Token { + len: 1, + kind: TokenKind::Literal { + kind: LiteralKind::Int + }, + raw: "2".to_owned() + } + ); + } + + #[test] + fn test_tokenizing_without_whitespace() { + let mut tokens = tokenize("1=2").into_iter(); + + assert_eq!( + tokens.nth(0).unwrap(), + Token { + len: 1, + kind: TokenKind::Literal { + kind: LiteralKind::Int + }, + raw: "1".to_owned() + } + ); + + assert_eq!( + tokens.nth(0).unwrap(), + Token { + len: 1, + kind: TokenKind::Equals, + raw: "=".to_owned() + } + ); + + assert_eq!( + tokens.nth(0).unwrap(), + Token { + len: 1, + kind: TokenKind::Literal { + kind: LiteralKind::Int + }, + raw: "2".to_owned() + } + ); + } + + #[test] + fn test_booleans() { + let mut tokens = tokenize("true false").into_iter(); + + assert_eq!( + tokens.nth(0).unwrap(), + Token { + len: 4, + kind: TokenKind::Identifier { + kind: IdentifierKind::Boolean + }, + raw: "true".to_owned() + } + ); + + assert_eq!( + tokens.nth(1).unwrap(), + Token { + len: 5, + kind: TokenKind::Identifier { + kind: IdentifierKind::Boolean + }, + raw: "false".to_owned() + } + ); + } + + #[test] + fn test_functions() { + let mut tokens = tokenize("fn fib n:").into_iter(); + + assert_eq!( + tokens.nth(0).unwrap(), + Token { + len: 2, + kind: TokenKind::Identifier { + kind: IdentifierKind::Function + }, + raw: "fn".to_owned() + } + ); + } + + #[test] + fn test_comments() { + let mut tokens = tokenize( + " + -- foo + fn fib n: + ", + ) + .into_iter() + .filter(|t| { + t.kind != TokenKind::Whitespace + && t.kind != TokenKind::Tab + && t.kind != TokenKind::CarriageReturn + }); + + assert_eq!( + tokens.nth(0).unwrap(), + Token { + len: 6, + kind: TokenKind::Comment, + raw: "-- foo".to_owned(), + } + ); + + assert_eq!( + tokens.nth(0).unwrap(), + Token { + len: 2, + kind: TokenKind::Identifier { + kind: IdentifierKind::Function + }, + raw: "fn".to_owned(), + } + ); + } +} diff --git a/src/main.rs b/src/main.rs new file mode 100644 index 0000000..add3a58 --- /dev/null +++ b/src/main.rs @@ -0,0 +1,9 @@ +mod lexer; +mod parser; + +fn main() { + let tokens = lexer::tokenize(&"let x = 2"); + // let ast = parser::parse(tokens.into_iter()); + + println!("{:?}", tokens) +} diff --git a/src/parser/mod.rs b/src/parser/mod.rs new file mode 100644 index 0000000..3b06319 --- /dev/null +++ b/src/parser/mod.rs @@ -0,0 +1,36 @@ +use crate::lexer::Token; + +pub struct Parser { + tokens: Box>, + current: Option, + indentation_level: usize, +} + +impl Parser { + pub(crate) fn new(tokens: impl Iterator + 'static) -> Self { + Parser { + tokens: Box::new(tokens), + current: None, + indentation_level: 0, + } + } + + fn next(&mut self) { + self.current = self.tokens.next(); + } +} + +#[derive(Debug)] +pub struct AST; + +pub fn parse(tokens: impl Iterator + 'static) -> AST { + let mut parser = Parser::new(tokens); + let ast = AST {}; + + loop { + parser.next(); + break; + } + + ast +}