sabre/src/lexer/mod.rs

/**
 * Copyright 2020 Garrit Franke
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *      https://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
pub(crate) mod cursor;

use self::TokenKind::*;
use cursor::Cursor;

#[cfg(test)]
mod tests;

#[derive(Debug, PartialEq, Eq, Clone)]
pub struct Token {
    pub kind: TokenKind,
    pub len: usize,
    pub raw: String,
    pub pos: Position,
}

impl Token {
    fn new(kind: TokenKind, len: usize, raw: String, pos: Position) -> Token {
        Token {
            kind,
            len,
            raw,
            pos,
        }
    }
}

#[derive(Debug, PartialEq, Eq, Clone, Copy)]
pub struct Position {
    pub line: usize,
    pub offset: usize,
    pub raw: usize,
}

/// Enum representing common lexeme types.
#[derive(Clone, Debug, PartialEq, Eq, PartialOrd, Ord)]
pub enum TokenKind {
    /// Any whitespace characters sequence.
    Whitespace,
    Identifier(String),
    Literal(Value),
    /// Keywords such as 'if' or 'else'
    Keyword(Keyword),
    /// // Lorem Ipsum
    Comment,
    /// "+"
    Plus,
    /// "-"
    Minus,
    /// "*"
    Star,
    /// "/"
    Slash,
    /// "%"
    Percent,
    /// ":"
    Colon,
    /// ";"
    SemiColon,
    /// "!"
    Exclamation,
    /// ","
    Comma,
    /// "="
    Assign,
    /// "=="
    Equals,
    /// "<"
    LessThan,
    /// "<="
    LessThanOrEqual,
    /// ">"
    GreaterThan,
    /// ">="
    GreaterThanOrEqual,
    /// "!="
    NotEqual,
    /// &&
    And,
    /// "||"
    Or,
    /// "("
    BraceOpen,
    /// ")"
    BraceClose,
    /// "["
    SquareBraceOpen,
    /// "]"
    SquareBraceClose,
    /// "{"
    CurlyBracesOpen,
    /// "}"
    CurlyBracesClose,
    /// "\t"
    Tab,
    /// "\n"
    CarriageReturn,
    /// Unknown token, not expected by the lexer, e.g. "№"
    Unknown,
}

#[derive(Clone, Copy, Debug, PartialEq, Eq, PartialOrd, Ord)]
pub enum Value {
    Int,
    Str,
}

#[derive(Clone, Copy, Debug, PartialEq, Eq, PartialOrd, Ord)]
pub enum Keyword {
    Let,
    If,
    Else,
    Return,
    While,
    Break,
    Continue,
    Function,
    Boolean,
    Unknown,
}

/// Creates an iterator that produces tokens from the input string.
pub fn tokenize(mut input: &str) -> Vec<Token> {
    let mut pos = Position {
        raw: usize::MAX,
        line: 1,
        offset: 0,
    };
    std::iter::from_fn(move || {
        if input.is_empty() {
            return None;
        }
        let token = first_token(input, &mut pos);
        input = &input[token.len..];
        Some(token)
    })
    .collect()
}

/// Parses the first token from the provided input string.
pub fn first_token(input: &str, pos: &mut Position) -> Token {
    debug_assert!(!input.is_empty());
    Cursor::new(input, pos).advance_token()
}

pub fn is_whitespace(c: char) -> bool {
    match c {
        ' ' | '\n' | '\r' | '\t' => true,
        '\u{00A0}' => {
            dbg!("Non-standard unicode character found: '\u{00A0}'");
            true
        }
        _ => false,
    }
}

/// True if `c` is valid as a first character of an identifier.
/// See [Rust language reference](https://doc.rust-lang.org/reference/identifiers.html) for
/// a formal definition of valid identifier name.
pub fn is_id_start(c: char) -> bool {
    ('a' <= c && c <= 'z') || ('A' <= c && c <= 'Z') || c == '_'
}

/// True if `c` is valid as a non-first character of an identifier.
/// See [Rust language reference](https://doc.rust-lang.org/reference/identifiers.html) for
/// a formal definition of valid identifier name.
pub fn is_id_continue(c: char) -> bool {
    ('a' <= c && c <= 'z') || ('A' <= c && c <= 'Z') || ('0' <= c && c <= '9') || c == '_'
}

impl Cursor<'_> {
    /// Parses a token from the input string.
    fn advance_token(&mut self) -> Token {
        // Original chars used to identify the token later on
        let original_chars = self.chars();
        // FIXME: Identical value, since it will be used twice and is not clonable later
        let original_chars2 = self.chars();
        let first_char = self.bump().unwrap();
        let token_kind = match first_char {
            c if is_whitespace(c) => self.whitespace(),
            '0'..='9' => self.number(),
            '"' | '\'' => self.string(),
            '+' => Plus,
            '-' => Minus,
            '*' => Star,
            '%' => Percent,
            '/' => match self.first() {
                '/' => {
                    self.bump();
                    self.comment()
                }
                _ => Slash,
            },
            '=' => match self.first() {
                '=' => {
                    self.bump();
                    Equals
                }
                _ => Assign,
            },
            ':' => Colon,
            ';' => SemiColon,
            ',' => Comma,
            '<' => match self.first() {
                '=' => {
                    self.bump();
                    LessThanOrEqual
                }
                _ => LessThan,
            },
            '>' => match self.first() {
                '=' => {
                    self.bump();
                    GreaterThanOrEqual
                }
                _ => GreaterThan,
            },
            '&' => match self.first() {
                '&' => {
                    self.bump();
                    And
                }
                _ => Unknown,
            },
            '|' => match self.first() {
                '|' => {
                    self.bump();
                    Or
                }
                _ => Unknown,
            },
            '!' => match self.first() {
                '=' => {
                    self.bump();
                    NotEqual
                }
                _ => Exclamation,
            },
            '(' => BraceOpen,
            ')' => BraceClose,
            '[' => SquareBraceOpen,
            ']' => SquareBraceClose,
            '{' => CurlyBracesOpen,
            '}' => CurlyBracesClose,
            c if is_id_start(c) => {
                let kind = self.identifier(c);
                if kind == Keyword::Unknown {
                    let mut ch: String = original_chars.collect();
                    ch.truncate(self.len_consumed());
                    TokenKind::Identifier(ch)
                } else {
                    TokenKind::Keyword(kind)
                }
            }
            '\n' => CarriageReturn,
            '\t' => Tab,
            _ => Unknown,
        };

        let len = self.len_consumed();
        let mut raw = original_chars2.collect::<String>();
        // Cut the original tokens to the length of the token
        raw.truncate(len);
        let position = self.pos();
        let token = Token::new(token_kind, len, raw, position);

        token
    }

    /// Eats symbols while predicate returns true or until the end of file is reached.
    /// Returns amount of eaten symbols.
    fn eat_while<F>(&mut self, mut predicate: F) -> usize
    where
        F: FnMut(char) -> bool,
    {
        let mut eaten: usize = 0;
        while predicate(self.first()) && !self.is_eof() {
            eaten += 1;
            self.bump();
        }

        eaten
    }

    fn whitespace(&mut self) -> TokenKind {
        debug_assert!(is_whitespace(self.prev()));
        self.eat_while(is_whitespace);
        Whitespace
    }

    fn number(&mut self) -> TokenKind {
        self.eat_digits();
        TokenKind::Literal(Value::Int)
    }

    fn string(&mut self) -> TokenKind {
        self.eat_string();

        TokenKind::Literal(Value::Str)
    }

    fn identifier(&mut self, first_char: char) -> Keyword {
        let mut original: String = self.chars().collect::<String>();
        let len = self.eat_while(is_id_continue);

        // Cut original "rest"-character stream to length of token
        // and prepend first character, because it has been eaten beforehand
        original.truncate(len);
        original = format!("{}{}", first_char, original);

        match original {
            c if c == "if" => Keyword::If,
            c if c == "else" => Keyword::Else,
            c if c == "fn" => Keyword::Function,
            c if c == "true" || c == "false" => Keyword::Boolean,
            c if c == "let" => Keyword::Let,
            c if c == "return" => Keyword::Return,
            c if c == "while" => Keyword::While,
            _ => Keyword::Unknown,
        }
    }

    fn comment(&mut self) -> TokenKind {
        // FIXME: Might lead to a bug, if End of file is encountered
        while self.first() != '\n' {
            self.bump();
        }

        TokenKind::Comment
    }

    fn eat_digits(&mut self) -> bool {
        let mut has_digits = false;
        loop {
            match self.first() {
                '_' => {
                    self.bump();
                }
                '0'..='9' => {
                    has_digits = true;
                    self.bump();
                }
                _ => break,
            }
        }
        has_digits
    }

    fn eat_string(&mut self) {
        // FIXME: double quoted strings could probably be ended by single quoted, and vice versa.
        // Possible fix: Pass the token of the string beginning down to this method and check against it.
        loop {
            match self.first() {
                '"' | '\'' => break,
                '\n' => panic!(
                    "String does not end on same line. At {}:{}",
                    self.pos().line,
                    self.pos().offset
                ),
                _ => self.bump(),
            };
        }

        // Eat last quote
        self.bump();
    }
}
Add copyright notices 3 years ago			`/**`
			`* Copyright 2020 Garrit Franke`
			`*`
			`* Licensed under the Apache License, Version 2.0 (the "License");`
			`* you may not use this file except in compliance with the License.`
			`* You may obtain a copy of the License at`
			`*`
			`* https://www.apache.org/licenses/LICENSE-2.0`
			`*`
			`* Unless required by applicable law or agreed to in writing, software`
			`* distributed under the License is distributed on an "AS IS" BASIS,`
			`* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.`
			`* See the License for the specific language governing permissions and`
			`* limitations under the License.`
			`*/`
Initial commit 3 years ago			`pub(crate) mod cursor;`

			`use self::TokenKind::*;`
			`use cursor::Cursor;`

			`#[cfg(test)]`
			`mod tests;`

Parse functions 3 years ago			`#[derive(Debug, PartialEq, Eq, Clone)]`
Initial commit 3 years ago			`pub struct Token {`
			`pub kind: TokenKind,`
			`pub len: usize,`
			`pub raw: String,`
Add token positions 3 years ago			`pub pos: Position,`
Initial commit 3 years ago			`}`

			`impl Token {`
Add token positions 3 years ago			`fn new(kind: TokenKind, len: usize, raw: String, pos: Position) -> Token {`
Add token positions 3 years ago			`Token {`
			`kind,`
			`len,`
			`raw,`
			`pos,`
			`}`
Initial commit 3 years ago			`}`
			`}`

Add token positions 3 years ago			`#[derive(Debug, PartialEq, Eq, Clone, Copy)]`
			`pub struct Position {`
			`pub line: usize,`
			`pub offset: usize,`
			`pub raw: usize,`
			`}`

Initial commit 3 years ago			`/// Enum representing common lexeme types.`
Refactor TokenType 3 years ago			`#[derive(Clone, Debug, PartialEq, Eq, PartialOrd, Ord)]`
Initial commit 3 years ago			`pub enum TokenKind {`
			`/// Any whitespace characters sequence.`
			`Whitespace,`
Refactor TokenType 3 years ago			`Identifier(String),`
			`Literal(Value),`
Initial commit 3 years ago			`/// Keywords such as 'if' or 'else'`
Refactor TokenType 3 years ago			`Keyword(Keyword),`
Initial commit 3 years ago			`/// // Lorem Ipsum`
			`Comment,`
			`/// "+"`
			`Plus,`
			`/// "-"`
			`Minus,`
			`/// "*"`
			`Star,`
			`/// "/"`
			`Slash,`
Add copyright notices 3 years ago			`/// "%"`
			`Percent,`
Initial commit 3 years ago			`/// ":"`
			`Colon,`
Refactor TokenType 3 years ago			`/// ";"`
			`SemiColon,`
Add remaining comparison operators 3 years ago			`/// "!"`
			`Exclamation,`
Add function arguments 3 years ago			`/// ","`
			`Comma,`
Initial commit 3 years ago			`/// "="`
Add variable declarations 3 years ago			`Assign,`
Initial commit 3 years ago			`/// "=="`
Add variable declarations 3 years ago			`Equals,`
Initial commit 3 years ago			`/// "<"`
Add math operations 3 years ago			`LessThan,`
Add remaining comparison operators 3 years ago			`/// "<="`
			`LessThanOrEqual,`
Initial commit 3 years ago			`/// ">"`
Add math operations 3 years ago			`GreaterThan,`
Add remaining comparison operators 3 years ago			`/// ">="`
			`GreaterThanOrEqual,`
			`/// "!="`
			`NotEqual,`
Add copyright notices 3 years ago			`/// &&`
			`And,`
			`/// "\|\|"`
			`Or,`
Initial commit 3 years ago			`/// "("`
			`BraceOpen,`
			`/// ")"`
			`BraceClose,`
			`/// "["`
			`SquareBraceOpen,`
			`/// "]"`
			`SquareBraceClose,`
Add curly braces 3 years ago			`/// "{"`
			`CurlyBracesOpen,`
			`/// "}"`
			`CurlyBracesClose,`
Initial commit 3 years ago			`/// "\t"`
			`Tab,`
			`/// "\n"`
			`CarriageReturn,`
			`/// Unknown token, not expected by the lexer, e.g. "№"`
			`Unknown,`
			`}`

			`#[derive(Clone, Copy, Debug, PartialEq, Eq, PartialOrd, Ord)]`
Refactor TokenType 3 years ago			`pub enum Value {`
Initial commit 3 years ago			`Int,`
			`Str,`
			`}`

			`#[derive(Clone, Copy, Debug, PartialEq, Eq, PartialOrd, Ord)]`
Refactor TokenType 3 years ago			`pub enum Keyword {`
Initial commit 3 years ago			`Let,`
			`If,`
			`Else,`
Implement return statement 3 years ago			`Return,`
Add while loop 3 years ago			`While,`
			`Break,`
			`Continue,`
Revert "Change function declaration syntax" This reverts commit b21c781a646c7da6eec0a3a4787f6c8eb86a7ea5. 3 years ago			`Function,`
Initial commit 3 years ago			`Boolean,`
			`Unknown,`
			`}`

			`/// Creates an iterator that produces tokens from the input string.`
			`pub fn tokenize(mut input: &str) -> Vec<Token> {`
Add token positions 3 years ago			`let mut pos = Position {`
			`raw: usize::MAX,`
			`line: 1,`
			`offset: 0,`
			`};`
Initial commit 3 years ago			`std::iter::from_fn(move \|\| {`
			`if input.is_empty() {`
			`return None;`
			`}`
Fix warnings 3 years ago			`let token = first_token(input, &mut pos);`
Initial commit 3 years ago			`input = &input[token.len..];`
			`Some(token)`
			`})`
			`.collect()`
			`}`

			`/// Parses the first token from the provided input string.`
Fix warnings 3 years ago			`pub fn first_token(input: &str, pos: &mut Position) -> Token {`
Initial commit 3 years ago			`debug_assert!(!input.is_empty());`
Fix warnings 3 years ago			`Cursor::new(input, pos).advance_token()`
Initial commit 3 years ago			`}`

			`pub fn is_whitespace(c: char) -> bool {`
			`match c {`
Fix function parsing 3 years ago			`' ' \| '\n' \| '\r' \| '\t' => true,`
Add optional types for declares 3 years ago			`'\u{00A0}' => {`
			`dbg!("Non-standard unicode character found: '\u{00A0}'");`
			`true`
			`}`
Initial commit 3 years ago			`_ => false,`
			`}`
			`}`

			/// True if `c` is valid as a first character of an identifier.
			`/// See [Rust language reference](https://doc.rust-lang.org/reference/identifiers.html) for`
			`/// a formal definition of valid identifier name.`
			`pub fn is_id_start(c: char) -> bool {`
			`('a' <= c && c <= 'z') \|\| ('A' <= c && c <= 'Z') \|\| c == '_'`
			`}`

			/// True if `c` is valid as a non-first character of an identifier.
			`/// See [Rust language reference](https://doc.rust-lang.org/reference/identifiers.html) for`
			`/// a formal definition of valid identifier name.`
			`pub fn is_id_continue(c: char) -> bool {`
			`('a' <= c && c <= 'z') \|\| ('A' <= c && c <= 'Z') \|\| ('0' <= c && c <= '9') \|\| c == '_'`
			`}`

			`impl Cursor<'_> {`
			`/// Parses a token from the input string.`
			`fn advance_token(&mut self) -> Token {`
			`// Original chars used to identify the token later on`
			`let original_chars = self.chars();`
Refactor TokenType 3 years ago			`// FIXME: Identical value, since it will be used twice and is not clonable later`
			`let original_chars2 = self.chars();`
Initial commit 3 years ago			`let first_char = self.bump().unwrap();`
			`let token_kind = match first_char {`
			`c if is_whitespace(c) => self.whitespace(),`
Refactor TokenType 3 years ago			`'0'..='9' => self.number(),`
			`'"' \| '\'' => self.string(),`
Initial commit 3 years ago			`'+' => Plus,`
			`'-' => Minus,`
			`'*' => Star,`
Add copyright notices 3 years ago			`'%' => Percent,`
Initial commit 3 years ago			`'/' => match self.first() {`
Add greeter example 3 years ago			`'/' => {`
			`self.bump();`
			`self.comment()`
			`}`
Initial commit 3 years ago			`_ => Slash,`
			`},`
			`'=' => match self.first() {`
Add greeter example 3 years ago			`'=' => {`
			`self.bump();`
			`Equals`
			`}`
Add variable declarations 3 years ago			`_ => Assign,`
Initial commit 3 years ago			`},`
Revert "Change function declaration syntax" This reverts commit b21c781a646c7da6eec0a3a4787f6c8eb86a7ea5. 3 years ago			`':' => Colon,`
Refactor TokenType 3 years ago			`';' => SemiColon,`
Tokenize Comma 3 years ago			`',' => Comma,`
Add remaining comparison operators 3 years ago			`'<' => match self.first() {`
			`'=' => {`
			`self.bump();`
			`LessThanOrEqual`
			`}`
			`_ => LessThan,`
			`},`
			`'>' => match self.first() {`
			`'=' => {`
			`self.bump();`
			`GreaterThanOrEqual`
			`}`
			`_ => GreaterThan,`
			`},`
Add copyright notices 3 years ago			`'&' => match self.first() {`
			`'&' => {`
			`self.bump();`
			`And`
			`}`
			`_ => Unknown,`
			`},`
			`'\|' => match self.first() {`
			`'\|' => {`
			`self.bump();`
			`Or`
			`}`
			`_ => Unknown,`
			`},`
Add remaining comparison operators 3 years ago			`'!' => match self.first() {`
			`'=' => {`
			`self.bump();`
			`NotEqual`
			`}`
			`_ => Exclamation,`
			`},`
Initial commit 3 years ago			`'(' => BraceOpen,`
			`')' => BraceClose,`
			`'[' => SquareBraceOpen,`
			`']' => SquareBraceClose,`
Add curly braces 3 years ago			`'{' => CurlyBracesOpen,`
			`'}' => CurlyBracesClose,`
Initial commit 3 years ago			`c if is_id_start(c) => {`
			`let kind = self.identifier(c);`
Refactor TokenType 3 years ago			`if kind == Keyword::Unknown {`
			`let mut ch: String = original_chars.collect();`
			`ch.truncate(self.len_consumed());`
			`TokenKind::Identifier(ch)`
Parse functions 3 years ago			`} else {`
Refactor TokenType 3 years ago			`TokenKind::Keyword(kind)`
Parse functions 3 years ago			`}`
Initial commit 3 years ago			`}`
			`'\n' => CarriageReturn,`
			`'\t' => Tab,`
			`_ => Unknown,`
			`};`

			`let len = self.len_consumed();`
Refactor TokenType 3 years ago			`let mut raw = original_chars2.collect::<String>();`
Initial commit 3 years ago			`// Cut the original tokens to the length of the token`
			`raw.truncate(len);`
Add token positions 3 years ago			`let position = self.pos();`
			`let token = Token::new(token_kind, len, raw, position);`

			`token`
Initial commit 3 years ago			`}`

			`/// Eats symbols while predicate returns true or until the end of file is reached.`
			`/// Returns amount of eaten symbols.`
			`fn eat_while<F>(&mut self, mut predicate: F) -> usize`
			`where`
			`F: FnMut(char) -> bool,`
			`{`
			`let mut eaten: usize = 0;`
			`while predicate(self.first()) && !self.is_eof() {`
			`eaten += 1;`
			`self.bump();`
			`}`

			`eaten`
			`}`

			`fn whitespace(&mut self) -> TokenKind {`
			`debug_assert!(is_whitespace(self.prev()));`
			`self.eat_while(is_whitespace);`
			`Whitespace`
			`}`

Refactor TokenType 3 years ago			`fn number(&mut self) -> TokenKind {`
Initial commit 3 years ago			`self.eat_digits();`
Refactor TokenType 3 years ago			`TokenKind::Literal(Value::Int)`
Initial commit 3 years ago			`}`

Refactor TokenType 3 years ago			`fn string(&mut self) -> TokenKind {`
Initial commit 3 years ago			`self.eat_string();`

Refactor TokenType 3 years ago			`TokenKind::Literal(Value::Str)`
Initial commit 3 years ago			`}`

Refactor TokenType 3 years ago			`fn identifier(&mut self, first_char: char) -> Keyword {`
Initial commit 3 years ago			`let mut original: String = self.chars().collect::<String>();`
			`let len = self.eat_while(is_id_continue);`

			`// Cut original "rest"-character stream to length of token`
			`// and prepend first character, because it has been eaten beforehand`
			`original.truncate(len);`
			`original = format!("{}{}", first_char, original);`

			`match original {`
Refactor TokenType 3 years ago			`c if c == "if" => Keyword::If,`
			`c if c == "else" => Keyword::Else,`
Revert "Change function declaration syntax" This reverts commit b21c781a646c7da6eec0a3a4787f6c8eb86a7ea5. 3 years ago			`c if c == "fn" => Keyword::Function,`
Refactor TokenType 3 years ago			`c if c == "true" \|\| c == "false" => Keyword::Boolean,`
			`c if c == "let" => Keyword::Let,`
Implement return statement 3 years ago			`c if c == "return" => Keyword::Return,`
Add while loop 3 years ago			`c if c == "while" => Keyword::While,`
Refactor TokenType 3 years ago			`_ => Keyword::Unknown,`
Initial commit 3 years ago			`}`
			`}`

			`fn comment(&mut self) -> TokenKind {`
			`// FIXME: Might lead to a bug, if End of file is encountered`
			`while self.first() != '\n' {`
			`self.bump();`
			`}`

			`TokenKind::Comment`
			`}`

			`fn eat_digits(&mut self) -> bool {`
			`let mut has_digits = false;`
			`loop {`
			`match self.first() {`
			`'_' => {`
			`self.bump();`
			`}`
			`'0'..='9' => {`
			`has_digits = true;`
			`self.bump();`
			`}`
			`_ => break,`
			`}`
			`}`
			`has_digits`
			`}`

			`fn eat_string(&mut self) {`
			`// FIXME: double quoted strings could probably be ended by single quoted, and vice versa.`
			`// Possible fix: Pass the token of the string beginning down to this method and check against it.`
			`loop {`
			`match self.first() {`
			`'"' \| '\'' => break,`
Fix infinite loop when parsing strings 3 years ago			`'\n' => panic!(`
			`"String does not end on same line. At {}:{}",`
			`self.pos().line,`
			`self.pos().offset`
			`),`
Initial commit 3 years ago			`_ => self.bump(),`
			`};`
			`}`

			`// Eat last quote`
			`self.bump();`
			`}`
			`}`