You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

383 lines
9.9 KiB

/**
* Copyright 2020 Garrit Franke
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* https://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
3 years ago
pub(crate) mod cursor;
use self::TokenKind::*;
use cursor::Cursor;
#[cfg(test)]
mod tests;
3 years ago
#[derive(Debug, PartialEq, Eq, Clone)]
3 years ago
pub struct Token {
pub kind: TokenKind,
pub len: usize,
pub raw: String,
pub pos: Position,
3 years ago
}
impl Token {
fn new(kind: TokenKind, len: usize, raw: String, pos: Position) -> Token {
Token {
kind,
len,
raw,
pos,
}
3 years ago
}
}
#[derive(Debug, PartialEq, Eq, Clone, Copy)]
pub struct Position {
pub line: usize,
pub offset: usize,
pub raw: usize,
}
3 years ago
/// Enum representing common lexeme types.
#[derive(Clone, Debug, PartialEq, Eq, PartialOrd, Ord)]
3 years ago
pub enum TokenKind {
/// Any whitespace characters sequence.
Whitespace,
Identifier(String),
Literal(Value),
3 years ago
/// Keywords such as 'if' or 'else'
Keyword(Keyword),
3 years ago
/// // Lorem Ipsum
Comment,
/// "+"
Plus,
/// "-"
Minus,
/// "*"
Star,
/// "/"
Slash,
/// "%"
Percent,
3 years ago
/// ":"
Colon,
/// ";"
SemiColon,
/// "!"
Exclamation,
/// ","
Comma,
3 years ago
/// "="
Assign,
3 years ago
/// "=="
Equals,
3 years ago
/// "<"
LessThan,
/// "<="
LessThanOrEqual,
3 years ago
/// ">"
GreaterThan,
/// ">="
GreaterThanOrEqual,
/// "!="
NotEqual,
/// &&
And,
/// "||"
Or,
3 years ago
/// "("
BraceOpen,
/// ")"
BraceClose,
/// "["
SquareBraceOpen,
/// "]"
SquareBraceClose,
3 years ago
/// "{"
CurlyBracesOpen,
/// "}"
CurlyBracesClose,
3 years ago
/// "\t"
Tab,
/// "\n"
CarriageReturn,
/// Unknown token, not expected by the lexer, e.g. "№"
Unknown,
}
#[derive(Clone, Copy, Debug, PartialEq, Eq, PartialOrd, Ord)]
pub enum Value {
3 years ago
Int,
Str,
}
#[derive(Clone, Copy, Debug, PartialEq, Eq, PartialOrd, Ord)]
pub enum Keyword {
3 years ago
Let,
If,
Else,
Return,
3 years ago
While,
Break,
Continue,
Function,
3 years ago
Boolean,
Unknown,
}
/// Creates an iterator that produces tokens from the input string.
pub fn tokenize(mut input: &str) -> Vec<Token> {
let mut pos = Position {
raw: usize::MAX,
line: 1,
offset: 0,
};
3 years ago
std::iter::from_fn(move || {
if input.is_empty() {
return None;
}
3 years ago
let token = first_token(input, &mut pos);
3 years ago
input = &input[token.len..];
Some(token)
})
.collect()
}
/// Parses the first token from the provided input string.
3 years ago
pub fn first_token(input: &str, pos: &mut Position) -> Token {
3 years ago
debug_assert!(!input.is_empty());
3 years ago
Cursor::new(input, pos).advance_token()
3 years ago
}
pub fn is_whitespace(c: char) -> bool {
match c {
' ' | '\n' | '\r' | '\t' => true,
'\u{00A0}' => {
dbg!("Non-standard unicode character found: '\u{00A0}'");
true
}
3 years ago
_ => false,
}
}
/// True if `c` is valid as a first character of an identifier.
/// See [Rust language reference](https://doc.rust-lang.org/reference/identifiers.html) for
/// a formal definition of valid identifier name.
pub fn is_id_start(c: char) -> bool {
('a' <= c && c <= 'z') || ('A' <= c && c <= 'Z') || c == '_'
}
/// True if `c` is valid as a non-first character of an identifier.
/// See [Rust language reference](https://doc.rust-lang.org/reference/identifiers.html) for
/// a formal definition of valid identifier name.
pub fn is_id_continue(c: char) -> bool {
('a' <= c && c <= 'z') || ('A' <= c && c <= 'Z') || ('0' <= c && c <= '9') || c == '_'
}
impl Cursor<'_> {
/// Parses a token from the input string.
fn advance_token(&mut self) -> Token {
// Original chars used to identify the token later on
let original_chars = self.chars();
// FIXME: Identical value, since it will be used twice and is not clonable later
let original_chars2 = self.chars();
3 years ago
let first_char = self.bump().unwrap();
let token_kind = match first_char {
c if is_whitespace(c) => self.whitespace(),
'0'..='9' => self.number(),
'"' | '\'' => self.string(),
3 years ago
'+' => Plus,
'-' => Minus,
'*' => Star,
'%' => Percent,
3 years ago
'/' => match self.first() {
'/' => {
self.bump();
self.comment()
}
3 years ago
_ => Slash,
},
'=' => match self.first() {
'=' => {
self.bump();
Equals
}
_ => Assign,
3 years ago
},
':' => Colon,
';' => SemiColon,
3 years ago
',' => Comma,
'<' => match self.first() {
'=' => {
self.bump();
LessThanOrEqual
}
_ => LessThan,
},
'>' => match self.first() {
'=' => {
self.bump();
GreaterThanOrEqual
}
_ => GreaterThan,
},
'&' => match self.first() {
'&' => {
self.bump();
And
}
_ => Unknown,
},
'|' => match self.first() {
'|' => {
self.bump();
Or
}
_ => Unknown,
},
'!' => match self.first() {
'=' => {
self.bump();
NotEqual
}
_ => Exclamation,
},
3 years ago
'(' => BraceOpen,
')' => BraceClose,
'[' => SquareBraceOpen,
']' => SquareBraceClose,
3 years ago
'{' => CurlyBracesOpen,
'}' => CurlyBracesClose,
3 years ago
c if is_id_start(c) => {
let kind = self.identifier(c);
if kind == Keyword::Unknown {
let mut ch: String = original_chars.collect();
ch.truncate(self.len_consumed());
TokenKind::Identifier(ch)
3 years ago
} else {
TokenKind::Keyword(kind)
3 years ago
}
3 years ago
}
'\n' => CarriageReturn,
'\t' => Tab,
_ => Unknown,
};
let len = self.len_consumed();
let mut raw = original_chars2.collect::<String>();
3 years ago
// Cut the original tokens to the length of the token
raw.truncate(len);
let position = self.pos();
let token = Token::new(token_kind, len, raw, position);
token
3 years ago
}
/// Eats symbols while predicate returns true or until the end of file is reached.
/// Returns amount of eaten symbols.
fn eat_while<F>(&mut self, mut predicate: F) -> usize
where
F: FnMut(char) -> bool,
{
let mut eaten: usize = 0;
while predicate(self.first()) && !self.is_eof() {
eaten += 1;
self.bump();
}
eaten
}
fn whitespace(&mut self) -> TokenKind {
debug_assert!(is_whitespace(self.prev()));
self.eat_while(is_whitespace);
Whitespace
}
fn number(&mut self) -> TokenKind {
3 years ago
self.eat_digits();
TokenKind::Literal(Value::Int)
3 years ago
}
fn string(&mut self) -> TokenKind {
3 years ago
self.eat_string();
TokenKind::Literal(Value::Str)
3 years ago
}
fn identifier(&mut self, first_char: char) -> Keyword {
3 years ago
let mut original: String = self.chars().collect::<String>();
let len = self.eat_while(is_id_continue);
// Cut original "rest"-character stream to length of token
// and prepend first character, because it has been eaten beforehand
original.truncate(len);
original = format!("{}{}", first_char, original);
match original {
c if c == "if" => Keyword::If,
c if c == "else" => Keyword::Else,
c if c == "fn" => Keyword::Function,
c if c == "true" || c == "false" => Keyword::Boolean,
c if c == "let" => Keyword::Let,
c if c == "return" => Keyword::Return,
3 years ago
c if c == "while" => Keyword::While,
_ => Keyword::Unknown,
3 years ago
}
}
fn comment(&mut self) -> TokenKind {
// FIXME: Might lead to a bug, if End of file is encountered
while self.first() != '\n' {
self.bump();
}
TokenKind::Comment
}
fn eat_digits(&mut self) -> bool {
let mut has_digits = false;
loop {
match self.first() {
'_' => {
self.bump();
}
'0'..='9' => {
has_digits = true;
self.bump();
}
_ => break,
}
}
has_digits
}
fn eat_string(&mut self) {
// FIXME: double quoted strings could probably be ended by single quoted, and vice versa.
// Possible fix: Pass the token of the string beginning down to this method and check against it.
loop {
match self.first() {
'"' | '\'' => break,
'\n' => panic!(
"String does not end on same line. At {}:{}",
self.pos().line,
self.pos().offset
),
3 years ago
_ => self.bump(),
};
}
// Eat last quote
self.bump();
}
}