mirror of https://git.sr.ht/~garritfra/sabre
Garrit Franke
3 years ago
commit
3a3b0ce99a
8 changed files with 581 additions and 0 deletions
@ -0,0 +1,5 @@
|
||||
# This file is automatically @generated by Cargo. |
||||
# It is not intended for manual editing. |
||||
[[package]] |
||||
name = "flex" |
||||
version = "0.0.1" |
@ -0,0 +1,9 @@
|
||||
[package] |
||||
name = "flex" |
||||
version = "0.0.1" |
||||
authors = ["Garrit Franke <garritfranke@gmail.com>"] |
||||
edition = "2018" |
||||
|
||||
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html |
||||
|
||||
[dependencies] |
@ -0,0 +1,78 @@
|
||||
use std::str::Chars; |
||||
|
||||
/// Peekable iterator over a char sequence.
|
||||
///
|
||||
/// Next characters can be peeked via `nth_char` method,
|
||||
/// and position can be shifted forward via `bump` method.
|
||||
pub(crate) struct Cursor<'a> { |
||||
initial_len: usize, |
||||
chars: Chars<'a>, |
||||
prev: char, |
||||
} |
||||
|
||||
pub(crate) const EOF_CHAR: char = '\0'; |
||||
|
||||
impl<'a> Cursor<'a> { |
||||
pub(crate) fn new(input: &'a str) -> Cursor<'a> { |
||||
Cursor { |
||||
initial_len: input.len(), |
||||
chars: input.chars(), |
||||
#[cfg(debug_assertions)] |
||||
prev: EOF_CHAR, |
||||
} |
||||
} |
||||
|
||||
/// For debug assertions only
|
||||
/// Returns the last eaten symbol (or '\0' in release builds).
|
||||
pub(crate) fn prev(&self) -> char { |
||||
#[cfg(debug_assertions)] |
||||
{ |
||||
self.prev |
||||
} |
||||
|
||||
#[cfg(not(debug_assertions))] |
||||
{ |
||||
'\0' |
||||
} |
||||
} |
||||
|
||||
/// Returns nth character relative to the current cursor position.
|
||||
/// If requested position doesn't exist, `EOF_CHAR` is returned.
|
||||
/// However, getting `EOF_CHAR` doesn't always mean actual end of file,
|
||||
/// it should be checked with `is_eof` method.
|
||||
fn nth_char(&self, n: usize) -> char { |
||||
self.chars().nth(n).unwrap_or(EOF_CHAR) |
||||
} |
||||
|
||||
/// Peeks the next symbol from the input stream without consuming it.
|
||||
pub(crate) fn first(&self) -> char { |
||||
self.nth_char(0) |
||||
} |
||||
|
||||
/// Checks if there is nothing more to consume.
|
||||
pub(crate) fn is_eof(&self) -> bool { |
||||
self.chars.as_str().is_empty() |
||||
} |
||||
|
||||
/// Returns amount of already consumed symbols.
|
||||
pub(crate) fn len_consumed(&self) -> usize { |
||||
self.initial_len - self.chars.as_str().len() |
||||
} |
||||
|
||||
/// Returns a `Chars` iterator over the remaining characters.
|
||||
pub(crate) fn chars(&self) -> Chars<'a> { |
||||
self.chars.clone() |
||||
} |
||||
|
||||
/// Moves to the next character.
|
||||
pub(crate) fn bump(&mut self) -> Option<char> { |
||||
let c = self.chars.next()?; |
||||
|
||||
#[cfg(debug_assertions)] |
||||
{ |
||||
self.prev = c; |
||||
} |
||||
|
||||
Some(c) |
||||
} |
||||
} |
@ -0,0 +1,269 @@
|
||||
pub(crate) mod cursor; |
||||
|
||||
use self::TokenKind::*; |
||||
use cursor::Cursor; |
||||
|
||||
#[cfg(test)] |
||||
mod tests; |
||||
|
||||
#[derive(Debug, PartialEq, Eq)] |
||||
pub struct Token { |
||||
pub kind: TokenKind, |
||||
pub len: usize, |
||||
pub raw: String, |
||||
} |
||||
|
||||
impl Token { |
||||
fn new(kind: TokenKind, len: usize, raw: String) -> Token { |
||||
Token { kind, len, raw } |
||||
} |
||||
} |
||||
|
||||
/// Enum representing common lexeme types.
|
||||
#[derive(Clone, Copy, Debug, PartialEq, Eq, PartialOrd, Ord)] |
||||
pub enum TokenKind { |
||||
/// Any whitespace characters sequence.
|
||||
Whitespace, |
||||
Literal { |
||||
kind: LiteralKind, |
||||
}, |
||||
/// Keywords such as 'if' or 'else'
|
||||
Identifier { |
||||
kind: IdentifierKind, |
||||
}, |
||||
/// // Lorem Ipsum
|
||||
Comment, |
||||
/// "+"
|
||||
Plus, |
||||
/// "-"
|
||||
Minus, |
||||
/// "*"
|
||||
Star, |
||||
/// "/"
|
||||
Slash, |
||||
/// ":"
|
||||
Colon, |
||||
/// "="
|
||||
Equals, |
||||
/// "=="
|
||||
DeepEquals, |
||||
/// "<"
|
||||
SmallerThen, |
||||
/// ">"
|
||||
LargerThen, |
||||
/// "("
|
||||
BraceOpen, |
||||
/// ")"
|
||||
BraceClose, |
||||
/// "["
|
||||
SquareBraceOpen, |
||||
/// "]"
|
||||
SquareBraceClose, |
||||
/// "\t"
|
||||
Tab, |
||||
/// "\n"
|
||||
CarriageReturn, |
||||
/// Unknown token, not expected by the lexer, e.g. "№"
|
||||
Unknown, |
||||
} |
||||
|
||||
#[derive(Clone, Copy, Debug, PartialEq, Eq, PartialOrd, Ord)] |
||||
pub enum LiteralKind { |
||||
Int, |
||||
Str, |
||||
} |
||||
|
||||
#[derive(Clone, Copy, Debug, PartialEq, Eq, PartialOrd, Ord)] |
||||
pub enum IdentifierKind { |
||||
Let, |
||||
If, |
||||
Else, |
||||
Function, |
||||
Boolean, |
||||
Unknown, |
||||
} |
||||
|
||||
/// Creates an iterator that produces tokens from the input string.
|
||||
pub fn tokenize(mut input: &str) -> Vec<Token> { |
||||
std::iter::from_fn(move || { |
||||
if input.is_empty() { |
||||
return None; |
||||
} |
||||
let token = first_token(input); |
||||
input = &input[token.len..]; |
||||
Some(token) |
||||
}) |
||||
.collect() |
||||
} |
||||
|
||||
/// Parses the first token from the provided input string.
|
||||
pub fn first_token(input: &str) -> Token { |
||||
debug_assert!(!input.is_empty()); |
||||
Cursor::new(input).advance_token() |
||||
} |
||||
|
||||
pub fn is_whitespace(c: char) -> bool { |
||||
match c { |
||||
' ' => true, |
||||
_ => false, |
||||
} |
||||
} |
||||
|
||||
/// True if `c` is valid as a first character of an identifier.
|
||||
/// See [Rust language reference](https://doc.rust-lang.org/reference/identifiers.html) for
|
||||
/// a formal definition of valid identifier name.
|
||||
pub fn is_id_start(c: char) -> bool { |
||||
('a' <= c && c <= 'z') || ('A' <= c && c <= 'Z') || c == '_' |
||||
} |
||||
|
||||
/// True if `c` is valid as a non-first character of an identifier.
|
||||
/// See [Rust language reference](https://doc.rust-lang.org/reference/identifiers.html) for
|
||||
/// a formal definition of valid identifier name.
|
||||
pub fn is_id_continue(c: char) -> bool { |
||||
('a' <= c && c <= 'z') || ('A' <= c && c <= 'Z') || ('0' <= c && c <= '9') || c == '_' |
||||
} |
||||
|
||||
impl Cursor<'_> { |
||||
/// Parses a token from the input string.
|
||||
fn advance_token(&mut self) -> Token { |
||||
// Original chars used to identify the token later on
|
||||
let original_chars = self.chars(); |
||||
let first_char = self.bump().unwrap(); |
||||
let token_kind = match first_char { |
||||
c if is_whitespace(c) => self.whitespace(), |
||||
'0'..='9' => { |
||||
let kind = self.number(); |
||||
|
||||
TokenKind::Literal { kind } |
||||
} |
||||
'"' | '\'' => { |
||||
let kind = self.string(); |
||||
|
||||
TokenKind::Literal { kind } |
||||
} |
||||
'+' => Plus, |
||||
'-' => Minus, |
||||
'*' => Star, |
||||
'/' => match self.first() { |
||||
'/' => self.comment(), |
||||
_ => Slash, |
||||
}, |
||||
'=' => match self.first() { |
||||
'=' => DeepEquals, |
||||
_ => Equals, |
||||
}, |
||||
':' => Colon, |
||||
'<' => SmallerThen, |
||||
'>' => LargerThen, |
||||
'(' => BraceOpen, |
||||
')' => BraceClose, |
||||
'[' => SquareBraceOpen, |
||||
']' => SquareBraceClose, |
||||
c if is_id_start(c) => { |
||||
let kind = self.identifier(c); |
||||
|
||||
Identifier { kind } |
||||
} |
||||
'\n' => CarriageReturn, |
||||
'\t' => Tab, |
||||
_ => Unknown, |
||||
}; |
||||
|
||||
let len = self.len_consumed(); |
||||
let mut raw = original_chars.collect::<String>(); |
||||
// Cut the original tokens to the length of the token
|
||||
raw.truncate(len); |
||||
Token::new(token_kind, len, raw) |
||||
} |
||||
|
||||
/// Eats symbols while predicate returns true or until the end of file is reached.
|
||||
/// Returns amount of eaten symbols.
|
||||
fn eat_while<F>(&mut self, mut predicate: F) -> usize |
||||
where |
||||
F: FnMut(char) -> bool, |
||||
{ |
||||
let mut eaten: usize = 0; |
||||
while predicate(self.first()) && !self.is_eof() { |
||||
eaten += 1; |
||||
self.bump(); |
||||
} |
||||
|
||||
eaten |
||||
} |
||||
|
||||
fn whitespace(&mut self) -> TokenKind { |
||||
debug_assert!(is_whitespace(self.prev())); |
||||
self.eat_while(is_whitespace); |
||||
Whitespace |
||||
} |
||||
|
||||
fn number(&mut self) -> LiteralKind { |
||||
self.eat_digits(); |
||||
LiteralKind::Int |
||||
} |
||||
|
||||
fn string(&mut self) -> LiteralKind { |
||||
self.eat_string(); |
||||
|
||||
LiteralKind::Str |
||||
} |
||||
|
||||
fn identifier(&mut self, first_char: char) -> IdentifierKind { |
||||
let mut original: String = self.chars().collect::<String>(); |
||||
let len = self.eat_while(is_id_continue); |
||||
|
||||
// Cut original "rest"-character stream to length of token
|
||||
// and prepend first character, because it has been eaten beforehand
|
||||
original.truncate(len); |
||||
original = format!("{}{}", first_char, original); |
||||
|
||||
match original { |
||||
c if c == "if" => IdentifierKind::If, |
||||
c if c == "else" => IdentifierKind::Else, |
||||
c if c == "fn" => IdentifierKind::Function, |
||||
c if c == "true" || c == "false" => IdentifierKind::Boolean, |
||||
c if c == "let" => IdentifierKind::Let, |
||||
_ => IdentifierKind::Unknown, |
||||
} |
||||
} |
||||
|
||||
fn comment(&mut self) -> TokenKind { |
||||
// FIXME: Might lead to a bug, if End of file is encountered
|
||||
while self.first() != '\n' { |
||||
self.bump(); |
||||
} |
||||
|
||||
TokenKind::Comment |
||||
} |
||||
|
||||
fn eat_digits(&mut self) -> bool { |
||||
let mut has_digits = false; |
||||
loop { |
||||
match self.first() { |
||||
'_' => { |
||||
self.bump(); |
||||
} |
||||
'0'..='9' => { |
||||
has_digits = true; |
||||
self.bump(); |
||||
} |
||||
_ => break, |
||||
} |
||||
} |
||||
has_digits |
||||
} |
||||
|
||||
fn eat_string(&mut self) { |
||||
// FIXME: double quoted strings could probably be ended by single quoted, and vice versa.
|
||||
// Possible fix: Pass the token of the string beginning down to this method and check against it.
|
||||
loop { |
||||
match self.first() { |
||||
'"' | '\'' => break, |
||||
_ => self.bump(), |
||||
}; |
||||
} |
||||
|
||||
// Eat last quote
|
||||
self.bump(); |
||||
} |
||||
} |
@ -0,0 +1,173 @@
|
||||
#[cfg(test)] |
||||
mod tests { |
||||
use crate::lexer::*; |
||||
|
||||
#[test] |
||||
fn test_basic_tokenizing() { |
||||
let mut tokens = tokenize("1 = 2").into_iter(); |
||||
|
||||
assert_eq!( |
||||
tokens.nth(0).unwrap(), |
||||
Token { |
||||
len: 1, |
||||
kind: TokenKind::Literal { |
||||
kind: LiteralKind::Int |
||||
}, |
||||
raw: "1".to_owned() |
||||
} |
||||
); |
||||
|
||||
assert_eq!( |
||||
tokens.nth(0).unwrap(), |
||||
Token { |
||||
len: 1, |
||||
kind: TokenKind::Whitespace, |
||||
raw: " ".to_owned() |
||||
} |
||||
); |
||||
|
||||
assert_eq!( |
||||
tokens.nth(0).unwrap(), |
||||
Token { |
||||
len: 1, |
||||
kind: TokenKind::Equals, |
||||
raw: "=".to_owned() |
||||
} |
||||
); |
||||
|
||||
assert_eq!( |
||||
tokens.nth(0).unwrap(), |
||||
Token { |
||||
len: 1, |
||||
kind: TokenKind::Whitespace, |
||||
raw: " ".to_owned() |
||||
} |
||||
); |
||||
|
||||
assert_eq!( |
||||
tokens.nth(0).unwrap(), |
||||
Token { |
||||
len: 1, |
||||
kind: TokenKind::Literal { |
||||
kind: LiteralKind::Int |
||||
}, |
||||
raw: "2".to_owned() |
||||
} |
||||
); |
||||
} |
||||
|
||||
#[test] |
||||
fn test_tokenizing_without_whitespace() { |
||||
let mut tokens = tokenize("1=2").into_iter(); |
||||
|
||||
assert_eq!( |
||||
tokens.nth(0).unwrap(), |
||||
Token { |
||||
len: 1, |
||||
kind: TokenKind::Literal { |
||||
kind: LiteralKind::Int |
||||
}, |
||||
raw: "1".to_owned() |
||||
} |
||||
); |
||||
|
||||
assert_eq!( |
||||
tokens.nth(0).unwrap(), |
||||
Token { |
||||
len: 1, |
||||
kind: TokenKind::Equals, |
||||
raw: "=".to_owned() |
||||
} |
||||
); |
||||
|
||||
assert_eq!( |
||||
tokens.nth(0).unwrap(), |
||||
Token { |
||||
len: 1, |
||||
kind: TokenKind::Literal { |
||||
kind: LiteralKind::Int |
||||
}, |
||||
raw: "2".to_owned() |
||||
} |
||||
); |
||||
} |
||||
|
||||
#[test] |
||||
fn test_booleans() { |
||||
let mut tokens = tokenize("true false").into_iter(); |
||||
|
||||
assert_eq!( |
||||
tokens.nth(0).unwrap(), |
||||
Token { |
||||
len: 4, |
||||
kind: TokenKind::Identifier { |
||||
kind: IdentifierKind::Boolean |
||||
}, |
||||
raw: "true".to_owned() |
||||
} |
||||
); |
||||
|
||||
assert_eq!( |
||||
tokens.nth(1).unwrap(), |
||||
Token { |
||||
len: 5, |
||||
kind: TokenKind::Identifier { |
||||
kind: IdentifierKind::Boolean |
||||
}, |
||||
raw: "false".to_owned() |
||||
} |
||||
); |
||||
} |
||||
|
||||
#[test] |
||||
fn test_functions() { |
||||
let mut tokens = tokenize("fn fib n:").into_iter(); |
||||
|
||||
assert_eq!( |
||||
tokens.nth(0).unwrap(), |
||||
Token { |
||||
len: 2, |
||||
kind: TokenKind::Identifier { |
||||
kind: IdentifierKind::Function |
||||
}, |
||||
raw: "fn".to_owned() |
||||
} |
||||
); |
||||
} |
||||
|
||||
#[test] |
||||
fn test_comments() { |
||||
let mut tokens = tokenize( |
||||
" |
||||
-- foo |
||||
fn fib n: |
||||
", |
||||
) |
||||
.into_iter() |
||||
.filter(|t| { |
||||
t.kind != TokenKind::Whitespace |
||||
&& t.kind != TokenKind::Tab |
||||
&& t.kind != TokenKind::CarriageReturn |
||||
}); |
||||
|
||||
assert_eq!( |
||||
tokens.nth(0).unwrap(), |
||||
Token { |
||||
len: 6, |
||||
kind: TokenKind::Comment, |
||||
raw: "-- foo".to_owned(), |
||||
} |
||||
); |
||||
|
||||
assert_eq!( |
||||
tokens.nth(0).unwrap(), |
||||
Token { |
||||
len: 2, |
||||
kind: TokenKind::Identifier { |
||||
kind: IdentifierKind::Function |
||||
}, |
||||
raw: "fn".to_owned(), |
||||
} |
||||
); |
||||
} |
||||
} |
@ -0,0 +1,9 @@
|
||||
mod lexer; |
||||
mod parser; |
||||
|
||||
fn main() { |
||||
let tokens = lexer::tokenize(&"let x = 2"); |
||||
// let ast = parser::parse(tokens.into_iter());
|
||||
|
||||
println!("{:?}", tokens) |
||||
} |
@ -0,0 +1,36 @@
|
||||
use crate::lexer::Token; |
||||
|
||||
pub struct Parser { |
||||
tokens: Box<dyn Iterator<Item = Token>>, |
||||
current: Option<Token>, |
||||
indentation_level: usize, |
||||
} |
||||
|
||||
impl Parser { |
||||
pub(crate) fn new(tokens: impl Iterator<Item = Token> + 'static) -> Self { |
||||
Parser { |
||||
tokens: Box::new(tokens), |
||||
current: None, |
||||
indentation_level: 0, |
||||
} |
||||
} |
||||
|
||||
fn next(&mut self) { |
||||
self.current = self.tokens.next(); |
||||
} |
||||
} |
||||
|
||||
#[derive(Debug)] |
||||
pub struct AST; |
||||
|
||||
pub fn parse(tokens: impl Iterator<Item = Token> + 'static) -> AST { |
||||
let mut parser = Parser::new(tokens); |
||||
let ast = AST {}; |
||||
|
||||
loop { |
||||
parser.next(); |
||||
break; |
||||
} |
||||
|
||||
ast |
||||
} |
Loading…
Reference in new issue