Browse Source

Initial commit

github-actions
Garrit Franke 12 months ago
commit
3a3b0ce99a
  1. 2
      .gitignore
  2. 5
      Cargo.lock
  3. 9
      Cargo.toml
  4. 78
      src/lexer/cursor.rs
  5. 269
      src/lexer/mod.rs
  6. 173
      src/lexer/tests.rs
  7. 9
      src/main.rs
  8. 36
      src/parser/mod.rs

2
.gitignore

@ -0,0 +1,2 @@
target/
.vscode/

5
Cargo.lock

@ -0,0 +1,5 @@
# This file is automatically @generated by Cargo.
# It is not intended for manual editing.
[[package]]
name = "flex"
version = "0.0.1"

9
Cargo.toml

@ -0,0 +1,9 @@
[package]
name = "flex"
version = "0.0.1"
authors = ["Garrit Franke <garritfranke@gmail.com>"]
edition = "2018"
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
[dependencies]

78
src/lexer/cursor.rs

@ -0,0 +1,78 @@
use std::str::Chars;
/// Peekable iterator over a char sequence.
///
/// Next characters can be peeked via `nth_char` method,
/// and position can be shifted forward via `bump` method.
pub(crate) struct Cursor<'a> {
initial_len: usize,
chars: Chars<'a>,
prev: char,
}
pub(crate) const EOF_CHAR: char = '\0';
impl<'a> Cursor<'a> {
pub(crate) fn new(input: &'a str) -> Cursor<'a> {
Cursor {
initial_len: input.len(),
chars: input.chars(),
#[cfg(debug_assertions)]
prev: EOF_CHAR,
}
}
/// For debug assertions only
/// Returns the last eaten symbol (or '\0' in release builds).
pub(crate) fn prev(&self) -> char {
#[cfg(debug_assertions)]
{
self.prev
}
#[cfg(not(debug_assertions))]
{
'\0'
}
}
/// Returns nth character relative to the current cursor position.
/// If requested position doesn't exist, `EOF_CHAR` is returned.
/// However, getting `EOF_CHAR` doesn't always mean actual end of file,
/// it should be checked with `is_eof` method.
fn nth_char(&self, n: usize) -> char {
self.chars().nth(n).unwrap_or(EOF_CHAR)
}
/// Peeks the next symbol from the input stream without consuming it.
pub(crate) fn first(&self) -> char {
self.nth_char(0)
}
/// Checks if there is nothing more to consume.
pub(crate) fn is_eof(&self) -> bool {
self.chars.as_str().is_empty()
}
/// Returns amount of already consumed symbols.
pub(crate) fn len_consumed(&self) -> usize {
self.initial_len - self.chars.as_str().len()
}
/// Returns a `Chars` iterator over the remaining characters.
pub(crate) fn chars(&self) -> Chars<'a> {
self.chars.clone()
}
/// Moves to the next character.
pub(crate) fn bump(&mut self) -> Option<char> {
let c = self.chars.next()?;
#[cfg(debug_assertions)]
{
self.prev = c;
}
Some(c)
}
}

269
src/lexer/mod.rs

@ -0,0 +1,269 @@
pub(crate) mod cursor;
use self::TokenKind::*;
use cursor::Cursor;
#[cfg(test)]
mod tests;
#[derive(Debug, PartialEq, Eq)]
pub struct Token {
pub kind: TokenKind,
pub len: usize,
pub raw: String,
}
impl Token {
fn new(kind: TokenKind, len: usize, raw: String) -> Token {
Token { kind, len, raw }
}
}
/// Enum representing common lexeme types.
#[derive(Clone, Copy, Debug, PartialEq, Eq, PartialOrd, Ord)]
pub enum TokenKind {
/// Any whitespace characters sequence.
Whitespace,
Literal {
kind: LiteralKind,
},
/// Keywords such as 'if' or 'else'
Identifier {
kind: IdentifierKind,
},
/// // Lorem Ipsum
Comment,
/// "+"
Plus,
/// "-"
Minus,
/// "*"
Star,
/// "/"
Slash,
/// ":"
Colon,
/// "="
Equals,
/// "=="
DeepEquals,
/// "<"
SmallerThen,
/// ">"
LargerThen,
/// "("
BraceOpen,
/// ")"
BraceClose,
/// "["
SquareBraceOpen,
/// "]"
SquareBraceClose,
/// "\t"
Tab,
/// "\n"
CarriageReturn,
/// Unknown token, not expected by the lexer, e.g. "№"
Unknown,
}
#[derive(Clone, Copy, Debug, PartialEq, Eq, PartialOrd, Ord)]
pub enum LiteralKind {
Int,
Str,
}
#[derive(Clone, Copy, Debug, PartialEq, Eq, PartialOrd, Ord)]
pub enum IdentifierKind {
Let,
If,
Else,
Function,
Boolean,
Unknown,
}
/// Creates an iterator that produces tokens from the input string.
pub fn tokenize(mut input: &str) -> Vec<Token> {
std::iter::from_fn(move || {
if input.is_empty() {
return None;
}
let token = first_token(input);
input = &input[token.len..];
Some(token)
})
.collect()
}
/// Parses the first token from the provided input string.
pub fn first_token(input: &str) -> Token {
debug_assert!(!input.is_empty());
Cursor::new(input).advance_token()
}
pub fn is_whitespace(c: char) -> bool {
match c {
' ' => true,
_ => false,
}
}
/// True if `c` is valid as a first character of an identifier.
/// See [Rust language reference](https://doc.rust-lang.org/reference/identifiers.html) for
/// a formal definition of valid identifier name.
pub fn is_id_start(c: char) -> bool {
('a' <= c && c <= 'z') || ('A' <= c && c <= 'Z') || c == '_'
}
/// True if `c` is valid as a non-first character of an identifier.
/// See [Rust language reference](https://doc.rust-lang.org/reference/identifiers.html) for
/// a formal definition of valid identifier name.
pub fn is_id_continue(c: char) -> bool {
('a' <= c && c <= 'z') || ('A' <= c && c <= 'Z') || ('0' <= c && c <= '9') || c == '_'
}
impl Cursor<'_> {
/// Parses a token from the input string.
fn advance_token(&mut self) -> Token {
// Original chars used to identify the token later on
let original_chars = self.chars();
let first_char = self.bump().unwrap();
let token_kind = match first_char {
c if is_whitespace(c) => self.whitespace(),
'0'..='9' => {
let kind = self.number();
TokenKind::Literal { kind }
}
'"' | '\'' => {
let kind = self.string();
TokenKind::Literal { kind }
}
'+' => Plus,
'-' => Minus,
'*' => Star,
'/' => match self.first() {
'/' => self.comment(),
_ => Slash,
},
'=' => match self.first() {
'=' => DeepEquals,
_ => Equals,
},
':' => Colon,
'<' => SmallerThen,
'>' => LargerThen,
'(' => BraceOpen,
')' => BraceClose,
'[' => SquareBraceOpen,
']' => SquareBraceClose,
c if is_id_start(c) => {
let kind = self.identifier(c);
Identifier { kind }
}
'\n' => CarriageReturn,
'\t' => Tab,
_ => Unknown,
};
let len = self.len_consumed();
let mut raw = original_chars.collect::<String>();
// Cut the original tokens to the length of the token
raw.truncate(len);
Token::new(token_kind, len, raw)
}
/// Eats symbols while predicate returns true or until the end of file is reached.
/// Returns amount of eaten symbols.
fn eat_while<F>(&mut self, mut predicate: F) -> usize
where
F: FnMut(char) -> bool,
{
let mut eaten: usize = 0;
while predicate(self.first()) && !self.is_eof() {
eaten += 1;
self.bump();
}
eaten
}
fn whitespace(&mut self) -> TokenKind {
debug_assert!(is_whitespace(self.prev()));
self.eat_while(is_whitespace);
Whitespace
}
fn number(&mut self) -> LiteralKind {
self.eat_digits();
LiteralKind::Int
}
fn string(&mut self) -> LiteralKind {
self.eat_string();
LiteralKind::Str
}
fn identifier(&mut self, first_char: char) -> IdentifierKind {
let mut original: String = self.chars().collect::<String>();
let len = self.eat_while(is_id_continue);
// Cut original "rest"-character stream to length of token
// and prepend first character, because it has been eaten beforehand
original.truncate(len);
original = format!("{}{}", first_char, original);
match original {
c if c == "if" => IdentifierKind::If,
c if c == "else" => IdentifierKind::Else,
c if c == "fn" => IdentifierKind::Function,
c if c == "true" || c == "false" => IdentifierKind::Boolean,
c if c == "let" => IdentifierKind::Let,
_ => IdentifierKind::Unknown,
}
}
fn comment(&mut self) -> TokenKind {
// FIXME: Might lead to a bug, if End of file is encountered
while self.first() != '\n' {
self.bump();
}
TokenKind::Comment
}
fn eat_digits(&mut self) -> bool {
let mut has_digits = false;
loop {
match self.first() {
'_' => {
self.bump();
}
'0'..='9' => {
has_digits = true;
self.bump();
}
_ => break,
}
}
has_digits
}
fn eat_string(&mut self) {
// FIXME: double quoted strings could probably be ended by single quoted, and vice versa.
// Possible fix: Pass the token of the string beginning down to this method and check against it.
loop {
match self.first() {
'"' | '\'' => break,
_ => self.bump(),
};
}
// Eat last quote
self.bump();
}
}

173
src/lexer/tests.rs

@ -0,0 +1,173 @@
#[cfg(test)]
mod tests {
use crate::lexer::*;
#[test]
fn test_basic_tokenizing() {
let mut tokens = tokenize("1 = 2").into_iter();
assert_eq!(
tokens.nth(0).unwrap(),
Token {
len: 1,
kind: TokenKind::Literal {
kind: LiteralKind::Int
},
raw: "1".to_owned()
}
);
assert_eq!(
tokens.nth(0).unwrap(),
Token {
len: 1,
kind: TokenKind::Whitespace,
raw: " ".to_owned()
}
);
assert_eq!(
tokens.nth(0).unwrap(),
Token {
len: 1,
kind: TokenKind::Equals,
raw: "=".to_owned()
}
);
assert_eq!(
tokens.nth(0).unwrap(),
Token {
len: 1,
kind: TokenKind::Whitespace,
raw: " ".to_owned()
}
);
assert_eq!(
tokens.nth(0).unwrap(),
Token {
len: 1,
kind: TokenKind::Literal {
kind: LiteralKind::Int
},
raw: "2".to_owned()
}
);
}
#[test]
fn test_tokenizing_without_whitespace() {
let mut tokens = tokenize("1=2").into_iter();
assert_eq!(
tokens.nth(0).unwrap(),
Token {
len: 1,
kind: TokenKind::Literal {
kind: LiteralKind::Int
},
raw: "1".to_owned()
}
);
assert_eq!(
tokens.nth(0).unwrap(),
Token {
len: 1,
kind: TokenKind::Equals,
raw: "=".to_owned()
}
);
assert_eq!(
tokens.nth(0).unwrap(),
Token {
len: 1,
kind: TokenKind::Literal {
kind: LiteralKind::Int
},
raw: "2".to_owned()
}
);
}
#[test]
fn test_booleans() {
let mut tokens = tokenize("true false").into_iter();
assert_eq!(
tokens.nth(0).unwrap(),
Token {
len: 4,
kind: TokenKind::Identifier {
kind: IdentifierKind::Boolean
},
raw: "true".to_owned()
}
);
assert_eq!(
tokens.nth(1).unwrap(),
Token {
len: 5,
kind: TokenKind::Identifier {
kind: IdentifierKind::Boolean
},
raw: "false".to_owned()
}
);
}
#[test]
fn test_functions() {
let mut tokens = tokenize("fn fib n:").into_iter();
assert_eq!(
tokens.nth(0).unwrap(),
Token {
len: 2,
kind: TokenKind::Identifier {
kind: IdentifierKind::Function
},
raw: "fn".to_owned()
}
);
}
#[test]
fn test_comments() {
let mut tokens = tokenize(
"
-- foo
fn fib n:
",
)
.into_iter()
.filter(|t| {
t.kind != TokenKind::Whitespace
&& t.kind != TokenKind::Tab
&& t.kind != TokenKind::CarriageReturn
});
assert_eq!(
tokens.nth(0).unwrap(),
Token {
len: 6,
kind: TokenKind::Comment,
raw: "-- foo".to_owned(),
}
);
assert_eq!(
tokens.nth(0).unwrap(),
Token {
len: 2,
kind: TokenKind::Identifier {
kind: IdentifierKind::Function
},
raw: "fn".to_owned(),
}
);
}
}

9
src/main.rs

@ -0,0 +1,9 @@
mod lexer;
mod parser;
fn main() {
let tokens = lexer::tokenize(&"let x = 2");
// let ast = parser::parse(tokens.into_iter());
println!("{:?}", tokens)
}

36
src/parser/mod.rs

@ -0,0 +1,36 @@
use crate::lexer::Token;
pub struct Parser {
tokens: Box<dyn Iterator<Item = Token>>,
current: Option<Token>,
indentation_level: usize,
}
impl Parser {
pub(crate) fn new(tokens: impl Iterator<Item = Token> + 'static) -> Self {
Parser {
tokens: Box::new(tokens),
current: None,
indentation_level: 0,
}
}
fn next(&mut self) {
self.current = self.tokens.next();
}
}
#[derive(Debug)]
pub struct AST;
pub fn parse(tokens: impl Iterator<Item = Token> + 'static) -> AST {
let mut parser = Parser::new(tokens);
let ast = AST {};
loop {
parser.next();
break;
}
ast
}
Loading…
Cancel
Save