Browse Source

Add token positions

github-actions
Garrit Franke 3 years ago
parent
commit
565b0bd9f6
  1. 2
      examples/hello_world.sb
  2. 27
      src/lexer/cursor.rs
  3. 30
      src/lexer/mod.rs
  4. 81
      src/lexer/tests.rs
  5. 3
      src/main.rs
  6. 37
      src/parser/mod.rs
  7. 15
      src/parser/tests.rs
  8. 1
      src/util/mod.rs
  9. 17
      src/util/string_util.rs

2
examples/hello_world.sb

@ -1,5 +1,3 @@
// This is a comment
fn main() {
let x = "Hello World";
return x;

27
src/lexer/cursor.rs

@ -1,3 +1,4 @@
use crate::lexer::Position;
use std::str::Chars;
/// Peekable iterator over a char sequence.
@ -6,6 +7,7 @@ use std::str::Chars;
/// and position can be shifted forward via `bump` method.
pub(crate) struct Cursor<'a> {
initial_length: usize,
pos: &'a mut Position,
len: usize,
chars: Chars<'a>,
prev: char,
@ -14,13 +16,18 @@ pub(crate) struct Cursor<'a> {
pub(crate) const EOF_CHAR: char = '\0';
impl<'a> Cursor<'a> {
pub(crate) fn new(input: &'a str, initial_len: usize) -> Cursor<'a> {
pub(crate) fn new(
input: &'a str,
initial_len: usize,
position: &'a mut Position,
) -> Cursor<'a> {
Cursor {
initial_length: initial_len,
len: input.len(),
chars: input.chars(),
#[cfg(debug_assertions)]
prev: EOF_CHAR,
pos: position,
}
}
@ -66,13 +73,27 @@ impl<'a> Cursor<'a> {
self.chars.clone()
}
pub(crate) fn pos(&self) -> usize {
self.initial_length - self.len
pub(crate) fn pos(&self) -> Position {
let mut p = self.pos.clone();
p
}
/// Moves to the next character.
pub(crate) fn bump(&mut self) -> Option<char> {
let c = self.chars.next()?;
// If first token, the position should be set to 0
match self.pos.raw {
usize::MAX => self.pos.raw = 0,
_ => {
self.pos.raw += 1;
self.pos.offset += 1;
}
}
if c == '\n' {
self.pos.line += 1;
self.pos.offset = 0;
}
#[cfg(debug_assertions)]
{

30
src/lexer/mod.rs

@ -11,11 +11,11 @@ pub struct Token {
pub kind: TokenKind,
pub len: usize,
pub raw: String,
pub pos: usize,
pub pos: Position,
}
impl Token {
fn new(kind: TokenKind, len: usize, raw: String, pos: usize) -> Token {
fn new(kind: TokenKind, len: usize, raw: String, pos: Position) -> Token {
Token {
kind,
len,
@ -25,6 +25,13 @@ impl Token {
}
}
#[derive(Debug, PartialEq, Eq, Clone, Copy)]
pub struct Position {
pub line: usize,
pub offset: usize,
pub raw: usize,
}
/// Enum representing common lexeme types.
#[derive(Clone, Debug, PartialEq, Eq, PartialOrd, Ord)]
pub enum TokenKind {
@ -95,12 +102,17 @@ pub enum Keyword {
/// Creates an iterator that produces tokens from the input string.
pub fn tokenize(mut input: &str) -> Vec<Token> {
let mut initial_length = input.len();
let initial_length = input.len();
let mut pos = Position {
raw: usize::MAX,
line: 1,
offset: 0,
};
std::iter::from_fn(move || {
if input.is_empty() {
return None;
}
let token = first_token(input, initial_length);
let token = first_token(input, initial_length, &mut pos);
input = &input[token.len..];
Some(token)
})
@ -108,9 +120,9 @@ pub fn tokenize(mut input: &str) -> Vec<Token> {
}
/// Parses the first token from the provided input string.
pub fn first_token(input: &str, initial_len: usize) -> Token {
pub fn first_token(input: &str, initial_len: usize, pos: &mut Position) -> Token {
debug_assert!(!input.is_empty());
Cursor::new(input, initial_len).advance_token()
Cursor::new(input, initial_len, pos).advance_token()
}
pub fn is_whitespace(c: char) -> bool {
@ -186,7 +198,11 @@ impl Cursor<'_> {
let mut raw = original_chars2.collect::<String>();
// Cut the original tokens to the length of the token
raw.truncate(len);
Token::new(token_kind, len, raw, self.pos())
let position = self.pos();
let token = Token::new(token_kind, len, raw, position);
dbg!(&token);
token
}
/// Eats symbols while predicate returns true or until the end of file is reached.

81
src/lexer/tests.rs

@ -13,7 +13,11 @@ mod tests {
len: 1,
kind: TokenKind::Literal(Value::Int),
raw: "1".to_owned(),
pos: 0
pos: Position {
raw: 0,
line: 1,
offset: 0
}
}
);
@ -23,7 +27,11 @@ mod tests {
len: 1,
kind: TokenKind::Whitespace,
raw: " ".to_owned(),
pos: 1
pos: Position {
raw: 1,
line: 1,
offset: 1
}
}
);
@ -33,7 +41,11 @@ mod tests {
len: 1,
kind: TokenKind::Assign,
raw: "=".to_owned(),
pos: 2
pos: Position {
raw: 2,
line: 1,
offset: 2
}
}
);
@ -43,7 +55,11 @@ mod tests {
len: 1,
kind: TokenKind::Whitespace,
raw: " ".to_owned(),
pos: 3
pos: Position {
raw: 3,
line: 1,
offset: 3
}
}
);
@ -53,7 +69,11 @@ mod tests {
len: 1,
kind: TokenKind::Literal(Value::Int),
raw: "2".to_owned(),
pos: 4
pos: Position {
raw: 4,
line: 1,
offset: 4
}
}
);
}
@ -68,7 +88,11 @@ mod tests {
len: 1,
kind: TokenKind::Literal(Value::Int),
raw: "1".to_owned(),
pos: 0
pos: Position {
raw: 0,
line: 1,
offset: 0
}
}
);
@ -78,7 +102,11 @@ mod tests {
len: 1,
kind: TokenKind::Assign,
raw: "=".to_owned(),
pos: 1
pos: Position {
raw: 1,
line: 1,
offset: 1
}
}
);
@ -88,7 +116,11 @@ mod tests {
len: 1,
kind: TokenKind::Literal(Value::Int),
raw: "2".to_owned(),
pos: 2
pos: Position {
raw: 2,
line: 1,
offset: 2
}
}
);
}
@ -103,7 +135,11 @@ mod tests {
len: 4,
kind: TokenKind::Keyword(Keyword::Boolean),
raw: "true".to_owned(),
pos: 0
pos: Position {
raw: 3,
line: 1,
offset: 3
}
}
);
@ -113,7 +149,11 @@ mod tests {
len: 5,
kind: TokenKind::Keyword(Keyword::Boolean),
raw: "false".to_owned(),
pos: 5
pos: Position {
raw: 9,
line: 1,
offset: 9
}
}
);
}
@ -128,7 +168,11 @@ mod tests {
len: 2,
kind: TokenKind::Keyword(Keyword::Function),
raw: "fn".to_owned(),
pos: 0
pos: Position {
raw: 1,
line: 1,
offset: 1
}
}
);
}
@ -136,8 +180,7 @@ mod tests {
#[test]
fn test_comments() {
let mut tokens = tokenize(
"
// foo
"// foo
fn fib() {}
",
)
@ -154,7 +197,11 @@ fn fib() {}
len: 6,
kind: TokenKind::Comment,
raw: "// foo".to_owned(),
pos: 1
pos: Position {
raw: 5,
line: 1,
offset: 5
}
}
);
@ -164,7 +211,11 @@ fn fib() {}
len: 2,
kind: TokenKind::Keyword(Keyword::Function),
raw: "fn".to_owned(),
pos: 8
pos: Position {
raw: 8,
line: 2,
offset: 2
}
}
);
}

3
src/main.rs

@ -3,6 +3,7 @@ use std::io::Read;
mod lexer;
mod parser;
mod util;
fn main() -> std::io::Result<()> {
let mut file = File::open("examples/hello_world.sb")?;
@ -12,7 +13,7 @@ fn main() -> std::io::Result<()> {
let tokens = lexer::tokenize(&contents);
// let ast = parser::parse(tokens.into_iter());
let program = parser::parse(tokens).unwrap();
let program = parser::parse(tokens, Some(contents));
println!("{:#?}", program);
Ok(())

37
src/parser/mod.rs

@ -1,6 +1,7 @@
use crate::lexer::Keyword;
use crate::lexer::{Token, TokenKind, Value};
use crate::parser::node_type::*;
use crate::util::string_util::highlight_position_in_file;
use std::iter::Peekable;
use std::vec::IntoIter;
@ -12,10 +13,11 @@ mod tests;
pub struct Parser {
tokens: Peekable<IntoIter<Token>>,
peeked: Vec<Token>,
raw: Option<String>,
}
impl Parser {
pub fn new(tokens: Vec<Token>) -> Parser {
pub fn new(tokens: Vec<Token>, raw: Option<String>) -> Parser {
// FIXME: Fiter without collecting?
let tokens_without_whitespace: Vec<Token> = tokens
.into_iter()
@ -24,6 +26,7 @@ impl Parser {
Parser {
tokens: tokens_without_whitespace.into_iter().peekable(),
peeked: vec![],
raw: raw,
}
}
@ -71,10 +74,8 @@ impl Parser {
fn match_token(&mut self, token_kind: TokenKind) -> Result<Token, String> {
match self.next() {
Some(token) if token.kind == token_kind => Ok(token),
other => Err(format!(
"Token {:?} not found, found {:?}",
token_kind, other
)),
Some(other) => Err(self.make_error(token_kind, other)),
None => Err("Token expected".to_string()),
}
}
@ -91,9 +92,9 @@ impl Parser {
fn match_keyword(&mut self, keyword: Keyword) -> Result<(), String> {
let token = self.next_token();
match token.kind {
TokenKind::Keyword(k) if k == keyword => Ok(()),
other => Err(format!("Expected SemiColon, found {:?}", other)),
match &token.kind {
TokenKind::Keyword(ref k) if k == &keyword => Ok(()),
_ => Err(self.make_error(TokenKind::SemiColon, token)),
}
}
@ -103,6 +104,18 @@ impl Parser {
other => Err(format!("Expected Identifier, found {:?}", other)),
}
}
fn make_error(&mut self, token_kind: TokenKind, other: Token) -> String {
match &self.raw {
Some(raw_file) => format!(
"Token {:?} not found, found {:?}\n{:?}",
token_kind,
other,
highlight_position_in_file(raw_file.to_string(), other.to_owned().pos)
),
None => format!("Token {:?} not found, found {:?}", token_kind, other),
}
}
}
impl Parser {
@ -148,7 +161,7 @@ impl Parser {
fn parse_statement(&mut self) -> Result<Statement, String> {
let token = self.next_token();
dbg!(&token);
match token.kind {
match &token.kind {
TokenKind::Keyword(Keyword::Let) => {
let state = self.parse_declare();
self.match_token(TokenKind::SemiColon)?;
@ -161,7 +174,7 @@ impl Parser {
Ok(state)
}
other => Err(format!("Expected Statement, found {:?}", other)),
_ => Err(self.make_error(TokenKind::Unknown, token)),
}
}
@ -202,8 +215,8 @@ impl Parser {
}
}
pub fn parse(tokens: Vec<Token>) -> Result<node_type::Program, String> {
let mut parser = Parser::new(tokens);
pub fn parse(tokens: Vec<Token>, raw: Option<String>) -> Result<node_type::Program, String> {
let mut parser = Parser::new(tokens, raw);
parser.parse()
}

15
src/parser/tests.rs

@ -3,8 +3,9 @@ use crate::parser::*;
#[test]
fn test_parse_empty_function() {
let tokens = tokenize("fn main() {}");
let tree = parse(tokens);
let raw = "fn main() {}";
let tokens = tokenize(raw);
let tree = parse(tokens, Some(raw.to_string()));
assert!(tree.is_ok())
}
@ -16,7 +17,7 @@ fn test_parse_function_with_return() {
}
";
let tokens = tokenize(raw);
let tree = parse(tokens);
let tree = parse(tokens, Some(raw.to_string()));
assert!(tree.is_ok())
}
@ -28,7 +29,7 @@ fn test_parse_missing_semicolon() {
}
";
let tokens = tokenize(raw);
let tree = parse(tokens);
let tree = parse(tokens, Some(raw.to_string()));
assert!(tree.is_err())
}
@ -38,7 +39,7 @@ fn test_parse_no_function_context() {
let x = 1;
";
let tokens = tokenize(raw);
let tree = parse(tokens);
let tree = parse(tokens, Some(raw.to_string()));
assert!(tree.is_err())
}
@ -56,7 +57,7 @@ fn test_parse_multiple_functions() {
}
";
let tokens = tokenize(raw);
let tree = parse(tokens);
let tree = parse(tokens, Some(raw.to_string()));
assert!(tree.is_ok())
}
@ -69,6 +70,6 @@ fn test_parse_variable_declaration() {
}
";
let tokens = tokenize(raw);
let tree = parse(tokens);
let tree = parse(tokens, Some(raw.to_string()));
assert!(tree.is_ok())
}

1
src/util/mod.rs

@ -0,0 +1 @@
pub mod string_util;

17
src/util/string_util.rs

@ -0,0 +1,17 @@
use crate::lexer::Position;
pub fn highlight_position_in_file(input: String, position: Position) -> String {
// TODO: Chain without collecting in between
input
.chars()
.skip(position.raw)
.take_while(|c| c != &'\n')
.collect::<String>()
.chars()
.rev()
.take_while(|c| c != &'\n')
.collect::<String>()
.chars()
.rev()
.collect::<String>()
}
Loading…
Cancel
Save