From b0fe2b0474bb174cb55a9319e3ccd5f3e354d1b8 Mon Sep 17 00:00:00 2001 From: Garrit Franke Date: Fri, 4 Dec 2020 11:57:46 +0100 Subject: [PATCH] Add token positions --- src/lexer/cursor.rs | 14 ++++++++++---- src/lexer/mod.rs | 19 +++++++++++++------ src/lexer/tests.rs | 42 ++++++++++++++++++++++++++++-------------- 3 files changed, 51 insertions(+), 24 deletions(-) diff --git a/src/lexer/cursor.rs b/src/lexer/cursor.rs index 69a2de0..feb9301 100644 --- a/src/lexer/cursor.rs +++ b/src/lexer/cursor.rs @@ -5,7 +5,8 @@ use std::str::Chars; /// Next characters can be peeked via `nth_char` method, /// and position can be shifted forward via `bump` method. pub(crate) struct Cursor<'a> { - initial_len: usize, + initial_length: usize, + len: usize, chars: Chars<'a>, prev: char, } @@ -13,9 +14,10 @@ pub(crate) struct Cursor<'a> { pub(crate) const EOF_CHAR: char = '\0'; impl<'a> Cursor<'a> { - pub(crate) fn new(input: &'a str) -> Cursor<'a> { + pub(crate) fn new(input: &'a str, initial_len: usize) -> Cursor<'a> { Cursor { - initial_len: input.len(), + initial_length: initial_len, + len: input.len(), chars: input.chars(), #[cfg(debug_assertions)] prev: EOF_CHAR, @@ -56,7 +58,7 @@ impl<'a> Cursor<'a> { /// Returns amount of already consumed symbols. pub(crate) fn len_consumed(&self) -> usize { - self.initial_len - self.chars.as_str().len() + self.len - self.chars.as_str().len() } /// Returns a `Chars` iterator over the remaining characters. @@ -64,6 +66,10 @@ impl<'a> Cursor<'a> { self.chars.clone() } + pub(crate) fn pos(&self) -> usize { + self.initial_length - self.len + } + /// Moves to the next character. pub(crate) fn bump(&mut self) -> Option { let c = self.chars.next()?; diff --git a/src/lexer/mod.rs b/src/lexer/mod.rs index 7e2c695..df4ce27 100644 --- a/src/lexer/mod.rs +++ b/src/lexer/mod.rs @@ -11,11 +11,17 @@ pub struct Token { pub kind: TokenKind, pub len: usize, pub raw: String, + pub pos: usize, } impl Token { - fn new(kind: TokenKind, len: usize, raw: String) -> Token { - Token { kind, len, raw } + fn new(kind: TokenKind, len: usize, raw: String, pos: usize) -> Token { + Token { + kind, + len, + raw, + pos, + } } } @@ -89,11 +95,12 @@ pub enum Keyword { /// Creates an iterator that produces tokens from the input string. pub fn tokenize(mut input: &str) -> Vec { + let mut initial_length = input.len(); std::iter::from_fn(move || { if input.is_empty() { return None; } - let token = first_token(input); + let token = first_token(input, initial_length); input = &input[token.len..]; Some(token) }) @@ -101,9 +108,9 @@ pub fn tokenize(mut input: &str) -> Vec { } /// Parses the first token from the provided input string. -pub fn first_token(input: &str) -> Token { +pub fn first_token(input: &str, initial_len: usize) -> Token { debug_assert!(!input.is_empty()); - Cursor::new(input).advance_token() + Cursor::new(input, initial_len).advance_token() } pub fn is_whitespace(c: char) -> bool { @@ -179,7 +186,7 @@ impl Cursor<'_> { let mut raw = original_chars2.collect::(); // Cut the original tokens to the length of the token raw.truncate(len); - Token::new(token_kind, len, raw) + Token::new(token_kind, len, raw, self.pos()) } /// Eats symbols while predicate returns true or until the end of file is reached. diff --git a/src/lexer/tests.rs b/src/lexer/tests.rs index 584096b..ef6a1a3 100644 --- a/src/lexer/tests.rs +++ b/src/lexer/tests.rs @@ -4,14 +4,16 @@ mod tests { #[test] fn test_basic_tokenizing() { - let mut tokens = tokenize("1 = 2").into_iter(); + let raw = tokenize("1 = 2"); + let mut tokens = raw.into_iter(); assert_eq!( tokens.nth(0).unwrap(), Token { len: 1, kind: TokenKind::Literal(Value::Int), - raw: "1".to_owned() + raw: "1".to_owned(), + pos: 0 } ); @@ -20,7 +22,8 @@ mod tests { Token { len: 1, kind: TokenKind::Whitespace, - raw: " ".to_owned() + raw: " ".to_owned(), + pos: 1 } ); @@ -29,7 +32,8 @@ mod tests { Token { len: 1, kind: TokenKind::Assign, - raw: "=".to_owned() + raw: "=".to_owned(), + pos: 2 } ); @@ -38,7 +42,8 @@ mod tests { Token { len: 1, kind: TokenKind::Whitespace, - raw: " ".to_owned() + raw: " ".to_owned(), + pos: 3 } ); @@ -47,7 +52,8 @@ mod tests { Token { len: 1, kind: TokenKind::Literal(Value::Int), - raw: "2".to_owned() + raw: "2".to_owned(), + pos: 4 } ); } @@ -61,7 +67,8 @@ mod tests { Token { len: 1, kind: TokenKind::Literal(Value::Int), - raw: "1".to_owned() + raw: "1".to_owned(), + pos: 0 } ); @@ -70,7 +77,8 @@ mod tests { Token { len: 1, kind: TokenKind::Assign, - raw: "=".to_owned() + raw: "=".to_owned(), + pos: 1 } ); @@ -79,7 +87,8 @@ mod tests { Token { len: 1, kind: TokenKind::Literal(Value::Int), - raw: "2".to_owned() + raw: "2".to_owned(), + pos: 2 } ); } @@ -93,7 +102,8 @@ mod tests { Token { len: 4, kind: TokenKind::Keyword(Keyword::Boolean), - raw: "true".to_owned() + raw: "true".to_owned(), + pos: 0 } ); @@ -102,7 +112,8 @@ mod tests { Token { len: 5, kind: TokenKind::Keyword(Keyword::Boolean), - raw: "false".to_owned() + raw: "false".to_owned(), + pos: 5 } ); } @@ -116,7 +127,8 @@ mod tests { Token { len: 2, kind: TokenKind::Keyword(Keyword::Function), - raw: "fn".to_owned() + raw: "fn".to_owned(), + pos: 0 } ); } @@ -125,8 +137,8 @@ mod tests { fn test_comments() { let mut tokens = tokenize( " - // foo - fn fib() {} +// foo +fn fib() {} ", ) .into_iter() @@ -142,6 +154,7 @@ mod tests { len: 6, kind: TokenKind::Comment, raw: "// foo".to_owned(), + pos: 1 } ); @@ -151,6 +164,7 @@ mod tests { len: 2, kind: TokenKind::Keyword(Keyword::Function), raw: "fn".to_owned(), + pos: 8 } ); }