Browse Source

UTF-8 (#47)

* lex: check for unicode whitespace

* lex: add unicode support

* changelog: add unicode entry
pull/51/head
Garrit Franke 3 years ago committed by GitHub
parent
commit
4930da315d
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
  1. 1
      CHANGELOG.md
  2. 32
      Cargo.lock
  3. 3
      Cargo.toml
  4. 52
      src/lexer/mod.rs
  5. 2
      src/main.rs
  6. 2
      tests/main.sb
  7. 3
      tests/structs.sb
  8. 21
      tests/unicode.sb

1
CHANGELOG.md

@ -13,6 +13,7 @@
- Support for `_` character in integers (E.g. `1_000_000`)
- Parser errors have been improved in consistency and readability
- Compile to stdout by using the `-o -` flag
- Proper support for utf-8
**Fixes**

32
Cargo.lock generated

@ -2,9 +2,9 @@
# It is not intended for manual editing.
[[package]]
name = "aho-corasick"
version = "0.7.15"
version = "0.7.18"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "7404febffaa47dac81aa44dba71523c9d069b1bdc50a77db41195149e17f68e5"
checksum = "1e37cfd5e7657ada45f742d6e99ca5788580b5c529dc78faf11ece6dc702656f"
dependencies = [
"memchr",
]
@ -23,6 +23,8 @@ name = "antimony-lang"
version = "0.6.0"
dependencies = [
"inkwell",
"lazy_static",
"regex",
"rust-embed",
"structopt",
]
@ -166,9 +168,9 @@ dependencies = [
[[package]]
name = "memchr"
version = "2.3.4"
version = "2.4.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "0ee1c47aaa256ecabcaea351eae4a9b01ef39ed810004e298d2511ed284b1525"
checksum = "b16bd47d9e329435e309c58469fe0791c2d0d1ba96ec0954152a5ae2b04387dc"
[[package]]
name = "once_cell"
@ -251,21 +253,20 @@ checksum = "41cc0f7e4d5d4544e8861606a285bb08d3e70712ccc7d2b84d7c0ccfaf4b05ce"
[[package]]
name = "regex"
version = "1.4.3"
version = "1.5.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d9251239e129e16308e70d853559389de218ac275b515068abc96829d05b948a"
checksum = "d07a8629359eb56f1e2fb1652bb04212c072a87ba68546a04065d525673ac461"
dependencies = [
"aho-corasick",
"memchr",
"regex-syntax",
"thread_local",
]
[[package]]
name = "regex-syntax"
version = "0.6.22"
version = "0.6.25"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b5eb417147ba9860a96cfe72a0b93bf88fee1744b5636ec99ab20c1aa9376581"
checksum = "f497285884f3fcff424ffc933e56d7cbca511def0c9831a7f9b5f6153e3cc89b"
[[package]]
name = "rust-embed"
@ -385,15 +386,6 @@ dependencies = [
"unicode-width",
]
[[package]]
name = "thread_local"
version = "1.1.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "8018d24e04c95ac8790716a5987d0fec4f8b27249ffa0f7d33f1369bdfb88cbd"
dependencies = [
"once_cell",
]
[[package]]
name = "unicode-segmentation"
version = "1.7.1"
@ -408,9 +400,9 @@ checksum = "9337591893a19b88d8d87f2cec1e73fad5cdfd10e5a6f349f498ad6ea2ffb1e3"
[[package]]
name = "unicode-xid"
version = "0.2.1"
version = "0.2.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f7fe0bb3479651439c9112f72b6c505038574c9fbb575ed1bf3b797fa39dd564"
checksum = "8ccb82d61f80a663efe1f787a51b16b5a51e3314d6ac365b08639f52387b33f3"
[[package]]
name = "vec_map"

3
Cargo.toml

@ -21,4 +21,5 @@ llvm = ["inkwell"]
structopt = "0.3.21"
rust-embed = "5.7.0"
inkwell = { version = "0.1.0-beta.2", features = ["llvm10-0"], optional = true }
regex = "1.5.4"
lazy_static = "1.4.0"

52
src/lexer/mod.rs

@ -17,6 +17,8 @@ pub(crate) mod cursor;
use self::TokenKind::*;
use cursor::Cursor;
use lazy_static::lazy_static;
use regex::Regex;
#[cfg(test)]
mod tests;
@ -178,24 +180,48 @@ pub fn first_token(input: &str, pos: &mut Position) -> Token {
}
pub fn is_whitespace(c: char) -> bool {
match c {
' ' | '\n' | '\r' | '\t' => true,
'\u{00A0}' => {
dbg!("Non-standard unicode character found: '\u{00A0}'");
true
}
_ => false,
}
// https://doc.rust-lang.org/reference/whitespace.html
matches!(
c,
// Usual ASCII suspects
'\u{0009}' // \t
| '\u{000A}' // \n
| '\u{000B}' // vertical tab
| '\u{000C}' // form feed
| '\u{000D}' // \r
| '\u{0020}' // space
// NEXT LINE from latin1
| '\u{0085}'
// Bidi markers
| '\u{200E}' // LEFT-TO-RIGHT MARK
| '\u{200F}' // RIGHT-TO-LEFT MARK
// Dedicated whitespace characters from Unicode
| '\u{2028}' // LINE SEPARATOR
| '\u{2029}' // PARAGRAPH SEPARATOR
)
}
/// True if `c` is valid as a first character of an identifier.
/// True if `c` is a valid first character of an identifier
/// See [Antimony specification](https://antimony-lang.github.io/antimony/developers/specification.html#identifiers) for
/// a formal definition of valid identifier name.
pub fn is_id_start(c: char) -> bool {
('a'..='z').contains(&c) || ('A'..='Z').contains(&c) || c == '_'
lazy_static! {
static ref ID_START: Regex = Regex::new(r"[\pL_]").unwrap();
}
ID_START.is_match(&c.to_string())
}
/// True if `c` is valid as a non-first character of an identifier.
/// True if `c` is a valid continuation of an identifier
/// See [Antimony specification](https://antimony-lang.github.io/antimony/developers/specification.html#identifiers) for
/// a formal definition of valid identifier name.
pub fn is_id_continue(c: char) -> bool {
('a'..='z').contains(&c) || ('A'..='Z').contains(&c) || ('0'..='9').contains(&c) || c == '_'
lazy_static! {
static ref ID_CONTINUE: Regex = Regex::new(r"[\pL\p{Nd}_]").unwrap();
}
ID_CONTINUE.is_match(&c.to_string())
}
impl Cursor<'_> {
@ -330,7 +356,7 @@ impl Cursor<'_> {
{
let mut eaten: usize = 0;
while predicate(self.first()) && !self.is_eof() {
eaten += 1;
eaten += self.first().len_utf8();
self.bump();
}

2
src/main.rs

@ -1,3 +1,5 @@
extern crate lazy_static;
extern crate regex;
/**
* Copyright 2020 Garrit Franke
*

2
tests/main.sb

@ -10,6 +10,7 @@ import "imports"
import "numbers"
import "structs"
import "types"
import "unicode"
fn main() {
@ -21,6 +22,7 @@ fn main() {
numbers_main()
structs_main()
types_main()
unicode_main()
log_test_stage("Done!")
}

3
tests/structs.sb

@ -1,4 +1,4 @@
fn functions_main() {
fn structs_main() {
log_test_stage("Testing structs")
test_initialization()
test_simple_field_access()
@ -34,6 +34,7 @@ fn user_stub() {
}
fn test_initialization() {
println("test_initialization")
let foo = new User {
username: "Foo Bar"
first_name: "Bar"

21
tests/unicode.sb

@ -0,0 +1,21 @@
fn unicode_main() {
log_test_stage("Testing unicode")
test_unicode_strings()
test_unicode_identifiers()
}
fn test_unicode_strings() {
println("Test unicode strings")
let alpha_omega = "αβ"
println(alpha_omega)
}
fn test_unicode_identifiers() {
println("Test unicode identifiers")
let αβ = "αβ"
let 世界 = "世界"
println(世界)
}
Loading…
Cancel
Save