From 62c2faa33a3aadd2abc7a70330a2fd76858e6611 Mon Sep 17 00:00:00 2001 From: Robert Bendun Date: Mon, 2 May 2022 19:24:29 +0200 Subject: [PATCH] Parsing symbols --- src/lexer.cc | 15 +++++++++++++-- src/musique.hh | 12 ++++++++++++ src/tests/lex.cc | 12 ++++++++++++ src/unicode.cc | 7 +++++++ 4 files changed, 44 insertions(+), 2 deletions(-) diff --git a/src/lexer.cc b/src/lexer.cc index a943c88..33e1801 100644 --- a/src/lexer.cc +++ b/src/lexer.cc @@ -80,13 +80,24 @@ auto Lexer::next_token() -> Result break; } - if (unicode::is_letter(peek())) { - assert(false && "symbols are not implemented yet"); + if (unicode::is_identifier(peek(), unicode::First_Character::No)) { + goto symbol_lexing; } return { Token::Type::Chord, finish(), token_location }; } + using namespace std::placeholders; + if (consume_if(std::bind(unicode::is_identifier, _1, unicode::First_Character::Yes))) { + symbol_lexing: + for (auto predicate = std::bind(unicode::is_identifier, _1, unicode::First_Character::No); + consume_if(predicate); + ) { + } + + return { Token::Type::Symbol, finish(), token_location }; + } + return errors::unrecognized_character(peek(), token_location); } diff --git a/src/musique.hh b/src/musique.hh index 00d57e4..31e126b 100644 --- a/src/musique.hh +++ b/src/musique.hh @@ -111,9 +111,21 @@ namespace unicode constexpr u32 Max_Bytes = 4; } + // is_digit returns true if `digit` is ASCII digit bool is_digit(u32 digit); + + // is_space return true if `space` is ASCII blank character bool is_space(u32 space); + + // is_letter returns true if `letter` is considered a letter by Unicode bool is_letter(u32 letter); + + // is_identifier returns true if `letter` is valid character for identifier. + // + // It's modifier by is_first_character flag to determine some character classes + // allowance like numbers, which are only allowed NOT at the front of the identifier + enum class First_Character : bool { Yes = true, No = false }; + bool is_identifier(u32 letter, First_Character is_first_character); } namespace utf8 diff --git a/src/tests/lex.cc b/src/tests/lex.cc index 4cfed50..aaae51a 100644 --- a/src/tests/lex.cc +++ b/src/tests/lex.cc @@ -98,4 +98,16 @@ suite lexer_test = [] { expect_token_type_and_value(Token::Type::Chord, "f1'2'3'5'7'"); expect_token_type_and_value(Token::Type::Chord, "b1,2,5,7,"); }; + + "Symbol literals"_test = [] { + expect_token_type_and_value(Token::Type::Symbol, "i"); + expect_token_type_and_value(Token::Type::Symbol, "i2"); + expect_token_type_and_value(Token::Type::Symbol, "example"); + expect_token_type_and_value(Token::Type::Symbol, "d1envelope"); + expect_token_type_and_value(Token::Type::Symbol, "kebab-case"); + expect_token_type_and_value(Token::Type::Symbol, "snake_case"); + expect_token_type_and_value(Token::Type::Symbol, "camelCase"); + expect_token_type_and_value(Token::Type::Symbol, "PascalCase"); + expect_token_type_and_value(Token::Type::Symbol, "haskell'"); + }; }; diff --git a/src/unicode.cc b/src/unicode.cc index 82b851d..f91aa9c 100644 --- a/src/unicode.cc +++ b/src/unicode.cc @@ -103,3 +103,10 @@ bool unicode::is_letter(u32 letter) // TODO Unicode letters handling return std::isalpha(letter); } + +bool unicode::is_identifier(u32 letter, unicode::First_Character is_first_character) +{ + return (unicode::is_letter(letter) || letter == '_') + || (!bool(is_first_character) && ( + letter == '-' || letter == '_' || letter == '\'' || unicode::is_digit(letter))); +}