From fd4db02e8dad2120ef767736905ea3dea9fa7b1d Mon Sep 17 00:00:00 2001 From: Robert Bendun Date: Wed, 27 Apr 2022 14:37:21 +0200 Subject: [PATCH] Lexer skips whitespace --- src/lexer.cc | 72 +++++++++++++++++++++++++++++++++--------------- src/musique.hh | 41 +++++++++++++++++++++++---- src/tests/lex.cc | 4 ++- src/unicode.cc | 19 +++++++++++++ 4 files changed, 108 insertions(+), 28 deletions(-) diff --git a/src/lexer.cc b/src/lexer.cc index a6bf301..e80e92c 100644 --- a/src/lexer.cc +++ b/src/lexer.cc @@ -2,52 +2,65 @@ auto Lexer::next_token() -> Result { - auto current = source; + while (consume_if(unicode::is_space)) { + } + start(); - auto c = next_rune(); - - if (c == 0) + if (peek() == 0) { return errors::End_Of_File; + } - switch (c) { - case '(': return { Token::Type::Open_Paren, current.substr(0, 1) }; - case ')': return { Token::Type::Close_Paren, current.substr(0, 1) }; - case '[': return { Token::Type::Open_Block, current.substr(0, 1) }; - case ']': return { Token::Type::Close_Block, current.substr(0, 1) }; - case '|': return { Token::Type::Variable_Separator, current.substr(0, 1) }; + switch (peek()) { + case '(': consume(); return { Token::Type::Open_Paren, finish() }; + case ')': consume(); return { Token::Type::Close_Paren, finish() }; + case '[': consume(); return { Token::Type::Open_Block, finish() }; + case ']': consume(); return { Token::Type::Close_Block, finish() }; + case '|': consume(); return { Token::Type::Variable_Separator, finish() }; } // Number literals like .75 - if (c == '.') { - while ((c = next_rune()) && std::isdigit(c)) {} - if (source.data() - current.data() != 1) - return { Token::Type::Numeric, current.substr(0, source.data() - current.data()) }; + if (peek() == '.') { + consume(); + while (consume_if(unicode::is_digit)) {} + if (token_length != 1) + return { Token::Type::Numeric, finish() }; } - if (std::isdigit(c)) { - while ((c = next_rune()) && std::isdigit(c)) {} - if (c == '.') { + if (consume_if(unicode::is_digit)) { + while (consume_if(unicode::is_digit)) {} + if (peek() == '.') { + consume(); bool looped = false; - while ((c = next_rune()) && std::isdigit(c)) { looped = true; } + while (consume_if(unicode::is_digit)) { looped = true; } if (not looped) { // If '.' is not followed by any digits, then '.' is not part of numeric literals // and only part before it is considered valid rewind(); } } - return { Token::Type::Numeric, current.substr(0, source.data() - current.data()) }; + return { Token::Type::Numeric, finish() }; } - - return {}; + return errors::Unrecognized_Character; } -auto Lexer::next_rune() -> u32 +auto Lexer::peek() const -> u32 +{ + if (not source.empty()) { + if (auto [rune, remaining] = utf8::decode(source); rune != utf8::Rune_Error) { + return rune; + } + } + return 0; +} + +auto Lexer::consume() -> u32 { if (not source.empty()) { if (auto [rune, remaining] = utf8::decode(source); rune != utf8::Rune_Error) { last_rune_length = remaining.data() - source.data(); source = remaining; + token_length += last_rune_length; return rune; } } @@ -57,6 +70,21 @@ auto Lexer::next_rune() -> u32 void Lexer::rewind() { source = { source.data() - last_rune_length, source.size() + last_rune_length }; + token_length -= last_rune_length; +} + +void Lexer::start() +{ + token_start = source.data(); + token_length = 0; +} + +std::string_view Lexer::finish() +{ + std::string_view result { token_start, token_length }; + token_start = nullptr; + token_length = 0; + return result; } std::ostream& operator<<(std::ostream& os, Token const&) diff --git a/src/musique.hh b/src/musique.hh index dfcab89..8572258 100644 --- a/src/musique.hh +++ b/src/musique.hh @@ -18,11 +18,14 @@ using i64 = std::int64_t; using usize = std::size_t; using isize = std::ptrdiff_t; +#define Fun(Function) ([](T&& ...args) { return (Function)(std::forward(args)...); }) + namespace errors { enum Type { - End_Of_File + End_Of_File, + Unrecognized_Character }; } @@ -60,11 +63,22 @@ std::ostream& operator<<(std::ostream& os, Error const& err); *std::move(try_value); \ }) +namespace unicode +{ + inline namespace special_runes + { + constexpr u32 Rune_Error = 0xfffd; + constexpr u32 Rune_Self = 0x80; + constexpr u32 Max_Bytes = 4; + } + + bool is_digit(u32 digit); + bool is_space(u32 space); +} + namespace utf8 { - constexpr u32 Rune_Error = 0xfffd; - constexpr u32 Rune_Self = 0x80; - constexpr u32 Max_Bytes = 4; + using namespace unicode::special_runes; // Decodes rune and returns remaining string auto decode(std::string_view str) -> std::pair; @@ -109,6 +123,9 @@ struct Lexer // Used for rewinding u32 last_rune_length = 0; + char const* token_start = nullptr; + usize token_length = 0; + // Determine location of tokens to produce nice errors std::string_view source_name = ""; unsigned column = 1, row = 1; @@ -116,8 +133,22 @@ struct Lexer auto next_token() -> Result; // Finds next rune in source - auto next_rune() -> u32; + auto peek() const -> u32; + + // Finds next rune in source and returns it, advancing the string + auto consume() -> u32; + + inline auto consume_if(auto test) -> u32 + { + return test(peek()) && (consume(), true); + } // Goes back last rune void rewind(); + + // Marks begin of token + void start(); + + // Marks end of token and returns it's matching source + std::string_view finish(); }; diff --git a/src/tests/lex.cc b/src/tests/lex.cc index a6e50e9..617d78d 100644 --- a/src/tests/lex.cc +++ b/src/tests/lex.cc @@ -29,7 +29,7 @@ static void expect_token_type_and_value( auto result = lexer.next_token(); expect(result.has_value() >> fatal, sl) << "have not parsed any tokens"; expect(eq(under(result->type), under(expected_type)), sl) << "different token type then expected"; - expect(eq(result->source, expected)) << "tokenized source is not equal to original"; + expect(eq(result->source, expected), sl) << "tokenized source is not equal to original"; } static void expect_token_type_and_value( @@ -64,5 +64,7 @@ suite lexer_test = [] { expect_token_type_and_value(Token::Type::Numeric, "0.75"); expect_token_type_and_value(Token::Type::Numeric, "123456789.123456789"); expect_token_type_and_value(Token::Type::Numeric, "123.", "123"); + expect_token_type_and_value(Token::Type::Numeric, " 1 ", "1"); + expect_token_type_and_value(Token::Type::Numeric, " 123 ", "123"); }; }; diff --git a/src/unicode.cc b/src/unicode.cc index 9bd847c..ae102e4 100644 --- a/src/unicode.cc +++ b/src/unicode.cc @@ -45,3 +45,22 @@ auto utf8::decode(std::string_view s) -> std::pair return { result, s }; } + +bool unicode::is_digit(u32 digit) +{ + return digit >= '0' && digit <= '9'; +} + +bool unicode::is_space(u32 space) +{ + switch (space) { + case ' ': + case '\t': + case '\n': + case '\f': + case '\r': + case '\v': + return true; + } + return false; +}