From e833cd65231bf9e8e7bc7e5b55cc7cf5e7f8df62 Mon Sep 17 00:00:00 2001 From: Robert Bendun Date: Wed, 27 Apr 2022 13:48:50 +0200 Subject: [PATCH] Lexer: number literals, one rune tokens --- Makefile | 3 ++- src/lexer.cc | 54 ++++++++++++++++++++++++++++++++++++++++++++++++ src/musique.hh | 36 +++++++++++++++++++++++++++++++- src/tests/lex.cc | 13 +++++++++++- src/unicode.cc | 47 +++++++++++++++++++++++++++++++++++++++++ 5 files changed, 150 insertions(+), 3 deletions(-) create mode 100644 src/unicode.cc diff --git a/Makefile b/Makefile index 87b974a..6832998 100644 --- a/Makefile +++ b/Makefile @@ -3,7 +3,8 @@ CXXFLAGS=-std=c++20 -Wall -Wextra -O2 -Werror=switch CPPFLAGS=-Ilib/expected/ -Ilib/ut/ -Isrc/ Obj=bin/lexer.o \ - bin/errors.o + bin/errors.o \ + bin/unicode.o all: bin/musique bin/unit-tests diff --git a/src/lexer.cc b/src/lexer.cc index 3311889..a6bf301 100644 --- a/src/lexer.cc +++ b/src/lexer.cc @@ -2,9 +2,63 @@ auto Lexer::next_token() -> Result { + auto current = source; + + auto c = next_rune(); + + if (c == 0) + return errors::End_Of_File; + + switch (c) { + case '(': return { Token::Type::Open_Paren, current.substr(0, 1) }; + case ')': return { Token::Type::Close_Paren, current.substr(0, 1) }; + case '[': return { Token::Type::Open_Block, current.substr(0, 1) }; + case ']': return { Token::Type::Close_Block, current.substr(0, 1) }; + case '|': return { Token::Type::Variable_Separator, current.substr(0, 1) }; + } + + // Number literals like .75 + if (c == '.') { + while ((c = next_rune()) && std::isdigit(c)) {} + if (source.data() - current.data() != 1) + return { Token::Type::Numeric, current.substr(0, source.data() - current.data()) }; + } + + if (std::isdigit(c)) { + while ((c = next_rune()) && std::isdigit(c)) {} + if (c == '.') { + bool looped = false; + while ((c = next_rune()) && std::isdigit(c)) { looped = true; } + if (not looped) { + // If '.' is not followed by any digits, then '.' is not part of numeric literals + // and only part before it is considered valid + rewind(); + } + } + return { Token::Type::Numeric, current.substr(0, source.data() - current.data()) }; + } + + return {}; } +auto Lexer::next_rune() -> u32 +{ + if (not source.empty()) { + if (auto [rune, remaining] = utf8::decode(source); rune != utf8::Rune_Error) { + last_rune_length = remaining.data() - source.data(); + source = remaining; + return rune; + } + } + return 0; +} + +void Lexer::rewind() +{ + source = { source.data() - last_rune_length, source.size() + last_rune_length }; +} + std::ostream& operator<<(std::ostream& os, Token const&) { os << "Token"; diff --git a/src/musique.hh b/src/musique.hh index 6314d16..dfcab89 100644 --- a/src/musique.hh +++ b/src/musique.hh @@ -15,6 +15,8 @@ using i16 = std::int16_t; using i32 = std::int32_t; using i64 = std::int64_t; +using usize = std::size_t; +using isize = std::ptrdiff_t; namespace errors { @@ -33,7 +35,20 @@ struct Error }; template -using Result = tl::expected; +struct Result : tl::expected +{ + constexpr Result() = default; + + constexpr Result(errors::Type error) : tl::expected(tl::unexpected(Error { error })) + { + } + + template + constexpr Result(Args&& ...args) + : tl::expected( T{ std::forward(args)... } ) + { + } +}; std::ostream& operator<<(std::ostream& os, Error const& err); @@ -45,6 +60,16 @@ std::ostream& operator<<(std::ostream& os, Error const& err); *std::move(try_value); \ }) +namespace utf8 +{ + constexpr u32 Rune_Error = 0xfffd; + constexpr u32 Rune_Self = 0x80; + constexpr u32 Max_Bytes = 4; + + // Decodes rune and returns remaining string + auto decode(std::string_view str) -> std::pair; +} + struct Token { enum class Type @@ -81,9 +106,18 @@ struct Lexer // Source that is beeing lexed std::string_view source; + // Used for rewinding + u32 last_rune_length = 0; + // Determine location of tokens to produce nice errors std::string_view source_name = ""; unsigned column = 1, row = 1; auto next_token() -> Result; + + // Finds next rune in source + auto next_rune() -> u32; + + // Goes back last rune + void rewind(); }; diff --git a/src/tests/lex.cc b/src/tests/lex.cc index 636c455..a6e50e9 100644 --- a/src/tests/lex.cc +++ b/src/tests/lex.cc @@ -22,15 +22,25 @@ static void expect_token_type( static void expect_token_type_and_value( Token::Type expected_type, std::string_view source, + std::string_view expected, reflection::source_location const& sl = reflection::source_location::current()) { Lexer lexer{source}; auto result = lexer.next_token(); expect(result.has_value() >> fatal, sl) << "have not parsed any tokens"; expect(eq(under(result->type), under(expected_type)), sl) << "different token type then expected"; - expect(eq(result->source, source)) << "tokenized source is not equal to original"; + expect(eq(result->source, expected)) << "tokenized source is not equal to original"; } +static void expect_token_type_and_value( + Token::Type expected_type, + std::string_view source, + reflection::source_location const& sl = reflection::source_location::current()) +{ + expect_token_type_and_value(expected_type, source, source, sl); +} + + suite lexer_test = [] { "Empty file"_test = [] { Lexer lexer{""}; @@ -53,5 +63,6 @@ suite lexer_test = [] { expect_token_type_and_value(Token::Type::Numeric, ".75"); expect_token_type_and_value(Token::Type::Numeric, "0.75"); expect_token_type_and_value(Token::Type::Numeric, "123456789.123456789"); + expect_token_type_and_value(Token::Type::Numeric, "123.", "123"); }; }; diff --git a/src/unicode.cc b/src/unicode.cc new file mode 100644 index 0000000..9bd847c --- /dev/null +++ b/src/unicode.cc @@ -0,0 +1,47 @@ +#include "musique.hh" + +auto utf8::decode(std::string_view s) -> std::pair +{ + static constexpr std::array payloads { + 0b0111'1111, 0b0001'1111, 0b0000'1111, 0b0000'0111 + }; + + static constexpr std::array patterns { + 0b0000'0000, 0b1100'0000, 0b1110'0000, 0b1111'0000 + }; + + constexpr auto payload_cont = 0b0011'1111; + constexpr auto pattern_cont = 0b1000'0000; + + if (s.empty()) { + return { utf8::Rune_Error, s }; + } + + usize length = 0; + + for (auto i = 0u; i < payloads.size(); ++i) { + if ((s.front() & ~payloads[i]) == patterns[i]) { + length = i+1; + break; + } + } + + if (length == 0 || s.size() < length) { + return { utf8::Rune_Error, s }; + } + + u32 result = s.front() & payloads[length-1]; + + while (--length > 0) { + s.remove_prefix(1); + if ((s.front() & ~payload_cont) == pattern_cont) + return { utf8::Rune_Error, s }; + + result <<= 6; + result |= u32(s.front() & payload_cont); + } + + s.remove_prefix(1); + + return { result, s }; +}