diff --git a/src/errors.cc b/src/errors.cc index 7971abe..3dffd26 100644 --- a/src/errors.cc +++ b/src/errors.cc @@ -5,7 +5,24 @@ bool Error::operator==(errors::Type type) return this->type == type; } -std::ostream& operator<<(std::ostream& os, Error const&) +std::ostream& operator<<(std::ostream& os, Error const& err) { - return os << "generic error"; + if (err.location) { + os << *err.location; + } else { + os << "musique"; + } + + os << ": error: "; + + switch (err.type) { + case errors::End_Of_File: + return os << "end of file\n"; + + case errors::Unrecognized_Character: + return os << "unrecognized charater 0x" << std::hex << err.invalid_character + << "(char: '" << utf8::Print(err.invalid_character) << "')\n"; + } + + return os << "unrecognized error type\n"; } diff --git a/src/lexer.cc b/src/lexer.cc index df5b719..5eb3321 100644 --- a/src/lexer.cc +++ b/src/lexer.cc @@ -2,8 +2,7 @@ auto Lexer::next_token() -> Result { - while (consume_if(unicode::is_space)) { - } + while (consume_if(unicode::is_space)) {} start(); if (peek() == 0) { diff --git a/src/musique.hh b/src/musique.hh index 852fd14..6af9eab 100644 --- a/src/musique.hh +++ b/src/musique.hh @@ -2,9 +2,11 @@ #include #include +#include #include #include #include +#include using u8 = std::uint8_t; using u16 = std::uint16_t; @@ -53,7 +55,8 @@ std::ostream& operator<<(std::ostream& os, Location const& location); struct Error { errors::Type type; - Error *child = nullptr; + std::optional location = std::nullopt; + u32 invalid_character = 0; bool operator==(errors::Type); }; @@ -102,9 +105,14 @@ namespace utf8 using namespace unicode::special_runes; // Decodes rune and returns remaining string - auto decode(std::string_view str) -> std::pair; + auto decode(std::string_view s) -> std::pair; + auto length(std::string_view s) -> usize; + + struct Print { u32 rune; }; } +std::ostream& operator<<(std::ostream& os, utf8::Print const& print); + struct Token { enum class Type diff --git a/src/tests/unicode.cc b/src/tests/unicode.cc new file mode 100644 index 0000000..2c670ed --- /dev/null +++ b/src/tests/unicode.cc @@ -0,0 +1,21 @@ +#include +#include + +using namespace boost::ut; +using namespace std::string_view_literals; + +suite utf8_test = [] { + "UTF-8 Character length"_test = [] { + expect(utf8::length(" ") == 1_u); + expect(utf8::length("ą") == 2_u); + expect(utf8::length("\u2705") == 3_u); + expect(utf8::length("\U000132d1") == 4_u); + }; + + "UTF-8 Character decoding"_test = [] { + expect(eq(utf8::decode(" ").first, 0x20u)); + expect(eq(utf8::decode("ą").first, 0x105u)); + expect(eq(utf8::decode("\u2705").first, 0x2705u)); + expect(eq(utf8::decode("\U000132d1").first, 0x132d1u)); + }; +}; diff --git a/src/unicode.cc b/src/unicode.cc index ae102e4..3c6dcaa 100644 --- a/src/unicode.cc +++ b/src/unicode.cc @@ -1,30 +1,35 @@ #include "musique.hh" +static constexpr std::array payloads { + 0b0111'1111, 0b0001'1111, 0b0000'1111, 0b0000'0111 +}; + +static constexpr std::array patterns { + 0b0000'0000, 0b1100'0000, 0b1110'0000, 0b1111'0000 +}; + +constexpr auto payload_cont = 0b0011'1111; +constexpr auto pattern_cont = 0b1000'0000; + +auto utf8::length(std::string_view s) -> usize +{ + if (not s.empty()) { + for (auto i = 0u; i < payloads.size(); ++i) { + if ((u8(s.front()) & ~payloads[i]) == patterns[i]) { + return i+1; + } + } + } + return 0; +} + auto utf8::decode(std::string_view s) -> std::pair { - static constexpr std::array payloads { - 0b0111'1111, 0b0001'1111, 0b0000'1111, 0b0000'0111 - }; - - static constexpr std::array patterns { - 0b0000'0000, 0b1100'0000, 0b1110'0000, 0b1111'0000 - }; - - constexpr auto payload_cont = 0b0011'1111; - constexpr auto pattern_cont = 0b1000'0000; - if (s.empty()) { return { utf8::Rune_Error, s }; } - usize length = 0; - - for (auto i = 0u; i < payloads.size(); ++i) { - if ((s.front() & ~payloads[i]) == patterns[i]) { - length = i+1; - break; - } - } + usize length = utf8::length(s); if (length == 0 || s.size() < length) { return { utf8::Rune_Error, s }; @@ -34,9 +39,9 @@ auto utf8::decode(std::string_view s) -> std::pair while (--length > 0) { s.remove_prefix(1); - if ((s.front() & ~payload_cont) == pattern_cont) + if ((s.front() & ~payload_cont) == pattern_cont) { return { utf8::Rune_Error, s }; - + } result <<= 6; result |= u32(s.front() & payload_cont); } @@ -46,6 +51,34 @@ auto utf8::decode(std::string_view s) -> std::pair return { result, s }; } +std::ostream& operator<<(std::ostream& os, utf8::Print const& print) +{ + auto r = print.rune; + std::array buffer; + unsigned length = 0; + + if (r <= 0x7f) { + buffer[0] = r; + length = 1; + } else if (r <= 0x07'ff) { + buffer[0] = ((r >> 6) & 0x1f) | 0xc0; + buffer[1] = ((r >> 0) & 0x3f) | 0x80; + length = 2; + } else if (r <= 0xff'ff) { + buffer[0] = ((r >> 12) & 0x0f) | 0xe0; + buffer[1] = ((r >> 6) & 0x3f) | 0x80; + buffer[2] = ((r >> 0) & 0x3f) | 0x80; + length = 3; + } else { + buffer[0] = ((r >> 18) & 0x07) | 0xf0; + buffer[1] = ((r >> 12) & 0x3f) | 0x80; + buffer[2] = ((r >> 6) & 0x3f) | 0x80; + buffer[3] = ((r >> 0) & 0x3f) | 0x80; + length = 4; + } + return os.write((char const*)buffer.data(), length); +} + bool unicode::is_digit(u32 digit) { return digit >= '0' && digit <= '9';