unicode support & location in errors
This commit is contained in:
parent
8d0507e341
commit
43223e685e
@ -5,7 +5,24 @@ bool Error::operator==(errors::Type type)
|
|||||||
return this->type == type;
|
return this->type == type;
|
||||||
}
|
}
|
||||||
|
|
||||||
std::ostream& operator<<(std::ostream& os, Error const&)
|
std::ostream& operator<<(std::ostream& os, Error const& err)
|
||||||
{
|
{
|
||||||
return os << "generic error";
|
if (err.location) {
|
||||||
|
os << *err.location;
|
||||||
|
} else {
|
||||||
|
os << "musique";
|
||||||
|
}
|
||||||
|
|
||||||
|
os << ": error: ";
|
||||||
|
|
||||||
|
switch (err.type) {
|
||||||
|
case errors::End_Of_File:
|
||||||
|
return os << "end of file\n";
|
||||||
|
|
||||||
|
case errors::Unrecognized_Character:
|
||||||
|
return os << "unrecognized charater 0x" << std::hex << err.invalid_character
|
||||||
|
<< "(char: '" << utf8::Print(err.invalid_character) << "')\n";
|
||||||
|
}
|
||||||
|
|
||||||
|
return os << "unrecognized error type\n";
|
||||||
}
|
}
|
||||||
|
@ -2,8 +2,7 @@
|
|||||||
|
|
||||||
auto Lexer::next_token() -> Result<Token>
|
auto Lexer::next_token() -> Result<Token>
|
||||||
{
|
{
|
||||||
while (consume_if(unicode::is_space)) {
|
while (consume_if(unicode::is_space)) {}
|
||||||
}
|
|
||||||
start();
|
start();
|
||||||
|
|
||||||
if (peek() == 0) {
|
if (peek() == 0) {
|
||||||
|
@ -2,9 +2,11 @@
|
|||||||
|
|
||||||
#include <cassert>
|
#include <cassert>
|
||||||
#include <cstdint>
|
#include <cstdint>
|
||||||
|
#include <optional>
|
||||||
#include <ostream>
|
#include <ostream>
|
||||||
#include <string_view>
|
#include <string_view>
|
||||||
#include <tl/expected.hpp>
|
#include <tl/expected.hpp>
|
||||||
|
#include <variant>
|
||||||
|
|
||||||
using u8 = std::uint8_t;
|
using u8 = std::uint8_t;
|
||||||
using u16 = std::uint16_t;
|
using u16 = std::uint16_t;
|
||||||
@ -53,7 +55,8 @@ std::ostream& operator<<(std::ostream& os, Location const& location);
|
|||||||
struct Error
|
struct Error
|
||||||
{
|
{
|
||||||
errors::Type type;
|
errors::Type type;
|
||||||
Error *child = nullptr;
|
std::optional<Location> location = std::nullopt;
|
||||||
|
u32 invalid_character = 0;
|
||||||
|
|
||||||
bool operator==(errors::Type);
|
bool operator==(errors::Type);
|
||||||
};
|
};
|
||||||
@ -102,9 +105,14 @@ namespace utf8
|
|||||||
using namespace unicode::special_runes;
|
using namespace unicode::special_runes;
|
||||||
|
|
||||||
// Decodes rune and returns remaining string
|
// Decodes rune and returns remaining string
|
||||||
auto decode(std::string_view str) -> std::pair<u32, std::string_view>;
|
auto decode(std::string_view s) -> std::pair<u32, std::string_view>;
|
||||||
|
auto length(std::string_view s) -> usize;
|
||||||
|
|
||||||
|
struct Print { u32 rune; };
|
||||||
}
|
}
|
||||||
|
|
||||||
|
std::ostream& operator<<(std::ostream& os, utf8::Print const& print);
|
||||||
|
|
||||||
struct Token
|
struct Token
|
||||||
{
|
{
|
||||||
enum class Type
|
enum class Type
|
||||||
|
21
src/tests/unicode.cc
Normal file
21
src/tests/unicode.cc
Normal file
@ -0,0 +1,21 @@
|
|||||||
|
#include <boost/ut.hpp>
|
||||||
|
#include <musique.hh>
|
||||||
|
|
||||||
|
using namespace boost::ut;
|
||||||
|
using namespace std::string_view_literals;
|
||||||
|
|
||||||
|
suite utf8_test = [] {
|
||||||
|
"UTF-8 Character length"_test = [] {
|
||||||
|
expect(utf8::length(" ") == 1_u);
|
||||||
|
expect(utf8::length("ą") == 2_u);
|
||||||
|
expect(utf8::length("\u2705") == 3_u);
|
||||||
|
expect(utf8::length("\U000132d1") == 4_u);
|
||||||
|
};
|
||||||
|
|
||||||
|
"UTF-8 Character decoding"_test = [] {
|
||||||
|
expect(eq(utf8::decode(" ").first, 0x20u));
|
||||||
|
expect(eq(utf8::decode("ą").first, 0x105u));
|
||||||
|
expect(eq(utf8::decode("\u2705").first, 0x2705u));
|
||||||
|
expect(eq(utf8::decode("\U000132d1").first, 0x132d1u));
|
||||||
|
};
|
||||||
|
};
|
@ -1,30 +1,35 @@
|
|||||||
#include "musique.hh"
|
#include "musique.hh"
|
||||||
|
|
||||||
|
static constexpr std::array<u8, 4> payloads {
|
||||||
|
0b0111'1111, 0b0001'1111, 0b0000'1111, 0b0000'0111
|
||||||
|
};
|
||||||
|
|
||||||
|
static constexpr std::array<u8, 4> patterns {
|
||||||
|
0b0000'0000, 0b1100'0000, 0b1110'0000, 0b1111'0000
|
||||||
|
};
|
||||||
|
|
||||||
|
constexpr auto payload_cont = 0b0011'1111;
|
||||||
|
constexpr auto pattern_cont = 0b1000'0000;
|
||||||
|
|
||||||
|
auto utf8::length(std::string_view s) -> usize
|
||||||
|
{
|
||||||
|
if (not s.empty()) {
|
||||||
|
for (auto i = 0u; i < payloads.size(); ++i) {
|
||||||
|
if ((u8(s.front()) & ~payloads[i]) == patterns[i]) {
|
||||||
|
return i+1;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
auto utf8::decode(std::string_view s) -> std::pair<u32, std::string_view>
|
auto utf8::decode(std::string_view s) -> std::pair<u32, std::string_view>
|
||||||
{
|
{
|
||||||
static constexpr std::array<u8, 4> payloads {
|
|
||||||
0b0111'1111, 0b0001'1111, 0b0000'1111, 0b0000'0111
|
|
||||||
};
|
|
||||||
|
|
||||||
static constexpr std::array<u8, 4> patterns {
|
|
||||||
0b0000'0000, 0b1100'0000, 0b1110'0000, 0b1111'0000
|
|
||||||
};
|
|
||||||
|
|
||||||
constexpr auto payload_cont = 0b0011'1111;
|
|
||||||
constexpr auto pattern_cont = 0b1000'0000;
|
|
||||||
|
|
||||||
if (s.empty()) {
|
if (s.empty()) {
|
||||||
return { utf8::Rune_Error, s };
|
return { utf8::Rune_Error, s };
|
||||||
}
|
}
|
||||||
|
|
||||||
usize length = 0;
|
usize length = utf8::length(s);
|
||||||
|
|
||||||
for (auto i = 0u; i < payloads.size(); ++i) {
|
|
||||||
if ((s.front() & ~payloads[i]) == patterns[i]) {
|
|
||||||
length = i+1;
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
if (length == 0 || s.size() < length) {
|
if (length == 0 || s.size() < length) {
|
||||||
return { utf8::Rune_Error, s };
|
return { utf8::Rune_Error, s };
|
||||||
@ -34,9 +39,9 @@ auto utf8::decode(std::string_view s) -> std::pair<u32, std::string_view>
|
|||||||
|
|
||||||
while (--length > 0) {
|
while (--length > 0) {
|
||||||
s.remove_prefix(1);
|
s.remove_prefix(1);
|
||||||
if ((s.front() & ~payload_cont) == pattern_cont)
|
if ((s.front() & ~payload_cont) == pattern_cont) {
|
||||||
return { utf8::Rune_Error, s };
|
return { utf8::Rune_Error, s };
|
||||||
|
}
|
||||||
result <<= 6;
|
result <<= 6;
|
||||||
result |= u32(s.front() & payload_cont);
|
result |= u32(s.front() & payload_cont);
|
||||||
}
|
}
|
||||||
@ -46,6 +51,34 @@ auto utf8::decode(std::string_view s) -> std::pair<u32, std::string_view>
|
|||||||
return { result, s };
|
return { result, s };
|
||||||
}
|
}
|
||||||
|
|
||||||
|
std::ostream& operator<<(std::ostream& os, utf8::Print const& print)
|
||||||
|
{
|
||||||
|
auto r = print.rune;
|
||||||
|
std::array<u8, utf8::Max_Bytes> buffer;
|
||||||
|
unsigned length = 0;
|
||||||
|
|
||||||
|
if (r <= 0x7f) {
|
||||||
|
buffer[0] = r;
|
||||||
|
length = 1;
|
||||||
|
} else if (r <= 0x07'ff) {
|
||||||
|
buffer[0] = ((r >> 6) & 0x1f) | 0xc0;
|
||||||
|
buffer[1] = ((r >> 0) & 0x3f) | 0x80;
|
||||||
|
length = 2;
|
||||||
|
} else if (r <= 0xff'ff) {
|
||||||
|
buffer[0] = ((r >> 12) & 0x0f) | 0xe0;
|
||||||
|
buffer[1] = ((r >> 6) & 0x3f) | 0x80;
|
||||||
|
buffer[2] = ((r >> 0) & 0x3f) | 0x80;
|
||||||
|
length = 3;
|
||||||
|
} else {
|
||||||
|
buffer[0] = ((r >> 18) & 0x07) | 0xf0;
|
||||||
|
buffer[1] = ((r >> 12) & 0x3f) | 0x80;
|
||||||
|
buffer[2] = ((r >> 6) & 0x3f) | 0x80;
|
||||||
|
buffer[3] = ((r >> 0) & 0x3f) | 0x80;
|
||||||
|
length = 4;
|
||||||
|
}
|
||||||
|
return os.write((char const*)buffer.data(), length);
|
||||||
|
}
|
||||||
|
|
||||||
bool unicode::is_digit(u32 digit)
|
bool unicode::is_digit(u32 digit)
|
||||||
{
|
{
|
||||||
return digit >= '0' && digit <= '9';
|
return digit >= '0' && digit <= '9';
|
||||||
|
Loading…
Reference in New Issue
Block a user