Lexer: number literals, one rune tokens
This commit is contained in:
parent
562ca6597e
commit
e833cd6523
3
Makefile
3
Makefile
@ -3,7 +3,8 @@ CXXFLAGS=-std=c++20 -Wall -Wextra -O2 -Werror=switch
|
|||||||
CPPFLAGS=-Ilib/expected/ -Ilib/ut/ -Isrc/
|
CPPFLAGS=-Ilib/expected/ -Ilib/ut/ -Isrc/
|
||||||
|
|
||||||
Obj=bin/lexer.o \
|
Obj=bin/lexer.o \
|
||||||
bin/errors.o
|
bin/errors.o \
|
||||||
|
bin/unicode.o
|
||||||
|
|
||||||
all: bin/musique bin/unit-tests
|
all: bin/musique bin/unit-tests
|
||||||
|
|
||||||
|
54
src/lexer.cc
54
src/lexer.cc
@ -2,9 +2,63 @@
|
|||||||
|
|
||||||
auto Lexer::next_token() -> Result<Token>
|
auto Lexer::next_token() -> Result<Token>
|
||||||
{
|
{
|
||||||
|
auto current = source;
|
||||||
|
|
||||||
|
auto c = next_rune();
|
||||||
|
|
||||||
|
if (c == 0)
|
||||||
|
return errors::End_Of_File;
|
||||||
|
|
||||||
|
switch (c) {
|
||||||
|
case '(': return { Token::Type::Open_Paren, current.substr(0, 1) };
|
||||||
|
case ')': return { Token::Type::Close_Paren, current.substr(0, 1) };
|
||||||
|
case '[': return { Token::Type::Open_Block, current.substr(0, 1) };
|
||||||
|
case ']': return { Token::Type::Close_Block, current.substr(0, 1) };
|
||||||
|
case '|': return { Token::Type::Variable_Separator, current.substr(0, 1) };
|
||||||
|
}
|
||||||
|
|
||||||
|
// Number literals like .75
|
||||||
|
if (c == '.') {
|
||||||
|
while ((c = next_rune()) && std::isdigit(c)) {}
|
||||||
|
if (source.data() - current.data() != 1)
|
||||||
|
return { Token::Type::Numeric, current.substr(0, source.data() - current.data()) };
|
||||||
|
}
|
||||||
|
|
||||||
|
if (std::isdigit(c)) {
|
||||||
|
while ((c = next_rune()) && std::isdigit(c)) {}
|
||||||
|
if (c == '.') {
|
||||||
|
bool looped = false;
|
||||||
|
while ((c = next_rune()) && std::isdigit(c)) { looped = true; }
|
||||||
|
if (not looped) {
|
||||||
|
// If '.' is not followed by any digits, then '.' is not part of numeric literals
|
||||||
|
// and only part before it is considered valid
|
||||||
|
rewind();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return { Token::Type::Numeric, current.substr(0, source.data() - current.data()) };
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
return {};
|
return {};
|
||||||
}
|
}
|
||||||
|
|
||||||
|
auto Lexer::next_rune() -> u32
|
||||||
|
{
|
||||||
|
if (not source.empty()) {
|
||||||
|
if (auto [rune, remaining] = utf8::decode(source); rune != utf8::Rune_Error) {
|
||||||
|
last_rune_length = remaining.data() - source.data();
|
||||||
|
source = remaining;
|
||||||
|
return rune;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
void Lexer::rewind()
|
||||||
|
{
|
||||||
|
source = { source.data() - last_rune_length, source.size() + last_rune_length };
|
||||||
|
}
|
||||||
|
|
||||||
std::ostream& operator<<(std::ostream& os, Token const&)
|
std::ostream& operator<<(std::ostream& os, Token const&)
|
||||||
{
|
{
|
||||||
os << "Token";
|
os << "Token";
|
||||||
|
@ -15,6 +15,8 @@ using i16 = std::int16_t;
|
|||||||
using i32 = std::int32_t;
|
using i32 = std::int32_t;
|
||||||
using i64 = std::int64_t;
|
using i64 = std::int64_t;
|
||||||
|
|
||||||
|
using usize = std::size_t;
|
||||||
|
using isize = std::ptrdiff_t;
|
||||||
|
|
||||||
namespace errors
|
namespace errors
|
||||||
{
|
{
|
||||||
@ -33,7 +35,20 @@ struct Error
|
|||||||
};
|
};
|
||||||
|
|
||||||
template<typename T>
|
template<typename T>
|
||||||
using Result = tl::expected<T, Error>;
|
struct Result : tl::expected<T, Error>
|
||||||
|
{
|
||||||
|
constexpr Result() = default;
|
||||||
|
|
||||||
|
constexpr Result(errors::Type error) : tl::expected<T, Error>(tl::unexpected(Error { error }))
|
||||||
|
{
|
||||||
|
}
|
||||||
|
|
||||||
|
template<typename ...Args>
|
||||||
|
constexpr Result(Args&& ...args)
|
||||||
|
: tl::expected<T, Error>( T{ std::forward<Args>(args)... } )
|
||||||
|
{
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
std::ostream& operator<<(std::ostream& os, Error const& err);
|
std::ostream& operator<<(std::ostream& os, Error const& err);
|
||||||
|
|
||||||
@ -45,6 +60,16 @@ std::ostream& operator<<(std::ostream& os, Error const& err);
|
|||||||
*std::move(try_value); \
|
*std::move(try_value); \
|
||||||
})
|
})
|
||||||
|
|
||||||
|
namespace utf8
|
||||||
|
{
|
||||||
|
constexpr u32 Rune_Error = 0xfffd;
|
||||||
|
constexpr u32 Rune_Self = 0x80;
|
||||||
|
constexpr u32 Max_Bytes = 4;
|
||||||
|
|
||||||
|
// Decodes rune and returns remaining string
|
||||||
|
auto decode(std::string_view str) -> std::pair<u32, std::string_view>;
|
||||||
|
}
|
||||||
|
|
||||||
struct Token
|
struct Token
|
||||||
{
|
{
|
||||||
enum class Type
|
enum class Type
|
||||||
@ -81,9 +106,18 @@ struct Lexer
|
|||||||
// Source that is beeing lexed
|
// Source that is beeing lexed
|
||||||
std::string_view source;
|
std::string_view source;
|
||||||
|
|
||||||
|
// Used for rewinding
|
||||||
|
u32 last_rune_length = 0;
|
||||||
|
|
||||||
// Determine location of tokens to produce nice errors
|
// Determine location of tokens to produce nice errors
|
||||||
std::string_view source_name = "<unnamed>";
|
std::string_view source_name = "<unnamed>";
|
||||||
unsigned column = 1, row = 1;
|
unsigned column = 1, row = 1;
|
||||||
|
|
||||||
auto next_token() -> Result<Token>;
|
auto next_token() -> Result<Token>;
|
||||||
|
|
||||||
|
// Finds next rune in source
|
||||||
|
auto next_rune() -> u32;
|
||||||
|
|
||||||
|
// Goes back last rune
|
||||||
|
void rewind();
|
||||||
};
|
};
|
||||||
|
@ -22,15 +22,25 @@ static void expect_token_type(
|
|||||||
static void expect_token_type_and_value(
|
static void expect_token_type_and_value(
|
||||||
Token::Type expected_type,
|
Token::Type expected_type,
|
||||||
std::string_view source,
|
std::string_view source,
|
||||||
|
std::string_view expected,
|
||||||
reflection::source_location const& sl = reflection::source_location::current())
|
reflection::source_location const& sl = reflection::source_location::current())
|
||||||
{
|
{
|
||||||
Lexer lexer{source};
|
Lexer lexer{source};
|
||||||
auto result = lexer.next_token();
|
auto result = lexer.next_token();
|
||||||
expect(result.has_value() >> fatal, sl) << "have not parsed any tokens";
|
expect(result.has_value() >> fatal, sl) << "have not parsed any tokens";
|
||||||
expect(eq(under(result->type), under(expected_type)), sl) << "different token type then expected";
|
expect(eq(under(result->type), under(expected_type)), sl) << "different token type then expected";
|
||||||
expect(eq(result->source, source)) << "tokenized source is not equal to original";
|
expect(eq(result->source, expected)) << "tokenized source is not equal to original";
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static void expect_token_type_and_value(
|
||||||
|
Token::Type expected_type,
|
||||||
|
std::string_view source,
|
||||||
|
reflection::source_location const& sl = reflection::source_location::current())
|
||||||
|
{
|
||||||
|
expect_token_type_and_value(expected_type, source, source, sl);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
suite lexer_test = [] {
|
suite lexer_test = [] {
|
||||||
"Empty file"_test = [] {
|
"Empty file"_test = [] {
|
||||||
Lexer lexer{""};
|
Lexer lexer{""};
|
||||||
@ -53,5 +63,6 @@ suite lexer_test = [] {
|
|||||||
expect_token_type_and_value(Token::Type::Numeric, ".75");
|
expect_token_type_and_value(Token::Type::Numeric, ".75");
|
||||||
expect_token_type_and_value(Token::Type::Numeric, "0.75");
|
expect_token_type_and_value(Token::Type::Numeric, "0.75");
|
||||||
expect_token_type_and_value(Token::Type::Numeric, "123456789.123456789");
|
expect_token_type_and_value(Token::Type::Numeric, "123456789.123456789");
|
||||||
|
expect_token_type_and_value(Token::Type::Numeric, "123.", "123");
|
||||||
};
|
};
|
||||||
};
|
};
|
||||||
|
47
src/unicode.cc
Normal file
47
src/unicode.cc
Normal file
@ -0,0 +1,47 @@
|
|||||||
|
#include "musique.hh"
|
||||||
|
|
||||||
|
auto utf8::decode(std::string_view s) -> std::pair<u32, std::string_view>
|
||||||
|
{
|
||||||
|
static constexpr std::array<u8, 4> payloads {
|
||||||
|
0b0111'1111, 0b0001'1111, 0b0000'1111, 0b0000'0111
|
||||||
|
};
|
||||||
|
|
||||||
|
static constexpr std::array<u8, 4> patterns {
|
||||||
|
0b0000'0000, 0b1100'0000, 0b1110'0000, 0b1111'0000
|
||||||
|
};
|
||||||
|
|
||||||
|
constexpr auto payload_cont = 0b0011'1111;
|
||||||
|
constexpr auto pattern_cont = 0b1000'0000;
|
||||||
|
|
||||||
|
if (s.empty()) {
|
||||||
|
return { utf8::Rune_Error, s };
|
||||||
|
}
|
||||||
|
|
||||||
|
usize length = 0;
|
||||||
|
|
||||||
|
for (auto i = 0u; i < payloads.size(); ++i) {
|
||||||
|
if ((s.front() & ~payloads[i]) == patterns[i]) {
|
||||||
|
length = i+1;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (length == 0 || s.size() < length) {
|
||||||
|
return { utf8::Rune_Error, s };
|
||||||
|
}
|
||||||
|
|
||||||
|
u32 result = s.front() & payloads[length-1];
|
||||||
|
|
||||||
|
while (--length > 0) {
|
||||||
|
s.remove_prefix(1);
|
||||||
|
if ((s.front() & ~payload_cont) == pattern_cont)
|
||||||
|
return { utf8::Rune_Error, s };
|
||||||
|
|
||||||
|
result <<= 6;
|
||||||
|
result |= u32(s.front() & payload_cont);
|
||||||
|
}
|
||||||
|
|
||||||
|
s.remove_prefix(1);
|
||||||
|
|
||||||
|
return { result, s };
|
||||||
|
}
|
Loading…
Reference in New Issue
Block a user