Lexer skips whitespace

This commit is contained in:
Robert Bendun 2022-04-27 14:37:21 +02:00
parent e833cd6523
commit fd4db02e8d
4 changed files with 108 additions and 28 deletions

View File

@ -2,52 +2,65 @@
auto Lexer::next_token() -> Result<Token> auto Lexer::next_token() -> Result<Token>
{ {
auto current = source; while (consume_if(unicode::is_space)) {
}
start();
auto c = next_rune(); if (peek() == 0) {
if (c == 0)
return errors::End_Of_File; return errors::End_Of_File;
}
switch (c) { switch (peek()) {
case '(': return { Token::Type::Open_Paren, current.substr(0, 1) }; case '(': consume(); return { Token::Type::Open_Paren, finish() };
case ')': return { Token::Type::Close_Paren, current.substr(0, 1) }; case ')': consume(); return { Token::Type::Close_Paren, finish() };
case '[': return { Token::Type::Open_Block, current.substr(0, 1) }; case '[': consume(); return { Token::Type::Open_Block, finish() };
case ']': return { Token::Type::Close_Block, current.substr(0, 1) }; case ']': consume(); return { Token::Type::Close_Block, finish() };
case '|': return { Token::Type::Variable_Separator, current.substr(0, 1) }; case '|': consume(); return { Token::Type::Variable_Separator, finish() };
} }
// Number literals like .75 // Number literals like .75
if (c == '.') { if (peek() == '.') {
while ((c = next_rune()) && std::isdigit(c)) {} consume();
if (source.data() - current.data() != 1) while (consume_if(unicode::is_digit)) {}
return { Token::Type::Numeric, current.substr(0, source.data() - current.data()) }; if (token_length != 1)
return { Token::Type::Numeric, finish() };
} }
if (std::isdigit(c)) { if (consume_if(unicode::is_digit)) {
while ((c = next_rune()) && std::isdigit(c)) {} while (consume_if(unicode::is_digit)) {}
if (c == '.') { if (peek() == '.') {
consume();
bool looped = false; bool looped = false;
while ((c = next_rune()) && std::isdigit(c)) { looped = true; } while (consume_if(unicode::is_digit)) { looped = true; }
if (not looped) { if (not looped) {
// If '.' is not followed by any digits, then '.' is not part of numeric literals // If '.' is not followed by any digits, then '.' is not part of numeric literals
// and only part before it is considered valid // and only part before it is considered valid
rewind(); rewind();
} }
} }
return { Token::Type::Numeric, current.substr(0, source.data() - current.data()) }; return { Token::Type::Numeric, finish() };
} }
return errors::Unrecognized_Character;
return {};
} }
auto Lexer::next_rune() -> u32 auto Lexer::peek() const -> u32
{
if (not source.empty()) {
if (auto [rune, remaining] = utf8::decode(source); rune != utf8::Rune_Error) {
return rune;
}
}
return 0;
}
auto Lexer::consume() -> u32
{ {
if (not source.empty()) { if (not source.empty()) {
if (auto [rune, remaining] = utf8::decode(source); rune != utf8::Rune_Error) { if (auto [rune, remaining] = utf8::decode(source); rune != utf8::Rune_Error) {
last_rune_length = remaining.data() - source.data(); last_rune_length = remaining.data() - source.data();
source = remaining; source = remaining;
token_length += last_rune_length;
return rune; return rune;
} }
} }
@ -57,6 +70,21 @@ auto Lexer::next_rune() -> u32
void Lexer::rewind() void Lexer::rewind()
{ {
source = { source.data() - last_rune_length, source.size() + last_rune_length }; source = { source.data() - last_rune_length, source.size() + last_rune_length };
token_length -= last_rune_length;
}
void Lexer::start()
{
token_start = source.data();
token_length = 0;
}
std::string_view Lexer::finish()
{
std::string_view result { token_start, token_length };
token_start = nullptr;
token_length = 0;
return result;
} }
std::ostream& operator<<(std::ostream& os, Token const&) std::ostream& operator<<(std::ostream& os, Token const&)

View File

@ -18,11 +18,14 @@ using i64 = std::int64_t;
using usize = std::size_t; using usize = std::size_t;
using isize = std::ptrdiff_t; using isize = std::ptrdiff_t;
#define Fun(Function) ([]<typename ...T>(T&& ...args) { return (Function)(std::forward<T>(args)...); })
namespace errors namespace errors
{ {
enum Type enum Type
{ {
End_Of_File End_Of_File,
Unrecognized_Character
}; };
} }
@ -60,11 +63,22 @@ std::ostream& operator<<(std::ostream& os, Error const& err);
*std::move(try_value); \ *std::move(try_value); \
}) })
namespace unicode
{
inline namespace special_runes
{
constexpr u32 Rune_Error = 0xfffd;
constexpr u32 Rune_Self = 0x80;
constexpr u32 Max_Bytes = 4;
}
bool is_digit(u32 digit);
bool is_space(u32 space);
}
namespace utf8 namespace utf8
{ {
constexpr u32 Rune_Error = 0xfffd; using namespace unicode::special_runes;
constexpr u32 Rune_Self = 0x80;
constexpr u32 Max_Bytes = 4;
// Decodes rune and returns remaining string // Decodes rune and returns remaining string
auto decode(std::string_view str) -> std::pair<u32, std::string_view>; auto decode(std::string_view str) -> std::pair<u32, std::string_view>;
@ -109,6 +123,9 @@ struct Lexer
// Used for rewinding // Used for rewinding
u32 last_rune_length = 0; u32 last_rune_length = 0;
char const* token_start = nullptr;
usize token_length = 0;
// Determine location of tokens to produce nice errors // Determine location of tokens to produce nice errors
std::string_view source_name = "<unnamed>"; std::string_view source_name = "<unnamed>";
unsigned column = 1, row = 1; unsigned column = 1, row = 1;
@ -116,8 +133,22 @@ struct Lexer
auto next_token() -> Result<Token>; auto next_token() -> Result<Token>;
// Finds next rune in source // Finds next rune in source
auto next_rune() -> u32; auto peek() const -> u32;
// Finds next rune in source and returns it, advancing the string
auto consume() -> u32;
inline auto consume_if(auto test) -> u32
{
return test(peek()) && (consume(), true);
}
// Goes back last rune // Goes back last rune
void rewind(); void rewind();
// Marks begin of token
void start();
// Marks end of token and returns it's matching source
std::string_view finish();
}; };

View File

@ -29,7 +29,7 @@ static void expect_token_type_and_value(
auto result = lexer.next_token(); auto result = lexer.next_token();
expect(result.has_value() >> fatal, sl) << "have not parsed any tokens"; expect(result.has_value() >> fatal, sl) << "have not parsed any tokens";
expect(eq(under(result->type), under(expected_type)), sl) << "different token type then expected"; expect(eq(under(result->type), under(expected_type)), sl) << "different token type then expected";
expect(eq(result->source, expected)) << "tokenized source is not equal to original"; expect(eq(result->source, expected), sl) << "tokenized source is not equal to original";
} }
static void expect_token_type_and_value( static void expect_token_type_and_value(
@ -64,5 +64,7 @@ suite lexer_test = [] {
expect_token_type_and_value(Token::Type::Numeric, "0.75"); expect_token_type_and_value(Token::Type::Numeric, "0.75");
expect_token_type_and_value(Token::Type::Numeric, "123456789.123456789"); expect_token_type_and_value(Token::Type::Numeric, "123456789.123456789");
expect_token_type_and_value(Token::Type::Numeric, "123.", "123"); expect_token_type_and_value(Token::Type::Numeric, "123.", "123");
expect_token_type_and_value(Token::Type::Numeric, " 1 ", "1");
expect_token_type_and_value(Token::Type::Numeric, " 123 ", "123");
}; };
}; };

View File

@ -45,3 +45,22 @@ auto utf8::decode(std::string_view s) -> std::pair<u32, std::string_view>
return { result, s }; return { result, s };
} }
bool unicode::is_digit(u32 digit)
{
return digit >= '0' && digit <= '9';
}
bool unicode::is_space(u32 space)
{
switch (space) {
case ' ':
case '\t':
case '\n':
case '\f':
case '\r':
case '\v':
return true;
}
return false;
}