Lexer skips whitespace
This commit is contained in:
parent
e833cd6523
commit
fd4db02e8d
72
src/lexer.cc
72
src/lexer.cc
@ -2,52 +2,65 @@
|
|||||||
|
|
||||||
auto Lexer::next_token() -> Result<Token>
|
auto Lexer::next_token() -> Result<Token>
|
||||||
{
|
{
|
||||||
auto current = source;
|
while (consume_if(unicode::is_space)) {
|
||||||
|
}
|
||||||
|
start();
|
||||||
|
|
||||||
auto c = next_rune();
|
if (peek() == 0) {
|
||||||
|
|
||||||
if (c == 0)
|
|
||||||
return errors::End_Of_File;
|
return errors::End_Of_File;
|
||||||
|
}
|
||||||
|
|
||||||
switch (c) {
|
switch (peek()) {
|
||||||
case '(': return { Token::Type::Open_Paren, current.substr(0, 1) };
|
case '(': consume(); return { Token::Type::Open_Paren, finish() };
|
||||||
case ')': return { Token::Type::Close_Paren, current.substr(0, 1) };
|
case ')': consume(); return { Token::Type::Close_Paren, finish() };
|
||||||
case '[': return { Token::Type::Open_Block, current.substr(0, 1) };
|
case '[': consume(); return { Token::Type::Open_Block, finish() };
|
||||||
case ']': return { Token::Type::Close_Block, current.substr(0, 1) };
|
case ']': consume(); return { Token::Type::Close_Block, finish() };
|
||||||
case '|': return { Token::Type::Variable_Separator, current.substr(0, 1) };
|
case '|': consume(); return { Token::Type::Variable_Separator, finish() };
|
||||||
}
|
}
|
||||||
|
|
||||||
// Number literals like .75
|
// Number literals like .75
|
||||||
if (c == '.') {
|
if (peek() == '.') {
|
||||||
while ((c = next_rune()) && std::isdigit(c)) {}
|
consume();
|
||||||
if (source.data() - current.data() != 1)
|
while (consume_if(unicode::is_digit)) {}
|
||||||
return { Token::Type::Numeric, current.substr(0, source.data() - current.data()) };
|
if (token_length != 1)
|
||||||
|
return { Token::Type::Numeric, finish() };
|
||||||
}
|
}
|
||||||
|
|
||||||
if (std::isdigit(c)) {
|
if (consume_if(unicode::is_digit)) {
|
||||||
while ((c = next_rune()) && std::isdigit(c)) {}
|
while (consume_if(unicode::is_digit)) {}
|
||||||
if (c == '.') {
|
if (peek() == '.') {
|
||||||
|
consume();
|
||||||
bool looped = false;
|
bool looped = false;
|
||||||
while ((c = next_rune()) && std::isdigit(c)) { looped = true; }
|
while (consume_if(unicode::is_digit)) { looped = true; }
|
||||||
if (not looped) {
|
if (not looped) {
|
||||||
// If '.' is not followed by any digits, then '.' is not part of numeric literals
|
// If '.' is not followed by any digits, then '.' is not part of numeric literals
|
||||||
// and only part before it is considered valid
|
// and only part before it is considered valid
|
||||||
rewind();
|
rewind();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
return { Token::Type::Numeric, current.substr(0, source.data() - current.data()) };
|
return { Token::Type::Numeric, finish() };
|
||||||
}
|
}
|
||||||
|
|
||||||
|
return errors::Unrecognized_Character;
|
||||||
return {};
|
|
||||||
}
|
}
|
||||||
|
|
||||||
auto Lexer::next_rune() -> u32
|
auto Lexer::peek() const -> u32
|
||||||
|
{
|
||||||
|
if (not source.empty()) {
|
||||||
|
if (auto [rune, remaining] = utf8::decode(source); rune != utf8::Rune_Error) {
|
||||||
|
return rune;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
auto Lexer::consume() -> u32
|
||||||
{
|
{
|
||||||
if (not source.empty()) {
|
if (not source.empty()) {
|
||||||
if (auto [rune, remaining] = utf8::decode(source); rune != utf8::Rune_Error) {
|
if (auto [rune, remaining] = utf8::decode(source); rune != utf8::Rune_Error) {
|
||||||
last_rune_length = remaining.data() - source.data();
|
last_rune_length = remaining.data() - source.data();
|
||||||
source = remaining;
|
source = remaining;
|
||||||
|
token_length += last_rune_length;
|
||||||
return rune;
|
return rune;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -57,6 +70,21 @@ auto Lexer::next_rune() -> u32
|
|||||||
void Lexer::rewind()
|
void Lexer::rewind()
|
||||||
{
|
{
|
||||||
source = { source.data() - last_rune_length, source.size() + last_rune_length };
|
source = { source.data() - last_rune_length, source.size() + last_rune_length };
|
||||||
|
token_length -= last_rune_length;
|
||||||
|
}
|
||||||
|
|
||||||
|
void Lexer::start()
|
||||||
|
{
|
||||||
|
token_start = source.data();
|
||||||
|
token_length = 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
std::string_view Lexer::finish()
|
||||||
|
{
|
||||||
|
std::string_view result { token_start, token_length };
|
||||||
|
token_start = nullptr;
|
||||||
|
token_length = 0;
|
||||||
|
return result;
|
||||||
}
|
}
|
||||||
|
|
||||||
std::ostream& operator<<(std::ostream& os, Token const&)
|
std::ostream& operator<<(std::ostream& os, Token const&)
|
||||||
|
@ -18,11 +18,14 @@ using i64 = std::int64_t;
|
|||||||
using usize = std::size_t;
|
using usize = std::size_t;
|
||||||
using isize = std::ptrdiff_t;
|
using isize = std::ptrdiff_t;
|
||||||
|
|
||||||
|
#define Fun(Function) ([]<typename ...T>(T&& ...args) { return (Function)(std::forward<T>(args)...); })
|
||||||
|
|
||||||
namespace errors
|
namespace errors
|
||||||
{
|
{
|
||||||
enum Type
|
enum Type
|
||||||
{
|
{
|
||||||
End_Of_File
|
End_Of_File,
|
||||||
|
Unrecognized_Character
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -60,11 +63,22 @@ std::ostream& operator<<(std::ostream& os, Error const& err);
|
|||||||
*std::move(try_value); \
|
*std::move(try_value); \
|
||||||
})
|
})
|
||||||
|
|
||||||
namespace utf8
|
namespace unicode
|
||||||
|
{
|
||||||
|
inline namespace special_runes
|
||||||
{
|
{
|
||||||
constexpr u32 Rune_Error = 0xfffd;
|
constexpr u32 Rune_Error = 0xfffd;
|
||||||
constexpr u32 Rune_Self = 0x80;
|
constexpr u32 Rune_Self = 0x80;
|
||||||
constexpr u32 Max_Bytes = 4;
|
constexpr u32 Max_Bytes = 4;
|
||||||
|
}
|
||||||
|
|
||||||
|
bool is_digit(u32 digit);
|
||||||
|
bool is_space(u32 space);
|
||||||
|
}
|
||||||
|
|
||||||
|
namespace utf8
|
||||||
|
{
|
||||||
|
using namespace unicode::special_runes;
|
||||||
|
|
||||||
// Decodes rune and returns remaining string
|
// Decodes rune and returns remaining string
|
||||||
auto decode(std::string_view str) -> std::pair<u32, std::string_view>;
|
auto decode(std::string_view str) -> std::pair<u32, std::string_view>;
|
||||||
@ -109,6 +123,9 @@ struct Lexer
|
|||||||
// Used for rewinding
|
// Used for rewinding
|
||||||
u32 last_rune_length = 0;
|
u32 last_rune_length = 0;
|
||||||
|
|
||||||
|
char const* token_start = nullptr;
|
||||||
|
usize token_length = 0;
|
||||||
|
|
||||||
// Determine location of tokens to produce nice errors
|
// Determine location of tokens to produce nice errors
|
||||||
std::string_view source_name = "<unnamed>";
|
std::string_view source_name = "<unnamed>";
|
||||||
unsigned column = 1, row = 1;
|
unsigned column = 1, row = 1;
|
||||||
@ -116,8 +133,22 @@ struct Lexer
|
|||||||
auto next_token() -> Result<Token>;
|
auto next_token() -> Result<Token>;
|
||||||
|
|
||||||
// Finds next rune in source
|
// Finds next rune in source
|
||||||
auto next_rune() -> u32;
|
auto peek() const -> u32;
|
||||||
|
|
||||||
|
// Finds next rune in source and returns it, advancing the string
|
||||||
|
auto consume() -> u32;
|
||||||
|
|
||||||
|
inline auto consume_if(auto test) -> u32
|
||||||
|
{
|
||||||
|
return test(peek()) && (consume(), true);
|
||||||
|
}
|
||||||
|
|
||||||
// Goes back last rune
|
// Goes back last rune
|
||||||
void rewind();
|
void rewind();
|
||||||
|
|
||||||
|
// Marks begin of token
|
||||||
|
void start();
|
||||||
|
|
||||||
|
// Marks end of token and returns it's matching source
|
||||||
|
std::string_view finish();
|
||||||
};
|
};
|
||||||
|
@ -29,7 +29,7 @@ static void expect_token_type_and_value(
|
|||||||
auto result = lexer.next_token();
|
auto result = lexer.next_token();
|
||||||
expect(result.has_value() >> fatal, sl) << "have not parsed any tokens";
|
expect(result.has_value() >> fatal, sl) << "have not parsed any tokens";
|
||||||
expect(eq(under(result->type), under(expected_type)), sl) << "different token type then expected";
|
expect(eq(under(result->type), under(expected_type)), sl) << "different token type then expected";
|
||||||
expect(eq(result->source, expected)) << "tokenized source is not equal to original";
|
expect(eq(result->source, expected), sl) << "tokenized source is not equal to original";
|
||||||
}
|
}
|
||||||
|
|
||||||
static void expect_token_type_and_value(
|
static void expect_token_type_and_value(
|
||||||
@ -64,5 +64,7 @@ suite lexer_test = [] {
|
|||||||
expect_token_type_and_value(Token::Type::Numeric, "0.75");
|
expect_token_type_and_value(Token::Type::Numeric, "0.75");
|
||||||
expect_token_type_and_value(Token::Type::Numeric, "123456789.123456789");
|
expect_token_type_and_value(Token::Type::Numeric, "123456789.123456789");
|
||||||
expect_token_type_and_value(Token::Type::Numeric, "123.", "123");
|
expect_token_type_and_value(Token::Type::Numeric, "123.", "123");
|
||||||
|
expect_token_type_and_value(Token::Type::Numeric, " 1 ", "1");
|
||||||
|
expect_token_type_and_value(Token::Type::Numeric, " 123 ", "123");
|
||||||
};
|
};
|
||||||
};
|
};
|
||||||
|
@ -45,3 +45,22 @@ auto utf8::decode(std::string_view s) -> std::pair<u32, std::string_view>
|
|||||||
|
|
||||||
return { result, s };
|
return { result, s };
|
||||||
}
|
}
|
||||||
|
|
||||||
|
bool unicode::is_digit(u32 digit)
|
||||||
|
{
|
||||||
|
return digit >= '0' && digit <= '9';
|
||||||
|
}
|
||||||
|
|
||||||
|
bool unicode::is_space(u32 space)
|
||||||
|
{
|
||||||
|
switch (space) {
|
||||||
|
case ' ':
|
||||||
|
case '\t':
|
||||||
|
case '\n':
|
||||||
|
case '\f':
|
||||||
|
case '\r':
|
||||||
|
case '\v':
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
Loading…
Reference in New Issue
Block a user