Lexer skips whitespace

2022-04-27 14:37:21 +02:00 · 2022-04-27 14:37:21 +02:00 · fd4db02e8d
commit fd4db02e8d
parent e833cd6523
4 changed files with 108 additions and 28 deletions
--- a/src/lexer.cc
+++ b/src/lexer.cc
@ -2,52 +2,65 @@

 auto Lexer::next_token() -> Result<Token>
 {
-	auto current = source;
+	while (consume_if(unicode::is_space)) {
+	}
+	start();

-	auto c = next_rune();
-
-	if (c == 0)
+	if (peek() == 0) {
 		return errors::End_Of_File;
+	}

-	switch (c) {
-	case '(': return { Token::Type::Open_Paren,         current.substr(0, 1) };
-	case ')': return { Token::Type::Close_Paren,        current.substr(0, 1) };
-	case '[': return { Token::Type::Open_Block,         current.substr(0, 1) };
-	case ']': return { Token::Type::Close_Block,        current.substr(0, 1) };
-	case '|': return { Token::Type::Variable_Separator, current.substr(0, 1) };
+	switch (peek()) {
+	case '(': consume(); return { Token::Type::Open_Paren,         finish() };
+	case ')': consume(); return { Token::Type::Close_Paren,        finish() };
+	case '[': consume(); return { Token::Type::Open_Block,         finish() };
+	case ']': consume(); return { Token::Type::Close_Block,        finish() };
+	case '|': consume(); return { Token::Type::Variable_Separator, finish() };
 	}

 	// Number literals like .75
-	if (c == '.') {
-		while ((c = next_rune()) && std::isdigit(c)) {}
-		if (source.data() - current.data() != 1)
-			return { Token::Type::Numeric, current.substr(0, source.data() - current.data()) };
+	if (peek() == '.') {
+		consume();
+		while (consume_if(unicode::is_digit)) {}
+		if (token_length != 1)
+			return { Token::Type::Numeric, finish() };
 	}

-	if (std::isdigit(c)) {
-		while ((c = next_rune()) && std::isdigit(c)) {}
-		if (c == '.') {
+	if (consume_if(unicode::is_digit)) {
+		while (consume_if(unicode::is_digit)) {}
+		if (peek() == '.') {
+			consume();
 			bool looped = false;
-			while ((c = next_rune()) && std::isdigit(c)) { looped = true; }
+			while (consume_if(unicode::is_digit)) { looped = true; }
 			if (not looped) {
 				// If '.' is not followed by any digits, then '.' is not part of numeric literals
 				// and only part before it is considered valid
 				rewind();
 			}
 		}
-		return { Token::Type::Numeric, current.substr(0, source.data() - current.data()) };
+		return { Token::Type::Numeric, finish() };
 	}

-
-	return {};
+	return errors::Unrecognized_Character;
 }

-auto Lexer::next_rune() -> u32
+auto Lexer::peek() const -> u32
+{
+	if (not source.empty()) {
+		if (auto [rune, remaining] = utf8::decode(source); rune != utf8::Rune_Error) {
+			return rune;
+		}
+	}
+	return 0;
+}
+
+auto Lexer::consume() -> u32
 {
 	if (not source.empty()) {
 		if (auto [rune, remaining] = utf8::decode(source); rune != utf8::Rune_Error) {
 			last_rune_length = remaining.data() - source.data();
 			source = remaining;
+			token_length += last_rune_length;
 			return rune;
 		}
 	}
@ -57,6 +70,21 @@ auto Lexer::next_rune() -> u32
 void Lexer::rewind()
 {
 	source = { source.data() - last_rune_length, source.size() + last_rune_length };
+	token_length -= last_rune_length;
+}
+
+void Lexer::start()
+{
+	token_start = source.data();
+	token_length = 0;
+}
+
+std::string_view Lexer::finish()
+{
+	std::string_view result { token_start, token_length };
+	token_start = nullptr;
+	token_length = 0;
+	return result;
 }

 std::ostream& operator<<(std::ostream& os, Token const&)
--- a/src/musique.hh
+++ b/src/musique.hh
@ -18,11 +18,14 @@ using i64 = std::int64_t;
 using usize = std::size_t;
 using isize = std::ptrdiff_t;

+#define Fun(Function) ([]<typename ...T>(T&& ...args) { return (Function)(std::forward<T>(args)...); })
+
 namespace errors
 {
 	enum Type
 	{
-		End_Of_File
+		End_Of_File,
+		Unrecognized_Character
 	};
 }

@ -60,11 +63,22 @@ std::ostream& operator<<(std::ostream& os, Error const& err);
 	*std::move(try_value); \
 	})

-namespace utf8
+namespace unicode
+{
+	inline namespace special_runes
 	{
 		constexpr u32 Rune_Error = 0xfffd;
 		constexpr u32 Rune_Self  = 0x80;
 		constexpr u32 Max_Bytes  = 4;
+	}
+
+	bool is_digit(u32 digit);
+	bool is_space(u32 space);
+}
+
+namespace utf8
+{
+	using namespace unicode::special_runes;

 	// Decodes rune and returns remaining string
 	auto decode(std::string_view str) -> std::pair<u32, std::string_view>;
@ -109,6 +123,9 @@ struct Lexer
 	// Used for rewinding
 	u32 last_rune_length = 0;

+	char const* token_start = nullptr;
+	usize token_length = 0;
+
 	// Determine location of tokens to produce nice errors
 	std::string_view source_name = "<unnamed>";
 	unsigned column = 1, row = 1;
@ -116,8 +133,22 @@ struct Lexer
 	auto next_token() -> Result<Token>;

 	// Finds next rune in source
-	auto next_rune() -> u32;
+	auto peek() const -> u32;
+
+	// Finds next rune in source and returns it, advancing the string
+	auto consume() -> u32;
+
+	inline auto consume_if(auto test) -> u32
+	{
+		return test(peek()) && (consume(), true);
+	}

 	// Goes back last rune
 	void rewind();
+
+	// Marks begin of token
+	void start();
+
+	// Marks end of token and returns it's matching source
+	std::string_view finish();
 };
--- a/src/tests/lex.cc
+++ b/src/tests/lex.cc
@ -29,7 +29,7 @@ static void expect_token_type_and_value(
 	auto result = lexer.next_token();
 	expect(result.has_value() >> fatal, sl) << "have not parsed any tokens";
 	expect(eq(under(result->type), under(expected_type)), sl) << "different token type then expected";
-	expect(eq(result->source, expected)) << "tokenized source is not equal to original";
+	expect(eq(result->source, expected), sl) << "tokenized source is not equal to original";
 }

 static void expect_token_type_and_value(
@ -64,5 +64,7 @@ suite lexer_test = [] {
 		expect_token_type_and_value(Token::Type::Numeric, "0.75");
 		expect_token_type_and_value(Token::Type::Numeric, "123456789.123456789");
 		expect_token_type_and_value(Token::Type::Numeric, "123.", "123");
+		expect_token_type_and_value(Token::Type::Numeric, " 1   ", "1");
+		expect_token_type_and_value(Token::Type::Numeric, " 123   ", "123");
 	};
 };
--- a/src/unicode.cc
+++ b/src/unicode.cc
@ -45,3 +45,22 @@ auto utf8::decode(std::string_view s) -> std::pair<u32, std::string_view>

 	return { result, s };
 }
+
+bool unicode::is_digit(u32 digit)
+{
+	return digit >= '0' && digit <= '9';
+}
+
+bool unicode::is_space(u32 space)
+{
+	switch (space) {
+	case ' ':
+	case '\t':
+	case '\n':
+	case '\f':
+	case '\r':
+	case '\v':
+		return true;
+	}
+	return false;
+}