248 lines
6.6 KiB
C++
248 lines
6.6 KiB
C++
#include <musique.hh>
|
|
|
|
#include <iomanip>
|
|
|
|
constexpr std::string_view Notes_Symbols = "abcedefgh";
|
|
constexpr std::string_view Valid_Operator_Chars =
|
|
"+-*/:%" // arithmetic
|
|
"|&^" // logic & bit operations
|
|
"<>=!" // comparisons
|
|
"." // indexing
|
|
;
|
|
|
|
constexpr auto Keywords = std::array {
|
|
"false"sv,
|
|
"nil"sv,
|
|
"true"sv,
|
|
"var"sv
|
|
};
|
|
|
|
void Lexer::skip_whitespace_and_comments()
|
|
{
|
|
for (;;) {
|
|
bool done_something = false;
|
|
|
|
while (consume_if(unicode::is_space)) {
|
|
done_something = true;
|
|
}
|
|
|
|
// #! line comments
|
|
if (consume_if('#', '!')) {
|
|
done_something = true;
|
|
while (peek() && peek() != '\n') {
|
|
consume();
|
|
}
|
|
}
|
|
|
|
// -- line and multiline coments
|
|
if (consume_if('-', '-')) {
|
|
done_something = true;
|
|
if (consume_if('-')) {
|
|
// multiline
|
|
unsigned count = 0;
|
|
while (count < 3) if (consume_if('-')) {
|
|
++count;
|
|
} else {
|
|
consume();
|
|
count = 0;
|
|
}
|
|
while (consume_if('-')) {}
|
|
} else {
|
|
// single line
|
|
while (peek() && peek() != '\n') {
|
|
consume();
|
|
}
|
|
}
|
|
}
|
|
|
|
if (not done_something)
|
|
break;
|
|
}
|
|
}
|
|
|
|
auto Lexer::next_token() -> Result<Token>
|
|
{
|
|
skip_whitespace_and_comments();
|
|
start();
|
|
|
|
if (peek() == 0) {
|
|
return errors::End_Of_File;
|
|
}
|
|
|
|
switch (peek()) {
|
|
case '(': consume(); return Token { Token::Type::Open_Paren, finish(), token_location };
|
|
case ')': consume(); return Token { Token::Type::Close_Paren, finish(), token_location };
|
|
case '[': consume(); return Token { Token::Type::Open_Block, finish(), token_location };
|
|
case ']': consume(); return Token { Token::Type::Close_Block, finish(), token_location };
|
|
case ';': consume(); return Token { Token::Type::Expression_Separator, finish(), token_location };
|
|
|
|
case '|':
|
|
consume();
|
|
// `|` may be part of operator, like `||`. So we need to check what follows. If next char
|
|
// is operator, then this character is part of operator sequence.
|
|
// Additionally we explicitly allow for `|foo|=0` here
|
|
if (Valid_Operator_Chars.find(peek()) == std::string_view::npos || peek() == '=')
|
|
return Token { Token::Type::Parameter_Separator, finish(), token_location };
|
|
}
|
|
|
|
if (consume_if(unicode::is_digit)) {
|
|
while (consume_if(unicode::is_digit)) {}
|
|
if (peek() == '.') {
|
|
consume();
|
|
bool looped = false;
|
|
while (consume_if(unicode::is_digit)) { looped = true; }
|
|
if (not looped) {
|
|
// If '.' is not followed by any digits, then '.' is not part of numeric literals
|
|
// and only part before it is considered valid
|
|
rewind();
|
|
}
|
|
}
|
|
return Token { Token::Type::Numeric, finish(), token_location };
|
|
}
|
|
|
|
// lex chord declaration
|
|
if (consume_if(Notes_Symbols)) {
|
|
// Allow `c#`
|
|
consume_if('#');
|
|
|
|
// Any of the following sequences are allowed
|
|
// c,,,,,,,,,,,,,,,,
|
|
// c1,,,,2,3212
|
|
// c1234'''''
|
|
// during lexing
|
|
while (consume_if(",'") || consume_if(unicode::is_digit)) {}
|
|
|
|
// If we encounter any letter that is not part of chord declaration,
|
|
// then we have symbol, not chord declaration
|
|
if (unicode::is_identifier(peek(), unicode::First_Character::No)) {
|
|
goto symbol_lexing;
|
|
}
|
|
|
|
return Token { Token::Type::Chord, finish(), token_location };
|
|
}
|
|
|
|
using namespace std::placeholders;
|
|
if (consume_if(std::bind(unicode::is_identifier, _1, unicode::First_Character::Yes))) {
|
|
symbol_lexing:
|
|
for (auto predicate = std::bind(unicode::is_identifier, _1, unicode::First_Character::No);
|
|
consume_if(predicate);
|
|
) {
|
|
}
|
|
|
|
Token t = { Token::Type::Symbol, finish(), token_location };
|
|
if (std::find(Keywords.begin(), Keywords.end(), t.source) != Keywords.end()) {
|
|
t.type = Token::Type::Keyword;
|
|
} else if (t.source == "v") {
|
|
t.type = Token::Type::Operator;
|
|
}
|
|
return t;
|
|
}
|
|
|
|
if (consume_if(Valid_Operator_Chars)) {
|
|
while (consume_if(Valid_Operator_Chars)) {}
|
|
return Token { Token::Type::Operator, finish(), token_location };
|
|
}
|
|
|
|
return errors::unrecognized_character(peek(), token_location);
|
|
}
|
|
|
|
auto Lexer::peek() const -> u32
|
|
{
|
|
if (not source.empty()) {
|
|
if (auto [rune, remaining] = utf8::decode(source); rune != utf8::Rune_Error) {
|
|
return rune;
|
|
}
|
|
}
|
|
return 0;
|
|
}
|
|
|
|
auto Lexer::consume() -> u32
|
|
{
|
|
prev_location = location;
|
|
if (not source.empty()) {
|
|
if (auto [rune, remaining] = utf8::decode(source); rune != utf8::Rune_Error) {
|
|
last_rune_length = remaining.data() - source.data();
|
|
source = remaining;
|
|
token_length += last_rune_length;
|
|
location.advance(rune);
|
|
return rune;
|
|
}
|
|
}
|
|
return 0;
|
|
}
|
|
|
|
auto Lexer::consume_if(auto test) -> bool
|
|
{
|
|
bool condition;
|
|
if constexpr (requires { test(peek()) && true; }) {
|
|
condition = test(peek());
|
|
} else if constexpr (std::is_integral_v<decltype(test)>) {
|
|
condition = (u32(test) == peek());
|
|
} else if constexpr (std::is_convertible_v<decltype(test), char const*>) {
|
|
auto const end = test + std::strlen(test);
|
|
condition = std::find(test, end, peek()) != end;
|
|
} else {
|
|
condition = std::find(std::begin(test), std::end(test), peek()) != std::end(test);
|
|
}
|
|
return condition && (consume(), true);
|
|
}
|
|
|
|
auto Lexer::consume_if(auto first, auto second) -> bool
|
|
{
|
|
if (consume_if(first)) {
|
|
if (consume_if(second)) {
|
|
return true;
|
|
} else {
|
|
rewind();
|
|
}
|
|
}
|
|
return false;
|
|
}
|
|
|
|
void Lexer::rewind()
|
|
{
|
|
assert(last_rune_length != 0, "cannot rewind to not existing rune");
|
|
source = { source.data() - last_rune_length, source.size() + last_rune_length };
|
|
token_length -= last_rune_length;
|
|
location = prev_location;
|
|
last_rune_length = 0;
|
|
}
|
|
|
|
void Lexer::start()
|
|
{
|
|
token_start = source.data();
|
|
token_length = 0;
|
|
token_location = location;
|
|
}
|
|
|
|
std::string_view Lexer::finish()
|
|
{
|
|
std::string_view result { token_start, token_length };
|
|
token_start = nullptr;
|
|
token_length = 0;
|
|
return result;
|
|
}
|
|
|
|
std::ostream& operator<<(std::ostream& os, Token const& token)
|
|
{
|
|
return os << '{' << token.type << ", " << std::quoted(token.source) << ", " << token.location << '}';
|
|
}
|
|
|
|
std::ostream& operator<<(std::ostream& os, Token::Type type)
|
|
{
|
|
switch (type) {
|
|
case Token::Type::Chord: return os << "CHORD";
|
|
case Token::Type::Close_Block: return os << "CLOSE BLOCK";
|
|
case Token::Type::Close_Paren: return os << "CLOSE PAREN";
|
|
case Token::Type::Expression_Separator: return os << "EXPRESSION SEPARATOR";
|
|
case Token::Type::Keyword: return os << "KEYWORD";
|
|
case Token::Type::Numeric: return os << "NUMERIC";
|
|
case Token::Type::Open_Block: return os << "OPEN BLOCK";
|
|
case Token::Type::Open_Paren: return os << "OPEN PAREN";
|
|
case Token::Type::Operator: return os << "OPERATOR";
|
|
case Token::Type::Parameter_Separator: return os << "PARAMETER SEPARATOR";
|
|
case Token::Type::Symbol: return os << "SYMBOL";
|
|
}
|
|
unreachable();
|
|
}
|