
248 lines
6.6 KiB

#include <musique.hh>
#include <iomanip>
constexpr std::string_view Notes_Symbols = "abcedefgh";
constexpr std::string_view Valid_Operator_Chars =
"+-*/:%" // arithmetic
"|&^" // logic & bit operations
"<>=!" // comparisons
"." // indexing
constexpr auto Keywords = std::array {
void Lexer::skip_whitespace_and_comments()
for (;;) {
bool done_something = false;
while (consume_if(unicode::is_space)) {
done_something = true;
// #! line comments
if (consume_if('#', '!')) {
done_something = true;
while (peek() && peek() != '\n') {
// -- line and multiline coments
if (consume_if('-', '-')) {
done_something = true;
if (consume_if('-')) {
// multiline
unsigned count = 0;
while (count < 3) if (consume_if('-')) {
} else {
count = 0;
while (consume_if('-')) {}
} else {
// single line
while (peek() && peek() != '\n') {
if (not done_something)
auto Lexer::next_token() -> Result<Token>
if (peek() == 0) {
return errors::End_Of_File;
switch (peek()) {
case '(': consume(); return Token { Token::Type::Open_Paren, finish(), token_location };
case ')': consume(); return Token { Token::Type::Close_Paren, finish(), token_location };
case '[': consume(); return Token { Token::Type::Open_Block, finish(), token_location };
case ']': consume(); return Token { Token::Type::Close_Block, finish(), token_location };
case ';': consume(); return Token { Token::Type::Expression_Separator, finish(), token_location };
case '|':
// `|` may be part of operator, like `||`. So we need to check what follows. If next char
// is operator, then this character is part of operator sequence.
// Additionally we explicitly allow for `|foo|=0` here
if (Valid_Operator_Chars.find(peek()) == std::string_view::npos || peek() == '=')
return Token { Token::Type::Parameter_Separator, finish(), token_location };
if (consume_if(unicode::is_digit)) {
while (consume_if(unicode::is_digit)) {}
if (peek() == '.') {
bool looped = false;
while (consume_if(unicode::is_digit)) { looped = true; }
if (not looped) {
// If '.' is not followed by any digits, then '.' is not part of numeric literals
// and only part before it is considered valid
return Token { Token::Type::Numeric, finish(), token_location };
// lex chord declaration
if (consume_if(Notes_Symbols)) {
// Allow `c#`
// Any of the following sequences are allowed
// c,,,,,,,,,,,,,,,,
// c1,,,,2,3212
// c1234'''''
// during lexing
while (consume_if(",'") || consume_if(unicode::is_digit)) {}
// If we encounter any letter that is not part of chord declaration,
// then we have symbol, not chord declaration
if (unicode::is_identifier(peek(), unicode::First_Character::No)) {
goto symbol_lexing;
return Token { Token::Type::Chord, finish(), token_location };
using namespace std::placeholders;
if (consume_if(std::bind(unicode::is_identifier, _1, unicode::First_Character::Yes))) {
for (auto predicate = std::bind(unicode::is_identifier, _1, unicode::First_Character::No);
) {
Token t = { Token::Type::Symbol, finish(), token_location };
if (std::find(Keywords.begin(), Keywords.end(), t.source) != Keywords.end()) {
t.type = Token::Type::Keyword;
} else if (t.source == "v") {
t.type = Token::Type::Operator;
return t;
if (consume_if(Valid_Operator_Chars)) {
while (consume_if(Valid_Operator_Chars)) {}
return Token { Token::Type::Operator, finish(), token_location };
return errors::unrecognized_character(peek(), token_location);
auto Lexer::peek() const -> u32
if (not source.empty()) {
if (auto [rune, remaining] = utf8::decode(source); rune != utf8::Rune_Error) {
return rune;
return 0;
auto Lexer::consume() -> u32
prev_location = location;
if (not source.empty()) {
if (auto [rune, remaining] = utf8::decode(source); rune != utf8::Rune_Error) {
last_rune_length = -;
source = remaining;
token_length += last_rune_length;
return rune;
return 0;
auto Lexer::consume_if(auto test) -> bool
bool condition;
if constexpr (requires { test(peek()) && true; }) {
condition = test(peek());
} else if constexpr (std::is_integral_v<decltype(test)>) {
condition = (u32(test) == peek());
} else if constexpr (std::is_convertible_v<decltype(test), char const*>) {
auto const end = test + std::strlen(test);
condition = std::find(test, end, peek()) != end;
} else {
condition = std::find(std::begin(test), std::end(test), peek()) != std::end(test);
return condition && (consume(), true);
auto Lexer::consume_if(auto first, auto second) -> bool
if (consume_if(first)) {
if (consume_if(second)) {
return true;
} else {
return false;
void Lexer::rewind()
assert(last_rune_length != 0, "cannot rewind to not existing rune");
source = { - last_rune_length, source.size() + last_rune_length };
token_length -= last_rune_length;
location = prev_location;
last_rune_length = 0;
void Lexer::start()
token_start =;
token_length = 0;
token_location = location;
std::string_view Lexer::finish()
std::string_view result { token_start, token_length };
token_start = nullptr;
token_length = 0;
return result;
std::ostream& operator<<(std::ostream& os, Token const& token)
return os << '{' << token.type << ", " << std::quoted(token.source) << ", " << token.location << '}';
std::ostream& operator<<(std::ostream& os, Token::Type type)
switch (type) {
case Token::Type::Chord: return os << "CHORD";
case Token::Type::Close_Block: return os << "CLOSE BLOCK";
case Token::Type::Close_Paren: return os << "CLOSE PAREN";
case Token::Type::Expression_Separator: return os << "EXPRESSION SEPARATOR";
case Token::Type::Keyword: return os << "KEYWORD";
case Token::Type::Numeric: return os << "NUMERIC";
case Token::Type::Open_Block: return os << "OPEN BLOCK";
case Token::Type::Open_Paren: return os << "OPEN PAREN";
case Token::Type::Operator: return os << "OPERATOR";
case Token::Type::Parameter_Separator: return os << "PARAMETER SEPARATOR";
case Token::Type::Symbol: return os << "SYMBOL";