From eda85108a25771b607a04324a5f5e3d8322129aa Mon Sep 17 00:00:00 2001 From: Robert Bendun Date: Sat, 18 Mar 2023 21:35:41 +0100 Subject: [PATCH] More normalization --- Makefile | 2 +- columns.user.tsv | 2 +- tsv2json/tsv2json.cc | 169 ++++++++++++++++++++++++++++++++++--------- 3 files changed, 136 insertions(+), 37 deletions(-) diff --git a/Makefile b/Makefile index 5ee5075..888942c 100644 --- a/Makefile +++ b/Makefile @@ -31,6 +31,6 @@ csv2tsv/csv2tsv: csv2tsv/csv2tsv.go cd csv2tsv; go build tsv2json/tsv2json: tsv2json/tsv2json.cc - g++ -std=c++20 -O3 -Wall -Wextra -o $@ $< + g++ -std=c++20 -O0 -Wall -Wextra -o $@ $< -ggdb .PHONY: clean diff --git a/columns.user.tsv b/columns.user.tsv index 2a7888b..5a24490 100644 --- a/columns.user.tsv +++ b/columns.user.tsv @@ -18,7 +18,7 @@ y bool 16 githubRepo URL of the official GitHub repo for the project if it hoste y bool 17 website URL of the official homepage for the language project. y bool 18 wikipedia URL of the entity on Wikipedia, if and only if it has a page dedicated to it. y sep(" && ").lower 19 originCommunity In what community(ies) did the language first originate? -y unless("various").lower.sep(" and ") 20 country What country was the language first developed in? +y lower.unless("various").sep(" and ") 20 country What country was the language first developed in? n Type 21 centralPackageRepositoryCount If you've searched for a CPM for this language and can't find one, set 0 as the count. n Type 22 reference A link to more info about this entity. You can add raw links and then auto "upgrade" them using some of the importer code. n Type 23 hopl The matching language on Diarmuid Pigott's Online Historical Encyclopaedia of Programming Languages site (https://hopl.info/) diff --git a/tsv2json/tsv2json.cc b/tsv2json/tsv2json.cc index 74833fb..72994ac 100644 --- a/tsv2json/tsv2json.cc +++ b/tsv2json/tsv2json.cc @@ -90,7 +90,7 @@ struct Expression Type type; std::string_view symbol; - std::vector sub; + std::vector sub{}; Expression(std::string_view symbol) : type{Type::Symbol} @@ -105,9 +105,6 @@ struct Expression { } - Expression(Expression const&) = default; - Expression(Expression &&) = default; - friend std::ostream& operator<<(std::ostream& os, Expression const& expr) { switch (expr.type) { @@ -195,13 +192,29 @@ struct Value { enum class Type { - String, + Null, + Bool, Number, + String, + Vector, }; - Type type; - std::string string; - double number; + Type type = Type::Null; + bool boolean = false; + std::string string{}; + double number = 0; + std::vector vector{}; + + explicit Value() + : type(Type::Null) + { + } + + explicit Value(bool b) + : type(Type::Bool) + , boolean(b) + { + } explicit Value(std::string_view s) : type(Type::String) @@ -214,26 +227,67 @@ struct Value , number(number) { } + + explicit Value(std::vector vector) + : type(Type::Vector) + , vector(std::move(vector)) + { + } }; -using Builtin = Value(*)(Value, std::optional); -using Env = std::unordered_map; +Json& operator+=(Json& json, Value const& value) +{ + switch (value.type) { + break; case Value::Type::Null: json = nullptr; + break; case Value::Type::Bool: json = value.boolean; + break; case Value::Type::String: json = value.string; + break; case Value::Type::Number: json = value.number; + break; case Value::Type::Vector: + { + auto _array = json.array(); + for (auto const& element : value.vector) json += element; + } + } + return json; +} -Value eval(std::span expressions, Value value, Env const& env) +struct Builtin +{ + std::string_view name; + Value(*handler)(Value, std::optional); + bool accepts_vector = false; +}; + +using Env = std::vector; + +Value eval(std::vector const& expressions, Value value, Env const& env) { for (auto const& expr : expressions) { - auto builtin = env.find(std::string(expr.symbol)); + auto builtin = std::find_if(env.begin(), env.end(), [expr](Builtin const& b) { return b.name == expr.symbol; }); if (builtin == env.end()) { std::cerr << "[ERROR] Unknown builtin: " << expr.symbol << '\n'; std::exit(1); } - switch (expr.type) { - break; case Expression::Symbol: - value = builtin->second(std::move(value), std::nullopt); - break; case Expression::Call: - assert(expr.sub.size() == 1); - assert(expr.sub.front().type == Expression::Symbol); - value = builtin->second(std::move(value), expr.sub[0].symbol); + if (!builtin->accepts_vector && value.type == Value::Type::Vector) { + for (auto &element : value.vector) { + switch (expr.type) { + break; case Expression::Symbol: + element = builtin->handler(std::move(element), std::nullopt); + break; case Expression::Call: + assert(expr.sub.size() == 1); + assert(expr.sub.front().type == Expression::Symbol); + element = builtin->handler(std::move(element), expr.sub[0].symbol); + } + } + } else { + switch (expr.type) { + break; case Expression::Symbol: + value = builtin->handler(std::move(value), std::nullopt); + break; case Expression::Call: + assert(expr.sub.size() == 1); + assert(expr.sub.front().type == Expression::Symbol); + value = builtin->handler(std::move(value), expr.sub[0].symbol); + } } } return value; @@ -244,13 +298,14 @@ struct Column Column(std::string_view name, std::string_view normalization_expression) : name(name) , expression_source(normalization_expression) - , expression(parse_normalization_expression(normalization_expression)) + , expression() { + expression = parse_normalization_expression(normalization_expression); } inline Value normalize(std::string_view source, Env const& env) { - return eval(std::span(expression), Value(source), env); + return eval(expression, Value(source), env); } std::string_view name; @@ -277,18 +332,17 @@ int main(int argc, char** argv) for (std::string_view line : split::iterator(source, '\n')) { auto tsv_it = split::iterator(line, '\t'); - if (tsv_it == split::sentinel{} || *tsv_it++ != "y") { - continue; - } + if (tsv_it == split::sentinel{} || !(*tsv_it++).starts_with("y")) { continue; } + auto const type = *tsv_it++; if (tsv_it == split::sentinel{}) continue; [[maybe_unused]] auto const _column_number = *tsv_it++; if (tsv_it == split::sentinel{}) continue; - auto const name = *tsv_it++; if (tsv_it == split::sentinel{}) continue; + auto const name = *tsv_it++; columns.emplace_back(name, type); } Env env = { - std::pair { "lower", +[](Value v, std::optional) -> Value { + Builtin { "lower", +[](Value v, std::optional) -> Value { assert(v.type == Value::Type::String); // FIXME Proper UTF-8 lowercase // However, manual inspection of used TSV files prooved that there aren't any non-ascii uppercase letters @@ -300,15 +354,63 @@ int main(int argc, char** argv) return v; }}, - std::pair { "int", +[](Value v, std::optional) -> Value { + Builtin { "int", +[](Value v, std::optional) -> Value { assert(v.type == Value::Type::String); long long int n; std::cout.flush(); + if (v.string.empty()) { + return Value{}; + } auto [p, ec] = std::from_chars(v.string.data(), v.string.data() + v.string.size(), n); - assert(ec == std::errc{}); + if (ec != std::errc{}) { + return Value{}; + } + + return Value(double(n)); }}, + + Builtin { "bool", +[](Value v, std::optional) -> Value { + assert(v.type == Value::Type::String); + return Value(v.string.empty()); + }}, + + Builtin { "str", +[](Value v, std::optional) -> Value { + assert(v.type == Value::Type::String); + return v; + }}, + + Builtin { "sep", +[](Value v, std::optional by) -> Value { + assert(by && "sep requires parameter by which it can split"); + assert(v.type == Value::Type::String && "only string can be splitted"); + + std::vector separated; + std::string_view source = v.string; + for (;;) if (auto split_point = source.find(*by); split_point != std::string_view::npos) { + separated.emplace_back(source.substr(0, split_point)); + source.remove_prefix(split_point + by->size()); + } else { + break; + } + + if (source.size()) { + separated.emplace_back(source); + } + return Value(std::move(separated)); + }}, + + Builtin { + .name = "unless", + .handler = +[](Value v, std::optional needle) -> Value { + assert(v.type == Value::Type::String); + assert(needle && "Unless requires string to search for"); + if (v.string.find(*needle) == std::string::npos) + return v; + return Value(""); + }, + .accepts_vector = false + } }; @@ -316,19 +418,16 @@ int main(int argc, char** argv) bool passed_header = false; Json json; auto _array = json.array(); - for (std::string line; std::getline(std::cin, line); ) { + for (std::string line; std::getline(std::cin, line);) { if (!passed_header) { passed_header = true; continue; } auto _object = json.object(); auto tsv = split::iterator(line, '\t'); - for (auto i = 0u; tsv != split::sentinel{}; ++i, ++tsv) { - auto normalized = columns[i].normalize(*tsv, env); - switch (normalized.type) { - break; case Value::Type::String: json.key(columns[i].name) = normalized.string; - break; case Value::Type::Number: json.key(columns[i].name) = double(normalized.number); - } + for (auto column = 0u; tsv != split::sentinel{}; ++column, ++tsv) { + assert(column < columns.size()); + json.key(columns[column].name) += columns[column].normalize(*tsv, env); } break; }