tsv2json: first normalization algorithms working
This commit is contained in:
parent
4b89e6d350
commit
7c3a79b7f7
3
.gitignore
vendored
3
.gitignore
vendored
@ -3,3 +3,6 @@ columns.pruned.tsv
|
||||
columns.original.tsv
|
||||
*.csv
|
||||
csv2tsv/csv2tsv
|
||||
.cache
|
||||
compile_commands.json
|
||||
tsv2json/tsv2json
|
||||
|
3
.gitmodules
vendored
3
.gitmodules
vendored
@ -1,3 +1,6 @@
|
||||
[submodule "aitech-ium"]
|
||||
path = aitech-ium
|
||||
url = https://git.wmi.amu.edu.pl/AITech/aitech-ium.git
|
||||
[submodule "tsv2json/json"]
|
||||
path = tsv2json/json
|
||||
url = https://github.com/RobertBendun/ImmidiateJSON.git
|
||||
|
3
Makefile
3
Makefile
@ -30,4 +30,7 @@ clean:
|
||||
csv2tsv/csv2tsv: csv2tsv/csv2tsv.go
|
||||
cd csv2tsv; go build
|
||||
|
||||
tsv2json/tsv2json: tsv2json/tsv2json.cc
|
||||
g++ -std=c++20 -O3 -Wall -Wextra -o $@ $<
|
||||
|
||||
.PHONY: clean
|
||||
|
@ -2,7 +2,7 @@ Keep Type Index Column Description
|
||||
y lower 1 title The official title of the language
|
||||
y int 2 appeared What year was the language publicly released and/or announced?
|
||||
y lower 3 type Which category in PLDB's subjective ontology does this entity fit into.
|
||||
y int 4 pldbId computed
|
||||
y lower 4 pldbId computed
|
||||
y int 5 rank computed
|
||||
y int 6 languageRank computed
|
||||
y int 7 factCount computed
|
||||
@ -13,12 +13,12 @@ y int 11 paperCount computed
|
||||
y int 12 numberOfUsers computed
|
||||
y int 13 numberOfJobs computed
|
||||
y int 14 githubBigQuery.repos How many repos for this language are listed in Google's BigQuery Public GitHub Dataset snapshot.
|
||||
y lower(sep("\band\b")) 15 creators Name(s) of the original creators of the language delimited by " and "
|
||||
y sep(" and ").lower 15 creators Name(s) of the original creators of the language delimited by " and "
|
||||
y bool 16 githubRepo URL of the official GitHub repo for the project if it hosted there.
|
||||
y bool 17 website URL of the official homepage for the language project.
|
||||
y bool 18 wikipedia URL of the entity on Wikipedia, if and only if it has a page dedicated to it.
|
||||
y lower(sep("\b&&\b")) 19 originCommunity In what community(ies) did the language first originate?
|
||||
y lower(skip("various",sep("\band\b"))) 20 country What country was the language first developed in?
|
||||
y sep(" && ").lower 19 originCommunity In what community(ies) did the language first originate?
|
||||
y unless("various").lower.sep(" and ") 20 country What country was the language first developed in?
|
||||
n Type 21 centralPackageRepositoryCount If you've searched for a CPM for this language and can't find one, set 0 as the count.
|
||||
n Type 22 reference A link to more info about this entity. You can add raw links and then auto "upgrade" them using some of the importer code.
|
||||
n Type 23 hopl The matching language on Diarmuid Pigott's Online Historical Encyclopaedia of Programming Languages site (https://hopl.info/)
|
||||
@ -85,9 +85,9 @@ y bool 83 features.hasMultiLineComments
|
||||
n Type 84 rijuRepl A link to try this language on riju.codes
|
||||
n Type 85 githubLanguage.codemirror_mime_type A String name of the file mime type used for highlighting whenever a file is edited. This should match the `mime` associated with the mode from https://git.io/f4SoQ
|
||||
n Type 86 githubLanguage.codemirror_mode A String name of the CodeMirror Mode used for highlighting whenever a file is edited. This must match a mode from https://git.io/vi9Fx
|
||||
y sep(' ') 87 fileExtensions An Array of associated extensions (the first one is considered the primary extension, the others should be listed alphabetically).
|
||||
y sep(" ") 87 fileExtensions An Array of associated extensions (the first one is considered the primary extension, the others should be listed alphabetically).
|
||||
n Type 88 tiobe Tiobe maintains a well known ranking of programming languages here: https://www.tiobe.com/tiobe-index/
|
||||
y sep(' ') 89 related What languages are related? This serves as a catch all, and it is better to use a more specific relationship node such as "supersetOf".
|
||||
y sep(" ") 89 related What languages are related? This serves as a catch all, and it is better to use a more specific relationship node such as "supersetOf".
|
||||
y str 90 multiLineCommentTokens A comment with a start delimiter and end token (which can be the same) that can span multiple lines.
|
||||
n Type 91 aka Another name for the language. Entries can have multiple aka lines.
|
||||
y bool 92 features.hasIntegers
|
||||
@ -95,7 +95,7 @@ n Type 93 helloWorldCollection Hello world written in this language from http://
|
||||
n Type 94 githubLanguage.aliases An Array of additional aliases (implicitly includes name.downcase).
|
||||
y bool 95 features.hasFloats
|
||||
n Type 96 tryItOnline A link to try this language on https://tio.run
|
||||
y sep(' ') 97 writtenIn What language(s) is the main implementation written in?
|
||||
y sep(" ") 97 writtenIn What language(s) is the main implementation written in?
|
||||
y bool 98 features.hasBooleans
|
||||
n Type 99 keywords What are all the keywords in this language?
|
||||
n Type 100 indeedJobs How many job descriptions match this query for this language on indeed.com?
|
||||
@ -139,10 +139,10 @@ n Type 137 tiobe.currentRank What is the current Tiobe rank of this language?
|
||||
y bool 138 features.hasWhileLoops
|
||||
n Type 139 forLanguages Which languages is this repository for?
|
||||
n Type 140 packageCount How many packages are in the repository? A package is some code with a name and a namespace, shipped as an atomic unit, with an owner(s).
|
||||
y sep(' ') 141 supersetOf Is this language a superset of another? If you specify this link then the superset language will inherit all features of subset language.
|
||||
y sep(" ") 141 supersetOf Is this language a superset of another? If you specify this link then the superset language will inherit all features of subset language.
|
||||
n Type 142 indeedJobs.2017
|
||||
y bool 143 features.hasBinaryNumbers
|
||||
y sep(' ') 144 influencedBy What languages influenced this one?
|
||||
y sep(" ") 144 influencedBy What languages influenced this one?
|
||||
y bool 145 features.hasOperatorOverloading
|
||||
y bool 146 features.hasImports
|
||||
y bool 147 features.hasFunctions
|
||||
@ -198,7 +198,7 @@ n Type 196 demoVideo Provide a url of a demo video of the language.
|
||||
n Type 197 isPublicDomain Is it public domain?
|
||||
y bool 198 features.hasMultilineStrings
|
||||
y bool 199 features.hasVariableSubstitutionSyntax Do you use different syntax when assigning versus referencing a variable?
|
||||
y sep(' ') 200 subsetOf Is this language a subset of another?
|
||||
y sep(" ") 200 subsetOf Is this language a subset of another?
|
||||
n Type 201 firstAnnouncement A url announcing the creation or release of a new language
|
||||
n Type 202 packageInstallCount How many packages have been downloaded?
|
||||
y bool 203 features.canWriteToDisk
|
||||
@ -273,7 +273,7 @@ y bool 271 features.hasDestructuring
|
||||
y bool 272 features.hasGenerators
|
||||
y bool 273 features.hasDynamicProperties
|
||||
y bool 274 features.hasExpressions
|
||||
y sep(' ') 275 forkOf What language is this language a fork of?
|
||||
y sep(" ") 275 forkOf What language is this language a fork of?
|
||||
n Type 276 inputLanguages Which language(s) does this take as input? For compilers, what languages does this compile compile?
|
||||
n Type 277 redditDiscussion A link to a related discussion on reddit.
|
||||
y bool 278 features.hasTryCatch
|
||||
@ -347,7 +347,7 @@ n Type 345 ebook Link to a free eBook about this. Only include if the eBook is o
|
||||
y bool 346 features.hasExports
|
||||
y bool 347 features.hasZippers
|
||||
y bool 348 features.hasMonads
|
||||
y sep(' ') 349 extensionOf What language is this language an extension of?
|
||||
y sep(" ") 349 extensionOf What language is this language an extension of?
|
||||
n Type 350 zulip Link to official (or popular unofficial) Zulip for language development.
|
||||
y bool 351 features.hasImplicitArguments
|
||||
y bool 352 features.hasDynamicTyping
|
||||
|
Can't render this file because it contains an unexpected character in line 16 and column 7.
|
1
tsv2json/json
Submodule
1
tsv2json/json
Submodule
@ -0,0 +1 @@
|
||||
Subproject commit 822784d0ebce101249e38d928ce69033e30455f4
|
338
tsv2json/tsv2json.cc
Normal file
338
tsv2json/tsv2json.cc
Normal file
@ -0,0 +1,338 @@
|
||||
#define IMM_JSON_IMPLEMENTATION
|
||||
#include "json/imm_json.hh"
|
||||
#include <cassert>
|
||||
#include <cctype>
|
||||
#include <fstream>
|
||||
#include <iomanip>
|
||||
#include <iostream>
|
||||
#include <memory>
|
||||
#include <optional>
|
||||
#include <span>
|
||||
#include <utility>
|
||||
|
||||
namespace split
|
||||
{
|
||||
struct sentinel {};
|
||||
|
||||
struct iterator
|
||||
{
|
||||
using difference_type = ptrdiff_t;
|
||||
using value_type = std::string_view;
|
||||
using iterator_category = std::input_iterator_tag;
|
||||
using pointer = void;
|
||||
using reference = std::string_view&;
|
||||
|
||||
explicit iterator(std::convertible_to<std::string_view> auto&& source, char delim)
|
||||
: source{source}
|
||||
, delim{delim}
|
||||
{
|
||||
++*this; // Compute first cell
|
||||
}
|
||||
|
||||
inline iterator begin() const
|
||||
{
|
||||
return *this;
|
||||
}
|
||||
|
||||
inline sentinel end() const
|
||||
{
|
||||
return sentinel{};
|
||||
}
|
||||
|
||||
inline bool operator==(sentinel) const
|
||||
{
|
||||
return reached_end;
|
||||
}
|
||||
|
||||
inline iterator& operator++()
|
||||
{
|
||||
if (source.empty()) {
|
||||
reached_end = true;
|
||||
return *this;
|
||||
}
|
||||
|
||||
if (auto tab = source.find(delim); tab != std::string_view::npos) {
|
||||
current = source.substr(0, tab);
|
||||
source.remove_prefix(tab+1);
|
||||
} else {
|
||||
current = source;
|
||||
source = {};
|
||||
}
|
||||
return *this;
|
||||
}
|
||||
|
||||
inline iterator operator++(int)
|
||||
{
|
||||
auto copy = *this;
|
||||
++*this;
|
||||
return copy;
|
||||
}
|
||||
|
||||
inline std::string_view operator*() const
|
||||
{
|
||||
return current;
|
||||
}
|
||||
|
||||
std::string_view current;
|
||||
std::string_view source;
|
||||
char delim;
|
||||
bool reached_end = false;
|
||||
};
|
||||
}
|
||||
|
||||
struct Expression
|
||||
{
|
||||
enum Type
|
||||
{
|
||||
Symbol,
|
||||
Call,
|
||||
};
|
||||
|
||||
Type type;
|
||||
std::string_view symbol;
|
||||
std::vector<Expression> sub;
|
||||
|
||||
Expression(std::string_view symbol)
|
||||
: type{Type::Symbol}
|
||||
, symbol{symbol}
|
||||
{
|
||||
}
|
||||
|
||||
Expression(std::string_view name, Expression &&arg)
|
||||
: type{Type::Call}
|
||||
, symbol{name}
|
||||
, sub{std::move(arg)}
|
||||
{
|
||||
}
|
||||
|
||||
Expression(Expression const&) = default;
|
||||
Expression(Expression &&) = default;
|
||||
|
||||
friend std::ostream& operator<<(std::ostream& os, Expression const& expr)
|
||||
{
|
||||
switch (expr.type) {
|
||||
break; case Type::Symbol: os << std::quoted(expr.symbol);
|
||||
break; case Type::Call: os << expr.symbol << '(' << expr.sub.front() << ')';
|
||||
}
|
||||
return os;
|
||||
}
|
||||
};
|
||||
|
||||
std::vector<Expression> parse_normalization_expression(std::string_view &source)
|
||||
{
|
||||
auto const skip_ws = [&] {
|
||||
if (auto after_ws = source.find_first_not_of(" \t"); after_ws != std::string_view::npos) {
|
||||
source.remove_prefix(after_ws);
|
||||
}
|
||||
};
|
||||
|
||||
std::vector<Expression> sequence;
|
||||
|
||||
for (;;) {
|
||||
std::string_view symbol;
|
||||
skip_ws();
|
||||
|
||||
// FIXME String escaping
|
||||
if (source.starts_with('"')) {
|
||||
source.remove_prefix(1);
|
||||
auto const string_end = source.find('"');
|
||||
if (string_end == std::string_view::npos) {
|
||||
std::cerr << "[ERROR] Failed to parse '" << source << "': expected end of string\n";
|
||||
std::exit(2);
|
||||
}
|
||||
|
||||
std::string_view symbol{source.begin(), string_end};
|
||||
source.remove_prefix(string_end+1);
|
||||
skip_ws();
|
||||
sequence.emplace_back(symbol);
|
||||
goto next;
|
||||
}
|
||||
|
||||
{
|
||||
// Don't use islower since it uses locale (slow)
|
||||
auto const symbol_end = std::find_if_not(source.begin(), source.end(), [](char c) { return c >= 'a' && c <= 'z'; });
|
||||
if (symbol_end == source.begin()) {
|
||||
std::cerr << "[ERROR] Failed to parse '" << source << "': expected symbol\n";
|
||||
std::exit(2);
|
||||
}
|
||||
|
||||
symbol = std::string_view{source.begin(), symbol_end};
|
||||
source.remove_prefix(symbol.size());
|
||||
skip_ws();
|
||||
}
|
||||
|
||||
if (source.empty()) {
|
||||
sequence.emplace_back(symbol);
|
||||
goto next;
|
||||
}
|
||||
|
||||
if (source.starts_with("(")) {
|
||||
source.remove_prefix(1);
|
||||
// FIXME Should separate expression sequence and expression argument parsing
|
||||
sequence.emplace_back(symbol, std::move(parse_normalization_expression(source).front()));
|
||||
skip_ws();
|
||||
if (!source.starts_with(")")) {
|
||||
std::cerr << "[ERROR] Failed to parse '" << source << "': expected closing bracket\n";
|
||||
std::exit(2);
|
||||
}
|
||||
source.remove_prefix(1);
|
||||
goto next;
|
||||
}
|
||||
|
||||
next:
|
||||
skip_ws();
|
||||
if (source.starts_with('.')) {
|
||||
source.remove_prefix(1);
|
||||
continue;
|
||||
} else {
|
||||
break;
|
||||
}
|
||||
}
|
||||
return sequence;
|
||||
}
|
||||
|
||||
struct Value
|
||||
{
|
||||
enum class Type
|
||||
{
|
||||
String,
|
||||
Number,
|
||||
};
|
||||
|
||||
Type type;
|
||||
std::string string;
|
||||
double number;
|
||||
|
||||
explicit Value(std::string_view s)
|
||||
: type(Type::String)
|
||||
, string(s)
|
||||
{
|
||||
}
|
||||
|
||||
explicit Value(double number)
|
||||
: type(Type::Number)
|
||||
, number(number)
|
||||
{
|
||||
}
|
||||
};
|
||||
|
||||
using Builtin = Value(*)(Value, std::optional<std::string_view>);
|
||||
using Env = std::unordered_map<std::string, Builtin>;
|
||||
|
||||
Value eval(std::span<Expression> expressions, Value value, Env const& env)
|
||||
{
|
||||
for (auto const& expr : expressions) {
|
||||
auto builtin = env.find(std::string(expr.symbol));
|
||||
if (builtin == env.end()) {
|
||||
std::cerr << "[ERROR] Unknown builtin: " << expr.symbol << '\n';
|
||||
std::exit(1);
|
||||
}
|
||||
switch (expr.type) {
|
||||
break; case Expression::Symbol:
|
||||
value = builtin->second(std::move(value), std::nullopt);
|
||||
break; case Expression::Call:
|
||||
assert(expr.sub.size() == 1);
|
||||
assert(expr.sub.front().type == Expression::Symbol);
|
||||
value = builtin->second(std::move(value), expr.sub[0].symbol);
|
||||
}
|
||||
}
|
||||
return value;
|
||||
}
|
||||
|
||||
struct Column
|
||||
{
|
||||
Column(std::string_view name, std::string_view normalization_expression)
|
||||
: name(name)
|
||||
, expression_source(normalization_expression)
|
||||
, expression(parse_normalization_expression(normalization_expression))
|
||||
{
|
||||
}
|
||||
|
||||
inline Value normalize(std::string_view source, Env const& env)
|
||||
{
|
||||
return eval(std::span(expression), Value(source), env);
|
||||
}
|
||||
|
||||
std::string_view name;
|
||||
std::string_view expression_source;
|
||||
std::vector<Expression> expression;
|
||||
};
|
||||
|
||||
int main(int argc, char** argv)
|
||||
{
|
||||
if (argc != 2) {
|
||||
std::cerr << "usage: " << argv[0] << " <columns.tsv>\n";
|
||||
std::cerr << " convert tsv file from TSV using definitions from columns.tsv\n";
|
||||
}
|
||||
|
||||
std::ifstream columns_file(argv[1]);
|
||||
static std::string source{std::istreambuf_iterator<char>(columns_file), {}};
|
||||
|
||||
if (auto it = std::next(split::iterator(source, '\t'), 1); it == split::sentinel{} || *it != "Type") {
|
||||
std::cerr << "[ERROR] Expected Type description in column 2\n";
|
||||
return 1;
|
||||
}
|
||||
|
||||
std::vector<Column> columns;
|
||||
|
||||
for (std::string_view line : split::iterator(source, '\n')) {
|
||||
auto tsv_it = split::iterator(line, '\t');
|
||||
if (tsv_it == split::sentinel{} || *tsv_it++ != "y") {
|
||||
continue;
|
||||
}
|
||||
auto const type = *tsv_it++; if (tsv_it == split::sentinel{}) continue;
|
||||
[[maybe_unused]] auto const _column_number = *tsv_it++; if (tsv_it == split::sentinel{}) continue;
|
||||
auto const name = *tsv_it++; if (tsv_it == split::sentinel{}) continue;
|
||||
|
||||
columns.emplace_back(name, type);
|
||||
}
|
||||
|
||||
Env env = {
|
||||
std::pair<std::string, Builtin> { "lower", +[](Value v, std::optional<std::string_view>) -> Value {
|
||||
assert(v.type == Value::Type::String);
|
||||
// FIXME Proper UTF-8 lowercase
|
||||
// However, manual inspection of used TSV files prooved that there aren't any non-ascii uppercase letters
|
||||
for (char &c : v.string) {
|
||||
if (c >= 'A' && c <= 'Z') {
|
||||
c = c - 'A' + 'a';
|
||||
}
|
||||
}
|
||||
return v;
|
||||
}},
|
||||
|
||||
std::pair<std::string, Builtin> { "int", +[](Value v, std::optional<std::string_view>) -> Value {
|
||||
assert(v.type == Value::Type::String);
|
||||
|
||||
long long int n;
|
||||
std::cout.flush();
|
||||
auto [p, ec] = std::from_chars(v.string.data(), v.string.data() + v.string.size(), n);
|
||||
assert(ec == std::errc{});
|
||||
return Value(double(n));
|
||||
}},
|
||||
};
|
||||
|
||||
|
||||
{
|
||||
bool passed_header = false;
|
||||
Json json;
|
||||
auto _array = json.array();
|
||||
for (std::string line; std::getline(std::cin, line); ) {
|
||||
if (!passed_header) {
|
||||
passed_header = true;
|
||||
continue;
|
||||
}
|
||||
auto _object = json.object();
|
||||
auto tsv = split::iterator(line, '\t');
|
||||
for (auto i = 0u; tsv != split::sentinel{}; ++i, ++tsv) {
|
||||
auto normalized = columns[i].normalize(*tsv, env);
|
||||
switch (normalized.type) {
|
||||
break; case Value::Type::String: json.key(columns[i].name) = normalized.string;
|
||||
break; case Value::Type::Number: json.key(columns[i].name) = double(normalized.number);
|
||||
}
|
||||
}
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
Loading…
Reference in New Issue
Block a user