clean slate

This commit is contained in:
Robert Bendun 2023-03-20 22:02:50 +01:00
parent eda85108a2
commit 1d0329af4d
6 changed files with 0 additions and 829 deletions

3
.gitmodules vendored
View File

@ -1,6 +1,3 @@
[submodule "aitech-ium"] [submodule "aitech-ium"]
path = aitech-ium path = aitech-ium
url = https://git.wmi.amu.edu.pl/AITech/aitech-ium.git url = https://git.wmi.amu.edu.pl/AITech/aitech-ium.git
[submodule "tsv2json/json"]
path = tsv2json/json
url = https://github.com/RobertBendun/ImmidiateJSON.git

View File

@ -1,36 +1,3 @@
# Remove all columns according to specification in columns.pruned.tsv
languages.pruned.tsv: languages.original.tsv columns.user.tsv
bash -c "cut -f`grep '^y' columns.user.tsv | cut -f3 | paste -sd ','` $< >$@"
# Allow user to mark in which columns is interested
columns.user.tsv: columns.pruned.tsv
if [ -f $@ ]; then touch $@; else awk 'NR==1{ printf("Keep\tType\t"); print } NR>1{printf("n\tstr\t"); print}' $< >$@; fi
# Prune columns that are not needed to create specification
columns.pruned.tsv: columns.original.tsv
cut --complement -f3,4,5,7,8,9,10 $< >$@
# Change data to TSV format since it is easier to process using standard UNIX tools
%.tsv: %.csv
csv2tsv/csv2tsv <$< >$@
# Check while downloading that file is as expected.
# Otherwise automatic filter mechanism wouldn't work.
# If hashes differ then user of this repo must migrate columns.user.tsv to a new format
columns.original.csv:
wget 'https://pldb.com/columns.csv' -O $@
sha256sum -c checksums.sha256
languages.original.csv: columns.original.csv
wget 'https://pldb.com/languages.csv' -O $@
clean:
rm -f languages.*.tsv languages.*.csv columns.original.tsv columns.*.csv columns.pruned.tsv
csv2tsv/csv2tsv: csv2tsv/csv2tsv.go csv2tsv/csv2tsv: csv2tsv/csv2tsv.go
cd csv2tsv; go build cd csv2tsv; go build
tsv2json/tsv2json: tsv2json/tsv2json.cc
g++ -std=c++20 -O0 -Wall -Wextra -o $@ $< -ggdb
.PHONY: clean

View File

@ -1 +0,0 @@
27afba98520ff7c408492cab9ea8789db223ae8c134cb0cd6dfbebf7c7132100 columns.original.csv

View File

@ -1,354 +0,0 @@
Keep Type Index Column Description
y lower 1 title The official title of the language
y int 2 appeared What year was the language publicly released and/or announced?
y lower 3 type Which category in PLDB's subjective ontology does this entity fit into.
y lower 4 pldbId computed
y int 5 rank computed
y int 6 languageRank computed
y int 7 factCount computed
y int 8 lastActivity computed
y int 9 exampleCount computed
y int 10 bookCount computed
y int 11 paperCount computed
y int 12 numberOfUsers computed
y int 13 numberOfJobs computed
y int 14 githubBigQuery.repos How many repos for this language are listed in Google's BigQuery Public GitHub Dataset snapshot.
y sep(" and ").lower 15 creators Name(s) of the original creators of the language delimited by " and "
y bool 16 githubRepo URL of the official GitHub repo for the project if it hosted there.
y bool 17 website URL of the official homepage for the language project.
y bool 18 wikipedia URL of the entity on Wikipedia, if and only if it has a page dedicated to it.
y sep(" && ").lower 19 originCommunity In what community(ies) did the language first originate?
y lower.unless("various").sep(" and ") 20 country What country was the language first developed in?
n Type 21 centralPackageRepositoryCount If you've searched for a CPM for this language and can't find one, set 0 as the count.
n Type 22 reference A link to more info about this entity. You can add raw links and then auto "upgrade" them using some of the importer code.
n Type 23 hopl The matching language on Diarmuid Pigott's Online Historical Encyclopaedia of Programming Languages site (https://hopl.info/)
n Type 24 wikipedia.dailyPageViews How many page views per day does this Wikipedia page get? Useful as a signal for rankings. Available via WP api.
y int 25 wikipedia.backlinksCount How many pages on WP link to this page?
n Type 26 wikipedia.summary What is the text summary of the language from the Wikipedia page?
n Type 27 wikipedia.pageId Waht is the internal ID for this entity on WP?
y int 28 wikipedia.appeared When does Wikipedia claim this entity first appeared?
n Type 29 wikipedia.created When was the *Wikipedia page* for this entity created?
n Type 30 wikipedia.revisionCount How many revisions does this page have?
n Type 31 wikipedia.related What languages does Wikipedia have as related?
n Type 32 fileType What is the file encoding for programs in this language?
n Type 33 isbndb Books about this language from ISBNdb.
y int 34 githubRepo.stars How many stars of the repo?
y int 35 githubRepo.forks How many forks of the repo?
y int 36 githubRepo.updated What year was the last commit made?
y int 37 githubRepo.subscribers How many subscribers to the repo?
n Type 38 githubRepo.created When was the *Github repo* for this entity created?
n Type 39 githubRepo.description Description of the repo on GitHub.
n Type 40 description Description of the repo on GitHub.
n Type 41 githubRepo.issues How many isses on the repo?
n Type 42 domainName If the project website is on its own domain.
y int 43 githubRepo.firstCommit What year the first commit made in this git repo?
n Type 44 semanticScholar Papers about this language from Semantic Scholar.
y bool 45 features.hasComments
n Type 46 domainName.registered When was this domain first registered?
n Type 47 isOpenSource Is it an open source project?
y bool 48 features.hasSemanticIndentation
y bool 49 features.hasLineComments
n Type 50 githubLanguage GitHub has a set of supported languages as defined here: https://raw.githubusercontent.com/github/linguist/master/lib/linguist/languages.yml
n Type 51 githubLanguage.tm_scope The TextMate scope that represents this programming language. This should match one of the scopes listed in the grammars.yml file. Use "none" if there is no grammar for this language.
n Type 52 githubLanguage.type Either data, programming, markup, prose, or nil.
n Type 53 githubLanguage.ace_mode A String name of the Ace Mode used for highlighting whenever a file is edited. This must match one of the filenames in http://git.io/3XO_Cg. Use "text" if a mode does not exist.
n Type 54 githubLanguage.fileExtensions An Array of associated extensions (the first one is considered the primary extension, the others should be listed alphabetically).
n Type 55 numberOfRepos computed
n Type 56 githubLanguage.repos How many repos for this language does GitHub report?
n Type 57 lineCommentToken Defined as a token that can be placed anywhere on a line and starts a comment that cannot be stopped except by a line break character or end of file.
n Type 58 githubLanguage.trendingProjectsCount How many trending repos for this language does GitHub report?
n Type 59 domainName.awisRank.2022
n Type 60 leachim6.filepath
n Type 61 leachim6 A link to this language in leachim6's hello-world project.
n Type 62 githubBigQuery Google BigQuery Public Datasets has a dataset with info on GitHub repos: https://cloud.google.com/blog/topics/public-datasets/github-on-bigquery-analyze-all-the-open-source-code
n Type 63 githubBigQuery.users
n Type 64 linguistGrammarRepo Linguist is a library used by GitHub to syntax highlight files on GitHub via a grammar. The list of languages supported by Linguist and the grammar package used for each language is listed here: https://github.com/github/linguist/blob/master/vendor/README.md. If Linguist has support for a language, it will have a repo on GitHub. Given a language is supported by Linguist, that is a good indication it has at least 200 unique :user/:repo repositories, according to their docs.
n Type 65 linguistGrammarRepo.commitCount How many commits in this repo?
n Type 66 linguistGrammarRepo.committerCount How many people have made commits in this repo?
n Type 67 linguistGrammarRepo.lastCommit What year was the last commit made?
n Type 68 linguistGrammarRepo.firstCommit What year was the first commit made?
n Type 69 wordRank Some creators use a common English word as their language's name. For these we note how common the word is, where "the" is 1.
n Type 70 leachim6.fileExtensions An Array of associated extensions (the first one is considered the primary extension, the others should be listed alphabetically).
n Type 71 linguistGrammarRepo.sampleCount How many language samples in this repo?
y bool 72 features.hasStrings
n Type 73 pygmentsHighlighter.filename
n Type 74 pygmentsHighlighter A link to a Pygments syntax highlighting class for this language (https://pygments.org/)
n Type 75 standsFor If the language name is an acronym what does/did it stand for?
n Type 76 stringToken What token(s) is used to delimite a string?
n Type 77 documentation Link to the official documentation for a language.
n Type 78 rosettaCode A link to this language on Rosetta Code - http://www.rosettacode.org/
n Type 79 pygmentsHighlighter.fileExtensions An Array of associated extensions (the first one is considered the primary extension, the others should be listed alphabetically).
y bool 80 features.hasPrintDebugging
n Type 81 printToken What token(s) is used to print a message?
n Type 82 twitter Official Twitter handle of the entity, if any.
y bool 83 features.hasMultiLineComments
n Type 84 rijuRepl A link to try this language on riju.codes
n Type 85 githubLanguage.codemirror_mime_type A String name of the file mime type used for highlighting whenever a file is edited. This should match the `mime` associated with the mode from https://git.io/f4SoQ
n Type 86 githubLanguage.codemirror_mode A String name of the CodeMirror Mode used for highlighting whenever a file is edited. This must match a mode from https://git.io/vi9Fx
y sep(" ") 87 fileExtensions An Array of associated extensions (the first one is considered the primary extension, the others should be listed alphabetically).
n Type 88 tiobe Tiobe maintains a well known ranking of programming languages here: https://www.tiobe.com/tiobe-index/
y sep(" ") 89 related What languages are related? This serves as a catch all, and it is better to use a more specific relationship node such as "supersetOf".
y str 90 multiLineCommentTokens A comment with a start delimiter and end token (which can be the same) that can span multiple lines.
n Type 91 aka Another name for the language. Entries can have multiple aka lines.
y bool 92 features.hasIntegers
n Type 93 helloWorldCollection Hello world written in this language from http://helloworldcollection.de/
n Type 94 githubLanguage.aliases An Array of additional aliases (implicitly includes name.downcase).
y bool 95 features.hasFloats
n Type 96 tryItOnline A link to try this language on https://tio.run
y sep(" ") 97 writtenIn What language(s) is the main implementation written in?
y bool 98 features.hasBooleans
n Type 99 keywords What are all the keywords in this language?
n Type 100 indeedJobs How many job descriptions match this query for this language on indeed.com?
n Type 101 wikipedia.fileExtensions An Array of associated extensions (the first one is considered the primary extension, the others should be listed alphabetically).
y bool 102 features.hasHexadecimals
n Type 103 projectEuler.memberCount.2022
n Type 104 projectEuler Is this language one of the ones listed on https://projecteuler.net/?
n Type 105 booleanTokens What token(s) is used for true and false?
n Type 106 visualParadigm Is this a visual programming thing? Sometimes called "no code" or "low code"?
n Type 107 domainName.awisRank.2017
n Type 108 projectEuler.memberCount.2019
n Type 109 webRepl An online repl for the project.
n Type 110 subreddit.memberCount.2022
n Type 111 subreddit Url of a subreddit(s) for this language.
n Type 112 codeMirror A link to a CodeMirror syntax highlighting package for this language (https://github.com/codemirror/codemirror5/tree/master/mode/LANGUAGE)
y bool 113 features.hasCaseInsensitiveIdentifiers
n Type 114 monaco A link to a Monaco syntax highlighting package for this language.
y bool 115 features.hasConditionals
n Type 116 jupyterKernel A link to a Jupyter Kernel for this language.
n Type 117 githubLanguage.interpreters An Array of associated interpreters
n Type 118 quineRelay The Quine Relay project (https://github.com/mame/quine-relay).
n Type 119 compilesTo Which language(s) does this language primarily compile to?
n Type 120 ubuntuPackage The name of an Ubuntu package for the language from https://packages.ubuntu.com/.
n Type 121 indeedJobs.2022
n Type 122 packageRepository URL to the package repository for this language.
n Type 123 antlr A link to the ANTLR grammar for this language (https://github.com/antlr/grammars-v4/tree/master/LANGUAGE)
n Type 124 officialBlogUrl URL to the official blog for this language.
n Type 125 meetup.groupCount
n Type 126 meetup.memberCount
n Type 127 meetup Some languages have active meetup groups on Meetup.com
n Type 128 linkedInSkill.2018
n Type 129 linkedInSkill How many people list this skill on LinkedIn?
n Type 130 languageServerProtocolProject A link to a project implementing LSP for this language.
n Type 131 githubLanguage.filenames Filenames commonly associated with the language.
y bool 132 features.hasOctals
n Type 133 releaseNotesUrl URL to the release notes for this language.
n Type 134 languageServerProtocolProject.writtenIn What language(s) is the main implementation written in?
y bool 135 features.hasAssignment
n Type 136 faqPageUrl URL to the frequently asked questions for this language.
n Type 137 tiobe.currentRank What is the current Tiobe rank of this language?
y bool 138 features.hasWhileLoops
n Type 139 forLanguages Which languages is this repository for?
n Type 140 packageCount How many packages are in the repository? A package is some code with a name and a namespace, shipped as an atomic unit, with an owner(s).
y sep(" ") 141 supersetOf Is this language a superset of another? If you specify this link then the superset language will inherit all features of subset language.
n Type 142 indeedJobs.2017
y bool 143 features.hasBinaryNumbers
y sep(" ") 144 influencedBy What languages influenced this one?
y bool 145 features.hasOperatorOverloading
y bool 146 features.hasImports
y bool 147 features.hasFunctions
n Type 148 githubLanguage.group Name of the parent language. Languages in a group are counted in the statistics as the parent language.
n Type 149 rijuRepl.description Description of the repo on GitHub.
n Type 150 subreddit.memberCount.2017
n Type 151 rijuRepl.gitRepo URL of the official git repo for the language project if not hosted on GitHub or GitLab or Sourcehut.
n Type 152 rijuRepl.fileExtensions An Array of associated extensions (the first one is considered the primary extension, the others should be listed alphabetically).
n Type 153 stackOverflowSurvey.2021.percentageUsing What percentage of survey respondents report using this language?
n Type 154 stackOverflowSurvey.2021.fans How many developers reported wanting to learn this language.
n Type 155 stackOverflowSurvey.2021.medianSalary Median salary reported by developers using this language.
n Type 156 stackOverflowSurvey.2021.users How many developers reported using this language.
n Type 157 downloadPageUrl URL to the download page for this language.
n Type 158 assignmentToken What token(s) is used for assignment to an identifier?
n Type 159 compilerExplorer This language's name on https://godbolt.org
y bool 160 features.hasMacros
y bool 161 features.hasClasses
n Type 162 replit A link to try this language on replit.com
n Type 163 rijuRepl.website URL of the official homepage for the language project.
n Type 164 pypl This language's id on https://pypl.github.io
n Type 165 emailList Link to the mailing list for a language.
y bool 166 features.hasTypeInference
y bool 167 features.isCaseSensitive
y bool 168 features.hasSwitch
y bool 169 features.hasConstants
y bool 170 features.hasGarbageCollection
n Type 171 spec Link to the official spec for a language.
y bool 172 features.hasExceptions
y bool 173 features.hasPointers
y bool 174 features.hasDirectives
y bool 175 features.hasAccessModifiers
n Type 176 eventsPageUrl URL to the events pages of this language.
n Type 177 cheatSheetUrl A link to a cheat sheet for this language.
y bool 178 features.hasLists
y bool 179 features.hasInheritance
n Type 180 esolang A link to this language on https://esolangs.org/
n bool 181 features.hasMultipleInheritance
y bool 182 features.hasConstructors
n Type 183 nativeLanguage Nearly all programming languages are written in English, but some aren't. Set this field for the ones that are not.
y bool 184 features.hasRegularExpressionsSyntaxSugar
n Type 185 screenshot For visual languages, a picture is worth a thousand words. Provide the URL to the screenshot in the form: https://pldb.com/screenshots/[pldbId].png
n Type 186 githubLanguage.wrap Boolean wrap to enable line wrapping (default: false)
y bool 187 features.isLisp Is this in the Lisp family of languages?
y bool 188 features.hasTernaryOperators
y bool 189 features.hasScientificNotation
n Type 190 versions.2022 A release year and version. Perhaps in the future we could get more specific to month or even day.
y bool 191 features.hasMessagePassing
n Type 192 gdbSupport Is the language supported by the GNU Debugger?
y bool 193 features.hasEnums
n Type 194 announcementMethod How was the language first announced?
n Type 195 gitlabRepo URL of the official GitLab repo for the language project.
n Type 196 demoVideo Provide a url of a demo video of the language.
n Type 197 isPublicDomain Is it public domain?
y bool 198 features.hasMultilineStrings
y bool 199 features.hasVariableSubstitutionSyntax Do you use different syntax when assigning versus referencing a variable?
y sep(" ") 200 subsetOf Is this language a subset of another?
n Type 201 firstAnnouncement A url announcing the creation or release of a new language
n Type 202 packageInstallCount How many packages have been downloaded?
y bool 203 features.canWriteToDisk
y bool 204 features.hasBitWiseOperators
y bool 205 features.hasZeroBasedNumbering
n Type 206 oldName What is the old name of this language?
y bool 207 features.hasStaticTyping
y bool 208 features.hasUnitsOfMeasure
y bool 209 features.hasIncrementAndDecrementOperators
y bool 210 features.hasSingleDispatch
y bool 211 features.hasHomoiconicity
n Type 212 runsOnVm What virtual machine(s) does this language run on?
y bool 213 features.hasHereDocs
y bool 214 features.hasFixedPoint
y bool 215 features.hasNamespaces
y bool 216 features.hasThreads
y bool 217 features.hasModules
n Type 218 gitRepo URL of the official git repo for the language project if not hosted on GitHub or GitLab or Sourcehut.
y bool 219 features.hasPatternMatching
y bool 220 features.hasGotos
n Type 221 annualReportsUrl URL to the annual reports for this language.
y bool 222 features.hasFunctionComposition
y bool 223 features.hasFunctionOverloading
y bool 224 features.hasAsyncAwait
y bool 225 features.hasIterators
y bool 226 features.hasExplicitTypeCasting
y bool 227 features.hasStructs
y bool 228 features.hasMultipleDispatch
y bool 229 features.hasInterfaces
y bool 230 features.hasGenerics
y bool 231 features.hasForEachLoops
y bool 232 features.hasMaps
y bool 233 features.hasPipes
y bool 234 features.hasMixins
y bool 235 features.canDoShebang
y bool 236 features.hasVariadicFunctions
y bool 237 features.hasManualMemoryManagement
y bool 238 features.hasTemplates
y bool 239 features.hasInfixNotation
y bool 240 features.hasPolymorphism
y bool 241 features.hasPartialApplication
y bool 242 features.hasAssertStatements
n Type 243 sourcehutRepo URL of the official sourcehut repo for the project.
y bool 244 features.hasForLoops
n Type 245 renamedTo What is the new name of this language?
y bool 246 features.hasDocComments Is there a standard mini language written in comments for documenting code?
y bool 247 features.hasUnicodeIdentifiers
y bool 248 features.hasDependentTypes
n Type 249 conference Some languages have a recurring conference(s) focused on that specific language.
y bool 250 features.hasDuckTyping
y bool 251 features.hasDefaultParameters
y bool 252 features.hasAnonymousFunctions
y bool 253 features.hasMagicGettersAndSetters
n Type 254 packageAuthors How many people contribute packages to this cpm?
n Type 255 photo For notations, a picture is worth a thousand words. Provide a photo in the form: https://pldb.com/photos/[pldbId].png
n Type 256 successorOf Was this language launched as the successor of another?
y bool 257 features.hasBuiltInRegex
y bool 258 features.hasNull
y bool 259 features.hasUnaryOperators
y bool 260 features.hasUserDefinedOperators
y bool 261 features.hasBreak
y bool 262 features.hasContinue
n Type 263 includeToken What token(s) is used for including another file?
y bool 264 features.hasUnionTypes
y bool 265 features.hasSingleTypeArrays Has an array data structure that only can hold items of the same type.
y bool 266 features.hasTypedHoles
y bool 267 features.hasReservedWords Does a concept of reserved words exists? For example, not being able to use certain keywords as variable names.
y bool 268 features.hasRangeOperators
y bool 269 features.hasDisposeBlocks
y bool 270 features.hasSymbolTables
y bool 271 features.hasDestructuring
y bool 272 features.hasGenerators
y bool 273 features.hasDynamicProperties
y bool 274 features.hasExpressions
y sep(" ") 275 forkOf What language is this language a fork of?
n Type 276 inputLanguages Which language(s) does this take as input? For compilers, what languages does this compile compile?
n Type 277 redditDiscussion A link to a related discussion on reddit.
y bool 278 features.hasTryCatch
y bool 279 features.hasEscapeCharacters
y bool 280 usesSemanticVersioning Does the official release of the language use semantic versioning?
y bool 281 features.hasPostfixNotation
y bool 282 features.hasPrefixNotation
y bool 283 features.hasStreams
y bool 284 features.hasLazyEvaluation
y bool 285 features.hasCharacters
n Type 286 funFact A text or code block containing a fun or unusual fact about the language.
y bool 287 features.hasSets
y bool 288 features.hasMethods
y bool 289 features.hasAbstractTypes
y bool 290 isDead Has the creator or maintainer announced it officially dead? Include a link to proof of the announcement.
y bool 291 features.canUseQuestionMarksAsPartOfIdentifier
y bool 292 features.hasTypeAnnotations
y bool 293 features.hasSymbols
y bool 294 features.hasDecimals
y bool 295 features.hasBlobs
y bool 296 features.hasSExpressions
y bool 297 features.hasLabels
y bool 298 features.hasIfElses
y bool 299 features.hasIfs
y bool 300 features.hasBoundedCheckedArrays
y bool 301 features.hasArraySlicingSyntax
y bool 302 features.hasTimestamps
y bool 303 features.hasMethodOverloading
y bool 304 features.hasVoidFunctions
y bool 305 features.hasGlobalScope
y bool 306 features.hasFnArguments
y bool 307 features.canReadCommandLineArgs
y bool 308 features.hasDynamicSizedArrays
y bool 309 features.hasRequiredMainFunction
y bool 310 features.hasSelfOrThisWord
n bool 311 features.hasStatementTerminatorCharacter
y bool 312 features.hasMemberVariables
y bool 313 features.hasStringConcatOperator
n Type 314 versions.2021 A release year and version. Perhaps in the future we could get more specific to month or even day.
y bool 315 features.hasAlgebraicTypes
y bool 316 features.hasTypeParameters
y bool 317 features.hasStaticMethods
y bool 318 features.hasRunTimeGuards
n Type 319 irc Link to official (or popular unofficial) IRC channel(s) for language development.
y bool 320 features.hasTraits
y bool 321 features.hasVirtualFunctions
n Type 322 discord Link to official (or popular unofficial) Discord for language development.
y bool 323 features.letterFirstIdentifiers Must identifiers start with a letter
y bool 324 features.hasReferences
y bool 325 features.hasImplicitTypeConversions
y bool 326 features.hasFirstClassFunctions
y bool 327 features.hasProcessorRegisters
y bool 328 features.hasSourceMaps
y bool 329 features.mergesWhitespace
y bool 330 features.supportsBreakpoints
y bool 331 features.hasMapFunctions
y bool 332 features.hasBinaryOperators
y bool 333 features.hasStatements
n Type 334 versions.2007 A release year and version. Perhaps in the future we could get more specific to month or even day.
n Type 335 versions.2023 A release year and version. Perhaps in the future we could get more specific to month or even day.
n Type 336 versions.2015 A release year and version. Perhaps in the future we could get more specific to month or even day.
n Type 337 versions.2019 A release year and version. Perhaps in the future we could get more specific to month or even day.
n Type 338 versions.2013 A release year and version. Perhaps in the future we could get more specific to month or even day.
y bool 339 features.hasRefinementTypes
y bool 340 features.hasPairs
y bool 341 features.hasValueReturnedFunctions
y bool 342 features.hasClobs
y bool 343 features.hasTriples
y bool 344 features.hasIds
n Type 345 ebook Link to a free eBook about this. Only include if the eBook is of high quality and not spammy.
y bool 346 features.hasExports
y bool 347 features.hasZippers
y bool 348 features.hasMonads
y sep(" ") 349 extensionOf What language is this language an extension of?
n Type 350 zulip Link to official (or popular unofficial) Zulip for language development.
y bool 351 features.hasImplicitArguments
y bool 352 features.hasDynamicTyping
y bool 353 features.hasMethodChaining
Can't render this file because it contains an unexpected character in line 16 and column 7.

@ -1 +0,0 @@
Subproject commit 822784d0ebce101249e38d928ce69033e30455f4

View File

@ -1,437 +0,0 @@
#define IMM_JSON_IMPLEMENTATION
#include "json/imm_json.hh"
#include <cassert>
#include <cctype>
#include <fstream>
#include <iomanip>
#include <iostream>
#include <memory>
#include <optional>
#include <span>
#include <utility>
namespace split
{
struct sentinel {};
struct iterator
{
using difference_type = ptrdiff_t;
using value_type = std::string_view;
using iterator_category = std::input_iterator_tag;
using pointer = void;
using reference = std::string_view&;
explicit iterator(std::convertible_to<std::string_view> auto&& source, char delim)
: source{source}
, delim{delim}
{
++*this; // Compute first cell
}
inline iterator begin() const
{
return *this;
}
inline sentinel end() const
{
return sentinel{};
}
inline bool operator==(sentinel) const
{
return reached_end;
}
inline iterator& operator++()
{
if (source.empty()) {
reached_end = true;
return *this;
}
if (auto tab = source.find(delim); tab != std::string_view::npos) {
current = source.substr(0, tab);
source.remove_prefix(tab+1);
} else {
current = source;
source = {};
}
return *this;
}
inline iterator operator++(int)
{
auto copy = *this;
++*this;
return copy;
}
inline std::string_view operator*() const
{
return current;
}
std::string_view current;
std::string_view source;
char delim;
bool reached_end = false;
};
}
struct Expression
{
enum Type
{
Symbol,
Call,
};
Type type;
std::string_view symbol;
std::vector<Expression> sub{};
Expression(std::string_view symbol)
: type{Type::Symbol}
, symbol{symbol}
{
}
Expression(std::string_view name, Expression &&arg)
: type{Type::Call}
, symbol{name}
, sub{std::move(arg)}
{
}
friend std::ostream& operator<<(std::ostream& os, Expression const& expr)
{
switch (expr.type) {
break; case Type::Symbol: os << std::quoted(expr.symbol);
break; case Type::Call: os << expr.symbol << '(' << expr.sub.front() << ')';
}
return os;
}
};
std::vector<Expression> parse_normalization_expression(std::string_view &source)
{
auto const skip_ws = [&] {
if (auto after_ws = source.find_first_not_of(" \t"); after_ws != std::string_view::npos) {
source.remove_prefix(after_ws);
}
};
std::vector<Expression> sequence;
for (;;) {
std::string_view symbol;
skip_ws();
// FIXME String escaping
if (source.starts_with('"')) {
source.remove_prefix(1);
auto const string_end = source.find('"');
if (string_end == std::string_view::npos) {
std::cerr << "[ERROR] Failed to parse '" << source << "': expected end of string\n";
std::exit(2);
}
std::string_view symbol{source.begin(), string_end};
source.remove_prefix(string_end+1);
skip_ws();
sequence.emplace_back(symbol);
goto next;
}
{
// Don't use islower since it uses locale (slow)
auto const symbol_end = std::find_if_not(source.begin(), source.end(), [](char c) { return c >= 'a' && c <= 'z'; });
if (symbol_end == source.begin()) {
std::cerr << "[ERROR] Failed to parse '" << source << "': expected symbol\n";
std::exit(2);
}
symbol = std::string_view{source.begin(), symbol_end};
source.remove_prefix(symbol.size());
skip_ws();
}
if (source.empty()) {
sequence.emplace_back(symbol);
goto next;
}
if (source.starts_with("(")) {
source.remove_prefix(1);
// FIXME Should separate expression sequence and expression argument parsing
sequence.emplace_back(symbol, std::move(parse_normalization_expression(source).front()));
skip_ws();
if (!source.starts_with(")")) {
std::cerr << "[ERROR] Failed to parse '" << source << "': expected closing bracket\n";
std::exit(2);
}
source.remove_prefix(1);
goto next;
}
next:
skip_ws();
if (source.starts_with('.')) {
source.remove_prefix(1);
continue;
} else {
break;
}
}
return sequence;
}
struct Value
{
enum class Type
{
Null,
Bool,
Number,
String,
Vector,
};
Type type = Type::Null;
bool boolean = false;
std::string string{};
double number = 0;
std::vector<Value> vector{};
explicit Value()
: type(Type::Null)
{
}
explicit Value(bool b)
: type(Type::Bool)
, boolean(b)
{
}
explicit Value(std::string_view s)
: type(Type::String)
, string(s)
{
}
explicit Value(double number)
: type(Type::Number)
, number(number)
{
}
explicit Value(std::vector<Value> vector)
: type(Type::Vector)
, vector(std::move(vector))
{
}
};
Json& operator+=(Json& json, Value const& value)
{
switch (value.type) {
break; case Value::Type::Null: json = nullptr;
break; case Value::Type::Bool: json = value.boolean;
break; case Value::Type::String: json = value.string;
break; case Value::Type::Number: json = value.number;
break; case Value::Type::Vector:
{
auto _array = json.array();
for (auto const& element : value.vector) json += element;
}
}
return json;
}
struct Builtin
{
std::string_view name;
Value(*handler)(Value, std::optional<std::string_view>);
bool accepts_vector = false;
};
using Env = std::vector<Builtin>;
Value eval(std::vector<Expression> const& expressions, Value value, Env const& env)
{
for (auto const& expr : expressions) {
auto builtin = std::find_if(env.begin(), env.end(), [expr](Builtin const& b) { return b.name == expr.symbol; });
if (builtin == env.end()) {
std::cerr << "[ERROR] Unknown builtin: " << expr.symbol << '\n';
std::exit(1);
}
if (!builtin->accepts_vector && value.type == Value::Type::Vector) {
for (auto &element : value.vector) {
switch (expr.type) {
break; case Expression::Symbol:
element = builtin->handler(std::move(element), std::nullopt);
break; case Expression::Call:
assert(expr.sub.size() == 1);
assert(expr.sub.front().type == Expression::Symbol);
element = builtin->handler(std::move(element), expr.sub[0].symbol);
}
}
} else {
switch (expr.type) {
break; case Expression::Symbol:
value = builtin->handler(std::move(value), std::nullopt);
break; case Expression::Call:
assert(expr.sub.size() == 1);
assert(expr.sub.front().type == Expression::Symbol);
value = builtin->handler(std::move(value), expr.sub[0].symbol);
}
}
}
return value;
}
struct Column
{
Column(std::string_view name, std::string_view normalization_expression)
: name(name)
, expression_source(normalization_expression)
, expression()
{
expression = parse_normalization_expression(normalization_expression);
}
inline Value normalize(std::string_view source, Env const& env)
{
return eval(expression, Value(source), env);
}
std::string_view name;
std::string_view expression_source;
std::vector<Expression> expression;
};
int main(int argc, char** argv)
{
if (argc != 2) {
std::cerr << "usage: " << argv[0] << " <columns.tsv>\n";
std::cerr << " convert tsv file from TSV using definitions from columns.tsv\n";
}
std::ifstream columns_file(argv[1]);
static std::string source{std::istreambuf_iterator<char>(columns_file), {}};
if (auto it = std::next(split::iterator(source, '\t'), 1); it == split::sentinel{} || *it != "Type") {
std::cerr << "[ERROR] Expected Type description in column 2\n";
return 1;
}
std::vector<Column> columns;
for (std::string_view line : split::iterator(source, '\n')) {
auto tsv_it = split::iterator(line, '\t');
if (tsv_it == split::sentinel{} || !(*tsv_it++).starts_with("y")) { continue; }
auto const type = *tsv_it++; if (tsv_it == split::sentinel{}) continue;
[[maybe_unused]] auto const _column_number = *tsv_it++; if (tsv_it == split::sentinel{}) continue;
auto const name = *tsv_it++;
columns.emplace_back(name, type);
}
Env env = {
Builtin { "lower", +[](Value v, std::optional<std::string_view>) -> Value {
assert(v.type == Value::Type::String);
// FIXME Proper UTF-8 lowercase
// However, manual inspection of used TSV files prooved that there aren't any non-ascii uppercase letters
for (char &c : v.string) {
if (c >= 'A' && c <= 'Z') {
c = c - 'A' + 'a';
}
}
return v;
}},
Builtin { "int", +[](Value v, std::optional<std::string_view>) -> Value {
assert(v.type == Value::Type::String);
long long int n;
std::cout.flush();
if (v.string.empty()) {
return Value{};
}
auto [p, ec] = std::from_chars(v.string.data(), v.string.data() + v.string.size(), n);
if (ec != std::errc{}) {
return Value{};
}
return Value(double(n));
}},
Builtin { "bool", +[](Value v, std::optional<std::string_view>) -> Value {
assert(v.type == Value::Type::String);
return Value(v.string.empty());
}},
Builtin { "str", +[](Value v, std::optional<std::string_view>) -> Value {
assert(v.type == Value::Type::String);
return v;
}},
Builtin { "sep", +[](Value v, std::optional<std::string_view> by) -> Value {
assert(by && "sep requires parameter by which it can split");
assert(v.type == Value::Type::String && "only string can be splitted");
std::vector<Value> separated;
std::string_view source = v.string;
for (;;) if (auto split_point = source.find(*by); split_point != std::string_view::npos) {
separated.emplace_back(source.substr(0, split_point));
source.remove_prefix(split_point + by->size());
} else {
break;
}
if (source.size()) {
separated.emplace_back(source);
}
return Value(std::move(separated));
}},
Builtin {
.name = "unless",
.handler = +[](Value v, std::optional<std::string_view> needle) -> Value {
assert(v.type == Value::Type::String);
assert(needle && "Unless requires string to search for");
if (v.string.find(*needle) == std::string::npos)
return v;
return Value("");
},
.accepts_vector = false
}
};
{
bool passed_header = false;
Json json;
auto _array = json.array();
for (std::string line; std::getline(std::cin, line);) {
if (!passed_header) {
passed_header = true;
continue;
}
auto _object = json.object();
auto tsv = split::iterator(line, '\t');
for (auto column = 0u; tsv != split::sentinel{}; ++column, ++tsv) {
assert(column < columns.size());
json.key(columns[column].name) += columns[column].normalize(*tsv, env);
}
break;
}
}
return 0;
}