From 929db9812c693262884b51ac8f37a2d282666e24 Mon Sep 17 00:00:00 2001 From: Robert Bendun Date: Sat, 18 Mar 2023 16:42:19 +0100 Subject: [PATCH] Download data and columns info; change format from csv to tsv --- .gitignore | 4 + Makefile | 25 ++++ checksums.sha256 | 1 + columns.user.tsv | 354 +++++++++++++++++++++++++++++++++++++++++++++++ csv2tsv.go | 32 +++++ 5 files changed, 416 insertions(+) create mode 100644 .gitignore create mode 100644 Makefile create mode 100644 checksums.sha256 create mode 100644 columns.user.tsv create mode 100644 csv2tsv.go diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..0bc39a1 --- /dev/null +++ b/.gitignore @@ -0,0 +1,4 @@ +languages.*.tsv +columns.pruned.tsv +columns.original.tsv +*.csv diff --git a/Makefile b/Makefile new file mode 100644 index 0000000..e8473f6 --- /dev/null +++ b/Makefile @@ -0,0 +1,25 @@ +# # Prune poorly filled or uninteresting columns +# languages.pruned1.tsv: languages.original.tsv +# cut --complement -f1,16-18,22,23,39,40,50,52,53,58-61,64-68,70-75,79,82,84-86,100,103-104,107-109,112,114,116-118,121-136,142,149,150,157,162,163,163,164,165,176,177,185,186,195, $< >$@ + +# Allow user to mark in which columns is interested +columns.user.tsv: columns.pruned.tsv + awk 'NR==1{ printf("Keep\t"); print } NR>1{printf("n\t"); print}' $< >$@ + +# Prune columns that are not needed to create specification +columns.pruned.tsv: columns.original.tsv + cut --complement -f3,4,5,7,8,9,10 $< >$@ + +# Change data to TSV format since it is easier to process using standard UNIX tools +%.tsv: %.csv + go run ./csv2tsv.go <$< >$@ + +# Check while downloading that file is as expected. +# Otherwise automatic filter mechanism wouldn't work. +# If hashes differ then user of this repo must migrate columns.user.tsv to a new format +columns.original.csv: + wget 'https://pldb.com/columns.csv' -O $@ + sha256sum -c checksums.sha256 + +languages.original.csv: columns.original.csv + wget 'https://pldb.com/languages.csv' -O $@ diff --git a/checksums.sha256 b/checksums.sha256 new file mode 100644 index 0000000..89373f5 --- /dev/null +++ b/checksums.sha256 @@ -0,0 +1 @@ +27afba98520ff7c408492cab9ea8789db223ae8c134cb0cd6dfbebf7c7132100 columns.original.csv diff --git a/columns.user.tsv b/columns.user.tsv new file mode 100644 index 0000000..b2837d4 --- /dev/null +++ b/columns.user.tsv @@ -0,0 +1,354 @@ +Keep Index Column Description +y 1 title The official title of the language +y 2 appeared What year was the language publicly released and/or announced? +y 3 type Which category in PLDB's subjective ontology does this entity fit into. +y 4 pldbId computed +y 5 rank computed +y 6 languageRank computed +y 7 factCount computed +y 8 lastActivity computed +y 9 exampleCount computed +y 10 bookCount computed +y 11 paperCount computed +y 12 numberOfUsers computed +y 13 numberOfJobs computed +y 14 githubBigQuery.repos How many repos for this language are listed in Google's BigQuery Public GitHub Dataset snapshot. +y 15 creators Name(s) of the original creators of the language delimited by " and " +y 16 githubRepo URL of the official GitHub repo for the project if it hosted there. +y 17 website URL of the official homepage for the language project. +y 18 wikipedia URL of the entity on Wikipedia, if and only if it has a page dedicated to it. +y 19 originCommunity In what community(ies) did the language first originate? +y 20 country What country was the language first developed in? +y 21 centralPackageRepositoryCount If you've searched for a CPM for this language and can't find one, set 0 as the count. +n 22 reference A link to more info about this entity. You can add raw links and then auto "upgrade" them using some of the importer code. +n 23 hopl The matching language on Diarmuid Pigott's Online Historical Encyclopaedia of Programming Languages site (https://hopl.info/) +n 24 wikipedia.dailyPageViews How many page views per day does this Wikipedia page get? Useful as a signal for rankings. Available via WP api. +y 25 wikipedia.backlinksCount How many pages on WP link to this page? +y 26 wikipedia.summary What is the text summary of the language from the Wikipedia page? +y 27 wikipedia.pageId Waht is the internal ID for this entity on WP? +y 28 wikipedia.appeared When does Wikipedia claim this entity first appeared? +y 29 wikipedia.created When was the *Wikipedia page* for this entity created? +y 30 wikipedia.revisionCount How many revisions does this page have? +n 31 wikipedia.related What languages does Wikipedia have as related? +y 32 fileType What is the file encoding for programs in this language? +n 33 isbndb Books about this language from ISBNdb. +y 34 githubRepo.stars How many stars of the repo? +y 35 githubRepo.forks How many forks of the repo? +y 36 githubRepo.updated What year was the last commit made? +y 37 githubRepo.subscribers How many subscribers to the repo? +y 38 githubRepo.created When was the *Github repo* for this entity created? +y 39 githubRepo.description Description of the repo on GitHub. +n 40 description Description of the repo on GitHub. +n 41 githubRepo.issues How many isses on the repo? +n 42 domainName If the project website is on its own domain. +y 43 githubRepo.firstCommit What year the first commit made in this git repo? +n 44 semanticScholar Papers about this language from Semantic Scholar. +y 45 features.hasComments +n 46 domainName.registered When was this domain first registered? +n 47 isOpenSource Is it an open source project? +y 48 features.hasSemanticIndentation +y 49 features.hasLineComments +n 50 githubLanguage GitHub has a set of supported languages as defined here: https://raw.githubusercontent.com/github/linguist/master/lib/linguist/languages.yml +n 51 githubLanguage.tm_scope The TextMate scope that represents this programming language. This should match one of the scopes listed in the grammars.yml file. Use "none" if there is no grammar for this language. +n 52 githubLanguage.type Either data, programming, markup, prose, or nil. +n 53 githubLanguage.ace_mode A String name of the Ace Mode used for highlighting whenever a file is edited. This must match one of the filenames in http://git.io/3XO_Cg. Use "text" if a mode does not exist. +n 54 githubLanguage.fileExtensions An Array of associated extensions (the first one is considered the primary extension, the others should be listed alphabetically). +n 55 numberOfRepos computed +n 56 githubLanguage.repos How many repos for this language does GitHub report? +n 57 lineCommentToken Defined as a token that can be placed anywhere on a line and starts a comment that cannot be stopped except by a line break character or end of file. +n 58 githubLanguage.trendingProjectsCount How many trending repos for this language does GitHub report? +n 59 domainName.awisRank.2022 +n 60 leachim6.filepath +n 61 leachim6 A link to this language in leachim6's hello-world project. +n 62 githubBigQuery Google BigQuery Public Datasets has a dataset with info on GitHub repos: https://cloud.google.com/blog/topics/public-datasets/github-on-bigquery-analyze-all-the-open-source-code +n 63 githubBigQuery.users +n 64 linguistGrammarRepo Linguist is a library used by GitHub to syntax highlight files on GitHub via a grammar. The list of languages supported by Linguist and the grammar package used for each language is listed here: https://github.com/github/linguist/blob/master/vendor/README.md. If Linguist has support for a language, it will have a repo on GitHub. Given a language is supported by Linguist, that is a good indication it has at least 200 unique :user/:repo repositories, according to their docs. +n 65 linguistGrammarRepo.commitCount How many commits in this repo? +n 66 linguistGrammarRepo.committerCount How many people have made commits in this repo? +n 67 linguistGrammarRepo.lastCommit What year was the last commit made? +n 68 linguistGrammarRepo.firstCommit What year was the first commit made? +n 69 wordRank Some creators use a common English word as their language's name. For these we note how common the word is, where "the" is 1. +n 70 leachim6.fileExtensions An Array of associated extensions (the first one is considered the primary extension, the others should be listed alphabetically). +n 71 linguistGrammarRepo.sampleCount How many language samples in this repo? +y 72 features.hasStrings +n 73 pygmentsHighlighter.filename +n 74 pygmentsHighlighter A link to a Pygments syntax highlighting class for this language (https://pygments.org/) +n 75 standsFor If the language name is an acronym what does/did it stand for? +n 76 stringToken What token(s) is used to delimite a string? +n 77 documentation Link to the official documentation for a language. +n 78 rosettaCode A link to this language on Rosetta Code - http://www.rosettacode.org/ +n 79 pygmentsHighlighter.fileExtensions An Array of associated extensions (the first one is considered the primary extension, the others should be listed alphabetically). +y 80 features.hasPrintDebugging +n 81 printToken What token(s) is used to print a message? +n 82 twitter Official Twitter handle of the entity, if any. +y 83 features.hasMultiLineComments +n 84 rijuRepl A link to try this language on riju.codes +n 85 githubLanguage.codemirror_mime_type A String name of the file mime type used for highlighting whenever a file is edited. This should match the `mime` associated with the mode from https://git.io/f4SoQ +n 86 githubLanguage.codemirror_mode A String name of the CodeMirror Mode used for highlighting whenever a file is edited. This must match a mode from https://git.io/vi9Fx +y 87 fileExtensions An Array of associated extensions (the first one is considered the primary extension, the others should be listed alphabetically). +n 88 tiobe Tiobe maintains a well known ranking of programming languages here: https://www.tiobe.com/tiobe-index/ +y 89 related What languages are related? This serves as a catch all, and it is better to use a more specific relationship node such as "supersetOf". +n 90 multiLineCommentTokens A comment with a start delimiter and end token (which can be the same) that can span multiple lines. +n 91 aka Another name for the language. Entries can have multiple aka lines. +y 92 features.hasIntegers +n 93 helloWorldCollection Hello world written in this language from http://helloworldcollection.de/ +n 94 githubLanguage.aliases An Array of additional aliases (implicitly includes name.downcase). +y 95 features.hasFloats +n 96 tryItOnline A link to try this language on https://tio.run +y 97 writtenIn What language(s) is the main implementation written in? +y 98 features.hasBooleans +n 99 keywords What are all the keywords in this language? +n 100 indeedJobs How many job descriptions match this query for this language on indeed.com? +n 101 wikipedia.fileExtensions An Array of associated extensions (the first one is considered the primary extension, the others should be listed alphabetically). +y 102 features.hasHexadecimals +n 103 projectEuler.memberCount.2022 +n 104 projectEuler Is this language one of the ones listed on https://projecteuler.net/? +n 105 booleanTokens What token(s) is used for true and false? +n 106 visualParadigm Is this a visual programming thing? Sometimes called "no code" or "low code"? +n 107 domainName.awisRank.2017 +n 108 projectEuler.memberCount.2019 +n 109 webRepl An online repl for the project. +n 110 subreddit.memberCount.2022 +n 111 subreddit Url of a subreddit(s) for this language. +n 112 codeMirror A link to a CodeMirror syntax highlighting package for this language (https://github.com/codemirror/codemirror5/tree/master/mode/LANGUAGE) +y 113 features.hasCaseInsensitiveIdentifiers +n 114 monaco A link to a Monaco syntax highlighting package for this language. +y 115 features.hasConditionals +n 116 jupyterKernel A link to a Jupyter Kernel for this language. +n 117 githubLanguage.interpreters An Array of associated interpreters +n 118 quineRelay The Quine Relay project (https://github.com/mame/quine-relay). +n 119 compilesTo Which language(s) does this language primarily compile to? +n 120 ubuntuPackage The name of an Ubuntu package for the language from https://packages.ubuntu.com/. +n 121 indeedJobs.2022 +n 122 packageRepository URL to the package repository for this language. +n 123 antlr A link to the ANTLR grammar for this language (https://github.com/antlr/grammars-v4/tree/master/LANGUAGE) +n 124 officialBlogUrl URL to the official blog for this language. +n 125 meetup.groupCount +n 126 meetup.memberCount +n 127 meetup Some languages have active meetup groups on Meetup.com +n 128 linkedInSkill.2018 +n 129 linkedInSkill How many people list this skill on LinkedIn? +n 130 languageServerProtocolProject A link to a project implementing LSP for this language. +n 131 githubLanguage.filenames Filenames commonly associated with the language. +y 132 features.hasOctals +n 133 releaseNotesUrl URL to the release notes for this language. +n 134 languageServerProtocolProject.writtenIn What language(s) is the main implementation written in? +y 135 features.hasAssignment +n 136 faqPageUrl URL to the frequently asked questions for this language. +n 137 tiobe.currentRank What is the current Tiobe rank of this language? +y 138 features.hasWhileLoops +n 139 forLanguages Which languages is this repository for? +n 140 packageCount How many packages are in the repository? A package is some code with a name and a namespace, shipped as an atomic unit, with an owner(s). +y 141 supersetOf Is this language a superset of another? If you specify this link then the superset language will inherit all features of subset language. +n 142 indeedJobs.2017 +y 143 features.hasBinaryNumbers +y 144 influencedBy What languages influenced this one? +y 145 features.hasOperatorOverloading +y 146 features.hasImports +y 147 features.hasFunctions +n 148 githubLanguage.group Name of the parent language. Languages in a group are counted in the statistics as the parent language. +n 149 rijuRepl.description Description of the repo on GitHub. +n 150 subreddit.memberCount.2017 +n 151 rijuRepl.gitRepo URL of the official git repo for the language project if not hosted on GitHub or GitLab or Sourcehut. +n 152 rijuRepl.fileExtensions An Array of associated extensions (the first one is considered the primary extension, the others should be listed alphabetically). +n 153 stackOverflowSurvey.2021.percentageUsing What percentage of survey respondents report using this language? +n 154 stackOverflowSurvey.2021.fans How many developers reported wanting to learn this language. +n 155 stackOverflowSurvey.2021.medianSalary Median salary reported by developers using this language. +n 156 stackOverflowSurvey.2021.users How many developers reported using this language. +n 157 downloadPageUrl URL to the download page for this language. +n 158 assignmentToken What token(s) is used for assignment to an identifier? +n 159 compilerExplorer This language's name on https://godbolt.org +y 160 features.hasMacros +y 161 features.hasClasses +n 162 replit A link to try this language on replit.com +n 163 rijuRepl.website URL of the official homepage for the language project. +n 164 pypl This language's id on https://pypl.github.io +n 165 emailList Link to the mailing list for a language. +y 166 features.hasTypeInference +y 167 features.isCaseSensitive +y 168 features.hasSwitch +y 169 features.hasConstants +y 170 features.hasGarbageCollection +n 171 spec Link to the official spec for a language. +y 172 features.hasExceptions +y 173 features.hasPointers +y 174 features.hasDirectives +y 175 features.hasAccessModifiers +n 176 eventsPageUrl URL to the events pages of this language. +n 177 cheatSheetUrl A link to a cheat sheet for this language. +y 178 features.hasLists +y 179 features.hasInheritance +n 180 esolang A link to this language on https://esolangs.org/ +n 181 features.hasMultipleInheritance +y 182 features.hasConstructors +n 183 nativeLanguage Nearly all programming languages are written in English, but some aren't. Set this field for the ones that are not. +y 184 features.hasRegularExpressionsSyntaxSugar +n 185 screenshot For visual languages, a picture is worth a thousand words. Provide the URL to the screenshot in the form: https://pldb.com/screenshots/[pldbId].png +n 186 githubLanguage.wrap Boolean wrap to enable line wrapping (default: false) +y 187 features.isLisp Is this in the Lisp family of languages? +y 188 features.hasTernaryOperators +y 189 features.hasScientificNotation +n 190 versions.2022 A release year and version. Perhaps in the future we could get more specific to month or even day. +y 191 features.hasMessagePassing +n 192 gdbSupport Is the language supported by the GNU Debugger? +y 193 features.hasEnums +n 194 announcementMethod How was the language first announced? +n 195 gitlabRepo URL of the official GitLab repo for the language project. +n 196 demoVideo Provide a url of a demo video of the language. +n 197 isPublicDomain Is it public domain? +y 198 features.hasMultilineStrings +y 199 features.hasVariableSubstitutionSyntax Do you use different syntax when assigning versus referencing a variable? +y 200 subsetOf Is this language a subset of another? +n 201 firstAnnouncement A url announcing the creation or release of a new language +n 202 packageInstallCount How many packages have been downloaded? +y 203 features.canWriteToDisk +y 204 features.hasBitWiseOperators +y 205 features.hasZeroBasedNumbering +n 206 oldName What is the old name of this language? +y 207 features.hasStaticTyping +y 208 features.hasUnitsOfMeasure +y 209 features.hasIncrementAndDecrementOperators +y 210 features.hasSingleDispatch +y 211 features.hasHomoiconicity +n 212 runsOnVm What virtual machine(s) does this language run on? +y 213 features.hasHereDocs +y 214 features.hasFixedPoint +y 215 features.hasNamespaces +y 216 features.hasThreads +y 217 features.hasModules +n 218 gitRepo URL of the official git repo for the language project if not hosted on GitHub or GitLab or Sourcehut. +y 219 features.hasPatternMatching +y 220 features.hasGotos +n 221 annualReportsUrl URL to the annual reports for this language. +y 222 features.hasFunctionComposition +y 223 features.hasFunctionOverloading +y 224 features.hasAsyncAwait +y 225 features.hasIterators +y 226 features.hasExplicitTypeCasting +y 227 features.hasStructs +y 228 features.hasMultipleDispatch +y 229 features.hasInterfaces +y 230 features.hasGenerics +y 231 features.hasForEachLoops +y 232 features.hasMaps +y 233 features.hasPipes +y 234 features.hasMixins +y 235 features.canDoShebang +y 236 features.hasVariadicFunctions +y 237 features.hasManualMemoryManagement +y 238 features.hasTemplates +y 239 features.hasInfixNotation +y 240 features.hasPolymorphism +y 241 features.hasPartialApplication +y 242 features.hasAssertStatements +n 243 sourcehutRepo URL of the official sourcehut repo for the project. +y 244 features.hasForLoops +n 245 renamedTo What is the new name of this language? +y 246 features.hasDocComments Is there a standard mini language written in comments for documenting code? +y 247 features.hasUnicodeIdentifiers +y 248 features.hasDependentTypes +n 249 conference Some languages have a recurring conference(s) focused on that specific language. +y 250 features.hasDuckTyping +y 251 features.hasDefaultParameters +y 252 features.hasAnonymousFunctions +y 253 features.hasMagicGettersAndSetters +n 254 packageAuthors How many people contribute packages to this cpm? +n 255 photo For notations, a picture is worth a thousand words. Provide a photo in the form: https://pldb.com/photos/[pldbId].png +n 256 successorOf Was this language launched as the successor of another? +y 257 features.hasBuiltInRegex +y 258 features.hasNull +y 259 features.hasUnaryOperators +y 260 features.hasUserDefinedOperators +y 261 features.hasBreak +y 262 features.hasContinue +n 263 includeToken What token(s) is used for including another file? +y 264 features.hasUnionTypes +y 265 features.hasSingleTypeArrays Has an array data structure that only can hold items of the same type. +y 266 features.hasTypedHoles +y 267 features.hasReservedWords Does a concept of reserved words exists? For example, not being able to use certain keywords as variable names. +y 268 features.hasRangeOperators +y 269 features.hasDisposeBlocks +y 270 features.hasSymbolTables +y 271 features.hasDestructuring +y 272 features.hasGenerators +y 273 features.hasDynamicProperties +y 274 features.hasExpressions +y 275 forkOf What language is this language a fork of? +n 276 inputLanguages Which language(s) does this take as input? For compilers, what languages does this compile compile? +n 277 redditDiscussion A link to a related discussion on reddit. +y 278 features.hasTryCatch +y 279 features.hasEscapeCharacters +y 280 usesSemanticVersioning Does the official release of the language use semantic versioning? +y 281 features.hasPostfixNotation +y 282 features.hasPrefixNotation +y 283 features.hasStreams +y 284 features.hasLazyEvaluation +y 285 features.hasCharacters +y 286 funFact A text or code block containing a fun or unusual fact about the language. +y 287 features.hasSets +y 288 features.hasMethods +y 289 features.hasAbstractTypes +y 290 isDead Has the creator or maintainer announced it officially dead? Include a link to proof of the announcement. +y 291 features.canUseQuestionMarksAsPartOfIdentifier +y 292 features.hasTypeAnnotations +y 293 features.hasSymbols +y 294 features.hasDecimals +y 295 features.hasBlobs +y 296 features.hasSExpressions +y 297 features.hasLabels +y 298 features.hasIfElses +y 299 features.hasIfs +y 300 features.hasBoundedCheckedArrays +y 301 features.hasArraySlicingSyntax +y 302 features.hasTimestamps +y 303 features.hasMethodOverloading +y 304 features.hasVoidFunctions +y 305 features.hasGlobalScope +y 306 features.hasFnArguments +y 307 features.canReadCommandLineArgs +y 308 features.hasDynamicSizedArrays +y 309 features.hasRequiredMainFunction +y 310 features.hasSelfOrThisWord +n 311 features.hasStatementTerminatorCharacter +y 312 features.hasMemberVariables +y 313 features.hasStringConcatOperator +n 314 versions.2021 A release year and version. Perhaps in the future we could get more specific to month or even day. +y 315 features.hasAlgebraicTypes +y 316 features.hasTypeParameters +y 317 features.hasStaticMethods +y 318 features.hasRunTimeGuards +n 319 irc Link to official (or popular unofficial) IRC channel(s) for language development. +y 320 features.hasTraits +y 321 features.hasVirtualFunctions +n 322 discord Link to official (or popular unofficial) Discord for language development. +y 323 features.letterFirstIdentifiers Must identifiers start with a letter +y 324 features.hasReferences +y 325 features.hasImplicitTypeConversions +y 326 features.hasFirstClassFunctions +y 327 features.hasProcessorRegisters +y 328 features.hasSourceMaps +y 329 features.mergesWhitespace +y 330 features.supportsBreakpoints +y 331 features.hasMapFunctions +y 332 features.hasBinaryOperators +y 333 features.hasStatements +n 334 versions.2007 A release year and version. Perhaps in the future we could get more specific to month or even day. +n 335 versions.2023 A release year and version. Perhaps in the future we could get more specific to month or even day. +n 336 versions.2015 A release year and version. Perhaps in the future we could get more specific to month or even day. +n 337 versions.2019 A release year and version. Perhaps in the future we could get more specific to month or even day. +n 338 versions.2013 A release year and version. Perhaps in the future we could get more specific to month or even day. +y 339 features.hasRefinementTypes +y 340 features.hasPairs +y 341 features.hasValueReturnedFunctions +y 342 features.hasClobs +y 343 features.hasTriples +y 344 features.hasIds +n 345 ebook Link to a free eBook about this. Only include if the eBook is of high quality and not spammy. +y 346 features.hasExports +y 347 features.hasZippers +y 348 features.hasMonads +y 349 extensionOf What language is this language an extension of? +n 350 zulip Link to official (or popular unofficial) Zulip for language development. +y 351 features.hasImplicitArguments +y 352 features.hasDynamicTyping +y 353 features.hasMethodChaining diff --git a/csv2tsv.go b/csv2tsv.go new file mode 100644 index 0000000..a1a4197 --- /dev/null +++ b/csv2tsv.go @@ -0,0 +1,32 @@ +package main + +import ( + "encoding/csv" + "fmt" + "io" + "os" +) + +func main() { + reader := csv.NewReader(os.Stdin) + reader.FieldsPerRecord = -1 + reader.ReuseRecord = true + + loop: for lineno := 0;; lineno++ { + record, err := reader.Read() + switch err { + case io.EOF: + break loop + default: + fmt.Fprintf(os.Stderr, "[WARNING] Couldn't read line %d: %v\n", lineno, err) + case nil: + } + for i, entry := range record { + if i != 0 { + fmt.Print("\t") + } + fmt.Print(entry) + } + fmt.Println() + } +}