From 9eaa7ff32041c66c3eb4aff71b9e0497795b2371 Mon Sep 17 00:00:00 2001
From: Filip Gralinski <filipg@amu.edu.pl>
Date: Sat, 22 Feb 2020 12:35:19 +0100
Subject: [PATCH] Update README on headers

---
 README.md | 46 ++++++++++++++++++++++++++++++++++++++++++----
 1 file changed, 42 insertions(+), 4 deletions(-)

diff --git a/README.md b/README.md
index a0d71cf..f285e3d 100644
--- a/README.md
+++ b/README.md
@@ -480,6 +480,32 @@ So now you can see that the accuracy is over 78% and the likelihood
     in<1>:Brytania	2	0.53333333	0.01357876718525224600
     in<1>:rewolucja	2	0.53333333	0.01357876718525224600
 
+## Handling headers
+
+When dealing with TSV files, you often face a dilemma whether to add a
+header with field names as the first line of a TSV file or not:
+
+* a header makes a TSV more readable to humans, especially when you use tools like [Visidata](https://www.visidata.org/),
+  and when there is a lot of input columns (features)
+* … but, on the other hand, makes it much cumbersome to process with textutils (cat, sort, shuf, etc.) or similar tools.
+
+GEval can handle TSV with _and_ without headers. By default,
+headerless TSV are assumed, but you can specify column names for
+input and output/expected files with, respectively, `--in-header
+in-header.tsv` and `--out-header out-header.tsv` option.
+
+A header file (`in-header.tsv` or `out-header.tsv`) should be a one-line TSV line with column names.
+(Why this way? Because now you can combine this easily with data using, for instance, `cat in-header.tsv dev-0/in.tsv`.)
+
+Now GEval will work as follows:
+
+* when reading a file it will first check whether the first field in
+  the first line is the same as the first column name, if it is the
+  case, it will assume the given TSV file contains a header line (just make sure
+  this string is specific enough and won't mix up with data!),
+* otherwise, it will assume it is a headerless file,
+* anyway, the column names will be used for human-readable output, for
+  instance, when listing worst features.
 
 ## Preparing a Gonito challenge
 
@@ -497,6 +523,8 @@ have the following structure:
   metric is specified here (e.g. `--metric BLEU`), also non-default
   file names could be given here (e.g. `--test-name test-B` for a
   non-standard test subdirectory)
+* `in-header.tsv` — one-line TSV file with column names for input data (features),
+* `out-header.tsv` — one-line TSV file with column names for output/expected data, usually just one label,
 * `train/` — subdirectory with training data (if training data are
   supplied for a given Gonito challenge at all)
 * `train/in.tsv` — the input data for the training set
@@ -556,6 +584,8 @@ all: $(output_directory)/train/in.tsv.xz $(output_directory)/train/expected.tsv
      $(output_directory)/dev-0/in.tsv.xz $(output_directory)/dev-0/expected.tsv \
      $(output_directory)/test-A/in.tsv.xz $(output_directory)/test-A/expected.tsv \
      $(output_directory)/README.md
+     $(output_directory)/in-header.tsv
+     $(output_directory)/out-header.tsv
     # always validate the challenge
     geval --validate --expected-directory $(output_directory)
 
@@ -563,7 +593,14 @@ all: $(output_directory)/train/in.tsv.xz $(output_directory)/train/expected.tsv
 # is kept as challenge-readme.md in the repo with this Makefile;
 # note that the title from README.md will be taken as the title of the challenge
 # and the first paragraph — as a short description
-$(output_directory)/README.md: challenge-readme.md
+$(output_directory)/README.md: challenge-readme.md $(output_directory)/config.txt
+    cp $< $@
+
+# prepare header files (see above section on headers)
+$(output_directory)/in-header.tsv: in-header.tsv $(output_directory)/config.txt
+    cp $< $@
+
+$(output_directory)/out-header.tsv: out-header.tsv $(output_directory)/config.txt
     cp $< $@
 
 $(output_directory)/config.txt:
@@ -573,6 +610,7 @@ $(output_directory)/config.txt:
     # ... but we remove the `in/expected.tsv` files just in case
     # (we will overwrite this with our data anyway)
     rm -f $(output_directory)/{train,dev-0,test-A}/{in,expected}.tsv
+    rm $(output_directory)/{README.md,in-header.tsv,out-header.tsv}
 
 # a "total" TSV containing all the data, we'll split it later
 all-data.tsv.xz: prepare.py some-other-files
@@ -590,7 +628,7 @@ all-data.tsv.xz: prepare.py some-other-files
 # that you created a script `filter.py` that takes as an argument a regular expression that will be applied
 # to the MD5 sum (written in the hexadecimal format).
 
-$(output_directory)/train/in.tsv.xz $(output_directory)/train/expected.tsv: all-data.tsv.xz filter.py config.txt
+$(output_directory)/train/in.tsv.xz $(output_directory)/train/expected.tsv: all-data.tsv.xz filter.py $(output_directory)/config.txt
     # 1. xzcat for decompression
     # 2. ./filter.py will select 14/16=7/8 of items in a stable random manner
     # 3. tee >(...) is Bash magic to fork the ouptut into two streams
@@ -598,11 +636,11 @@ $(output_directory)/train/in.tsv.xz $(output_directory)/train/expected.tsv: all-
     # 5. xz will compress it back
     xzcat $< | ./filter.py '[0-9abcd]$' | tee >(cut -f 1 > $(output_directory)/train/expected.tsv) | cut -f 2- | xz > $@
 
-$(output_directory)/dev-0/in.tsv.xz $(output_directory)/dev-0/expected.tsv: all-data.tsv.xz filter.py config.txt
+$(output_directory)/dev-0/in.tsv.xz $(output_directory)/dev-0/expected.tsv: all-data.tsv.xz filter.py $(output_directory)/config.txt
     # 1/16 of items goes to dev-0 set
     xzcat $< | ./filter.py 'e$' | tee >(cut -f 1 > $(output_directory)/dev-0/expected.tsv) | cut -f 2- | xz > $@
 
-$(output_directory)/test-A/in.tsv.xz $(output_directory)/test-A/expected.tsv: all-data.tsv.xz filter.py config.txt
+$(output_directory)/test-A/in.tsv.xz $(output_directory)/test-A/expected.tsv: all-data.tsv.xz filter.py $(output_directory)/config.txt
     # (other) 1/16 of items goes to test-A set
     xzcat $< | ./filter.py 'f$' | tee >(cut -f 1 > $(output_directory)/test-A/expected.tsv) | cut -f 2- | xz > $@