From 3ef341608260c5c57912a24bcc58e7d6c30bd1f5 Mon Sep 17 00:00:00 2001 From: Filip Gralinski Date: Sat, 15 Jan 2022 15:18:57 +0100 Subject: [PATCH] More on challenge preparation --- README.md | 24 +++++-- misc/challenge-preparation-example/Makefile | 78 +++++++++++++++++++++ 2 files changed, 98 insertions(+), 4 deletions(-) create mode 100644 misc/challenge-preparation-example/Makefile diff --git a/README.md b/README.md index b83795c..44a1857 100644 --- a/README.md +++ b/README.md @@ -821,20 +821,36 @@ You can use `geval` to initiate a [Gonito](https://gonito.net) challenge: Of course, any other metric can be given to generate another type of toy challenge: - geval --init --expected-directory my-machine-translation-challenge --metric BLEU + geval --init --expected-directory my-machine-translation-challenge --metric BLEU --precision 4 -% -B 200 + +Note that the `--precision 4` and `-%` options give you pretty +formatting of evaluation scores. Simply you don't want ugly scores +such as `0.1729801323401`! The `--precision 4` option limits it to 4 +digits after the decimal dot (`0.1730`) and `-%` makes it into a +percent-like value (`17.30`). + +The `-B 200` is yet another interesting option. If it is used, GEval will +calculate confidence intervals using bootstrap sampling. ### Preparing a Git repository [Gonito](https://gonito.net) platform expects a Git repository with a challenge to be submitted. The suggested way to do this will be presented as a [Makefile](https://en.wikipedia.org/wiki/Makefile), but -of course you could use any other scripting language and the commands -should be clear if you know Bash and some basic facts about Makefiles: +of course you could use any other scripting language (anyway, it's +always a good idea to start with `geval --init` and then add/overwrite +the files). The commands should be clear if you know Bash and some +basic facts about Makefiles: * a Makefile consists of rules, each rule specifies how to build a _target_ out of _dependencies_ using shell commands * `$@` is the (first) target, whereas `$<` — the first dependency -* the indentation should be done with TABs, not spaces! +* the indentation should be done with **TABs, not spaces**! (see the + [file with TABs](misc/challenge-preparation-example/Makefile) + +Also don't forget to compress aggressively large files (e.g. +`train/in.tsv` and `train/expected.tsv`), the xz compressor is a good +option and is handled by GEval. ``` SHELL=/bin/bash diff --git a/misc/challenge-preparation-example/Makefile b/misc/challenge-preparation-example/Makefile new file mode 100644 index 0000000..304ab44 --- /dev/null +++ b/misc/challenge-preparation-example/Makefile @@ -0,0 +1,78 @@ +SHELL=/bin/bash + +# no not delete intermediate files +.SECONDARY: + +# the directory where the challenge will be created +output_directory=... + +# let's define which files are necessary, other files will be created if needed; +# we'll compress the input files with xz and leave `expected.tsv` files uncompressed +# (but you could decide otherwise) +all: $(output_directory)/train/in.tsv.xz $(output_directory)/train/expected.tsv \ + $(output_directory)/dev-0/in.tsv.xz $(output_directory)/dev-0/expected.tsv \ + $(output_directory)/test-A/in.tsv.xz $(output_directory)/test-A/expected.tsv \ + $(output_directory)/README.md \ + $(output_directory)/in-header.tsv \ + $(output_directory)/out-header.tsv + # always validate the challenge + geval --validate --expected-directory $(output_directory) + +# we need to replace the default README.md, we assume that it +# is kept as challenge-readme.md in the repo with this Makefile; +# note that the title from README.md will be taken as the title of the challenge +# and the first paragraph — as a short description +$(output_directory)/README.md: challenge-readme.md $(output_directory)/config.txt + cp $< $@ + +# prepare header files (see above section on headers) +$(output_directory)/in-header.tsv: in-header.tsv $(output_directory)/config.txt + cp $< $@ + +$(output_directory)/out-header.tsv: out-header.tsv $(output_directory)/config.txt + cp $< $@ + +$(output_directory)/config.txt: + mkdir -p $(output_directory) + geval --init --expected-directory $(output_directory) --metric MAIN_METRIC --metric AUXILIARY_METRIC --precision N --gonito-host https://some.gonito.host.net + # `geval --init` will generate a toy challenge for a given metric(s) + # ... but we remove the `in/expected.tsv` files just in case + # (we will overwrite this with our data anyway) + rm -f $(output_directory)/{train,dev-0,test-A}/{in,expected}.tsv + rm $(output_directory)/{README.md,in-header.tsv,out-header.tsv} + +# a "total" TSV containing all the data, we'll split it later +all-data.tsv.xz: prepare.py some-other-files + # the data are generated using your script, let's say prepare.py and + # some other files (of course, it depends on your task); + # the file will be compressed with xz + ./prepare.py some-other-files | xz > $@ + +# and now the challenge files, note that they will depend on config.txt so that +# the challenge skeleton is generated first + +# The best way to split data into train, dev-0 and test-A set is to do it in a random, +# but _stable_ manner, the set into which an item is assigned should depend on the MD5 sum +# of some field in the input data (a field unlikely to change). Let's assume +# that you created a script `filter.py` that takes as an argument a regular expression that will be applied +# to the MD5 sum (written in the hexadecimal format). + +$(output_directory)/train/in.tsv.xz $(output_directory)/train/expected.tsv: all-data.tsv.xz filter.py $(output_directory)/config.txt + # 1. xzcat for decompression + # 2. ./filter.py will select 14/16=7/8 of items in a stable random manner + # 3. tee >(...) is Bash magic to fork the ouptut into two streams + # 4. cut will select the columns + # 5. xz will compress it back + xzcat $< | ./filter.py '[0-9abcd]$' | tee >(cut -f 1 > $(output_directory)/train/expected.tsv) | cut -f 2- | xz > $(output_directory)/train/in.tsv.xz + +$(output_directory)/dev-0/in.tsv.xz $(output_directory)/dev-0/expected.tsv: all-data.tsv.xz filter.py $(output_directory)/config.txt + # 1/16 of items goes to dev-0 set + xzcat $< | ./filter.py 'e$' | tee >(cut -f 1 > $(output_directory)/dev-0/expected.tsv) | cut -f 2- | xz > $(output_directory)/dev-0/in.tsv.xz + +$(output_directory)/test-A/in.tsv.xz $(output_directory)/test-A/expected.tsv: all-data.tsv.xz filter.py $(output_directory)/config.txt + # (other) 1/16 of items goes to test-A set + xzcat $< | ./filter.py 'f$' | tee >(cut -f 1 > $(output_directory)/test-A/expected.tsv) | cut -f 2- | xz > $(output_directory)/test-A/in.tsv.xz + +# wiping out the challenge, if you are desperate +clean: + rm -rf $(output_directory)