From 3ebe158e55c4bdb96b3709cdfef673867778a8d9 Mon Sep 17 00:00:00 2001 From: Filip Gralinski Date: Sat, 1 Aug 2020 21:27:04 +0200 Subject: [PATCH 1/4] Describe flags, add "c" and "t" flags. Also add tests for flags --- README.md | 259 +++++++++++++++++- src/GEval/EvaluationScheme.hs | 11 +- test/Spec.hs | 21 ++ .../flags-case-fold-solution/test-A/out.tsv | 10 + .../flags-case-fold/config.txt | 1 + .../flags-case-fold/test-A/expected.tsv | 10 + .../flags-filtering-solution/test-A/out.tsv | 10 + .../flags-filtering/config.txt | 1 + .../flags-filtering/test-A/expected.tsv | 10 + .../flags-filtering/test-A/in.tsv | 10 + .../flags-lowercase-solution/test-A/out.tsv | 10 + .../flags-lowercase/config.txt | 1 + .../flags-lowercase/test-A/expected.tsv | 10 + .../flags-none-solution/test-A/out.tsv | 10 + test/flags-none/flags-none/config.txt | 1 + .../flags-none/flags-none/test-A/expected.tsv | 10 + .../test-A/out.tsv | 10 + .../flags-regexp-matching-anchor/config.txt | 1 + .../test-A/expected.tsv | 10 + .../test-A/out.tsv | 10 + .../flags-regexp-matching/config.txt | 1 + .../flags-regexp-matching/test-A/expected.tsv | 10 + .../test-A/out.tsv | 10 + .../flags-regexp-substitution-ref/config.txt | 1 + .../test-A/expected.tsv | 10 + .../test-A/out.tsv | 10 + .../flags-regexp-substitution/config.txt | 1 + .../test-A/expected.tsv | 10 + .../test-A/out.tsv | 10 + .../config.txt | 1 + .../test-A/expected.tsv | 10 + .../test-A/out.tsv | 10 + .../flags-regexp-token-matching/config.txt | 1 + .../test-A/expected.tsv | 10 + .../flags-uppercase-solution/test-A/out.tsv | 10 + .../flags-uppercase/config.txt | 1 + .../flags-uppercase/test-A/expected.tsv | 10 + 37 files changed, 530 insertions(+), 2 deletions(-) create mode 100644 test/flags-case-fold/flags-case-fold-solution/test-A/out.tsv create mode 100644 test/flags-case-fold/flags-case-fold/config.txt create mode 100644 test/flags-case-fold/flags-case-fold/test-A/expected.tsv create mode 100644 test/flags-filtering/flags-filtering-solution/test-A/out.tsv create mode 100644 test/flags-filtering/flags-filtering/config.txt create mode 100644 test/flags-filtering/flags-filtering/test-A/expected.tsv create mode 100644 test/flags-filtering/flags-filtering/test-A/in.tsv create mode 100644 test/flags-lowercase/flags-lowercase-solution/test-A/out.tsv create mode 100644 test/flags-lowercase/flags-lowercase/config.txt create mode 100644 test/flags-lowercase/flags-lowercase/test-A/expected.tsv create mode 100644 test/flags-none/flags-none-solution/test-A/out.tsv create mode 100644 test/flags-none/flags-none/config.txt create mode 100644 test/flags-none/flags-none/test-A/expected.tsv create mode 100644 test/flags-regexp-matching-anchor/flags-regexp-matching-anchor-solution/test-A/out.tsv create mode 100644 test/flags-regexp-matching-anchor/flags-regexp-matching-anchor/config.txt create mode 100644 test/flags-regexp-matching-anchor/flags-regexp-matching-anchor/test-A/expected.tsv create mode 100644 test/flags-regexp-matching/flags-regexp-matching-solution/test-A/out.tsv create mode 100644 test/flags-regexp-matching/flags-regexp-matching/config.txt create mode 100644 test/flags-regexp-matching/flags-regexp-matching/test-A/expected.tsv create mode 100644 test/flags-regexp-substitution-ref/flags-regexp-substitution-ref-solution/test-A/out.tsv create mode 100644 test/flags-regexp-substitution-ref/flags-regexp-substitution-ref/config.txt create mode 100644 test/flags-regexp-substitution-ref/flags-regexp-substitution-ref/test-A/expected.tsv create mode 100644 test/flags-regexp-substitution/flags-regexp-substitution-solution/test-A/out.tsv create mode 100644 test/flags-regexp-substitution/flags-regexp-substitution/config.txt create mode 100644 test/flags-regexp-substitution/flags-regexp-substitution/test-A/expected.tsv create mode 100644 test/flags-regexp-token-matching-anchor/flags-regexp-token-matching-anchor-solution/test-A/out.tsv create mode 100644 test/flags-regexp-token-matching-anchor/flags-regexp-token-matching-anchor/config.txt create mode 100644 test/flags-regexp-token-matching-anchor/flags-regexp-token-matching-anchor/test-A/expected.tsv create mode 100644 test/flags-regexp-token-matching/flags-regexp-token-matching-solution/test-A/out.tsv create mode 100644 test/flags-regexp-token-matching/flags-regexp-token-matching/config.txt create mode 100644 test/flags-regexp-token-matching/flags-regexp-token-matching/test-A/expected.tsv create mode 100644 test/flags-uppercase/flags-uppercase-solution/test-A/out.tsv create mode 100644 test/flags-uppercase/flags-uppercase/config.txt create mode 100644 test/flags-uppercase/flags-uppercase/test-A/expected.tsv diff --git a/README.md b/README.md index 3ed16b9..e410d12 100644 --- a/README.md +++ b/README.md @@ -88,7 +88,7 @@ to happen on macOS, as these packages are usually installed out of the box on Li In case the `lzma` package is not installed on your Linux, you need to run (assuming Debian/Ubuntu): sudo apt-get install pkg-config liblzma-dev libpq-dev libpcre3-dev libcairo2-dev libbz2-dev - + #### Windows issues If you see this message on Windows during executing `stack test` command: @@ -480,6 +480,263 @@ So now you can see that the accuracy is over 78% and the likelihood in<1>:Brytania 2 0.53333333 0.01357876718525224600 in<1>:rewolucja 2 0.53333333 0.01357876718525224600 +## Metric flags + +GEval offers a number of *flags* to modify the way an evaluation +metric is calculated or presented. For instance, if you use `BLEU:u` +instead of `BLEU`, the BLEU metric (a standard metric for machine +translation) will be evaluated on the actual and expected outputs +upper-cased. In other words, flags can be used to _normalize_ the text +before running the actual evaluation metric. + +Flags are given after a colon (`:`) and can be combined. Some flags +can have arguments, they should be given in angle brackets (`<...>`). + +The following files will be used in example calculations, `expected.tsv`: + + foo 123 bar + 29008 Straße + xyz + aaa 3 4 bbb + qwerty 100 + WWW WWW + test + 104 + BAR Foo baz + OK 7777 + +`out.tsv`: + + foo 999 BAR + 29008 STRASSE + xyz + aaa BBB 34 + qwerty 1000 + WWW WWW WWW WWW WWW WWW WWW WWW + testtttttt + 104 + Foo baz BAR + Ok 7777 + +`in.tsv`: + + + +Without any flags, the `Accuracy` metric is: + + $ geval -o out.tsv -e expected.tsv --metric Accuracy + 0.2 + +(As only two items are correct: `xyz` and `104`.) + +### Case change + +#### `l` — lower-case + + $ geval -o out.tsv -e expected.tsv --metric Accuracy:l + 0.3 + +#### `u` — upper-case + + $ geval -o out.tsv -e expected.tsv --metric Accuracy:l + 0.4 + +Why the result is differnt for lower-casing and upper-casing? Some +characters, e.g. German _ß_, are tricky. If you upper-case _Straße_ +you've got _STRASSE_, but if you lower-case it, you obtain _straße_, +not _strasse_! For this reason, when you want to disregard case when +evaluating your metric, it is better to case _case folding_ rather +than lower- or upper-casing: + +#### `c` — case fold + + $ geval -o out.tsv -e expected.tsv --metric Accuracy:c + 0.4 + +### Manipulations with regular expressions + +#### `m` matching a given PCRE regexp + +The evaluation metric will be calculated only on the parts of the +outputs matching a given regular expression. This can be used when you +want to focus on some specific parts of a text. For instance, we could +calculate Accuracy only considering (disregarding all other +characters, including spaces). + + $ geval -o out.tsv -e expected.tsv --metric 'Accuracy:m<\d+>' + 0.8 + +(Note that apostrophes are due to using Bash here, if you put it into +the `config.txt` file you should omit apostrophes: `--metric Accuracy:m<\d+>`.) + +All matches are considered and concatenated, if no match is found, an empty string is assumed +(hence, e.g., `testtttttt` is considered a hit for `test` after this normalization). +Note that both `aaa 3 4 bbb` and `aaa BBB 34` will be normalized to `34` here. + +You can use regexp anchoring operators (`^` or `$`). This will refer +to the beginning or end of the whole *line*. You could use it to +calculate the accuracy considering only the first two characters of output lines: + + $ geval -o out.tsv -e expected.tsv --metric 'Accuracy:m<^..>' + 0.8 + +#### `t` — filtering tokens using a PCRE regexp + +This applies a regexp for each token separately (tokens are seperated +by spaces, you can use a non-standard tokenizer with the `--tokenizer` option if needed). +All the tokens not matching the regexp are filtered out (but spaces are recovered). + + $ geval -o out.tsv -e expected.tsv --metric 'Accuracy:t<\d+>' + 0.7 + +Now, the anchoring operators refer to the beginning or end of a +*token*. For instance, let's consider only tokens starting with _b_: + + $ geval -o out.tsv -e expected.tsv --metric 'Accuracy:t<^b>' + 0.8 + +With `m` or `t` flags you can only select parts of output lines. What +if you want to do some replacements, e.g. collapse some +characters/strings into a standard form? You should use the `s` flag for this: + +#### `s` — replace parts of output lines matching a regexp + +This will substitute all occurrences of strings matching REGEXP with +REPLACEMENT. For instance, we could replace all numbers with a special token NUMBER. +All the other parts of a line are left intact. + + $ geval -o out.tsv -e expected.tsv --metric 'Accuracy:s<\d+>' + 0.3 + +You can use special operators `\0`, `\1`, `\2` to refer to parts matched by the regexp. + + $ geval -o out.tsv -e expected.tsv --metric 'Accuracy:s<([A-Za-z])\S+>' + 0.5 + +### Other normalizations + +#### `S` — sort all tokens + +This will sort all tokens, e.g. `foo bar baz` will be treated as `bar baz foo`. + +### Filtering + +#### `f` — filtering + +Flags such as `u`, `m<...>`, `s<...><...>` etc. work within a line +(item), they won't change the number items being evaluated. To +consider only a subset of items, use the `f` flag — only the +lines containing the feature FEATURE will be considered during metric +calculation. Features are the same as listed by the `--worst-features` +option, e.g. `exp:foo` would accept only lines with the expected +output containing the token `foo`, `in[2]:bar` — lines with the second +columns of input contaning the token `bar` (contrary to +`--worst-features` square brackets should be used be instead of angle ones for indexing). + +You *MUST* supply an input file when you use the `f<...>` flag. Assume +the following `in.txt` file: + + 12 this aaa + 32 this bbb + 32 this ccc + 12 that aaa + 12 that aaa + 10 that aaa + 11 that + 11 that + 17 this + 12 that + + $ geval -o out.tsv -e expected.tsv -i in.tsv --metric 'Accuracy:f' + 0.25 + +### Presentation + +Some flags are used not for modifying the result, but rather changing +the way it is presented by GEval (or the associated +[Gonito](https://gonito.net) Web application). + +#### `N` — use an alternative name + +Sometimes, the metric name gets complicated, you can use the `N<...>` +to get a more human-readable way. + +This will be used: + +* by GEval when presenting results from more than one metric (when + only one metric is calculated, its name is not given anyway), +* by Gonito, e.g. in table headers. + + $ geval -o out.tsv -e expected.tsv --metric Accuracy --metric MultiLabel-F1:N --metric 'MultiLabel-F0:N' --metric 'MultiLabelF9999:N' + Accuracy 0.200 + F-score 0.511 + Precision 0.462 + Recall 0.571 + +(GEval does not have separate Precision/Recall metrics, but they can +be easily obtained by setting the parameter of the F-score to, +respectively, 0 and a large number.) + +More than one name can be given. In such a case, or names will concatenated with spaces. + + $ geval --precision 3 -o out.tsv -e expected.tsv --metric 'Accuracy' --metric 'MultiLabel-F1:NNN' + Accuracy 0.200 + F-score on tokens 0.511 + +This is handy, when combined with the `{...}` operator (see below). + +#### `P` — set the priority (within the Gonito platform) + +This sets the priority level, considered when the results are displayed in the Gonito platform. +It has no effect in GEval as such (it is simply disregarded in GEval). + + $ geval --precision 3 -o out.tsv -e expected.tsv --metric 'Accuracy:P<1>' --metric 'MultiLabel-F1:P<3>' Accuracy:P<1> 0.200 + MultiLabel-F1.0:P<3> 0.511 + +The priority is interpreted by Gonito in the following way: + + * 1 — show everywhere, including the main leaderboard table + * 2 — show on the secondary leaderboard table and in detailed information for a submission + * 3 — show only in detailed information for a submission + +Although you can specify `P<...>` more than once, only the first value +will be considered for a given metric (this might be important when combined with the `{...}` operator. + +### Combining flags + +Flags can be combined, just by concatenation (`:` should be given only once): + + $ geval -o out.tsv -e expected.tsv -i in.tsv --metric Accuracy --metric 'Accuracy:fcs<\d>N' + Accuracy 0.2 + MyWeirdMetric 0.75 + +Note that the order of flags might be sometimes significant, in +general, they are considered from left to right. + +### Cartesian operator `{...}` + +Sometimes, you need to define a large number of similar metrics. Then +you can use the special `{...}` operator interpreted by GEval (not +Bash!). For instance `{foo,bar}xyz{aaa,bbb,ccc}` will be internally +considered as the Cartesian product (i.e. you'll get all the +combinations): `fooxyzaaa`, `fooxyzbbb`, `fooxyzccc`, `barxyzaaa`, +`barxyzbbb`, `barxyzccc`. + +For example, let's assume that we want accuracy, F-score, precision +and recall in both case-sensitive and case-insensitive versions. +Here's the way to calculate all these 8 metrics in a concise manner: + + $ geval --precision 3 -o out.tsv -e expected.tsv -i in.tsv --metric '{Accuracy:N,MultiLabel-F1:N,MultiLabel-F0:N

,MultiLabel-F9999:N}N{N,cN}' + sensitive non-sensitive + Acc case 0.200 0.400 + F1 case 0.511 0.681 + P case 0.462 0.615 + R case 0.571 0.762 + +Note that GEval automagically put the results in a table! (Well, +_case_ probably should be written in headers, but, well, it generates +the table totally on its own.) + ## Handling headers When dealing with TSV files, you often face a dilemma whether to add a diff --git a/src/GEval/EvaluationScheme.hs b/src/GEval/EvaluationScheme.hs index 135f9d8..b335087 100644 --- a/src/GEval/EvaluationScheme.hs +++ b/src/GEval/EvaluationScheme.hs @@ -12,7 +12,7 @@ import GEval.Metric import Text.Regex.PCRE.Heavy import Text.Regex.PCRE.Light.Base (Regex(..)) -import Data.Text (Text(..), concat, toLower, toUpper, pack, unpack, words, unwords) +import Data.Text (Text(..), concat, toCaseFold, toLower, toUpper, pack, unpack, words, unwords) import Data.List (intercalate, break, sort) import Data.Either import Data.Maybe (fromMaybe, catMaybes) @@ -23,8 +23,10 @@ data EvaluationScheme = EvaluationScheme Metric [PreprocessingOperation] deriving (Eq) data PreprocessingOperation = RegexpMatch Regex + | RegexpTokenMatch Regex | LowerCasing | UpperCasing + | CaseFolding | Sorting | SetName Text | SetPriority Int @@ -51,7 +53,10 @@ readOps ('l':theRest) = (LowerCasing:ops, theRest') where (ops, theRest') = readOps theRest readOps ('u':theRest) = (UpperCasing:ops, theRest') where (ops, theRest') = readOps theRest +readOps ('c':theRest) = (CaseFolding:ops, theRest') + where (ops, theRest') = readOps theRest readOps ('m':theRest) = handleParametrizedOp (RegexpMatch . (fromRight undefined) . ((flip compileM) []) . BSU.fromString) theRest +readOps ('t':theRest) = handleParametrizedOp (RegexpTokenMatch . (fromRight undefined) . ((flip compileM) []) . BSU.fromString) theRest readOps ('S':theRest) = (Sorting:ops, theRest') where (ops, theRest') = readOps theRest readOps ('N':theRest) = handleParametrizedOp (SetName . pack) theRest @@ -120,8 +125,10 @@ evaluationSchemeMetric (EvaluationScheme metric _) = metric instance Show PreprocessingOperation where show (RegexpMatch (Regex _ regexp)) = parametrizedOperation "m" (BSU.toString regexp) + show (RegexpTokenMatch (Regex _ regexp)) = parametrizedOperation "t" (BSU.toString regexp) show LowerCasing = "l" show UpperCasing = "u" + show CaseFolding = "c" show Sorting = "S" show (SetName t) = parametrizedOperation "N" (unpack t) show (SetPriority p) = parametrizedOperation "P" (show p) @@ -154,8 +161,10 @@ applyPreprocessingOperations (EvaluationScheme _ operations) t = foldl (flip app applyPreprocessingOperation :: PreprocessingOperation -> Text -> Text applyPreprocessingOperation (RegexpMatch regex) = Data.Text.concat . (map fst) . (scan regex) +applyPreprocessingOperation (RegexpTokenMatch regex) = Data.Text.unwords . (filter (≈ regex)) . Data.Text.words applyPreprocessingOperation LowerCasing = toLower applyPreprocessingOperation UpperCasing = toUpper +applyPreprocessingOperation CaseFolding = toCaseFold applyPreprocessingOperation Sorting = Data.Text.unwords . sort . Data.Text.words applyPreprocessingOperation (SetName _) = id applyPreprocessingOperation (SetPriority _) = id diff --git a/test/Spec.hs b/test/Spec.hs index 43b8a3d..2730046 100644 --- a/test/Spec.hs +++ b/test/Spec.hs @@ -364,6 +364,27 @@ main = hspec $ do runGEvalTest "f1-with-preprocessing" `shouldReturnAlmost` 0.57142857142857 it "Regexp substition" $ do runGEvalTest "accuracy-with-flags" `shouldReturnAlmost` 0.8 + describe "Flag examples" $ do + it "none" $ do + runGEvalTest "flags-none" `shouldReturnAlmost` 0.2 + it "lower-case" $ do + runGEvalTest "flags-lowercase" `shouldReturnAlmost` 0.3 + it "upper-case" $ do + runGEvalTest "flags-uppercase" `shouldReturnAlmost` 0.4 + it "regexp-matching" $ do + runGEvalTest "flags-regexp-matching" `shouldReturnAlmost` 0.8 + it "regexp-matching-anchor" $ do + runGEvalTest "flags-regexp-matching-anchor" `shouldReturnAlmost` 0.8 + it "regexp-token-matching" $ do + runGEvalTest "flags-regexp-token-matching" `shouldReturnAlmost` 0.7 + it "regexp-token-matching-anchor" $ do + runGEvalTest "flags-regexp-token-matching-anchor" `shouldReturnAlmost` 0.8 + it "regexp-substitution" $ do + runGEvalTest "flags-regexp-substitution" `shouldReturnAlmost` 0.3 + it "regexp-substitution-ref" $ do + runGEvalTest "flags-regexp-substitution-ref" `shouldReturnAlmost` 0.5 + it "filtering" $ do + runGEvalTest "flags-filtering" `shouldReturnAlmost` 0.25 describe "evaluating single lines" $ do it "RMSE" $ do (MetricOutput (SimpleRun v) _) <- gevalCoreOnSingleLines RMSE id RawItemTarget diff --git a/test/flags-case-fold/flags-case-fold-solution/test-A/out.tsv b/test/flags-case-fold/flags-case-fold-solution/test-A/out.tsv new file mode 100644 index 0000000..d94143b --- /dev/null +++ b/test/flags-case-fold/flags-case-fold-solution/test-A/out.tsv @@ -0,0 +1,10 @@ +foo 999 BAR +29008 STRASSE +xyz +aaa BBB 34 +qwerty 1000 +WWW WWW WWW +testtttttt +104 +Foo baz BAR +Ok 7777 diff --git a/test/flags-case-fold/flags-case-fold/config.txt b/test/flags-case-fold/flags-case-fold/config.txt new file mode 100644 index 0000000..82c1775 --- /dev/null +++ b/test/flags-case-fold/flags-case-fold/config.txt @@ -0,0 +1 @@ +--metric Accuracy:c diff --git a/test/flags-case-fold/flags-case-fold/test-A/expected.tsv b/test/flags-case-fold/flags-case-fold/test-A/expected.tsv new file mode 100644 index 0000000..a95a323 --- /dev/null +++ b/test/flags-case-fold/flags-case-fold/test-A/expected.tsv @@ -0,0 +1,10 @@ +foo 123 bar +29008 Straße +xyz +aaa 3 4 bbb +qwerty 100 +WWW WWW +test +104 +BAR Foo baz +OK 7777 diff --git a/test/flags-filtering/flags-filtering-solution/test-A/out.tsv b/test/flags-filtering/flags-filtering-solution/test-A/out.tsv new file mode 100644 index 0000000..d94143b --- /dev/null +++ b/test/flags-filtering/flags-filtering-solution/test-A/out.tsv @@ -0,0 +1,10 @@ +foo 999 BAR +29008 STRASSE +xyz +aaa BBB 34 +qwerty 1000 +WWW WWW WWW +testtttttt +104 +Foo baz BAR +Ok 7777 diff --git a/test/flags-filtering/flags-filtering/config.txt b/test/flags-filtering/flags-filtering/config.txt new file mode 100644 index 0000000..dfa6b66 --- /dev/null +++ b/test/flags-filtering/flags-filtering/config.txt @@ -0,0 +1 @@ +--metric Accuracy:f diff --git a/test/flags-filtering/flags-filtering/test-A/expected.tsv b/test/flags-filtering/flags-filtering/test-A/expected.tsv new file mode 100644 index 0000000..a95a323 --- /dev/null +++ b/test/flags-filtering/flags-filtering/test-A/expected.tsv @@ -0,0 +1,10 @@ +foo 123 bar +29008 Straße +xyz +aaa 3 4 bbb +qwerty 100 +WWW WWW +test +104 +BAR Foo baz +OK 7777 diff --git a/test/flags-filtering/flags-filtering/test-A/in.tsv b/test/flags-filtering/flags-filtering/test-A/in.tsv new file mode 100644 index 0000000..72292ef --- /dev/null +++ b/test/flags-filtering/flags-filtering/test-A/in.tsv @@ -0,0 +1,10 @@ +12 this aaa +32 this bbb +32 this ccc +12 that aaa +12 that aaa +10 that aaa +11 that +11 that +17 this +12 that diff --git a/test/flags-lowercase/flags-lowercase-solution/test-A/out.tsv b/test/flags-lowercase/flags-lowercase-solution/test-A/out.tsv new file mode 100644 index 0000000..d94143b --- /dev/null +++ b/test/flags-lowercase/flags-lowercase-solution/test-A/out.tsv @@ -0,0 +1,10 @@ +foo 999 BAR +29008 STRASSE +xyz +aaa BBB 34 +qwerty 1000 +WWW WWW WWW +testtttttt +104 +Foo baz BAR +Ok 7777 diff --git a/test/flags-lowercase/flags-lowercase/config.txt b/test/flags-lowercase/flags-lowercase/config.txt new file mode 100644 index 0000000..e3c75cd --- /dev/null +++ b/test/flags-lowercase/flags-lowercase/config.txt @@ -0,0 +1 @@ +--metric Accuracy:l diff --git a/test/flags-lowercase/flags-lowercase/test-A/expected.tsv b/test/flags-lowercase/flags-lowercase/test-A/expected.tsv new file mode 100644 index 0000000..a95a323 --- /dev/null +++ b/test/flags-lowercase/flags-lowercase/test-A/expected.tsv @@ -0,0 +1,10 @@ +foo 123 bar +29008 Straße +xyz +aaa 3 4 bbb +qwerty 100 +WWW WWW +test +104 +BAR Foo baz +OK 7777 diff --git a/test/flags-none/flags-none-solution/test-A/out.tsv b/test/flags-none/flags-none-solution/test-A/out.tsv new file mode 100644 index 0000000..d94143b --- /dev/null +++ b/test/flags-none/flags-none-solution/test-A/out.tsv @@ -0,0 +1,10 @@ +foo 999 BAR +29008 STRASSE +xyz +aaa BBB 34 +qwerty 1000 +WWW WWW WWW +testtttttt +104 +Foo baz BAR +Ok 7777 diff --git a/test/flags-none/flags-none/config.txt b/test/flags-none/flags-none/config.txt new file mode 100644 index 0000000..337a0cc --- /dev/null +++ b/test/flags-none/flags-none/config.txt @@ -0,0 +1 @@ +--metric Accuracy diff --git a/test/flags-none/flags-none/test-A/expected.tsv b/test/flags-none/flags-none/test-A/expected.tsv new file mode 100644 index 0000000..a95a323 --- /dev/null +++ b/test/flags-none/flags-none/test-A/expected.tsv @@ -0,0 +1,10 @@ +foo 123 bar +29008 Straße +xyz +aaa 3 4 bbb +qwerty 100 +WWW WWW +test +104 +BAR Foo baz +OK 7777 diff --git a/test/flags-regexp-matching-anchor/flags-regexp-matching-anchor-solution/test-A/out.tsv b/test/flags-regexp-matching-anchor/flags-regexp-matching-anchor-solution/test-A/out.tsv new file mode 100644 index 0000000..d94143b --- /dev/null +++ b/test/flags-regexp-matching-anchor/flags-regexp-matching-anchor-solution/test-A/out.tsv @@ -0,0 +1,10 @@ +foo 999 BAR +29008 STRASSE +xyz +aaa BBB 34 +qwerty 1000 +WWW WWW WWW +testtttttt +104 +Foo baz BAR +Ok 7777 diff --git a/test/flags-regexp-matching-anchor/flags-regexp-matching-anchor/config.txt b/test/flags-regexp-matching-anchor/flags-regexp-matching-anchor/config.txt new file mode 100644 index 0000000..f420c60 --- /dev/null +++ b/test/flags-regexp-matching-anchor/flags-regexp-matching-anchor/config.txt @@ -0,0 +1 @@ +--metric Accuracy:m<^..> diff --git a/test/flags-regexp-matching-anchor/flags-regexp-matching-anchor/test-A/expected.tsv b/test/flags-regexp-matching-anchor/flags-regexp-matching-anchor/test-A/expected.tsv new file mode 100644 index 0000000..a95a323 --- /dev/null +++ b/test/flags-regexp-matching-anchor/flags-regexp-matching-anchor/test-A/expected.tsv @@ -0,0 +1,10 @@ +foo 123 bar +29008 Straße +xyz +aaa 3 4 bbb +qwerty 100 +WWW WWW +test +104 +BAR Foo baz +OK 7777 diff --git a/test/flags-regexp-matching/flags-regexp-matching-solution/test-A/out.tsv b/test/flags-regexp-matching/flags-regexp-matching-solution/test-A/out.tsv new file mode 100644 index 0000000..d94143b --- /dev/null +++ b/test/flags-regexp-matching/flags-regexp-matching-solution/test-A/out.tsv @@ -0,0 +1,10 @@ +foo 999 BAR +29008 STRASSE +xyz +aaa BBB 34 +qwerty 1000 +WWW WWW WWW +testtttttt +104 +Foo baz BAR +Ok 7777 diff --git a/test/flags-regexp-matching/flags-regexp-matching/config.txt b/test/flags-regexp-matching/flags-regexp-matching/config.txt new file mode 100644 index 0000000..ce11160 --- /dev/null +++ b/test/flags-regexp-matching/flags-regexp-matching/config.txt @@ -0,0 +1 @@ +--metric Accuracy:m<\d+> diff --git a/test/flags-regexp-matching/flags-regexp-matching/test-A/expected.tsv b/test/flags-regexp-matching/flags-regexp-matching/test-A/expected.tsv new file mode 100644 index 0000000..a95a323 --- /dev/null +++ b/test/flags-regexp-matching/flags-regexp-matching/test-A/expected.tsv @@ -0,0 +1,10 @@ +foo 123 bar +29008 Straße +xyz +aaa 3 4 bbb +qwerty 100 +WWW WWW +test +104 +BAR Foo baz +OK 7777 diff --git a/test/flags-regexp-substitution-ref/flags-regexp-substitution-ref-solution/test-A/out.tsv b/test/flags-regexp-substitution-ref/flags-regexp-substitution-ref-solution/test-A/out.tsv new file mode 100644 index 0000000..d94143b --- /dev/null +++ b/test/flags-regexp-substitution-ref/flags-regexp-substitution-ref-solution/test-A/out.tsv @@ -0,0 +1,10 @@ +foo 999 BAR +29008 STRASSE +xyz +aaa BBB 34 +qwerty 1000 +WWW WWW WWW +testtttttt +104 +Foo baz BAR +Ok 7777 diff --git a/test/flags-regexp-substitution-ref/flags-regexp-substitution-ref/config.txt b/test/flags-regexp-substitution-ref/flags-regexp-substitution-ref/config.txt new file mode 100644 index 0000000..2fca6f8 --- /dev/null +++ b/test/flags-regexp-substitution-ref/flags-regexp-substitution-ref/config.txt @@ -0,0 +1 @@ +--metric Accuracy:s<([A-Za-z])\S+> diff --git a/test/flags-regexp-substitution-ref/flags-regexp-substitution-ref/test-A/expected.tsv b/test/flags-regexp-substitution-ref/flags-regexp-substitution-ref/test-A/expected.tsv new file mode 100644 index 0000000..a95a323 --- /dev/null +++ b/test/flags-regexp-substitution-ref/flags-regexp-substitution-ref/test-A/expected.tsv @@ -0,0 +1,10 @@ +foo 123 bar +29008 Straße +xyz +aaa 3 4 bbb +qwerty 100 +WWW WWW +test +104 +BAR Foo baz +OK 7777 diff --git a/test/flags-regexp-substitution/flags-regexp-substitution-solution/test-A/out.tsv b/test/flags-regexp-substitution/flags-regexp-substitution-solution/test-A/out.tsv new file mode 100644 index 0000000..d94143b --- /dev/null +++ b/test/flags-regexp-substitution/flags-regexp-substitution-solution/test-A/out.tsv @@ -0,0 +1,10 @@ +foo 999 BAR +29008 STRASSE +xyz +aaa BBB 34 +qwerty 1000 +WWW WWW WWW +testtttttt +104 +Foo baz BAR +Ok 7777 diff --git a/test/flags-regexp-substitution/flags-regexp-substitution/config.txt b/test/flags-regexp-substitution/flags-regexp-substitution/config.txt new file mode 100644 index 0000000..3d1a792 --- /dev/null +++ b/test/flags-regexp-substitution/flags-regexp-substitution/config.txt @@ -0,0 +1 @@ +--metric Accuracy:s<\d+> diff --git a/test/flags-regexp-substitution/flags-regexp-substitution/test-A/expected.tsv b/test/flags-regexp-substitution/flags-regexp-substitution/test-A/expected.tsv new file mode 100644 index 0000000..a95a323 --- /dev/null +++ b/test/flags-regexp-substitution/flags-regexp-substitution/test-A/expected.tsv @@ -0,0 +1,10 @@ +foo 123 bar +29008 Straße +xyz +aaa 3 4 bbb +qwerty 100 +WWW WWW +test +104 +BAR Foo baz +OK 7777 diff --git a/test/flags-regexp-token-matching-anchor/flags-regexp-token-matching-anchor-solution/test-A/out.tsv b/test/flags-regexp-token-matching-anchor/flags-regexp-token-matching-anchor-solution/test-A/out.tsv new file mode 100644 index 0000000..d94143b --- /dev/null +++ b/test/flags-regexp-token-matching-anchor/flags-regexp-token-matching-anchor-solution/test-A/out.tsv @@ -0,0 +1,10 @@ +foo 999 BAR +29008 STRASSE +xyz +aaa BBB 34 +qwerty 1000 +WWW WWW WWW +testtttttt +104 +Foo baz BAR +Ok 7777 diff --git a/test/flags-regexp-token-matching-anchor/flags-regexp-token-matching-anchor/config.txt b/test/flags-regexp-token-matching-anchor/flags-regexp-token-matching-anchor/config.txt new file mode 100644 index 0000000..8a33294 --- /dev/null +++ b/test/flags-regexp-token-matching-anchor/flags-regexp-token-matching-anchor/config.txt @@ -0,0 +1 @@ +--metric Accuracy:t<^b> diff --git a/test/flags-regexp-token-matching-anchor/flags-regexp-token-matching-anchor/test-A/expected.tsv b/test/flags-regexp-token-matching-anchor/flags-regexp-token-matching-anchor/test-A/expected.tsv new file mode 100644 index 0000000..a95a323 --- /dev/null +++ b/test/flags-regexp-token-matching-anchor/flags-regexp-token-matching-anchor/test-A/expected.tsv @@ -0,0 +1,10 @@ +foo 123 bar +29008 Straße +xyz +aaa 3 4 bbb +qwerty 100 +WWW WWW +test +104 +BAR Foo baz +OK 7777 diff --git a/test/flags-regexp-token-matching/flags-regexp-token-matching-solution/test-A/out.tsv b/test/flags-regexp-token-matching/flags-regexp-token-matching-solution/test-A/out.tsv new file mode 100644 index 0000000..d94143b --- /dev/null +++ b/test/flags-regexp-token-matching/flags-regexp-token-matching-solution/test-A/out.tsv @@ -0,0 +1,10 @@ +foo 999 BAR +29008 STRASSE +xyz +aaa BBB 34 +qwerty 1000 +WWW WWW WWW +testtttttt +104 +Foo baz BAR +Ok 7777 diff --git a/test/flags-regexp-token-matching/flags-regexp-token-matching/config.txt b/test/flags-regexp-token-matching/flags-regexp-token-matching/config.txt new file mode 100644 index 0000000..3aa87e0 --- /dev/null +++ b/test/flags-regexp-token-matching/flags-regexp-token-matching/config.txt @@ -0,0 +1 @@ +--metric Accuracy:t<\d+> diff --git a/test/flags-regexp-token-matching/flags-regexp-token-matching/test-A/expected.tsv b/test/flags-regexp-token-matching/flags-regexp-token-matching/test-A/expected.tsv new file mode 100644 index 0000000..a95a323 --- /dev/null +++ b/test/flags-regexp-token-matching/flags-regexp-token-matching/test-A/expected.tsv @@ -0,0 +1,10 @@ +foo 123 bar +29008 Straße +xyz +aaa 3 4 bbb +qwerty 100 +WWW WWW +test +104 +BAR Foo baz +OK 7777 diff --git a/test/flags-uppercase/flags-uppercase-solution/test-A/out.tsv b/test/flags-uppercase/flags-uppercase-solution/test-A/out.tsv new file mode 100644 index 0000000..d94143b --- /dev/null +++ b/test/flags-uppercase/flags-uppercase-solution/test-A/out.tsv @@ -0,0 +1,10 @@ +foo 999 BAR +29008 STRASSE +xyz +aaa BBB 34 +qwerty 1000 +WWW WWW WWW +testtttttt +104 +Foo baz BAR +Ok 7777 diff --git a/test/flags-uppercase/flags-uppercase/config.txt b/test/flags-uppercase/flags-uppercase/config.txt new file mode 100644 index 0000000..a94cfa3 --- /dev/null +++ b/test/flags-uppercase/flags-uppercase/config.txt @@ -0,0 +1 @@ +--metric Accuracy:u diff --git a/test/flags-uppercase/flags-uppercase/test-A/expected.tsv b/test/flags-uppercase/flags-uppercase/test-A/expected.tsv new file mode 100644 index 0000000..a95a323 --- /dev/null +++ b/test/flags-uppercase/flags-uppercase/test-A/expected.tsv @@ -0,0 +1,10 @@ +foo 123 bar +29008 Straße +xyz +aaa 3 4 bbb +qwerty 100 +WWW WWW +test +104 +BAR Foo baz +OK 7777 From 23acb0133872ae0af20d40eb076d40067fbc3cb8 Mon Sep 17 00:00:00 2001 From: Filip Gralinski Date: Sat, 1 Aug 2020 21:37:48 +0200 Subject: [PATCH 2/4] Fixes in README (description of flags) --- README.md | 28 ++++++++++--------- test/Spec.hs | 2 ++ .../flags-sort-solution/test-A/out.tsv | 10 +++++++ test/flags-sort/flags-sort/config.txt | 1 + .../flags-sort/flags-sort/test-A/expected.tsv | 10 +++++++ 5 files changed, 38 insertions(+), 13 deletions(-) create mode 100644 test/flags-sort/flags-sort-solution/test-A/out.tsv create mode 100644 test/flags-sort/flags-sort/config.txt create mode 100644 test/flags-sort/flags-sort/test-A/expected.tsv diff --git a/README.md b/README.md index e410d12..7609006 100644 --- a/README.md +++ b/README.md @@ -518,10 +518,6 @@ The following files will be used in example calculations, `expected.tsv`: Foo baz BAR Ok 7777 -`in.tsv`: - - - Without any flags, the `Accuracy` metric is: $ geval -o out.tsv -e expected.tsv --metric Accuracy @@ -541,7 +537,7 @@ Without any flags, the `Accuracy` metric is: $ geval -o out.tsv -e expected.tsv --metric Accuracy:l 0.4 -Why the result is differnt for lower-casing and upper-casing? Some +Why the result is different for lower-casing and upper-casing? Some characters, e.g. German _ß_, are tricky. If you upper-case _Straße_ you've got _STRASSE_, but if you lower-case it, you obtain _straße_, not _strasse_! For this reason, when you want to disregard case when @@ -555,12 +551,12 @@ than lower- or upper-casing: ### Manipulations with regular expressions -#### `m` matching a given PCRE regexp +#### `m` — matching a given PCRE regexp The evaluation metric will be calculated only on the parts of the outputs matching a given regular expression. This can be used when you want to focus on some specific parts of a text. For instance, we could -calculate Accuracy only considering (disregarding all other +calculate Accuracy only considering numbers (disregarding all other characters, including spaces). $ geval -o out.tsv -e expected.tsv --metric 'Accuracy:m<\d+>' @@ -569,9 +565,11 @@ characters, including spaces). (Note that apostrophes are due to using Bash here, if you put it into the `config.txt` file you should omit apostrophes: `--metric Accuracy:m<\d+>`.) -All matches are considered and concatenated, if no match is found, an empty string is assumed -(hence, e.g., `testtttttt` is considered a hit for `test` after this normalization). -Note that both `aaa 3 4 bbb` and `aaa BBB 34` will be normalized to `34` here. +All matches are considered and concatenated, if no match is found, an +empty string is assumed (hence, e.g., `testtttttt` is considered a hit +for `test` after this normalization, as both will be transformed into +the empty string). Note that both `aaa 3 4 bbb` and `aaa BBB 34` will +be normalized to `34` here. You can use regexp anchoring operators (`^` or `$`). This will refer to the beginning or end of the whole *line*. You could use it to @@ -619,6 +617,9 @@ You can use special operators `\0`, `\1`, `\2` to refer to parts matched by the This will sort all tokens, e.g. `foo bar baz` will be treated as `bar baz foo`. + $ geval -o out.tsv -e expected.tsv --metric 'Accuracy:S' + 0.3 + ### Filtering #### `f` — filtering @@ -626,12 +627,12 @@ This will sort all tokens, e.g. `foo bar baz` will be treated as `bar baz foo`. Flags such as `u`, `m<...>`, `s<...><...>` etc. work within a line (item), they won't change the number items being evaluated. To consider only a subset of items, use the `f` flag — only the -lines containing the feature FEATURE will be considered during metric +lines containing the feature FEATURE will be taken during metric calculation. Features are the same as listed by the `--worst-features` option, e.g. `exp:foo` would accept only lines with the expected output containing the token `foo`, `in[2]:bar` — lines with the second columns of input contaning the token `bar` (contrary to -`--worst-features` square brackets should be used be instead of angle ones for indexing). +`--worst-features` square brackets should be used, instead of angle ones, for indexing). You *MUST* supply an input file when you use the `f<...>` flag. Assume the following `in.txt` file: @@ -690,7 +691,8 @@ This is handy, when combined with the `{...}` operator (see below). This sets the priority level, considered when the results are displayed in the Gonito platform. It has no effect in GEval as such (it is simply disregarded in GEval). - $ geval --precision 3 -o out.tsv -e expected.tsv --metric 'Accuracy:P<1>' --metric 'MultiLabel-F1:P<3>' Accuracy:P<1> 0.200 + $ geval --precision 3 -o out.tsv -e expected.tsv --metric 'Accuracy:P<1>' --metric 'MultiLabel-F1:P<3>' + Accuracy:P<1> 0.200 MultiLabel-F1.0:P<3> 0.511 The priority is interpreted by Gonito in the following way: diff --git a/test/Spec.hs b/test/Spec.hs index 2730046..d54bd3d 100644 --- a/test/Spec.hs +++ b/test/Spec.hs @@ -383,6 +383,8 @@ main = hspec $ do runGEvalTest "flags-regexp-substitution" `shouldReturnAlmost` 0.3 it "regexp-substitution-ref" $ do runGEvalTest "flags-regexp-substitution-ref" `shouldReturnAlmost` 0.5 + it "sort" $ do + runGEvalTest "flags-sort" `shouldReturnAlmost` 0.3 it "filtering" $ do runGEvalTest "flags-filtering" `shouldReturnAlmost` 0.25 describe "evaluating single lines" $ do diff --git a/test/flags-sort/flags-sort-solution/test-A/out.tsv b/test/flags-sort/flags-sort-solution/test-A/out.tsv new file mode 100644 index 0000000..4be9eae --- /dev/null +++ b/test/flags-sort/flags-sort-solution/test-A/out.tsv @@ -0,0 +1,10 @@ +foo 999 BAR +29008 STRASSE +xyz +aaa BBB 34 +qwerty 1000 +WWW WWW WWW WWW WWW WWW WWW WWW +testtttttt +104 +Foo baz BAR +Ok 7777 diff --git a/test/flags-sort/flags-sort/config.txt b/test/flags-sort/flags-sort/config.txt new file mode 100644 index 0000000..0de8e69 --- /dev/null +++ b/test/flags-sort/flags-sort/config.txt @@ -0,0 +1 @@ +--metric Accuracy:S diff --git a/test/flags-sort/flags-sort/test-A/expected.tsv b/test/flags-sort/flags-sort/test-A/expected.tsv new file mode 100644 index 0000000..a95a323 --- /dev/null +++ b/test/flags-sort/flags-sort/test-A/expected.tsv @@ -0,0 +1,10 @@ +foo 123 bar +29008 Straße +xyz +aaa 3 4 bbb +qwerty 100 +WWW WWW +test +104 +BAR Foo baz +OK 7777 From b719d3190d287d37e4a5a936f1ff0a3ca6fd6e93 Mon Sep 17 00:00:00 2001 From: Filip Gralinski Date: Sat, 1 Aug 2020 21:44:27 +0200 Subject: [PATCH 3/4] Minore fix --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 7609006..e0fbc7a 100644 --- a/README.md +++ b/README.md @@ -541,7 +541,7 @@ Why the result is different for lower-casing and upper-casing? Some characters, e.g. German _ß_, are tricky. If you upper-case _Straße_ you've got _STRASSE_, but if you lower-case it, you obtain _straße_, not _strasse_! For this reason, when you want to disregard case when -evaluating your metric, it is better to case _case folding_ rather +evaluating your metric, it is better to use _case folding_ rather than lower- or upper-casing: #### `c` — case fold From 198aa1f080d3d09493b9cf11869ccdc55bba4bd4 Mon Sep 17 00:00:00 2001 From: Filip Gralinski Date: Sat, 1 Aug 2020 21:46:59 +0200 Subject: [PATCH 4/4] Bump up version number --- CHANGELOG.md | 4 ++++ geval.cabal | 2 +- 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 4f8cfaf..23eff90 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,4 +1,8 @@ +## 1.36.1.0 + +* Add "c" and "t" flags + ## 1.36.0.0 * Add fuzzy matching for MultiLabel-F1 diff --git a/geval.cabal b/geval.cabal index 8009c99..8b88a10 100644 --- a/geval.cabal +++ b/geval.cabal @@ -1,5 +1,5 @@ name: geval -version: 1.36.0.0 +version: 1.36.1.0 synopsis: Machine learning evaluation tools description: Please see README.md homepage: http://github.com/name/project