From 819fbecedc6f744a1a08f21c11067191837f87a2 Mon Sep 17 00:00:00 2001 From: Filip Gralinski Date: Sat, 17 Oct 2020 16:55:40 +0200 Subject: [PATCH] Add CER metric (Character-Error Rate) --- src/GEval/Core.hs | 20 +++++++++- src/GEval/CreateChallenge.hs | 39 ++++++++++++++++++- src/GEval/Metric.hs | 5 ++- src/GEval/MetricsMechanics.hs | 9 ++++- src/GEval/MetricsMeta.hs | 38 +++++++++++++++++- test/Spec.hs | 6 +++ .../cer-mean-simple-solution/test-A/out.tsv | 2 + .../cer-mean-simple/config.txt | 1 + .../cer-mean-simple/test-A/expected.tsv | 2 + .../cer-simple-solution/test-A/out.tsv | 2 + test/cer-simple/cer-simple/config.txt | 1 + .../cer-simple/cer-simple/test-A/expected.tsv | 2 + 12 files changed, 121 insertions(+), 6 deletions(-) create mode 100644 test/cer-mean-simple/cer-mean-simple-solution/test-A/out.tsv create mode 100644 test/cer-mean-simple/cer-mean-simple/config.txt create mode 100644 test/cer-mean-simple/cer-mean-simple/test-A/expected.tsv create mode 100644 test/cer-simple/cer-simple-solution/test-A/out.tsv create mode 100644 test/cer-simple/cer-simple/config.txt create mode 100644 test/cer-simple/cer-simple/test-A/expected.tsv diff --git a/src/GEval/Core.hs b/src/GEval/Core.hs index a9b0f16..e003f72 100644 --- a/src/GEval/Core.hs +++ b/src/GEval/Core.hs @@ -158,6 +158,7 @@ isPreprocessable Spearman = False isPreprocessable BLEU = True isPreprocessable GLEU = True isPreprocessable WER = True +isPreprocessable CER = True isPreprocessable Accuracy = True isPreprocessable ClippEU = False isPreprocessable (FMeasure _) = False @@ -691,7 +692,19 @@ gevalCoreOnSources (Mean WER) intoWords (RawItemTarget t) = Prelude.map unpack $ Data.Text.words t intoWords (PartiallyParsedItemTarget ts) = Prelude.map unpack ts -gevalCoreOnSources (Mean _) = error $ "Mean/ meta-metric defined only for MultiLabel-F1 and WER for the time being" +gevalCoreOnSources (Mean CER) + = gevalCoreWithoutInputOnItemTargets (Right . getString) + (Right . getString) + ((uncurry (/.)) . (uncurry werStep)) + averageC + id + noGraph + where + -- repeated as below, as it will be refactored into dependent types soon anyway + getString (RawItemTarget t) = unpack t + getString (PartiallyParsedItemTarget ts) = Prelude.unwords $ Prelude.map unpack ts + +gevalCoreOnSources (Mean _) = error $ "Mean/ meta-metric defined only for MultiLabel-F1, WER and CER for the time being" -- only MultiLabel-F1 handled for JSONs for the time being... gevalCoreOnSources (MultiLabelFMeasure beta matchingSpec) = @@ -925,6 +938,11 @@ continueGEvalCalculations SAWER WER = defineContinuation werAgg werFinal noGraph werFuse (a1, a2) (b1, b2) = (a1 + b1, a2 + b2) werFinal (errors, ref) = errors /. ref +continueGEvalCalculations SACER CER = defineContinuation cerAgg cerFinal noGraph + where cerAgg = CC.foldl cerFuse (0, 0) + cerFuse (a1, a2) (b1, b2) = (a1 + b1, a2 + b2) + cerFinal (errors, ref) = errors /. ref + continueGEvalCalculations SAAccuracy Accuracy = defineContinuation averageC id noGraph continueGEvalCalculations SAFMeasure (FMeasure beta) = defineContinuation countAgg (fMeasureOnCounts beta) noGraph diff --git a/src/GEval/CreateChallenge.hs b/src/GEval/CreateChallenge.hs index f9e7dc6..5e283f9 100644 --- a/src/GEval/CreateChallenge.hs +++ b/src/GEval/CreateChallenge.hs @@ -105,6 +105,15 @@ Directory structure * `${testName}/in.tsv` — Finnish input data for the test set * `${testName}/expected.tsv` — Māori reference translation for the test set |] +readmeMDContents WER testName = readmeMDContents BLEU testName +readmeMDContents CER testName = [i| +GEval simple OCR challenge +========================== + +Do OCR. + +This is a sample fake challenge for Gonito framework. Replace it with +the description of your challenge.|] ++ (commonReadmeMDContents testName) readmeMDContents Accuracy testName = [i| GEval sample classification challenge @@ -417,7 +426,8 @@ Directory structure * `README.md` — this file * `config.txt` — configuration file * `train/` — directory with training data -* `train/train.tsv` — sample train set +* `train/in.tsv` — input data for the train set +* `train/expected.tsv` — expected (reference) data for the train set * `dev-0/` — directory with dev (test) data * `dev-0/in.tsv` — input data for the dev set * `dev-0/expected.tsv` — expected (reference) data for the dev set @@ -469,6 +479,11 @@ trainContents BLEU = [hereLit|alussa loi jumala taivaan ja maan he mea hanga na ja maa oli autio ja tyhjä , ja pimeys oli syvyyden päällä a kahore he ahua o te whenua , i takoto kau ; he pouri ano a runga i te mata o te hohonu ja jumalan henki liikkui vetten päällä na ka whakapaho te wairua o te atua i runga i te kare o nga wai |] +trainContents WER = trainContents BLEU +trainContents CER = [hereLit|Hannibal ad portas train1.pdf +equo ne credite train2.pdf +errare humanum est train3.pdf +|] trainContents Accuracy = [hereLit|Y 10 none yes N -2 strong no @@ -568,6 +583,10 @@ devInContents GLEU = devInContents BLEU devInContents BLEU = [hereLit|ja jumala sanoi : " tulkoon valkeus " , ja valkeus tuli ja jumala näki , että valkeus oli hyvä ; ja jumala erotti valkeuden pimeydestä |] +devInContents WER = devInContents BLEU +devInContents CER = [hereLit|dev1.pdf +dev2.pdf +|] devInContents Accuracy = [hereLit|-8 none no 1 mild no |] @@ -636,6 +655,10 @@ devExpectedContents GLEU = devExpectedContents BLEU devExpectedContents BLEU = [hereLit|a ka ki te atua , kia marama : na ka marama a ka kite te atua i te marama , he pai : a ka wehea e te atua te marama i te pouri |] +devExpectedContents WER = devExpectedContents BLEU +devExpectedContents CER = [hereLit|et facta est lux +Et tu, Brute? +|] devExpectedContents Accuracy = [hereLit|N Y |] @@ -702,11 +725,15 @@ devExpectedContents _ = [hereLit|0.82 testInContents :: Metric -> String testInContents (Mean metric) = testInContents metric -testInContents GLEU = [hereLit|Alice has a black +testInContents GLEU = [hereLit|Alicella on musta kissa. |] testInContents BLEU = [hereLit|ja jumala kutsui valkeuden päiväksi , ja pimeyden hän kutsui yöksi ja tuli ehtoo , ja tuli aamu , ensimmäinen päivä |] +testInContents WER = testInContents BLEU +testInContents CER = [hereLit|test1.pdf +test2.pdf +|] testInContents Accuracy = [hereLit|2 mild yes -5 mild no |] @@ -776,6 +803,10 @@ testExpectedContents (Mean metric) = testExpectedContents metric testExpectedContents BLEU = [hereLit|na ka huaina e te atua te marama ko te awatea , a ko te pouri i huaina e ia ko te po a ko te ahiahi , ko te ata , he ra kotahi |] +testExpectedContents CER = [hereLit|esse est percipi +tabula rasa +|] +testExpectedContents WER = testExpectedContents BLEU testExpectedContents Accuracy = [hereLit|N Y |] @@ -848,6 +879,8 @@ inHeaderContents :: Metric -> Maybe [String] inHeaderContents (Mean metric) = inHeaderContents metric inHeaderContents GLEU = Nothing inHeaderContents BLEU = Nothing +inHeaderContents WER = Nothing +inHeaderContents CER = Just ["Filename"] inHeaderContents Accuracy = Just ["Temperature", "Wind", "Rain"] inHeaderContents (FMeasure _) = Just ["seismic", "seismoacoustic", @@ -894,6 +927,8 @@ outHeaderContents :: Metric -> Maybe [String] outHeaderContents (Mean metric) = outHeaderContents metric outHeaderContents BLEU = Nothing outHeaderContents GLEU = Nothing +outHeaderContents WER = Nothing +outHeaderContents CER = Just ["OCRedText"] outHeaderContents Accuracy = Just ["ShouldYouKidForWalk"] outHeaderContents (FMeasure _) = Just ["IsSeismicBump"] outHeaderContents (MacroFMeasure _) = Just ["LanguageCode"] diff --git a/src/GEval/Metric.hs b/src/GEval/Metric.hs index d30f214..77ab327 100644 --- a/src/GEval/Metric.hs +++ b/src/GEval/Metric.hs @@ -25,7 +25,7 @@ import Data.Attoparsec.Text (parseOnly) -- the evaluation procedures are defined in GEval.Core -- | evaluation metric -data Metric = RMSE | MSE | Pearson | Spearman | BLEU | GLEU | WER | Accuracy | ClippEU +data Metric = RMSE | MSE | Pearson | Spearman | BLEU | GLEU | WER | CER | Accuracy | ClippEU | FMeasure Double | MacroFMeasure Double | NMI | LogLossHashed Word32 | CharMatch | MAP | LogLoss | Likelihood | BIOF1 | BIOF1Labels | TokenAccuracy | SegmentAccuracy | LikelihoodHashed Word32 | MAE | SMAPE @@ -48,6 +48,7 @@ instance Show Metric where show BLEU = "BLEU" show GLEU = "GLEU" show WER = "WER" + show CER = "CER" show Accuracy = "Accuracy" show ClippEU = "ClippEU" show (FMeasure beta) = "F" ++ (show beta) @@ -119,6 +120,7 @@ instance Read Metric where readsPrec _ ('B':'L':'E':'U':theRest) = [(BLEU, theRest)] readsPrec _ ('G':'L':'E':'U':theRest) = [(GLEU, theRest)] readsPrec _ ('W':'E':'R':theRest) = [(WER, theRest)] + readsPrec _ ('C':'E':'R':theRest) = [(CER, theRest)] readsPrec _ ('A':'c':'c':'u':'r':'a':'c':'y':theRest) = [(Accuracy, theRest)] readsPrec _ ('C':'l':'i':'p':'p':'E':'U':theRest) = [(ClippEU, theRest)] readsPrec _ ('N':'M':'I':theRest) = [(NMI, theRest)] @@ -178,6 +180,7 @@ getMetricOrdering Spearman = TheHigherTheBetter getMetricOrdering BLEU = TheHigherTheBetter getMetricOrdering GLEU = TheHigherTheBetter getMetricOrdering WER = TheLowerTheBetter +getMetricOrdering CER = TheLowerTheBetter getMetricOrdering Accuracy = TheHigherTheBetter getMetricOrdering ClippEU = TheHigherTheBetter getMetricOrdering (FMeasure _) = TheHigherTheBetter diff --git a/src/GEval/MetricsMechanics.hs b/src/GEval/MetricsMechanics.hs index a743e88..c0c4c98 100644 --- a/src/GEval/MetricsMechanics.hs +++ b/src/GEval/MetricsMechanics.hs @@ -47,7 +47,7 @@ import GEval.MatchingSpecification -- | Helper type so that singleton can be used. -- | (The problem is that some metrics are parametrized by Double -- | Word32 and this is not handled by the singleton libary.) -singletons [d|data AMetric = ARMSE | AMSE | APearson | ASpearman | ABLEU | AGLEU | AWER | AAccuracy | AClippEU +singletons [d|data AMetric = ARMSE | AMSE | APearson | ASpearman | ABLEU | AGLEU | AWER | ACER | AAccuracy | AClippEU | AFMeasure | AMacroFMeasure | ANMI | ALogLossHashed | ACharMatch | AMAP | ALogLoss | ALikelihood | ABIOF1 | ABIOF1Labels | ATokenAccuracy | ASegmentAccuracy | ALikelihoodHashed | AMAE | ASMAPE | AMultiLabelFMeasure MatchingSpecification @@ -66,6 +66,7 @@ toHelper Spearman = ASpearman toHelper BLEU = ABLEU toHelper GLEU = AGLEU toHelper WER = AWER +toHelper CER = ACER toHelper Accuracy = AAccuracy toHelper ClippEU = AClippEU toHelper (FMeasure _) = AFMeasure @@ -104,6 +105,7 @@ type family ParsedExpectedType (t :: AMetric) :: * where ParsedExpectedType ABLEU = [[String]] ParsedExpectedType AGLEU = [[String]] ParsedExpectedType AWER = [String] + ParsedExpectedType ACER = String ParsedExpectedType AAccuracy = Text ParsedExpectedType AClippEU = [ClippingSpec] ParsedExpectedType AFMeasure = Bool @@ -138,6 +140,7 @@ expectedParser SASpearman = doubleParser expectedParser SABLEU = alternativeSentencesParser expectedParser SAGLEU = alternativeSentencesParser expectedParser SAWER = intoStringWords +expectedParser SACER = Right . unpack expectedParser SAAccuracy = onlyStrip expectedParser SAClippEU = controlledParse lineClippingSpecsParser expectedParser SAFMeasure = zeroOneParser @@ -185,6 +188,7 @@ outputParser SASpearman = expectedParser SASpearman outputParser SABLEU = Right . Prelude.words . unpack outputParser SAGLEU = Right . Prelude.words . unpack outputParser SAWER = expectedParser SAWER +outputParser SACER = expectedParser SACER outputParser SAAccuracy = expectedParser SAAccuracy outputParser SAClippEU = controlledParse lineClippingsParser outputParser SAFMeasure = probToZeroOneParser @@ -236,6 +240,7 @@ type family ItemIntermediateRepresentationType (t :: AMetric) :: * where ItemIntermediateRepresentationType ALikelihoodHashed = (Text, Text) ItemIntermediateRepresentationType ACharMatch = (Text, Text) ItemIntermediateRepresentationType AWER = (Int, Int) + ItemIntermediateRepresentationType ACER = (Int, Int) ItemIntermediateRepresentationType t = Double itemStep :: SAMetric t -> (ParsedExpectedType t, ParsedOutputType t) -> ItemIntermediateRepresentationType t @@ -246,6 +251,8 @@ itemStep SASpearman = id itemStep SABLEU = uncurry bleuStep itemStep SAGLEU = uncurry gleuStep itemStep SAWER = uncurry werStep +-- strings are character lists, so we could re-use werStep +itemStep SACER = uncurry werStep itemStep SAAccuracy = hitOrMiss itemStep SAClippEU = clippEUMatchStep itemStep SAFMeasure = getCount diff --git a/src/GEval/MetricsMeta.hs b/src/GEval/MetricsMeta.hs index d5c3516..c4b7b73 100644 --- a/src/GEval/MetricsMeta.hs +++ b/src/GEval/MetricsMeta.hs @@ -58,6 +58,7 @@ listOfAvailableMetrics = [RMSE, BLEU, GLEU, WER, + CER, NMI, ClippEU, LogLossHashed defaultLogLossHashedSize, @@ -78,6 +79,7 @@ listOfAvailableMetrics = [RMSE, CharMatch] extraInfo :: EvaluationScheme -> Maybe String +extraInfo (EvaluationScheme CER []) = Just "Character-Error Rate" extraInfo (EvaluationScheme GLEU []) = Just "\"Google GLEU\" not the grammar correction metric" extraInfo (EvaluationScheme BLEU [LowerCasing, RegexpMatch _]) = Just "BLEU on lowercased strings, only Latin characters and digits considered" @@ -97,6 +99,8 @@ isMetricDescribed (SoftFMeasure _) = True isMetricDescribed (Soft2DFMeasure _) = True isMetricDescribed (ProbabilisticMultiLabelFMeasure _) = True isMetricDescribed GLEU = True +isMetricDescribed WER = True +isMetricDescribed CER = True isMetricDescribed SegmentAccuracy = True isMetricDescribed _ = False @@ -138,6 +142,17 @@ metric on a corpus level but does not have its drawbacks for our per sentence reward objective. see: https://arxiv.org/pdf/1609.08144.pdf |] +getMetricDescription WER = + [i|WER (Word-Error Rate) is the number of word-level mistakes divided +by the number of words in the expected output. Possible mistakes are +deletions, insertions and substitions — as in the Levenshtein distance. +|] +getMetricDescription CER = + [i|CER (Character-Error Rate) is the number of character-level mistakes divided +by the total length of the expected output. Possible mistakes are +deletions, insertions and substitions — as in the Levenshtein distance. +|] + getMetricDescription SegmentAccuracy = [i|Accuracy counted for segments, i.e. labels with positions. The percentage of labels in the ground truth retrieved in the actual output is returned. @@ -157,6 +172,12 @@ first-name/3:0.9 |] outContents GLEU = [hereLit|Alice has a black |] +outContents WER = [hereLit|na ka huainaua e te atua te marama ko te awatea , a ko te pouri i huaina e ia ko te po +a ko te ahiahi , ko ata , he ra ko kotahi +|] +outContents CER = [hereLit|esse esi perctp +tabula rasai +|] outContents SegmentAccuracy = [hereLit|N:1-4 V:5-6 N:8-10 V:12-13 A:15-17 N:1-4 V:6-7 A:9-13 |] @@ -178,6 +199,10 @@ expectedScore (EvaluationScheme GLEU []) = 0.7142857142857143 expectedScore (EvaluationScheme SegmentAccuracy []) = 0.875 +expectedScore (EvaluationScheme WER []) + = 0.08571 +expectedScore (EvaluationScheme CER []) + = 0.14814 helpMetricParameterMetricsList :: String helpMetricParameterMetricsList = intercalate ", " $ map (\s -> (show s) ++ (case extraInfo s of @@ -226,7 +251,7 @@ the form LABEL:PAGE/X0,Y0,X1,Y1 where LABEL is any label, page is the page numbe formatDescription (ProbabilisticMultiLabelFMeasure _) = [hereLit|In each line a number of labels (entities) can be given. A label probability can be provided with a colon (e.g. "foo:0.7"). By default, 1.0 is assumed. |] -formatDescription GLEU = [hereLit|In each line a there is a space sparated sentence of words. +formatDescription GLEU = [hereLit|In each line a there is a space sparated sequence of words. |] formatDescription SegmentAccuracy = [hereLit|Labels can be any strings (without spaces), whereas is a list of 1-based indexes or spans separated by commas (spans are inclusive @@ -235,6 +260,9 @@ label "foo:bar" for positions 2, 4, 5, 6, 7 and 10. Note that no overlapping segments can be returned (evaluation will fail in such a case). |] +formatDescription WER = formatDescription GLEU +formatDescription CER = [hereLit|Any text, whitespace and punctuation marks are also considered. +|] scoreExplanation :: EvaluationScheme -> Maybe String scoreExplanation (EvaluationScheme (SoftFMeasure _) []) @@ -257,6 +285,14 @@ Now we have to calculate precision and recall: scoreExplanation (EvaluationScheme SegmentAccuracy []) = Just [hereLit|Out of 4 segments in the expected output for the first item, 3 were retrieved correcly (accuracy is 3/4=0.75). The second item was retrieved perfectly (accuracy is 1.0). Hence, the average is (0.75+1.0)/2=0.875.|] +scoreExplanation (EvaluationScheme WER []) + = Just [hereLit|The total length of expected output (in words) is 35. There are 3 errors +(1 word substituted, 1 inserted, 1 deleted) in the actual output. Hence, +WER = (1+1+1) / 35 = 3 / 35 = 0.08571.|] +scoreExplanation (EvaluationScheme CER []) + = Just [hereLit|The total length of expected output (in characters) is 27. There are 4 errors +(1 word substituted, 1 inserted, 1 deleted) in the actual output. Hence, +CER = (2+1+1) / 27 = 4 / 27 = 0.14814.|] pasteLines :: String -> String -> String pasteLines a b = printf "%-35s %s\n" a b diff --git a/test/Spec.hs b/test/Spec.hs index f7d81f6..23f0ba1 100644 --- a/test/Spec.hs +++ b/test/Spec.hs @@ -128,6 +128,12 @@ main = hspec $ do describe "WER" $ do it "simple example" $ runGEvalTest "wer-simple" `shouldReturnAlmost` 0.5555555555 + describe "CER" $ do + it "simple example" $ + runGEvalTest "cer-simple" `shouldReturnAlmost` 0.28947368421 + describe "CER" $ do + it "simple example (Mean/CER)" $ + runGEvalTest "cer-mean-simple" `shouldReturnAlmost` 0.277777777777778 describe "Accuracy" $ do it "simple example" $ runGEvalTest "accuracy-simple" `shouldReturnAlmost` 0.6 diff --git a/test/cer-mean-simple/cer-mean-simple-solution/test-A/out.tsv b/test/cer-mean-simple/cer-mean-simple-solution/test-A/out.tsv new file mode 100644 index 0000000..22a8f53 --- /dev/null +++ b/test/cer-mean-simple/cer-mean-simple-solution/test-A/out.tsv @@ -0,0 +1,2 @@ +To be or mot to be +Thas is the diff --git a/test/cer-mean-simple/cer-mean-simple/config.txt b/test/cer-mean-simple/cer-mean-simple/config.txt new file mode 100644 index 0000000..5b4aac4 --- /dev/null +++ b/test/cer-mean-simple/cer-mean-simple/config.txt @@ -0,0 +1 @@ +--metric Mean/CER diff --git a/test/cer-mean-simple/cer-mean-simple/test-A/expected.tsv b/test/cer-mean-simple/cer-mean-simple/test-A/expected.tsv new file mode 100644 index 0000000..a62528a --- /dev/null +++ b/test/cer-mean-simple/cer-mean-simple/test-A/expected.tsv @@ -0,0 +1,2 @@ +To be or not to be +That is the question diff --git a/test/cer-simple/cer-simple-solution/test-A/out.tsv b/test/cer-simple/cer-simple-solution/test-A/out.tsv new file mode 100644 index 0000000..22a8f53 --- /dev/null +++ b/test/cer-simple/cer-simple-solution/test-A/out.tsv @@ -0,0 +1,2 @@ +To be or mot to be +Thas is the diff --git a/test/cer-simple/cer-simple/config.txt b/test/cer-simple/cer-simple/config.txt new file mode 100644 index 0000000..3b10195 --- /dev/null +++ b/test/cer-simple/cer-simple/config.txt @@ -0,0 +1 @@ +--metric CER diff --git a/test/cer-simple/cer-simple/test-A/expected.tsv b/test/cer-simple/cer-simple/test-A/expected.tsv new file mode 100644 index 0000000..a62528a --- /dev/null +++ b/test/cer-simple/cer-simple/test-A/expected.tsv @@ -0,0 +1,2 @@ +To be or not to be +That is the question