Add CER metric

(Character-Error Rate)
2020-10-17 16:55:40 +02:00 · 2020-10-17 16:55:40 +02:00 · 819fbecedc
commit 819fbecedc
parent 51c29aabf6
12 changed files with 121 additions and 6 deletions
--- a/src/GEval/Core.hs
+++ b/src/GEval/Core.hs
@ -158,6 +158,7 @@ isPreprocessable Spearman = False
 isPreprocessable BLEU     = True
 isPreprocessable GLEU     = True
 isPreprocessable WER      = True
+isPreprocessable CER      = True
 isPreprocessable Accuracy = True
 isPreprocessable ClippEU  = False
 isPreprocessable (FMeasure _) = False
@ -691,7 +692,19 @@ gevalCoreOnSources (Mean WER)
      intoWords (RawItemTarget t) = Prelude.map unpack $ Data.Text.words t
      intoWords (PartiallyParsedItemTarget ts) = Prelude.map unpack ts

-gevalCoreOnSources (Mean _) = error $ "Mean/ meta-metric defined only for MultiLabel-F1 and WER for the time being"
+gevalCoreOnSources (Mean CER)
+  = gevalCoreWithoutInputOnItemTargets (Right . getString)
+                                       (Right . getString)
+                                       ((uncurry (/.)) . (uncurry werStep))
+                                       averageC
+                                       id
+                                       noGraph
+    where
+      -- repeated as below, as it will be refactored into dependent types soon anyway
+      getString (RawItemTarget t) = unpack t
+      getString (PartiallyParsedItemTarget ts) = Prelude.unwords $ Prelude.map unpack ts
+
+gevalCoreOnSources (Mean _) = error $ "Mean/ meta-metric defined only for MultiLabel-F1, WER and CER for the time being"

 -- only MultiLabel-F1 handled for JSONs for the time being...
 gevalCoreOnSources (MultiLabelFMeasure beta matchingSpec) =
@ -925,6 +938,11 @@ continueGEvalCalculations SAWER WER = defineContinuation werAgg werFinal noGraph
        werFuse (a1, a2) (b1, b2) = (a1 + b1, a2 + b2)
        werFinal (errors, ref) = errors /. ref

+continueGEvalCalculations SACER CER = defineContinuation cerAgg cerFinal noGraph
+  where cerAgg = CC.foldl cerFuse (0, 0)
+        cerFuse (a1, a2) (b1, b2) = (a1 + b1, a2 + b2)
+        cerFinal (errors, ref) = errors /. ref
+
 continueGEvalCalculations SAAccuracy Accuracy = defineContinuation averageC id noGraph

 continueGEvalCalculations SAFMeasure (FMeasure beta) = defineContinuation countAgg (fMeasureOnCounts beta) noGraph
--- a/src/GEval/CreateChallenge.hs
+++ b/src/GEval/CreateChallenge.hs
@ -105,6 +105,15 @@ Directory structure
 * `${testName}/in.tsv` — Finnish input data for the test set
 * `${testName}/expected.tsv` — Māori reference translation for the test set
 |]
+readmeMDContents WER testName = readmeMDContents BLEU testName
+readmeMDContents CER testName = [i|
+GEval simple OCR challenge
+==========================
+
+Do OCR.
+
+This is a sample fake challenge for Gonito framework. Replace it with
+the description of your challenge.|] ++ (commonReadmeMDContents testName)

 readmeMDContents Accuracy testName = [i|
 GEval sample classification challenge
@ -417,7 +426,8 @@ Directory structure
 * `README.md` — this file
 * `config.txt` — configuration file
 * `train/` — directory with training data
-* `train/train.tsv` — sample train set
+* `train/in.tsv` — input data for the train set
+* `train/expected.tsv` — expected (reference) data for the train set
 * `dev-0/` — directory with dev (test) data
 * `dev-0/in.tsv` — input data for the dev set
 * `dev-0/expected.tsv` — expected (reference) data for the dev set
@ -469,6 +479,11 @@ trainContents BLEU = [hereLit|alussa loi jumala taivaan ja maan	he mea hanga na
 ja maa oli autio ja tyhjä , ja pimeys oli syvyyden päällä	a kahore he ahua o te whenua , i takoto kau ; he pouri ano a runga i te mata o te hohonu
 ja jumalan henki liikkui vetten päällä	na ka whakapaho te wairua o te atua i runga i te kare o nga wai
 |]
+trainContents WER = trainContents BLEU
+trainContents CER = [hereLit|Hannibal ad portas	train1.pdf
+equo ne credite	train2.pdf
+errare humanum est	train3.pdf
+|]

 trainContents Accuracy = [hereLit|Y	10	none	yes
 N	-2	strong	no
@ -568,6 +583,10 @@ devInContents GLEU = devInContents BLEU
 devInContents BLEU = [hereLit|ja jumala sanoi : " tulkoon valkeus " , ja valkeus tuli
 ja jumala näki , että valkeus oli hyvä ; ja jumala erotti valkeuden pimeydestä
 |]
+devInContents WER = devInContents BLEU
+devInContents CER = [hereLit|dev1.pdf
+dev2.pdf
+|]
 devInContents Accuracy = [hereLit|-8	none	no
 1	mild	no
 |]
@ -636,6 +655,10 @@ devExpectedContents GLEU = devExpectedContents BLEU
 devExpectedContents BLEU = [hereLit|a ka ki te atua , kia marama : na ka marama
 a ka kite te atua i te marama , he pai : a ka wehea e te atua te marama i te pouri
 |]
+devExpectedContents WER = devExpectedContents BLEU
+devExpectedContents CER = [hereLit|et facta est lux
+Et tu, Brute?
+|]
 devExpectedContents Accuracy = [hereLit|N
 Y
 |]
@ -702,11 +725,15 @@ devExpectedContents _ = [hereLit|0.82

 testInContents :: Metric -> String
 testInContents (Mean metric) = testInContents metric
-testInContents GLEU = [hereLit|Alice has a black
+testInContents GLEU = [hereLit|Alicella on musta kissa.
 |]
 testInContents BLEU = [hereLit|ja jumala kutsui valkeuden päiväksi , ja pimeyden hän kutsui yöksi
 ja tuli ehtoo , ja tuli aamu , ensimmäinen päivä
 |]
+testInContents WER = testInContents BLEU
+testInContents CER = [hereLit|test1.pdf
+test2.pdf
+|]
 testInContents Accuracy = [hereLit|2	mild	yes
 -5	mild	no
 |]
@ -776,6 +803,10 @@ testExpectedContents (Mean metric) = testExpectedContents metric
 testExpectedContents BLEU = [hereLit|na ka huaina e te atua te marama ko te awatea , a ko te pouri i huaina e ia ko te po
 a ko te ahiahi , ko te ata , he ra kotahi
 |]
+testExpectedContents CER = [hereLit|esse est percipi
+tabula rasa
+|]
+testExpectedContents WER = testExpectedContents BLEU
 testExpectedContents Accuracy = [hereLit|N
 Y
 |]
@ -848,6 +879,8 @@ inHeaderContents :: Metric -> Maybe [String]
 inHeaderContents (Mean metric) = inHeaderContents metric
 inHeaderContents GLEU = Nothing
 inHeaderContents BLEU = Nothing
+inHeaderContents WER = Nothing
+inHeaderContents CER = Just ["Filename"]
 inHeaderContents Accuracy = Just ["Temperature", "Wind", "Rain"]
 inHeaderContents (FMeasure _) = Just ["seismic",
                                      "seismoacoustic",
@ -894,6 +927,8 @@ outHeaderContents :: Metric -> Maybe [String]
 outHeaderContents (Mean metric) = outHeaderContents metric
 outHeaderContents BLEU = Nothing
 outHeaderContents GLEU = Nothing
+outHeaderContents WER = Nothing
+outHeaderContents CER = Just ["OCRedText"]
 outHeaderContents Accuracy = Just ["ShouldYouKidForWalk"]
 outHeaderContents (FMeasure _) = Just ["IsSeismicBump"]
 outHeaderContents (MacroFMeasure _) = Just ["LanguageCode"]
--- a/src/GEval/Metric.hs
+++ b/src/GEval/Metric.hs
@ -25,7 +25,7 @@ import Data.Attoparsec.Text (parseOnly)
 -- the evaluation procedures are defined in GEval.Core

 -- | evaluation metric
-data Metric = RMSE | MSE | Pearson | Spearman | BLEU | GLEU | WER | Accuracy | ClippEU
+data Metric = RMSE | MSE | Pearson | Spearman | BLEU | GLEU | WER | CER | Accuracy | ClippEU
              | FMeasure Double | MacroFMeasure Double | NMI
              | LogLossHashed Word32 | CharMatch | MAP | LogLoss | Likelihood
              | BIOF1 | BIOF1Labels | TokenAccuracy | SegmentAccuracy | LikelihoodHashed Word32 | MAE | SMAPE
@ -48,6 +48,7 @@ instance Show Metric where
  show BLEU = "BLEU"
  show GLEU = "GLEU"
  show WER = "WER"
+  show CER = "CER"
  show Accuracy = "Accuracy"
  show ClippEU = "ClippEU"
  show (FMeasure beta) = "F" ++ (show beta)
@ -119,6 +120,7 @@ instance Read Metric where
  readsPrec _ ('B':'L':'E':'U':theRest) = [(BLEU, theRest)]
  readsPrec _ ('G':'L':'E':'U':theRest) = [(GLEU, theRest)]
  readsPrec _ ('W':'E':'R':theRest) = [(WER, theRest)]
+  readsPrec _ ('C':'E':'R':theRest) = [(CER, theRest)]
  readsPrec _ ('A':'c':'c':'u':'r':'a':'c':'y':theRest) = [(Accuracy, theRest)]
  readsPrec _ ('C':'l':'i':'p':'p':'E':'U':theRest) = [(ClippEU, theRest)]
  readsPrec _ ('N':'M':'I':theRest) = [(NMI, theRest)]
@ -178,6 +180,7 @@ getMetricOrdering Spearman = TheHigherTheBetter
 getMetricOrdering BLEU     = TheHigherTheBetter
 getMetricOrdering GLEU     = TheHigherTheBetter
 getMetricOrdering WER      = TheLowerTheBetter
+getMetricOrdering CER      = TheLowerTheBetter
 getMetricOrdering Accuracy = TheHigherTheBetter
 getMetricOrdering ClippEU  = TheHigherTheBetter
 getMetricOrdering (FMeasure _) = TheHigherTheBetter
--- a/src/GEval/MetricsMechanics.hs
+++ b/src/GEval/MetricsMechanics.hs
@ -47,7 +47,7 @@ import GEval.MatchingSpecification
 -- | Helper type so that singleton can be used.
 -- | (The problem is that some metrics are parametrized by Double
 -- | Word32 and this is not handled by the singleton libary.)
-singletons [d|data AMetric = ARMSE | AMSE | APearson | ASpearman | ABLEU | AGLEU | AWER | AAccuracy | AClippEU
+singletons [d|data AMetric = ARMSE | AMSE | APearson | ASpearman | ABLEU | AGLEU | AWER | ACER | AAccuracy | AClippEU
                             | AFMeasure | AMacroFMeasure | ANMI
                             | ALogLossHashed | ACharMatch | AMAP | ALogLoss | ALikelihood
                             | ABIOF1 | ABIOF1Labels | ATokenAccuracy | ASegmentAccuracy | ALikelihoodHashed | AMAE | ASMAPE | AMultiLabelFMeasure MatchingSpecification
@ -66,6 +66,7 @@ toHelper Spearman = ASpearman
 toHelper BLEU = ABLEU
 toHelper GLEU = AGLEU
 toHelper WER = AWER
+toHelper CER = ACER
 toHelper Accuracy = AAccuracy
 toHelper ClippEU = AClippEU
 toHelper (FMeasure _) = AFMeasure
@ -104,6 +105,7 @@ type family ParsedExpectedType (t :: AMetric) :: * where
  ParsedExpectedType ABLEU     = [[String]]
  ParsedExpectedType AGLEU     = [[String]]
  ParsedExpectedType AWER      = [String]
+  ParsedExpectedType ACER      = String
  ParsedExpectedType AAccuracy = Text
  ParsedExpectedType AClippEU  = [ClippingSpec]
  ParsedExpectedType AFMeasure = Bool
@ -138,6 +140,7 @@ expectedParser SASpearman = doubleParser
 expectedParser SABLEU = alternativeSentencesParser
 expectedParser SAGLEU = alternativeSentencesParser
 expectedParser SAWER = intoStringWords
+expectedParser SACER = Right . unpack
 expectedParser SAAccuracy = onlyStrip
 expectedParser SAClippEU = controlledParse lineClippingSpecsParser
 expectedParser SAFMeasure = zeroOneParser
@ -185,6 +188,7 @@ outputParser SASpearman = expectedParser SASpearman
 outputParser SABLEU = Right . Prelude.words . unpack
 outputParser SAGLEU = Right . Prelude.words . unpack
 outputParser SAWER = expectedParser SAWER
+outputParser SACER = expectedParser SACER
 outputParser SAAccuracy = expectedParser SAAccuracy
 outputParser SAClippEU = controlledParse lineClippingsParser
 outputParser SAFMeasure = probToZeroOneParser
@ -236,6 +240,7 @@ type family ItemIntermediateRepresentationType (t :: AMetric) :: * where
  ItemIntermediateRepresentationType ALikelihoodHashed = (Text, Text)
  ItemIntermediateRepresentationType ACharMatch = (Text, Text)
  ItemIntermediateRepresentationType AWER = (Int, Int)
+  ItemIntermediateRepresentationType ACER = (Int, Int)
  ItemIntermediateRepresentationType t = Double

 itemStep :: SAMetric t -> (ParsedExpectedType t, ParsedOutputType t) -> ItemIntermediateRepresentationType t
@ -246,6 +251,8 @@ itemStep SASpearman = id
 itemStep SABLEU = uncurry bleuStep
 itemStep SAGLEU = uncurry gleuStep
 itemStep SAWER = uncurry werStep
+-- strings are character lists, so we could re-use werStep
+itemStep SACER = uncurry werStep
 itemStep SAAccuracy = hitOrMiss
 itemStep SAClippEU = clippEUMatchStep
 itemStep SAFMeasure = getCount
--- a/src/GEval/MetricsMeta.hs
+++ b/src/GEval/MetricsMeta.hs
@ -58,6 +58,7 @@ listOfAvailableMetrics = [RMSE,
                          BLEU,
                          GLEU,
                          WER,
+                          CER,
                          NMI,
                          ClippEU,
                          LogLossHashed defaultLogLossHashedSize,
@ -78,6 +79,7 @@ listOfAvailableMetrics = [RMSE,
                          CharMatch]

 extraInfo :: EvaluationScheme -> Maybe String
+extraInfo (EvaluationScheme CER []) = Just "Character-Error Rate"
 extraInfo (EvaluationScheme GLEU [])  = Just "\"Google GLEU\" not the grammar correction metric"
 extraInfo (EvaluationScheme BLEU [LowerCasing,
                                 RegexpMatch _]) = Just "BLEU on lowercased strings, only Latin characters and digits considered"
@ -97,6 +99,8 @@ isMetricDescribed (SoftFMeasure _) = True
 isMetricDescribed (Soft2DFMeasure _) = True
 isMetricDescribed (ProbabilisticMultiLabelFMeasure _) = True
 isMetricDescribed GLEU = True
+isMetricDescribed WER = True
+isMetricDescribed CER = True
 isMetricDescribed SegmentAccuracy = True
 isMetricDescribed _ = False

@ -138,6 +142,17 @@ metric on a corpus level but does not have its drawbacks for our per
 sentence reward objective.
 see: https://arxiv.org/pdf/1609.08144.pdf
 |]
+getMetricDescription WER =
+  [i|WER (Word-Error Rate) is the number of word-level mistakes divided
+by the number of words in the expected output. Possible mistakes are
+deletions, insertions and substitions — as in the Levenshtein distance.
+|]
+getMetricDescription CER =
+  [i|CER (Character-Error Rate) is the number of character-level mistakes divided
+by the total length of the expected output. Possible mistakes are
+deletions, insertions and substitions — as in the Levenshtein distance.
+|]
+
 getMetricDescription SegmentAccuracy =
  [i|Accuracy counted for segments, i.e. labels with positions.
 The percentage of labels in the ground truth retrieved in the actual output is returned.
@ -157,6 +172,12 @@ first-name/3:0.9
 |]
 outContents GLEU = [hereLit|Alice has a black
 |]
+outContents WER = [hereLit|na ka huainaua e te atua te marama ko te awatea , a ko te pouri i huaina e ia ko te po
+a ko te ahiahi , ko ata , he ra ko kotahi
+|]
+outContents CER = [hereLit|esse esi perctp
+tabula rasai
+|]
 outContents SegmentAccuracy = [hereLit|N:1-4 V:5-6 N:8-10 V:12-13 A:15-17
 N:1-4 V:6-7 A:9-13
 |]
@ -178,6 +199,10 @@ expectedScore (EvaluationScheme GLEU [])
  = 0.7142857142857143
 expectedScore (EvaluationScheme SegmentAccuracy [])
  = 0.875
+expectedScore (EvaluationScheme WER [])
+  = 0.08571
+expectedScore (EvaluationScheme CER [])
+  = 0.14814

 helpMetricParameterMetricsList :: String
 helpMetricParameterMetricsList = intercalate ", " $ map (\s -> (show s) ++ (case extraInfo s of
@ -226,7 +251,7 @@ the form LABEL:PAGE/X0,Y0,X1,Y1 where LABEL is any label, page is the page numbe
 formatDescription (ProbabilisticMultiLabelFMeasure _) = [hereLit|In each line a number of labels (entities) can be given. A label probability
 can be provided with a colon (e.g. "foo:0.7"). By default, 1.0 is assumed.
 |]
-formatDescription GLEU = [hereLit|In each line a there is a space sparated sentence of words.
+formatDescription GLEU = [hereLit|In each line a there is a space sparated sequence of words.
 |]
 formatDescription SegmentAccuracy = [hereLit|Labels can be any strings (without spaces), whereas is a list of
 1-based indexes or spans separated by commas (spans are inclusive
@ -235,6 +260,9 @@ label "foo:bar" for positions 2, 4, 5, 6, 7 and 10. Note that no
 overlapping segments can be returned (evaluation will fail in
 such a case).
 |]
+formatDescription WER = formatDescription GLEU
+formatDescription CER = [hereLit|Any text, whitespace and punctuation marks are also considered.
+|]

 scoreExplanation :: EvaluationScheme -> Maybe String
 scoreExplanation (EvaluationScheme (SoftFMeasure _) [])
@ -257,6 +285,14 @@ Now we have to calculate precision and recall:
 scoreExplanation (EvaluationScheme SegmentAccuracy [])
  = Just [hereLit|Out of 4 segments in the expected output for the first item, 3 were retrieved correcly (accuracy is 3/4=0.75).
 The second item was retrieved perfectly (accuracy is 1.0). Hence, the average is (0.75+1.0)/2=0.875.|]
+scoreExplanation (EvaluationScheme WER [])
+  = Just [hereLit|The total length of expected output (in words) is 35. There are 3 errors
+(1 word substituted, 1 inserted, 1 deleted)  in the actual output. Hence,
+WER = (1+1+1) / 35 = 3 / 35 = 0.08571.|]
+scoreExplanation (EvaluationScheme CER [])
+  = Just [hereLit|The total length of expected output (in characters) is 27. There are 4 errors
+(1 word substituted, 1 inserted, 1 deleted)  in the actual output. Hence,
+CER = (2+1+1) / 27 = 4 / 27 = 0.14814.|]

 pasteLines :: String -> String -> String
 pasteLines a b = printf "%-35s %s\n" a b
--- a/test/Spec.hs
+++ b/test/Spec.hs
@ -128,6 +128,12 @@ main = hspec $ do
  describe "WER" $ do
    it "simple example" $
      runGEvalTest "wer-simple" `shouldReturnAlmost` 0.5555555555
+  describe "CER" $ do
+    it "simple example" $
+      runGEvalTest "cer-simple" `shouldReturnAlmost` 0.28947368421
+  describe "CER" $ do
+    it "simple example (Mean/CER)" $
+      runGEvalTest "cer-mean-simple" `shouldReturnAlmost` 0.277777777777778
  describe "Accuracy" $ do
    it "simple example" $
      runGEvalTest "accuracy-simple" `shouldReturnAlmost` 0.6
--- a/test/cer-mean-simple/cer-mean-simple-solution/test-A/out.tsv
+++ b/test/cer-mean-simple/cer-mean-simple-solution/test-A/out.tsv
@ -0,0 +1,2 @@
+To be or mot to be
+Thas is the
--- a/test/cer-mean-simple/cer-mean-simple/config.txt
+++ b/test/cer-mean-simple/cer-mean-simple/config.txt
@ -0,0 +1 @@
+--metric Mean/CER
--- a/test/cer-mean-simple/cer-mean-simple/test-A/expected.tsv
+++ b/test/cer-mean-simple/cer-mean-simple/test-A/expected.tsv
@ -0,0 +1,2 @@
+To be or not to be
+That is the question
--- a/test/cer-simple/cer-simple-solution/test-A/out.tsv
+++ b/test/cer-simple/cer-simple-solution/test-A/out.tsv
@ -0,0 +1,2 @@
+To be or mot to be
+Thas is the
--- a/test/cer-simple/cer-simple/config.txt
+++ b/test/cer-simple/cer-simple/config.txt
@ -0,0 +1 @@
+--metric CER
--- a/test/cer-simple/cer-simple/test-A/expected.tsv
+++ b/test/cer-simple/cer-simple/test-A/expected.tsv
@ -0,0 +1,2 @@
+To be or not to be
+That is the question