From 819fbecedc6f744a1a08f21c11067191837f87a2 Mon Sep 17 00:00:00 2001
From: Filip Gralinski <filipg@amu.edu.pl>
Date: Sat, 17 Oct 2020 16:55:40 +0200
Subject: [PATCH] Add CER metric

(Character-Error Rate)
---
 src/GEval/Core.hs                             | 20 +++++++++-
 src/GEval/CreateChallenge.hs                  | 39 ++++++++++++++++++-
 src/GEval/Metric.hs                           |  5 ++-
 src/GEval/MetricsMechanics.hs                 |  9 ++++-
 src/GEval/MetricsMeta.hs                      | 38 +++++++++++++++++-
 test/Spec.hs                                  |  6 +++
 .../cer-mean-simple-solution/test-A/out.tsv   |  2 +
 .../cer-mean-simple/config.txt                |  1 +
 .../cer-mean-simple/test-A/expected.tsv       |  2 +
 .../cer-simple-solution/test-A/out.tsv        |  2 +
 test/cer-simple/cer-simple/config.txt         |  1 +
 .../cer-simple/cer-simple/test-A/expected.tsv |  2 +
 12 files changed, 121 insertions(+), 6 deletions(-)
 create mode 100644 test/cer-mean-simple/cer-mean-simple-solution/test-A/out.tsv
 create mode 100644 test/cer-mean-simple/cer-mean-simple/config.txt
 create mode 100644 test/cer-mean-simple/cer-mean-simple/test-A/expected.tsv
 create mode 100644 test/cer-simple/cer-simple-solution/test-A/out.tsv
 create mode 100644 test/cer-simple/cer-simple/config.txt
 create mode 100644 test/cer-simple/cer-simple/test-A/expected.tsv

diff --git a/src/GEval/Core.hs b/src/GEval/Core.hs
index a9b0f16..e003f72 100644
--- a/src/GEval/Core.hs
+++ b/src/GEval/Core.hs
@@ -158,6 +158,7 @@ isPreprocessable Spearman = False
 isPreprocessable BLEU     = True
 isPreprocessable GLEU     = True
 isPreprocessable WER      = True
+isPreprocessable CER      = True
 isPreprocessable Accuracy = True
 isPreprocessable ClippEU  = False
 isPreprocessable (FMeasure _) = False
@@ -691,7 +692,19 @@ gevalCoreOnSources (Mean WER)
       intoWords (RawItemTarget t) = Prelude.map unpack $ Data.Text.words t
       intoWords (PartiallyParsedItemTarget ts) = Prelude.map unpack ts
 
-gevalCoreOnSources (Mean _) = error $ "Mean/ meta-metric defined only for MultiLabel-F1 and WER for the time being"
+gevalCoreOnSources (Mean CER)
+  = gevalCoreWithoutInputOnItemTargets (Right . getString)
+                                       (Right . getString)
+                                       ((uncurry (/.)) . (uncurry werStep))
+                                       averageC
+                                       id
+                                       noGraph
+    where
+      -- repeated as below, as it will be refactored into dependent types soon anyway
+      getString (RawItemTarget t) = unpack t
+      getString (PartiallyParsedItemTarget ts) = Prelude.unwords $ Prelude.map unpack ts
+
+gevalCoreOnSources (Mean _) = error $ "Mean/ meta-metric defined only for MultiLabel-F1, WER and CER for the time being"
 
 -- only MultiLabel-F1 handled for JSONs for the time being...
 gevalCoreOnSources (MultiLabelFMeasure beta matchingSpec) =
@@ -925,6 +938,11 @@ continueGEvalCalculations SAWER WER = defineContinuation werAgg werFinal noGraph
         werFuse (a1, a2) (b1, b2) = (a1 + b1, a2 + b2)
         werFinal (errors, ref) = errors /. ref
 
+continueGEvalCalculations SACER CER = defineContinuation cerAgg cerFinal noGraph
+  where cerAgg = CC.foldl cerFuse (0, 0)
+        cerFuse (a1, a2) (b1, b2) = (a1 + b1, a2 + b2)
+        cerFinal (errors, ref) = errors /. ref
+
 continueGEvalCalculations SAAccuracy Accuracy = defineContinuation averageC id noGraph
 
 continueGEvalCalculations SAFMeasure (FMeasure beta) = defineContinuation countAgg (fMeasureOnCounts beta) noGraph
diff --git a/src/GEval/CreateChallenge.hs b/src/GEval/CreateChallenge.hs
index f9e7dc6..5e283f9 100644
--- a/src/GEval/CreateChallenge.hs
+++ b/src/GEval/CreateChallenge.hs
@@ -105,6 +105,15 @@ Directory structure
 * `${testName}/in.tsv` — Finnish input data for the test set
 * `${testName}/expected.tsv` — Māori reference translation for the test set
 |]
+readmeMDContents WER testName = readmeMDContents BLEU testName
+readmeMDContents CER testName = [i|
+GEval simple OCR challenge
+==========================
+
+Do OCR.
+
+This is a sample fake challenge for Gonito framework. Replace it with
+the description of your challenge.|] ++ (commonReadmeMDContents testName)
 
 readmeMDContents Accuracy testName = [i|
 GEval sample classification challenge
@@ -417,7 +426,8 @@ Directory structure
 * `README.md` — this file
 * `config.txt` — configuration file
 * `train/` — directory with training data
-* `train/train.tsv` — sample train set
+* `train/in.tsv` — input data for the train set
+* `train/expected.tsv` — expected (reference) data for the train set
 * `dev-0/` — directory with dev (test) data
 * `dev-0/in.tsv` — input data for the dev set
 * `dev-0/expected.tsv` — expected (reference) data for the dev set
@@ -469,6 +479,11 @@ trainContents BLEU = [hereLit|alussa loi jumala taivaan ja maan	he mea hanga na
 ja maa oli autio ja tyhjä , ja pimeys oli syvyyden päällä	a kahore he ahua o te whenua , i takoto kau ; he pouri ano a runga i te mata o te hohonu
 ja jumalan henki liikkui vetten päällä	na ka whakapaho te wairua o te atua i runga i te kare o nga wai
 |]
+trainContents WER = trainContents BLEU
+trainContents CER = [hereLit|Hannibal ad portas	train1.pdf
+equo ne credite	train2.pdf
+errare humanum est	train3.pdf
+|]
 
 trainContents Accuracy = [hereLit|Y	10	none	yes
 N	-2	strong	no
@@ -568,6 +583,10 @@ devInContents GLEU = devInContents BLEU
 devInContents BLEU = [hereLit|ja jumala sanoi : " tulkoon valkeus " , ja valkeus tuli
 ja jumala näki , että valkeus oli hyvä ; ja jumala erotti valkeuden pimeydestä
 |]
+devInContents WER = devInContents BLEU
+devInContents CER = [hereLit|dev1.pdf
+dev2.pdf
+|]
 devInContents Accuracy = [hereLit|-8	none	no
 1	mild	no
 |]
@@ -636,6 +655,10 @@ devExpectedContents GLEU = devExpectedContents BLEU
 devExpectedContents BLEU = [hereLit|a ka ki te atua , kia marama : na ka marama
 a ka kite te atua i te marama , he pai : a ka wehea e te atua te marama i te pouri
 |]
+devExpectedContents WER = devExpectedContents BLEU
+devExpectedContents CER = [hereLit|et facta est lux
+Et tu, Brute?
+|]
 devExpectedContents Accuracy = [hereLit|N
 Y
 |]
@@ -702,11 +725,15 @@ devExpectedContents _ = [hereLit|0.82
 
 testInContents :: Metric -> String
 testInContents (Mean metric) = testInContents metric
-testInContents GLEU = [hereLit|Alice has a black
+testInContents GLEU = [hereLit|Alicella on musta kissa.
 |]
 testInContents BLEU = [hereLit|ja jumala kutsui valkeuden päiväksi , ja pimeyden hän kutsui yöksi
 ja tuli ehtoo , ja tuli aamu , ensimmäinen päivä
 |]
+testInContents WER = testInContents BLEU
+testInContents CER = [hereLit|test1.pdf
+test2.pdf
+|]
 testInContents Accuracy = [hereLit|2	mild	yes
 -5	mild	no
 |]
@@ -776,6 +803,10 @@ testExpectedContents (Mean metric) = testExpectedContents metric
 testExpectedContents BLEU = [hereLit|na ka huaina e te atua te marama ko te awatea , a ko te pouri i huaina e ia ko te po
 a ko te ahiahi , ko te ata , he ra kotahi
 |]
+testExpectedContents CER = [hereLit|esse est percipi
+tabula rasa
+|]
+testExpectedContents WER = testExpectedContents BLEU
 testExpectedContents Accuracy = [hereLit|N
 Y
 |]
@@ -848,6 +879,8 @@ inHeaderContents :: Metric -> Maybe [String]
 inHeaderContents (Mean metric) = inHeaderContents metric
 inHeaderContents GLEU = Nothing
 inHeaderContents BLEU = Nothing
+inHeaderContents WER = Nothing
+inHeaderContents CER = Just ["Filename"]
 inHeaderContents Accuracy = Just ["Temperature", "Wind", "Rain"]
 inHeaderContents (FMeasure _) = Just ["seismic",
                                       "seismoacoustic",
@@ -894,6 +927,8 @@ outHeaderContents :: Metric -> Maybe [String]
 outHeaderContents (Mean metric) = outHeaderContents metric
 outHeaderContents BLEU = Nothing
 outHeaderContents GLEU = Nothing
+outHeaderContents WER = Nothing
+outHeaderContents CER = Just ["OCRedText"]
 outHeaderContents Accuracy = Just ["ShouldYouKidForWalk"]
 outHeaderContents (FMeasure _) = Just ["IsSeismicBump"]
 outHeaderContents (MacroFMeasure _) = Just ["LanguageCode"]
diff --git a/src/GEval/Metric.hs b/src/GEval/Metric.hs
index d30f214..77ab327 100644
--- a/src/GEval/Metric.hs
+++ b/src/GEval/Metric.hs
@@ -25,7 +25,7 @@ import Data.Attoparsec.Text (parseOnly)
 -- the evaluation procedures are defined in GEval.Core
 
 -- | evaluation metric
-data Metric = RMSE | MSE | Pearson | Spearman | BLEU | GLEU | WER | Accuracy | ClippEU
+data Metric = RMSE | MSE | Pearson | Spearman | BLEU | GLEU | WER | CER | Accuracy | ClippEU
               | FMeasure Double | MacroFMeasure Double | NMI
               | LogLossHashed Word32 | CharMatch | MAP | LogLoss | Likelihood
               | BIOF1 | BIOF1Labels | TokenAccuracy | SegmentAccuracy | LikelihoodHashed Word32 | MAE | SMAPE
@@ -48,6 +48,7 @@ instance Show Metric where
   show BLEU = "BLEU"
   show GLEU = "GLEU"
   show WER = "WER"
+  show CER = "CER"
   show Accuracy = "Accuracy"
   show ClippEU = "ClippEU"
   show (FMeasure beta) = "F" ++ (show beta)
@@ -119,6 +120,7 @@ instance Read Metric where
   readsPrec _ ('B':'L':'E':'U':theRest) = [(BLEU, theRest)]
   readsPrec _ ('G':'L':'E':'U':theRest) = [(GLEU, theRest)]
   readsPrec _ ('W':'E':'R':theRest) = [(WER, theRest)]
+  readsPrec _ ('C':'E':'R':theRest) = [(CER, theRest)]
   readsPrec _ ('A':'c':'c':'u':'r':'a':'c':'y':theRest) = [(Accuracy, theRest)]
   readsPrec _ ('C':'l':'i':'p':'p':'E':'U':theRest) = [(ClippEU, theRest)]
   readsPrec _ ('N':'M':'I':theRest) = [(NMI, theRest)]
@@ -178,6 +180,7 @@ getMetricOrdering Spearman = TheHigherTheBetter
 getMetricOrdering BLEU     = TheHigherTheBetter
 getMetricOrdering GLEU     = TheHigherTheBetter
 getMetricOrdering WER      = TheLowerTheBetter
+getMetricOrdering CER      = TheLowerTheBetter
 getMetricOrdering Accuracy = TheHigherTheBetter
 getMetricOrdering ClippEU  = TheHigherTheBetter
 getMetricOrdering (FMeasure _) = TheHigherTheBetter
diff --git a/src/GEval/MetricsMechanics.hs b/src/GEval/MetricsMechanics.hs
index a743e88..c0c4c98 100644
--- a/src/GEval/MetricsMechanics.hs
+++ b/src/GEval/MetricsMechanics.hs
@@ -47,7 +47,7 @@ import GEval.MatchingSpecification
 -- | Helper type so that singleton can be used.
 -- | (The problem is that some metrics are parametrized by Double
 -- | Word32 and this is not handled by the singleton libary.)
-singletons [d|data AMetric = ARMSE | AMSE | APearson | ASpearman | ABLEU | AGLEU | AWER | AAccuracy | AClippEU
+singletons [d|data AMetric = ARMSE | AMSE | APearson | ASpearman | ABLEU | AGLEU | AWER | ACER | AAccuracy | AClippEU
                              | AFMeasure | AMacroFMeasure | ANMI
                              | ALogLossHashed | ACharMatch | AMAP | ALogLoss | ALikelihood
                              | ABIOF1 | ABIOF1Labels | ATokenAccuracy | ASegmentAccuracy | ALikelihoodHashed | AMAE | ASMAPE | AMultiLabelFMeasure MatchingSpecification
@@ -66,6 +66,7 @@ toHelper Spearman = ASpearman
 toHelper BLEU = ABLEU
 toHelper GLEU = AGLEU
 toHelper WER = AWER
+toHelper CER = ACER
 toHelper Accuracy = AAccuracy
 toHelper ClippEU = AClippEU
 toHelper (FMeasure _) = AFMeasure
@@ -104,6 +105,7 @@ type family ParsedExpectedType (t :: AMetric) :: * where
   ParsedExpectedType ABLEU     = [[String]]
   ParsedExpectedType AGLEU     = [[String]]
   ParsedExpectedType AWER      = [String]
+  ParsedExpectedType ACER      = String
   ParsedExpectedType AAccuracy = Text
   ParsedExpectedType AClippEU  = [ClippingSpec]
   ParsedExpectedType AFMeasure = Bool
@@ -138,6 +140,7 @@ expectedParser SASpearman = doubleParser
 expectedParser SABLEU = alternativeSentencesParser
 expectedParser SAGLEU = alternativeSentencesParser
 expectedParser SAWER = intoStringWords
+expectedParser SACER = Right . unpack
 expectedParser SAAccuracy = onlyStrip
 expectedParser SAClippEU = controlledParse lineClippingSpecsParser
 expectedParser SAFMeasure = zeroOneParser
@@ -185,6 +188,7 @@ outputParser SASpearman = expectedParser SASpearman
 outputParser SABLEU = Right . Prelude.words . unpack
 outputParser SAGLEU = Right . Prelude.words . unpack
 outputParser SAWER = expectedParser SAWER
+outputParser SACER = expectedParser SACER
 outputParser SAAccuracy = expectedParser SAAccuracy
 outputParser SAClippEU = controlledParse lineClippingsParser
 outputParser SAFMeasure = probToZeroOneParser
@@ -236,6 +240,7 @@ type family ItemIntermediateRepresentationType (t :: AMetric) :: * where
   ItemIntermediateRepresentationType ALikelihoodHashed = (Text, Text)
   ItemIntermediateRepresentationType ACharMatch = (Text, Text)
   ItemIntermediateRepresentationType AWER = (Int, Int)
+  ItemIntermediateRepresentationType ACER = (Int, Int)
   ItemIntermediateRepresentationType t = Double
 
 itemStep :: SAMetric t -> (ParsedExpectedType t, ParsedOutputType t) -> ItemIntermediateRepresentationType t
@@ -246,6 +251,8 @@ itemStep SASpearman = id
 itemStep SABLEU = uncurry bleuStep
 itemStep SAGLEU = uncurry gleuStep
 itemStep SAWER = uncurry werStep
+-- strings are character lists, so we could re-use werStep
+itemStep SACER = uncurry werStep
 itemStep SAAccuracy = hitOrMiss
 itemStep SAClippEU = clippEUMatchStep
 itemStep SAFMeasure = getCount
diff --git a/src/GEval/MetricsMeta.hs b/src/GEval/MetricsMeta.hs
index d5c3516..c4b7b73 100644
--- a/src/GEval/MetricsMeta.hs
+++ b/src/GEval/MetricsMeta.hs
@@ -58,6 +58,7 @@ listOfAvailableMetrics = [RMSE,
                           BLEU,
                           GLEU,
                           WER,
+                          CER,
                           NMI,
                           ClippEU,
                           LogLossHashed defaultLogLossHashedSize,
@@ -78,6 +79,7 @@ listOfAvailableMetrics = [RMSE,
                           CharMatch]
 
 extraInfo :: EvaluationScheme -> Maybe String
+extraInfo (EvaluationScheme CER []) = Just "Character-Error Rate"
 extraInfo (EvaluationScheme GLEU [])  = Just "\"Google GLEU\" not the grammar correction metric"
 extraInfo (EvaluationScheme BLEU [LowerCasing,
                                  RegexpMatch _]) = Just "BLEU on lowercased strings, only Latin characters and digits considered"
@@ -97,6 +99,8 @@ isMetricDescribed (SoftFMeasure _) = True
 isMetricDescribed (Soft2DFMeasure _) = True
 isMetricDescribed (ProbabilisticMultiLabelFMeasure _) = True
 isMetricDescribed GLEU = True
+isMetricDescribed WER = True
+isMetricDescribed CER = True
 isMetricDescribed SegmentAccuracy = True
 isMetricDescribed _ = False
 
@@ -138,6 +142,17 @@ metric on a corpus level but does not have its drawbacks for our per
 sentence reward objective.
 see: https://arxiv.org/pdf/1609.08144.pdf
 |]
+getMetricDescription WER =
+  [i|WER (Word-Error Rate) is the number of word-level mistakes divided
+by the number of words in the expected output. Possible mistakes are
+deletions, insertions and substitions — as in the Levenshtein distance.
+|]
+getMetricDescription CER =
+  [i|CER (Character-Error Rate) is the number of character-level mistakes divided
+by the total length of the expected output. Possible mistakes are
+deletions, insertions and substitions — as in the Levenshtein distance.
+|]
+
 getMetricDescription SegmentAccuracy =
   [i|Accuracy counted for segments, i.e. labels with positions.
 The percentage of labels in the ground truth retrieved in the actual output is returned.
@@ -157,6 +172,12 @@ first-name/3:0.9
 |]
 outContents GLEU = [hereLit|Alice has a black
 |]
+outContents WER = [hereLit|na ka huainaua e te atua te marama ko te awatea , a ko te pouri i huaina e ia ko te po
+a ko te ahiahi , ko ata , he ra ko kotahi
+|]
+outContents CER = [hereLit|esse esi perctp
+tabula rasai
+|]
 outContents SegmentAccuracy = [hereLit|N:1-4 V:5-6 N:8-10 V:12-13 A:15-17
 N:1-4 V:6-7 A:9-13
 |]
@@ -178,6 +199,10 @@ expectedScore (EvaluationScheme GLEU [])
   = 0.7142857142857143
 expectedScore (EvaluationScheme SegmentAccuracy [])
   = 0.875
+expectedScore (EvaluationScheme WER [])
+  = 0.08571
+expectedScore (EvaluationScheme CER [])
+  = 0.14814
 
 helpMetricParameterMetricsList :: String
 helpMetricParameterMetricsList = intercalate ", " $ map (\s -> (show s) ++ (case extraInfo s of
@@ -226,7 +251,7 @@ the form LABEL:PAGE/X0,Y0,X1,Y1 where LABEL is any label, page is the page numbe
 formatDescription (ProbabilisticMultiLabelFMeasure _) = [hereLit|In each line a number of labels (entities) can be given. A label probability
 can be provided with a colon (e.g. "foo:0.7"). By default, 1.0 is assumed.
 |]
-formatDescription GLEU = [hereLit|In each line a there is a space sparated sentence of words.
+formatDescription GLEU = [hereLit|In each line a there is a space sparated sequence of words.
 |]
 formatDescription SegmentAccuracy = [hereLit|Labels can be any strings (without spaces), whereas is a list of
 1-based indexes or spans separated by commas (spans are inclusive
@@ -235,6 +260,9 @@ label "foo:bar" for positions 2, 4, 5, 6, 7 and 10. Note that no
 overlapping segments can be returned (evaluation will fail in
 such a case).
 |]
+formatDescription WER = formatDescription GLEU
+formatDescription CER = [hereLit|Any text, whitespace and punctuation marks are also considered.
+|]
 
 scoreExplanation :: EvaluationScheme -> Maybe String
 scoreExplanation (EvaluationScheme (SoftFMeasure _) [])
@@ -257,6 +285,14 @@ Now we have to calculate precision and recall:
 scoreExplanation (EvaluationScheme SegmentAccuracy [])
   = Just [hereLit|Out of 4 segments in the expected output for the first item, 3 were retrieved correcly (accuracy is 3/4=0.75).
 The second item was retrieved perfectly (accuracy is 1.0). Hence, the average is (0.75+1.0)/2=0.875.|]
+scoreExplanation (EvaluationScheme WER [])
+  = Just [hereLit|The total length of expected output (in words) is 35. There are 3 errors
+(1 word substituted, 1 inserted, 1 deleted)  in the actual output. Hence,
+WER = (1+1+1) / 35 = 3 / 35 = 0.08571.|]
+scoreExplanation (EvaluationScheme CER [])
+  = Just [hereLit|The total length of expected output (in characters) is 27. There are 4 errors
+(1 word substituted, 1 inserted, 1 deleted)  in the actual output. Hence,
+CER = (2+1+1) / 27 = 4 / 27 = 0.14814.|]
 
 pasteLines :: String -> String -> String
 pasteLines a b = printf "%-35s %s\n" a b
diff --git a/test/Spec.hs b/test/Spec.hs
index f7d81f6..23f0ba1 100644
--- a/test/Spec.hs
+++ b/test/Spec.hs
@@ -128,6 +128,12 @@ main = hspec $ do
   describe "WER" $ do
     it "simple example" $
       runGEvalTest "wer-simple" `shouldReturnAlmost` 0.5555555555
+  describe "CER" $ do
+    it "simple example" $
+      runGEvalTest "cer-simple" `shouldReturnAlmost` 0.28947368421
+  describe "CER" $ do
+    it "simple example (Mean/CER)" $
+      runGEvalTest "cer-mean-simple" `shouldReturnAlmost` 0.277777777777778
   describe "Accuracy" $ do
     it "simple example" $
       runGEvalTest "accuracy-simple" `shouldReturnAlmost` 0.6
diff --git a/test/cer-mean-simple/cer-mean-simple-solution/test-A/out.tsv b/test/cer-mean-simple/cer-mean-simple-solution/test-A/out.tsv
new file mode 100644
index 0000000..22a8f53
--- /dev/null
+++ b/test/cer-mean-simple/cer-mean-simple-solution/test-A/out.tsv
@@ -0,0 +1,2 @@
+To be or mot to be
+Thas is the
diff --git a/test/cer-mean-simple/cer-mean-simple/config.txt b/test/cer-mean-simple/cer-mean-simple/config.txt
new file mode 100644
index 0000000..5b4aac4
--- /dev/null
+++ b/test/cer-mean-simple/cer-mean-simple/config.txt
@@ -0,0 +1 @@
+--metric Mean/CER
diff --git a/test/cer-mean-simple/cer-mean-simple/test-A/expected.tsv b/test/cer-mean-simple/cer-mean-simple/test-A/expected.tsv
new file mode 100644
index 0000000..a62528a
--- /dev/null
+++ b/test/cer-mean-simple/cer-mean-simple/test-A/expected.tsv
@@ -0,0 +1,2 @@
+To be or not to be
+That is the question
diff --git a/test/cer-simple/cer-simple-solution/test-A/out.tsv b/test/cer-simple/cer-simple-solution/test-A/out.tsv
new file mode 100644
index 0000000..22a8f53
--- /dev/null
+++ b/test/cer-simple/cer-simple-solution/test-A/out.tsv
@@ -0,0 +1,2 @@
+To be or mot to be
+Thas is the
diff --git a/test/cer-simple/cer-simple/config.txt b/test/cer-simple/cer-simple/config.txt
new file mode 100644
index 0000000..3b10195
--- /dev/null
+++ b/test/cer-simple/cer-simple/config.txt
@@ -0,0 +1 @@
+--metric CER
diff --git a/test/cer-simple/cer-simple/test-A/expected.tsv b/test/cer-simple/cer-simple/test-A/expected.tsv
new file mode 100644
index 0000000..a62528a
--- /dev/null
+++ b/test/cer-simple/cer-simple/test-A/expected.tsv
@@ -0,0 +1,2 @@
+To be or not to be
+That is the question