Add CER metric

(Character-Error Rate)
This commit is contained in:
Filip Gralinski 2020-10-17 16:55:40 +02:00
parent 51c29aabf6
commit 819fbecedc
12 changed files with 121 additions and 6 deletions

View File

@ -158,6 +158,7 @@ isPreprocessable Spearman = False
isPreprocessable BLEU = True
isPreprocessable GLEU = True
isPreprocessable WER = True
isPreprocessable CER = True
isPreprocessable Accuracy = True
isPreprocessable ClippEU = False
isPreprocessable (FMeasure _) = False
@ -691,7 +692,19 @@ gevalCoreOnSources (Mean WER)
intoWords (RawItemTarget t) = Prelude.map unpack $ Data.Text.words t
intoWords (PartiallyParsedItemTarget ts) = Prelude.map unpack ts
gevalCoreOnSources (Mean _) = error $ "Mean/ meta-metric defined only for MultiLabel-F1 and WER for the time being"
gevalCoreOnSources (Mean CER)
= gevalCoreWithoutInputOnItemTargets (Right . getString)
(Right . getString)
((uncurry (/.)) . (uncurry werStep))
averageC
id
noGraph
where
-- repeated as below, as it will be refactored into dependent types soon anyway
getString (RawItemTarget t) = unpack t
getString (PartiallyParsedItemTarget ts) = Prelude.unwords $ Prelude.map unpack ts
gevalCoreOnSources (Mean _) = error $ "Mean/ meta-metric defined only for MultiLabel-F1, WER and CER for the time being"
-- only MultiLabel-F1 handled for JSONs for the time being...
gevalCoreOnSources (MultiLabelFMeasure beta matchingSpec) =
@ -925,6 +938,11 @@ continueGEvalCalculations SAWER WER = defineContinuation werAgg werFinal noGraph
werFuse (a1, a2) (b1, b2) = (a1 + b1, a2 + b2)
werFinal (errors, ref) = errors /. ref
continueGEvalCalculations SACER CER = defineContinuation cerAgg cerFinal noGraph
where cerAgg = CC.foldl cerFuse (0, 0)
cerFuse (a1, a2) (b1, b2) = (a1 + b1, a2 + b2)
cerFinal (errors, ref) = errors /. ref
continueGEvalCalculations SAAccuracy Accuracy = defineContinuation averageC id noGraph
continueGEvalCalculations SAFMeasure (FMeasure beta) = defineContinuation countAgg (fMeasureOnCounts beta) noGraph

View File

@ -105,6 +105,15 @@ Directory structure
* `${testName}/in.tsv` Finnish input data for the test set
* `${testName}/expected.tsv` Māori reference translation for the test set
|]
readmeMDContents WER testName = readmeMDContents BLEU testName
readmeMDContents CER testName = [i|
GEval simple OCR challenge
==========================
Do OCR.
This is a sample fake challenge for Gonito framework. Replace it with
the description of your challenge.|] ++ (commonReadmeMDContents testName)
readmeMDContents Accuracy testName = [i|
GEval sample classification challenge
@ -417,7 +426,8 @@ Directory structure
* `README.md` this file
* `config.txt` configuration file
* `train/` directory with training data
* `train/train.tsv` sample train set
* `train/in.tsv` input data for the train set
* `train/expected.tsv` expected (reference) data for the train set
* `dev-0/` directory with dev (test) data
* `dev-0/in.tsv` input data for the dev set
* `dev-0/expected.tsv` expected (reference) data for the dev set
@ -469,6 +479,11 @@ trainContents BLEU = [hereLit|alussa loi jumala taivaan ja maan he mea hanga na
ja maa oli autio ja tyhjä , ja pimeys oli syvyyden päällä a kahore he ahua o te whenua , i takoto kau ; he pouri ano a runga i te mata o te hohonu
ja jumalan henki liikkui vetten päällä na ka whakapaho te wairua o te atua i runga i te kare o nga wai
|]
trainContents WER = trainContents BLEU
trainContents CER = [hereLit|Hannibal ad portas train1.pdf
equo ne credite train2.pdf
errare humanum est train3.pdf
|]
trainContents Accuracy = [hereLit|Y 10 none yes
N -2 strong no
@ -568,6 +583,10 @@ devInContents GLEU = devInContents BLEU
devInContents BLEU = [hereLit|ja jumala sanoi : " tulkoon valkeus " , ja valkeus tuli
ja jumala näki , että valkeus oli hyvä ; ja jumala erotti valkeuden pimeydestä
|]
devInContents WER = devInContents BLEU
devInContents CER = [hereLit|dev1.pdf
dev2.pdf
|]
devInContents Accuracy = [hereLit|-8 none no
1 mild no
|]
@ -636,6 +655,10 @@ devExpectedContents GLEU = devExpectedContents BLEU
devExpectedContents BLEU = [hereLit|a ka ki te atua , kia marama : na ka marama
a ka kite te atua i te marama , he pai : a ka wehea e te atua te marama i te pouri
|]
devExpectedContents WER = devExpectedContents BLEU
devExpectedContents CER = [hereLit|et facta est lux
Et tu, Brute?
|]
devExpectedContents Accuracy = [hereLit|N
Y
|]
@ -702,11 +725,15 @@ devExpectedContents _ = [hereLit|0.82
testInContents :: Metric -> String
testInContents (Mean metric) = testInContents metric
testInContents GLEU = [hereLit|Alice has a black
testInContents GLEU = [hereLit|Alicella on musta kissa.
|]
testInContents BLEU = [hereLit|ja jumala kutsui valkeuden päiväksi , ja pimeyden hän kutsui yöksi
ja tuli ehtoo , ja tuli aamu , ensimmäinen päivä
|]
testInContents WER = testInContents BLEU
testInContents CER = [hereLit|test1.pdf
test2.pdf
|]
testInContents Accuracy = [hereLit|2 mild yes
-5 mild no
|]
@ -776,6 +803,10 @@ testExpectedContents (Mean metric) = testExpectedContents metric
testExpectedContents BLEU = [hereLit|na ka huaina e te atua te marama ko te awatea , a ko te pouri i huaina e ia ko te po
a ko te ahiahi , ko te ata , he ra kotahi
|]
testExpectedContents CER = [hereLit|esse est percipi
tabula rasa
|]
testExpectedContents WER = testExpectedContents BLEU
testExpectedContents Accuracy = [hereLit|N
Y
|]
@ -848,6 +879,8 @@ inHeaderContents :: Metric -> Maybe [String]
inHeaderContents (Mean metric) = inHeaderContents metric
inHeaderContents GLEU = Nothing
inHeaderContents BLEU = Nothing
inHeaderContents WER = Nothing
inHeaderContents CER = Just ["Filename"]
inHeaderContents Accuracy = Just ["Temperature", "Wind", "Rain"]
inHeaderContents (FMeasure _) = Just ["seismic",
"seismoacoustic",
@ -894,6 +927,8 @@ outHeaderContents :: Metric -> Maybe [String]
outHeaderContents (Mean metric) = outHeaderContents metric
outHeaderContents BLEU = Nothing
outHeaderContents GLEU = Nothing
outHeaderContents WER = Nothing
outHeaderContents CER = Just ["OCRedText"]
outHeaderContents Accuracy = Just ["ShouldYouKidForWalk"]
outHeaderContents (FMeasure _) = Just ["IsSeismicBump"]
outHeaderContents (MacroFMeasure _) = Just ["LanguageCode"]

View File

@ -25,7 +25,7 @@ import Data.Attoparsec.Text (parseOnly)
-- the evaluation procedures are defined in GEval.Core
-- | evaluation metric
data Metric = RMSE | MSE | Pearson | Spearman | BLEU | GLEU | WER | Accuracy | ClippEU
data Metric = RMSE | MSE | Pearson | Spearman | BLEU | GLEU | WER | CER | Accuracy | ClippEU
| FMeasure Double | MacroFMeasure Double | NMI
| LogLossHashed Word32 | CharMatch | MAP | LogLoss | Likelihood
| BIOF1 | BIOF1Labels | TokenAccuracy | SegmentAccuracy | LikelihoodHashed Word32 | MAE | SMAPE
@ -48,6 +48,7 @@ instance Show Metric where
show BLEU = "BLEU"
show GLEU = "GLEU"
show WER = "WER"
show CER = "CER"
show Accuracy = "Accuracy"
show ClippEU = "ClippEU"
show (FMeasure beta) = "F" ++ (show beta)
@ -119,6 +120,7 @@ instance Read Metric where
readsPrec _ ('B':'L':'E':'U':theRest) = [(BLEU, theRest)]
readsPrec _ ('G':'L':'E':'U':theRest) = [(GLEU, theRest)]
readsPrec _ ('W':'E':'R':theRest) = [(WER, theRest)]
readsPrec _ ('C':'E':'R':theRest) = [(CER, theRest)]
readsPrec _ ('A':'c':'c':'u':'r':'a':'c':'y':theRest) = [(Accuracy, theRest)]
readsPrec _ ('C':'l':'i':'p':'p':'E':'U':theRest) = [(ClippEU, theRest)]
readsPrec _ ('N':'M':'I':theRest) = [(NMI, theRest)]
@ -178,6 +180,7 @@ getMetricOrdering Spearman = TheHigherTheBetter
getMetricOrdering BLEU = TheHigherTheBetter
getMetricOrdering GLEU = TheHigherTheBetter
getMetricOrdering WER = TheLowerTheBetter
getMetricOrdering CER = TheLowerTheBetter
getMetricOrdering Accuracy = TheHigherTheBetter
getMetricOrdering ClippEU = TheHigherTheBetter
getMetricOrdering (FMeasure _) = TheHigherTheBetter

View File

@ -47,7 +47,7 @@ import GEval.MatchingSpecification
-- | Helper type so that singleton can be used.
-- | (The problem is that some metrics are parametrized by Double
-- | Word32 and this is not handled by the singleton libary.)
singletons [d|data AMetric = ARMSE | AMSE | APearson | ASpearman | ABLEU | AGLEU | AWER | AAccuracy | AClippEU
singletons [d|data AMetric = ARMSE | AMSE | APearson | ASpearman | ABLEU | AGLEU | AWER | ACER | AAccuracy | AClippEU
| AFMeasure | AMacroFMeasure | ANMI
| ALogLossHashed | ACharMatch | AMAP | ALogLoss | ALikelihood
| ABIOF1 | ABIOF1Labels | ATokenAccuracy | ASegmentAccuracy | ALikelihoodHashed | AMAE | ASMAPE | AMultiLabelFMeasure MatchingSpecification
@ -66,6 +66,7 @@ toHelper Spearman = ASpearman
toHelper BLEU = ABLEU
toHelper GLEU = AGLEU
toHelper WER = AWER
toHelper CER = ACER
toHelper Accuracy = AAccuracy
toHelper ClippEU = AClippEU
toHelper (FMeasure _) = AFMeasure
@ -104,6 +105,7 @@ type family ParsedExpectedType (t :: AMetric) :: * where
ParsedExpectedType ABLEU = [[String]]
ParsedExpectedType AGLEU = [[String]]
ParsedExpectedType AWER = [String]
ParsedExpectedType ACER = String
ParsedExpectedType AAccuracy = Text
ParsedExpectedType AClippEU = [ClippingSpec]
ParsedExpectedType AFMeasure = Bool
@ -138,6 +140,7 @@ expectedParser SASpearman = doubleParser
expectedParser SABLEU = alternativeSentencesParser
expectedParser SAGLEU = alternativeSentencesParser
expectedParser SAWER = intoStringWords
expectedParser SACER = Right . unpack
expectedParser SAAccuracy = onlyStrip
expectedParser SAClippEU = controlledParse lineClippingSpecsParser
expectedParser SAFMeasure = zeroOneParser
@ -185,6 +188,7 @@ outputParser SASpearman = expectedParser SASpearman
outputParser SABLEU = Right . Prelude.words . unpack
outputParser SAGLEU = Right . Prelude.words . unpack
outputParser SAWER = expectedParser SAWER
outputParser SACER = expectedParser SACER
outputParser SAAccuracy = expectedParser SAAccuracy
outputParser SAClippEU = controlledParse lineClippingsParser
outputParser SAFMeasure = probToZeroOneParser
@ -236,6 +240,7 @@ type family ItemIntermediateRepresentationType (t :: AMetric) :: * where
ItemIntermediateRepresentationType ALikelihoodHashed = (Text, Text)
ItemIntermediateRepresentationType ACharMatch = (Text, Text)
ItemIntermediateRepresentationType AWER = (Int, Int)
ItemIntermediateRepresentationType ACER = (Int, Int)
ItemIntermediateRepresentationType t = Double
itemStep :: SAMetric t -> (ParsedExpectedType t, ParsedOutputType t) -> ItemIntermediateRepresentationType t
@ -246,6 +251,8 @@ itemStep SASpearman = id
itemStep SABLEU = uncurry bleuStep
itemStep SAGLEU = uncurry gleuStep
itemStep SAWER = uncurry werStep
-- strings are character lists, so we could re-use werStep
itemStep SACER = uncurry werStep
itemStep SAAccuracy = hitOrMiss
itemStep SAClippEU = clippEUMatchStep
itemStep SAFMeasure = getCount

View File

@ -58,6 +58,7 @@ listOfAvailableMetrics = [RMSE,
BLEU,
GLEU,
WER,
CER,
NMI,
ClippEU,
LogLossHashed defaultLogLossHashedSize,
@ -78,6 +79,7 @@ listOfAvailableMetrics = [RMSE,
CharMatch]
extraInfo :: EvaluationScheme -> Maybe String
extraInfo (EvaluationScheme CER []) = Just "Character-Error Rate"
extraInfo (EvaluationScheme GLEU []) = Just "\"Google GLEU\" not the grammar correction metric"
extraInfo (EvaluationScheme BLEU [LowerCasing,
RegexpMatch _]) = Just "BLEU on lowercased strings, only Latin characters and digits considered"
@ -97,6 +99,8 @@ isMetricDescribed (SoftFMeasure _) = True
isMetricDescribed (Soft2DFMeasure _) = True
isMetricDescribed (ProbabilisticMultiLabelFMeasure _) = True
isMetricDescribed GLEU = True
isMetricDescribed WER = True
isMetricDescribed CER = True
isMetricDescribed SegmentAccuracy = True
isMetricDescribed _ = False
@ -138,6 +142,17 @@ metric on a corpus level but does not have its drawbacks for our per
sentence reward objective.
see: https://arxiv.org/pdf/1609.08144.pdf
|]
getMetricDescription WER =
[i|WER (Word-Error Rate) is the number of word-level mistakes divided
by the number of words in the expected output. Possible mistakes are
deletions, insertions and substitions as in the Levenshtein distance.
|]
getMetricDescription CER =
[i|CER (Character-Error Rate) is the number of character-level mistakes divided
by the total length of the expected output. Possible mistakes are
deletions, insertions and substitions as in the Levenshtein distance.
|]
getMetricDescription SegmentAccuracy =
[i|Accuracy counted for segments, i.e. labels with positions.
The percentage of labels in the ground truth retrieved in the actual output is returned.
@ -157,6 +172,12 @@ first-name/3:0.9
|]
outContents GLEU = [hereLit|Alice has a black
|]
outContents WER = [hereLit|na ka huainaua e te atua te marama ko te awatea , a ko te pouri i huaina e ia ko te po
a ko te ahiahi , ko ata , he ra ko kotahi
|]
outContents CER = [hereLit|esse esi perctp
tabula rasai
|]
outContents SegmentAccuracy = [hereLit|N:1-4 V:5-6 N:8-10 V:12-13 A:15-17
N:1-4 V:6-7 A:9-13
|]
@ -178,6 +199,10 @@ expectedScore (EvaluationScheme GLEU [])
= 0.7142857142857143
expectedScore (EvaluationScheme SegmentAccuracy [])
= 0.875
expectedScore (EvaluationScheme WER [])
= 0.08571
expectedScore (EvaluationScheme CER [])
= 0.14814
helpMetricParameterMetricsList :: String
helpMetricParameterMetricsList = intercalate ", " $ map (\s -> (show s) ++ (case extraInfo s of
@ -226,7 +251,7 @@ the form LABEL:PAGE/X0,Y0,X1,Y1 where LABEL is any label, page is the page numbe
formatDescription (ProbabilisticMultiLabelFMeasure _) = [hereLit|In each line a number of labels (entities) can be given. A label probability
can be provided with a colon (e.g. "foo:0.7"). By default, 1.0 is assumed.
|]
formatDescription GLEU = [hereLit|In each line a there is a space sparated sentence of words.
formatDescription GLEU = [hereLit|In each line a there is a space sparated sequence of words.
|]
formatDescription SegmentAccuracy = [hereLit|Labels can be any strings (without spaces), whereas is a list of
1-based indexes or spans separated by commas (spans are inclusive
@ -235,6 +260,9 @@ label "foo:bar" for positions 2, 4, 5, 6, 7 and 10. Note that no
overlapping segments can be returned (evaluation will fail in
such a case).
|]
formatDescription WER = formatDescription GLEU
formatDescription CER = [hereLit|Any text, whitespace and punctuation marks are also considered.
|]
scoreExplanation :: EvaluationScheme -> Maybe String
scoreExplanation (EvaluationScheme (SoftFMeasure _) [])
@ -257,6 +285,14 @@ Now we have to calculate precision and recall:
scoreExplanation (EvaluationScheme SegmentAccuracy [])
= Just [hereLit|Out of 4 segments in the expected output for the first item, 3 were retrieved correcly (accuracy is 3/4=0.75).
The second item was retrieved perfectly (accuracy is 1.0). Hence, the average is (0.75+1.0)/2=0.875.|]
scoreExplanation (EvaluationScheme WER [])
= Just [hereLit|The total length of expected output (in words) is 35. There are 3 errors
(1 word substituted, 1 inserted, 1 deleted) in the actual output. Hence,
WER = (1+1+1) / 35 = 3 / 35 = 0.08571.|]
scoreExplanation (EvaluationScheme CER [])
= Just [hereLit|The total length of expected output (in characters) is 27. There are 4 errors
(1 word substituted, 1 inserted, 1 deleted) in the actual output. Hence,
CER = (2+1+1) / 27 = 4 / 27 = 0.14814.|]
pasteLines :: String -> String -> String
pasteLines a b = printf "%-35s %s\n" a b

View File

@ -128,6 +128,12 @@ main = hspec $ do
describe "WER" $ do
it "simple example" $
runGEvalTest "wer-simple" `shouldReturnAlmost` 0.5555555555
describe "CER" $ do
it "simple example" $
runGEvalTest "cer-simple" `shouldReturnAlmost` 0.28947368421
describe "CER" $ do
it "simple example (Mean/CER)" $
runGEvalTest "cer-mean-simple" `shouldReturnAlmost` 0.277777777777778
describe "Accuracy" $ do
it "simple example" $
runGEvalTest "accuracy-simple" `shouldReturnAlmost` 0.6

View File

@ -0,0 +1,2 @@
To be or mot to be
Thas is the
1 To be or mot to be
2 Thas is the

View File

@ -0,0 +1 @@
--metric Mean/CER

View File

@ -0,0 +1,2 @@
To be or not to be
That is the question
1 To be or not to be
2 That is the question

View File

@ -0,0 +1,2 @@
To be or mot to be
Thas is the
1 To be or mot to be
2 Thas is the

View File

@ -0,0 +1 @@
--metric CER

View File

@ -0,0 +1,2 @@
To be or not to be
That is the question
1 To be or not to be
2 That is the question