From c71bba81f3f821456b90608e751692a37805eb4a Mon Sep 17 00:00:00 2001
From: Filip Gralinski <filipg@amu.edu.pl>
Date: Fri, 23 Jul 2021 17:26:41 +0200
Subject: [PATCH] Fix bug with inconsistent handling of probs in MultiLabel-F1

---
 src/GEval/Core.hs                             | 48 +++++++++----------
 src/GEval/Metric.hs                           | 12 +++--
 src/GEval/MetricsMechanics.hs                 | 23 ++++-----
 src/GEval/MetricsMeta.hs                      | 23 +++++----
 test/Spec.hs                                  |  2 +
 .../test-A/out.tsv                            |  3 ++
 .../multilabel-f1-ie-probs/config.txt         |  1 +
 .../test-A/expected.tsv                       |  3 ++
 8 files changed, 67 insertions(+), 48 deletions(-)
 create mode 100644 test/multilabel-f1-ie-probs/multilabel-f1-ie-probs-solution/test-A/out.tsv
 create mode 100644 test/multilabel-f1-ie-probs/multilabel-f1-ie-probs/config.txt
 create mode 100644 test/multilabel-f1-ie-probs/multilabel-f1-ie-probs/test-A/expected.tsv

diff --git a/src/GEval/Core.hs b/src/GEval/Core.hs
index 3a2c54c..c7aba0d 100644
--- a/src/GEval/Core.hs
+++ b/src/GEval/Core.hs
@@ -680,60 +680,60 @@ gevalCoreOnSources (LikelihoodHashed nbOfBits) = helperLogLossHashed nbOfBits lo
 
 
 gevalCoreOnSources (Mean (MultiLabelFMeasure beta matchingSpec))
-  = gevalCoreWithoutInputOnItemTargets (Right . intoWords)
-                                       (Right . getWords)
-                                       ((fMeasureOnCounts beta) . (getWeightedCounts (getMatchingFunctionForString matchingSpec)))
+  = gevalCoreWithoutInputOnItemTargets intoWords
+                                       getWords
+                                       ((fMeasureOnCounts beta) . (getWeightedCounts (getMatchingFunctionForText matchingSpec)))
                                        averageC
                                        id
                                        noGraph
     where
       -- repeated as below, as it will be refactored into dependent types soon anyway
-      getWords (RawItemTarget t) = Prelude.map unpack $ selectByStandardThreshold $ parseIntoProbList t
-      getWords (PartiallyParsedItemTarget ts) = Prelude.map unpack ts
-      intoWords (RawItemTarget t) = Prelude.map unpack $ Data.Text.words t
-      intoWords (PartiallyParsedItemTarget ts) = Prelude.map unpack ts
+      getWords (RawItemTarget t) = outputParser (SAMultiLabelFMeasure SExactMatch) t
+      getWords (PartiallyParsedItemTarget ts) = Right ts
+      intoWords (RawItemTarget t) = expectedParser (SAMultiLabelFMeasure SExactMatch) t
+      intoWords (PartiallyParsedItemTarget ts) = Right ts
 
 gevalCoreOnSources (Mean WER)
-  = gevalCoreWithoutInputOnItemTargets (Right . intoWords)
-                                       (Right . getWords)
+  = gevalCoreWithoutInputOnItemTargets intoWords
+                                       getWords
                                        ((uncurry (/.)) . (uncurry werStep))
                                        averageC
                                        id
                                        noGraph
     where
       -- repeated as below, as it will be refactored into dependent types soon anyway
-      getWords (RawItemTarget t) = Prelude.map unpack $ selectByStandardThreshold $ parseIntoProbList t
-      getWords (PartiallyParsedItemTarget ts) = Prelude.map unpack ts
-      intoWords (RawItemTarget t) = Prelude.map unpack $ Data.Text.words t
-      intoWords (PartiallyParsedItemTarget ts) = Prelude.map unpack ts
+      getWords (RawItemTarget t) = outputParser SAWER t
+      getWords (PartiallyParsedItemTarget ts) = Right $ Prelude.map unpack ts
+      intoWords (RawItemTarget t) = expectedParser SAWER t
+      intoWords (PartiallyParsedItemTarget ts) = Right $ Prelude.map unpack ts
 
 gevalCoreOnSources (Mean CER)
-  = gevalCoreWithoutInputOnItemTargets (Right . getString)
-                                       (Right . getString)
+  = gevalCoreWithoutInputOnItemTargets getString
+                                       getString
                                        ((uncurry (/.)) . (uncurry werStep))
                                        averageC
                                        id
                                        noGraph
     where
       -- repeated as below, as it will be refactored into dependent types soon anyway
-      getString (RawItemTarget t) = unpack t
-      getString (PartiallyParsedItemTarget ts) = Prelude.unwords $ Prelude.map unpack ts
+      getString (RawItemTarget t) = expectedParser SACER t
+      getString (PartiallyParsedItemTarget ts) = Right $ Prelude.unwords $ Prelude.map unpack ts
 
 gevalCoreOnSources (Mean _) = error $ "Mean/ meta-metric defined only for MultiLabel-F1, WER and CER for the time being"
 
 -- only MultiLabel-F1 handled for JSONs for the time being...
 gevalCoreOnSources (MultiLabelFMeasure beta matchingSpec) =
-  gevalCoreWithoutInputOnItemTargets (Right . intoWords)
-                                     (Right . getWords)
-                                     (getWeightedCounts (getMatchingFunctionForString matchingSpec))
+  gevalCoreWithoutInputOnItemTargets intoWords
+                                     getWords
+                                     (getWeightedCounts (getMatchingFunctionForText matchingSpec))
                                      countAgg
                                      (fMeasureOnCounts beta)
                                      noGraph
     where
-      getWords (RawItemTarget t) = Prelude.map unpack $ selectByStandardThreshold $ parseIntoProbList t
-      getWords (PartiallyParsedItemTarget ts) = Prelude.map unpack ts
-      intoWords (RawItemTarget t) = Prelude.map unpack $ Data.Text.words t
-      intoWords (PartiallyParsedItemTarget ts) = Prelude.map unpack ts
+      getWords (RawItemTarget t) = outputParser (SAMultiLabelFMeasure SExactMatch) t
+      getWords (PartiallyParsedItemTarget ts) = Right ts
+      intoWords (RawItemTarget t) = expectedParser (SAMultiLabelFMeasure SExactMatch) t
+      intoWords (PartiallyParsedItemTarget ts) = Right ts
 
 gevalCoreOnSources Pearson = gevalCoreByCorrelationMeasure pearson
 gevalCoreOnSources Spearman = gevalCoreByCorrelationMeasure spearman
diff --git a/src/GEval/Metric.hs b/src/GEval/Metric.hs
index d8ee8fe..e878ca9 100644
--- a/src/GEval/Metric.hs
+++ b/src/GEval/Metric.hs
@@ -13,7 +13,7 @@ module GEval.Metric
   where
 
 import Data.Word
-import Data.Text
+import Data.Text hiding (map)
 import Data.Monoid ((<>))
 
 import GEval.Common
@@ -262,10 +262,11 @@ fixedNumberOfColumnsInInput (ProbabilisticSoftFMeasure _) = False
 fixedNumberOfColumnsInInput (Soft2DFMeasure _) = False
 fixedNumberOfColumnsInInput _ = True
 
+
 perfectOutLineFromExpectedLine :: Metric -> Text -> Text
 perfectOutLineFromExpectedLine (Mean metric) t = perfectOutLineFromExpectedLine metric t
-perfectOutLineFromExpectedLine (LogLossHashed _) t = t <> ":1.0"
-perfectOutLineFromExpectedLine (LikelihoodHashed _) t = t <> ":1.0"
+perfectOutLineFromExpectedLine (LogLossHashed _) t = addProbOne t
+perfectOutLineFromExpectedLine (LikelihoodHashed _) t = addProbOne t
 perfectOutLineFromExpectedLine BLEU t = getFirstColumn t
 perfectOutLineFromExpectedLine GLEU t = getFirstColumn t
 perfectOutLineFromExpectedLine ClippEU t = cleanMarginFromClippEU t
@@ -273,6 +274,9 @@ perfectOutLineFromExpectedLine (Accuracy ExactMatch) t = t
 perfectOutLineFromExpectedLine (Accuracy _) t = getFirstColumn t
 perfectOutLineFromExpectedLine _ t = t
 
+addProbOne :: Text -> Text
+addProbOne = (<> ":1.0")
+
 getFirstColumn :: Text -> Text
 getFirstColumn t = case splitOn "\t" t of
   [] -> ""
@@ -280,7 +284,7 @@ getFirstColumn t = case splitOn "\t" t of
 
 cleanMarginFromClippEU :: Text -> Text
 cleanMarginFromClippEU t = Data.Text.unwords outs
-  where outs = Prelude.map toOut specs
+  where outs = map toOut specs
         (Right specs) = parseOnly lineClippingSpecsParser t
         toOut (ClippingSpec (PageNumber pageNumber) (Rectangle (Point x0 y0) (Point x1 y1)) _) =
           pack ((show pageNumber) ++ "/" ++ (show x0) ++ "," ++ (show y0) ++ "," ++ (show x1) ++ "," ++ (show y1))
diff --git a/src/GEval/MetricsMechanics.hs b/src/GEval/MetricsMechanics.hs
index 3222617..34179ed 100644
--- a/src/GEval/MetricsMechanics.hs
+++ b/src/GEval/MetricsMechanics.hs
@@ -27,7 +27,7 @@ import GEval.PrecisionRecall (weightedMaxMatch, fMeasureOnCounts, calculateMAPFo
 
 import Control.Exception
 
-import Data.Text
+import Data.Text hiding (map, maximum, zip)
 import Data.Text.Read as TR
 import qualified Data.List.Split as DLS
 import Data.Attoparsec.Text (parseOnly)
@@ -41,7 +41,8 @@ import GEval.Annotation (Annotation, ObtainedAnnotation,
 import GEval.Clippings (Clipping, ClippingSpec, LabeledClipping, lineClippingsParser, lineClippingSpecsParser, lineLabeledClippingsParser)
 import GEval.BIO (TaggedEntity, parseBioSequenceIntoEntities, parseBioSequenceIntoEntitiesWithoutNormalization)
 import GEval.LogLossHashed (parseWordSpecs, wordSpecToPair)
-import GEval.ProbList (ProbList(..), parseIntoProbList, WordWithProb(..), countLogLossOnProbList)
+import GEval.ProbList (ProbList(..), WordWithProb(..),
+                       parseIntoProbList, countLogLossOnProbList, selectByStandardThreshold)
 import GEval.MatchingSpecification
 import GEval.Haversine
 
@@ -222,7 +223,7 @@ outputParser SATokenAccuracy = intoWords
 outputParser SASegmentAccuracy = parseSegmentAnnotations
 outputParser SAMAE = doubleParser
 outputParser SASMAPE = doubleParser
-outputParser (SAMultiLabelFMeasure _) = intoWords
+outputParser (SAMultiLabelFMeasure _) = Right . selectByStandardThreshold . parseIntoProbList
 outputParser SAMultiLabelLogLoss = Right . parseIntoProbList
 outputParser SAMultiLabelLikelihood = Right . parseIntoProbList
 outputParser SAHaversine = parseSpherePoints
@@ -258,7 +259,7 @@ type family ItemIntermediateRepresentationType (t :: AMetric) :: * where
   ItemIntermediateRepresentationType t = Double
 
 findBest :: (Text -> Text -> Double) -> (Text -> Text -> Double)
-findBest fun expected got = Prelude.maximum $ Prelude.map (fun got) expectedVals
+findBest fun expected got = maximum $ map (fun got) expectedVals
   where expectedVals = case splitOn "\t" expected of
                          [] -> [""]
                          l -> l
@@ -310,7 +311,7 @@ intoWords = Right . Data.Text.words
 
 intoStringWords = Right . Prelude.words . unpack
 
-alternativeSentencesParser = Right . Prelude.map Prelude.words . DLS.splitOn "\t" . unpack
+alternativeSentencesParser = Right . map Prelude.words . DLS.splitOn "\t" . unpack
 
 onlyStrip = Right . strip
 
@@ -322,8 +323,8 @@ predictedParser got =
   case parseWordSpecs got of
     Right wordSpecs -> if Prelude.null pairs
                       then Nothing
-                      else Just $ snd $ Prelude.maximum pairs
-      where pairs = catMaybes $ Prelude.map wordSpecToPair wordSpecs
+                      else Just $ snd $ maximum pairs
+      where pairs = catMaybes $ map wordSpecToPair wordSpecs
     Left _ -> Just got
 
 splitByTabs = Right . DLS.splitOn "\t" . unpack
@@ -361,8 +362,8 @@ hitOrMiss (exp, got) =
   case parseWordSpecs got of
     Right wordSpecs -> if Prelude.null pairs
                       then 0.0
-                      else indicator (exp == (snd $ Prelude.maximum pairs))
-      where pairs = catMaybes $ Prelude.map wordSpecToPair wordSpecs
+                      else indicator (exp == (snd $ maximum pairs))
+      where pairs = catMaybes $ map wordSpecToPair wordSpecs
     Left _ ->  indicator ((normalizeProbForAccuracy exp got) == exp)
               -- if the expected value is 0 or 1 treat values
               -- between 0.0 and 1.0 as probabilities
@@ -405,7 +406,7 @@ getSoft2DCounts (expected, got) = (tpArea, expArea, gotArea)
 
 getFragCounts :: CoverableEntityWithProbability e => ([BareEntity e], [e]) -> (Double, Double, Int, Int)
 getFragCounts (expected, got)
-  | allDisjoint (Prelude.map getBareEntity got) = (
+  | allDisjoint (map getBareEntity got) = (
       recallScoreTotal expected got,
       precisionScoreTotal got expected,
       Prelude.length expected,
@@ -418,7 +419,7 @@ countHitsAndTotals (es, os) =
   then throw $ OtherException "wrong number of tokens"
   else Prelude.foldl matchFun
                      (0, 0)
-                     (Prelude.zip es os)
+                     (zip es os)
   where  matchFun :: (Int, Int) -> (Text, Text) -> (Int, Int)
          matchFun (h, t) (e, o)
            | e == (pack "*") = (h, t)
diff --git a/src/GEval/MetricsMeta.hs b/src/GEval/MetricsMeta.hs
index 51edec1..8c5f74a 100644
--- a/src/GEval/MetricsMeta.hs
+++ b/src/GEval/MetricsMeta.hs
@@ -126,7 +126,7 @@ Mean/Multilabel-F1.
 |]
 getMetricDescription (SoftFMeasure _) =
   [i|"Soft" F-measure on intervals, i.e. partial "hits" are considered. For instance,
-if a label `foo` is expected for the span 2-9 and this label is returned but with
+if a label `foo` is expected for the span 2-9 and this label is returned, but with
 the span 8-12, it is counted as 2/8=0.25 instead of 0 or 1 when precision/recall counts
 are gathered.
 |]
@@ -183,9 +183,9 @@ getMetricDescription BIOWeightedF1 =
 |]
 
 outContents :: Metric -> String
-outContents (MultiLabelFMeasure _ _) = [hereLit|person/1,3 first-name/1 first-name/3
-surname/2
-first-name/3
+outContents (MultiLabelFMeasure _ _) = [hereLit|person/1,3 first-name/1:0.8 first-name/3:0.75
+surname/2 county/1:0.33
+first-name/3:0.52
 |]
 outContents (SoftFMeasure _) = [hereLit|inwords:1-4
 inwords:1-3 indigits:5
@@ -279,8 +279,12 @@ formatEvaluationSchemeDescription scheme@(EvaluationScheme metric _) = show sche
 IF YOU WANT TO HAVE IT DESCRIBED|]
 
 formatDescription :: Metric -> String
-formatDescription (MultiLabelFMeasure _ ExactMatch) = [hereLit|Any label separated by spaces can be used. They are
-not intepreted in any way when the metric is calculated.
+formatDescription (MultiLabelFMeasure _ ExactMatch) = [hereLit|Any label separated by spaces can be used. Labels are not
+interpreted except that they can be accompanied by probabilities
+(after a colon): only labels with probabilities >= 0.5 are considered.
+This is for compatibility with probalistic metrics. By default, 1.0 is
+assumed as the probability, but it is recommended to add probabilities
+explicitly.
 |]
 formatDescription (SoftFMeasure _) = [hereLit|Each line is a sequence of entities separated by spaces, each entity is of
 the form LABEL:SPAN, where LABEL is any label and SPAN is defined using single integers, intervals or such
@@ -316,9 +320,10 @@ B-tags and I-tags can accompanied by an extra label after a slash.
 scoreExplanation :: EvaluationScheme -> Maybe String
 scoreExplanation (EvaluationScheme (MultiLabelFMeasure _ ExactMatch) [])
   = Just [hereLit|Out of the total 5 labels in the output, 3 are correct (person/1,3, first-name/1 and
-first-name/3), hence precision is 3/5=0.6, whereas out of the 4 labels in gold standard,
-again 3 were retrieved, so recall is 3/4=0.75. The harmonic mean of precision and recall
-is 2/(4/3 + 5/3) = 2/3 = 0.6666|]
+first-name/3, only labels with probabilities >= 0.5 are considered, otherwise the probabilities are just
+discarded), hence precision is 3/5=0.6, whereas out of the 4 labels in gold standard, again 3 were
+retrieved, so recall is 3/4=0.75. The harmonic mean of precision and recall is 2/(4/3 + 5/3) =
+= 2/3 = 0.6666|]
 scoreExplanation (EvaluationScheme (SoftFMeasure _) [])
   = Just [hereLit|We have a partial (0.75) success for the entity `inwords:1-4`, hence Recall = 0.75/1 = 0.75,
 Precision = (0 + 0.75 + 0) / 3 = 0.25, so F-score = 0.375|]
diff --git a/test/Spec.hs b/test/Spec.hs
index 4bc5841..6faeb37 100644
--- a/test/Spec.hs
+++ b/test/Spec.hs
@@ -368,6 +368,8 @@ main = hspec $ do
       runGEvalTest "multilabel-f1-ie-fuzzy-smart" `shouldReturnAlmost` 0.598444
     it "information extraction with smart fuzzy matching hardened" $ do
       runGEvalTest "multilabel-f1-ie-fuzzy-harden" `shouldReturnAlmost` 0.555555555
+    it "information extraction" $ do
+      runGEvalTest "multilabel-f1-ie-probs" `shouldReturnAlmost` 0.1111111111
   describe "Mean/MultiLabel-F" $ do
     it "simple" $ do
       runGEvalTest "mean-multilabel-f1-simple" `shouldReturnAlmost` 0.5
diff --git a/test/multilabel-f1-ie-probs/multilabel-f1-ie-probs-solution/test-A/out.tsv b/test/multilabel-f1-ie-probs/multilabel-f1-ie-probs-solution/test-A/out.tsv
new file mode 100644
index 0000000..4ac0083
--- /dev/null
+++ b/test/multilabel-f1-ie-probs/multilabel-f1-ie-probs-solution/test-A/out.tsv
@@ -0,0 +1,3 @@
+important-person=JOHN_BROWN:0.52 important-person=JOHN_SMITH company-name=Axaxaxaxas_Mlo profit=12031
+company-name=Foo_Bar profit=1220:0.8223 unwanted=none:0.49
+company-name=Whatever important-person=PIERRE_MENARD
diff --git a/test/multilabel-f1-ie-probs/multilabel-f1-ie-probs/config.txt b/test/multilabel-f1-ie-probs/multilabel-f1-ie-probs/config.txt
new file mode 100644
index 0000000..b79da4c
--- /dev/null
+++ b/test/multilabel-f1-ie-probs/multilabel-f1-ie-probs/config.txt
@@ -0,0 +1 @@
+--metric MultiLabel-F1
diff --git a/test/multilabel-f1-ie-probs/multilabel-f1-ie-probs/test-A/expected.tsv b/test/multilabel-f1-ie-probs/multilabel-f1-ie-probs/test-A/expected.tsv
new file mode 100644
index 0000000..ff50263
--- /dev/null
+++ b/test/multilabel-f1-ie-probs/multilabel-f1-ie-probs/test-A/expected.tsv
@@ -0,0 +1,3 @@
+company-name=Axaxaxas_Mlö profit=12031 important-person=John_Smith important-person=James_Brown
+company-name=Orbis_Tertius profit=1020 important-person=Anna_Smith
+company-name=Whatever_Inc profit=5600 important-person=Pierre_Menard