Fix bug with inconsistent handling of probs in MultiLabel-F1

2021-07-23 17:26:41 +02:00 · 2021-07-23 17:26:41 +02:00 · c71bba81f3
commit c71bba81f3
parent 4b80625cc2
8 changed files with 67 additions and 48 deletions
--- a/src/GEval/Core.hs
+++ b/src/GEval/Core.hs
@ -680,60 +680,60 @@ gevalCoreOnSources (LikelihoodHashed nbOfBits) = helperLogLossHashed nbOfBits lo
 gevalCoreOnSources (Mean (MultiLabelFMeasure beta matchingSpec))
-  = gevalCoreWithoutInputOnItemTargets (Right . intoWords)
+  = gevalCoreWithoutInputOnItemTargets intoWords
-                                       (Right . getWords)
+                                       getWords
-                                       ((fMeasureOnCounts beta) . (getWeightedCounts (getMatchingFunctionForString matchingSpec)))
+                                       ((fMeasureOnCounts beta) . (getWeightedCounts (getMatchingFunctionForText matchingSpec)))
                                       averageC
                                       id
                                       noGraph
    where
      -- repeated as below, as it will be refactored into dependent types soon anyway
-      getWords (RawItemTarget t) = Prelude.map unpack $ selectByStandardThreshold $ parseIntoProbList t
+      getWords (RawItemTarget t) = outputParser (SAMultiLabelFMeasure SExactMatch) t
-      getWords (PartiallyParsedItemTarget ts) = Prelude.map unpack ts
+      getWords (PartiallyParsedItemTarget ts) = Right ts
-      intoWords (RawItemTarget t) = Prelude.map unpack $ Data.Text.words t
+      intoWords (RawItemTarget t) = expectedParser (SAMultiLabelFMeasure SExactMatch) t
-      intoWords (PartiallyParsedItemTarget ts) = Prelude.map unpack ts
+      intoWords (PartiallyParsedItemTarget ts) = Right ts
 gevalCoreOnSources (Mean WER)
-  = gevalCoreWithoutInputOnItemTargets (Right . intoWords)
+  = gevalCoreWithoutInputOnItemTargets intoWords
-                                       (Right . getWords)
+                                       getWords
                                       ((uncurry (/.)) . (uncurry werStep))
                                       averageC
                                       id
                                       noGraph
    where
      -- repeated as below, as it will be refactored into dependent types soon anyway
-      getWords (RawItemTarget t) = Prelude.map unpack $ selectByStandardThreshold $ parseIntoProbList t
+      getWords (RawItemTarget t) = outputParser SAWER t
-      getWords (PartiallyParsedItemTarget ts) = Prelude.map unpack ts
+      getWords (PartiallyParsedItemTarget ts) = Right $ Prelude.map unpack ts
-      intoWords (RawItemTarget t) = Prelude.map unpack $ Data.Text.words t
+      intoWords (RawItemTarget t) = expectedParser SAWER t
-      intoWords (PartiallyParsedItemTarget ts) = Prelude.map unpack ts
+      intoWords (PartiallyParsedItemTarget ts) = Right $ Prelude.map unpack ts
 gevalCoreOnSources (Mean CER)
-  = gevalCoreWithoutInputOnItemTargets (Right . getString)
+  = gevalCoreWithoutInputOnItemTargets getString
-                                       (Right . getString)
+                                       getString
                                       ((uncurry (/.)) . (uncurry werStep))
                                       averageC
                                       id
                                       noGraph
    where
      -- repeated as below, as it will be refactored into dependent types soon anyway
-      getString (RawItemTarget t) = unpack t
+      getString (RawItemTarget t) = expectedParser SACER t
-      getString (PartiallyParsedItemTarget ts) = Prelude.unwords $ Prelude.map unpack ts
+      getString (PartiallyParsedItemTarget ts) = Right $ Prelude.unwords $ Prelude.map unpack ts
 gevalCoreOnSources (Mean _) = error $ "Mean/ meta-metric defined only for MultiLabel-F1, WER and CER for the time being"
 -- only MultiLabel-F1 handled for JSONs for the time being...
 gevalCoreOnSources (MultiLabelFMeasure beta matchingSpec) =
-  gevalCoreWithoutInputOnItemTargets (Right . intoWords)
+  gevalCoreWithoutInputOnItemTargets intoWords
-                                     (Right . getWords)
+                                     getWords
-                                     (getWeightedCounts (getMatchingFunctionForString matchingSpec))
+                                     (getWeightedCounts (getMatchingFunctionForText matchingSpec))
                                     countAgg
                                     (fMeasureOnCounts beta)
                                     noGraph
    where
-      getWords (RawItemTarget t) = Prelude.map unpack $ selectByStandardThreshold $ parseIntoProbList t
+      getWords (RawItemTarget t) = outputParser (SAMultiLabelFMeasure SExactMatch) t
-      getWords (PartiallyParsedItemTarget ts) = Prelude.map unpack ts
+      getWords (PartiallyParsedItemTarget ts) = Right ts
-      intoWords (RawItemTarget t) = Prelude.map unpack $ Data.Text.words t
+      intoWords (RawItemTarget t) = expectedParser (SAMultiLabelFMeasure SExactMatch) t
-      intoWords (PartiallyParsedItemTarget ts) = Prelude.map unpack ts
+      intoWords (PartiallyParsedItemTarget ts) = Right ts
 gevalCoreOnSources Pearson = gevalCoreByCorrelationMeasure pearson
 gevalCoreOnSources Spearman = gevalCoreByCorrelationMeasure spearman
--- a/src/GEval/Metric.hs
+++ b/src/GEval/Metric.hs
@ -13,7 +13,7 @@ module GEval.Metric
  where
 import Data.Word
-import Data.Text
+import Data.Text hiding (map)
 import Data.Monoid ((<>))
 import GEval.Common
@ -262,10 +262,11 @@ fixedNumberOfColumnsInInput (ProbabilisticSoftFMeasure _) = False
 fixedNumberOfColumnsInInput (Soft2DFMeasure _) = False
 fixedNumberOfColumnsInInput _ = True
 perfectOutLineFromExpectedLine :: Metric -> Text -> Text
 perfectOutLineFromExpectedLine (Mean metric) t = perfectOutLineFromExpectedLine metric t
-perfectOutLineFromExpectedLine (LogLossHashed _) t = t <> ":1.0"
+perfectOutLineFromExpectedLine (LogLossHashed _) t = addProbOne t
-perfectOutLineFromExpectedLine (LikelihoodHashed _) t = t <> ":1.0"
+perfectOutLineFromExpectedLine (LikelihoodHashed _) t = addProbOne t
 perfectOutLineFromExpectedLine BLEU t = getFirstColumn t
 perfectOutLineFromExpectedLine GLEU t = getFirstColumn t
 perfectOutLineFromExpectedLine ClippEU t = cleanMarginFromClippEU t
@ -273,6 +274,9 @@ perfectOutLineFromExpectedLine (Accuracy ExactMatch) t = t
 perfectOutLineFromExpectedLine (Accuracy _) t = getFirstColumn t
 perfectOutLineFromExpectedLine _ t = t
 addProbOne :: Text -> Text
 addProbOne = (<> ":1.0")
 getFirstColumn :: Text -> Text
 getFirstColumn t = case splitOn "\t" t of
  [] -> ""
@ -280,7 +284,7 @@ getFirstColumn t = case splitOn "\t" t of
 cleanMarginFromClippEU :: Text -> Text
 cleanMarginFromClippEU t = Data.Text.unwords outs
-  where outs = Prelude.map toOut specs
+  where outs = map toOut specs
        (Right specs) = parseOnly lineClippingSpecsParser t
        toOut (ClippingSpec (PageNumber pageNumber) (Rectangle (Point x0 y0) (Point x1 y1)) _) =
          pack ((show pageNumber) ++ "/" ++ (show x0) ++ "," ++ (show y0) ++ "," ++ (show x1) ++ "," ++ (show y1))
--- a/src/GEval/MetricsMechanics.hs
+++ b/src/GEval/MetricsMechanics.hs
@ -27,7 +27,7 @@ import GEval.PrecisionRecall (weightedMaxMatch, fMeasureOnCounts, calculateMAPFo
 import Control.Exception
-import Data.Text
+import Data.Text hiding (map, maximum, zip)
 import Data.Text.Read as TR
 import qualified Data.List.Split as DLS
 import Data.Attoparsec.Text (parseOnly)
@ -41,7 +41,8 @@ import GEval.Annotation (Annotation, ObtainedAnnotation,
 import GEval.Clippings (Clipping, ClippingSpec, LabeledClipping, lineClippingsParser, lineClippingSpecsParser, lineLabeledClippingsParser)
 import GEval.BIO (TaggedEntity, parseBioSequenceIntoEntities, parseBioSequenceIntoEntitiesWithoutNormalization)
 import GEval.LogLossHashed (parseWordSpecs, wordSpecToPair)
-import GEval.ProbList (ProbList(..), parseIntoProbList, WordWithProb(..), countLogLossOnProbList)
+import GEval.ProbList (ProbList(..), WordWithProb(..),
                       parseIntoProbList, countLogLossOnProbList, selectByStandardThreshold)
 import GEval.MatchingSpecification
 import GEval.Haversine
@ -222,7 +223,7 @@ outputParser SATokenAccuracy = intoWords
 outputParser SASegmentAccuracy = parseSegmentAnnotations
 outputParser SAMAE = doubleParser
 outputParser SASMAPE = doubleParser
-outputParser (SAMultiLabelFMeasure _) = intoWords
+outputParser (SAMultiLabelFMeasure _) = Right . selectByStandardThreshold . parseIntoProbList
 outputParser SAMultiLabelLogLoss = Right . parseIntoProbList
 outputParser SAMultiLabelLikelihood = Right . parseIntoProbList
 outputParser SAHaversine = parseSpherePoints
@ -258,7 +259,7 @@ type family ItemIntermediateRepresentationType (t :: AMetric) :: * where
  ItemIntermediateRepresentationType t = Double
 findBest :: (Text -> Text -> Double) -> (Text -> Text -> Double)
-findBest fun expected got = Prelude.maximum $ Prelude.map (fun got) expectedVals
+findBest fun expected got = maximum $ map (fun got) expectedVals
  where expectedVals = case splitOn "\t" expected of
                         [] -> [""]
                         l -> l
@ -310,7 +311,7 @@ intoWords = Right . Data.Text.words
 intoStringWords = Right . Prelude.words . unpack
-alternativeSentencesParser = Right . Prelude.map Prelude.words . DLS.splitOn "\t" . unpack
+alternativeSentencesParser = Right . map Prelude.words . DLS.splitOn "\t" . unpack
 onlyStrip = Right . strip
@ -322,8 +323,8 @@ predictedParser got =
  case parseWordSpecs got of
    Right wordSpecs -> if Prelude.null pairs
                      then Nothing
-                      else Just $ snd $ Prelude.maximum pairs
+                      else Just $ snd $ maximum pairs
-      where pairs = catMaybes $ Prelude.map wordSpecToPair wordSpecs
+      where pairs = catMaybes $ map wordSpecToPair wordSpecs
    Left _ -> Just got
 splitByTabs = Right . DLS.splitOn "\t" . unpack
@ -361,8 +362,8 @@ hitOrMiss (exp, got) =
  case parseWordSpecs got of
    Right wordSpecs -> if Prelude.null pairs
                      then 0.0
-                      else indicator (exp == (snd $ Prelude.maximum pairs))
+                      else indicator (exp == (snd $ maximum pairs))
-      where pairs = catMaybes $ Prelude.map wordSpecToPair wordSpecs
+      where pairs = catMaybes $ map wordSpecToPair wordSpecs
    Left _ ->  indicator ((normalizeProbForAccuracy exp got) == exp)
              -- if the expected value is 0 or 1 treat values
              -- between 0.0 and 1.0 as probabilities
@ -405,7 +406,7 @@ getSoft2DCounts (expected, got) = (tpArea, expArea, gotArea)
 getFragCounts :: CoverableEntityWithProbability e => ([BareEntity e], [e]) -> (Double, Double, Int, Int)
 getFragCounts (expected, got)
-  | allDisjoint (Prelude.map getBareEntity got) = (
+  | allDisjoint (map getBareEntity got) = (
      recallScoreTotal expected got,
      precisionScoreTotal got expected,
      Prelude.length expected,
@ -418,7 +419,7 @@ countHitsAndTotals (es, os) =
  then throw $ OtherException "wrong number of tokens"
  else Prelude.foldl matchFun
                     (0, 0)
-                     (Prelude.zip es os)
+                     (zip es os)
  where  matchFun :: (Int, Int) -> (Text, Text) -> (Int, Int)
         matchFun (h, t) (e, o)
           | e == (pack "*") = (h, t)
--- a/src/GEval/MetricsMeta.hs
+++ b/src/GEval/MetricsMeta.hs
@ -126,7 +126,7 @@ Mean/Multilabel-F1.
 |]
 getMetricDescription (SoftFMeasure _) =
  [i|"Soft" F-measure on intervals, i.e. partial "hits" are considered. For instance,
-if a label `foo` is expected for the span 2-9 and this label is returned but with
+if a label `foo` is expected for the span 2-9 and this label is returned, but with
 the span 8-12, it is counted as 2/8=0.25 instead of 0 or 1 when precision/recall counts
 are gathered.
 |]
@ -183,9 +183,9 @@ getMetricDescription BIOWeightedF1 =
 |]
 outContents :: Metric -> String
-outContents (MultiLabelFMeasure _ _) = [hereLit|person/1,3 first-name/1 first-name/3
+outContents (MultiLabelFMeasure _ _) = [hereLit|person/1,3 first-name/1:0.8 first-name/3:0.75
-surname/2
+surname/2 county/1:0.33
-first-name/3
+first-name/3:0.52
 |]
 outContents (SoftFMeasure _) = [hereLit|inwords:1-4
 inwords:1-3 indigits:5
@ -279,8 +279,12 @@ formatEvaluationSchemeDescription scheme@(EvaluationScheme metric _) = show sche
 IF YOU WANT TO HAVE IT DESCRIBED|]
 formatDescription :: Metric -> String
-formatDescription (MultiLabelFMeasure _ ExactMatch) = [hereLit|Any label separated by spaces can be used. They are
+formatDescription (MultiLabelFMeasure _ ExactMatch) = [hereLit|Any label separated by spaces can be used. Labels are not
-not intepreted in any way when the metric is calculated.
+interpreted except that they can be accompanied by probabilities
 (after a colon): only labels with probabilities >= 0.5 are considered.
 This is for compatibility with probalistic metrics. By default, 1.0 is
 assumed as the probability, but it is recommended to add probabilities
 explicitly.
 |]
 formatDescription (SoftFMeasure _) = [hereLit|Each line is a sequence of entities separated by spaces, each entity is of
 the form LABEL:SPAN, where LABEL is any label and SPAN is defined using single integers, intervals or such
@ -316,9 +320,10 @@ B-tags and I-tags can accompanied by an extra label after a slash.
 scoreExplanation :: EvaluationScheme -> Maybe String
 scoreExplanation (EvaluationScheme (MultiLabelFMeasure _ ExactMatch) [])
  = Just [hereLit|Out of the total 5 labels in the output, 3 are correct (person/1,3, first-name/1 and
-first-name/3), hence precision is 3/5=0.6, whereas out of the 4 labels in gold standard,
+first-name/3, only labels with probabilities >= 0.5 are considered, otherwise the probabilities are just
-again 3 were retrieved, so recall is 3/4=0.75. The harmonic mean of precision and recall
+discarded), hence precision is 3/5=0.6, whereas out of the 4 labels in gold standard, again 3 were
-is 2/(4/3 + 5/3) = 2/3 = 0.6666|]
+retrieved, so recall is 3/4=0.75. The harmonic mean of precision and recall is 2/(4/3 + 5/3) =
 = 2/3 = 0.6666|]
 scoreExplanation (EvaluationScheme (SoftFMeasure _) [])
  = Just [hereLit|We have a partial (0.75) success for the entity `inwords:1-4`, hence Recall = 0.75/1 = 0.75,
 Precision = (0 + 0.75 + 0) / 3 = 0.25, so F-score = 0.375|]
--- a/test/Spec.hs
+++ b/test/Spec.hs
@ -368,6 +368,8 @@ main = hspec $ do
      runGEvalTest "multilabel-f1-ie-fuzzy-smart" `shouldReturnAlmost` 0.598444
    it "information extraction with smart fuzzy matching hardened" $ do
      runGEvalTest "multilabel-f1-ie-fuzzy-harden" `shouldReturnAlmost` 0.555555555
    it "information extraction" $ do
      runGEvalTest "multilabel-f1-ie-probs" `shouldReturnAlmost` 0.1111111111
  describe "Mean/MultiLabel-F" $ do
    it "simple" $ do
      runGEvalTest "mean-multilabel-f1-simple" `shouldReturnAlmost` 0.5
--- a/test/multilabel-f1-ie-probs/multilabel-f1-ie-probs-solution/test-A/out.tsv
+++ b/test/multilabel-f1-ie-probs/multilabel-f1-ie-probs-solution/test-A/out.tsv
@ -0,0 +1,3 @@
 important-person=JOHN_BROWN:0.52 important-person=JOHN_SMITH company-name=Axaxaxaxas_Mlo profit=12031
 company-name=Foo_Bar profit=1220:0.8223 unwanted=none:0.49
 company-name=Whatever important-person=PIERRE_MENARD
--- a/test/multilabel-f1-ie-probs/multilabel-f1-ie-probs/config.txt
+++ b/test/multilabel-f1-ie-probs/multilabel-f1-ie-probs/config.txt
@ -0,0 +1 @@
 --metric MultiLabel-F1
--- a/test/multilabel-f1-ie-probs/multilabel-f1-ie-probs/test-A/expected.tsv
+++ b/test/multilabel-f1-ie-probs/multilabel-f1-ie-probs/test-A/expected.tsv
@ -0,0 +1,3 @@
 company-name=Axaxaxas_Mlö profit=12031 important-person=John_Smith important-person=James_Brown
 company-name=Orbis_Tertius profit=1020 important-person=Anna_Smith
 company-name=Whatever_Inc profit=5600 important-person=Pierre_Menard