From c71bba81f3f821456b90608e751692a37805eb4a Mon Sep 17 00:00:00 2001 From: Filip Gralinski Date: Fri, 23 Jul 2021 17:26:41 +0200 Subject: [PATCH] Fix bug with inconsistent handling of probs in MultiLabel-F1 --- src/GEval/Core.hs | 48 +++++++++---------- src/GEval/Metric.hs | 12 +++-- src/GEval/MetricsMechanics.hs | 23 ++++----- src/GEval/MetricsMeta.hs | 23 +++++---- test/Spec.hs | 2 + .../test-A/out.tsv | 3 ++ .../multilabel-f1-ie-probs/config.txt | 1 + .../test-A/expected.tsv | 3 ++ 8 files changed, 67 insertions(+), 48 deletions(-) create mode 100644 test/multilabel-f1-ie-probs/multilabel-f1-ie-probs-solution/test-A/out.tsv create mode 100644 test/multilabel-f1-ie-probs/multilabel-f1-ie-probs/config.txt create mode 100644 test/multilabel-f1-ie-probs/multilabel-f1-ie-probs/test-A/expected.tsv diff --git a/src/GEval/Core.hs b/src/GEval/Core.hs index 3a2c54c..c7aba0d 100644 --- a/src/GEval/Core.hs +++ b/src/GEval/Core.hs @@ -680,60 +680,60 @@ gevalCoreOnSources (LikelihoodHashed nbOfBits) = helperLogLossHashed nbOfBits lo gevalCoreOnSources (Mean (MultiLabelFMeasure beta matchingSpec)) - = gevalCoreWithoutInputOnItemTargets (Right . intoWords) - (Right . getWords) - ((fMeasureOnCounts beta) . (getWeightedCounts (getMatchingFunctionForString matchingSpec))) + = gevalCoreWithoutInputOnItemTargets intoWords + getWords + ((fMeasureOnCounts beta) . (getWeightedCounts (getMatchingFunctionForText matchingSpec))) averageC id noGraph where -- repeated as below, as it will be refactored into dependent types soon anyway - getWords (RawItemTarget t) = Prelude.map unpack $ selectByStandardThreshold $ parseIntoProbList t - getWords (PartiallyParsedItemTarget ts) = Prelude.map unpack ts - intoWords (RawItemTarget t) = Prelude.map unpack $ Data.Text.words t - intoWords (PartiallyParsedItemTarget ts) = Prelude.map unpack ts + getWords (RawItemTarget t) = outputParser (SAMultiLabelFMeasure SExactMatch) t + getWords (PartiallyParsedItemTarget ts) = Right ts + intoWords (RawItemTarget t) = expectedParser (SAMultiLabelFMeasure SExactMatch) t + intoWords (PartiallyParsedItemTarget ts) = Right ts gevalCoreOnSources (Mean WER) - = gevalCoreWithoutInputOnItemTargets (Right . intoWords) - (Right . getWords) + = gevalCoreWithoutInputOnItemTargets intoWords + getWords ((uncurry (/.)) . (uncurry werStep)) averageC id noGraph where -- repeated as below, as it will be refactored into dependent types soon anyway - getWords (RawItemTarget t) = Prelude.map unpack $ selectByStandardThreshold $ parseIntoProbList t - getWords (PartiallyParsedItemTarget ts) = Prelude.map unpack ts - intoWords (RawItemTarget t) = Prelude.map unpack $ Data.Text.words t - intoWords (PartiallyParsedItemTarget ts) = Prelude.map unpack ts + getWords (RawItemTarget t) = outputParser SAWER t + getWords (PartiallyParsedItemTarget ts) = Right $ Prelude.map unpack ts + intoWords (RawItemTarget t) = expectedParser SAWER t + intoWords (PartiallyParsedItemTarget ts) = Right $ Prelude.map unpack ts gevalCoreOnSources (Mean CER) - = gevalCoreWithoutInputOnItemTargets (Right . getString) - (Right . getString) + = gevalCoreWithoutInputOnItemTargets getString + getString ((uncurry (/.)) . (uncurry werStep)) averageC id noGraph where -- repeated as below, as it will be refactored into dependent types soon anyway - getString (RawItemTarget t) = unpack t - getString (PartiallyParsedItemTarget ts) = Prelude.unwords $ Prelude.map unpack ts + getString (RawItemTarget t) = expectedParser SACER t + getString (PartiallyParsedItemTarget ts) = Right $ Prelude.unwords $ Prelude.map unpack ts gevalCoreOnSources (Mean _) = error $ "Mean/ meta-metric defined only for MultiLabel-F1, WER and CER for the time being" -- only MultiLabel-F1 handled for JSONs for the time being... gevalCoreOnSources (MultiLabelFMeasure beta matchingSpec) = - gevalCoreWithoutInputOnItemTargets (Right . intoWords) - (Right . getWords) - (getWeightedCounts (getMatchingFunctionForString matchingSpec)) + gevalCoreWithoutInputOnItemTargets intoWords + getWords + (getWeightedCounts (getMatchingFunctionForText matchingSpec)) countAgg (fMeasureOnCounts beta) noGraph where - getWords (RawItemTarget t) = Prelude.map unpack $ selectByStandardThreshold $ parseIntoProbList t - getWords (PartiallyParsedItemTarget ts) = Prelude.map unpack ts - intoWords (RawItemTarget t) = Prelude.map unpack $ Data.Text.words t - intoWords (PartiallyParsedItemTarget ts) = Prelude.map unpack ts + getWords (RawItemTarget t) = outputParser (SAMultiLabelFMeasure SExactMatch) t + getWords (PartiallyParsedItemTarget ts) = Right ts + intoWords (RawItemTarget t) = expectedParser (SAMultiLabelFMeasure SExactMatch) t + intoWords (PartiallyParsedItemTarget ts) = Right ts gevalCoreOnSources Pearson = gevalCoreByCorrelationMeasure pearson gevalCoreOnSources Spearman = gevalCoreByCorrelationMeasure spearman diff --git a/src/GEval/Metric.hs b/src/GEval/Metric.hs index d8ee8fe..e878ca9 100644 --- a/src/GEval/Metric.hs +++ b/src/GEval/Metric.hs @@ -13,7 +13,7 @@ module GEval.Metric where import Data.Word -import Data.Text +import Data.Text hiding (map) import Data.Monoid ((<>)) import GEval.Common @@ -262,10 +262,11 @@ fixedNumberOfColumnsInInput (ProbabilisticSoftFMeasure _) = False fixedNumberOfColumnsInInput (Soft2DFMeasure _) = False fixedNumberOfColumnsInInput _ = True + perfectOutLineFromExpectedLine :: Metric -> Text -> Text perfectOutLineFromExpectedLine (Mean metric) t = perfectOutLineFromExpectedLine metric t -perfectOutLineFromExpectedLine (LogLossHashed _) t = t <> ":1.0" -perfectOutLineFromExpectedLine (LikelihoodHashed _) t = t <> ":1.0" +perfectOutLineFromExpectedLine (LogLossHashed _) t = addProbOne t +perfectOutLineFromExpectedLine (LikelihoodHashed _) t = addProbOne t perfectOutLineFromExpectedLine BLEU t = getFirstColumn t perfectOutLineFromExpectedLine GLEU t = getFirstColumn t perfectOutLineFromExpectedLine ClippEU t = cleanMarginFromClippEU t @@ -273,6 +274,9 @@ perfectOutLineFromExpectedLine (Accuracy ExactMatch) t = t perfectOutLineFromExpectedLine (Accuracy _) t = getFirstColumn t perfectOutLineFromExpectedLine _ t = t +addProbOne :: Text -> Text +addProbOne = (<> ":1.0") + getFirstColumn :: Text -> Text getFirstColumn t = case splitOn "\t" t of [] -> "" @@ -280,7 +284,7 @@ getFirstColumn t = case splitOn "\t" t of cleanMarginFromClippEU :: Text -> Text cleanMarginFromClippEU t = Data.Text.unwords outs - where outs = Prelude.map toOut specs + where outs = map toOut specs (Right specs) = parseOnly lineClippingSpecsParser t toOut (ClippingSpec (PageNumber pageNumber) (Rectangle (Point x0 y0) (Point x1 y1)) _) = pack ((show pageNumber) ++ "/" ++ (show x0) ++ "," ++ (show y0) ++ "," ++ (show x1) ++ "," ++ (show y1)) diff --git a/src/GEval/MetricsMechanics.hs b/src/GEval/MetricsMechanics.hs index 3222617..34179ed 100644 --- a/src/GEval/MetricsMechanics.hs +++ b/src/GEval/MetricsMechanics.hs @@ -27,7 +27,7 @@ import GEval.PrecisionRecall (weightedMaxMatch, fMeasureOnCounts, calculateMAPFo import Control.Exception -import Data.Text +import Data.Text hiding (map, maximum, zip) import Data.Text.Read as TR import qualified Data.List.Split as DLS import Data.Attoparsec.Text (parseOnly) @@ -41,7 +41,8 @@ import GEval.Annotation (Annotation, ObtainedAnnotation, import GEval.Clippings (Clipping, ClippingSpec, LabeledClipping, lineClippingsParser, lineClippingSpecsParser, lineLabeledClippingsParser) import GEval.BIO (TaggedEntity, parseBioSequenceIntoEntities, parseBioSequenceIntoEntitiesWithoutNormalization) import GEval.LogLossHashed (parseWordSpecs, wordSpecToPair) -import GEval.ProbList (ProbList(..), parseIntoProbList, WordWithProb(..), countLogLossOnProbList) +import GEval.ProbList (ProbList(..), WordWithProb(..), + parseIntoProbList, countLogLossOnProbList, selectByStandardThreshold) import GEval.MatchingSpecification import GEval.Haversine @@ -222,7 +223,7 @@ outputParser SATokenAccuracy = intoWords outputParser SASegmentAccuracy = parseSegmentAnnotations outputParser SAMAE = doubleParser outputParser SASMAPE = doubleParser -outputParser (SAMultiLabelFMeasure _) = intoWords +outputParser (SAMultiLabelFMeasure _) = Right . selectByStandardThreshold . parseIntoProbList outputParser SAMultiLabelLogLoss = Right . parseIntoProbList outputParser SAMultiLabelLikelihood = Right . parseIntoProbList outputParser SAHaversine = parseSpherePoints @@ -258,7 +259,7 @@ type family ItemIntermediateRepresentationType (t :: AMetric) :: * where ItemIntermediateRepresentationType t = Double findBest :: (Text -> Text -> Double) -> (Text -> Text -> Double) -findBest fun expected got = Prelude.maximum $ Prelude.map (fun got) expectedVals +findBest fun expected got = maximum $ map (fun got) expectedVals where expectedVals = case splitOn "\t" expected of [] -> [""] l -> l @@ -310,7 +311,7 @@ intoWords = Right . Data.Text.words intoStringWords = Right . Prelude.words . unpack -alternativeSentencesParser = Right . Prelude.map Prelude.words . DLS.splitOn "\t" . unpack +alternativeSentencesParser = Right . map Prelude.words . DLS.splitOn "\t" . unpack onlyStrip = Right . strip @@ -322,8 +323,8 @@ predictedParser got = case parseWordSpecs got of Right wordSpecs -> if Prelude.null pairs then Nothing - else Just $ snd $ Prelude.maximum pairs - where pairs = catMaybes $ Prelude.map wordSpecToPair wordSpecs + else Just $ snd $ maximum pairs + where pairs = catMaybes $ map wordSpecToPair wordSpecs Left _ -> Just got splitByTabs = Right . DLS.splitOn "\t" . unpack @@ -361,8 +362,8 @@ hitOrMiss (exp, got) = case parseWordSpecs got of Right wordSpecs -> if Prelude.null pairs then 0.0 - else indicator (exp == (snd $ Prelude.maximum pairs)) - where pairs = catMaybes $ Prelude.map wordSpecToPair wordSpecs + else indicator (exp == (snd $ maximum pairs)) + where pairs = catMaybes $ map wordSpecToPair wordSpecs Left _ -> indicator ((normalizeProbForAccuracy exp got) == exp) -- if the expected value is 0 or 1 treat values -- between 0.0 and 1.0 as probabilities @@ -405,7 +406,7 @@ getSoft2DCounts (expected, got) = (tpArea, expArea, gotArea) getFragCounts :: CoverableEntityWithProbability e => ([BareEntity e], [e]) -> (Double, Double, Int, Int) getFragCounts (expected, got) - | allDisjoint (Prelude.map getBareEntity got) = ( + | allDisjoint (map getBareEntity got) = ( recallScoreTotal expected got, precisionScoreTotal got expected, Prelude.length expected, @@ -418,7 +419,7 @@ countHitsAndTotals (es, os) = then throw $ OtherException "wrong number of tokens" else Prelude.foldl matchFun (0, 0) - (Prelude.zip es os) + (zip es os) where matchFun :: (Int, Int) -> (Text, Text) -> (Int, Int) matchFun (h, t) (e, o) | e == (pack "*") = (h, t) diff --git a/src/GEval/MetricsMeta.hs b/src/GEval/MetricsMeta.hs index 51edec1..8c5f74a 100644 --- a/src/GEval/MetricsMeta.hs +++ b/src/GEval/MetricsMeta.hs @@ -126,7 +126,7 @@ Mean/Multilabel-F1. |] getMetricDescription (SoftFMeasure _) = [i|"Soft" F-measure on intervals, i.e. partial "hits" are considered. For instance, -if a label `foo` is expected for the span 2-9 and this label is returned but with +if a label `foo` is expected for the span 2-9 and this label is returned, but with the span 8-12, it is counted as 2/8=0.25 instead of 0 or 1 when precision/recall counts are gathered. |] @@ -183,9 +183,9 @@ getMetricDescription BIOWeightedF1 = |] outContents :: Metric -> String -outContents (MultiLabelFMeasure _ _) = [hereLit|person/1,3 first-name/1 first-name/3 -surname/2 -first-name/3 +outContents (MultiLabelFMeasure _ _) = [hereLit|person/1,3 first-name/1:0.8 first-name/3:0.75 +surname/2 county/1:0.33 +first-name/3:0.52 |] outContents (SoftFMeasure _) = [hereLit|inwords:1-4 inwords:1-3 indigits:5 @@ -279,8 +279,12 @@ formatEvaluationSchemeDescription scheme@(EvaluationScheme metric _) = show sche IF YOU WANT TO HAVE IT DESCRIBED|] formatDescription :: Metric -> String -formatDescription (MultiLabelFMeasure _ ExactMatch) = [hereLit|Any label separated by spaces can be used. They are -not intepreted in any way when the metric is calculated. +formatDescription (MultiLabelFMeasure _ ExactMatch) = [hereLit|Any label separated by spaces can be used. Labels are not +interpreted except that they can be accompanied by probabilities +(after a colon): only labels with probabilities >= 0.5 are considered. +This is for compatibility with probalistic metrics. By default, 1.0 is +assumed as the probability, but it is recommended to add probabilities +explicitly. |] formatDescription (SoftFMeasure _) = [hereLit|Each line is a sequence of entities separated by spaces, each entity is of the form LABEL:SPAN, where LABEL is any label and SPAN is defined using single integers, intervals or such @@ -316,9 +320,10 @@ B-tags and I-tags can accompanied by an extra label after a slash. scoreExplanation :: EvaluationScheme -> Maybe String scoreExplanation (EvaluationScheme (MultiLabelFMeasure _ ExactMatch) []) = Just [hereLit|Out of the total 5 labels in the output, 3 are correct (person/1,3, first-name/1 and -first-name/3), hence precision is 3/5=0.6, whereas out of the 4 labels in gold standard, -again 3 were retrieved, so recall is 3/4=0.75. The harmonic mean of precision and recall -is 2/(4/3 + 5/3) = 2/3 = 0.6666|] +first-name/3, only labels with probabilities >= 0.5 are considered, otherwise the probabilities are just +discarded), hence precision is 3/5=0.6, whereas out of the 4 labels in gold standard, again 3 were +retrieved, so recall is 3/4=0.75. The harmonic mean of precision and recall is 2/(4/3 + 5/3) = += 2/3 = 0.6666|] scoreExplanation (EvaluationScheme (SoftFMeasure _) []) = Just [hereLit|We have a partial (0.75) success for the entity `inwords:1-4`, hence Recall = 0.75/1 = 0.75, Precision = (0 + 0.75 + 0) / 3 = 0.25, so F-score = 0.375|] diff --git a/test/Spec.hs b/test/Spec.hs index 4bc5841..6faeb37 100644 --- a/test/Spec.hs +++ b/test/Spec.hs @@ -368,6 +368,8 @@ main = hspec $ do runGEvalTest "multilabel-f1-ie-fuzzy-smart" `shouldReturnAlmost` 0.598444 it "information extraction with smart fuzzy matching hardened" $ do runGEvalTest "multilabel-f1-ie-fuzzy-harden" `shouldReturnAlmost` 0.555555555 + it "information extraction" $ do + runGEvalTest "multilabel-f1-ie-probs" `shouldReturnAlmost` 0.1111111111 describe "Mean/MultiLabel-F" $ do it "simple" $ do runGEvalTest "mean-multilabel-f1-simple" `shouldReturnAlmost` 0.5 diff --git a/test/multilabel-f1-ie-probs/multilabel-f1-ie-probs-solution/test-A/out.tsv b/test/multilabel-f1-ie-probs/multilabel-f1-ie-probs-solution/test-A/out.tsv new file mode 100644 index 0000000..4ac0083 --- /dev/null +++ b/test/multilabel-f1-ie-probs/multilabel-f1-ie-probs-solution/test-A/out.tsv @@ -0,0 +1,3 @@ +important-person=JOHN_BROWN:0.52 important-person=JOHN_SMITH company-name=Axaxaxaxas_Mlo profit=12031 +company-name=Foo_Bar profit=1220:0.8223 unwanted=none:0.49 +company-name=Whatever important-person=PIERRE_MENARD diff --git a/test/multilabel-f1-ie-probs/multilabel-f1-ie-probs/config.txt b/test/multilabel-f1-ie-probs/multilabel-f1-ie-probs/config.txt new file mode 100644 index 0000000..b79da4c --- /dev/null +++ b/test/multilabel-f1-ie-probs/multilabel-f1-ie-probs/config.txt @@ -0,0 +1 @@ +--metric MultiLabel-F1 diff --git a/test/multilabel-f1-ie-probs/multilabel-f1-ie-probs/test-A/expected.tsv b/test/multilabel-f1-ie-probs/multilabel-f1-ie-probs/test-A/expected.tsv new file mode 100644 index 0000000..ff50263 --- /dev/null +++ b/test/multilabel-f1-ie-probs/multilabel-f1-ie-probs/test-A/expected.tsv @@ -0,0 +1,3 @@ +company-name=Axaxaxas_Mlö profit=12031 important-person=John_Smith important-person=James_Brown +company-name=Orbis_Tertius profit=1020 important-person=Anna_Smith +company-name=Whatever_Inc profit=5600 important-person=Pierre_Menard