Fix bug with inconsistent handling of probs in MultiLabel-F1

This commit is contained in:
Filip Gralinski 2021-07-23 17:26:41 +02:00
parent 4b80625cc2
commit c71bba81f3
8 changed files with 67 additions and 48 deletions

View File

@ -680,60 +680,60 @@ gevalCoreOnSources (LikelihoodHashed nbOfBits) = helperLogLossHashed nbOfBits lo
gevalCoreOnSources (Mean (MultiLabelFMeasure beta matchingSpec))
= gevalCoreWithoutInputOnItemTargets (Right . intoWords)
(Right . getWords)
((fMeasureOnCounts beta) . (getWeightedCounts (getMatchingFunctionForString matchingSpec)))
= gevalCoreWithoutInputOnItemTargets intoWords
((fMeasureOnCounts beta) . (getWeightedCounts (getMatchingFunctionForText matchingSpec)))
-- repeated as below, as it will be refactored into dependent types soon anyway
getWords (RawItemTarget t) = unpack $ selectByStandardThreshold $ parseIntoProbList t
getWords (PartiallyParsedItemTarget ts) = unpack ts
intoWords (RawItemTarget t) = unpack $ Data.Text.words t
intoWords (PartiallyParsedItemTarget ts) = unpack ts
getWords (RawItemTarget t) = outputParser (SAMultiLabelFMeasure SExactMatch) t
getWords (PartiallyParsedItemTarget ts) = Right ts
intoWords (RawItemTarget t) = expectedParser (SAMultiLabelFMeasure SExactMatch) t
intoWords (PartiallyParsedItemTarget ts) = Right ts
gevalCoreOnSources (Mean WER)
= gevalCoreWithoutInputOnItemTargets (Right . intoWords)
(Right . getWords)
= gevalCoreWithoutInputOnItemTargets intoWords
((uncurry (/.)) . (uncurry werStep))
-- repeated as below, as it will be refactored into dependent types soon anyway
getWords (RawItemTarget t) = unpack $ selectByStandardThreshold $ parseIntoProbList t
getWords (PartiallyParsedItemTarget ts) = unpack ts
intoWords (RawItemTarget t) = unpack $ Data.Text.words t
intoWords (PartiallyParsedItemTarget ts) = unpack ts
getWords (RawItemTarget t) = outputParser SAWER t
getWords (PartiallyParsedItemTarget ts) = Right $ unpack ts
intoWords (RawItemTarget t) = expectedParser SAWER t
intoWords (PartiallyParsedItemTarget ts) = Right $ unpack ts
gevalCoreOnSources (Mean CER)
= gevalCoreWithoutInputOnItemTargets (Right . getString)
(Right . getString)
= gevalCoreWithoutInputOnItemTargets getString
((uncurry (/.)) . (uncurry werStep))
-- repeated as below, as it will be refactored into dependent types soon anyway
getString (RawItemTarget t) = unpack t
getString (PartiallyParsedItemTarget ts) = Prelude.unwords $ unpack ts
getString (RawItemTarget t) = expectedParser SACER t
getString (PartiallyParsedItemTarget ts) = Right $ Prelude.unwords $ unpack ts
gevalCoreOnSources (Mean _) = error $ "Mean/ meta-metric defined only for MultiLabel-F1, WER and CER for the time being"
-- only MultiLabel-F1 handled for JSONs for the time being...
gevalCoreOnSources (MultiLabelFMeasure beta matchingSpec) =
gevalCoreWithoutInputOnItemTargets (Right . intoWords)
(Right . getWords)
(getWeightedCounts (getMatchingFunctionForString matchingSpec))
gevalCoreWithoutInputOnItemTargets intoWords
(getWeightedCounts (getMatchingFunctionForText matchingSpec))
(fMeasureOnCounts beta)
getWords (RawItemTarget t) = unpack $ selectByStandardThreshold $ parseIntoProbList t
getWords (PartiallyParsedItemTarget ts) = unpack ts
intoWords (RawItemTarget t) = unpack $ Data.Text.words t
intoWords (PartiallyParsedItemTarget ts) = unpack ts
getWords (RawItemTarget t) = outputParser (SAMultiLabelFMeasure SExactMatch) t
getWords (PartiallyParsedItemTarget ts) = Right ts
intoWords (RawItemTarget t) = expectedParser (SAMultiLabelFMeasure SExactMatch) t
intoWords (PartiallyParsedItemTarget ts) = Right ts
gevalCoreOnSources Pearson = gevalCoreByCorrelationMeasure pearson
gevalCoreOnSources Spearman = gevalCoreByCorrelationMeasure spearman

View File

@ -13,7 +13,7 @@ module GEval.Metric
import Data.Word
import Data.Text
import Data.Text hiding (map)
import Data.Monoid ((<>))
import GEval.Common
@ -262,10 +262,11 @@ fixedNumberOfColumnsInInput (ProbabilisticSoftFMeasure _) = False
fixedNumberOfColumnsInInput (Soft2DFMeasure _) = False
fixedNumberOfColumnsInInput _ = True
perfectOutLineFromExpectedLine :: Metric -> Text -> Text
perfectOutLineFromExpectedLine (Mean metric) t = perfectOutLineFromExpectedLine metric t
perfectOutLineFromExpectedLine (LogLossHashed _) t = t <> ":1.0"
perfectOutLineFromExpectedLine (LikelihoodHashed _) t = t <> ":1.0"
perfectOutLineFromExpectedLine (LogLossHashed _) t = addProbOne t
perfectOutLineFromExpectedLine (LikelihoodHashed _) t = addProbOne t
perfectOutLineFromExpectedLine BLEU t = getFirstColumn t
perfectOutLineFromExpectedLine GLEU t = getFirstColumn t
perfectOutLineFromExpectedLine ClippEU t = cleanMarginFromClippEU t
@ -273,6 +274,9 @@ perfectOutLineFromExpectedLine (Accuracy ExactMatch) t = t
perfectOutLineFromExpectedLine (Accuracy _) t = getFirstColumn t
perfectOutLineFromExpectedLine _ t = t
addProbOne :: Text -> Text
addProbOne = (<> ":1.0")
getFirstColumn :: Text -> Text
getFirstColumn t = case splitOn "\t" t of
[] -> ""
@ -280,7 +284,7 @@ getFirstColumn t = case splitOn "\t" t of
cleanMarginFromClippEU :: Text -> Text
cleanMarginFromClippEU t = Data.Text.unwords outs
where outs = toOut specs
where outs = map toOut specs
(Right specs) = parseOnly lineClippingSpecsParser t
toOut (ClippingSpec (PageNumber pageNumber) (Rectangle (Point x0 y0) (Point x1 y1)) _) =
pack ((show pageNumber) ++ "/" ++ (show x0) ++ "," ++ (show y0) ++ "," ++ (show x1) ++ "," ++ (show y1))

View File

@ -27,7 +27,7 @@ import GEval.PrecisionRecall (weightedMaxMatch, fMeasureOnCounts, calculateMAPFo
import Control.Exception
import Data.Text
import Data.Text hiding (map, maximum, zip)
import Data.Text.Read as TR
import qualified Data.List.Split as DLS
import Data.Attoparsec.Text (parseOnly)
@ -41,7 +41,8 @@ import GEval.Annotation (Annotation, ObtainedAnnotation,
import GEval.Clippings (Clipping, ClippingSpec, LabeledClipping, lineClippingsParser, lineClippingSpecsParser, lineLabeledClippingsParser)
import GEval.BIO (TaggedEntity, parseBioSequenceIntoEntities, parseBioSequenceIntoEntitiesWithoutNormalization)
import GEval.LogLossHashed (parseWordSpecs, wordSpecToPair)
import GEval.ProbList (ProbList(..), parseIntoProbList, WordWithProb(..), countLogLossOnProbList)
import GEval.ProbList (ProbList(..), WordWithProb(..),
parseIntoProbList, countLogLossOnProbList, selectByStandardThreshold)
import GEval.MatchingSpecification
import GEval.Haversine
@ -222,7 +223,7 @@ outputParser SATokenAccuracy = intoWords
outputParser SASegmentAccuracy = parseSegmentAnnotations
outputParser SAMAE = doubleParser
outputParser SASMAPE = doubleParser
outputParser (SAMultiLabelFMeasure _) = intoWords
outputParser (SAMultiLabelFMeasure _) = Right . selectByStandardThreshold . parseIntoProbList
outputParser SAMultiLabelLogLoss = Right . parseIntoProbList
outputParser SAMultiLabelLikelihood = Right . parseIntoProbList
outputParser SAHaversine = parseSpherePoints
@ -258,7 +259,7 @@ type family ItemIntermediateRepresentationType (t :: AMetric) :: * where
ItemIntermediateRepresentationType t = Double
findBest :: (Text -> Text -> Double) -> (Text -> Text -> Double)
findBest fun expected got = Prelude.maximum $ (fun got) expectedVals
findBest fun expected got = maximum $ map (fun got) expectedVals
where expectedVals = case splitOn "\t" expected of
[] -> [""]
l -> l
@ -310,7 +311,7 @@ intoWords = Right . Data.Text.words
intoStringWords = Right . Prelude.words . unpack
alternativeSentencesParser = Right . Prelude.words . DLS.splitOn "\t" . unpack
alternativeSentencesParser = Right . map Prelude.words . DLS.splitOn "\t" . unpack
onlyStrip = Right . strip
@ -322,8 +323,8 @@ predictedParser got =
case parseWordSpecs got of
Right wordSpecs -> if Prelude.null pairs
then Nothing
else Just $ snd $ Prelude.maximum pairs
where pairs = catMaybes $ wordSpecToPair wordSpecs
else Just $ snd $ maximum pairs
where pairs = catMaybes $ map wordSpecToPair wordSpecs
Left _ -> Just got
splitByTabs = Right . DLS.splitOn "\t" . unpack
@ -361,8 +362,8 @@ hitOrMiss (exp, got) =
case parseWordSpecs got of
Right wordSpecs -> if Prelude.null pairs
then 0.0
else indicator (exp == (snd $ Prelude.maximum pairs))
where pairs = catMaybes $ wordSpecToPair wordSpecs
else indicator (exp == (snd $ maximum pairs))
where pairs = catMaybes $ map wordSpecToPair wordSpecs
Left _ -> indicator ((normalizeProbForAccuracy exp got) == exp)
-- if the expected value is 0 or 1 treat values
-- between 0.0 and 1.0 as probabilities
@ -405,7 +406,7 @@ getSoft2DCounts (expected, got) = (tpArea, expArea, gotArea)
getFragCounts :: CoverableEntityWithProbability e => ([BareEntity e], [e]) -> (Double, Double, Int, Int)
getFragCounts (expected, got)
| allDisjoint ( getBareEntity got) = (
| allDisjoint (map getBareEntity got) = (
recallScoreTotal expected got,
precisionScoreTotal got expected,
Prelude.length expected,
@ -418,7 +419,7 @@ countHitsAndTotals (es, os) =
then throw $ OtherException "wrong number of tokens"
else Prelude.foldl matchFun
(0, 0)
( es os)
(zip es os)
where matchFun :: (Int, Int) -> (Text, Text) -> (Int, Int)
matchFun (h, t) (e, o)
| e == (pack "*") = (h, t)

View File

@ -126,7 +126,7 @@ Mean/Multilabel-F1.
getMetricDescription (SoftFMeasure _) =
[i|"Soft" F-measure on intervals, i.e. partial "hits" are considered. For instance,
if a label `foo` is expected for the span 2-9 and this label is returned but with
if a label `foo` is expected for the span 2-9 and this label is returned, but with
the span 8-12, it is counted as 2/8=0.25 instead of 0 or 1 when precision/recall counts
are gathered.
@ -183,9 +183,9 @@ getMetricDescription BIOWeightedF1 =
outContents :: Metric -> String
outContents (MultiLabelFMeasure _ _) = [hereLit|person/1,3 first-name/1 first-name/3
outContents (MultiLabelFMeasure _ _) = [hereLit|person/1,3 first-name/1:0.8 first-name/3:0.75
surname/2 county/1:0.33
outContents (SoftFMeasure _) = [hereLit|inwords:1-4
inwords:1-3 indigits:5
@ -279,8 +279,12 @@ formatEvaluationSchemeDescription scheme@(EvaluationScheme metric _) = show sche
formatDescription :: Metric -> String
formatDescription (MultiLabelFMeasure _ ExactMatch) = [hereLit|Any label separated by spaces can be used. They are
not intepreted in any way when the metric is calculated.
formatDescription (MultiLabelFMeasure _ ExactMatch) = [hereLit|Any label separated by spaces can be used. Labels are not
interpreted except that they can be accompanied by probabilities
(after a colon): only labels with probabilities >= 0.5 are considered.
This is for compatibility with probalistic metrics. By default, 1.0 is
assumed as the probability, but it is recommended to add probabilities
formatDescription (SoftFMeasure _) = [hereLit|Each line is a sequence of entities separated by spaces, each entity is of
the form LABEL:SPAN, where LABEL is any label and SPAN is defined using single integers, intervals or such
@ -316,9 +320,10 @@ B-tags and I-tags can accompanied by an extra label after a slash.
scoreExplanation :: EvaluationScheme -> Maybe String
scoreExplanation (EvaluationScheme (MultiLabelFMeasure _ ExactMatch) [])
= Just [hereLit|Out of the total 5 labels in the output, 3 are correct (person/1,3, first-name/1 and
first-name/3), hence precision is 3/5=0.6, whereas out of the 4 labels in gold standard,
again 3 were retrieved, so recall is 3/4=0.75. The harmonic mean of precision and recall
is 2/(4/3 + 5/3) = 2/3 = 0.6666|]
first-name/3, only labels with probabilities >= 0.5 are considered, otherwise the probabilities are just
discarded), hence precision is 3/5=0.6, whereas out of the 4 labels in gold standard, again 3 were
retrieved, so recall is 3/4=0.75. The harmonic mean of precision and recall is 2/(4/3 + 5/3) =
= 2/3 = 0.6666|]
scoreExplanation (EvaluationScheme (SoftFMeasure _) [])
= Just [hereLit|We have a partial (0.75) success for the entity `inwords:1-4`, hence Recall = 0.75/1 = 0.75,
Precision = (0 + 0.75 + 0) / 3 = 0.25, so F-score = 0.375|]

View File

@ -368,6 +368,8 @@ main = hspec $ do
runGEvalTest "multilabel-f1-ie-fuzzy-smart" `shouldReturnAlmost` 0.598444
it "information extraction with smart fuzzy matching hardened" $ do
runGEvalTest "multilabel-f1-ie-fuzzy-harden" `shouldReturnAlmost` 0.555555555
it "information extraction" $ do
runGEvalTest "multilabel-f1-ie-probs" `shouldReturnAlmost` 0.1111111111
describe "Mean/MultiLabel-F" $ do
it "simple" $ do
runGEvalTest "mean-multilabel-f1-simple" `shouldReturnAlmost` 0.5

View File

@ -0,0 +1,3 @@
important-person=JOHN_BROWN:0.52 important-person=JOHN_SMITH company-name=Axaxaxaxas_Mlo profit=12031
company-name=Foo_Bar profit=1220:0.8223 unwanted=none:0.49
company-name=Whatever important-person=PIERRE_MENARD
1 important-person=JOHN_BROWN:0.52 important-person=JOHN_SMITH company-name=Axaxaxaxas_Mlo profit=12031
2 company-name=Foo_Bar profit=1220:0.8223 unwanted=none:0.49
3 company-name=Whatever important-person=PIERRE_MENARD

View File

@ -0,0 +1 @@
--metric MultiLabel-F1

View File

@ -0,0 +1,3 @@
company-name=Axaxaxas_Mlö profit=12031 important-person=John_Smith important-person=James_Brown
company-name=Orbis_Tertius profit=1020 important-person=Anna_Smith
company-name=Whatever_Inc profit=5600 important-person=Pierre_Menard
1 company-name=Axaxaxas_Mlö profit=12031 important-person=John_Smith important-person=James_Brown
2 company-name=Orbis_Tertius profit=1020 important-person=Anna_Smith
3 company-name=Whatever_Inc profit=5600 important-person=Pierre_Menard