Fix bug with inconsistent handling of probs in MultiLabel-F1
This commit is contained in:
parent
4b80625cc2
commit
c71bba81f3
@ -680,60 +680,60 @@ gevalCoreOnSources (LikelihoodHashed nbOfBits) = helperLogLossHashed nbOfBits lo
|
||||
|
||||
|
||||
gevalCoreOnSources (Mean (MultiLabelFMeasure beta matchingSpec))
|
||||
= gevalCoreWithoutInputOnItemTargets (Right . intoWords)
|
||||
(Right . getWords)
|
||||
((fMeasureOnCounts beta) . (getWeightedCounts (getMatchingFunctionForString matchingSpec)))
|
||||
= gevalCoreWithoutInputOnItemTargets intoWords
|
||||
getWords
|
||||
((fMeasureOnCounts beta) . (getWeightedCounts (getMatchingFunctionForText matchingSpec)))
|
||||
averageC
|
||||
id
|
||||
noGraph
|
||||
where
|
||||
-- repeated as below, as it will be refactored into dependent types soon anyway
|
||||
getWords (RawItemTarget t) = Prelude.map unpack $ selectByStandardThreshold $ parseIntoProbList t
|
||||
getWords (PartiallyParsedItemTarget ts) = Prelude.map unpack ts
|
||||
intoWords (RawItemTarget t) = Prelude.map unpack $ Data.Text.words t
|
||||
intoWords (PartiallyParsedItemTarget ts) = Prelude.map unpack ts
|
||||
getWords (RawItemTarget t) = outputParser (SAMultiLabelFMeasure SExactMatch) t
|
||||
getWords (PartiallyParsedItemTarget ts) = Right ts
|
||||
intoWords (RawItemTarget t) = expectedParser (SAMultiLabelFMeasure SExactMatch) t
|
||||
intoWords (PartiallyParsedItemTarget ts) = Right ts
|
||||
|
||||
gevalCoreOnSources (Mean WER)
|
||||
= gevalCoreWithoutInputOnItemTargets (Right . intoWords)
|
||||
(Right . getWords)
|
||||
= gevalCoreWithoutInputOnItemTargets intoWords
|
||||
getWords
|
||||
((uncurry (/.)) . (uncurry werStep))
|
||||
averageC
|
||||
id
|
||||
noGraph
|
||||
where
|
||||
-- repeated as below, as it will be refactored into dependent types soon anyway
|
||||
getWords (RawItemTarget t) = Prelude.map unpack $ selectByStandardThreshold $ parseIntoProbList t
|
||||
getWords (PartiallyParsedItemTarget ts) = Prelude.map unpack ts
|
||||
intoWords (RawItemTarget t) = Prelude.map unpack $ Data.Text.words t
|
||||
intoWords (PartiallyParsedItemTarget ts) = Prelude.map unpack ts
|
||||
getWords (RawItemTarget t) = outputParser SAWER t
|
||||
getWords (PartiallyParsedItemTarget ts) = Right $ Prelude.map unpack ts
|
||||
intoWords (RawItemTarget t) = expectedParser SAWER t
|
||||
intoWords (PartiallyParsedItemTarget ts) = Right $ Prelude.map unpack ts
|
||||
|
||||
gevalCoreOnSources (Mean CER)
|
||||
= gevalCoreWithoutInputOnItemTargets (Right . getString)
|
||||
(Right . getString)
|
||||
= gevalCoreWithoutInputOnItemTargets getString
|
||||
getString
|
||||
((uncurry (/.)) . (uncurry werStep))
|
||||
averageC
|
||||
id
|
||||
noGraph
|
||||
where
|
||||
-- repeated as below, as it will be refactored into dependent types soon anyway
|
||||
getString (RawItemTarget t) = unpack t
|
||||
getString (PartiallyParsedItemTarget ts) = Prelude.unwords $ Prelude.map unpack ts
|
||||
getString (RawItemTarget t) = expectedParser SACER t
|
||||
getString (PartiallyParsedItemTarget ts) = Right $ Prelude.unwords $ Prelude.map unpack ts
|
||||
|
||||
gevalCoreOnSources (Mean _) = error $ "Mean/ meta-metric defined only for MultiLabel-F1, WER and CER for the time being"
|
||||
|
||||
-- only MultiLabel-F1 handled for JSONs for the time being...
|
||||
gevalCoreOnSources (MultiLabelFMeasure beta matchingSpec) =
|
||||
gevalCoreWithoutInputOnItemTargets (Right . intoWords)
|
||||
(Right . getWords)
|
||||
(getWeightedCounts (getMatchingFunctionForString matchingSpec))
|
||||
gevalCoreWithoutInputOnItemTargets intoWords
|
||||
getWords
|
||||
(getWeightedCounts (getMatchingFunctionForText matchingSpec))
|
||||
countAgg
|
||||
(fMeasureOnCounts beta)
|
||||
noGraph
|
||||
where
|
||||
getWords (RawItemTarget t) = Prelude.map unpack $ selectByStandardThreshold $ parseIntoProbList t
|
||||
getWords (PartiallyParsedItemTarget ts) = Prelude.map unpack ts
|
||||
intoWords (RawItemTarget t) = Prelude.map unpack $ Data.Text.words t
|
||||
intoWords (PartiallyParsedItemTarget ts) = Prelude.map unpack ts
|
||||
getWords (RawItemTarget t) = outputParser (SAMultiLabelFMeasure SExactMatch) t
|
||||
getWords (PartiallyParsedItemTarget ts) = Right ts
|
||||
intoWords (RawItemTarget t) = expectedParser (SAMultiLabelFMeasure SExactMatch) t
|
||||
intoWords (PartiallyParsedItemTarget ts) = Right ts
|
||||
|
||||
gevalCoreOnSources Pearson = gevalCoreByCorrelationMeasure pearson
|
||||
gevalCoreOnSources Spearman = gevalCoreByCorrelationMeasure spearman
|
||||
|
@ -13,7 +13,7 @@ module GEval.Metric
|
||||
where
|
||||
|
||||
import Data.Word
|
||||
import Data.Text
|
||||
import Data.Text hiding (map)
|
||||
import Data.Monoid ((<>))
|
||||
|
||||
import GEval.Common
|
||||
@ -262,10 +262,11 @@ fixedNumberOfColumnsInInput (ProbabilisticSoftFMeasure _) = False
|
||||
fixedNumberOfColumnsInInput (Soft2DFMeasure _) = False
|
||||
fixedNumberOfColumnsInInput _ = True
|
||||
|
||||
|
||||
perfectOutLineFromExpectedLine :: Metric -> Text -> Text
|
||||
perfectOutLineFromExpectedLine (Mean metric) t = perfectOutLineFromExpectedLine metric t
|
||||
perfectOutLineFromExpectedLine (LogLossHashed _) t = t <> ":1.0"
|
||||
perfectOutLineFromExpectedLine (LikelihoodHashed _) t = t <> ":1.0"
|
||||
perfectOutLineFromExpectedLine (LogLossHashed _) t = addProbOne t
|
||||
perfectOutLineFromExpectedLine (LikelihoodHashed _) t = addProbOne t
|
||||
perfectOutLineFromExpectedLine BLEU t = getFirstColumn t
|
||||
perfectOutLineFromExpectedLine GLEU t = getFirstColumn t
|
||||
perfectOutLineFromExpectedLine ClippEU t = cleanMarginFromClippEU t
|
||||
@ -273,6 +274,9 @@ perfectOutLineFromExpectedLine (Accuracy ExactMatch) t = t
|
||||
perfectOutLineFromExpectedLine (Accuracy _) t = getFirstColumn t
|
||||
perfectOutLineFromExpectedLine _ t = t
|
||||
|
||||
addProbOne :: Text -> Text
|
||||
addProbOne = (<> ":1.0")
|
||||
|
||||
getFirstColumn :: Text -> Text
|
||||
getFirstColumn t = case splitOn "\t" t of
|
||||
[] -> ""
|
||||
@ -280,7 +284,7 @@ getFirstColumn t = case splitOn "\t" t of
|
||||
|
||||
cleanMarginFromClippEU :: Text -> Text
|
||||
cleanMarginFromClippEU t = Data.Text.unwords outs
|
||||
where outs = Prelude.map toOut specs
|
||||
where outs = map toOut specs
|
||||
(Right specs) = parseOnly lineClippingSpecsParser t
|
||||
toOut (ClippingSpec (PageNumber pageNumber) (Rectangle (Point x0 y0) (Point x1 y1)) _) =
|
||||
pack ((show pageNumber) ++ "/" ++ (show x0) ++ "," ++ (show y0) ++ "," ++ (show x1) ++ "," ++ (show y1))
|
||||
|
@ -27,7 +27,7 @@ import GEval.PrecisionRecall (weightedMaxMatch, fMeasureOnCounts, calculateMAPFo
|
||||
|
||||
import Control.Exception
|
||||
|
||||
import Data.Text
|
||||
import Data.Text hiding (map, maximum, zip)
|
||||
import Data.Text.Read as TR
|
||||
import qualified Data.List.Split as DLS
|
||||
import Data.Attoparsec.Text (parseOnly)
|
||||
@ -41,7 +41,8 @@ import GEval.Annotation (Annotation, ObtainedAnnotation,
|
||||
import GEval.Clippings (Clipping, ClippingSpec, LabeledClipping, lineClippingsParser, lineClippingSpecsParser, lineLabeledClippingsParser)
|
||||
import GEval.BIO (TaggedEntity, parseBioSequenceIntoEntities, parseBioSequenceIntoEntitiesWithoutNormalization)
|
||||
import GEval.LogLossHashed (parseWordSpecs, wordSpecToPair)
|
||||
import GEval.ProbList (ProbList(..), parseIntoProbList, WordWithProb(..), countLogLossOnProbList)
|
||||
import GEval.ProbList (ProbList(..), WordWithProb(..),
|
||||
parseIntoProbList, countLogLossOnProbList, selectByStandardThreshold)
|
||||
import GEval.MatchingSpecification
|
||||
import GEval.Haversine
|
||||
|
||||
@ -222,7 +223,7 @@ outputParser SATokenAccuracy = intoWords
|
||||
outputParser SASegmentAccuracy = parseSegmentAnnotations
|
||||
outputParser SAMAE = doubleParser
|
||||
outputParser SASMAPE = doubleParser
|
||||
outputParser (SAMultiLabelFMeasure _) = intoWords
|
||||
outputParser (SAMultiLabelFMeasure _) = Right . selectByStandardThreshold . parseIntoProbList
|
||||
outputParser SAMultiLabelLogLoss = Right . parseIntoProbList
|
||||
outputParser SAMultiLabelLikelihood = Right . parseIntoProbList
|
||||
outputParser SAHaversine = parseSpherePoints
|
||||
@ -258,7 +259,7 @@ type family ItemIntermediateRepresentationType (t :: AMetric) :: * where
|
||||
ItemIntermediateRepresentationType t = Double
|
||||
|
||||
findBest :: (Text -> Text -> Double) -> (Text -> Text -> Double)
|
||||
findBest fun expected got = Prelude.maximum $ Prelude.map (fun got) expectedVals
|
||||
findBest fun expected got = maximum $ map (fun got) expectedVals
|
||||
where expectedVals = case splitOn "\t" expected of
|
||||
[] -> [""]
|
||||
l -> l
|
||||
@ -310,7 +311,7 @@ intoWords = Right . Data.Text.words
|
||||
|
||||
intoStringWords = Right . Prelude.words . unpack
|
||||
|
||||
alternativeSentencesParser = Right . Prelude.map Prelude.words . DLS.splitOn "\t" . unpack
|
||||
alternativeSentencesParser = Right . map Prelude.words . DLS.splitOn "\t" . unpack
|
||||
|
||||
onlyStrip = Right . strip
|
||||
|
||||
@ -322,8 +323,8 @@ predictedParser got =
|
||||
case parseWordSpecs got of
|
||||
Right wordSpecs -> if Prelude.null pairs
|
||||
then Nothing
|
||||
else Just $ snd $ Prelude.maximum pairs
|
||||
where pairs = catMaybes $ Prelude.map wordSpecToPair wordSpecs
|
||||
else Just $ snd $ maximum pairs
|
||||
where pairs = catMaybes $ map wordSpecToPair wordSpecs
|
||||
Left _ -> Just got
|
||||
|
||||
splitByTabs = Right . DLS.splitOn "\t" . unpack
|
||||
@ -361,8 +362,8 @@ hitOrMiss (exp, got) =
|
||||
case parseWordSpecs got of
|
||||
Right wordSpecs -> if Prelude.null pairs
|
||||
then 0.0
|
||||
else indicator (exp == (snd $ Prelude.maximum pairs))
|
||||
where pairs = catMaybes $ Prelude.map wordSpecToPair wordSpecs
|
||||
else indicator (exp == (snd $ maximum pairs))
|
||||
where pairs = catMaybes $ map wordSpecToPair wordSpecs
|
||||
Left _ -> indicator ((normalizeProbForAccuracy exp got) == exp)
|
||||
-- if the expected value is 0 or 1 treat values
|
||||
-- between 0.0 and 1.0 as probabilities
|
||||
@ -405,7 +406,7 @@ getSoft2DCounts (expected, got) = (tpArea, expArea, gotArea)
|
||||
|
||||
getFragCounts :: CoverableEntityWithProbability e => ([BareEntity e], [e]) -> (Double, Double, Int, Int)
|
||||
getFragCounts (expected, got)
|
||||
| allDisjoint (Prelude.map getBareEntity got) = (
|
||||
| allDisjoint (map getBareEntity got) = (
|
||||
recallScoreTotal expected got,
|
||||
precisionScoreTotal got expected,
|
||||
Prelude.length expected,
|
||||
@ -418,7 +419,7 @@ countHitsAndTotals (es, os) =
|
||||
then throw $ OtherException "wrong number of tokens"
|
||||
else Prelude.foldl matchFun
|
||||
(0, 0)
|
||||
(Prelude.zip es os)
|
||||
(zip es os)
|
||||
where matchFun :: (Int, Int) -> (Text, Text) -> (Int, Int)
|
||||
matchFun (h, t) (e, o)
|
||||
| e == (pack "*") = (h, t)
|
||||
|
@ -126,7 +126,7 @@ Mean/Multilabel-F1.
|
||||
|]
|
||||
getMetricDescription (SoftFMeasure _) =
|
||||
[i|"Soft" F-measure on intervals, i.e. partial "hits" are considered. For instance,
|
||||
if a label `foo` is expected for the span 2-9 and this label is returned but with
|
||||
if a label `foo` is expected for the span 2-9 and this label is returned, but with
|
||||
the span 8-12, it is counted as 2/8=0.25 instead of 0 or 1 when precision/recall counts
|
||||
are gathered.
|
||||
|]
|
||||
@ -183,9 +183,9 @@ getMetricDescription BIOWeightedF1 =
|
||||
|]
|
||||
|
||||
outContents :: Metric -> String
|
||||
outContents (MultiLabelFMeasure _ _) = [hereLit|person/1,3 first-name/1 first-name/3
|
||||
surname/2
|
||||
first-name/3
|
||||
outContents (MultiLabelFMeasure _ _) = [hereLit|person/1,3 first-name/1:0.8 first-name/3:0.75
|
||||
surname/2 county/1:0.33
|
||||
first-name/3:0.52
|
||||
|]
|
||||
outContents (SoftFMeasure _) = [hereLit|inwords:1-4
|
||||
inwords:1-3 indigits:5
|
||||
@ -279,8 +279,12 @@ formatEvaluationSchemeDescription scheme@(EvaluationScheme metric _) = show sche
|
||||
IF YOU WANT TO HAVE IT DESCRIBED|]
|
||||
|
||||
formatDescription :: Metric -> String
|
||||
formatDescription (MultiLabelFMeasure _ ExactMatch) = [hereLit|Any label separated by spaces can be used. They are
|
||||
not intepreted in any way when the metric is calculated.
|
||||
formatDescription (MultiLabelFMeasure _ ExactMatch) = [hereLit|Any label separated by spaces can be used. Labels are not
|
||||
interpreted except that they can be accompanied by probabilities
|
||||
(after a colon): only labels with probabilities >= 0.5 are considered.
|
||||
This is for compatibility with probalistic metrics. By default, 1.0 is
|
||||
assumed as the probability, but it is recommended to add probabilities
|
||||
explicitly.
|
||||
|]
|
||||
formatDescription (SoftFMeasure _) = [hereLit|Each line is a sequence of entities separated by spaces, each entity is of
|
||||
the form LABEL:SPAN, where LABEL is any label and SPAN is defined using single integers, intervals or such
|
||||
@ -316,9 +320,10 @@ B-tags and I-tags can accompanied by an extra label after a slash.
|
||||
scoreExplanation :: EvaluationScheme -> Maybe String
|
||||
scoreExplanation (EvaluationScheme (MultiLabelFMeasure _ ExactMatch) [])
|
||||
= Just [hereLit|Out of the total 5 labels in the output, 3 are correct (person/1,3, first-name/1 and
|
||||
first-name/3), hence precision is 3/5=0.6, whereas out of the 4 labels in gold standard,
|
||||
again 3 were retrieved, so recall is 3/4=0.75. The harmonic mean of precision and recall
|
||||
is 2/(4/3 + 5/3) = 2/3 = 0.6666|]
|
||||
first-name/3, only labels with probabilities >= 0.5 are considered, otherwise the probabilities are just
|
||||
discarded), hence precision is 3/5=0.6, whereas out of the 4 labels in gold standard, again 3 were
|
||||
retrieved, so recall is 3/4=0.75. The harmonic mean of precision and recall is 2/(4/3 + 5/3) =
|
||||
= 2/3 = 0.6666|]
|
||||
scoreExplanation (EvaluationScheme (SoftFMeasure _) [])
|
||||
= Just [hereLit|We have a partial (0.75) success for the entity `inwords:1-4`, hence Recall = 0.75/1 = 0.75,
|
||||
Precision = (0 + 0.75 + 0) / 3 = 0.25, so F-score = 0.375|]
|
||||
|
@ -368,6 +368,8 @@ main = hspec $ do
|
||||
runGEvalTest "multilabel-f1-ie-fuzzy-smart" `shouldReturnAlmost` 0.598444
|
||||
it "information extraction with smart fuzzy matching hardened" $ do
|
||||
runGEvalTest "multilabel-f1-ie-fuzzy-harden" `shouldReturnAlmost` 0.555555555
|
||||
it "information extraction" $ do
|
||||
runGEvalTest "multilabel-f1-ie-probs" `shouldReturnAlmost` 0.1111111111
|
||||
describe "Mean/MultiLabel-F" $ do
|
||||
it "simple" $ do
|
||||
runGEvalTest "mean-multilabel-f1-simple" `shouldReturnAlmost` 0.5
|
||||
|
@ -0,0 +1,3 @@
|
||||
important-person=JOHN_BROWN:0.52 important-person=JOHN_SMITH company-name=Axaxaxaxas_Mlo profit=12031
|
||||
company-name=Foo_Bar profit=1220:0.8223 unwanted=none:0.49
|
||||
company-name=Whatever important-person=PIERRE_MENARD
|
|
@ -0,0 +1 @@
|
||||
--metric MultiLabel-F1
|
@ -0,0 +1,3 @@
|
||||
company-name=Axaxaxas_Mlö profit=12031 important-person=John_Smith important-person=James_Brown
|
||||
company-name=Orbis_Tertius profit=1020 important-person=Anna_Smith
|
||||
company-name=Whatever_Inc profit=5600 important-person=Pierre_Menard
|
|
Loading…
Reference in New Issue
Block a user