From e4a6ed347d68231ea8ca46c010a408e323b8bdf9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Filip=20Grali=C5=84ski?= Date: Thu, 22 Aug 2019 17:07:32 +0200 Subject: [PATCH] Change the meaning of Soft2D-F1 metric. Now it is averaged per line. --- src/GEval/Core.hs | 15 +++++++++------ src/GEval/CreateChallenge.hs | 4 ++-- src/GEval/MetricsMeta.hs | 31 ++++++++++++++++++++++++++++--- test/Spec.hs | 4 ++-- 4 files changed, 41 insertions(+), 13 deletions(-) diff --git a/src/GEval/Core.hs b/src/GEval/Core.hs index cbe0184..526129f 100644 --- a/src/GEval/Core.hs +++ b/src/GEval/Core.hs @@ -48,6 +48,8 @@ module GEval.Core somethingWrongWithFilesMessage ) where +import Debug.Trace + import GEval.Metric import GEval.EvaluationScheme @@ -649,15 +651,16 @@ gevalCore' (ProbabilisticSoftFMeasure beta) _ = gevalCoreWithoutInput parseAnnot gevalCore' (Soft2DFMeasure beta) _ = gevalCoreWithoutInput parseLabeledClippings parseLabeledClippings - get2DCounts - countAgg - (fMeasureOnCounts beta) + count2DFScore + averageC + id noGraph where parseLabeledClippings = controlledParse lineLabeledClippingsParser - get2DCounts (expected, got) = (coveredBy expected got, - totalArea expected, - totalArea got) + count2DFScore (expected, got) = fMeasureOnCounts beta (tpArea, expArea, gotArea) + where tpArea = coveredBy expected got + expArea = totalArea expected + gotArea = totalArea got gevalCore' ClippEU _ = gevalCoreWithoutInput parseClippingSpecs parseClippings matchStep clippeuAgg finalStep noGraph where diff --git a/src/GEval/CreateChallenge.hs b/src/GEval/CreateChallenge.hs index e209fb9..2b72c6e 100644 --- a/src/GEval/CreateChallenge.hs +++ b/src/GEval/CreateChallenge.hs @@ -743,8 +743,8 @@ testExpectedContents MultiLabelLikelihood = testExpectedContents MultiLabelLogLo testExpectedContents MultiLabelLogLoss = [hereLit|SADNESS HATE |] -testExpectedContents (Soft2DFMeasure _) = [hereLit|3/0,0,100,100 -1/10,10,1000,1000 +testExpectedContents (Soft2DFMeasure _) = [hereLit|foo:3/0,0,100,100 +bar:1/50,50,1000,1000 |] testExpectedContents ClippEU = [hereLit|3/0,0,100,100/10 1/10,10,1000,1000/10 diff --git a/src/GEval/MetricsMeta.hs b/src/GEval/MetricsMeta.hs index 1c007df..9720c81 100644 --- a/src/GEval/MetricsMeta.hs +++ b/src/GEval/MetricsMeta.hs @@ -87,6 +87,7 @@ isEvaluationSchemeDescribed _ = False isMetricDescribed :: Metric -> Bool isMetricDescribed (SoftFMeasure _) = True +isMetricDescribed (Soft2DFMeasure _) = True isMetricDescribed _ = False getEvaluationSchemeDescription :: EvaluationScheme -> String @@ -96,18 +97,33 @@ getMetricDescription :: Metric -> String getMetricDescription (SoftFMeasure _) = [i|"Soft" F-measure on intervals, i.e. partial "hits" are considered. For instance, if a label `foo` is expected for the span 2-9 and this label is returned but with -the span 8-12, it is counted as 1/4 for recall and 2/5 for precision. +the span 8-12, it is counted as 2/8=0.25 instead of 0 or 1 when precision/recall counts +are gathered. +|] +getMetricDescription (Soft2DFMeasure _) = + [i|"Soft" F-measure on rectangles, i.e. precision and recall is calculated for areas. For instance, +if a label `foo` is expected for the rectangle (0, 0)-(100, 200) and this label is returned but with +the span (50, 100)-(150, 150), it is treatd as recall=1/4 and precision=2/5. For each item (line) F-score +is evaluated separately and finally averaged. |] outContents :: Metric -> String outContents (SoftFMeasure _) = [hereLit|inwords:1-4 inwords:1-3 indigits:5 |] +outContents (Soft2DFMeasure _) = [hereLit|foo:3/250,130,340,217 +bar:1/0,0,100,200 foo:1/40,50,1000,1000 bar:1/400,600,1000,1000 +|] expectedScore :: EvaluationScheme -> MetricValue -expectedScore (EvaluationScheme (SoftFMeasure beta) []) = weightedHarmonicMean beta precision recall - where precision = 0.25 +expectedScore (EvaluationScheme (SoftFMeasure beta) []) + = let precision = 0.25 recall = 0.75 + in weightedHarmonicMean beta precision recall +expectedScore (EvaluationScheme (Soft2DFMeasure beta) []) + = let precision = 0.21117747440273 + recall = 0.27423822714681 + in (weightedHarmonicMean beta precision recall) / 2.0 listOfAvailableEvaluationSchemes :: [EvaluationScheme] listOfAvailableEvaluationSchemes = map (\m -> EvaluationScheme m []) listOfAvailableMetrics @@ -143,11 +159,20 @@ formatDescription (SoftFMeasure _) = [hereLit|Each line is a sequence of entitie the form LABEL:SPAN, where LABEL is any label and SPAN is defined using single integers, intervals or such units separated with commas. |] +formatDescription (Soft2DFMeasure _) = [hereLit|Each line is a sequence of entities separated by spaces, each entity is of +the form LABEL:PAGE/X0,Y0,X1,Y1 where LABEL is any label, page is the page number (starting from 1) and +(X0, Y0) and (X1, Y1) are clipping corners. +|] scoreExplanation :: EvaluationScheme -> Maybe String scoreExplanation (EvaluationScheme (SoftFMeasure _) []) = Just [hereLit|We have a partial (0.75) success for the entity `inwords:1-4`, hence Recall = 0.75/1 = 0.75, Precision = (0 + 0.75 + 0) / 3 = 0.25, so F-score = 0.375|] +scoreExplanation (EvaluationScheme (Soft2DFMeasure _) []) + = Just [hereLit|The F-score for the first item is 0 (the entity was found in the completely wrong place). +As far as the second item is concerned, the total area that covered by the output is 50*150+600*400=247500. +Hence, recall is 247500/902500=0.274 and precision - 247500/(20000+912000+240000)=0.211. Therefore, the F-score +for the second item is 0.238 and the F-score for the whole set is (0 + 0.238)/2 = 0.119.|] pasteLines :: String -> String -> String pasteLines a b = printf "%-35s %s\n" a b diff --git a/test/Spec.hs b/test/Spec.hs index f635f88..57222e9 100644 --- a/test/Spec.hs +++ b/test/Spec.hs @@ -276,7 +276,7 @@ main = hspec $ do runGEvalTest "probabilistic-soft-f1-calibrated" `shouldReturnAlmost` 0.88888888888 describe "Soft2D-F1" $ do it "simple test" $ do - runGEvalTest "soft2d-f1-simple" `shouldReturnAlmost` 0.30152621462832535 + runGEvalTest "soft2d-f1-simple" `shouldReturnAlmost` 0.218457349437945 describe "test edit-distance library" $ do it "for handling UTF8" $ do levenshteinDistance defaultEditCosts "źdźbło" "źd好bło" `shouldBe` 1 @@ -551,7 +551,7 @@ main = hspec $ do let outFile = tempDir "test-A" "out.tsv" writeFile outFile (outContents metric) obtainedScore <- (runGEval ["--expected-directory", tempDir, "--out-directory", tempDir]) >>= extractVal - obtainedScore `shouldBe` (expectedScore scheme) + obtainedScore `shouldBeAlmost` (expectedScore scheme) describe "submit" $ do it "current branch" $ do runGitTest "branch-test" (\_ -> getCurrentBranch) `shouldReturn` "develop"