From 5998f8a31602d0b6337f7c1de37a785f4dcd7fdf Mon Sep 17 00:00:00 2001 From: Filip Gralinski Date: Sat, 7 Sep 2019 15:48:13 +0200 Subject: [PATCH] Documentation on Probabilistic-MultiLabel-F1 metric --- .gitignore | 2 ++ src/GEval/CreateChallenge.hs | 24 ++++++++++++++---------- src/GEval/MetricsMeta.hs | 26 ++++++++++++++++++++++++++ 3 files changed, 42 insertions(+), 10 deletions(-) diff --git a/.gitignore b/.gitignore index abf0f01..18cbbd7 100644 --- a/.gitignore +++ b/.gitignore @@ -1,2 +1,4 @@ *~ .stack-work +.shake/ +geval diff --git a/src/GEval/CreateChallenge.hs b/src/GEval/CreateChallenge.hs index 2b72c6e..85430eb 100644 --- a/src/GEval/CreateChallenge.hs +++ b/src/GEval/CreateChallenge.hs @@ -297,6 +297,7 @@ in the expected file (but not in the output file). |] ++ (commonReadmeMDContents testName) +readmeMDContents (ProbabilisticMultiLabelFMeasure beta) testName = readmeMDContents (MultiLabelFMeasure beta) testName readmeMDContents (MultiLabelFMeasure beta) testName = [i| Tag names and their component ============================= @@ -308,9 +309,7 @@ Tags: * surname * first-name -For each tag a sequence of token IDs separated with commas should be given (after a colon). - -The metric is F1 on labels. +For each tag a sequence of token IDs separated with commas should be given (after a slash). |] ++ (commonReadmeMDContents testName) readmeMDContents MultiLabelLikelihood testName = readmeMDContents MultiLabelLogLoss testName @@ -474,9 +473,10 @@ B-firstname/JOHN I-surname/VON I-surname/NEUMANN John von Nueman trainContents TokenAccuracy = [hereLit|* V N I like cats * * V * N I can see the rainbow |] -trainContents (MultiLabelFMeasure _) = [hereLit|I know Mr John Smith person:3,4,5 first-name:4 surname:5 -Steven bloody Brown person:1,3 first-name:1 surname:3 -James and James first-name:1 firstname:3 +trainContents (ProbabilisticMultiLabelFMeasure beta) = trainContents (MultiLabelFMeasure beta) +trainContents (MultiLabelFMeasure _) = [hereLit|I know Mr John Smith person/3,4,5 first-name/4 surname/5 +Steven bloody Brown person/1,3 first-name/1 surname/3 +James and James first-name/1 firstname/3 |] trainContents MultiLabelLikelihood = [hereLit|I hate you! HATE Love and hate LOVE HATE @@ -540,6 +540,7 @@ Mr Jan Kowalski devInContents TokenAccuracy = [hereLit|The cats on the mat Ala has a cat |] +devInContents (ProbabilisticMultiLabelFMeasure beta) = devInContents (MultiLabelFMeasure beta) devInContents (MultiLabelFMeasure _) = [hereLit|Jan Kowalski is here I see him Barbara @@ -603,9 +604,10 @@ O B-firstname/JAN B-surname/KOWALSKI devExpectedContents TokenAccuracy = [hereLit|* N * * N N V * N |] -devExpectedContents (MultiLabelFMeasure _) = [hereLit|person:1,2 first-name:1 surname:2 +devExpectedContents (ProbabilisticMultiLabelFMeasure beta) = devExpectedContents (MultiLabelFMeasure beta) +devExpectedContents (MultiLabelFMeasure _) = [hereLit|person/1,2 first-name/1 surname/2 -first-name:1 +first-name/1 |] devExpectedContents MultiLabelLikelihood = devExpectedContents MultiLabelLogLoss devExpectedContents MultiLabelLogLoss = [hereLit|LOVE @@ -670,6 +672,7 @@ No name here testInContents TokenAccuracy = [hereLit|I have cats I know |] +testInContents (ProbabilisticMultiLabelFMeasure beta) = testInContents (MultiLabelFMeasure beta) testInContents (MultiLabelFMeasure _) = [hereLit|John bloody Smith Nobody is there I saw Marketa @@ -735,9 +738,10 @@ O O O testExpectedContents TokenAccuracy = [hereLit|* V N * V |] -testExpectedContents (MultiLabelFMeasure _) = [hereLit|person:1,3 first-name:1 surname:3 +testExpectedContents (ProbabilisticMultiLabelFMeasure beta) = testExpectedContents (MultiLabelFMeasure beta) +testExpectedContents (MultiLabelFMeasure _) = [hereLit|person/1,3 first-name/1 surname/3 -first-name:3 +first-name/3 |] testExpectedContents MultiLabelLikelihood = testExpectedContents MultiLabelLogLoss testExpectedContents MultiLabelLogLoss = [hereLit|SADNESS diff --git a/src/GEval/MetricsMeta.hs b/src/GEval/MetricsMeta.hs index 84c0fb0..f65cde8 100644 --- a/src/GEval/MetricsMeta.hs +++ b/src/GEval/MetricsMeta.hs @@ -47,6 +47,9 @@ listOfAvailableMetrics = [RMSE, MultiLabelFMeasure 1.0, MultiLabelFMeasure 2.0, MultiLabelFMeasure 0.25, + ProbabilisticMultiLabelFMeasure 1.0, + ProbabilisticMultiLabelFMeasure 2.0, + ProbabilisticMultiLabelFMeasure 0.25, MultiLabelLikelihood, MAP, BLEU, @@ -88,6 +91,7 @@ isEvaluationSchemeDescribed _ = False isMetricDescribed :: Metric -> Bool isMetricDescribed (SoftFMeasure _) = True isMetricDescribed (Soft2DFMeasure _) = True +isMetricDescribed (ProbabilisticMultiLabelFMeasure _) = True isMetricDescribed _ = False getEvaluationSchemeDescription :: EvaluationScheme -> String @@ -106,6 +110,15 @@ if a label `foo` is expected for the rectangle (0, 0)-(100, 200) and this label the span (50, 100)-(150, 150), it is treated as recall=1/8 and precision=1/2. For each item (line) F-score is evaluated separately and finally averaged. |] +getMetricDescription (ProbabilisticMultiLabelFMeasure _) = + [i|F-measure generalised so that labels could annotated with probabilities and the quality +of probabilities is assessed as well. It is calculated as the harmonic mean of calibration and recall +where calibration measures the quality of probabilities (how well they are calibrated, e.g. +if we have 10 items with probability 0.5 and 5 of them are correct, then the calibration +is perfect. +|] + + outContents :: Metric -> String outContents (SoftFMeasure _) = [hereLit|inwords:1-4 @@ -114,6 +127,10 @@ inwords:1-3 indigits:5 outContents (Soft2DFMeasure _) = [hereLit|foo:3/250,130,340,217 bar:1/0,0,100,200 foo:1/40,50,1000,1000 bar:1/400,600,1000,1000 |] +outContents (ProbabilisticMultiLabelFMeasure _) = [hereLit|first-name/1:0.8 surname/3:1.0 +surname/1:0.4 +first-name/3:0.9 +|] expectedScore :: EvaluationScheme -> MetricValue expectedScore (EvaluationScheme (SoftFMeasure beta) []) @@ -124,6 +141,10 @@ expectedScore (EvaluationScheme (Soft2DFMeasure beta) []) = let precision = 0.211622914314256 recall = 0.2749908502976 in (weightedHarmonicMean beta precision recall) / 2.0 +expectedScore (EvaluationScheme (ProbabilisticMultiLabelFMeasure beta) []) + = let precision = 0.6569596940847289 + recall = 0.675 + in weightedHarmonicMean beta precision recall listOfAvailableEvaluationSchemes :: [EvaluationScheme] listOfAvailableEvaluationSchemes = map (\m -> EvaluationScheme m []) listOfAvailableMetrics @@ -163,6 +184,10 @@ formatDescription (Soft2DFMeasure _) = [hereLit|Each line is a sequence of entit the form LABEL:PAGE/X0,Y0,X1,Y1 where LABEL is any label, page is the page number (starting from 1) and (X0, Y0) and (X1, Y1) are clipping corners. |] +formatDescription (ProbabilisticMultiLabelFMeasure _) = [hereLit|In each line a number of labels (entities) can be given. A label probability +can be provided with a colon (e.g. "foo:0.7"). By default, 1.0 is assumed. +|] + scoreExplanation :: EvaluationScheme -> Maybe String scoreExplanation (EvaluationScheme (SoftFMeasure _) []) @@ -173,6 +198,7 @@ scoreExplanation (EvaluationScheme (Soft2DFMeasure _) []) As far as the second item is concerned, the total area that covered by the output is 50*150+600*400=247500. Hence, recall is 247500/902500=0.274 and precision - 247500/(20000+912000+240000)=0.211. Therefore, the F-score for the second item is 0.238 and the F-score for the whole set is (0 + 0.238)/2 = 0.119.|] +scoreExplanation (EvaluationScheme (ProbabilisticMultiLabelFMeasure _) []) = Nothing pasteLines :: String -> String -> String pasteLines a b = printf "%-35s %s\n" a b