Documentation on Probabilistic-MultiLabel-F1 metric

2019-09-07 15:48:13 +02:00 · 2019-09-07 15:48:13 +02:00 · 5998f8a316
commit 5998f8a316
parent b540cba7da
3 changed files with 42 additions and 10 deletions
--- a/.gitignore
+++ b/.gitignore
@ -1,2 +1,4 @@
 *~
 .stack-work
 .shake/
 geval
--- a/src/GEval/CreateChallenge.hs
+++ b/src/GEval/CreateChallenge.hs
@ -297,6 +297,7 @@ in the expected file (but not in the output file).
 |] ++ (commonReadmeMDContents testName)
 readmeMDContents (ProbabilisticMultiLabelFMeasure beta) testName = readmeMDContents (MultiLabelFMeasure beta) testName
 readmeMDContents (MultiLabelFMeasure beta) testName = [i|
 Tag names and their component
 =============================
@ -308,9 +309,7 @@ Tags:
 * surname
 * first-name
-For each tag a sequence of token IDs separated with commas should be given (after a colon).
+For each tag a sequence of token IDs separated with commas should be given (after a slash).
 The metric is F1 on labels.
 |] ++ (commonReadmeMDContents testName)
 readmeMDContents MultiLabelLikelihood testName = readmeMDContents MultiLabelLogLoss testName
@ -474,9 +473,10 @@ B-firstname/JOHN I-surname/VON I-surname/NEUMANN	John von Nueman
 trainContents TokenAccuracy = [hereLit|* V N	I like cats
 * * V * N	I can see the rainbow
 |]
-trainContents (MultiLabelFMeasure _) = [hereLit|I know Mr John Smith	person:3,4,5 first-name:4 surname:5
+trainContents (ProbabilisticMultiLabelFMeasure beta) = trainContents (MultiLabelFMeasure beta)
-Steven bloody Brown	person:1,3 first-name:1 surname:3
+trainContents (MultiLabelFMeasure _) = [hereLit|I know Mr John Smith	person/3,4,5 first-name/4 surname/5
-James and James	first-name:1 firstname:3
+Steven bloody Brown	person/1,3 first-name/1 surname/3
 James and James	first-name/1 firstname/3
 |]
 trainContents MultiLabelLikelihood = [hereLit|I hate you!	HATE
 Love and hate	LOVE HATE
@ -540,6 +540,7 @@ Mr Jan Kowalski
 devInContents TokenAccuracy = [hereLit|The cats on the mat
 Ala has a cat
 |]
 devInContents (ProbabilisticMultiLabelFMeasure beta) = devInContents (MultiLabelFMeasure beta)
 devInContents (MultiLabelFMeasure _) = [hereLit|Jan Kowalski is here
 I see him
 Barbara
@ -603,9 +604,10 @@ O B-firstname/JAN B-surname/KOWALSKI
 devExpectedContents TokenAccuracy = [hereLit|* N * * N
 N V * N
 |]
-devExpectedContents (MultiLabelFMeasure _) = [hereLit|person:1,2 first-name:1 surname:2
+devExpectedContents (ProbabilisticMultiLabelFMeasure beta) = devExpectedContents (MultiLabelFMeasure beta)
 devExpectedContents (MultiLabelFMeasure _) = [hereLit|person/1,2 first-name/1 surname/2
-first-name:1
+first-name/1
 |]
 devExpectedContents MultiLabelLikelihood = devExpectedContents MultiLabelLogLoss
 devExpectedContents MultiLabelLogLoss = [hereLit|LOVE
@ -670,6 +672,7 @@ No name here
 testInContents TokenAccuracy = [hereLit|I have cats
 I know
 |]
 testInContents (ProbabilisticMultiLabelFMeasure beta) = testInContents (MultiLabelFMeasure beta)
 testInContents (MultiLabelFMeasure _) = [hereLit|John bloody Smith
 Nobody is there
 I saw Marketa
@ -735,9 +738,10 @@ O O O
 testExpectedContents TokenAccuracy = [hereLit|* V N
 * V
 |]
-testExpectedContents (MultiLabelFMeasure _) = [hereLit|person:1,3 first-name:1 surname:3
+testExpectedContents (ProbabilisticMultiLabelFMeasure beta) = testExpectedContents (MultiLabelFMeasure beta)
 testExpectedContents (MultiLabelFMeasure _) = [hereLit|person/1,3 first-name/1 surname/3
-first-name:3
+first-name/3
 |]
 testExpectedContents MultiLabelLikelihood = testExpectedContents MultiLabelLogLoss
 testExpectedContents MultiLabelLogLoss = [hereLit|SADNESS
--- a/src/GEval/MetricsMeta.hs
+++ b/src/GEval/MetricsMeta.hs
@ -47,6 +47,9 @@ listOfAvailableMetrics = [RMSE,
                          MultiLabelFMeasure 1.0,
                          MultiLabelFMeasure 2.0,
                          MultiLabelFMeasure 0.25,
                          ProbabilisticMultiLabelFMeasure 1.0,
                          ProbabilisticMultiLabelFMeasure 2.0,
                          ProbabilisticMultiLabelFMeasure 0.25,
                          MultiLabelLikelihood,
                          MAP,
                          BLEU,
@ -88,6 +91,7 @@ isEvaluationSchemeDescribed _ = False
 isMetricDescribed :: Metric -> Bool
 isMetricDescribed (SoftFMeasure _) = True
 isMetricDescribed (Soft2DFMeasure _) = True
 isMetricDescribed (ProbabilisticMultiLabelFMeasure _) = True
 isMetricDescribed _ = False
 getEvaluationSchemeDescription :: EvaluationScheme -> String
@ -106,6 +110,15 @@ if a label `foo` is expected for the rectangle (0, 0)-(100, 200) and this label
 the span (50, 100)-(150, 150), it is treated as recall=1/8 and precision=1/2. For each item (line) F-score
 is evaluated separately and finally averaged.
 |]
 getMetricDescription (ProbabilisticMultiLabelFMeasure _) =
  [i|F-measure generalised so that labels could annotated with probabilities and the quality
 of probabilities is assessed as well. It is calculated as the harmonic mean of calibration and recall
 where calibration measures the quality of probabilities (how well they are calibrated, e.g.
 if we have 10 items with probability 0.5 and 5 of them are correct, then the calibration
 is perfect.
 |]
 outContents :: Metric -> String
 outContents (SoftFMeasure _) = [hereLit|inwords:1-4
@ -114,6 +127,10 @@ inwords:1-3 indigits:5
 outContents (Soft2DFMeasure _) = [hereLit|foo:3/250,130,340,217
 bar:1/0,0,100,200 foo:1/40,50,1000,1000 bar:1/400,600,1000,1000
 |]
 outContents (ProbabilisticMultiLabelFMeasure _) = [hereLit|first-name/1:0.8 surname/3:1.0
 surname/1:0.4
 first-name/3:0.9
 |]
 expectedScore :: EvaluationScheme -> MetricValue
 expectedScore (EvaluationScheme (SoftFMeasure beta) [])
@ -124,6 +141,10 @@ expectedScore (EvaluationScheme (Soft2DFMeasure beta) [])
  = let precision = 0.211622914314256
        recall = 0.2749908502976
      in (weightedHarmonicMean beta precision recall) / 2.0
 expectedScore (EvaluationScheme (ProbabilisticMultiLabelFMeasure beta) [])
  = let precision = 0.6569596940847289
        recall = 0.675
      in weightedHarmonicMean beta precision recall
 listOfAvailableEvaluationSchemes :: [EvaluationScheme]
 listOfAvailableEvaluationSchemes = map (\m -> EvaluationScheme m []) listOfAvailableMetrics
@ -163,6 +184,10 @@ formatDescription (Soft2DFMeasure _) = [hereLit|Each line is a sequence of entit
 the form LABEL:PAGE/X0,Y0,X1,Y1 where LABEL is any label, page is the page number (starting from 1) and
 (X0, Y0) and (X1, Y1) are clipping corners.
 |]
 formatDescription (ProbabilisticMultiLabelFMeasure _) = [hereLit|In each line a number of labels (entities) can be given. A label probability
 can be provided with a colon (e.g. "foo:0.7"). By default, 1.0 is assumed.
 |]
 scoreExplanation :: EvaluationScheme -> Maybe String
 scoreExplanation (EvaluationScheme (SoftFMeasure _) [])
@ -173,6 +198,7 @@ scoreExplanation (EvaluationScheme (Soft2DFMeasure _) [])
 As far as the second item is concerned, the total area that covered by the output is 50*150+600*400=247500.
 Hence, recall is 247500/902500=0.274 and precision - 247500/(20000+912000+240000)=0.211. Therefore, the F-score
 for the second item is 0.238 and the F-score for the whole set is (0 + 0.238)/2 = 0.119.|]
 scoreExplanation (EvaluationScheme (ProbabilisticMultiLabelFMeasure _) []) = Nothing
 pasteLines :: String -> String -> String
 pasteLines a b = printf "%-35s %s\n" a b