From 5998f8a31602d0b6337f7c1de37a785f4dcd7fdf Mon Sep 17 00:00:00 2001
From: Filip Gralinski <filipg@amu.edu.pl>
Date: Sat, 7 Sep 2019 15:48:13 +0200
Subject: [PATCH] Documentation on Probabilistic-MultiLabel-F1 metric

---
 .gitignore                   |  2 ++
 src/GEval/CreateChallenge.hs | 24 ++++++++++++++----------
 src/GEval/MetricsMeta.hs     | 26 ++++++++++++++++++++++++++
 3 files changed, 42 insertions(+), 10 deletions(-)

diff --git a/.gitignore b/.gitignore
index abf0f01..18cbbd7 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,2 +1,4 @@
 *~
 .stack-work
+.shake/
+geval
diff --git a/src/GEval/CreateChallenge.hs b/src/GEval/CreateChallenge.hs
index 2b72c6e..85430eb 100644
--- a/src/GEval/CreateChallenge.hs
+++ b/src/GEval/CreateChallenge.hs
@@ -297,6 +297,7 @@ in the expected file (but not in the output file).
 
 |] ++ (commonReadmeMDContents testName)
 
+readmeMDContents (ProbabilisticMultiLabelFMeasure beta) testName = readmeMDContents (MultiLabelFMeasure beta) testName
 readmeMDContents (MultiLabelFMeasure beta) testName = [i|
 Tag names and their component
 =============================
@@ -308,9 +309,7 @@ Tags:
 * surname
 * first-name
 
-For each tag a sequence of token IDs separated with commas should be given (after a colon).
-
-The metric is F1 on labels.
+For each tag a sequence of token IDs separated with commas should be given (after a slash).
 |] ++ (commonReadmeMDContents testName)
 
 readmeMDContents MultiLabelLikelihood testName = readmeMDContents MultiLabelLogLoss testName
@@ -474,9 +473,10 @@ B-firstname/JOHN I-surname/VON I-surname/NEUMANN	John von Nueman
 trainContents TokenAccuracy = [hereLit|* V N	I like cats
 * * V * N	I can see the rainbow
 |]
-trainContents (MultiLabelFMeasure _) = [hereLit|I know Mr John Smith	person:3,4,5 first-name:4 surname:5
-Steven bloody Brown	person:1,3 first-name:1 surname:3
-James and James	first-name:1 firstname:3
+trainContents (ProbabilisticMultiLabelFMeasure beta) = trainContents (MultiLabelFMeasure beta)
+trainContents (MultiLabelFMeasure _) = [hereLit|I know Mr John Smith	person/3,4,5 first-name/4 surname/5
+Steven bloody Brown	person/1,3 first-name/1 surname/3
+James and James	first-name/1 firstname/3
 |]
 trainContents MultiLabelLikelihood = [hereLit|I hate you!	HATE
 Love and hate	LOVE HATE
@@ -540,6 +540,7 @@ Mr Jan Kowalski
 devInContents TokenAccuracy = [hereLit|The cats on the mat
 Ala has a cat
 |]
+devInContents (ProbabilisticMultiLabelFMeasure beta) = devInContents (MultiLabelFMeasure beta)
 devInContents (MultiLabelFMeasure _) = [hereLit|Jan Kowalski is here
 I see him
 Barbara
@@ -603,9 +604,10 @@ O B-firstname/JAN B-surname/KOWALSKI
 devExpectedContents TokenAccuracy = [hereLit|* N * * N
 N V * N
 |]
-devExpectedContents (MultiLabelFMeasure _) = [hereLit|person:1,2 first-name:1 surname:2
+devExpectedContents (ProbabilisticMultiLabelFMeasure beta) = devExpectedContents (MultiLabelFMeasure beta)
+devExpectedContents (MultiLabelFMeasure _) = [hereLit|person/1,2 first-name/1 surname/2
 
-first-name:1
+first-name/1
 |]
 devExpectedContents MultiLabelLikelihood = devExpectedContents MultiLabelLogLoss
 devExpectedContents MultiLabelLogLoss = [hereLit|LOVE
@@ -670,6 +672,7 @@ No name here
 testInContents TokenAccuracy = [hereLit|I have cats
 I know
 |]
+testInContents (ProbabilisticMultiLabelFMeasure beta) = testInContents (MultiLabelFMeasure beta)
 testInContents (MultiLabelFMeasure _) = [hereLit|John bloody Smith
 Nobody is there
 I saw Marketa
@@ -735,9 +738,10 @@ O O O
 testExpectedContents TokenAccuracy = [hereLit|* V N
 * V
 |]
-testExpectedContents (MultiLabelFMeasure _) = [hereLit|person:1,3 first-name:1 surname:3
+testExpectedContents (ProbabilisticMultiLabelFMeasure beta) = testExpectedContents (MultiLabelFMeasure beta)
+testExpectedContents (MultiLabelFMeasure _) = [hereLit|person/1,3 first-name/1 surname/3
 
-first-name:3
+first-name/3
 |]
 testExpectedContents MultiLabelLikelihood = testExpectedContents MultiLabelLogLoss
 testExpectedContents MultiLabelLogLoss = [hereLit|SADNESS
diff --git a/src/GEval/MetricsMeta.hs b/src/GEval/MetricsMeta.hs
index 84c0fb0..f65cde8 100644
--- a/src/GEval/MetricsMeta.hs
+++ b/src/GEval/MetricsMeta.hs
@@ -47,6 +47,9 @@ listOfAvailableMetrics = [RMSE,
                           MultiLabelFMeasure 1.0,
                           MultiLabelFMeasure 2.0,
                           MultiLabelFMeasure 0.25,
+                          ProbabilisticMultiLabelFMeasure 1.0,
+                          ProbabilisticMultiLabelFMeasure 2.0,
+                          ProbabilisticMultiLabelFMeasure 0.25,
                           MultiLabelLikelihood,
                           MAP,
                           BLEU,
@@ -88,6 +91,7 @@ isEvaluationSchemeDescribed _ = False
 isMetricDescribed :: Metric -> Bool
 isMetricDescribed (SoftFMeasure _) = True
 isMetricDescribed (Soft2DFMeasure _) = True
+isMetricDescribed (ProbabilisticMultiLabelFMeasure _) = True
 isMetricDescribed _ = False
 
 getEvaluationSchemeDescription :: EvaluationScheme -> String
@@ -106,6 +110,15 @@ if a label `foo` is expected for the rectangle (0, 0)-(100, 200) and this label
 the span (50, 100)-(150, 150), it is treated as recall=1/8 and precision=1/2. For each item (line) F-score
 is evaluated separately and finally averaged.
 |]
+getMetricDescription (ProbabilisticMultiLabelFMeasure _) =
+  [i|F-measure generalised so that labels could annotated with probabilities and the quality
+of probabilities is assessed as well. It is calculated as the harmonic mean of calibration and recall
+where calibration measures the quality of probabilities (how well they are calibrated, e.g.
+if we have 10 items with probability 0.5 and 5 of them are correct, then the calibration
+is perfect.
+|]
+
+
 
 outContents :: Metric -> String
 outContents (SoftFMeasure _) = [hereLit|inwords:1-4
@@ -114,6 +127,10 @@ inwords:1-3 indigits:5
 outContents (Soft2DFMeasure _) = [hereLit|foo:3/250,130,340,217
 bar:1/0,0,100,200 foo:1/40,50,1000,1000 bar:1/400,600,1000,1000
 |]
+outContents (ProbabilisticMultiLabelFMeasure _) = [hereLit|first-name/1:0.8 surname/3:1.0
+surname/1:0.4
+first-name/3:0.9
+|]
 
 expectedScore :: EvaluationScheme -> MetricValue
 expectedScore (EvaluationScheme (SoftFMeasure beta) [])
@@ -124,6 +141,10 @@ expectedScore (EvaluationScheme (Soft2DFMeasure beta) [])
   = let precision = 0.211622914314256
         recall = 0.2749908502976
       in (weightedHarmonicMean beta precision recall) / 2.0
+expectedScore (EvaluationScheme (ProbabilisticMultiLabelFMeasure beta) [])
+  = let precision = 0.6569596940847289
+        recall = 0.675
+      in weightedHarmonicMean beta precision recall
 
 listOfAvailableEvaluationSchemes :: [EvaluationScheme]
 listOfAvailableEvaluationSchemes = map (\m -> EvaluationScheme m []) listOfAvailableMetrics
@@ -163,6 +184,10 @@ formatDescription (Soft2DFMeasure _) = [hereLit|Each line is a sequence of entit
 the form LABEL:PAGE/X0,Y0,X1,Y1 where LABEL is any label, page is the page number (starting from 1) and
 (X0, Y0) and (X1, Y1) are clipping corners.
 |]
+formatDescription (ProbabilisticMultiLabelFMeasure _) = [hereLit|In each line a number of labels (entities) can be given. A label probability
+can be provided with a colon (e.g. "foo:0.7"). By default, 1.0 is assumed.
+|]
+
 
 scoreExplanation :: EvaluationScheme -> Maybe String
 scoreExplanation (EvaluationScheme (SoftFMeasure _) [])
@@ -173,6 +198,7 @@ scoreExplanation (EvaluationScheme (Soft2DFMeasure _) [])
 As far as the second item is concerned, the total area that covered by the output is 50*150+600*400=247500.
 Hence, recall is 247500/902500=0.274 and precision - 247500/(20000+912000+240000)=0.211. Therefore, the F-score
 for the second item is 0.238 and the F-score for the whole set is (0 + 0.238)/2 = 0.119.|]
+scoreExplanation (EvaluationScheme (ProbabilisticMultiLabelFMeasure _) []) = Nothing
 
 pasteLines :: String -> String -> String
 pasteLines a b = printf "%-35s %s\n" a b