Describe MultiLabel-F1

2021-04-09 15:00:06 +02:00 · 2021-04-09 15:00:06 +02:00 · 61eb437909
commit 61eb437909
parent a49abb560b
1 changed files with 25 additions and 0 deletions
--- a/src/GEval/MetricsMeta.hs
+++ b/src/GEval/MetricsMeta.hs
@ -95,6 +95,7 @@ isEvaluationSchemeDescribed (EvaluationScheme metric []) = isMetricDescribed met
 isEvaluationSchemeDescribed _ = False
 isMetricDescribed :: Metric -> Bool
 isMetricDescribed (MultiLabelFMeasure 1.0 ExactMatch) = True
 isMetricDescribed (SoftFMeasure _) = True
 isMetricDescribed (Soft2DFMeasure _) = True
 isMetricDescribed (ProbabilisticMultiLabelFMeasure _) = True
@ -108,6 +109,17 @@ getEvaluationSchemeDescription :: EvaluationScheme -> String
 getEvaluationSchemeDescription (EvaluationScheme metric []) = getMetricDescription metric
 getMetricDescription :: Metric -> String
 getMetricDescription (MultiLabelFMeasure 1.0 ExactMatch) =
  [i|F-measure (or F-score), i.e. harmonic mean of the precision and
 recall. It can be calculated for any labels. Precision is the fraction
 of correct labels among the ones in the output, whereas recall is the
 fraction of instances in the gold standard that were correctly
 retrieved. Counts for precision and recall are calculated for the
 whole test set, in other words it is a micro-average (it is NOT the
 case that F-scores are calculated for each document or label class
 separately and then averaged). For a macro-average per document, see
 Mean/Multilabel-F1.
 |]
 getMetricDescription (SoftFMeasure _) =
  [i|"Soft" F-measure on intervals, i.e. partial "hits" are considered. For instance,
 if a label `foo` is expected for the span 2-9 and this label is returned but with
@ -160,6 +172,10 @@ Accuracy is calculated separately for each item and then averaged.
 |]
 outContents :: Metric -> String
 outContents (MultiLabelFMeasure _ _) = [hereLit|person/1,3 first-name/1 first-name/3
 surname/2
 first-name/3
 |]
 outContents (SoftFMeasure _) = [hereLit|inwords:1-4
 inwords:1-3 indigits:5
 |]
@ -183,6 +199,7 @@ N:1-4 V:6-7 A:9-13
 |]
 expectedScore :: EvaluationScheme -> MetricValue
 expectedScore (EvaluationScheme (MultiLabelFMeasure 1.0 ExactMatch) []) = 0.6666
 expectedScore (EvaluationScheme (SoftFMeasure beta) [])
  = let precision = 0.25
        recall = 0.75
@ -240,6 +257,9 @@ formatEvaluationSchemeDescription scheme@(EvaluationScheme metric _) = show sche
 IF YOU WANT TO HAVE IT DESCRIBED|]
 formatDescription :: Metric -> String
 formatDescription (MultiLabelFMeasure _ ExactMatch) = [hereLit|Any label separated by spaces can be used. They are
 not intepreted in any way when the metric is calculated.
 |]
 formatDescription (SoftFMeasure _) = [hereLit|Each line is a sequence of entities separated by spaces, each entity is of
 the form LABEL:SPAN, where LABEL is any label and SPAN is defined using single integers, intervals or such
 units separated with commas.
@ -265,6 +285,11 @@ formatDescription CER = [hereLit|Any text, whitespace and punctuation marks are
 |]
 scoreExplanation :: EvaluationScheme -> Maybe String
 scoreExplanation (EvaluationScheme (MultiLabelFMeasure _ ExactMatch) [])
  = Just [hereLit|Out of the total 5 labels in the output, 3 are correct (person/1,3, first-name/1 and
 first-name/3), hence precision is 3/5=0.6, whereas out of the 4 labels in gold standard,
 again 3 were retrieved, so recall is 3/4=0.75. The harmonic mean of precision and recall
 is 2/(4/3 + 5/3) = 2/3 = 0.6666|]
 scoreExplanation (EvaluationScheme (SoftFMeasure _) [])
  = Just [hereLit|We have a partial (0.75) success for the entity `inwords:1-4`, hence Recall = 0.75/1 = 0.75,
 Precision = (0 + 0.75 + 0) / 3 = 0.25, so F-score = 0.375|]