Describe MultiLabel-F1

This commit is contained in:
Filip Gralinski 2021-04-09 15:00:06 +02:00
parent a49abb560b
commit 61eb437909

View File

@ -95,6 +95,7 @@ isEvaluationSchemeDescribed (EvaluationScheme metric []) = isMetricDescribed met
isEvaluationSchemeDescribed _ = False isEvaluationSchemeDescribed _ = False
isMetricDescribed :: Metric -> Bool isMetricDescribed :: Metric -> Bool
isMetricDescribed (MultiLabelFMeasure 1.0 ExactMatch) = True
isMetricDescribed (SoftFMeasure _) = True isMetricDescribed (SoftFMeasure _) = True
isMetricDescribed (Soft2DFMeasure _) = True isMetricDescribed (Soft2DFMeasure _) = True
isMetricDescribed (ProbabilisticMultiLabelFMeasure _) = True isMetricDescribed (ProbabilisticMultiLabelFMeasure _) = True
@ -108,6 +109,17 @@ getEvaluationSchemeDescription :: EvaluationScheme -> String
getEvaluationSchemeDescription (EvaluationScheme metric []) = getMetricDescription metric getEvaluationSchemeDescription (EvaluationScheme metric []) = getMetricDescription metric
getMetricDescription :: Metric -> String getMetricDescription :: Metric -> String
getMetricDescription (MultiLabelFMeasure 1.0 ExactMatch) =
[i|F-measure (or F-score), i.e. harmonic mean of the precision and
recall. It can be calculated for any labels. Precision is the fraction
of correct labels among the ones in the output, whereas recall is the
fraction of instances in the gold standard that were correctly
retrieved. Counts for precision and recall are calculated for the
whole test set, in other words it is a micro-average (it is NOT the
case that F-scores are calculated for each document or label class
separately and then averaged). For a macro-average per document, see
Mean/Multilabel-F1.
|]
getMetricDescription (SoftFMeasure _) = getMetricDescription (SoftFMeasure _) =
[i|"Soft" F-measure on intervals, i.e. partial "hits" are considered. For instance, [i|"Soft" F-measure on intervals, i.e. partial "hits" are considered. For instance,
if a label `foo` is expected for the span 2-9 and this label is returned but with if a label `foo` is expected for the span 2-9 and this label is returned but with
@ -160,6 +172,10 @@ Accuracy is calculated separately for each item and then averaged.
|] |]
outContents :: Metric -> String outContents :: Metric -> String
outContents (MultiLabelFMeasure _ _) = [hereLit|person/1,3 first-name/1 first-name/3
surname/2
first-name/3
|]
outContents (SoftFMeasure _) = [hereLit|inwords:1-4 outContents (SoftFMeasure _) = [hereLit|inwords:1-4
inwords:1-3 indigits:5 inwords:1-3 indigits:5
|] |]
@ -183,6 +199,7 @@ N:1-4 V:6-7 A:9-13
|] |]
expectedScore :: EvaluationScheme -> MetricValue expectedScore :: EvaluationScheme -> MetricValue
expectedScore (EvaluationScheme (MultiLabelFMeasure 1.0 ExactMatch) []) = 0.6666
expectedScore (EvaluationScheme (SoftFMeasure beta) []) expectedScore (EvaluationScheme (SoftFMeasure beta) [])
= let precision = 0.25 = let precision = 0.25
recall = 0.75 recall = 0.75
@ -240,6 +257,9 @@ formatEvaluationSchemeDescription scheme@(EvaluationScheme metric _) = show sche
IF YOU WANT TO HAVE IT DESCRIBED|] IF YOU WANT TO HAVE IT DESCRIBED|]
formatDescription :: Metric -> String formatDescription :: Metric -> String
formatDescription (MultiLabelFMeasure _ ExactMatch) = [hereLit|Any label separated by spaces can be used. They are
not intepreted in any way when the metric is calculated.
|]
formatDescription (SoftFMeasure _) = [hereLit|Each line is a sequence of entities separated by spaces, each entity is of formatDescription (SoftFMeasure _) = [hereLit|Each line is a sequence of entities separated by spaces, each entity is of
the form LABEL:SPAN, where LABEL is any label and SPAN is defined using single integers, intervals or such the form LABEL:SPAN, where LABEL is any label and SPAN is defined using single integers, intervals or such
units separated with commas. units separated with commas.
@ -265,6 +285,11 @@ formatDescription CER = [hereLit|Any text, whitespace and punctuation marks are
|] |]
scoreExplanation :: EvaluationScheme -> Maybe String scoreExplanation :: EvaluationScheme -> Maybe String
scoreExplanation (EvaluationScheme (MultiLabelFMeasure _ ExactMatch) [])
= Just [hereLit|Out of the total 5 labels in the output, 3 are correct (person/1,3, first-name/1 and
first-name/3), hence precision is 3/5=0.6, whereas out of the 4 labels in gold standard,
again 3 were retrieved, so recall is 3/4=0.75. The harmonic mean of precision and recall
is 2/(4/3 + 5/3) = 2/3 = 0.6666|]
scoreExplanation (EvaluationScheme (SoftFMeasure _) []) scoreExplanation (EvaluationScheme (SoftFMeasure _) [])
= Just [hereLit|We have a partial (0.75) success for the entity `inwords:1-4`, hence Recall = 0.75/1 = 0.75, = Just [hereLit|We have a partial (0.75) success for the entity `inwords:1-4`, hence Recall = 0.75/1 = 0.75,
Precision = (0 + 0.75 + 0) / 3 = 0.25, so F-score = 0.375|] Precision = (0 + 0.75 + 0) / 3 = 0.25, so F-score = 0.375|]