Add BIOWeightedF1 metric

2021-06-09 22:16:13 +02:00 · 2021-06-09 22:16:13 +02:00 · 0afa1fe0ba
commit 0afa1fe0ba
parent e26275eff2
11 changed files with 97 additions and 5 deletions
--- a/src/GEval/BIO.hs
+++ b/src/GEval/BIO.hs
@ -3,7 +3,7 @@
 module GEval.BIO
       (BIOLabel(..), bioSequenceParser, parseBioSequenceIntoEntities,
        parseBioSequenceIntoEntitiesWithoutNormalization,
-        TaggedSpan(..), TaggedEntity(..), gatherCountsForBIO,
+        TaggedSpan(..), TaggedEntity(..), gatherCountsForBIO, gatherSeparatedCountsForBIO,
        eraseNormalisation)
       where

@ -16,9 +16,12 @@ import Data.Attoparsec.Combinator
 import Control.Applicative
 import Data.Char
 import Data.Maybe (catMaybes)
+import Data.List (groupBy, sortBy)

 import GEval.Common

+import qualified Data.HashMap.Strict as M
+
 data BIOLabel = Outside | Beginning T.Text (Maybe T.Text) | Inside T.Text (Maybe T.Text)
                deriving (Eq, Show)

@ -43,6 +46,23 @@ gatherCountsForBIO expected got = (maxMatchOnOrdered laterThan expected got, len
  where
    laterThan (TaggedEntity (TaggedSpan a _) _ _) (TaggedEntity (TaggedSpan b _) _ _) = a > b

+compareByLabel :: TaggedEntity -> TaggedEntity -> Ordering
+compareByLabel (TaggedEntity _ labelA _) (TaggedEntity _ labelB _) = labelA `compare` labelB
+
+equalLabel :: TaggedEntity -> TaggedEntity -> Bool
+equalLabel (TaggedEntity _ labelA _) (TaggedEntity _ labelB _) = labelA == labelB
+
+gatherSeparatedCountsForBIO :: [TaggedEntity] -> [TaggedEntity] -> M.HashMap T.Text (Int, Int, Int)
+gatherSeparatedCountsForBIO expected got = M.mapWithKey process expectedMapped
+  where expectedMapped = groupEntitiesByLabel expected
+        gotMapped = groupEntitiesByLabel got
+        groupEntitiesByLabel =
+          M.fromList
+          . map (\l@((TaggedEntity _ lab _):_) -> (lab, l))
+          . groupBy equalLabel
+          . sortBy compareByLabel
+        process lab expectedGroup = gatherCountsForBIO expectedGroup (M.lookupDefault [] lab gotMapped)
+
 parseBioSequenceIntoEntities :: T.Text -> Either String [TaggedEntity]
 parseBioSequenceIntoEntities t = labelsIntoEntities =<< (parseOnly (bioSequenceParser <* endOfInput) t)

--- a/src/GEval/Core.hs
+++ b/src/GEval/Core.hs
@ -177,6 +177,7 @@ isPreprocessable LogLoss = False
 isPreprocessable Likelihood = False
 isPreprocessable BIOF1 = False
 isPreprocessable BIOF1Labels = False
+isPreprocessable BIOWeightedF1 = False
 isPreprocessable TokenAccuracy = True
 isPreprocessable SegmentAccuracy = True
 isPreprocessable MAE = False
@ -788,6 +789,11 @@ generalizedProbabilisticFMeasure beta metric = gevalCoreWithoutInput metric
 countAgg :: (Num n, Num v, Monad m) => ConduitM (n, v, v) o m (n, v, v)
 countAgg = CC.foldl countFolder (fromInteger 0, fromInteger 0, fromInteger 0)

+separatedCountAgg :: Monad m => ConduitM (M.HashMap Text (Int, Int, Int)) o m (M.HashMap Text (Int, Int, Int))
+separatedCountAgg = CC.foldl separatedCountFolder M.empty
+  where separatedCountFolder = M.unionWith countFolder
+
+
 countFragAgg :: (Num n, Num v, Monad m) => ConduitM (n, n, v, v) o m (n, n, v, v)
 countFragAgg = CC.foldl countFragFolder (fromInteger 0, fromInteger 0, fromInteger 0, fromInteger 0)

@ -990,6 +996,8 @@ continueGEvalCalculations SABIOF1 BIOF1 = defineContinuation countAgg f1MeasureO

 continueGEvalCalculations SABIOF1Labels BIOF1Labels = defineContinuation countAgg f1MeasureOnCounts noGraph

+continueGEvalCalculations SABIOWeightedF1 BIOWeightedF1 = defineContinuation separatedCountAgg f1MeasureOnSeparatedCounts noGraph
+
 continueGEvalCalculations SASegmentAccuracy SegmentAccuracy = defineContinuation averageC id noGraph

 continueGEvalCalculations SATokenAccuracy TokenAccuracy = defineContinuation hitsAndTotalsAgg
--- a/src/GEval/CreateChallenge.hs
+++ b/src/GEval/CreateChallenge.hs
@ -316,6 +316,7 @@ This a sample challenge for the likelihood metric.
 |] ++ (commonReadmeMDContents testName)

 readmeMDContents BIOF1Labels testName = readmeMDContents BIOF1 testName
+readmeMDContents BIOWeightedF1 testName = readmeMDContents BIOF1 testName
 readmeMDContents BIOF1 testName = [i|
 Tag and normalize names
 =======================
@ -568,6 +569,7 @@ trainContents LogLoss = [hereLit|0.0	Hell, no!!!
 0.0	Boring, boring, boring
 |]
 trainContents BIOF1Labels = trainContents BIOF1
+trainContents BIOWeightedF1 = trainContents BIOF1
 trainContents BIOF1 = [hereLit|O O O B-surname/BOND O B-firstname/JAMES B-surname/BOND	My name is Bond , James Bond
 O O O O O	There is no name here
 B-firstname/JOHN B-surname/VON I-surname/NEUMANN	John von Nueman
@ -646,6 +648,7 @@ Boring stuff
 That's good
 |]
 devInContents BIOF1Labels = devInContents BIOF1
+devInContents BIOWeightedF1 = devInContents BIOF1
 devInContents BIOF1 = [hereLit|Adam and Eve
 Mr Jan Kowalski
 |]
@ -720,6 +723,7 @@ devExpectedContents LogLoss = [hereLit|1.0
 1.0
 |]
 devExpectedContents BIOF1Labels = devExpectedContents BIOF1
+devExpectedContents BIOWeightedF1 = devExpectedContents BIOF1
 devExpectedContents BIOF1 = [hereLit|B-firstname/ADAM O B-firstname/EVE
 O B-firstname/JAN B-surname/KOWALSKI
 |]
@ -799,8 +803,10 @@ Super-duper!!
 That is incredibly boring.
 |]
 testInContents BIOF1Labels = testInContents BIOF1
+testInContents BIOWeightedF1 = testInContents BIOF1
 testInContents BIOF1 = [hereLit|Alan Tring
 No name here
+Tarski is NOT here
 |]
 testInContents TokenAccuracy = [hereLit|I have cats
 I know
@ -875,8 +881,10 @@ testExpectedContents LogLoss = [hereLit|1.0
 0.0
 |]
 testExpectedContents BIOF1Labels = testExpectedContents BIOF1
+testExpectedContents BIOWeightedF1 = testExpectedContents BIOF1
 testExpectedContents BIOF1 = [hereLit|B-firstname/ALAN B-surname/TURING
 O O O
+B-surname/TARSKI O O O
 |]
 testExpectedContents TokenAccuracy = [hereLit|* V N
 * V
@ -945,6 +953,7 @@ inHeaderContents MAP = Just ["Dialect", "PolishPhrase"]
 inHeaderContents Likelihood = inHeaderContents LogLoss
 inHeaderContents LogLoss = Just ["Text"]
 inHeaderContents BIOF1Labels = inHeaderContents BIOF1
+inHeaderContents BIOWeightedF1 = inHeaderContents BIOF1
 inHeaderContents BIOF1 = Just ["Text"]
 inHeaderContents TokenAccuracy = Just ["TokenizedText"]
 inHeaderContents SegmentAccuracy = Just ["Segment"]
@ -976,6 +985,7 @@ outHeaderContents MAP = Nothing
 outHeaderContents Likelihood = outHeaderContents LogLoss
 outHeaderContents LogLoss = Just ["Probability"]
 outHeaderContents BIOF1Labels = outHeaderContents BIOF1
+outHeaderContents BIOWeightedF1 = outHeaderContents BIOF1
 outHeaderContents BIOF1 = Just ["BIOOutput"]
 outHeaderContents TokenAccuracy = Just ["PartsOfSpeech"]
 outHeaderContents SegmentAccuracy = Just ["PartsOfSpeech"]
--- a/src/GEval/Metric.hs
+++ b/src/GEval/Metric.hs
@ -28,7 +28,7 @@ import Data.Attoparsec.Text (parseOnly)
 data Metric = RMSE | MSE | Pearson | Spearman | BLEU | GLEU | WER | CER | Accuracy | ClippEU
              | FMeasure Double | MacroFMeasure Double | NMI
              | LogLossHashed Word32 | CharMatch | MAP | LogLoss | Likelihood
-              | BIOF1 | BIOF1Labels | TokenAccuracy | SegmentAccuracy | LikelihoodHashed Word32 | MAE | SMAPE
+              | BIOF1 | BIOWeightedF1 | BIOF1Labels | TokenAccuracy | SegmentAccuracy | LikelihoodHashed Word32 | MAE | SMAPE
              | MultiLabelFMeasure Double MatchingSpecification
              | MultiLabelLogLoss | MultiLabelLikelihood
              | SoftFMeasure Double | ProbabilisticMultiLabelFMeasure Double
@ -78,6 +78,7 @@ instance Show Metric where
  show Likelihood = "Likelihood"
  show BIOF1 = "BIO-F1"
  show BIOF1Labels = "BIO-F1-Labels"
+  show BIOWeightedF1 = "BIO-Weighted-F1"
  show TokenAccuracy = "TokenAccuracy"
  show SegmentAccuracy = "SegmentAccuracy"
  show MAE = "MAE"
@ -161,6 +162,7 @@ instance Read Metric where
  readsPrec p ('C':'h':'a':'r':'M':'a':'t':'c':'h':theRest) = [(CharMatch, theRest)]
  readsPrec _ ('M':'A':'P':theRest) = [(MAP, theRest)]
  readsPrec _ ('B':'I':'O':'-':'F':'1':'-':'L':'a':'b':'e':'l':'s':theRest) = [(BIOF1Labels, theRest)]
+  readsPrec _ ('B':'I':'O':'-':'W':'e':'i':'g':'h':'t':'e':'d':'-':'F':'1': theRest) = [(BIOWeightedF1, theRest)]
  readsPrec _ ('B':'I':'O':'-':'F':'1':theRest) = [(BIOF1, theRest)]
  readsPrec _ ('T':'o':'k':'e':'n':'A':'c':'c':'u':'r':'a':'c':'y':theRest) = [(TokenAccuracy, theRest)]
  readsPrec _ ('S':'e':'g':'m':'e':'n':'t':'A':'c':'c':'u':'r':'a':'c':'y':theRest) = [(SegmentAccuracy, theRest)]
@ -201,6 +203,7 @@ getMetricOrdering MAP = TheHigherTheBetter
 getMetricOrdering LogLoss = TheLowerTheBetter
 getMetricOrdering Likelihood = TheHigherTheBetter
 getMetricOrdering BIOF1 = TheHigherTheBetter
+getMetricOrdering BIOWeightedF1 = TheHigherTheBetter
 getMetricOrdering BIOF1Labels = TheHigherTheBetter
 getMetricOrdering TokenAccuracy = TheHigherTheBetter
 getMetricOrdering SegmentAccuracy = TheHigherTheBetter
--- a/src/GEval/MetricsMechanics.hs
+++ b/src/GEval/MetricsMechanics.hs
@ -20,7 +20,7 @@ import GEval.Common
 import GEval.BLEU (bleuStep, gleuStep)
 import GEval.WER (werStep)
 import GEval.Clippings (totalArea, coveredBy, clippEUMatchStep)
-import GEval.BIO (gatherCountsForBIO)
+import GEval.BIO (gatherCountsForBIO, gatherSeparatedCountsForBIO)

 import GEval.Probability
 import GEval.PrecisionRecall (weightedMaxMatch, fMeasureOnCounts, calculateMAPForOneResult, getProbabilisticCounts, getCounts)
@ -45,13 +45,15 @@ import GEval.ProbList (ProbList(..), parseIntoProbList, WordWithProb(..), countL
 import GEval.MatchingSpecification
 import GEval.Haversine

+import qualified Data.HashMap.Strict as M
+
 -- | Helper type so that singleton can be used.
 -- | (The problem is that some metrics are parametrized by Double
 -- | Word32 and this is not handled by the singleton libary.)
 singletons [d|data AMetric = ARMSE | AMSE | APearson | ASpearman | ABLEU | AGLEU | AWER | ACER | AAccuracy | AClippEU
                             | AFMeasure | AMacroFMeasure | ANMI
                             | ALogLossHashed | ACharMatch | AMAP | ALogLoss | ALikelihood
-                             | ABIOF1 | ABIOF1Labels | ATokenAccuracy | ASegmentAccuracy | ALikelihoodHashed | AMAE | ASMAPE | AMultiLabelFMeasure MatchingSpecification
+                             | ABIOF1 | ABIOWeightedF1 | ABIOF1Labels | ATokenAccuracy | ASegmentAccuracy | ALikelihoodHashed | AMAE | ASMAPE | AMultiLabelFMeasure MatchingSpecification
                             | AMultiLabelLogLoss | AMultiLabelLikelihood
                             | ASoftFMeasure | AProbabilisticMultiLabelFMeasure | AProbabilisticSoftFMeasure | ASoft2DFMeasure
                             | AFLCFMeasure | AHaversine
@ -79,6 +81,7 @@ toHelper MAP = AMAP
 toHelper LogLoss = ALogLoss
 toHelper Likelihood = ALikelihood
 toHelper BIOF1 = ABIOF1
+toHelper BIOWeightedF1 = ABIOWeightedF1
 toHelper BIOF1Labels = ABIOF1Labels
 toHelper TokenAccuracy = ATokenAccuracy
 toHelper SegmentAccuracy = ASegmentAccuracy
@ -125,6 +128,7 @@ type family ParsedExpectedType (t :: AMetric) :: * where
  ParsedExpectedType ALogLoss = Double
  ParsedExpectedType ALikelihood = Double
  ParsedExpectedType ABIOF1 = [TaggedEntity]
+  ParsedExpectedType ABIOWeightedF1 = [TaggedEntity]
  ParsedExpectedType ABIOF1Labels = [TaggedEntity]
  ParsedExpectedType ATokenAccuracy = [Text]
  ParsedExpectedType ASegmentAccuracy = [Annotation]
@ -161,6 +165,7 @@ expectedParser SAMAP = splitByTabs
 expectedParser SALogLoss = doubleParser
 expectedParser SALikelihood = doubleParser
 expectedParser SABIOF1 = parseBioSequenceIntoEntities
+expectedParser SABIOWeightedF1 = parseBioSequenceIntoEntities
 expectedParser SABIOF1Labels = parseBioSequenceIntoEntitiesWithoutNormalization
 expectedParser SATokenAccuracy = intoWords
 expectedParser SASegmentAccuracy = parseSegmentAnnotations
@ -211,6 +216,7 @@ outputParser SAMAP = splitByTabs
 outputParser SALogLoss = doubleParser
 outputParser SALikelihood = doubleParser
 outputParser SABIOF1 = parseBioSequenceIntoEntities
+outputParser SABIOWeightedF1 = parseBioSequenceIntoEntities
 outputParser SABIOF1Labels = parseBioSequenceIntoEntitiesWithoutNormalization
 outputParser SATokenAccuracy = intoWords
 outputParser SASegmentAccuracy = parseSegmentAnnotations
@ -232,6 +238,7 @@ type family ItemIntermediateRepresentationType (t :: AMetric) :: * where
  ItemIntermediateRepresentationType AClippEU = (Int, Int, Int)
  ItemIntermediateRepresentationType ANMI = (Text, Text)
  ItemIntermediateRepresentationType ABIOF1 = (Int, Int, Int)
+  ItemIntermediateRepresentationType ABIOWeightedF1 = M.HashMap Text (Int, Int, Int)
  ItemIntermediateRepresentationType ABIOF1Labels = (Int, Int, Int)
  ItemIntermediateRepresentationType ATokenAccuracy = (Int, Int)
  ItemIntermediateRepresentationType AProbabilisticMultiLabelFMeasure = ([Double], [Double], Double, Int)
@ -277,6 +284,7 @@ itemStep SAMAP = uncurry calculateMAPForOneResult
 itemStep SALogLoss = itemLogLossError
 itemStep SALikelihood = itemLogLossError
 itemStep SABIOF1 = uncurry gatherCountsForBIO
+itemStep SABIOWeightedF1 = uncurry gatherSeparatedCountsForBIO
 itemStep SABIOF1Labels = uncurry gatherCountsForBIO
 itemStep SATokenAccuracy = countHitsAndTotals
 itemStep SASegmentAccuracy = uncurry segmentAccuracy
--- a/src/GEval/MetricsMeta.hs
+++ b/src/GEval/MetricsMeta.hs
@ -64,6 +64,7 @@ listOfAvailableMetrics = [RMSE,
                          LogLossHashed defaultLogLossHashedSize,
                          LikelihoodHashed defaultLogLossHashedSize,
                          BIOF1,
+                          BIOWeightedF1,
                          BIOF1Labels,
                          TokenAccuracy,
                          SegmentAccuracy,
@ -105,6 +106,7 @@ isMetricDescribed WER = True
 isMetricDescribed CER = True
 isMetricDescribed SegmentAccuracy = True
 isMetricDescribed Haversine = True
+isMetricDescribed BIOWeightedF1 = True
 isMetricDescribed _ = False

 getEvaluationSchemeDescription :: EvaluationScheme -> String
@ -176,6 +178,9 @@ getMetricDescription Haversine =
  [i|The haversine formula determines the great-circle distance between
 two points on a sphere given their longitudes and latitudes (in degrees).
 |]
+getMetricDescription BIOWeightedF1 =
+  [i|Weighted-average F1-score calculated on output expressed in the BIO format.
+|]

 outContents :: Metric -> String
 outContents (MultiLabelFMeasure _ _) = [hereLit|person/1,3 first-name/1 first-name/3
@ -206,6 +211,10 @@ N:1-4 V:6-7 A:9-13
 outContents Haversine = [hereLit|39.575264	-56.995928
 29.949932	-90.070116
 |]
+outContents BIOWeightedF1 = [hereLit|B-firstname/ALAN B-surname/TURING
+O O O
+B-surname/TARSKI O B-surname/NOT O
+|]

 expectedScore :: EvaluationScheme -> MetricValue
 expectedScore (EvaluationScheme (MultiLabelFMeasure 1.0 ExactMatch) []) = 0.6666
@ -231,6 +240,8 @@ expectedScore (EvaluationScheme CER [])
  = 0.14814
 expectedScore (EvaluationScheme Haversine [])
  = 1044.2633358563135
+expectedScore (EvaluationScheme BIOWeightedF1 [])
+  = 0.86666666

 helpMetricParameterMetricsList :: String
 helpMetricParameterMetricsList = intercalate ", " $ map (\s -> (show s) ++ (case extraInfo s of
@ -297,6 +308,10 @@ formatDescription CER = [hereLit|Any text, whitespace and punctuation marks are
 formatDescription Haversine = [hereLit|Each line is a latitude and longitude of sphere separated by tabulation,
 e.g. "41.558153 -73.051497".
 |]
+formatDescription BIOWeightedF1 = [hereLit|Each line is a sequence of tags encoded in the BIO format, i.e. O, B-tag, I-tag;
+B-tags and I-tags can accompanied by an extra label after a slash.
+|]
+

 scoreExplanation :: EvaluationScheme -> Maybe String
 scoreExplanation (EvaluationScheme (MultiLabelFMeasure _ ExactMatch) [])
@ -332,6 +347,11 @@ scoreExplanation (EvaluationScheme CER [])
  = Just [hereLit|The total length of expected output (in characters) is 27. There are 4 errors
 (1 word substituted, 1 inserted, 1 deleted)  in the actual output. Hence,
 CER = (2+1+1) / 27 = 4 / 27 = 0.14814.|]
+scoreExplanation (EvaluationScheme Haversine []) = Nothing
+scoreExplanation (EvaluationScheme BIOWeightedF1 [])
+  = Just [hereLit|There are two labels (firstname and surname, O is not considered). Firstname was
+predicted in the perfect way, hence F1=1, whereas for surname recall is 1, precision - 2/3 and F1 - 4/5.
+The weighted average is (1 * 1 + 2 * 4/5) / 3 = 13/15 = 0.8667.|]

 pasteLines :: String -> String -> String
 pasteLines a b = printf "%-35s %s\n" a b
--- a/src/GEval/PrecisionRecall.hs
+++ b/src/GEval/PrecisionRecall.hs
@ -7,9 +7,11 @@ module GEval.PrecisionRecall(calculateMAPForOneResult,
                             precisionAndRecall, precisionAndRecallFromCounts,
                             maxMatch, maxMatchOnOrdered, getCounts, weightedMaxMatch, weightedMaxMatching,
                             getProbabilisticCounts,
-                             countFragFolder)
+                             countFragFolder, fMeasureOnSeparatedCounts, f1MeasureOnSeparatedCounts)
       where

+import Debug.Trace
+
 import GEval.Common
 import GEval.Probability

@ -21,6 +23,7 @@ import Data.List (find, foldl', nub)
 import Data.Algorithm.Munkres
 import qualified Data.Array.IArray as DAI

+import qualified Data.HashMap.Strict as M

 calculateMAPForOneResult :: (Eq a) => [a] -> [a] -> Double
 calculateMAPForOneResult expected got = precisionSum / fromIntegral (length expected)
@ -69,6 +72,15 @@ fMeasureOnFragCounts beta (rC, pC, nbExpected, nbGot) =
  where r = rC /. nbExpected
        p = pC /. nbGot

+f1MeasureOnSeparatedCounts :: M.HashMap a (Int, Int, Int) -> Double
+f1MeasureOnSeparatedCounts m = fMeasureOnSeparatedCounts 1.0 m
+
+fMeasureOnSeparatedCounts :: Double -> M.HashMap a (Int, Int, Int) -> Double
+fMeasureOnSeparatedCounts beta m = (sum $ map (\c@(_, t, _) -> (fromIntegral t) * (fMeasureOnCounts beta c)) mAsList) /. total
+  where mAsList = M.elems m
+        total = sum $ map (\(_, t, _) -> t) mAsList
+
+
 countFolder :: (Num n, Num v) => (n, v, v) -> (n, v, v) -> (n, v, v)
 countFolder (a1, a2, a3) (b1, b2, b3) = (a1+b1, a2+b2, a3+b3)

--- a/test/Spec.hs
+++ b/test/Spec.hs
@ -536,6 +536,8 @@ main = hspec $ do
      runGEvalTest "bio-f1-perfect" `shouldReturnAlmost` 1.0
    it "check inconsistent input" $ do
      runGEvalTest "bio-f1-error" `shouldThrow` (== UnexpectedData 2 "inconsistent label sequence `B-NAME/JOHN I-FOO/SMITH I-FOO/X`")
+    it "weighted F1" $ do
+      runGEvalTest "bio-weighted-f1-simple" `shouldReturnAlmost` 0.82539682
  describe "automatic decompression" $ do
    it "more complex test" $ do
      runGEvalTest "charmatch-complex-compressed" `shouldReturnAlmost` 0.1923076923076923
--- a/test/bio-weighted-f1-simple/bio-weighted-f1-simple-solution/test-A/out.tsv
+++ b/test/bio-weighted-f1-simple/bio-weighted-f1-simple-solution/test-A/out.tsv
@ -0,0 +1,4 @@
+O B-FOO O O O B-FOO I-FOO I-FOO B-BAR
+B-BAR O B-XYZ
+B-BAZ O B-BAZ O B-BAR
+O B-BAZ I-BAZ B-FOO
--- a/test/bio-weighted-f1-simple/bio-weighted-f1-simple/config.txt
+++ b/test/bio-weighted-f1-simple/bio-weighted-f1-simple/config.txt
@ -0,0 +1 @@
+--metric BIO-Weighted-F1
--- a/test/bio-weighted-f1-simple/bio-weighted-f1-simple/test-A/expected.tsv
+++ b/test/bio-weighted-f1-simple/bio-weighted-f1-simple/test-A/expected.tsv
@ -0,0 +1,4 @@
+O B-FOO O O O B-FOO I-FOO O B-BAR
+B-BAR O O
+B-BAZ B-BAZ B-BAZ O O
+O B-BAZ I-BAZ B-FOO