From 0afa1fe0bab3e5c39e568f40e5a4d44932dc2fa2 Mon Sep 17 00:00:00 2001
From: Filip Gralinski <filipg@amu.edu.pl>
Date: Wed, 9 Jun 2021 22:16:13 +0200
Subject: [PATCH] Add BIOWeightedF1 metric

---
 src/GEval/BIO.hs                              | 22 ++++++++++++++++++-
 src/GEval/Core.hs                             |  8 +++++++
 src/GEval/CreateChallenge.hs                  | 10 +++++++++
 src/GEval/Metric.hs                           |  5 ++++-
 src/GEval/MetricsMechanics.hs                 | 12 ++++++++--
 src/GEval/MetricsMeta.hs                      | 20 +++++++++++++++++
 src/GEval/PrecisionRecall.hs                  | 14 +++++++++++-
 test/Spec.hs                                  |  2 ++
 .../test-A/out.tsv                            |  4 ++++
 .../bio-weighted-f1-simple/config.txt         |  1 +
 .../test-A/expected.tsv                       |  4 ++++
 11 files changed, 97 insertions(+), 5 deletions(-)
 create mode 100644 test/bio-weighted-f1-simple/bio-weighted-f1-simple-solution/test-A/out.tsv
 create mode 100644 test/bio-weighted-f1-simple/bio-weighted-f1-simple/config.txt
 create mode 100644 test/bio-weighted-f1-simple/bio-weighted-f1-simple/test-A/expected.tsv

diff --git a/src/GEval/BIO.hs b/src/GEval/BIO.hs
index f909dd3..acffa80 100644
--- a/src/GEval/BIO.hs
+++ b/src/GEval/BIO.hs
@@ -3,7 +3,7 @@
 module GEval.BIO
        (BIOLabel(..), bioSequenceParser, parseBioSequenceIntoEntities,
         parseBioSequenceIntoEntitiesWithoutNormalization,
-        TaggedSpan(..), TaggedEntity(..), gatherCountsForBIO,
+        TaggedSpan(..), TaggedEntity(..), gatherCountsForBIO, gatherSeparatedCountsForBIO,
         eraseNormalisation)
        where
 
@@ -16,9 +16,12 @@ import Data.Attoparsec.Combinator
 import Control.Applicative
 import Data.Char
 import Data.Maybe (catMaybes)
+import Data.List (groupBy, sortBy)
 
 import GEval.Common
 
+import qualified Data.HashMap.Strict as M
+
 data BIOLabel = Outside | Beginning T.Text (Maybe T.Text) | Inside T.Text (Maybe T.Text)
                 deriving (Eq, Show)
 
@@ -43,6 +46,23 @@ gatherCountsForBIO expected got = (maxMatchOnOrdered laterThan expected got, len
   where
     laterThan (TaggedEntity (TaggedSpan a _) _ _) (TaggedEntity (TaggedSpan b _) _ _) = a > b
 
+compareByLabel :: TaggedEntity -> TaggedEntity -> Ordering
+compareByLabel (TaggedEntity _ labelA _) (TaggedEntity _ labelB _) = labelA `compare` labelB
+
+equalLabel :: TaggedEntity -> TaggedEntity -> Bool
+equalLabel (TaggedEntity _ labelA _) (TaggedEntity _ labelB _) = labelA == labelB
+
+gatherSeparatedCountsForBIO :: [TaggedEntity] -> [TaggedEntity] -> M.HashMap T.Text (Int, Int, Int)
+gatherSeparatedCountsForBIO expected got = M.mapWithKey process expectedMapped
+  where expectedMapped = groupEntitiesByLabel expected
+        gotMapped = groupEntitiesByLabel got
+        groupEntitiesByLabel =
+          M.fromList
+          . map (\l@((TaggedEntity _ lab _):_) -> (lab, l))
+          . groupBy equalLabel
+          . sortBy compareByLabel
+        process lab expectedGroup = gatherCountsForBIO expectedGroup (M.lookupDefault [] lab gotMapped)
+
 parseBioSequenceIntoEntities :: T.Text -> Either String [TaggedEntity]
 parseBioSequenceIntoEntities t = labelsIntoEntities =<< (parseOnly (bioSequenceParser <* endOfInput) t)
 
diff --git a/src/GEval/Core.hs b/src/GEval/Core.hs
index ad4b90e..00d7361 100644
--- a/src/GEval/Core.hs
+++ b/src/GEval/Core.hs
@@ -177,6 +177,7 @@ isPreprocessable LogLoss = False
 isPreprocessable Likelihood = False
 isPreprocessable BIOF1 = False
 isPreprocessable BIOF1Labels = False
+isPreprocessable BIOWeightedF1 = False
 isPreprocessable TokenAccuracy = True
 isPreprocessable SegmentAccuracy = True
 isPreprocessable MAE = False
@@ -788,6 +789,11 @@ generalizedProbabilisticFMeasure beta metric = gevalCoreWithoutInput metric
 countAgg :: (Num n, Num v, Monad m) => ConduitM (n, v, v) o m (n, v, v)
 countAgg = CC.foldl countFolder (fromInteger 0, fromInteger 0, fromInteger 0)
 
+separatedCountAgg :: Monad m => ConduitM (M.HashMap Text (Int, Int, Int)) o m (M.HashMap Text (Int, Int, Int))
+separatedCountAgg = CC.foldl separatedCountFolder M.empty
+  where separatedCountFolder = M.unionWith countFolder
+
+
 countFragAgg :: (Num n, Num v, Monad m) => ConduitM (n, n, v, v) o m (n, n, v, v)
 countFragAgg = CC.foldl countFragFolder (fromInteger 0, fromInteger 0, fromInteger 0, fromInteger 0)
 
@@ -990,6 +996,8 @@ continueGEvalCalculations SABIOF1 BIOF1 = defineContinuation countAgg f1MeasureO
 
 continueGEvalCalculations SABIOF1Labels BIOF1Labels = defineContinuation countAgg f1MeasureOnCounts noGraph
 
+continueGEvalCalculations SABIOWeightedF1 BIOWeightedF1 = defineContinuation separatedCountAgg f1MeasureOnSeparatedCounts noGraph
+
 continueGEvalCalculations SASegmentAccuracy SegmentAccuracy = defineContinuation averageC id noGraph
 
 continueGEvalCalculations SATokenAccuracy TokenAccuracy = defineContinuation hitsAndTotalsAgg
diff --git a/src/GEval/CreateChallenge.hs b/src/GEval/CreateChallenge.hs
index a52c663..41129e6 100644
--- a/src/GEval/CreateChallenge.hs
+++ b/src/GEval/CreateChallenge.hs
@@ -316,6 +316,7 @@ This a sample challenge for the likelihood metric.
 |] ++ (commonReadmeMDContents testName)
 
 readmeMDContents BIOF1Labels testName = readmeMDContents BIOF1 testName
+readmeMDContents BIOWeightedF1 testName = readmeMDContents BIOF1 testName
 readmeMDContents BIOF1 testName = [i|
 Tag and normalize names
 =======================
@@ -568,6 +569,7 @@ trainContents LogLoss = [hereLit|0.0	Hell, no!!!
 0.0	Boring, boring, boring
 |]
 trainContents BIOF1Labels = trainContents BIOF1
+trainContents BIOWeightedF1 = trainContents BIOF1
 trainContents BIOF1 = [hereLit|O O O B-surname/BOND O B-firstname/JAMES B-surname/BOND	My name is Bond , James Bond
 O O O O O	There is no name here
 B-firstname/JOHN B-surname/VON I-surname/NEUMANN	John von Nueman
@@ -646,6 +648,7 @@ Boring stuff
 That's good
 |]
 devInContents BIOF1Labels = devInContents BIOF1
+devInContents BIOWeightedF1 = devInContents BIOF1
 devInContents BIOF1 = [hereLit|Adam and Eve
 Mr Jan Kowalski
 |]
@@ -720,6 +723,7 @@ devExpectedContents LogLoss = [hereLit|1.0
 1.0
 |]
 devExpectedContents BIOF1Labels = devExpectedContents BIOF1
+devExpectedContents BIOWeightedF1 = devExpectedContents BIOF1
 devExpectedContents BIOF1 = [hereLit|B-firstname/ADAM O B-firstname/EVE
 O B-firstname/JAN B-surname/KOWALSKI
 |]
@@ -799,8 +803,10 @@ Super-duper!!
 That is incredibly boring.
 |]
 testInContents BIOF1Labels = testInContents BIOF1
+testInContents BIOWeightedF1 = testInContents BIOF1
 testInContents BIOF1 = [hereLit|Alan Tring
 No name here
+Tarski is NOT here
 |]
 testInContents TokenAccuracy = [hereLit|I have cats
 I know
@@ -875,8 +881,10 @@ testExpectedContents LogLoss = [hereLit|1.0
 0.0
 |]
 testExpectedContents BIOF1Labels = testExpectedContents BIOF1
+testExpectedContents BIOWeightedF1 = testExpectedContents BIOF1
 testExpectedContents BIOF1 = [hereLit|B-firstname/ALAN B-surname/TURING
 O O O
+B-surname/TARSKI O O O
 |]
 testExpectedContents TokenAccuracy = [hereLit|* V N
 * V
@@ -945,6 +953,7 @@ inHeaderContents MAP = Just ["Dialect", "PolishPhrase"]
 inHeaderContents Likelihood = inHeaderContents LogLoss
 inHeaderContents LogLoss = Just ["Text"]
 inHeaderContents BIOF1Labels = inHeaderContents BIOF1
+inHeaderContents BIOWeightedF1 = inHeaderContents BIOF1
 inHeaderContents BIOF1 = Just ["Text"]
 inHeaderContents TokenAccuracy = Just ["TokenizedText"]
 inHeaderContents SegmentAccuracy = Just ["Segment"]
@@ -976,6 +985,7 @@ outHeaderContents MAP = Nothing
 outHeaderContents Likelihood = outHeaderContents LogLoss
 outHeaderContents LogLoss = Just ["Probability"]
 outHeaderContents BIOF1Labels = outHeaderContents BIOF1
+outHeaderContents BIOWeightedF1 = outHeaderContents BIOF1
 outHeaderContents BIOF1 = Just ["BIOOutput"]
 outHeaderContents TokenAccuracy = Just ["PartsOfSpeech"]
 outHeaderContents SegmentAccuracy = Just ["PartsOfSpeech"]
diff --git a/src/GEval/Metric.hs b/src/GEval/Metric.hs
index 8597600..2b04067 100644
--- a/src/GEval/Metric.hs
+++ b/src/GEval/Metric.hs
@@ -28,7 +28,7 @@ import Data.Attoparsec.Text (parseOnly)
 data Metric = RMSE | MSE | Pearson | Spearman | BLEU | GLEU | WER | CER | Accuracy | ClippEU
               | FMeasure Double | MacroFMeasure Double | NMI
               | LogLossHashed Word32 | CharMatch | MAP | LogLoss | Likelihood
-              | BIOF1 | BIOF1Labels | TokenAccuracy | SegmentAccuracy | LikelihoodHashed Word32 | MAE | SMAPE
+              | BIOF1 | BIOWeightedF1 | BIOF1Labels | TokenAccuracy | SegmentAccuracy | LikelihoodHashed Word32 | MAE | SMAPE
               | MultiLabelFMeasure Double MatchingSpecification
               | MultiLabelLogLoss | MultiLabelLikelihood
               | SoftFMeasure Double | ProbabilisticMultiLabelFMeasure Double
@@ -78,6 +78,7 @@ instance Show Metric where
   show Likelihood = "Likelihood"
   show BIOF1 = "BIO-F1"
   show BIOF1Labels = "BIO-F1-Labels"
+  show BIOWeightedF1 = "BIO-Weighted-F1"
   show TokenAccuracy = "TokenAccuracy"
   show SegmentAccuracy = "SegmentAccuracy"
   show MAE = "MAE"
@@ -161,6 +162,7 @@ instance Read Metric where
   readsPrec p ('C':'h':'a':'r':'M':'a':'t':'c':'h':theRest) = [(CharMatch, theRest)]
   readsPrec _ ('M':'A':'P':theRest) = [(MAP, theRest)]
   readsPrec _ ('B':'I':'O':'-':'F':'1':'-':'L':'a':'b':'e':'l':'s':theRest) = [(BIOF1Labels, theRest)]
+  readsPrec _ ('B':'I':'O':'-':'W':'e':'i':'g':'h':'t':'e':'d':'-':'F':'1': theRest) = [(BIOWeightedF1, theRest)]
   readsPrec _ ('B':'I':'O':'-':'F':'1':theRest) = [(BIOF1, theRest)]
   readsPrec _ ('T':'o':'k':'e':'n':'A':'c':'c':'u':'r':'a':'c':'y':theRest) = [(TokenAccuracy, theRest)]
   readsPrec _ ('S':'e':'g':'m':'e':'n':'t':'A':'c':'c':'u':'r':'a':'c':'y':theRest) = [(SegmentAccuracy, theRest)]
@@ -201,6 +203,7 @@ getMetricOrdering MAP = TheHigherTheBetter
 getMetricOrdering LogLoss = TheLowerTheBetter
 getMetricOrdering Likelihood = TheHigherTheBetter
 getMetricOrdering BIOF1 = TheHigherTheBetter
+getMetricOrdering BIOWeightedF1 = TheHigherTheBetter
 getMetricOrdering BIOF1Labels = TheHigherTheBetter
 getMetricOrdering TokenAccuracy = TheHigherTheBetter
 getMetricOrdering SegmentAccuracy = TheHigherTheBetter
diff --git a/src/GEval/MetricsMechanics.hs b/src/GEval/MetricsMechanics.hs
index 6e66336..ff4be79 100644
--- a/src/GEval/MetricsMechanics.hs
+++ b/src/GEval/MetricsMechanics.hs
@@ -20,7 +20,7 @@ import GEval.Common
 import GEval.BLEU (bleuStep, gleuStep)
 import GEval.WER (werStep)
 import GEval.Clippings (totalArea, coveredBy, clippEUMatchStep)
-import GEval.BIO (gatherCountsForBIO)
+import GEval.BIO (gatherCountsForBIO, gatherSeparatedCountsForBIO)
 
 import GEval.Probability
 import GEval.PrecisionRecall (weightedMaxMatch, fMeasureOnCounts, calculateMAPForOneResult, getProbabilisticCounts, getCounts)
@@ -45,13 +45,15 @@ import GEval.ProbList (ProbList(..), parseIntoProbList, WordWithProb(..), countL
 import GEval.MatchingSpecification
 import GEval.Haversine
 
+import qualified Data.HashMap.Strict as M
+
 -- | Helper type so that singleton can be used.
 -- | (The problem is that some metrics are parametrized by Double
 -- | Word32 and this is not handled by the singleton libary.)
 singletons [d|data AMetric = ARMSE | AMSE | APearson | ASpearman | ABLEU | AGLEU | AWER | ACER | AAccuracy | AClippEU
                              | AFMeasure | AMacroFMeasure | ANMI
                              | ALogLossHashed | ACharMatch | AMAP | ALogLoss | ALikelihood
-                             | ABIOF1 | ABIOF1Labels | ATokenAccuracy | ASegmentAccuracy | ALikelihoodHashed | AMAE | ASMAPE | AMultiLabelFMeasure MatchingSpecification
+                             | ABIOF1 | ABIOWeightedF1 | ABIOF1Labels | ATokenAccuracy | ASegmentAccuracy | ALikelihoodHashed | AMAE | ASMAPE | AMultiLabelFMeasure MatchingSpecification
                              | AMultiLabelLogLoss | AMultiLabelLikelihood
                              | ASoftFMeasure | AProbabilisticMultiLabelFMeasure | AProbabilisticSoftFMeasure | ASoft2DFMeasure
                              | AFLCFMeasure | AHaversine
@@ -79,6 +81,7 @@ toHelper MAP = AMAP
 toHelper LogLoss = ALogLoss
 toHelper Likelihood = ALikelihood
 toHelper BIOF1 = ABIOF1
+toHelper BIOWeightedF1 = ABIOWeightedF1
 toHelper BIOF1Labels = ABIOF1Labels
 toHelper TokenAccuracy = ATokenAccuracy
 toHelper SegmentAccuracy = ASegmentAccuracy
@@ -125,6 +128,7 @@ type family ParsedExpectedType (t :: AMetric) :: * where
   ParsedExpectedType ALogLoss = Double
   ParsedExpectedType ALikelihood = Double
   ParsedExpectedType ABIOF1 = [TaggedEntity]
+  ParsedExpectedType ABIOWeightedF1 = [TaggedEntity]
   ParsedExpectedType ABIOF1Labels = [TaggedEntity]
   ParsedExpectedType ATokenAccuracy = [Text]
   ParsedExpectedType ASegmentAccuracy = [Annotation]
@@ -161,6 +165,7 @@ expectedParser SAMAP = splitByTabs
 expectedParser SALogLoss = doubleParser
 expectedParser SALikelihood = doubleParser
 expectedParser SABIOF1 = parseBioSequenceIntoEntities
+expectedParser SABIOWeightedF1 = parseBioSequenceIntoEntities
 expectedParser SABIOF1Labels = parseBioSequenceIntoEntitiesWithoutNormalization
 expectedParser SATokenAccuracy = intoWords
 expectedParser SASegmentAccuracy = parseSegmentAnnotations
@@ -211,6 +216,7 @@ outputParser SAMAP = splitByTabs
 outputParser SALogLoss = doubleParser
 outputParser SALikelihood = doubleParser
 outputParser SABIOF1 = parseBioSequenceIntoEntities
+outputParser SABIOWeightedF1 = parseBioSequenceIntoEntities
 outputParser SABIOF1Labels = parseBioSequenceIntoEntitiesWithoutNormalization
 outputParser SATokenAccuracy = intoWords
 outputParser SASegmentAccuracy = parseSegmentAnnotations
@@ -232,6 +238,7 @@ type family ItemIntermediateRepresentationType (t :: AMetric) :: * where
   ItemIntermediateRepresentationType AClippEU = (Int, Int, Int)
   ItemIntermediateRepresentationType ANMI = (Text, Text)
   ItemIntermediateRepresentationType ABIOF1 = (Int, Int, Int)
+  ItemIntermediateRepresentationType ABIOWeightedF1 = M.HashMap Text (Int, Int, Int)
   ItemIntermediateRepresentationType ABIOF1Labels = (Int, Int, Int)
   ItemIntermediateRepresentationType ATokenAccuracy = (Int, Int)
   ItemIntermediateRepresentationType AProbabilisticMultiLabelFMeasure = ([Double], [Double], Double, Int)
@@ -277,6 +284,7 @@ itemStep SAMAP = uncurry calculateMAPForOneResult
 itemStep SALogLoss = itemLogLossError
 itemStep SALikelihood = itemLogLossError
 itemStep SABIOF1 = uncurry gatherCountsForBIO
+itemStep SABIOWeightedF1 = uncurry gatherSeparatedCountsForBIO
 itemStep SABIOF1Labels = uncurry gatherCountsForBIO
 itemStep SATokenAccuracy = countHitsAndTotals
 itemStep SASegmentAccuracy = uncurry segmentAccuracy
diff --git a/src/GEval/MetricsMeta.hs b/src/GEval/MetricsMeta.hs
index 6f62d5d..8fba402 100644
--- a/src/GEval/MetricsMeta.hs
+++ b/src/GEval/MetricsMeta.hs
@@ -64,6 +64,7 @@ listOfAvailableMetrics = [RMSE,
                           LogLossHashed defaultLogLossHashedSize,
                           LikelihoodHashed defaultLogLossHashedSize,
                           BIOF1,
+                          BIOWeightedF1,
                           BIOF1Labels,
                           TokenAccuracy,
                           SegmentAccuracy,
@@ -105,6 +106,7 @@ isMetricDescribed WER = True
 isMetricDescribed CER = True
 isMetricDescribed SegmentAccuracy = True
 isMetricDescribed Haversine = True
+isMetricDescribed BIOWeightedF1 = True
 isMetricDescribed _ = False
 
 getEvaluationSchemeDescription :: EvaluationScheme -> String
@@ -176,6 +178,9 @@ getMetricDescription Haversine =
   [i|The haversine formula determines the great-circle distance between
 two points on a sphere given their longitudes and latitudes (in degrees).
 |]
+getMetricDescription BIOWeightedF1 =
+  [i|Weighted-average F1-score calculated on output expressed in the BIO format.
+|]
 
 outContents :: Metric -> String
 outContents (MultiLabelFMeasure _ _) = [hereLit|person/1,3 first-name/1 first-name/3
@@ -206,6 +211,10 @@ N:1-4 V:6-7 A:9-13
 outContents Haversine = [hereLit|39.575264	-56.995928
 29.949932	-90.070116
 |]
+outContents BIOWeightedF1 = [hereLit|B-firstname/ALAN B-surname/TURING
+O O O
+B-surname/TARSKI O B-surname/NOT O
+|]
 
 expectedScore :: EvaluationScheme -> MetricValue
 expectedScore (EvaluationScheme (MultiLabelFMeasure 1.0 ExactMatch) []) = 0.6666
@@ -231,6 +240,8 @@ expectedScore (EvaluationScheme CER [])
   = 0.14814
 expectedScore (EvaluationScheme Haversine [])
   = 1044.2633358563135
+expectedScore (EvaluationScheme BIOWeightedF1 [])
+  = 0.86666666
 
 helpMetricParameterMetricsList :: String
 helpMetricParameterMetricsList = intercalate ", " $ map (\s -> (show s) ++ (case extraInfo s of
@@ -297,6 +308,10 @@ formatDescription CER = [hereLit|Any text, whitespace and punctuation marks are
 formatDescription Haversine = [hereLit|Each line is a latitude and longitude of sphere separated by tabulation,
 e.g. "41.558153 -73.051497".
 |]
+formatDescription BIOWeightedF1 = [hereLit|Each line is a sequence of tags encoded in the BIO format, i.e. O, B-tag, I-tag;
+B-tags and I-tags can accompanied by an extra label after a slash.
+|]
+
 
 scoreExplanation :: EvaluationScheme -> Maybe String
 scoreExplanation (EvaluationScheme (MultiLabelFMeasure _ ExactMatch) [])
@@ -332,6 +347,11 @@ scoreExplanation (EvaluationScheme CER [])
   = Just [hereLit|The total length of expected output (in characters) is 27. There are 4 errors
 (1 word substituted, 1 inserted, 1 deleted)  in the actual output. Hence,
 CER = (2+1+1) / 27 = 4 / 27 = 0.14814.|]
+scoreExplanation (EvaluationScheme Haversine []) = Nothing
+scoreExplanation (EvaluationScheme BIOWeightedF1 [])
+  = Just [hereLit|There are two labels (firstname and surname, O is not considered). Firstname was
+predicted in the perfect way, hence F1=1, whereas for surname recall is 1, precision - 2/3 and F1 - 4/5.
+The weighted average is (1 * 1 + 2 * 4/5) / 3 = 13/15 = 0.8667.|]
 
 pasteLines :: String -> String -> String
 pasteLines a b = printf "%-35s %s\n" a b
diff --git a/src/GEval/PrecisionRecall.hs b/src/GEval/PrecisionRecall.hs
index 15dd33e..a4e889e 100644
--- a/src/GEval/PrecisionRecall.hs
+++ b/src/GEval/PrecisionRecall.hs
@@ -7,9 +7,11 @@ module GEval.PrecisionRecall(calculateMAPForOneResult,
                              precisionAndRecall, precisionAndRecallFromCounts,
                              maxMatch, maxMatchOnOrdered, getCounts, weightedMaxMatch, weightedMaxMatching,
                              getProbabilisticCounts,
-                             countFragFolder)
+                             countFragFolder, fMeasureOnSeparatedCounts, f1MeasureOnSeparatedCounts)
        where
 
+import Debug.Trace
+
 import GEval.Common
 import GEval.Probability
 
@@ -21,6 +23,7 @@ import Data.List (find, foldl', nub)
 import Data.Algorithm.Munkres
 import qualified Data.Array.IArray as DAI
 
+import qualified Data.HashMap.Strict as M
 
 calculateMAPForOneResult :: (Eq a) => [a] -> [a] -> Double
 calculateMAPForOneResult expected got = precisionSum / fromIntegral (length expected)
@@ -69,6 +72,15 @@ fMeasureOnFragCounts beta (rC, pC, nbExpected, nbGot) =
   where r = rC /. nbExpected
         p = pC /. nbGot
 
+f1MeasureOnSeparatedCounts :: M.HashMap a (Int, Int, Int) -> Double
+f1MeasureOnSeparatedCounts m = fMeasureOnSeparatedCounts 1.0 m
+
+fMeasureOnSeparatedCounts :: Double -> M.HashMap a (Int, Int, Int) -> Double
+fMeasureOnSeparatedCounts beta m = (sum $ map (\c@(_, t, _) -> (fromIntegral t) * (fMeasureOnCounts beta c)) mAsList) /. total
+  where mAsList = M.elems m
+        total = sum $ map (\(_, t, _) -> t) mAsList
+
+
 countFolder :: (Num n, Num v) => (n, v, v) -> (n, v, v) -> (n, v, v)
 countFolder (a1, a2, a3) (b1, b2, b3) = (a1+b1, a2+b2, a3+b3)
 
diff --git a/test/Spec.hs b/test/Spec.hs
index 22834fc..eefc80a 100644
--- a/test/Spec.hs
+++ b/test/Spec.hs
@@ -536,6 +536,8 @@ main = hspec $ do
       runGEvalTest "bio-f1-perfect" `shouldReturnAlmost` 1.0
     it "check inconsistent input" $ do
       runGEvalTest "bio-f1-error" `shouldThrow` (== UnexpectedData 2 "inconsistent label sequence `B-NAME/JOHN I-FOO/SMITH I-FOO/X`")
+    it "weighted F1" $ do
+      runGEvalTest "bio-weighted-f1-simple" `shouldReturnAlmost` 0.82539682
   describe "automatic decompression" $ do
     it "more complex test" $ do
       runGEvalTest "charmatch-complex-compressed" `shouldReturnAlmost` 0.1923076923076923
diff --git a/test/bio-weighted-f1-simple/bio-weighted-f1-simple-solution/test-A/out.tsv b/test/bio-weighted-f1-simple/bio-weighted-f1-simple-solution/test-A/out.tsv
new file mode 100644
index 0000000..8c2aa2f
--- /dev/null
+++ b/test/bio-weighted-f1-simple/bio-weighted-f1-simple-solution/test-A/out.tsv
@@ -0,0 +1,4 @@
+O B-FOO O O O B-FOO I-FOO I-FOO B-BAR
+B-BAR O B-XYZ
+B-BAZ O B-BAZ O B-BAR
+O B-BAZ I-BAZ B-FOO
diff --git a/test/bio-weighted-f1-simple/bio-weighted-f1-simple/config.txt b/test/bio-weighted-f1-simple/bio-weighted-f1-simple/config.txt
new file mode 100644
index 0000000..80706b6
--- /dev/null
+++ b/test/bio-weighted-f1-simple/bio-weighted-f1-simple/config.txt
@@ -0,0 +1 @@
+--metric BIO-Weighted-F1
diff --git a/test/bio-weighted-f1-simple/bio-weighted-f1-simple/test-A/expected.tsv b/test/bio-weighted-f1-simple/bio-weighted-f1-simple/test-A/expected.tsv
new file mode 100644
index 0000000..3c5ccb0
--- /dev/null
+++ b/test/bio-weighted-f1-simple/bio-weighted-f1-simple/test-A/expected.tsv
@@ -0,0 +1,4 @@
+O B-FOO O O O B-FOO I-FOO O B-BAR
+B-BAR O O
+B-BAZ B-BAZ B-BAZ O O
+O B-BAZ I-BAZ B-FOO