From 8735610745c6663fb89b173137fa010820cc2f26 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Filip=20Grali=C5=84ski?= <filipg@amu.edu.pl>
Date: Wed, 17 Oct 2018 17:52:43 +0200
Subject: [PATCH] Implement soft f-score

---
 geval.cabal                                   |  2 +
 src/GEval/Annotation.hs                       | 46 +++++++++++++++++++
 src/GEval/Common.hs                           | 15 +++++-
 src/GEval/Core.hs                             | 27 +++++++++--
 src/GEval/CreateChallenge.hs                  | 24 +++++++++-
 src/GEval/OptionsParser.hs                    |  2 +-
 src/GEval/PrecisionRecall.hs                  | 32 ++++++++++---
 stack.yaml                                    |  2 +-
 test/Spec.hs                                  | 25 ++++++++++
 .../soft-f1-perfect-solution/test-A/out.tsv   |  3 ++
 .../soft-f1-perfect/config.txt                |  1 +
 .../soft-f1-perfect/test-A/expected.tsv       |  3 ++
 .../soft-f1-simple-solution/test-A/out.tsv    |  3 ++
 test/soft-f1-simple/soft-f1-simple/config.txt |  1 +
 .../soft-f1-simple/test-A/expected.tsv        |  3 ++
 15 files changed, 173 insertions(+), 16 deletions(-)
 create mode 100644 src/GEval/Annotation.hs
 create mode 100644 test/soft-f1-perfect/soft-f1-perfect-solution/test-A/out.tsv
 create mode 100644 test/soft-f1-perfect/soft-f1-perfect/config.txt
 create mode 100644 test/soft-f1-perfect/soft-f1-perfect/test-A/expected.tsv
 create mode 100644 test/soft-f1-simple/soft-f1-simple-solution/test-A/out.tsv
 create mode 100644 test/soft-f1-simple/soft-f1-simple/config.txt
 create mode 100644 test/soft-f1-simple/soft-f1-simple/test-A/expected.tsv

diff --git a/geval.cabal b/geval.cabal
index ffcf5af..77b0464 100644
--- a/geval.cabal
+++ b/geval.cabal
@@ -36,6 +36,7 @@ library
                      , GEval.FeatureExtractor
                      , GEval.WER
                      , Text.Tokenizer
+                     , GEval.Annotation
                      , Paths_geval
   build-depends:       base >= 4.7 && < 5
                      , cond
@@ -76,6 +77,7 @@ library
                      , uri-encode
                      , MissingH
                      , array
+                     , Munkres
   default-language:    Haskell2010
 
 executable geval
diff --git a/src/GEval/Annotation.hs b/src/GEval/Annotation.hs
new file mode 100644
index 0000000..68f3386
--- /dev/null
+++ b/src/GEval/Annotation.hs
@@ -0,0 +1,46 @@
+{-# LANGUAGE OverloadedStrings #-}
+
+module GEval.Annotation
+       (parseAnnotations, Annotation(..), matchScore)
+       where
+
+import qualified Data.IntSet as IS
+import qualified Data.Text as T
+
+import Data.Attoparsec.Text
+import Data.Attoparsec.Combinator
+import Control.Applicative
+import GEval.Common (sepByWhitespaces, (/.))
+import Data.Char
+
+data Annotation = Annotation T.Text IS.IntSet
+                  deriving (Eq, Show)
+
+parseAnnotations :: T.Text -> Either String [Annotation]
+parseAnnotations t = parseOnly (annotationsParser <* endOfInput) t
+
+annotationsParser :: Parser [Annotation]
+annotationsParser = sepByWhitespaces annotationParser
+
+annotationParser :: Parser Annotation
+annotationParser = do
+  label <- takeWhile1 (\c -> not (isSpace c) && c /= ':')
+  string ":"
+  intSet <- intSetParser
+  return $ Annotation label intSet
+
+intSetParser :: Parser IS.IntSet
+intSetParser = IS.unions <$> intervalParser `sepBy` (string ",")
+
+intervalParser :: Parser IS.IntSet
+intervalParser = do
+  startIx <- decimal
+  endIx <- (string "-" *> decimal <|> pure startIx)
+  pure $ IS.fromList [startIx..endIx]
+
+matchScore :: Annotation -> Annotation -> Double
+matchScore (Annotation labelA intSetA) (Annotation labelB intSetB)
+  | labelA == labelB = (intSetLength intersect) /. (intSetLength $ intSetA `IS.union` intSetB)
+  | otherwise = 0.0
+  where intSetLength = Prelude.length . IS.toList
+        intersect = intSetA `IS.intersection` intSetB
diff --git a/src/GEval/Common.hs b/src/GEval/Common.hs
index 38828ab..31fa578 100644
--- a/src/GEval/Common.hs
+++ b/src/GEval/Common.hs
@@ -6,9 +6,20 @@ import Data.Text.Read as TR
 
 import Data.Attoparsec.Text
 
-(/.) :: (Eq a, Integral a) => a -> a -> Double
+-- some operations can be "hard" (on ints) or "soft" (on doubles),
+-- introduce a typeclass so that we could generalise easily
+class ConvertibleToDouble n where
+  toDouble :: n -> Double
+
+instance ConvertibleToDouble Double where
+  toDouble = id
+
+instance ConvertibleToDouble Int where
+  toDouble = fromIntegral
+
+(/.) :: (ConvertibleToDouble f, Integral a) => f -> a -> Double
 x /. 0 = 1.0
-x /. y = (fromIntegral x) / (fromIntegral y)
+x /. y = (toDouble x) / (fromIntegral y)
 
 safeDoubleDiv :: Double -> Double -> Double
 safeDoubleDiv _ 0.0 = 0.0
diff --git a/src/GEval/Core.hs b/src/GEval/Core.hs
index 56e6293..df7641f 100644
--- a/src/GEval/Core.hs
+++ b/src/GEval/Core.hs
@@ -71,6 +71,8 @@ import Data.Attoparsec.Text (parseOnly)
 
 import Data.Conduit.SmartSource
 
+import qualified Data.IntSet as IS
+
 import GEval.BLEU
 import GEval.Common
 import GEval.ClippEU
@@ -83,6 +85,7 @@ import GEval.ProbList
 import GEval.WER
 import Data.Conduit.AutoDecompress
 import Text.Tokenizer
+import GEval.Annotation
 
 import qualified Data.HashMap.Strict as M
 import qualified Data.Vector as V
@@ -107,6 +110,7 @@ data Metric = RMSE | MSE | Pearson | Spearman | BLEU | GLEU | WER | Accuracy | C
               | LogLossHashed Word32 | CharMatch | MAP | LogLoss | Likelihood
               | BIOF1 | BIOF1Labels | LikelihoodHashed Word32 | MAE | MultiLabelFMeasure Double
               | MultiLabelLogLoss | MultiLabelLikelihood
+              | SoftFMeasure Double
               deriving (Eq)
 
 instance Show Metric where
@@ -121,6 +125,7 @@ instance Show Metric where
   show ClippEU = "ClippEU"
   show (FMeasure beta) = "F" ++ (show beta)
   show (MacroFMeasure beta) = "Macro-F" ++ (show beta)
+  show (SoftFMeasure beta) = "Soft-F" ++ (show beta)
   show NMI = "NMI"
   show (LogLossHashed nbOfBits) = "LogLossHashed" ++ (if
                                                        nbOfBits == defaultLogLossHashedSize
@@ -165,6 +170,9 @@ instance Read Metric where
   readsPrec p ('M':'u':'l':'t':'i':'L':'a':'b':'e':'l':'-':'F':theRest) = case readsPrec p theRest of
     [(beta, theRest)] -> [(MultiLabelFMeasure beta, theRest)]
     _ -> []
+  readsPrec p ('S':'o':'f':'t':'-':'F':theRest) = case readsPrec p theRest of
+    [(beta, theRest)] -> [(SoftFMeasure beta, theRest)]
+    _ -> []
   readsPrec p ('L':'o':'g':'L':'o':'s':'s':'H':'a':'s':'h':'e':'d':theRest) = case readsPrec p theRest of
     [(nbOfBits, theRest)] -> [(LogLossHashed nbOfBits, theRest)]
     _ -> [(LogLossHashed defaultLogLossHashedSize, theRest)]
@@ -198,6 +206,7 @@ getMetricOrdering Accuracy = TheHigherTheBetter
 getMetricOrdering ClippEU  = TheHigherTheBetter
 getMetricOrdering (FMeasure _) = TheHigherTheBetter
 getMetricOrdering (MacroFMeasure _) = TheHigherTheBetter
+getMetricOrdering (SoftFMeasure _) = TheHigherTheBetter
 getMetricOrdering NMI = TheHigherTheBetter
 getMetricOrdering (LogLossHashed _) = TheLowerTheBetter
 getMetricOrdering (LikelihoodHashed _) = TheHigherTheBetter
@@ -586,6 +595,7 @@ gevalCore' (FMeasure beta) _ = gevalCoreWithoutInput outParser outParser getCoun
           | prob >= detectionThreshold && prob <= 1.0 = Right True
           | otherwise = Left "expected probability"
         detectionThreshold = 0.5
+        getCount :: (Bool, Bool) -> (Int, Int, Int)
         getCount (True, True)   = (1, 1, 1)
         getCount (True, False)  = (0, 1, 0)
         getCount (False, True)  = (0, 0, 1)
@@ -615,11 +625,20 @@ gevalCore' (MacroFMeasure beta) _ = gevalCoreWithoutInput (Right . Just . strip)
                             insertMaybeToMap (Just c) m = M.insertWith (+) c 1 m
                             macroAverageOnCounts (tpMap, expectedMap, gotMap) =
                               (Prelude.sum
-                               $ Prelude.map (\c -> fMeasureOnCounts beta (M.lookupDefault 0 c tpMap,
+                               $ Prelude.map (\c -> fMeasureOnCounts beta (M.lookupDefault (0::Int) c tpMap,
                                                                          M.lookupDefault 0 c expectedMap,
                                                                          M.lookupDefault 0 c gotMap))
                                $ M.keys expectedMap) / (fromIntegral $ Prelude.length $ M.keys expectedMap)
 
+gevalCore' (SoftFMeasure beta) _ = gevalCoreWithoutInput parseAnnotations
+                                                         parseAnnotations
+                                                         getSoftCounts
+                                                         countAgg
+                                                         (fMeasureOnCounts beta)
+                      where getSoftCounts (expected, got) = (weightedMaxMatch matchScore expected got,
+                                                             Prelude.length expected,
+                                                             Prelude.length got)
+
 gevalCore' ClippEU _ = gevalCoreWithoutInput parseClippingSpecs parseClippings matchStep clippeuAgg finalStep
   where
     parseClippings = controlledParse lineClippingsParser
@@ -680,8 +699,8 @@ gevalCore' MultiLabelLogLoss _ = gevalCoreWithoutInput intoWords
     where
       intoWords = Right . Data.Text.words
 
-countAgg :: Monad m => ConduitM (Int, Int, Int) o m (Int, Int, Int)
-countAgg = CC.foldl countFolder (0, 0, 0)
+countAgg :: (Num n, Monad m) => ConduitM (n, Int, Int) o m (n, Int, Int)
+countAgg = CC.foldl countFolder (fromInteger 0, 0, 0)
 
 gevalCoreByCorrelationMeasure :: (MonadUnliftIO m, MonadThrow m, MonadIO m) =>
                                 (V.Vector (Double, Double) -> Double) -> -- ^ correlation function
@@ -707,7 +726,7 @@ skipLineNumber fun = fun . snd
 -- | A helper function to run evaluation when the input is not needed to calculate the metric value.
 gevalCoreWithoutInput :: (MonadUnliftIO m, MonadThrow m, MonadIO m) =>
                       (Text -> Either String a) ->  -- ^ parser for values in the expected output
-                      (Text -> Either String b) ->  -- ^ parser for values in the output
+                      (Text -> Either String b) ->  -- ^ parser for values in the actual output
                       ((a, b) -> c) ->              -- ^ function which combines parsed values into a single value
                                                   -- (will be launched for each item, e.g. an error/cost function
                                                   -- could be calculated here)
diff --git a/src/GEval/CreateChallenge.hs b/src/GEval/CreateChallenge.hs
index dbd114d..557430e 100644
--- a/src/GEval/CreateChallenge.hs
+++ b/src/GEval/CreateChallenge.hs
@@ -110,6 +110,13 @@ GEval sample challenge — guess the language of a first name
 This is a sample/toy classification challenge for Gonito framework with Macro-F-measure as the metric.
 |] ++ (commonReadmeMDContents testName)
 
+readmeMDContents (SoftFMeasure _) testName = [i|
+GEval sample challenge — mark numbers
+=====================================
+
+This is a sample/toy classification challenge for Gonito framework with Soft-F-measure as the metric.
+|] ++ (commonReadmeMDContents testName)
+
 readmeMDContents NMI testName = [i|
 Cluster proverbs
 ================
@@ -358,7 +365,10 @@ pl	Kazimierz
 en	Matthew
 en	Richard
 |]
-
+trainContents (SoftFMeasure _) = [hereLit|indigits:8	I have 3 daughters
+indigits:1-2 indigits:9-12	12 July 1812
+inwords:11-13	I can see two dogs
+|]
 trainContents NMI = [hereLit|pl	Kto pod kim dołki kopie, ten sam w nie wpada.
 en	The pen is mightier than the sword.
 pl	Baba z wozu, koniom lżej.
@@ -424,6 +434,9 @@ devInContents (MacroFMeasure _) = [hereLit|Władysław
 Steven
 Helmut
 |]
+devInContents (SoftFMeasure _) = [hereLit|I have two kids
+7 April 2003
+|]
 devInContents (LikelihoodHashed b) = devInContents (LogLossHashed b)
 devInContents (LogLossHashed _) = [hereLit|Nie kupuj	w worku
 Ona	psa
@@ -472,6 +485,9 @@ devExpectedContents (MacroFMeasure _) = [hereLit|pl
 en
 de
 |]
+devExpectedContents (SoftFMeasure _) = [hereLit|inwords:8-10
+indigits:1 indigits:9-12
+|]
 devExpectedContents NMI = [hereLit|en
 pl
 en
@@ -524,6 +540,9 @@ testInContents (MacroFMeasure _) = [hereLit|Arkadiusz
 Heinrich
 Henry
 |]
+testInContents (SoftFMeasure _) = [hereLit|Nothing
+Four sides
+|]
 testInContents NMI = [hereLit|Fortune favors the bold.
 People who live in glass houses should not throw stones.
 W marcu, jak w garncu.
@@ -575,6 +594,9 @@ testExpectedContents (MacroFMeasure _) = [hereLit|pl
 de
 en
 |]
+testExpectedContents (SoftFMeasure _) = [hereLit|
+inwords:1-4
+|]
 testExpectedContents NMI = [hereLit|en
 en
 pl
diff --git a/src/GEval/OptionsParser.hs b/src/GEval/OptionsParser.hs
index e842d18..8828b7d 100644
--- a/src/GEval/OptionsParser.hs
+++ b/src/GEval/OptionsParser.hs
@@ -169,7 +169,7 @@ metricReader = many $ option auto         -- actually `some` should be used inst
                ( long "metric"            -- --metric might be in the config.txt file...
                  <> short 'm'
                  <> metavar "METRIC"
-                 <> help "Metric to be used - RMSE, MSE, Pearson, Spearman, Accuracy, LogLoss, Likelihood, F-measure (specify as F1, F2, F0.25, etc.), macro F-measure (specify as Macro-F1, Macro-F2, Macro-F0.25, etc.), multi-label F-measure (specify as MultiLabel-F1, MultiLabel-F2, MultiLabel-F0.25, etc.), MAP, BLEU, GLEU (\"Google GLEU\" not the grammar correction metric), WER, NMI, ClippEU, LogLossHashed, LikelihoodHashed, BIO-F1, BIO-F1-Labels or CharMatch" )
+                 <> help "Metric to be used - RMSE, MSE, Pearson, Spearman, Accuracy, LogLoss, Likelihood, F-measure (specify as F1, F2, F0.25, etc.), macro F-measure (specify as Macro-F1, Macro-F2, Macro-F0.25, etc.), multi-label F-measure (specify as MultiLabel-F1, MultiLabel-F2, MultiLabel-F0.25, etc.), MAP, BLEU, GLEU (\"Google GLEU\" not the grammar correction metric), WER, NMI, ClippEU, LogLossHashed, LikelihoodHashed, BIO-F1, BIO-F1-Labels, soft F-measure (specify as Soft-F1, Soft-F2, Soft-F0.25) or CharMatch" )
 
 altMetricReader :: Parser (Maybe Metric)
 altMetricReader = optional $ option auto
diff --git a/src/GEval/PrecisionRecall.hs b/src/GEval/PrecisionRecall.hs
index ff38ca5..00d3fab 100644
--- a/src/GEval/PrecisionRecall.hs
+++ b/src/GEval/PrecisionRecall.hs
@@ -1,10 +1,11 @@
 {-# LANGUAGE PartialTypeSignatures #-}
+{-# LANGUAGE ScopedTypeVariables #-}
 
 module GEval.PrecisionRecall(calculateMAPForOneResult,
                              fMeasure, f1Measure, f2Measure, precision, recall,
                              fMeasureOnCounts, f1MeasureOnCounts, f2MeasureOnCounts, countFolder,
                              precisionAndRecall, precisionAndRecallFromCounts,
-                             maxMatch, maxMatchOnOrdered, getCounts)
+                             maxMatch, maxMatchOnOrdered, getCounts, weightedMaxMatch)
        where
 
 import GEval.Common
@@ -14,9 +15,13 @@ import Data.Graph.Inductive.Query.MaxFlow
 
 import Data.List (nub, foldl')
 
+import Data.Algorithm.Munkres
+import qualified Data.Array.IArray as DAI
+
+
 calculateMAPForOneResult :: (Eq a) => [a] -> [a] -> Double
 calculateMAPForOneResult expected got = precisionSum / fromIntegral (length expected)
-  where (_, _, precisionSum) = calculateMAPForOneResultCore expected (nub got)
+  where (_::Int, _, precisionSum) = calculateMAPForOneResultCore expected (nub got)
         calculateMAPForOneResultCore expected got = foldl' (oneMAPStep expected) (0, 0, 0.0) got
         oneMAPStep expected (gotCount, allCount, precisionSum) gotItem
           | gotItem `elem` expected = (newGotCount, newAllCount, precisionSum + (newGotCount /. newAllCount))
@@ -41,19 +46,19 @@ fMeasure beta matchingFun expected got =
   where betaSquared = beta ^ 2
         (p, r) = precisionAndRecall matchingFun expected got
 
-f2MeasureOnCounts :: (Int, Int, Int) -> Double
+f2MeasureOnCounts :: ConvertibleToDouble n => (n, Int, Int) -> Double
 f2MeasureOnCounts = fMeasureOnCounts 2.0
 
-f1MeasureOnCounts :: (Int, Int, Int) -> Double
+f1MeasureOnCounts :: ConvertibleToDouble n => (n, Int, Int) -> Double
 f1MeasureOnCounts = fMeasureOnCounts 1.0
 
-fMeasureOnCounts :: Double -> (Int, Int, Int) -> Double
+fMeasureOnCounts :: ConvertibleToDouble n => Double -> (n, Int, Int) -> Double
 fMeasureOnCounts beta (tp, nbExpected, nbGot) =
   (1 + betaSquared) * p * r `safeDoubleDiv` (betaSquared * p + r)
   where betaSquared = beta ^ 2
         (p, r) = precisionAndRecallFromCounts (tp, nbExpected, nbGot)
 
-countFolder :: (Int, Int, Int) -> (Int, Int, Int) -> (Int, Int, Int)
+countFolder :: Num n => (n, Int, Int) -> (n, Int, Int) -> (n, Int, Int)
 countFolder (a1, a2, a3) (b1, b2, b3) = (a1+b1, a2+b2, a3+b3)
 
 getCounts :: (a -> b -> Bool) -> ([a], [b]) -> (Int, Int, Int)
@@ -72,7 +77,7 @@ precisionAndRecall matchFun expected got
   = precisionAndRecallFromCounts (tp, length expected, length got)
     where tp = maxMatch matchFun expected got
 
-precisionAndRecallFromCounts :: (Int, Int, Int) -> (Double, Double)
+precisionAndRecallFromCounts :: ConvertibleToDouble n => (n, Int, Int) -> (Double, Double)
 precisionAndRecallFromCounts (tp, nbExpected, nbGot) =
   (tp /. nbGot, tp /. nbExpected)
 
@@ -116,3 +121,16 @@ buildGraph matchFun expected got = (b, e, g)
                 return (b,e)
                 where expectedIxs = [2..1+(length expected)]
                       gotIxs = [2+(length expected)..1+(length expected)+(length got)]
+
+-- the weight are assumed to be between 0.0 and 1.0
+weightedMaxMatch :: (a -> b -> Double) -> [a] -> [b] -> Double
+weightedMaxMatch matchFun expected got = (fromIntegral $ length matching) - score
+   where (matching, score) = hungarianMethodDouble complementWeightArray
+                             -- unfortunately `hungarianMethodDouble` looks
+                             -- for minimal bipartite matching
+                             -- rather than the maximal one
+         complementWeightArray = DAI.array ((1, 1), (m, n)) weightList
+         m = length expected
+         n = length got
+         weightList = [((i, j), 1.0 - (matchFun x y)) | (i, x) <- zip [1..m] expected,
+                                                        (j, y) <- zip [1..n] got]
diff --git a/stack.yaml b/stack.yaml
index 478c62b..572ce27 100644
--- a/stack.yaml
+++ b/stack.yaml
@@ -1,5 +1,5 @@
 flags: {}
 packages:
 - '.'
-extra-deps: [murmur3-1.0.3,naturalcomp-0.0.3]
+extra-deps: [murmur3-1.0.3,naturalcomp-0.0.3,Munkres-0.1]
 resolver: lts-11.9
diff --git a/test/Spec.hs b/test/Spec.hs
index 3134483..f8badf9 100644
--- a/test/Spec.hs
+++ b/test/Spec.hs
@@ -18,6 +18,7 @@ import Data.Attoparsec.Text
 import Options.Applicative
 import Data.Text
 import Text.EditDistance
+import GEval.Annotation
 
 import Data.Map.Strict
 
@@ -32,6 +33,8 @@ import System.IO.Silently
 
 import qualified Test.HUnit as HU
 
+import qualified Data.IntSet as IS
+
 import Data.Conduit.SmartSource
 import Data.Conduit.Rank
 import qualified Data.Conduit.Text as CT
@@ -185,6 +188,10 @@ main = hspec $ do
       precision alwaysMatch ['a', 'b', 'c'] [0, 1, 2, 3, 4, 5] `shouldBeAlmost` 0.5
       recall alwaysMatch ['a', 'b', 'c'] [0, 1, 2, 3, 4, 5] `shouldBeAlmost` 1.0
       f1Measure alwaysMatch ['a', 'b', 'c'] [0, 1, 2, 3 , 4, 5] `shouldBeAlmost` 0.66666666666666
+  describe "max match" $ do
+    it "simple" $ do
+      maxMatch (==) [1,2,2] [3,2] `shouldBe` 1
+      maxMatch (==) [3,2] [1,2,2] `shouldBe` 1
   describe "ClippEU" $ do
     it "parsing rectangles" $ do
       let (Right r) = parseOnly (lineClippingsParser <* endOfInput) "2/0,0,2,3 10/20,30,40,50 18/0,1,500,3 "
@@ -214,6 +221,11 @@ main = hspec $ do
       read "F2" `shouldBe` (FMeasure 2.0)
       read "F1" `shouldBe` (FMeasure 1.0)
       read "F0.5" `shouldBe` (FMeasure 0.5)
+  describe "Soft-F1" $ do
+    it "simple test" $ do
+      runGEvalTest "soft-f1-simple" `shouldReturnAlmost` 0.33333333333333
+    it "perfect test" $ do
+      runGEvalTest "soft-f1-perfect" `shouldReturnAlmost` 1.0
   describe "test edit-distance library" $ do
     it "for handling UTF8" $ do
       levenshteinDistance defaultEditCosts "źdźbło" "źd好bło" `shouldBe` 1
@@ -261,6 +273,19 @@ main = hspec $ do
       gevalCoreOnSingleLines RMSE id (LineInFile (FilePathSpec "stub1") 1 "blabla")
                                      (LineInFile (FilePathSpec "stub2") 1 "3.4")
                                      (LineInFile (FilePathSpec "stub3") 1 "2.6") `shouldReturnAlmost` 0.8
+  describe "Annotation format" $ do
+    it "just parse" $ do
+      parseAnnotations "foo:3,7-10 baz:4-6" `shouldBe` Right [Annotation "foo" (IS.fromList [3,7,8,9,10]),
+                                                               Annotation "baz" (IS.fromList [4,5,6])]
+    it "empty" $ do
+      parseAnnotations "" `shouldBe` Right []
+    it "empty (just spaces)" $ do
+      parseAnnotations "   " `shouldBe` Right []
+    it "match score" $ do
+      matchScore (Annotation "x" (IS.fromList [3..6])) (Annotation "y" (IS.fromList [3..6])) `shouldBeAlmost` 0.0
+      matchScore (Annotation "x" (IS.fromList [3..6])) (Annotation "x" (IS.fromList [3..6])) `shouldBeAlmost` 1.0
+      matchScore (Annotation "x" (IS.fromList [123..140])) (Annotation "x" (IS.fromList [125..130])) `shouldBeAlmost` 0.33333
+      matchScore (Annotation "x" (IS.fromList [3..4])) (Annotation "x" (IS.fromList [2..13])) `shouldBeAlmost` 0.1666666
   describe "BIO format" $ do
     it "just parse" $ do
       let (Right r) = parseOnly (bioSequenceParser <* endOfInput) "O B-city/NEW_YORK I-city B-city/KALISZ I-city O B-name"
diff --git a/test/soft-f1-perfect/soft-f1-perfect-solution/test-A/out.tsv b/test/soft-f1-perfect/soft-f1-perfect-solution/test-A/out.tsv
new file mode 100644
index 0000000..c0cc5c1
--- /dev/null
+++ b/test/soft-f1-perfect/soft-f1-perfect-solution/test-A/out.tsv
@@ -0,0 +1,3 @@
+foo:3-4 bar:5-11 foo:12
+
+baz:10-16 xyz:123-140
diff --git a/test/soft-f1-perfect/soft-f1-perfect/config.txt b/test/soft-f1-perfect/soft-f1-perfect/config.txt
new file mode 100644
index 0000000..987f184
--- /dev/null
+++ b/test/soft-f1-perfect/soft-f1-perfect/config.txt
@@ -0,0 +1 @@
+--metric Soft-F1
diff --git a/test/soft-f1-perfect/soft-f1-perfect/test-A/expected.tsv b/test/soft-f1-perfect/soft-f1-perfect/test-A/expected.tsv
new file mode 100644
index 0000000..c0cc5c1
--- /dev/null
+++ b/test/soft-f1-perfect/soft-f1-perfect/test-A/expected.tsv
@@ -0,0 +1,3 @@
+foo:3-4 bar:5-11 foo:12
+
+baz:10-16 xyz:123-140
diff --git a/test/soft-f1-simple/soft-f1-simple-solution/test-A/out.tsv b/test/soft-f1-simple/soft-f1-simple-solution/test-A/out.tsv
new file mode 100644
index 0000000..520c40d
--- /dev/null
+++ b/test/soft-f1-simple/soft-f1-simple-solution/test-A/out.tsv
@@ -0,0 +1,3 @@
+foo:2-13
+bar:3-7
+baz:10-16 foo:123-140
diff --git a/test/soft-f1-simple/soft-f1-simple/config.txt b/test/soft-f1-simple/soft-f1-simple/config.txt
new file mode 100644
index 0000000..987f184
--- /dev/null
+++ b/test/soft-f1-simple/soft-f1-simple/config.txt
@@ -0,0 +1 @@
+--metric Soft-F1
diff --git a/test/soft-f1-simple/soft-f1-simple/test-A/expected.tsv b/test/soft-f1-simple/soft-f1-simple/test-A/expected.tsv
new file mode 100644
index 0000000..417b57f
--- /dev/null
+++ b/test/soft-f1-simple/soft-f1-simple/test-A/expected.tsv
@@ -0,0 +1,3 @@
+foo:3-4 baz:3-7 foo:12
+
+baz:10-16 foo:125-130