implement BIO-F1

2018-05-15 09:38:13 +02:00 · 2018-05-15 09:38:13 +02:00 · 82e794ae3c
commit 82e794ae3c
parent a1c357948e
16 changed files with 239 additions and 6 deletions
--- a/geval.cabal
+++ b/geval.cabal
@ -26,6 +26,7 @@ library
                     , GEval.LogLossHashed
                     , GEval.CharMatch
                     , GEval.LineByLine
+                     , GEval.BIO
  build-depends:       base >= 4.7 && < 5
                     , cond
                     , conduit
--- a/src/GEval/BIO.hs
+++ b/src/GEval/BIO.hs
@ -0,0 +1,99 @@
+{-# LANGUAGE OverloadedStrings #-}
+
+module GEval.BIO
+       (BIOLabel(..), bioSequenceParser, parseBioSequenceIntoEntities, TaggedSpan(..), TaggedEntity(..), gatherCountsForBIO)
+       where
+
+import GEval.PrecisionRecall
+
+import qualified Data.Text as T
+
+import Data.Attoparsec.Text
+import Data.Attoparsec.Combinator
+import Control.Applicative
+import Data.Char
+import Data.Maybe (catMaybes)
+
+import GEval.Common
+
+data BIOLabel = Outside | Beginning T.Text (Maybe T.Text) | Inside T.Text (Maybe T.Text)
+                deriving (Eq, Show)
+
+data TaggedSpan = TaggedSpan Int Int
+                  deriving (Eq, Show)
+
+data TaggedEntity = TaggedEntity TaggedSpan T.Text (Maybe T.Text)
+                    deriving (Eq, Show)
+
+gatherCountsForBIO :: [TaggedEntity] -> [TaggedEntity] -> (Int, Int, Int)
+gatherCountsForBIO expected got = (maxMatchOnOrdered laterThan expected got, length expected, length got)
+  where
+    laterThan (TaggedEntity (TaggedSpan a _) _ _) (TaggedEntity (TaggedSpan b _) _ _) = a > b
+
+parseBioSequenceIntoEntities :: T.Text -> Either String [TaggedEntity]
+parseBioSequenceIntoEntities t = labelsIntoEntities =<< (parseOnly (bioSequenceParser <* endOfInput) t)
+
+labelsIntoEntities :: [BIOLabel] -> Either String [TaggedEntity]
+labelsIntoEntities labels = labelsIntoEntities' $ zip labels [1..]
+
+labelsIntoEntities' :: [(BIOLabel, Int)] -> Either String [TaggedEntity]
+labelsIntoEntities' labelsWithPositions = mapM labelSplitToEntity labelsGathered
+  where labelsGathered = splitLabels labelsWithPositions
+
+labelSplitToEntity :: [(BIOLabel, Int)] -> Either String TaggedEntity
+labelSplitToEntity labs@(h@(_,begIx):t) = if isBeginning h && all (\tp -> isInside tp && tt tp == btp) t
+                                     then
+                                       Right $ TaggedEntity (TaggedSpan begIx lastItemIx) btp mNormalized
+                                     else
+                                       Left "something wrong with label sequence"
+  where isBeginning (Beginning _ _, _) = True
+        isBeginning _ = False
+        isInside (Inside _ _, _) = True
+        isInside _ = False
+        tt (Beginning t _, _) = t
+        tt (Inside t _, _) = t
+        btp = tt h
+        lastItemIx = case t of
+          [] -> begIx
+          _ -> let (_, ix) = last t
+              in ix
+        normalized (Beginning _ n, _) = n
+        normalized (Inside _ n, _) = n
+        mNormalized = if all (\tp -> normalized tp == Nothing) labs
+                        then
+                          Nothing
+                        else
+                          Just $ T.intercalate "_" $ catMaybes $ map normalized labs
+
+splitLabels :: [(BIOLabel, Int)] -> [[(BIOLabel, Int)]]
+splitLabels [] = []
+splitLabels ((Outside, _):r) = splitLabels r
+splitLabels (e@(_, ix):r) =
+  case splitLabels r of
+    l@(((Beginning _ _, _):_):_) -> ([e]:l)
+    (s@((Inside _ _, ix'):_):l) -> if ix' == ix + 1
+                                    then
+                                      ((e:s):l)
+                                    else
+                                      ([e]:(s:l))
+    [] -> [[e]]
+
+bioSequenceParser :: Parser [BIOLabel]
+bioSequenceParser = sepByWhitespaces bioLabelParser
+
+bioLabelParser :: Parser BIOLabel
+bioLabelParser =
+  (string "O" *> pure Outside) <|>
+  (do
+      labelType <- bioMarkerParser
+      string "-"
+      label <- takeWhile1 (\c -> not (isSpace c) && c /= '/')
+      normalized <- (do
+                       string "/"
+                       normalized <- takeWhile1 (not . isSpace)
+                       return $ Just normalized) <|> pure Nothing
+      return $ labelType label normalized)
+
+bioMarkerParser :: Parser (T.Text -> Maybe T.Text -> BIOLabel)
+bioMarkerParser =
+  (string "B" *> pure Beginning) <|> (string "I" *> pure Inside)
--- a/src/GEval/Core.hs
+++ b/src/GEval/Core.hs
@ -53,6 +53,7 @@ import qualified System.Directory as D
 import System.Posix
 import System.FilePath
 import Data.Maybe
+import Data.Tuple
 import qualified Data.List.Split as DLS

 import Control.Monad.IO.Class
@ -67,6 +68,7 @@ import GEval.PrecisionRecall
 import GEval.ClusteringMetrics
 import GEval.LogLossHashed
 import GEval.CharMatch
+import GEval.BIO

 import qualified Data.HashMap.Strict as M

@ -80,7 +82,7 @@ defaultLogLossHashedSize :: Word32
 defaultLogLossHashedSize = 10

 data Metric = RMSE | MSE | BLEU | Accuracy | ClippEU | FMeasure Double | NMI | LogLossHashed Word32 | CharMatch
-              | MAP | LogLoss
+              | MAP | LogLoss | BIOF1
              deriving (Eq)

 instance Show Metric where
@ -100,6 +102,7 @@ instance Show Metric where
  show CharMatch = "CharMatch"
  show MAP = "MAP"
  show LogLoss = "LogLoss"
+  show BIOF1 = "BIO-F1"

 instance Read Metric where
  readsPrec _ ('R':'M':'S':'E':theRest) = [(RMSE, theRest)]
@ -117,6 +120,7 @@ instance Read Metric where
  readsPrec _ ('L':'o':'g':'L':'o':'s':'s':theRest) = [(LogLoss, theRest)]
  readsPrec p ('C':'h':'a':'r':'M':'a':'t':'c':'h':theRest) = [(CharMatch, theRest)]
  readsPrec _ ('M':'A':'P':theRest) = [(MAP, theRest)]
+  readsPrec _ ('B':'I':'O':'-':'F':'1':theRest) = [(BIOF1, theRest)]

 data MetricOrdering = TheLowerTheBetter | TheHigherTheBetter

@ -132,6 +136,7 @@ getMetricOrdering (LogLossHashed _) = TheLowerTheBetter
 getMetricOrdering CharMatch = TheHigherTheBetter
 getMetricOrdering MAP = TheHigherTheBetter
 getMetricOrdering LogLoss = TheLowerTheBetter
+getMetricOrdering BIOF1 = TheHigherTheBetter

 defaultOutDirectory = "."
 defaultTestName = "test-A"
@ -381,6 +386,10 @@ gevalCore' CharMatch inputLineSource = helper inputLineSource
   helper inputLineSource expectedLineSource outputLineSource = do
     gevalCoreGeneralized (ParserSpecWithInput (Right . unpack) (Right . unpack) (Right . unpack)) step countAgg (fMeasureOnCounts charMatchBeta) (WithInput inputLineSource expectedLineSource outputLineSource)
   step (ParsedRecordWithInput inp exp out) = getCharMatchCount inp exp out
+
+gevalCore' BIOF1 _ = gevalCoreWithoutInput parseBioSequenceIntoEntities parseBioSequenceIntoEntities (uncurry gatherCountsForBIO) countAgg f1MeasureOnCounts
+
+countAgg :: Monad m => ConduitM (Int, Int, Int) o m (Int, Int, Int)
 countAgg = CC.foldl countFolder (0, 0, 0)

 parseDistributionWrapper :: Word32 -> Word32 -> Text -> HashedDistribution
--- a/src/GEval/CreateChallenge.hs
+++ b/src/GEval/CreateChallenge.hs
@ -199,6 +199,18 @@ This a sample challenge for the log-loss metric.

 |] ++ (commonReadmeMDContents testName)

+readmeMDContents BIOF1 testName = [i|
+Tag and normalize names
+=======================
+
+Tag names in the tokenized text and normalized them.
+
+The output should be given in the BIO format with the normalized forms given after slashes (see
+`dev-0/expected.tsv` for an example).
+
+The metric is F1 counted on entities (not labels).
+|] ++ (commonReadmeMDContents testName)
+
 readmeMDContents _ testName = [i|
 GEval sample challenge
 ======================
@ -288,6 +300,10 @@ trainContents LogLoss = [hereLit|0.0	Hell, no!!!
 1.0	Lekker!!!
 0.0	Boring, boring, boring
 |]
+trainContents BIOF1 = [hereLit|O O O B-surname/BOND O B-firstname/JAMES B-surname/BOND	My name is Bond , James Bond
+O O O O O	There is no name here
+B-firstname/JOHN I-surname/VON I-surname/NEUMANN	John von Nueman
+|]
 trainContents _ = [hereLit|0.06	0.39	0	0.206
 1.00	1.00	1	0.017
 317.8	5.20	67	0.048
@ -323,6 +339,9 @@ devInContents LogLoss = [hereLit|Great stuff!
 Boring stuff
 That's good
 |]
+devInContents BIOF1 = [hereLit|Adam and Eve
+Mr Jan Kowalski
+|]
 devInContents _ = [hereLit|0.72	0	0.007
 9.54	62	0.054
 |]
@ -356,6 +375,9 @@ devExpectedContents LogLoss = [hereLit|1.0
 0.0
 1.0
 |]
+devExpectedContents BIOF1 = [hereLit|B-firstname/ADAM O B-firstname/EVE
+O B-firstname/JAN B-surname/KOWALSKI
+|]
 devExpectedContents _ = [hereLit|0.82
 95.2
 |]
@ -391,6 +413,9 @@ testInContents LogLoss = [hereLit|That's great, ha, ha, I love it!
 Super-duper!!
 That is incredibly boring.
 |]
+testInContents BIOF1 = [hereLit|Alan Tring
+No name here
+|]
 testInContents _ = [hereLit|1.52	2	0.093
 30.06	14	0.009
 |]
@ -426,6 +451,9 @@ testExpectedContents LogLoss = [hereLit|1.0
 1.0
 0.0
 |]
+testExpectedContents BIOF1 = [hereLit|B-firstname/ALAN B-surname/TURING
+O O O
+|]
 testExpectedContents _ = [hereLit|0.11
 17.2
 |]
--- a/src/GEval/OptionsParser.hs
+++ b/src/GEval/OptionsParser.hs
@ -100,7 +100,7 @@ metricReader = option auto
                 <> value defaultMetric
                 <> showDefault
                 <> metavar "METRIC"
-                 <> help "Metric to be used - RMSE, MSE, Accuracy, LogLoss, F-measure (specify as F1, F2, F0.25, etc.), MAP, BLEU, NMI, ClippEU, LogLossHashed or CharMatch" )
+                 <> help "Metric to be used - RMSE, MSE, Accuracy, LogLoss, F-measure (specify as F1, F2, F0.25, etc.), MAP, BLEU, NMI, ClippEU, LogLossHashed, BIO-F1 or CharMatch" )

 runGEval :: [String] -> IO (Either (ParserResult GEvalOptions) (Maybe MetricValue))
 runGEval args = do
--- a/src/GEval/PrecisionRecall.hs
+++ b/src/GEval/PrecisionRecall.hs
@ -3,7 +3,7 @@
 module GEval.PrecisionRecall(calculateMAPForOneResult,
                             fMeasure, f1Measure, f2Measure, precision, recall,
                             fMeasureOnCounts, f1MeasureOnCounts, f2MeasureOnCounts, countFolder,
-                             precisionAndRecall, precisionAndRecallFromCounts, maxMatch)
+                             precisionAndRecall, precisionAndRecallFromCounts, maxMatch, maxMatchOnOrdered)
       where

 import GEval.Common
@ -65,6 +65,17 @@ precision matchFun expected got = fst $ precisionAndRecall matchFun expected got
 recall :: (a -> b -> Bool) -> [a] -> [b] -> Double
 recall matchFun expected got = snd $ precisionAndRecall matchFun expected got

+
+maxMatchOnOrdered :: Eq a => (a -> a -> Bool) -> [a] -> [a] -> Int
+maxMatchOnOrdered laterThan expected got =
+   let (matched, _) = foldl' step (0, expected) got
+   in matched
+         where step (matched, l@(h:t)) g
+                | h == g = (matched+1, t)
+                | h `laterThan` g  = (matched, l)
+                | otherwise = step (matched, t) g
+               step (matched, []) g = (matched, [])
+
 -- counting maximum match with maximum bipartite matching
 -- (we build an auxiliary graph and do a max-flow on this)
 maxMatch :: (a -> b -> Bool) -> [a] -> [b] -> Int
@ -72,7 +83,6 @@ maxMatch matchFun expected got = mf
   where (b, e, g) = buildGraph matchFun expected got
         mf = maxFlow g (fst b) (fst e)

-
 buildGraph :: (a -> b -> Bool) -> [a] -> [b] -> (LNode Int, LNode Int, Gr Int Int)
 buildGraph matchFun expected got = (b, e, g)
   where ((b, e), (_, g)) = buildGraph' matchFun expected got
--- a/test/Spec.hs
+++ b/test/Spec.hs
@ -8,6 +8,7 @@ import GEval.BLEU
 import GEval.ClippEU
 import GEval.PrecisionRecall
 import GEval.ClusteringMetrics
+import GEval.BIO
 import Data.Attoparsec.Text
 import Options.Applicative
 import Data.Text
@ -191,7 +192,63 @@ main = hspec $ do
      gevalCoreOnSingleLines RMSE (LineInFile "stub1" 1 "blabla")
                                  (LineInFile "stub2" 1 "3.4")
                                  (LineInFile "stub3" 1 "2.6") `shouldReturnAlmost` 0.8
-
+  describe "BIO format" $ do
+    it "just parse" $ do
+      let (Right r) = parseOnly (bioSequenceParser <* endOfInput) "O B-city/NEW_YORK I-city B-city/KALISZ I-city O B-name"
+      r `shouldBe` [Outside,
+                    Beginning "city" (Just "NEW_YORK"),
+                    Inside "city" Nothing,
+                    Beginning "city" (Just "KALISZ"),
+                    Inside "city" Nothing,
+                    Outside,
+                    Beginning "name" Nothing]
+    it "simplest entity" $ do
+      let (Right ents) = parseBioSequenceIntoEntities "B-city"
+      ents `shouldBe` [TaggedEntity (TaggedSpan 1 1) "city" Nothing]
+    it "multi-word entity" $ do
+      let (Right ents) = parseBioSequenceIntoEntities "B-date I-date"
+      ents `shouldBe` [TaggedEntity (TaggedSpan 1 2) "date" Nothing]
+    it "multi-word entity with normalized text" $ do
+      let (Right ents) = parseBioSequenceIntoEntities "B-date/FOO I-date/BAR"
+      ents `shouldBe` [TaggedEntity (TaggedSpan 1 2) "date" (Just "FOO_BAR")]
+    it "simplest entity with something outside" $ do
+      let (Right ents) = parseBioSequenceIntoEntities "O B-city"
+      ents `shouldBe` [TaggedEntity (TaggedSpan 2 2) "city" Nothing]
+    it "another simple case" $ do
+      let (Right ents) = parseBioSequenceIntoEntities "B-city B-city"
+      ents `shouldBe` [TaggedEntity (TaggedSpan 1 1) "city" Nothing,
+                       TaggedEntity (TaggedSpan 2 2) "city" Nothing]
+    it "just parse into entities" $ do
+      let (Right ents) = parseBioSequenceIntoEntities "O O B-city/LOS_ANGELES I-city B-city/KLUCZBORK O B-name O B-person/JOHN I-person/VON I-person/NEUMANN"
+      ents `shouldBe` [TaggedEntity (TaggedSpan 3 4) "city" (Just "LOS_ANGELES"),
+                       TaggedEntity (TaggedSpan 5 5) "city" (Just "KLUCZBORK"),
+                       TaggedEntity (TaggedSpan 7 7) "name" (Nothing),
+                       TaggedEntity (TaggedSpan 9 11) "person" (Just "JOHN_VON_NEUMANN")]
+    it "another entity parse" $ do
+      let (Right ents) = parseBioSequenceIntoEntities "B-month/JULY B-month/JULY O O B-foo/bar"
+      ents `shouldBe` [TaggedEntity (TaggedSpan 1 1) "month" (Just "JULY"),
+                       TaggedEntity (TaggedSpan 2 2) "month" (Just "JULY"),
+                       TaggedEntity (TaggedSpan 5 5) "foo" (Just "bar")]
+    it "another entity parse" $ do
+      let (Right ents) = parseBioSequenceIntoEntities "B-city/LOS I-city/ANGELES O B-city/NEW I-city/YORK"
+      ents `shouldBe` [TaggedEntity (TaggedSpan 1 2) "city" (Just "LOS_ANGELES"),
+                       TaggedEntity (TaggedSpan 4 5) "city" (Just "NEW_YORK")]
+    it "parse entity" $ do
+      let (Right ents) = parseBioSequenceIntoEntities "B-surname/BROWN B-surname/SMITH"
+      ents `shouldBe` [TaggedEntity (TaggedSpan 1 1) "surname" (Just "BROWN"),
+                       TaggedEntity (TaggedSpan 2 2) "surname" (Just "SMITH")]
+    it "parse entity" $ do
+      let (Right ents) = parseBioSequenceIntoEntities "O B-surname/SMITH"
+      ents `shouldBe` [TaggedEntity (TaggedSpan 2 2) "surname" (Just "SMITH")]
+    it "check counting" $ do
+      gatherCountsForBIO [TaggedEntity (TaggedSpan 2 2) "surname" (Just "SMITH")] [TaggedEntity (TaggedSpan 1 1) "surname" (Just "BROWN"),
+                                                                                   TaggedEntity (TaggedSpan 2 2) "surname" (Just "SMITH")] `shouldBe` (1, 1, 2)
+    it "check F1 on a more complicated example" $ do
+      runGEvalTest "bio-f1-complex" `shouldReturnAlmost` 0.625
+    it "calculate F1" $ do
+      runGEvalTest "bio-f1-simple" `shouldReturnAlmost` 0.5
+    it "check perfect score" $ do
+      runGEvalTest "bio-f1-perfect" `shouldReturnAlmost` 1.0

 neverMatch :: Char -> Int -> Bool
 neverMatch _ _ = False
--- a/test/bio-f1-complex/bio-f1-complex-solution/test-A/out.tsv
+++ b/test/bio-f1-complex/bio-f1-complex-solution/test-A/out.tsv
@ -0,0 +1,6 @@
+B-wrong
+B-city/LOS I-city/ANGELES O B-city/NEW I-city/YORK
+B-surname/BROWN B-surname/SMITH
+B-month/JULY B-month/JULY O O B-foo/bar
+O B-class I-class I-class
+O B-wrong
--- a/test/bio-f1-complex/bio-f1-complex/config.txt
+++ b/test/bio-f1-complex/bio-f1-complex/config.txt
@ -0,0 +1 @@
+--metric BIO-F1
--- a/test/bio-f1-complex/bio-f1-complex/test-A/expected.tsv
+++ b/test/bio-f1-complex/bio-f1-complex/test-A/expected.tsv
@ -0,0 +1,6 @@
+O
+B-city/LOS I-city/ANGELES O B-city/NEW_YORK O
+O B-surname/SMITH
+B-month/JULY O O O B-foo/bar
+O B-class I-class I-class
+O O
--- a/test/bio-f1-perfect/bio-f1-perfect-solution/test-A/out.tsv
+++ b/test/bio-f1-perfect/bio-f1-perfect-solution/test-A/out.tsv
@ -0,0 +1,4 @@
+O O O
+O B-city/NEW I-city/YORK I-city/CITY O B-month/July
+B-surname/SMITH
+B-city/LONDON B-city/PARIS
--- a/test/bio-f1-perfect/bio-f1-perfect/config.txt
+++ b/test/bio-f1-perfect/bio-f1-perfect/config.txt
@ -0,0 +1 @@
+--metric BIO-F1
--- a/test/bio-f1-perfect/bio-f1-perfect/test-A/expected.tsv
+++ b/test/bio-f1-perfect/bio-f1-perfect/test-A/expected.tsv
@ -0,0 +1,4 @@
+O O O
+O B-city/NEW I-city/YORK I-city/CITY O B-month/July
+B-surname/SMITH
+B-city/LONDON B-city/PARIS
--- a/test/bio-f1-simple/bio-f1-simple-solution/test-A/out.tsv
+++ b/test/bio-f1-simple/bio-f1-simple-solution/test-A/out.tsv
@ -0,0 +1,3 @@
+O O B-city/POZNAŃ O O B-date/MARCH I-date/12
+B-city/BUK O O O
+B-name/FOO O B-surname/KOWALSKI
--- a/test/bio-f1-simple/bio-f1-simple/config.txt
+++ b/test/bio-f1-simple/bio-f1-simple/config.txt
@ -0,0 +1 @@
+--metric BIO-F1
--- a/test/bio-f1-simple/bio-f1-simple/test-A/expected.tsv
+++ b/test/bio-f1-simple/bio-f1-simple/test-A/expected.tsv
@ -0,0 +1,3 @@
+O O B-city/POZNAŃ O O B-date/MARCH I-date/12
+O O O O
+O B-city/KONIN O