implement BIO-F1

2018-05-15 09:38:13 +02:00 · 2018-05-15 09:38:13 +02:00 · 82e794ae3c
commit 82e794ae3c
parent a1c357948e
16 changed files with 239 additions and 6 deletions
--- a/geval.cabal
+++ b/geval.cabal
@ -26,6 +26,7 @@ library
                     , GEval.LogLossHashed
                     , GEval.CharMatch
                     , GEval.LineByLine
                     , GEval.BIO
  build-depends:       base >= 4.7 && < 5
                     , cond
                     , conduit
--- a/src/GEval/BIO.hs
+++ b/src/GEval/BIO.hs
@ -0,0 +1,99 @@
 {-# LANGUAGE OverloadedStrings #-}
 module GEval.BIO
       (BIOLabel(..), bioSequenceParser, parseBioSequenceIntoEntities, TaggedSpan(..), TaggedEntity(..), gatherCountsForBIO)
       where
 import GEval.PrecisionRecall
 import qualified Data.Text as T
 import Data.Attoparsec.Text
 import Data.Attoparsec.Combinator
 import Control.Applicative
 import Data.Char
 import Data.Maybe (catMaybes)
 import GEval.Common
 data BIOLabel = Outside | Beginning T.Text (Maybe T.Text) | Inside T.Text (Maybe T.Text)
                deriving (Eq, Show)
 data TaggedSpan = TaggedSpan Int Int
                  deriving (Eq, Show)
 data TaggedEntity = TaggedEntity TaggedSpan T.Text (Maybe T.Text)
                    deriving (Eq, Show)
 gatherCountsForBIO :: [TaggedEntity] -> [TaggedEntity] -> (Int, Int, Int)
 gatherCountsForBIO expected got = (maxMatchOnOrdered laterThan expected got, length expected, length got)
  where
    laterThan (TaggedEntity (TaggedSpan a _) _ _) (TaggedEntity (TaggedSpan b _) _ _) = a > b
 parseBioSequenceIntoEntities :: T.Text -> Either String [TaggedEntity]
 parseBioSequenceIntoEntities t = labelsIntoEntities =<< (parseOnly (bioSequenceParser <* endOfInput) t)
 labelsIntoEntities :: [BIOLabel] -> Either String [TaggedEntity]
 labelsIntoEntities labels = labelsIntoEntities' $ zip labels [1..]
 labelsIntoEntities' :: [(BIOLabel, Int)] -> Either String [TaggedEntity]
 labelsIntoEntities' labelsWithPositions = mapM labelSplitToEntity labelsGathered
  where labelsGathered = splitLabels labelsWithPositions
 labelSplitToEntity :: [(BIOLabel, Int)] -> Either String TaggedEntity
 labelSplitToEntity labs@(h@(_,begIx):t) = if isBeginning h && all (\tp -> isInside tp && tt tp == btp) t
                                     then
                                       Right $ TaggedEntity (TaggedSpan begIx lastItemIx) btp mNormalized
                                     else
                                       Left "something wrong with label sequence"
  where isBeginning (Beginning _ _, _) = True
        isBeginning _ = False
        isInside (Inside _ _, _) = True
        isInside _ = False
        tt (Beginning t _, _) = t
        tt (Inside t _, _) = t
        btp = tt h
        lastItemIx = case t of
          [] -> begIx
          _ -> let (_, ix) = last t
              in ix
        normalized (Beginning _ n, _) = n
        normalized (Inside _ n, _) = n
        mNormalized = if all (\tp -> normalized tp == Nothing) labs
                        then
                          Nothing
                        else
                          Just $ T.intercalate "_" $ catMaybes $ map normalized labs
 splitLabels :: [(BIOLabel, Int)] -> [[(BIOLabel, Int)]]
 splitLabels [] = []
 splitLabels ((Outside, _):r) = splitLabels r
 splitLabels (e@(_, ix):r) =
  case splitLabels r of
    l@(((Beginning _ _, _):_):_) -> ([e]:l)
    (s@((Inside _ _, ix'):_):l) -> if ix' == ix + 1
                                    then
                                      ((e:s):l)
                                    else
                                      ([e]:(s:l))
    [] -> [[e]]
 bioSequenceParser :: Parser [BIOLabel]
 bioSequenceParser = sepByWhitespaces bioLabelParser
 bioLabelParser :: Parser BIOLabel
 bioLabelParser =
  (string "O" *> pure Outside) <|>
  (do
      labelType <- bioMarkerParser
      string "-"
      label <- takeWhile1 (\c -> not (isSpace c) && c /= '/')
      normalized <- (do
                       string "/"
                       normalized <- takeWhile1 (not . isSpace)
                       return $ Just normalized) <|> pure Nothing
      return $ labelType label normalized)
 bioMarkerParser :: Parser (T.Text -> Maybe T.Text -> BIOLabel)
 bioMarkerParser =
  (string "B" *> pure Beginning) <|> (string "I" *> pure Inside)
--- a/src/GEval/Core.hs
+++ b/src/GEval/Core.hs
@ -53,6 +53,7 @@ import qualified System.Directory as D
 import System.Posix
 import System.FilePath
 import Data.Maybe
 import Data.Tuple
 import qualified Data.List.Split as DLS
 import Control.Monad.IO.Class
@ -67,6 +68,7 @@ import GEval.PrecisionRecall
 import GEval.ClusteringMetrics
 import GEval.LogLossHashed
 import GEval.CharMatch
 import GEval.BIO
 import qualified Data.HashMap.Strict as M
@ -80,7 +82,7 @@ defaultLogLossHashedSize :: Word32
 defaultLogLossHashedSize = 10
 data Metric = RMSE | MSE | BLEU | Accuracy | ClippEU | FMeasure Double | NMI | LogLossHashed Word32 | CharMatch
-              | MAP | LogLoss
+              | MAP | LogLoss | BIOF1
              deriving (Eq)
 instance Show Metric where
@ -100,6 +102,7 @@ instance Show Metric where
  show CharMatch = "CharMatch"
  show MAP = "MAP"
  show LogLoss = "LogLoss"
  show BIOF1 = "BIO-F1"
 instance Read Metric where
  readsPrec _ ('R':'M':'S':'E':theRest) = [(RMSE, theRest)]
@ -117,6 +120,7 @@ instance Read Metric where
  readsPrec _ ('L':'o':'g':'L':'o':'s':'s':theRest) = [(LogLoss, theRest)]
  readsPrec p ('C':'h':'a':'r':'M':'a':'t':'c':'h':theRest) = [(CharMatch, theRest)]
  readsPrec _ ('M':'A':'P':theRest) = [(MAP, theRest)]
  readsPrec _ ('B':'I':'O':'-':'F':'1':theRest) = [(BIOF1, theRest)]
 data MetricOrdering = TheLowerTheBetter | TheHigherTheBetter
@ -132,6 +136,7 @@ getMetricOrdering (LogLossHashed _) = TheLowerTheBetter
 getMetricOrdering CharMatch = TheHigherTheBetter
 getMetricOrdering MAP = TheHigherTheBetter
 getMetricOrdering LogLoss = TheLowerTheBetter
 getMetricOrdering BIOF1 = TheHigherTheBetter
 defaultOutDirectory = "."
 defaultTestName = "test-A"
@ -381,6 +386,10 @@ gevalCore' CharMatch inputLineSource = helper inputLineSource
   helper inputLineSource expectedLineSource outputLineSource = do
     gevalCoreGeneralized (ParserSpecWithInput (Right . unpack) (Right . unpack) (Right . unpack)) step countAgg (fMeasureOnCounts charMatchBeta) (WithInput inputLineSource expectedLineSource outputLineSource)
   step (ParsedRecordWithInput inp exp out) = getCharMatchCount inp exp out
 gevalCore' BIOF1 _ = gevalCoreWithoutInput parseBioSequenceIntoEntities parseBioSequenceIntoEntities (uncurry gatherCountsForBIO) countAgg f1MeasureOnCounts
 countAgg :: Monad m => ConduitM (Int, Int, Int) o m (Int, Int, Int)
 countAgg = CC.foldl countFolder (0, 0, 0)
 parseDistributionWrapper :: Word32 -> Word32 -> Text -> HashedDistribution
--- a/src/GEval/CreateChallenge.hs
+++ b/src/GEval/CreateChallenge.hs
@ -199,6 +199,18 @@ This a sample challenge for the log-loss metric.
 |] ++ (commonReadmeMDContents testName)
 readmeMDContents BIOF1 testName = [i|
 Tag and normalize names
 =======================
 Tag names in the tokenized text and normalized them.
 The output should be given in the BIO format with the normalized forms given after slashes (see
 `dev-0/expected.tsv` for an example).
 The metric is F1 counted on entities (not labels).
 |] ++ (commonReadmeMDContents testName)
 readmeMDContents _ testName = [i|
 GEval sample challenge
 ======================
@ -288,6 +300,10 @@ trainContents LogLoss = [hereLit|0.0	Hell, no!!!
 1.0	Lekker!!!
 0.0	Boring, boring, boring
 |]
 trainContents BIOF1 = [hereLit|O O O B-surname/BOND O B-firstname/JAMES B-surname/BOND	My name is Bond , James Bond
 O O O O O	There is no name here
 B-firstname/JOHN I-surname/VON I-surname/NEUMANN	John von Nueman
 |]
 trainContents _ = [hereLit|0.06	0.39	0	0.206
 1.00	1.00	1	0.017
 317.8	5.20	67	0.048
@ -323,6 +339,9 @@ devInContents LogLoss = [hereLit|Great stuff!
 Boring stuff
 That's good
 |]
 devInContents BIOF1 = [hereLit|Adam and Eve
 Mr Jan Kowalski
 |]
 devInContents _ = [hereLit|0.72	0	0.007
 9.54	62	0.054
 |]
@ -356,6 +375,9 @@ devExpectedContents LogLoss = [hereLit|1.0
 0.0
 1.0
 |]
 devExpectedContents BIOF1 = [hereLit|B-firstname/ADAM O B-firstname/EVE
 O B-firstname/JAN B-surname/KOWALSKI
 |]
 devExpectedContents _ = [hereLit|0.82
 95.2
 |]
@ -391,6 +413,9 @@ testInContents LogLoss = [hereLit|That's great, ha, ha, I love it!
 Super-duper!!
 That is incredibly boring.
 |]
 testInContents BIOF1 = [hereLit|Alan Tring
 No name here
 |]
 testInContents _ = [hereLit|1.52	2	0.093
 30.06	14	0.009
 |]
@ -426,6 +451,9 @@ testExpectedContents LogLoss = [hereLit|1.0
 1.0
 0.0
 |]
 testExpectedContents BIOF1 = [hereLit|B-firstname/ALAN B-surname/TURING
 O O O
 |]
 testExpectedContents _ = [hereLit|0.11
 17.2
 |]
--- a/src/GEval/OptionsParser.hs
+++ b/src/GEval/OptionsParser.hs
@ -100,7 +100,7 @@ metricReader = option auto
                 <> value defaultMetric
                 <> showDefault
                 <> metavar "METRIC"
-                 <> help "Metric to be used - RMSE, MSE, Accuracy, LogLoss, F-measure (specify as F1, F2, F0.25, etc.), MAP, BLEU, NMI, ClippEU, LogLossHashed or CharMatch" )
+                 <> help "Metric to be used - RMSE, MSE, Accuracy, LogLoss, F-measure (specify as F1, F2, F0.25, etc.), MAP, BLEU, NMI, ClippEU, LogLossHashed, BIO-F1 or CharMatch" )
 runGEval :: [String] -> IO (Either (ParserResult GEvalOptions) (Maybe MetricValue))
 runGEval args = do
--- a/src/GEval/PrecisionRecall.hs
+++ b/src/GEval/PrecisionRecall.hs
@ -3,7 +3,7 @@
 module GEval.PrecisionRecall(calculateMAPForOneResult,
                             fMeasure, f1Measure, f2Measure, precision, recall,
                             fMeasureOnCounts, f1MeasureOnCounts, f2MeasureOnCounts, countFolder,
-                             precisionAndRecall, precisionAndRecallFromCounts, maxMatch)
+                             precisionAndRecall, precisionAndRecallFromCounts, maxMatch, maxMatchOnOrdered)
       where
 import GEval.Common
@ -65,6 +65,17 @@ precision matchFun expected got = fst $ precisionAndRecall matchFun expected got
 recall :: (a -> b -> Bool) -> [a] -> [b] -> Double
 recall matchFun expected got = snd $ precisionAndRecall matchFun expected got
 maxMatchOnOrdered :: Eq a => (a -> a -> Bool) -> [a] -> [a] -> Int
 maxMatchOnOrdered laterThan expected got =
   let (matched, _) = foldl' step (0, expected) got
   in matched
         where step (matched, l@(h:t)) g
                | h == g = (matched+1, t)
                | h `laterThan` g  = (matched, l)
                | otherwise = step (matched, t) g
               step (matched, []) g = (matched, [])
 -- counting maximum match with maximum bipartite matching
 -- (we build an auxiliary graph and do a max-flow on this)
 maxMatch :: (a -> b -> Bool) -> [a] -> [b] -> Int
@ -72,7 +83,6 @@ maxMatch matchFun expected got = mf
   where (b, e, g) = buildGraph matchFun expected got
         mf = maxFlow g (fst b) (fst e)
 buildGraph :: (a -> b -> Bool) -> [a] -> [b] -> (LNode Int, LNode Int, Gr Int Int)
 buildGraph matchFun expected got = (b, e, g)
   where ((b, e), (_, g)) = buildGraph' matchFun expected got
--- a/test/Spec.hs
+++ b/test/Spec.hs
@ -8,6 +8,7 @@ import GEval.BLEU
 import GEval.ClippEU
 import GEval.PrecisionRecall
 import GEval.ClusteringMetrics
 import GEval.BIO
 import Data.Attoparsec.Text
 import Options.Applicative
 import Data.Text
@ -191,7 +192,63 @@ main = hspec $ do
      gevalCoreOnSingleLines RMSE (LineInFile "stub1" 1 "blabla")
                                  (LineInFile "stub2" 1 "3.4")
                                  (LineInFile "stub3" 1 "2.6") `shouldReturnAlmost` 0.8
-
+  describe "BIO format" $ do
    it "just parse" $ do
      let (Right r) = parseOnly (bioSequenceParser <* endOfInput) "O B-city/NEW_YORK I-city B-city/KALISZ I-city O B-name"
      r `shouldBe` [Outside,
                    Beginning "city" (Just "NEW_YORK"),
                    Inside "city" Nothing,
                    Beginning "city" (Just "KALISZ"),
                    Inside "city" Nothing,
                    Outside,
                    Beginning "name" Nothing]
    it "simplest entity" $ do
      let (Right ents) = parseBioSequenceIntoEntities "B-city"
      ents `shouldBe` [TaggedEntity (TaggedSpan 1 1) "city" Nothing]
    it "multi-word entity" $ do
      let (Right ents) = parseBioSequenceIntoEntities "B-date I-date"
      ents `shouldBe` [TaggedEntity (TaggedSpan 1 2) "date" Nothing]
    it "multi-word entity with normalized text" $ do
      let (Right ents) = parseBioSequenceIntoEntities "B-date/FOO I-date/BAR"
      ents `shouldBe` [TaggedEntity (TaggedSpan 1 2) "date" (Just "FOO_BAR")]
    it "simplest entity with something outside" $ do
      let (Right ents) = parseBioSequenceIntoEntities "O B-city"
      ents `shouldBe` [TaggedEntity (TaggedSpan 2 2) "city" Nothing]
    it "another simple case" $ do
      let (Right ents) = parseBioSequenceIntoEntities "B-city B-city"
      ents `shouldBe` [TaggedEntity (TaggedSpan 1 1) "city" Nothing,
                       TaggedEntity (TaggedSpan 2 2) "city" Nothing]
    it "just parse into entities" $ do
      let (Right ents) = parseBioSequenceIntoEntities "O O B-city/LOS_ANGELES I-city B-city/KLUCZBORK O B-name O B-person/JOHN I-person/VON I-person/NEUMANN"
      ents `shouldBe` [TaggedEntity (TaggedSpan 3 4) "city" (Just "LOS_ANGELES"),
                       TaggedEntity (TaggedSpan 5 5) "city" (Just "KLUCZBORK"),
                       TaggedEntity (TaggedSpan 7 7) "name" (Nothing),
                       TaggedEntity (TaggedSpan 9 11) "person" (Just "JOHN_VON_NEUMANN")]
    it "another entity parse" $ do
      let (Right ents) = parseBioSequenceIntoEntities "B-month/JULY B-month/JULY O O B-foo/bar"
      ents `shouldBe` [TaggedEntity (TaggedSpan 1 1) "month" (Just "JULY"),
                       TaggedEntity (TaggedSpan 2 2) "month" (Just "JULY"),
                       TaggedEntity (TaggedSpan 5 5) "foo" (Just "bar")]
    it "another entity parse" $ do
      let (Right ents) = parseBioSequenceIntoEntities "B-city/LOS I-city/ANGELES O B-city/NEW I-city/YORK"
      ents `shouldBe` [TaggedEntity (TaggedSpan 1 2) "city" (Just "LOS_ANGELES"),
                       TaggedEntity (TaggedSpan 4 5) "city" (Just "NEW_YORK")]
    it "parse entity" $ do
      let (Right ents) = parseBioSequenceIntoEntities "B-surname/BROWN B-surname/SMITH"
      ents `shouldBe` [TaggedEntity (TaggedSpan 1 1) "surname" (Just "BROWN"),
                       TaggedEntity (TaggedSpan 2 2) "surname" (Just "SMITH")]
    it "parse entity" $ do
      let (Right ents) = parseBioSequenceIntoEntities "O B-surname/SMITH"
      ents `shouldBe` [TaggedEntity (TaggedSpan 2 2) "surname" (Just "SMITH")]
    it "check counting" $ do
      gatherCountsForBIO [TaggedEntity (TaggedSpan 2 2) "surname" (Just "SMITH")] [TaggedEntity (TaggedSpan 1 1) "surname" (Just "BROWN"),
                                                                                   TaggedEntity (TaggedSpan 2 2) "surname" (Just "SMITH")] `shouldBe` (1, 1, 2)
    it "check F1 on a more complicated example" $ do
      runGEvalTest "bio-f1-complex" `shouldReturnAlmost` 0.625
    it "calculate F1" $ do
      runGEvalTest "bio-f1-simple" `shouldReturnAlmost` 0.5
    it "check perfect score" $ do
      runGEvalTest "bio-f1-perfect" `shouldReturnAlmost` 1.0
 neverMatch :: Char -> Int -> Bool
 neverMatch _ _ = False
--- a/test/bio-f1-complex/bio-f1-complex-solution/test-A/out.tsv
+++ b/test/bio-f1-complex/bio-f1-complex-solution/test-A/out.tsv
@ -0,0 +1,6 @@
 B-wrong
 B-city/LOS I-city/ANGELES O B-city/NEW I-city/YORK
 B-surname/BROWN B-surname/SMITH
 B-month/JULY B-month/JULY O O B-foo/bar
 O B-class I-class I-class
 O B-wrong
--- a/test/bio-f1-complex/bio-f1-complex/config.txt
+++ b/test/bio-f1-complex/bio-f1-complex/config.txt
@ -0,0 +1 @@
 --metric BIO-F1
--- a/test/bio-f1-complex/bio-f1-complex/test-A/expected.tsv
+++ b/test/bio-f1-complex/bio-f1-complex/test-A/expected.tsv
@ -0,0 +1,6 @@
 O
 B-city/LOS I-city/ANGELES O B-city/NEW_YORK O
 O B-surname/SMITH
 B-month/JULY O O O B-foo/bar
 O B-class I-class I-class
 O O
--- a/test/bio-f1-perfect/bio-f1-perfect-solution/test-A/out.tsv
+++ b/test/bio-f1-perfect/bio-f1-perfect-solution/test-A/out.tsv
@ -0,0 +1,4 @@
 O O O
 O B-city/NEW I-city/YORK I-city/CITY O B-month/July
 B-surname/SMITH
 B-city/LONDON B-city/PARIS
--- a/test/bio-f1-perfect/bio-f1-perfect/config.txt
+++ b/test/bio-f1-perfect/bio-f1-perfect/config.txt
@ -0,0 +1 @@
 --metric BIO-F1
--- a/test/bio-f1-perfect/bio-f1-perfect/test-A/expected.tsv
+++ b/test/bio-f1-perfect/bio-f1-perfect/test-A/expected.tsv
@ -0,0 +1,4 @@
 O O O
 O B-city/NEW I-city/YORK I-city/CITY O B-month/July
 B-surname/SMITH
 B-city/LONDON B-city/PARIS
--- a/test/bio-f1-simple/bio-f1-simple-solution/test-A/out.tsv
+++ b/test/bio-f1-simple/bio-f1-simple-solution/test-A/out.tsv
@ -0,0 +1,3 @@
 O O B-city/POZNAŃ O O B-date/MARCH I-date/12
 B-city/BUK O O O
 B-name/FOO O B-surname/KOWALSKI
--- a/test/bio-f1-simple/bio-f1-simple/config.txt
+++ b/test/bio-f1-simple/bio-f1-simple/config.txt
@ -0,0 +1 @@
 --metric BIO-F1
--- a/test/bio-f1-simple/bio-f1-simple/test-A/expected.tsv
+++ b/test/bio-f1-simple/bio-f1-simple/test-A/expected.tsv
@ -0,0 +1,3 @@
 O O B-city/POZNAŃ O O B-date/MARCH I-date/12
 O O O O
 O B-city/KONIN O