implement BIO-F1

This commit is contained in:
Filip Gralinski 2018-05-15 09:38:13 +02:00 committed by Filip Graliński
parent a1c357948e
commit 82e794ae3c
16 changed files with 239 additions and 6 deletions

View File

@ -26,6 +26,7 @@ library
, GEval.LogLossHashed , GEval.LogLossHashed
, GEval.CharMatch , GEval.CharMatch
, GEval.LineByLine , GEval.LineByLine
, GEval.BIO
build-depends: base >= 4.7 && < 5 build-depends: base >= 4.7 && < 5
, cond , cond
, conduit , conduit

99
src/GEval/BIO.hs Normal file
View File

@ -0,0 +1,99 @@
{-# LANGUAGE OverloadedStrings #-}
module GEval.BIO
(BIOLabel(..), bioSequenceParser, parseBioSequenceIntoEntities, TaggedSpan(..), TaggedEntity(..), gatherCountsForBIO)
where
import GEval.PrecisionRecall
import qualified Data.Text as T
import Data.Attoparsec.Text
import Data.Attoparsec.Combinator
import Control.Applicative
import Data.Char
import Data.Maybe (catMaybes)
import GEval.Common
data BIOLabel = Outside | Beginning T.Text (Maybe T.Text) | Inside T.Text (Maybe T.Text)
deriving (Eq, Show)
data TaggedSpan = TaggedSpan Int Int
deriving (Eq, Show)
data TaggedEntity = TaggedEntity TaggedSpan T.Text (Maybe T.Text)
deriving (Eq, Show)
gatherCountsForBIO :: [TaggedEntity] -> [TaggedEntity] -> (Int, Int, Int)
gatherCountsForBIO expected got = (maxMatchOnOrdered laterThan expected got, length expected, length got)
where
laterThan (TaggedEntity (TaggedSpan a _) _ _) (TaggedEntity (TaggedSpan b _) _ _) = a > b
parseBioSequenceIntoEntities :: T.Text -> Either String [TaggedEntity]
parseBioSequenceIntoEntities t = labelsIntoEntities =<< (parseOnly (bioSequenceParser <* endOfInput) t)
labelsIntoEntities :: [BIOLabel] -> Either String [TaggedEntity]
labelsIntoEntities labels = labelsIntoEntities' $ zip labels [1..]
labelsIntoEntities' :: [(BIOLabel, Int)] -> Either String [TaggedEntity]
labelsIntoEntities' labelsWithPositions = mapM labelSplitToEntity labelsGathered
where labelsGathered = splitLabels labelsWithPositions
labelSplitToEntity :: [(BIOLabel, Int)] -> Either String TaggedEntity
labelSplitToEntity labs@(h@(_,begIx):t) = if isBeginning h && all (\tp -> isInside tp && tt tp == btp) t
then
Right $ TaggedEntity (TaggedSpan begIx lastItemIx) btp mNormalized
else
Left "something wrong with label sequence"
where isBeginning (Beginning _ _, _) = True
isBeginning _ = False
isInside (Inside _ _, _) = True
isInside _ = False
tt (Beginning t _, _) = t
tt (Inside t _, _) = t
btp = tt h
lastItemIx = case t of
[] -> begIx
_ -> let (_, ix) = last t
in ix
normalized (Beginning _ n, _) = n
normalized (Inside _ n, _) = n
mNormalized = if all (\tp -> normalized tp == Nothing) labs
then
Nothing
else
Just $ T.intercalate "_" $ catMaybes $ map normalized labs
splitLabels :: [(BIOLabel, Int)] -> [[(BIOLabel, Int)]]
splitLabels [] = []
splitLabels ((Outside, _):r) = splitLabels r
splitLabels (e@(_, ix):r) =
case splitLabels r of
l@(((Beginning _ _, _):_):_) -> ([e]:l)
(s@((Inside _ _, ix'):_):l) -> if ix' == ix + 1
then
((e:s):l)
else
([e]:(s:l))
[] -> [[e]]
bioSequenceParser :: Parser [BIOLabel]
bioSequenceParser = sepByWhitespaces bioLabelParser
bioLabelParser :: Parser BIOLabel
bioLabelParser =
(string "O" *> pure Outside) <|>
(do
labelType <- bioMarkerParser
string "-"
label <- takeWhile1 (\c -> not (isSpace c) && c /= '/')
normalized <- (do
string "/"
normalized <- takeWhile1 (not . isSpace)
return $ Just normalized) <|> pure Nothing
return $ labelType label normalized)
bioMarkerParser :: Parser (T.Text -> Maybe T.Text -> BIOLabel)
bioMarkerParser =
(string "B" *> pure Beginning) <|> (string "I" *> pure Inside)

View File

@ -53,6 +53,7 @@ import qualified System.Directory as D
import System.Posix import System.Posix
import System.FilePath import System.FilePath
import Data.Maybe import Data.Maybe
import Data.Tuple
import qualified Data.List.Split as DLS import qualified Data.List.Split as DLS
import Control.Monad.IO.Class import Control.Monad.IO.Class
@ -67,6 +68,7 @@ import GEval.PrecisionRecall
import GEval.ClusteringMetrics import GEval.ClusteringMetrics
import GEval.LogLossHashed import GEval.LogLossHashed
import GEval.CharMatch import GEval.CharMatch
import GEval.BIO
import qualified Data.HashMap.Strict as M import qualified Data.HashMap.Strict as M
@ -80,7 +82,7 @@ defaultLogLossHashedSize :: Word32
defaultLogLossHashedSize = 10 defaultLogLossHashedSize = 10
data Metric = RMSE | MSE | BLEU | Accuracy | ClippEU | FMeasure Double | NMI | LogLossHashed Word32 | CharMatch data Metric = RMSE | MSE | BLEU | Accuracy | ClippEU | FMeasure Double | NMI | LogLossHashed Word32 | CharMatch
| MAP | LogLoss | MAP | LogLoss | BIOF1
deriving (Eq) deriving (Eq)
instance Show Metric where instance Show Metric where
@ -100,6 +102,7 @@ instance Show Metric where
show CharMatch = "CharMatch" show CharMatch = "CharMatch"
show MAP = "MAP" show MAP = "MAP"
show LogLoss = "LogLoss" show LogLoss = "LogLoss"
show BIOF1 = "BIO-F1"
instance Read Metric where instance Read Metric where
readsPrec _ ('R':'M':'S':'E':theRest) = [(RMSE, theRest)] readsPrec _ ('R':'M':'S':'E':theRest) = [(RMSE, theRest)]
@ -117,6 +120,7 @@ instance Read Metric where
readsPrec _ ('L':'o':'g':'L':'o':'s':'s':theRest) = [(LogLoss, theRest)] readsPrec _ ('L':'o':'g':'L':'o':'s':'s':theRest) = [(LogLoss, theRest)]
readsPrec p ('C':'h':'a':'r':'M':'a':'t':'c':'h':theRest) = [(CharMatch, theRest)] readsPrec p ('C':'h':'a':'r':'M':'a':'t':'c':'h':theRest) = [(CharMatch, theRest)]
readsPrec _ ('M':'A':'P':theRest) = [(MAP, theRest)] readsPrec _ ('M':'A':'P':theRest) = [(MAP, theRest)]
readsPrec _ ('B':'I':'O':'-':'F':'1':theRest) = [(BIOF1, theRest)]
data MetricOrdering = TheLowerTheBetter | TheHigherTheBetter data MetricOrdering = TheLowerTheBetter | TheHigherTheBetter
@ -132,6 +136,7 @@ getMetricOrdering (LogLossHashed _) = TheLowerTheBetter
getMetricOrdering CharMatch = TheHigherTheBetter getMetricOrdering CharMatch = TheHigherTheBetter
getMetricOrdering MAP = TheHigherTheBetter getMetricOrdering MAP = TheHigherTheBetter
getMetricOrdering LogLoss = TheLowerTheBetter getMetricOrdering LogLoss = TheLowerTheBetter
getMetricOrdering BIOF1 = TheHigherTheBetter
defaultOutDirectory = "." defaultOutDirectory = "."
defaultTestName = "test-A" defaultTestName = "test-A"
@ -381,6 +386,10 @@ gevalCore' CharMatch inputLineSource = helper inputLineSource
helper inputLineSource expectedLineSource outputLineSource = do helper inputLineSource expectedLineSource outputLineSource = do
gevalCoreGeneralized (ParserSpecWithInput (Right . unpack) (Right . unpack) (Right . unpack)) step countAgg (fMeasureOnCounts charMatchBeta) (WithInput inputLineSource expectedLineSource outputLineSource) gevalCoreGeneralized (ParserSpecWithInput (Right . unpack) (Right . unpack) (Right . unpack)) step countAgg (fMeasureOnCounts charMatchBeta) (WithInput inputLineSource expectedLineSource outputLineSource)
step (ParsedRecordWithInput inp exp out) = getCharMatchCount inp exp out step (ParsedRecordWithInput inp exp out) = getCharMatchCount inp exp out
gevalCore' BIOF1 _ = gevalCoreWithoutInput parseBioSequenceIntoEntities parseBioSequenceIntoEntities (uncurry gatherCountsForBIO) countAgg f1MeasureOnCounts
countAgg :: Monad m => ConduitM (Int, Int, Int) o m (Int, Int, Int)
countAgg = CC.foldl countFolder (0, 0, 0) countAgg = CC.foldl countFolder (0, 0, 0)
parseDistributionWrapper :: Word32 -> Word32 -> Text -> HashedDistribution parseDistributionWrapper :: Word32 -> Word32 -> Text -> HashedDistribution

View File

@ -199,6 +199,18 @@ This a sample challenge for the log-loss metric.
|] ++ (commonReadmeMDContents testName) |] ++ (commonReadmeMDContents testName)
readmeMDContents BIOF1 testName = [i|
Tag and normalize names
=======================
Tag names in the tokenized text and normalized them.
The output should be given in the BIO format with the normalized forms given after slashes (see
`dev-0/expected.tsv` for an example).
The metric is F1 counted on entities (not labels).
|] ++ (commonReadmeMDContents testName)
readmeMDContents _ testName = [i| readmeMDContents _ testName = [i|
GEval sample challenge GEval sample challenge
====================== ======================
@ -288,6 +300,10 @@ trainContents LogLoss = [hereLit|0.0 Hell, no!!!
1.0 Lekker!!! 1.0 Lekker!!!
0.0 Boring, boring, boring 0.0 Boring, boring, boring
|] |]
trainContents BIOF1 = [hereLit|O O O B-surname/BOND O B-firstname/JAMES B-surname/BOND My name is Bond , James Bond
O O O O O There is no name here
B-firstname/JOHN I-surname/VON I-surname/NEUMANN John von Nueman
|]
trainContents _ = [hereLit|0.06 0.39 0 0.206 trainContents _ = [hereLit|0.06 0.39 0 0.206
1.00 1.00 1 0.017 1.00 1.00 1 0.017
317.8 5.20 67 0.048 317.8 5.20 67 0.048
@ -323,6 +339,9 @@ devInContents LogLoss = [hereLit|Great stuff!
Boring stuff Boring stuff
That's good That's good
|] |]
devInContents BIOF1 = [hereLit|Adam and Eve
Mr Jan Kowalski
|]
devInContents _ = [hereLit|0.72 0 0.007 devInContents _ = [hereLit|0.72 0 0.007
9.54 62 0.054 9.54 62 0.054
|] |]
@ -356,6 +375,9 @@ devExpectedContents LogLoss = [hereLit|1.0
0.0 0.0
1.0 1.0
|] |]
devExpectedContents BIOF1 = [hereLit|B-firstname/ADAM O B-firstname/EVE
O B-firstname/JAN B-surname/KOWALSKI
|]
devExpectedContents _ = [hereLit|0.82 devExpectedContents _ = [hereLit|0.82
95.2 95.2
|] |]
@ -391,6 +413,9 @@ testInContents LogLoss = [hereLit|That's great, ha, ha, I love it!
Super-duper!! Super-duper!!
That is incredibly boring. That is incredibly boring.
|] |]
testInContents BIOF1 = [hereLit|Alan Tring
No name here
|]
testInContents _ = [hereLit|1.52 2 0.093 testInContents _ = [hereLit|1.52 2 0.093
30.06 14 0.009 30.06 14 0.009
|] |]
@ -426,6 +451,9 @@ testExpectedContents LogLoss = [hereLit|1.0
1.0 1.0
0.0 0.0
|] |]
testExpectedContents BIOF1 = [hereLit|B-firstname/ALAN B-surname/TURING
O O O
|]
testExpectedContents _ = [hereLit|0.11 testExpectedContents _ = [hereLit|0.11
17.2 17.2
|] |]

View File

@ -100,7 +100,7 @@ metricReader = option auto
<> value defaultMetric <> value defaultMetric
<> showDefault <> showDefault
<> metavar "METRIC" <> metavar "METRIC"
<> help "Metric to be used - RMSE, MSE, Accuracy, LogLoss, F-measure (specify as F1, F2, F0.25, etc.), MAP, BLEU, NMI, ClippEU, LogLossHashed or CharMatch" ) <> help "Metric to be used - RMSE, MSE, Accuracy, LogLoss, F-measure (specify as F1, F2, F0.25, etc.), MAP, BLEU, NMI, ClippEU, LogLossHashed, BIO-F1 or CharMatch" )
runGEval :: [String] -> IO (Either (ParserResult GEvalOptions) (Maybe MetricValue)) runGEval :: [String] -> IO (Either (ParserResult GEvalOptions) (Maybe MetricValue))
runGEval args = do runGEval args = do

View File

@ -3,7 +3,7 @@
module GEval.PrecisionRecall(calculateMAPForOneResult, module GEval.PrecisionRecall(calculateMAPForOneResult,
fMeasure, f1Measure, f2Measure, precision, recall, fMeasure, f1Measure, f2Measure, precision, recall,
fMeasureOnCounts, f1MeasureOnCounts, f2MeasureOnCounts, countFolder, fMeasureOnCounts, f1MeasureOnCounts, f2MeasureOnCounts, countFolder,
precisionAndRecall, precisionAndRecallFromCounts, maxMatch) precisionAndRecall, precisionAndRecallFromCounts, maxMatch, maxMatchOnOrdered)
where where
import GEval.Common import GEval.Common
@ -65,6 +65,17 @@ precision matchFun expected got = fst $ precisionAndRecall matchFun expected got
recall :: (a -> b -> Bool) -> [a] -> [b] -> Double recall :: (a -> b -> Bool) -> [a] -> [b] -> Double
recall matchFun expected got = snd $ precisionAndRecall matchFun expected got recall matchFun expected got = snd $ precisionAndRecall matchFun expected got
maxMatchOnOrdered :: Eq a => (a -> a -> Bool) -> [a] -> [a] -> Int
maxMatchOnOrdered laterThan expected got =
let (matched, _) = foldl' step (0, expected) got
in matched
where step (matched, l@(h:t)) g
| h == g = (matched+1, t)
| h `laterThan` g = (matched, l)
| otherwise = step (matched, t) g
step (matched, []) g = (matched, [])
-- counting maximum match with maximum bipartite matching -- counting maximum match with maximum bipartite matching
-- (we build an auxiliary graph and do a max-flow on this) -- (we build an auxiliary graph and do a max-flow on this)
maxMatch :: (a -> b -> Bool) -> [a] -> [b] -> Int maxMatch :: (a -> b -> Bool) -> [a] -> [b] -> Int
@ -72,7 +83,6 @@ maxMatch matchFun expected got = mf
where (b, e, g) = buildGraph matchFun expected got where (b, e, g) = buildGraph matchFun expected got
mf = maxFlow g (fst b) (fst e) mf = maxFlow g (fst b) (fst e)
buildGraph :: (a -> b -> Bool) -> [a] -> [b] -> (LNode Int, LNode Int, Gr Int Int) buildGraph :: (a -> b -> Bool) -> [a] -> [b] -> (LNode Int, LNode Int, Gr Int Int)
buildGraph matchFun expected got = (b, e, g) buildGraph matchFun expected got = (b, e, g)
where ((b, e), (_, g)) = buildGraph' matchFun expected got where ((b, e), (_, g)) = buildGraph' matchFun expected got

View File

@ -8,6 +8,7 @@ import GEval.BLEU
import GEval.ClippEU import GEval.ClippEU
import GEval.PrecisionRecall import GEval.PrecisionRecall
import GEval.ClusteringMetrics import GEval.ClusteringMetrics
import GEval.BIO
import Data.Attoparsec.Text import Data.Attoparsec.Text
import Options.Applicative import Options.Applicative
import Data.Text import Data.Text
@ -191,7 +192,63 @@ main = hspec $ do
gevalCoreOnSingleLines RMSE (LineInFile "stub1" 1 "blabla") gevalCoreOnSingleLines RMSE (LineInFile "stub1" 1 "blabla")
(LineInFile "stub2" 1 "3.4") (LineInFile "stub2" 1 "3.4")
(LineInFile "stub3" 1 "2.6") `shouldReturnAlmost` 0.8 (LineInFile "stub3" 1 "2.6") `shouldReturnAlmost` 0.8
describe "BIO format" $ do
it "just parse" $ do
let (Right r) = parseOnly (bioSequenceParser <* endOfInput) "O B-city/NEW_YORK I-city B-city/KALISZ I-city O B-name"
r `shouldBe` [Outside,
Beginning "city" (Just "NEW_YORK"),
Inside "city" Nothing,
Beginning "city" (Just "KALISZ"),
Inside "city" Nothing,
Outside,
Beginning "name" Nothing]
it "simplest entity" $ do
let (Right ents) = parseBioSequenceIntoEntities "B-city"
ents `shouldBe` [TaggedEntity (TaggedSpan 1 1) "city" Nothing]
it "multi-word entity" $ do
let (Right ents) = parseBioSequenceIntoEntities "B-date I-date"
ents `shouldBe` [TaggedEntity (TaggedSpan 1 2) "date" Nothing]
it "multi-word entity with normalized text" $ do
let (Right ents) = parseBioSequenceIntoEntities "B-date/FOO I-date/BAR"
ents `shouldBe` [TaggedEntity (TaggedSpan 1 2) "date" (Just "FOO_BAR")]
it "simplest entity with something outside" $ do
let (Right ents) = parseBioSequenceIntoEntities "O B-city"
ents `shouldBe` [TaggedEntity (TaggedSpan 2 2) "city" Nothing]
it "another simple case" $ do
let (Right ents) = parseBioSequenceIntoEntities "B-city B-city"
ents `shouldBe` [TaggedEntity (TaggedSpan 1 1) "city" Nothing,
TaggedEntity (TaggedSpan 2 2) "city" Nothing]
it "just parse into entities" $ do
let (Right ents) = parseBioSequenceIntoEntities "O O B-city/LOS_ANGELES I-city B-city/KLUCZBORK O B-name O B-person/JOHN I-person/VON I-person/NEUMANN"
ents `shouldBe` [TaggedEntity (TaggedSpan 3 4) "city" (Just "LOS_ANGELES"),
TaggedEntity (TaggedSpan 5 5) "city" (Just "KLUCZBORK"),
TaggedEntity (TaggedSpan 7 7) "name" (Nothing),
TaggedEntity (TaggedSpan 9 11) "person" (Just "JOHN_VON_NEUMANN")]
it "another entity parse" $ do
let (Right ents) = parseBioSequenceIntoEntities "B-month/JULY B-month/JULY O O B-foo/bar"
ents `shouldBe` [TaggedEntity (TaggedSpan 1 1) "month" (Just "JULY"),
TaggedEntity (TaggedSpan 2 2) "month" (Just "JULY"),
TaggedEntity (TaggedSpan 5 5) "foo" (Just "bar")]
it "another entity parse" $ do
let (Right ents) = parseBioSequenceIntoEntities "B-city/LOS I-city/ANGELES O B-city/NEW I-city/YORK"
ents `shouldBe` [TaggedEntity (TaggedSpan 1 2) "city" (Just "LOS_ANGELES"),
TaggedEntity (TaggedSpan 4 5) "city" (Just "NEW_YORK")]
it "parse entity" $ do
let (Right ents) = parseBioSequenceIntoEntities "B-surname/BROWN B-surname/SMITH"
ents `shouldBe` [TaggedEntity (TaggedSpan 1 1) "surname" (Just "BROWN"),
TaggedEntity (TaggedSpan 2 2) "surname" (Just "SMITH")]
it "parse entity" $ do
let (Right ents) = parseBioSequenceIntoEntities "O B-surname/SMITH"
ents `shouldBe` [TaggedEntity (TaggedSpan 2 2) "surname" (Just "SMITH")]
it "check counting" $ do
gatherCountsForBIO [TaggedEntity (TaggedSpan 2 2) "surname" (Just "SMITH")] [TaggedEntity (TaggedSpan 1 1) "surname" (Just "BROWN"),
TaggedEntity (TaggedSpan 2 2) "surname" (Just "SMITH")] `shouldBe` (1, 1, 2)
it "check F1 on a more complicated example" $ do
runGEvalTest "bio-f1-complex" `shouldReturnAlmost` 0.625
it "calculate F1" $ do
runGEvalTest "bio-f1-simple" `shouldReturnAlmost` 0.5
it "check perfect score" $ do
runGEvalTest "bio-f1-perfect" `shouldReturnAlmost` 1.0
neverMatch :: Char -> Int -> Bool neverMatch :: Char -> Int -> Bool
neverMatch _ _ = False neverMatch _ _ = False

View File

@ -0,0 +1,6 @@
B-wrong
B-city/LOS I-city/ANGELES O B-city/NEW I-city/YORK
B-surname/BROWN B-surname/SMITH
B-month/JULY B-month/JULY O O B-foo/bar
O B-class I-class I-class
O B-wrong
1 B-wrong
2 B-city/LOS I-city/ANGELES O B-city/NEW I-city/YORK
3 B-surname/BROWN B-surname/SMITH
4 B-month/JULY B-month/JULY O O B-foo/bar
5 O B-class I-class I-class
6 O B-wrong

View File

@ -0,0 +1 @@
--metric BIO-F1

View File

@ -0,0 +1,6 @@
O
B-city/LOS I-city/ANGELES O B-city/NEW_YORK O
O B-surname/SMITH
B-month/JULY O O O B-foo/bar
O B-class I-class I-class
O O
1 O
2 B-city/LOS I-city/ANGELES O B-city/NEW_YORK O
3 O B-surname/SMITH
4 B-month/JULY O O O B-foo/bar
5 O B-class I-class I-class
6 O O

View File

@ -0,0 +1,4 @@
O O O
O B-city/NEW I-city/YORK I-city/CITY O B-month/July
B-surname/SMITH
B-city/LONDON B-city/PARIS
1 O O O
2 O B-city/NEW I-city/YORK I-city/CITY O B-month/July
3 B-surname/SMITH
4 B-city/LONDON B-city/PARIS

View File

@ -0,0 +1 @@
--metric BIO-F1

View File

@ -0,0 +1,4 @@
O O O
O B-city/NEW I-city/YORK I-city/CITY O B-month/July
B-surname/SMITH
B-city/LONDON B-city/PARIS
1 O O O
2 O B-city/NEW I-city/YORK I-city/CITY O B-month/July
3 B-surname/SMITH
4 B-city/LONDON B-city/PARIS

View File

@ -0,0 +1,3 @@
O O B-city/POZNAŃ O O B-date/MARCH I-date/12
B-city/BUK O O O
B-name/FOO O B-surname/KOWALSKI
1 O O B-city/POZNAŃ O O B-date/MARCH I-date/12
2 B-city/BUK O O O
3 B-name/FOO O B-surname/KOWALSKI

View File

@ -0,0 +1 @@
--metric BIO-F1

View File

@ -0,0 +1,3 @@
O O B-city/POZNAŃ O O B-date/MARCH I-date/12
O O O O
O B-city/KONIN O
1 O O B-city/POZNAŃ O O B-date/MARCH I-date/12
2 O O O O
3 O B-city/KONIN O