implement BIO-F1

This commit is contained in:
Filip Gralinski 2018-05-15 09:38:13 +02:00 committed by Filip Graliński
parent a1c357948e
commit 82e794ae3c
16 changed files with 239 additions and 6 deletions

View File

@ -26,6 +26,7 @@ library
, GEval.LogLossHashed
, GEval.CharMatch
, GEval.LineByLine
, GEval.BIO
build-depends: base >= 4.7 && < 5
, cond
, conduit

99
src/GEval/BIO.hs Normal file
View File

@ -0,0 +1,99 @@
{-# LANGUAGE OverloadedStrings #-}
module GEval.BIO
(BIOLabel(..), bioSequenceParser, parseBioSequenceIntoEntities, TaggedSpan(..), TaggedEntity(..), gatherCountsForBIO)
where
import GEval.PrecisionRecall
import qualified Data.Text as T
import Data.Attoparsec.Text
import Data.Attoparsec.Combinator
import Control.Applicative
import Data.Char
import Data.Maybe (catMaybes)
import GEval.Common
data BIOLabel = Outside | Beginning T.Text (Maybe T.Text) | Inside T.Text (Maybe T.Text)
deriving (Eq, Show)
data TaggedSpan = TaggedSpan Int Int
deriving (Eq, Show)
data TaggedEntity = TaggedEntity TaggedSpan T.Text (Maybe T.Text)
deriving (Eq, Show)
gatherCountsForBIO :: [TaggedEntity] -> [TaggedEntity] -> (Int, Int, Int)
gatherCountsForBIO expected got = (maxMatchOnOrdered laterThan expected got, length expected, length got)
where
laterThan (TaggedEntity (TaggedSpan a _) _ _) (TaggedEntity (TaggedSpan b _) _ _) = a > b
parseBioSequenceIntoEntities :: T.Text -> Either String [TaggedEntity]
parseBioSequenceIntoEntities t = labelsIntoEntities =<< (parseOnly (bioSequenceParser <* endOfInput) t)
labelsIntoEntities :: [BIOLabel] -> Either String [TaggedEntity]
labelsIntoEntities labels = labelsIntoEntities' $ zip labels [1..]
labelsIntoEntities' :: [(BIOLabel, Int)] -> Either String [TaggedEntity]
labelsIntoEntities' labelsWithPositions = mapM labelSplitToEntity labelsGathered
where labelsGathered = splitLabels labelsWithPositions
labelSplitToEntity :: [(BIOLabel, Int)] -> Either String TaggedEntity
labelSplitToEntity labs@(h@(_,begIx):t) = if isBeginning h && all (\tp -> isInside tp && tt tp == btp) t
then
Right $ TaggedEntity (TaggedSpan begIx lastItemIx) btp mNormalized
else
Left "something wrong with label sequence"
where isBeginning (Beginning _ _, _) = True
isBeginning _ = False
isInside (Inside _ _, _) = True
isInside _ = False
tt (Beginning t _, _) = t
tt (Inside t _, _) = t
btp = tt h
lastItemIx = case t of
[] -> begIx
_ -> let (_, ix) = last t
in ix
normalized (Beginning _ n, _) = n
normalized (Inside _ n, _) = n
mNormalized = if all (\tp -> normalized tp == Nothing) labs
then
Nothing
else
Just $ T.intercalate "_" $ catMaybes $ map normalized labs
splitLabels :: [(BIOLabel, Int)] -> [[(BIOLabel, Int)]]
splitLabels [] = []
splitLabels ((Outside, _):r) = splitLabels r
splitLabels (e@(_, ix):r) =
case splitLabels r of
l@(((Beginning _ _, _):_):_) -> ([e]:l)
(s@((Inside _ _, ix'):_):l) -> if ix' == ix + 1
then
((e:s):l)
else
([e]:(s:l))
[] -> [[e]]
bioSequenceParser :: Parser [BIOLabel]
bioSequenceParser = sepByWhitespaces bioLabelParser
bioLabelParser :: Parser BIOLabel
bioLabelParser =
(string "O" *> pure Outside) <|>
(do
labelType <- bioMarkerParser
string "-"
label <- takeWhile1 (\c -> not (isSpace c) && c /= '/')
normalized <- (do
string "/"
normalized <- takeWhile1 (not . isSpace)
return $ Just normalized) <|> pure Nothing
return $ labelType label normalized)
bioMarkerParser :: Parser (T.Text -> Maybe T.Text -> BIOLabel)
bioMarkerParser =
(string "B" *> pure Beginning) <|> (string "I" *> pure Inside)

View File

@ -53,6 +53,7 @@ import qualified System.Directory as D
import System.Posix
import System.FilePath
import Data.Maybe
import Data.Tuple
import qualified Data.List.Split as DLS
import Control.Monad.IO.Class
@ -67,6 +68,7 @@ import GEval.PrecisionRecall
import GEval.ClusteringMetrics
import GEval.LogLossHashed
import GEval.CharMatch
import GEval.BIO
import qualified Data.HashMap.Strict as M
@ -80,7 +82,7 @@ defaultLogLossHashedSize :: Word32
defaultLogLossHashedSize = 10
data Metric = RMSE | MSE | BLEU | Accuracy | ClippEU | FMeasure Double | NMI | LogLossHashed Word32 | CharMatch
| MAP | LogLoss
| MAP | LogLoss | BIOF1
deriving (Eq)
instance Show Metric where
@ -100,6 +102,7 @@ instance Show Metric where
show CharMatch = "CharMatch"
show MAP = "MAP"
show LogLoss = "LogLoss"
show BIOF1 = "BIO-F1"
instance Read Metric where
readsPrec _ ('R':'M':'S':'E':theRest) = [(RMSE, theRest)]
@ -117,6 +120,7 @@ instance Read Metric where
readsPrec _ ('L':'o':'g':'L':'o':'s':'s':theRest) = [(LogLoss, theRest)]
readsPrec p ('C':'h':'a':'r':'M':'a':'t':'c':'h':theRest) = [(CharMatch, theRest)]
readsPrec _ ('M':'A':'P':theRest) = [(MAP, theRest)]
readsPrec _ ('B':'I':'O':'-':'F':'1':theRest) = [(BIOF1, theRest)]
data MetricOrdering = TheLowerTheBetter | TheHigherTheBetter
@ -132,6 +136,7 @@ getMetricOrdering (LogLossHashed _) = TheLowerTheBetter
getMetricOrdering CharMatch = TheHigherTheBetter
getMetricOrdering MAP = TheHigherTheBetter
getMetricOrdering LogLoss = TheLowerTheBetter
getMetricOrdering BIOF1 = TheHigherTheBetter
defaultOutDirectory = "."
defaultTestName = "test-A"
@ -381,6 +386,10 @@ gevalCore' CharMatch inputLineSource = helper inputLineSource
helper inputLineSource expectedLineSource outputLineSource = do
gevalCoreGeneralized (ParserSpecWithInput (Right . unpack) (Right . unpack) (Right . unpack)) step countAgg (fMeasureOnCounts charMatchBeta) (WithInput inputLineSource expectedLineSource outputLineSource)
step (ParsedRecordWithInput inp exp out) = getCharMatchCount inp exp out
gevalCore' BIOF1 _ = gevalCoreWithoutInput parseBioSequenceIntoEntities parseBioSequenceIntoEntities (uncurry gatherCountsForBIO) countAgg f1MeasureOnCounts
countAgg :: Monad m => ConduitM (Int, Int, Int) o m (Int, Int, Int)
countAgg = CC.foldl countFolder (0, 0, 0)
parseDistributionWrapper :: Word32 -> Word32 -> Text -> HashedDistribution

View File

@ -199,6 +199,18 @@ This a sample challenge for the log-loss metric.
|] ++ (commonReadmeMDContents testName)
readmeMDContents BIOF1 testName = [i|
Tag and normalize names
=======================
Tag names in the tokenized text and normalized them.
The output should be given in the BIO format with the normalized forms given after slashes (see
`dev-0/expected.tsv` for an example).
The metric is F1 counted on entities (not labels).
|] ++ (commonReadmeMDContents testName)
readmeMDContents _ testName = [i|
GEval sample challenge
======================
@ -288,6 +300,10 @@ trainContents LogLoss = [hereLit|0.0 Hell, no!!!
1.0 Lekker!!!
0.0 Boring, boring, boring
|]
trainContents BIOF1 = [hereLit|O O O B-surname/BOND O B-firstname/JAMES B-surname/BOND My name is Bond , James Bond
O O O O O There is no name here
B-firstname/JOHN I-surname/VON I-surname/NEUMANN John von Nueman
|]
trainContents _ = [hereLit|0.06 0.39 0 0.206
1.00 1.00 1 0.017
317.8 5.20 67 0.048
@ -323,6 +339,9 @@ devInContents LogLoss = [hereLit|Great stuff!
Boring stuff
That's good
|]
devInContents BIOF1 = [hereLit|Adam and Eve
Mr Jan Kowalski
|]
devInContents _ = [hereLit|0.72 0 0.007
9.54 62 0.054
|]
@ -356,6 +375,9 @@ devExpectedContents LogLoss = [hereLit|1.0
0.0
1.0
|]
devExpectedContents BIOF1 = [hereLit|B-firstname/ADAM O B-firstname/EVE
O B-firstname/JAN B-surname/KOWALSKI
|]
devExpectedContents _ = [hereLit|0.82
95.2
|]
@ -391,6 +413,9 @@ testInContents LogLoss = [hereLit|That's great, ha, ha, I love it!
Super-duper!!
That is incredibly boring.
|]
testInContents BIOF1 = [hereLit|Alan Tring
No name here
|]
testInContents _ = [hereLit|1.52 2 0.093
30.06 14 0.009
|]
@ -426,6 +451,9 @@ testExpectedContents LogLoss = [hereLit|1.0
1.0
0.0
|]
testExpectedContents BIOF1 = [hereLit|B-firstname/ALAN B-surname/TURING
O O O
|]
testExpectedContents _ = [hereLit|0.11
17.2
|]

View File

@ -100,7 +100,7 @@ metricReader = option auto
<> value defaultMetric
<> showDefault
<> metavar "METRIC"
<> help "Metric to be used - RMSE, MSE, Accuracy, LogLoss, F-measure (specify as F1, F2, F0.25, etc.), MAP, BLEU, NMI, ClippEU, LogLossHashed or CharMatch" )
<> help "Metric to be used - RMSE, MSE, Accuracy, LogLoss, F-measure (specify as F1, F2, F0.25, etc.), MAP, BLEU, NMI, ClippEU, LogLossHashed, BIO-F1 or CharMatch" )
runGEval :: [String] -> IO (Either (ParserResult GEvalOptions) (Maybe MetricValue))
runGEval args = do

View File

@ -3,7 +3,7 @@
module GEval.PrecisionRecall(calculateMAPForOneResult,
fMeasure, f1Measure, f2Measure, precision, recall,
fMeasureOnCounts, f1MeasureOnCounts, f2MeasureOnCounts, countFolder,
precisionAndRecall, precisionAndRecallFromCounts, maxMatch)
precisionAndRecall, precisionAndRecallFromCounts, maxMatch, maxMatchOnOrdered)
where
import GEval.Common
@ -65,6 +65,17 @@ precision matchFun expected got = fst $ precisionAndRecall matchFun expected got
recall :: (a -> b -> Bool) -> [a] -> [b] -> Double
recall matchFun expected got = snd $ precisionAndRecall matchFun expected got
maxMatchOnOrdered :: Eq a => (a -> a -> Bool) -> [a] -> [a] -> Int
maxMatchOnOrdered laterThan expected got =
let (matched, _) = foldl' step (0, expected) got
in matched
where step (matched, l@(h:t)) g
| h == g = (matched+1, t)
| h `laterThan` g = (matched, l)
| otherwise = step (matched, t) g
step (matched, []) g = (matched, [])
-- counting maximum match with maximum bipartite matching
-- (we build an auxiliary graph and do a max-flow on this)
maxMatch :: (a -> b -> Bool) -> [a] -> [b] -> Int
@ -72,7 +83,6 @@ maxMatch matchFun expected got = mf
where (b, e, g) = buildGraph matchFun expected got
mf = maxFlow g (fst b) (fst e)
buildGraph :: (a -> b -> Bool) -> [a] -> [b] -> (LNode Int, LNode Int, Gr Int Int)
buildGraph matchFun expected got = (b, e, g)
where ((b, e), (_, g)) = buildGraph' matchFun expected got

View File

@ -8,6 +8,7 @@ import GEval.BLEU
import GEval.ClippEU
import GEval.PrecisionRecall
import GEval.ClusteringMetrics
import GEval.BIO
import Data.Attoparsec.Text
import Options.Applicative
import Data.Text
@ -191,7 +192,63 @@ main = hspec $ do
gevalCoreOnSingleLines RMSE (LineInFile "stub1" 1 "blabla")
(LineInFile "stub2" 1 "3.4")
(LineInFile "stub3" 1 "2.6") `shouldReturnAlmost` 0.8
describe "BIO format" $ do
it "just parse" $ do
let (Right r) = parseOnly (bioSequenceParser <* endOfInput) "O B-city/NEW_YORK I-city B-city/KALISZ I-city O B-name"
r `shouldBe` [Outside,
Beginning "city" (Just "NEW_YORK"),
Inside "city" Nothing,
Beginning "city" (Just "KALISZ"),
Inside "city" Nothing,
Outside,
Beginning "name" Nothing]
it "simplest entity" $ do
let (Right ents) = parseBioSequenceIntoEntities "B-city"
ents `shouldBe` [TaggedEntity (TaggedSpan 1 1) "city" Nothing]
it "multi-word entity" $ do
let (Right ents) = parseBioSequenceIntoEntities "B-date I-date"
ents `shouldBe` [TaggedEntity (TaggedSpan 1 2) "date" Nothing]
it "multi-word entity with normalized text" $ do
let (Right ents) = parseBioSequenceIntoEntities "B-date/FOO I-date/BAR"
ents `shouldBe` [TaggedEntity (TaggedSpan 1 2) "date" (Just "FOO_BAR")]
it "simplest entity with something outside" $ do
let (Right ents) = parseBioSequenceIntoEntities "O B-city"
ents `shouldBe` [TaggedEntity (TaggedSpan 2 2) "city" Nothing]
it "another simple case" $ do
let (Right ents) = parseBioSequenceIntoEntities "B-city B-city"
ents `shouldBe` [TaggedEntity (TaggedSpan 1 1) "city" Nothing,
TaggedEntity (TaggedSpan 2 2) "city" Nothing]
it "just parse into entities" $ do
let (Right ents) = parseBioSequenceIntoEntities "O O B-city/LOS_ANGELES I-city B-city/KLUCZBORK O B-name O B-person/JOHN I-person/VON I-person/NEUMANN"
ents `shouldBe` [TaggedEntity (TaggedSpan 3 4) "city" (Just "LOS_ANGELES"),
TaggedEntity (TaggedSpan 5 5) "city" (Just "KLUCZBORK"),
TaggedEntity (TaggedSpan 7 7) "name" (Nothing),
TaggedEntity (TaggedSpan 9 11) "person" (Just "JOHN_VON_NEUMANN")]
it "another entity parse" $ do
let (Right ents) = parseBioSequenceIntoEntities "B-month/JULY B-month/JULY O O B-foo/bar"
ents `shouldBe` [TaggedEntity (TaggedSpan 1 1) "month" (Just "JULY"),
TaggedEntity (TaggedSpan 2 2) "month" (Just "JULY"),
TaggedEntity (TaggedSpan 5 5) "foo" (Just "bar")]
it "another entity parse" $ do
let (Right ents) = parseBioSequenceIntoEntities "B-city/LOS I-city/ANGELES O B-city/NEW I-city/YORK"
ents `shouldBe` [TaggedEntity (TaggedSpan 1 2) "city" (Just "LOS_ANGELES"),
TaggedEntity (TaggedSpan 4 5) "city" (Just "NEW_YORK")]
it "parse entity" $ do
let (Right ents) = parseBioSequenceIntoEntities "B-surname/BROWN B-surname/SMITH"
ents `shouldBe` [TaggedEntity (TaggedSpan 1 1) "surname" (Just "BROWN"),
TaggedEntity (TaggedSpan 2 2) "surname" (Just "SMITH")]
it "parse entity" $ do
let (Right ents) = parseBioSequenceIntoEntities "O B-surname/SMITH"
ents `shouldBe` [TaggedEntity (TaggedSpan 2 2) "surname" (Just "SMITH")]
it "check counting" $ do
gatherCountsForBIO [TaggedEntity (TaggedSpan 2 2) "surname" (Just "SMITH")] [TaggedEntity (TaggedSpan 1 1) "surname" (Just "BROWN"),
TaggedEntity (TaggedSpan 2 2) "surname" (Just "SMITH")] `shouldBe` (1, 1, 2)
it "check F1 on a more complicated example" $ do
runGEvalTest "bio-f1-complex" `shouldReturnAlmost` 0.625
it "calculate F1" $ do
runGEvalTest "bio-f1-simple" `shouldReturnAlmost` 0.5
it "check perfect score" $ do
runGEvalTest "bio-f1-perfect" `shouldReturnAlmost` 1.0
neverMatch :: Char -> Int -> Bool
neverMatch _ _ = False

View File

@ -0,0 +1,6 @@
B-wrong
B-city/LOS I-city/ANGELES O B-city/NEW I-city/YORK
B-surname/BROWN B-surname/SMITH
B-month/JULY B-month/JULY O O B-foo/bar
O B-class I-class I-class
O B-wrong
1 B-wrong
2 B-city/LOS I-city/ANGELES O B-city/NEW I-city/YORK
3 B-surname/BROWN B-surname/SMITH
4 B-month/JULY B-month/JULY O O B-foo/bar
5 O B-class I-class I-class
6 O B-wrong

View File

@ -0,0 +1 @@
--metric BIO-F1

View File

@ -0,0 +1,6 @@
O
B-city/LOS I-city/ANGELES O B-city/NEW_YORK O
O B-surname/SMITH
B-month/JULY O O O B-foo/bar
O B-class I-class I-class
O O
1 O
2 B-city/LOS I-city/ANGELES O B-city/NEW_YORK O
3 O B-surname/SMITH
4 B-month/JULY O O O B-foo/bar
5 O B-class I-class I-class
6 O O

View File

@ -0,0 +1,4 @@
O O O
O B-city/NEW I-city/YORK I-city/CITY O B-month/July
B-surname/SMITH
B-city/LONDON B-city/PARIS
1 O O O
2 O B-city/NEW I-city/YORK I-city/CITY O B-month/July
3 B-surname/SMITH
4 B-city/LONDON B-city/PARIS

View File

@ -0,0 +1 @@
--metric BIO-F1

View File

@ -0,0 +1,4 @@
O O O
O B-city/NEW I-city/YORK I-city/CITY O B-month/July
B-surname/SMITH
B-city/LONDON B-city/PARIS
1 O O O
2 O B-city/NEW I-city/YORK I-city/CITY O B-month/July
3 B-surname/SMITH
4 B-city/LONDON B-city/PARIS

View File

@ -0,0 +1,3 @@
O O B-city/POZNAŃ O O B-date/MARCH I-date/12
B-city/BUK O O O
B-name/FOO O B-surname/KOWALSKI
1 O O B-city/POZNAŃ O O B-date/MARCH I-date/12
2 B-city/BUK O O O
3 B-name/FOO O B-surname/KOWALSKI

View File

@ -0,0 +1 @@
--metric BIO-F1

View File

@ -0,0 +1,3 @@
O O B-city/POZNAŃ O O B-date/MARCH I-date/12
O O O O
O B-city/KONIN O
1 O O B-city/POZNAŃ O O B-date/MARCH I-date/12
2 O O O O
3 O B-city/KONIN O