add BIO-F1-Labels metric

This commit is contained in:
Filip Gralinski 2018-05-29 22:04:19 +02:00
parent 65e8d2562e
commit 4768931221
8 changed files with 37 additions and 3 deletions

View File

@ -1,7 +1,9 @@
{-# LANGUAGE OverloadedStrings #-} {-# LANGUAGE OverloadedStrings #-}
module GEval.BIO module GEval.BIO
(BIOLabel(..), bioSequenceParser, parseBioSequenceIntoEntities, TaggedSpan(..), TaggedEntity(..), gatherCountsForBIO) (BIOLabel(..), bioSequenceParser, parseBioSequenceIntoEntities,
TaggedSpan(..), TaggedEntity(..), gatherCountsForBIO,
eraseNormalisation)
where where
import GEval.PrecisionRecall import GEval.PrecisionRecall
@ -32,6 +34,9 @@ data TaggedSpan = TaggedSpan Int Int
data TaggedEntity = TaggedEntity TaggedSpan T.Text (Maybe T.Text) data TaggedEntity = TaggedEntity TaggedSpan T.Text (Maybe T.Text)
deriving (Eq, Show) deriving (Eq, Show)
eraseNormalisation :: TaggedEntity -> TaggedEntity
eraseNormalisation (TaggedEntity span label normalized) = (TaggedEntity span label Nothing)
gatherCountsForBIO :: [TaggedEntity] -> [TaggedEntity] -> (Int, Int, Int) gatherCountsForBIO :: [TaggedEntity] -> [TaggedEntity] -> (Int, Int, Int)
gatherCountsForBIO expected got = (maxMatchOnOrdered laterThan expected got, length expected, length got) gatherCountsForBIO expected got = (maxMatchOnOrdered laterThan expected got, length expected, length got)
where where

View File

@ -84,7 +84,7 @@ defaultLogLossHashedSize :: Word32
defaultLogLossHashedSize = 10 defaultLogLossHashedSize = 10
data Metric = RMSE | MSE | BLEU | Accuracy | ClippEU | FMeasure Double | NMI | LogLossHashed Word32 | CharMatch data Metric = RMSE | MSE | BLEU | Accuracy | ClippEU | FMeasure Double | NMI | LogLossHashed Word32 | CharMatch
| MAP | LogLoss | Likelihood | BIOF1 | LikelihoodHashed Word32 | MAP | LogLoss | Likelihood | BIOF1 | BIOF1Labels | LikelihoodHashed Word32
deriving (Eq) deriving (Eq)
instance Show Metric where instance Show Metric where
@ -112,6 +112,7 @@ instance Show Metric where
show LogLoss = "LogLoss" show LogLoss = "LogLoss"
show Likelihood = "Likelihood" show Likelihood = "Likelihood"
show BIOF1 = "BIO-F1" show BIOF1 = "BIO-F1"
show BIOF1Labels = "BIO-F1-Labels"
instance Read Metric where instance Read Metric where
readsPrec _ ('R':'M':'S':'E':theRest) = [(RMSE, theRest)] readsPrec _ ('R':'M':'S':'E':theRest) = [(RMSE, theRest)]
@ -133,6 +134,7 @@ instance Read Metric where
readsPrec _ ('L':'i':'k':'e':'l':'i':'h':'o':'o':'d':theRest) = [(Likelihood, theRest)] readsPrec _ ('L':'i':'k':'e':'l':'i':'h':'o':'o':'d':theRest) = [(Likelihood, theRest)]
readsPrec p ('C':'h':'a':'r':'M':'a':'t':'c':'h':theRest) = [(CharMatch, theRest)] readsPrec p ('C':'h':'a':'r':'M':'a':'t':'c':'h':theRest) = [(CharMatch, theRest)]
readsPrec _ ('M':'A':'P':theRest) = [(MAP, theRest)] readsPrec _ ('M':'A':'P':theRest) = [(MAP, theRest)]
readsPrec _ ('B':'I':'O':'-':'F':'1':'-':'L':'a':'b':'e':'l':'s':theRest) = [(BIOF1Labels, theRest)]
readsPrec _ ('B':'I':'O':'-':'F':'1':theRest) = [(BIOF1, theRest)] readsPrec _ ('B':'I':'O':'-':'F':'1':theRest) = [(BIOF1, theRest)]
data MetricOrdering = TheLowerTheBetter | TheHigherTheBetter data MetricOrdering = TheLowerTheBetter | TheHigherTheBetter
@ -152,6 +154,7 @@ getMetricOrdering MAP = TheHigherTheBetter
getMetricOrdering LogLoss = TheLowerTheBetter getMetricOrdering LogLoss = TheLowerTheBetter
getMetricOrdering Likelihood = TheHigherTheBetter getMetricOrdering Likelihood = TheHigherTheBetter
getMetricOrdering BIOF1 = TheHigherTheBetter getMetricOrdering BIOF1 = TheHigherTheBetter
getMetricOrdering BIOF1Labels = TheHigherTheBetter
defaultOutDirectory = "." defaultOutDirectory = "."
defaultTestName = "test-A" defaultTestName = "test-A"
@ -437,6 +440,11 @@ gevalCore' CharMatch inputLineSource = helper inputLineSource
gevalCore' BIOF1 _ = gevalCoreWithoutInput parseBioSequenceIntoEntities parseBioSequenceIntoEntities (uncurry gatherCountsForBIO) countAgg f1MeasureOnCounts gevalCore' BIOF1 _ = gevalCoreWithoutInput parseBioSequenceIntoEntities parseBioSequenceIntoEntities (uncurry gatherCountsForBIO) countAgg f1MeasureOnCounts
gevalCore' BIOF1Labels _ = gevalCoreWithoutInput parseBioSequenceIntoEntitiesWithoutNormalization parseBioSequenceIntoEntitiesWithoutNormalization (uncurry gatherCountsForBIO) countAgg f1MeasureOnCounts
where parseBioSequenceIntoEntitiesWithoutNormalization s = do
entities <- parseBioSequenceIntoEntities s
return $ Prelude.map eraseNormalisation entities
countAgg :: Monad m => ConduitM (Int, Int, Int) o m (Int, Int, Int) countAgg :: Monad m => ConduitM (Int, Int, Int) o m (Int, Int, Int)
countAgg = CC.foldl countFolder (0, 0, 0) countAgg = CC.foldl countFolder (0, 0, 0)

View File

@ -215,6 +215,7 @@ This a sample challenge for the likelihood metric.
|] ++ (commonReadmeMDContents testName) |] ++ (commonReadmeMDContents testName)
readmeMDContents BIOF1Labels testName = readmeMDContents BIOF1 testName
readmeMDContents BIOF1 testName = [i| readmeMDContents BIOF1 testName = [i|
Tag and normalize names Tag and normalize names
======================= =======================
@ -318,6 +319,7 @@ trainContents LogLoss = [hereLit|0.0 Hell, no!!!
1.0 Lekker!!! 1.0 Lekker!!!
0.0 Boring, boring, boring 0.0 Boring, boring, boring
|] |]
trainContents BIOF1Labels = trainContents BIOF1
trainContents BIOF1 = [hereLit|O O O B-surname/BOND O B-firstname/JAMES B-surname/BOND My name is Bond , James Bond trainContents BIOF1 = [hereLit|O O O B-surname/BOND O B-firstname/JAMES B-surname/BOND My name is Bond , James Bond
O O O O O There is no name here O O O O O There is no name here
B-firstname/JOHN I-surname/VON I-surname/NEUMANN John von Nueman B-firstname/JOHN I-surname/VON I-surname/NEUMANN John von Nueman
@ -359,6 +361,7 @@ devInContents LogLoss = [hereLit|Great stuff!
Boring stuff Boring stuff
That's good That's good
|] |]
devInContents BIOF1Labels = devInContents BIOF1
devInContents BIOF1 = [hereLit|Adam and Eve devInContents BIOF1 = [hereLit|Adam and Eve
Mr Jan Kowalski Mr Jan Kowalski
|] |]
@ -397,6 +400,7 @@ devExpectedContents LogLoss = [hereLit|1.0
0.0 0.0
1.0 1.0
|] |]
devExpectedContents BIOF1Labels = devExpectedContents BIOF1
devExpectedContents BIOF1 = [hereLit|B-firstname/ADAM O B-firstname/EVE devExpectedContents BIOF1 = [hereLit|B-firstname/ADAM O B-firstname/EVE
O B-firstname/JAN B-surname/KOWALSKI O B-firstname/JAN B-surname/KOWALSKI
|] |]
@ -437,6 +441,7 @@ testInContents LogLoss = [hereLit|That's great, ha, ha, I love it!
Super-duper!! Super-duper!!
That is incredibly boring. That is incredibly boring.
|] |]
testInContents BIOF1Labels = testInContents BIOF1
testInContents BIOF1 = [hereLit|Alan Tring testInContents BIOF1 = [hereLit|Alan Tring
No name here No name here
|] |]
@ -477,6 +482,7 @@ testExpectedContents LogLoss = [hereLit|1.0
1.0 1.0
0.0 0.0
|] |]
testExpectedContents BIOF1Labels = testExpectedContents BIOF1
testExpectedContents BIOF1 = [hereLit|B-firstname/ALAN B-surname/TURING testExpectedContents BIOF1 = [hereLit|B-firstname/ALAN B-surname/TURING
O O O O O O
|] |]

View File

@ -110,7 +110,7 @@ metricReader = option auto
<> value defaultMetric <> value defaultMetric
<> showDefault <> showDefault
<> metavar "METRIC" <> metavar "METRIC"
<> help "Metric to be used - RMSE, MSE, Accuracy, LogLoss, Likelihood, F-measure (specify as F1, F2, F0.25, etc.), MAP, BLEU, NMI, ClippEU, LogLossHashed, LikelihoodHashed, BIO-F1 or CharMatch" ) <> help "Metric to be used - RMSE, MSE, Accuracy, LogLoss, Likelihood, F-measure (specify as F1, F2, F0.25, etc.), MAP, BLEU, NMI, ClippEU, LogLossHashed, LikelihoodHashed, BIO-F1, BIO-F1-Labels or CharMatch" )
runGEval :: [String] -> IO (Either (ParserResult GEvalOptions) (Maybe MetricValue)) runGEval :: [String] -> IO (Either (ParserResult GEvalOptions) (Maybe MetricValue))
runGEval args = do runGEval args = do

View File

@ -257,6 +257,8 @@ main = hspec $ do
TaggedEntity (TaggedSpan 2 2) "surname" (Just "SMITH")] `shouldBe` (1, 1, 2) TaggedEntity (TaggedSpan 2 2) "surname" (Just "SMITH")] `shouldBe` (1, 1, 2)
it "check F1 on a more complicated example" $ do it "check F1 on a more complicated example" $ do
runGEvalTest "bio-f1-complex" `shouldReturnAlmost` 0.625 runGEvalTest "bio-f1-complex" `shouldReturnAlmost` 0.625
it "check F1 on labels only" $ do
runGEvalTest "bio-f1-complex-labels" `shouldReturnAlmost` 0.6666666666
it "calculate F1" $ do it "calculate F1" $ do
runGEvalTest "bio-f1-simple" `shouldReturnAlmost` 0.5 runGEvalTest "bio-f1-simple" `shouldReturnAlmost` 0.5
it "calculate F1 with underscores rather than minus signs" $ do it "calculate F1 with underscores rather than minus signs" $ do

View File

@ -0,0 +1,6 @@
B-wrong
B-city/LOS I-city/ANGELES O B-city/NEW I-city/YORK_CITY
B-surname/BROWN B-surname/SMIT
B-month B-month O O B-foo/bar
O B-class I-class I-class
O O
1 B-wrong
2 B-city/LOS I-city/ANGELES O B-city/NEW I-city/YORK_CITY
3 B-surname/BROWN B-surname/SMIT
4 B-month B-month O O B-foo/bar
5 O B-class I-class I-class
6 O O

View File

@ -0,0 +1 @@
--metric BIO-F1-Labels

View File

@ -0,0 +1,6 @@
O
B-city/LOS I-city/ANGELES O B-city/NEW_YORK O
O B-surname/SMITH
B-month/JULY O O O B-foo/bar
O B-class I-class I-class
O O
1 O
2 B-city/LOS I-city/ANGELES O B-city/NEW_YORK O
3 O B-surname/SMITH
4 B-month/JULY O O O B-foo/bar
5 O B-class I-class I-class
6 O O