implement BIO-F1
This commit is contained in:
parent
a1c357948e
commit
82e794ae3c
@ -26,6 +26,7 @@ library
|
|||||||
, GEval.LogLossHashed
|
, GEval.LogLossHashed
|
||||||
, GEval.CharMatch
|
, GEval.CharMatch
|
||||||
, GEval.LineByLine
|
, GEval.LineByLine
|
||||||
|
, GEval.BIO
|
||||||
build-depends: base >= 4.7 && < 5
|
build-depends: base >= 4.7 && < 5
|
||||||
, cond
|
, cond
|
||||||
, conduit
|
, conduit
|
||||||
|
99
src/GEval/BIO.hs
Normal file
99
src/GEval/BIO.hs
Normal file
@ -0,0 +1,99 @@
|
|||||||
|
{-# LANGUAGE OverloadedStrings #-}
|
||||||
|
|
||||||
|
module GEval.BIO
|
||||||
|
(BIOLabel(..), bioSequenceParser, parseBioSequenceIntoEntities, TaggedSpan(..), TaggedEntity(..), gatherCountsForBIO)
|
||||||
|
where
|
||||||
|
|
||||||
|
import GEval.PrecisionRecall
|
||||||
|
|
||||||
|
import qualified Data.Text as T
|
||||||
|
|
||||||
|
import Data.Attoparsec.Text
|
||||||
|
import Data.Attoparsec.Combinator
|
||||||
|
import Control.Applicative
|
||||||
|
import Data.Char
|
||||||
|
import Data.Maybe (catMaybes)
|
||||||
|
|
||||||
|
import GEval.Common
|
||||||
|
|
||||||
|
data BIOLabel = Outside | Beginning T.Text (Maybe T.Text) | Inside T.Text (Maybe T.Text)
|
||||||
|
deriving (Eq, Show)
|
||||||
|
|
||||||
|
data TaggedSpan = TaggedSpan Int Int
|
||||||
|
deriving (Eq, Show)
|
||||||
|
|
||||||
|
data TaggedEntity = TaggedEntity TaggedSpan T.Text (Maybe T.Text)
|
||||||
|
deriving (Eq, Show)
|
||||||
|
|
||||||
|
gatherCountsForBIO :: [TaggedEntity] -> [TaggedEntity] -> (Int, Int, Int)
|
||||||
|
gatherCountsForBIO expected got = (maxMatchOnOrdered laterThan expected got, length expected, length got)
|
||||||
|
where
|
||||||
|
laterThan (TaggedEntity (TaggedSpan a _) _ _) (TaggedEntity (TaggedSpan b _) _ _) = a > b
|
||||||
|
|
||||||
|
parseBioSequenceIntoEntities :: T.Text -> Either String [TaggedEntity]
|
||||||
|
parseBioSequenceIntoEntities t = labelsIntoEntities =<< (parseOnly (bioSequenceParser <* endOfInput) t)
|
||||||
|
|
||||||
|
labelsIntoEntities :: [BIOLabel] -> Either String [TaggedEntity]
|
||||||
|
labelsIntoEntities labels = labelsIntoEntities' $ zip labels [1..]
|
||||||
|
|
||||||
|
labelsIntoEntities' :: [(BIOLabel, Int)] -> Either String [TaggedEntity]
|
||||||
|
labelsIntoEntities' labelsWithPositions = mapM labelSplitToEntity labelsGathered
|
||||||
|
where labelsGathered = splitLabels labelsWithPositions
|
||||||
|
|
||||||
|
labelSplitToEntity :: [(BIOLabel, Int)] -> Either String TaggedEntity
|
||||||
|
labelSplitToEntity labs@(h@(_,begIx):t) = if isBeginning h && all (\tp -> isInside tp && tt tp == btp) t
|
||||||
|
then
|
||||||
|
Right $ TaggedEntity (TaggedSpan begIx lastItemIx) btp mNormalized
|
||||||
|
else
|
||||||
|
Left "something wrong with label sequence"
|
||||||
|
where isBeginning (Beginning _ _, _) = True
|
||||||
|
isBeginning _ = False
|
||||||
|
isInside (Inside _ _, _) = True
|
||||||
|
isInside _ = False
|
||||||
|
tt (Beginning t _, _) = t
|
||||||
|
tt (Inside t _, _) = t
|
||||||
|
btp = tt h
|
||||||
|
lastItemIx = case t of
|
||||||
|
[] -> begIx
|
||||||
|
_ -> let (_, ix) = last t
|
||||||
|
in ix
|
||||||
|
normalized (Beginning _ n, _) = n
|
||||||
|
normalized (Inside _ n, _) = n
|
||||||
|
mNormalized = if all (\tp -> normalized tp == Nothing) labs
|
||||||
|
then
|
||||||
|
Nothing
|
||||||
|
else
|
||||||
|
Just $ T.intercalate "_" $ catMaybes $ map normalized labs
|
||||||
|
|
||||||
|
splitLabels :: [(BIOLabel, Int)] -> [[(BIOLabel, Int)]]
|
||||||
|
splitLabels [] = []
|
||||||
|
splitLabels ((Outside, _):r) = splitLabels r
|
||||||
|
splitLabels (e@(_, ix):r) =
|
||||||
|
case splitLabels r of
|
||||||
|
l@(((Beginning _ _, _):_):_) -> ([e]:l)
|
||||||
|
(s@((Inside _ _, ix'):_):l) -> if ix' == ix + 1
|
||||||
|
then
|
||||||
|
((e:s):l)
|
||||||
|
else
|
||||||
|
([e]:(s:l))
|
||||||
|
[] -> [[e]]
|
||||||
|
|
||||||
|
bioSequenceParser :: Parser [BIOLabel]
|
||||||
|
bioSequenceParser = sepByWhitespaces bioLabelParser
|
||||||
|
|
||||||
|
bioLabelParser :: Parser BIOLabel
|
||||||
|
bioLabelParser =
|
||||||
|
(string "O" *> pure Outside) <|>
|
||||||
|
(do
|
||||||
|
labelType <- bioMarkerParser
|
||||||
|
string "-"
|
||||||
|
label <- takeWhile1 (\c -> not (isSpace c) && c /= '/')
|
||||||
|
normalized <- (do
|
||||||
|
string "/"
|
||||||
|
normalized <- takeWhile1 (not . isSpace)
|
||||||
|
return $ Just normalized) <|> pure Nothing
|
||||||
|
return $ labelType label normalized)
|
||||||
|
|
||||||
|
bioMarkerParser :: Parser (T.Text -> Maybe T.Text -> BIOLabel)
|
||||||
|
bioMarkerParser =
|
||||||
|
(string "B" *> pure Beginning) <|> (string "I" *> pure Inside)
|
@ -53,6 +53,7 @@ import qualified System.Directory as D
|
|||||||
import System.Posix
|
import System.Posix
|
||||||
import System.FilePath
|
import System.FilePath
|
||||||
import Data.Maybe
|
import Data.Maybe
|
||||||
|
import Data.Tuple
|
||||||
import qualified Data.List.Split as DLS
|
import qualified Data.List.Split as DLS
|
||||||
|
|
||||||
import Control.Monad.IO.Class
|
import Control.Monad.IO.Class
|
||||||
@ -67,6 +68,7 @@ import GEval.PrecisionRecall
|
|||||||
import GEval.ClusteringMetrics
|
import GEval.ClusteringMetrics
|
||||||
import GEval.LogLossHashed
|
import GEval.LogLossHashed
|
||||||
import GEval.CharMatch
|
import GEval.CharMatch
|
||||||
|
import GEval.BIO
|
||||||
|
|
||||||
import qualified Data.HashMap.Strict as M
|
import qualified Data.HashMap.Strict as M
|
||||||
|
|
||||||
@ -80,7 +82,7 @@ defaultLogLossHashedSize :: Word32
|
|||||||
defaultLogLossHashedSize = 10
|
defaultLogLossHashedSize = 10
|
||||||
|
|
||||||
data Metric = RMSE | MSE | BLEU | Accuracy | ClippEU | FMeasure Double | NMI | LogLossHashed Word32 | CharMatch
|
data Metric = RMSE | MSE | BLEU | Accuracy | ClippEU | FMeasure Double | NMI | LogLossHashed Word32 | CharMatch
|
||||||
| MAP | LogLoss
|
| MAP | LogLoss | BIOF1
|
||||||
deriving (Eq)
|
deriving (Eq)
|
||||||
|
|
||||||
instance Show Metric where
|
instance Show Metric where
|
||||||
@ -100,6 +102,7 @@ instance Show Metric where
|
|||||||
show CharMatch = "CharMatch"
|
show CharMatch = "CharMatch"
|
||||||
show MAP = "MAP"
|
show MAP = "MAP"
|
||||||
show LogLoss = "LogLoss"
|
show LogLoss = "LogLoss"
|
||||||
|
show BIOF1 = "BIO-F1"
|
||||||
|
|
||||||
instance Read Metric where
|
instance Read Metric where
|
||||||
readsPrec _ ('R':'M':'S':'E':theRest) = [(RMSE, theRest)]
|
readsPrec _ ('R':'M':'S':'E':theRest) = [(RMSE, theRest)]
|
||||||
@ -117,6 +120,7 @@ instance Read Metric where
|
|||||||
readsPrec _ ('L':'o':'g':'L':'o':'s':'s':theRest) = [(LogLoss, theRest)]
|
readsPrec _ ('L':'o':'g':'L':'o':'s':'s':theRest) = [(LogLoss, theRest)]
|
||||||
readsPrec p ('C':'h':'a':'r':'M':'a':'t':'c':'h':theRest) = [(CharMatch, theRest)]
|
readsPrec p ('C':'h':'a':'r':'M':'a':'t':'c':'h':theRest) = [(CharMatch, theRest)]
|
||||||
readsPrec _ ('M':'A':'P':theRest) = [(MAP, theRest)]
|
readsPrec _ ('M':'A':'P':theRest) = [(MAP, theRest)]
|
||||||
|
readsPrec _ ('B':'I':'O':'-':'F':'1':theRest) = [(BIOF1, theRest)]
|
||||||
|
|
||||||
data MetricOrdering = TheLowerTheBetter | TheHigherTheBetter
|
data MetricOrdering = TheLowerTheBetter | TheHigherTheBetter
|
||||||
|
|
||||||
@ -132,6 +136,7 @@ getMetricOrdering (LogLossHashed _) = TheLowerTheBetter
|
|||||||
getMetricOrdering CharMatch = TheHigherTheBetter
|
getMetricOrdering CharMatch = TheHigherTheBetter
|
||||||
getMetricOrdering MAP = TheHigherTheBetter
|
getMetricOrdering MAP = TheHigherTheBetter
|
||||||
getMetricOrdering LogLoss = TheLowerTheBetter
|
getMetricOrdering LogLoss = TheLowerTheBetter
|
||||||
|
getMetricOrdering BIOF1 = TheHigherTheBetter
|
||||||
|
|
||||||
defaultOutDirectory = "."
|
defaultOutDirectory = "."
|
||||||
defaultTestName = "test-A"
|
defaultTestName = "test-A"
|
||||||
@ -381,7 +386,11 @@ gevalCore' CharMatch inputLineSource = helper inputLineSource
|
|||||||
helper inputLineSource expectedLineSource outputLineSource = do
|
helper inputLineSource expectedLineSource outputLineSource = do
|
||||||
gevalCoreGeneralized (ParserSpecWithInput (Right . unpack) (Right . unpack) (Right . unpack)) step countAgg (fMeasureOnCounts charMatchBeta) (WithInput inputLineSource expectedLineSource outputLineSource)
|
gevalCoreGeneralized (ParserSpecWithInput (Right . unpack) (Right . unpack) (Right . unpack)) step countAgg (fMeasureOnCounts charMatchBeta) (WithInput inputLineSource expectedLineSource outputLineSource)
|
||||||
step (ParsedRecordWithInput inp exp out) = getCharMatchCount inp exp out
|
step (ParsedRecordWithInput inp exp out) = getCharMatchCount inp exp out
|
||||||
countAgg = CC.foldl countFolder (0, 0, 0)
|
|
||||||
|
gevalCore' BIOF1 _ = gevalCoreWithoutInput parseBioSequenceIntoEntities parseBioSequenceIntoEntities (uncurry gatherCountsForBIO) countAgg f1MeasureOnCounts
|
||||||
|
|
||||||
|
countAgg :: Monad m => ConduitM (Int, Int, Int) o m (Int, Int, Int)
|
||||||
|
countAgg = CC.foldl countFolder (0, 0, 0)
|
||||||
|
|
||||||
parseDistributionWrapper :: Word32 -> Word32 -> Text -> HashedDistribution
|
parseDistributionWrapper :: Word32 -> Word32 -> Text -> HashedDistribution
|
||||||
parseDistributionWrapper nbOfBits seed distroSpec = case parseDistribution nbOfBits seed distroSpec of
|
parseDistributionWrapper nbOfBits seed distroSpec = case parseDistribution nbOfBits seed distroSpec of
|
||||||
|
@ -199,6 +199,18 @@ This a sample challenge for the log-loss metric.
|
|||||||
|
|
||||||
|] ++ (commonReadmeMDContents testName)
|
|] ++ (commonReadmeMDContents testName)
|
||||||
|
|
||||||
|
readmeMDContents BIOF1 testName = [i|
|
||||||
|
Tag and normalize names
|
||||||
|
=======================
|
||||||
|
|
||||||
|
Tag names in the tokenized text and normalized them.
|
||||||
|
|
||||||
|
The output should be given in the BIO format with the normalized forms given after slashes (see
|
||||||
|
`dev-0/expected.tsv` for an example).
|
||||||
|
|
||||||
|
The metric is F1 counted on entities (not labels).
|
||||||
|
|] ++ (commonReadmeMDContents testName)
|
||||||
|
|
||||||
readmeMDContents _ testName = [i|
|
readmeMDContents _ testName = [i|
|
||||||
GEval sample challenge
|
GEval sample challenge
|
||||||
======================
|
======================
|
||||||
@ -288,6 +300,10 @@ trainContents LogLoss = [hereLit|0.0 Hell, no!!!
|
|||||||
1.0 Lekker!!!
|
1.0 Lekker!!!
|
||||||
0.0 Boring, boring, boring
|
0.0 Boring, boring, boring
|
||||||
|]
|
|]
|
||||||
|
trainContents BIOF1 = [hereLit|O O O B-surname/BOND O B-firstname/JAMES B-surname/BOND My name is Bond , James Bond
|
||||||
|
O O O O O There is no name here
|
||||||
|
B-firstname/JOHN I-surname/VON I-surname/NEUMANN John von Nueman
|
||||||
|
|]
|
||||||
trainContents _ = [hereLit|0.06 0.39 0 0.206
|
trainContents _ = [hereLit|0.06 0.39 0 0.206
|
||||||
1.00 1.00 1 0.017
|
1.00 1.00 1 0.017
|
||||||
317.8 5.20 67 0.048
|
317.8 5.20 67 0.048
|
||||||
@ -323,6 +339,9 @@ devInContents LogLoss = [hereLit|Great stuff!
|
|||||||
Boring stuff
|
Boring stuff
|
||||||
That's good
|
That's good
|
||||||
|]
|
|]
|
||||||
|
devInContents BIOF1 = [hereLit|Adam and Eve
|
||||||
|
Mr Jan Kowalski
|
||||||
|
|]
|
||||||
devInContents _ = [hereLit|0.72 0 0.007
|
devInContents _ = [hereLit|0.72 0 0.007
|
||||||
9.54 62 0.054
|
9.54 62 0.054
|
||||||
|]
|
|]
|
||||||
@ -356,6 +375,9 @@ devExpectedContents LogLoss = [hereLit|1.0
|
|||||||
0.0
|
0.0
|
||||||
1.0
|
1.0
|
||||||
|]
|
|]
|
||||||
|
devExpectedContents BIOF1 = [hereLit|B-firstname/ADAM O B-firstname/EVE
|
||||||
|
O B-firstname/JAN B-surname/KOWALSKI
|
||||||
|
|]
|
||||||
devExpectedContents _ = [hereLit|0.82
|
devExpectedContents _ = [hereLit|0.82
|
||||||
95.2
|
95.2
|
||||||
|]
|
|]
|
||||||
@ -391,6 +413,9 @@ testInContents LogLoss = [hereLit|That's great, ha, ha, I love it!
|
|||||||
Super-duper!!
|
Super-duper!!
|
||||||
That is incredibly boring.
|
That is incredibly boring.
|
||||||
|]
|
|]
|
||||||
|
testInContents BIOF1 = [hereLit|Alan Tring
|
||||||
|
No name here
|
||||||
|
|]
|
||||||
testInContents _ = [hereLit|1.52 2 0.093
|
testInContents _ = [hereLit|1.52 2 0.093
|
||||||
30.06 14 0.009
|
30.06 14 0.009
|
||||||
|]
|
|]
|
||||||
@ -426,6 +451,9 @@ testExpectedContents LogLoss = [hereLit|1.0
|
|||||||
1.0
|
1.0
|
||||||
0.0
|
0.0
|
||||||
|]
|
|]
|
||||||
|
testExpectedContents BIOF1 = [hereLit|B-firstname/ALAN B-surname/TURING
|
||||||
|
O O O
|
||||||
|
|]
|
||||||
testExpectedContents _ = [hereLit|0.11
|
testExpectedContents _ = [hereLit|0.11
|
||||||
17.2
|
17.2
|
||||||
|]
|
|]
|
||||||
|
@ -100,7 +100,7 @@ metricReader = option auto
|
|||||||
<> value defaultMetric
|
<> value defaultMetric
|
||||||
<> showDefault
|
<> showDefault
|
||||||
<> metavar "METRIC"
|
<> metavar "METRIC"
|
||||||
<> help "Metric to be used - RMSE, MSE, Accuracy, LogLoss, F-measure (specify as F1, F2, F0.25, etc.), MAP, BLEU, NMI, ClippEU, LogLossHashed or CharMatch" )
|
<> help "Metric to be used - RMSE, MSE, Accuracy, LogLoss, F-measure (specify as F1, F2, F0.25, etc.), MAP, BLEU, NMI, ClippEU, LogLossHashed, BIO-F1 or CharMatch" )
|
||||||
|
|
||||||
runGEval :: [String] -> IO (Either (ParserResult GEvalOptions) (Maybe MetricValue))
|
runGEval :: [String] -> IO (Either (ParserResult GEvalOptions) (Maybe MetricValue))
|
||||||
runGEval args = do
|
runGEval args = do
|
||||||
|
@ -3,7 +3,7 @@
|
|||||||
module GEval.PrecisionRecall(calculateMAPForOneResult,
|
module GEval.PrecisionRecall(calculateMAPForOneResult,
|
||||||
fMeasure, f1Measure, f2Measure, precision, recall,
|
fMeasure, f1Measure, f2Measure, precision, recall,
|
||||||
fMeasureOnCounts, f1MeasureOnCounts, f2MeasureOnCounts, countFolder,
|
fMeasureOnCounts, f1MeasureOnCounts, f2MeasureOnCounts, countFolder,
|
||||||
precisionAndRecall, precisionAndRecallFromCounts, maxMatch)
|
precisionAndRecall, precisionAndRecallFromCounts, maxMatch, maxMatchOnOrdered)
|
||||||
where
|
where
|
||||||
|
|
||||||
import GEval.Common
|
import GEval.Common
|
||||||
@ -65,6 +65,17 @@ precision matchFun expected got = fst $ precisionAndRecall matchFun expected got
|
|||||||
recall :: (a -> b -> Bool) -> [a] -> [b] -> Double
|
recall :: (a -> b -> Bool) -> [a] -> [b] -> Double
|
||||||
recall matchFun expected got = snd $ precisionAndRecall matchFun expected got
|
recall matchFun expected got = snd $ precisionAndRecall matchFun expected got
|
||||||
|
|
||||||
|
|
||||||
|
maxMatchOnOrdered :: Eq a => (a -> a -> Bool) -> [a] -> [a] -> Int
|
||||||
|
maxMatchOnOrdered laterThan expected got =
|
||||||
|
let (matched, _) = foldl' step (0, expected) got
|
||||||
|
in matched
|
||||||
|
where step (matched, l@(h:t)) g
|
||||||
|
| h == g = (matched+1, t)
|
||||||
|
| h `laterThan` g = (matched, l)
|
||||||
|
| otherwise = step (matched, t) g
|
||||||
|
step (matched, []) g = (matched, [])
|
||||||
|
|
||||||
-- counting maximum match with maximum bipartite matching
|
-- counting maximum match with maximum bipartite matching
|
||||||
-- (we build an auxiliary graph and do a max-flow on this)
|
-- (we build an auxiliary graph and do a max-flow on this)
|
||||||
maxMatch :: (a -> b -> Bool) -> [a] -> [b] -> Int
|
maxMatch :: (a -> b -> Bool) -> [a] -> [b] -> Int
|
||||||
@ -72,7 +83,6 @@ maxMatch matchFun expected got = mf
|
|||||||
where (b, e, g) = buildGraph matchFun expected got
|
where (b, e, g) = buildGraph matchFun expected got
|
||||||
mf = maxFlow g (fst b) (fst e)
|
mf = maxFlow g (fst b) (fst e)
|
||||||
|
|
||||||
|
|
||||||
buildGraph :: (a -> b -> Bool) -> [a] -> [b] -> (LNode Int, LNode Int, Gr Int Int)
|
buildGraph :: (a -> b -> Bool) -> [a] -> [b] -> (LNode Int, LNode Int, Gr Int Int)
|
||||||
buildGraph matchFun expected got = (b, e, g)
|
buildGraph matchFun expected got = (b, e, g)
|
||||||
where ((b, e), (_, g)) = buildGraph' matchFun expected got
|
where ((b, e), (_, g)) = buildGraph' matchFun expected got
|
||||||
|
59
test/Spec.hs
59
test/Spec.hs
@ -8,6 +8,7 @@ import GEval.BLEU
|
|||||||
import GEval.ClippEU
|
import GEval.ClippEU
|
||||||
import GEval.PrecisionRecall
|
import GEval.PrecisionRecall
|
||||||
import GEval.ClusteringMetrics
|
import GEval.ClusteringMetrics
|
||||||
|
import GEval.BIO
|
||||||
import Data.Attoparsec.Text
|
import Data.Attoparsec.Text
|
||||||
import Options.Applicative
|
import Options.Applicative
|
||||||
import Data.Text
|
import Data.Text
|
||||||
@ -191,7 +192,63 @@ main = hspec $ do
|
|||||||
gevalCoreOnSingleLines RMSE (LineInFile "stub1" 1 "blabla")
|
gevalCoreOnSingleLines RMSE (LineInFile "stub1" 1 "blabla")
|
||||||
(LineInFile "stub2" 1 "3.4")
|
(LineInFile "stub2" 1 "3.4")
|
||||||
(LineInFile "stub3" 1 "2.6") `shouldReturnAlmost` 0.8
|
(LineInFile "stub3" 1 "2.6") `shouldReturnAlmost` 0.8
|
||||||
|
describe "BIO format" $ do
|
||||||
|
it "just parse" $ do
|
||||||
|
let (Right r) = parseOnly (bioSequenceParser <* endOfInput) "O B-city/NEW_YORK I-city B-city/KALISZ I-city O B-name"
|
||||||
|
r `shouldBe` [Outside,
|
||||||
|
Beginning "city" (Just "NEW_YORK"),
|
||||||
|
Inside "city" Nothing,
|
||||||
|
Beginning "city" (Just "KALISZ"),
|
||||||
|
Inside "city" Nothing,
|
||||||
|
Outside,
|
||||||
|
Beginning "name" Nothing]
|
||||||
|
it "simplest entity" $ do
|
||||||
|
let (Right ents) = parseBioSequenceIntoEntities "B-city"
|
||||||
|
ents `shouldBe` [TaggedEntity (TaggedSpan 1 1) "city" Nothing]
|
||||||
|
it "multi-word entity" $ do
|
||||||
|
let (Right ents) = parseBioSequenceIntoEntities "B-date I-date"
|
||||||
|
ents `shouldBe` [TaggedEntity (TaggedSpan 1 2) "date" Nothing]
|
||||||
|
it "multi-word entity with normalized text" $ do
|
||||||
|
let (Right ents) = parseBioSequenceIntoEntities "B-date/FOO I-date/BAR"
|
||||||
|
ents `shouldBe` [TaggedEntity (TaggedSpan 1 2) "date" (Just "FOO_BAR")]
|
||||||
|
it "simplest entity with something outside" $ do
|
||||||
|
let (Right ents) = parseBioSequenceIntoEntities "O B-city"
|
||||||
|
ents `shouldBe` [TaggedEntity (TaggedSpan 2 2) "city" Nothing]
|
||||||
|
it "another simple case" $ do
|
||||||
|
let (Right ents) = parseBioSequenceIntoEntities "B-city B-city"
|
||||||
|
ents `shouldBe` [TaggedEntity (TaggedSpan 1 1) "city" Nothing,
|
||||||
|
TaggedEntity (TaggedSpan 2 2) "city" Nothing]
|
||||||
|
it "just parse into entities" $ do
|
||||||
|
let (Right ents) = parseBioSequenceIntoEntities "O O B-city/LOS_ANGELES I-city B-city/KLUCZBORK O B-name O B-person/JOHN I-person/VON I-person/NEUMANN"
|
||||||
|
ents `shouldBe` [TaggedEntity (TaggedSpan 3 4) "city" (Just "LOS_ANGELES"),
|
||||||
|
TaggedEntity (TaggedSpan 5 5) "city" (Just "KLUCZBORK"),
|
||||||
|
TaggedEntity (TaggedSpan 7 7) "name" (Nothing),
|
||||||
|
TaggedEntity (TaggedSpan 9 11) "person" (Just "JOHN_VON_NEUMANN")]
|
||||||
|
it "another entity parse" $ do
|
||||||
|
let (Right ents) = parseBioSequenceIntoEntities "B-month/JULY B-month/JULY O O B-foo/bar"
|
||||||
|
ents `shouldBe` [TaggedEntity (TaggedSpan 1 1) "month" (Just "JULY"),
|
||||||
|
TaggedEntity (TaggedSpan 2 2) "month" (Just "JULY"),
|
||||||
|
TaggedEntity (TaggedSpan 5 5) "foo" (Just "bar")]
|
||||||
|
it "another entity parse" $ do
|
||||||
|
let (Right ents) = parseBioSequenceIntoEntities "B-city/LOS I-city/ANGELES O B-city/NEW I-city/YORK"
|
||||||
|
ents `shouldBe` [TaggedEntity (TaggedSpan 1 2) "city" (Just "LOS_ANGELES"),
|
||||||
|
TaggedEntity (TaggedSpan 4 5) "city" (Just "NEW_YORK")]
|
||||||
|
it "parse entity" $ do
|
||||||
|
let (Right ents) = parseBioSequenceIntoEntities "B-surname/BROWN B-surname/SMITH"
|
||||||
|
ents `shouldBe` [TaggedEntity (TaggedSpan 1 1) "surname" (Just "BROWN"),
|
||||||
|
TaggedEntity (TaggedSpan 2 2) "surname" (Just "SMITH")]
|
||||||
|
it "parse entity" $ do
|
||||||
|
let (Right ents) = parseBioSequenceIntoEntities "O B-surname/SMITH"
|
||||||
|
ents `shouldBe` [TaggedEntity (TaggedSpan 2 2) "surname" (Just "SMITH")]
|
||||||
|
it "check counting" $ do
|
||||||
|
gatherCountsForBIO [TaggedEntity (TaggedSpan 2 2) "surname" (Just "SMITH")] [TaggedEntity (TaggedSpan 1 1) "surname" (Just "BROWN"),
|
||||||
|
TaggedEntity (TaggedSpan 2 2) "surname" (Just "SMITH")] `shouldBe` (1, 1, 2)
|
||||||
|
it "check F1 on a more complicated example" $ do
|
||||||
|
runGEvalTest "bio-f1-complex" `shouldReturnAlmost` 0.625
|
||||||
|
it "calculate F1" $ do
|
||||||
|
runGEvalTest "bio-f1-simple" `shouldReturnAlmost` 0.5
|
||||||
|
it "check perfect score" $ do
|
||||||
|
runGEvalTest "bio-f1-perfect" `shouldReturnAlmost` 1.0
|
||||||
|
|
||||||
neverMatch :: Char -> Int -> Bool
|
neverMatch :: Char -> Int -> Bool
|
||||||
neverMatch _ _ = False
|
neverMatch _ _ = False
|
||||||
|
@ -0,0 +1,6 @@
|
|||||||
|
B-wrong
|
||||||
|
B-city/LOS I-city/ANGELES O B-city/NEW I-city/YORK
|
||||||
|
B-surname/BROWN B-surname/SMITH
|
||||||
|
B-month/JULY B-month/JULY O O B-foo/bar
|
||||||
|
O B-class I-class I-class
|
||||||
|
O B-wrong
|
|
1
test/bio-f1-complex/bio-f1-complex/config.txt
Normal file
1
test/bio-f1-complex/bio-f1-complex/config.txt
Normal file
@ -0,0 +1 @@
|
|||||||
|
--metric BIO-F1
|
6
test/bio-f1-complex/bio-f1-complex/test-A/expected.tsv
Normal file
6
test/bio-f1-complex/bio-f1-complex/test-A/expected.tsv
Normal file
@ -0,0 +1,6 @@
|
|||||||
|
O
|
||||||
|
B-city/LOS I-city/ANGELES O B-city/NEW_YORK O
|
||||||
|
O B-surname/SMITH
|
||||||
|
B-month/JULY O O O B-foo/bar
|
||||||
|
O B-class I-class I-class
|
||||||
|
O O
|
|
@ -0,0 +1,4 @@
|
|||||||
|
O O O
|
||||||
|
O B-city/NEW I-city/YORK I-city/CITY O B-month/July
|
||||||
|
B-surname/SMITH
|
||||||
|
B-city/LONDON B-city/PARIS
|
|
1
test/bio-f1-perfect/bio-f1-perfect/config.txt
Normal file
1
test/bio-f1-perfect/bio-f1-perfect/config.txt
Normal file
@ -0,0 +1 @@
|
|||||||
|
--metric BIO-F1
|
4
test/bio-f1-perfect/bio-f1-perfect/test-A/expected.tsv
Normal file
4
test/bio-f1-perfect/bio-f1-perfect/test-A/expected.tsv
Normal file
@ -0,0 +1,4 @@
|
|||||||
|
O O O
|
||||||
|
O B-city/NEW I-city/YORK I-city/CITY O B-month/July
|
||||||
|
B-surname/SMITH
|
||||||
|
B-city/LONDON B-city/PARIS
|
|
3
test/bio-f1-simple/bio-f1-simple-solution/test-A/out.tsv
Normal file
3
test/bio-f1-simple/bio-f1-simple-solution/test-A/out.tsv
Normal file
@ -0,0 +1,3 @@
|
|||||||
|
O O B-city/POZNAŃ O O B-date/MARCH I-date/12
|
||||||
|
B-city/BUK O O O
|
||||||
|
B-name/FOO O B-surname/KOWALSKI
|
|
1
test/bio-f1-simple/bio-f1-simple/config.txt
Normal file
1
test/bio-f1-simple/bio-f1-simple/config.txt
Normal file
@ -0,0 +1 @@
|
|||||||
|
--metric BIO-F1
|
3
test/bio-f1-simple/bio-f1-simple/test-A/expected.tsv
Normal file
3
test/bio-f1-simple/bio-f1-simple/test-A/expected.tsv
Normal file
@ -0,0 +1,3 @@
|
|||||||
|
O O B-city/POZNAŃ O O B-date/MARCH I-date/12
|
||||||
|
O O O O
|
||||||
|
O B-city/KONIN O
|
|
Loading…
Reference in New Issue
Block a user