diff --git a/geval.cabal b/geval.cabal index 8cfc841..23e3672 100644 --- a/geval.cabal +++ b/geval.cabal @@ -26,6 +26,7 @@ library , GEval.LogLossHashed , GEval.CharMatch , GEval.LineByLine + , GEval.BIO build-depends: base >= 4.7 && < 5 , cond , conduit diff --git a/src/GEval/BIO.hs b/src/GEval/BIO.hs new file mode 100644 index 0000000..a3e8819 --- /dev/null +++ b/src/GEval/BIO.hs @@ -0,0 +1,99 @@ +{-# LANGUAGE OverloadedStrings #-} + +module GEval.BIO + (BIOLabel(..), bioSequenceParser, parseBioSequenceIntoEntities, TaggedSpan(..), TaggedEntity(..), gatherCountsForBIO) + where + +import GEval.PrecisionRecall + +import qualified Data.Text as T + +import Data.Attoparsec.Text +import Data.Attoparsec.Combinator +import Control.Applicative +import Data.Char +import Data.Maybe (catMaybes) + +import GEval.Common + +data BIOLabel = Outside | Beginning T.Text (Maybe T.Text) | Inside T.Text (Maybe T.Text) + deriving (Eq, Show) + +data TaggedSpan = TaggedSpan Int Int + deriving (Eq, Show) + +data TaggedEntity = TaggedEntity TaggedSpan T.Text (Maybe T.Text) + deriving (Eq, Show) + +gatherCountsForBIO :: [TaggedEntity] -> [TaggedEntity] -> (Int, Int, Int) +gatherCountsForBIO expected got = (maxMatchOnOrdered laterThan expected got, length expected, length got) + where + laterThan (TaggedEntity (TaggedSpan a _) _ _) (TaggedEntity (TaggedSpan b _) _ _) = a > b + +parseBioSequenceIntoEntities :: T.Text -> Either String [TaggedEntity] +parseBioSequenceIntoEntities t = labelsIntoEntities =<< (parseOnly (bioSequenceParser <* endOfInput) t) + +labelsIntoEntities :: [BIOLabel] -> Either String [TaggedEntity] +labelsIntoEntities labels = labelsIntoEntities' $ zip labels [1..] + +labelsIntoEntities' :: [(BIOLabel, Int)] -> Either String [TaggedEntity] +labelsIntoEntities' labelsWithPositions = mapM labelSplitToEntity labelsGathered + where labelsGathered = splitLabels labelsWithPositions + +labelSplitToEntity :: [(BIOLabel, Int)] -> Either String TaggedEntity +labelSplitToEntity labs@(h@(_,begIx):t) = if isBeginning h && all (\tp -> isInside tp && tt tp == btp) t + then + Right $ TaggedEntity (TaggedSpan begIx lastItemIx) btp mNormalized + else + Left "something wrong with label sequence" + where isBeginning (Beginning _ _, _) = True + isBeginning _ = False + isInside (Inside _ _, _) = True + isInside _ = False + tt (Beginning t _, _) = t + tt (Inside t _, _) = t + btp = tt h + lastItemIx = case t of + [] -> begIx + _ -> let (_, ix) = last t + in ix + normalized (Beginning _ n, _) = n + normalized (Inside _ n, _) = n + mNormalized = if all (\tp -> normalized tp == Nothing) labs + then + Nothing + else + Just $ T.intercalate "_" $ catMaybes $ map normalized labs + +splitLabels :: [(BIOLabel, Int)] -> [[(BIOLabel, Int)]] +splitLabels [] = [] +splitLabels ((Outside, _):r) = splitLabels r +splitLabels (e@(_, ix):r) = + case splitLabels r of + l@(((Beginning _ _, _):_):_) -> ([e]:l) + (s@((Inside _ _, ix'):_):l) -> if ix' == ix + 1 + then + ((e:s):l) + else + ([e]:(s:l)) + [] -> [[e]] + +bioSequenceParser :: Parser [BIOLabel] +bioSequenceParser = sepByWhitespaces bioLabelParser + +bioLabelParser :: Parser BIOLabel +bioLabelParser = + (string "O" *> pure Outside) <|> + (do + labelType <- bioMarkerParser + string "-" + label <- takeWhile1 (\c -> not (isSpace c) && c /= '/') + normalized <- (do + string "/" + normalized <- takeWhile1 (not . isSpace) + return $ Just normalized) <|> pure Nothing + return $ labelType label normalized) + +bioMarkerParser :: Parser (T.Text -> Maybe T.Text -> BIOLabel) +bioMarkerParser = + (string "B" *> pure Beginning) <|> (string "I" *> pure Inside) diff --git a/src/GEval/Core.hs b/src/GEval/Core.hs index 6b29f7e..2ba9b27 100644 --- a/src/GEval/Core.hs +++ b/src/GEval/Core.hs @@ -53,6 +53,7 @@ import qualified System.Directory as D import System.Posix import System.FilePath import Data.Maybe +import Data.Tuple import qualified Data.List.Split as DLS import Control.Monad.IO.Class @@ -67,6 +68,7 @@ import GEval.PrecisionRecall import GEval.ClusteringMetrics import GEval.LogLossHashed import GEval.CharMatch +import GEval.BIO import qualified Data.HashMap.Strict as M @@ -80,7 +82,7 @@ defaultLogLossHashedSize :: Word32 defaultLogLossHashedSize = 10 data Metric = RMSE | MSE | BLEU | Accuracy | ClippEU | FMeasure Double | NMI | LogLossHashed Word32 | CharMatch - | MAP | LogLoss + | MAP | LogLoss | BIOF1 deriving (Eq) instance Show Metric where @@ -100,6 +102,7 @@ instance Show Metric where show CharMatch = "CharMatch" show MAP = "MAP" show LogLoss = "LogLoss" + show BIOF1 = "BIO-F1" instance Read Metric where readsPrec _ ('R':'M':'S':'E':theRest) = [(RMSE, theRest)] @@ -117,6 +120,7 @@ instance Read Metric where readsPrec _ ('L':'o':'g':'L':'o':'s':'s':theRest) = [(LogLoss, theRest)] readsPrec p ('C':'h':'a':'r':'M':'a':'t':'c':'h':theRest) = [(CharMatch, theRest)] readsPrec _ ('M':'A':'P':theRest) = [(MAP, theRest)] + readsPrec _ ('B':'I':'O':'-':'F':'1':theRest) = [(BIOF1, theRest)] data MetricOrdering = TheLowerTheBetter | TheHigherTheBetter @@ -132,6 +136,7 @@ getMetricOrdering (LogLossHashed _) = TheLowerTheBetter getMetricOrdering CharMatch = TheHigherTheBetter getMetricOrdering MAP = TheHigherTheBetter getMetricOrdering LogLoss = TheLowerTheBetter +getMetricOrdering BIOF1 = TheHigherTheBetter defaultOutDirectory = "." defaultTestName = "test-A" @@ -381,7 +386,11 @@ gevalCore' CharMatch inputLineSource = helper inputLineSource helper inputLineSource expectedLineSource outputLineSource = do gevalCoreGeneralized (ParserSpecWithInput (Right . unpack) (Right . unpack) (Right . unpack)) step countAgg (fMeasureOnCounts charMatchBeta) (WithInput inputLineSource expectedLineSource outputLineSource) step (ParsedRecordWithInput inp exp out) = getCharMatchCount inp exp out - countAgg = CC.foldl countFolder (0, 0, 0) + +gevalCore' BIOF1 _ = gevalCoreWithoutInput parseBioSequenceIntoEntities parseBioSequenceIntoEntities (uncurry gatherCountsForBIO) countAgg f1MeasureOnCounts + +countAgg :: Monad m => ConduitM (Int, Int, Int) o m (Int, Int, Int) +countAgg = CC.foldl countFolder (0, 0, 0) parseDistributionWrapper :: Word32 -> Word32 -> Text -> HashedDistribution parseDistributionWrapper nbOfBits seed distroSpec = case parseDistribution nbOfBits seed distroSpec of diff --git a/src/GEval/CreateChallenge.hs b/src/GEval/CreateChallenge.hs index 3b31b3f..09a85e4 100644 --- a/src/GEval/CreateChallenge.hs +++ b/src/GEval/CreateChallenge.hs @@ -199,6 +199,18 @@ This a sample challenge for the log-loss metric. |] ++ (commonReadmeMDContents testName) +readmeMDContents BIOF1 testName = [i| +Tag and normalize names +======================= + +Tag names in the tokenized text and normalized them. + +The output should be given in the BIO format with the normalized forms given after slashes (see +`dev-0/expected.tsv` for an example). + +The metric is F1 counted on entities (not labels). +|] ++ (commonReadmeMDContents testName) + readmeMDContents _ testName = [i| GEval sample challenge ====================== @@ -288,6 +300,10 @@ trainContents LogLoss = [hereLit|0.0 Hell, no!!! 1.0 Lekker!!! 0.0 Boring, boring, boring |] +trainContents BIOF1 = [hereLit|O O O B-surname/BOND O B-firstname/JAMES B-surname/BOND My name is Bond , James Bond +O O O O O There is no name here +B-firstname/JOHN I-surname/VON I-surname/NEUMANN John von Nueman +|] trainContents _ = [hereLit|0.06 0.39 0 0.206 1.00 1.00 1 0.017 317.8 5.20 67 0.048 @@ -323,6 +339,9 @@ devInContents LogLoss = [hereLit|Great stuff! Boring stuff That's good |] +devInContents BIOF1 = [hereLit|Adam and Eve +Mr Jan Kowalski +|] devInContents _ = [hereLit|0.72 0 0.007 9.54 62 0.054 |] @@ -356,6 +375,9 @@ devExpectedContents LogLoss = [hereLit|1.0 0.0 1.0 |] +devExpectedContents BIOF1 = [hereLit|B-firstname/ADAM O B-firstname/EVE +O B-firstname/JAN B-surname/KOWALSKI +|] devExpectedContents _ = [hereLit|0.82 95.2 |] @@ -391,6 +413,9 @@ testInContents LogLoss = [hereLit|That's great, ha, ha, I love it! Super-duper!! That is incredibly boring. |] +testInContents BIOF1 = [hereLit|Alan Tring +No name here +|] testInContents _ = [hereLit|1.52 2 0.093 30.06 14 0.009 |] @@ -426,6 +451,9 @@ testExpectedContents LogLoss = [hereLit|1.0 1.0 0.0 |] +testExpectedContents BIOF1 = [hereLit|B-firstname/ALAN B-surname/TURING +O O O +|] testExpectedContents _ = [hereLit|0.11 17.2 |] diff --git a/src/GEval/OptionsParser.hs b/src/GEval/OptionsParser.hs index d5fe2f8..bad86fe 100644 --- a/src/GEval/OptionsParser.hs +++ b/src/GEval/OptionsParser.hs @@ -100,7 +100,7 @@ metricReader = option auto <> value defaultMetric <> showDefault <> metavar "METRIC" - <> help "Metric to be used - RMSE, MSE, Accuracy, LogLoss, F-measure (specify as F1, F2, F0.25, etc.), MAP, BLEU, NMI, ClippEU, LogLossHashed or CharMatch" ) + <> help "Metric to be used - RMSE, MSE, Accuracy, LogLoss, F-measure (specify as F1, F2, F0.25, etc.), MAP, BLEU, NMI, ClippEU, LogLossHashed, BIO-F1 or CharMatch" ) runGEval :: [String] -> IO (Either (ParserResult GEvalOptions) (Maybe MetricValue)) runGEval args = do diff --git a/src/GEval/PrecisionRecall.hs b/src/GEval/PrecisionRecall.hs index 8781914..16879b2 100644 --- a/src/GEval/PrecisionRecall.hs +++ b/src/GEval/PrecisionRecall.hs @@ -3,7 +3,7 @@ module GEval.PrecisionRecall(calculateMAPForOneResult, fMeasure, f1Measure, f2Measure, precision, recall, fMeasureOnCounts, f1MeasureOnCounts, f2MeasureOnCounts, countFolder, - precisionAndRecall, precisionAndRecallFromCounts, maxMatch) + precisionAndRecall, precisionAndRecallFromCounts, maxMatch, maxMatchOnOrdered) where import GEval.Common @@ -65,6 +65,17 @@ precision matchFun expected got = fst $ precisionAndRecall matchFun expected got recall :: (a -> b -> Bool) -> [a] -> [b] -> Double recall matchFun expected got = snd $ precisionAndRecall matchFun expected got + +maxMatchOnOrdered :: Eq a => (a -> a -> Bool) -> [a] -> [a] -> Int +maxMatchOnOrdered laterThan expected got = + let (matched, _) = foldl' step (0, expected) got + in matched + where step (matched, l@(h:t)) g + | h == g = (matched+1, t) + | h `laterThan` g = (matched, l) + | otherwise = step (matched, t) g + step (matched, []) g = (matched, []) + -- counting maximum match with maximum bipartite matching -- (we build an auxiliary graph and do a max-flow on this) maxMatch :: (a -> b -> Bool) -> [a] -> [b] -> Int @@ -72,7 +83,6 @@ maxMatch matchFun expected got = mf where (b, e, g) = buildGraph matchFun expected got mf = maxFlow g (fst b) (fst e) - buildGraph :: (a -> b -> Bool) -> [a] -> [b] -> (LNode Int, LNode Int, Gr Int Int) buildGraph matchFun expected got = (b, e, g) where ((b, e), (_, g)) = buildGraph' matchFun expected got diff --git a/test/Spec.hs b/test/Spec.hs index 27b6653..8cd27bf 100644 --- a/test/Spec.hs +++ b/test/Spec.hs @@ -8,6 +8,7 @@ import GEval.BLEU import GEval.ClippEU import GEval.PrecisionRecall import GEval.ClusteringMetrics +import GEval.BIO import Data.Attoparsec.Text import Options.Applicative import Data.Text @@ -191,7 +192,63 @@ main = hspec $ do gevalCoreOnSingleLines RMSE (LineInFile "stub1" 1 "blabla") (LineInFile "stub2" 1 "3.4") (LineInFile "stub3" 1 "2.6") `shouldReturnAlmost` 0.8 - + describe "BIO format" $ do + it "just parse" $ do + let (Right r) = parseOnly (bioSequenceParser <* endOfInput) "O B-city/NEW_YORK I-city B-city/KALISZ I-city O B-name" + r `shouldBe` [Outside, + Beginning "city" (Just "NEW_YORK"), + Inside "city" Nothing, + Beginning "city" (Just "KALISZ"), + Inside "city" Nothing, + Outside, + Beginning "name" Nothing] + it "simplest entity" $ do + let (Right ents) = parseBioSequenceIntoEntities "B-city" + ents `shouldBe` [TaggedEntity (TaggedSpan 1 1) "city" Nothing] + it "multi-word entity" $ do + let (Right ents) = parseBioSequenceIntoEntities "B-date I-date" + ents `shouldBe` [TaggedEntity (TaggedSpan 1 2) "date" Nothing] + it "multi-word entity with normalized text" $ do + let (Right ents) = parseBioSequenceIntoEntities "B-date/FOO I-date/BAR" + ents `shouldBe` [TaggedEntity (TaggedSpan 1 2) "date" (Just "FOO_BAR")] + it "simplest entity with something outside" $ do + let (Right ents) = parseBioSequenceIntoEntities "O B-city" + ents `shouldBe` [TaggedEntity (TaggedSpan 2 2) "city" Nothing] + it "another simple case" $ do + let (Right ents) = parseBioSequenceIntoEntities "B-city B-city" + ents `shouldBe` [TaggedEntity (TaggedSpan 1 1) "city" Nothing, + TaggedEntity (TaggedSpan 2 2) "city" Nothing] + it "just parse into entities" $ do + let (Right ents) = parseBioSequenceIntoEntities "O O B-city/LOS_ANGELES I-city B-city/KLUCZBORK O B-name O B-person/JOHN I-person/VON I-person/NEUMANN" + ents `shouldBe` [TaggedEntity (TaggedSpan 3 4) "city" (Just "LOS_ANGELES"), + TaggedEntity (TaggedSpan 5 5) "city" (Just "KLUCZBORK"), + TaggedEntity (TaggedSpan 7 7) "name" (Nothing), + TaggedEntity (TaggedSpan 9 11) "person" (Just "JOHN_VON_NEUMANN")] + it "another entity parse" $ do + let (Right ents) = parseBioSequenceIntoEntities "B-month/JULY B-month/JULY O O B-foo/bar" + ents `shouldBe` [TaggedEntity (TaggedSpan 1 1) "month" (Just "JULY"), + TaggedEntity (TaggedSpan 2 2) "month" (Just "JULY"), + TaggedEntity (TaggedSpan 5 5) "foo" (Just "bar")] + it "another entity parse" $ do + let (Right ents) = parseBioSequenceIntoEntities "B-city/LOS I-city/ANGELES O B-city/NEW I-city/YORK" + ents `shouldBe` [TaggedEntity (TaggedSpan 1 2) "city" (Just "LOS_ANGELES"), + TaggedEntity (TaggedSpan 4 5) "city" (Just "NEW_YORK")] + it "parse entity" $ do + let (Right ents) = parseBioSequenceIntoEntities "B-surname/BROWN B-surname/SMITH" + ents `shouldBe` [TaggedEntity (TaggedSpan 1 1) "surname" (Just "BROWN"), + TaggedEntity (TaggedSpan 2 2) "surname" (Just "SMITH")] + it "parse entity" $ do + let (Right ents) = parseBioSequenceIntoEntities "O B-surname/SMITH" + ents `shouldBe` [TaggedEntity (TaggedSpan 2 2) "surname" (Just "SMITH")] + it "check counting" $ do + gatherCountsForBIO [TaggedEntity (TaggedSpan 2 2) "surname" (Just "SMITH")] [TaggedEntity (TaggedSpan 1 1) "surname" (Just "BROWN"), + TaggedEntity (TaggedSpan 2 2) "surname" (Just "SMITH")] `shouldBe` (1, 1, 2) + it "check F1 on a more complicated example" $ do + runGEvalTest "bio-f1-complex" `shouldReturnAlmost` 0.625 + it "calculate F1" $ do + runGEvalTest "bio-f1-simple" `shouldReturnAlmost` 0.5 + it "check perfect score" $ do + runGEvalTest "bio-f1-perfect" `shouldReturnAlmost` 1.0 neverMatch :: Char -> Int -> Bool neverMatch _ _ = False diff --git a/test/bio-f1-complex/bio-f1-complex-solution/test-A/out.tsv b/test/bio-f1-complex/bio-f1-complex-solution/test-A/out.tsv new file mode 100644 index 0000000..f84fa1d --- /dev/null +++ b/test/bio-f1-complex/bio-f1-complex-solution/test-A/out.tsv @@ -0,0 +1,6 @@ +B-wrong +B-city/LOS I-city/ANGELES O B-city/NEW I-city/YORK +B-surname/BROWN B-surname/SMITH +B-month/JULY B-month/JULY O O B-foo/bar +O B-class I-class I-class +O B-wrong diff --git a/test/bio-f1-complex/bio-f1-complex/config.txt b/test/bio-f1-complex/bio-f1-complex/config.txt new file mode 100644 index 0000000..70977e1 --- /dev/null +++ b/test/bio-f1-complex/bio-f1-complex/config.txt @@ -0,0 +1 @@ +--metric BIO-F1 diff --git a/test/bio-f1-complex/bio-f1-complex/test-A/expected.tsv b/test/bio-f1-complex/bio-f1-complex/test-A/expected.tsv new file mode 100644 index 0000000..c6e719d --- /dev/null +++ b/test/bio-f1-complex/bio-f1-complex/test-A/expected.tsv @@ -0,0 +1,6 @@ +O +B-city/LOS I-city/ANGELES O B-city/NEW_YORK O +O B-surname/SMITH +B-month/JULY O O O B-foo/bar +O B-class I-class I-class +O O diff --git a/test/bio-f1-perfect/bio-f1-perfect-solution/test-A/out.tsv b/test/bio-f1-perfect/bio-f1-perfect-solution/test-A/out.tsv new file mode 100644 index 0000000..e0fa64b --- /dev/null +++ b/test/bio-f1-perfect/bio-f1-perfect-solution/test-A/out.tsv @@ -0,0 +1,4 @@ +O O O +O B-city/NEW I-city/YORK I-city/CITY O B-month/July +B-surname/SMITH +B-city/LONDON B-city/PARIS diff --git a/test/bio-f1-perfect/bio-f1-perfect/config.txt b/test/bio-f1-perfect/bio-f1-perfect/config.txt new file mode 100644 index 0000000..70977e1 --- /dev/null +++ b/test/bio-f1-perfect/bio-f1-perfect/config.txt @@ -0,0 +1 @@ +--metric BIO-F1 diff --git a/test/bio-f1-perfect/bio-f1-perfect/test-A/expected.tsv b/test/bio-f1-perfect/bio-f1-perfect/test-A/expected.tsv new file mode 100644 index 0000000..e0fa64b --- /dev/null +++ b/test/bio-f1-perfect/bio-f1-perfect/test-A/expected.tsv @@ -0,0 +1,4 @@ +O O O +O B-city/NEW I-city/YORK I-city/CITY O B-month/July +B-surname/SMITH +B-city/LONDON B-city/PARIS diff --git a/test/bio-f1-simple/bio-f1-simple-solution/test-A/out.tsv b/test/bio-f1-simple/bio-f1-simple-solution/test-A/out.tsv new file mode 100644 index 0000000..0666d27 --- /dev/null +++ b/test/bio-f1-simple/bio-f1-simple-solution/test-A/out.tsv @@ -0,0 +1,3 @@ +O O B-city/POZNAŃ O O B-date/MARCH I-date/12 +B-city/BUK O O O +B-name/FOO O B-surname/KOWALSKI diff --git a/test/bio-f1-simple/bio-f1-simple/config.txt b/test/bio-f1-simple/bio-f1-simple/config.txt new file mode 100644 index 0000000..70977e1 --- /dev/null +++ b/test/bio-f1-simple/bio-f1-simple/config.txt @@ -0,0 +1 @@ +--metric BIO-F1 diff --git a/test/bio-f1-simple/bio-f1-simple/test-A/expected.tsv b/test/bio-f1-simple/bio-f1-simple/test-A/expected.tsv new file mode 100644 index 0000000..659986b --- /dev/null +++ b/test/bio-f1-simple/bio-f1-simple/test-A/expected.tsv @@ -0,0 +1,3 @@ +O O B-city/POZNAŃ O O B-date/MARCH I-date/12 +O O O O +O B-city/KONIN O