MultiLabel-F1 works on labels given with probs now

2018-08-09 14:08:54 +02:00 · 2018-08-09 14:08:54 +02:00 · bd2bfde287
commit bd2bfde287
parent 82bdf70031
10 changed files with 94 additions and 2 deletions
--- a/geval.cabal
+++ b/geval.cabal
@ -28,6 +28,7 @@ library
                     , GEval.LineByLine
                     , GEval.BIO
                     , GEval.ParseParams
                     , GEval.ProbList
                     , Data.Conduit.AutoDecompress
                     , Data.Conduit.SmartSource
                     , Data.Conduit.Rank
--- a/src/GEval/Core.hs
+++ b/src/GEval/Core.hs
@ -78,6 +78,7 @@ import GEval.ClusteringMetrics
 import GEval.LogLossHashed
 import GEval.CharMatch
 import GEval.BIO
 import GEval.ProbList
 import Data.Conduit.AutoDecompress
 import qualified Data.HashMap.Strict as M
@ -572,11 +573,14 @@ gevalCore' BIOF1Labels _ = gevalCoreWithoutInput parseBioSequenceIntoEntitiesWit
           return $ Prelude.map eraseNormalisation entities
 gevalCore' (MultiLabelFMeasure beta) _ = gevalCoreWithoutInput intoWords
-                                                               intoWords
+                                                               getWords
                                                               (getCounts (==))
                                                               countAgg
                                                               (fMeasureOnCounts beta)
-    where intoWords = Right . (Prelude.map unpack) . Data.Text.words
+    where
      getWords = Right . (Prelude.map unpack) . selectByStandardThreshold . parseIntoProbList
      intoWords = Right . (Prelude.map unpack) . Data.Text.words
 countAgg :: Monad m => ConduitM (Int, Int, Int) o m (Int, Int, Int)
 countAgg = CC.foldl countFolder (0, 0, 0)
--- a/src/GEval/ProbList.hs
+++ b/src/GEval/ProbList.hs
@ -0,0 +1,65 @@
 {-# LANGUAGE OverloadedStrings #-}
 module GEval.ProbList
       (parseIntoProbList, selectByStandardThreshold)
       where
 import qualified Data.Text as T
 import GEval.Common
 newtype Probability = P { getP :: Double }
    deriving (Eq,Ord,Show)
 isProbability :: Double -> Bool
 isProbability p = 0.0 <= p && p <= 1.0
 mkProbability :: Double -> Probability
 mkProbability p
  | isProbability p = P p
  | otherwise = error $ show p ++ " is not in [0, 1]"
 probabilityOne :: Probability
 probabilityOne = mkProbability 1.0
 data ProbList = ProbList [WordWithProb]
  deriving (Show)
 data WordWithProb = WordWithProb T.Text Probability
  deriving (Show)
 parseIntoWordWithProb :: T.Text -> WordWithProb
 parseIntoWordWithProb t =
  -- If anything is wrong the whole word is treated as a label,
  -- even if it contains colons, digits, dots, etc.
  if T.null wordSpecPart
  then wordWithoutProb
  else
    if "." `T.isInfixOf` numberPart
    then case textToDouble numberPart of
       Right p -> if isProbability p
                   then WordWithProb wordSpecPart (mkProbability p)
                   else wordWithoutProb
       Left _ -> wordWithoutProb
    else wordWithoutProb
  where
    (wordSpecPart', numberPart) = T.breakOnEnd ":" t
    wordWithoutProb = WordWithProb t probabilityOne
    wordSpecPart = if T.null wordSpecPart'
                   then wordSpecPart'
                   else T.init wordSpecPart'
 parseIntoProbList :: T.Text -> ProbList
 parseIntoProbList = ProbList . map parseIntoWordWithProb . T.words
 selectByThreshold :: Probability -> ProbList -> [T.Text]
 selectByThreshold threshold (ProbList l) =
  map (\(WordWithProb w _) -> w)
  $ filter (\(WordWithProb _ p) -> p >= threshold) l
 standardThreshold :: Double
 standardThreshold = 0.5
 selectByStandardThreshold :: ProbList -> [T.Text]
 selectByStandardThreshold = selectByThreshold (mkProbability standardThreshold)
--- a/test/Spec.hs
+++ b/test/Spec.hs
@ -220,6 +220,10 @@ main = hspec $ do
      runGEvalTest "multilabel-f1-simple" `shouldReturnAlmost` 0.66666666666
    it "simple F2" $ do
      runGEvalTest "multilabel-f2-simple" `shouldReturnAlmost` 0.441176470588235
    it "labels given with probs" $ do
      runGEvalTest "multilabel-f1-with-probs" `shouldReturnAlmost` 0.615384615384615
    it "labels given with probs and numbers" $ do
      runGEvalTest "multilabel-f1-with-probs-and-numbers" `shouldReturnAlmost` 0.6666666666666
  describe "evaluating single lines" $ do
    it "RMSE" $ do
      gevalCoreOnSingleLines RMSE (LineInFile (FilePathSpec "stub1") 1 "blabla")
--- a/test/multilabel-f1-with-probs-and-numbers/multilabel-f1-with-probs-and-numbers-solution/test-A/out.tsv
+++ b/test/multilabel-f1-with-probs-and-numbers/multilabel-f1-with-probs-and-numbers-solution/test-A/out.tsv
@ -0,0 +1,3 @@
 foo:1 bar:2:0.7
 foo
--- a/test/multilabel-f1-with-probs-and-numbers/multilabel-f1-with-probs-and-numbers/config.txt
+++ b/test/multilabel-f1-with-probs-and-numbers/multilabel-f1-with-probs-and-numbers/config.txt
@ -0,0 +1 @@
 --metric MultiLabel-F1
--- a/test/multilabel-f1-with-probs-and-numbers/multilabel-f1-with-probs-and-numbers/test-A/expected.tsv
+++ b/test/multilabel-f1-with-probs-and-numbers/multilabel-f1-with-probs-and-numbers/test-A/expected.tsv
@ -0,0 +1,3 @@
 foo:1 bar:2
 foo:0
--- a/test/multilabel-f1-with-probs/multilabel-f1-with-probs-solution/test-A/out.tsv
+++ b/test/multilabel-f1-with-probs/multilabel-f1-with-probs-solution/test-A/out.tsv
@ -0,0 +1,5 @@
 foo 90:0.751 bar:0.3
 91:0.4 90:0.6
 90:0.99
 nic:0.3 baz:0.6
 foo:0.87 baz:0.8111
--- a/test/multilabel-f1-with-probs/multilabel-f1-with-probs/config.txt
+++ b/test/multilabel-f1-with-probs/multilabel-f1-with-probs/config.txt
@ -0,0 +1 @@
 --metric MultiLabel-F1
--- a/test/multilabel-f1-with-probs/multilabel-f1-with-probs/test-A/expected.tsv
+++ b/test/multilabel-f1-with-probs/multilabel-f1-with-probs/test-A/expected.tsv
@ -0,0 +1,5 @@
 foo 90
 91
 90 bar
 foo
+foo 90
+bar
+foo