MultiLabel-F1 works on labels given with probs now

This commit is contained in:
Filip Graliński 2018-08-09 14:08:54 +02:00
parent 82bdf70031
commit bd2bfde287
10 changed files with 94 additions and 2 deletions

View File

@ -28,6 +28,7 @@ library
, GEval.LineByLine
, GEval.BIO
, GEval.ParseParams
, GEval.ProbList
, Data.Conduit.AutoDecompress
, Data.Conduit.SmartSource
, Data.Conduit.Rank

View File

@ -78,6 +78,7 @@ import GEval.ClusteringMetrics
import GEval.LogLossHashed
import GEval.CharMatch
import GEval.BIO
import GEval.ProbList
import Data.Conduit.AutoDecompress
import qualified Data.HashMap.Strict as M
@ -572,11 +573,14 @@ gevalCore' BIOF1Labels _ = gevalCoreWithoutInput parseBioSequenceIntoEntitiesWit
return $ Prelude.map eraseNormalisation entities
gevalCore' (MultiLabelFMeasure beta) _ = gevalCoreWithoutInput intoWords
intoWords
getWords
(getCounts (==))
countAgg
(fMeasureOnCounts beta)
where intoWords = Right . (Prelude.map unpack) . Data.Text.words
where
getWords = Right . (Prelude.map unpack) . selectByStandardThreshold . parseIntoProbList
intoWords = Right . (Prelude.map unpack) . Data.Text.words
countAgg :: Monad m => ConduitM (Int, Int, Int) o m (Int, Int, Int)
countAgg = CC.foldl countFolder (0, 0, 0)

65
src/GEval/ProbList.hs Normal file
View File

@ -0,0 +1,65 @@
{-# LANGUAGE OverloadedStrings #-}
module GEval.ProbList
(parseIntoProbList, selectByStandardThreshold)
where
import qualified Data.Text as T
import GEval.Common
newtype Probability = P { getP :: Double }
deriving (Eq,Ord,Show)
isProbability :: Double -> Bool
isProbability p = 0.0 <= p && p <= 1.0
mkProbability :: Double -> Probability
mkProbability p
| isProbability p = P p
| otherwise = error $ show p ++ " is not in [0, 1]"
probabilityOne :: Probability
probabilityOne = mkProbability 1.0
data ProbList = ProbList [WordWithProb]
deriving (Show)
data WordWithProb = WordWithProb T.Text Probability
deriving (Show)
parseIntoWordWithProb :: T.Text -> WordWithProb
parseIntoWordWithProb t =
-- If anything is wrong the whole word is treated as a label,
-- even if it contains colons, digits, dots, etc.
if T.null wordSpecPart
then wordWithoutProb
else
if "." `T.isInfixOf` numberPart
then case textToDouble numberPart of
Right p -> if isProbability p
then WordWithProb wordSpecPart (mkProbability p)
else wordWithoutProb
Left _ -> wordWithoutProb
else wordWithoutProb
where
(wordSpecPart', numberPart) = T.breakOnEnd ":" t
wordWithoutProb = WordWithProb t probabilityOne
wordSpecPart = if T.null wordSpecPart'
then wordSpecPart'
else T.init wordSpecPart'
parseIntoProbList :: T.Text -> ProbList
parseIntoProbList = ProbList . map parseIntoWordWithProb . T.words
selectByThreshold :: Probability -> ProbList -> [T.Text]
selectByThreshold threshold (ProbList l) =
map (\(WordWithProb w _) -> w)
$ filter (\(WordWithProb _ p) -> p >= threshold) l
standardThreshold :: Double
standardThreshold = 0.5
selectByStandardThreshold :: ProbList -> [T.Text]
selectByStandardThreshold = selectByThreshold (mkProbability standardThreshold)

View File

@ -220,6 +220,10 @@ main = hspec $ do
runGEvalTest "multilabel-f1-simple" `shouldReturnAlmost` 0.66666666666
it "simple F2" $ do
runGEvalTest "multilabel-f2-simple" `shouldReturnAlmost` 0.441176470588235
it "labels given with probs" $ do
runGEvalTest "multilabel-f1-with-probs" `shouldReturnAlmost` 0.615384615384615
it "labels given with probs and numbers" $ do
runGEvalTest "multilabel-f1-with-probs-and-numbers" `shouldReturnAlmost` 0.6666666666666
describe "evaluating single lines" $ do
it "RMSE" $ do
gevalCoreOnSingleLines RMSE (LineInFile (FilePathSpec "stub1") 1 "blabla")

View File

@ -0,0 +1,3 @@
foo:1 bar:2:0.7
foo
1 foo:1 bar:2:0.7
2 foo

View File

@ -0,0 +1 @@
--metric MultiLabel-F1

View File

@ -0,0 +1,3 @@
foo:1 bar:2
foo:0
1 foo:1 bar:2
2 foo:0

View File

@ -0,0 +1,5 @@
foo 90:0.751 bar:0.3
91:0.4 90:0.6
90:0.99
nic:0.3 baz:0.6
foo:0.87 baz:0.8111
1 foo 90:0.751 bar:0.3
2 91:0.4 90:0.6
3 90:0.99
4 nic:0.3 baz:0.6
5 foo:0.87 baz:0.8111

View File

@ -0,0 +1 @@
--metric MultiLabel-F1

View File

@ -0,0 +1,5 @@
foo 90
91
90 bar
foo
1 foo 90
2 91
3 90 bar
4 foo