implement MultiLabel-LogLoss and MultiLabel-Likelihood

2018-08-09 16:00:19 +02:00 · 2018-08-09 16:00:19 +02:00 · efcceae26a
commit efcceae26a
parent bd2bfde287
6 changed files with 53 additions and 1 deletions
--- a/src/GEval/Core.hs
+++ b/src/GEval/Core.hs
@ -98,6 +98,7 @@ defaultLogLossHashedSize = 10
 data Metric = RMSE | MSE | BLEU | Accuracy | ClippEU | FMeasure Double | NMI
              | LogLossHashed Word32 | CharMatch | MAP | LogLoss | Likelihood
              | BIOF1 | BIOF1Labels | LikelihoodHashed Word32 | MAE | MultiLabelFMeasure Double
+              | MultiLabelLogLoss | MultiLabelLikelihood
              deriving (Eq)

 instance Show Metric where
@ -128,6 +129,8 @@ instance Show Metric where
  show BIOF1Labels = "BIO-F1-Labels"
  show MAE = "MAE"
  show (MultiLabelFMeasure beta) = "MultiLabel-F" ++ (show beta)
+  show MultiLabelLogLoss = "MultiLabel-Logloss"
+  show MultiLabelLikelihood = "MultiLabel-Likelihood"

 instance Read Metric where
  readsPrec _ ('R':'M':'S':'E':theRest) = [(RMSE, theRest)]
@ -155,6 +158,10 @@ instance Read Metric where
  readsPrec _ ('B':'I':'O':'-':'F':'1':'-':'L':'a':'b':'e':'l':'s':theRest) = [(BIOF1Labels, theRest)]
  readsPrec _ ('B':'I':'O':'-':'F':'1':theRest) = [(BIOF1, theRest)]
  readsPrec _ ('M':'A':'E':theRest) = [(MAE, theRest)]
+  readsPrec _ ('M':'u':'l':'t':'i':'L':'a':'b':'e':'l':'-':'L':'o':'g':'L':'o':'s':'s':theRest) = [(MultiLabelLogLoss, theRest)]
+  readsPrec _ ('M':'u':'l':'t':'i':'L':'a':'b':'e':'l':'-':'L':'i':'k':'e':'l':'i':'h':'o':'o':'d':theRest) = [(MultiLabelLikelihood, theRest)]
+
+

 data MetricOrdering = TheLowerTheBetter | TheHigherTheBetter

@ -177,6 +184,9 @@ getMetricOrdering BIOF1 = TheHigherTheBetter
 getMetricOrdering BIOF1Labels = TheHigherTheBetter
 getMetricOrdering MAE = TheLowerTheBetter
 getMetricOrdering (MultiLabelFMeasure _) = TheHigherTheBetter
+getMetricOrdering MultiLabelLogLoss = TheLowerTheBetter
+getMetricOrdering MultiLabelLikelihood = TheHigherTheBetter
+

 isInputNeeded :: Metric -> Bool
 isInputNeeded CharMatch = True
@ -448,6 +458,10 @@ gevalCoreOnSources (LikelihoodHashed b) inputLineSource expectedLineSource outLi
  logLoss <- gevalCoreOnSources (LogLossHashed b) inputLineSource expectedLineSource outLineSource
  return $ logLossToLikehood logLoss

+gevalCoreOnSources MultiLabelLikelihood inputLineSource expectedLineSource outLineSource = do
+  logLoss <- gevalCoreOnSources MultiLabelLogLoss inputLineSource expectedLineSource outLineSource
+  return $ logLossToLikehood logLoss
+
 gevalCoreOnSources metric inputLineSource expectedLineSource outLineSource = do
  gevalCore' metric inputLineSource expectedLineSource outLineSource

@ -581,6 +595,13 @@ gevalCore' (MultiLabelFMeasure beta) _ = gevalCoreWithoutInput intoWords
      getWords = Right . (Prelude.map unpack) . selectByStandardThreshold . parseIntoProbList
      intoWords = Right . (Prelude.map unpack) . Data.Text.words

+gevalCore' MultiLabelLogLoss _ = gevalCoreWithoutInput intoWords
+                                                       (Right . parseIntoProbList)
+                                                       (uncurry countLogLossOnProbList)
+                                                       averageC
+                                                       id
+    where
+      intoWords = Right . Data.Text.words

 countAgg :: Monad m => ConduitM (Int, Int, Int) o m (Int, Int, Int)
 countAgg = CC.foldl countFolder (0, 0, 0)
--- a/src/GEval/ProbList.hs
+++ b/src/GEval/ProbList.hs
@ -1,7 +1,7 @@
 {-# LANGUAGE OverloadedStrings #-}

 module GEval.ProbList
-       (parseIntoProbList, selectByStandardThreshold)
+       (parseIntoProbList, selectByStandardThreshold, countLogLossOnProbList)
       where

 import qualified Data.Text as T
@ -23,6 +23,9 @@ mkProbability p
 probabilityOne :: Probability
 probabilityOne = mkProbability 1.0

+probabilityZero :: Probability
+probabilityZero = mkProbability 0.0
+
 data ProbList = ProbList [WordWithProb]
  deriving (Show)

@ -63,3 +66,19 @@ standardThreshold = 0.5

 selectByStandardThreshold :: ProbList -> [T.Text]
 selectByStandardThreshold = selectByThreshold (mkProbability standardThreshold)
+
+findProb :: ProbList -> T.Text -> Probability
+findProb (ProbList probList) target =
+  case filter (\(WordWithProb w _) -> w == target) probList of
+    ((WordWithProb _ p):_) -> p
+    [] -> probabilityZero
+
+countLogLossOnProbList :: [T.Text] -> ProbList -> Double
+countLogLossOnProbList expected probList@(ProbList l)  =
+  - (logLossForCorrectOnes + logLossForIncorrectOnes)
+  where logLossForCorrectOnes =
+          sum  $ map (\ew -> log ( getP (findProb probList ew))) expected
+        logLossForIncorrectOnes =
+          sum
+          $ map (\(WordWithProb _ p) -> log (1.0 - getP p))
+          $ filter (\(WordWithProb w p) -> w `notElem` expected) l
--- a/test/Spec.hs
+++ b/test/Spec.hs
@ -224,6 +224,9 @@ main = hspec $ do
      runGEvalTest "multilabel-f1-with-probs" `shouldReturnAlmost` 0.615384615384615
    it "labels given with probs and numbers" $ do
      runGEvalTest "multilabel-f1-with-probs-and-numbers" `shouldReturnAlmost` 0.6666666666666
+  describe "MultiLabel-Likelihood" $ do
+    it "simple" $ do
+      runGEvalTest "multilabel-likelihood-simple" `shouldReturnAlmost` 0.115829218528827
  describe "evaluating single lines" $ do
    it "RMSE" $ do
      gevalCoreOnSingleLines RMSE (LineInFile (FilePathSpec "stub1") 1 "blabla")
--- a/test/multilabel-likelihood-simple/multilabel-likelihood-simple-solution/test-A/out.tsv
+++ b/test/multilabel-likelihood-simple/multilabel-likelihood-simple-solution/test-A/out.tsv
@ -0,0 +1,4 @@
+foo:0.3 bar
+foo:0.9
+baz:1.0
+bar:0.8 baz:0.3 foo:0.1
--- a/test/multilabel-likelihood-simple/multilabel-likelihood-simple/config.txt
+++ b/test/multilabel-likelihood-simple/multilabel-likelihood-simple/config.txt
@ -0,0 +1 @@
+--metric MultiLabel-Likelihood
--- a/test/multilabel-likelihood-simple/multilabel-likelihood-simple/test-A/expected.tsv
+++ b/test/multilabel-likelihood-simple/multilabel-likelihood-simple/test-A/expected.tsv
@ -0,0 +1,4 @@
+foo bar
+
+baz
+foo baz