add auxiliary function for calculating purity

This commit is contained in:
Filip Gralinski 2017-03-24 08:50:57 +01:00 committed by Filip Gralinski
parent 422d7c63d9
commit 065a3ce9cd
3 changed files with 40 additions and 1 deletions

View File

@ -1,5 +1,5 @@
name: geval name: geval
version: 0.2.6.0 version: 0.3.0.0
synopsis: Machine learning evaluation tools synopsis: Machine learning evaluation tools
description: Please see README.md description: Please see README.md
homepage: http://github.com/name/project homepage: http://github.com/name/project
@ -21,6 +21,7 @@ library
, GEval.BLEU , GEval.BLEU
, GEval.ClippEU , GEval.ClippEU
, GEval.PrecisionRecall , GEval.PrecisionRecall
, GEval.Purity
, GEval.Common , GEval.Common
build-depends: base >= 4.7 && < 5 build-depends: base >= 4.7 && < 5
, cond , cond
@ -38,6 +39,8 @@ library
, unix , unix
, fgl , fgl
, attoparsec , attoparsec
, unordered-containers
, hashable
default-language: Haskell2010 default-language: Haskell2010
executable geval executable geval

28
src/GEval/Purity.hs Normal file
View File

@ -0,0 +1,28 @@
module GEval.Purity
(purity, purityFromConfusionMap, updateConfusionMap)
where
import GEval.Common
import qualified Data.HashMap.Strict as M
import Data.Hashable
import Data.List
purity :: (Hashable a, Eq a, Hashable b, Eq b) => [(a, b)] -> Double
purity pL = purityFromConfusionMap cM
where cM = confusionMap pL
purityFromConfusionMap :: (Hashable a, Eq a, Hashable b, Eq b) => M.HashMap a (M.HashMap b Int) -> Double
purityFromConfusionMap cM = numberOfMajorityItems /. numberOfItems
where numberOfItems = sum $ map fst classCounts
numberOfMajorityItems = sum $ map snd classCounts
classCounts = map getClassCount $ M.toList cM
getClassCount (_, sh) = foldl' (\(s, m) (_, c) -> (s + c, max m c)) (0, 0) $ M.toList sh
confusionMap :: (Hashable a, Eq a, Hashable b, Eq b) => [(a, b)] -> M.HashMap a (M.HashMap b Int)
confusionMap = foldl' updateConfusionMap M.empty
updateConfusionMap :: (Hashable a, Eq a, Hashable b, Eq b) => M.HashMap a (M.HashMap b Int) -> (a, b) -> M.HashMap a (M.HashMap b Int)
updateConfusionMap h (e, g) = M.insertWith updateSubHash e (unitHash g) h
where unitHash k = M.singleton k 1
updateSubHash uh sh = M.unionWith (+) uh sh

View File

@ -7,6 +7,7 @@ import GEval.OptionsParser
import GEval.BLEU import GEval.BLEU
import GEval.ClippEU import GEval.ClippEU
import GEval.PrecisionRecall import GEval.PrecisionRecall
import GEval.Purity
import Data.Attoparsec.Text import Data.Attoparsec.Text
import Options.Applicative import Options.Applicative
import Data.Text import Data.Text
@ -50,6 +51,13 @@ main = hspec $ do
precisionCount [["bar", "bar", "bar", "bar", "foo", "xyz", "foo"]] ["foo", "bar", "foo", "baz", "bar", "foo"] `shouldBe` 4 precisionCount [["bar", "bar", "bar", "bar", "foo", "xyz", "foo"]] ["foo", "bar", "foo", "baz", "bar", "foo"] `shouldBe` 4
it "multiple refs" $ do it "multiple refs" $ do
precisionCount [["foo", "baz"], ["bar"], ["baz", "xyz"]] ["foo", "bar", "foo"] `shouldBe` 2 precisionCount [["foo", "baz"], ["bar"], ["baz", "xyz"]] ["foo", "bar", "foo"] `shouldBe` 2
describe "purity (in flat clustering)" $ do
it "the example from Information Retrieval Book" $ do
purity [(2, "o") :: (Int, String), (2, "o"), (2, "d"), (3, "x"), (3, "d"),
(1, "x"), (1, "o"), (1, "x"), (1, "x"), (1, "x"), (1, "x"),
(2, "x"), (2, "o"), (2, "o"),
(3, "x"), (3, "d"), (3, "d")] `shouldBeAlmost` 0.70588
describe "reading options" $ do describe "reading options" $ do
it "can get the metric" $ do it "can get the metric" $ do
extractMetric "bleu-complex" `shouldReturn` (Just BLEU) extractMetric "bleu-complex" `shouldReturn` (Just BLEU)