From 065a3ce9cd06225632cebcba395200e9abfae985 Mon Sep 17 00:00:00 2001 From: Filip Gralinski Date: Fri, 24 Mar 2017 08:50:57 +0100 Subject: [PATCH] add auxiliary function for calculating purity --- geval.cabal | 5 ++++- src/GEval/Purity.hs | 28 ++++++++++++++++++++++++++++ test/Spec.hs | 8 ++++++++ 3 files changed, 40 insertions(+), 1 deletion(-) create mode 100644 src/GEval/Purity.hs diff --git a/geval.cabal b/geval.cabal index 0df7320..c7b6408 100644 --- a/geval.cabal +++ b/geval.cabal @@ -1,5 +1,5 @@ name: geval -version: 0.2.6.0 +version: 0.3.0.0 synopsis: Machine learning evaluation tools description: Please see README.md homepage: http://github.com/name/project @@ -21,6 +21,7 @@ library , GEval.BLEU , GEval.ClippEU , GEval.PrecisionRecall + , GEval.Purity , GEval.Common build-depends: base >= 4.7 && < 5 , cond @@ -38,6 +39,8 @@ library , unix , fgl , attoparsec + , unordered-containers + , hashable default-language: Haskell2010 executable geval diff --git a/src/GEval/Purity.hs b/src/GEval/Purity.hs new file mode 100644 index 0000000..d12fa35 --- /dev/null +++ b/src/GEval/Purity.hs @@ -0,0 +1,28 @@ +module GEval.Purity + (purity, purityFromConfusionMap, updateConfusionMap) + where + +import GEval.Common + +import qualified Data.HashMap.Strict as M +import Data.Hashable +import Data.List + +purity :: (Hashable a, Eq a, Hashable b, Eq b) => [(a, b)] -> Double +purity pL = purityFromConfusionMap cM + where cM = confusionMap pL + +purityFromConfusionMap :: (Hashable a, Eq a, Hashable b, Eq b) => M.HashMap a (M.HashMap b Int) -> Double +purityFromConfusionMap cM = numberOfMajorityItems /. numberOfItems + where numberOfItems = sum $ map fst classCounts + numberOfMajorityItems = sum $ map snd classCounts + classCounts = map getClassCount $ M.toList cM + getClassCount (_, sh) = foldl' (\(s, m) (_, c) -> (s + c, max m c)) (0, 0) $ M.toList sh + +confusionMap :: (Hashable a, Eq a, Hashable b, Eq b) => [(a, b)] -> M.HashMap a (M.HashMap b Int) +confusionMap = foldl' updateConfusionMap M.empty + +updateConfusionMap :: (Hashable a, Eq a, Hashable b, Eq b) => M.HashMap a (M.HashMap b Int) -> (a, b) -> M.HashMap a (M.HashMap b Int) +updateConfusionMap h (e, g) = M.insertWith updateSubHash e (unitHash g) h + where unitHash k = M.singleton k 1 + updateSubHash uh sh = M.unionWith (+) uh sh diff --git a/test/Spec.hs b/test/Spec.hs index 639f048..664b6ab 100644 --- a/test/Spec.hs +++ b/test/Spec.hs @@ -7,6 +7,7 @@ import GEval.OptionsParser import GEval.BLEU import GEval.ClippEU import GEval.PrecisionRecall +import GEval.Purity import Data.Attoparsec.Text import Options.Applicative import Data.Text @@ -50,6 +51,13 @@ main = hspec $ do precisionCount [["bar", "bar", "bar", "bar", "foo", "xyz", "foo"]] ["foo", "bar", "foo", "baz", "bar", "foo"] `shouldBe` 4 it "multiple refs" $ do precisionCount [["foo", "baz"], ["bar"], ["baz", "xyz"]] ["foo", "bar", "foo"] `shouldBe` 2 + describe "purity (in flat clustering)" $ do + it "the example from Information Retrieval Book" $ do + purity [(2, "o") :: (Int, String), (2, "o"), (2, "d"), (3, "x"), (3, "d"), + (1, "x"), (1, "o"), (1, "x"), (1, "x"), (1, "x"), (1, "x"), + (2, "x"), (2, "o"), (2, "o"), + (3, "x"), (3, "d"), (3, "d")] `shouldBeAlmost` 0.70588 + describe "reading options" $ do it "can get the metric" $ do extractMetric "bleu-complex" `shouldReturn` (Just BLEU)