diff --git a/geval.cabal b/geval.cabal index 61d1c72..3bd22a3 100644 --- a/geval.cabal +++ b/geval.cabal @@ -1,5 +1,5 @@ name: geval -version: 1.13.0.0 +version: 1.14.0.0 synopsis: Machine learning evaluation tools description: Please see README.md homepage: http://github.com/name/project @@ -39,6 +39,7 @@ library , GEval.Annotation , GEval.BlackBoxDebugging , Text.WordShape + , Data.Statistics.Kendall , Paths_geval build-depends: base >= 4.7 && < 5 , cond @@ -80,6 +81,7 @@ library , MissingH , array , Munkres + , vector-algorithms default-language: Haskell2010 executable geval @@ -117,6 +119,8 @@ test-suite geval-test , directory , temporary , silently + , vector + , statistics ghc-options: -threaded -rtsopts -with-rtsopts=-N default-language: Haskell2010 diff --git a/src/Data/Statistics/Kendall.hs b/src/Data/Statistics/Kendall.hs new file mode 100644 index 0000000..347ae1f --- /dev/null +++ b/src/Data/Statistics/Kendall.hs @@ -0,0 +1,178 @@ +{-# LANGUAGE BangPatterns, CPP, FlexibleContexts, ScopedTypeVariables #-} +-- | +-- (Taken from http://hackage.haskell.org/package/statistics-0.15.0.0/docs/src/Statistics.Correlation.Kendall.html) +-- +-- Module : Statistics.Correlation.Kendall +-- +-- Fast O(NlogN) implementation of +-- . +-- +-- This module implements Kendall's tau form b which allows ties in the data. +-- This is the same formula used by other statistical packages, e.g., R, matlab. +-- +-- > \tau = \frac{n_c - n_d}{\sqrt{(n_0 - n_1)(n_0 - n_2)}} +-- +-- where n_0 = n(n-1)\/2, n_1 = number of pairs tied for the first quantify, +-- n_2 = number of pairs tied for the second quantify, +-- n_c = number of concordant pairs$, n_d = number of discordant pairs. + +module Data.Statistics.Kendall + ( kendall, + kendallZ + -- * References + -- $references + ) where + +import Control.Monad.ST (ST, runST) +import Data.Bits (shiftR) +import Data.Function (on) +import Data.STRef +import qualified Data.Vector.Algorithms.Intro as I +import qualified Data.Vector.Generic as G +import qualified Data.Vector.Generic.Mutable as GM + +-- | /O(nlogn)/ Compute the Kendall's tau from a vector of paired data. +-- Return NaN when number of pairs <= 1. +kendall :: (Ord a, Ord b, G.Vector v (a, b)) => v (a, b) -> Double +kendall xy' + | G.length xy' <= 1 = 0/0 + | otherwise = runST $ do + xy <- G.thaw xy' + let n = GM.length xy + n_dRef <- newSTRef 0 + I.sort xy + tieX <- numOfTiesBy ((==) `on` fst) xy + tieXY <- numOfTiesBy (==) xy + tmp <- GM.new n + mergeSort (compare `on` snd) xy tmp n_dRef + tieY <- numOfTiesBy ((==) `on` snd) xy + n_d <- readSTRef n_dRef + let n_0 = (fromIntegral n * (fromIntegral n-1)) `shiftR` 1 :: Integer + n_c = n_0 - n_d - tieX - tieY + tieXY + return $ fromIntegral (n_c - n_d) / + (sqrt.fromIntegral) ((n_0 - tieX) * (n_0 - tieY)) +{-# INLINE kendall #-} + +kendallZ :: (Ord a, Ord b, G.Vector v (a, b)) => v (a, b) -> Double +kendallZ xy' + | G.length xy' <= 1 = 0/0 + | otherwise = runST $ do + xy <- G.thaw xy' + let n = GM.length xy + let vfun x = x * (x - 1) * (2*x + 5) + let tttfun x = x * (x - 1) * (x - 2) + n_dRef <- newSTRef 0 + I.sort xy + tieX <- numOfTiesBy ((==) `on` fst) xy + tieXY <- numOfTiesBy (==) xy + vt <- numOfTiesByGeneralized vfun ((==) `on` fst) xy + tttX <- numOfTiesByGeneralized tttfun ((==) `on` fst) xy + tmp <- GM.new n + mergeSort (compare `on` snd) xy tmp n_dRef + tieY <- numOfTiesBy ((==) `on` snd) xy + vu <- numOfTiesByGeneralized vfun ((==) `on` snd) xy + tttY <- numOfTiesByGeneralized tttfun ((==) `on` snd) xy + n_d <- readSTRef n_dRef + let n_0 = (fromIntegral n * (fromIntegral n-1)) `shiftR` 1 :: Integer + n_c = n_0 - n_d - tieX - tieY + tieXY + v0 = vfun (fromIntegral n) + v1 = 2.0 * (fromIntegral tieX) * (fromIntegral tieY) / (fromIntegral (n * (n-1))) + v2 = (fromIntegral tttX) * (fromIntegral tttY) / (fromIntegral (9 * (tttfun n))) + v = (fromIntegral (v0 - vt - vu)) / 18.0 + v1 + v2 + return $ (fromIntegral (n_c - n_d)) / sqrt v +{-# INLINE kendallZ #-} + +-- calculate number of tied pairs in a sorted vector +numOfTiesBy :: GM.MVector v a + => (a -> a -> Bool) -> v s a -> ST s Integer +numOfTiesBy f xs = numOfTiesByGeneralized (\x -> (x * (x - 1)) `shiftR` 1) f xs + +numOfTiesByGeneralized :: GM.MVector v a + => (Int -> Int) -> (a -> a -> Bool) -> v s a -> ST s Integer +numOfTiesByGeneralized op f xs = do count <- newSTRef (0::Integer) + loop count (1::Int) (0::Int) + readSTRef count + where + n = GM.length xs + loop c !acc !i | i >= n - 1 = modifySTRef' c (+ g acc) + | otherwise = do + x1 <- GM.unsafeRead xs i + x2 <- GM.unsafeRead xs (i+1) + if f x1 x2 + then loop c (acc+1) (i+1) + else modifySTRef' c (+ g acc) >> loop c 1 (i+1) + g x = fromIntegral $ op x +{-# INLINE numOfTiesByGeneralized #-} + +-- Implementation of Knight's merge sort (adapted from vector-algorithm). This +-- function is used to count the number of discordant pairs. +mergeSort :: GM.MVector v e + => (e -> e -> Ordering) + -> v s e + -> v s e + -> STRef s Integer + -> ST s () +mergeSort cmp src buf count = loop 0 (GM.length src - 1) + where + loop l u + | u == l = return () + | u - l == 1 = do + eL <- GM.unsafeRead src l + eU <- GM.unsafeRead src u + case cmp eL eU of + GT -> do GM.unsafeWrite src l eU + GM.unsafeWrite src u eL + modifySTRef' count (+1) + _ -> return () + | otherwise = do + let mid = (u + l) `shiftR` 1 + loop l mid + loop mid u + merge cmp (GM.unsafeSlice l (u-l+1) src) buf (mid - l) count +{-# INLINE mergeSort #-} + +merge :: GM.MVector v e + => (e -> e -> Ordering) + -> v s e + -> v s e + -> Int + -> STRef s Integer + -> ST s () +merge cmp src buf mid count = do GM.unsafeCopy tmp lower + eTmp <- GM.unsafeRead tmp 0 + eUpp <- GM.unsafeRead upper 0 + loop tmp 0 eTmp upper 0 eUpp 0 + where + lower = GM.unsafeSlice 0 mid src + upper = GM.unsafeSlice mid (GM.length src - mid) src + tmp = GM.unsafeSlice 0 mid buf + wroteHigh low iLow eLow high iHigh iIns + | iHigh >= GM.length high = + GM.unsafeCopy (GM.unsafeSlice iIns (GM.length low - iLow) src) + (GM.unsafeSlice iLow (GM.length low - iLow) low) + | otherwise = do eHigh <- GM.unsafeRead high iHigh + loop low iLow eLow high iHigh eHigh iIns + + wroteLow low iLow high iHigh eHigh iIns + | iLow >= GM.length low = return () + | otherwise = do eLow <- GM.unsafeRead low iLow + loop low iLow eLow high iHigh eHigh iIns + + loop !low !iLow !eLow !high !iHigh !eHigh !iIns = case cmp eHigh eLow of + LT -> do GM.unsafeWrite src iIns eHigh + modifySTRef' count (+ fromIntegral (GM.length low - iLow)) + wroteHigh low iLow eLow high (iHigh+1) (iIns+1) + _ -> do GM.unsafeWrite src iIns eLow + wroteLow low (iLow+1) high iHigh eHigh (iIns+1) +{-# INLINE merge #-} + +#if !MIN_VERSION_base(4,6,0) +modifySTRef' :: STRef s a -> (a -> a) -> ST s () +modifySTRef' = modifySTRef +#endif + +-- $references +-- +-- * William R. Knight. (1966) A computer method for calculating Kendall's Tau +-- with ungrouped data. /Journal of the American Statistical Association/, +-- Vol. 61, No. 314, Part 1, pp. 436-439. diff --git a/src/GEval/FeatureExtractor.hs b/src/GEval/FeatureExtractor.hs index 6489ddc..c83bad0 100644 --- a/src/GEval/FeatureExtractor.hs +++ b/src/GEval/FeatureExtractor.hs @@ -4,6 +4,10 @@ module GEval.FeatureExtractor (extractFactors, extractFactorsFromTabbed, cartesianFeatures, + Feature(..), + NumericalType(..), + NumericalDirection(..), + Featuroid(..), LineWithFactors(..), LineWithPeggedFactors(..), PeggedFactor(..), @@ -26,15 +30,55 @@ import GEval.BlackBoxDebugging import GEval.Common import Text.Read (readMaybe) +data Feature = UnaryFeature PeggedExistentialFactor + | CartesianFeature PeggedExistentialFactor PeggedExistentialFactor + | NumericalFeature FeatureNamespace NumericalType NumericalDirection + deriving (Eq, Ord) + +instance Show Feature where + show (UnaryFeature p) = show p + show (CartesianFeature pA pB) = formatCartesian pA pB + show (NumericalFeature namespace ntype direction) = (show namespace) ++ ":" ++ (show ntype) ++ (show direction) + +data NumericalType = DirectValue | LengthOf + deriving (Eq, Ord) + +instance Show NumericalType where + show DirectValue = "=" + show LengthOf = "=#" + +data NumericalDirection = Big | Small + deriving (Eq, Ord) + +instance Show NumericalDirection where + show Big = "+" + show Small = "-" + +-- | Featuroid is something between a factor and a feature, i.e. for numerical factors +-- it's not a single value, but still without the direction. +data Featuroid = UnaryFeaturoid PeggedExistentialFactor + | CartesianFeaturoid PeggedExistentialFactor PeggedExistentialFactor + | NumericalFeaturoid FeatureNamespace + deriving (Eq, Ord) + +instance Show Featuroid where + show (UnaryFeaturoid p) = show p + show (CartesianFeaturoid pA pB) = formatCartesian pA pB + show (NumericalFeaturoid namespace) = (show namespace) ++ ":=" + data LineWithFactors = LineWithFactors Double MetricValue [Factor] deriving (Eq, Ord) +-- | A factor extracted from a single item (its input, expected output or actual output). data Factor = UnaryFactor PeggedFactor | CartesianFactor PeggedExistentialFactor PeggedExistentialFactor deriving (Eq, Ord) instance Show Factor where show (UnaryFactor factor) = show factor - show (CartesianFactor factorA factorB) = (show factorA) ++ "~~" ++ (show factorB) + show (CartesianFactor factorA factorB) = formatCartesian factorA factorB + +formatCartesian :: PeggedExistentialFactor -> PeggedExistentialFactor -> String +formatCartesian factorA factorB = (show factorA) ++ "~~" ++ (show factorB) data LineWithPeggedFactors = LineWithPeggedFactors Double MetricValue [PeggedFactor] deriving (Eq, Ord) diff --git a/src/GEval/LineByLine.hs b/src/GEval/LineByLine.hs index 1b10a28..098a408 100644 --- a/src/GEval/LineByLine.hs +++ b/src/GEval/LineByLine.hs @@ -34,7 +34,9 @@ import Data.Text.Encoding import Data.Conduit.Rank import Data.Maybe (fromMaybe) -import Data.List (sortBy, sort, concat) +import qualified Data.Vector as V + +import Data.List (sortBy, sortOn, sort, concat) import Control.Monad.IO.Class import Control.Monad.Trans.Resource @@ -56,6 +58,7 @@ import System.FilePath import Statistics.Distribution (cumulative) import Statistics.Distribution.Normal (normalDistr) +import Data.Statistics.Kendall (kendallZ) import qualified Data.Map.Strict as M import qualified Data.Set as S @@ -125,7 +128,7 @@ extractFeaturesAndPValues spec bbdo = data RankedFactor = RankedFactor Factor Double MetricValue deriving (Show) -data FeatureWithPValue = FeatureWithPValue Factor -- ^ feature itself +data FeatureWithPValue = FeatureWithPValue Feature -- ^ feature itself Double -- ^ p-value MetricValue -- ^ average metric value Integer -- ^ count @@ -184,11 +187,12 @@ finalFeatures True minFreq = do filtreCartesian False = CC.map id filtreCartesian True = CC.concatMapAccum step S.empty - where step f@(FeatureWithPValue (UnaryFactor (PeggedFactor namespace (SimpleExistentialFactor p))) _ _ _) mp = (S.insert (PeggedExistentialFactor namespace p) mp, [f]) - step f@(FeatureWithPValue (UnaryFactor (PeggedFactor namespace (NumericalFactor _ _))) _ _ _) mp = (mp, [f]) - step f@(FeatureWithPValue (CartesianFactor pA pB) _ _ _) mp = (mp, if pA `S.member` mp || pB `S.member` mp - then [] - else [f]) + where step f@(FeatureWithPValue (UnaryFeature fac) _ _ _) mp = (S.insert fac mp, [f]) + step f@(FeatureWithPValue (CartesianFeature pA pB) _ _ _) mp = (mp, if pA `S.member` mp || pB `S.member` mp + then [] + else [f]) + step f@(FeatureWithPValue (NumericalFeature _ _ _) _ _ _) mp = (mp, [f]) + peggedToUnaryLine :: LineWithPeggedFactors -> LineWithFactors peggedToUnaryLine (LineWithPeggedFactors rank score fs) = LineWithFactors rank score (Prelude.map UnaryFactor fs) @@ -200,33 +204,69 @@ getFeatures mTokenizer bbdo (LineRecord inLine expLine outLine _ _) = extractFactorsFromTabbed mTokenizer bbdo "in" inLine, extractFactors mTokenizer bbdo "out" outLine] +data FeatureAggregate = ExistentialFactorAggregate Double MetricValue Integer + | NumericalValueAggregate [Double] [MetricValue] [Int] [MetricValue] + | LengthAggregate [Double] [MetricValue] [Int] + +aggreggate :: FeatureAggregate -> FeatureAggregate -> FeatureAggregate +aggreggate (ExistentialFactorAggregate r1 s1 c1) (ExistentialFactorAggregate r2 s2 c2) = + ExistentialFactorAggregate (r1 + r2) (s1 + s2) (c1 + c2) +aggreggate (NumericalValueAggregate ranks1 scores1 lengths1 values1) (NumericalValueAggregate ranks2 scores2 lengths2 values2) = + NumericalValueAggregate (ranks1 ++ ranks2) (scores1 ++ scores2) (lengths1 ++ lengths2) (values1 ++ values2) +aggreggate (NumericalValueAggregate ranks1 scores1 lengths1 _) (LengthAggregate ranks2 scores2 lengths2) = + LengthAggregate (ranks1 ++ ranks2) (scores1 ++ scores2) (lengths1 ++ lengths2) +aggreggate (LengthAggregate ranks1 scores1 lengths1) (NumericalValueAggregate ranks2 scores2 lengths2 _) = + LengthAggregate (ranks1 ++ ranks2) (scores1 ++ scores2) (lengths1 ++ lengths2) +aggreggate (LengthAggregate ranks1 scores1 lengths1) (LengthAggregate ranks2 scores2 lengths2) = + LengthAggregate (ranks1 ++ ranks2) (scores1 ++ scores2) (lengths1 ++ lengths2) +aggreggate _ _ = error "Mismatched aggregates!" + +initAggregate :: RankedFactor -> (Featuroid, FeatureAggregate) +initAggregate (RankedFactor (UnaryFactor (PeggedFactor namespace (NumericalFactor Nothing l))) r s) = + (NumericalFeaturoid namespace, LengthAggregate [r] [s] [l]) +initAggregate (RankedFactor (UnaryFactor (PeggedFactor namespace (NumericalFactor (Just v) l))) r s) = + (NumericalFeaturoid namespace, NumericalValueAggregate [r] [s] [l] [v]) +initAggregate (RankedFactor (UnaryFactor (PeggedFactor namespace (SimpleExistentialFactor f))) r s) = + (UnaryFeaturoid (PeggedExistentialFactor namespace f), ExistentialFactorAggregate r s 1) +initAggregate (RankedFactor (CartesianFactor pA pB) r s) = + (CartesianFeaturoid pA pB, ExistentialFactorAggregate r s 1) + +filterAggregateByFreq :: Integer -> (Maybe Integer) -> FeatureAggregate -> Bool +filterAggregateByFreq minFreq Nothing (ExistentialFactorAggregate _ _ c) = c >= minFreq +filterAggregateByFreq minFreq (Just total) (ExistentialFactorAggregate _ _ c) = c >= minFreq && total - c >= minFreq +filterAggregateByFreq _ _ _ = True + uScoresCounter :: Monad m => Integer -> ConduitT RankedFactor FeatureWithPValue (StateT Integer m) () -uScoresCounter minFreq = CC.map (\(RankedFactor feature r score) -> (feature, (r, score, 1))) +uScoresCounter minFreq = CC.map initAggregate .| gobbleAndDo countUScores .| lowerFreqFiltre .| pValueCalculator minFreq where countUScores l = M.toList - $ M.fromListWith (\(r1, s1, c1) (r2, s2, c2) -> ((r1 + r2), (s1 + s2), (c1 + c2))) l - lowerFreqFiltre = CC.filter (\(_, (_, _, c)) -> c >= minFreq) + $ M.fromListWith aggreggate l + lowerFreqFiltre = CC.filter (\(_, fAgg) -> filterAggregateByFreq minFreq Nothing fAgg) -pValueCalculator :: Monad m => Integer -> ConduitT (Factor, (Double, MetricValue, Integer)) FeatureWithPValue (StateT Integer m) () +pValueCalculator :: Monad m => Integer -> ConduitT (Featuroid, FeatureAggregate) FeatureWithPValue (StateT Integer m) () pValueCalculator minFreq = do firstVal <- await case firstVal of - Just i@(_, (_, _, c)) -> do + Just i@(_, fAgg) -> do total <- lift get - if total - c >= minFreq + if filterAggregateByFreq minFreq (Just total) fAgg then yield $ calculatePValue total i else return () - CC.filter (\(_, (_, _, c)) -> total - c >= minFreq) .| CC.map (calculatePValue total) + CC.filter (\(_, fAgg) -> filterAggregateByFreq minFreq (Just total) fAgg) .| CC.map (calculatePValue total) Nothing -> return () -calculatePValue :: Integer -> (Factor, (Double, MetricValue, Integer)) -> FeatureWithPValue -calculatePValue total (f, (r, s, c)) = FeatureWithPValue f - (pvalue (r - minusR c) c (total - c)) - (s / (fromIntegral c)) - c +calculatePValue :: Integer -> (Featuroid, FeatureAggregate) -> FeatureWithPValue +calculatePValue _ (NumericalFeaturoid namespace, NumericalValueAggregate ranks scores _ values) = + kendallPValueFeature namespace DirectValue ranks scores values +calculatePValue _ (NumericalFeaturoid namespace, LengthAggregate ranks scores lens) = + kendallPValueFeature namespace LengthOf ranks scores lens +calculatePValue total (f, ExistentialFactorAggregate r s c) = FeatureWithPValue (featoroidToFeature f) + (pvalue (r - minusR c) c (total - c)) + (s / (fromIntegral c)) + c where minusR c = (c' * (c' + 1)) / 2.0 where c' = fromIntegral c -- calulating p-value from Mann–Whitney U test @@ -237,6 +277,26 @@ calculatePValue total (f, (r, s, c)) = FeatureWithPValue f sigma = sqrt $ n1' * n2' * (n1' + n2' + 1) / 12 z = (u - mean) / sigma in cumulative (normalDistr 0.0 1.0) z + featoroidToFeature (UnaryFeaturoid fac) = UnaryFeature fac + featoroidToFeature (CartesianFeaturoid facA facB) = (CartesianFeature facA facB) + + +kendallPValueFeature :: Ord a => FeatureNamespace -> NumericalType -> [Double] -> [MetricValue] -> [a] -> FeatureWithPValue +kendallPValueFeature namespace ntype ranks scores values = FeatureWithPValue (NumericalFeature namespace ntype ndirection) + pv + ((sum selectedScores) / (fromIntegral selected)) + (fromIntegral selected) + where z = kendallZ (V.fromList $ Prelude.zip ranks values) + pv = 2 * (cumulative (normalDistr 0.0 1.0) (- (abs z))) + ndirection = if z > 0 + then Small + else Big + selected = (Prelude.length scores) `div` 4 + + selectedScores = Prelude.take selected $ Prelude.map snd $ turner $ sortOn fst $ Prelude.zip values scores + turner = case ndirection of + Small -> id + Big -> Prelude.reverse totalCounter :: Monad m => ConduitT a a (StateT Integer m) () diff --git a/test/Spec.hs b/test/Spec.hs index dbb3f2f..06370f5 100644 --- a/test/Spec.hs +++ b/test/Spec.hs @@ -40,6 +40,7 @@ import Data.List (sort) import qualified Test.HUnit as HU import qualified Data.IntSet as IS +import qualified Data.Vector as V import Data.Conduit.SmartSource import Data.Conduit.Rank @@ -49,6 +50,10 @@ import Control.Monad.Trans.Resource import qualified Data.Conduit.List as CL import qualified Data.Conduit.Combinators as CC +import Statistics.Distribution (cumulative) +import Statistics.Distribution.Normal (normalDistr) +import Data.Statistics.Kendall (kendall, kendallZ) + informationRetrievalBookExample :: [(String, Int)] informationRetrievalBookExample = [("o", 2), ("o", 2), ("d", 2), ("x", 3), ("d", 3), ("x", 1), ("o", 1), ("x", 1), ( "x", 1), ("x", 1), ("x", 1), @@ -541,6 +546,14 @@ main = hspec $ do (SimpleExistentialFactor (SimpleAtomicFactor (TextFactor "tests"))), PeggedFactor (FeatureTabbedNamespace "in" 3) (NumericalFactor Nothing 5) ] + describe "Kendall's tau" $ do + it "tau" $ do + kendall (V.fromList $ Prelude.zip [12, 2, 1, 12, 2] [1, 4, 7, 1, 0]) `shouldBeAlmost` (-0.47140452079103173) + it "z" $ do + kendallZ (V.fromList $ Prelude.zip [12, 2, 1, 12, 2] [1, 4, 7, 1, 0]) `shouldBeAlmost` (-1.0742) + it "p-value" $ do + (2 * (cumulative (normalDistr 0.0 1.0) $ kendallZ (V.fromList $ Prelude.zip [12, 2, 1, 12, 2] [1, 4, 7, 1, 0]))) `shouldBeAlmost` 0.2827 + checkConduitPure conduit inList expList = do let outList = runConduitPure $ CC.yieldMany inList .| conduit .| CC.sinkList