Add numerical features
This commit is contained in:
parent
d5a8908599
commit
abcce9bf68
@ -1,5 +1,5 @@
|
|||||||
name: geval
|
name: geval
|
||||||
version: 1.13.0.0
|
version: 1.14.0.0
|
||||||
synopsis: Machine learning evaluation tools
|
synopsis: Machine learning evaluation tools
|
||||||
description: Please see README.md
|
description: Please see README.md
|
||||||
homepage: http://github.com/name/project
|
homepage: http://github.com/name/project
|
||||||
@ -39,6 +39,7 @@ library
|
|||||||
, GEval.Annotation
|
, GEval.Annotation
|
||||||
, GEval.BlackBoxDebugging
|
, GEval.BlackBoxDebugging
|
||||||
, Text.WordShape
|
, Text.WordShape
|
||||||
|
, Data.Statistics.Kendall
|
||||||
, Paths_geval
|
, Paths_geval
|
||||||
build-depends: base >= 4.7 && < 5
|
build-depends: base >= 4.7 && < 5
|
||||||
, cond
|
, cond
|
||||||
@ -80,6 +81,7 @@ library
|
|||||||
, MissingH
|
, MissingH
|
||||||
, array
|
, array
|
||||||
, Munkres
|
, Munkres
|
||||||
|
, vector-algorithms
|
||||||
default-language: Haskell2010
|
default-language: Haskell2010
|
||||||
|
|
||||||
executable geval
|
executable geval
|
||||||
@ -117,6 +119,8 @@ test-suite geval-test
|
|||||||
, directory
|
, directory
|
||||||
, temporary
|
, temporary
|
||||||
, silently
|
, silently
|
||||||
|
, vector
|
||||||
|
, statistics
|
||||||
ghc-options: -threaded -rtsopts -with-rtsopts=-N
|
ghc-options: -threaded -rtsopts -with-rtsopts=-N
|
||||||
default-language: Haskell2010
|
default-language: Haskell2010
|
||||||
|
|
||||||
|
178
src/Data/Statistics/Kendall.hs
Normal file
178
src/Data/Statistics/Kendall.hs
Normal file
@ -0,0 +1,178 @@
|
|||||||
|
{-# LANGUAGE BangPatterns, CPP, FlexibleContexts, ScopedTypeVariables #-}
|
||||||
|
-- |
|
||||||
|
-- (Taken from http://hackage.haskell.org/package/statistics-0.15.0.0/docs/src/Statistics.Correlation.Kendall.html)
|
||||||
|
--
|
||||||
|
-- Module : Statistics.Correlation.Kendall
|
||||||
|
--
|
||||||
|
-- Fast O(NlogN) implementation of
|
||||||
|
-- <http://en.wikipedia.org/wiki/Kendall_tau_rank_correlation_coefficient Kendall's tau>.
|
||||||
|
--
|
||||||
|
-- This module implements Kendall's tau form b which allows ties in the data.
|
||||||
|
-- This is the same formula used by other statistical packages, e.g., R, matlab.
|
||||||
|
--
|
||||||
|
-- > \tau = \frac{n_c - n_d}{\sqrt{(n_0 - n_1)(n_0 - n_2)}}
|
||||||
|
--
|
||||||
|
-- where n_0 = n(n-1)\/2, n_1 = number of pairs tied for the first quantify,
|
||||||
|
-- n_2 = number of pairs tied for the second quantify,
|
||||||
|
-- n_c = number of concordant pairs$, n_d = number of discordant pairs.
|
||||||
|
|
||||||
|
module Data.Statistics.Kendall
|
||||||
|
( kendall,
|
||||||
|
kendallZ
|
||||||
|
-- * References
|
||||||
|
-- $references
|
||||||
|
) where
|
||||||
|
|
||||||
|
import Control.Monad.ST (ST, runST)
|
||||||
|
import Data.Bits (shiftR)
|
||||||
|
import Data.Function (on)
|
||||||
|
import Data.STRef
|
||||||
|
import qualified Data.Vector.Algorithms.Intro as I
|
||||||
|
import qualified Data.Vector.Generic as G
|
||||||
|
import qualified Data.Vector.Generic.Mutable as GM
|
||||||
|
|
||||||
|
-- | /O(nlogn)/ Compute the Kendall's tau from a vector of paired data.
|
||||||
|
-- Return NaN when number of pairs <= 1.
|
||||||
|
kendall :: (Ord a, Ord b, G.Vector v (a, b)) => v (a, b) -> Double
|
||||||
|
kendall xy'
|
||||||
|
| G.length xy' <= 1 = 0/0
|
||||||
|
| otherwise = runST $ do
|
||||||
|
xy <- G.thaw xy'
|
||||||
|
let n = GM.length xy
|
||||||
|
n_dRef <- newSTRef 0
|
||||||
|
I.sort xy
|
||||||
|
tieX <- numOfTiesBy ((==) `on` fst) xy
|
||||||
|
tieXY <- numOfTiesBy (==) xy
|
||||||
|
tmp <- GM.new n
|
||||||
|
mergeSort (compare `on` snd) xy tmp n_dRef
|
||||||
|
tieY <- numOfTiesBy ((==) `on` snd) xy
|
||||||
|
n_d <- readSTRef n_dRef
|
||||||
|
let n_0 = (fromIntegral n * (fromIntegral n-1)) `shiftR` 1 :: Integer
|
||||||
|
n_c = n_0 - n_d - tieX - tieY + tieXY
|
||||||
|
return $ fromIntegral (n_c - n_d) /
|
||||||
|
(sqrt.fromIntegral) ((n_0 - tieX) * (n_0 - tieY))
|
||||||
|
{-# INLINE kendall #-}
|
||||||
|
|
||||||
|
kendallZ :: (Ord a, Ord b, G.Vector v (a, b)) => v (a, b) -> Double
|
||||||
|
kendallZ xy'
|
||||||
|
| G.length xy' <= 1 = 0/0
|
||||||
|
| otherwise = runST $ do
|
||||||
|
xy <- G.thaw xy'
|
||||||
|
let n = GM.length xy
|
||||||
|
let vfun x = x * (x - 1) * (2*x + 5)
|
||||||
|
let tttfun x = x * (x - 1) * (x - 2)
|
||||||
|
n_dRef <- newSTRef 0
|
||||||
|
I.sort xy
|
||||||
|
tieX <- numOfTiesBy ((==) `on` fst) xy
|
||||||
|
tieXY <- numOfTiesBy (==) xy
|
||||||
|
vt <- numOfTiesByGeneralized vfun ((==) `on` fst) xy
|
||||||
|
tttX <- numOfTiesByGeneralized tttfun ((==) `on` fst) xy
|
||||||
|
tmp <- GM.new n
|
||||||
|
mergeSort (compare `on` snd) xy tmp n_dRef
|
||||||
|
tieY <- numOfTiesBy ((==) `on` snd) xy
|
||||||
|
vu <- numOfTiesByGeneralized vfun ((==) `on` snd) xy
|
||||||
|
tttY <- numOfTiesByGeneralized tttfun ((==) `on` snd) xy
|
||||||
|
n_d <- readSTRef n_dRef
|
||||||
|
let n_0 = (fromIntegral n * (fromIntegral n-1)) `shiftR` 1 :: Integer
|
||||||
|
n_c = n_0 - n_d - tieX - tieY + tieXY
|
||||||
|
v0 = vfun (fromIntegral n)
|
||||||
|
v1 = 2.0 * (fromIntegral tieX) * (fromIntegral tieY) / (fromIntegral (n * (n-1)))
|
||||||
|
v2 = (fromIntegral tttX) * (fromIntegral tttY) / (fromIntegral (9 * (tttfun n)))
|
||||||
|
v = (fromIntegral (v0 - vt - vu)) / 18.0 + v1 + v2
|
||||||
|
return $ (fromIntegral (n_c - n_d)) / sqrt v
|
||||||
|
{-# INLINE kendallZ #-}
|
||||||
|
|
||||||
|
-- calculate number of tied pairs in a sorted vector
|
||||||
|
numOfTiesBy :: GM.MVector v a
|
||||||
|
=> (a -> a -> Bool) -> v s a -> ST s Integer
|
||||||
|
numOfTiesBy f xs = numOfTiesByGeneralized (\x -> (x * (x - 1)) `shiftR` 1) f xs
|
||||||
|
|
||||||
|
numOfTiesByGeneralized :: GM.MVector v a
|
||||||
|
=> (Int -> Int) -> (a -> a -> Bool) -> v s a -> ST s Integer
|
||||||
|
numOfTiesByGeneralized op f xs = do count <- newSTRef (0::Integer)
|
||||||
|
loop count (1::Int) (0::Int)
|
||||||
|
readSTRef count
|
||||||
|
where
|
||||||
|
n = GM.length xs
|
||||||
|
loop c !acc !i | i >= n - 1 = modifySTRef' c (+ g acc)
|
||||||
|
| otherwise = do
|
||||||
|
x1 <- GM.unsafeRead xs i
|
||||||
|
x2 <- GM.unsafeRead xs (i+1)
|
||||||
|
if f x1 x2
|
||||||
|
then loop c (acc+1) (i+1)
|
||||||
|
else modifySTRef' c (+ g acc) >> loop c 1 (i+1)
|
||||||
|
g x = fromIntegral $ op x
|
||||||
|
{-# INLINE numOfTiesByGeneralized #-}
|
||||||
|
|
||||||
|
-- Implementation of Knight's merge sort (adapted from vector-algorithm). This
|
||||||
|
-- function is used to count the number of discordant pairs.
|
||||||
|
mergeSort :: GM.MVector v e
|
||||||
|
=> (e -> e -> Ordering)
|
||||||
|
-> v s e
|
||||||
|
-> v s e
|
||||||
|
-> STRef s Integer
|
||||||
|
-> ST s ()
|
||||||
|
mergeSort cmp src buf count = loop 0 (GM.length src - 1)
|
||||||
|
where
|
||||||
|
loop l u
|
||||||
|
| u == l = return ()
|
||||||
|
| u - l == 1 = do
|
||||||
|
eL <- GM.unsafeRead src l
|
||||||
|
eU <- GM.unsafeRead src u
|
||||||
|
case cmp eL eU of
|
||||||
|
GT -> do GM.unsafeWrite src l eU
|
||||||
|
GM.unsafeWrite src u eL
|
||||||
|
modifySTRef' count (+1)
|
||||||
|
_ -> return ()
|
||||||
|
| otherwise = do
|
||||||
|
let mid = (u + l) `shiftR` 1
|
||||||
|
loop l mid
|
||||||
|
loop mid u
|
||||||
|
merge cmp (GM.unsafeSlice l (u-l+1) src) buf (mid - l) count
|
||||||
|
{-# INLINE mergeSort #-}
|
||||||
|
|
||||||
|
merge :: GM.MVector v e
|
||||||
|
=> (e -> e -> Ordering)
|
||||||
|
-> v s e
|
||||||
|
-> v s e
|
||||||
|
-> Int
|
||||||
|
-> STRef s Integer
|
||||||
|
-> ST s ()
|
||||||
|
merge cmp src buf mid count = do GM.unsafeCopy tmp lower
|
||||||
|
eTmp <- GM.unsafeRead tmp 0
|
||||||
|
eUpp <- GM.unsafeRead upper 0
|
||||||
|
loop tmp 0 eTmp upper 0 eUpp 0
|
||||||
|
where
|
||||||
|
lower = GM.unsafeSlice 0 mid src
|
||||||
|
upper = GM.unsafeSlice mid (GM.length src - mid) src
|
||||||
|
tmp = GM.unsafeSlice 0 mid buf
|
||||||
|
wroteHigh low iLow eLow high iHigh iIns
|
||||||
|
| iHigh >= GM.length high =
|
||||||
|
GM.unsafeCopy (GM.unsafeSlice iIns (GM.length low - iLow) src)
|
||||||
|
(GM.unsafeSlice iLow (GM.length low - iLow) low)
|
||||||
|
| otherwise = do eHigh <- GM.unsafeRead high iHigh
|
||||||
|
loop low iLow eLow high iHigh eHigh iIns
|
||||||
|
|
||||||
|
wroteLow low iLow high iHigh eHigh iIns
|
||||||
|
| iLow >= GM.length low = return ()
|
||||||
|
| otherwise = do eLow <- GM.unsafeRead low iLow
|
||||||
|
loop low iLow eLow high iHigh eHigh iIns
|
||||||
|
|
||||||
|
loop !low !iLow !eLow !high !iHigh !eHigh !iIns = case cmp eHigh eLow of
|
||||||
|
LT -> do GM.unsafeWrite src iIns eHigh
|
||||||
|
modifySTRef' count (+ fromIntegral (GM.length low - iLow))
|
||||||
|
wroteHigh low iLow eLow high (iHigh+1) (iIns+1)
|
||||||
|
_ -> do GM.unsafeWrite src iIns eLow
|
||||||
|
wroteLow low (iLow+1) high iHigh eHigh (iIns+1)
|
||||||
|
{-# INLINE merge #-}
|
||||||
|
|
||||||
|
#if !MIN_VERSION_base(4,6,0)
|
||||||
|
modifySTRef' :: STRef s a -> (a -> a) -> ST s ()
|
||||||
|
modifySTRef' = modifySTRef
|
||||||
|
#endif
|
||||||
|
|
||||||
|
-- $references
|
||||||
|
--
|
||||||
|
-- * William R. Knight. (1966) A computer method for calculating Kendall's Tau
|
||||||
|
-- with ungrouped data. /Journal of the American Statistical Association/,
|
||||||
|
-- Vol. 61, No. 314, Part 1, pp. 436-439. <http://www.jstor.org/pss/2282833>
|
@ -4,6 +4,10 @@ module GEval.FeatureExtractor
|
|||||||
(extractFactors,
|
(extractFactors,
|
||||||
extractFactorsFromTabbed,
|
extractFactorsFromTabbed,
|
||||||
cartesianFeatures,
|
cartesianFeatures,
|
||||||
|
Feature(..),
|
||||||
|
NumericalType(..),
|
||||||
|
NumericalDirection(..),
|
||||||
|
Featuroid(..),
|
||||||
LineWithFactors(..),
|
LineWithFactors(..),
|
||||||
LineWithPeggedFactors(..),
|
LineWithPeggedFactors(..),
|
||||||
PeggedFactor(..),
|
PeggedFactor(..),
|
||||||
@ -26,15 +30,55 @@ import GEval.BlackBoxDebugging
|
|||||||
import GEval.Common
|
import GEval.Common
|
||||||
import Text.Read (readMaybe)
|
import Text.Read (readMaybe)
|
||||||
|
|
||||||
|
data Feature = UnaryFeature PeggedExistentialFactor
|
||||||
|
| CartesianFeature PeggedExistentialFactor PeggedExistentialFactor
|
||||||
|
| NumericalFeature FeatureNamespace NumericalType NumericalDirection
|
||||||
|
deriving (Eq, Ord)
|
||||||
|
|
||||||
|
instance Show Feature where
|
||||||
|
show (UnaryFeature p) = show p
|
||||||
|
show (CartesianFeature pA pB) = formatCartesian pA pB
|
||||||
|
show (NumericalFeature namespace ntype direction) = (show namespace) ++ ":" ++ (show ntype) ++ (show direction)
|
||||||
|
|
||||||
|
data NumericalType = DirectValue | LengthOf
|
||||||
|
deriving (Eq, Ord)
|
||||||
|
|
||||||
|
instance Show NumericalType where
|
||||||
|
show DirectValue = "="
|
||||||
|
show LengthOf = "=#"
|
||||||
|
|
||||||
|
data NumericalDirection = Big | Small
|
||||||
|
deriving (Eq, Ord)
|
||||||
|
|
||||||
|
instance Show NumericalDirection where
|
||||||
|
show Big = "+"
|
||||||
|
show Small = "-"
|
||||||
|
|
||||||
|
-- | Featuroid is something between a factor and a feature, i.e. for numerical factors
|
||||||
|
-- it's not a single value, but still without the direction.
|
||||||
|
data Featuroid = UnaryFeaturoid PeggedExistentialFactor
|
||||||
|
| CartesianFeaturoid PeggedExistentialFactor PeggedExistentialFactor
|
||||||
|
| NumericalFeaturoid FeatureNamespace
|
||||||
|
deriving (Eq, Ord)
|
||||||
|
|
||||||
|
instance Show Featuroid where
|
||||||
|
show (UnaryFeaturoid p) = show p
|
||||||
|
show (CartesianFeaturoid pA pB) = formatCartesian pA pB
|
||||||
|
show (NumericalFeaturoid namespace) = (show namespace) ++ ":="
|
||||||
|
|
||||||
data LineWithFactors = LineWithFactors Double MetricValue [Factor]
|
data LineWithFactors = LineWithFactors Double MetricValue [Factor]
|
||||||
deriving (Eq, Ord)
|
deriving (Eq, Ord)
|
||||||
|
|
||||||
|
-- | A factor extracted from a single item (its input, expected output or actual output).
|
||||||
data Factor = UnaryFactor PeggedFactor | CartesianFactor PeggedExistentialFactor PeggedExistentialFactor
|
data Factor = UnaryFactor PeggedFactor | CartesianFactor PeggedExistentialFactor PeggedExistentialFactor
|
||||||
deriving (Eq, Ord)
|
deriving (Eq, Ord)
|
||||||
|
|
||||||
instance Show Factor where
|
instance Show Factor where
|
||||||
show (UnaryFactor factor) = show factor
|
show (UnaryFactor factor) = show factor
|
||||||
show (CartesianFactor factorA factorB) = (show factorA) ++ "~~" ++ (show factorB)
|
show (CartesianFactor factorA factorB) = formatCartesian factorA factorB
|
||||||
|
|
||||||
|
formatCartesian :: PeggedExistentialFactor -> PeggedExistentialFactor -> String
|
||||||
|
formatCartesian factorA factorB = (show factorA) ++ "~~" ++ (show factorB)
|
||||||
|
|
||||||
data LineWithPeggedFactors = LineWithPeggedFactors Double MetricValue [PeggedFactor]
|
data LineWithPeggedFactors = LineWithPeggedFactors Double MetricValue [PeggedFactor]
|
||||||
deriving (Eq, Ord)
|
deriving (Eq, Ord)
|
||||||
|
@ -34,7 +34,9 @@ import Data.Text.Encoding
|
|||||||
import Data.Conduit.Rank
|
import Data.Conduit.Rank
|
||||||
import Data.Maybe (fromMaybe)
|
import Data.Maybe (fromMaybe)
|
||||||
|
|
||||||
import Data.List (sortBy, sort, concat)
|
import qualified Data.Vector as V
|
||||||
|
|
||||||
|
import Data.List (sortBy, sortOn, sort, concat)
|
||||||
|
|
||||||
import Control.Monad.IO.Class
|
import Control.Monad.IO.Class
|
||||||
import Control.Monad.Trans.Resource
|
import Control.Monad.Trans.Resource
|
||||||
@ -56,6 +58,7 @@ import System.FilePath
|
|||||||
|
|
||||||
import Statistics.Distribution (cumulative)
|
import Statistics.Distribution (cumulative)
|
||||||
import Statistics.Distribution.Normal (normalDistr)
|
import Statistics.Distribution.Normal (normalDistr)
|
||||||
|
import Data.Statistics.Kendall (kendallZ)
|
||||||
|
|
||||||
import qualified Data.Map.Strict as M
|
import qualified Data.Map.Strict as M
|
||||||
import qualified Data.Set as S
|
import qualified Data.Set as S
|
||||||
@ -125,7 +128,7 @@ extractFeaturesAndPValues spec bbdo =
|
|||||||
data RankedFactor = RankedFactor Factor Double MetricValue
|
data RankedFactor = RankedFactor Factor Double MetricValue
|
||||||
deriving (Show)
|
deriving (Show)
|
||||||
|
|
||||||
data FeatureWithPValue = FeatureWithPValue Factor -- ^ feature itself
|
data FeatureWithPValue = FeatureWithPValue Feature -- ^ feature itself
|
||||||
Double -- ^ p-value
|
Double -- ^ p-value
|
||||||
MetricValue -- ^ average metric value
|
MetricValue -- ^ average metric value
|
||||||
Integer -- ^ count
|
Integer -- ^ count
|
||||||
@ -184,11 +187,12 @@ finalFeatures True minFreq = do
|
|||||||
|
|
||||||
filtreCartesian False = CC.map id
|
filtreCartesian False = CC.map id
|
||||||
filtreCartesian True = CC.concatMapAccum step S.empty
|
filtreCartesian True = CC.concatMapAccum step S.empty
|
||||||
where step f@(FeatureWithPValue (UnaryFactor (PeggedFactor namespace (SimpleExistentialFactor p))) _ _ _) mp = (S.insert (PeggedExistentialFactor namespace p) mp, [f])
|
where step f@(FeatureWithPValue (UnaryFeature fac) _ _ _) mp = (S.insert fac mp, [f])
|
||||||
step f@(FeatureWithPValue (UnaryFactor (PeggedFactor namespace (NumericalFactor _ _))) _ _ _) mp = (mp, [f])
|
step f@(FeatureWithPValue (CartesianFeature pA pB) _ _ _) mp = (mp, if pA `S.member` mp || pB `S.member` mp
|
||||||
step f@(FeatureWithPValue (CartesianFactor pA pB) _ _ _) mp = (mp, if pA `S.member` mp || pB `S.member` mp
|
then []
|
||||||
then []
|
else [f])
|
||||||
else [f])
|
step f@(FeatureWithPValue (NumericalFeature _ _ _) _ _ _) mp = (mp, [f])
|
||||||
|
|
||||||
|
|
||||||
peggedToUnaryLine :: LineWithPeggedFactors -> LineWithFactors
|
peggedToUnaryLine :: LineWithPeggedFactors -> LineWithFactors
|
||||||
peggedToUnaryLine (LineWithPeggedFactors rank score fs) = LineWithFactors rank score (Prelude.map UnaryFactor fs)
|
peggedToUnaryLine (LineWithPeggedFactors rank score fs) = LineWithFactors rank score (Prelude.map UnaryFactor fs)
|
||||||
@ -200,33 +204,69 @@ getFeatures mTokenizer bbdo (LineRecord inLine expLine outLine _ _) =
|
|||||||
extractFactorsFromTabbed mTokenizer bbdo "in" inLine,
|
extractFactorsFromTabbed mTokenizer bbdo "in" inLine,
|
||||||
extractFactors mTokenizer bbdo "out" outLine]
|
extractFactors mTokenizer bbdo "out" outLine]
|
||||||
|
|
||||||
|
data FeatureAggregate = ExistentialFactorAggregate Double MetricValue Integer
|
||||||
|
| NumericalValueAggregate [Double] [MetricValue] [Int] [MetricValue]
|
||||||
|
| LengthAggregate [Double] [MetricValue] [Int]
|
||||||
|
|
||||||
|
aggreggate :: FeatureAggregate -> FeatureAggregate -> FeatureAggregate
|
||||||
|
aggreggate (ExistentialFactorAggregate r1 s1 c1) (ExistentialFactorAggregate r2 s2 c2) =
|
||||||
|
ExistentialFactorAggregate (r1 + r2) (s1 + s2) (c1 + c2)
|
||||||
|
aggreggate (NumericalValueAggregate ranks1 scores1 lengths1 values1) (NumericalValueAggregate ranks2 scores2 lengths2 values2) =
|
||||||
|
NumericalValueAggregate (ranks1 ++ ranks2) (scores1 ++ scores2) (lengths1 ++ lengths2) (values1 ++ values2)
|
||||||
|
aggreggate (NumericalValueAggregate ranks1 scores1 lengths1 _) (LengthAggregate ranks2 scores2 lengths2) =
|
||||||
|
LengthAggregate (ranks1 ++ ranks2) (scores1 ++ scores2) (lengths1 ++ lengths2)
|
||||||
|
aggreggate (LengthAggregate ranks1 scores1 lengths1) (NumericalValueAggregate ranks2 scores2 lengths2 _) =
|
||||||
|
LengthAggregate (ranks1 ++ ranks2) (scores1 ++ scores2) (lengths1 ++ lengths2)
|
||||||
|
aggreggate (LengthAggregate ranks1 scores1 lengths1) (LengthAggregate ranks2 scores2 lengths2) =
|
||||||
|
LengthAggregate (ranks1 ++ ranks2) (scores1 ++ scores2) (lengths1 ++ lengths2)
|
||||||
|
aggreggate _ _ = error "Mismatched aggregates!"
|
||||||
|
|
||||||
|
initAggregate :: RankedFactor -> (Featuroid, FeatureAggregate)
|
||||||
|
initAggregate (RankedFactor (UnaryFactor (PeggedFactor namespace (NumericalFactor Nothing l))) r s) =
|
||||||
|
(NumericalFeaturoid namespace, LengthAggregate [r] [s] [l])
|
||||||
|
initAggregate (RankedFactor (UnaryFactor (PeggedFactor namespace (NumericalFactor (Just v) l))) r s) =
|
||||||
|
(NumericalFeaturoid namespace, NumericalValueAggregate [r] [s] [l] [v])
|
||||||
|
initAggregate (RankedFactor (UnaryFactor (PeggedFactor namespace (SimpleExistentialFactor f))) r s) =
|
||||||
|
(UnaryFeaturoid (PeggedExistentialFactor namespace f), ExistentialFactorAggregate r s 1)
|
||||||
|
initAggregate (RankedFactor (CartesianFactor pA pB) r s) =
|
||||||
|
(CartesianFeaturoid pA pB, ExistentialFactorAggregate r s 1)
|
||||||
|
|
||||||
|
filterAggregateByFreq :: Integer -> (Maybe Integer) -> FeatureAggregate -> Bool
|
||||||
|
filterAggregateByFreq minFreq Nothing (ExistentialFactorAggregate _ _ c) = c >= minFreq
|
||||||
|
filterAggregateByFreq minFreq (Just total) (ExistentialFactorAggregate _ _ c) = c >= minFreq && total - c >= minFreq
|
||||||
|
filterAggregateByFreq _ _ _ = True
|
||||||
|
|
||||||
uScoresCounter :: Monad m => Integer -> ConduitT RankedFactor FeatureWithPValue (StateT Integer m) ()
|
uScoresCounter :: Monad m => Integer -> ConduitT RankedFactor FeatureWithPValue (StateT Integer m) ()
|
||||||
uScoresCounter minFreq = CC.map (\(RankedFactor feature r score) -> (feature, (r, score, 1)))
|
uScoresCounter minFreq = CC.map initAggregate
|
||||||
.| gobbleAndDo countUScores
|
.| gobbleAndDo countUScores
|
||||||
.| lowerFreqFiltre
|
.| lowerFreqFiltre
|
||||||
.| pValueCalculator minFreq
|
.| pValueCalculator minFreq
|
||||||
where countUScores l =
|
where countUScores l =
|
||||||
M.toList
|
M.toList
|
||||||
$ M.fromListWith (\(r1, s1, c1) (r2, s2, c2) -> ((r1 + r2), (s1 + s2), (c1 + c2))) l
|
$ M.fromListWith aggreggate l
|
||||||
lowerFreqFiltre = CC.filter (\(_, (_, _, c)) -> c >= minFreq)
|
lowerFreqFiltre = CC.filter (\(_, fAgg) -> filterAggregateByFreq minFreq Nothing fAgg)
|
||||||
|
|
||||||
pValueCalculator :: Monad m => Integer -> ConduitT (Factor, (Double, MetricValue, Integer)) FeatureWithPValue (StateT Integer m) ()
|
pValueCalculator :: Monad m => Integer -> ConduitT (Featuroid, FeatureAggregate) FeatureWithPValue (StateT Integer m) ()
|
||||||
pValueCalculator minFreq = do
|
pValueCalculator minFreq = do
|
||||||
firstVal <- await
|
firstVal <- await
|
||||||
case firstVal of
|
case firstVal of
|
||||||
Just i@(_, (_, _, c)) -> do
|
Just i@(_, fAgg) -> do
|
||||||
total <- lift get
|
total <- lift get
|
||||||
if total - c >= minFreq
|
if filterAggregateByFreq minFreq (Just total) fAgg
|
||||||
then yield $ calculatePValue total i
|
then yield $ calculatePValue total i
|
||||||
else return ()
|
else return ()
|
||||||
CC.filter (\(_, (_, _, c)) -> total - c >= minFreq) .| CC.map (calculatePValue total)
|
CC.filter (\(_, fAgg) -> filterAggregateByFreq minFreq (Just total) fAgg) .| CC.map (calculatePValue total)
|
||||||
Nothing -> return ()
|
Nothing -> return ()
|
||||||
|
|
||||||
calculatePValue :: Integer -> (Factor, (Double, MetricValue, Integer)) -> FeatureWithPValue
|
calculatePValue :: Integer -> (Featuroid, FeatureAggregate) -> FeatureWithPValue
|
||||||
calculatePValue total (f, (r, s, c)) = FeatureWithPValue f
|
calculatePValue _ (NumericalFeaturoid namespace, NumericalValueAggregate ranks scores _ values) =
|
||||||
(pvalue (r - minusR c) c (total - c))
|
kendallPValueFeature namespace DirectValue ranks scores values
|
||||||
(s / (fromIntegral c))
|
calculatePValue _ (NumericalFeaturoid namespace, LengthAggregate ranks scores lens) =
|
||||||
c
|
kendallPValueFeature namespace LengthOf ranks scores lens
|
||||||
|
calculatePValue total (f, ExistentialFactorAggregate r s c) = FeatureWithPValue (featoroidToFeature f)
|
||||||
|
(pvalue (r - minusR c) c (total - c))
|
||||||
|
(s / (fromIntegral c))
|
||||||
|
c
|
||||||
where minusR c = (c' * (c' + 1)) / 2.0
|
where minusR c = (c' * (c' + 1)) / 2.0
|
||||||
where c' = fromIntegral c
|
where c' = fromIntegral c
|
||||||
-- calulating p-value from Mann–Whitney U test
|
-- calulating p-value from Mann–Whitney U test
|
||||||
@ -237,6 +277,26 @@ calculatePValue total (f, (r, s, c)) = FeatureWithPValue f
|
|||||||
sigma = sqrt $ n1' * n2' * (n1' + n2' + 1) / 12
|
sigma = sqrt $ n1' * n2' * (n1' + n2' + 1) / 12
|
||||||
z = (u - mean) / sigma
|
z = (u - mean) / sigma
|
||||||
in cumulative (normalDistr 0.0 1.0) z
|
in cumulative (normalDistr 0.0 1.0) z
|
||||||
|
featoroidToFeature (UnaryFeaturoid fac) = UnaryFeature fac
|
||||||
|
featoroidToFeature (CartesianFeaturoid facA facB) = (CartesianFeature facA facB)
|
||||||
|
|
||||||
|
|
||||||
|
kendallPValueFeature :: Ord a => FeatureNamespace -> NumericalType -> [Double] -> [MetricValue] -> [a] -> FeatureWithPValue
|
||||||
|
kendallPValueFeature namespace ntype ranks scores values = FeatureWithPValue (NumericalFeature namespace ntype ndirection)
|
||||||
|
pv
|
||||||
|
((sum selectedScores) / (fromIntegral selected))
|
||||||
|
(fromIntegral selected)
|
||||||
|
where z = kendallZ (V.fromList $ Prelude.zip ranks values)
|
||||||
|
pv = 2 * (cumulative (normalDistr 0.0 1.0) (- (abs z)))
|
||||||
|
ndirection = if z > 0
|
||||||
|
then Small
|
||||||
|
else Big
|
||||||
|
selected = (Prelude.length scores) `div` 4
|
||||||
|
|
||||||
|
selectedScores = Prelude.take selected $ Prelude.map snd $ turner $ sortOn fst $ Prelude.zip values scores
|
||||||
|
turner = case ndirection of
|
||||||
|
Small -> id
|
||||||
|
Big -> Prelude.reverse
|
||||||
|
|
||||||
|
|
||||||
totalCounter :: Monad m => ConduitT a a (StateT Integer m) ()
|
totalCounter :: Monad m => ConduitT a a (StateT Integer m) ()
|
||||||
|
13
test/Spec.hs
13
test/Spec.hs
@ -40,6 +40,7 @@ import Data.List (sort)
|
|||||||
import qualified Test.HUnit as HU
|
import qualified Test.HUnit as HU
|
||||||
|
|
||||||
import qualified Data.IntSet as IS
|
import qualified Data.IntSet as IS
|
||||||
|
import qualified Data.Vector as V
|
||||||
|
|
||||||
import Data.Conduit.SmartSource
|
import Data.Conduit.SmartSource
|
||||||
import Data.Conduit.Rank
|
import Data.Conduit.Rank
|
||||||
@ -49,6 +50,10 @@ import Control.Monad.Trans.Resource
|
|||||||
import qualified Data.Conduit.List as CL
|
import qualified Data.Conduit.List as CL
|
||||||
import qualified Data.Conduit.Combinators as CC
|
import qualified Data.Conduit.Combinators as CC
|
||||||
|
|
||||||
|
import Statistics.Distribution (cumulative)
|
||||||
|
import Statistics.Distribution.Normal (normalDistr)
|
||||||
|
import Data.Statistics.Kendall (kendall, kendallZ)
|
||||||
|
|
||||||
informationRetrievalBookExample :: [(String, Int)]
|
informationRetrievalBookExample :: [(String, Int)]
|
||||||
informationRetrievalBookExample = [("o", 2), ("o", 2), ("d", 2), ("x", 3), ("d", 3),
|
informationRetrievalBookExample = [("o", 2), ("o", 2), ("d", 2), ("x", 3), ("d", 3),
|
||||||
("x", 1), ("o", 1), ("x", 1), ( "x", 1), ("x", 1), ("x", 1),
|
("x", 1), ("o", 1), ("x", 1), ( "x", 1), ("x", 1), ("x", 1),
|
||||||
@ -541,6 +546,14 @@ main = hspec $ do
|
|||||||
(SimpleExistentialFactor (SimpleAtomicFactor (TextFactor "tests"))),
|
(SimpleExistentialFactor (SimpleAtomicFactor (TextFactor "tests"))),
|
||||||
PeggedFactor (FeatureTabbedNamespace "in" 3)
|
PeggedFactor (FeatureTabbedNamespace "in" 3)
|
||||||
(NumericalFactor Nothing 5) ]
|
(NumericalFactor Nothing 5) ]
|
||||||
|
describe "Kendall's tau" $ do
|
||||||
|
it "tau" $ do
|
||||||
|
kendall (V.fromList $ Prelude.zip [12, 2, 1, 12, 2] [1, 4, 7, 1, 0]) `shouldBeAlmost` (-0.47140452079103173)
|
||||||
|
it "z" $ do
|
||||||
|
kendallZ (V.fromList $ Prelude.zip [12, 2, 1, 12, 2] [1, 4, 7, 1, 0]) `shouldBeAlmost` (-1.0742)
|
||||||
|
it "p-value" $ do
|
||||||
|
(2 * (cumulative (normalDistr 0.0 1.0) $ kendallZ (V.fromList $ Prelude.zip [12, 2, 1, 12, 2] [1, 4, 7, 1, 0]))) `shouldBeAlmost` 0.2827
|
||||||
|
|
||||||
|
|
||||||
checkConduitPure conduit inList expList = do
|
checkConduitPure conduit inList expList = do
|
||||||
let outList = runConduitPure $ CC.yieldMany inList .| conduit .| CC.sinkList
|
let outList = runConduitPure $ CC.yieldMany inList .| conduit .| CC.sinkList
|
||||||
|
Loading…
Reference in New Issue
Block a user