Add numerical features
This commit is contained in:
parent
d5a8908599
commit
abcce9bf68
@ -1,5 +1,5 @@
|
||||
name: geval
|
||||
version: 1.13.0.0
|
||||
version: 1.14.0.0
|
||||
synopsis: Machine learning evaluation tools
|
||||
description: Please see README.md
|
||||
homepage: http://github.com/name/project
|
||||
@ -39,6 +39,7 @@ library
|
||||
, GEval.Annotation
|
||||
, GEval.BlackBoxDebugging
|
||||
, Text.WordShape
|
||||
, Data.Statistics.Kendall
|
||||
, Paths_geval
|
||||
build-depends: base >= 4.7 && < 5
|
||||
, cond
|
||||
@ -80,6 +81,7 @@ library
|
||||
, MissingH
|
||||
, array
|
||||
, Munkres
|
||||
, vector-algorithms
|
||||
default-language: Haskell2010
|
||||
|
||||
executable geval
|
||||
@ -117,6 +119,8 @@ test-suite geval-test
|
||||
, directory
|
||||
, temporary
|
||||
, silently
|
||||
, vector
|
||||
, statistics
|
||||
ghc-options: -threaded -rtsopts -with-rtsopts=-N
|
||||
default-language: Haskell2010
|
||||
|
||||
|
178
src/Data/Statistics/Kendall.hs
Normal file
178
src/Data/Statistics/Kendall.hs
Normal file
@ -0,0 +1,178 @@
|
||||
{-# LANGUAGE BangPatterns, CPP, FlexibleContexts, ScopedTypeVariables #-}
|
||||
-- |
|
||||
-- (Taken from http://hackage.haskell.org/package/statistics-0.15.0.0/docs/src/Statistics.Correlation.Kendall.html)
|
||||
--
|
||||
-- Module : Statistics.Correlation.Kendall
|
||||
--
|
||||
-- Fast O(NlogN) implementation of
|
||||
-- <http://en.wikipedia.org/wiki/Kendall_tau_rank_correlation_coefficient Kendall's tau>.
|
||||
--
|
||||
-- This module implements Kendall's tau form b which allows ties in the data.
|
||||
-- This is the same formula used by other statistical packages, e.g., R, matlab.
|
||||
--
|
||||
-- > \tau = \frac{n_c - n_d}{\sqrt{(n_0 - n_1)(n_0 - n_2)}}
|
||||
--
|
||||
-- where n_0 = n(n-1)\/2, n_1 = number of pairs tied for the first quantify,
|
||||
-- n_2 = number of pairs tied for the second quantify,
|
||||
-- n_c = number of concordant pairs$, n_d = number of discordant pairs.
|
||||
|
||||
module Data.Statistics.Kendall
|
||||
( kendall,
|
||||
kendallZ
|
||||
-- * References
|
||||
-- $references
|
||||
) where
|
||||
|
||||
import Control.Monad.ST (ST, runST)
|
||||
import Data.Bits (shiftR)
|
||||
import Data.Function (on)
|
||||
import Data.STRef
|
||||
import qualified Data.Vector.Algorithms.Intro as I
|
||||
import qualified Data.Vector.Generic as G
|
||||
import qualified Data.Vector.Generic.Mutable as GM
|
||||
|
||||
-- | /O(nlogn)/ Compute the Kendall's tau from a vector of paired data.
|
||||
-- Return NaN when number of pairs <= 1.
|
||||
kendall :: (Ord a, Ord b, G.Vector v (a, b)) => v (a, b) -> Double
|
||||
kendall xy'
|
||||
| G.length xy' <= 1 = 0/0
|
||||
| otherwise = runST $ do
|
||||
xy <- G.thaw xy'
|
||||
let n = GM.length xy
|
||||
n_dRef <- newSTRef 0
|
||||
I.sort xy
|
||||
tieX <- numOfTiesBy ((==) `on` fst) xy
|
||||
tieXY <- numOfTiesBy (==) xy
|
||||
tmp <- GM.new n
|
||||
mergeSort (compare `on` snd) xy tmp n_dRef
|
||||
tieY <- numOfTiesBy ((==) `on` snd) xy
|
||||
n_d <- readSTRef n_dRef
|
||||
let n_0 = (fromIntegral n * (fromIntegral n-1)) `shiftR` 1 :: Integer
|
||||
n_c = n_0 - n_d - tieX - tieY + tieXY
|
||||
return $ fromIntegral (n_c - n_d) /
|
||||
(sqrt.fromIntegral) ((n_0 - tieX) * (n_0 - tieY))
|
||||
{-# INLINE kendall #-}
|
||||
|
||||
kendallZ :: (Ord a, Ord b, G.Vector v (a, b)) => v (a, b) -> Double
|
||||
kendallZ xy'
|
||||
| G.length xy' <= 1 = 0/0
|
||||
| otherwise = runST $ do
|
||||
xy <- G.thaw xy'
|
||||
let n = GM.length xy
|
||||
let vfun x = x * (x - 1) * (2*x + 5)
|
||||
let tttfun x = x * (x - 1) * (x - 2)
|
||||
n_dRef <- newSTRef 0
|
||||
I.sort xy
|
||||
tieX <- numOfTiesBy ((==) `on` fst) xy
|
||||
tieXY <- numOfTiesBy (==) xy
|
||||
vt <- numOfTiesByGeneralized vfun ((==) `on` fst) xy
|
||||
tttX <- numOfTiesByGeneralized tttfun ((==) `on` fst) xy
|
||||
tmp <- GM.new n
|
||||
mergeSort (compare `on` snd) xy tmp n_dRef
|
||||
tieY <- numOfTiesBy ((==) `on` snd) xy
|
||||
vu <- numOfTiesByGeneralized vfun ((==) `on` snd) xy
|
||||
tttY <- numOfTiesByGeneralized tttfun ((==) `on` snd) xy
|
||||
n_d <- readSTRef n_dRef
|
||||
let n_0 = (fromIntegral n * (fromIntegral n-1)) `shiftR` 1 :: Integer
|
||||
n_c = n_0 - n_d - tieX - tieY + tieXY
|
||||
v0 = vfun (fromIntegral n)
|
||||
v1 = 2.0 * (fromIntegral tieX) * (fromIntegral tieY) / (fromIntegral (n * (n-1)))
|
||||
v2 = (fromIntegral tttX) * (fromIntegral tttY) / (fromIntegral (9 * (tttfun n)))
|
||||
v = (fromIntegral (v0 - vt - vu)) / 18.0 + v1 + v2
|
||||
return $ (fromIntegral (n_c - n_d)) / sqrt v
|
||||
{-# INLINE kendallZ #-}
|
||||
|
||||
-- calculate number of tied pairs in a sorted vector
|
||||
numOfTiesBy :: GM.MVector v a
|
||||
=> (a -> a -> Bool) -> v s a -> ST s Integer
|
||||
numOfTiesBy f xs = numOfTiesByGeneralized (\x -> (x * (x - 1)) `shiftR` 1) f xs
|
||||
|
||||
numOfTiesByGeneralized :: GM.MVector v a
|
||||
=> (Int -> Int) -> (a -> a -> Bool) -> v s a -> ST s Integer
|
||||
numOfTiesByGeneralized op f xs = do count <- newSTRef (0::Integer)
|
||||
loop count (1::Int) (0::Int)
|
||||
readSTRef count
|
||||
where
|
||||
n = GM.length xs
|
||||
loop c !acc !i | i >= n - 1 = modifySTRef' c (+ g acc)
|
||||
| otherwise = do
|
||||
x1 <- GM.unsafeRead xs i
|
||||
x2 <- GM.unsafeRead xs (i+1)
|
||||
if f x1 x2
|
||||
then loop c (acc+1) (i+1)
|
||||
else modifySTRef' c (+ g acc) >> loop c 1 (i+1)
|
||||
g x = fromIntegral $ op x
|
||||
{-# INLINE numOfTiesByGeneralized #-}
|
||||
|
||||
-- Implementation of Knight's merge sort (adapted from vector-algorithm). This
|
||||
-- function is used to count the number of discordant pairs.
|
||||
mergeSort :: GM.MVector v e
|
||||
=> (e -> e -> Ordering)
|
||||
-> v s e
|
||||
-> v s e
|
||||
-> STRef s Integer
|
||||
-> ST s ()
|
||||
mergeSort cmp src buf count = loop 0 (GM.length src - 1)
|
||||
where
|
||||
loop l u
|
||||
| u == l = return ()
|
||||
| u - l == 1 = do
|
||||
eL <- GM.unsafeRead src l
|
||||
eU <- GM.unsafeRead src u
|
||||
case cmp eL eU of
|
||||
GT -> do GM.unsafeWrite src l eU
|
||||
GM.unsafeWrite src u eL
|
||||
modifySTRef' count (+1)
|
||||
_ -> return ()
|
||||
| otherwise = do
|
||||
let mid = (u + l) `shiftR` 1
|
||||
loop l mid
|
||||
loop mid u
|
||||
merge cmp (GM.unsafeSlice l (u-l+1) src) buf (mid - l) count
|
||||
{-# INLINE mergeSort #-}
|
||||
|
||||
merge :: GM.MVector v e
|
||||
=> (e -> e -> Ordering)
|
||||
-> v s e
|
||||
-> v s e
|
||||
-> Int
|
||||
-> STRef s Integer
|
||||
-> ST s ()
|
||||
merge cmp src buf mid count = do GM.unsafeCopy tmp lower
|
||||
eTmp <- GM.unsafeRead tmp 0
|
||||
eUpp <- GM.unsafeRead upper 0
|
||||
loop tmp 0 eTmp upper 0 eUpp 0
|
||||
where
|
||||
lower = GM.unsafeSlice 0 mid src
|
||||
upper = GM.unsafeSlice mid (GM.length src - mid) src
|
||||
tmp = GM.unsafeSlice 0 mid buf
|
||||
wroteHigh low iLow eLow high iHigh iIns
|
||||
| iHigh >= GM.length high =
|
||||
GM.unsafeCopy (GM.unsafeSlice iIns (GM.length low - iLow) src)
|
||||
(GM.unsafeSlice iLow (GM.length low - iLow) low)
|
||||
| otherwise = do eHigh <- GM.unsafeRead high iHigh
|
||||
loop low iLow eLow high iHigh eHigh iIns
|
||||
|
||||
wroteLow low iLow high iHigh eHigh iIns
|
||||
| iLow >= GM.length low = return ()
|
||||
| otherwise = do eLow <- GM.unsafeRead low iLow
|
||||
loop low iLow eLow high iHigh eHigh iIns
|
||||
|
||||
loop !low !iLow !eLow !high !iHigh !eHigh !iIns = case cmp eHigh eLow of
|
||||
LT -> do GM.unsafeWrite src iIns eHigh
|
||||
modifySTRef' count (+ fromIntegral (GM.length low - iLow))
|
||||
wroteHigh low iLow eLow high (iHigh+1) (iIns+1)
|
||||
_ -> do GM.unsafeWrite src iIns eLow
|
||||
wroteLow low (iLow+1) high iHigh eHigh (iIns+1)
|
||||
{-# INLINE merge #-}
|
||||
|
||||
#if !MIN_VERSION_base(4,6,0)
|
||||
modifySTRef' :: STRef s a -> (a -> a) -> ST s ()
|
||||
modifySTRef' = modifySTRef
|
||||
#endif
|
||||
|
||||
-- $references
|
||||
--
|
||||
-- * William R. Knight. (1966) A computer method for calculating Kendall's Tau
|
||||
-- with ungrouped data. /Journal of the American Statistical Association/,
|
||||
-- Vol. 61, No. 314, Part 1, pp. 436-439. <http://www.jstor.org/pss/2282833>
|
@ -4,6 +4,10 @@ module GEval.FeatureExtractor
|
||||
(extractFactors,
|
||||
extractFactorsFromTabbed,
|
||||
cartesianFeatures,
|
||||
Feature(..),
|
||||
NumericalType(..),
|
||||
NumericalDirection(..),
|
||||
Featuroid(..),
|
||||
LineWithFactors(..),
|
||||
LineWithPeggedFactors(..),
|
||||
PeggedFactor(..),
|
||||
@ -26,15 +30,55 @@ import GEval.BlackBoxDebugging
|
||||
import GEval.Common
|
||||
import Text.Read (readMaybe)
|
||||
|
||||
data Feature = UnaryFeature PeggedExistentialFactor
|
||||
| CartesianFeature PeggedExistentialFactor PeggedExistentialFactor
|
||||
| NumericalFeature FeatureNamespace NumericalType NumericalDirection
|
||||
deriving (Eq, Ord)
|
||||
|
||||
instance Show Feature where
|
||||
show (UnaryFeature p) = show p
|
||||
show (CartesianFeature pA pB) = formatCartesian pA pB
|
||||
show (NumericalFeature namespace ntype direction) = (show namespace) ++ ":" ++ (show ntype) ++ (show direction)
|
||||
|
||||
data NumericalType = DirectValue | LengthOf
|
||||
deriving (Eq, Ord)
|
||||
|
||||
instance Show NumericalType where
|
||||
show DirectValue = "="
|
||||
show LengthOf = "=#"
|
||||
|
||||
data NumericalDirection = Big | Small
|
||||
deriving (Eq, Ord)
|
||||
|
||||
instance Show NumericalDirection where
|
||||
show Big = "+"
|
||||
show Small = "-"
|
||||
|
||||
-- | Featuroid is something between a factor and a feature, i.e. for numerical factors
|
||||
-- it's not a single value, but still without the direction.
|
||||
data Featuroid = UnaryFeaturoid PeggedExistentialFactor
|
||||
| CartesianFeaturoid PeggedExistentialFactor PeggedExistentialFactor
|
||||
| NumericalFeaturoid FeatureNamespace
|
||||
deriving (Eq, Ord)
|
||||
|
||||
instance Show Featuroid where
|
||||
show (UnaryFeaturoid p) = show p
|
||||
show (CartesianFeaturoid pA pB) = formatCartesian pA pB
|
||||
show (NumericalFeaturoid namespace) = (show namespace) ++ ":="
|
||||
|
||||
data LineWithFactors = LineWithFactors Double MetricValue [Factor]
|
||||
deriving (Eq, Ord)
|
||||
|
||||
-- | A factor extracted from a single item (its input, expected output or actual output).
|
||||
data Factor = UnaryFactor PeggedFactor | CartesianFactor PeggedExistentialFactor PeggedExistentialFactor
|
||||
deriving (Eq, Ord)
|
||||
|
||||
instance Show Factor where
|
||||
show (UnaryFactor factor) = show factor
|
||||
show (CartesianFactor factorA factorB) = (show factorA) ++ "~~" ++ (show factorB)
|
||||
show (CartesianFactor factorA factorB) = formatCartesian factorA factorB
|
||||
|
||||
formatCartesian :: PeggedExistentialFactor -> PeggedExistentialFactor -> String
|
||||
formatCartesian factorA factorB = (show factorA) ++ "~~" ++ (show factorB)
|
||||
|
||||
data LineWithPeggedFactors = LineWithPeggedFactors Double MetricValue [PeggedFactor]
|
||||
deriving (Eq, Ord)
|
||||
|
@ -34,7 +34,9 @@ import Data.Text.Encoding
|
||||
import Data.Conduit.Rank
|
||||
import Data.Maybe (fromMaybe)
|
||||
|
||||
import Data.List (sortBy, sort, concat)
|
||||
import qualified Data.Vector as V
|
||||
|
||||
import Data.List (sortBy, sortOn, sort, concat)
|
||||
|
||||
import Control.Monad.IO.Class
|
||||
import Control.Monad.Trans.Resource
|
||||
@ -56,6 +58,7 @@ import System.FilePath
|
||||
|
||||
import Statistics.Distribution (cumulative)
|
||||
import Statistics.Distribution.Normal (normalDistr)
|
||||
import Data.Statistics.Kendall (kendallZ)
|
||||
|
||||
import qualified Data.Map.Strict as M
|
||||
import qualified Data.Set as S
|
||||
@ -125,7 +128,7 @@ extractFeaturesAndPValues spec bbdo =
|
||||
data RankedFactor = RankedFactor Factor Double MetricValue
|
||||
deriving (Show)
|
||||
|
||||
data FeatureWithPValue = FeatureWithPValue Factor -- ^ feature itself
|
||||
data FeatureWithPValue = FeatureWithPValue Feature -- ^ feature itself
|
||||
Double -- ^ p-value
|
||||
MetricValue -- ^ average metric value
|
||||
Integer -- ^ count
|
||||
@ -184,11 +187,12 @@ finalFeatures True minFreq = do
|
||||
|
||||
filtreCartesian False = CC.map id
|
||||
filtreCartesian True = CC.concatMapAccum step S.empty
|
||||
where step f@(FeatureWithPValue (UnaryFactor (PeggedFactor namespace (SimpleExistentialFactor p))) _ _ _) mp = (S.insert (PeggedExistentialFactor namespace p) mp, [f])
|
||||
step f@(FeatureWithPValue (UnaryFactor (PeggedFactor namespace (NumericalFactor _ _))) _ _ _) mp = (mp, [f])
|
||||
step f@(FeatureWithPValue (CartesianFactor pA pB) _ _ _) mp = (mp, if pA `S.member` mp || pB `S.member` mp
|
||||
then []
|
||||
else [f])
|
||||
where step f@(FeatureWithPValue (UnaryFeature fac) _ _ _) mp = (S.insert fac mp, [f])
|
||||
step f@(FeatureWithPValue (CartesianFeature pA pB) _ _ _) mp = (mp, if pA `S.member` mp || pB `S.member` mp
|
||||
then []
|
||||
else [f])
|
||||
step f@(FeatureWithPValue (NumericalFeature _ _ _) _ _ _) mp = (mp, [f])
|
||||
|
||||
|
||||
peggedToUnaryLine :: LineWithPeggedFactors -> LineWithFactors
|
||||
peggedToUnaryLine (LineWithPeggedFactors rank score fs) = LineWithFactors rank score (Prelude.map UnaryFactor fs)
|
||||
@ -200,33 +204,69 @@ getFeatures mTokenizer bbdo (LineRecord inLine expLine outLine _ _) =
|
||||
extractFactorsFromTabbed mTokenizer bbdo "in" inLine,
|
||||
extractFactors mTokenizer bbdo "out" outLine]
|
||||
|
||||
data FeatureAggregate = ExistentialFactorAggregate Double MetricValue Integer
|
||||
| NumericalValueAggregate [Double] [MetricValue] [Int] [MetricValue]
|
||||
| LengthAggregate [Double] [MetricValue] [Int]
|
||||
|
||||
aggreggate :: FeatureAggregate -> FeatureAggregate -> FeatureAggregate
|
||||
aggreggate (ExistentialFactorAggregate r1 s1 c1) (ExistentialFactorAggregate r2 s2 c2) =
|
||||
ExistentialFactorAggregate (r1 + r2) (s1 + s2) (c1 + c2)
|
||||
aggreggate (NumericalValueAggregate ranks1 scores1 lengths1 values1) (NumericalValueAggregate ranks2 scores2 lengths2 values2) =
|
||||
NumericalValueAggregate (ranks1 ++ ranks2) (scores1 ++ scores2) (lengths1 ++ lengths2) (values1 ++ values2)
|
||||
aggreggate (NumericalValueAggregate ranks1 scores1 lengths1 _) (LengthAggregate ranks2 scores2 lengths2) =
|
||||
LengthAggregate (ranks1 ++ ranks2) (scores1 ++ scores2) (lengths1 ++ lengths2)
|
||||
aggreggate (LengthAggregate ranks1 scores1 lengths1) (NumericalValueAggregate ranks2 scores2 lengths2 _) =
|
||||
LengthAggregate (ranks1 ++ ranks2) (scores1 ++ scores2) (lengths1 ++ lengths2)
|
||||
aggreggate (LengthAggregate ranks1 scores1 lengths1) (LengthAggregate ranks2 scores2 lengths2) =
|
||||
LengthAggregate (ranks1 ++ ranks2) (scores1 ++ scores2) (lengths1 ++ lengths2)
|
||||
aggreggate _ _ = error "Mismatched aggregates!"
|
||||
|
||||
initAggregate :: RankedFactor -> (Featuroid, FeatureAggregate)
|
||||
initAggregate (RankedFactor (UnaryFactor (PeggedFactor namespace (NumericalFactor Nothing l))) r s) =
|
||||
(NumericalFeaturoid namespace, LengthAggregate [r] [s] [l])
|
||||
initAggregate (RankedFactor (UnaryFactor (PeggedFactor namespace (NumericalFactor (Just v) l))) r s) =
|
||||
(NumericalFeaturoid namespace, NumericalValueAggregate [r] [s] [l] [v])
|
||||
initAggregate (RankedFactor (UnaryFactor (PeggedFactor namespace (SimpleExistentialFactor f))) r s) =
|
||||
(UnaryFeaturoid (PeggedExistentialFactor namespace f), ExistentialFactorAggregate r s 1)
|
||||
initAggregate (RankedFactor (CartesianFactor pA pB) r s) =
|
||||
(CartesianFeaturoid pA pB, ExistentialFactorAggregate r s 1)
|
||||
|
||||
filterAggregateByFreq :: Integer -> (Maybe Integer) -> FeatureAggregate -> Bool
|
||||
filterAggregateByFreq minFreq Nothing (ExistentialFactorAggregate _ _ c) = c >= minFreq
|
||||
filterAggregateByFreq minFreq (Just total) (ExistentialFactorAggregate _ _ c) = c >= minFreq && total - c >= minFreq
|
||||
filterAggregateByFreq _ _ _ = True
|
||||
|
||||
uScoresCounter :: Monad m => Integer -> ConduitT RankedFactor FeatureWithPValue (StateT Integer m) ()
|
||||
uScoresCounter minFreq = CC.map (\(RankedFactor feature r score) -> (feature, (r, score, 1)))
|
||||
uScoresCounter minFreq = CC.map initAggregate
|
||||
.| gobbleAndDo countUScores
|
||||
.| lowerFreqFiltre
|
||||
.| pValueCalculator minFreq
|
||||
where countUScores l =
|
||||
M.toList
|
||||
$ M.fromListWith (\(r1, s1, c1) (r2, s2, c2) -> ((r1 + r2), (s1 + s2), (c1 + c2))) l
|
||||
lowerFreqFiltre = CC.filter (\(_, (_, _, c)) -> c >= minFreq)
|
||||
$ M.fromListWith aggreggate l
|
||||
lowerFreqFiltre = CC.filter (\(_, fAgg) -> filterAggregateByFreq minFreq Nothing fAgg)
|
||||
|
||||
pValueCalculator :: Monad m => Integer -> ConduitT (Factor, (Double, MetricValue, Integer)) FeatureWithPValue (StateT Integer m) ()
|
||||
pValueCalculator :: Monad m => Integer -> ConduitT (Featuroid, FeatureAggregate) FeatureWithPValue (StateT Integer m) ()
|
||||
pValueCalculator minFreq = do
|
||||
firstVal <- await
|
||||
case firstVal of
|
||||
Just i@(_, (_, _, c)) -> do
|
||||
Just i@(_, fAgg) -> do
|
||||
total <- lift get
|
||||
if total - c >= minFreq
|
||||
if filterAggregateByFreq minFreq (Just total) fAgg
|
||||
then yield $ calculatePValue total i
|
||||
else return ()
|
||||
CC.filter (\(_, (_, _, c)) -> total - c >= minFreq) .| CC.map (calculatePValue total)
|
||||
CC.filter (\(_, fAgg) -> filterAggregateByFreq minFreq (Just total) fAgg) .| CC.map (calculatePValue total)
|
||||
Nothing -> return ()
|
||||
|
||||
calculatePValue :: Integer -> (Factor, (Double, MetricValue, Integer)) -> FeatureWithPValue
|
||||
calculatePValue total (f, (r, s, c)) = FeatureWithPValue f
|
||||
(pvalue (r - minusR c) c (total - c))
|
||||
(s / (fromIntegral c))
|
||||
c
|
||||
calculatePValue :: Integer -> (Featuroid, FeatureAggregate) -> FeatureWithPValue
|
||||
calculatePValue _ (NumericalFeaturoid namespace, NumericalValueAggregate ranks scores _ values) =
|
||||
kendallPValueFeature namespace DirectValue ranks scores values
|
||||
calculatePValue _ (NumericalFeaturoid namespace, LengthAggregate ranks scores lens) =
|
||||
kendallPValueFeature namespace LengthOf ranks scores lens
|
||||
calculatePValue total (f, ExistentialFactorAggregate r s c) = FeatureWithPValue (featoroidToFeature f)
|
||||
(pvalue (r - minusR c) c (total - c))
|
||||
(s / (fromIntegral c))
|
||||
c
|
||||
where minusR c = (c' * (c' + 1)) / 2.0
|
||||
where c' = fromIntegral c
|
||||
-- calulating p-value from Mann–Whitney U test
|
||||
@ -237,6 +277,26 @@ calculatePValue total (f, (r, s, c)) = FeatureWithPValue f
|
||||
sigma = sqrt $ n1' * n2' * (n1' + n2' + 1) / 12
|
||||
z = (u - mean) / sigma
|
||||
in cumulative (normalDistr 0.0 1.0) z
|
||||
featoroidToFeature (UnaryFeaturoid fac) = UnaryFeature fac
|
||||
featoroidToFeature (CartesianFeaturoid facA facB) = (CartesianFeature facA facB)
|
||||
|
||||
|
||||
kendallPValueFeature :: Ord a => FeatureNamespace -> NumericalType -> [Double] -> [MetricValue] -> [a] -> FeatureWithPValue
|
||||
kendallPValueFeature namespace ntype ranks scores values = FeatureWithPValue (NumericalFeature namespace ntype ndirection)
|
||||
pv
|
||||
((sum selectedScores) / (fromIntegral selected))
|
||||
(fromIntegral selected)
|
||||
where z = kendallZ (V.fromList $ Prelude.zip ranks values)
|
||||
pv = 2 * (cumulative (normalDistr 0.0 1.0) (- (abs z)))
|
||||
ndirection = if z > 0
|
||||
then Small
|
||||
else Big
|
||||
selected = (Prelude.length scores) `div` 4
|
||||
|
||||
selectedScores = Prelude.take selected $ Prelude.map snd $ turner $ sortOn fst $ Prelude.zip values scores
|
||||
turner = case ndirection of
|
||||
Small -> id
|
||||
Big -> Prelude.reverse
|
||||
|
||||
|
||||
totalCounter :: Monad m => ConduitT a a (StateT Integer m) ()
|
||||
|
13
test/Spec.hs
13
test/Spec.hs
@ -40,6 +40,7 @@ import Data.List (sort)
|
||||
import qualified Test.HUnit as HU
|
||||
|
||||
import qualified Data.IntSet as IS
|
||||
import qualified Data.Vector as V
|
||||
|
||||
import Data.Conduit.SmartSource
|
||||
import Data.Conduit.Rank
|
||||
@ -49,6 +50,10 @@ import Control.Monad.Trans.Resource
|
||||
import qualified Data.Conduit.List as CL
|
||||
import qualified Data.Conduit.Combinators as CC
|
||||
|
||||
import Statistics.Distribution (cumulative)
|
||||
import Statistics.Distribution.Normal (normalDistr)
|
||||
import Data.Statistics.Kendall (kendall, kendallZ)
|
||||
|
||||
informationRetrievalBookExample :: [(String, Int)]
|
||||
informationRetrievalBookExample = [("o", 2), ("o", 2), ("d", 2), ("x", 3), ("d", 3),
|
||||
("x", 1), ("o", 1), ("x", 1), ( "x", 1), ("x", 1), ("x", 1),
|
||||
@ -541,6 +546,14 @@ main = hspec $ do
|
||||
(SimpleExistentialFactor (SimpleAtomicFactor (TextFactor "tests"))),
|
||||
PeggedFactor (FeatureTabbedNamespace "in" 3)
|
||||
(NumericalFactor Nothing 5) ]
|
||||
describe "Kendall's tau" $ do
|
||||
it "tau" $ do
|
||||
kendall (V.fromList $ Prelude.zip [12, 2, 1, 12, 2] [1, 4, 7, 1, 0]) `shouldBeAlmost` (-0.47140452079103173)
|
||||
it "z" $ do
|
||||
kendallZ (V.fromList $ Prelude.zip [12, 2, 1, 12, 2] [1, 4, 7, 1, 0]) `shouldBeAlmost` (-1.0742)
|
||||
it "p-value" $ do
|
||||
(2 * (cumulative (normalDistr 0.0 1.0) $ kendallZ (V.fromList $ Prelude.zip [12, 2, 1, 12, 2] [1, 4, 7, 1, 0]))) `shouldBeAlmost` 0.2827
|
||||
|
||||
|
||||
checkConduitPure conduit inList expList = do
|
||||
let outList = runConduitPure $ CC.yieldMany inList .| conduit .| CC.sinkList
|
||||
|
Loading…
Reference in New Issue
Block a user