Add numerical features

This commit is contained in:
Filip Gralinski 2019-01-27 20:18:39 +01:00 committed by Filip Graliński
parent d5a8908599
commit abcce9bf68
5 changed files with 320 additions and 21 deletions

View File

@ -1,5 +1,5 @@
name: geval name: geval
version: 1.13.0.0 version: 1.14.0.0
synopsis: Machine learning evaluation tools synopsis: Machine learning evaluation tools
description: Please see README.md description: Please see README.md
homepage: http://github.com/name/project homepage: http://github.com/name/project
@ -39,6 +39,7 @@ library
, GEval.Annotation , GEval.Annotation
, GEval.BlackBoxDebugging , GEval.BlackBoxDebugging
, Text.WordShape , Text.WordShape
, Data.Statistics.Kendall
, Paths_geval , Paths_geval
build-depends: base >= 4.7 && < 5 build-depends: base >= 4.7 && < 5
, cond , cond
@ -80,6 +81,7 @@ library
, MissingH , MissingH
, array , array
, Munkres , Munkres
, vector-algorithms
default-language: Haskell2010 default-language: Haskell2010
executable geval executable geval
@ -117,6 +119,8 @@ test-suite geval-test
, directory , directory
, temporary , temporary
, silently , silently
, vector
, statistics
ghc-options: -threaded -rtsopts -with-rtsopts=-N ghc-options: -threaded -rtsopts -with-rtsopts=-N
default-language: Haskell2010 default-language: Haskell2010

View File

@ -0,0 +1,178 @@
{-# LANGUAGE BangPatterns, CPP, FlexibleContexts, ScopedTypeVariables #-}
-- |
-- (Taken from http://hackage.haskell.org/package/statistics-0.15.0.0/docs/src/Statistics.Correlation.Kendall.html)
--
-- Module : Statistics.Correlation.Kendall
--
-- Fast O(NlogN) implementation of
-- <http://en.wikipedia.org/wiki/Kendall_tau_rank_correlation_coefficient Kendall's tau>.
--
-- This module implements Kendall's tau form b which allows ties in the data.
-- This is the same formula used by other statistical packages, e.g., R, matlab.
--
-- > \tau = \frac{n_c - n_d}{\sqrt{(n_0 - n_1)(n_0 - n_2)}}
--
-- where n_0 = n(n-1)\/2, n_1 = number of pairs tied for the first quantify,
-- n_2 = number of pairs tied for the second quantify,
-- n_c = number of concordant pairs$, n_d = number of discordant pairs.
module Data.Statistics.Kendall
( kendall,
kendallZ
-- * References
-- $references
) where
import Control.Monad.ST (ST, runST)
import Data.Bits (shiftR)
import Data.Function (on)
import Data.STRef
import qualified Data.Vector.Algorithms.Intro as I
import qualified Data.Vector.Generic as G
import qualified Data.Vector.Generic.Mutable as GM
-- | /O(nlogn)/ Compute the Kendall's tau from a vector of paired data.
-- Return NaN when number of pairs <= 1.
kendall :: (Ord a, Ord b, G.Vector v (a, b)) => v (a, b) -> Double
kendall xy'
| G.length xy' <= 1 = 0/0
| otherwise = runST $ do
xy <- G.thaw xy'
let n = GM.length xy
n_dRef <- newSTRef 0
I.sort xy
tieX <- numOfTiesBy ((==) `on` fst) xy
tieXY <- numOfTiesBy (==) xy
tmp <- GM.new n
mergeSort (compare `on` snd) xy tmp n_dRef
tieY <- numOfTiesBy ((==) `on` snd) xy
n_d <- readSTRef n_dRef
let n_0 = (fromIntegral n * (fromIntegral n-1)) `shiftR` 1 :: Integer
n_c = n_0 - n_d - tieX - tieY + tieXY
return $ fromIntegral (n_c - n_d) /
(sqrt.fromIntegral) ((n_0 - tieX) * (n_0 - tieY))
{-# INLINE kendall #-}
kendallZ :: (Ord a, Ord b, G.Vector v (a, b)) => v (a, b) -> Double
kendallZ xy'
| G.length xy' <= 1 = 0/0
| otherwise = runST $ do
xy <- G.thaw xy'
let n = GM.length xy
let vfun x = x * (x - 1) * (2*x + 5)
let tttfun x = x * (x - 1) * (x - 2)
n_dRef <- newSTRef 0
I.sort xy
tieX <- numOfTiesBy ((==) `on` fst) xy
tieXY <- numOfTiesBy (==) xy
vt <- numOfTiesByGeneralized vfun ((==) `on` fst) xy
tttX <- numOfTiesByGeneralized tttfun ((==) `on` fst) xy
tmp <- GM.new n
mergeSort (compare `on` snd) xy tmp n_dRef
tieY <- numOfTiesBy ((==) `on` snd) xy
vu <- numOfTiesByGeneralized vfun ((==) `on` snd) xy
tttY <- numOfTiesByGeneralized tttfun ((==) `on` snd) xy
n_d <- readSTRef n_dRef
let n_0 = (fromIntegral n * (fromIntegral n-1)) `shiftR` 1 :: Integer
n_c = n_0 - n_d - tieX - tieY + tieXY
v0 = vfun (fromIntegral n)
v1 = 2.0 * (fromIntegral tieX) * (fromIntegral tieY) / (fromIntegral (n * (n-1)))
v2 = (fromIntegral tttX) * (fromIntegral tttY) / (fromIntegral (9 * (tttfun n)))
v = (fromIntegral (v0 - vt - vu)) / 18.0 + v1 + v2
return $ (fromIntegral (n_c - n_d)) / sqrt v
{-# INLINE kendallZ #-}
-- calculate number of tied pairs in a sorted vector
numOfTiesBy :: GM.MVector v a
=> (a -> a -> Bool) -> v s a -> ST s Integer
numOfTiesBy f xs = numOfTiesByGeneralized (\x -> (x * (x - 1)) `shiftR` 1) f xs
numOfTiesByGeneralized :: GM.MVector v a
=> (Int -> Int) -> (a -> a -> Bool) -> v s a -> ST s Integer
numOfTiesByGeneralized op f xs = do count <- newSTRef (0::Integer)
loop count (1::Int) (0::Int)
readSTRef count
where
n = GM.length xs
loop c !acc !i | i >= n - 1 = modifySTRef' c (+ g acc)
| otherwise = do
x1 <- GM.unsafeRead xs i
x2 <- GM.unsafeRead xs (i+1)
if f x1 x2
then loop c (acc+1) (i+1)
else modifySTRef' c (+ g acc) >> loop c 1 (i+1)
g x = fromIntegral $ op x
{-# INLINE numOfTiesByGeneralized #-}
-- Implementation of Knight's merge sort (adapted from vector-algorithm). This
-- function is used to count the number of discordant pairs.
mergeSort :: GM.MVector v e
=> (e -> e -> Ordering)
-> v s e
-> v s e
-> STRef s Integer
-> ST s ()
mergeSort cmp src buf count = loop 0 (GM.length src - 1)
where
loop l u
| u == l = return ()
| u - l == 1 = do
eL <- GM.unsafeRead src l
eU <- GM.unsafeRead src u
case cmp eL eU of
GT -> do GM.unsafeWrite src l eU
GM.unsafeWrite src u eL
modifySTRef' count (+1)
_ -> return ()
| otherwise = do
let mid = (u + l) `shiftR` 1
loop l mid
loop mid u
merge cmp (GM.unsafeSlice l (u-l+1) src) buf (mid - l) count
{-# INLINE mergeSort #-}
merge :: GM.MVector v e
=> (e -> e -> Ordering)
-> v s e
-> v s e
-> Int
-> STRef s Integer
-> ST s ()
merge cmp src buf mid count = do GM.unsafeCopy tmp lower
eTmp <- GM.unsafeRead tmp 0
eUpp <- GM.unsafeRead upper 0
loop tmp 0 eTmp upper 0 eUpp 0
where
lower = GM.unsafeSlice 0 mid src
upper = GM.unsafeSlice mid (GM.length src - mid) src
tmp = GM.unsafeSlice 0 mid buf
wroteHigh low iLow eLow high iHigh iIns
| iHigh >= GM.length high =
GM.unsafeCopy (GM.unsafeSlice iIns (GM.length low - iLow) src)
(GM.unsafeSlice iLow (GM.length low - iLow) low)
| otherwise = do eHigh <- GM.unsafeRead high iHigh
loop low iLow eLow high iHigh eHigh iIns
wroteLow low iLow high iHigh eHigh iIns
| iLow >= GM.length low = return ()
| otherwise = do eLow <- GM.unsafeRead low iLow
loop low iLow eLow high iHigh eHigh iIns
loop !low !iLow !eLow !high !iHigh !eHigh !iIns = case cmp eHigh eLow of
LT -> do GM.unsafeWrite src iIns eHigh
modifySTRef' count (+ fromIntegral (GM.length low - iLow))
wroteHigh low iLow eLow high (iHigh+1) (iIns+1)
_ -> do GM.unsafeWrite src iIns eLow
wroteLow low (iLow+1) high iHigh eHigh (iIns+1)
{-# INLINE merge #-}
#if !MIN_VERSION_base(4,6,0)
modifySTRef' :: STRef s a -> (a -> a) -> ST s ()
modifySTRef' = modifySTRef
#endif
-- $references
--
-- * William R. Knight. (1966) A computer method for calculating Kendall's Tau
-- with ungrouped data. /Journal of the American Statistical Association/,
-- Vol. 61, No. 314, Part 1, pp. 436-439. <http://www.jstor.org/pss/2282833>

View File

@ -4,6 +4,10 @@ module GEval.FeatureExtractor
(extractFactors, (extractFactors,
extractFactorsFromTabbed, extractFactorsFromTabbed,
cartesianFeatures, cartesianFeatures,
Feature(..),
NumericalType(..),
NumericalDirection(..),
Featuroid(..),
LineWithFactors(..), LineWithFactors(..),
LineWithPeggedFactors(..), LineWithPeggedFactors(..),
PeggedFactor(..), PeggedFactor(..),
@ -26,15 +30,55 @@ import GEval.BlackBoxDebugging
import GEval.Common import GEval.Common
import Text.Read (readMaybe) import Text.Read (readMaybe)
data Feature = UnaryFeature PeggedExistentialFactor
| CartesianFeature PeggedExistentialFactor PeggedExistentialFactor
| NumericalFeature FeatureNamespace NumericalType NumericalDirection
deriving (Eq, Ord)
instance Show Feature where
show (UnaryFeature p) = show p
show (CartesianFeature pA pB) = formatCartesian pA pB
show (NumericalFeature namespace ntype direction) = (show namespace) ++ ":" ++ (show ntype) ++ (show direction)
data NumericalType = DirectValue | LengthOf
deriving (Eq, Ord)
instance Show NumericalType where
show DirectValue = "="
show LengthOf = "=#"
data NumericalDirection = Big | Small
deriving (Eq, Ord)
instance Show NumericalDirection where
show Big = "+"
show Small = "-"
-- | Featuroid is something between a factor and a feature, i.e. for numerical factors
-- it's not a single value, but still without the direction.
data Featuroid = UnaryFeaturoid PeggedExistentialFactor
| CartesianFeaturoid PeggedExistentialFactor PeggedExistentialFactor
| NumericalFeaturoid FeatureNamespace
deriving (Eq, Ord)
instance Show Featuroid where
show (UnaryFeaturoid p) = show p
show (CartesianFeaturoid pA pB) = formatCartesian pA pB
show (NumericalFeaturoid namespace) = (show namespace) ++ ":="
data LineWithFactors = LineWithFactors Double MetricValue [Factor] data LineWithFactors = LineWithFactors Double MetricValue [Factor]
deriving (Eq, Ord) deriving (Eq, Ord)
-- | A factor extracted from a single item (its input, expected output or actual output).
data Factor = UnaryFactor PeggedFactor | CartesianFactor PeggedExistentialFactor PeggedExistentialFactor data Factor = UnaryFactor PeggedFactor | CartesianFactor PeggedExistentialFactor PeggedExistentialFactor
deriving (Eq, Ord) deriving (Eq, Ord)
instance Show Factor where instance Show Factor where
show (UnaryFactor factor) = show factor show (UnaryFactor factor) = show factor
show (CartesianFactor factorA factorB) = (show factorA) ++ "~~" ++ (show factorB) show (CartesianFactor factorA factorB) = formatCartesian factorA factorB
formatCartesian :: PeggedExistentialFactor -> PeggedExistentialFactor -> String
formatCartesian factorA factorB = (show factorA) ++ "~~" ++ (show factorB)
data LineWithPeggedFactors = LineWithPeggedFactors Double MetricValue [PeggedFactor] data LineWithPeggedFactors = LineWithPeggedFactors Double MetricValue [PeggedFactor]
deriving (Eq, Ord) deriving (Eq, Ord)

View File

@ -34,7 +34,9 @@ import Data.Text.Encoding
import Data.Conduit.Rank import Data.Conduit.Rank
import Data.Maybe (fromMaybe) import Data.Maybe (fromMaybe)
import Data.List (sortBy, sort, concat) import qualified Data.Vector as V
import Data.List (sortBy, sortOn, sort, concat)
import Control.Monad.IO.Class import Control.Monad.IO.Class
import Control.Monad.Trans.Resource import Control.Monad.Trans.Resource
@ -56,6 +58,7 @@ import System.FilePath
import Statistics.Distribution (cumulative) import Statistics.Distribution (cumulative)
import Statistics.Distribution.Normal (normalDistr) import Statistics.Distribution.Normal (normalDistr)
import Data.Statistics.Kendall (kendallZ)
import qualified Data.Map.Strict as M import qualified Data.Map.Strict as M
import qualified Data.Set as S import qualified Data.Set as S
@ -125,7 +128,7 @@ extractFeaturesAndPValues spec bbdo =
data RankedFactor = RankedFactor Factor Double MetricValue data RankedFactor = RankedFactor Factor Double MetricValue
deriving (Show) deriving (Show)
data FeatureWithPValue = FeatureWithPValue Factor -- ^ feature itself data FeatureWithPValue = FeatureWithPValue Feature -- ^ feature itself
Double -- ^ p-value Double -- ^ p-value
MetricValue -- ^ average metric value MetricValue -- ^ average metric value
Integer -- ^ count Integer -- ^ count
@ -184,11 +187,12 @@ finalFeatures True minFreq = do
filtreCartesian False = CC.map id filtreCartesian False = CC.map id
filtreCartesian True = CC.concatMapAccum step S.empty filtreCartesian True = CC.concatMapAccum step S.empty
where step f@(FeatureWithPValue (UnaryFactor (PeggedFactor namespace (SimpleExistentialFactor p))) _ _ _) mp = (S.insert (PeggedExistentialFactor namespace p) mp, [f]) where step f@(FeatureWithPValue (UnaryFeature fac) _ _ _) mp = (S.insert fac mp, [f])
step f@(FeatureWithPValue (UnaryFactor (PeggedFactor namespace (NumericalFactor _ _))) _ _ _) mp = (mp, [f]) step f@(FeatureWithPValue (CartesianFeature pA pB) _ _ _) mp = (mp, if pA `S.member` mp || pB `S.member` mp
step f@(FeatureWithPValue (CartesianFactor pA pB) _ _ _) mp = (mp, if pA `S.member` mp || pB `S.member` mp then []
then [] else [f])
else [f]) step f@(FeatureWithPValue (NumericalFeature _ _ _) _ _ _) mp = (mp, [f])
peggedToUnaryLine :: LineWithPeggedFactors -> LineWithFactors peggedToUnaryLine :: LineWithPeggedFactors -> LineWithFactors
peggedToUnaryLine (LineWithPeggedFactors rank score fs) = LineWithFactors rank score (Prelude.map UnaryFactor fs) peggedToUnaryLine (LineWithPeggedFactors rank score fs) = LineWithFactors rank score (Prelude.map UnaryFactor fs)
@ -200,33 +204,69 @@ getFeatures mTokenizer bbdo (LineRecord inLine expLine outLine _ _) =
extractFactorsFromTabbed mTokenizer bbdo "in" inLine, extractFactorsFromTabbed mTokenizer bbdo "in" inLine,
extractFactors mTokenizer bbdo "out" outLine] extractFactors mTokenizer bbdo "out" outLine]
data FeatureAggregate = ExistentialFactorAggregate Double MetricValue Integer
| NumericalValueAggregate [Double] [MetricValue] [Int] [MetricValue]
| LengthAggregate [Double] [MetricValue] [Int]
aggreggate :: FeatureAggregate -> FeatureAggregate -> FeatureAggregate
aggreggate (ExistentialFactorAggregate r1 s1 c1) (ExistentialFactorAggregate r2 s2 c2) =
ExistentialFactorAggregate (r1 + r2) (s1 + s2) (c1 + c2)
aggreggate (NumericalValueAggregate ranks1 scores1 lengths1 values1) (NumericalValueAggregate ranks2 scores2 lengths2 values2) =
NumericalValueAggregate (ranks1 ++ ranks2) (scores1 ++ scores2) (lengths1 ++ lengths2) (values1 ++ values2)
aggreggate (NumericalValueAggregate ranks1 scores1 lengths1 _) (LengthAggregate ranks2 scores2 lengths2) =
LengthAggregate (ranks1 ++ ranks2) (scores1 ++ scores2) (lengths1 ++ lengths2)
aggreggate (LengthAggregate ranks1 scores1 lengths1) (NumericalValueAggregate ranks2 scores2 lengths2 _) =
LengthAggregate (ranks1 ++ ranks2) (scores1 ++ scores2) (lengths1 ++ lengths2)
aggreggate (LengthAggregate ranks1 scores1 lengths1) (LengthAggregate ranks2 scores2 lengths2) =
LengthAggregate (ranks1 ++ ranks2) (scores1 ++ scores2) (lengths1 ++ lengths2)
aggreggate _ _ = error "Mismatched aggregates!"
initAggregate :: RankedFactor -> (Featuroid, FeatureAggregate)
initAggregate (RankedFactor (UnaryFactor (PeggedFactor namespace (NumericalFactor Nothing l))) r s) =
(NumericalFeaturoid namespace, LengthAggregate [r] [s] [l])
initAggregate (RankedFactor (UnaryFactor (PeggedFactor namespace (NumericalFactor (Just v) l))) r s) =
(NumericalFeaturoid namespace, NumericalValueAggregate [r] [s] [l] [v])
initAggregate (RankedFactor (UnaryFactor (PeggedFactor namespace (SimpleExistentialFactor f))) r s) =
(UnaryFeaturoid (PeggedExistentialFactor namespace f), ExistentialFactorAggregate r s 1)
initAggregate (RankedFactor (CartesianFactor pA pB) r s) =
(CartesianFeaturoid pA pB, ExistentialFactorAggregate r s 1)
filterAggregateByFreq :: Integer -> (Maybe Integer) -> FeatureAggregate -> Bool
filterAggregateByFreq minFreq Nothing (ExistentialFactorAggregate _ _ c) = c >= minFreq
filterAggregateByFreq minFreq (Just total) (ExistentialFactorAggregate _ _ c) = c >= minFreq && total - c >= minFreq
filterAggregateByFreq _ _ _ = True
uScoresCounter :: Monad m => Integer -> ConduitT RankedFactor FeatureWithPValue (StateT Integer m) () uScoresCounter :: Monad m => Integer -> ConduitT RankedFactor FeatureWithPValue (StateT Integer m) ()
uScoresCounter minFreq = CC.map (\(RankedFactor feature r score) -> (feature, (r, score, 1))) uScoresCounter minFreq = CC.map initAggregate
.| gobbleAndDo countUScores .| gobbleAndDo countUScores
.| lowerFreqFiltre .| lowerFreqFiltre
.| pValueCalculator minFreq .| pValueCalculator minFreq
where countUScores l = where countUScores l =
M.toList M.toList
$ M.fromListWith (\(r1, s1, c1) (r2, s2, c2) -> ((r1 + r2), (s1 + s2), (c1 + c2))) l $ M.fromListWith aggreggate l
lowerFreqFiltre = CC.filter (\(_, (_, _, c)) -> c >= minFreq) lowerFreqFiltre = CC.filter (\(_, fAgg) -> filterAggregateByFreq minFreq Nothing fAgg)
pValueCalculator :: Monad m => Integer -> ConduitT (Factor, (Double, MetricValue, Integer)) FeatureWithPValue (StateT Integer m) () pValueCalculator :: Monad m => Integer -> ConduitT (Featuroid, FeatureAggregate) FeatureWithPValue (StateT Integer m) ()
pValueCalculator minFreq = do pValueCalculator minFreq = do
firstVal <- await firstVal <- await
case firstVal of case firstVal of
Just i@(_, (_, _, c)) -> do Just i@(_, fAgg) -> do
total <- lift get total <- lift get
if total - c >= minFreq if filterAggregateByFreq minFreq (Just total) fAgg
then yield $ calculatePValue total i then yield $ calculatePValue total i
else return () else return ()
CC.filter (\(_, (_, _, c)) -> total - c >= minFreq) .| CC.map (calculatePValue total) CC.filter (\(_, fAgg) -> filterAggregateByFreq minFreq (Just total) fAgg) .| CC.map (calculatePValue total)
Nothing -> return () Nothing -> return ()
calculatePValue :: Integer -> (Factor, (Double, MetricValue, Integer)) -> FeatureWithPValue calculatePValue :: Integer -> (Featuroid, FeatureAggregate) -> FeatureWithPValue
calculatePValue total (f, (r, s, c)) = FeatureWithPValue f calculatePValue _ (NumericalFeaturoid namespace, NumericalValueAggregate ranks scores _ values) =
(pvalue (r - minusR c) c (total - c)) kendallPValueFeature namespace DirectValue ranks scores values
(s / (fromIntegral c)) calculatePValue _ (NumericalFeaturoid namespace, LengthAggregate ranks scores lens) =
c kendallPValueFeature namespace LengthOf ranks scores lens
calculatePValue total (f, ExistentialFactorAggregate r s c) = FeatureWithPValue (featoroidToFeature f)
(pvalue (r - minusR c) c (total - c))
(s / (fromIntegral c))
c
where minusR c = (c' * (c' + 1)) / 2.0 where minusR c = (c' * (c' + 1)) / 2.0
where c' = fromIntegral c where c' = fromIntegral c
-- calulating p-value from MannWhitney U test -- calulating p-value from MannWhitney U test
@ -237,6 +277,26 @@ calculatePValue total (f, (r, s, c)) = FeatureWithPValue f
sigma = sqrt $ n1' * n2' * (n1' + n2' + 1) / 12 sigma = sqrt $ n1' * n2' * (n1' + n2' + 1) / 12
z = (u - mean) / sigma z = (u - mean) / sigma
in cumulative (normalDistr 0.0 1.0) z in cumulative (normalDistr 0.0 1.0) z
featoroidToFeature (UnaryFeaturoid fac) = UnaryFeature fac
featoroidToFeature (CartesianFeaturoid facA facB) = (CartesianFeature facA facB)
kendallPValueFeature :: Ord a => FeatureNamespace -> NumericalType -> [Double] -> [MetricValue] -> [a] -> FeatureWithPValue
kendallPValueFeature namespace ntype ranks scores values = FeatureWithPValue (NumericalFeature namespace ntype ndirection)
pv
((sum selectedScores) / (fromIntegral selected))
(fromIntegral selected)
where z = kendallZ (V.fromList $ Prelude.zip ranks values)
pv = 2 * (cumulative (normalDistr 0.0 1.0) (- (abs z)))
ndirection = if z > 0
then Small
else Big
selected = (Prelude.length scores) `div` 4
selectedScores = Prelude.take selected $ Prelude.map snd $ turner $ sortOn fst $ Prelude.zip values scores
turner = case ndirection of
Small -> id
Big -> Prelude.reverse
totalCounter :: Monad m => ConduitT a a (StateT Integer m) () totalCounter :: Monad m => ConduitT a a (StateT Integer m) ()

View File

@ -40,6 +40,7 @@ import Data.List (sort)
import qualified Test.HUnit as HU import qualified Test.HUnit as HU
import qualified Data.IntSet as IS import qualified Data.IntSet as IS
import qualified Data.Vector as V
import Data.Conduit.SmartSource import Data.Conduit.SmartSource
import Data.Conduit.Rank import Data.Conduit.Rank
@ -49,6 +50,10 @@ import Control.Monad.Trans.Resource
import qualified Data.Conduit.List as CL import qualified Data.Conduit.List as CL
import qualified Data.Conduit.Combinators as CC import qualified Data.Conduit.Combinators as CC
import Statistics.Distribution (cumulative)
import Statistics.Distribution.Normal (normalDistr)
import Data.Statistics.Kendall (kendall, kendallZ)
informationRetrievalBookExample :: [(String, Int)] informationRetrievalBookExample :: [(String, Int)]
informationRetrievalBookExample = [("o", 2), ("o", 2), ("d", 2), ("x", 3), ("d", 3), informationRetrievalBookExample = [("o", 2), ("o", 2), ("d", 2), ("x", 3), ("d", 3),
("x", 1), ("o", 1), ("x", 1), ( "x", 1), ("x", 1), ("x", 1), ("x", 1), ("o", 1), ("x", 1), ( "x", 1), ("x", 1), ("x", 1),
@ -541,6 +546,14 @@ main = hspec $ do
(SimpleExistentialFactor (SimpleAtomicFactor (TextFactor "tests"))), (SimpleExistentialFactor (SimpleAtomicFactor (TextFactor "tests"))),
PeggedFactor (FeatureTabbedNamespace "in" 3) PeggedFactor (FeatureTabbedNamespace "in" 3)
(NumericalFactor Nothing 5) ] (NumericalFactor Nothing 5) ]
describe "Kendall's tau" $ do
it "tau" $ do
kendall (V.fromList $ Prelude.zip [12, 2, 1, 12, 2] [1, 4, 7, 1, 0]) `shouldBeAlmost` (-0.47140452079103173)
it "z" $ do
kendallZ (V.fromList $ Prelude.zip [12, 2, 1, 12, 2] [1, 4, 7, 1, 0]) `shouldBeAlmost` (-1.0742)
it "p-value" $ do
(2 * (cumulative (normalDistr 0.0 1.0) $ kendallZ (V.fromList $ Prelude.zip [12, 2, 1, 12, 2] [1, 4, 7, 1, 0]))) `shouldBeAlmost` 0.2827
checkConduitPure conduit inList expList = do checkConduitPure conduit inList expList = do
let outList = runConduitPure $ CC.yieldMany inList .| conduit .| CC.sinkList let outList = runConduitPure $ CC.yieldMany inList .| conduit .| CC.sinkList