Add numerical features

This commit is contained in:
Filip Gralinski 2019-01-27 20:18:39 +01:00 committed by Filip Graliński
parent d5a8908599
commit abcce9bf68
5 changed files with 320 additions and 21 deletions

View File

@ -1,5 +1,5 @@
name: geval
version: 1.13.0.0
version: 1.14.0.0
synopsis: Machine learning evaluation tools
description: Please see README.md
homepage: http://github.com/name/project
@ -39,6 +39,7 @@ library
, GEval.Annotation
, GEval.BlackBoxDebugging
, Text.WordShape
, Data.Statistics.Kendall
, Paths_geval
build-depends: base >= 4.7 && < 5
, cond
@ -80,6 +81,7 @@ library
, MissingH
, array
, Munkres
, vector-algorithms
default-language: Haskell2010
executable geval
@ -117,6 +119,8 @@ test-suite geval-test
, directory
, temporary
, silently
, vector
, statistics
ghc-options: -threaded -rtsopts -with-rtsopts=-N
default-language: Haskell2010

View File

@ -0,0 +1,178 @@
{-# LANGUAGE BangPatterns, CPP, FlexibleContexts, ScopedTypeVariables #-}
-- |
-- (Taken from http://hackage.haskell.org/package/statistics-0.15.0.0/docs/src/Statistics.Correlation.Kendall.html)
--
-- Module : Statistics.Correlation.Kendall
--
-- Fast O(NlogN) implementation of
-- <http://en.wikipedia.org/wiki/Kendall_tau_rank_correlation_coefficient Kendall's tau>.
--
-- This module implements Kendall's tau form b which allows ties in the data.
-- This is the same formula used by other statistical packages, e.g., R, matlab.
--
-- > \tau = \frac{n_c - n_d}{\sqrt{(n_0 - n_1)(n_0 - n_2)}}
--
-- where n_0 = n(n-1)\/2, n_1 = number of pairs tied for the first quantify,
-- n_2 = number of pairs tied for the second quantify,
-- n_c = number of concordant pairs$, n_d = number of discordant pairs.
module Data.Statistics.Kendall
( kendall,
kendallZ
-- * References
-- $references
) where
import Control.Monad.ST (ST, runST)
import Data.Bits (shiftR)
import Data.Function (on)
import Data.STRef
import qualified Data.Vector.Algorithms.Intro as I
import qualified Data.Vector.Generic as G
import qualified Data.Vector.Generic.Mutable as GM
-- | /O(nlogn)/ Compute the Kendall's tau from a vector of paired data.
-- Return NaN when number of pairs <= 1.
kendall :: (Ord a, Ord b, G.Vector v (a, b)) => v (a, b) -> Double
kendall xy'
| G.length xy' <= 1 = 0/0
| otherwise = runST $ do
xy <- G.thaw xy'
let n = GM.length xy
n_dRef <- newSTRef 0
I.sort xy
tieX <- numOfTiesBy ((==) `on` fst) xy
tieXY <- numOfTiesBy (==) xy
tmp <- GM.new n
mergeSort (compare `on` snd) xy tmp n_dRef
tieY <- numOfTiesBy ((==) `on` snd) xy
n_d <- readSTRef n_dRef
let n_0 = (fromIntegral n * (fromIntegral n-1)) `shiftR` 1 :: Integer
n_c = n_0 - n_d - tieX - tieY + tieXY
return $ fromIntegral (n_c - n_d) /
(sqrt.fromIntegral) ((n_0 - tieX) * (n_0 - tieY))
{-# INLINE kendall #-}
kendallZ :: (Ord a, Ord b, G.Vector v (a, b)) => v (a, b) -> Double
kendallZ xy'
| G.length xy' <= 1 = 0/0
| otherwise = runST $ do
xy <- G.thaw xy'
let n = GM.length xy
let vfun x = x * (x - 1) * (2*x + 5)
let tttfun x = x * (x - 1) * (x - 2)
n_dRef <- newSTRef 0
I.sort xy
tieX <- numOfTiesBy ((==) `on` fst) xy
tieXY <- numOfTiesBy (==) xy
vt <- numOfTiesByGeneralized vfun ((==) `on` fst) xy
tttX <- numOfTiesByGeneralized tttfun ((==) `on` fst) xy
tmp <- GM.new n
mergeSort (compare `on` snd) xy tmp n_dRef
tieY <- numOfTiesBy ((==) `on` snd) xy
vu <- numOfTiesByGeneralized vfun ((==) `on` snd) xy
tttY <- numOfTiesByGeneralized tttfun ((==) `on` snd) xy
n_d <- readSTRef n_dRef
let n_0 = (fromIntegral n * (fromIntegral n-1)) `shiftR` 1 :: Integer
n_c = n_0 - n_d - tieX - tieY + tieXY
v0 = vfun (fromIntegral n)
v1 = 2.0 * (fromIntegral tieX) * (fromIntegral tieY) / (fromIntegral (n * (n-1)))
v2 = (fromIntegral tttX) * (fromIntegral tttY) / (fromIntegral (9 * (tttfun n)))
v = (fromIntegral (v0 - vt - vu)) / 18.0 + v1 + v2
return $ (fromIntegral (n_c - n_d)) / sqrt v
{-# INLINE kendallZ #-}
-- calculate number of tied pairs in a sorted vector
numOfTiesBy :: GM.MVector v a
=> (a -> a -> Bool) -> v s a -> ST s Integer
numOfTiesBy f xs = numOfTiesByGeneralized (\x -> (x * (x - 1)) `shiftR` 1) f xs
numOfTiesByGeneralized :: GM.MVector v a
=> (Int -> Int) -> (a -> a -> Bool) -> v s a -> ST s Integer
numOfTiesByGeneralized op f xs = do count <- newSTRef (0::Integer)
loop count (1::Int) (0::Int)
readSTRef count
where
n = GM.length xs
loop c !acc !i | i >= n - 1 = modifySTRef' c (+ g acc)
| otherwise = do
x1 <- GM.unsafeRead xs i
x2 <- GM.unsafeRead xs (i+1)
if f x1 x2
then loop c (acc+1) (i+1)
else modifySTRef' c (+ g acc) >> loop c 1 (i+1)
g x = fromIntegral $ op x
{-# INLINE numOfTiesByGeneralized #-}
-- Implementation of Knight's merge sort (adapted from vector-algorithm). This
-- function is used to count the number of discordant pairs.
mergeSort :: GM.MVector v e
=> (e -> e -> Ordering)
-> v s e
-> v s e
-> STRef s Integer
-> ST s ()
mergeSort cmp src buf count = loop 0 (GM.length src - 1)
where
loop l u
| u == l = return ()
| u - l == 1 = do
eL <- GM.unsafeRead src l
eU <- GM.unsafeRead src u
case cmp eL eU of
GT -> do GM.unsafeWrite src l eU
GM.unsafeWrite src u eL
modifySTRef' count (+1)
_ -> return ()
| otherwise = do
let mid = (u + l) `shiftR` 1
loop l mid
loop mid u
merge cmp (GM.unsafeSlice l (u-l+1) src) buf (mid - l) count
{-# INLINE mergeSort #-}
merge :: GM.MVector v e
=> (e -> e -> Ordering)
-> v s e
-> v s e
-> Int
-> STRef s Integer
-> ST s ()
merge cmp src buf mid count = do GM.unsafeCopy tmp lower
eTmp <- GM.unsafeRead tmp 0
eUpp <- GM.unsafeRead upper 0
loop tmp 0 eTmp upper 0 eUpp 0
where
lower = GM.unsafeSlice 0 mid src
upper = GM.unsafeSlice mid (GM.length src - mid) src
tmp = GM.unsafeSlice 0 mid buf
wroteHigh low iLow eLow high iHigh iIns
| iHigh >= GM.length high =
GM.unsafeCopy (GM.unsafeSlice iIns (GM.length low - iLow) src)
(GM.unsafeSlice iLow (GM.length low - iLow) low)
| otherwise = do eHigh <- GM.unsafeRead high iHigh
loop low iLow eLow high iHigh eHigh iIns
wroteLow low iLow high iHigh eHigh iIns
| iLow >= GM.length low = return ()
| otherwise = do eLow <- GM.unsafeRead low iLow
loop low iLow eLow high iHigh eHigh iIns
loop !low !iLow !eLow !high !iHigh !eHigh !iIns = case cmp eHigh eLow of
LT -> do GM.unsafeWrite src iIns eHigh
modifySTRef' count (+ fromIntegral (GM.length low - iLow))
wroteHigh low iLow eLow high (iHigh+1) (iIns+1)
_ -> do GM.unsafeWrite src iIns eLow
wroteLow low (iLow+1) high iHigh eHigh (iIns+1)
{-# INLINE merge #-}
#if !MIN_VERSION_base(4,6,0)
modifySTRef' :: STRef s a -> (a -> a) -> ST s ()
modifySTRef' = modifySTRef
#endif
-- $references
--
-- * William R. Knight. (1966) A computer method for calculating Kendall's Tau
-- with ungrouped data. /Journal of the American Statistical Association/,
-- Vol. 61, No. 314, Part 1, pp. 436-439. <http://www.jstor.org/pss/2282833>

View File

@ -4,6 +4,10 @@ module GEval.FeatureExtractor
(extractFactors,
extractFactorsFromTabbed,
cartesianFeatures,
Feature(..),
NumericalType(..),
NumericalDirection(..),
Featuroid(..),
LineWithFactors(..),
LineWithPeggedFactors(..),
PeggedFactor(..),
@ -26,15 +30,55 @@ import GEval.BlackBoxDebugging
import GEval.Common
import Text.Read (readMaybe)
data Feature = UnaryFeature PeggedExistentialFactor
| CartesianFeature PeggedExistentialFactor PeggedExistentialFactor
| NumericalFeature FeatureNamespace NumericalType NumericalDirection
deriving (Eq, Ord)
instance Show Feature where
show (UnaryFeature p) = show p
show (CartesianFeature pA pB) = formatCartesian pA pB
show (NumericalFeature namespace ntype direction) = (show namespace) ++ ":" ++ (show ntype) ++ (show direction)
data NumericalType = DirectValue | LengthOf
deriving (Eq, Ord)
instance Show NumericalType where
show DirectValue = "="
show LengthOf = "=#"
data NumericalDirection = Big | Small
deriving (Eq, Ord)
instance Show NumericalDirection where
show Big = "+"
show Small = "-"
-- | Featuroid is something between a factor and a feature, i.e. for numerical factors
-- it's not a single value, but still without the direction.
data Featuroid = UnaryFeaturoid PeggedExistentialFactor
| CartesianFeaturoid PeggedExistentialFactor PeggedExistentialFactor
| NumericalFeaturoid FeatureNamespace
deriving (Eq, Ord)
instance Show Featuroid where
show (UnaryFeaturoid p) = show p
show (CartesianFeaturoid pA pB) = formatCartesian pA pB
show (NumericalFeaturoid namespace) = (show namespace) ++ ":="
data LineWithFactors = LineWithFactors Double MetricValue [Factor]
deriving (Eq, Ord)
-- | A factor extracted from a single item (its input, expected output or actual output).
data Factor = UnaryFactor PeggedFactor | CartesianFactor PeggedExistentialFactor PeggedExistentialFactor
deriving (Eq, Ord)
instance Show Factor where
show (UnaryFactor factor) = show factor
show (CartesianFactor factorA factorB) = (show factorA) ++ "~~" ++ (show factorB)
show (CartesianFactor factorA factorB) = formatCartesian factorA factorB
formatCartesian :: PeggedExistentialFactor -> PeggedExistentialFactor -> String
formatCartesian factorA factorB = (show factorA) ++ "~~" ++ (show factorB)
data LineWithPeggedFactors = LineWithPeggedFactors Double MetricValue [PeggedFactor]
deriving (Eq, Ord)

View File

@ -34,7 +34,9 @@ import Data.Text.Encoding
import Data.Conduit.Rank
import Data.Maybe (fromMaybe)
import Data.List (sortBy, sort, concat)
import qualified Data.Vector as V
import Data.List (sortBy, sortOn, sort, concat)
import Control.Monad.IO.Class
import Control.Monad.Trans.Resource
@ -56,6 +58,7 @@ import System.FilePath
import Statistics.Distribution (cumulative)
import Statistics.Distribution.Normal (normalDistr)
import Data.Statistics.Kendall (kendallZ)
import qualified Data.Map.Strict as M
import qualified Data.Set as S
@ -125,7 +128,7 @@ extractFeaturesAndPValues spec bbdo =
data RankedFactor = RankedFactor Factor Double MetricValue
deriving (Show)
data FeatureWithPValue = FeatureWithPValue Factor -- ^ feature itself
data FeatureWithPValue = FeatureWithPValue Feature -- ^ feature itself
Double -- ^ p-value
MetricValue -- ^ average metric value
Integer -- ^ count
@ -184,11 +187,12 @@ finalFeatures True minFreq = do
filtreCartesian False = CC.map id
filtreCartesian True = CC.concatMapAccum step S.empty
where step f@(FeatureWithPValue (UnaryFactor (PeggedFactor namespace (SimpleExistentialFactor p))) _ _ _) mp = (S.insert (PeggedExistentialFactor namespace p) mp, [f])
step f@(FeatureWithPValue (UnaryFactor (PeggedFactor namespace (NumericalFactor _ _))) _ _ _) mp = (mp, [f])
step f@(FeatureWithPValue (CartesianFactor pA pB) _ _ _) mp = (mp, if pA `S.member` mp || pB `S.member` mp
then []
else [f])
where step f@(FeatureWithPValue (UnaryFeature fac) _ _ _) mp = (S.insert fac mp, [f])
step f@(FeatureWithPValue (CartesianFeature pA pB) _ _ _) mp = (mp, if pA `S.member` mp || pB `S.member` mp
then []
else [f])
step f@(FeatureWithPValue (NumericalFeature _ _ _) _ _ _) mp = (mp, [f])
peggedToUnaryLine :: LineWithPeggedFactors -> LineWithFactors
peggedToUnaryLine (LineWithPeggedFactors rank score fs) = LineWithFactors rank score (Prelude.map UnaryFactor fs)
@ -200,33 +204,69 @@ getFeatures mTokenizer bbdo (LineRecord inLine expLine outLine _ _) =
extractFactorsFromTabbed mTokenizer bbdo "in" inLine,
extractFactors mTokenizer bbdo "out" outLine]
data FeatureAggregate = ExistentialFactorAggregate Double MetricValue Integer
| NumericalValueAggregate [Double] [MetricValue] [Int] [MetricValue]
| LengthAggregate [Double] [MetricValue] [Int]
aggreggate :: FeatureAggregate -> FeatureAggregate -> FeatureAggregate
aggreggate (ExistentialFactorAggregate r1 s1 c1) (ExistentialFactorAggregate r2 s2 c2) =
ExistentialFactorAggregate (r1 + r2) (s1 + s2) (c1 + c2)
aggreggate (NumericalValueAggregate ranks1 scores1 lengths1 values1) (NumericalValueAggregate ranks2 scores2 lengths2 values2) =
NumericalValueAggregate (ranks1 ++ ranks2) (scores1 ++ scores2) (lengths1 ++ lengths2) (values1 ++ values2)
aggreggate (NumericalValueAggregate ranks1 scores1 lengths1 _) (LengthAggregate ranks2 scores2 lengths2) =
LengthAggregate (ranks1 ++ ranks2) (scores1 ++ scores2) (lengths1 ++ lengths2)
aggreggate (LengthAggregate ranks1 scores1 lengths1) (NumericalValueAggregate ranks2 scores2 lengths2 _) =
LengthAggregate (ranks1 ++ ranks2) (scores1 ++ scores2) (lengths1 ++ lengths2)
aggreggate (LengthAggregate ranks1 scores1 lengths1) (LengthAggregate ranks2 scores2 lengths2) =
LengthAggregate (ranks1 ++ ranks2) (scores1 ++ scores2) (lengths1 ++ lengths2)
aggreggate _ _ = error "Mismatched aggregates!"
initAggregate :: RankedFactor -> (Featuroid, FeatureAggregate)
initAggregate (RankedFactor (UnaryFactor (PeggedFactor namespace (NumericalFactor Nothing l))) r s) =
(NumericalFeaturoid namespace, LengthAggregate [r] [s] [l])
initAggregate (RankedFactor (UnaryFactor (PeggedFactor namespace (NumericalFactor (Just v) l))) r s) =
(NumericalFeaturoid namespace, NumericalValueAggregate [r] [s] [l] [v])
initAggregate (RankedFactor (UnaryFactor (PeggedFactor namespace (SimpleExistentialFactor f))) r s) =
(UnaryFeaturoid (PeggedExistentialFactor namespace f), ExistentialFactorAggregate r s 1)
initAggregate (RankedFactor (CartesianFactor pA pB) r s) =
(CartesianFeaturoid pA pB, ExistentialFactorAggregate r s 1)
filterAggregateByFreq :: Integer -> (Maybe Integer) -> FeatureAggregate -> Bool
filterAggregateByFreq minFreq Nothing (ExistentialFactorAggregate _ _ c) = c >= minFreq
filterAggregateByFreq minFreq (Just total) (ExistentialFactorAggregate _ _ c) = c >= minFreq && total - c >= minFreq
filterAggregateByFreq _ _ _ = True
uScoresCounter :: Monad m => Integer -> ConduitT RankedFactor FeatureWithPValue (StateT Integer m) ()
uScoresCounter minFreq = CC.map (\(RankedFactor feature r score) -> (feature, (r, score, 1)))
uScoresCounter minFreq = CC.map initAggregate
.| gobbleAndDo countUScores
.| lowerFreqFiltre
.| pValueCalculator minFreq
where countUScores l =
M.toList
$ M.fromListWith (\(r1, s1, c1) (r2, s2, c2) -> ((r1 + r2), (s1 + s2), (c1 + c2))) l
lowerFreqFiltre = CC.filter (\(_, (_, _, c)) -> c >= minFreq)
$ M.fromListWith aggreggate l
lowerFreqFiltre = CC.filter (\(_, fAgg) -> filterAggregateByFreq minFreq Nothing fAgg)
pValueCalculator :: Monad m => Integer -> ConduitT (Factor, (Double, MetricValue, Integer)) FeatureWithPValue (StateT Integer m) ()
pValueCalculator :: Monad m => Integer -> ConduitT (Featuroid, FeatureAggregate) FeatureWithPValue (StateT Integer m) ()
pValueCalculator minFreq = do
firstVal <- await
case firstVal of
Just i@(_, (_, _, c)) -> do
Just i@(_, fAgg) -> do
total <- lift get
if total - c >= minFreq
if filterAggregateByFreq minFreq (Just total) fAgg
then yield $ calculatePValue total i
else return ()
CC.filter (\(_, (_, _, c)) -> total - c >= minFreq) .| CC.map (calculatePValue total)
CC.filter (\(_, fAgg) -> filterAggregateByFreq minFreq (Just total) fAgg) .| CC.map (calculatePValue total)
Nothing -> return ()
calculatePValue :: Integer -> (Factor, (Double, MetricValue, Integer)) -> FeatureWithPValue
calculatePValue total (f, (r, s, c)) = FeatureWithPValue f
(pvalue (r - minusR c) c (total - c))
(s / (fromIntegral c))
c
calculatePValue :: Integer -> (Featuroid, FeatureAggregate) -> FeatureWithPValue
calculatePValue _ (NumericalFeaturoid namespace, NumericalValueAggregate ranks scores _ values) =
kendallPValueFeature namespace DirectValue ranks scores values
calculatePValue _ (NumericalFeaturoid namespace, LengthAggregate ranks scores lens) =
kendallPValueFeature namespace LengthOf ranks scores lens
calculatePValue total (f, ExistentialFactorAggregate r s c) = FeatureWithPValue (featoroidToFeature f)
(pvalue (r - minusR c) c (total - c))
(s / (fromIntegral c))
c
where minusR c = (c' * (c' + 1)) / 2.0
where c' = fromIntegral c
-- calulating p-value from MannWhitney U test
@ -237,6 +277,26 @@ calculatePValue total (f, (r, s, c)) = FeatureWithPValue f
sigma = sqrt $ n1' * n2' * (n1' + n2' + 1) / 12
z = (u - mean) / sigma
in cumulative (normalDistr 0.0 1.0) z
featoroidToFeature (UnaryFeaturoid fac) = UnaryFeature fac
featoroidToFeature (CartesianFeaturoid facA facB) = (CartesianFeature facA facB)
kendallPValueFeature :: Ord a => FeatureNamespace -> NumericalType -> [Double] -> [MetricValue] -> [a] -> FeatureWithPValue
kendallPValueFeature namespace ntype ranks scores values = FeatureWithPValue (NumericalFeature namespace ntype ndirection)
pv
((sum selectedScores) / (fromIntegral selected))
(fromIntegral selected)
where z = kendallZ (V.fromList $ Prelude.zip ranks values)
pv = 2 * (cumulative (normalDistr 0.0 1.0) (- (abs z)))
ndirection = if z > 0
then Small
else Big
selected = (Prelude.length scores) `div` 4
selectedScores = Prelude.take selected $ Prelude.map snd $ turner $ sortOn fst $ Prelude.zip values scores
turner = case ndirection of
Small -> id
Big -> Prelude.reverse
totalCounter :: Monad m => ConduitT a a (StateT Integer m) ()

View File

@ -40,6 +40,7 @@ import Data.List (sort)
import qualified Test.HUnit as HU
import qualified Data.IntSet as IS
import qualified Data.Vector as V
import Data.Conduit.SmartSource
import Data.Conduit.Rank
@ -49,6 +50,10 @@ import Control.Monad.Trans.Resource
import qualified Data.Conduit.List as CL
import qualified Data.Conduit.Combinators as CC
import Statistics.Distribution (cumulative)
import Statistics.Distribution.Normal (normalDistr)
import Data.Statistics.Kendall (kendall, kendallZ)
informationRetrievalBookExample :: [(String, Int)]
informationRetrievalBookExample = [("o", 2), ("o", 2), ("d", 2), ("x", 3), ("d", 3),
("x", 1), ("o", 1), ("x", 1), ( "x", 1), ("x", 1), ("x", 1),
@ -541,6 +546,14 @@ main = hspec $ do
(SimpleExistentialFactor (SimpleAtomicFactor (TextFactor "tests"))),
PeggedFactor (FeatureTabbedNamespace "in" 3)
(NumericalFactor Nothing 5) ]
describe "Kendall's tau" $ do
it "tau" $ do
kendall (V.fromList $ Prelude.zip [12, 2, 1, 12, 2] [1, 4, 7, 1, 0]) `shouldBeAlmost` (-0.47140452079103173)
it "z" $ do
kendallZ (V.fromList $ Prelude.zip [12, 2, 1, 12, 2] [1, 4, 7, 1, 0]) `shouldBeAlmost` (-1.0742)
it "p-value" $ do
(2 * (cumulative (normalDistr 0.0 1.0) $ kendallZ (V.fromList $ Prelude.zip [12, 2, 1, 12, 2] [1, 4, 7, 1, 0]))) `shouldBeAlmost` 0.2827
checkConduitPure conduit inList expList = do
let outList = runConduitPure $ CC.yieldMany inList .| conduit .| CC.sinkList