count the number of lines correctly
This commit is contained in:
parent
a3f5f25f69
commit
51abed6fa4
@ -32,6 +32,8 @@ import Data.List (sortBy, sort, concat)
|
|||||||
|
|
||||||
import Control.Monad.IO.Class
|
import Control.Monad.IO.Class
|
||||||
import Control.Monad.Trans.Resource
|
import Control.Monad.Trans.Resource
|
||||||
|
import Data.Conduit.Lift
|
||||||
|
import Control.Monad.State.Strict
|
||||||
|
|
||||||
import Data.Monoid ((<>))
|
import Data.Monoid ((<>))
|
||||||
|
|
||||||
@ -69,8 +71,7 @@ runWorstFeatures :: ResultOrdering -> GEvalSpecification -> IO ()
|
|||||||
runWorstFeatures ordering spec = runLineByLineGeneralized ordering spec consum
|
runWorstFeatures ordering spec = runLineByLineGeneralized ordering spec consum
|
||||||
where consum :: ConduitT LineRecord Void (ResourceT IO) ()
|
where consum :: ConduitT LineRecord Void (ResourceT IO) ()
|
||||||
consum = (rank (lessByMetric $ gesMainMetric spec)
|
consum = (rank (lessByMetric $ gesMainMetric spec)
|
||||||
.| featureExtractor
|
.| evalStateC 0 extractFeaturesAndPValues
|
||||||
.| uScoresCounter
|
|
||||||
.| CL.map (encodeUtf8 . formatFeatureWithPValue)
|
.| CL.map (encodeUtf8 . formatFeatureWithPValue)
|
||||||
.| CC.unlinesAscii
|
.| CC.unlinesAscii
|
||||||
.| CC.stdout)
|
.| CC.stdout)
|
||||||
@ -82,10 +83,17 @@ runWorstFeatures ordering spec = runLineByLineGeneralized ordering spec consum
|
|||||||
formatScore :: MetricValue -> Text
|
formatScore :: MetricValue -> Text
|
||||||
formatScore = Data.Text.pack . printf "%f"
|
formatScore = Data.Text.pack . printf "%f"
|
||||||
|
|
||||||
|
extractFeaturesAndPValues :: Monad m => ConduitT (Double, LineRecord) FeatureWithPValue (StateT Integer m) ()
|
||||||
|
extractFeaturesAndPValues =
|
||||||
|
totalCounter
|
||||||
|
.| featureExtractor
|
||||||
|
.| uScoresCounter
|
||||||
|
|
||||||
|
|
||||||
data RankedFeature = RankedFeature Text Double
|
data RankedFeature = RankedFeature Text Double
|
||||||
deriving (Show)
|
deriving (Show)
|
||||||
|
|
||||||
data FeatureWithPValue = FeatureWithPValue Text Double Int
|
data FeatureWithPValue = FeatureWithPValue Text Double Integer
|
||||||
deriving (Show)
|
deriving (Show)
|
||||||
|
|
||||||
formatFeatureWithPValue :: FeatureWithPValue -> Text
|
formatFeatureWithPValue :: FeatureWithPValue -> Text
|
||||||
@ -101,15 +109,27 @@ featureExtractor = CC.map extract .| CC.concat
|
|||||||
extractUnigramFeaturesFromTabbed "in" inLine,
|
extractUnigramFeaturesFromTabbed "in" inLine,
|
||||||
extractUnigramFeatures "out" outLine]
|
extractUnigramFeatures "out" outLine]
|
||||||
|
|
||||||
uScoresCounter :: Monad m => ConduitT RankedFeature FeatureWithPValue m ()
|
uScoresCounter :: Monad m => ConduitT RankedFeature FeatureWithPValue (StateT Integer m) ()
|
||||||
uScoresCounter = CC.map (\(RankedFeature feature r) -> (feature, (r, 1)))
|
uScoresCounter = CC.map (\(RankedFeature feature r) -> (feature, (r, 1)))
|
||||||
.| gobbleDoAndContinueWithCount
|
.| gobbleAndDo countUScores
|
||||||
countUScores
|
.| pValueCalculator
|
||||||
(\total -> CC.map (\(f, (r, c)) -> FeatureWithPValue f (pvalue (r - minusR c) c (total - c)) c))
|
|
||||||
where countUScores l =
|
where countUScores l =
|
||||||
M.toList
|
M.toList
|
||||||
$ M.fromListWith (\(r1, c1) (r2, c2) -> ((r1 + r2), (c1 + c2))) l
|
$ M.fromListWith (\(r1, c1) (r2, c2) -> ((r1 + r2), (c1 + c2))) l
|
||||||
minusR c = (c' * (c' + 1)) / 2.0
|
|
||||||
|
pValueCalculator :: Monad m => ConduitT (Text, (Double, Integer)) FeatureWithPValue (StateT Integer m) ()
|
||||||
|
pValueCalculator = do
|
||||||
|
firstVal <- await
|
||||||
|
case firstVal of
|
||||||
|
Just i -> do
|
||||||
|
total <- lift get
|
||||||
|
yield $ calculatePValue total i
|
||||||
|
CC.map $ calculatePValue total
|
||||||
|
Nothing -> return ()
|
||||||
|
|
||||||
|
calculatePValue :: Integer -> (Text, (Double, Integer)) -> FeatureWithPValue
|
||||||
|
calculatePValue total (f, (r, c)) = FeatureWithPValue f (pvalue (r - minusR c) c (total - c)) c
|
||||||
|
where minusR c = (c' * (c' + 1)) / 2.0
|
||||||
where c' = fromIntegral c
|
where c' = fromIntegral c
|
||||||
-- calulating p-value from Mann–Whitney U test
|
-- calulating p-value from Mann–Whitney U test
|
||||||
-- (normal approximation is used)
|
-- (normal approximation is used)
|
||||||
@ -120,6 +140,18 @@ uScoresCounter = CC.map (\(RankedFeature feature r) -> (feature, (r, 1)))
|
|||||||
z = (u - mean) / sigma
|
z = (u - mean) / sigma
|
||||||
in cumulative (normalDistr 0.0 1.0) z
|
in cumulative (normalDistr 0.0 1.0) z
|
||||||
|
|
||||||
|
|
||||||
|
totalCounter :: Monad m => ConduitT a a (StateT Integer m) ()
|
||||||
|
totalCounter = do
|
||||||
|
m <- await
|
||||||
|
case m of
|
||||||
|
Just x -> do
|
||||||
|
i <- lift get
|
||||||
|
lift $ put $ i + 1
|
||||||
|
yield x
|
||||||
|
totalCounter
|
||||||
|
Nothing -> return ()
|
||||||
|
|
||||||
lessByMetric :: Metric -> (LineRecord -> LineRecord -> Bool)
|
lessByMetric :: Metric -> (LineRecord -> LineRecord -> Bool)
|
||||||
lessByMetric metric = lessByMetric' (getMetricOrdering metric)
|
lessByMetric metric = lessByMetric' (getMetricOrdering metric)
|
||||||
where lessByMetric' TheHigherTheBetter = (\(LineRecord _ _ _ _ scoreA) (LineRecord _ _ _ _ scoreB) ->
|
where lessByMetric' TheHigherTheBetter = (\(LineRecord _ _ _ _ scoreA) (LineRecord _ _ _ _ scoreB) ->
|
||||||
@ -144,13 +176,6 @@ gobbleAndDo fun = do
|
|||||||
l <- CC.sinkList
|
l <- CC.sinkList
|
||||||
CC.yieldMany $ fun l
|
CC.yieldMany $ fun l
|
||||||
|
|
||||||
gobbleDoAndContinueWithCount :: Monad m => ([a] -> [b]) -> (Int -> ConduitT b c m ()) -> ConduitT a c m ()
|
|
||||||
gobbleDoAndContinueWithCount fun continuation = do
|
|
||||||
l <- CC.sinkList
|
|
||||||
let lc = Prelude.length l
|
|
||||||
(CC.yieldMany $ fun l) .| (continuation lc)
|
|
||||||
|
|
||||||
|
|
||||||
runDiff :: ResultOrdering -> FilePath -> GEvalSpecification -> IO ()
|
runDiff :: ResultOrdering -> FilePath -> GEvalSpecification -> IO ()
|
||||||
runDiff ordering otherOut spec = runDiffGeneralized ordering otherOut spec consum
|
runDiff ordering otherOut spec = runDiffGeneralized ordering otherOut spec consum
|
||||||
where consum :: ConduitT (LineRecord, LineRecord) Void (ResourceT IO) ()
|
where consum :: ConduitT (LineRecord, LineRecord) Void (ResourceT IO) ()
|
||||||
|
Loading…
Reference in New Issue
Block a user