count the number of lines correctly

This commit is contained in:
Filip Graliński 2018-08-03 11:16:28 +02:00
parent a3f5f25f69
commit 51abed6fa4

View File

@ -32,6 +32,8 @@ import Data.List (sortBy, sort, concat)
import Control.Monad.IO.Class
import Control.Monad.Trans.Resource
import Data.Conduit.Lift
import Control.Monad.State.Strict
import Data.Monoid ((<>))
@ -69,8 +71,7 @@ runWorstFeatures :: ResultOrdering -> GEvalSpecification -> IO ()
runWorstFeatures ordering spec = runLineByLineGeneralized ordering spec consum
where consum :: ConduitT LineRecord Void (ResourceT IO) ()
consum = (rank (lessByMetric $ gesMainMetric spec)
.| featureExtractor
.| uScoresCounter
.| evalStateC 0 extractFeaturesAndPValues
.| CL.map (encodeUtf8 . formatFeatureWithPValue)
.| CC.unlinesAscii
.| CC.stdout)
@ -82,10 +83,17 @@ runWorstFeatures ordering spec = runLineByLineGeneralized ordering spec consum
formatScore :: MetricValue -> Text
formatScore = Data.Text.pack . printf "%f"
extractFeaturesAndPValues :: Monad m => ConduitT (Double, LineRecord) FeatureWithPValue (StateT Integer m) ()
extractFeaturesAndPValues =
totalCounter
.| featureExtractor
.| uScoresCounter
data RankedFeature = RankedFeature Text Double
deriving (Show)
data FeatureWithPValue = FeatureWithPValue Text Double Int
data FeatureWithPValue = FeatureWithPValue Text Double Integer
deriving (Show)
formatFeatureWithPValue :: FeatureWithPValue -> Text
@ -101,15 +109,27 @@ featureExtractor = CC.map extract .| CC.concat
extractUnigramFeaturesFromTabbed "in" inLine,
extractUnigramFeatures "out" outLine]
uScoresCounter :: Monad m => ConduitT RankedFeature FeatureWithPValue m ()
uScoresCounter :: Monad m => ConduitT RankedFeature FeatureWithPValue (StateT Integer m) ()
uScoresCounter = CC.map (\(RankedFeature feature r) -> (feature, (r, 1)))
.| gobbleDoAndContinueWithCount
countUScores
(\total -> CC.map (\(f, (r, c)) -> FeatureWithPValue f (pvalue (r - minusR c) c (total - c)) c))
.| gobbleAndDo countUScores
.| pValueCalculator
where countUScores l =
M.toList
$ M.fromListWith (\(r1, c1) (r2, c2) -> ((r1 + r2), (c1 + c2))) l
minusR c = (c' * (c' + 1)) / 2.0
pValueCalculator :: Monad m => ConduitT (Text, (Double, Integer)) FeatureWithPValue (StateT Integer m) ()
pValueCalculator = do
firstVal <- await
case firstVal of
Just i -> do
total <- lift get
yield $ calculatePValue total i
CC.map $ calculatePValue total
Nothing -> return ()
calculatePValue :: Integer -> (Text, (Double, Integer)) -> FeatureWithPValue
calculatePValue total (f, (r, c)) = FeatureWithPValue f (pvalue (r - minusR c) c (total - c)) c
where minusR c = (c' * (c' + 1)) / 2.0
where c' = fromIntegral c
-- calulating p-value from MannWhitney U test
-- (normal approximation is used)
@ -120,6 +140,18 @@ uScoresCounter = CC.map (\(RankedFeature feature r) -> (feature, (r, 1)))
z = (u - mean) / sigma
in cumulative (normalDistr 0.0 1.0) z
totalCounter :: Monad m => ConduitT a a (StateT Integer m) ()
totalCounter = do
m <- await
case m of
Just x -> do
i <- lift get
lift $ put $ i + 1
yield x
totalCounter
Nothing -> return ()
lessByMetric :: Metric -> (LineRecord -> LineRecord -> Bool)
lessByMetric metric = lessByMetric' (getMetricOrdering metric)
where lessByMetric' TheHigherTheBetter = (\(LineRecord _ _ _ _ scoreA) (LineRecord _ _ _ _ scoreB) ->
@ -144,13 +176,6 @@ gobbleAndDo fun = do
l <- CC.sinkList
CC.yieldMany $ fun l
gobbleDoAndContinueWithCount :: Monad m => ([a] -> [b]) -> (Int -> ConduitT b c m ()) -> ConduitT a c m ()
gobbleDoAndContinueWithCount fun continuation = do
l <- CC.sinkList
let lc = Prelude.length l
(CC.yieldMany $ fun l) .| (continuation lc)
runDiff :: ResultOrdering -> FilePath -> GEvalSpecification -> IO ()
runDiff ordering otherOut spec = runDiffGeneralized ordering otherOut spec consum
where consum :: ConduitT (LineRecord, LineRecord) Void (ResourceT IO) ()