p-value for features counted

2018-08-02 12:50:13 +02:00 · 2018-08-02 12:50:13 +02:00 · 020b93ccf8
commit 020b93ccf8
parent f8418894fb
5 changed files with 107 additions and 2 deletions
--- a/geval.cabal
+++ b/geval.cabal
@ -31,6 +31,7 @@ library
                     , Data.Conduit.AutoDecompress
                     , Data.Conduit.SmartSource
                     , Data.Conduit.Rank
                     , GEval.FeatureExtractor
                     , Paths_geval
  build-depends:       base >= 4.7 && < 5
                     , cond
@ -65,6 +66,7 @@ library
                     , Glob
                     , naturalcomp
                     , containers
                     , statistics
  default-language:    Haskell2010
 executable geval
--- a/src/GEval/Core.hs
+++ b/src/GEval/Core.hs
@ -213,7 +213,7 @@ getExpectedDirectory :: GEvalSpecification -> FilePath
 getExpectedDirectory spec = fromMaybe outDirectory $ gesExpectedDirectory spec
                            where outDirectory = gesOutDirectory spec
-data GEvalSpecialCommand = Init | LineByLine | Diff FilePath | PrintVersion
+data GEvalSpecialCommand = Init | LineByLine | WorstFeatures | Diff FilePath | PrintVersion
 data ResultOrdering = KeepTheOriginalOrder | FirstTheWorst | FirstTheBest
--- a/src/GEval/FeatureExtractor.hs
+++ b/src/GEval/FeatureExtractor.hs
@ -0,0 +1,25 @@
 {-# LANGUAGE OverloadedStrings #-}
 module GEval.FeatureExtractor
  (extractUnigramFeatures,
   extractUnigramFeaturesFromTabbed)
  where
 import Data.Text
 import Data.List
 import Data.Monoid ((<>))
 extractUnigramFeatures :: Text -> Text -> [Text]
 extractUnigramFeatures namespace record = Prelude.map (prefix <>) $ nub $ tokenize record
  where prefix = namespace <> ":"
 tokenize :: Text -> [Text]
 tokenize t = Data.List.filter (not . Data.Text.null) $ split splitPred t
   where splitPred c = c == ' ' || c == '\t' || c == ':'
 extractUnigramFeaturesFromTabbed :: Text -> Text -> [Text]
 extractUnigramFeaturesFromTabbed namespace record =
  Data.List.concat
  $ Prelude.map (\(n, t) -> extractUnigramFeatures (namespace <> "<" <> (pack $ show n) <> ">") t)
  $ Prelude.zip [1..] (splitOn "\t" record)
--- a/src/GEval/LineByLine.hs
+++ b/src/GEval/LineByLine.hs
@ -9,6 +9,7 @@
 module GEval.LineByLine
       (runLineByLine,
        runWorstFeatures,
        runLineByLineGeneralized,
        runDiff,
        runDiffGeneralized,
@ -25,12 +26,17 @@ import qualified Data.Conduit.List as CL
 import qualified Data.Conduit.Combinators as CC
 import Data.Text
 import Data.Text.Encoding
 import Data.Conduit.Rank
-import Data.List (sortBy, sort)
+import Data.List (sortBy, sort, concat)
 import Control.Monad.IO.Class
 import Control.Monad.Trans.Resource
 import Data.Monoid ((<>))
 import GEval.FeatureExtractor
 import Data.Word
 import Text.Printf
@ -39,6 +45,11 @@ import Data.Conduit.SmartSource
 import System.FilePath
 import Statistics.Distribution (cumulative)
 import Statistics.Distribution.Normal (normalDistr)
 import qualified Data.Map.Strict as M
 data LineRecord = LineRecord Text Text Text Word32 MetricValue
                  deriving (Eq, Show)
@ -54,6 +65,65 @@ runLineByLine ordering spec = runLineByLineGeneralized ordering spec consum
         formatScore :: MetricValue -> Text
         formatScore = Data.Text.pack . printf "%f"
 runWorstFeatures :: ResultOrdering -> GEvalSpecification -> IO ()
 runWorstFeatures ordering spec = runLineByLineGeneralized ordering spec consum
   where consum :: ConduitT LineRecord Void (ResourceT IO) ()
         consum = (rank (lessByMetric $ gesMainMetric spec)
                   .| featureExtractor
                   .| uScoresCounter
                   .| CL.map (encodeUtf8 . formatFeatureWithZScore)
                   .| CC.unlinesAscii
                   .| CC.stdout)
         formatOutput (LineRecord inp exp out _ score) = Data.Text.intercalate "\t" [
           formatScore score,
           escapeTabs inp,
           escapeTabs exp,
           escapeTabs out]
         formatScore :: MetricValue -> Text
         formatScore = Data.Text.pack . printf "%f"
 data RankedFeature = RankedFeature Text Double
                     deriving (Show)
 data FeatureWithZScore = FeatureWithZScore Text Double Int
                     deriving (Show)
 formatFeatureWithZScore :: FeatureWithZScore -> Text
 formatFeatureWithZScore (FeatureWithZScore f z c) =
  f <> " " <> (pack $ show c)  <>  " " <> (pack $ printf "%0.20f" z)
 featureExtractor :: Monad m => ConduitT (Double, LineRecord) RankedFeature m ()
 featureExtractor = CC.map extract .| CC.concat
  where extract (rank, LineRecord inLine expLine outLine _ _) =
          Prelude.map (\f -> RankedFeature f rank)
          $ Data.List.concat [
              extractUnigramFeatures "exp" expLine,
              extractUnigramFeaturesFromTabbed "in" inLine,
              extractUnigramFeatures "out" outLine]
 uScoresCounter :: Monad m => ConduitT RankedFeature FeatureWithZScore m ()
 uScoresCounter = CC.map (\(RankedFeature feature r) -> (feature, (r, 1)))
               .| gobbleAndDo countUScores
               .| CC.map (\(f, (r, c)) -> FeatureWithZScore f (zscore (r - minusR c) c (2942 - c)) c)
  where countUScores l =
           M.toList
           $ M.fromListWith (\(r1, c1) (r2, c2) -> ((r1 + r2), (c1 + c2))) l
        minusR c = (c' * (c' + 1)) / 2.0
              where c' = fromIntegral c
        zscore u n1 n2 = let n1' = fromIntegral n1
                             n2' = fromIntegral n2
                             mean = n1' * n2' / 2
                             sigma = sqrt $ n1' * n2' * (n1' + n2' + 1) / 12
                             z = (u - mean) / sigma
                         in cumulative (normalDistr 0.0 1.0) z
 lessByMetric :: Metric -> (LineRecord -> LineRecord -> Bool)
 lessByMetric metric = lessByMetric' (getMetricOrdering metric)
  where lessByMetric' TheHigherTheBetter = (\(LineRecord _ _ _ _ scoreA) (LineRecord _ _ _ _ scoreB) ->
                                             scoreA < scoreB)
        lessByMetric' TheLowerTheBetter = (\(LineRecord _ _ _ _ scoreA) (LineRecord _ _ _ _ scoreB) ->
                                             scoreA > scoreB)
 runLineByLineGeneralized :: ResultOrdering -> GEvalSpecification -> ConduitT LineRecord Void (ResourceT IO) a -> IO a
 runLineByLineGeneralized ordering spec consum = do
  (inputFilePath, expectedFilePath, outFilePath) <- checkAndGetFilesSingleOut True spec
--- a/src/GEval/OptionsParser.hs
+++ b/src/GEval/OptionsParser.hs
@ -46,6 +46,11 @@ optionsParser = GEvalOptions
                   <> short 'l'
                   <> help "Give scores for each line rather than the whole test set" ))
                 <|>
                 (flag' WorstFeatures
                 ( long "worst-features"
                   <> short 'w'
                   <> help "Print a ranking of worst features, i.e. features that worsen the score significantly" ))
                 <|>
                 (Diff <$> strOption
                    ( long "diff"
                      <> short 'd'
@ -194,6 +199,9 @@ runGEval''' (Just PrintVersion) _ _ = do
 runGEval''' (Just LineByLine) ordering spec = do
  runLineByLine ordering spec
  return Nothing
 runGEval''' (Just WorstFeatures) ordering spec = do
  runWorstFeatures ordering spec
  return Nothing
 runGEval''' (Just (Diff otherOut)) ordering spec = do
  runDiff ordering otherOut spec
  return Nothing