Add Bootstrap facilities

This commit is contained in:
Filip Gralinski 2020-01-25 22:05:11 +01:00
parent bfcd5aa631
commit deb14c6702
3 changed files with 42 additions and 0 deletions

View File

@ -51,6 +51,7 @@ library
, Data.Statistics.Calibration , Data.Statistics.Calibration
, Data.CartesianStrings , Data.CartesianStrings
, Data.SplitIntoCrossTabs , Data.SplitIntoCrossTabs
, Data.Conduit.Bootstrap
, Paths_geval , Paths_geval
build-depends: base >= 4.7 && < 5 build-depends: base >= 4.7 && < 5
, cond , cond
@ -105,6 +106,7 @@ library
, utf8-string , utf8-string
, singletons , singletons
, ordered-containers , ordered-containers
, random
default-language: Haskell2010 default-language: Haskell2010
executable geval executable geval

View File

@ -0,0 +1,28 @@
{-# LANGUAGE ScopedTypeVariables #-}
-- Bootstrap re-sampling
module Data.Conduit.Bootstrap
(bootstrapC)
where
import Data.Conduit
import qualified Data.Conduit.List as CL
import qualified Data.Conduit.Combinators as CC
import Control.Monad.Trans.Resource
import Data.Vector.Unboxed
import qualified Data.Vector.Generic as VG
import Debug.Trace
import System.Random (mkStdGen, randomRs)
bootstrapC :: (Show c, Show f, Unbox c, Monad m) => Int -> ConduitT c Void (ResourceT m) f -> ConduitT c Void (ResourceT m) [f]
bootstrapC numberOfSamples final = do
l <- CC.sinkList
let v = fromList l
results <- Prelude.mapM (\i -> (CC.yieldMany (resampleVector (mkStdGen i) v) .| final)) [1..numberOfSamples]
return results
resampleVector gen v = Prelude.map (\ix -> v VG.! ix) $ Prelude.take n $ randomRs (0, n-1) gen
where n = VG.length v

View File

@ -1,5 +1,6 @@
{-# LANGUAGE OverloadedStrings #-} {-# LANGUAGE OverloadedStrings #-}
{-# LANGUAGE FlexibleContexts #-} {-# LANGUAGE FlexibleContexts #-}
{-# LANGUAGE ScopedTypeVariables #-}
import Test.Hspec import Test.Hspec
@ -29,6 +30,7 @@ import GEval.FeatureExtractor
import GEval.Selector import GEval.Selector
import GEval.CreateChallenge import GEval.CreateChallenge
import GEval.Validation import GEval.Validation
import Data.Conduit.Bootstrap
import Data.Map.Strict import Data.Map.Strict
import Data.Conduit.List (consume) import Data.Conduit.List (consume)
@ -539,6 +541,16 @@ main = hspec $ do
(1.5, 3.0), (1.5, 3.0),
(3.0, 2.0), (3.0, 2.0),
(4.0, 1.0)] (4.0, 1.0)]
describe "bootstrap conduit" $ do
it "sanity test" $ do
let nbOfSamples = 1000
let listChecked :: [Int] = [0..10]
(runResourceT $ runConduit (CL.sourceList listChecked .| CC.product)) `shouldReturn` 0
results <- runResourceT $ runConduit (CL.sourceList listChecked .| bootstrapC nbOfSamples CC.product)
Prelude.length results `shouldBe` nbOfSamples
(Prelude.length (Prelude.filter (> 0) results)) `shouldNotBe` 0
describe "tokenizer" $ do describe "tokenizer" $ do
it "simple utterance with '13a' tokenizer" $ do it "simple utterance with '13a' tokenizer" $ do
tokenize (Just V13a) "To be or not to be, that's the question." `shouldBe` tokenize (Just V13a) "To be or not to be, that's the question." `shouldBe`