automatic decompression
This commit is contained in:
parent
bab4f7d94c
commit
438f013914
@ -27,6 +27,7 @@ library
|
||||
, GEval.CharMatch
|
||||
, GEval.LineByLine
|
||||
, GEval.BIO
|
||||
, Data.Conduit.AutoDecompress
|
||||
build-depends: base >= 4.7 && < 5
|
||||
, cond
|
||||
, conduit
|
||||
@ -49,6 +50,12 @@ library
|
||||
, vector
|
||||
, mtl
|
||||
, edit-distance
|
||||
, bytestring
|
||||
, word8
|
||||
, primitive
|
||||
, transformers-base
|
||||
, bzlib-conduit
|
||||
, lzma-conduit
|
||||
default-language: Haskell2010
|
||||
|
||||
executable geval
|
||||
|
44
src/Data/Conduit/AutoDecompress.hs
Normal file
44
src/Data/Conduit/AutoDecompress.hs
Normal file
@ -0,0 +1,44 @@
|
||||
{-# LANGUAGE AllowAmbiguousTypes #-}
|
||||
|
||||
module Data.Conduit.AutoDecompress
|
||||
(autoDecompress)
|
||||
where
|
||||
|
||||
import Data.Conduit
|
||||
import Data.Conduit.Combinators
|
||||
import Data.ByteString
|
||||
import Data.Conduit.Zlib
|
||||
import Data.Word8
|
||||
import Control.Monad.Trans.Resource (MonadThrow, MonadResource)
|
||||
import Control.Monad.Primitive (PrimMonad)
|
||||
import Control.Monad.Base (MonadBase)
|
||||
import qualified Data.Conduit.Lzma as XZ
|
||||
import qualified Data.Conduit.BZlib as BZ
|
||||
|
||||
autoDecompress :: (MonadResource m, MonadBase base m, MonadThrow m, PrimMonad base) => ConduitM ByteString ByteString m ()
|
||||
autoDecompress = do
|
||||
f <- await
|
||||
case f of
|
||||
Just chunk -> if Data.ByteString.length chunk > 1
|
||||
then
|
||||
do
|
||||
let firstByte = Data.ByteString.head chunk
|
||||
let secondByte = Data.ByteString.index chunk 1
|
||||
leftover chunk
|
||||
lookAtMagicNumbers (firstByte, secondByte)
|
||||
else
|
||||
do
|
||||
leftover chunk
|
||||
doNothing
|
||||
|
||||
Nothing -> return ()
|
||||
|
||||
|
||||
lookAtMagicNumbers :: (MonadResource m, MonadBase base m, MonadThrow m, PrimMonad base) => (Word8, Word8) -> Conduit ByteString m ByteString
|
||||
lookAtMagicNumbers (31, 139) = ungzip
|
||||
lookAtMagicNumbers (66, 90) = BZ.bunzip2
|
||||
lookAtMagicNumbers (253, 55) = XZ.decompress Nothing
|
||||
lookAtMagicNumbers _ = doNothing
|
||||
|
||||
doNothing :: Monad m => Conduit ByteString m ByteString
|
||||
doNothing = Data.Conduit.Combinators.filter (const True)
|
@ -57,7 +57,7 @@ import Data.Tuple
|
||||
import qualified Data.List.Split as DLS
|
||||
|
||||
import Control.Monad.IO.Class
|
||||
import Control.Monad ((<=<))
|
||||
import Control.Monad ((<=<), filterM)
|
||||
|
||||
import Data.Attoparsec.Text (parseOnly)
|
||||
|
||||
@ -69,6 +69,7 @@ import GEval.ClusteringMetrics
|
||||
import GEval.LogLossHashed
|
||||
import GEval.CharMatch
|
||||
import GEval.BIO
|
||||
import Data.Conduit.AutoDecompress
|
||||
|
||||
import qualified Data.HashMap.Strict as M
|
||||
|
||||
@ -239,11 +240,14 @@ checkAndGetFiles gevalSpec = do
|
||||
unlessM (D.doesDirectoryExist expectedDirectory) $ throwM $ NoExpectedDirectory expectedDirectory
|
||||
unlessM (D.doesDirectoryExist outTestDirectory) $ throwM $ NoOutTestDirectory outTestDirectory
|
||||
unlessM (D.doesDirectoryExist expectedTestDirectory) $ throwM $ NoExpectedTestDirectory expectedTestDirectory
|
||||
inputFilePath <- lookForCompressedFiles inputFilePath'
|
||||
expectedFilePath <- lookForCompressedFiles expectedFilePath'
|
||||
outFilePath <- lookForCompressedFiles outFilePath'
|
||||
checkInputFileIfNeeded metric inputFilePath
|
||||
return (inputFilePath, expectedFilePath, outFilePath)
|
||||
where expectedFilePath = expectedTestDirectory </> (gesExpectedFile gevalSpec)
|
||||
outFilePath = getOutFile gevalSpec (gesOutFile gevalSpec)
|
||||
inputFilePath = expectedTestDirectory </> (gesInputFile gevalSpec)
|
||||
where expectedFilePath' = expectedTestDirectory </> (gesExpectedFile gevalSpec)
|
||||
outFilePath' = getOutFile gevalSpec (gesOutFile gevalSpec)
|
||||
inputFilePath' = expectedTestDirectory </> (gesInputFile gevalSpec)
|
||||
expectedTestDirectory = expectedDirectory </> testName
|
||||
outTestDirectory = outDirectory </> testName
|
||||
expectedDirectory = getExpectedDirectory gevalSpec
|
||||
@ -251,6 +255,24 @@ checkAndGetFiles gevalSpec = do
|
||||
testName = gesTestName gevalSpec
|
||||
metric = gesMetric gevalSpec
|
||||
|
||||
lookForCompressedFiles :: FilePath -> IO FilePath
|
||||
lookForCompressedFiles = lookForAlternativeFiles [".gz", ".xz", ".bz2"]
|
||||
|
||||
lookForAlternativeFiles :: [String] -> FilePath -> IO FilePath
|
||||
lookForAlternativeFiles suffixes filePath
|
||||
| takeExtension filePath `Prelude.elem` suffixes = return filePath
|
||||
| otherwise = do
|
||||
fileIsThere <- D.doesFileExist filePath
|
||||
if fileIsThere
|
||||
then
|
||||
return filePath
|
||||
else
|
||||
do
|
||||
found <- Control.Monad.filterM D.doesFileExist $ Prelude.map (filePath <.>) suffixes
|
||||
return $ case found of
|
||||
[fp] -> fp
|
||||
_ -> filePath
|
||||
|
||||
getOutFile :: GEvalSpecification -> FilePath -> FilePath
|
||||
getOutFile gevalSpec out = outDirectory </> testName </> out
|
||||
where outDirectory = gesOutDirectory gevalSpec
|
||||
@ -264,7 +286,7 @@ checkInputFileIfNeeded _ _ = return ()
|
||||
|
||||
fileAsLineSource :: FilePath -> LineSource (ResourceT IO)
|
||||
fileAsLineSource filePath =
|
||||
LineSource (CB.sourceFile filePath $= CT.decodeUtf8Lenient =$= CT.lines) filePath 1
|
||||
LineSource (CB.sourceFile filePath $= autoDecompress =$= CT.decodeUtf8Lenient =$= CT.lines) filePath 1
|
||||
|
||||
gevalCoreOnSingleLines :: Metric -> LineInFile -> LineInFile -> LineInFile -> IO (MetricValue)
|
||||
gevalCoreOnSingleLines metric inpLine expLine outLine =
|
||||
|
@ -252,6 +252,10 @@ main = hspec $ do
|
||||
runGEvalTest "bio-f1-simple" `shouldReturnAlmost` 0.5
|
||||
it "check perfect score" $ do
|
||||
runGEvalTest "bio-f1-perfect" `shouldReturnAlmost` 1.0
|
||||
describe "automatic decompression" $ do
|
||||
it "more complex test" $ do
|
||||
runGEvalTest "charmatch-complex-compressed" `shouldReturnAlmost` 0.1923076923076923
|
||||
|
||||
|
||||
neverMatch :: Char -> Int -> Bool
|
||||
neverMatch _ _ = False
|
||||
|
Binary file not shown.
@ -0,0 +1 @@
|
||||
--metric CharMatch
|
Binary file not shown.
Binary file not shown.
Loading…
Reference in New Issue
Block a user