automatic decompression

This commit is contained in:
Filip Gralinski 2018-05-17 08:26:57 +02:00
parent bab4f7d94c
commit 438f013914
8 changed files with 83 additions and 5 deletions

View File

@ -27,6 +27,7 @@ library
, GEval.CharMatch
, GEval.LineByLine
, GEval.BIO
, Data.Conduit.AutoDecompress
build-depends: base >= 4.7 && < 5
, cond
, conduit
@ -49,6 +50,12 @@ library
, vector
, mtl
, edit-distance
, bytestring
, word8
, primitive
, transformers-base
, bzlib-conduit
, lzma-conduit
default-language: Haskell2010
executable geval

View File

@ -0,0 +1,44 @@
{-# LANGUAGE AllowAmbiguousTypes #-}
module Data.Conduit.AutoDecompress
(autoDecompress)
where
import Data.Conduit
import Data.Conduit.Combinators
import Data.ByteString
import Data.Conduit.Zlib
import Data.Word8
import Control.Monad.Trans.Resource (MonadThrow, MonadResource)
import Control.Monad.Primitive (PrimMonad)
import Control.Monad.Base (MonadBase)
import qualified Data.Conduit.Lzma as XZ
import qualified Data.Conduit.BZlib as BZ
autoDecompress :: (MonadResource m, MonadBase base m, MonadThrow m, PrimMonad base) => ConduitM ByteString ByteString m ()
autoDecompress = do
f <- await
case f of
Just chunk -> if Data.ByteString.length chunk > 1
then
do
let firstByte = Data.ByteString.head chunk
let secondByte = Data.ByteString.index chunk 1
leftover chunk
lookAtMagicNumbers (firstByte, secondByte)
else
do
leftover chunk
doNothing
Nothing -> return ()
lookAtMagicNumbers :: (MonadResource m, MonadBase base m, MonadThrow m, PrimMonad base) => (Word8, Word8) -> Conduit ByteString m ByteString
lookAtMagicNumbers (31, 139) = ungzip
lookAtMagicNumbers (66, 90) = BZ.bunzip2
lookAtMagicNumbers (253, 55) = XZ.decompress Nothing
lookAtMagicNumbers _ = doNothing
doNothing :: Monad m => Conduit ByteString m ByteString
doNothing = Data.Conduit.Combinators.filter (const True)

View File

@ -57,7 +57,7 @@ import Data.Tuple
import qualified Data.List.Split as DLS
import Control.Monad.IO.Class
import Control.Monad ((<=<))
import Control.Monad ((<=<), filterM)
import Data.Attoparsec.Text (parseOnly)
@ -69,6 +69,7 @@ import GEval.ClusteringMetrics
import GEval.LogLossHashed
import GEval.CharMatch
import GEval.BIO
import Data.Conduit.AutoDecompress
import qualified Data.HashMap.Strict as M
@ -239,11 +240,14 @@ checkAndGetFiles gevalSpec = do
unlessM (D.doesDirectoryExist expectedDirectory) $ throwM $ NoExpectedDirectory expectedDirectory
unlessM (D.doesDirectoryExist outTestDirectory) $ throwM $ NoOutTestDirectory outTestDirectory
unlessM (D.doesDirectoryExist expectedTestDirectory) $ throwM $ NoExpectedTestDirectory expectedTestDirectory
inputFilePath <- lookForCompressedFiles inputFilePath'
expectedFilePath <- lookForCompressedFiles expectedFilePath'
outFilePath <- lookForCompressedFiles outFilePath'
checkInputFileIfNeeded metric inputFilePath
return (inputFilePath, expectedFilePath, outFilePath)
where expectedFilePath = expectedTestDirectory </> (gesExpectedFile gevalSpec)
outFilePath = getOutFile gevalSpec (gesOutFile gevalSpec)
inputFilePath = expectedTestDirectory </> (gesInputFile gevalSpec)
where expectedFilePath' = expectedTestDirectory </> (gesExpectedFile gevalSpec)
outFilePath' = getOutFile gevalSpec (gesOutFile gevalSpec)
inputFilePath' = expectedTestDirectory </> (gesInputFile gevalSpec)
expectedTestDirectory = expectedDirectory </> testName
outTestDirectory = outDirectory </> testName
expectedDirectory = getExpectedDirectory gevalSpec
@ -251,6 +255,24 @@ checkAndGetFiles gevalSpec = do
testName = gesTestName gevalSpec
metric = gesMetric gevalSpec
lookForCompressedFiles :: FilePath -> IO FilePath
lookForCompressedFiles = lookForAlternativeFiles [".gz", ".xz", ".bz2"]
lookForAlternativeFiles :: [String] -> FilePath -> IO FilePath
lookForAlternativeFiles suffixes filePath
| takeExtension filePath `Prelude.elem` suffixes = return filePath
| otherwise = do
fileIsThere <- D.doesFileExist filePath
if fileIsThere
then
return filePath
else
do
found <- Control.Monad.filterM D.doesFileExist $ Prelude.map (filePath <.>) suffixes
return $ case found of
[fp] -> fp
_ -> filePath
getOutFile :: GEvalSpecification -> FilePath -> FilePath
getOutFile gevalSpec out = outDirectory </> testName </> out
where outDirectory = gesOutDirectory gevalSpec
@ -264,7 +286,7 @@ checkInputFileIfNeeded _ _ = return ()
fileAsLineSource :: FilePath -> LineSource (ResourceT IO)
fileAsLineSource filePath =
LineSource (CB.sourceFile filePath $= CT.decodeUtf8Lenient =$= CT.lines) filePath 1
LineSource (CB.sourceFile filePath $= autoDecompress =$= CT.decodeUtf8Lenient =$= CT.lines) filePath 1
gevalCoreOnSingleLines :: Metric -> LineInFile -> LineInFile -> LineInFile -> IO (MetricValue)
gevalCoreOnSingleLines metric inpLine expLine outLine =

View File

@ -252,6 +252,10 @@ main = hspec $ do
runGEvalTest "bio-f1-simple" `shouldReturnAlmost` 0.5
it "check perfect score" $ do
runGEvalTest "bio-f1-perfect" `shouldReturnAlmost` 1.0
describe "automatic decompression" $ do
it "more complex test" $ do
runGEvalTest "charmatch-complex-compressed" `shouldReturnAlmost` 0.1923076923076923
neverMatch :: Char -> Int -> Bool
neverMatch _ _ = False

View File

@ -0,0 +1 @@
--metric CharMatch