automatic decompression

This commit is contained in:
Filip Gralinski 2018-05-17 08:26:57 +02:00
parent bab4f7d94c
commit 438f013914
8 changed files with 83 additions and 5 deletions

View File

@ -27,6 +27,7 @@ library
, GEval.CharMatch , GEval.CharMatch
, GEval.LineByLine , GEval.LineByLine
, GEval.BIO , GEval.BIO
, Data.Conduit.AutoDecompress
build-depends: base >= 4.7 && < 5 build-depends: base >= 4.7 && < 5
, cond , cond
, conduit , conduit
@ -49,6 +50,12 @@ library
, vector , vector
, mtl , mtl
, edit-distance , edit-distance
, bytestring
, word8
, primitive
, transformers-base
, bzlib-conduit
, lzma-conduit
default-language: Haskell2010 default-language: Haskell2010
executable geval executable geval

View File

@ -0,0 +1,44 @@
{-# LANGUAGE AllowAmbiguousTypes #-}
module Data.Conduit.AutoDecompress
(autoDecompress)
where
import Data.Conduit
import Data.Conduit.Combinators
import Data.ByteString
import Data.Conduit.Zlib
import Data.Word8
import Control.Monad.Trans.Resource (MonadThrow, MonadResource)
import Control.Monad.Primitive (PrimMonad)
import Control.Monad.Base (MonadBase)
import qualified Data.Conduit.Lzma as XZ
import qualified Data.Conduit.BZlib as BZ
autoDecompress :: (MonadResource m, MonadBase base m, MonadThrow m, PrimMonad base) => ConduitM ByteString ByteString m ()
autoDecompress = do
f <- await
case f of
Just chunk -> if Data.ByteString.length chunk > 1
then
do
let firstByte = Data.ByteString.head chunk
let secondByte = Data.ByteString.index chunk 1
leftover chunk
lookAtMagicNumbers (firstByte, secondByte)
else
do
leftover chunk
doNothing
Nothing -> return ()
lookAtMagicNumbers :: (MonadResource m, MonadBase base m, MonadThrow m, PrimMonad base) => (Word8, Word8) -> Conduit ByteString m ByteString
lookAtMagicNumbers (31, 139) = ungzip
lookAtMagicNumbers (66, 90) = BZ.bunzip2
lookAtMagicNumbers (253, 55) = XZ.decompress Nothing
lookAtMagicNumbers _ = doNothing
doNothing :: Monad m => Conduit ByteString m ByteString
doNothing = Data.Conduit.Combinators.filter (const True)

View File

@ -57,7 +57,7 @@ import Data.Tuple
import qualified Data.List.Split as DLS import qualified Data.List.Split as DLS
import Control.Monad.IO.Class import Control.Monad.IO.Class
import Control.Monad ((<=<)) import Control.Monad ((<=<), filterM)
import Data.Attoparsec.Text (parseOnly) import Data.Attoparsec.Text (parseOnly)
@ -69,6 +69,7 @@ import GEval.ClusteringMetrics
import GEval.LogLossHashed import GEval.LogLossHashed
import GEval.CharMatch import GEval.CharMatch
import GEval.BIO import GEval.BIO
import Data.Conduit.AutoDecompress
import qualified Data.HashMap.Strict as M import qualified Data.HashMap.Strict as M
@ -239,11 +240,14 @@ checkAndGetFiles gevalSpec = do
unlessM (D.doesDirectoryExist expectedDirectory) $ throwM $ NoExpectedDirectory expectedDirectory unlessM (D.doesDirectoryExist expectedDirectory) $ throwM $ NoExpectedDirectory expectedDirectory
unlessM (D.doesDirectoryExist outTestDirectory) $ throwM $ NoOutTestDirectory outTestDirectory unlessM (D.doesDirectoryExist outTestDirectory) $ throwM $ NoOutTestDirectory outTestDirectory
unlessM (D.doesDirectoryExist expectedTestDirectory) $ throwM $ NoExpectedTestDirectory expectedTestDirectory unlessM (D.doesDirectoryExist expectedTestDirectory) $ throwM $ NoExpectedTestDirectory expectedTestDirectory
inputFilePath <- lookForCompressedFiles inputFilePath'
expectedFilePath <- lookForCompressedFiles expectedFilePath'
outFilePath <- lookForCompressedFiles outFilePath'
checkInputFileIfNeeded metric inputFilePath checkInputFileIfNeeded metric inputFilePath
return (inputFilePath, expectedFilePath, outFilePath) return (inputFilePath, expectedFilePath, outFilePath)
where expectedFilePath = expectedTestDirectory </> (gesExpectedFile gevalSpec) where expectedFilePath' = expectedTestDirectory </> (gesExpectedFile gevalSpec)
outFilePath = getOutFile gevalSpec (gesOutFile gevalSpec) outFilePath' = getOutFile gevalSpec (gesOutFile gevalSpec)
inputFilePath = expectedTestDirectory </> (gesInputFile gevalSpec) inputFilePath' = expectedTestDirectory </> (gesInputFile gevalSpec)
expectedTestDirectory = expectedDirectory </> testName expectedTestDirectory = expectedDirectory </> testName
outTestDirectory = outDirectory </> testName outTestDirectory = outDirectory </> testName
expectedDirectory = getExpectedDirectory gevalSpec expectedDirectory = getExpectedDirectory gevalSpec
@ -251,6 +255,24 @@ checkAndGetFiles gevalSpec = do
testName = gesTestName gevalSpec testName = gesTestName gevalSpec
metric = gesMetric gevalSpec metric = gesMetric gevalSpec
lookForCompressedFiles :: FilePath -> IO FilePath
lookForCompressedFiles = lookForAlternativeFiles [".gz", ".xz", ".bz2"]
lookForAlternativeFiles :: [String] -> FilePath -> IO FilePath
lookForAlternativeFiles suffixes filePath
| takeExtension filePath `Prelude.elem` suffixes = return filePath
| otherwise = do
fileIsThere <- D.doesFileExist filePath
if fileIsThere
then
return filePath
else
do
found <- Control.Monad.filterM D.doesFileExist $ Prelude.map (filePath <.>) suffixes
return $ case found of
[fp] -> fp
_ -> filePath
getOutFile :: GEvalSpecification -> FilePath -> FilePath getOutFile :: GEvalSpecification -> FilePath -> FilePath
getOutFile gevalSpec out = outDirectory </> testName </> out getOutFile gevalSpec out = outDirectory </> testName </> out
where outDirectory = gesOutDirectory gevalSpec where outDirectory = gesOutDirectory gevalSpec
@ -264,7 +286,7 @@ checkInputFileIfNeeded _ _ = return ()
fileAsLineSource :: FilePath -> LineSource (ResourceT IO) fileAsLineSource :: FilePath -> LineSource (ResourceT IO)
fileAsLineSource filePath = fileAsLineSource filePath =
LineSource (CB.sourceFile filePath $= CT.decodeUtf8Lenient =$= CT.lines) filePath 1 LineSource (CB.sourceFile filePath $= autoDecompress =$= CT.decodeUtf8Lenient =$= CT.lines) filePath 1
gevalCoreOnSingleLines :: Metric -> LineInFile -> LineInFile -> LineInFile -> IO (MetricValue) gevalCoreOnSingleLines :: Metric -> LineInFile -> LineInFile -> LineInFile -> IO (MetricValue)
gevalCoreOnSingleLines metric inpLine expLine outLine = gevalCoreOnSingleLines metric inpLine expLine outLine =

View File

@ -252,6 +252,10 @@ main = hspec $ do
runGEvalTest "bio-f1-simple" `shouldReturnAlmost` 0.5 runGEvalTest "bio-f1-simple" `shouldReturnAlmost` 0.5
it "check perfect score" $ do it "check perfect score" $ do
runGEvalTest "bio-f1-perfect" `shouldReturnAlmost` 1.0 runGEvalTest "bio-f1-perfect" `shouldReturnAlmost` 1.0
describe "automatic decompression" $ do
it "more complex test" $ do
runGEvalTest "charmatch-complex-compressed" `shouldReturnAlmost` 0.1923076923076923
neverMatch :: Char -> Int -> Bool neverMatch :: Char -> Int -> Bool
neverMatch _ _ = False neverMatch _ _ = False

View File

@ -0,0 +1 @@
--metric CharMatch