automatic decompression
This commit is contained in:
parent
bab4f7d94c
commit
438f013914
@ -27,6 +27,7 @@ library
|
|||||||
, GEval.CharMatch
|
, GEval.CharMatch
|
||||||
, GEval.LineByLine
|
, GEval.LineByLine
|
||||||
, GEval.BIO
|
, GEval.BIO
|
||||||
|
, Data.Conduit.AutoDecompress
|
||||||
build-depends: base >= 4.7 && < 5
|
build-depends: base >= 4.7 && < 5
|
||||||
, cond
|
, cond
|
||||||
, conduit
|
, conduit
|
||||||
@ -49,6 +50,12 @@ library
|
|||||||
, vector
|
, vector
|
||||||
, mtl
|
, mtl
|
||||||
, edit-distance
|
, edit-distance
|
||||||
|
, bytestring
|
||||||
|
, word8
|
||||||
|
, primitive
|
||||||
|
, transformers-base
|
||||||
|
, bzlib-conduit
|
||||||
|
, lzma-conduit
|
||||||
default-language: Haskell2010
|
default-language: Haskell2010
|
||||||
|
|
||||||
executable geval
|
executable geval
|
||||||
|
44
src/Data/Conduit/AutoDecompress.hs
Normal file
44
src/Data/Conduit/AutoDecompress.hs
Normal file
@ -0,0 +1,44 @@
|
|||||||
|
{-# LANGUAGE AllowAmbiguousTypes #-}
|
||||||
|
|
||||||
|
module Data.Conduit.AutoDecompress
|
||||||
|
(autoDecompress)
|
||||||
|
where
|
||||||
|
|
||||||
|
import Data.Conduit
|
||||||
|
import Data.Conduit.Combinators
|
||||||
|
import Data.ByteString
|
||||||
|
import Data.Conduit.Zlib
|
||||||
|
import Data.Word8
|
||||||
|
import Control.Monad.Trans.Resource (MonadThrow, MonadResource)
|
||||||
|
import Control.Monad.Primitive (PrimMonad)
|
||||||
|
import Control.Monad.Base (MonadBase)
|
||||||
|
import qualified Data.Conduit.Lzma as XZ
|
||||||
|
import qualified Data.Conduit.BZlib as BZ
|
||||||
|
|
||||||
|
autoDecompress :: (MonadResource m, MonadBase base m, MonadThrow m, PrimMonad base) => ConduitM ByteString ByteString m ()
|
||||||
|
autoDecompress = do
|
||||||
|
f <- await
|
||||||
|
case f of
|
||||||
|
Just chunk -> if Data.ByteString.length chunk > 1
|
||||||
|
then
|
||||||
|
do
|
||||||
|
let firstByte = Data.ByteString.head chunk
|
||||||
|
let secondByte = Data.ByteString.index chunk 1
|
||||||
|
leftover chunk
|
||||||
|
lookAtMagicNumbers (firstByte, secondByte)
|
||||||
|
else
|
||||||
|
do
|
||||||
|
leftover chunk
|
||||||
|
doNothing
|
||||||
|
|
||||||
|
Nothing -> return ()
|
||||||
|
|
||||||
|
|
||||||
|
lookAtMagicNumbers :: (MonadResource m, MonadBase base m, MonadThrow m, PrimMonad base) => (Word8, Word8) -> Conduit ByteString m ByteString
|
||||||
|
lookAtMagicNumbers (31, 139) = ungzip
|
||||||
|
lookAtMagicNumbers (66, 90) = BZ.bunzip2
|
||||||
|
lookAtMagicNumbers (253, 55) = XZ.decompress Nothing
|
||||||
|
lookAtMagicNumbers _ = doNothing
|
||||||
|
|
||||||
|
doNothing :: Monad m => Conduit ByteString m ByteString
|
||||||
|
doNothing = Data.Conduit.Combinators.filter (const True)
|
@ -57,7 +57,7 @@ import Data.Tuple
|
|||||||
import qualified Data.List.Split as DLS
|
import qualified Data.List.Split as DLS
|
||||||
|
|
||||||
import Control.Monad.IO.Class
|
import Control.Monad.IO.Class
|
||||||
import Control.Monad ((<=<))
|
import Control.Monad ((<=<), filterM)
|
||||||
|
|
||||||
import Data.Attoparsec.Text (parseOnly)
|
import Data.Attoparsec.Text (parseOnly)
|
||||||
|
|
||||||
@ -69,6 +69,7 @@ import GEval.ClusteringMetrics
|
|||||||
import GEval.LogLossHashed
|
import GEval.LogLossHashed
|
||||||
import GEval.CharMatch
|
import GEval.CharMatch
|
||||||
import GEval.BIO
|
import GEval.BIO
|
||||||
|
import Data.Conduit.AutoDecompress
|
||||||
|
|
||||||
import qualified Data.HashMap.Strict as M
|
import qualified Data.HashMap.Strict as M
|
||||||
|
|
||||||
@ -239,11 +240,14 @@ checkAndGetFiles gevalSpec = do
|
|||||||
unlessM (D.doesDirectoryExist expectedDirectory) $ throwM $ NoExpectedDirectory expectedDirectory
|
unlessM (D.doesDirectoryExist expectedDirectory) $ throwM $ NoExpectedDirectory expectedDirectory
|
||||||
unlessM (D.doesDirectoryExist outTestDirectory) $ throwM $ NoOutTestDirectory outTestDirectory
|
unlessM (D.doesDirectoryExist outTestDirectory) $ throwM $ NoOutTestDirectory outTestDirectory
|
||||||
unlessM (D.doesDirectoryExist expectedTestDirectory) $ throwM $ NoExpectedTestDirectory expectedTestDirectory
|
unlessM (D.doesDirectoryExist expectedTestDirectory) $ throwM $ NoExpectedTestDirectory expectedTestDirectory
|
||||||
|
inputFilePath <- lookForCompressedFiles inputFilePath'
|
||||||
|
expectedFilePath <- lookForCompressedFiles expectedFilePath'
|
||||||
|
outFilePath <- lookForCompressedFiles outFilePath'
|
||||||
checkInputFileIfNeeded metric inputFilePath
|
checkInputFileIfNeeded metric inputFilePath
|
||||||
return (inputFilePath, expectedFilePath, outFilePath)
|
return (inputFilePath, expectedFilePath, outFilePath)
|
||||||
where expectedFilePath = expectedTestDirectory </> (gesExpectedFile gevalSpec)
|
where expectedFilePath' = expectedTestDirectory </> (gesExpectedFile gevalSpec)
|
||||||
outFilePath = getOutFile gevalSpec (gesOutFile gevalSpec)
|
outFilePath' = getOutFile gevalSpec (gesOutFile gevalSpec)
|
||||||
inputFilePath = expectedTestDirectory </> (gesInputFile gevalSpec)
|
inputFilePath' = expectedTestDirectory </> (gesInputFile gevalSpec)
|
||||||
expectedTestDirectory = expectedDirectory </> testName
|
expectedTestDirectory = expectedDirectory </> testName
|
||||||
outTestDirectory = outDirectory </> testName
|
outTestDirectory = outDirectory </> testName
|
||||||
expectedDirectory = getExpectedDirectory gevalSpec
|
expectedDirectory = getExpectedDirectory gevalSpec
|
||||||
@ -251,6 +255,24 @@ checkAndGetFiles gevalSpec = do
|
|||||||
testName = gesTestName gevalSpec
|
testName = gesTestName gevalSpec
|
||||||
metric = gesMetric gevalSpec
|
metric = gesMetric gevalSpec
|
||||||
|
|
||||||
|
lookForCompressedFiles :: FilePath -> IO FilePath
|
||||||
|
lookForCompressedFiles = lookForAlternativeFiles [".gz", ".xz", ".bz2"]
|
||||||
|
|
||||||
|
lookForAlternativeFiles :: [String] -> FilePath -> IO FilePath
|
||||||
|
lookForAlternativeFiles suffixes filePath
|
||||||
|
| takeExtension filePath `Prelude.elem` suffixes = return filePath
|
||||||
|
| otherwise = do
|
||||||
|
fileIsThere <- D.doesFileExist filePath
|
||||||
|
if fileIsThere
|
||||||
|
then
|
||||||
|
return filePath
|
||||||
|
else
|
||||||
|
do
|
||||||
|
found <- Control.Monad.filterM D.doesFileExist $ Prelude.map (filePath <.>) suffixes
|
||||||
|
return $ case found of
|
||||||
|
[fp] -> fp
|
||||||
|
_ -> filePath
|
||||||
|
|
||||||
getOutFile :: GEvalSpecification -> FilePath -> FilePath
|
getOutFile :: GEvalSpecification -> FilePath -> FilePath
|
||||||
getOutFile gevalSpec out = outDirectory </> testName </> out
|
getOutFile gevalSpec out = outDirectory </> testName </> out
|
||||||
where outDirectory = gesOutDirectory gevalSpec
|
where outDirectory = gesOutDirectory gevalSpec
|
||||||
@ -264,7 +286,7 @@ checkInputFileIfNeeded _ _ = return ()
|
|||||||
|
|
||||||
fileAsLineSource :: FilePath -> LineSource (ResourceT IO)
|
fileAsLineSource :: FilePath -> LineSource (ResourceT IO)
|
||||||
fileAsLineSource filePath =
|
fileAsLineSource filePath =
|
||||||
LineSource (CB.sourceFile filePath $= CT.decodeUtf8Lenient =$= CT.lines) filePath 1
|
LineSource (CB.sourceFile filePath $= autoDecompress =$= CT.decodeUtf8Lenient =$= CT.lines) filePath 1
|
||||||
|
|
||||||
gevalCoreOnSingleLines :: Metric -> LineInFile -> LineInFile -> LineInFile -> IO (MetricValue)
|
gevalCoreOnSingleLines :: Metric -> LineInFile -> LineInFile -> LineInFile -> IO (MetricValue)
|
||||||
gevalCoreOnSingleLines metric inpLine expLine outLine =
|
gevalCoreOnSingleLines metric inpLine expLine outLine =
|
||||||
|
@ -252,6 +252,10 @@ main = hspec $ do
|
|||||||
runGEvalTest "bio-f1-simple" `shouldReturnAlmost` 0.5
|
runGEvalTest "bio-f1-simple" `shouldReturnAlmost` 0.5
|
||||||
it "check perfect score" $ do
|
it "check perfect score" $ do
|
||||||
runGEvalTest "bio-f1-perfect" `shouldReturnAlmost` 1.0
|
runGEvalTest "bio-f1-perfect" `shouldReturnAlmost` 1.0
|
||||||
|
describe "automatic decompression" $ do
|
||||||
|
it "more complex test" $ do
|
||||||
|
runGEvalTest "charmatch-complex-compressed" `shouldReturnAlmost` 0.1923076923076923
|
||||||
|
|
||||||
|
|
||||||
neverMatch :: Char -> Int -> Bool
|
neverMatch :: Char -> Int -> Bool
|
||||||
neverMatch _ _ = False
|
neverMatch _ _ = False
|
||||||
|
Binary file not shown.
@ -0,0 +1 @@
|
|||||||
|
--metric CharMatch
|
Binary file not shown.
Binary file not shown.
Loading…
Reference in New Issue
Block a user