This commit is contained in:
Filip Gralinski 2018-05-12 10:53:21 +02:00
parent cea084c789
commit bdcd26cddc
5 changed files with 157 additions and 1 deletions

View File

@ -26,6 +26,7 @@ library
, GEval.LogLossHashed
, GEval.CharMatch
, GEval.LineByLine
, Data.Conduit.SmartSource
build-depends: base >= 4.7 && < 5
, cond
, conduit
@ -48,6 +49,9 @@ library
, vector
, mtl
, edit-distance
, bytestring
, http-conduit
, transformers
default-language: Haskell2010
executable geval
@ -72,6 +76,9 @@ test-suite geval-test
, text
, attoparsec
, edit-distance
, resourcet
, conduit
, conduit-extra
ghc-options: -threaded -rtsopts -with-rtsopts=-N
default-language: Haskell2010

View File

@ -0,0 +1,108 @@
{-# LANGUAGE RankNTypes #-}
module Data.Conduit.SmartSource
where
import Data.List (isInfixOf, isPrefixOf, isSuffixOf, elemIndex, elem)
import Data.Char (ord)
import System.FilePath
import Control.Monad.Trans.Resource (MonadResource)
import Data.Conduit
import qualified Data.ByteString as S
import Data.Conduit.Binary (sourceFile)
import Network.HTTP.Conduit
import Control.Monad.IO.Class
import Control.Monad.Trans.Class (lift)
data SmartSpec = NoSpec
| Stdin
| FileNameSpec FilePath
| FilePathSpec FilePath
| Http String
| Https String
| GitSpec String (Maybe FilePath)
| PossiblyGitSpec String
deriving (Eq, Show)
--smartSource :: (MonadIO m, MonadResource m) => [FilePath] -> Maybe FilePath -> SmartSpec -> Producer m S.ByteString
smartSource defaultDirs defaultFile spec = pureSmartSource defaultDirs spec
--pureSmartSource :: (MonadIO m, MonadResource m) => [FilePath] -> SmartSpec -> Producer m S.ByteString
pureSmartSource _ NoSpec = error "No source specification given"
pureSmartSource _ (FileNameSpec fileName) = sourceFile fileName
pureSmartSource _ (FilePathSpec fileName) = sourceFile fileName
pureSmartSource [] (PossiblyGitSpec spec) = sourceFile spec
pureSmartSource (firstDir:_) (PossiblyGitSpec spec) = sourceFile (firstDir </> spec)
pureSmartSource _ (Https url) = httpSource url
pureSmartSource _ (Http url) = httpSource url
httpSource :: MonadResource m => String -> ConduitM () S.ByteString m ()
httpSource url = do
request <- liftIO $ parseRequest url
manager <- liftIO $ newManager tlsManagerSettings
response <- lift $ http request manager
(httpsource, finalizer) <- lift $ unwrapResumable (responseBody response)
httpsource
lift finalizer
parseSmartSpec :: FilePath -> SmartSpec
parseSmartSpec "" = NoSpec
parseSmartSpec "-" = Stdin
parseSmartSpec spec
| "http://" `isPrefixOf` spec = Http spec
| "https://" `isPrefixOf` spec = Https spec
| otherwise = case elemIndex ':' spec of
Just ix -> let ref = take ix spec in
if checkRefFormat ref
then
GitSpec ref (if ix == length spec - 1
then
Nothing
else
Just $ drop (ix+1) spec)
else
fileSpec
Nothing -> if checkRefFormat spec && not ('/' `elem` spec) && not ('.' `elem` spec)
then
PossiblyGitSpec spec
else
fileSpec
where fileSpec = (if '/' `elem` spec then FilePathSpec else FileNameSpec) spec
parseSmartSpecInContext :: [FilePath] -> Maybe FilePath -> String -> Maybe SmartSpec
parseSmartSpecInContext defaultDirs defaultFile spec = parseSmartSpecInContext' defaultDirs defaultFile $ parseSmartSpec spec
where parseSmartSpecInContext' _ Nothing NoSpec = Nothing
parseSmartSpecInContext' [] (Just defaultFile) NoSpec = Just $ FileNameSpec defaultFile
parseSmartSpecInContext' (firstDir:_) (Just defaultFile) NoSpec = Just $ FilePathSpec (firstDir </> defaultFile)
parseSmartSpecInContext' (firstDir:_) _ (FileNameSpec fileName) = Just $ FilePathSpec (firstDir </> fileName)
parseSmartSpecInContext' _ Nothing (GitSpec branch Nothing) = Nothing
parseSmartSpecInContext' [] (Just defaultFile) (GitSpec branch Nothing) = Just $ GitSpec branch $ Just defaultFile
parseSmartSpecInContext' (firstDir:_) (Just defaultFile) (GitSpec branch Nothing)
= Just $ GitSpec branch $ Just (firstDir </> defaultFile)
parseSmartSpecInContext' _ _ parsedSpec = Just parsedSpec
checkRefFormat :: String -> Bool
checkRefFormat ref =
not ("./" `isInfixOf` ref) &&
not (".lock" `isSuffixOf` ref) &&
not (".lock/" `isInfixOf` ref) &&
not (".." `isInfixOf` ref) &&
not (any isUnwantedChar ref) &&
not ("/" `isSuffixOf` ref) &&
not ("//" `isInfixOf` ref) &&
not ("@{" `isInfixOf` ref) &&
ref /= "@"
where isUnwantedChar ':' = True
isUnwantedChar '?' = True
isUnwantedChar '*' = True
isUnwantedChar '[' = True
isUnwantedChar '~' = True
isUnwantedChar '^' = True
isUnwantedChar '\\' = True
isUnwantedChar '\177' = True
isUnwantedChar c = ord c < 32

View File

@ -60,6 +60,8 @@ import Control.Monad ((<=<))
import Data.Attoparsec.Text (parseOnly)
import Data.Conduit.SmartSource
import GEval.BLEU
import GEval.Common
import GEval.ClippEU
@ -259,7 +261,7 @@ checkInputFileIfNeeded _ _ = return ()
fileAsLineSource :: FilePath -> LineSource (ResourceT IO)
fileAsLineSource filePath =
LineSource (CB.sourceFile filePath $= CT.decodeUtf8Lenient =$= CT.lines) filePath 1
LineSource (smartSource [] Nothing (parseSmartSpec filePath) $= CT.decodeUtf8Lenient =$= CT.lines) filePath 1
gevalCoreOnSingleLines :: Metric -> LineInFile -> LineInFile -> LineInFile -> IO (MetricValue)
gevalCoreOnSingleLines metric inpLine expLine outLine =

View File

@ -15,6 +15,12 @@ import Text.EditDistance
import qualified Test.HUnit as HU
import Data.Conduit.SmartSource
import qualified Data.Conduit.Text as CT
import Data.Conduit
import Control.Monad.Trans.Resource
import qualified Data.Conduit.List as CL
informationRetrievalBookExample :: [(String, Int)]
informationRetrievalBookExample = [("o", 2), ("o", 2), ("d", 2), ("x", 3), ("d", 3),
("x", 1), ("o", 1), ("x", 1), ( "x", 1), ("x", 1), ("x", 1),
@ -187,7 +193,39 @@ main = hspec $ do
gevalCoreOnSingleLines RMSE (LineInFile "stub1" 1 "blabla")
(LineInFile "stub2" 1 "3.4")
(LineInFile "stub3" 1 "2.6") `shouldReturnAlmost` 0.8
describe "smart sources" $ do
it "smart specs are parsed" $ do
parseSmartSpec "" `shouldBe` NoSpec
parseSmartSpec "-" `shouldBe` Stdin
parseSmartSpec "http://gonito.net/foo" `shouldBe` Http "http://gonito.net/foo"
parseSmartSpec "https://gonito.net" `shouldBe` Https "https://gonito.net"
parseSmartSpec "branch:" `shouldBe` GitSpec "branch" Nothing
parseSmartSpec "37be:foo/bar.tsv" `shouldBe` GitSpec "37be" (Just "foo/bar.tsv")
parseSmartSpec "bla/xyz:foo/bar.tsv" `shouldBe` GitSpec "bla/xyz" (Just "foo/bar.tsv")
parseSmartSpec "out.tsv" `shouldBe` FileNameSpec "out.tsv"
parseSmartSpec "dev-1/out.tsv" `shouldBe` FilePathSpec "dev-1/out.tsv"
parseSmartSpec "../out.tsv" `shouldBe` FilePathSpec "../out.tsv"
parseSmartSpec "4a5f" `shouldBe` PossiblyGitSpec "4a5f"
parseSmartSpec "!!" `shouldBe` PossiblyGitSpec "!!"
parseSmartSpec "branch" `shouldBe` PossiblyGitSpec "branch"
it "smart specs are parsed in context" $ do
parseSmartSpecInContext [] Nothing "xyz" `shouldBe` Just (PossiblyGitSpec "xyz")
parseSmartSpecInContext ["foo", "bar"] Nothing "out.tsv" `shouldBe` Just (FilePathSpec "foo/out.tsv")
parseSmartSpecInContext [] (Just "default") "" `shouldBe` Just (FileNameSpec "default")
parseSmartSpecInContext ["foo"] (Just "default") "" `shouldBe` Just (FilePathSpec "foo/default")
parseSmartSpecInContext ["foo/bar"] (Just "default") "http://gonito.net" `shouldBe` Just (Http "http://gonito.net")
parseSmartSpecInContext ["foo/bar"] Nothing "" `shouldBe` Nothing
it "sources are accessed" $ do
readFromSmartSource [] Nothing "test/files/foo.txt" `shouldReturn` ["foo\n"]
readFromSmartSource [] Nothing "https://httpbin.org/robots.txt" `shouldReturn`
["User-agent: *\nDisallow: /deny\n"]
readFromSmartSource :: [FilePath] -> Maybe FilePath -> String -> IO [String]
readFromSmartSource defaultDirs defaultFile specS = do
let (Just spec) = parseSmartSpecInContext defaultDirs defaultFile specS
let source = smartSource defaultDirs defaultFile spec
contents <- runResourceT (source $$ CT.decodeUtf8Lenient =$ CL.consume)
return $ Prelude.map unpack contents
neverMatch :: Char -> Int -> Bool
neverMatch _ _ = False

1
test/files/foo.txt Normal file
View File

@ -0,0 +1 @@
foo