, Data.SplitIntoCrossTabs
, Data.Conduit.Bootstrap
, GEval.Formatting
, Data.Conduit.Header
, Paths_geval
build-depends: base >= 4.7 && < 5
, cond
Normal file
Normal file
{-# LANGUAGE OverloadedStrings #-}
module Data.Conduit.Header
(processHeader, TabularHeader, readHeaderFile)
import Data.Text
import Data.Conduit
import Data.Conduit.AutoDecompress
import qualified System.Directory as D
data TabularHeader = TabularHeader [Text]
processHeader :: Monad m => Maybe TabularHeader -> ConduitT Text Text m ()
processHeader Nothing = doNothing
processHeader (Just (TabularHeader (firstField:_))) = do
mLine <- await
case mLine of
Just line -> case splitIntoFields line of
(firstField':_) -> do
if firstField' == firstField
then return ()
else yield line
Nothing -> return ()
splitIntoFields :: Text -> [Text]
splitIntoFields = splitOn "\t"
readHeaderFile :: FilePath -> IO (Maybe TabularHeader)
readHeaderFile headerFilePath = do
fileExists <- (D.doesFileExist headerFilePath)
if fileExists
content <- readFile headerFilePath
let (firstLine:_) = Prelude.lines content
return $ Just $ TabularHeader $ splitIntoFields $ pack firstLine
return Nothing
| UnexpectedData Word32 String
| UnexpectedMultipleOutputs
| OtherException String
| NoHeaderFile FilePath
deriving (Eq)
instance Exception GEvalException
@ -147,6 +148,7 @@ instance Show GEvalException where
show (UnexpectedData lineNo message) = "Line " ++ (show lineNo) ++ ": Unexpected data [" ++ message ++ "]"
show UnexpectedMultipleOutputs = "Multiple outputs are not possible in this mode, use -o option to select an output file"
show (OtherException message) = message
show (NoHeaderFile filePath) = somethingWrongWithFilesMessage "No file with header specification" filePath
somethingWrongWithFilesMessage :: String -> FilePath -> String
somethingWrongWithFilesMessage msg filePath = Prelude.concat
) where
import Debug.Trace
@ -82,6 +86,7 @@ import Control.Monad ((<=<), filterM)
import Data.Attoparsec.Text (parseOnly)
import Data.Conduit.SmartSource
import Data.Conduit.Header
import qualified Data.IntSet as IS
gesToken :: Maybe String,
gesGonitoGitAnnexRemote :: Maybe String,
gesReferences :: Maybe String,
gesBootstrapResampling :: Maybe Int }
gesBootstrapResampling :: Maybe Int,
gesInHeader :: Maybe String,
gesOutHeader :: Maybe String }
gesMainMetric :: GEvalSpecification -> Metric
@ -192,6 +199,16 @@ getExpectedDirectory :: GEvalSpecification -> FilePath
getExpectedDirectory spec = fromMaybe outDirectory $ gesExpectedDirectory spec
where outDirectory = gesOutDirectory spec
getInHeader :: GEvalSpecification -> Maybe FilePath
getInHeader spec = getHeader spec gesInHeader
getOutHeader :: GEvalSpecification -> Maybe FilePath
getOutHeader spec = getHeader spec gesOutHeader
getHeader spec selector = case selector spec of
Just headerFile -> Just $ getExpectedDirectory spec </> headerFile
Nothing -> Nothing
-- | Special command, not just running the regular evaluation.
-- See OptionsParser.hs for more information.
data GEvalSpecialCommand = Init
gesToken = Nothing,
gesGonitoGitAnnexRemote = Nothing,
gesReferences = Nothing,
gesBootstrapResampling = Nothing }
gesBootstrapResampling = Nothing,
gesInHeader = Nothing,
gesOutHeader = Nothing }
isEmptyFile :: FilePath -> IO (Bool)
isEmptyFile path = do
@ -255,9 +274,13 @@ noGraph = const Nothing
gevalOnSingleOut :: GEvalSpecification -> SourceSpec -> SourceSpec -> SourceSpec -> IO (SourceSpec, [MetricOutput])
gevalOnSingleOut gevalSpec inputSource expectedSource outSource = do
mInHeader <- readHeaderFileWrapper $ getInHeader gevalSpec
mOutHeader <- readHeaderFileWrapper $ getOutHeader gevalSpec
vals <- Prelude.mapM (\scheme -> gevalCore (evaluationSchemeMetric scheme)
(preprocess . applyPreprocessingOperations scheme)
(gesBootstrapResampling gevalSpec)
@ -267,6 +290,15 @@ gevalOnSingleOut gevalSpec inputSource expectedSource outSource = do
preprocess = gesPreprocess gevalSpec
mSelector = gesSelector gevalSpec
readHeaderFileWrapper :: Maybe FilePath -> IO (Maybe TabularHeader)
readHeaderFileWrapper Nothing = return Nothing
readHeaderFileWrapper (Just headerFilePath) = do
mHeader <- readHeaderFile headerFilePath
case mHeader of
Just header -> return $ Just header
Nothing -> throwM $ NoHeaderFile headerFilePath
checkAndGetFilesSingleOut :: Bool -> GEvalSpecification -> IO (SourceSpec, SourceSpec, SourceSpec)
checkAndGetFilesSingleOut forceInput gevalSpec = do
res <- checkAndGetFiles forceInput gevalSpec
@ -369,9 +401,18 @@ getInputSourceIfNeeded forced metrics directory inputFilePath
Right sourceSpec -> return sourceSpec
| otherwise = return NoSource
fileAsLineSource spec mSelector preprocess =
LineSource ((smartSource spec) .| autoDecompress .| CT.decodeUtf8Lenient .| CT.lines) (select (getDataFormat spec) mSelector) preprocess spec 1
data FileProcessingOptions = FileProcessingOptions {
fileProcessingOptionsSelector :: Maybe Selector,
fileProcessingOptionsPreprocess :: (Text -> Text),
fileProcessingOptionsHeader :: Maybe TabularHeader }
fileAsLineSource :: SourceSpec -> FileProcessingOptions -> LineSource (ResourceT IO)
fileAsLineSource spec options =
LineSource ((smartSource spec) .| autoDecompress .| CT.decodeUtf8Lenient .| CT.lines .| processHeader mHeader) (select (getDataFormat spec) mSelector) preprocess spec 1
where mSelector = fileProcessingOptionsSelector options
preprocess = fileProcessingOptionsPreprocess options
mHeader = fileProcessingOptionsHeader options
getDataDecoder :: LineSource (ResourceT IO) -> (Text -> ItemTarget)
getDataDecoder (LineSource _ dd _ _ _) = dd
@ -429,22 +470,32 @@ handleBootstrap _ = True
-> Maybe Selector -- ^ selector to be used
-> (Text -> Text) -- ^ preprocessing function (e.g. tokenization)
-> (Maybe TabularHeader) -- ^ header for input
-> (Maybe TabularHeader) -- ^ header for output/expected files
-> (Maybe Int) -- ^ number of bootstrap samples
-> SourceSpec -- ^ source specification for the input values
-> SourceSpec -- ^ source specification for the expected output
-> SourceSpec -- ^ source specification for the output
-> IO (MetricOutput) -- ^ metric value for the output against the expected output
gevalCore metric mSelector preprocess mBootstrapResampling inputSource expectedSource outSource = do
gevalCore metric mSelector preprocess mInHeader mOutHeader mBootstrapResampling inputSource expectedSource outSource = do
whenM (isEmptyFileSource outSource) $ throwM $ EmptyOutput
go metric
(fileAsLineSource inputSource mSelector preprocess)
(fileAsLineSource expectedSource mSelector preprocess)
(fileAsLineSource outSource mSelector preprocess)
(fileAsLineSource inputSource inOptions)
(fileAsLineSource expectedSource outOptions)
(fileAsLineSource outSource outOptions)
where go = case mBootstrapResampling of
Nothing -> gevalCoreOnSources
Just bootstrapResampling -> if handleBootstrap metric
then gevalBootstrapOnSources bootstrapResampling
else gevalCoreOnSources
outOptions = FileProcessingOptions {
fileProcessingOptionsSelector = mSelector,
fileProcessingOptionsPreprocess = preprocess,
fileProcessingOptionsHeader = mOutHeader }
inOptions = FileProcessingOptions {
fileProcessingOptionsSelector = mSelector,
fileProcessingOptionsPreprocess = preprocess,
fileProcessingOptionsHeader = mInHeader }
isEmptyFileSource :: SourceSpec -> IO Bool
isEmptyFileSource (FilePathSpec filePath) = isEmptyFile filePath
createFile (expectedDirectory </> ".gitignore") $ gitignoreContents
createFile (expectedDirectory </> "README.md") $ readmeMDContents metric testName
createFile (expectedDirectory </> configFileName) $ configContents metrics precision testName
createHeaderFile expectedDirectory "in-header.tsv" $ inHeaderContents metric
createHeaderFile expectedDirectory "out-header.tsv" $ outHeaderContents metric
if withDataFiles
@ -53,6 +55,10 @@ createChallenge withDataFiles expectedDirectory spec = do
testDirectory = expectedDirectory </> testName
expectedFile = gesExpectedFile spec
createHeaderFile _ _ Nothing = return ()
createHeaderFile expectedDirectory headerFile (Just fields) = do
createFile (expectedDirectory </> headerFile) $ (intercalate "\t" fields) ++ "\n"
createTrainFiles :: Metric -> FilePath -> FilePath -> IO ()
createTrainFiles metric@(LogLossHashed _) trainDirectory _ = createSingleTrainFile metric trainDirectory
createTrainFiles metric@(LikelihoodHashed _) trainDirectory _ = createSingleTrainFile metric trainDirectory
" --test-name " ++ testName
"") ++
(precisionOpt precision)
(precisionOpt precision) ++
inHeaderOpts ++
where precisionOpt Nothing = ""
precisionOpt (Just p) = " --precision " ++ (show p)
((EvaluationScheme mainMetric _):_) = schemes
inHeaderOpts = getHeaderOpts "in-header" inHeaderContents
outHeaderOpts = getHeaderOpts "out-header" outHeaderContents
getHeaderOpts opt selector = case selector mainMetric of
Just _ -> " --" ++ opt ++ " " ++ (opt <.> "tsv")
Nothing -> ""
-- Originally train content was in one file, to avoid large changes
-- for the time being we are using the original function.
inHeaderContents :: Metric -> Maybe [String]
inHeaderContents (Mean metric) = inHeaderContents metric
inHeaderContents GLEU = Nothing
inHeaderContents BLEU = Nothing
inHeaderContents Accuracy = Just ["Temperature", "Wind", "Rain"]
inHeaderContents (FMeasure _) = Just ["seismic",
inHeaderContents (MacroFMeasure _) = Just ["FirstName"]
inHeaderContents (ProbabilisticSoftFMeasure b) = inHeaderContents (SoftFMeasure b)
inHeaderContents (SoftFMeasure _) = Just ["Text"]
inHeaderContents NMI = Just ["Utterance"]
inHeaderContents (LikelihoodHashed b) = inHeaderContents (LogLossHashed b)
inHeaderContents (LogLossHashed _) = Just ["LeftContext", "RightContext"]
inHeaderContents CharMatch = Just ["Text"]
inHeaderContents MAP = Just ["Dialect", "PolishPhrase"]
inHeaderContents Likelihood = inHeaderContents LogLoss
inHeaderContents LogLoss = Just ["Text"]
inHeaderContents BIOF1Labels = inHeaderContents BIOF1
inHeaderContents BIOF1 = Just ["Text"]
inHeaderContents TokenAccuracy = Just ["TokenizedText"]
inHeaderContents SegmentAccuracy = Just ["Segment"]
inHeaderContents (ProbabilisticMultiLabelFMeasure beta) = inHeaderContents (MultiLabelFMeasure beta)
inHeaderContents (MultiLabelFMeasure _) = Just ["Text"]
inHeaderContents MultiLabelLikelihood = inHeaderContents MultiLabelLogLoss
inHeaderContents MultiLabelLogLoss = Just ["Utterance"]
inHeaderContents (Soft2DFMeasure _) = inHeaderContents ClippEU
inHeaderContents ClippEU = Just ["DjvuFilePath"]
inHeaderContents _ = Just ["OrbitalPeriod", "OrbitalEccentricity", "NumberOfMoons"]
outHeaderContents :: Metric -> Maybe [String]
outHeaderContents (Mean metric) = outHeaderContents metric
outHeaderContents BLEU = Nothing
outHeaderContents GLEU = Nothing
outHeaderContents Accuracy = Just ["ShouldYouKidForWalk"]
outHeaderContents (FMeasure _) = Just ["IsSeismicBump"]
outHeaderContents (MacroFMeasure _) = Just ["LanguageCode"]
outHeaderContents (ProbabilisticSoftFMeasure b) = outHeaderContents (SoftFMeasure b)
outHeaderContents (SoftFMeasure _) = Just ["NamesFound"]
outHeaderContents NMI = Just ["LanguageCode"]
outHeaderContents (LikelihoodHashed b) = outHeaderContents (LogLossHashed b)
outHeaderContents (LogLossHashed _) = Just ["GuessedWord"]
outHeaderContents CharMatch = Just ["NormalizedText"]
outHeaderContents MAP = Nothing
outHeaderContents Likelihood = outHeaderContents LogLoss
outHeaderContents LogLoss = Just ["Probability"]
outHeaderContents BIOF1Labels = outHeaderContents BIOF1
outHeaderContents BIOF1 = Just ["BIOOutput"]
outHeaderContents TokenAccuracy = Just ["PartsOfSpeech"]
outHeaderContents SegmentAccuracy = Just ["PartsOfSpeech"]
outHeaderContents (ProbabilisticMultiLabelFMeasure beta) = outHeaderContents (MultiLabelFMeasure beta)
outHeaderContents (MultiLabelFMeasure _) = Just ["Entities"]
outHeaderContents MultiLabelLikelihood = outHeaderContents MultiLabelLogLoss
outHeaderContents MultiLabelLogLoss = Just ["Emotion"]
outHeaderContents (Soft2DFMeasure _) = Just ["Rectangle"]
outHeaderContents ClippEU = Just ["Rectangle"]
outHeaderContents _ = Just ["Mass"]
gitignoreContents :: String
gitignoreContents = [hereLit|
import Data.Conduit.Binary (sourceFile)
import Data.Conduit.Header
import qualified Data.HashMap.Strict as H
import qualified Data.Map.Strict as M
import qualified Data.Set as S
@ -463,7 +465,10 @@ runLineByLineGeneralized ordering spec consum = do
return $ Just references
Nothing -> return Nothing
(inputFilePath, expectedFilePath, outFilePath) <- checkAndGetFilesSingleOut True spec
gevalLineByLineCore metric mSelector preprocess inputFilePath expectedFilePath outFilePath (sorter ordering .| consum mReferences)
mInHeader <- readHeaderFileWrapper $ getInHeader spec
mOutHeader <- readHeaderFileWrapper $ getOutHeader spec
let mOutHeader = Nothing
gevalLineByLineCore metric mSelector preprocess mInHeader mOutHeader inputFilePath expectedFilePath outFilePath (sorter ordering .| consum mReferences)
where metric = gesMainMetric spec
scheme = gesMainScheme spec
mSelector = gesSelector spec
@ -517,7 +522,9 @@ runMultiOutputGeneralized spec consum = do
altSourceSpecs' <- mapM (getSmartSourceSpec ((gesOutDirectory spec) </> (gesTestName spec)) "out.tsv") altOuts
let altSourceSpecs = rights altSourceSpecs'
let sourceSpecs = (outSource:altSourceSpecs)
let sources = Prelude.map (gevalLineByLineSource metric mSelector preprocess inputSource expectedSource) sourceSpecs
mInHeader <- readHeaderFileWrapper $ getInHeader spec
mOutHeader <- readHeaderFileWrapper $ getOutHeader spec
let sources = Prelude.map (gevalLineByLineSource metric mSelector preprocess mInHeader mOutHeader inputSource expectedSource) sourceSpecs
runResourceT $ runConduit $
(sequenceSources sources .| consum)
where metric = gesMainMetric spec
runDiffGeneralized ordering otherOut spec consum = do
(inputSource, expectedSource, outSource) <- checkAndGetFilesSingleOut True spec
ooss <- getSmartSourceSpec ((gesOutDirectory spec) </> (gesTestName spec)) "out.tsv" otherOut
mInHeader <- readHeaderFileWrapper $ getInHeader spec
mOutHeader <- readHeaderFileWrapper $ getOutHeader spec
case ooss of
Left NoSpecGiven -> throwM $ NoOutFile otherOut
Left (NoFile fp) -> throwM $ NoOutFile fp
Left (NoDirectory d) -> throwM $ NoOutFile otherOut
Right otherOutSource -> do
let sourceA = gevalLineByLineSource metric mSelector preprocess inputSource expectedSource otherOutSource
let sourceB = gevalLineByLineSource metric mSelector preprocess inputSource expectedSource outSource
let sourceA = gevalLineByLineSource metric mSelector preprocess mInHeader mOutHeader inputSource expectedSource otherOutSource
let sourceB = gevalLineByLineSource metric mSelector preprocess mInHeader mOutHeader inputSource expectedSource outSource
runResourceT $ runConduit $
((getZipSource $ (,)
<$> ZipSource sourceA
escapeTabs :: Text -> Text
escapeTabs = Data.Text.replace "\t" "<tab>"
gevalLineByLineCore :: Metric -> Maybe Selector -> (Text -> Text) -> SourceSpec -> SourceSpec -> SourceSpec -> ConduitT LineRecord Void (ResourceT IO) a -> IO a
gevalLineByLineCore metric mSelector preprocess inputSource expectedSource outSource consum =
gevalLineByLineCore :: Metric -> Maybe Selector -> (Text -> Text) -> (Maybe TabularHeader) -> (Maybe TabularHeader) -> SourceSpec -> SourceSpec -> SourceSpec -> ConduitT LineRecord Void (ResourceT IO) a -> IO a
gevalLineByLineCore metric mSelector preprocess mInHeader mOutHeader inputSource expectedSource outSource consum =
runResourceT $ runConduit $
((gevalLineByLineSource metric mSelector preprocess inputSource expectedSource outSource) .| consum)
((gevalLineByLineSource metric mSelector preprocess mInHeader mOutHeader inputSource expectedSource outSource) .| consum)
gevalLineByLineSource :: Metric -> Maybe Selector -> (Text -> Text) -> SourceSpec -> SourceSpec -> SourceSpec -> ConduitT () LineRecord (ResourceT IO) ()
gevalLineByLineSource metric mSelector preprocess inputSource expectedSource outSource =
gevalLineByLineSource :: Metric
-> Maybe Selector
-> (Text -> Text)
-> (Maybe TabularHeader)
-> (Maybe TabularHeader)
-> SourceSpec
-> SourceSpec
-> SourceSpec
-> ConduitT () LineRecord (ResourceT IO) ()
gevalLineByLineSource metric mSelector preprocess mInHeader mOutHeader inputSource expectedSource outSource =
(getZipSource $ (,)
<$> ZipSource (CL.sourceList [1..])
<*> (ZipSource $ threeLineSource context)) .| CL.mapM (checkStepM evaluateLine) .| CL.catMaybes
where context = (WithInput inputLineSource expectedLineSource outputLineSource)
-- preparing sources, `id` means that no preprocessing is done (to avoid double preprocessing)
inputLineSource = fileAsLineSource inputSource mSelector id
expectedLineSource = fileAsLineSource expectedSource mSelector id
outputLineSource = fileAsLineSource outSource mSelector id
inputLineSource = fileAsLineSource inputSource inOptions
expectedLineSource = fileAsLineSource expectedSource outOptions
outputLineSource = fileAsLineSource outSource outOptions
justLine (LineInFile _ _ l) = l
evaluateLine (lineNo, ParsedRecordWithInput inp exp out) = do
s <- liftIO $ gevalCoreOnSingleLines metric preprocess (getDataDecoder inputLineSource) (LineInFile inputSource lineNo inp)
(getDataDecoder expectedLineSource) (LineInFile expectedSource lineNo exp)
(getDataDecoder outputLineSource) (LineInFile outSource lineNo out)
return $ LineRecord inp exp out lineNo (extractSimpleRunValue $ getMetricValue s)
-- preparing sources, `id` means that no preprocessing is done (to avoid double preprocessing)
outOptions = FileProcessingOptions {
fileProcessingOptionsSelector = mSelector,
fileProcessingOptionsPreprocess = id,
fileProcessingOptionsHeader = mOutHeader }
inOptions = FileProcessingOptions {
fileProcessingOptionsSelector = mSelector,
fileProcessingOptionsPreprocess = id,
fileProcessingOptionsHeader = mInHeader }
justTokenize :: Maybe Tokenizer -> IO ()
justTokenize Nothing = error "a tokenizer must be specified with --tokenizer option"
<> short 'B'
<> metavar "NUMBER-OF-SAMPLES"
<> help "Tests on NUMBER-OF-SAMPLES bootstrap samples rather than just on the whole test set" ))
<*> ( optional . strOption $
( long "in-header"
<> metavar "FILE"
<> help "One-line TSV file specifying a list of field names for input files"))
<*> ( optional . strOption $
( long "out-header"
<> metavar "FILE"
<> help "One-line TSV file specifying a list of field names for output and expected files"))
defaultMinFrequency :: Integer
defaultMinFrequency = 1
describe "automatic decompression" $ do
it "more complex test" $ do
runGEvalTest "charmatch-complex-compressed" `shouldReturnAlmost` 0.1923076923076923
describe "headers" $ do
it "simple" $ do
runGEvalTest "mse-simple-headers" `shouldReturnAlmost` 0.4166666666666667
describe "handling jsonl format" $ do
it "simple test" $
runGEvalTestExtraOptions ["-e", "expected.jsonl" ] "jsonl-simple" `shouldReturnAlmost` 0.571428571428
@ -467,7 +470,9 @@ main = hspec $ do
gesToken = Nothing,
gesGonitoGitAnnexRemote = Nothing,
gesReferences = Nothing,
gesBootstrapResampling = Nothing }
gesBootstrapResampling = Nothing,
gesInHeader = Nothing,
gesOutHeader = Nothing }
it "simple test" $ do
results <- runLineByLineGeneralized KeepTheOriginalOrder sampleChallenge (const Data.Conduit.List.consume)
Prelude.map (\(LineRecord inp _ _ _ _) -> inp) results `shouldBe` ["foo",
--metric MSE --out-header out-header.tsv
@ -0,0 +1,4 @@
