Handle headers

This commit is contained in:
Filip Gralinski 2020-02-17 22:29:27 +01:00
parent ccd6d919da
commit 6d586c7238
12 changed files with 255 additions and 26 deletions

View File

@ -53,6 +53,7 @@ library
, Data.SplitIntoCrossTabs , Data.SplitIntoCrossTabs
, Data.Conduit.Bootstrap , Data.Conduit.Bootstrap
, GEval.Formatting , GEval.Formatting
, Data.Conduit.Header
, Paths_geval , Paths_geval
build-depends: base >= 4.7 && < 5 build-depends: base >= 4.7 && < 5
, cond , cond

View File

@ -0,0 +1,41 @@
{-# LANGUAGE OverloadedStrings #-}
module Data.Conduit.Header
(processHeader, TabularHeader, readHeaderFile)
where
import Data.Text
import Data.Conduit
import Data.Conduit.AutoDecompress
import qualified System.Directory as D
data TabularHeader = TabularHeader [Text]
processHeader :: Monad m => Maybe TabularHeader -> ConduitT Text Text m ()
processHeader Nothing = doNothing
processHeader (Just (TabularHeader (firstField:_))) = do
mLine <- await
case mLine of
Just line -> case splitIntoFields line of
(firstField':_) -> do
if firstField' == firstField
then return ()
else yield line
doNothing
Nothing -> return ()
splitIntoFields :: Text -> [Text]
splitIntoFields = splitOn "\t"
readHeaderFile :: FilePath -> IO (Maybe TabularHeader)
readHeaderFile headerFilePath = do
fileExists <- (D.doesFileExist headerFilePath)
if fileExists
then
do
content <- readFile headerFilePath
let (firstLine:_) = Prelude.lines content
return $ Just $ TabularHeader $ splitIntoFields $ pack firstLine
else
return Nothing

View File

@ -126,6 +126,7 @@ data GEvalException = NoExpectedFile FilePath
| UnexpectedData Word32 String | UnexpectedData Word32 String
| UnexpectedMultipleOutputs | UnexpectedMultipleOutputs
| OtherException String | OtherException String
| NoHeaderFile FilePath
deriving (Eq) deriving (Eq)
instance Exception GEvalException instance Exception GEvalException
@ -147,6 +148,7 @@ instance Show GEvalException where
show (UnexpectedData lineNo message) = "Line " ++ (show lineNo) ++ ": Unexpected data [" ++ message ++ "]" show (UnexpectedData lineNo message) = "Line " ++ (show lineNo) ++ ": Unexpected data [" ++ message ++ "]"
show UnexpectedMultipleOutputs = "Multiple outputs are not possible in this mode, use -o option to select an output file" show UnexpectedMultipleOutputs = "Multiple outputs are not possible in this mode, use -o option to select an output file"
show (OtherException message) = message show (OtherException message) = message
show (NoHeaderFile filePath) = somethingWrongWithFilesMessage "No file with header specification" filePath
somethingWrongWithFilesMessage :: String -> FilePath -> String somethingWrongWithFilesMessage :: String -> FilePath -> String
somethingWrongWithFilesMessage msg filePath = Prelude.concat somethingWrongWithFilesMessage msg filePath = Prelude.concat

View File

@ -44,7 +44,11 @@ module GEval.Core
getDataDecoder, getDataDecoder,
threeLineSource, threeLineSource,
extensionsHandled, extensionsHandled,
isEmptyFile isEmptyFile,
FileProcessingOptions(..),
readHeaderFileWrapper,
getInHeader,
getOutHeader
) where ) where
import Debug.Trace import Debug.Trace
@ -82,6 +86,7 @@ import Control.Monad ((<=<), filterM)
import Data.Attoparsec.Text (parseOnly) import Data.Attoparsec.Text (parseOnly)
import Data.Conduit.SmartSource import Data.Conduit.SmartSource
import Data.Conduit.Header
import qualified Data.IntSet as IS import qualified Data.IntSet as IS
@ -172,7 +177,9 @@ data GEvalSpecification = GEvalSpecification
gesToken :: Maybe String, gesToken :: Maybe String,
gesGonitoGitAnnexRemote :: Maybe String, gesGonitoGitAnnexRemote :: Maybe String,
gesReferences :: Maybe String, gesReferences :: Maybe String,
gesBootstrapResampling :: Maybe Int } gesBootstrapResampling :: Maybe Int,
gesInHeader :: Maybe String,
gesOutHeader :: Maybe String }
gesMainMetric :: GEvalSpecification -> Metric gesMainMetric :: GEvalSpecification -> Metric
@ -192,6 +199,16 @@ getExpectedDirectory :: GEvalSpecification -> FilePath
getExpectedDirectory spec = fromMaybe outDirectory $ gesExpectedDirectory spec getExpectedDirectory spec = fromMaybe outDirectory $ gesExpectedDirectory spec
where outDirectory = gesOutDirectory spec where outDirectory = gesOutDirectory spec
getInHeader :: GEvalSpecification -> Maybe FilePath
getInHeader spec = getHeader spec gesInHeader
getOutHeader :: GEvalSpecification -> Maybe FilePath
getOutHeader spec = getHeader spec gesOutHeader
getHeader spec selector = case selector spec of
Just headerFile -> Just $ getExpectedDirectory spec </> headerFile
Nothing -> Nothing
-- | Special command, not just running the regular evaluation. -- | Special command, not just running the regular evaluation.
-- See OptionsParser.hs for more information. -- See OptionsParser.hs for more information.
data GEvalSpecialCommand = Init data GEvalSpecialCommand = Init
@ -229,7 +246,9 @@ defaultGEvalSpecification = GEvalSpecification {
gesToken = Nothing, gesToken = Nothing,
gesGonitoGitAnnexRemote = Nothing, gesGonitoGitAnnexRemote = Nothing,
gesReferences = Nothing, gesReferences = Nothing,
gesBootstrapResampling = Nothing } gesBootstrapResampling = Nothing,
gesInHeader = Nothing,
gesOutHeader = Nothing }
isEmptyFile :: FilePath -> IO (Bool) isEmptyFile :: FilePath -> IO (Bool)
isEmptyFile path = do isEmptyFile path = do
@ -255,9 +274,13 @@ noGraph = const Nothing
gevalOnSingleOut :: GEvalSpecification -> SourceSpec -> SourceSpec -> SourceSpec -> IO (SourceSpec, [MetricOutput]) gevalOnSingleOut :: GEvalSpecification -> SourceSpec -> SourceSpec -> SourceSpec -> IO (SourceSpec, [MetricOutput])
gevalOnSingleOut gevalSpec inputSource expectedSource outSource = do gevalOnSingleOut gevalSpec inputSource expectedSource outSource = do
mInHeader <- readHeaderFileWrapper $ getInHeader gevalSpec
mOutHeader <- readHeaderFileWrapper $ getOutHeader gevalSpec
vals <- Prelude.mapM (\scheme -> gevalCore (evaluationSchemeMetric scheme) vals <- Prelude.mapM (\scheme -> gevalCore (evaluationSchemeMetric scheme)
mSelector mSelector
(preprocess . applyPreprocessingOperations scheme) (preprocess . applyPreprocessingOperations scheme)
mInHeader
mOutHeader
(gesBootstrapResampling gevalSpec) (gesBootstrapResampling gevalSpec)
inputSource inputSource
expectedSource expectedSource
@ -267,6 +290,15 @@ gevalOnSingleOut gevalSpec inputSource expectedSource outSource = do
preprocess = gesPreprocess gevalSpec preprocess = gesPreprocess gevalSpec
mSelector = gesSelector gevalSpec mSelector = gesSelector gevalSpec
readHeaderFileWrapper :: Maybe FilePath -> IO (Maybe TabularHeader)
readHeaderFileWrapper Nothing = return Nothing
readHeaderFileWrapper (Just headerFilePath) = do
mHeader <- readHeaderFile headerFilePath
case mHeader of
Just header -> return $ Just header
Nothing -> throwM $ NoHeaderFile headerFilePath
checkAndGetFilesSingleOut :: Bool -> GEvalSpecification -> IO (SourceSpec, SourceSpec, SourceSpec) checkAndGetFilesSingleOut :: Bool -> GEvalSpecification -> IO (SourceSpec, SourceSpec, SourceSpec)
checkAndGetFilesSingleOut forceInput gevalSpec = do checkAndGetFilesSingleOut forceInput gevalSpec = do
res <- checkAndGetFiles forceInput gevalSpec res <- checkAndGetFiles forceInput gevalSpec
@ -369,9 +401,18 @@ getInputSourceIfNeeded forced metrics directory inputFilePath
Right sourceSpec -> return sourceSpec Right sourceSpec -> return sourceSpec
| otherwise = return NoSource | otherwise = return NoSource
fileAsLineSource :: SourceSpec -> Maybe Selector -> (Text -> Text) -> LineSource (ResourceT IO) data FileProcessingOptions = FileProcessingOptions {
fileAsLineSource spec mSelector preprocess = fileProcessingOptionsSelector :: Maybe Selector,
LineSource ((smartSource spec) .| autoDecompress .| CT.decodeUtf8Lenient .| CT.lines) (select (getDataFormat spec) mSelector) preprocess spec 1 fileProcessingOptionsPreprocess :: (Text -> Text),
fileProcessingOptionsHeader :: Maybe TabularHeader }
fileAsLineSource :: SourceSpec -> FileProcessingOptions -> LineSource (ResourceT IO)
fileAsLineSource spec options =
LineSource ((smartSource spec) .| autoDecompress .| CT.decodeUtf8Lenient .| CT.lines .| processHeader mHeader) (select (getDataFormat spec) mSelector) preprocess spec 1
where mSelector = fileProcessingOptionsSelector options
preprocess = fileProcessingOptionsPreprocess options
mHeader = fileProcessingOptionsHeader options
getDataDecoder :: LineSource (ResourceT IO) -> (Text -> ItemTarget) getDataDecoder :: LineSource (ResourceT IO) -> (Text -> ItemTarget)
getDataDecoder (LineSource _ dd _ _ _) = dd getDataDecoder (LineSource _ dd _ _ _) = dd
@ -429,22 +470,32 @@ handleBootstrap _ = True
gevalCore :: Metric -- ^ evaluation metric gevalCore :: Metric -- ^ evaluation metric
-> Maybe Selector -- ^ selector to be used -> Maybe Selector -- ^ selector to be used
-> (Text -> Text) -- ^ preprocessing function (e.g. tokenization) -> (Text -> Text) -- ^ preprocessing function (e.g. tokenization)
-> (Maybe TabularHeader) -- ^ header for input
-> (Maybe TabularHeader) -- ^ header for output/expected files
-> (Maybe Int) -- ^ number of bootstrap samples -> (Maybe Int) -- ^ number of bootstrap samples
-> SourceSpec -- ^ source specification for the input values -> SourceSpec -- ^ source specification for the input values
-> SourceSpec -- ^ source specification for the expected output -> SourceSpec -- ^ source specification for the expected output
-> SourceSpec -- ^ source specification for the output -> SourceSpec -- ^ source specification for the output
-> IO (MetricOutput) -- ^ metric value for the output against the expected output -> IO (MetricOutput) -- ^ metric value for the output against the expected output
gevalCore metric mSelector preprocess mBootstrapResampling inputSource expectedSource outSource = do gevalCore metric mSelector preprocess mInHeader mOutHeader mBootstrapResampling inputSource expectedSource outSource = do
whenM (isEmptyFileSource outSource) $ throwM $ EmptyOutput whenM (isEmptyFileSource outSource) $ throwM $ EmptyOutput
go metric go metric
(fileAsLineSource inputSource mSelector preprocess) (fileAsLineSource inputSource inOptions)
(fileAsLineSource expectedSource mSelector preprocess) (fileAsLineSource expectedSource outOptions)
(fileAsLineSource outSource mSelector preprocess) (fileAsLineSource outSource outOptions)
where go = case mBootstrapResampling of where go = case mBootstrapResampling of
Nothing -> gevalCoreOnSources Nothing -> gevalCoreOnSources
Just bootstrapResampling -> if handleBootstrap metric Just bootstrapResampling -> if handleBootstrap metric
then gevalBootstrapOnSources bootstrapResampling then gevalBootstrapOnSources bootstrapResampling
else gevalCoreOnSources else gevalCoreOnSources
outOptions = FileProcessingOptions {
fileProcessingOptionsSelector = mSelector,
fileProcessingOptionsPreprocess = preprocess,
fileProcessingOptionsHeader = mOutHeader }
inOptions = FileProcessingOptions {
fileProcessingOptionsSelector = mSelector,
fileProcessingOptionsPreprocess = preprocess,
fileProcessingOptionsHeader = mInHeader }
isEmptyFileSource :: SourceSpec -> IO Bool isEmptyFileSource :: SourceSpec -> IO Bool
isEmptyFileSource (FilePathSpec filePath) = isEmptyFile filePath isEmptyFileSource (FilePathSpec filePath) = isEmptyFile filePath

View File

@ -31,6 +31,8 @@ createChallenge withDataFiles expectedDirectory spec = do
createFile (expectedDirectory </> ".gitignore") $ gitignoreContents createFile (expectedDirectory </> ".gitignore") $ gitignoreContents
createFile (expectedDirectory </> "README.md") $ readmeMDContents metric testName createFile (expectedDirectory </> "README.md") $ readmeMDContents metric testName
createFile (expectedDirectory </> configFileName) $ configContents metrics precision testName createFile (expectedDirectory </> configFileName) $ configContents metrics precision testName
createHeaderFile expectedDirectory "in-header.tsv" $ inHeaderContents metric
createHeaderFile expectedDirectory "out-header.tsv" $ outHeaderContents metric
if withDataFiles if withDataFiles
then then
do do
@ -53,6 +55,10 @@ createChallenge withDataFiles expectedDirectory spec = do
testDirectory = expectedDirectory </> testName testDirectory = expectedDirectory </> testName
expectedFile = gesExpectedFile spec expectedFile = gesExpectedFile spec
createHeaderFile _ _ Nothing = return ()
createHeaderFile expectedDirectory headerFile (Just fields) = do
createFile (expectedDirectory </> headerFile) $ (intercalate "\t" fields) ++ "\n"
createTrainFiles :: Metric -> FilePath -> FilePath -> IO () createTrainFiles :: Metric -> FilePath -> FilePath -> IO ()
createTrainFiles metric@(LogLossHashed _) trainDirectory _ = createSingleTrainFile metric trainDirectory createTrainFiles metric@(LogLossHashed _) trainDirectory _ = createSingleTrainFile metric trainDirectory
createTrainFiles metric@(LikelihoodHashed _) trainDirectory _ = createSingleTrainFile metric trainDirectory createTrainFiles metric@(LikelihoodHashed _) trainDirectory _ = createSingleTrainFile metric trainDirectory
@ -423,9 +429,17 @@ configContents schemes precision testName = unwords (Prelude.map (\scheme -> ("-
" --test-name " ++ testName " --test-name " ++ testName
else else
"") ++ "") ++
(precisionOpt precision) (precisionOpt precision) ++
inHeaderOpts ++
outHeaderOpts
where precisionOpt Nothing = "" where precisionOpt Nothing = ""
precisionOpt (Just p) = " --precision " ++ (show p) precisionOpt (Just p) = " --precision " ++ (show p)
((EvaluationScheme mainMetric _):_) = schemes
inHeaderOpts = getHeaderOpts "in-header" inHeaderContents
outHeaderOpts = getHeaderOpts "out-header" outHeaderContents
getHeaderOpts opt selector = case selector mainMetric of
Just _ -> " --" ++ opt ++ " " ++ (opt <.> "tsv")
Nothing -> ""
-- Originally train content was in one file, to avoid large changes -- Originally train content was in one file, to avoid large changes
-- for the time being we are using the original function. -- for the time being we are using the original function.
@ -823,6 +837,79 @@ testExpectedContents _ = [hereLit|0.11
17.2 17.2
|] |]
inHeaderContents :: Metric -> Maybe [String]
inHeaderContents (Mean metric) = inHeaderContents metric
inHeaderContents GLEU = Nothing
inHeaderContents BLEU = Nothing
inHeaderContents Accuracy = Just ["Temperature", "Wind", "Rain"]
inHeaderContents (FMeasure _) = Just ["seismic",
"seismoacoustic",
"shift",
"genergy",
"gpuls",
"gdenergy",
"gdpuls",
"ghazard",
"nbumps",
"nbumps2",
"nbumps3",
"nbumps4",
"nbumps5",
"nbumps6",
"nbumps7",
"nbumps89",
"energy",
"maxenergy",
"class"]
inHeaderContents (MacroFMeasure _) = Just ["FirstName"]
inHeaderContents (ProbabilisticSoftFMeasure b) = inHeaderContents (SoftFMeasure b)
inHeaderContents (SoftFMeasure _) = Just ["Text"]
inHeaderContents NMI = Just ["Utterance"]
inHeaderContents (LikelihoodHashed b) = inHeaderContents (LogLossHashed b)
inHeaderContents (LogLossHashed _) = Just ["LeftContext", "RightContext"]
inHeaderContents CharMatch = Just ["Text"]
inHeaderContents MAP = Just ["Dialect", "PolishPhrase"]
inHeaderContents Likelihood = inHeaderContents LogLoss
inHeaderContents LogLoss = Just ["Text"]
inHeaderContents BIOF1Labels = inHeaderContents BIOF1
inHeaderContents BIOF1 = Just ["Text"]
inHeaderContents TokenAccuracy = Just ["TokenizedText"]
inHeaderContents SegmentAccuracy = Just ["Segment"]
inHeaderContents (ProbabilisticMultiLabelFMeasure beta) = inHeaderContents (MultiLabelFMeasure beta)
inHeaderContents (MultiLabelFMeasure _) = Just ["Text"]
inHeaderContents MultiLabelLikelihood = inHeaderContents MultiLabelLogLoss
inHeaderContents MultiLabelLogLoss = Just ["Utterance"]
inHeaderContents (Soft2DFMeasure _) = inHeaderContents ClippEU
inHeaderContents ClippEU = Just ["DjvuFilePath"]
inHeaderContents _ = Just ["OrbitalPeriod", "OrbitalEccentricity", "NumberOfMoons"]
outHeaderContents :: Metric -> Maybe [String]
outHeaderContents (Mean metric) = outHeaderContents metric
outHeaderContents BLEU = Nothing
outHeaderContents GLEU = Nothing
outHeaderContents Accuracy = Just ["ShouldYouKidForWalk"]
outHeaderContents (FMeasure _) = Just ["IsSeismicBump"]
outHeaderContents (MacroFMeasure _) = Just ["LanguageCode"]
outHeaderContents (ProbabilisticSoftFMeasure b) = outHeaderContents (SoftFMeasure b)
outHeaderContents (SoftFMeasure _) = Just ["NamesFound"]
outHeaderContents NMI = Just ["LanguageCode"]
outHeaderContents (LikelihoodHashed b) = outHeaderContents (LogLossHashed b)
outHeaderContents (LogLossHashed _) = Just ["GuessedWord"]
outHeaderContents CharMatch = Just ["NormalizedText"]
outHeaderContents MAP = Nothing
outHeaderContents Likelihood = outHeaderContents LogLoss
outHeaderContents LogLoss = Just ["Probability"]
outHeaderContents BIOF1Labels = outHeaderContents BIOF1
outHeaderContents BIOF1 = Just ["BIOOutput"]
outHeaderContents TokenAccuracy = Just ["PartsOfSpeech"]
outHeaderContents SegmentAccuracy = Just ["PartsOfSpeech"]
outHeaderContents (ProbabilisticMultiLabelFMeasure beta) = outHeaderContents (MultiLabelFMeasure beta)
outHeaderContents (MultiLabelFMeasure _) = Just ["Entities"]
outHeaderContents MultiLabelLikelihood = outHeaderContents MultiLabelLogLoss
outHeaderContents MultiLabelLogLoss = Just ["Emotion"]
outHeaderContents (Soft2DFMeasure _) = Just ["Rectangle"]
outHeaderContents ClippEU = Just ["Rectangle"]
outHeaderContents _ = Just ["Mass"]
gitignoreContents :: String gitignoreContents :: String
gitignoreContents = [hereLit| gitignoreContents = [hereLit|

View File

@ -73,6 +73,8 @@ import Data.Statistics.Kendall (kendallZ)
import Data.Conduit.Binary (sourceFile) import Data.Conduit.Binary (sourceFile)
import Data.Conduit.Header
import qualified Data.HashMap.Strict as H import qualified Data.HashMap.Strict as H
import qualified Data.Map.Strict as M import qualified Data.Map.Strict as M
import qualified Data.Set as S import qualified Data.Set as S
@ -463,7 +465,10 @@ runLineByLineGeneralized ordering spec consum = do
return $ Just references return $ Just references
Nothing -> return Nothing Nothing -> return Nothing
(inputFilePath, expectedFilePath, outFilePath) <- checkAndGetFilesSingleOut True spec (inputFilePath, expectedFilePath, outFilePath) <- checkAndGetFilesSingleOut True spec
gevalLineByLineCore metric mSelector preprocess inputFilePath expectedFilePath outFilePath (sorter ordering .| consum mReferences) mInHeader <- readHeaderFileWrapper $ getInHeader spec
mOutHeader <- readHeaderFileWrapper $ getOutHeader spec
let mOutHeader = Nothing
gevalLineByLineCore metric mSelector preprocess mInHeader mOutHeader inputFilePath expectedFilePath outFilePath (sorter ordering .| consum mReferences)
where metric = gesMainMetric spec where metric = gesMainMetric spec
scheme = gesMainScheme spec scheme = gesMainScheme spec
mSelector = gesSelector spec mSelector = gesSelector spec
@ -517,7 +522,9 @@ runMultiOutputGeneralized spec consum = do
altSourceSpecs' <- mapM (getSmartSourceSpec ((gesOutDirectory spec) </> (gesTestName spec)) "out.tsv") altOuts altSourceSpecs' <- mapM (getSmartSourceSpec ((gesOutDirectory spec) </> (gesTestName spec)) "out.tsv") altOuts
let altSourceSpecs = rights altSourceSpecs' let altSourceSpecs = rights altSourceSpecs'
let sourceSpecs = (outSource:altSourceSpecs) let sourceSpecs = (outSource:altSourceSpecs)
let sources = Prelude.map (gevalLineByLineSource metric mSelector preprocess inputSource expectedSource) sourceSpecs mInHeader <- readHeaderFileWrapper $ getInHeader spec
mOutHeader <- readHeaderFileWrapper $ getOutHeader spec
let sources = Prelude.map (gevalLineByLineSource metric mSelector preprocess mInHeader mOutHeader inputSource expectedSource) sourceSpecs
runResourceT $ runConduit $ runResourceT $ runConduit $
(sequenceSources sources .| consum) (sequenceSources sources .| consum)
where metric = gesMainMetric spec where metric = gesMainMetric spec
@ -545,13 +552,15 @@ runDiffGeneralized :: ResultOrdering -> FilePath -> GEvalSpecification -> (Maybe
runDiffGeneralized ordering otherOut spec consum = do runDiffGeneralized ordering otherOut spec consum = do
(inputSource, expectedSource, outSource) <- checkAndGetFilesSingleOut True spec (inputSource, expectedSource, outSource) <- checkAndGetFilesSingleOut True spec
ooss <- getSmartSourceSpec ((gesOutDirectory spec) </> (gesTestName spec)) "out.tsv" otherOut ooss <- getSmartSourceSpec ((gesOutDirectory spec) </> (gesTestName spec)) "out.tsv" otherOut
mInHeader <- readHeaderFileWrapper $ getInHeader spec
mOutHeader <- readHeaderFileWrapper $ getOutHeader spec
case ooss of case ooss of
Left NoSpecGiven -> throwM $ NoOutFile otherOut Left NoSpecGiven -> throwM $ NoOutFile otherOut
Left (NoFile fp) -> throwM $ NoOutFile fp Left (NoFile fp) -> throwM $ NoOutFile fp
Left (NoDirectory d) -> throwM $ NoOutFile otherOut Left (NoDirectory d) -> throwM $ NoOutFile otherOut
Right otherOutSource -> do Right otherOutSource -> do
let sourceA = gevalLineByLineSource metric mSelector preprocess inputSource expectedSource otherOutSource let sourceA = gevalLineByLineSource metric mSelector preprocess mInHeader mOutHeader inputSource expectedSource otherOutSource
let sourceB = gevalLineByLineSource metric mSelector preprocess inputSource expectedSource outSource let sourceB = gevalLineByLineSource metric mSelector preprocess mInHeader mOutHeader inputSource expectedSource outSource
runResourceT $ runConduit $ runResourceT $ runConduit $
((getZipSource $ (,) ((getZipSource $ (,)
<$> ZipSource sourceA <$> ZipSource sourceA
@ -573,27 +582,44 @@ runDiffGeneralized ordering otherOut spec consum = do
escapeTabs :: Text -> Text escapeTabs :: Text -> Text
escapeTabs = Data.Text.replace "\t" "<tab>" escapeTabs = Data.Text.replace "\t" "<tab>"
gevalLineByLineCore :: Metric -> Maybe Selector -> (Text -> Text) -> SourceSpec -> SourceSpec -> SourceSpec -> ConduitT LineRecord Void (ResourceT IO) a -> IO a gevalLineByLineCore :: Metric -> Maybe Selector -> (Text -> Text) -> (Maybe TabularHeader) -> (Maybe TabularHeader) -> SourceSpec -> SourceSpec -> SourceSpec -> ConduitT LineRecord Void (ResourceT IO) a -> IO a
gevalLineByLineCore metric mSelector preprocess inputSource expectedSource outSource consum = gevalLineByLineCore metric mSelector preprocess mInHeader mOutHeader inputSource expectedSource outSource consum =
runResourceT $ runConduit $ runResourceT $ runConduit $
((gevalLineByLineSource metric mSelector preprocess inputSource expectedSource outSource) .| consum) ((gevalLineByLineSource metric mSelector preprocess mInHeader mOutHeader inputSource expectedSource outSource) .| consum)
gevalLineByLineSource :: Metric -> Maybe Selector -> (Text -> Text) -> SourceSpec -> SourceSpec -> SourceSpec -> ConduitT () LineRecord (ResourceT IO) () gevalLineByLineSource :: Metric
gevalLineByLineSource metric mSelector preprocess inputSource expectedSource outSource = -> Maybe Selector
-> (Text -> Text)
-> (Maybe TabularHeader)
-> (Maybe TabularHeader)
-> SourceSpec
-> SourceSpec
-> SourceSpec
-> ConduitT () LineRecord (ResourceT IO) ()
gevalLineByLineSource metric mSelector preprocess mInHeader mOutHeader inputSource expectedSource outSource =
(getZipSource $ (,) (getZipSource $ (,)
<$> ZipSource (CL.sourceList [1..]) <$> ZipSource (CL.sourceList [1..])
<*> (ZipSource $ threeLineSource context)) .| CL.mapM (checkStepM evaluateLine) .| CL.catMaybes <*> (ZipSource $ threeLineSource context)) .| CL.mapM (checkStepM evaluateLine) .| CL.catMaybes
where context = (WithInput inputLineSource expectedLineSource outputLineSource) where context = (WithInput inputLineSource expectedLineSource outputLineSource)
-- preparing sources, `id` means that no preprocessing is done (to avoid double preprocessing) inputLineSource = fileAsLineSource inputSource inOptions
inputLineSource = fileAsLineSource inputSource mSelector id expectedLineSource = fileAsLineSource expectedSource outOptions
expectedLineSource = fileAsLineSource expectedSource mSelector id outputLineSource = fileAsLineSource outSource outOptions
outputLineSource = fileAsLineSource outSource mSelector id
justLine (LineInFile _ _ l) = l justLine (LineInFile _ _ l) = l
evaluateLine (lineNo, ParsedRecordWithInput inp exp out) = do evaluateLine (lineNo, ParsedRecordWithInput inp exp out) = do
s <- liftIO $ gevalCoreOnSingleLines metric preprocess (getDataDecoder inputLineSource) (LineInFile inputSource lineNo inp) s <- liftIO $ gevalCoreOnSingleLines metric preprocess (getDataDecoder inputLineSource) (LineInFile inputSource lineNo inp)
(getDataDecoder expectedLineSource) (LineInFile expectedSource lineNo exp) (getDataDecoder expectedLineSource) (LineInFile expectedSource lineNo exp)
(getDataDecoder outputLineSource) (LineInFile outSource lineNo out) (getDataDecoder outputLineSource) (LineInFile outSource lineNo out)
return $ LineRecord inp exp out lineNo (extractSimpleRunValue $ getMetricValue s) return $ LineRecord inp exp out lineNo (extractSimpleRunValue $ getMetricValue s)
-- preparing sources, `id` means that no preprocessing is done (to avoid double preprocessing)
outOptions = FileProcessingOptions {
fileProcessingOptionsSelector = mSelector,
fileProcessingOptionsPreprocess = id,
fileProcessingOptionsHeader = mOutHeader }
inOptions = FileProcessingOptions {
fileProcessingOptionsSelector = mSelector,
fileProcessingOptionsPreprocess = id,
fileProcessingOptionsHeader = mInHeader }
justTokenize :: Maybe Tokenizer -> IO () justTokenize :: Maybe Tokenizer -> IO ()
justTokenize Nothing = error "a tokenizer must be specified with --tokenizer option" justTokenize Nothing = error "a tokenizer must be specified with --tokenizer option"

View File

@ -214,7 +214,14 @@ specParser = GEvalSpecification
<> short 'B' <> short 'B'
<> metavar "NUMBER-OF-SAMPLES" <> metavar "NUMBER-OF-SAMPLES"
<> help "Tests on NUMBER-OF-SAMPLES bootstrap samples rather than just on the whole test set" )) <> help "Tests on NUMBER-OF-SAMPLES bootstrap samples rather than just on the whole test set" ))
<*> ( optional . strOption $
( long "in-header"
<> metavar "FILE"
<> help "One-line TSV file specifying a list of field names for input files"))
<*> ( optional . strOption $
( long "out-header"
<> metavar "FILE"
<> help "One-line TSV file specifying a list of field names for output and expected files"))
defaultMinFrequency :: Integer defaultMinFrequency :: Integer
defaultMinFrequency = 1 defaultMinFrequency = 1

View File

@ -446,6 +446,9 @@ main = hspec $ do
describe "automatic decompression" $ do describe "automatic decompression" $ do
it "more complex test" $ do it "more complex test" $ do
runGEvalTest "charmatch-complex-compressed" `shouldReturnAlmost` 0.1923076923076923 runGEvalTest "charmatch-complex-compressed" `shouldReturnAlmost` 0.1923076923076923
describe "headers" $ do
it "simple" $ do
runGEvalTest "mse-simple-headers" `shouldReturnAlmost` 0.4166666666666667
describe "handling jsonl format" $ do describe "handling jsonl format" $ do
it "simple test" $ it "simple test" $
runGEvalTestExtraOptions ["-e", "expected.jsonl" ] "jsonl-simple" `shouldReturnAlmost` 0.571428571428 runGEvalTestExtraOptions ["-e", "expected.jsonl" ] "jsonl-simple" `shouldReturnAlmost` 0.571428571428
@ -467,7 +470,9 @@ main = hspec $ do
gesToken = Nothing, gesToken = Nothing,
gesGonitoGitAnnexRemote = Nothing, gesGonitoGitAnnexRemote = Nothing,
gesReferences = Nothing, gesReferences = Nothing,
gesBootstrapResampling = Nothing } gesBootstrapResampling = Nothing,
gesInHeader = Nothing,
gesOutHeader = Nothing }
it "simple test" $ do it "simple test" $ do
results <- runLineByLineGeneralized KeepTheOriginalOrder sampleChallenge (const Data.Conduit.List.consume) results <- runLineByLineGeneralized KeepTheOriginalOrder sampleChallenge (const Data.Conduit.List.consume)
Prelude.map (\(LineRecord inp _ _ _ _) -> inp) results `shouldBe` ["foo", Prelude.map (\(LineRecord inp _ _ _ _) -> inp) results `shouldBe` ["foo",

View File

@ -0,0 +1,3 @@
3.0
3.0
2.0
1 3.0
2 3.0
3 2.0

View File

@ -0,0 +1 @@
--metric MSE --out-header out-header.tsv

View File

@ -0,0 +1 @@
SomeValue
1 SomeValue

View File

@ -0,0 +1,4 @@
SomeValue
2.0
3.0
1.5
1 SomeValue
2 2.0
3 3.0
4 1.5