Handle headers
This commit is contained in:
parent
ccd6d919da
commit
6d586c7238
@ -53,6 +53,7 @@ library
|
||||
, Data.SplitIntoCrossTabs
|
||||
, Data.Conduit.Bootstrap
|
||||
, GEval.Formatting
|
||||
, Data.Conduit.Header
|
||||
, Paths_geval
|
||||
build-depends: base >= 4.7 && < 5
|
||||
, cond
|
||||
|
41
src/Data/Conduit/Header.hs
Normal file
41
src/Data/Conduit/Header.hs
Normal file
@ -0,0 +1,41 @@
|
||||
{-# LANGUAGE OverloadedStrings #-}
|
||||
|
||||
module Data.Conduit.Header
|
||||
(processHeader, TabularHeader, readHeaderFile)
|
||||
where
|
||||
|
||||
import Data.Text
|
||||
import Data.Conduit
|
||||
import Data.Conduit.AutoDecompress
|
||||
|
||||
import qualified System.Directory as D
|
||||
|
||||
data TabularHeader = TabularHeader [Text]
|
||||
|
||||
processHeader :: Monad m => Maybe TabularHeader -> ConduitT Text Text m ()
|
||||
processHeader Nothing = doNothing
|
||||
processHeader (Just (TabularHeader (firstField:_))) = do
|
||||
mLine <- await
|
||||
case mLine of
|
||||
Just line -> case splitIntoFields line of
|
||||
(firstField':_) -> do
|
||||
if firstField' == firstField
|
||||
then return ()
|
||||
else yield line
|
||||
doNothing
|
||||
Nothing -> return ()
|
||||
|
||||
splitIntoFields :: Text -> [Text]
|
||||
splitIntoFields = splitOn "\t"
|
||||
|
||||
readHeaderFile :: FilePath -> IO (Maybe TabularHeader)
|
||||
readHeaderFile headerFilePath = do
|
||||
fileExists <- (D.doesFileExist headerFilePath)
|
||||
if fileExists
|
||||
then
|
||||
do
|
||||
content <- readFile headerFilePath
|
||||
let (firstLine:_) = Prelude.lines content
|
||||
return $ Just $ TabularHeader $ splitIntoFields $ pack firstLine
|
||||
else
|
||||
return Nothing
|
@ -126,6 +126,7 @@ data GEvalException = NoExpectedFile FilePath
|
||||
| UnexpectedData Word32 String
|
||||
| UnexpectedMultipleOutputs
|
||||
| OtherException String
|
||||
| NoHeaderFile FilePath
|
||||
deriving (Eq)
|
||||
|
||||
instance Exception GEvalException
|
||||
@ -147,6 +148,7 @@ instance Show GEvalException where
|
||||
show (UnexpectedData lineNo message) = "Line " ++ (show lineNo) ++ ": Unexpected data [" ++ message ++ "]"
|
||||
show UnexpectedMultipleOutputs = "Multiple outputs are not possible in this mode, use -o option to select an output file"
|
||||
show (OtherException message) = message
|
||||
show (NoHeaderFile filePath) = somethingWrongWithFilesMessage "No file with header specification" filePath
|
||||
|
||||
somethingWrongWithFilesMessage :: String -> FilePath -> String
|
||||
somethingWrongWithFilesMessage msg filePath = Prelude.concat
|
||||
|
@ -44,7 +44,11 @@ module GEval.Core
|
||||
getDataDecoder,
|
||||
threeLineSource,
|
||||
extensionsHandled,
|
||||
isEmptyFile
|
||||
isEmptyFile,
|
||||
FileProcessingOptions(..),
|
||||
readHeaderFileWrapper,
|
||||
getInHeader,
|
||||
getOutHeader
|
||||
) where
|
||||
|
||||
import Debug.Trace
|
||||
@ -82,6 +86,7 @@ import Control.Monad ((<=<), filterM)
|
||||
import Data.Attoparsec.Text (parseOnly)
|
||||
|
||||
import Data.Conduit.SmartSource
|
||||
import Data.Conduit.Header
|
||||
|
||||
import qualified Data.IntSet as IS
|
||||
|
||||
@ -172,7 +177,9 @@ data GEvalSpecification = GEvalSpecification
|
||||
gesToken :: Maybe String,
|
||||
gesGonitoGitAnnexRemote :: Maybe String,
|
||||
gesReferences :: Maybe String,
|
||||
gesBootstrapResampling :: Maybe Int }
|
||||
gesBootstrapResampling :: Maybe Int,
|
||||
gesInHeader :: Maybe String,
|
||||
gesOutHeader :: Maybe String }
|
||||
|
||||
|
||||
gesMainMetric :: GEvalSpecification -> Metric
|
||||
@ -192,6 +199,16 @@ getExpectedDirectory :: GEvalSpecification -> FilePath
|
||||
getExpectedDirectory spec = fromMaybe outDirectory $ gesExpectedDirectory spec
|
||||
where outDirectory = gesOutDirectory spec
|
||||
|
||||
getInHeader :: GEvalSpecification -> Maybe FilePath
|
||||
getInHeader spec = getHeader spec gesInHeader
|
||||
|
||||
getOutHeader :: GEvalSpecification -> Maybe FilePath
|
||||
getOutHeader spec = getHeader spec gesOutHeader
|
||||
|
||||
getHeader spec selector = case selector spec of
|
||||
Just headerFile -> Just $ getExpectedDirectory spec </> headerFile
|
||||
Nothing -> Nothing
|
||||
|
||||
-- | Special command, not just running the regular evaluation.
|
||||
-- See OptionsParser.hs for more information.
|
||||
data GEvalSpecialCommand = Init
|
||||
@ -229,7 +246,9 @@ defaultGEvalSpecification = GEvalSpecification {
|
||||
gesToken = Nothing,
|
||||
gesGonitoGitAnnexRemote = Nothing,
|
||||
gesReferences = Nothing,
|
||||
gesBootstrapResampling = Nothing }
|
||||
gesBootstrapResampling = Nothing,
|
||||
gesInHeader = Nothing,
|
||||
gesOutHeader = Nothing }
|
||||
|
||||
isEmptyFile :: FilePath -> IO (Bool)
|
||||
isEmptyFile path = do
|
||||
@ -255,9 +274,13 @@ noGraph = const Nothing
|
||||
|
||||
gevalOnSingleOut :: GEvalSpecification -> SourceSpec -> SourceSpec -> SourceSpec -> IO (SourceSpec, [MetricOutput])
|
||||
gevalOnSingleOut gevalSpec inputSource expectedSource outSource = do
|
||||
mInHeader <- readHeaderFileWrapper $ getInHeader gevalSpec
|
||||
mOutHeader <- readHeaderFileWrapper $ getOutHeader gevalSpec
|
||||
vals <- Prelude.mapM (\scheme -> gevalCore (evaluationSchemeMetric scheme)
|
||||
mSelector
|
||||
(preprocess . applyPreprocessingOperations scheme)
|
||||
mInHeader
|
||||
mOutHeader
|
||||
(gesBootstrapResampling gevalSpec)
|
||||
inputSource
|
||||
expectedSource
|
||||
@ -267,6 +290,15 @@ gevalOnSingleOut gevalSpec inputSource expectedSource outSource = do
|
||||
preprocess = gesPreprocess gevalSpec
|
||||
mSelector = gesSelector gevalSpec
|
||||
|
||||
|
||||
readHeaderFileWrapper :: Maybe FilePath -> IO (Maybe TabularHeader)
|
||||
readHeaderFileWrapper Nothing = return Nothing
|
||||
readHeaderFileWrapper (Just headerFilePath) = do
|
||||
mHeader <- readHeaderFile headerFilePath
|
||||
case mHeader of
|
||||
Just header -> return $ Just header
|
||||
Nothing -> throwM $ NoHeaderFile headerFilePath
|
||||
|
||||
checkAndGetFilesSingleOut :: Bool -> GEvalSpecification -> IO (SourceSpec, SourceSpec, SourceSpec)
|
||||
checkAndGetFilesSingleOut forceInput gevalSpec = do
|
||||
res <- checkAndGetFiles forceInput gevalSpec
|
||||
@ -369,9 +401,18 @@ getInputSourceIfNeeded forced metrics directory inputFilePath
|
||||
Right sourceSpec -> return sourceSpec
|
||||
| otherwise = return NoSource
|
||||
|
||||
fileAsLineSource :: SourceSpec -> Maybe Selector -> (Text -> Text) -> LineSource (ResourceT IO)
|
||||
fileAsLineSource spec mSelector preprocess =
|
||||
LineSource ((smartSource spec) .| autoDecompress .| CT.decodeUtf8Lenient .| CT.lines) (select (getDataFormat spec) mSelector) preprocess spec 1
|
||||
data FileProcessingOptions = FileProcessingOptions {
|
||||
fileProcessingOptionsSelector :: Maybe Selector,
|
||||
fileProcessingOptionsPreprocess :: (Text -> Text),
|
||||
fileProcessingOptionsHeader :: Maybe TabularHeader }
|
||||
|
||||
|
||||
fileAsLineSource :: SourceSpec -> FileProcessingOptions -> LineSource (ResourceT IO)
|
||||
fileAsLineSource spec options =
|
||||
LineSource ((smartSource spec) .| autoDecompress .| CT.decodeUtf8Lenient .| CT.lines .| processHeader mHeader) (select (getDataFormat spec) mSelector) preprocess spec 1
|
||||
where mSelector = fileProcessingOptionsSelector options
|
||||
preprocess = fileProcessingOptionsPreprocess options
|
||||
mHeader = fileProcessingOptionsHeader options
|
||||
|
||||
getDataDecoder :: LineSource (ResourceT IO) -> (Text -> ItemTarget)
|
||||
getDataDecoder (LineSource _ dd _ _ _) = dd
|
||||
@ -429,22 +470,32 @@ handleBootstrap _ = True
|
||||
gevalCore :: Metric -- ^ evaluation metric
|
||||
-> Maybe Selector -- ^ selector to be used
|
||||
-> (Text -> Text) -- ^ preprocessing function (e.g. tokenization)
|
||||
-> (Maybe TabularHeader) -- ^ header for input
|
||||
-> (Maybe TabularHeader) -- ^ header for output/expected files
|
||||
-> (Maybe Int) -- ^ number of bootstrap samples
|
||||
-> SourceSpec -- ^ source specification for the input values
|
||||
-> SourceSpec -- ^ source specification for the expected output
|
||||
-> SourceSpec -- ^ source specification for the output
|
||||
-> IO (MetricOutput) -- ^ metric value for the output against the expected output
|
||||
gevalCore metric mSelector preprocess mBootstrapResampling inputSource expectedSource outSource = do
|
||||
gevalCore metric mSelector preprocess mInHeader mOutHeader mBootstrapResampling inputSource expectedSource outSource = do
|
||||
whenM (isEmptyFileSource outSource) $ throwM $ EmptyOutput
|
||||
go metric
|
||||
(fileAsLineSource inputSource mSelector preprocess)
|
||||
(fileAsLineSource expectedSource mSelector preprocess)
|
||||
(fileAsLineSource outSource mSelector preprocess)
|
||||
(fileAsLineSource inputSource inOptions)
|
||||
(fileAsLineSource expectedSource outOptions)
|
||||
(fileAsLineSource outSource outOptions)
|
||||
where go = case mBootstrapResampling of
|
||||
Nothing -> gevalCoreOnSources
|
||||
Just bootstrapResampling -> if handleBootstrap metric
|
||||
then gevalBootstrapOnSources bootstrapResampling
|
||||
else gevalCoreOnSources
|
||||
outOptions = FileProcessingOptions {
|
||||
fileProcessingOptionsSelector = mSelector,
|
||||
fileProcessingOptionsPreprocess = preprocess,
|
||||
fileProcessingOptionsHeader = mOutHeader }
|
||||
inOptions = FileProcessingOptions {
|
||||
fileProcessingOptionsSelector = mSelector,
|
||||
fileProcessingOptionsPreprocess = preprocess,
|
||||
fileProcessingOptionsHeader = mInHeader }
|
||||
|
||||
isEmptyFileSource :: SourceSpec -> IO Bool
|
||||
isEmptyFileSource (FilePathSpec filePath) = isEmptyFile filePath
|
||||
|
@ -31,6 +31,8 @@ createChallenge withDataFiles expectedDirectory spec = do
|
||||
createFile (expectedDirectory </> ".gitignore") $ gitignoreContents
|
||||
createFile (expectedDirectory </> "README.md") $ readmeMDContents metric testName
|
||||
createFile (expectedDirectory </> configFileName) $ configContents metrics precision testName
|
||||
createHeaderFile expectedDirectory "in-header.tsv" $ inHeaderContents metric
|
||||
createHeaderFile expectedDirectory "out-header.tsv" $ outHeaderContents metric
|
||||
if withDataFiles
|
||||
then
|
||||
do
|
||||
@ -53,6 +55,10 @@ createChallenge withDataFiles expectedDirectory spec = do
|
||||
testDirectory = expectedDirectory </> testName
|
||||
expectedFile = gesExpectedFile spec
|
||||
|
||||
createHeaderFile _ _ Nothing = return ()
|
||||
createHeaderFile expectedDirectory headerFile (Just fields) = do
|
||||
createFile (expectedDirectory </> headerFile) $ (intercalate "\t" fields) ++ "\n"
|
||||
|
||||
createTrainFiles :: Metric -> FilePath -> FilePath -> IO ()
|
||||
createTrainFiles metric@(LogLossHashed _) trainDirectory _ = createSingleTrainFile metric trainDirectory
|
||||
createTrainFiles metric@(LikelihoodHashed _) trainDirectory _ = createSingleTrainFile metric trainDirectory
|
||||
@ -423,9 +429,17 @@ configContents schemes precision testName = unwords (Prelude.map (\scheme -> ("-
|
||||
" --test-name " ++ testName
|
||||
else
|
||||
"") ++
|
||||
(precisionOpt precision)
|
||||
(precisionOpt precision) ++
|
||||
inHeaderOpts ++
|
||||
outHeaderOpts
|
||||
where precisionOpt Nothing = ""
|
||||
precisionOpt (Just p) = " --precision " ++ (show p)
|
||||
((EvaluationScheme mainMetric _):_) = schemes
|
||||
inHeaderOpts = getHeaderOpts "in-header" inHeaderContents
|
||||
outHeaderOpts = getHeaderOpts "out-header" outHeaderContents
|
||||
getHeaderOpts opt selector = case selector mainMetric of
|
||||
Just _ -> " --" ++ opt ++ " " ++ (opt <.> "tsv")
|
||||
Nothing -> ""
|
||||
|
||||
-- Originally train content was in one file, to avoid large changes
|
||||
-- for the time being we are using the original function.
|
||||
@ -823,6 +837,79 @@ testExpectedContents _ = [hereLit|0.11
|
||||
17.2
|
||||
|]
|
||||
|
||||
inHeaderContents :: Metric -> Maybe [String]
|
||||
inHeaderContents (Mean metric) = inHeaderContents metric
|
||||
inHeaderContents GLEU = Nothing
|
||||
inHeaderContents BLEU = Nothing
|
||||
inHeaderContents Accuracy = Just ["Temperature", "Wind", "Rain"]
|
||||
inHeaderContents (FMeasure _) = Just ["seismic",
|
||||
"seismoacoustic",
|
||||
"shift",
|
||||
"genergy",
|
||||
"gpuls",
|
||||
"gdenergy",
|
||||
"gdpuls",
|
||||
"ghazard",
|
||||
"nbumps",
|
||||
"nbumps2",
|
||||
"nbumps3",
|
||||
"nbumps4",
|
||||
"nbumps5",
|
||||
"nbumps6",
|
||||
"nbumps7",
|
||||
"nbumps89",
|
||||
"energy",
|
||||
"maxenergy",
|
||||
"class"]
|
||||
inHeaderContents (MacroFMeasure _) = Just ["FirstName"]
|
||||
inHeaderContents (ProbabilisticSoftFMeasure b) = inHeaderContents (SoftFMeasure b)
|
||||
inHeaderContents (SoftFMeasure _) = Just ["Text"]
|
||||
inHeaderContents NMI = Just ["Utterance"]
|
||||
inHeaderContents (LikelihoodHashed b) = inHeaderContents (LogLossHashed b)
|
||||
inHeaderContents (LogLossHashed _) = Just ["LeftContext", "RightContext"]
|
||||
inHeaderContents CharMatch = Just ["Text"]
|
||||
inHeaderContents MAP = Just ["Dialect", "PolishPhrase"]
|
||||
inHeaderContents Likelihood = inHeaderContents LogLoss
|
||||
inHeaderContents LogLoss = Just ["Text"]
|
||||
inHeaderContents BIOF1Labels = inHeaderContents BIOF1
|
||||
inHeaderContents BIOF1 = Just ["Text"]
|
||||
inHeaderContents TokenAccuracy = Just ["TokenizedText"]
|
||||
inHeaderContents SegmentAccuracy = Just ["Segment"]
|
||||
inHeaderContents (ProbabilisticMultiLabelFMeasure beta) = inHeaderContents (MultiLabelFMeasure beta)
|
||||
inHeaderContents (MultiLabelFMeasure _) = Just ["Text"]
|
||||
inHeaderContents MultiLabelLikelihood = inHeaderContents MultiLabelLogLoss
|
||||
inHeaderContents MultiLabelLogLoss = Just ["Utterance"]
|
||||
inHeaderContents (Soft2DFMeasure _) = inHeaderContents ClippEU
|
||||
inHeaderContents ClippEU = Just ["DjvuFilePath"]
|
||||
inHeaderContents _ = Just ["OrbitalPeriod", "OrbitalEccentricity", "NumberOfMoons"]
|
||||
|
||||
outHeaderContents :: Metric -> Maybe [String]
|
||||
outHeaderContents (Mean metric) = outHeaderContents metric
|
||||
outHeaderContents BLEU = Nothing
|
||||
outHeaderContents GLEU = Nothing
|
||||
outHeaderContents Accuracy = Just ["ShouldYouKidForWalk"]
|
||||
outHeaderContents (FMeasure _) = Just ["IsSeismicBump"]
|
||||
outHeaderContents (MacroFMeasure _) = Just ["LanguageCode"]
|
||||
outHeaderContents (ProbabilisticSoftFMeasure b) = outHeaderContents (SoftFMeasure b)
|
||||
outHeaderContents (SoftFMeasure _) = Just ["NamesFound"]
|
||||
outHeaderContents NMI = Just ["LanguageCode"]
|
||||
outHeaderContents (LikelihoodHashed b) = outHeaderContents (LogLossHashed b)
|
||||
outHeaderContents (LogLossHashed _) = Just ["GuessedWord"]
|
||||
outHeaderContents CharMatch = Just ["NormalizedText"]
|
||||
outHeaderContents MAP = Nothing
|
||||
outHeaderContents Likelihood = outHeaderContents LogLoss
|
||||
outHeaderContents LogLoss = Just ["Probability"]
|
||||
outHeaderContents BIOF1Labels = outHeaderContents BIOF1
|
||||
outHeaderContents BIOF1 = Just ["BIOOutput"]
|
||||
outHeaderContents TokenAccuracy = Just ["PartsOfSpeech"]
|
||||
outHeaderContents SegmentAccuracy = Just ["PartsOfSpeech"]
|
||||
outHeaderContents (ProbabilisticMultiLabelFMeasure beta) = outHeaderContents (MultiLabelFMeasure beta)
|
||||
outHeaderContents (MultiLabelFMeasure _) = Just ["Entities"]
|
||||
outHeaderContents MultiLabelLikelihood = outHeaderContents MultiLabelLogLoss
|
||||
outHeaderContents MultiLabelLogLoss = Just ["Emotion"]
|
||||
outHeaderContents (Soft2DFMeasure _) = Just ["Rectangle"]
|
||||
outHeaderContents ClippEU = Just ["Rectangle"]
|
||||
outHeaderContents _ = Just ["Mass"]
|
||||
|
||||
gitignoreContents :: String
|
||||
gitignoreContents = [hereLit|
|
||||
|
@ -73,6 +73,8 @@ import Data.Statistics.Kendall (kendallZ)
|
||||
|
||||
import Data.Conduit.Binary (sourceFile)
|
||||
|
||||
import Data.Conduit.Header
|
||||
|
||||
import qualified Data.HashMap.Strict as H
|
||||
import qualified Data.Map.Strict as M
|
||||
import qualified Data.Set as S
|
||||
@ -463,7 +465,10 @@ runLineByLineGeneralized ordering spec consum = do
|
||||
return $ Just references
|
||||
Nothing -> return Nothing
|
||||
(inputFilePath, expectedFilePath, outFilePath) <- checkAndGetFilesSingleOut True spec
|
||||
gevalLineByLineCore metric mSelector preprocess inputFilePath expectedFilePath outFilePath (sorter ordering .| consum mReferences)
|
||||
mInHeader <- readHeaderFileWrapper $ getInHeader spec
|
||||
mOutHeader <- readHeaderFileWrapper $ getOutHeader spec
|
||||
let mOutHeader = Nothing
|
||||
gevalLineByLineCore metric mSelector preprocess mInHeader mOutHeader inputFilePath expectedFilePath outFilePath (sorter ordering .| consum mReferences)
|
||||
where metric = gesMainMetric spec
|
||||
scheme = gesMainScheme spec
|
||||
mSelector = gesSelector spec
|
||||
@ -517,7 +522,9 @@ runMultiOutputGeneralized spec consum = do
|
||||
altSourceSpecs' <- mapM (getSmartSourceSpec ((gesOutDirectory spec) </> (gesTestName spec)) "out.tsv") altOuts
|
||||
let altSourceSpecs = rights altSourceSpecs'
|
||||
let sourceSpecs = (outSource:altSourceSpecs)
|
||||
let sources = Prelude.map (gevalLineByLineSource metric mSelector preprocess inputSource expectedSource) sourceSpecs
|
||||
mInHeader <- readHeaderFileWrapper $ getInHeader spec
|
||||
mOutHeader <- readHeaderFileWrapper $ getOutHeader spec
|
||||
let sources = Prelude.map (gevalLineByLineSource metric mSelector preprocess mInHeader mOutHeader inputSource expectedSource) sourceSpecs
|
||||
runResourceT $ runConduit $
|
||||
(sequenceSources sources .| consum)
|
||||
where metric = gesMainMetric spec
|
||||
@ -545,13 +552,15 @@ runDiffGeneralized :: ResultOrdering -> FilePath -> GEvalSpecification -> (Maybe
|
||||
runDiffGeneralized ordering otherOut spec consum = do
|
||||
(inputSource, expectedSource, outSource) <- checkAndGetFilesSingleOut True spec
|
||||
ooss <- getSmartSourceSpec ((gesOutDirectory spec) </> (gesTestName spec)) "out.tsv" otherOut
|
||||
mInHeader <- readHeaderFileWrapper $ getInHeader spec
|
||||
mOutHeader <- readHeaderFileWrapper $ getOutHeader spec
|
||||
case ooss of
|
||||
Left NoSpecGiven -> throwM $ NoOutFile otherOut
|
||||
Left (NoFile fp) -> throwM $ NoOutFile fp
|
||||
Left (NoDirectory d) -> throwM $ NoOutFile otherOut
|
||||
Right otherOutSource -> do
|
||||
let sourceA = gevalLineByLineSource metric mSelector preprocess inputSource expectedSource otherOutSource
|
||||
let sourceB = gevalLineByLineSource metric mSelector preprocess inputSource expectedSource outSource
|
||||
let sourceA = gevalLineByLineSource metric mSelector preprocess mInHeader mOutHeader inputSource expectedSource otherOutSource
|
||||
let sourceB = gevalLineByLineSource metric mSelector preprocess mInHeader mOutHeader inputSource expectedSource outSource
|
||||
runResourceT $ runConduit $
|
||||
((getZipSource $ (,)
|
||||
<$> ZipSource sourceA
|
||||
@ -573,27 +582,44 @@ runDiffGeneralized ordering otherOut spec consum = do
|
||||
escapeTabs :: Text -> Text
|
||||
escapeTabs = Data.Text.replace "\t" "<tab>"
|
||||
|
||||
gevalLineByLineCore :: Metric -> Maybe Selector -> (Text -> Text) -> SourceSpec -> SourceSpec -> SourceSpec -> ConduitT LineRecord Void (ResourceT IO) a -> IO a
|
||||
gevalLineByLineCore metric mSelector preprocess inputSource expectedSource outSource consum =
|
||||
gevalLineByLineCore :: Metric -> Maybe Selector -> (Text -> Text) -> (Maybe TabularHeader) -> (Maybe TabularHeader) -> SourceSpec -> SourceSpec -> SourceSpec -> ConduitT LineRecord Void (ResourceT IO) a -> IO a
|
||||
gevalLineByLineCore metric mSelector preprocess mInHeader mOutHeader inputSource expectedSource outSource consum =
|
||||
runResourceT $ runConduit $
|
||||
((gevalLineByLineSource metric mSelector preprocess inputSource expectedSource outSource) .| consum)
|
||||
((gevalLineByLineSource metric mSelector preprocess mInHeader mOutHeader inputSource expectedSource outSource) .| consum)
|
||||
|
||||
gevalLineByLineSource :: Metric -> Maybe Selector -> (Text -> Text) -> SourceSpec -> SourceSpec -> SourceSpec -> ConduitT () LineRecord (ResourceT IO) ()
|
||||
gevalLineByLineSource metric mSelector preprocess inputSource expectedSource outSource =
|
||||
gevalLineByLineSource :: Metric
|
||||
-> Maybe Selector
|
||||
-> (Text -> Text)
|
||||
-> (Maybe TabularHeader)
|
||||
-> (Maybe TabularHeader)
|
||||
-> SourceSpec
|
||||
-> SourceSpec
|
||||
-> SourceSpec
|
||||
-> ConduitT () LineRecord (ResourceT IO) ()
|
||||
gevalLineByLineSource metric mSelector preprocess mInHeader mOutHeader inputSource expectedSource outSource =
|
||||
(getZipSource $ (,)
|
||||
<$> ZipSource (CL.sourceList [1..])
|
||||
<*> (ZipSource $ threeLineSource context)) .| CL.mapM (checkStepM evaluateLine) .| CL.catMaybes
|
||||
where context = (WithInput inputLineSource expectedLineSource outputLineSource)
|
||||
-- preparing sources, `id` means that no preprocessing is done (to avoid double preprocessing)
|
||||
inputLineSource = fileAsLineSource inputSource mSelector id
|
||||
expectedLineSource = fileAsLineSource expectedSource mSelector id
|
||||
outputLineSource = fileAsLineSource outSource mSelector id
|
||||
inputLineSource = fileAsLineSource inputSource inOptions
|
||||
expectedLineSource = fileAsLineSource expectedSource outOptions
|
||||
outputLineSource = fileAsLineSource outSource outOptions
|
||||
justLine (LineInFile _ _ l) = l
|
||||
evaluateLine (lineNo, ParsedRecordWithInput inp exp out) = do
|
||||
s <- liftIO $ gevalCoreOnSingleLines metric preprocess (getDataDecoder inputLineSource) (LineInFile inputSource lineNo inp)
|
||||
(getDataDecoder expectedLineSource) (LineInFile expectedSource lineNo exp)
|
||||
(getDataDecoder outputLineSource) (LineInFile outSource lineNo out)
|
||||
return $ LineRecord inp exp out lineNo (extractSimpleRunValue $ getMetricValue s)
|
||||
-- preparing sources, `id` means that no preprocessing is done (to avoid double preprocessing)
|
||||
outOptions = FileProcessingOptions {
|
||||
fileProcessingOptionsSelector = mSelector,
|
||||
fileProcessingOptionsPreprocess = id,
|
||||
fileProcessingOptionsHeader = mOutHeader }
|
||||
inOptions = FileProcessingOptions {
|
||||
fileProcessingOptionsSelector = mSelector,
|
||||
fileProcessingOptionsPreprocess = id,
|
||||
fileProcessingOptionsHeader = mInHeader }
|
||||
|
||||
|
||||
justTokenize :: Maybe Tokenizer -> IO ()
|
||||
justTokenize Nothing = error "a tokenizer must be specified with --tokenizer option"
|
||||
|
@ -214,7 +214,14 @@ specParser = GEvalSpecification
|
||||
<> short 'B'
|
||||
<> metavar "NUMBER-OF-SAMPLES"
|
||||
<> help "Tests on NUMBER-OF-SAMPLES bootstrap samples rather than just on the whole test set" ))
|
||||
|
||||
<*> ( optional . strOption $
|
||||
( long "in-header"
|
||||
<> metavar "FILE"
|
||||
<> help "One-line TSV file specifying a list of field names for input files"))
|
||||
<*> ( optional . strOption $
|
||||
( long "out-header"
|
||||
<> metavar "FILE"
|
||||
<> help "One-line TSV file specifying a list of field names for output and expected files"))
|
||||
|
||||
defaultMinFrequency :: Integer
|
||||
defaultMinFrequency = 1
|
||||
|
@ -446,6 +446,9 @@ main = hspec $ do
|
||||
describe "automatic decompression" $ do
|
||||
it "more complex test" $ do
|
||||
runGEvalTest "charmatch-complex-compressed" `shouldReturnAlmost` 0.1923076923076923
|
||||
describe "headers" $ do
|
||||
it "simple" $ do
|
||||
runGEvalTest "mse-simple-headers" `shouldReturnAlmost` 0.4166666666666667
|
||||
describe "handling jsonl format" $ do
|
||||
it "simple test" $
|
||||
runGEvalTestExtraOptions ["-e", "expected.jsonl" ] "jsonl-simple" `shouldReturnAlmost` 0.571428571428
|
||||
@ -467,7 +470,9 @@ main = hspec $ do
|
||||
gesToken = Nothing,
|
||||
gesGonitoGitAnnexRemote = Nothing,
|
||||
gesReferences = Nothing,
|
||||
gesBootstrapResampling = Nothing }
|
||||
gesBootstrapResampling = Nothing,
|
||||
gesInHeader = Nothing,
|
||||
gesOutHeader = Nothing }
|
||||
it "simple test" $ do
|
||||
results <- runLineByLineGeneralized KeepTheOriginalOrder sampleChallenge (const Data.Conduit.List.consume)
|
||||
Prelude.map (\(LineRecord inp _ _ _ _) -> inp) results `shouldBe` ["foo",
|
||||
|
@ -0,0 +1,3 @@
|
||||
3.0
|
||||
3.0
|
||||
2.0
|
|
1
test/mse-simple-headers/mse-simple-headers/config.txt
Normal file
1
test/mse-simple-headers/mse-simple-headers/config.txt
Normal file
@ -0,0 +1 @@
|
||||
--metric MSE --out-header out-header.tsv
|
@ -0,0 +1 @@
|
||||
SomeValue
|
|
@ -0,0 +1,4 @@
|
||||
SomeValue
|
||||
2.0
|
||||
3.0
|
||||
1.5
|
|
Loading…
Reference in New Issue
Block a user