Introduce DataSource

This commit is contained in:
Filip Gralinski 2020-05-12 17:13:52 +02:00
parent 8f3550493c
commit 454f60812c
3 changed files with 73 additions and 25 deletions

View File

@ -54,6 +54,7 @@ library
, Data.Conduit.Bootstrap
, GEval.Formatting
, Data.Conduit.Header
, GEval.DataSource
, Paths_geval
build-depends: base >= 4.7 && < 5
, cond

View File

@ -106,6 +106,7 @@ import GEval.Selector
import GEval.Annotation
import GEval.BlackBoxDebugging
import Data.Conduit.Bootstrap
import GEval.DataSource
import qualified Data.HashMap.Strict as M
import qualified Data.Vector as V
@ -265,31 +266,44 @@ data LineSource m = LineSource (ConduitT () Text m ()) (Text -> ItemTarget) (Tex
geval :: GEvalSpecification -> IO [(SourceSpec, [MetricOutput])]
geval gevalSpec = do
mInHeader <- readHeaderFileWrapper $ getInHeader gevalSpec
mOutHeader <- readHeaderFileWrapper $ getOutHeader gevalSpec
(inputSource, expectedSource, outSources) <- checkAndGetFiles False gevalSpec
results <- Prelude.mapM (gevalOnSingleOut gevalSpec inputSource expectedSource) outSources
let chDataSource = ChallengeDataSource {
challengeDataSourceInput = inputSource,
challengeDataSourceExpected = expectedSource,
challengeDataSourceSelector = gesSelector gevalSpec,
challengeDataSourcePreprocess = gesPreprocess gevalSpec,
challengeDataSourceFilter = Nothing,
challengeDataSourceInHeader = mInHeader,
challengeDataSourceOutHeader = mOutHeader }
results <- Prelude.mapM (\outSource -> gevalOnSingleOut gevalSpec
DataSource {
dataSourceChallengeData = chDataSource,
dataSourceOut = outSource }) outSources
return $ sortBy (\a b -> (show $ fst a) `naturalComp` (show $ fst b)) results
noGraph :: d -> Maybe GraphSeries
noGraph = const Nothing
gevalOnSingleOut :: GEvalSpecification -> SourceSpec -> SourceSpec -> SourceSpec -> IO (SourceSpec, [MetricOutput])
gevalOnSingleOut gevalSpec inputSource expectedSource outSource = do
mInHeader <- readHeaderFileWrapper $ getInHeader gevalSpec
mOutHeader <- readHeaderFileWrapper $ getOutHeader gevalSpec
vals <- Prelude.mapM (\scheme -> gevalCore (evaluationSchemeMetric scheme)
mSelector
(preprocess . applyPreprocessingOperations scheme)
mInHeader
mOutHeader
(gesBootstrapResampling gevalSpec)
inputSource
expectedSource
outSource) schemes
gevalOnSingleOut :: GEvalSpecification -> DataSource -> IO (SourceSpec, [MetricOutput])
gevalOnSingleOut gevalSpec dataSource = do
vals <- Prelude.mapM (\scheme ->
gevalCore (evaluationSchemeMetric scheme)
(gesBootstrapResampling gevalSpec)
(addPreprocessing (applyPreprocessingOperations scheme) dataSource))
schemes
return (outSource, vals)
where schemes = gesMetrics gevalSpec
preprocess = gesPreprocess gevalSpec
mSelector = gesSelector gevalSpec
where outSource = dataSourceOut dataSource
schemes = gesMetrics gevalSpec
addPreprocessing :: (Text -> Text) -> DataSource -> DataSource
addPreprocessing prep dataSource =
dataSource {
dataSourceChallengeData = (dataSourceChallengeData dataSource) {
challengeDataSourcePreprocess =
(challengeDataSourcePreprocess $ dataSourceChallengeData dataSource) . prep }}
readHeaderFileWrapper :: Maybe FilePath -> IO (Maybe TabularHeader)
readHeaderFileWrapper Nothing = return Nothing
@ -468,16 +482,10 @@ handleBootstrap _ = True
-- Throws @GEvalException@ if something was wrong in the data (e.g.
-- inconsistent number of lines in the sources).
gevalCore :: Metric -- ^ evaluation metric
-> Maybe Selector -- ^ selector to be used
-> (Text -> Text) -- ^ preprocessing function (e.g. tokenization)
-> (Maybe TabularHeader) -- ^ header for input
-> (Maybe TabularHeader) -- ^ header for output/expected files
-> (Maybe Int) -- ^ number of bootstrap samples
-> SourceSpec -- ^ source specification for the input values
-> SourceSpec -- ^ source specification for the expected output
-> SourceSpec -- ^ source specification for the output
-> DataSource
-> IO (MetricOutput) -- ^ metric value for the output against the expected output
gevalCore metric mSelector preprocess mInHeader mOutHeader mBootstrapResampling inputSource expectedSource outSource = do
gevalCore metric mBootstrapResampling dataSource = do
whenM (isEmptyFileSource outSource) $ throwM $ EmptyOutput
go metric
(fileAsLineSource inputSource inOptions)
@ -496,6 +504,16 @@ gevalCore metric mSelector preprocess mInHeader mOutHeader mBootstrapResampling
fileProcessingOptionsSelector = mSelector,
fileProcessingOptionsPreprocess = preprocess,
fileProcessingOptionsHeader = mInHeader }
challengeDataSource = dataSourceChallengeData dataSource
mSelector = challengeDataSourceSelector challengeDataSource
preprocess = challengeDataSourcePreprocess challengeDataSource
mInHeader = challengeDataSourceInHeader challengeDataSource
mOutHeader = challengeDataSourceOutHeader challengeDataSource
inputSource = challengeDataSourceInput challengeDataSource
expectedSource = challengeDataSourceExpected challengeDataSource
outSource = dataSourceOut dataSource
isEmptyFileSource :: SourceSpec -> IO Bool
isEmptyFileSource (FilePathSpec filePath) = isEmptyFile filePath

29
src/GEval/DataSource.hs Normal file
View File

@ -0,0 +1,29 @@
module GEval.DataSource
(ChallengeDataSource(..),
DataSource(..))
where
import Data.Text
import Data.Conduit.SmartSource
import Data.Conduit.Header
import GEval.Selector
-- | This type specifies the way the challenge data (input and
-- expected data, but not outputs) flow into evaluation.
--
-- At some point, it is turned into conduit for reading data.
data ChallengeDataSource = ChallengeDataSource {
challengeDataSourceInput :: SourceSpec,
challengeDataSourceExpected :: SourceSpec,
challengeDataSourceSelector :: Maybe Selector,
challengeDataSourcePreprocess :: Text -> Text,
challengeDataSourceFilter :: Maybe (Text -> Bool),
challengeDataSourceInHeader :: Maybe TabularHeader,
challengeDataSourceOutHeader :: Maybe TabularHeader }
-- | This type specifies all the data flowing into evaluation,
-- including the output data to be evaluated.
data DataSource = DataSource {
dataSourceChallengeData :: ChallengeDataSource,
dataSourceOut :: SourceSpec }