Handle jsonl files

This commit is contained in:
Filip Gralinski 2019-02-13 17:53:30 +01:00 committed by Filip Graliński
parent 9bfcb3bbde
commit 26e9735d31
10 changed files with 207 additions and 55 deletions

View File

@ -40,6 +40,7 @@ library
, GEval.BlackBoxDebugging , GEval.BlackBoxDebugging
, Text.WordShape , Text.WordShape
, Data.Statistics.Kendall , Data.Statistics.Kendall
, GEval.Selector
, Paths_geval , Paths_geval
build-depends: base >= 4.7 && < 5 build-depends: base >= 4.7 && < 5
, cond , cond
@ -82,6 +83,8 @@ library
, array , array
, Munkres , Munkres
, vector-algorithms , vector-algorithms
, aeson
, aeson-pretty
default-language: Haskell2010 default-language: Haskell2010
executable geval executable geval

View File

@ -102,8 +102,10 @@ checkRefFormat ref =
isUnwantedChar '\177' = True isUnwantedChar '\177' = True
isUnwantedChar c = ord c < 32 isUnwantedChar c = ord c < 32
compressedFilesHandled = [".gz", ".xz", ".bz2"]
lookForCompressedFiles :: FilePath -> IO FilePath lookForCompressedFiles :: FilePath -> IO FilePath
lookForCompressedFiles = lookForAlternativeFiles [".gz", ".xz", ".bz2"] lookForCompressedFiles = lookForAlternativeFiles compressedFilesHandled
lookForAlternativeFiles :: [String] -> FilePath -> IO FilePath lookForAlternativeFiles :: [String] -> FilePath -> IO FilePath
lookForAlternativeFiles suffixes filePath lookForAlternativeFiles suffixes filePath

View File

@ -39,7 +39,8 @@ module GEval.Core
checkMultipleOuts, checkMultipleOuts,
checkMultipleOutsCore, checkMultipleOutsCore,
gesMainMetric, gesMainMetric,
gesPreprocess gesPreprocess,
getDataDecoder
) where ) where
import Data.Conduit import Data.Conduit
@ -58,9 +59,10 @@ import qualified System.Directory as D
import System.Posix import System.Posix
import System.FilePath import System.FilePath
import Data.Maybe import Data.Maybe
import Data.Either (rights)
import Data.Tuple import Data.Tuple
import qualified Data.List.Split as DLS import qualified Data.List.Split as DLS
import Data.List (sortBy) import Data.List (sortBy, isSuffixOf)
import Text.NaturalComp import Text.NaturalComp
import Control.Monad.IO.Class import Control.Monad.IO.Class
@ -84,6 +86,7 @@ import GEval.ProbList
import GEval.WER import GEval.WER
import Data.Conduit.AutoDecompress import Data.Conduit.AutoDecompress
import Text.Tokenizer import Text.Tokenizer
import GEval.Selector
import GEval.Annotation import GEval.Annotation
import GEval.BlackBoxDebugging import GEval.BlackBoxDebugging
@ -257,6 +260,7 @@ data GEvalSpecification = GEvalSpecification
{ gesOutDirectory :: FilePath, { gesOutDirectory :: FilePath,
gesExpectedDirectory :: Maybe FilePath, gesExpectedDirectory :: Maybe FilePath,
gesTestName :: String, gesTestName :: String,
gesSelector :: Maybe Selector,
gesOutFile :: String, gesOutFile :: String,
gesExpectedFile :: String, gesExpectedFile :: String,
gesInputFile :: String, gesInputFile :: String,
@ -341,6 +345,7 @@ defaultGEvalSpecification = GEvalSpecification {
gesOutDirectory = defaultOutDirectory, gesOutDirectory = defaultOutDirectory,
gesExpectedDirectory = Nothing, gesExpectedDirectory = Nothing,
gesTestName = defaultTestName, gesTestName = defaultTestName,
gesSelector = Nothing,
gesOutFile = defaultOutFile, gesOutFile = defaultOutFile,
gesExpectedFile = defaultExpectedFile, gesExpectedFile = defaultExpectedFile,
gesInputFile = defaultInputFile, gesInputFile = defaultInputFile,
@ -356,8 +361,13 @@ isEmptyFile path = do
stat <- getFileStatus path stat <- getFileStatus path
return ((fileSize stat) == 0) return ((fileSize stat) == 0)
-- | Extensions handled (tried) by default. Files with other
-- extensions are handled only when given explicitly.
-- Compressor extensions (e.g. "gz") should not be given here.
extensionsHandled :: [String]
extensionsHandled = ["tsv", "jsonl"]
data LineSource m = LineSource (ConduitT () Text m ()) (Text -> Text) SourceSpec Word32 data LineSource m = LineSource (ConduitT () Text m ()) (Text -> ItemTarget) (Text -> Text) SourceSpec Word32
geval :: GEvalSpecification -> IO [(SourceSpec, [MetricValue])] geval :: GEvalSpecification -> IO [(SourceSpec, [MetricValue])]
geval gevalSpec = do geval gevalSpec = do
@ -367,10 +377,11 @@ geval gevalSpec = do
gevalOnSingleOut :: GEvalSpecification -> SourceSpec -> SourceSpec -> SourceSpec -> IO (SourceSpec, [MetricValue]) gevalOnSingleOut :: GEvalSpecification -> SourceSpec -> SourceSpec -> SourceSpec -> IO (SourceSpec, [MetricValue])
gevalOnSingleOut gevalSpec inputSource expectedSource outSource = do gevalOnSingleOut gevalSpec inputSource expectedSource outSource = do
vals <- Prelude.mapM (\metric -> gevalCore metric preprocess inputSource expectedSource outSource) metrics vals <- Prelude.mapM (\metric -> gevalCore metric mSelector preprocess inputSource expectedSource outSource) metrics
return (outSource, vals) return (outSource, vals)
where metrics = gesMetrics gevalSpec where metrics = gesMetrics gevalSpec
preprocess = gesPreprocess gevalSpec preprocess = gesPreprocess gevalSpec
mSelector = gesSelector gevalSpec
checkAndGetFilesSingleOut :: Bool -> GEvalSpecification -> IO (SourceSpec, SourceSpec, SourceSpec) checkAndGetFilesSingleOut :: Bool -> GEvalSpecification -> IO (SourceSpec, SourceSpec, SourceSpec)
checkAndGetFilesSingleOut forceInput gevalSpec = do checkAndGetFilesSingleOut forceInput gevalSpec = do
@ -397,7 +408,7 @@ checkAndGetFiles forceInput gevalSpec = do
osss <- case mMultipleOuts of osss <- case mMultipleOuts of
Just filePaths -> return $ Prelude.map (\fp -> FilePathSpec fp) filePaths Just filePaths -> return $ Prelude.map (\fp -> FilePathSpec fp) filePaths
Nothing -> do Nothing -> do
oss <- getSmartSourceSpec outTestDirectory "out.tsv" outFile oss <- checkSingleOut outTestDirectory outFile
case oss of case oss of
Left NoSpecGiven -> throwM $ NoOutFile outFile Left NoSpecGiven -> throwM $ NoOutFile outFile
Left (NoFile fp) -> throwM $ NoOutFile fp Left (NoFile fp) -> throwM $ NoOutFile fp
@ -418,22 +429,39 @@ checkAndGetFiles forceInput gevalSpec = do
inputFile = gesInputFile gevalSpec inputFile = gesInputFile gevalSpec
metrics = gesMetrics gevalSpec metrics = gesMetrics gevalSpec
checkSingleOut :: FilePath -> FilePath -> IO (Either SmartSourceError SourceSpec)
checkSingleOut outTestDirectory outFile
| outFile == defaultOutFile = do
-- if the default output file name is used try alternative formats (e.g. jsonl)
specs <- Prelude.mapM (\ext -> getSmartSourceSpec outTestDirectory defaultOutFile (outFile -<.> ext)) extensionsHandled
return $ case rights specs of
[] -> Prelude.head specs
rspecs@_ -> Right $ Prelude.head rspecs
| otherwise = getSmartSourceSpec outTestDirectory defaultOutFile outFile
checkMultipleOuts :: GEvalSpecification -> IO (Maybe [FilePath]) checkMultipleOuts :: GEvalSpecification -> IO (Maybe [FilePath])
checkMultipleOuts gevalSpec = checkMultipleOutsCore outDirectory testName outFile checkMultipleOuts gevalSpec = checkMultipleOutsCore outDirectory testName outFile
where outFile = gesOutFile gevalSpec where outFile = gesOutFile gevalSpec
outDirectory = gesOutDirectory gevalSpec outDirectory = gesOutDirectory gevalSpec
testName = gesTestName gevalSpec testName = gesTestName gevalSpec
-- | Looks for multiple output files.
checkMultipleOutsCore :: FilePath -> FilePath -> FilePath -> IO (Maybe [FilePath]) checkMultipleOutsCore :: FilePath -> FilePath -> FilePath -> IO (Maybe [FilePath])
checkMultipleOutsCore outDirectory testName outFile = do checkMultipleOutsCore outDirectory testName outFile = do
-- if the out.tsv is there, just use it -- if the out.tsv is there (possibly with an alternative extension,
outFilePath <- lookForCompressedFiles (outTestDirectory </> outFile) -- e.g. jsonl and compressed), just use it - but here we just check
isSimpleOutThere <- D.doesFileExist outFilePath -- this (`Nothing` will be returned in such a case, anyway)
outFilePaths <- Prelude.mapM (\ext -> lookForCompressedFiles (outTestDirectory </> outFile -<.> ext))
extensionsHandled
isSimpleOutTheres <- Prelude.mapM D.doesFileExist outFilePaths
let isSimpleOutThere = Prelude.and isSimpleOutTheres
let patterns = Prelude.map (\ext -> compile ("out-*.tsv" ++ ext)) ["", ".gz", ".bz2", ".xz"] let patterns = [compile ("out-*" <.> dataExt ++ compressorExt) |
dataExt <- extensionsHandled,
compressorExt <- ("":compressedFilesHandled)]
multipleOuts <- Prelude.concat <$> globDir patterns outTestDirectory multipleOuts <- Prelude.concat <$> globDir patterns outTestDirectory
if outFile == "out.tsv" && not isSimpleOutThere && multipleOuts /= [] if outFile == defaultOutFile && not isSimpleOutThere && multipleOuts /= []
then then
return $ Just multipleOuts return $ Just multipleOuts
else else
@ -457,39 +485,62 @@ getInputSourceIfNeeded forced metrics directory inputFilePath
Right sourceSpec -> return sourceSpec Right sourceSpec -> return sourceSpec
| otherwise = return NoSource | otherwise = return NoSource
fileAsLineSource :: SourceSpec -> (Text -> Text) -> LineSource (ResourceT IO) fileAsLineSource :: SourceSpec -> Maybe Selector -> (Text -> Text) -> LineSource (ResourceT IO)
fileAsLineSource spec preprocess = fileAsLineSource spec mSelector preprocess =
LineSource ((smartSource spec) .| autoDecompress .| CT.decodeUtf8Lenient .| CT.lines .| CC.map (dropAround (== '\r'))) preprocess spec 1 LineSource ((smartSource spec) .| autoDecompress .| CT.decodeUtf8Lenient .| CT.lines) (select (getDataFormat spec) mSelector) preprocess spec 1
gevalCoreOnSingleLines :: Metric -> (Text -> Text) -> LineInFile -> LineInFile -> LineInFile -> IO (MetricValue) getDataDecoder :: LineSource (ResourceT IO) -> (Text -> ItemTarget)
gevalCoreOnSingleLines metric preprocess inpLine expLine outLine = getDataDecoder (LineSource _ dd _ _ _) = dd
gevalCoreOnSources metric (singleLineAsLineSource inpLine preprocess)
(singleLineAsLineSource expLine outputPreprocess) getDataFormat :: SourceSpec -> DataFormat
(singleLineAsLineSource outLine outputPreprocess) getDataFormat (FilePathSpec filePath) = getDataFormatFromFilePath filePath
getDataFormat Stdin = Tsv
getDataFormat NoSource = Tsv
getDataFormat (Http url) = getDataFormatFromFilePath url
getDataFormat (Https url) = getDataFormatFromFilePath url
getDataFormatFromFilePath :: FilePath -> DataFormat
getDataFormatFromFilePath path =
case takeExtensions path' of
".jsonl" -> Jsonl
_ -> Tsv
where path' = if Prelude.or $ Prelude.map (\ext -> ext `Data.List.isSuffixOf` path)
compressedFilesHandled
then dropExtension path
else path
dataDecoder fmt mSelector = CC.map (select fmt mSelector)
gevalCoreOnSingleLines :: Metric -> (Text -> Text) -> (Text -> ItemTarget) -> LineInFile -> (Text -> ItemTarget) -> LineInFile -> (Text -> ItemTarget) -> LineInFile -> IO (MetricValue)
gevalCoreOnSingleLines metric preprocess inpDecoder inpLine expDecoder expLine outDecoder outLine =
gevalCoreOnSources metric (singleLineAsLineSource inpLine inpDecoder preprocess)
(singleLineAsLineSource expLine expDecoder outputPreprocess)
(singleLineAsLineSource outLine outDecoder outputPreprocess)
where outputPreprocess = if isPreprocessable metric where outputPreprocess = if isPreprocessable metric
then preprocess then preprocess
else id else id
singleLineAsLineSource :: LineInFile -> (Text -> Text) -> LineSource (ResourceT IO) singleLineAsLineSource :: LineInFile -> (Text -> ItemTarget) -> (Text -> Text) -> LineSource (ResourceT IO)
singleLineAsLineSource (LineInFile sourceSpec lineNo line) preprocess = singleLineAsLineSource (LineInFile sourceSpec lineNo line) itemDecoder preprocess =
LineSource (CL.sourceList [line]) preprocess sourceSpec lineNo LineSource (CL.sourceList [line]) itemDecoder preprocess sourceSpec lineNo
-- | Runs evaluation for a given metric using the sources specified -- | Runs evaluation for a given metric using the sources specified
-- for input, expected output and output. Returns the metric value. -- for input, expected output and output. Returns the metric value.
-- Throws @GEvalException@ if something was wrong in the data (e.g. -- Throws @GEvalException@ if something was wrong in the data (e.g.
-- inconsistent number of lines in the sources). -- inconsistent number of lines in the sources).
gevalCore :: Metric -- ^ evaluation metric gevalCore :: Metric -- ^ evaluation metric
-> Maybe Selector -- ^ selector to be used
-> (Text -> Text) -- ^ preprocessing function (e.g. tokenization) -> (Text -> Text) -- ^ preprocessing function (e.g. tokenization)
-> SourceSpec -- ^ source specification for the input values -> SourceSpec -- ^ source specification for the input values
-> SourceSpec -- ^ source specification for the expected output -> SourceSpec -- ^ source specification for the expected output
-> SourceSpec -- ^ source specification for the output -> SourceSpec -- ^ source specification for the output
-> IO (MetricValue) -- ^ metric value for the output against the expected output -> IO (MetricValue) -- ^ metric value for the output against the expected output
gevalCore metric preprocess inputSource expectedSource outSource = do gevalCore metric mSelector preprocess inputSource expectedSource outSource = do
whenM (isEmptyFileSource outSource) $ throwM $ EmptyOutput whenM (isEmptyFileSource outSource) $ throwM $ EmptyOutput
gevalCoreOnSources metric gevalCoreOnSources metric
(fileAsLineSource inputSource preprocess) (fileAsLineSource inputSource mSelector preprocess)
(fileAsLineSource expectedSource preprocess) (fileAsLineSource expectedSource mSelector preprocess)
(fileAsLineSource outSource preprocess) (fileAsLineSource outSource mSelector preprocess)
isEmptyFileSource :: SourceSpec -> IO Bool isEmptyFileSource :: SourceSpec -> IO Bool
isEmptyFileSource (FilePathSpec filePath) = isEmptyFile filePath isEmptyFileSource (FilePathSpec filePath) = isEmptyFile filePath
@ -687,7 +738,7 @@ gevalCore' MAP _ = gevalCoreWithoutInput (Right . DLS.splitOn "\t" . unpack)
gevalCore' (LogLossHashed nbOfBits) _ = helper nbOfBits gevalCore' (LogLossHashed nbOfBits) _ = helper nbOfBits
-- for LogLossHashed we "salt" each hash with the line number -- for LogLossHashed we "salt" each hash with the line number
where helper nbOfBits expectedLineSource outLineSource = where helper nbOfBits expectedLineSource outLineSource =
gevalCore''' (ParserSpecWithoutInput (Right . id) tentativeParser) (\(lineNo, (t,d)) -> calculateLogLoss nbOfBits lineNo t (parseDistributionWrapper nbOfBits lineNo d)) averageC negate (WithoutInput expectedLineSource outLineSource) gevalCore''' (ParserSpecWithoutInput (liftOp (Right . id)) (liftOp tentativeParser)) (\(lineNo, (t,d)) -> calculateLogLoss nbOfBits lineNo t (parseDistributionWrapper nbOfBits lineNo d)) averageC negate (WithoutInput expectedLineSource outLineSource)
-- Unfortunately, we're parsing the distribution twice. We need to -- Unfortunately, we're parsing the distribution twice. We need to
-- tentatively parse the distribution when the line number is unknown -- tentatively parse the distribution when the line number is unknown
-- (so we just set it to 1) -- (so we just set it to 1)
@ -699,8 +750,10 @@ gevalCore' (LogLossHashed nbOfBits) _ = helper nbOfBits
gevalCore' CharMatch inputLineSource = helper inputLineSource gevalCore' CharMatch inputLineSource = helper inputLineSource
where where
helper inputLineSource expectedLineSource outputLineSource = do helper inputLineSource expectedLineSource outputLineSource = do
gevalCoreGeneralized (ParserSpecWithInput (Right . unpack) (Right . unpack) (Right . unpack)) step countAgg (fMeasureOnCounts charMatchBeta) (WithInput inputLineSource expectedLineSource outputLineSource) gevalCoreGeneralized (ParserSpecWithInput justUnpack justUnpack justUnpack) step countAgg (fMeasureOnCounts charMatchBeta) (WithInput inputLineSource expectedLineSource outputLineSource)
step (ParsedRecordWithInput inp exp out) = getCharMatchCount inp exp out step (ParsedRecordWithInput inp exp out) = getCharMatchCount inp exp out
justUnpack = liftOp (Right . unpack)
gevalCore' BIOF1 _ = gevalCoreWithoutInput parseBioSequenceIntoEntities parseBioSequenceIntoEntities (uncurry gatherCountsForBIO) countAgg f1MeasureOnCounts gevalCore' BIOF1 _ = gevalCoreWithoutInput parseBioSequenceIntoEntities parseBioSequenceIntoEntities (uncurry gatherCountsForBIO) countAgg f1MeasureOnCounts
@ -784,7 +837,7 @@ gevalCoreWithoutInput :: (MonadUnliftIO m, MonadThrow m, MonadIO m)
-> LineSource (ResourceT m) -- ^ source to read the output -> LineSource (ResourceT m) -- ^ source to read the output
-> m (MetricValue) -- ^ metric values for the output against the expected output -> m (MetricValue) -- ^ metric values for the output against the expected output
gevalCoreWithoutInput expParser outParser itemStep aggregator finalStep expectedLineStream outLineStream = gevalCoreWithoutInput expParser outParser itemStep aggregator finalStep expectedLineStream outLineStream =
gevalCoreGeneralized (ParserSpecWithoutInput expParser outParser) (trans itemStep) aggregator finalStep (WithoutInput expectedLineStream outLineStream) gevalCoreGeneralized (ParserSpecWithoutInput (liftOp expParser) (liftOp outParser)) (trans itemStep) aggregator finalStep (WithoutInput expectedLineStream outLineStream)
where where
trans :: ((a, b) -> c) -> ParsedRecord (WithoutInput m a b) -> c trans :: ((a, b) -> c) -> ParsedRecord (WithoutInput m a b) -> c
trans step (ParsedRecordWithoutInput x y) = step (x, y) trans step (ParsedRecordWithoutInput x y) = step (x, y)
@ -838,12 +891,12 @@ class EvaluationContext ctxt m where
data WithoutInput m e o = WithoutInput (LineSource (ResourceT m)) (LineSource (ResourceT m)) data WithoutInput m e o = WithoutInput (LineSource (ResourceT m)) (LineSource (ResourceT m))
instance (MonadUnliftIO m, MonadIO m, MonadThrow m) => EvaluationContext (WithoutInput m e o) m where instance (MonadUnliftIO m, MonadIO m, MonadThrow m) => EvaluationContext (WithoutInput m e o) m where
data ParserSpec (WithoutInput m e o) = ParserSpecWithoutInput (Text -> Either String e) (Text -> Either String o) data ParserSpec (WithoutInput m e o) = ParserSpecWithoutInput (ItemTarget -> Either String e) (ItemTarget -> Either String o)
data WrappedParsedRecord (WithoutInput m e o) = WrappedParsedRecordWithoutInput (SourceItem e) (SourceItem o) data WrappedParsedRecord (WithoutInput m e o) = WrappedParsedRecordWithoutInput (SourceItem e) (SourceItem o)
data ParsedRecord (WithoutInput m e o) = ParsedRecordWithoutInput e o data ParsedRecord (WithoutInput m e o) = ParsedRecordWithoutInput e o
getFirstLineNo _ (WithoutInput _ (LineSource _ _ _ lineNo)) = lineNo getFirstLineNo _ (WithoutInput _ (LineSource _ _ _ _ lineNo)) = lineNo
getExpectedSource (WithoutInput (LineSource _ _ expectedSource _) _) = expectedSource getExpectedSource (WithoutInput (LineSource _ _ _ expectedSource _) _) = expectedSource
getOutSource (WithoutInput _ (LineSource _ _ outSource _)) = outSource getOutSource (WithoutInput _ (LineSource _ _ _ outSource _)) = outSource
recordSource (WithoutInput expectedLineSource outLineSource) (ParserSpecWithoutInput expParser outParser) = getZipSource $ WrappedParsedRecordWithoutInput recordSource (WithoutInput expectedLineSource outLineSource) (ParserSpecWithoutInput expParser outParser) = getZipSource $ WrappedParsedRecordWithoutInput
<$> ZipSource (items expectedLineSource expParser) <$> ZipSource (items expectedLineSource expParser)
<*> ZipSource (items outLineSource outParser) <*> ZipSource (items outLineSource outParser)
@ -864,15 +917,15 @@ instance (MonadUnliftIO m, MonadIO m, MonadThrow m) => EvaluationContext (Withou
data WithInput m i e o = WithInput (LineSource (ResourceT m)) (LineSource (ResourceT m)) (LineSource (ResourceT m)) data WithInput m i e o = WithInput (LineSource (ResourceT m)) (LineSource (ResourceT m)) (LineSource (ResourceT m))
getInputFilePath (WithInput (LineSource _ _ inputFilePath _) _ _) = inputFilePath getInputFilePath (WithInput (LineSource _ _ _ inputFilePath _) _ _) = inputFilePath
instance (MonadUnliftIO m, MonadIO m, MonadThrow m) => EvaluationContext (WithInput m i e o) m where instance (MonadUnliftIO m, MonadIO m, MonadThrow m) => EvaluationContext (WithInput m i e o) m where
data ParserSpec (WithInput m i e o) = ParserSpecWithInput (Text -> Either String i) (Text -> Either String e) (Text -> Either String o) data ParserSpec (WithInput m i e o) = ParserSpecWithInput (ItemTarget -> Either String i) (ItemTarget -> Either String e) (ItemTarget -> Either String o)
data WrappedParsedRecord (WithInput m i e o) = WrappedParsedRecordWithInput (SourceItem i) (SourceItem e) (SourceItem o) data WrappedParsedRecord (WithInput m i e o) = WrappedParsedRecordWithInput (SourceItem i) (SourceItem e) (SourceItem o)
data ParsedRecord (WithInput m i e o) = ParsedRecordWithInput i e o data ParsedRecord (WithInput m i e o) = ParsedRecordWithInput i e o
getFirstLineNo _ (WithInput _ _ (LineSource _ _ _ lineNo)) = lineNo getFirstLineNo _ (WithInput _ _ (LineSource _ _ _ _ lineNo)) = lineNo
getExpectedSource (WithInput _ (LineSource _ _ expectedSource _) _) = expectedSource getExpectedSource (WithInput _ (LineSource _ _ _ expectedSource _) _) = expectedSource
getOutSource (WithInput _ _ (LineSource _ _ outSource _)) = outSource getOutSource (WithInput _ _ (LineSource _ _ _ outSource _)) = outSource
recordSource (WithInput inputLineSource expectedLineSource outLineSource) (ParserSpecWithInput inpParser expParser outParser) = getZipSource $ (\x (y,z) -> WrappedParsedRecordWithInput x y z) recordSource (WithInput inputLineSource expectedLineSource outLineSource) (ParserSpecWithInput inpParser expParser outParser) = getZipSource $ (\x (y,z) -> WrappedParsedRecordWithInput x y z)
<$> ZipSource (items inputLineSource inpParser) <*> (ZipSource $ getZipSource $ (,) <$> ZipSource (items inputLineSource inpParser) <*> (ZipSource $ getZipSource $ (,)
<$> ZipSource (items expectedLineSource expParser) <$> ZipSource (items expectedLineSource expParser)
@ -906,11 +959,13 @@ averageC = getZipSink
<$> ZipSink CC.sum <$> ZipSink CC.sum
<*> ZipSink CC.length <*> ZipSink CC.length
items :: MonadResource m => LineSource m -> (Text -> Either String a) -> ConduitT () (SourceItem a) m () items :: MonadResource m => LineSource m -> (ItemTarget -> Either String a) -> ConduitT () (SourceItem a) m ()
items (LineSource lineSource preprocess _ _) parser = items (LineSource lineSource itemDecoder preprocess _ _) parser =
(lineSource .| CL.map (toItem . parser . preprocess)) >> yield Done (lineSource .| CL.map (toItem . parser . preprocess' . itemDecoder)) >> yield Done
where toItem (Right x) = Got x where toItem (Right x) = Got x
toItem (Left m) = Wrong m toItem (Left m) = Wrong m
preprocess' (RawItemTarget t) = RawItemTarget $ preprocess t
preprocess' (PartiallyParsedItemTarget ts) = PartiallyParsedItemTarget $ Prelude.map preprocess ts
itemAbsoluteError :: (Double, Double) -> Double itemAbsoluteError :: (Double, Double) -> Double
itemAbsoluteError (exp, out) = abs (exp-out) itemAbsoluteError (exp, out) = abs (exp-out)

View File

@ -47,6 +47,7 @@ import Data.Monoid ((<>))
import GEval.FeatureExtractor import GEval.FeatureExtractor
import GEval.BlackBoxDebugging import GEval.BlackBoxDebugging
import GEval.Selector
import Data.Word import Data.Word
@ -329,8 +330,9 @@ lessByMetric reversed metric = lessByMetric' reversed (getMetricOrdering metric)
runLineByLineGeneralized :: ResultOrdering -> GEvalSpecification -> ConduitT LineRecord Void (ResourceT IO) a -> IO a runLineByLineGeneralized :: ResultOrdering -> GEvalSpecification -> ConduitT LineRecord Void (ResourceT IO) a -> IO a
runLineByLineGeneralized ordering spec consum = do runLineByLineGeneralized ordering spec consum = do
(inputFilePath, expectedFilePath, outFilePath) <- checkAndGetFilesSingleOut True spec (inputFilePath, expectedFilePath, outFilePath) <- checkAndGetFilesSingleOut True spec
gevalLineByLineCore metric preprocess inputFilePath expectedFilePath outFilePath (sorter ordering .| consum) gevalLineByLineCore metric mSelector preprocess inputFilePath expectedFilePath outFilePath (sorter ordering .| consum)
where metric = gesMainMetric spec where metric = gesMainMetric spec
mSelector = gesSelector spec
preprocess = gesPreprocess spec preprocess = gesPreprocess spec
sorter KeepTheOriginalOrder = doNothing sorter KeepTheOriginalOrder = doNothing
sorter ordering = gobbleAndDo $ sortBy (sortOrder ordering (getMetricOrdering metric)) sorter ordering = gobbleAndDo $ sortBy (sortOrder ordering (getMetricOrdering metric))
@ -387,14 +389,15 @@ runDiffGeneralized ordering otherOut spec consum = do
Left (NoFile fp) -> throwM $ NoOutFile fp Left (NoFile fp) -> throwM $ NoOutFile fp
Left (NoDirectory d) -> throwM $ NoOutFile otherOut Left (NoDirectory d) -> throwM $ NoOutFile otherOut
Right otherOutSource -> do Right otherOutSource -> do
let sourceA = gevalLineByLineSource metric preprocess inputSource expectedSource otherOutSource let sourceA = gevalLineByLineSource metric mSelector preprocess inputSource expectedSource otherOutSource
let sourceB = gevalLineByLineSource metric preprocess inputSource expectedSource outSource let sourceB = gevalLineByLineSource metric mSelector preprocess inputSource expectedSource outSource
runResourceT $ runConduit $ runResourceT $ runConduit $
((getZipSource $ (,) ((getZipSource $ (,)
<$> ZipSource sourceA <$> ZipSource sourceA
<*> ZipSource sourceB) .| sorter ordering .| consum) <*> ZipSource sourceB) .| sorter ordering .| consum)
where metric = gesMainMetric spec where metric = gesMainMetric spec
preprocess = gesPreprocess spec preprocess = gesPreprocess spec
mSelector = gesSelector spec
sorter KeepTheOriginalOrder = doNothing sorter KeepTheOriginalOrder = doNothing
sorter ordering = gobbleAndDo $ sortBy (sortOrder ordering (getMetricOrdering metric)) sorter ordering = gobbleAndDo $ sortBy (sortOrder ordering (getMetricOrdering metric))
sortOrder FirstTheWorst TheHigherTheBetter = compareScores sortOrder FirstTheWorst TheHigherTheBetter = compareScores
@ -408,26 +411,26 @@ runDiffGeneralized ordering otherOut spec consum = do
escapeTabs :: Text -> Text escapeTabs :: Text -> Text
escapeTabs = Data.Text.replace "\t" "<tab>" escapeTabs = Data.Text.replace "\t" "<tab>"
gevalLineByLineCore :: Metric -> (Text -> Text) -> SourceSpec -> SourceSpec -> SourceSpec -> ConduitT LineRecord Void (ResourceT IO) a -> IO a gevalLineByLineCore :: Metric -> Maybe Selector -> (Text -> Text) -> SourceSpec -> SourceSpec -> SourceSpec -> ConduitT LineRecord Void (ResourceT IO) a -> IO a
gevalLineByLineCore metric preprocess inputSource expectedSource outSource consum = gevalLineByLineCore metric mSelector preprocess inputSource expectedSource outSource consum =
runResourceT $ runConduit $ runResourceT $ runConduit $
((gevalLineByLineSource metric preprocess inputSource expectedSource outSource) .| consum) ((gevalLineByLineSource metric mSelector preprocess inputSource expectedSource outSource) .| consum)
gevalLineByLineSource :: Metric -> (Text -> Text) -> SourceSpec -> SourceSpec -> SourceSpec -> ConduitT () LineRecord (ResourceT IO) () gevalLineByLineSource :: Metric -> Maybe Selector -> (Text -> Text) -> SourceSpec -> SourceSpec -> SourceSpec -> ConduitT () LineRecord (ResourceT IO) ()
gevalLineByLineSource metric preprocess inputSource expectedSource outSource = gevalLineByLineSource metric mSelector preprocess inputSource expectedSource outSource =
(getZipSource $ (,) (getZipSource $ (,)
<$> ZipSource (CL.sourceList [1..]) <$> ZipSource (CL.sourceList [1..])
<*> (ZipSource $ recordSource context parserSpec)) .| CL.mapM (checkStepM evaluateLine) .| CL.catMaybes <*> (ZipSource $ recordSource context parserSpec)) .| CL.mapM (checkStepM evaluateLine) .| CL.catMaybes
where parserSpec = (ParserSpecWithInput (Right . id) (Right . id) (Right . id)) where parserSpec = (ParserSpecWithInput (Right . id) (Right . id) (Right . id))
context = (WithInput inputLineSource expectedLineSource outputLineSource) context = (WithInput inputLineSource expectedLineSource outputLineSource)
inputLineSource = fileAsLineSource inputSource id inputLineSource = fileAsLineSource inputSource mSelector id
expectedLineSource = fileAsLineSource expectedSource id expectedLineSource = fileAsLineSource expectedSource mSelector id
outputLineSource = fileAsLineSource outSource id outputLineSource = fileAsLineSource outSource mSelector id
justLine (LineInFile _ _ l) = l justLine (LineInFile _ _ l) = l
evaluateLine (lineNo, ParsedRecordWithInput inp exp out) = do evaluateLine (lineNo, ParsedRecordWithInput inp exp out) = do
s <- liftIO $ gevalCoreOnSingleLines metric preprocess (LineInFile inputSource lineNo inp) s <- liftIO $ gevalCoreOnSingleLines metric preprocess (getDataDecoder inputLineSource) (LineInFile inputSource lineNo inp)
(LineInFile expectedSource lineNo exp) (getDataDecoder expectedLineSource) (LineInFile expectedSource lineNo exp)
(LineInFile outSource lineNo out) (getDataDecoder outputLineSource) (LineInFile outSource lineNo out)
return $ LineRecord inp exp out lineNo s return $ LineRecord inp exp out lineNo s
justTokenize :: Maybe Tokenizer -> IO () justTokenize :: Maybe Tokenizer -> IO ()

View File

@ -28,6 +28,7 @@ import GEval.CreateChallenge
import GEval.LineByLine import GEval.LineByLine
import GEval.Submit (submit) import GEval.Submit (submit)
import GEval.BlackBoxDebugging import GEval.BlackBoxDebugging
import GEval.Selector
import Data.Conduit.SmartSource import Data.Conduit.SmartSource
@ -122,6 +123,7 @@ specParser = GEvalSpecification
<> showDefault <> showDefault
<> metavar "NAME" <> metavar "NAME"
<> help "Test name (i.e. subdirectory with results or expected results)" ) <> help "Test name (i.e. subdirectory with results or expected results)" )
<*> (optional $ selectorParser)
<*> strOption <*> strOption
( long "out-file" ( long "out-file"
<> short 'o' <> short 'o'
@ -172,6 +174,13 @@ specParser = GEvalSpecification
defaultMinFrequency :: Integer defaultMinFrequency :: Integer
defaultMinFrequency = 1 defaultMinFrequency = 1
selectorParser :: Parser Selector
selectorParser = parseSelector <$> (strOption $
( long "selector"
<> metavar "JSON_PATH"
<> help "Selector to an item to be considered"
))
blackBoxDebuggingOptionsParser :: Parser BlackBoxDebuggingOptions blackBoxDebuggingOptionsParser :: Parser BlackBoxDebuggingOptions
blackBoxDebuggingOptionsParser = BlackBoxDebuggingOptions blackBoxDebuggingOptionsParser = BlackBoxDebuggingOptions
<$> option auto <$> option auto

69
src/GEval/Selector.hs Normal file
View File

@ -0,0 +1,69 @@
{-# LANGUAGE OverloadedStrings #-}
module GEval.Selector
( Selector(..),
DataFormat(..),
ItemTarget(..),
liftOp,
select,
parseSelector ) where
import qualified Data.Text as T
import qualified Data.Vector as V
import qualified Data.Text.Encoding as DTE
import Data.Aeson
import qualified Data.HashMap.Strict as H
import Data.Text.Encoding (encodeUtf8Builder)
import Data.ByteString.Builder(toLazyByteString)
import Data.Aeson.Encode.Pretty
import qualified Data.ByteString as B
import qualified Data.ByteString.Internal as BI
import qualified Data.ByteString.Lazy as BL
data Selector = Selector [T.Text]
deriving (Eq, Show)
data DataFormat = Tsv | Jsonl
deriving (Eq, Show)
data ItemTarget = RawItemTarget T.Text | PartiallyParsedItemTarget [T.Text]
deriving (Eq, Show)
parseSelector :: String -> Selector
parseSelector = Selector . T.splitOn "/" . T.pack
liftOp :: (T.Text -> a) -> (ItemTarget -> a)
liftOp fun (RawItemTarget t) = fun t
liftOp fun (PartiallyParsedItemTarget t) = fun (T.intercalate " " t)
select :: DataFormat -> Maybe Selector -> T.Text -> ItemTarget
select _ Nothing t = RawItemTarget t
select Tsv (Just _) _ = error "selectors not handled for TSVs"
select Jsonl (Just selector) t = case selectInJson selector $ decode'' $ t of
Just v -> finalSelect v
Nothing -> error "selector failed"
finalSelect :: Value -> ItemTarget
finalSelect (Array array) = PartiallyParsedItemTarget $ V.toList $ V.map (\e -> DTE.decodeUtf8 $ toStrict $ encodePretty' encConfig e) array
finalSelect val = RawItemTarget $ DTE.decodeUtf8 $ toStrict $ encodePretty' encConfig val
encConfig = Config {
confIndent = Spaces 0,
confCompare = compare,
confNumFormat = Generic,
confTrailingNewline = False }
toStrict :: BL.ByteString -> B.ByteString
toStrict = B.concat . BL.toChunks
-- TODO get rid of this
decode'' :: FromJSON a => T.Text -> Maybe a
decode'' = decode . toLazyByteString . encodeUtf8Builder
selectInJson :: Selector -> Maybe Value -> Maybe Value
selectInJson _ Nothing = Nothing
selectInJson (Selector []) value = value
selectInJson (Selector (h:r)) (Just (Object object)) =
selectInJson (Selector r) (H.lookup h object)
selectInJson _ _ = Nothing

View File

@ -369,12 +369,16 @@ main = hspec $ do
describe "automatic decompression" $ do describe "automatic decompression" $ do
it "more complex test" $ do it "more complex test" $ do
runGEvalTest "charmatch-complex-compressed" `shouldReturnAlmost` 0.1923076923076923 runGEvalTest "charmatch-complex-compressed" `shouldReturnAlmost` 0.1923076923076923
describe "handling jsonl format" $ do
it "simple test" $
runGEvalTestExtraOptions ["-e", "expected.jsonl" ] "jsonl-simple" `shouldReturnAlmost` 0.5
describe "line by line mode" $ do describe "line by line mode" $ do
let sampleChallenge = let sampleChallenge =
GEvalSpecification GEvalSpecification
{ gesOutDirectory = "test/likelihood-simple/likelihood-simple-solution", { gesOutDirectory = "test/likelihood-simple/likelihood-simple-solution",
gesExpectedDirectory = Just "test/likelihood-simple/likelihood-simple", gesExpectedDirectory = Just "test/likelihood-simple/likelihood-simple",
gesTestName = "test-A", gesTestName = "test-A",
gesSelector = Nothing,
gesOutFile = "out.tsv", gesOutFile = "out.tsv",
gesExpectedFile = "expected.tsv", gesExpectedFile = "expected.tsv",
gesInputFile = "in.tsv", gesInputFile = "in.tsv",

View File

@ -0,0 +1,3 @@
{"id": 0, "root":{"foo":"bar", "items":[{"aaa":12, "bbb":"x"}, {"aaa":14, "bbb":"a"}]}}
{"id": 1, "root":{"foo":"baz", "items":[{"aaa": 13, "bbb":"y"}]}}
{"id": 2, "root":{"foo":"baz", "items":[{"aaa":3, "bbb":"abc"}]}}

View File

@ -0,0 +1 @@
--metric MultiLabel-F1 --selector root/items

View File

@ -0,0 +1,3 @@
{"id": 0, "root":{"foo":"bar", "items":[{"aaa":12, "bbb":"xyz"}, {"aaa":14, "bbb":"a"}]}}
{"id": 1, "root":{"foo":"baz", "items":[]}}
{"id": 2, "root":{"foo":"baz", "items":[{"aaa":3, "bbb":"abc"}]}}