Handle jsonl files

This commit is contained in:
Filip Gralinski 2019-02-13 17:53:30 +01:00 committed by Filip Graliński
parent 9bfcb3bbde
commit 26e9735d31
10 changed files with 207 additions and 55 deletions

View File

@ -40,6 +40,7 @@ library
, GEval.BlackBoxDebugging
, Text.WordShape
, Data.Statistics.Kendall
, GEval.Selector
, Paths_geval
build-depends: base >= 4.7 && < 5
, cond
@ -82,6 +83,8 @@ library
, array
, Munkres
, vector-algorithms
, aeson
, aeson-pretty
default-language: Haskell2010
executable geval

View File

@ -102,8 +102,10 @@ checkRefFormat ref =
isUnwantedChar '\177' = True
isUnwantedChar c = ord c < 32
compressedFilesHandled = [".gz", ".xz", ".bz2"]
lookForCompressedFiles :: FilePath -> IO FilePath
lookForCompressedFiles = lookForAlternativeFiles [".gz", ".xz", ".bz2"]
lookForCompressedFiles = lookForAlternativeFiles compressedFilesHandled
lookForAlternativeFiles :: [String] -> FilePath -> IO FilePath
lookForAlternativeFiles suffixes filePath

View File

@ -39,7 +39,8 @@ module GEval.Core
checkMultipleOuts,
checkMultipleOutsCore,
gesMainMetric,
gesPreprocess
gesPreprocess,
getDataDecoder
) where
import Data.Conduit
@ -58,9 +59,10 @@ import qualified System.Directory as D
import System.Posix
import System.FilePath
import Data.Maybe
import Data.Either (rights)
import Data.Tuple
import qualified Data.List.Split as DLS
import Data.List (sortBy)
import Data.List (sortBy, isSuffixOf)
import Text.NaturalComp
import Control.Monad.IO.Class
@ -84,6 +86,7 @@ import GEval.ProbList
import GEval.WER
import Data.Conduit.AutoDecompress
import Text.Tokenizer
import GEval.Selector
import GEval.Annotation
import GEval.BlackBoxDebugging
@ -257,6 +260,7 @@ data GEvalSpecification = GEvalSpecification
{ gesOutDirectory :: FilePath,
gesExpectedDirectory :: Maybe FilePath,
gesTestName :: String,
gesSelector :: Maybe Selector,
gesOutFile :: String,
gesExpectedFile :: String,
gesInputFile :: String,
@ -341,6 +345,7 @@ defaultGEvalSpecification = GEvalSpecification {
gesOutDirectory = defaultOutDirectory,
gesExpectedDirectory = Nothing,
gesTestName = defaultTestName,
gesSelector = Nothing,
gesOutFile = defaultOutFile,
gesExpectedFile = defaultExpectedFile,
gesInputFile = defaultInputFile,
@ -356,8 +361,13 @@ isEmptyFile path = do
stat <- getFileStatus path
return ((fileSize stat) == 0)
-- | Extensions handled (tried) by default. Files with other
-- extensions are handled only when given explicitly.
-- Compressor extensions (e.g. "gz") should not be given here.
extensionsHandled :: [String]
extensionsHandled = ["tsv", "jsonl"]
data LineSource m = LineSource (ConduitT () Text m ()) (Text -> Text) SourceSpec Word32
data LineSource m = LineSource (ConduitT () Text m ()) (Text -> ItemTarget) (Text -> Text) SourceSpec Word32
geval :: GEvalSpecification -> IO [(SourceSpec, [MetricValue])]
geval gevalSpec = do
@ -367,10 +377,11 @@ geval gevalSpec = do
gevalOnSingleOut :: GEvalSpecification -> SourceSpec -> SourceSpec -> SourceSpec -> IO (SourceSpec, [MetricValue])
gevalOnSingleOut gevalSpec inputSource expectedSource outSource = do
vals <- Prelude.mapM (\metric -> gevalCore metric preprocess inputSource expectedSource outSource) metrics
vals <- Prelude.mapM (\metric -> gevalCore metric mSelector preprocess inputSource expectedSource outSource) metrics
return (outSource, vals)
where metrics = gesMetrics gevalSpec
preprocess = gesPreprocess gevalSpec
mSelector = gesSelector gevalSpec
checkAndGetFilesSingleOut :: Bool -> GEvalSpecification -> IO (SourceSpec, SourceSpec, SourceSpec)
checkAndGetFilesSingleOut forceInput gevalSpec = do
@ -397,7 +408,7 @@ checkAndGetFiles forceInput gevalSpec = do
osss <- case mMultipleOuts of
Just filePaths -> return $ Prelude.map (\fp -> FilePathSpec fp) filePaths
Nothing -> do
oss <- getSmartSourceSpec outTestDirectory "out.tsv" outFile
oss <- checkSingleOut outTestDirectory outFile
case oss of
Left NoSpecGiven -> throwM $ NoOutFile outFile
Left (NoFile fp) -> throwM $ NoOutFile fp
@ -418,22 +429,39 @@ checkAndGetFiles forceInput gevalSpec = do
inputFile = gesInputFile gevalSpec
metrics = gesMetrics gevalSpec
checkSingleOut :: FilePath -> FilePath -> IO (Either SmartSourceError SourceSpec)
checkSingleOut outTestDirectory outFile
| outFile == defaultOutFile = do
-- if the default output file name is used try alternative formats (e.g. jsonl)
specs <- Prelude.mapM (\ext -> getSmartSourceSpec outTestDirectory defaultOutFile (outFile -<.> ext)) extensionsHandled
return $ case rights specs of
[] -> Prelude.head specs
rspecs@_ -> Right $ Prelude.head rspecs
| otherwise = getSmartSourceSpec outTestDirectory defaultOutFile outFile
checkMultipleOuts :: GEvalSpecification -> IO (Maybe [FilePath])
checkMultipleOuts gevalSpec = checkMultipleOutsCore outDirectory testName outFile
where outFile = gesOutFile gevalSpec
outDirectory = gesOutDirectory gevalSpec
testName = gesTestName gevalSpec
-- | Looks for multiple output files.
checkMultipleOutsCore :: FilePath -> FilePath -> FilePath -> IO (Maybe [FilePath])
checkMultipleOutsCore outDirectory testName outFile = do
-- if the out.tsv is there, just use it
outFilePath <- lookForCompressedFiles (outTestDirectory </> outFile)
isSimpleOutThere <- D.doesFileExist outFilePath
-- if the out.tsv is there (possibly with an alternative extension,
-- e.g. jsonl and compressed), just use it - but here we just check
-- this (`Nothing` will be returned in such a case, anyway)
outFilePaths <- Prelude.mapM (\ext -> lookForCompressedFiles (outTestDirectory </> outFile -<.> ext))
extensionsHandled
isSimpleOutTheres <- Prelude.mapM D.doesFileExist outFilePaths
let isSimpleOutThere = Prelude.and isSimpleOutTheres
let patterns = Prelude.map (\ext -> compile ("out-*.tsv" ++ ext)) ["", ".gz", ".bz2", ".xz"]
let patterns = [compile ("out-*" <.> dataExt ++ compressorExt) |
dataExt <- extensionsHandled,
compressorExt <- ("":compressedFilesHandled)]
multipleOuts <- Prelude.concat <$> globDir patterns outTestDirectory
if outFile == "out.tsv" && not isSimpleOutThere && multipleOuts /= []
if outFile == defaultOutFile && not isSimpleOutThere && multipleOuts /= []
then
return $ Just multipleOuts
else
@ -457,39 +485,62 @@ getInputSourceIfNeeded forced metrics directory inputFilePath
Right sourceSpec -> return sourceSpec
| otherwise = return NoSource
fileAsLineSource :: SourceSpec -> (Text -> Text) -> LineSource (ResourceT IO)
fileAsLineSource spec preprocess =
LineSource ((smartSource spec) .| autoDecompress .| CT.decodeUtf8Lenient .| CT.lines .| CC.map (dropAround (== '\r'))) preprocess spec 1
fileAsLineSource :: SourceSpec -> Maybe Selector -> (Text -> Text) -> LineSource (ResourceT IO)
fileAsLineSource spec mSelector preprocess =
LineSource ((smartSource spec) .| autoDecompress .| CT.decodeUtf8Lenient .| CT.lines) (select (getDataFormat spec) mSelector) preprocess spec 1
gevalCoreOnSingleLines :: Metric -> (Text -> Text) -> LineInFile -> LineInFile -> LineInFile -> IO (MetricValue)
gevalCoreOnSingleLines metric preprocess inpLine expLine outLine =
gevalCoreOnSources metric (singleLineAsLineSource inpLine preprocess)
(singleLineAsLineSource expLine outputPreprocess)
(singleLineAsLineSource outLine outputPreprocess)
getDataDecoder :: LineSource (ResourceT IO) -> (Text -> ItemTarget)
getDataDecoder (LineSource _ dd _ _ _) = dd
getDataFormat :: SourceSpec -> DataFormat
getDataFormat (FilePathSpec filePath) = getDataFormatFromFilePath filePath
getDataFormat Stdin = Tsv
getDataFormat NoSource = Tsv
getDataFormat (Http url) = getDataFormatFromFilePath url
getDataFormat (Https url) = getDataFormatFromFilePath url
getDataFormatFromFilePath :: FilePath -> DataFormat
getDataFormatFromFilePath path =
case takeExtensions path' of
".jsonl" -> Jsonl
_ -> Tsv
where path' = if Prelude.or $ Prelude.map (\ext -> ext `Data.List.isSuffixOf` path)
compressedFilesHandled
then dropExtension path
else path
dataDecoder fmt mSelector = CC.map (select fmt mSelector)
gevalCoreOnSingleLines :: Metric -> (Text -> Text) -> (Text -> ItemTarget) -> LineInFile -> (Text -> ItemTarget) -> LineInFile -> (Text -> ItemTarget) -> LineInFile -> IO (MetricValue)
gevalCoreOnSingleLines metric preprocess inpDecoder inpLine expDecoder expLine outDecoder outLine =
gevalCoreOnSources metric (singleLineAsLineSource inpLine inpDecoder preprocess)
(singleLineAsLineSource expLine expDecoder outputPreprocess)
(singleLineAsLineSource outLine outDecoder outputPreprocess)
where outputPreprocess = if isPreprocessable metric
then preprocess
else id
singleLineAsLineSource :: LineInFile -> (Text -> Text) -> LineSource (ResourceT IO)
singleLineAsLineSource (LineInFile sourceSpec lineNo line) preprocess =
LineSource (CL.sourceList [line]) preprocess sourceSpec lineNo
singleLineAsLineSource :: LineInFile -> (Text -> ItemTarget) -> (Text -> Text) -> LineSource (ResourceT IO)
singleLineAsLineSource (LineInFile sourceSpec lineNo line) itemDecoder preprocess =
LineSource (CL.sourceList [line]) itemDecoder preprocess sourceSpec lineNo
-- | Runs evaluation for a given metric using the sources specified
-- for input, expected output and output. Returns the metric value.
-- Throws @GEvalException@ if something was wrong in the data (e.g.
-- inconsistent number of lines in the sources).
gevalCore :: Metric -- ^ evaluation metric
-> Maybe Selector -- ^ selector to be used
-> (Text -> Text) -- ^ preprocessing function (e.g. tokenization)
-> SourceSpec -- ^ source specification for the input values
-> SourceSpec -- ^ source specification for the expected output
-> SourceSpec -- ^ source specification for the output
-> IO (MetricValue) -- ^ metric value for the output against the expected output
gevalCore metric preprocess inputSource expectedSource outSource = do
gevalCore metric mSelector preprocess inputSource expectedSource outSource = do
whenM (isEmptyFileSource outSource) $ throwM $ EmptyOutput
gevalCoreOnSources metric
(fileAsLineSource inputSource preprocess)
(fileAsLineSource expectedSource preprocess)
(fileAsLineSource outSource preprocess)
(fileAsLineSource inputSource mSelector preprocess)
(fileAsLineSource expectedSource mSelector preprocess)
(fileAsLineSource outSource mSelector preprocess)
isEmptyFileSource :: SourceSpec -> IO Bool
isEmptyFileSource (FilePathSpec filePath) = isEmptyFile filePath
@ -687,7 +738,7 @@ gevalCore' MAP _ = gevalCoreWithoutInput (Right . DLS.splitOn "\t" . unpack)
gevalCore' (LogLossHashed nbOfBits) _ = helper nbOfBits
-- for LogLossHashed we "salt" each hash with the line number
where helper nbOfBits expectedLineSource outLineSource =
gevalCore''' (ParserSpecWithoutInput (Right . id) tentativeParser) (\(lineNo, (t,d)) -> calculateLogLoss nbOfBits lineNo t (parseDistributionWrapper nbOfBits lineNo d)) averageC negate (WithoutInput expectedLineSource outLineSource)
gevalCore''' (ParserSpecWithoutInput (liftOp (Right . id)) (liftOp tentativeParser)) (\(lineNo, (t,d)) -> calculateLogLoss nbOfBits lineNo t (parseDistributionWrapper nbOfBits lineNo d)) averageC negate (WithoutInput expectedLineSource outLineSource)
-- Unfortunately, we're parsing the distribution twice. We need to
-- tentatively parse the distribution when the line number is unknown
-- (so we just set it to 1)
@ -699,8 +750,10 @@ gevalCore' (LogLossHashed nbOfBits) _ = helper nbOfBits
gevalCore' CharMatch inputLineSource = helper inputLineSource
where
helper inputLineSource expectedLineSource outputLineSource = do
gevalCoreGeneralized (ParserSpecWithInput (Right . unpack) (Right . unpack) (Right . unpack)) step countAgg (fMeasureOnCounts charMatchBeta) (WithInput inputLineSource expectedLineSource outputLineSource)
gevalCoreGeneralized (ParserSpecWithInput justUnpack justUnpack justUnpack) step countAgg (fMeasureOnCounts charMatchBeta) (WithInput inputLineSource expectedLineSource outputLineSource)
step (ParsedRecordWithInput inp exp out) = getCharMatchCount inp exp out
justUnpack = liftOp (Right . unpack)
gevalCore' BIOF1 _ = gevalCoreWithoutInput parseBioSequenceIntoEntities parseBioSequenceIntoEntities (uncurry gatherCountsForBIO) countAgg f1MeasureOnCounts
@ -784,7 +837,7 @@ gevalCoreWithoutInput :: (MonadUnliftIO m, MonadThrow m, MonadIO m)
-> LineSource (ResourceT m) -- ^ source to read the output
-> m (MetricValue) -- ^ metric values for the output against the expected output
gevalCoreWithoutInput expParser outParser itemStep aggregator finalStep expectedLineStream outLineStream =
gevalCoreGeneralized (ParserSpecWithoutInput expParser outParser) (trans itemStep) aggregator finalStep (WithoutInput expectedLineStream outLineStream)
gevalCoreGeneralized (ParserSpecWithoutInput (liftOp expParser) (liftOp outParser)) (trans itemStep) aggregator finalStep (WithoutInput expectedLineStream outLineStream)
where
trans :: ((a, b) -> c) -> ParsedRecord (WithoutInput m a b) -> c
trans step (ParsedRecordWithoutInput x y) = step (x, y)
@ -838,12 +891,12 @@ class EvaluationContext ctxt m where
data WithoutInput m e o = WithoutInput (LineSource (ResourceT m)) (LineSource (ResourceT m))
instance (MonadUnliftIO m, MonadIO m, MonadThrow m) => EvaluationContext (WithoutInput m e o) m where
data ParserSpec (WithoutInput m e o) = ParserSpecWithoutInput (Text -> Either String e) (Text -> Either String o)
data ParserSpec (WithoutInput m e o) = ParserSpecWithoutInput (ItemTarget -> Either String e) (ItemTarget -> Either String o)
data WrappedParsedRecord (WithoutInput m e o) = WrappedParsedRecordWithoutInput (SourceItem e) (SourceItem o)
data ParsedRecord (WithoutInput m e o) = ParsedRecordWithoutInput e o
getFirstLineNo _ (WithoutInput _ (LineSource _ _ _ lineNo)) = lineNo
getExpectedSource (WithoutInput (LineSource _ _ expectedSource _) _) = expectedSource
getOutSource (WithoutInput _ (LineSource _ _ outSource _)) = outSource
getFirstLineNo _ (WithoutInput _ (LineSource _ _ _ _ lineNo)) = lineNo
getExpectedSource (WithoutInput (LineSource _ _ _ expectedSource _) _) = expectedSource
getOutSource (WithoutInput _ (LineSource _ _ _ outSource _)) = outSource
recordSource (WithoutInput expectedLineSource outLineSource) (ParserSpecWithoutInput expParser outParser) = getZipSource $ WrappedParsedRecordWithoutInput
<$> ZipSource (items expectedLineSource expParser)
<*> ZipSource (items outLineSource outParser)
@ -864,15 +917,15 @@ instance (MonadUnliftIO m, MonadIO m, MonadThrow m) => EvaluationContext (Withou
data WithInput m i e o = WithInput (LineSource (ResourceT m)) (LineSource (ResourceT m)) (LineSource (ResourceT m))
getInputFilePath (WithInput (LineSource _ _ inputFilePath _) _ _) = inputFilePath
getInputFilePath (WithInput (LineSource _ _ _ inputFilePath _) _ _) = inputFilePath
instance (MonadUnliftIO m, MonadIO m, MonadThrow m) => EvaluationContext (WithInput m i e o) m where
data ParserSpec (WithInput m i e o) = ParserSpecWithInput (Text -> Either String i) (Text -> Either String e) (Text -> Either String o)
data ParserSpec (WithInput m i e o) = ParserSpecWithInput (ItemTarget -> Either String i) (ItemTarget -> Either String e) (ItemTarget -> Either String o)
data WrappedParsedRecord (WithInput m i e o) = WrappedParsedRecordWithInput (SourceItem i) (SourceItem e) (SourceItem o)
data ParsedRecord (WithInput m i e o) = ParsedRecordWithInput i e o
getFirstLineNo _ (WithInput _ _ (LineSource _ _ _ lineNo)) = lineNo
getExpectedSource (WithInput _ (LineSource _ _ expectedSource _) _) = expectedSource
getOutSource (WithInput _ _ (LineSource _ _ outSource _)) = outSource
getFirstLineNo _ (WithInput _ _ (LineSource _ _ _ _ lineNo)) = lineNo
getExpectedSource (WithInput _ (LineSource _ _ _ expectedSource _) _) = expectedSource
getOutSource (WithInput _ _ (LineSource _ _ _ outSource _)) = outSource
recordSource (WithInput inputLineSource expectedLineSource outLineSource) (ParserSpecWithInput inpParser expParser outParser) = getZipSource $ (\x (y,z) -> WrappedParsedRecordWithInput x y z)
<$> ZipSource (items inputLineSource inpParser) <*> (ZipSource $ getZipSource $ (,)
<$> ZipSource (items expectedLineSource expParser)
@ -906,11 +959,13 @@ averageC = getZipSink
<$> ZipSink CC.sum
<*> ZipSink CC.length
items :: MonadResource m => LineSource m -> (Text -> Either String a) -> ConduitT () (SourceItem a) m ()
items (LineSource lineSource preprocess _ _) parser =
(lineSource .| CL.map (toItem . parser . preprocess)) >> yield Done
items :: MonadResource m => LineSource m -> (ItemTarget -> Either String a) -> ConduitT () (SourceItem a) m ()
items (LineSource lineSource itemDecoder preprocess _ _) parser =
(lineSource .| CL.map (toItem . parser . preprocess' . itemDecoder)) >> yield Done
where toItem (Right x) = Got x
toItem (Left m) = Wrong m
preprocess' (RawItemTarget t) = RawItemTarget $ preprocess t
preprocess' (PartiallyParsedItemTarget ts) = PartiallyParsedItemTarget $ Prelude.map preprocess ts
itemAbsoluteError :: (Double, Double) -> Double
itemAbsoluteError (exp, out) = abs (exp-out)

View File

@ -47,6 +47,7 @@ import Data.Monoid ((<>))
import GEval.FeatureExtractor
import GEval.BlackBoxDebugging
import GEval.Selector
import Data.Word
@ -329,8 +330,9 @@ lessByMetric reversed metric = lessByMetric' reversed (getMetricOrdering metric)
runLineByLineGeneralized :: ResultOrdering -> GEvalSpecification -> ConduitT LineRecord Void (ResourceT IO) a -> IO a
runLineByLineGeneralized ordering spec consum = do
(inputFilePath, expectedFilePath, outFilePath) <- checkAndGetFilesSingleOut True spec
gevalLineByLineCore metric preprocess inputFilePath expectedFilePath outFilePath (sorter ordering .| consum)
gevalLineByLineCore metric mSelector preprocess inputFilePath expectedFilePath outFilePath (sorter ordering .| consum)
where metric = gesMainMetric spec
mSelector = gesSelector spec
preprocess = gesPreprocess spec
sorter KeepTheOriginalOrder = doNothing
sorter ordering = gobbleAndDo $ sortBy (sortOrder ordering (getMetricOrdering metric))
@ -387,14 +389,15 @@ runDiffGeneralized ordering otherOut spec consum = do
Left (NoFile fp) -> throwM $ NoOutFile fp
Left (NoDirectory d) -> throwM $ NoOutFile otherOut
Right otherOutSource -> do
let sourceA = gevalLineByLineSource metric preprocess inputSource expectedSource otherOutSource
let sourceB = gevalLineByLineSource metric preprocess inputSource expectedSource outSource
let sourceA = gevalLineByLineSource metric mSelector preprocess inputSource expectedSource otherOutSource
let sourceB = gevalLineByLineSource metric mSelector preprocess inputSource expectedSource outSource
runResourceT $ runConduit $
((getZipSource $ (,)
<$> ZipSource sourceA
<*> ZipSource sourceB) .| sorter ordering .| consum)
where metric = gesMainMetric spec
preprocess = gesPreprocess spec
mSelector = gesSelector spec
sorter KeepTheOriginalOrder = doNothing
sorter ordering = gobbleAndDo $ sortBy (sortOrder ordering (getMetricOrdering metric))
sortOrder FirstTheWorst TheHigherTheBetter = compareScores
@ -408,26 +411,26 @@ runDiffGeneralized ordering otherOut spec consum = do
escapeTabs :: Text -> Text
escapeTabs = Data.Text.replace "\t" "<tab>"
gevalLineByLineCore :: Metric -> (Text -> Text) -> SourceSpec -> SourceSpec -> SourceSpec -> ConduitT LineRecord Void (ResourceT IO) a -> IO a
gevalLineByLineCore metric preprocess inputSource expectedSource outSource consum =
gevalLineByLineCore :: Metric -> Maybe Selector -> (Text -> Text) -> SourceSpec -> SourceSpec -> SourceSpec -> ConduitT LineRecord Void (ResourceT IO) a -> IO a
gevalLineByLineCore metric mSelector preprocess inputSource expectedSource outSource consum =
runResourceT $ runConduit $
((gevalLineByLineSource metric preprocess inputSource expectedSource outSource) .| consum)
((gevalLineByLineSource metric mSelector preprocess inputSource expectedSource outSource) .| consum)
gevalLineByLineSource :: Metric -> (Text -> Text) -> SourceSpec -> SourceSpec -> SourceSpec -> ConduitT () LineRecord (ResourceT IO) ()
gevalLineByLineSource metric preprocess inputSource expectedSource outSource =
gevalLineByLineSource :: Metric -> Maybe Selector -> (Text -> Text) -> SourceSpec -> SourceSpec -> SourceSpec -> ConduitT () LineRecord (ResourceT IO) ()
gevalLineByLineSource metric mSelector preprocess inputSource expectedSource outSource =
(getZipSource $ (,)
<$> ZipSource (CL.sourceList [1..])
<*> (ZipSource $ recordSource context parserSpec)) .| CL.mapM (checkStepM evaluateLine) .| CL.catMaybes
where parserSpec = (ParserSpecWithInput (Right . id) (Right . id) (Right . id))
context = (WithInput inputLineSource expectedLineSource outputLineSource)
inputLineSource = fileAsLineSource inputSource id
expectedLineSource = fileAsLineSource expectedSource id
outputLineSource = fileAsLineSource outSource id
inputLineSource = fileAsLineSource inputSource mSelector id
expectedLineSource = fileAsLineSource expectedSource mSelector id
outputLineSource = fileAsLineSource outSource mSelector id
justLine (LineInFile _ _ l) = l
evaluateLine (lineNo, ParsedRecordWithInput inp exp out) = do
s <- liftIO $ gevalCoreOnSingleLines metric preprocess (LineInFile inputSource lineNo inp)
(LineInFile expectedSource lineNo exp)
(LineInFile outSource lineNo out)
s <- liftIO $ gevalCoreOnSingleLines metric preprocess (getDataDecoder inputLineSource) (LineInFile inputSource lineNo inp)
(getDataDecoder expectedLineSource) (LineInFile expectedSource lineNo exp)
(getDataDecoder outputLineSource) (LineInFile outSource lineNo out)
return $ LineRecord inp exp out lineNo s
justTokenize :: Maybe Tokenizer -> IO ()

View File

@ -28,6 +28,7 @@ import GEval.CreateChallenge
import GEval.LineByLine
import GEval.Submit (submit)
import GEval.BlackBoxDebugging
import GEval.Selector
import Data.Conduit.SmartSource
@ -122,6 +123,7 @@ specParser = GEvalSpecification
<> showDefault
<> metavar "NAME"
<> help "Test name (i.e. subdirectory with results or expected results)" )
<*> (optional $ selectorParser)
<*> strOption
( long "out-file"
<> short 'o'
@ -172,6 +174,13 @@ specParser = GEvalSpecification
defaultMinFrequency :: Integer
defaultMinFrequency = 1
selectorParser :: Parser Selector
selectorParser = parseSelector <$> (strOption $
( long "selector"
<> metavar "JSON_PATH"
<> help "Selector to an item to be considered"
))
blackBoxDebuggingOptionsParser :: Parser BlackBoxDebuggingOptions
blackBoxDebuggingOptionsParser = BlackBoxDebuggingOptions
<$> option auto

69
src/GEval/Selector.hs Normal file
View File

@ -0,0 +1,69 @@
{-# LANGUAGE OverloadedStrings #-}
module GEval.Selector
( Selector(..),
DataFormat(..),
ItemTarget(..),
liftOp,
select,
parseSelector ) where
import qualified Data.Text as T
import qualified Data.Vector as V
import qualified Data.Text.Encoding as DTE
import Data.Aeson
import qualified Data.HashMap.Strict as H
import Data.Text.Encoding (encodeUtf8Builder)
import Data.ByteString.Builder(toLazyByteString)
import Data.Aeson.Encode.Pretty
import qualified Data.ByteString as B
import qualified Data.ByteString.Internal as BI
import qualified Data.ByteString.Lazy as BL
data Selector = Selector [T.Text]
deriving (Eq, Show)
data DataFormat = Tsv | Jsonl
deriving (Eq, Show)
data ItemTarget = RawItemTarget T.Text | PartiallyParsedItemTarget [T.Text]
deriving (Eq, Show)
parseSelector :: String -> Selector
parseSelector = Selector . T.splitOn "/" . T.pack
liftOp :: (T.Text -> a) -> (ItemTarget -> a)
liftOp fun (RawItemTarget t) = fun t
liftOp fun (PartiallyParsedItemTarget t) = fun (T.intercalate " " t)
select :: DataFormat -> Maybe Selector -> T.Text -> ItemTarget
select _ Nothing t = RawItemTarget t
select Tsv (Just _) _ = error "selectors not handled for TSVs"
select Jsonl (Just selector) t = case selectInJson selector $ decode'' $ t of
Just v -> finalSelect v
Nothing -> error "selector failed"
finalSelect :: Value -> ItemTarget
finalSelect (Array array) = PartiallyParsedItemTarget $ V.toList $ V.map (\e -> DTE.decodeUtf8 $ toStrict $ encodePretty' encConfig e) array
finalSelect val = RawItemTarget $ DTE.decodeUtf8 $ toStrict $ encodePretty' encConfig val
encConfig = Config {
confIndent = Spaces 0,
confCompare = compare,
confNumFormat = Generic,
confTrailingNewline = False }
toStrict :: BL.ByteString -> B.ByteString
toStrict = B.concat . BL.toChunks
-- TODO get rid of this
decode'' :: FromJSON a => T.Text -> Maybe a
decode'' = decode . toLazyByteString . encodeUtf8Builder
selectInJson :: Selector -> Maybe Value -> Maybe Value
selectInJson _ Nothing = Nothing
selectInJson (Selector []) value = value
selectInJson (Selector (h:r)) (Just (Object object)) =
selectInJson (Selector r) (H.lookup h object)
selectInJson _ _ = Nothing

View File

@ -369,12 +369,16 @@ main = hspec $ do
describe "automatic decompression" $ do
it "more complex test" $ do
runGEvalTest "charmatch-complex-compressed" `shouldReturnAlmost` 0.1923076923076923
describe "handling jsonl format" $ do
it "simple test" $
runGEvalTestExtraOptions ["-e", "expected.jsonl" ] "jsonl-simple" `shouldReturnAlmost` 0.5
describe "line by line mode" $ do
let sampleChallenge =
GEvalSpecification
{ gesOutDirectory = "test/likelihood-simple/likelihood-simple-solution",
gesExpectedDirectory = Just "test/likelihood-simple/likelihood-simple",
gesTestName = "test-A",
gesSelector = Nothing,
gesOutFile = "out.tsv",
gesExpectedFile = "expected.tsv",
gesInputFile = "in.tsv",

View File

@ -0,0 +1,3 @@
{"id": 0, "root":{"foo":"bar", "items":[{"aaa":12, "bbb":"x"}, {"aaa":14, "bbb":"a"}]}}
{"id": 1, "root":{"foo":"baz", "items":[{"aaa": 13, "bbb":"y"}]}}
{"id": 2, "root":{"foo":"baz", "items":[{"aaa":3, "bbb":"abc"}]}}

View File

@ -0,0 +1 @@
--metric MultiLabel-F1 --selector root/items

View File

@ -0,0 +1,3 @@
{"id": 0, "root":{"foo":"bar", "items":[{"aaa":12, "bbb":"xyz"}, {"aaa":14, "bbb":"a"}]}}
{"id": 1, "root":{"foo":"baz", "items":[]}}
{"id": 2, "root":{"foo":"baz", "items":[{"aaa":3, "bbb":"abc"}]}}