Handle headers
This commit is contained in:
parent
ccd6d919da
commit
6d586c7238
@ -53,6 +53,7 @@ library
|
|||||||
, Data.SplitIntoCrossTabs
|
, Data.SplitIntoCrossTabs
|
||||||
, Data.Conduit.Bootstrap
|
, Data.Conduit.Bootstrap
|
||||||
, GEval.Formatting
|
, GEval.Formatting
|
||||||
|
, Data.Conduit.Header
|
||||||
, Paths_geval
|
, Paths_geval
|
||||||
build-depends: base >= 4.7 && < 5
|
build-depends: base >= 4.7 && < 5
|
||||||
, cond
|
, cond
|
||||||
|
41
src/Data/Conduit/Header.hs
Normal file
41
src/Data/Conduit/Header.hs
Normal file
@ -0,0 +1,41 @@
|
|||||||
|
{-# LANGUAGE OverloadedStrings #-}
|
||||||
|
|
||||||
|
module Data.Conduit.Header
|
||||||
|
(processHeader, TabularHeader, readHeaderFile)
|
||||||
|
where
|
||||||
|
|
||||||
|
import Data.Text
|
||||||
|
import Data.Conduit
|
||||||
|
import Data.Conduit.AutoDecompress
|
||||||
|
|
||||||
|
import qualified System.Directory as D
|
||||||
|
|
||||||
|
data TabularHeader = TabularHeader [Text]
|
||||||
|
|
||||||
|
processHeader :: Monad m => Maybe TabularHeader -> ConduitT Text Text m ()
|
||||||
|
processHeader Nothing = doNothing
|
||||||
|
processHeader (Just (TabularHeader (firstField:_))) = do
|
||||||
|
mLine <- await
|
||||||
|
case mLine of
|
||||||
|
Just line -> case splitIntoFields line of
|
||||||
|
(firstField':_) -> do
|
||||||
|
if firstField' == firstField
|
||||||
|
then return ()
|
||||||
|
else yield line
|
||||||
|
doNothing
|
||||||
|
Nothing -> return ()
|
||||||
|
|
||||||
|
splitIntoFields :: Text -> [Text]
|
||||||
|
splitIntoFields = splitOn "\t"
|
||||||
|
|
||||||
|
readHeaderFile :: FilePath -> IO (Maybe TabularHeader)
|
||||||
|
readHeaderFile headerFilePath = do
|
||||||
|
fileExists <- (D.doesFileExist headerFilePath)
|
||||||
|
if fileExists
|
||||||
|
then
|
||||||
|
do
|
||||||
|
content <- readFile headerFilePath
|
||||||
|
let (firstLine:_) = Prelude.lines content
|
||||||
|
return $ Just $ TabularHeader $ splitIntoFields $ pack firstLine
|
||||||
|
else
|
||||||
|
return Nothing
|
@ -126,6 +126,7 @@ data GEvalException = NoExpectedFile FilePath
|
|||||||
| UnexpectedData Word32 String
|
| UnexpectedData Word32 String
|
||||||
| UnexpectedMultipleOutputs
|
| UnexpectedMultipleOutputs
|
||||||
| OtherException String
|
| OtherException String
|
||||||
|
| NoHeaderFile FilePath
|
||||||
deriving (Eq)
|
deriving (Eq)
|
||||||
|
|
||||||
instance Exception GEvalException
|
instance Exception GEvalException
|
||||||
@ -147,6 +148,7 @@ instance Show GEvalException where
|
|||||||
show (UnexpectedData lineNo message) = "Line " ++ (show lineNo) ++ ": Unexpected data [" ++ message ++ "]"
|
show (UnexpectedData lineNo message) = "Line " ++ (show lineNo) ++ ": Unexpected data [" ++ message ++ "]"
|
||||||
show UnexpectedMultipleOutputs = "Multiple outputs are not possible in this mode, use -o option to select an output file"
|
show UnexpectedMultipleOutputs = "Multiple outputs are not possible in this mode, use -o option to select an output file"
|
||||||
show (OtherException message) = message
|
show (OtherException message) = message
|
||||||
|
show (NoHeaderFile filePath) = somethingWrongWithFilesMessage "No file with header specification" filePath
|
||||||
|
|
||||||
somethingWrongWithFilesMessage :: String -> FilePath -> String
|
somethingWrongWithFilesMessage :: String -> FilePath -> String
|
||||||
somethingWrongWithFilesMessage msg filePath = Prelude.concat
|
somethingWrongWithFilesMessage msg filePath = Prelude.concat
|
||||||
|
@ -44,7 +44,11 @@ module GEval.Core
|
|||||||
getDataDecoder,
|
getDataDecoder,
|
||||||
threeLineSource,
|
threeLineSource,
|
||||||
extensionsHandled,
|
extensionsHandled,
|
||||||
isEmptyFile
|
isEmptyFile,
|
||||||
|
FileProcessingOptions(..),
|
||||||
|
readHeaderFileWrapper,
|
||||||
|
getInHeader,
|
||||||
|
getOutHeader
|
||||||
) where
|
) where
|
||||||
|
|
||||||
import Debug.Trace
|
import Debug.Trace
|
||||||
@ -82,6 +86,7 @@ import Control.Monad ((<=<), filterM)
|
|||||||
import Data.Attoparsec.Text (parseOnly)
|
import Data.Attoparsec.Text (parseOnly)
|
||||||
|
|
||||||
import Data.Conduit.SmartSource
|
import Data.Conduit.SmartSource
|
||||||
|
import Data.Conduit.Header
|
||||||
|
|
||||||
import qualified Data.IntSet as IS
|
import qualified Data.IntSet as IS
|
||||||
|
|
||||||
@ -172,7 +177,9 @@ data GEvalSpecification = GEvalSpecification
|
|||||||
gesToken :: Maybe String,
|
gesToken :: Maybe String,
|
||||||
gesGonitoGitAnnexRemote :: Maybe String,
|
gesGonitoGitAnnexRemote :: Maybe String,
|
||||||
gesReferences :: Maybe String,
|
gesReferences :: Maybe String,
|
||||||
gesBootstrapResampling :: Maybe Int }
|
gesBootstrapResampling :: Maybe Int,
|
||||||
|
gesInHeader :: Maybe String,
|
||||||
|
gesOutHeader :: Maybe String }
|
||||||
|
|
||||||
|
|
||||||
gesMainMetric :: GEvalSpecification -> Metric
|
gesMainMetric :: GEvalSpecification -> Metric
|
||||||
@ -192,6 +199,16 @@ getExpectedDirectory :: GEvalSpecification -> FilePath
|
|||||||
getExpectedDirectory spec = fromMaybe outDirectory $ gesExpectedDirectory spec
|
getExpectedDirectory spec = fromMaybe outDirectory $ gesExpectedDirectory spec
|
||||||
where outDirectory = gesOutDirectory spec
|
where outDirectory = gesOutDirectory spec
|
||||||
|
|
||||||
|
getInHeader :: GEvalSpecification -> Maybe FilePath
|
||||||
|
getInHeader spec = getHeader spec gesInHeader
|
||||||
|
|
||||||
|
getOutHeader :: GEvalSpecification -> Maybe FilePath
|
||||||
|
getOutHeader spec = getHeader spec gesOutHeader
|
||||||
|
|
||||||
|
getHeader spec selector = case selector spec of
|
||||||
|
Just headerFile -> Just $ getExpectedDirectory spec </> headerFile
|
||||||
|
Nothing -> Nothing
|
||||||
|
|
||||||
-- | Special command, not just running the regular evaluation.
|
-- | Special command, not just running the regular evaluation.
|
||||||
-- See OptionsParser.hs for more information.
|
-- See OptionsParser.hs for more information.
|
||||||
data GEvalSpecialCommand = Init
|
data GEvalSpecialCommand = Init
|
||||||
@ -229,7 +246,9 @@ defaultGEvalSpecification = GEvalSpecification {
|
|||||||
gesToken = Nothing,
|
gesToken = Nothing,
|
||||||
gesGonitoGitAnnexRemote = Nothing,
|
gesGonitoGitAnnexRemote = Nothing,
|
||||||
gesReferences = Nothing,
|
gesReferences = Nothing,
|
||||||
gesBootstrapResampling = Nothing }
|
gesBootstrapResampling = Nothing,
|
||||||
|
gesInHeader = Nothing,
|
||||||
|
gesOutHeader = Nothing }
|
||||||
|
|
||||||
isEmptyFile :: FilePath -> IO (Bool)
|
isEmptyFile :: FilePath -> IO (Bool)
|
||||||
isEmptyFile path = do
|
isEmptyFile path = do
|
||||||
@ -255,9 +274,13 @@ noGraph = const Nothing
|
|||||||
|
|
||||||
gevalOnSingleOut :: GEvalSpecification -> SourceSpec -> SourceSpec -> SourceSpec -> IO (SourceSpec, [MetricOutput])
|
gevalOnSingleOut :: GEvalSpecification -> SourceSpec -> SourceSpec -> SourceSpec -> IO (SourceSpec, [MetricOutput])
|
||||||
gevalOnSingleOut gevalSpec inputSource expectedSource outSource = do
|
gevalOnSingleOut gevalSpec inputSource expectedSource outSource = do
|
||||||
|
mInHeader <- readHeaderFileWrapper $ getInHeader gevalSpec
|
||||||
|
mOutHeader <- readHeaderFileWrapper $ getOutHeader gevalSpec
|
||||||
vals <- Prelude.mapM (\scheme -> gevalCore (evaluationSchemeMetric scheme)
|
vals <- Prelude.mapM (\scheme -> gevalCore (evaluationSchemeMetric scheme)
|
||||||
mSelector
|
mSelector
|
||||||
(preprocess . applyPreprocessingOperations scheme)
|
(preprocess . applyPreprocessingOperations scheme)
|
||||||
|
mInHeader
|
||||||
|
mOutHeader
|
||||||
(gesBootstrapResampling gevalSpec)
|
(gesBootstrapResampling gevalSpec)
|
||||||
inputSource
|
inputSource
|
||||||
expectedSource
|
expectedSource
|
||||||
@ -267,6 +290,15 @@ gevalOnSingleOut gevalSpec inputSource expectedSource outSource = do
|
|||||||
preprocess = gesPreprocess gevalSpec
|
preprocess = gesPreprocess gevalSpec
|
||||||
mSelector = gesSelector gevalSpec
|
mSelector = gesSelector gevalSpec
|
||||||
|
|
||||||
|
|
||||||
|
readHeaderFileWrapper :: Maybe FilePath -> IO (Maybe TabularHeader)
|
||||||
|
readHeaderFileWrapper Nothing = return Nothing
|
||||||
|
readHeaderFileWrapper (Just headerFilePath) = do
|
||||||
|
mHeader <- readHeaderFile headerFilePath
|
||||||
|
case mHeader of
|
||||||
|
Just header -> return $ Just header
|
||||||
|
Nothing -> throwM $ NoHeaderFile headerFilePath
|
||||||
|
|
||||||
checkAndGetFilesSingleOut :: Bool -> GEvalSpecification -> IO (SourceSpec, SourceSpec, SourceSpec)
|
checkAndGetFilesSingleOut :: Bool -> GEvalSpecification -> IO (SourceSpec, SourceSpec, SourceSpec)
|
||||||
checkAndGetFilesSingleOut forceInput gevalSpec = do
|
checkAndGetFilesSingleOut forceInput gevalSpec = do
|
||||||
res <- checkAndGetFiles forceInput gevalSpec
|
res <- checkAndGetFiles forceInput gevalSpec
|
||||||
@ -369,9 +401,18 @@ getInputSourceIfNeeded forced metrics directory inputFilePath
|
|||||||
Right sourceSpec -> return sourceSpec
|
Right sourceSpec -> return sourceSpec
|
||||||
| otherwise = return NoSource
|
| otherwise = return NoSource
|
||||||
|
|
||||||
fileAsLineSource :: SourceSpec -> Maybe Selector -> (Text -> Text) -> LineSource (ResourceT IO)
|
data FileProcessingOptions = FileProcessingOptions {
|
||||||
fileAsLineSource spec mSelector preprocess =
|
fileProcessingOptionsSelector :: Maybe Selector,
|
||||||
LineSource ((smartSource spec) .| autoDecompress .| CT.decodeUtf8Lenient .| CT.lines) (select (getDataFormat spec) mSelector) preprocess spec 1
|
fileProcessingOptionsPreprocess :: (Text -> Text),
|
||||||
|
fileProcessingOptionsHeader :: Maybe TabularHeader }
|
||||||
|
|
||||||
|
|
||||||
|
fileAsLineSource :: SourceSpec -> FileProcessingOptions -> LineSource (ResourceT IO)
|
||||||
|
fileAsLineSource spec options =
|
||||||
|
LineSource ((smartSource spec) .| autoDecompress .| CT.decodeUtf8Lenient .| CT.lines .| processHeader mHeader) (select (getDataFormat spec) mSelector) preprocess spec 1
|
||||||
|
where mSelector = fileProcessingOptionsSelector options
|
||||||
|
preprocess = fileProcessingOptionsPreprocess options
|
||||||
|
mHeader = fileProcessingOptionsHeader options
|
||||||
|
|
||||||
getDataDecoder :: LineSource (ResourceT IO) -> (Text -> ItemTarget)
|
getDataDecoder :: LineSource (ResourceT IO) -> (Text -> ItemTarget)
|
||||||
getDataDecoder (LineSource _ dd _ _ _) = dd
|
getDataDecoder (LineSource _ dd _ _ _) = dd
|
||||||
@ -429,22 +470,32 @@ handleBootstrap _ = True
|
|||||||
gevalCore :: Metric -- ^ evaluation metric
|
gevalCore :: Metric -- ^ evaluation metric
|
||||||
-> Maybe Selector -- ^ selector to be used
|
-> Maybe Selector -- ^ selector to be used
|
||||||
-> (Text -> Text) -- ^ preprocessing function (e.g. tokenization)
|
-> (Text -> Text) -- ^ preprocessing function (e.g. tokenization)
|
||||||
|
-> (Maybe TabularHeader) -- ^ header for input
|
||||||
|
-> (Maybe TabularHeader) -- ^ header for output/expected files
|
||||||
-> (Maybe Int) -- ^ number of bootstrap samples
|
-> (Maybe Int) -- ^ number of bootstrap samples
|
||||||
-> SourceSpec -- ^ source specification for the input values
|
-> SourceSpec -- ^ source specification for the input values
|
||||||
-> SourceSpec -- ^ source specification for the expected output
|
-> SourceSpec -- ^ source specification for the expected output
|
||||||
-> SourceSpec -- ^ source specification for the output
|
-> SourceSpec -- ^ source specification for the output
|
||||||
-> IO (MetricOutput) -- ^ metric value for the output against the expected output
|
-> IO (MetricOutput) -- ^ metric value for the output against the expected output
|
||||||
gevalCore metric mSelector preprocess mBootstrapResampling inputSource expectedSource outSource = do
|
gevalCore metric mSelector preprocess mInHeader mOutHeader mBootstrapResampling inputSource expectedSource outSource = do
|
||||||
whenM (isEmptyFileSource outSource) $ throwM $ EmptyOutput
|
whenM (isEmptyFileSource outSource) $ throwM $ EmptyOutput
|
||||||
go metric
|
go metric
|
||||||
(fileAsLineSource inputSource mSelector preprocess)
|
(fileAsLineSource inputSource inOptions)
|
||||||
(fileAsLineSource expectedSource mSelector preprocess)
|
(fileAsLineSource expectedSource outOptions)
|
||||||
(fileAsLineSource outSource mSelector preprocess)
|
(fileAsLineSource outSource outOptions)
|
||||||
where go = case mBootstrapResampling of
|
where go = case mBootstrapResampling of
|
||||||
Nothing -> gevalCoreOnSources
|
Nothing -> gevalCoreOnSources
|
||||||
Just bootstrapResampling -> if handleBootstrap metric
|
Just bootstrapResampling -> if handleBootstrap metric
|
||||||
then gevalBootstrapOnSources bootstrapResampling
|
then gevalBootstrapOnSources bootstrapResampling
|
||||||
else gevalCoreOnSources
|
else gevalCoreOnSources
|
||||||
|
outOptions = FileProcessingOptions {
|
||||||
|
fileProcessingOptionsSelector = mSelector,
|
||||||
|
fileProcessingOptionsPreprocess = preprocess,
|
||||||
|
fileProcessingOptionsHeader = mOutHeader }
|
||||||
|
inOptions = FileProcessingOptions {
|
||||||
|
fileProcessingOptionsSelector = mSelector,
|
||||||
|
fileProcessingOptionsPreprocess = preprocess,
|
||||||
|
fileProcessingOptionsHeader = mInHeader }
|
||||||
|
|
||||||
isEmptyFileSource :: SourceSpec -> IO Bool
|
isEmptyFileSource :: SourceSpec -> IO Bool
|
||||||
isEmptyFileSource (FilePathSpec filePath) = isEmptyFile filePath
|
isEmptyFileSource (FilePathSpec filePath) = isEmptyFile filePath
|
||||||
|
@ -31,6 +31,8 @@ createChallenge withDataFiles expectedDirectory spec = do
|
|||||||
createFile (expectedDirectory </> ".gitignore") $ gitignoreContents
|
createFile (expectedDirectory </> ".gitignore") $ gitignoreContents
|
||||||
createFile (expectedDirectory </> "README.md") $ readmeMDContents metric testName
|
createFile (expectedDirectory </> "README.md") $ readmeMDContents metric testName
|
||||||
createFile (expectedDirectory </> configFileName) $ configContents metrics precision testName
|
createFile (expectedDirectory </> configFileName) $ configContents metrics precision testName
|
||||||
|
createHeaderFile expectedDirectory "in-header.tsv" $ inHeaderContents metric
|
||||||
|
createHeaderFile expectedDirectory "out-header.tsv" $ outHeaderContents metric
|
||||||
if withDataFiles
|
if withDataFiles
|
||||||
then
|
then
|
||||||
do
|
do
|
||||||
@ -53,6 +55,10 @@ createChallenge withDataFiles expectedDirectory spec = do
|
|||||||
testDirectory = expectedDirectory </> testName
|
testDirectory = expectedDirectory </> testName
|
||||||
expectedFile = gesExpectedFile spec
|
expectedFile = gesExpectedFile spec
|
||||||
|
|
||||||
|
createHeaderFile _ _ Nothing = return ()
|
||||||
|
createHeaderFile expectedDirectory headerFile (Just fields) = do
|
||||||
|
createFile (expectedDirectory </> headerFile) $ (intercalate "\t" fields) ++ "\n"
|
||||||
|
|
||||||
createTrainFiles :: Metric -> FilePath -> FilePath -> IO ()
|
createTrainFiles :: Metric -> FilePath -> FilePath -> IO ()
|
||||||
createTrainFiles metric@(LogLossHashed _) trainDirectory _ = createSingleTrainFile metric trainDirectory
|
createTrainFiles metric@(LogLossHashed _) trainDirectory _ = createSingleTrainFile metric trainDirectory
|
||||||
createTrainFiles metric@(LikelihoodHashed _) trainDirectory _ = createSingleTrainFile metric trainDirectory
|
createTrainFiles metric@(LikelihoodHashed _) trainDirectory _ = createSingleTrainFile metric trainDirectory
|
||||||
@ -423,9 +429,17 @@ configContents schemes precision testName = unwords (Prelude.map (\scheme -> ("-
|
|||||||
" --test-name " ++ testName
|
" --test-name " ++ testName
|
||||||
else
|
else
|
||||||
"") ++
|
"") ++
|
||||||
(precisionOpt precision)
|
(precisionOpt precision) ++
|
||||||
|
inHeaderOpts ++
|
||||||
|
outHeaderOpts
|
||||||
where precisionOpt Nothing = ""
|
where precisionOpt Nothing = ""
|
||||||
precisionOpt (Just p) = " --precision " ++ (show p)
|
precisionOpt (Just p) = " --precision " ++ (show p)
|
||||||
|
((EvaluationScheme mainMetric _):_) = schemes
|
||||||
|
inHeaderOpts = getHeaderOpts "in-header" inHeaderContents
|
||||||
|
outHeaderOpts = getHeaderOpts "out-header" outHeaderContents
|
||||||
|
getHeaderOpts opt selector = case selector mainMetric of
|
||||||
|
Just _ -> " --" ++ opt ++ " " ++ (opt <.> "tsv")
|
||||||
|
Nothing -> ""
|
||||||
|
|
||||||
-- Originally train content was in one file, to avoid large changes
|
-- Originally train content was in one file, to avoid large changes
|
||||||
-- for the time being we are using the original function.
|
-- for the time being we are using the original function.
|
||||||
@ -823,6 +837,79 @@ testExpectedContents _ = [hereLit|0.11
|
|||||||
17.2
|
17.2
|
||||||
|]
|
|]
|
||||||
|
|
||||||
|
inHeaderContents :: Metric -> Maybe [String]
|
||||||
|
inHeaderContents (Mean metric) = inHeaderContents metric
|
||||||
|
inHeaderContents GLEU = Nothing
|
||||||
|
inHeaderContents BLEU = Nothing
|
||||||
|
inHeaderContents Accuracy = Just ["Temperature", "Wind", "Rain"]
|
||||||
|
inHeaderContents (FMeasure _) = Just ["seismic",
|
||||||
|
"seismoacoustic",
|
||||||
|
"shift",
|
||||||
|
"genergy",
|
||||||
|
"gpuls",
|
||||||
|
"gdenergy",
|
||||||
|
"gdpuls",
|
||||||
|
"ghazard",
|
||||||
|
"nbumps",
|
||||||
|
"nbumps2",
|
||||||
|
"nbumps3",
|
||||||
|
"nbumps4",
|
||||||
|
"nbumps5",
|
||||||
|
"nbumps6",
|
||||||
|
"nbumps7",
|
||||||
|
"nbumps89",
|
||||||
|
"energy",
|
||||||
|
"maxenergy",
|
||||||
|
"class"]
|
||||||
|
inHeaderContents (MacroFMeasure _) = Just ["FirstName"]
|
||||||
|
inHeaderContents (ProbabilisticSoftFMeasure b) = inHeaderContents (SoftFMeasure b)
|
||||||
|
inHeaderContents (SoftFMeasure _) = Just ["Text"]
|
||||||
|
inHeaderContents NMI = Just ["Utterance"]
|
||||||
|
inHeaderContents (LikelihoodHashed b) = inHeaderContents (LogLossHashed b)
|
||||||
|
inHeaderContents (LogLossHashed _) = Just ["LeftContext", "RightContext"]
|
||||||
|
inHeaderContents CharMatch = Just ["Text"]
|
||||||
|
inHeaderContents MAP = Just ["Dialect", "PolishPhrase"]
|
||||||
|
inHeaderContents Likelihood = inHeaderContents LogLoss
|
||||||
|
inHeaderContents LogLoss = Just ["Text"]
|
||||||
|
inHeaderContents BIOF1Labels = inHeaderContents BIOF1
|
||||||
|
inHeaderContents BIOF1 = Just ["Text"]
|
||||||
|
inHeaderContents TokenAccuracy = Just ["TokenizedText"]
|
||||||
|
inHeaderContents SegmentAccuracy = Just ["Segment"]
|
||||||
|
inHeaderContents (ProbabilisticMultiLabelFMeasure beta) = inHeaderContents (MultiLabelFMeasure beta)
|
||||||
|
inHeaderContents (MultiLabelFMeasure _) = Just ["Text"]
|
||||||
|
inHeaderContents MultiLabelLikelihood = inHeaderContents MultiLabelLogLoss
|
||||||
|
inHeaderContents MultiLabelLogLoss = Just ["Utterance"]
|
||||||
|
inHeaderContents (Soft2DFMeasure _) = inHeaderContents ClippEU
|
||||||
|
inHeaderContents ClippEU = Just ["DjvuFilePath"]
|
||||||
|
inHeaderContents _ = Just ["OrbitalPeriod", "OrbitalEccentricity", "NumberOfMoons"]
|
||||||
|
|
||||||
|
outHeaderContents :: Metric -> Maybe [String]
|
||||||
|
outHeaderContents (Mean metric) = outHeaderContents metric
|
||||||
|
outHeaderContents BLEU = Nothing
|
||||||
|
outHeaderContents GLEU = Nothing
|
||||||
|
outHeaderContents Accuracy = Just ["ShouldYouKidForWalk"]
|
||||||
|
outHeaderContents (FMeasure _) = Just ["IsSeismicBump"]
|
||||||
|
outHeaderContents (MacroFMeasure _) = Just ["LanguageCode"]
|
||||||
|
outHeaderContents (ProbabilisticSoftFMeasure b) = outHeaderContents (SoftFMeasure b)
|
||||||
|
outHeaderContents (SoftFMeasure _) = Just ["NamesFound"]
|
||||||
|
outHeaderContents NMI = Just ["LanguageCode"]
|
||||||
|
outHeaderContents (LikelihoodHashed b) = outHeaderContents (LogLossHashed b)
|
||||||
|
outHeaderContents (LogLossHashed _) = Just ["GuessedWord"]
|
||||||
|
outHeaderContents CharMatch = Just ["NormalizedText"]
|
||||||
|
outHeaderContents MAP = Nothing
|
||||||
|
outHeaderContents Likelihood = outHeaderContents LogLoss
|
||||||
|
outHeaderContents LogLoss = Just ["Probability"]
|
||||||
|
outHeaderContents BIOF1Labels = outHeaderContents BIOF1
|
||||||
|
outHeaderContents BIOF1 = Just ["BIOOutput"]
|
||||||
|
outHeaderContents TokenAccuracy = Just ["PartsOfSpeech"]
|
||||||
|
outHeaderContents SegmentAccuracy = Just ["PartsOfSpeech"]
|
||||||
|
outHeaderContents (ProbabilisticMultiLabelFMeasure beta) = outHeaderContents (MultiLabelFMeasure beta)
|
||||||
|
outHeaderContents (MultiLabelFMeasure _) = Just ["Entities"]
|
||||||
|
outHeaderContents MultiLabelLikelihood = outHeaderContents MultiLabelLogLoss
|
||||||
|
outHeaderContents MultiLabelLogLoss = Just ["Emotion"]
|
||||||
|
outHeaderContents (Soft2DFMeasure _) = Just ["Rectangle"]
|
||||||
|
outHeaderContents ClippEU = Just ["Rectangle"]
|
||||||
|
outHeaderContents _ = Just ["Mass"]
|
||||||
|
|
||||||
gitignoreContents :: String
|
gitignoreContents :: String
|
||||||
gitignoreContents = [hereLit|
|
gitignoreContents = [hereLit|
|
||||||
|
@ -73,6 +73,8 @@ import Data.Statistics.Kendall (kendallZ)
|
|||||||
|
|
||||||
import Data.Conduit.Binary (sourceFile)
|
import Data.Conduit.Binary (sourceFile)
|
||||||
|
|
||||||
|
import Data.Conduit.Header
|
||||||
|
|
||||||
import qualified Data.HashMap.Strict as H
|
import qualified Data.HashMap.Strict as H
|
||||||
import qualified Data.Map.Strict as M
|
import qualified Data.Map.Strict as M
|
||||||
import qualified Data.Set as S
|
import qualified Data.Set as S
|
||||||
@ -463,7 +465,10 @@ runLineByLineGeneralized ordering spec consum = do
|
|||||||
return $ Just references
|
return $ Just references
|
||||||
Nothing -> return Nothing
|
Nothing -> return Nothing
|
||||||
(inputFilePath, expectedFilePath, outFilePath) <- checkAndGetFilesSingleOut True spec
|
(inputFilePath, expectedFilePath, outFilePath) <- checkAndGetFilesSingleOut True spec
|
||||||
gevalLineByLineCore metric mSelector preprocess inputFilePath expectedFilePath outFilePath (sorter ordering .| consum mReferences)
|
mInHeader <- readHeaderFileWrapper $ getInHeader spec
|
||||||
|
mOutHeader <- readHeaderFileWrapper $ getOutHeader spec
|
||||||
|
let mOutHeader = Nothing
|
||||||
|
gevalLineByLineCore metric mSelector preprocess mInHeader mOutHeader inputFilePath expectedFilePath outFilePath (sorter ordering .| consum mReferences)
|
||||||
where metric = gesMainMetric spec
|
where metric = gesMainMetric spec
|
||||||
scheme = gesMainScheme spec
|
scheme = gesMainScheme spec
|
||||||
mSelector = gesSelector spec
|
mSelector = gesSelector spec
|
||||||
@ -517,7 +522,9 @@ runMultiOutputGeneralized spec consum = do
|
|||||||
altSourceSpecs' <- mapM (getSmartSourceSpec ((gesOutDirectory spec) </> (gesTestName spec)) "out.tsv") altOuts
|
altSourceSpecs' <- mapM (getSmartSourceSpec ((gesOutDirectory spec) </> (gesTestName spec)) "out.tsv") altOuts
|
||||||
let altSourceSpecs = rights altSourceSpecs'
|
let altSourceSpecs = rights altSourceSpecs'
|
||||||
let sourceSpecs = (outSource:altSourceSpecs)
|
let sourceSpecs = (outSource:altSourceSpecs)
|
||||||
let sources = Prelude.map (gevalLineByLineSource metric mSelector preprocess inputSource expectedSource) sourceSpecs
|
mInHeader <- readHeaderFileWrapper $ getInHeader spec
|
||||||
|
mOutHeader <- readHeaderFileWrapper $ getOutHeader spec
|
||||||
|
let sources = Prelude.map (gevalLineByLineSource metric mSelector preprocess mInHeader mOutHeader inputSource expectedSource) sourceSpecs
|
||||||
runResourceT $ runConduit $
|
runResourceT $ runConduit $
|
||||||
(sequenceSources sources .| consum)
|
(sequenceSources sources .| consum)
|
||||||
where metric = gesMainMetric spec
|
where metric = gesMainMetric spec
|
||||||
@ -545,13 +552,15 @@ runDiffGeneralized :: ResultOrdering -> FilePath -> GEvalSpecification -> (Maybe
|
|||||||
runDiffGeneralized ordering otherOut spec consum = do
|
runDiffGeneralized ordering otherOut spec consum = do
|
||||||
(inputSource, expectedSource, outSource) <- checkAndGetFilesSingleOut True spec
|
(inputSource, expectedSource, outSource) <- checkAndGetFilesSingleOut True spec
|
||||||
ooss <- getSmartSourceSpec ((gesOutDirectory spec) </> (gesTestName spec)) "out.tsv" otherOut
|
ooss <- getSmartSourceSpec ((gesOutDirectory spec) </> (gesTestName spec)) "out.tsv" otherOut
|
||||||
|
mInHeader <- readHeaderFileWrapper $ getInHeader spec
|
||||||
|
mOutHeader <- readHeaderFileWrapper $ getOutHeader spec
|
||||||
case ooss of
|
case ooss of
|
||||||
Left NoSpecGiven -> throwM $ NoOutFile otherOut
|
Left NoSpecGiven -> throwM $ NoOutFile otherOut
|
||||||
Left (NoFile fp) -> throwM $ NoOutFile fp
|
Left (NoFile fp) -> throwM $ NoOutFile fp
|
||||||
Left (NoDirectory d) -> throwM $ NoOutFile otherOut
|
Left (NoDirectory d) -> throwM $ NoOutFile otherOut
|
||||||
Right otherOutSource -> do
|
Right otherOutSource -> do
|
||||||
let sourceA = gevalLineByLineSource metric mSelector preprocess inputSource expectedSource otherOutSource
|
let sourceA = gevalLineByLineSource metric mSelector preprocess mInHeader mOutHeader inputSource expectedSource otherOutSource
|
||||||
let sourceB = gevalLineByLineSource metric mSelector preprocess inputSource expectedSource outSource
|
let sourceB = gevalLineByLineSource metric mSelector preprocess mInHeader mOutHeader inputSource expectedSource outSource
|
||||||
runResourceT $ runConduit $
|
runResourceT $ runConduit $
|
||||||
((getZipSource $ (,)
|
((getZipSource $ (,)
|
||||||
<$> ZipSource sourceA
|
<$> ZipSource sourceA
|
||||||
@ -573,27 +582,44 @@ runDiffGeneralized ordering otherOut spec consum = do
|
|||||||
escapeTabs :: Text -> Text
|
escapeTabs :: Text -> Text
|
||||||
escapeTabs = Data.Text.replace "\t" "<tab>"
|
escapeTabs = Data.Text.replace "\t" "<tab>"
|
||||||
|
|
||||||
gevalLineByLineCore :: Metric -> Maybe Selector -> (Text -> Text) -> SourceSpec -> SourceSpec -> SourceSpec -> ConduitT LineRecord Void (ResourceT IO) a -> IO a
|
gevalLineByLineCore :: Metric -> Maybe Selector -> (Text -> Text) -> (Maybe TabularHeader) -> (Maybe TabularHeader) -> SourceSpec -> SourceSpec -> SourceSpec -> ConduitT LineRecord Void (ResourceT IO) a -> IO a
|
||||||
gevalLineByLineCore metric mSelector preprocess inputSource expectedSource outSource consum =
|
gevalLineByLineCore metric mSelector preprocess mInHeader mOutHeader inputSource expectedSource outSource consum =
|
||||||
runResourceT $ runConduit $
|
runResourceT $ runConduit $
|
||||||
((gevalLineByLineSource metric mSelector preprocess inputSource expectedSource outSource) .| consum)
|
((gevalLineByLineSource metric mSelector preprocess mInHeader mOutHeader inputSource expectedSource outSource) .| consum)
|
||||||
|
|
||||||
gevalLineByLineSource :: Metric -> Maybe Selector -> (Text -> Text) -> SourceSpec -> SourceSpec -> SourceSpec -> ConduitT () LineRecord (ResourceT IO) ()
|
gevalLineByLineSource :: Metric
|
||||||
gevalLineByLineSource metric mSelector preprocess inputSource expectedSource outSource =
|
-> Maybe Selector
|
||||||
|
-> (Text -> Text)
|
||||||
|
-> (Maybe TabularHeader)
|
||||||
|
-> (Maybe TabularHeader)
|
||||||
|
-> SourceSpec
|
||||||
|
-> SourceSpec
|
||||||
|
-> SourceSpec
|
||||||
|
-> ConduitT () LineRecord (ResourceT IO) ()
|
||||||
|
gevalLineByLineSource metric mSelector preprocess mInHeader mOutHeader inputSource expectedSource outSource =
|
||||||
(getZipSource $ (,)
|
(getZipSource $ (,)
|
||||||
<$> ZipSource (CL.sourceList [1..])
|
<$> ZipSource (CL.sourceList [1..])
|
||||||
<*> (ZipSource $ threeLineSource context)) .| CL.mapM (checkStepM evaluateLine) .| CL.catMaybes
|
<*> (ZipSource $ threeLineSource context)) .| CL.mapM (checkStepM evaluateLine) .| CL.catMaybes
|
||||||
where context = (WithInput inputLineSource expectedLineSource outputLineSource)
|
where context = (WithInput inputLineSource expectedLineSource outputLineSource)
|
||||||
-- preparing sources, `id` means that no preprocessing is done (to avoid double preprocessing)
|
inputLineSource = fileAsLineSource inputSource inOptions
|
||||||
inputLineSource = fileAsLineSource inputSource mSelector id
|
expectedLineSource = fileAsLineSource expectedSource outOptions
|
||||||
expectedLineSource = fileAsLineSource expectedSource mSelector id
|
outputLineSource = fileAsLineSource outSource outOptions
|
||||||
outputLineSource = fileAsLineSource outSource mSelector id
|
|
||||||
justLine (LineInFile _ _ l) = l
|
justLine (LineInFile _ _ l) = l
|
||||||
evaluateLine (lineNo, ParsedRecordWithInput inp exp out) = do
|
evaluateLine (lineNo, ParsedRecordWithInput inp exp out) = do
|
||||||
s <- liftIO $ gevalCoreOnSingleLines metric preprocess (getDataDecoder inputLineSource) (LineInFile inputSource lineNo inp)
|
s <- liftIO $ gevalCoreOnSingleLines metric preprocess (getDataDecoder inputLineSource) (LineInFile inputSource lineNo inp)
|
||||||
(getDataDecoder expectedLineSource) (LineInFile expectedSource lineNo exp)
|
(getDataDecoder expectedLineSource) (LineInFile expectedSource lineNo exp)
|
||||||
(getDataDecoder outputLineSource) (LineInFile outSource lineNo out)
|
(getDataDecoder outputLineSource) (LineInFile outSource lineNo out)
|
||||||
return $ LineRecord inp exp out lineNo (extractSimpleRunValue $ getMetricValue s)
|
return $ LineRecord inp exp out lineNo (extractSimpleRunValue $ getMetricValue s)
|
||||||
|
-- preparing sources, `id` means that no preprocessing is done (to avoid double preprocessing)
|
||||||
|
outOptions = FileProcessingOptions {
|
||||||
|
fileProcessingOptionsSelector = mSelector,
|
||||||
|
fileProcessingOptionsPreprocess = id,
|
||||||
|
fileProcessingOptionsHeader = mOutHeader }
|
||||||
|
inOptions = FileProcessingOptions {
|
||||||
|
fileProcessingOptionsSelector = mSelector,
|
||||||
|
fileProcessingOptionsPreprocess = id,
|
||||||
|
fileProcessingOptionsHeader = mInHeader }
|
||||||
|
|
||||||
|
|
||||||
justTokenize :: Maybe Tokenizer -> IO ()
|
justTokenize :: Maybe Tokenizer -> IO ()
|
||||||
justTokenize Nothing = error "a tokenizer must be specified with --tokenizer option"
|
justTokenize Nothing = error "a tokenizer must be specified with --tokenizer option"
|
||||||
|
@ -214,7 +214,14 @@ specParser = GEvalSpecification
|
|||||||
<> short 'B'
|
<> short 'B'
|
||||||
<> metavar "NUMBER-OF-SAMPLES"
|
<> metavar "NUMBER-OF-SAMPLES"
|
||||||
<> help "Tests on NUMBER-OF-SAMPLES bootstrap samples rather than just on the whole test set" ))
|
<> help "Tests on NUMBER-OF-SAMPLES bootstrap samples rather than just on the whole test set" ))
|
||||||
|
<*> ( optional . strOption $
|
||||||
|
( long "in-header"
|
||||||
|
<> metavar "FILE"
|
||||||
|
<> help "One-line TSV file specifying a list of field names for input files"))
|
||||||
|
<*> ( optional . strOption $
|
||||||
|
( long "out-header"
|
||||||
|
<> metavar "FILE"
|
||||||
|
<> help "One-line TSV file specifying a list of field names for output and expected files"))
|
||||||
|
|
||||||
defaultMinFrequency :: Integer
|
defaultMinFrequency :: Integer
|
||||||
defaultMinFrequency = 1
|
defaultMinFrequency = 1
|
||||||
|
@ -446,6 +446,9 @@ main = hspec $ do
|
|||||||
describe "automatic decompression" $ do
|
describe "automatic decompression" $ do
|
||||||
it "more complex test" $ do
|
it "more complex test" $ do
|
||||||
runGEvalTest "charmatch-complex-compressed" `shouldReturnAlmost` 0.1923076923076923
|
runGEvalTest "charmatch-complex-compressed" `shouldReturnAlmost` 0.1923076923076923
|
||||||
|
describe "headers" $ do
|
||||||
|
it "simple" $ do
|
||||||
|
runGEvalTest "mse-simple-headers" `shouldReturnAlmost` 0.4166666666666667
|
||||||
describe "handling jsonl format" $ do
|
describe "handling jsonl format" $ do
|
||||||
it "simple test" $
|
it "simple test" $
|
||||||
runGEvalTestExtraOptions ["-e", "expected.jsonl" ] "jsonl-simple" `shouldReturnAlmost` 0.571428571428
|
runGEvalTestExtraOptions ["-e", "expected.jsonl" ] "jsonl-simple" `shouldReturnAlmost` 0.571428571428
|
||||||
@ -467,7 +470,9 @@ main = hspec $ do
|
|||||||
gesToken = Nothing,
|
gesToken = Nothing,
|
||||||
gesGonitoGitAnnexRemote = Nothing,
|
gesGonitoGitAnnexRemote = Nothing,
|
||||||
gesReferences = Nothing,
|
gesReferences = Nothing,
|
||||||
gesBootstrapResampling = Nothing }
|
gesBootstrapResampling = Nothing,
|
||||||
|
gesInHeader = Nothing,
|
||||||
|
gesOutHeader = Nothing }
|
||||||
it "simple test" $ do
|
it "simple test" $ do
|
||||||
results <- runLineByLineGeneralized KeepTheOriginalOrder sampleChallenge (const Data.Conduit.List.consume)
|
results <- runLineByLineGeneralized KeepTheOriginalOrder sampleChallenge (const Data.Conduit.List.consume)
|
||||||
Prelude.map (\(LineRecord inp _ _ _ _) -> inp) results `shouldBe` ["foo",
|
Prelude.map (\(LineRecord inp _ _ _ _) -> inp) results `shouldBe` ["foo",
|
||||||
|
@ -0,0 +1,3 @@
|
|||||||
|
3.0
|
||||||
|
3.0
|
||||||
|
2.0
|
|
1
test/mse-simple-headers/mse-simple-headers/config.txt
Normal file
1
test/mse-simple-headers/mse-simple-headers/config.txt
Normal file
@ -0,0 +1 @@
|
|||||||
|
--metric MSE --out-header out-header.tsv
|
@ -0,0 +1 @@
|
|||||||
|
SomeValue
|
|
@ -0,0 +1,4 @@
|
|||||||
|
SomeValue
|
||||||
|
2.0
|
||||||
|
3.0
|
||||||
|
1.5
|
|
Loading…
Reference in New Issue
Block a user