first tokenizer

This commit is contained in:
Filip Gralinski 2018-08-13 10:09:55 +02:00
parent d3da3a0ca5
commit 83550688ce
4 changed files with 61 additions and 45 deletions

View File

@ -39,7 +39,8 @@ module GEval.Core
checkAndGetFilesSingleOut,
checkMultipleOuts,
checkMultipleOutsCore,
gesMainMetric
gesMainMetric,
gesPreprocess
) where
import Data.Conduit
@ -222,6 +223,9 @@ gesMainMetric spec = case gesMetrics spec of
(metric:_) -> metric
otherwise -> error "no metric given"
gesPreprocess :: GEvalSpecification -> (Text -> Text)
gesPreprocess spec = tokenizeTabSeparatedWithSpaces (gesTokenizer spec)
getExpectedDirectory :: GEvalSpecification -> FilePath
getExpectedDirectory spec = fromMaybe outDirectory $ gesExpectedDirectory spec
where outDirectory = gesOutDirectory spec
@ -295,7 +299,7 @@ isEmptyFile path = do
return ((fileSize stat) == 0)
data LineSource m = LineSource (ConduitT () Text m ()) SourceSpec Word32
data LineSource m = LineSource (ConduitT () Text m ()) (Text -> Text) SourceSpec Word32
geval :: GEvalSpecification -> IO [(SourceSpec, [MetricValue])]
geval gevalSpec = do
@ -305,9 +309,10 @@ geval gevalSpec = do
gevalOnSingleOut :: GEvalSpecification -> SourceSpec -> SourceSpec -> SourceSpec -> IO (SourceSpec, [MetricValue])
gevalOnSingleOut gevalSpec inputSource expectedSource outSource = do
vals <- Prelude.mapM (\metric -> gevalCore metric inputSource expectedSource outSource) metrics
vals <- Prelude.mapM (\metric -> gevalCore metric preprocess inputSource expectedSource outSource) metrics
return (outSource, vals)
where metrics = gesMetrics gevalSpec
preprocess = gesPreprocess gevalSpec
checkAndGetFilesSingleOut :: Bool -> GEvalSpecification -> IO (SourceSpec, SourceSpec, SourceSpec)
checkAndGetFilesSingleOut forceInput gevalSpec = do
@ -394,35 +399,36 @@ getInputSourceIfNeeded forced metrics directory inputFilePath
Right sourceSpec -> return sourceSpec
| otherwise = return NoSource
fileAsLineSource :: SourceSpec -> LineSource (ResourceT IO)
fileAsLineSource spec =
LineSource ((smartSource spec) $= autoDecompress $= CT.decodeUtf8Lenient =$= CT.lines) spec 1
fileAsLineSource :: SourceSpec -> (Text -> Text) -> LineSource (ResourceT IO)
fileAsLineSource spec preprocess =
LineSource ((smartSource spec) $= autoDecompress $= CT.decodeUtf8Lenient =$= CT.lines) preprocess spec 1
gevalCoreOnSingleLines :: Metric -> LineInFile -> LineInFile -> LineInFile -> IO (MetricValue)
gevalCoreOnSingleLines metric inpLine expLine outLine =
gevalCoreOnSources metric (singleLineAsLineSource inpLine)
(singleLineAsLineSource expLine)
(singleLineAsLineSource outLine)
gevalCoreOnSingleLines :: Metric -> (Text -> Text) -> LineInFile -> LineInFile -> LineInFile -> IO (MetricValue)
gevalCoreOnSingleLines metric preprocess inpLine expLine outLine =
gevalCoreOnSources metric (singleLineAsLineSource inpLine preprocess)
(singleLineAsLineSource expLine preprocess)
(singleLineAsLineSource outLine preprocess)
singleLineAsLineSource :: LineInFile -> LineSource (ResourceT IO)
singleLineAsLineSource (LineInFile sourceSpec lineNo line) =
LineSource (CL.sourceList [line]) sourceSpec lineNo
singleLineAsLineSource :: LineInFile -> (Text -> Text) -> LineSource (ResourceT IO)
singleLineAsLineSource (LineInFile sourceSpec lineNo line) preprocess =
LineSource (CL.sourceList [line]) preprocess sourceSpec lineNo
-- | Runs evaluation for a given metric using the sources specified
-- for input, expected output and output. Returns the metric value.
-- Throws @GEvalException@ if something was wrong in the data (e.g.
-- inconsistent number of lines in the sources).
gevalCore :: Metric -- ^ evaluation metric
-> (Text -> Text) -- ^ preprocessing function (e.g. tokenization)
-> SourceSpec -- ^ source specification for the input values
-> SourceSpec -- ^ source specification for the expected output
-> SourceSpec -- ^ source specification for the output
-> IO (MetricValue) -- ^ metric value for the output against the expected output
gevalCore metric inputSource expectedSource outSource = do
gevalCore metric preprocess inputSource expectedSource outSource = do
whenM (isEmptyFileSource outSource) $ throwM $ EmptyOutput
gevalCoreOnSources metric
(fileAsLineSource inputSource)
(fileAsLineSource expectedSource)
(fileAsLineSource outSource)
(fileAsLineSource inputSource preprocess)
(fileAsLineSource expectedSource preprocess)
(fileAsLineSource outSource preprocess)
isEmptyFileSource :: SourceSpec -> IO Bool
isEmptyFileSource (FilePathSpec filePath) = isEmptyFile filePath
@ -690,9 +696,9 @@ instance (MonadUnliftIO m, MonadIO m, MonadThrow m) => EvaluationContext (Withou
data ParserSpec (WithoutInput m e o) = ParserSpecWithoutInput (Text -> Either String e) (Text -> Either String o)
data WrappedParsedRecord (WithoutInput m e o) = WrappedParsedRecordWithoutInput (SourceItem e) (SourceItem o)
data ParsedRecord (WithoutInput m e o) = ParsedRecordWithoutInput e o
getFirstLineNo _ (WithoutInput _ (LineSource _ _ lineNo)) = lineNo
getExpectedSource (WithoutInput (LineSource _ expectedSource _) _) = expectedSource
getOutSource (WithoutInput _ (LineSource _ outSource _)) = outSource
getFirstLineNo _ (WithoutInput _ (LineSource _ _ _ lineNo)) = lineNo
getExpectedSource (WithoutInput (LineSource _ _ expectedSource _) _) = expectedSource
getOutSource (WithoutInput _ (LineSource _ _ outSource _)) = outSource
recordSource (WithoutInput expectedLineSource outLineSource) (ParserSpecWithoutInput expParser outParser) = getZipSource $ WrappedParsedRecordWithoutInput
<$> ZipSource (items expectedLineSource expParser)
<*> ZipSource (items outLineSource outParser)
@ -713,15 +719,15 @@ instance (MonadUnliftIO m, MonadIO m, MonadThrow m) => EvaluationContext (Withou
data WithInput m i e o = WithInput (LineSource (ResourceT m)) (LineSource (ResourceT m)) (LineSource (ResourceT m))
getInputFilePath (WithInput (LineSource _ inputFilePath _) _ _) = inputFilePath
getInputFilePath (WithInput (LineSource _ _ inputFilePath _) _ _) = inputFilePath
instance (MonadUnliftIO m, MonadIO m, MonadThrow m) => EvaluationContext (WithInput m i e o) m where
data ParserSpec (WithInput m i e o) = ParserSpecWithInput (Text -> Either String i) (Text -> Either String e) (Text -> Either String o)
data WrappedParsedRecord (WithInput m i e o) = WrappedParsedRecordWithInput (SourceItem i) (SourceItem e) (SourceItem o)
data ParsedRecord (WithInput m i e o) = ParsedRecordWithInput i e o
getFirstLineNo _ (WithInput _ _ (LineSource _ _ lineNo)) = lineNo
getExpectedSource (WithInput _ (LineSource _ expectedSource _) _) = expectedSource
getOutSource (WithInput _ _ (LineSource _ outSource _)) = outSource
getFirstLineNo _ (WithInput _ _ (LineSource _ _ _ lineNo)) = lineNo
getExpectedSource (WithInput _ (LineSource _ _ expectedSource _) _) = expectedSource
getOutSource (WithInput _ _ (LineSource _ _ outSource _)) = outSource
recordSource (WithInput inputLineSource expectedLineSource outLineSource) (ParserSpecWithInput inpParser expParser outParser) = getZipSource $ (\x (y,z) -> WrappedParsedRecordWithInput x y z)
<$> ZipSource (items inputLineSource inpParser) <*> (ZipSource $ getZipSource $ (,)
<$> ZipSource (items expectedLineSource expParser)
@ -756,8 +762,8 @@ averageC = getZipSink
<*> ZipSink CC.length
items :: MonadResource m => LineSource m -> (Text -> Either String a) -> ConduitT () (SourceItem a) m ()
items (LineSource lineSource _ _) parser =
(lineSource =$= CL.map (toItem . parser)) >> yield Done
items (LineSource lineSource preprocess _ _) parser =
(lineSource =$= CL.map (toItem . parser . preprocess)) >> yield Done
where toItem (Right x) = Got x
toItem (Left m) = Wrong m

View File

@ -192,8 +192,9 @@ lessByMetric reversed metric = lessByMetric' reversed (getMetricOrdering metric)
runLineByLineGeneralized :: ResultOrdering -> GEvalSpecification -> ConduitT LineRecord Void (ResourceT IO) a -> IO a
runLineByLineGeneralized ordering spec consum = do
(inputFilePath, expectedFilePath, outFilePath) <- checkAndGetFilesSingleOut True spec
gevalLineByLineCore metric inputFilePath expectedFilePath outFilePath (sorter ordering .| consum)
gevalLineByLineCore metric preprocess inputFilePath expectedFilePath outFilePath (sorter ordering .| consum)
where metric = gesMainMetric spec
preprocess = gesPreprocess spec
sorter KeepTheOriginalOrder = doNothing
sorter ordering = gobbleAndDo $ sortBy (sortOrder ordering (getMetricOrdering metric))
sortOrder FirstTheWorst TheHigherTheBetter = compareScores
@ -245,13 +246,14 @@ runDiffGeneralized ordering otherOut spec consum = do
Left (NoFile fp) -> throwM $ NoOutFile fp
Left (NoDirectory d) -> throwM $ NoOutFile otherOut
Right otherOutSource -> do
let sourceA = gevalLineByLineSource metric inputSource expectedSource otherOutSource
let sourceB = gevalLineByLineSource metric inputSource expectedSource outSource
let sourceA = gevalLineByLineSource metric preprocess inputSource expectedSource otherOutSource
let sourceB = gevalLineByLineSource metric preprocess inputSource expectedSource outSource
runResourceT $ runConduit $
((getZipSource $ (,)
<$> ZipSource sourceA
<*> ZipSource sourceB) .| sorter ordering .| consum)
where metric = gesMainMetric spec
preprocess = gesPreprocess spec
sorter KeepTheOriginalOrder = doNothing
sorter ordering = gobbleAndDo $ sortBy (sortOrder ordering (getMetricOrdering metric))
sortOrder FirstTheWorst TheHigherTheBetter = compareScores
@ -265,24 +267,24 @@ runDiffGeneralized ordering otherOut spec consum = do
escapeTabs :: Text -> Text
escapeTabs = Data.Text.replace "\t" "<tab>"
gevalLineByLineCore :: Metric -> SourceSpec -> SourceSpec -> SourceSpec -> ConduitT LineRecord Void (ResourceT IO) a -> IO a
gevalLineByLineCore metric inputSource expectedSource outSource consum =
gevalLineByLineCore :: Metric -> (Text -> Text) -> SourceSpec -> SourceSpec -> SourceSpec -> ConduitT LineRecord Void (ResourceT IO) a -> IO a
gevalLineByLineCore metric preprocess inputSource expectedSource outSource consum =
runResourceT $ runConduit $
((gevalLineByLineSource metric inputSource expectedSource outSource) .| consum)
((gevalLineByLineSource metric preprocess inputSource expectedSource outSource) .| consum)
gevalLineByLineSource :: Metric -> SourceSpec -> SourceSpec -> SourceSpec -> ConduitT () LineRecord (ResourceT IO) ()
gevalLineByLineSource metric inputSource expectedSource outSource =
gevalLineByLineSource :: Metric -> (Text -> Text) -> SourceSpec -> SourceSpec -> SourceSpec -> ConduitT () LineRecord (ResourceT IO) ()
gevalLineByLineSource metric preprocess inputSource expectedSource outSource =
(getZipSource $ (,)
<$> ZipSource (CL.sourceList [1..])
<*> (ZipSource $ recordSource context parserSpec)) .| CL.mapM (checkStepM evaluateLine) .| CL.catMaybes
where parserSpec = (ParserSpecWithInput (Right . id) (Right . id) (Right . id))
context = (WithInput inputLineSource expectedLineSource outputLineSource)
inputLineSource = fileAsLineSource inputSource
expectedLineSource = fileAsLineSource expectedSource
outputLineSource = fileAsLineSource outSource
inputLineSource = fileAsLineSource inputSource id
expectedLineSource = fileAsLineSource expectedSource id
outputLineSource = fileAsLineSource outSource id
justLine (LineInFile _ _ l) = l
evaluateLine (lineNo, ParsedRecordWithInput inp exp out) = do
s <- liftIO $ gevalCoreOnSingleLines metric (LineInFile inputSource lineNo inp)
s <- liftIO $ gevalCoreOnSingleLines metric preprocess (LineInFile inputSource lineNo inp)
(LineInFile expectedSource lineNo exp)
(LineInFile outSource lineNo out)
return $ LineRecord inp exp out lineNo s

View File

@ -21,6 +21,13 @@ instance Read Tokenizer where
tokenize :: Maybe Tokenizer -> T.Text -> [T.Text]
tokenize mTokenizer = T.words . (tokenizeWithSpaces mTokenizer)
tokenizeTabSeparatedWithSpaces :: Maybe Tokenizer -> T.Text -> T.Text
tokenizeTabSeparatedWithSpaces Nothing t = t -- special case for efficiency
tokenizeTabSeparatedWithSpaces tokenizer@(Just _) t =
T.intercalate "\t"
$ map (tokenizeWithSpaces tokenizer)
$ T.splitOn "\t" t
tokenizeWithSpaces :: Maybe Tokenizer -> T.Text -> T.Text
tokenizeWithSpaces Nothing t = t
tokenizeWithSpaces (Just V13a) t = T.strip tTokenized

View File

@ -232,7 +232,7 @@ main = hspec $ do
runGEvalTest "multilabel-likelihood-simple" `shouldReturnAlmost` 0.115829218528827
describe "evaluating single lines" $ do
it "RMSE" $ do
gevalCoreOnSingleLines RMSE (LineInFile (FilePathSpec "stub1") 1 "blabla")
gevalCoreOnSingleLines RMSE id (LineInFile (FilePathSpec "stub1") 1 "blabla")
(LineInFile (FilePathSpec "stub2") 1 "3.4")
(LineInFile (FilePathSpec "stub3") 1 "2.6") `shouldReturnAlmost` 0.8
describe "BIO format" $ do
@ -311,7 +311,8 @@ main = hspec $ do
gesExpectedFile = "expected.tsv",
gesInputFile = "in.tsv",
gesMetrics = [Likelihood],
gesPrecision = Nothing }
gesPrecision = Nothing,
gesTokenizer = Nothing }
it "simple test" $ do
results <- runLineByLineGeneralized KeepTheOriginalOrder sampleChallenge Data.Conduit.List.consume
Prelude.map (\(LineRecord inp _ _ _ _) -> inp) results `shouldBe` ["foo",