first tokenizer

This commit is contained in:
Filip Gralinski 2018-08-13 10:09:55 +02:00
parent d3da3a0ca5
commit 83550688ce
4 changed files with 61 additions and 45 deletions

View File

@ -39,7 +39,8 @@ module GEval.Core
checkAndGetFilesSingleOut, checkAndGetFilesSingleOut,
checkMultipleOuts, checkMultipleOuts,
checkMultipleOutsCore, checkMultipleOutsCore,
gesMainMetric gesMainMetric,
gesPreprocess
) where ) where
import Data.Conduit import Data.Conduit
@ -222,6 +223,9 @@ gesMainMetric spec = case gesMetrics spec of
(metric:_) -> metric (metric:_) -> metric
otherwise -> error "no metric given" otherwise -> error "no metric given"
gesPreprocess :: GEvalSpecification -> (Text -> Text)
gesPreprocess spec = tokenizeTabSeparatedWithSpaces (gesTokenizer spec)
getExpectedDirectory :: GEvalSpecification -> FilePath getExpectedDirectory :: GEvalSpecification -> FilePath
getExpectedDirectory spec = fromMaybe outDirectory $ gesExpectedDirectory spec getExpectedDirectory spec = fromMaybe outDirectory $ gesExpectedDirectory spec
where outDirectory = gesOutDirectory spec where outDirectory = gesOutDirectory spec
@ -295,7 +299,7 @@ isEmptyFile path = do
return ((fileSize stat) == 0) return ((fileSize stat) == 0)
data LineSource m = LineSource (ConduitT () Text m ()) SourceSpec Word32 data LineSource m = LineSource (ConduitT () Text m ()) (Text -> Text) SourceSpec Word32
geval :: GEvalSpecification -> IO [(SourceSpec, [MetricValue])] geval :: GEvalSpecification -> IO [(SourceSpec, [MetricValue])]
geval gevalSpec = do geval gevalSpec = do
@ -305,9 +309,10 @@ geval gevalSpec = do
gevalOnSingleOut :: GEvalSpecification -> SourceSpec -> SourceSpec -> SourceSpec -> IO (SourceSpec, [MetricValue]) gevalOnSingleOut :: GEvalSpecification -> SourceSpec -> SourceSpec -> SourceSpec -> IO (SourceSpec, [MetricValue])
gevalOnSingleOut gevalSpec inputSource expectedSource outSource = do gevalOnSingleOut gevalSpec inputSource expectedSource outSource = do
vals <- Prelude.mapM (\metric -> gevalCore metric inputSource expectedSource outSource) metrics vals <- Prelude.mapM (\metric -> gevalCore metric preprocess inputSource expectedSource outSource) metrics
return (outSource, vals) return (outSource, vals)
where metrics = gesMetrics gevalSpec where metrics = gesMetrics gevalSpec
preprocess = gesPreprocess gevalSpec
checkAndGetFilesSingleOut :: Bool -> GEvalSpecification -> IO (SourceSpec, SourceSpec, SourceSpec) checkAndGetFilesSingleOut :: Bool -> GEvalSpecification -> IO (SourceSpec, SourceSpec, SourceSpec)
checkAndGetFilesSingleOut forceInput gevalSpec = do checkAndGetFilesSingleOut forceInput gevalSpec = do
@ -394,35 +399,36 @@ getInputSourceIfNeeded forced metrics directory inputFilePath
Right sourceSpec -> return sourceSpec Right sourceSpec -> return sourceSpec
| otherwise = return NoSource | otherwise = return NoSource
fileAsLineSource :: SourceSpec -> LineSource (ResourceT IO) fileAsLineSource :: SourceSpec -> (Text -> Text) -> LineSource (ResourceT IO)
fileAsLineSource spec = fileAsLineSource spec preprocess =
LineSource ((smartSource spec) $= autoDecompress $= CT.decodeUtf8Lenient =$= CT.lines) spec 1 LineSource ((smartSource spec) $= autoDecompress $= CT.decodeUtf8Lenient =$= CT.lines) preprocess spec 1
gevalCoreOnSingleLines :: Metric -> LineInFile -> LineInFile -> LineInFile -> IO (MetricValue) gevalCoreOnSingleLines :: Metric -> (Text -> Text) -> LineInFile -> LineInFile -> LineInFile -> IO (MetricValue)
gevalCoreOnSingleLines metric inpLine expLine outLine = gevalCoreOnSingleLines metric preprocess inpLine expLine outLine =
gevalCoreOnSources metric (singleLineAsLineSource inpLine) gevalCoreOnSources metric (singleLineAsLineSource inpLine preprocess)
(singleLineAsLineSource expLine) (singleLineAsLineSource expLine preprocess)
(singleLineAsLineSource outLine) (singleLineAsLineSource outLine preprocess)
singleLineAsLineSource :: LineInFile -> LineSource (ResourceT IO) singleLineAsLineSource :: LineInFile -> (Text -> Text) -> LineSource (ResourceT IO)
singleLineAsLineSource (LineInFile sourceSpec lineNo line) = singleLineAsLineSource (LineInFile sourceSpec lineNo line) preprocess =
LineSource (CL.sourceList [line]) sourceSpec lineNo LineSource (CL.sourceList [line]) preprocess sourceSpec lineNo
-- | Runs evaluation for a given metric using the sources specified -- | Runs evaluation for a given metric using the sources specified
-- for input, expected output and output. Returns the metric value. -- for input, expected output and output. Returns the metric value.
-- Throws @GEvalException@ if something was wrong in the data (e.g. -- Throws @GEvalException@ if something was wrong in the data (e.g.
-- inconsistent number of lines in the sources). -- inconsistent number of lines in the sources).
gevalCore :: Metric -- ^ evaluation metric gevalCore :: Metric -- ^ evaluation metric
-> (Text -> Text) -- ^ preprocessing function (e.g. tokenization)
-> SourceSpec -- ^ source specification for the input values -> SourceSpec -- ^ source specification for the input values
-> SourceSpec -- ^ source specification for the expected output -> SourceSpec -- ^ source specification for the expected output
-> SourceSpec -- ^ source specification for the output -> SourceSpec -- ^ source specification for the output
-> IO (MetricValue) -- ^ metric value for the output against the expected output -> IO (MetricValue) -- ^ metric value for the output against the expected output
gevalCore metric inputSource expectedSource outSource = do gevalCore metric preprocess inputSource expectedSource outSource = do
whenM (isEmptyFileSource outSource) $ throwM $ EmptyOutput whenM (isEmptyFileSource outSource) $ throwM $ EmptyOutput
gevalCoreOnSources metric gevalCoreOnSources metric
(fileAsLineSource inputSource) (fileAsLineSource inputSource preprocess)
(fileAsLineSource expectedSource) (fileAsLineSource expectedSource preprocess)
(fileAsLineSource outSource) (fileAsLineSource outSource preprocess)
isEmptyFileSource :: SourceSpec -> IO Bool isEmptyFileSource :: SourceSpec -> IO Bool
isEmptyFileSource (FilePathSpec filePath) = isEmptyFile filePath isEmptyFileSource (FilePathSpec filePath) = isEmptyFile filePath
@ -690,9 +696,9 @@ instance (MonadUnliftIO m, MonadIO m, MonadThrow m) => EvaluationContext (Withou
data ParserSpec (WithoutInput m e o) = ParserSpecWithoutInput (Text -> Either String e) (Text -> Either String o) data ParserSpec (WithoutInput m e o) = ParserSpecWithoutInput (Text -> Either String e) (Text -> Either String o)
data WrappedParsedRecord (WithoutInput m e o) = WrappedParsedRecordWithoutInput (SourceItem e) (SourceItem o) data WrappedParsedRecord (WithoutInput m e o) = WrappedParsedRecordWithoutInput (SourceItem e) (SourceItem o)
data ParsedRecord (WithoutInput m e o) = ParsedRecordWithoutInput e o data ParsedRecord (WithoutInput m e o) = ParsedRecordWithoutInput e o
getFirstLineNo _ (WithoutInput _ (LineSource _ _ lineNo)) = lineNo getFirstLineNo _ (WithoutInput _ (LineSource _ _ _ lineNo)) = lineNo
getExpectedSource (WithoutInput (LineSource _ expectedSource _) _) = expectedSource getExpectedSource (WithoutInput (LineSource _ _ expectedSource _) _) = expectedSource
getOutSource (WithoutInput _ (LineSource _ outSource _)) = outSource getOutSource (WithoutInput _ (LineSource _ _ outSource _)) = outSource
recordSource (WithoutInput expectedLineSource outLineSource) (ParserSpecWithoutInput expParser outParser) = getZipSource $ WrappedParsedRecordWithoutInput recordSource (WithoutInput expectedLineSource outLineSource) (ParserSpecWithoutInput expParser outParser) = getZipSource $ WrappedParsedRecordWithoutInput
<$> ZipSource (items expectedLineSource expParser) <$> ZipSource (items expectedLineSource expParser)
<*> ZipSource (items outLineSource outParser) <*> ZipSource (items outLineSource outParser)
@ -713,15 +719,15 @@ instance (MonadUnliftIO m, MonadIO m, MonadThrow m) => EvaluationContext (Withou
data WithInput m i e o = WithInput (LineSource (ResourceT m)) (LineSource (ResourceT m)) (LineSource (ResourceT m)) data WithInput m i e o = WithInput (LineSource (ResourceT m)) (LineSource (ResourceT m)) (LineSource (ResourceT m))
getInputFilePath (WithInput (LineSource _ inputFilePath _) _ _) = inputFilePath getInputFilePath (WithInput (LineSource _ _ inputFilePath _) _ _) = inputFilePath
instance (MonadUnliftIO m, MonadIO m, MonadThrow m) => EvaluationContext (WithInput m i e o) m where instance (MonadUnliftIO m, MonadIO m, MonadThrow m) => EvaluationContext (WithInput m i e o) m where
data ParserSpec (WithInput m i e o) = ParserSpecWithInput (Text -> Either String i) (Text -> Either String e) (Text -> Either String o) data ParserSpec (WithInput m i e o) = ParserSpecWithInput (Text -> Either String i) (Text -> Either String e) (Text -> Either String o)
data WrappedParsedRecord (WithInput m i e o) = WrappedParsedRecordWithInput (SourceItem i) (SourceItem e) (SourceItem o) data WrappedParsedRecord (WithInput m i e o) = WrappedParsedRecordWithInput (SourceItem i) (SourceItem e) (SourceItem o)
data ParsedRecord (WithInput m i e o) = ParsedRecordWithInput i e o data ParsedRecord (WithInput m i e o) = ParsedRecordWithInput i e o
getFirstLineNo _ (WithInput _ _ (LineSource _ _ lineNo)) = lineNo getFirstLineNo _ (WithInput _ _ (LineSource _ _ _ lineNo)) = lineNo
getExpectedSource (WithInput _ (LineSource _ expectedSource _) _) = expectedSource getExpectedSource (WithInput _ (LineSource _ _ expectedSource _) _) = expectedSource
getOutSource (WithInput _ _ (LineSource _ outSource _)) = outSource getOutSource (WithInput _ _ (LineSource _ _ outSource _)) = outSource
recordSource (WithInput inputLineSource expectedLineSource outLineSource) (ParserSpecWithInput inpParser expParser outParser) = getZipSource $ (\x (y,z) -> WrappedParsedRecordWithInput x y z) recordSource (WithInput inputLineSource expectedLineSource outLineSource) (ParserSpecWithInput inpParser expParser outParser) = getZipSource $ (\x (y,z) -> WrappedParsedRecordWithInput x y z)
<$> ZipSource (items inputLineSource inpParser) <*> (ZipSource $ getZipSource $ (,) <$> ZipSource (items inputLineSource inpParser) <*> (ZipSource $ getZipSource $ (,)
<$> ZipSource (items expectedLineSource expParser) <$> ZipSource (items expectedLineSource expParser)
@ -756,8 +762,8 @@ averageC = getZipSink
<*> ZipSink CC.length <*> ZipSink CC.length
items :: MonadResource m => LineSource m -> (Text -> Either String a) -> ConduitT () (SourceItem a) m () items :: MonadResource m => LineSource m -> (Text -> Either String a) -> ConduitT () (SourceItem a) m ()
items (LineSource lineSource _ _) parser = items (LineSource lineSource preprocess _ _) parser =
(lineSource =$= CL.map (toItem . parser)) >> yield Done (lineSource =$= CL.map (toItem . parser . preprocess)) >> yield Done
where toItem (Right x) = Got x where toItem (Right x) = Got x
toItem (Left m) = Wrong m toItem (Left m) = Wrong m

View File

@ -192,8 +192,9 @@ lessByMetric reversed metric = lessByMetric' reversed (getMetricOrdering metric)
runLineByLineGeneralized :: ResultOrdering -> GEvalSpecification -> ConduitT LineRecord Void (ResourceT IO) a -> IO a runLineByLineGeneralized :: ResultOrdering -> GEvalSpecification -> ConduitT LineRecord Void (ResourceT IO) a -> IO a
runLineByLineGeneralized ordering spec consum = do runLineByLineGeneralized ordering spec consum = do
(inputFilePath, expectedFilePath, outFilePath) <- checkAndGetFilesSingleOut True spec (inputFilePath, expectedFilePath, outFilePath) <- checkAndGetFilesSingleOut True spec
gevalLineByLineCore metric inputFilePath expectedFilePath outFilePath (sorter ordering .| consum) gevalLineByLineCore metric preprocess inputFilePath expectedFilePath outFilePath (sorter ordering .| consum)
where metric = gesMainMetric spec where metric = gesMainMetric spec
preprocess = gesPreprocess spec
sorter KeepTheOriginalOrder = doNothing sorter KeepTheOriginalOrder = doNothing
sorter ordering = gobbleAndDo $ sortBy (sortOrder ordering (getMetricOrdering metric)) sorter ordering = gobbleAndDo $ sortBy (sortOrder ordering (getMetricOrdering metric))
sortOrder FirstTheWorst TheHigherTheBetter = compareScores sortOrder FirstTheWorst TheHigherTheBetter = compareScores
@ -245,13 +246,14 @@ runDiffGeneralized ordering otherOut spec consum = do
Left (NoFile fp) -> throwM $ NoOutFile fp Left (NoFile fp) -> throwM $ NoOutFile fp
Left (NoDirectory d) -> throwM $ NoOutFile otherOut Left (NoDirectory d) -> throwM $ NoOutFile otherOut
Right otherOutSource -> do Right otherOutSource -> do
let sourceA = gevalLineByLineSource metric inputSource expectedSource otherOutSource let sourceA = gevalLineByLineSource metric preprocess inputSource expectedSource otherOutSource
let sourceB = gevalLineByLineSource metric inputSource expectedSource outSource let sourceB = gevalLineByLineSource metric preprocess inputSource expectedSource outSource
runResourceT $ runConduit $ runResourceT $ runConduit $
((getZipSource $ (,) ((getZipSource $ (,)
<$> ZipSource sourceA <$> ZipSource sourceA
<*> ZipSource sourceB) .| sorter ordering .| consum) <*> ZipSource sourceB) .| sorter ordering .| consum)
where metric = gesMainMetric spec where metric = gesMainMetric spec
preprocess = gesPreprocess spec
sorter KeepTheOriginalOrder = doNothing sorter KeepTheOriginalOrder = doNothing
sorter ordering = gobbleAndDo $ sortBy (sortOrder ordering (getMetricOrdering metric)) sorter ordering = gobbleAndDo $ sortBy (sortOrder ordering (getMetricOrdering metric))
sortOrder FirstTheWorst TheHigherTheBetter = compareScores sortOrder FirstTheWorst TheHigherTheBetter = compareScores
@ -265,24 +267,24 @@ runDiffGeneralized ordering otherOut spec consum = do
escapeTabs :: Text -> Text escapeTabs :: Text -> Text
escapeTabs = Data.Text.replace "\t" "<tab>" escapeTabs = Data.Text.replace "\t" "<tab>"
gevalLineByLineCore :: Metric -> SourceSpec -> SourceSpec -> SourceSpec -> ConduitT LineRecord Void (ResourceT IO) a -> IO a gevalLineByLineCore :: Metric -> (Text -> Text) -> SourceSpec -> SourceSpec -> SourceSpec -> ConduitT LineRecord Void (ResourceT IO) a -> IO a
gevalLineByLineCore metric inputSource expectedSource outSource consum = gevalLineByLineCore metric preprocess inputSource expectedSource outSource consum =
runResourceT $ runConduit $ runResourceT $ runConduit $
((gevalLineByLineSource metric inputSource expectedSource outSource) .| consum) ((gevalLineByLineSource metric preprocess inputSource expectedSource outSource) .| consum)
gevalLineByLineSource :: Metric -> SourceSpec -> SourceSpec -> SourceSpec -> ConduitT () LineRecord (ResourceT IO) () gevalLineByLineSource :: Metric -> (Text -> Text) -> SourceSpec -> SourceSpec -> SourceSpec -> ConduitT () LineRecord (ResourceT IO) ()
gevalLineByLineSource metric inputSource expectedSource outSource = gevalLineByLineSource metric preprocess inputSource expectedSource outSource =
(getZipSource $ (,) (getZipSource $ (,)
<$> ZipSource (CL.sourceList [1..]) <$> ZipSource (CL.sourceList [1..])
<*> (ZipSource $ recordSource context parserSpec)) .| CL.mapM (checkStepM evaluateLine) .| CL.catMaybes <*> (ZipSource $ recordSource context parserSpec)) .| CL.mapM (checkStepM evaluateLine) .| CL.catMaybes
where parserSpec = (ParserSpecWithInput (Right . id) (Right . id) (Right . id)) where parserSpec = (ParserSpecWithInput (Right . id) (Right . id) (Right . id))
context = (WithInput inputLineSource expectedLineSource outputLineSource) context = (WithInput inputLineSource expectedLineSource outputLineSource)
inputLineSource = fileAsLineSource inputSource inputLineSource = fileAsLineSource inputSource id
expectedLineSource = fileAsLineSource expectedSource expectedLineSource = fileAsLineSource expectedSource id
outputLineSource = fileAsLineSource outSource outputLineSource = fileAsLineSource outSource id
justLine (LineInFile _ _ l) = l justLine (LineInFile _ _ l) = l
evaluateLine (lineNo, ParsedRecordWithInput inp exp out) = do evaluateLine (lineNo, ParsedRecordWithInput inp exp out) = do
s <- liftIO $ gevalCoreOnSingleLines metric (LineInFile inputSource lineNo inp) s <- liftIO $ gevalCoreOnSingleLines metric preprocess (LineInFile inputSource lineNo inp)
(LineInFile expectedSource lineNo exp) (LineInFile expectedSource lineNo exp)
(LineInFile outSource lineNo out) (LineInFile outSource lineNo out)
return $ LineRecord inp exp out lineNo s return $ LineRecord inp exp out lineNo s

View File

@ -21,6 +21,13 @@ instance Read Tokenizer where
tokenize :: Maybe Tokenizer -> T.Text -> [T.Text] tokenize :: Maybe Tokenizer -> T.Text -> [T.Text]
tokenize mTokenizer = T.words . (tokenizeWithSpaces mTokenizer) tokenize mTokenizer = T.words . (tokenizeWithSpaces mTokenizer)
tokenizeTabSeparatedWithSpaces :: Maybe Tokenizer -> T.Text -> T.Text
tokenizeTabSeparatedWithSpaces Nothing t = t -- special case for efficiency
tokenizeTabSeparatedWithSpaces tokenizer@(Just _) t =
T.intercalate "\t"
$ map (tokenizeWithSpaces tokenizer)
$ T.splitOn "\t" t
tokenizeWithSpaces :: Maybe Tokenizer -> T.Text -> T.Text tokenizeWithSpaces :: Maybe Tokenizer -> T.Text -> T.Text
tokenizeWithSpaces Nothing t = t tokenizeWithSpaces Nothing t = t
tokenizeWithSpaces (Just V13a) t = T.strip tTokenized tokenizeWithSpaces (Just V13a) t = T.strip tTokenized

View File

@ -232,9 +232,9 @@ main = hspec $ do
runGEvalTest "multilabel-likelihood-simple" `shouldReturnAlmost` 0.115829218528827 runGEvalTest "multilabel-likelihood-simple" `shouldReturnAlmost` 0.115829218528827
describe "evaluating single lines" $ do describe "evaluating single lines" $ do
it "RMSE" $ do it "RMSE" $ do
gevalCoreOnSingleLines RMSE (LineInFile (FilePathSpec "stub1") 1 "blabla") gevalCoreOnSingleLines RMSE id (LineInFile (FilePathSpec "stub1") 1 "blabla")
(LineInFile (FilePathSpec "stub2") 1 "3.4") (LineInFile (FilePathSpec "stub2") 1 "3.4")
(LineInFile (FilePathSpec "stub3") 1 "2.6") `shouldReturnAlmost` 0.8 (LineInFile (FilePathSpec "stub3") 1 "2.6") `shouldReturnAlmost` 0.8
describe "BIO format" $ do describe "BIO format" $ do
it "just parse" $ do it "just parse" $ do
let (Right r) = parseOnly (bioSequenceParser <* endOfInput) "O B-city/NEW_YORK I-city B-city/KALISZ I-city O B-name" let (Right r) = parseOnly (bioSequenceParser <* endOfInput) "O B-city/NEW_YORK I-city B-city/KALISZ I-city O B-name"
@ -311,7 +311,8 @@ main = hspec $ do
gesExpectedFile = "expected.tsv", gesExpectedFile = "expected.tsv",
gesInputFile = "in.tsv", gesInputFile = "in.tsv",
gesMetrics = [Likelihood], gesMetrics = [Likelihood],
gesPrecision = Nothing } gesPrecision = Nothing,
gesTokenizer = Nothing }
it "simple test" $ do it "simple test" $ do
results <- runLineByLineGeneralized KeepTheOriginalOrder sampleChallenge Data.Conduit.List.consume results <- runLineByLineGeneralized KeepTheOriginalOrder sampleChallenge Data.Conduit.List.consume
Prelude.map (\(LineRecord inp _ _ _ _) -> inp) results `shouldBe` ["foo", Prelude.map (\(LineRecord inp _ _ _ _) -> inp) results `shouldBe` ["foo",