diff --git a/src/GEval/Core.hs b/src/GEval/Core.hs index c6bd134..c95a15d 100644 --- a/src/GEval/Core.hs +++ b/src/GEval/Core.hs @@ -39,7 +39,8 @@ module GEval.Core checkAndGetFilesSingleOut, checkMultipleOuts, checkMultipleOutsCore, - gesMainMetric + gesMainMetric, + gesPreprocess ) where import Data.Conduit @@ -222,6 +223,9 @@ gesMainMetric spec = case gesMetrics spec of (metric:_) -> metric otherwise -> error "no metric given" +gesPreprocess :: GEvalSpecification -> (Text -> Text) +gesPreprocess spec = tokenizeTabSeparatedWithSpaces (gesTokenizer spec) + getExpectedDirectory :: GEvalSpecification -> FilePath getExpectedDirectory spec = fromMaybe outDirectory $ gesExpectedDirectory spec where outDirectory = gesOutDirectory spec @@ -295,7 +299,7 @@ isEmptyFile path = do return ((fileSize stat) == 0) -data LineSource m = LineSource (ConduitT () Text m ()) SourceSpec Word32 +data LineSource m = LineSource (ConduitT () Text m ()) (Text -> Text) SourceSpec Word32 geval :: GEvalSpecification -> IO [(SourceSpec, [MetricValue])] geval gevalSpec = do @@ -305,9 +309,10 @@ geval gevalSpec = do gevalOnSingleOut :: GEvalSpecification -> SourceSpec -> SourceSpec -> SourceSpec -> IO (SourceSpec, [MetricValue]) gevalOnSingleOut gevalSpec inputSource expectedSource outSource = do - vals <- Prelude.mapM (\metric -> gevalCore metric inputSource expectedSource outSource) metrics + vals <- Prelude.mapM (\metric -> gevalCore metric preprocess inputSource expectedSource outSource) metrics return (outSource, vals) where metrics = gesMetrics gevalSpec + preprocess = gesPreprocess gevalSpec checkAndGetFilesSingleOut :: Bool -> GEvalSpecification -> IO (SourceSpec, SourceSpec, SourceSpec) checkAndGetFilesSingleOut forceInput gevalSpec = do @@ -394,35 +399,36 @@ getInputSourceIfNeeded forced metrics directory inputFilePath Right sourceSpec -> return sourceSpec | otherwise = return NoSource -fileAsLineSource :: SourceSpec -> LineSource (ResourceT IO) -fileAsLineSource spec = - LineSource ((smartSource spec) $= autoDecompress $= CT.decodeUtf8Lenient =$= CT.lines) spec 1 +fileAsLineSource :: SourceSpec -> (Text -> Text) -> LineSource (ResourceT IO) +fileAsLineSource spec preprocess = + LineSource ((smartSource spec) $= autoDecompress $= CT.decodeUtf8Lenient =$= CT.lines) preprocess spec 1 -gevalCoreOnSingleLines :: Metric -> LineInFile -> LineInFile -> LineInFile -> IO (MetricValue) -gevalCoreOnSingleLines metric inpLine expLine outLine = - gevalCoreOnSources metric (singleLineAsLineSource inpLine) - (singleLineAsLineSource expLine) - (singleLineAsLineSource outLine) +gevalCoreOnSingleLines :: Metric -> (Text -> Text) -> LineInFile -> LineInFile -> LineInFile -> IO (MetricValue) +gevalCoreOnSingleLines metric preprocess inpLine expLine outLine = + gevalCoreOnSources metric (singleLineAsLineSource inpLine preprocess) + (singleLineAsLineSource expLine preprocess) + (singleLineAsLineSource outLine preprocess) -singleLineAsLineSource :: LineInFile -> LineSource (ResourceT IO) -singleLineAsLineSource (LineInFile sourceSpec lineNo line) = - LineSource (CL.sourceList [line]) sourceSpec lineNo +singleLineAsLineSource :: LineInFile -> (Text -> Text) -> LineSource (ResourceT IO) +singleLineAsLineSource (LineInFile sourceSpec lineNo line) preprocess = + LineSource (CL.sourceList [line]) preprocess sourceSpec lineNo -- | Runs evaluation for a given metric using the sources specified -- for input, expected output and output. Returns the metric value. -- Throws @GEvalException@ if something was wrong in the data (e.g. -- inconsistent number of lines in the sources). gevalCore :: Metric -- ^ evaluation metric + -> (Text -> Text) -- ^ preprocessing function (e.g. tokenization) -> SourceSpec -- ^ source specification for the input values -> SourceSpec -- ^ source specification for the expected output -> SourceSpec -- ^ source specification for the output -> IO (MetricValue) -- ^ metric value for the output against the expected output -gevalCore metric inputSource expectedSource outSource = do +gevalCore metric preprocess inputSource expectedSource outSource = do whenM (isEmptyFileSource outSource) $ throwM $ EmptyOutput gevalCoreOnSources metric - (fileAsLineSource inputSource) - (fileAsLineSource expectedSource) - (fileAsLineSource outSource) + (fileAsLineSource inputSource preprocess) + (fileAsLineSource expectedSource preprocess) + (fileAsLineSource outSource preprocess) isEmptyFileSource :: SourceSpec -> IO Bool isEmptyFileSource (FilePathSpec filePath) = isEmptyFile filePath @@ -690,9 +696,9 @@ instance (MonadUnliftIO m, MonadIO m, MonadThrow m) => EvaluationContext (Withou data ParserSpec (WithoutInput m e o) = ParserSpecWithoutInput (Text -> Either String e) (Text -> Either String o) data WrappedParsedRecord (WithoutInput m e o) = WrappedParsedRecordWithoutInput (SourceItem e) (SourceItem o) data ParsedRecord (WithoutInput m e o) = ParsedRecordWithoutInput e o - getFirstLineNo _ (WithoutInput _ (LineSource _ _ lineNo)) = lineNo - getExpectedSource (WithoutInput (LineSource _ expectedSource _) _) = expectedSource - getOutSource (WithoutInput _ (LineSource _ outSource _)) = outSource + getFirstLineNo _ (WithoutInput _ (LineSource _ _ _ lineNo)) = lineNo + getExpectedSource (WithoutInput (LineSource _ _ expectedSource _) _) = expectedSource + getOutSource (WithoutInput _ (LineSource _ _ outSource _)) = outSource recordSource (WithoutInput expectedLineSource outLineSource) (ParserSpecWithoutInput expParser outParser) = getZipSource $ WrappedParsedRecordWithoutInput <$> ZipSource (items expectedLineSource expParser) <*> ZipSource (items outLineSource outParser) @@ -713,15 +719,15 @@ instance (MonadUnliftIO m, MonadIO m, MonadThrow m) => EvaluationContext (Withou data WithInput m i e o = WithInput (LineSource (ResourceT m)) (LineSource (ResourceT m)) (LineSource (ResourceT m)) -getInputFilePath (WithInput (LineSource _ inputFilePath _) _ _) = inputFilePath +getInputFilePath (WithInput (LineSource _ _ inputFilePath _) _ _) = inputFilePath instance (MonadUnliftIO m, MonadIO m, MonadThrow m) => EvaluationContext (WithInput m i e o) m where data ParserSpec (WithInput m i e o) = ParserSpecWithInput (Text -> Either String i) (Text -> Either String e) (Text -> Either String o) data WrappedParsedRecord (WithInput m i e o) = WrappedParsedRecordWithInput (SourceItem i) (SourceItem e) (SourceItem o) data ParsedRecord (WithInput m i e o) = ParsedRecordWithInput i e o - getFirstLineNo _ (WithInput _ _ (LineSource _ _ lineNo)) = lineNo - getExpectedSource (WithInput _ (LineSource _ expectedSource _) _) = expectedSource - getOutSource (WithInput _ _ (LineSource _ outSource _)) = outSource + getFirstLineNo _ (WithInput _ _ (LineSource _ _ _ lineNo)) = lineNo + getExpectedSource (WithInput _ (LineSource _ _ expectedSource _) _) = expectedSource + getOutSource (WithInput _ _ (LineSource _ _ outSource _)) = outSource recordSource (WithInput inputLineSource expectedLineSource outLineSource) (ParserSpecWithInput inpParser expParser outParser) = getZipSource $ (\x (y,z) -> WrappedParsedRecordWithInput x y z) <$> ZipSource (items inputLineSource inpParser) <*> (ZipSource $ getZipSource $ (,) <$> ZipSource (items expectedLineSource expParser) @@ -756,8 +762,8 @@ averageC = getZipSink <*> ZipSink CC.length items :: MonadResource m => LineSource m -> (Text -> Either String a) -> ConduitT () (SourceItem a) m () -items (LineSource lineSource _ _) parser = - (lineSource =$= CL.map (toItem . parser)) >> yield Done +items (LineSource lineSource preprocess _ _) parser = + (lineSource =$= CL.map (toItem . parser . preprocess)) >> yield Done where toItem (Right x) = Got x toItem (Left m) = Wrong m diff --git a/src/GEval/LineByLine.hs b/src/GEval/LineByLine.hs index f8565a4..05d0599 100644 --- a/src/GEval/LineByLine.hs +++ b/src/GEval/LineByLine.hs @@ -192,8 +192,9 @@ lessByMetric reversed metric = lessByMetric' reversed (getMetricOrdering metric) runLineByLineGeneralized :: ResultOrdering -> GEvalSpecification -> ConduitT LineRecord Void (ResourceT IO) a -> IO a runLineByLineGeneralized ordering spec consum = do (inputFilePath, expectedFilePath, outFilePath) <- checkAndGetFilesSingleOut True spec - gevalLineByLineCore metric inputFilePath expectedFilePath outFilePath (sorter ordering .| consum) + gevalLineByLineCore metric preprocess inputFilePath expectedFilePath outFilePath (sorter ordering .| consum) where metric = gesMainMetric spec + preprocess = gesPreprocess spec sorter KeepTheOriginalOrder = doNothing sorter ordering = gobbleAndDo $ sortBy (sortOrder ordering (getMetricOrdering metric)) sortOrder FirstTheWorst TheHigherTheBetter = compareScores @@ -245,13 +246,14 @@ runDiffGeneralized ordering otherOut spec consum = do Left (NoFile fp) -> throwM $ NoOutFile fp Left (NoDirectory d) -> throwM $ NoOutFile otherOut Right otherOutSource -> do - let sourceA = gevalLineByLineSource metric inputSource expectedSource otherOutSource - let sourceB = gevalLineByLineSource metric inputSource expectedSource outSource + let sourceA = gevalLineByLineSource metric preprocess inputSource expectedSource otherOutSource + let sourceB = gevalLineByLineSource metric preprocess inputSource expectedSource outSource runResourceT $ runConduit $ ((getZipSource $ (,) <$> ZipSource sourceA <*> ZipSource sourceB) .| sorter ordering .| consum) where metric = gesMainMetric spec + preprocess = gesPreprocess spec sorter KeepTheOriginalOrder = doNothing sorter ordering = gobbleAndDo $ sortBy (sortOrder ordering (getMetricOrdering metric)) sortOrder FirstTheWorst TheHigherTheBetter = compareScores @@ -265,24 +267,24 @@ runDiffGeneralized ordering otherOut spec consum = do escapeTabs :: Text -> Text escapeTabs = Data.Text.replace "\t" "" -gevalLineByLineCore :: Metric -> SourceSpec -> SourceSpec -> SourceSpec -> ConduitT LineRecord Void (ResourceT IO) a -> IO a -gevalLineByLineCore metric inputSource expectedSource outSource consum = +gevalLineByLineCore :: Metric -> (Text -> Text) -> SourceSpec -> SourceSpec -> SourceSpec -> ConduitT LineRecord Void (ResourceT IO) a -> IO a +gevalLineByLineCore metric preprocess inputSource expectedSource outSource consum = runResourceT $ runConduit $ - ((gevalLineByLineSource metric inputSource expectedSource outSource) .| consum) + ((gevalLineByLineSource metric preprocess inputSource expectedSource outSource) .| consum) -gevalLineByLineSource :: Metric -> SourceSpec -> SourceSpec -> SourceSpec -> ConduitT () LineRecord (ResourceT IO) () -gevalLineByLineSource metric inputSource expectedSource outSource = +gevalLineByLineSource :: Metric -> (Text -> Text) -> SourceSpec -> SourceSpec -> SourceSpec -> ConduitT () LineRecord (ResourceT IO) () +gevalLineByLineSource metric preprocess inputSource expectedSource outSource = (getZipSource $ (,) <$> ZipSource (CL.sourceList [1..]) <*> (ZipSource $ recordSource context parserSpec)) .| CL.mapM (checkStepM evaluateLine) .| CL.catMaybes where parserSpec = (ParserSpecWithInput (Right . id) (Right . id) (Right . id)) context = (WithInput inputLineSource expectedLineSource outputLineSource) - inputLineSource = fileAsLineSource inputSource - expectedLineSource = fileAsLineSource expectedSource - outputLineSource = fileAsLineSource outSource + inputLineSource = fileAsLineSource inputSource id + expectedLineSource = fileAsLineSource expectedSource id + outputLineSource = fileAsLineSource outSource id justLine (LineInFile _ _ l) = l evaluateLine (lineNo, ParsedRecordWithInput inp exp out) = do - s <- liftIO $ gevalCoreOnSingleLines metric (LineInFile inputSource lineNo inp) - (LineInFile expectedSource lineNo exp) - (LineInFile outSource lineNo out) + s <- liftIO $ gevalCoreOnSingleLines metric preprocess (LineInFile inputSource lineNo inp) + (LineInFile expectedSource lineNo exp) + (LineInFile outSource lineNo out) return $ LineRecord inp exp out lineNo s diff --git a/src/Text/Tokenizer.hs b/src/Text/Tokenizer.hs index ff8ebea..02a4f4c 100644 --- a/src/Text/Tokenizer.hs +++ b/src/Text/Tokenizer.hs @@ -21,6 +21,13 @@ instance Read Tokenizer where tokenize :: Maybe Tokenizer -> T.Text -> [T.Text] tokenize mTokenizer = T.words . (tokenizeWithSpaces mTokenizer) +tokenizeTabSeparatedWithSpaces :: Maybe Tokenizer -> T.Text -> T.Text +tokenizeTabSeparatedWithSpaces Nothing t = t -- special case for efficiency +tokenizeTabSeparatedWithSpaces tokenizer@(Just _) t = + T.intercalate "\t" + $ map (tokenizeWithSpaces tokenizer) + $ T.splitOn "\t" t + tokenizeWithSpaces :: Maybe Tokenizer -> T.Text -> T.Text tokenizeWithSpaces Nothing t = t tokenizeWithSpaces (Just V13a) t = T.strip tTokenized diff --git a/test/Spec.hs b/test/Spec.hs index 53f129a..5728b2d 100644 --- a/test/Spec.hs +++ b/test/Spec.hs @@ -232,9 +232,9 @@ main = hspec $ do runGEvalTest "multilabel-likelihood-simple" `shouldReturnAlmost` 0.115829218528827 describe "evaluating single lines" $ do it "RMSE" $ do - gevalCoreOnSingleLines RMSE (LineInFile (FilePathSpec "stub1") 1 "blabla") - (LineInFile (FilePathSpec "stub2") 1 "3.4") - (LineInFile (FilePathSpec "stub3") 1 "2.6") `shouldReturnAlmost` 0.8 + gevalCoreOnSingleLines RMSE id (LineInFile (FilePathSpec "stub1") 1 "blabla") + (LineInFile (FilePathSpec "stub2") 1 "3.4") + (LineInFile (FilePathSpec "stub3") 1 "2.6") `shouldReturnAlmost` 0.8 describe "BIO format" $ do it "just parse" $ do let (Right r) = parseOnly (bioSequenceParser <* endOfInput) "O B-city/NEW_YORK I-city B-city/KALISZ I-city O B-name" @@ -311,7 +311,8 @@ main = hspec $ do gesExpectedFile = "expected.tsv", gesInputFile = "in.tsv", gesMetrics = [Likelihood], - gesPrecision = Nothing } + gesPrecision = Nothing, + gesTokenizer = Nothing } it "simple test" $ do results <- runLineByLineGeneralized KeepTheOriginalOrder sampleChallenge Data.Conduit.List.consume Prelude.map (\(LineRecord inp _ _ _ _) -> inp) results `shouldBe` ["foo",