diff --git a/src/GEval/FeatureExtractor.hs b/src/GEval/FeatureExtractor.hs index e0f5f5b..7b968d4 100644 --- a/src/GEval/FeatureExtractor.hs +++ b/src/GEval/FeatureExtractor.hs @@ -9,17 +9,19 @@ module GEval.FeatureExtractor import Data.Text import Data.List import Data.Monoid ((<>)) +import Text.Tokenizer -extractUnigramFeatures :: Text -> Text -> [Text] -extractUnigramFeatures namespace record = Prelude.map (prefix <>) $ nub $ tokenize record +extractUnigramFeatures :: (Maybe Tokenizer) -> Text -> Text -> [Text] +extractUnigramFeatures mTokenizer namespace record = Prelude.map (prefix <>) $ nub $ (tokenizeForFeatures mTokenizer) record where prefix = namespace <> ":" -tokenize :: Text -> [Text] -tokenize t = Data.List.filter (not . Data.Text.null) $ split splitPred t +tokenizeForFeatures :: (Maybe Tokenizer) -> Text -> [Text] +tokenizeForFeatures Nothing t = Data.List.filter (not . Data.Text.null) $ split splitPred t where splitPred c = c == ' ' || c == '\t' || c == ':' +tokenizeForFeatures mTokenizer t = tokenize mTokenizer t -extractUnigramFeaturesFromTabbed :: Text -> Text -> [Text] -extractUnigramFeaturesFromTabbed namespace record = +extractUnigramFeaturesFromTabbed :: (Maybe Tokenizer) -> Text -> Text -> [Text] +extractUnigramFeaturesFromTabbed mTokenizer namespace record = Data.List.concat - $ Prelude.map (\(n, t) -> extractUnigramFeatures (namespace <> "<" <> (pack $ show n) <> ">") t) + $ Prelude.map (\(n, t) -> extractUnigramFeatures mTokenizer (namespace <> "<" <> (pack $ show n) <> ">") t) $ Prelude.zip [1..] (splitOn "\t" record) diff --git a/src/GEval/LineByLine.hs b/src/GEval/LineByLine.hs index 3942fe9..e68b5a9 100644 --- a/src/GEval/LineByLine.hs +++ b/src/GEval/LineByLine.hs @@ -78,7 +78,7 @@ runWorstFeatures ordering spec = runLineByLineGeneralized ordering' spec (worstF worstFeaturesPipeline :: Bool -> GEvalSpecification -> ConduitT LineRecord Void (ResourceT IO) () worstFeaturesPipeline reversed spec = rank (lessByMetric reversed $ gesMainMetric spec) - .| evalStateC 0 extractFeaturesAndPValues + .| evalStateC 0 (extractFeaturesAndPValues spec) .| gobbleAndDo (sortBy featureOrder) .| CL.map (encodeUtf8 . formatFeatureWithPValue) .| CC.unlinesAscii @@ -99,10 +99,10 @@ forceSomeOrdering :: ResultOrdering -> ResultOrdering forceSomeOrdering FirstTheBest = FirstTheBest forceSomeOrdering KeepTheOriginalOrder = FirstTheWorst -extractFeaturesAndPValues :: Monad m => ConduitT (Double, LineRecord) FeatureWithPValue (StateT Integer m) () -extractFeaturesAndPValues = +extractFeaturesAndPValues :: Monad m => GEvalSpecification -> ConduitT (Double, LineRecord) FeatureWithPValue (StateT Integer m) () +extractFeaturesAndPValues spec = totalCounter - .| featureExtractor + .| featureExtractor spec .| uScoresCounter @@ -122,15 +122,15 @@ formatFeatureWithPValue (FeatureWithPValue f p avg c) = (pack $ printf "%0.8f" avg), (pack $ printf "%0.20f" p)] -featureExtractor :: Monad m => ConduitT (Double, LineRecord) RankedFeature m () -featureExtractor = CC.map extract .| CC.concat +featureExtractor :: Monad m => GEvalSpecification -> ConduitT (Double, LineRecord) RankedFeature m () +featureExtractor spec = CC.map extract .| CC.concat where extract (rank, LineRecord inLine expLine outLine _ score) = Prelude.map (\f -> RankedFeature f rank score) $ Data.List.concat [ - extractUnigramFeatures "exp" expLine, - extractUnigramFeaturesFromTabbed "in" inLine, - extractUnigramFeatures "out" outLine] - + extractUnigramFeatures mTokenizer "exp" expLine, + extractUnigramFeaturesFromTabbed mTokenizer "in" inLine, + extractUnigramFeatures mTokenizer "out" outLine] + mTokenizer = gesTokenizer spec uScoresCounter :: Monad m => ConduitT RankedFeature FeatureWithPValue (StateT Integer m) () uScoresCounter = CC.map (\(RankedFeature feature r score) -> (feature, (r, score, 1))) .| gobbleAndDo countUScores diff --git a/src/GEval/OptionsParser.hs b/src/GEval/OptionsParser.hs index 3847b3b..c15e650 100644 --- a/src/GEval/OptionsParser.hs +++ b/src/GEval/OptionsParser.hs @@ -135,7 +135,7 @@ specParser = GEvalSpecification ( long "tokenizer" <> short 'T' <> metavar "TOKENIZER" - <> help "Tokenizer on expected and actual output before running evaluation (makes sense mostly for metrics such BLEU), only 13a tokenizer is implemented so far" )) + <> help "Tokenizer on expected and actual output before running evaluation (makes sense mostly for metrics such BLEU), only 13a tokenizer is implemented so far. Will be also used for tokenizing text into features when in --worst-features and --most-worsening-features modes." )) singletonMaybe :: Maybe a -> Maybe [a] singletonMaybe (Just x) = Just [x]