use tokenization when looking for worst features

This commit is contained in:
Filip Gralinski 2018-08-17 17:27:25 +02:00
parent 0871b57bbc
commit 5e5a58210e
3 changed files with 20 additions and 18 deletions

View File

@ -9,17 +9,19 @@ module GEval.FeatureExtractor
import Data.Text import Data.Text
import Data.List import Data.List
import Data.Monoid ((<>)) import Data.Monoid ((<>))
import Text.Tokenizer
extractUnigramFeatures :: Text -> Text -> [Text] extractUnigramFeatures :: (Maybe Tokenizer) -> Text -> Text -> [Text]
extractUnigramFeatures namespace record = Prelude.map (prefix <>) $ nub $ tokenize record extractUnigramFeatures mTokenizer namespace record = Prelude.map (prefix <>) $ nub $ (tokenizeForFeatures mTokenizer) record
where prefix = namespace <> ":" where prefix = namespace <> ":"
tokenize :: Text -> [Text] tokenizeForFeatures :: (Maybe Tokenizer) -> Text -> [Text]
tokenize t = Data.List.filter (not . Data.Text.null) $ split splitPred t tokenizeForFeatures Nothing t = Data.List.filter (not . Data.Text.null) $ split splitPred t
where splitPred c = c == ' ' || c == '\t' || c == ':' where splitPred c = c == ' ' || c == '\t' || c == ':'
tokenizeForFeatures mTokenizer t = tokenize mTokenizer t
extractUnigramFeaturesFromTabbed :: Text -> Text -> [Text] extractUnigramFeaturesFromTabbed :: (Maybe Tokenizer) -> Text -> Text -> [Text]
extractUnigramFeaturesFromTabbed namespace record = extractUnigramFeaturesFromTabbed mTokenizer namespace record =
Data.List.concat Data.List.concat
$ Prelude.map (\(n, t) -> extractUnigramFeatures (namespace <> "<" <> (pack $ show n) <> ">") t) $ Prelude.map (\(n, t) -> extractUnigramFeatures mTokenizer (namespace <> "<" <> (pack $ show n) <> ">") t)
$ Prelude.zip [1..] (splitOn "\t" record) $ Prelude.zip [1..] (splitOn "\t" record)

View File

@ -78,7 +78,7 @@ runWorstFeatures ordering spec = runLineByLineGeneralized ordering' spec (worstF
worstFeaturesPipeline :: Bool -> GEvalSpecification -> ConduitT LineRecord Void (ResourceT IO) () worstFeaturesPipeline :: Bool -> GEvalSpecification -> ConduitT LineRecord Void (ResourceT IO) ()
worstFeaturesPipeline reversed spec = rank (lessByMetric reversed $ gesMainMetric spec) worstFeaturesPipeline reversed spec = rank (lessByMetric reversed $ gesMainMetric spec)
.| evalStateC 0 extractFeaturesAndPValues .| evalStateC 0 (extractFeaturesAndPValues spec)
.| gobbleAndDo (sortBy featureOrder) .| gobbleAndDo (sortBy featureOrder)
.| CL.map (encodeUtf8 . formatFeatureWithPValue) .| CL.map (encodeUtf8 . formatFeatureWithPValue)
.| CC.unlinesAscii .| CC.unlinesAscii
@ -99,10 +99,10 @@ forceSomeOrdering :: ResultOrdering -> ResultOrdering
forceSomeOrdering FirstTheBest = FirstTheBest forceSomeOrdering FirstTheBest = FirstTheBest
forceSomeOrdering KeepTheOriginalOrder = FirstTheWorst forceSomeOrdering KeepTheOriginalOrder = FirstTheWorst
extractFeaturesAndPValues :: Monad m => ConduitT (Double, LineRecord) FeatureWithPValue (StateT Integer m) () extractFeaturesAndPValues :: Monad m => GEvalSpecification -> ConduitT (Double, LineRecord) FeatureWithPValue (StateT Integer m) ()
extractFeaturesAndPValues = extractFeaturesAndPValues spec =
totalCounter totalCounter
.| featureExtractor .| featureExtractor spec
.| uScoresCounter .| uScoresCounter
@ -122,15 +122,15 @@ formatFeatureWithPValue (FeatureWithPValue f p avg c) =
(pack $ printf "%0.8f" avg), (pack $ printf "%0.8f" avg),
(pack $ printf "%0.20f" p)] (pack $ printf "%0.20f" p)]
featureExtractor :: Monad m => ConduitT (Double, LineRecord) RankedFeature m () featureExtractor :: Monad m => GEvalSpecification -> ConduitT (Double, LineRecord) RankedFeature m ()
featureExtractor = CC.map extract .| CC.concat featureExtractor spec = CC.map extract .| CC.concat
where extract (rank, LineRecord inLine expLine outLine _ score) = where extract (rank, LineRecord inLine expLine outLine _ score) =
Prelude.map (\f -> RankedFeature f rank score) Prelude.map (\f -> RankedFeature f rank score)
$ Data.List.concat [ $ Data.List.concat [
extractUnigramFeatures "exp" expLine, extractUnigramFeatures mTokenizer "exp" expLine,
extractUnigramFeaturesFromTabbed "in" inLine, extractUnigramFeaturesFromTabbed mTokenizer "in" inLine,
extractUnigramFeatures "out" outLine] extractUnigramFeatures mTokenizer "out" outLine]
mTokenizer = gesTokenizer spec
uScoresCounter :: Monad m => ConduitT RankedFeature FeatureWithPValue (StateT Integer m) () uScoresCounter :: Monad m => ConduitT RankedFeature FeatureWithPValue (StateT Integer m) ()
uScoresCounter = CC.map (\(RankedFeature feature r score) -> (feature, (r, score, 1))) uScoresCounter = CC.map (\(RankedFeature feature r score) -> (feature, (r, score, 1)))
.| gobbleAndDo countUScores .| gobbleAndDo countUScores

View File

@ -135,7 +135,7 @@ specParser = GEvalSpecification
( long "tokenizer" ( long "tokenizer"
<> short 'T' <> short 'T'
<> metavar "TOKENIZER" <> metavar "TOKENIZER"
<> help "Tokenizer on expected and actual output before running evaluation (makes sense mostly for metrics such BLEU), only 13a tokenizer is implemented so far" )) <> help "Tokenizer on expected and actual output before running evaluation (makes sense mostly for metrics such BLEU), only 13a tokenizer is implemented so far. Will be also used for tokenizing text into features when in --worst-features and --most-worsening-features modes." ))
singletonMaybe :: Maybe a -> Maybe [a] singletonMaybe :: Maybe a -> Maybe [a]
singletonMaybe (Just x) = Just [x] singletonMaybe (Just x) = Just [x]