use tokenization when looking for worst features

This commit is contained in:
Filip Gralinski 2018-08-17 17:27:25 +02:00
parent 0871b57bbc
commit 5e5a58210e
3 changed files with 20 additions and 18 deletions

View File

@ -9,17 +9,19 @@ module GEval.FeatureExtractor
import Data.Text
import Data.List
import Data.Monoid ((<>))
import Text.Tokenizer
extractUnigramFeatures :: Text -> Text -> [Text]
extractUnigramFeatures namespace record = Prelude.map (prefix <>) $ nub $ tokenize record
extractUnigramFeatures :: (Maybe Tokenizer) -> Text -> Text -> [Text]
extractUnigramFeatures mTokenizer namespace record = Prelude.map (prefix <>) $ nub $ (tokenizeForFeatures mTokenizer) record
where prefix = namespace <> ":"
tokenize :: Text -> [Text]
tokenize t = Data.List.filter (not . Data.Text.null) $ split splitPred t
tokenizeForFeatures :: (Maybe Tokenizer) -> Text -> [Text]
tokenizeForFeatures Nothing t = Data.List.filter (not . Data.Text.null) $ split splitPred t
where splitPred c = c == ' ' || c == '\t' || c == ':'
tokenizeForFeatures mTokenizer t = tokenize mTokenizer t
extractUnigramFeaturesFromTabbed :: Text -> Text -> [Text]
extractUnigramFeaturesFromTabbed namespace record =
extractUnigramFeaturesFromTabbed :: (Maybe Tokenizer) -> Text -> Text -> [Text]
extractUnigramFeaturesFromTabbed mTokenizer namespace record =
Data.List.concat
$ Prelude.map (\(n, t) -> extractUnigramFeatures (namespace <> "<" <> (pack $ show n) <> ">") t)
$ Prelude.map (\(n, t) -> extractUnigramFeatures mTokenizer (namespace <> "<" <> (pack $ show n) <> ">") t)
$ Prelude.zip [1..] (splitOn "\t" record)

View File

@ -78,7 +78,7 @@ runWorstFeatures ordering spec = runLineByLineGeneralized ordering' spec (worstF
worstFeaturesPipeline :: Bool -> GEvalSpecification -> ConduitT LineRecord Void (ResourceT IO) ()
worstFeaturesPipeline reversed spec = rank (lessByMetric reversed $ gesMainMetric spec)
.| evalStateC 0 extractFeaturesAndPValues
.| evalStateC 0 (extractFeaturesAndPValues spec)
.| gobbleAndDo (sortBy featureOrder)
.| CL.map (encodeUtf8 . formatFeatureWithPValue)
.| CC.unlinesAscii
@ -99,10 +99,10 @@ forceSomeOrdering :: ResultOrdering -> ResultOrdering
forceSomeOrdering FirstTheBest = FirstTheBest
forceSomeOrdering KeepTheOriginalOrder = FirstTheWorst
extractFeaturesAndPValues :: Monad m => ConduitT (Double, LineRecord) FeatureWithPValue (StateT Integer m) ()
extractFeaturesAndPValues =
extractFeaturesAndPValues :: Monad m => GEvalSpecification -> ConduitT (Double, LineRecord) FeatureWithPValue (StateT Integer m) ()
extractFeaturesAndPValues spec =
totalCounter
.| featureExtractor
.| featureExtractor spec
.| uScoresCounter
@ -122,15 +122,15 @@ formatFeatureWithPValue (FeatureWithPValue f p avg c) =
(pack $ printf "%0.8f" avg),
(pack $ printf "%0.20f" p)]
featureExtractor :: Monad m => ConduitT (Double, LineRecord) RankedFeature m ()
featureExtractor = CC.map extract .| CC.concat
featureExtractor :: Monad m => GEvalSpecification -> ConduitT (Double, LineRecord) RankedFeature m ()
featureExtractor spec = CC.map extract .| CC.concat
where extract (rank, LineRecord inLine expLine outLine _ score) =
Prelude.map (\f -> RankedFeature f rank score)
$ Data.List.concat [
extractUnigramFeatures "exp" expLine,
extractUnigramFeaturesFromTabbed "in" inLine,
extractUnigramFeatures "out" outLine]
extractUnigramFeatures mTokenizer "exp" expLine,
extractUnigramFeaturesFromTabbed mTokenizer "in" inLine,
extractUnigramFeatures mTokenizer "out" outLine]
mTokenizer = gesTokenizer spec
uScoresCounter :: Monad m => ConduitT RankedFeature FeatureWithPValue (StateT Integer m) ()
uScoresCounter = CC.map (\(RankedFeature feature r score) -> (feature, (r, score, 1)))
.| gobbleAndDo countUScores

View File

@ -135,7 +135,7 @@ specParser = GEvalSpecification
( long "tokenizer"
<> short 'T'
<> metavar "TOKENIZER"
<> help "Tokenizer on expected and actual output before running evaluation (makes sense mostly for metrics such BLEU), only 13a tokenizer is implemented so far" ))
<> help "Tokenizer on expected and actual output before running evaluation (makes sense mostly for metrics such BLEU), only 13a tokenizer is implemented so far. Will be also used for tokenizing text into features when in --worst-features and --most-worsening-features modes." ))
singletonMaybe :: Maybe a -> Maybe [a]
singletonMaybe (Just x) = Just [x]