use tokenization when looking for worst features
This commit is contained in:
parent
0871b57bbc
commit
5e5a58210e
@ -9,17 +9,19 @@ module GEval.FeatureExtractor
|
||||
import Data.Text
|
||||
import Data.List
|
||||
import Data.Monoid ((<>))
|
||||
import Text.Tokenizer
|
||||
|
||||
extractUnigramFeatures :: Text -> Text -> [Text]
|
||||
extractUnigramFeatures namespace record = Prelude.map (prefix <>) $ nub $ tokenize record
|
||||
extractUnigramFeatures :: (Maybe Tokenizer) -> Text -> Text -> [Text]
|
||||
extractUnigramFeatures mTokenizer namespace record = Prelude.map (prefix <>) $ nub $ (tokenizeForFeatures mTokenizer) record
|
||||
where prefix = namespace <> ":"
|
||||
|
||||
tokenize :: Text -> [Text]
|
||||
tokenize t = Data.List.filter (not . Data.Text.null) $ split splitPred t
|
||||
tokenizeForFeatures :: (Maybe Tokenizer) -> Text -> [Text]
|
||||
tokenizeForFeatures Nothing t = Data.List.filter (not . Data.Text.null) $ split splitPred t
|
||||
where splitPred c = c == ' ' || c == '\t' || c == ':'
|
||||
tokenizeForFeatures mTokenizer t = tokenize mTokenizer t
|
||||
|
||||
extractUnigramFeaturesFromTabbed :: Text -> Text -> [Text]
|
||||
extractUnigramFeaturesFromTabbed namespace record =
|
||||
extractUnigramFeaturesFromTabbed :: (Maybe Tokenizer) -> Text -> Text -> [Text]
|
||||
extractUnigramFeaturesFromTabbed mTokenizer namespace record =
|
||||
Data.List.concat
|
||||
$ Prelude.map (\(n, t) -> extractUnigramFeatures (namespace <> "<" <> (pack $ show n) <> ">") t)
|
||||
$ Prelude.map (\(n, t) -> extractUnigramFeatures mTokenizer (namespace <> "<" <> (pack $ show n) <> ">") t)
|
||||
$ Prelude.zip [1..] (splitOn "\t" record)
|
||||
|
@ -78,7 +78,7 @@ runWorstFeatures ordering spec = runLineByLineGeneralized ordering' spec (worstF
|
||||
|
||||
worstFeaturesPipeline :: Bool -> GEvalSpecification -> ConduitT LineRecord Void (ResourceT IO) ()
|
||||
worstFeaturesPipeline reversed spec = rank (lessByMetric reversed $ gesMainMetric spec)
|
||||
.| evalStateC 0 extractFeaturesAndPValues
|
||||
.| evalStateC 0 (extractFeaturesAndPValues spec)
|
||||
.| gobbleAndDo (sortBy featureOrder)
|
||||
.| CL.map (encodeUtf8 . formatFeatureWithPValue)
|
||||
.| CC.unlinesAscii
|
||||
@ -99,10 +99,10 @@ forceSomeOrdering :: ResultOrdering -> ResultOrdering
|
||||
forceSomeOrdering FirstTheBest = FirstTheBest
|
||||
forceSomeOrdering KeepTheOriginalOrder = FirstTheWorst
|
||||
|
||||
extractFeaturesAndPValues :: Monad m => ConduitT (Double, LineRecord) FeatureWithPValue (StateT Integer m) ()
|
||||
extractFeaturesAndPValues =
|
||||
extractFeaturesAndPValues :: Monad m => GEvalSpecification -> ConduitT (Double, LineRecord) FeatureWithPValue (StateT Integer m) ()
|
||||
extractFeaturesAndPValues spec =
|
||||
totalCounter
|
||||
.| featureExtractor
|
||||
.| featureExtractor spec
|
||||
.| uScoresCounter
|
||||
|
||||
|
||||
@ -122,15 +122,15 @@ formatFeatureWithPValue (FeatureWithPValue f p avg c) =
|
||||
(pack $ printf "%0.8f" avg),
|
||||
(pack $ printf "%0.20f" p)]
|
||||
|
||||
featureExtractor :: Monad m => ConduitT (Double, LineRecord) RankedFeature m ()
|
||||
featureExtractor = CC.map extract .| CC.concat
|
||||
featureExtractor :: Monad m => GEvalSpecification -> ConduitT (Double, LineRecord) RankedFeature m ()
|
||||
featureExtractor spec = CC.map extract .| CC.concat
|
||||
where extract (rank, LineRecord inLine expLine outLine _ score) =
|
||||
Prelude.map (\f -> RankedFeature f rank score)
|
||||
$ Data.List.concat [
|
||||
extractUnigramFeatures "exp" expLine,
|
||||
extractUnigramFeaturesFromTabbed "in" inLine,
|
||||
extractUnigramFeatures "out" outLine]
|
||||
|
||||
extractUnigramFeatures mTokenizer "exp" expLine,
|
||||
extractUnigramFeaturesFromTabbed mTokenizer "in" inLine,
|
||||
extractUnigramFeatures mTokenizer "out" outLine]
|
||||
mTokenizer = gesTokenizer spec
|
||||
uScoresCounter :: Monad m => ConduitT RankedFeature FeatureWithPValue (StateT Integer m) ()
|
||||
uScoresCounter = CC.map (\(RankedFeature feature r score) -> (feature, (r, score, 1)))
|
||||
.| gobbleAndDo countUScores
|
||||
|
@ -135,7 +135,7 @@ specParser = GEvalSpecification
|
||||
( long "tokenizer"
|
||||
<> short 'T'
|
||||
<> metavar "TOKENIZER"
|
||||
<> help "Tokenizer on expected and actual output before running evaluation (makes sense mostly for metrics such BLEU), only 13a tokenizer is implemented so far" ))
|
||||
<> help "Tokenizer on expected and actual output before running evaluation (makes sense mostly for metrics such BLEU), only 13a tokenizer is implemented so far. Will be also used for tokenizing text into features when in --worst-features and --most-worsening-features modes." ))
|
||||
|
||||
singletonMaybe :: Maybe a -> Maybe [a]
|
||||
singletonMaybe (Just x) = Just [x]
|
||||
|
Loading…
Reference in New Issue
Block a user