use tokenization when looking for worst features
This commit is contained in:
parent
0871b57bbc
commit
5e5a58210e
@ -9,17 +9,19 @@ module GEval.FeatureExtractor
|
|||||||
import Data.Text
|
import Data.Text
|
||||||
import Data.List
|
import Data.List
|
||||||
import Data.Monoid ((<>))
|
import Data.Monoid ((<>))
|
||||||
|
import Text.Tokenizer
|
||||||
|
|
||||||
extractUnigramFeatures :: Text -> Text -> [Text]
|
extractUnigramFeatures :: (Maybe Tokenizer) -> Text -> Text -> [Text]
|
||||||
extractUnigramFeatures namespace record = Prelude.map (prefix <>) $ nub $ tokenize record
|
extractUnigramFeatures mTokenizer namespace record = Prelude.map (prefix <>) $ nub $ (tokenizeForFeatures mTokenizer) record
|
||||||
where prefix = namespace <> ":"
|
where prefix = namespace <> ":"
|
||||||
|
|
||||||
tokenize :: Text -> [Text]
|
tokenizeForFeatures :: (Maybe Tokenizer) -> Text -> [Text]
|
||||||
tokenize t = Data.List.filter (not . Data.Text.null) $ split splitPred t
|
tokenizeForFeatures Nothing t = Data.List.filter (not . Data.Text.null) $ split splitPred t
|
||||||
where splitPred c = c == ' ' || c == '\t' || c == ':'
|
where splitPred c = c == ' ' || c == '\t' || c == ':'
|
||||||
|
tokenizeForFeatures mTokenizer t = tokenize mTokenizer t
|
||||||
|
|
||||||
extractUnigramFeaturesFromTabbed :: Text -> Text -> [Text]
|
extractUnigramFeaturesFromTabbed :: (Maybe Tokenizer) -> Text -> Text -> [Text]
|
||||||
extractUnigramFeaturesFromTabbed namespace record =
|
extractUnigramFeaturesFromTabbed mTokenizer namespace record =
|
||||||
Data.List.concat
|
Data.List.concat
|
||||||
$ Prelude.map (\(n, t) -> extractUnigramFeatures (namespace <> "<" <> (pack $ show n) <> ">") t)
|
$ Prelude.map (\(n, t) -> extractUnigramFeatures mTokenizer (namespace <> "<" <> (pack $ show n) <> ">") t)
|
||||||
$ Prelude.zip [1..] (splitOn "\t" record)
|
$ Prelude.zip [1..] (splitOn "\t" record)
|
||||||
|
@ -78,7 +78,7 @@ runWorstFeatures ordering spec = runLineByLineGeneralized ordering' spec (worstF
|
|||||||
|
|
||||||
worstFeaturesPipeline :: Bool -> GEvalSpecification -> ConduitT LineRecord Void (ResourceT IO) ()
|
worstFeaturesPipeline :: Bool -> GEvalSpecification -> ConduitT LineRecord Void (ResourceT IO) ()
|
||||||
worstFeaturesPipeline reversed spec = rank (lessByMetric reversed $ gesMainMetric spec)
|
worstFeaturesPipeline reversed spec = rank (lessByMetric reversed $ gesMainMetric spec)
|
||||||
.| evalStateC 0 extractFeaturesAndPValues
|
.| evalStateC 0 (extractFeaturesAndPValues spec)
|
||||||
.| gobbleAndDo (sortBy featureOrder)
|
.| gobbleAndDo (sortBy featureOrder)
|
||||||
.| CL.map (encodeUtf8 . formatFeatureWithPValue)
|
.| CL.map (encodeUtf8 . formatFeatureWithPValue)
|
||||||
.| CC.unlinesAscii
|
.| CC.unlinesAscii
|
||||||
@ -99,10 +99,10 @@ forceSomeOrdering :: ResultOrdering -> ResultOrdering
|
|||||||
forceSomeOrdering FirstTheBest = FirstTheBest
|
forceSomeOrdering FirstTheBest = FirstTheBest
|
||||||
forceSomeOrdering KeepTheOriginalOrder = FirstTheWorst
|
forceSomeOrdering KeepTheOriginalOrder = FirstTheWorst
|
||||||
|
|
||||||
extractFeaturesAndPValues :: Monad m => ConduitT (Double, LineRecord) FeatureWithPValue (StateT Integer m) ()
|
extractFeaturesAndPValues :: Monad m => GEvalSpecification -> ConduitT (Double, LineRecord) FeatureWithPValue (StateT Integer m) ()
|
||||||
extractFeaturesAndPValues =
|
extractFeaturesAndPValues spec =
|
||||||
totalCounter
|
totalCounter
|
||||||
.| featureExtractor
|
.| featureExtractor spec
|
||||||
.| uScoresCounter
|
.| uScoresCounter
|
||||||
|
|
||||||
|
|
||||||
@ -122,15 +122,15 @@ formatFeatureWithPValue (FeatureWithPValue f p avg c) =
|
|||||||
(pack $ printf "%0.8f" avg),
|
(pack $ printf "%0.8f" avg),
|
||||||
(pack $ printf "%0.20f" p)]
|
(pack $ printf "%0.20f" p)]
|
||||||
|
|
||||||
featureExtractor :: Monad m => ConduitT (Double, LineRecord) RankedFeature m ()
|
featureExtractor :: Monad m => GEvalSpecification -> ConduitT (Double, LineRecord) RankedFeature m ()
|
||||||
featureExtractor = CC.map extract .| CC.concat
|
featureExtractor spec = CC.map extract .| CC.concat
|
||||||
where extract (rank, LineRecord inLine expLine outLine _ score) =
|
where extract (rank, LineRecord inLine expLine outLine _ score) =
|
||||||
Prelude.map (\f -> RankedFeature f rank score)
|
Prelude.map (\f -> RankedFeature f rank score)
|
||||||
$ Data.List.concat [
|
$ Data.List.concat [
|
||||||
extractUnigramFeatures "exp" expLine,
|
extractUnigramFeatures mTokenizer "exp" expLine,
|
||||||
extractUnigramFeaturesFromTabbed "in" inLine,
|
extractUnigramFeaturesFromTabbed mTokenizer "in" inLine,
|
||||||
extractUnigramFeatures "out" outLine]
|
extractUnigramFeatures mTokenizer "out" outLine]
|
||||||
|
mTokenizer = gesTokenizer spec
|
||||||
uScoresCounter :: Monad m => ConduitT RankedFeature FeatureWithPValue (StateT Integer m) ()
|
uScoresCounter :: Monad m => ConduitT RankedFeature FeatureWithPValue (StateT Integer m) ()
|
||||||
uScoresCounter = CC.map (\(RankedFeature feature r score) -> (feature, (r, score, 1)))
|
uScoresCounter = CC.map (\(RankedFeature feature r score) -> (feature, (r, score, 1)))
|
||||||
.| gobbleAndDo countUScores
|
.| gobbleAndDo countUScores
|
||||||
|
@ -135,7 +135,7 @@ specParser = GEvalSpecification
|
|||||||
( long "tokenizer"
|
( long "tokenizer"
|
||||||
<> short 'T'
|
<> short 'T'
|
||||||
<> metavar "TOKENIZER"
|
<> metavar "TOKENIZER"
|
||||||
<> help "Tokenizer on expected and actual output before running evaluation (makes sense mostly for metrics such BLEU), only 13a tokenizer is implemented so far" ))
|
<> help "Tokenizer on expected and actual output before running evaluation (makes sense mostly for metrics such BLEU), only 13a tokenizer is implemented so far. Will be also used for tokenizing text into features when in --worst-features and --most-worsening-features modes." ))
|
||||||
|
|
||||||
singletonMaybe :: Maybe a -> Maybe [a]
|
singletonMaybe :: Maybe a -> Maybe [a]
|
||||||
singletonMaybe (Just x) = Just [x]
|
singletonMaybe (Just x) = Just [x]
|
||||||
|
Loading…
Reference in New Issue
Block a user