Add bigram features in black-box debugging

This commit is contained in:
Filip Graliński 2019-01-10 10:41:55 +01:00
parent 13f9629cbc
commit 99e3a10791
4 changed files with 32 additions and 14 deletions

View File

@ -4,5 +4,6 @@ module GEval.BlackBoxDebugging
data BlackBoxDebuggingOptions = BlackBoxDebuggingOptions { data BlackBoxDebuggingOptions = BlackBoxDebuggingOptions {
bbdoMinFrequency :: Integer, bbdoMinFrequency :: Integer,
bbdoWordShapes :: Bool bbdoWordShapes :: Bool,
bbdoBigrams :: Bool
} }

View File

@ -2,8 +2,8 @@
module GEval.FeatureExtractor module GEval.FeatureExtractor
(extractUnigramFeatures, (extractFeatures,
extractUnigramFeaturesFromTabbed, extractFeaturesFromTabbed,
Feature(..)) Feature(..))
where where
@ -13,10 +13,18 @@ import Data.Monoid ((<>))
import Text.Tokenizer import Text.Tokenizer
import Text.WordShape import Text.WordShape
import GEval.BlackBoxDebugging import GEval.BlackBoxDebugging
import GEval.Common
data Feature = SimpleFeature FeatureNamespace AtomicFeature data Feature = SimpleFeature FeatureNamespace SimpleFeature
deriving (Eq, Ord) deriving (Eq, Ord)
data SimpleFeature = SimpleAtomicFeature AtomicFeature | BigramFeature AtomicFeature AtomicFeature
deriving (Eq, Ord)
instance Show SimpleFeature where
show (SimpleAtomicFeature feature) = show feature
show (BigramFeature featureA featureB) = (show featureA) ++ "++" ++ (show featureB)
instance Show Feature where instance Show Feature where
show (SimpleFeature namespace feature) = (show namespace) ++ ":" ++ (show feature) show (SimpleFeature namespace feature) = (show namespace) ++ ":" ++ (show feature)
@ -46,15 +54,21 @@ extractAtomicFeatures mTokenizer bbdo t = [Data.List.map TextFeature tokens] ++
else []) else [])
where tokens = nub $ (tokenizeForFeatures mTokenizer) t where tokens = nub $ (tokenizeForFeatures mTokenizer) t
extractSimpleFeatures :: (Maybe Tokenizer) -> BlackBoxDebuggingOptions -> Text -> [SimpleFeature]
extractSimpleFeatures mTokenizer bbdo t = Data.List.concat $ (Prelude.map (Prelude.map SimpleAtomicFeature) atomss) ++
if bbdoBigrams bbdo
then Prelude.map bigramFeatures atomss
else []
where atomss = extractAtomicFeatures mTokenizer bbdo t
bigramFeatures atoms = Prelude.map (\(a, b) -> BigramFeature a b) $ bigrams atoms
extractUnigramFeatures :: (Maybe Tokenizer) -> BlackBoxDebuggingOptions -> Text -> Text -> [Feature] extractFeatures :: (Maybe Tokenizer) -> BlackBoxDebuggingOptions -> Text -> Text -> [Feature]
extractUnigramFeatures mTokenizer bbdo namespace record = extractFeatures mTokenizer bbdo namespace record =
Prelude.map (\af -> SimpleFeature (FeatureNamespace namespace) af) Prelude.map (\af -> SimpleFeature (FeatureNamespace namespace) af)
$ Data.List.concat $ extractSimpleFeatures mTokenizer bbdo record
$ extractAtomicFeatures mTokenizer bbdo record
extractUnigramFeaturesFromTabbed :: (Maybe Tokenizer) -> BlackBoxDebuggingOptions -> Text -> Text -> [Feature] extractFeaturesFromTabbed :: (Maybe Tokenizer) -> BlackBoxDebuggingOptions -> Text -> Text -> [Feature]
extractUnigramFeaturesFromTabbed mTokenizer bbdo namespace record = extractFeaturesFromTabbed mTokenizer bbdo namespace record =
Data.List.concat Data.List.concat
$ Prelude.map (\(n, t) -> Prelude.map (\af -> SimpleFeature (FeatureTabbedNamespace namespace n) af) $ Data.List.concat $ extractAtomicFeatures mTokenizer bbdo t) $ Prelude.map (\(n, t) -> Prelude.map (\af -> SimpleFeature (FeatureTabbedNamespace namespace n) af) $ extractSimpleFeatures mTokenizer bbdo t)
$ Prelude.zip [1..] (splitOn "\t" record) $ Prelude.zip [1..] (splitOn "\t" record)

View File

@ -129,9 +129,9 @@ featureExtractor spec bbdo = CC.map extract .| CC.concat
where extract (rank, LineRecord inLine expLine outLine _ score) = where extract (rank, LineRecord inLine expLine outLine _ score) =
Prelude.map (\f -> RankedFeature f rank score) Prelude.map (\f -> RankedFeature f rank score)
$ Data.List.concat [ $ Data.List.concat [
extractUnigramFeatures mTokenizer bbdo "exp" expLine, extractFeatures mTokenizer bbdo "exp" expLine,
extractUnigramFeaturesFromTabbed mTokenizer bbdo "in" inLine, extractFeaturesFromTabbed mTokenizer bbdo "in" inLine,
extractUnigramFeatures mTokenizer bbdo "out" outLine] extractFeatures mTokenizer bbdo "out" outLine]
mTokenizer = gesTokenizer spec mTokenizer = gesTokenizer spec
uScoresCounter :: Monad m => Integer -> ConduitT RankedFeature FeatureWithPValue (StateT Integer m) () uScoresCounter :: Monad m => Integer -> ConduitT RankedFeature FeatureWithPValue (StateT Integer m) ()

View File

@ -175,6 +175,9 @@ blackBoxDebuggingOptionsParser = BlackBoxDebuggingOptions
<*> switch <*> switch
( long "word-shapes" ( long "word-shapes"
<> help "Consider word shapes") <> help "Consider word shapes")
<*> switch
( long "bigrams"
<> help "Consider feature bigrams")
singletonMaybe :: Maybe a -> Maybe [a] singletonMaybe :: Maybe a -> Maybe [a]
singletonMaybe (Just x) = Just [x] singletonMaybe (Just x) = Just [x]