Add bigram features in black-box debugging
This commit is contained in:
parent
13f9629cbc
commit
99e3a10791
@ -4,5 +4,6 @@ module GEval.BlackBoxDebugging
|
|||||||
|
|
||||||
data BlackBoxDebuggingOptions = BlackBoxDebuggingOptions {
|
data BlackBoxDebuggingOptions = BlackBoxDebuggingOptions {
|
||||||
bbdoMinFrequency :: Integer,
|
bbdoMinFrequency :: Integer,
|
||||||
bbdoWordShapes :: Bool
|
bbdoWordShapes :: Bool,
|
||||||
|
bbdoBigrams :: Bool
|
||||||
}
|
}
|
||||||
|
@ -2,8 +2,8 @@
|
|||||||
|
|
||||||
|
|
||||||
module GEval.FeatureExtractor
|
module GEval.FeatureExtractor
|
||||||
(extractUnigramFeatures,
|
(extractFeatures,
|
||||||
extractUnigramFeaturesFromTabbed,
|
extractFeaturesFromTabbed,
|
||||||
Feature(..))
|
Feature(..))
|
||||||
where
|
where
|
||||||
|
|
||||||
@ -13,10 +13,18 @@ import Data.Monoid ((<>))
|
|||||||
import Text.Tokenizer
|
import Text.Tokenizer
|
||||||
import Text.WordShape
|
import Text.WordShape
|
||||||
import GEval.BlackBoxDebugging
|
import GEval.BlackBoxDebugging
|
||||||
|
import GEval.Common
|
||||||
|
|
||||||
data Feature = SimpleFeature FeatureNamespace AtomicFeature
|
data Feature = SimpleFeature FeatureNamespace SimpleFeature
|
||||||
deriving (Eq, Ord)
|
deriving (Eq, Ord)
|
||||||
|
|
||||||
|
data SimpleFeature = SimpleAtomicFeature AtomicFeature | BigramFeature AtomicFeature AtomicFeature
|
||||||
|
deriving (Eq, Ord)
|
||||||
|
|
||||||
|
instance Show SimpleFeature where
|
||||||
|
show (SimpleAtomicFeature feature) = show feature
|
||||||
|
show (BigramFeature featureA featureB) = (show featureA) ++ "++" ++ (show featureB)
|
||||||
|
|
||||||
instance Show Feature where
|
instance Show Feature where
|
||||||
show (SimpleFeature namespace feature) = (show namespace) ++ ":" ++ (show feature)
|
show (SimpleFeature namespace feature) = (show namespace) ++ ":" ++ (show feature)
|
||||||
|
|
||||||
@ -46,15 +54,21 @@ extractAtomicFeatures mTokenizer bbdo t = [Data.List.map TextFeature tokens] ++
|
|||||||
else [])
|
else [])
|
||||||
where tokens = nub $ (tokenizeForFeatures mTokenizer) t
|
where tokens = nub $ (tokenizeForFeatures mTokenizer) t
|
||||||
|
|
||||||
|
extractSimpleFeatures :: (Maybe Tokenizer) -> BlackBoxDebuggingOptions -> Text -> [SimpleFeature]
|
||||||
|
extractSimpleFeatures mTokenizer bbdo t = Data.List.concat $ (Prelude.map (Prelude.map SimpleAtomicFeature) atomss) ++
|
||||||
|
if bbdoBigrams bbdo
|
||||||
|
then Prelude.map bigramFeatures atomss
|
||||||
|
else []
|
||||||
|
where atomss = extractAtomicFeatures mTokenizer bbdo t
|
||||||
|
bigramFeatures atoms = Prelude.map (\(a, b) -> BigramFeature a b) $ bigrams atoms
|
||||||
|
|
||||||
extractUnigramFeatures :: (Maybe Tokenizer) -> BlackBoxDebuggingOptions -> Text -> Text -> [Feature]
|
extractFeatures :: (Maybe Tokenizer) -> BlackBoxDebuggingOptions -> Text -> Text -> [Feature]
|
||||||
extractUnigramFeatures mTokenizer bbdo namespace record =
|
extractFeatures mTokenizer bbdo namespace record =
|
||||||
Prelude.map (\af -> SimpleFeature (FeatureNamespace namespace) af)
|
Prelude.map (\af -> SimpleFeature (FeatureNamespace namespace) af)
|
||||||
$ Data.List.concat
|
$ extractSimpleFeatures mTokenizer bbdo record
|
||||||
$ extractAtomicFeatures mTokenizer bbdo record
|
|
||||||
|
|
||||||
extractUnigramFeaturesFromTabbed :: (Maybe Tokenizer) -> BlackBoxDebuggingOptions -> Text -> Text -> [Feature]
|
extractFeaturesFromTabbed :: (Maybe Tokenizer) -> BlackBoxDebuggingOptions -> Text -> Text -> [Feature]
|
||||||
extractUnigramFeaturesFromTabbed mTokenizer bbdo namespace record =
|
extractFeaturesFromTabbed mTokenizer bbdo namespace record =
|
||||||
Data.List.concat
|
Data.List.concat
|
||||||
$ Prelude.map (\(n, t) -> Prelude.map (\af -> SimpleFeature (FeatureTabbedNamespace namespace n) af) $ Data.List.concat $ extractAtomicFeatures mTokenizer bbdo t)
|
$ Prelude.map (\(n, t) -> Prelude.map (\af -> SimpleFeature (FeatureTabbedNamespace namespace n) af) $ extractSimpleFeatures mTokenizer bbdo t)
|
||||||
$ Prelude.zip [1..] (splitOn "\t" record)
|
$ Prelude.zip [1..] (splitOn "\t" record)
|
||||||
|
@ -129,9 +129,9 @@ featureExtractor spec bbdo = CC.map extract .| CC.concat
|
|||||||
where extract (rank, LineRecord inLine expLine outLine _ score) =
|
where extract (rank, LineRecord inLine expLine outLine _ score) =
|
||||||
Prelude.map (\f -> RankedFeature f rank score)
|
Prelude.map (\f -> RankedFeature f rank score)
|
||||||
$ Data.List.concat [
|
$ Data.List.concat [
|
||||||
extractUnigramFeatures mTokenizer bbdo "exp" expLine,
|
extractFeatures mTokenizer bbdo "exp" expLine,
|
||||||
extractUnigramFeaturesFromTabbed mTokenizer bbdo "in" inLine,
|
extractFeaturesFromTabbed mTokenizer bbdo "in" inLine,
|
||||||
extractUnigramFeatures mTokenizer bbdo "out" outLine]
|
extractFeatures mTokenizer bbdo "out" outLine]
|
||||||
mTokenizer = gesTokenizer spec
|
mTokenizer = gesTokenizer spec
|
||||||
|
|
||||||
uScoresCounter :: Monad m => Integer -> ConduitT RankedFeature FeatureWithPValue (StateT Integer m) ()
|
uScoresCounter :: Monad m => Integer -> ConduitT RankedFeature FeatureWithPValue (StateT Integer m) ()
|
||||||
|
@ -175,6 +175,9 @@ blackBoxDebuggingOptionsParser = BlackBoxDebuggingOptions
|
|||||||
<*> switch
|
<*> switch
|
||||||
( long "word-shapes"
|
( long "word-shapes"
|
||||||
<> help "Consider word shapes")
|
<> help "Consider word shapes")
|
||||||
|
<*> switch
|
||||||
|
( long "bigrams"
|
||||||
|
<> help "Consider feature bigrams")
|
||||||
|
|
||||||
singletonMaybe :: Maybe a -> Maybe [a]
|
singletonMaybe :: Maybe a -> Maybe [a]
|
||||||
singletonMaybe (Just x) = Just [x]
|
singletonMaybe (Just x) = Just [x]
|
||||||
|
Loading…
Reference in New Issue
Block a user