diff --git a/src/GEval/BlackBoxDebugging.hs b/src/GEval/BlackBoxDebugging.hs index 7102ca8..f03da5a 100644 --- a/src/GEval/BlackBoxDebugging.hs +++ b/src/GEval/BlackBoxDebugging.hs @@ -4,5 +4,6 @@ module GEval.BlackBoxDebugging data BlackBoxDebuggingOptions = BlackBoxDebuggingOptions { bbdoMinFrequency :: Integer, - bbdoWordShapes :: Bool + bbdoWordShapes :: Bool, + bbdoBigrams :: Bool } diff --git a/src/GEval/FeatureExtractor.hs b/src/GEval/FeatureExtractor.hs index 6e85476..4cbb645 100644 --- a/src/GEval/FeatureExtractor.hs +++ b/src/GEval/FeatureExtractor.hs @@ -2,8 +2,8 @@ module GEval.FeatureExtractor - (extractUnigramFeatures, - extractUnigramFeaturesFromTabbed, + (extractFeatures, + extractFeaturesFromTabbed, Feature(..)) where @@ -13,10 +13,18 @@ import Data.Monoid ((<>)) import Text.Tokenizer import Text.WordShape import GEval.BlackBoxDebugging +import GEval.Common -data Feature = SimpleFeature FeatureNamespace AtomicFeature +data Feature = SimpleFeature FeatureNamespace SimpleFeature deriving (Eq, Ord) +data SimpleFeature = SimpleAtomicFeature AtomicFeature | BigramFeature AtomicFeature AtomicFeature + deriving (Eq, Ord) + +instance Show SimpleFeature where + show (SimpleAtomicFeature feature) = show feature + show (BigramFeature featureA featureB) = (show featureA) ++ "++" ++ (show featureB) + instance Show Feature where show (SimpleFeature namespace feature) = (show namespace) ++ ":" ++ (show feature) @@ -46,15 +54,21 @@ extractAtomicFeatures mTokenizer bbdo t = [Data.List.map TextFeature tokens] ++ else []) where tokens = nub $ (tokenizeForFeatures mTokenizer) t +extractSimpleFeatures :: (Maybe Tokenizer) -> BlackBoxDebuggingOptions -> Text -> [SimpleFeature] +extractSimpleFeatures mTokenizer bbdo t = Data.List.concat $ (Prelude.map (Prelude.map SimpleAtomicFeature) atomss) ++ + if bbdoBigrams bbdo + then Prelude.map bigramFeatures atomss + else [] + where atomss = extractAtomicFeatures mTokenizer bbdo t + bigramFeatures atoms = Prelude.map (\(a, b) -> BigramFeature a b) $ bigrams atoms -extractUnigramFeatures :: (Maybe Tokenizer) -> BlackBoxDebuggingOptions -> Text -> Text -> [Feature] -extractUnigramFeatures mTokenizer bbdo namespace record = +extractFeatures :: (Maybe Tokenizer) -> BlackBoxDebuggingOptions -> Text -> Text -> [Feature] +extractFeatures mTokenizer bbdo namespace record = Prelude.map (\af -> SimpleFeature (FeatureNamespace namespace) af) - $ Data.List.concat - $ extractAtomicFeatures mTokenizer bbdo record + $ extractSimpleFeatures mTokenizer bbdo record -extractUnigramFeaturesFromTabbed :: (Maybe Tokenizer) -> BlackBoxDebuggingOptions -> Text -> Text -> [Feature] -extractUnigramFeaturesFromTabbed mTokenizer bbdo namespace record = +extractFeaturesFromTabbed :: (Maybe Tokenizer) -> BlackBoxDebuggingOptions -> Text -> Text -> [Feature] +extractFeaturesFromTabbed mTokenizer bbdo namespace record = Data.List.concat - $ Prelude.map (\(n, t) -> Prelude.map (\af -> SimpleFeature (FeatureTabbedNamespace namespace n) af) $ Data.List.concat $ extractAtomicFeatures mTokenizer bbdo t) + $ Prelude.map (\(n, t) -> Prelude.map (\af -> SimpleFeature (FeatureTabbedNamespace namespace n) af) $ extractSimpleFeatures mTokenizer bbdo t) $ Prelude.zip [1..] (splitOn "\t" record) diff --git a/src/GEval/LineByLine.hs b/src/GEval/LineByLine.hs index 00c2cba..0e26a3c 100644 --- a/src/GEval/LineByLine.hs +++ b/src/GEval/LineByLine.hs @@ -129,9 +129,9 @@ featureExtractor spec bbdo = CC.map extract .| CC.concat where extract (rank, LineRecord inLine expLine outLine _ score) = Prelude.map (\f -> RankedFeature f rank score) $ Data.List.concat [ - extractUnigramFeatures mTokenizer bbdo "exp" expLine, - extractUnigramFeaturesFromTabbed mTokenizer bbdo "in" inLine, - extractUnigramFeatures mTokenizer bbdo "out" outLine] + extractFeatures mTokenizer bbdo "exp" expLine, + extractFeaturesFromTabbed mTokenizer bbdo "in" inLine, + extractFeatures mTokenizer bbdo "out" outLine] mTokenizer = gesTokenizer spec uScoresCounter :: Monad m => Integer -> ConduitT RankedFeature FeatureWithPValue (StateT Integer m) () diff --git a/src/GEval/OptionsParser.hs b/src/GEval/OptionsParser.hs index cc2380d..91332ec 100644 --- a/src/GEval/OptionsParser.hs +++ b/src/GEval/OptionsParser.hs @@ -175,6 +175,9 @@ blackBoxDebuggingOptionsParser = BlackBoxDebuggingOptions <*> switch ( long "word-shapes" <> help "Consider word shapes") + <*> switch + ( long "bigrams" + <> help "Consider feature bigrams") singletonMaybe :: Maybe a -> Maybe [a] singletonMaybe (Just x) = Just [x]