From 23aad86e72461f815bdb37aa16a3458ce9b41005 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Filip=20Grali=C5=84ski?= Date: Thu, 10 Jan 2019 14:01:29 +0100 Subject: [PATCH] Add cartesian features to black-box debugging But it's very slow now, needs to be sped up --- src/GEval/BlackBoxDebugging.hs | 3 ++- src/GEval/FeatureExtractor.hs | 27 +++++++++++++++++++-------- src/GEval/LineByLine.hs | 18 +++++++++++++----- src/GEval/OptionsParser.hs | 3 +++ 4 files changed, 37 insertions(+), 14 deletions(-) diff --git a/src/GEval/BlackBoxDebugging.hs b/src/GEval/BlackBoxDebugging.hs index f03da5a..a49a76d 100644 --- a/src/GEval/BlackBoxDebugging.hs +++ b/src/GEval/BlackBoxDebugging.hs @@ -5,5 +5,6 @@ module GEval.BlackBoxDebugging data BlackBoxDebuggingOptions = BlackBoxDebuggingOptions { bbdoMinFrequency :: Integer, bbdoWordShapes :: Bool, - bbdoBigrams :: Bool + bbdoBigrams :: Bool, + bbdoCartesian :: Bool } diff --git a/src/GEval/FeatureExtractor.hs b/src/GEval/FeatureExtractor.hs index 4cbb645..103c371 100644 --- a/src/GEval/FeatureExtractor.hs +++ b/src/GEval/FeatureExtractor.hs @@ -4,6 +4,7 @@ module GEval.FeatureExtractor (extractFeatures, extractFeaturesFromTabbed, + cartesianFeatures, Feature(..)) where @@ -15,9 +16,19 @@ import Text.WordShape import GEval.BlackBoxDebugging import GEval.Common -data Feature = SimpleFeature FeatureNamespace SimpleFeature +data Feature = UnaryFeature PeggedFeature | CartesianFeature PeggedFeature PeggedFeature deriving (Eq, Ord) +instance Show Feature where + show (UnaryFeature feature) = show feature + show (CartesianFeature featureA featureB) = (show featureA) ++ "~~" ++ (show featureB) + +data PeggedFeature = PeggedFeature FeatureNamespace SimpleFeature + deriving (Eq, Ord) + +instance Show PeggedFeature where + show (PeggedFeature namespace feature) = (show namespace) ++ ":" ++ (show feature) + data SimpleFeature = SimpleAtomicFeature AtomicFeature | BigramFeature AtomicFeature AtomicFeature deriving (Eq, Ord) @@ -25,9 +36,6 @@ instance Show SimpleFeature where show (SimpleAtomicFeature feature) = show feature show (BigramFeature featureA featureB) = (show featureA) ++ "++" ++ (show featureB) -instance Show Feature where - show (SimpleFeature namespace feature) = (show namespace) ++ ":" ++ (show feature) - data AtomicFeature = TextFeature Text | ShapeFeature WordShape deriving (Eq, Ord) @@ -62,13 +70,16 @@ extractSimpleFeatures mTokenizer bbdo t = Data.List.concat $ (Prelude.map (Prelu where atomss = extractAtomicFeatures mTokenizer bbdo t bigramFeatures atoms = Prelude.map (\(a, b) -> BigramFeature a b) $ bigrams atoms -extractFeatures :: (Maybe Tokenizer) -> BlackBoxDebuggingOptions -> Text -> Text -> [Feature] +extractFeatures :: (Maybe Tokenizer) -> BlackBoxDebuggingOptions -> Text -> Text -> [PeggedFeature] extractFeatures mTokenizer bbdo namespace record = - Prelude.map (\af -> SimpleFeature (FeatureNamespace namespace) af) + Prelude.map (\af -> PeggedFeature (FeatureNamespace namespace) af) $ extractSimpleFeatures mTokenizer bbdo record -extractFeaturesFromTabbed :: (Maybe Tokenizer) -> BlackBoxDebuggingOptions -> Text -> Text -> [Feature] +extractFeaturesFromTabbed :: (Maybe Tokenizer) -> BlackBoxDebuggingOptions -> Text -> Text -> [PeggedFeature] extractFeaturesFromTabbed mTokenizer bbdo namespace record = Data.List.concat - $ Prelude.map (\(n, t) -> Prelude.map (\af -> SimpleFeature (FeatureTabbedNamespace namespace n) af) $ extractSimpleFeatures mTokenizer bbdo t) + $ Prelude.map (\(n, t) -> Prelude.map (\af -> PeggedFeature (FeatureTabbedNamespace namespace n) af) $ extractSimpleFeatures mTokenizer bbdo t) $ Prelude.zip [1..] (splitOn "\t" record) + +cartesianFeatures :: [PeggedFeature] -> [Feature] +cartesianFeatures features = nub $ [CartesianFeature a b | a <- features, b <- features, a < b] diff --git a/src/GEval/LineByLine.hs b/src/GEval/LineByLine.hs index 0e26a3c..70aaa55 100644 --- a/src/GEval/LineByLine.hs +++ b/src/GEval/LineByLine.hs @@ -126,14 +126,22 @@ formatFeatureWithPValue (FeatureWithPValue f p avg c) = featureExtractor :: Monad m => GEvalSpecification -> BlackBoxDebuggingOptions -> ConduitT (Double, LineRecord) RankedFeature m () featureExtractor spec bbdo = CC.map extract .| CC.concat - where extract (rank, LineRecord inLine expLine outLine _ score) = + where extract (rank, line@(LineRecord _ _ _ _ score)) = Prelude.map (\f -> RankedFeature f rank score) - $ Data.List.concat [ - extractFeatures mTokenizer bbdo "exp" expLine, - extractFeaturesFromTabbed mTokenizer bbdo "in" inLine, - extractFeatures mTokenizer bbdo "out" outLine] + $ getFeatures mTokenizer bbdo line mTokenizer = gesTokenizer spec +getFeatures :: Maybe Tokenizer -> BlackBoxDebuggingOptions -> LineRecord -> [Feature] +getFeatures mTokenizer bbdo (LineRecord inLine expLine outLine _ _) = Prelude.map UnaryFeature unaryFeatures ++ + if bbdoCartesian bbdo + then cartesianFeatures unaryFeatures + else [] + where unaryFeatures = + Data.List.concat [ + extractFeatures mTokenizer bbdo "exp" expLine, + extractFeaturesFromTabbed mTokenizer bbdo "in" inLine, + extractFeatures mTokenizer bbdo "out" outLine] + uScoresCounter :: Monad m => Integer -> ConduitT RankedFeature FeatureWithPValue (StateT Integer m) () uScoresCounter minFreq = CC.map (\(RankedFeature feature r score) -> (feature, (r, score, 1))) .| gobbleAndDo countUScores diff --git a/src/GEval/OptionsParser.hs b/src/GEval/OptionsParser.hs index 91332ec..756bb07 100644 --- a/src/GEval/OptionsParser.hs +++ b/src/GEval/OptionsParser.hs @@ -178,6 +178,9 @@ blackBoxDebuggingOptionsParser = BlackBoxDebuggingOptions <*> switch ( long "bigrams" <> help "Consider feature bigrams") + <*> switch + ( long "cartesian" + <> help "Consider Cartesian combination of all features (computationally expensive!)") singletonMaybe :: Maybe a -> Maybe [a] singletonMaybe (Just x) = Just [x]