Add cartesian features to black-box debugging
But it's very slow now, needs to be sped up
This commit is contained in:
parent
99e3a10791
commit
23aad86e72
@ -5,5 +5,6 @@ module GEval.BlackBoxDebugging
|
||||
data BlackBoxDebuggingOptions = BlackBoxDebuggingOptions {
|
||||
bbdoMinFrequency :: Integer,
|
||||
bbdoWordShapes :: Bool,
|
||||
bbdoBigrams :: Bool
|
||||
bbdoBigrams :: Bool,
|
||||
bbdoCartesian :: Bool
|
||||
}
|
||||
|
@ -4,6 +4,7 @@
|
||||
module GEval.FeatureExtractor
|
||||
(extractFeatures,
|
||||
extractFeaturesFromTabbed,
|
||||
cartesianFeatures,
|
||||
Feature(..))
|
||||
where
|
||||
|
||||
@ -15,9 +16,19 @@ import Text.WordShape
|
||||
import GEval.BlackBoxDebugging
|
||||
import GEval.Common
|
||||
|
||||
data Feature = SimpleFeature FeatureNamespace SimpleFeature
|
||||
data Feature = UnaryFeature PeggedFeature | CartesianFeature PeggedFeature PeggedFeature
|
||||
deriving (Eq, Ord)
|
||||
|
||||
instance Show Feature where
|
||||
show (UnaryFeature feature) = show feature
|
||||
show (CartesianFeature featureA featureB) = (show featureA) ++ "~~" ++ (show featureB)
|
||||
|
||||
data PeggedFeature = PeggedFeature FeatureNamespace SimpleFeature
|
||||
deriving (Eq, Ord)
|
||||
|
||||
instance Show PeggedFeature where
|
||||
show (PeggedFeature namespace feature) = (show namespace) ++ ":" ++ (show feature)
|
||||
|
||||
data SimpleFeature = SimpleAtomicFeature AtomicFeature | BigramFeature AtomicFeature AtomicFeature
|
||||
deriving (Eq, Ord)
|
||||
|
||||
@ -25,9 +36,6 @@ instance Show SimpleFeature where
|
||||
show (SimpleAtomicFeature feature) = show feature
|
||||
show (BigramFeature featureA featureB) = (show featureA) ++ "++" ++ (show featureB)
|
||||
|
||||
instance Show Feature where
|
||||
show (SimpleFeature namespace feature) = (show namespace) ++ ":" ++ (show feature)
|
||||
|
||||
data AtomicFeature = TextFeature Text | ShapeFeature WordShape
|
||||
deriving (Eq, Ord)
|
||||
|
||||
@ -62,13 +70,16 @@ extractSimpleFeatures mTokenizer bbdo t = Data.List.concat $ (Prelude.map (Prelu
|
||||
where atomss = extractAtomicFeatures mTokenizer bbdo t
|
||||
bigramFeatures atoms = Prelude.map (\(a, b) -> BigramFeature a b) $ bigrams atoms
|
||||
|
||||
extractFeatures :: (Maybe Tokenizer) -> BlackBoxDebuggingOptions -> Text -> Text -> [Feature]
|
||||
extractFeatures :: (Maybe Tokenizer) -> BlackBoxDebuggingOptions -> Text -> Text -> [PeggedFeature]
|
||||
extractFeatures mTokenizer bbdo namespace record =
|
||||
Prelude.map (\af -> SimpleFeature (FeatureNamespace namespace) af)
|
||||
Prelude.map (\af -> PeggedFeature (FeatureNamespace namespace) af)
|
||||
$ extractSimpleFeatures mTokenizer bbdo record
|
||||
|
||||
extractFeaturesFromTabbed :: (Maybe Tokenizer) -> BlackBoxDebuggingOptions -> Text -> Text -> [Feature]
|
||||
extractFeaturesFromTabbed :: (Maybe Tokenizer) -> BlackBoxDebuggingOptions -> Text -> Text -> [PeggedFeature]
|
||||
extractFeaturesFromTabbed mTokenizer bbdo namespace record =
|
||||
Data.List.concat
|
||||
$ Prelude.map (\(n, t) -> Prelude.map (\af -> SimpleFeature (FeatureTabbedNamespace namespace n) af) $ extractSimpleFeatures mTokenizer bbdo t)
|
||||
$ Prelude.map (\(n, t) -> Prelude.map (\af -> PeggedFeature (FeatureTabbedNamespace namespace n) af) $ extractSimpleFeatures mTokenizer bbdo t)
|
||||
$ Prelude.zip [1..] (splitOn "\t" record)
|
||||
|
||||
cartesianFeatures :: [PeggedFeature] -> [Feature]
|
||||
cartesianFeatures features = nub $ [CartesianFeature a b | a <- features, b <- features, a < b]
|
||||
|
@ -126,14 +126,22 @@ formatFeatureWithPValue (FeatureWithPValue f p avg c) =
|
||||
|
||||
featureExtractor :: Monad m => GEvalSpecification -> BlackBoxDebuggingOptions -> ConduitT (Double, LineRecord) RankedFeature m ()
|
||||
featureExtractor spec bbdo = CC.map extract .| CC.concat
|
||||
where extract (rank, LineRecord inLine expLine outLine _ score) =
|
||||
where extract (rank, line@(LineRecord _ _ _ _ score)) =
|
||||
Prelude.map (\f -> RankedFeature f rank score)
|
||||
$ Data.List.concat [
|
||||
extractFeatures mTokenizer bbdo "exp" expLine,
|
||||
extractFeaturesFromTabbed mTokenizer bbdo "in" inLine,
|
||||
extractFeatures mTokenizer bbdo "out" outLine]
|
||||
$ getFeatures mTokenizer bbdo line
|
||||
mTokenizer = gesTokenizer spec
|
||||
|
||||
getFeatures :: Maybe Tokenizer -> BlackBoxDebuggingOptions -> LineRecord -> [Feature]
|
||||
getFeatures mTokenizer bbdo (LineRecord inLine expLine outLine _ _) = Prelude.map UnaryFeature unaryFeatures ++
|
||||
if bbdoCartesian bbdo
|
||||
then cartesianFeatures unaryFeatures
|
||||
else []
|
||||
where unaryFeatures =
|
||||
Data.List.concat [
|
||||
extractFeatures mTokenizer bbdo "exp" expLine,
|
||||
extractFeaturesFromTabbed mTokenizer bbdo "in" inLine,
|
||||
extractFeatures mTokenizer bbdo "out" outLine]
|
||||
|
||||
uScoresCounter :: Monad m => Integer -> ConduitT RankedFeature FeatureWithPValue (StateT Integer m) ()
|
||||
uScoresCounter minFreq = CC.map (\(RankedFeature feature r score) -> (feature, (r, score, 1)))
|
||||
.| gobbleAndDo countUScores
|
||||
|
@ -178,6 +178,9 @@ blackBoxDebuggingOptionsParser = BlackBoxDebuggingOptions
|
||||
<*> switch
|
||||
( long "bigrams"
|
||||
<> help "Consider feature bigrams")
|
||||
<*> switch
|
||||
( long "cartesian"
|
||||
<> help "Consider Cartesian combination of all features (computationally expensive!)")
|
||||
|
||||
singletonMaybe :: Maybe a -> Maybe [a]
|
||||
singletonMaybe (Just x) = Just [x]
|
||||
|
Loading…
Reference in New Issue
Block a user