diff --git a/src/GEval/BlackBoxDebugging.hs b/src/GEval/BlackBoxDebugging.hs index fa1620d..6d8b185 100644 --- a/src/GEval/BlackBoxDebugging.hs +++ b/src/GEval/BlackBoxDebugging.hs @@ -7,5 +7,6 @@ data BlackBoxDebuggingOptions = BlackBoxDebuggingOptions { bbdoWordShapes :: Bool, bbdoBigrams :: Bool, bbdoCartesian :: Bool, - bbdoMinCartesianFrequency :: Maybe Integer + bbdoMinCartesianFrequency :: Maybe Integer, + bbdoConsiderNumericalFeatures :: Bool } diff --git a/src/GEval/FeatureExtractor.hs b/src/GEval/FeatureExtractor.hs index 7996e55..8f31cb0 100644 --- a/src/GEval/FeatureExtractor.hs +++ b/src/GEval/FeatureExtractor.hs @@ -7,7 +7,10 @@ module GEval.FeatureExtractor LineWithFeatures(..), LineWithPeggedFactors(..), PeggedFactor(..), - Feature(..)) + Feature(..), + SimpleFactor(..), + AtomicFactor(..), + FeatureNamespace(..)) where import Data.Text @@ -17,6 +20,7 @@ import Text.Tokenizer import Text.WordShape import GEval.BlackBoxDebugging import GEval.Common +import Text.Read (readMaybe) data LineWithFeatures = LineWithFeatures Double MetricValue [Feature] deriving (Eq, Ord) @@ -37,12 +41,14 @@ data PeggedFactor = PeggedFactor FeatureNamespace SimpleFactor instance Show PeggedFactor where show (PeggedFactor namespace factor) = (show namespace) ++ ":" ++ (show factor) -data SimpleFactor = SimpleAtomicFactor AtomicFactor | BigramFactor AtomicFactor AtomicFactor +data SimpleFactor = SimpleAtomicFactor AtomicFactor | BigramFactor AtomicFactor AtomicFactor | NumericalFactor (Maybe Double) Int deriving (Eq, Ord) instance Show SimpleFactor where show (SimpleAtomicFactor factor) = show factor show (BigramFactor factorA factorB) = (show factorA) ++ "++" ++ (show factorB) + show (NumericalFactor (Just v) _) = ("=" ++ (show v)) + show (NumericalFactor (Nothing) l) = ("=#" ++ (show l)) data AtomicFactor = TextFactor Text | ShapeFactor WordShape deriving (Eq, Ord) @@ -72,12 +78,16 @@ extractAtomicFactors mTokenizer bbdo t = [Data.List.map TextFactor tokens] ++ extractSimpleFactors :: (Maybe Tokenizer) -> BlackBoxDebuggingOptions -> Text -> [SimpleFactor] extractSimpleFactors mTokenizer bbdo t = Data.List.concat $ (Prelude.map (Prelude.map SimpleAtomicFactor) atomss) ++ - if bbdoBigrams bbdo - then Prelude.map bigramFactors atomss - else [] + (if bbdoBigrams bbdo + then Prelude.map bigramFactors atomss + else []) + ++ + (if bbdoConsiderNumericalFeatures bbdo + then [numericalFactor t] + else []) where atomss = extractAtomicFactors mTokenizer bbdo t bigramFactors atoms = Prelude.map (\(a, b) -> BigramFactor a b) $ bigrams atoms - + numericalFactor t = [NumericalFactor (readMaybe $ unpack t) (Data.Text.length t)] extractFactors :: (Maybe Tokenizer) -> BlackBoxDebuggingOptions -> Text -> Text -> [PeggedFactor] extractFactors mTokenizer bbdo namespace record = Prelude.map (\af -> PeggedFactor (FeatureNamespace namespace) af) diff --git a/src/GEval/OptionsParser.hs b/src/GEval/OptionsParser.hs index cd1d4cc..056401a 100644 --- a/src/GEval/OptionsParser.hs +++ b/src/GEval/OptionsParser.hs @@ -193,6 +193,9 @@ blackBoxDebuggingOptionsParser = BlackBoxDebuggingOptions ( long "min-cartesian-frequency" <> metavar "N" <> help "When combining features into Cartesian features, consider only features whose frequency exceeds the threshold given")) + <*> switch + ( long "numerical-features" + <> help "Consider numerical features or field lengths") singletonMaybe :: Maybe a -> Maybe [a] singletonMaybe (Just x) = Just [x] diff --git a/test/Spec.hs b/test/Spec.hs index 5aa5a68..022d64d 100644 --- a/test/Spec.hs +++ b/test/Spec.hs @@ -21,6 +21,8 @@ import Options.Applicative import Data.Text import Text.EditDistance import GEval.Annotation +import GEval.BlackBoxDebugging +import GEval.FeatureExtractor import Data.Map.Strict @@ -33,6 +35,8 @@ import System.IO import System.IO.Temp import System.IO.Silently +import Data.List (sort) + import qualified Test.HUnit as HU import qualified Data.IntSet as IS @@ -507,6 +511,36 @@ main = hspec $ do token <- readToken return $ token == (Just "BBBB") ) `shouldReturn` True + describe "extracting features" $ do + it "extract factors" $ do + let bbdo = BlackBoxDebuggingOptions { + bbdoMinFrequency = 1, + bbdoWordShapes = False, + bbdoBigrams = True, + bbdoCartesian = False, + bbdoMinCartesianFrequency = Nothing, + bbdoConsiderNumericalFeatures = True } + (sort $ extractFactorsFromTabbed Nothing bbdo "in" "I like this\t34.3\ttests") `shouldBe` [ + PeggedFactor (FeatureTabbedNamespace "in" 1) + (SimpleAtomicFactor (TextFactor "I")), + PeggedFactor (FeatureTabbedNamespace "in" 1) + (SimpleAtomicFactor (TextFactor "like")), + PeggedFactor (FeatureTabbedNamespace "in" 1) + (SimpleAtomicFactor (TextFactor "this")), + PeggedFactor (FeatureTabbedNamespace "in" 1) + (BigramFactor (TextFactor "I") (TextFactor "like")), + PeggedFactor (FeatureTabbedNamespace "in" 1) + (BigramFactor (TextFactor "like") (TextFactor "this")), + PeggedFactor (FeatureTabbedNamespace "in" 1) + (NumericalFactor Nothing 11), + PeggedFactor (FeatureTabbedNamespace "in" 2) + (SimpleAtomicFactor (TextFactor "34.3")), + PeggedFactor (FeatureTabbedNamespace "in" 2) + (NumericalFactor (Just 34.3) 4), + PeggedFactor (FeatureTabbedNamespace "in" 3) + (SimpleAtomicFactor (TextFactor "tests")), + PeggedFactor (FeatureTabbedNamespace "in" 3) + (NumericalFactor Nothing 5) ] checkConduitPure conduit inList expList = do let outList = runConduitPure $ CC.yieldMany inList .| conduit .| CC.sinkList