Start numerical factors

This commit is contained in:
Filip Gralinski 2019-01-23 13:00:37 +01:00
parent e0aceb9ca2
commit dbf5c961af
4 changed files with 55 additions and 7 deletions

View File

@ -7,5 +7,6 @@ data BlackBoxDebuggingOptions = BlackBoxDebuggingOptions {
bbdoWordShapes :: Bool, bbdoWordShapes :: Bool,
bbdoBigrams :: Bool, bbdoBigrams :: Bool,
bbdoCartesian :: Bool, bbdoCartesian :: Bool,
bbdoMinCartesianFrequency :: Maybe Integer bbdoMinCartesianFrequency :: Maybe Integer,
bbdoConsiderNumericalFeatures :: Bool
} }

View File

@ -7,7 +7,10 @@ module GEval.FeatureExtractor
LineWithFeatures(..), LineWithFeatures(..),
LineWithPeggedFactors(..), LineWithPeggedFactors(..),
PeggedFactor(..), PeggedFactor(..),
Feature(..)) Feature(..),
SimpleFactor(..),
AtomicFactor(..),
FeatureNamespace(..))
where where
import Data.Text import Data.Text
@ -17,6 +20,7 @@ import Text.Tokenizer
import Text.WordShape import Text.WordShape
import GEval.BlackBoxDebugging import GEval.BlackBoxDebugging
import GEval.Common import GEval.Common
import Text.Read (readMaybe)
data LineWithFeatures = LineWithFeatures Double MetricValue [Feature] data LineWithFeatures = LineWithFeatures Double MetricValue [Feature]
deriving (Eq, Ord) deriving (Eq, Ord)
@ -37,12 +41,14 @@ data PeggedFactor = PeggedFactor FeatureNamespace SimpleFactor
instance Show PeggedFactor where instance Show PeggedFactor where
show (PeggedFactor namespace factor) = (show namespace) ++ ":" ++ (show factor) show (PeggedFactor namespace factor) = (show namespace) ++ ":" ++ (show factor)
data SimpleFactor = SimpleAtomicFactor AtomicFactor | BigramFactor AtomicFactor AtomicFactor data SimpleFactor = SimpleAtomicFactor AtomicFactor | BigramFactor AtomicFactor AtomicFactor | NumericalFactor (Maybe Double) Int
deriving (Eq, Ord) deriving (Eq, Ord)
instance Show SimpleFactor where instance Show SimpleFactor where
show (SimpleAtomicFactor factor) = show factor show (SimpleAtomicFactor factor) = show factor
show (BigramFactor factorA factorB) = (show factorA) ++ "++" ++ (show factorB) show (BigramFactor factorA factorB) = (show factorA) ++ "++" ++ (show factorB)
show (NumericalFactor (Just v) _) = ("=" ++ (show v))
show (NumericalFactor (Nothing) l) = ("=#" ++ (show l))
data AtomicFactor = TextFactor Text | ShapeFactor WordShape data AtomicFactor = TextFactor Text | ShapeFactor WordShape
deriving (Eq, Ord) deriving (Eq, Ord)
@ -72,12 +78,16 @@ extractAtomicFactors mTokenizer bbdo t = [Data.List.map TextFactor tokens] ++
extractSimpleFactors :: (Maybe Tokenizer) -> BlackBoxDebuggingOptions -> Text -> [SimpleFactor] extractSimpleFactors :: (Maybe Tokenizer) -> BlackBoxDebuggingOptions -> Text -> [SimpleFactor]
extractSimpleFactors mTokenizer bbdo t = Data.List.concat $ (Prelude.map (Prelude.map SimpleAtomicFactor) atomss) ++ extractSimpleFactors mTokenizer bbdo t = Data.List.concat $ (Prelude.map (Prelude.map SimpleAtomicFactor) atomss) ++
if bbdoBigrams bbdo (if bbdoBigrams bbdo
then Prelude.map bigramFactors atomss then Prelude.map bigramFactors atomss
else [] else [])
++
(if bbdoConsiderNumericalFeatures bbdo
then [numericalFactor t]
else [])
where atomss = extractAtomicFactors mTokenizer bbdo t where atomss = extractAtomicFactors mTokenizer bbdo t
bigramFactors atoms = Prelude.map (\(a, b) -> BigramFactor a b) $ bigrams atoms bigramFactors atoms = Prelude.map (\(a, b) -> BigramFactor a b) $ bigrams atoms
numericalFactor t = [NumericalFactor (readMaybe $ unpack t) (Data.Text.length t)]
extractFactors :: (Maybe Tokenizer) -> BlackBoxDebuggingOptions -> Text -> Text -> [PeggedFactor] extractFactors :: (Maybe Tokenizer) -> BlackBoxDebuggingOptions -> Text -> Text -> [PeggedFactor]
extractFactors mTokenizer bbdo namespace record = extractFactors mTokenizer bbdo namespace record =
Prelude.map (\af -> PeggedFactor (FeatureNamespace namespace) af) Prelude.map (\af -> PeggedFactor (FeatureNamespace namespace) af)

View File

@ -193,6 +193,9 @@ blackBoxDebuggingOptionsParser = BlackBoxDebuggingOptions
( long "min-cartesian-frequency" ( long "min-cartesian-frequency"
<> metavar "N" <> metavar "N"
<> help "When combining features into Cartesian features, consider only features whose frequency exceeds the threshold given")) <> help "When combining features into Cartesian features, consider only features whose frequency exceeds the threshold given"))
<*> switch
( long "numerical-features"
<> help "Consider numerical features or field lengths")
singletonMaybe :: Maybe a -> Maybe [a] singletonMaybe :: Maybe a -> Maybe [a]
singletonMaybe (Just x) = Just [x] singletonMaybe (Just x) = Just [x]

View File

@ -21,6 +21,8 @@ import Options.Applicative
import Data.Text import Data.Text
import Text.EditDistance import Text.EditDistance
import GEval.Annotation import GEval.Annotation
import GEval.BlackBoxDebugging
import GEval.FeatureExtractor
import Data.Map.Strict import Data.Map.Strict
@ -33,6 +35,8 @@ import System.IO
import System.IO.Temp import System.IO.Temp
import System.IO.Silently import System.IO.Silently
import Data.List (sort)
import qualified Test.HUnit as HU import qualified Test.HUnit as HU
import qualified Data.IntSet as IS import qualified Data.IntSet as IS
@ -507,6 +511,36 @@ main = hspec $ do
token <- readToken token <- readToken
return $ token == (Just "BBBB") return $ token == (Just "BBBB")
) `shouldReturn` True ) `shouldReturn` True
describe "extracting features" $ do
it "extract factors" $ do
let bbdo = BlackBoxDebuggingOptions {
bbdoMinFrequency = 1,
bbdoWordShapes = False,
bbdoBigrams = True,
bbdoCartesian = False,
bbdoMinCartesianFrequency = Nothing,
bbdoConsiderNumericalFeatures = True }
(sort $ extractFactorsFromTabbed Nothing bbdo "in" "I like this\t34.3\ttests") `shouldBe` [
PeggedFactor (FeatureTabbedNamespace "in" 1)
(SimpleAtomicFactor (TextFactor "I")),
PeggedFactor (FeatureTabbedNamespace "in" 1)
(SimpleAtomicFactor (TextFactor "like")),
PeggedFactor (FeatureTabbedNamespace "in" 1)
(SimpleAtomicFactor (TextFactor "this")),
PeggedFactor (FeatureTabbedNamespace "in" 1)
(BigramFactor (TextFactor "I") (TextFactor "like")),
PeggedFactor (FeatureTabbedNamespace "in" 1)
(BigramFactor (TextFactor "like") (TextFactor "this")),
PeggedFactor (FeatureTabbedNamespace "in" 1)
(NumericalFactor Nothing 11),
PeggedFactor (FeatureTabbedNamespace "in" 2)
(SimpleAtomicFactor (TextFactor "34.3")),
PeggedFactor (FeatureTabbedNamespace "in" 2)
(NumericalFactor (Just 34.3) 4),
PeggedFactor (FeatureTabbedNamespace "in" 3)
(SimpleAtomicFactor (TextFactor "tests")),
PeggedFactor (FeatureTabbedNamespace "in" 3)
(NumericalFactor Nothing 5) ]
checkConduitPure conduit inList expList = do checkConduitPure conduit inList expList = do
let outList = runConduitPure $ CC.yieldMany inList .| conduit .| CC.sinkList let outList = runConduitPure $ CC.yieldMany inList .| conduit .| CC.sinkList