Start numerical factors

This commit is contained in:
Filip Gralinski 2019-01-23 13:00:37 +01:00
parent e0aceb9ca2
commit dbf5c961af
4 changed files with 55 additions and 7 deletions

View File

@ -7,5 +7,6 @@ data BlackBoxDebuggingOptions = BlackBoxDebuggingOptions {
bbdoWordShapes :: Bool,
bbdoBigrams :: Bool,
bbdoCartesian :: Bool,
bbdoMinCartesianFrequency :: Maybe Integer
bbdoMinCartesianFrequency :: Maybe Integer,
bbdoConsiderNumericalFeatures :: Bool
}

View File

@ -7,7 +7,10 @@ module GEval.FeatureExtractor
LineWithFeatures(..),
LineWithPeggedFactors(..),
PeggedFactor(..),
Feature(..))
Feature(..),
SimpleFactor(..),
AtomicFactor(..),
FeatureNamespace(..))
where
import Data.Text
@ -17,6 +20,7 @@ import Text.Tokenizer
import Text.WordShape
import GEval.BlackBoxDebugging
import GEval.Common
import Text.Read (readMaybe)
data LineWithFeatures = LineWithFeatures Double MetricValue [Feature]
deriving (Eq, Ord)
@ -37,12 +41,14 @@ data PeggedFactor = PeggedFactor FeatureNamespace SimpleFactor
instance Show PeggedFactor where
show (PeggedFactor namespace factor) = (show namespace) ++ ":" ++ (show factor)
data SimpleFactor = SimpleAtomicFactor AtomicFactor | BigramFactor AtomicFactor AtomicFactor
data SimpleFactor = SimpleAtomicFactor AtomicFactor | BigramFactor AtomicFactor AtomicFactor | NumericalFactor (Maybe Double) Int
deriving (Eq, Ord)
instance Show SimpleFactor where
show (SimpleAtomicFactor factor) = show factor
show (BigramFactor factorA factorB) = (show factorA) ++ "++" ++ (show factorB)
show (NumericalFactor (Just v) _) = ("=" ++ (show v))
show (NumericalFactor (Nothing) l) = ("=#" ++ (show l))
data AtomicFactor = TextFactor Text | ShapeFactor WordShape
deriving (Eq, Ord)
@ -72,12 +78,16 @@ extractAtomicFactors mTokenizer bbdo t = [Data.List.map TextFactor tokens] ++
extractSimpleFactors :: (Maybe Tokenizer) -> BlackBoxDebuggingOptions -> Text -> [SimpleFactor]
extractSimpleFactors mTokenizer bbdo t = Data.List.concat $ (Prelude.map (Prelude.map SimpleAtomicFactor) atomss) ++
if bbdoBigrams bbdo
(if bbdoBigrams bbdo
then Prelude.map bigramFactors atomss
else []
else [])
++
(if bbdoConsiderNumericalFeatures bbdo
then [numericalFactor t]
else [])
where atomss = extractAtomicFactors mTokenizer bbdo t
bigramFactors atoms = Prelude.map (\(a, b) -> BigramFactor a b) $ bigrams atoms
numericalFactor t = [NumericalFactor (readMaybe $ unpack t) (Data.Text.length t)]
extractFactors :: (Maybe Tokenizer) -> BlackBoxDebuggingOptions -> Text -> Text -> [PeggedFactor]
extractFactors mTokenizer bbdo namespace record =
Prelude.map (\af -> PeggedFactor (FeatureNamespace namespace) af)

View File

@ -193,6 +193,9 @@ blackBoxDebuggingOptionsParser = BlackBoxDebuggingOptions
( long "min-cartesian-frequency"
<> metavar "N"
<> help "When combining features into Cartesian features, consider only features whose frequency exceeds the threshold given"))
<*> switch
( long "numerical-features"
<> help "Consider numerical features or field lengths")
singletonMaybe :: Maybe a -> Maybe [a]
singletonMaybe (Just x) = Just [x]

View File

@ -21,6 +21,8 @@ import Options.Applicative
import Data.Text
import Text.EditDistance
import GEval.Annotation
import GEval.BlackBoxDebugging
import GEval.FeatureExtractor
import Data.Map.Strict
@ -33,6 +35,8 @@ import System.IO
import System.IO.Temp
import System.IO.Silently
import Data.List (sort)
import qualified Test.HUnit as HU
import qualified Data.IntSet as IS
@ -507,6 +511,36 @@ main = hspec $ do
token <- readToken
return $ token == (Just "BBBB")
) `shouldReturn` True
describe "extracting features" $ do
it "extract factors" $ do
let bbdo = BlackBoxDebuggingOptions {
bbdoMinFrequency = 1,
bbdoWordShapes = False,
bbdoBigrams = True,
bbdoCartesian = False,
bbdoMinCartesianFrequency = Nothing,
bbdoConsiderNumericalFeatures = True }
(sort $ extractFactorsFromTabbed Nothing bbdo "in" "I like this\t34.3\ttests") `shouldBe` [
PeggedFactor (FeatureTabbedNamespace "in" 1)
(SimpleAtomicFactor (TextFactor "I")),
PeggedFactor (FeatureTabbedNamespace "in" 1)
(SimpleAtomicFactor (TextFactor "like")),
PeggedFactor (FeatureTabbedNamespace "in" 1)
(SimpleAtomicFactor (TextFactor "this")),
PeggedFactor (FeatureTabbedNamespace "in" 1)
(BigramFactor (TextFactor "I") (TextFactor "like")),
PeggedFactor (FeatureTabbedNamespace "in" 1)
(BigramFactor (TextFactor "like") (TextFactor "this")),
PeggedFactor (FeatureTabbedNamespace "in" 1)
(NumericalFactor Nothing 11),
PeggedFactor (FeatureTabbedNamespace "in" 2)
(SimpleAtomicFactor (TextFactor "34.3")),
PeggedFactor (FeatureTabbedNamespace "in" 2)
(NumericalFactor (Just 34.3) 4),
PeggedFactor (FeatureTabbedNamespace "in" 3)
(SimpleAtomicFactor (TextFactor "tests")),
PeggedFactor (FeatureTabbedNamespace "in" 3)
(NumericalFactor Nothing 5) ]
checkConduitPure conduit inList expList = do
let outList = runConduitPure $ CC.yieldMany inList .| conduit .| CC.sinkList