Start numerical factors
This commit is contained in:
parent
e0aceb9ca2
commit
dbf5c961af
@ -7,5 +7,6 @@ data BlackBoxDebuggingOptions = BlackBoxDebuggingOptions {
|
|||||||
bbdoWordShapes :: Bool,
|
bbdoWordShapes :: Bool,
|
||||||
bbdoBigrams :: Bool,
|
bbdoBigrams :: Bool,
|
||||||
bbdoCartesian :: Bool,
|
bbdoCartesian :: Bool,
|
||||||
bbdoMinCartesianFrequency :: Maybe Integer
|
bbdoMinCartesianFrequency :: Maybe Integer,
|
||||||
|
bbdoConsiderNumericalFeatures :: Bool
|
||||||
}
|
}
|
||||||
|
@ -7,7 +7,10 @@ module GEval.FeatureExtractor
|
|||||||
LineWithFeatures(..),
|
LineWithFeatures(..),
|
||||||
LineWithPeggedFactors(..),
|
LineWithPeggedFactors(..),
|
||||||
PeggedFactor(..),
|
PeggedFactor(..),
|
||||||
Feature(..))
|
Feature(..),
|
||||||
|
SimpleFactor(..),
|
||||||
|
AtomicFactor(..),
|
||||||
|
FeatureNamespace(..))
|
||||||
where
|
where
|
||||||
|
|
||||||
import Data.Text
|
import Data.Text
|
||||||
@ -17,6 +20,7 @@ import Text.Tokenizer
|
|||||||
import Text.WordShape
|
import Text.WordShape
|
||||||
import GEval.BlackBoxDebugging
|
import GEval.BlackBoxDebugging
|
||||||
import GEval.Common
|
import GEval.Common
|
||||||
|
import Text.Read (readMaybe)
|
||||||
|
|
||||||
data LineWithFeatures = LineWithFeatures Double MetricValue [Feature]
|
data LineWithFeatures = LineWithFeatures Double MetricValue [Feature]
|
||||||
deriving (Eq, Ord)
|
deriving (Eq, Ord)
|
||||||
@ -37,12 +41,14 @@ data PeggedFactor = PeggedFactor FeatureNamespace SimpleFactor
|
|||||||
instance Show PeggedFactor where
|
instance Show PeggedFactor where
|
||||||
show (PeggedFactor namespace factor) = (show namespace) ++ ":" ++ (show factor)
|
show (PeggedFactor namespace factor) = (show namespace) ++ ":" ++ (show factor)
|
||||||
|
|
||||||
data SimpleFactor = SimpleAtomicFactor AtomicFactor | BigramFactor AtomicFactor AtomicFactor
|
data SimpleFactor = SimpleAtomicFactor AtomicFactor | BigramFactor AtomicFactor AtomicFactor | NumericalFactor (Maybe Double) Int
|
||||||
deriving (Eq, Ord)
|
deriving (Eq, Ord)
|
||||||
|
|
||||||
instance Show SimpleFactor where
|
instance Show SimpleFactor where
|
||||||
show (SimpleAtomicFactor factor) = show factor
|
show (SimpleAtomicFactor factor) = show factor
|
||||||
show (BigramFactor factorA factorB) = (show factorA) ++ "++" ++ (show factorB)
|
show (BigramFactor factorA factorB) = (show factorA) ++ "++" ++ (show factorB)
|
||||||
|
show (NumericalFactor (Just v) _) = ("=" ++ (show v))
|
||||||
|
show (NumericalFactor (Nothing) l) = ("=#" ++ (show l))
|
||||||
|
|
||||||
data AtomicFactor = TextFactor Text | ShapeFactor WordShape
|
data AtomicFactor = TextFactor Text | ShapeFactor WordShape
|
||||||
deriving (Eq, Ord)
|
deriving (Eq, Ord)
|
||||||
@ -72,12 +78,16 @@ extractAtomicFactors mTokenizer bbdo t = [Data.List.map TextFactor tokens] ++
|
|||||||
|
|
||||||
extractSimpleFactors :: (Maybe Tokenizer) -> BlackBoxDebuggingOptions -> Text -> [SimpleFactor]
|
extractSimpleFactors :: (Maybe Tokenizer) -> BlackBoxDebuggingOptions -> Text -> [SimpleFactor]
|
||||||
extractSimpleFactors mTokenizer bbdo t = Data.List.concat $ (Prelude.map (Prelude.map SimpleAtomicFactor) atomss) ++
|
extractSimpleFactors mTokenizer bbdo t = Data.List.concat $ (Prelude.map (Prelude.map SimpleAtomicFactor) atomss) ++
|
||||||
if bbdoBigrams bbdo
|
(if bbdoBigrams bbdo
|
||||||
then Prelude.map bigramFactors atomss
|
then Prelude.map bigramFactors atomss
|
||||||
else []
|
else [])
|
||||||
|
++
|
||||||
|
(if bbdoConsiderNumericalFeatures bbdo
|
||||||
|
then [numericalFactor t]
|
||||||
|
else [])
|
||||||
where atomss = extractAtomicFactors mTokenizer bbdo t
|
where atomss = extractAtomicFactors mTokenizer bbdo t
|
||||||
bigramFactors atoms = Prelude.map (\(a, b) -> BigramFactor a b) $ bigrams atoms
|
bigramFactors atoms = Prelude.map (\(a, b) -> BigramFactor a b) $ bigrams atoms
|
||||||
|
numericalFactor t = [NumericalFactor (readMaybe $ unpack t) (Data.Text.length t)]
|
||||||
extractFactors :: (Maybe Tokenizer) -> BlackBoxDebuggingOptions -> Text -> Text -> [PeggedFactor]
|
extractFactors :: (Maybe Tokenizer) -> BlackBoxDebuggingOptions -> Text -> Text -> [PeggedFactor]
|
||||||
extractFactors mTokenizer bbdo namespace record =
|
extractFactors mTokenizer bbdo namespace record =
|
||||||
Prelude.map (\af -> PeggedFactor (FeatureNamespace namespace) af)
|
Prelude.map (\af -> PeggedFactor (FeatureNamespace namespace) af)
|
||||||
|
@ -193,6 +193,9 @@ blackBoxDebuggingOptionsParser = BlackBoxDebuggingOptions
|
|||||||
( long "min-cartesian-frequency"
|
( long "min-cartesian-frequency"
|
||||||
<> metavar "N"
|
<> metavar "N"
|
||||||
<> help "When combining features into Cartesian features, consider only features whose frequency exceeds the threshold given"))
|
<> help "When combining features into Cartesian features, consider only features whose frequency exceeds the threshold given"))
|
||||||
|
<*> switch
|
||||||
|
( long "numerical-features"
|
||||||
|
<> help "Consider numerical features or field lengths")
|
||||||
|
|
||||||
singletonMaybe :: Maybe a -> Maybe [a]
|
singletonMaybe :: Maybe a -> Maybe [a]
|
||||||
singletonMaybe (Just x) = Just [x]
|
singletonMaybe (Just x) = Just [x]
|
||||||
|
34
test/Spec.hs
34
test/Spec.hs
@ -21,6 +21,8 @@ import Options.Applicative
|
|||||||
import Data.Text
|
import Data.Text
|
||||||
import Text.EditDistance
|
import Text.EditDistance
|
||||||
import GEval.Annotation
|
import GEval.Annotation
|
||||||
|
import GEval.BlackBoxDebugging
|
||||||
|
import GEval.FeatureExtractor
|
||||||
|
|
||||||
import Data.Map.Strict
|
import Data.Map.Strict
|
||||||
|
|
||||||
@ -33,6 +35,8 @@ import System.IO
|
|||||||
import System.IO.Temp
|
import System.IO.Temp
|
||||||
import System.IO.Silently
|
import System.IO.Silently
|
||||||
|
|
||||||
|
import Data.List (sort)
|
||||||
|
|
||||||
import qualified Test.HUnit as HU
|
import qualified Test.HUnit as HU
|
||||||
|
|
||||||
import qualified Data.IntSet as IS
|
import qualified Data.IntSet as IS
|
||||||
@ -507,6 +511,36 @@ main = hspec $ do
|
|||||||
token <- readToken
|
token <- readToken
|
||||||
return $ token == (Just "BBBB")
|
return $ token == (Just "BBBB")
|
||||||
) `shouldReturn` True
|
) `shouldReturn` True
|
||||||
|
describe "extracting features" $ do
|
||||||
|
it "extract factors" $ do
|
||||||
|
let bbdo = BlackBoxDebuggingOptions {
|
||||||
|
bbdoMinFrequency = 1,
|
||||||
|
bbdoWordShapes = False,
|
||||||
|
bbdoBigrams = True,
|
||||||
|
bbdoCartesian = False,
|
||||||
|
bbdoMinCartesianFrequency = Nothing,
|
||||||
|
bbdoConsiderNumericalFeatures = True }
|
||||||
|
(sort $ extractFactorsFromTabbed Nothing bbdo "in" "I like this\t34.3\ttests") `shouldBe` [
|
||||||
|
PeggedFactor (FeatureTabbedNamespace "in" 1)
|
||||||
|
(SimpleAtomicFactor (TextFactor "I")),
|
||||||
|
PeggedFactor (FeatureTabbedNamespace "in" 1)
|
||||||
|
(SimpleAtomicFactor (TextFactor "like")),
|
||||||
|
PeggedFactor (FeatureTabbedNamespace "in" 1)
|
||||||
|
(SimpleAtomicFactor (TextFactor "this")),
|
||||||
|
PeggedFactor (FeatureTabbedNamespace "in" 1)
|
||||||
|
(BigramFactor (TextFactor "I") (TextFactor "like")),
|
||||||
|
PeggedFactor (FeatureTabbedNamespace "in" 1)
|
||||||
|
(BigramFactor (TextFactor "like") (TextFactor "this")),
|
||||||
|
PeggedFactor (FeatureTabbedNamespace "in" 1)
|
||||||
|
(NumericalFactor Nothing 11),
|
||||||
|
PeggedFactor (FeatureTabbedNamespace "in" 2)
|
||||||
|
(SimpleAtomicFactor (TextFactor "34.3")),
|
||||||
|
PeggedFactor (FeatureTabbedNamespace "in" 2)
|
||||||
|
(NumericalFactor (Just 34.3) 4),
|
||||||
|
PeggedFactor (FeatureTabbedNamespace "in" 3)
|
||||||
|
(SimpleAtomicFactor (TextFactor "tests")),
|
||||||
|
PeggedFactor (FeatureTabbedNamespace "in" 3)
|
||||||
|
(NumericalFactor Nothing 5) ]
|
||||||
|
|
||||||
checkConduitPure conduit inList expList = do
|
checkConduitPure conduit inList expList = do
|
||||||
let outList = runConduitPure $ CC.yieldMany inList .| conduit .| CC.sinkList
|
let outList = runConduitPure $ CC.yieldMany inList .| conduit .| CC.sinkList
|
||||||
|
Loading…
Reference in New Issue
Block a user