Start numerical factors
This commit is contained in:
parent
e0aceb9ca2
commit
dbf5c961af
@ -7,5 +7,6 @@ data BlackBoxDebuggingOptions = BlackBoxDebuggingOptions {
|
||||
bbdoWordShapes :: Bool,
|
||||
bbdoBigrams :: Bool,
|
||||
bbdoCartesian :: Bool,
|
||||
bbdoMinCartesianFrequency :: Maybe Integer
|
||||
bbdoMinCartesianFrequency :: Maybe Integer,
|
||||
bbdoConsiderNumericalFeatures :: Bool
|
||||
}
|
||||
|
@ -7,7 +7,10 @@ module GEval.FeatureExtractor
|
||||
LineWithFeatures(..),
|
||||
LineWithPeggedFactors(..),
|
||||
PeggedFactor(..),
|
||||
Feature(..))
|
||||
Feature(..),
|
||||
SimpleFactor(..),
|
||||
AtomicFactor(..),
|
||||
FeatureNamespace(..))
|
||||
where
|
||||
|
||||
import Data.Text
|
||||
@ -17,6 +20,7 @@ import Text.Tokenizer
|
||||
import Text.WordShape
|
||||
import GEval.BlackBoxDebugging
|
||||
import GEval.Common
|
||||
import Text.Read (readMaybe)
|
||||
|
||||
data LineWithFeatures = LineWithFeatures Double MetricValue [Feature]
|
||||
deriving (Eq, Ord)
|
||||
@ -37,12 +41,14 @@ data PeggedFactor = PeggedFactor FeatureNamespace SimpleFactor
|
||||
instance Show PeggedFactor where
|
||||
show (PeggedFactor namespace factor) = (show namespace) ++ ":" ++ (show factor)
|
||||
|
||||
data SimpleFactor = SimpleAtomicFactor AtomicFactor | BigramFactor AtomicFactor AtomicFactor
|
||||
data SimpleFactor = SimpleAtomicFactor AtomicFactor | BigramFactor AtomicFactor AtomicFactor | NumericalFactor (Maybe Double) Int
|
||||
deriving (Eq, Ord)
|
||||
|
||||
instance Show SimpleFactor where
|
||||
show (SimpleAtomicFactor factor) = show factor
|
||||
show (BigramFactor factorA factorB) = (show factorA) ++ "++" ++ (show factorB)
|
||||
show (NumericalFactor (Just v) _) = ("=" ++ (show v))
|
||||
show (NumericalFactor (Nothing) l) = ("=#" ++ (show l))
|
||||
|
||||
data AtomicFactor = TextFactor Text | ShapeFactor WordShape
|
||||
deriving (Eq, Ord)
|
||||
@ -72,12 +78,16 @@ extractAtomicFactors mTokenizer bbdo t = [Data.List.map TextFactor tokens] ++
|
||||
|
||||
extractSimpleFactors :: (Maybe Tokenizer) -> BlackBoxDebuggingOptions -> Text -> [SimpleFactor]
|
||||
extractSimpleFactors mTokenizer bbdo t = Data.List.concat $ (Prelude.map (Prelude.map SimpleAtomicFactor) atomss) ++
|
||||
if bbdoBigrams bbdo
|
||||
then Prelude.map bigramFactors atomss
|
||||
else []
|
||||
(if bbdoBigrams bbdo
|
||||
then Prelude.map bigramFactors atomss
|
||||
else [])
|
||||
++
|
||||
(if bbdoConsiderNumericalFeatures bbdo
|
||||
then [numericalFactor t]
|
||||
else [])
|
||||
where atomss = extractAtomicFactors mTokenizer bbdo t
|
||||
bigramFactors atoms = Prelude.map (\(a, b) -> BigramFactor a b) $ bigrams atoms
|
||||
|
||||
numericalFactor t = [NumericalFactor (readMaybe $ unpack t) (Data.Text.length t)]
|
||||
extractFactors :: (Maybe Tokenizer) -> BlackBoxDebuggingOptions -> Text -> Text -> [PeggedFactor]
|
||||
extractFactors mTokenizer bbdo namespace record =
|
||||
Prelude.map (\af -> PeggedFactor (FeatureNamespace namespace) af)
|
||||
|
@ -193,6 +193,9 @@ blackBoxDebuggingOptionsParser = BlackBoxDebuggingOptions
|
||||
( long "min-cartesian-frequency"
|
||||
<> metavar "N"
|
||||
<> help "When combining features into Cartesian features, consider only features whose frequency exceeds the threshold given"))
|
||||
<*> switch
|
||||
( long "numerical-features"
|
||||
<> help "Consider numerical features or field lengths")
|
||||
|
||||
singletonMaybe :: Maybe a -> Maybe [a]
|
||||
singletonMaybe (Just x) = Just [x]
|
||||
|
34
test/Spec.hs
34
test/Spec.hs
@ -21,6 +21,8 @@ import Options.Applicative
|
||||
import Data.Text
|
||||
import Text.EditDistance
|
||||
import GEval.Annotation
|
||||
import GEval.BlackBoxDebugging
|
||||
import GEval.FeatureExtractor
|
||||
|
||||
import Data.Map.Strict
|
||||
|
||||
@ -33,6 +35,8 @@ import System.IO
|
||||
import System.IO.Temp
|
||||
import System.IO.Silently
|
||||
|
||||
import Data.List (sort)
|
||||
|
||||
import qualified Test.HUnit as HU
|
||||
|
||||
import qualified Data.IntSet as IS
|
||||
@ -507,6 +511,36 @@ main = hspec $ do
|
||||
token <- readToken
|
||||
return $ token == (Just "BBBB")
|
||||
) `shouldReturn` True
|
||||
describe "extracting features" $ do
|
||||
it "extract factors" $ do
|
||||
let bbdo = BlackBoxDebuggingOptions {
|
||||
bbdoMinFrequency = 1,
|
||||
bbdoWordShapes = False,
|
||||
bbdoBigrams = True,
|
||||
bbdoCartesian = False,
|
||||
bbdoMinCartesianFrequency = Nothing,
|
||||
bbdoConsiderNumericalFeatures = True }
|
||||
(sort $ extractFactorsFromTabbed Nothing bbdo "in" "I like this\t34.3\ttests") `shouldBe` [
|
||||
PeggedFactor (FeatureTabbedNamespace "in" 1)
|
||||
(SimpleAtomicFactor (TextFactor "I")),
|
||||
PeggedFactor (FeatureTabbedNamespace "in" 1)
|
||||
(SimpleAtomicFactor (TextFactor "like")),
|
||||
PeggedFactor (FeatureTabbedNamespace "in" 1)
|
||||
(SimpleAtomicFactor (TextFactor "this")),
|
||||
PeggedFactor (FeatureTabbedNamespace "in" 1)
|
||||
(BigramFactor (TextFactor "I") (TextFactor "like")),
|
||||
PeggedFactor (FeatureTabbedNamespace "in" 1)
|
||||
(BigramFactor (TextFactor "like") (TextFactor "this")),
|
||||
PeggedFactor (FeatureTabbedNamespace "in" 1)
|
||||
(NumericalFactor Nothing 11),
|
||||
PeggedFactor (FeatureTabbedNamespace "in" 2)
|
||||
(SimpleAtomicFactor (TextFactor "34.3")),
|
||||
PeggedFactor (FeatureTabbedNamespace "in" 2)
|
||||
(NumericalFactor (Just 34.3) 4),
|
||||
PeggedFactor (FeatureTabbedNamespace "in" 3)
|
||||
(SimpleAtomicFactor (TextFactor "tests")),
|
||||
PeggedFactor (FeatureTabbedNamespace "in" 3)
|
||||
(NumericalFactor Nothing 5) ]
|
||||
|
||||
checkConduitPure conduit inList expList = do
|
||||
let outList = runConduitPure $ CC.yieldMany inList .| conduit .| CC.sinkList
|
||||
|
Loading…
Reference in New Issue
Block a user