Change features into "factors"
(Just the terminology was changed)
This commit is contained in:
parent
de901d4c64
commit
b0c75cac3a
@ -1,12 +1,12 @@
|
|||||||
{-# LANGUAGE OverloadedStrings #-}
|
{-# LANGUAGE OverloadedStrings #-}
|
||||||
|
|
||||||
module GEval.FeatureExtractor
|
module GEval.FeatureExtractor
|
||||||
(extractFeatures,
|
(extractFactors,
|
||||||
extractFeaturesFromTabbed,
|
extractFactorsFromTabbed,
|
||||||
cartesianFeatures,
|
cartesianFeatures,
|
||||||
LineWithFeatures(..),
|
LineWithFeatures(..),
|
||||||
LineWithPeggedFeatures(..),
|
LineWithPeggedFactors(..),
|
||||||
PeggedFeature(..),
|
PeggedFactor(..),
|
||||||
Feature(..))
|
Feature(..))
|
||||||
where
|
where
|
||||||
|
|
||||||
@ -21,35 +21,35 @@ import GEval.Common
|
|||||||
data LineWithFeatures = LineWithFeatures Double MetricValue [Feature]
|
data LineWithFeatures = LineWithFeatures Double MetricValue [Feature]
|
||||||
deriving (Eq, Ord)
|
deriving (Eq, Ord)
|
||||||
|
|
||||||
data Feature = UnaryFeature PeggedFeature | CartesianFeature PeggedFeature PeggedFeature
|
data Feature = UnaryFeature PeggedFactor | CartesianFeature PeggedFactor PeggedFactor
|
||||||
deriving (Eq, Ord)
|
deriving (Eq, Ord)
|
||||||
|
|
||||||
instance Show Feature where
|
instance Show Feature where
|
||||||
show (UnaryFeature feature) = show feature
|
show (UnaryFeature factor) = show factor
|
||||||
show (CartesianFeature featureA featureB) = (show featureA) ++ "~~" ++ (show featureB)
|
show (CartesianFeature factorA factorB) = (show factorA) ++ "~~" ++ (show factorB)
|
||||||
|
|
||||||
data LineWithPeggedFeatures = LineWithPeggedFeatures Double MetricValue [PeggedFeature]
|
data LineWithPeggedFactors = LineWithPeggedFactors Double MetricValue [PeggedFactor]
|
||||||
deriving (Eq, Ord)
|
deriving (Eq, Ord)
|
||||||
|
|
||||||
data PeggedFeature = PeggedFeature FeatureNamespace SimpleFeature
|
data PeggedFactor = PeggedFactor FeatureNamespace SimpleFactor
|
||||||
deriving (Eq, Ord)
|
deriving (Eq, Ord)
|
||||||
|
|
||||||
instance Show PeggedFeature where
|
instance Show PeggedFactor where
|
||||||
show (PeggedFeature namespace feature) = (show namespace) ++ ":" ++ (show feature)
|
show (PeggedFactor namespace factor) = (show namespace) ++ ":" ++ (show factor)
|
||||||
|
|
||||||
data SimpleFeature = SimpleAtomicFeature AtomicFeature | BigramFeature AtomicFeature AtomicFeature
|
data SimpleFactor = SimpleAtomicFactor AtomicFactor | BigramFactor AtomicFactor AtomicFactor
|
||||||
deriving (Eq, Ord)
|
deriving (Eq, Ord)
|
||||||
|
|
||||||
instance Show SimpleFeature where
|
instance Show SimpleFactor where
|
||||||
show (SimpleAtomicFeature feature) = show feature
|
show (SimpleAtomicFactor factor) = show factor
|
||||||
show (BigramFeature featureA featureB) = (show featureA) ++ "++" ++ (show featureB)
|
show (BigramFactor factorA factorB) = (show factorA) ++ "++" ++ (show factorB)
|
||||||
|
|
||||||
data AtomicFeature = TextFeature Text | ShapeFeature WordShape
|
data AtomicFactor = TextFactor Text | ShapeFactor WordShape
|
||||||
deriving (Eq, Ord)
|
deriving (Eq, Ord)
|
||||||
|
|
||||||
instance Show AtomicFeature where
|
instance Show AtomicFactor where
|
||||||
show (TextFeature t) = unpack t
|
show (TextFactor t) = unpack t
|
||||||
show (ShapeFeature (WordShape t)) = 'S':'H':'A':'P':'E':':':(unpack t)
|
show (ShapeFactor (WordShape t)) = 'S':'H':'A':'P':'E':':':(unpack t)
|
||||||
|
|
||||||
data FeatureNamespace = FeatureNamespace Text | FeatureTabbedNamespace Text Int
|
data FeatureNamespace = FeatureNamespace Text | FeatureTabbedNamespace Text Int
|
||||||
deriving (Eq, Ord)
|
deriving (Eq, Ord)
|
||||||
@ -58,42 +58,42 @@ instance Show FeatureNamespace where
|
|||||||
show (FeatureNamespace namespace) = unpack namespace
|
show (FeatureNamespace namespace) = unpack namespace
|
||||||
show (FeatureTabbedNamespace namespace column) = ((unpack namespace) ++ "<" ++ (show column) ++ ">")
|
show (FeatureTabbedNamespace namespace column) = ((unpack namespace) ++ "<" ++ (show column) ++ ">")
|
||||||
|
|
||||||
tokenizeForFeatures :: (Maybe Tokenizer) -> Text -> [Text]
|
tokenizeForFactors :: (Maybe Tokenizer) -> Text -> [Text]
|
||||||
tokenizeForFeatures Nothing t = Data.List.filter (not . Data.Text.null) $ split splitPred t
|
tokenizeForFactors Nothing t = Data.List.filter (not . Data.Text.null) $ split splitPred t
|
||||||
where splitPred c = c == ' ' || c == '\t' || c == ':'
|
where splitPred c = c == ' ' || c == '\t' || c == ':'
|
||||||
tokenizeForFeatures mTokenizer t = tokenize mTokenizer t
|
tokenizeForFactors mTokenizer t = tokenize mTokenizer t
|
||||||
|
|
||||||
extractAtomicFeatures :: (Maybe Tokenizer) -> BlackBoxDebuggingOptions -> Text -> [[AtomicFeature]]
|
extractAtomicFactors :: (Maybe Tokenizer) -> BlackBoxDebuggingOptions -> Text -> [[AtomicFactor]]
|
||||||
extractAtomicFeatures mTokenizer bbdo t = [Data.List.map TextFeature tokens] ++
|
extractAtomicFactors mTokenizer bbdo t = [Data.List.map TextFactor tokens] ++
|
||||||
(if bbdoWordShapes bbdo
|
(if bbdoWordShapes bbdo
|
||||||
then [nub $ Data.List.map (ShapeFeature . shapify) tokens]
|
then [nub $ Data.List.map (ShapeFactor . shapify) tokens]
|
||||||
else [])
|
else [])
|
||||||
where tokens = nub $ (tokenizeForFeatures mTokenizer) t
|
where tokens = nub $ (tokenizeForFactors mTokenizer) t
|
||||||
|
|
||||||
extractSimpleFeatures :: (Maybe Tokenizer) -> BlackBoxDebuggingOptions -> Text -> [SimpleFeature]
|
extractSimpleFactors :: (Maybe Tokenizer) -> BlackBoxDebuggingOptions -> Text -> [SimpleFactor]
|
||||||
extractSimpleFeatures mTokenizer bbdo t = Data.List.concat $ (Prelude.map (Prelude.map SimpleAtomicFeature) atomss) ++
|
extractSimpleFactors mTokenizer bbdo t = Data.List.concat $ (Prelude.map (Prelude.map SimpleAtomicFactor) atomss) ++
|
||||||
if bbdoBigrams bbdo
|
if bbdoBigrams bbdo
|
||||||
then Prelude.map bigramFeatures atomss
|
then Prelude.map bigramFactors atomss
|
||||||
else []
|
else []
|
||||||
where atomss = extractAtomicFeatures mTokenizer bbdo t
|
where atomss = extractAtomicFactors mTokenizer bbdo t
|
||||||
bigramFeatures atoms = Prelude.map (\(a, b) -> BigramFeature a b) $ bigrams atoms
|
bigramFactors atoms = Prelude.map (\(a, b) -> BigramFactor a b) $ bigrams atoms
|
||||||
|
|
||||||
extractFeatures :: (Maybe Tokenizer) -> BlackBoxDebuggingOptions -> Text -> Text -> [PeggedFeature]
|
extractFactors :: (Maybe Tokenizer) -> BlackBoxDebuggingOptions -> Text -> Text -> [PeggedFactor]
|
||||||
extractFeatures mTokenizer bbdo namespace record =
|
extractFactors mTokenizer bbdo namespace record =
|
||||||
Prelude.map (\af -> PeggedFeature (FeatureNamespace namespace) af)
|
Prelude.map (\af -> PeggedFactor (FeatureNamespace namespace) af)
|
||||||
$ extractSimpleFeatures mTokenizer bbdo record
|
$ extractSimpleFactors mTokenizer bbdo record
|
||||||
|
|
||||||
extractFeaturesFromTabbed :: (Maybe Tokenizer) -> BlackBoxDebuggingOptions -> Text -> Text -> [PeggedFeature]
|
extractFactorsFromTabbed :: (Maybe Tokenizer) -> BlackBoxDebuggingOptions -> Text -> Text -> [PeggedFactor]
|
||||||
extractFeaturesFromTabbed mTokenizer bbdo namespace record =
|
extractFactorsFromTabbed mTokenizer bbdo namespace record =
|
||||||
Data.List.concat
|
Data.List.concat
|
||||||
$ Prelude.map (\(n, t) -> Prelude.map (\af -> PeggedFeature (FeatureTabbedNamespace namespace n) af) $ extractSimpleFeatures mTokenizer bbdo t)
|
$ Prelude.map (\(n, t) -> Prelude.map (\af -> PeggedFactor (FeatureTabbedNamespace namespace n) af) $ extractSimpleFactors mTokenizer bbdo t)
|
||||||
$ Prelude.zip [1..] (splitOn "\t" record)
|
$ Prelude.zip [1..] (splitOn "\t" record)
|
||||||
|
|
||||||
addCartesianFeatures :: BlackBoxDebuggingOptions -> [LineWithPeggedFeatures] -> [LineWithFeatures]
|
addCartesianFactors :: BlackBoxDebuggingOptions -> [LineWithPeggedFactors] -> [LineWithFeatures]
|
||||||
addCartesianFeatures bbdo linesWithPeggedFeatures = addCartesianFeatures' (bbdoCartesian bbdo) linesWithPeggedFeatures
|
addCartesianFactors bbdo linesWithPeggedFactors = addCartesianFactors' (bbdoCartesian bbdo) linesWithPeggedFactors
|
||||||
where addCartesianFeatures' _ linesWithPeggedFeatures
|
where addCartesianFactors' _ linesWithPeggedFactors
|
||||||
= Prelude.map (\(LineWithPeggedFeatures rank score fs) ->
|
= Prelude.map (\(LineWithPeggedFactors rank score fs) ->
|
||||||
LineWithFeatures rank score (Prelude.map UnaryFeature fs)) linesWithPeggedFeatures
|
LineWithFeatures rank score (Prelude.map UnaryFeature fs)) linesWithPeggedFactors
|
||||||
|
|
||||||
cartesianFeatures :: [PeggedFeature] -> [Feature]
|
cartesianFeatures :: [PeggedFactor] -> [Feature]
|
||||||
cartesianFeatures features = nub $ [CartesianFeature a b | a <- features, b <- features, a < b]
|
cartesianFeatures factors = nub $ [CartesianFeature a b | a <- factors, b <- factors, a < b]
|
||||||
|
@ -134,7 +134,7 @@ featureExtractor spec bbdo = CC.map extract
|
|||||||
.| CC.map unwrapFeatures
|
.| CC.map unwrapFeatures
|
||||||
.| CC.concat
|
.| CC.concat
|
||||||
where extract (rank, line@(LineRecord _ _ _ _ score)) =
|
where extract (rank, line@(LineRecord _ _ _ _ score)) =
|
||||||
LineWithPeggedFeatures rank score $ getFeatures mTokenizer bbdo line
|
LineWithPeggedFactors rank score $ getFeatures mTokenizer bbdo line
|
||||||
mTokenizer = gesTokenizer spec
|
mTokenizer = gesTokenizer spec
|
||||||
unwrapFeatures (LineWithFeatures rank score fs) = Prelude.map (\f -> RankedFeature f rank score) fs
|
unwrapFeatures (LineWithFeatures rank score fs) = Prelude.map (\f -> RankedFeature f rank score) fs
|
||||||
|
|
||||||
@ -147,10 +147,10 @@ finalFeatures True minFreq = do
|
|||||||
$ M.toList
|
$ M.toList
|
||||||
$ M.fromListWith (+)
|
$ M.fromListWith (+)
|
||||||
$ Data.List.concat
|
$ Data.List.concat
|
||||||
$ Prelude.map (\(LineWithPeggedFeatures _ _ fs) -> Prelude.map (\f -> (f, 1)) fs) ls
|
$ Prelude.map (\(LineWithPeggedFactors _ _ fs) -> Prelude.map (\f -> (f, 1)) fs) ls
|
||||||
|
|
||||||
(CC.yieldMany $ ls) .| CC.map (addCartesian unaryFeaturesFrequentEnough)
|
(CC.yieldMany $ ls) .| CC.map (addCartesian unaryFeaturesFrequentEnough)
|
||||||
where addCartesian wanted (LineWithPeggedFeatures rank score fs) = LineWithFeatures rank score
|
where addCartesian wanted (LineWithPeggedFactors rank score fs) = LineWithFeatures rank score
|
||||||
$ ((Prelude.map UnaryFeature fs) ++
|
$ ((Prelude.map UnaryFeature fs) ++
|
||||||
(cartesianFeatures $ Prelude.filter ((flip S.member) wanted) fs))
|
(cartesianFeatures $ Prelude.filter ((flip S.member) wanted) fs))
|
||||||
|
|
||||||
@ -161,15 +161,15 @@ filtreCartesian True = CC.concatMapAccum step S.empty
|
|||||||
then []
|
then []
|
||||||
else [f])
|
else [f])
|
||||||
|
|
||||||
peggedToUnaryLine :: LineWithPeggedFeatures -> LineWithFeatures
|
peggedToUnaryLine :: LineWithPeggedFactors -> LineWithFeatures
|
||||||
peggedToUnaryLine (LineWithPeggedFeatures rank score fs) = LineWithFeatures rank score (Prelude.map UnaryFeature fs)
|
peggedToUnaryLine (LineWithPeggedFactors rank score fs) = LineWithFeatures rank score (Prelude.map UnaryFeature fs)
|
||||||
|
|
||||||
getFeatures :: Maybe Tokenizer -> BlackBoxDebuggingOptions -> LineRecord -> [PeggedFeature]
|
getFeatures :: Maybe Tokenizer -> BlackBoxDebuggingOptions -> LineRecord -> [PeggedFactor]
|
||||||
getFeatures mTokenizer bbdo (LineRecord inLine expLine outLine _ _) =
|
getFeatures mTokenizer bbdo (LineRecord inLine expLine outLine _ _) =
|
||||||
Data.List.concat [
|
Data.List.concat [
|
||||||
extractFeatures mTokenizer bbdo "exp" expLine,
|
extractFactors mTokenizer bbdo "exp" expLine,
|
||||||
extractFeaturesFromTabbed mTokenizer bbdo "in" inLine,
|
extractFactorsFromTabbed mTokenizer bbdo "in" inLine,
|
||||||
extractFeatures mTokenizer bbdo "out" outLine]
|
extractFactors mTokenizer bbdo "out" outLine]
|
||||||
|
|
||||||
uScoresCounter :: Monad m => Integer -> ConduitT RankedFeature FeatureWithPValue (StateT Integer m) ()
|
uScoresCounter :: Monad m => Integer -> ConduitT RankedFeature FeatureWithPValue (StateT Integer m) ()
|
||||||
uScoresCounter minFreq = CC.map (\(RankedFeature feature r score) -> (feature, (r, score, 1)))
|
uScoresCounter minFreq = CC.map (\(RankedFeature feature r score) -> (feature, (r, score, 1)))
|
||||||
|
Loading…
Reference in New Issue
Block a user