Change features into "factors"

(Just the terminology was changed)
This commit is contained in:
Filip Graliński 2019-01-11 16:08:56 +01:00
parent de901d4c64
commit b0c75cac3a
2 changed files with 54 additions and 54 deletions

View File

@ -1,12 +1,12 @@
{-# LANGUAGE OverloadedStrings #-} {-# LANGUAGE OverloadedStrings #-}
module GEval.FeatureExtractor module GEval.FeatureExtractor
(extractFeatures, (extractFactors,
extractFeaturesFromTabbed, extractFactorsFromTabbed,
cartesianFeatures, cartesianFeatures,
LineWithFeatures(..), LineWithFeatures(..),
LineWithPeggedFeatures(..), LineWithPeggedFactors(..),
PeggedFeature(..), PeggedFactor(..),
Feature(..)) Feature(..))
where where
@ -21,35 +21,35 @@ import GEval.Common
data LineWithFeatures = LineWithFeatures Double MetricValue [Feature] data LineWithFeatures = LineWithFeatures Double MetricValue [Feature]
deriving (Eq, Ord) deriving (Eq, Ord)
data Feature = UnaryFeature PeggedFeature | CartesianFeature PeggedFeature PeggedFeature data Feature = UnaryFeature PeggedFactor | CartesianFeature PeggedFactor PeggedFactor
deriving (Eq, Ord) deriving (Eq, Ord)
instance Show Feature where instance Show Feature where
show (UnaryFeature feature) = show feature show (UnaryFeature factor) = show factor
show (CartesianFeature featureA featureB) = (show featureA) ++ "~~" ++ (show featureB) show (CartesianFeature factorA factorB) = (show factorA) ++ "~~" ++ (show factorB)
data LineWithPeggedFeatures = LineWithPeggedFeatures Double MetricValue [PeggedFeature] data LineWithPeggedFactors = LineWithPeggedFactors Double MetricValue [PeggedFactor]
deriving (Eq, Ord) deriving (Eq, Ord)
data PeggedFeature = PeggedFeature FeatureNamespace SimpleFeature data PeggedFactor = PeggedFactor FeatureNamespace SimpleFactor
deriving (Eq, Ord) deriving (Eq, Ord)
instance Show PeggedFeature where instance Show PeggedFactor where
show (PeggedFeature namespace feature) = (show namespace) ++ ":" ++ (show feature) show (PeggedFactor namespace factor) = (show namespace) ++ ":" ++ (show factor)
data SimpleFeature = SimpleAtomicFeature AtomicFeature | BigramFeature AtomicFeature AtomicFeature data SimpleFactor = SimpleAtomicFactor AtomicFactor | BigramFactor AtomicFactor AtomicFactor
deriving (Eq, Ord) deriving (Eq, Ord)
instance Show SimpleFeature where instance Show SimpleFactor where
show (SimpleAtomicFeature feature) = show feature show (SimpleAtomicFactor factor) = show factor
show (BigramFeature featureA featureB) = (show featureA) ++ "++" ++ (show featureB) show (BigramFactor factorA factorB) = (show factorA) ++ "++" ++ (show factorB)
data AtomicFeature = TextFeature Text | ShapeFeature WordShape data AtomicFactor = TextFactor Text | ShapeFactor WordShape
deriving (Eq, Ord) deriving (Eq, Ord)
instance Show AtomicFeature where instance Show AtomicFactor where
show (TextFeature t) = unpack t show (TextFactor t) = unpack t
show (ShapeFeature (WordShape t)) = 'S':'H':'A':'P':'E':':':(unpack t) show (ShapeFactor (WordShape t)) = 'S':'H':'A':'P':'E':':':(unpack t)
data FeatureNamespace = FeatureNamespace Text | FeatureTabbedNamespace Text Int data FeatureNamespace = FeatureNamespace Text | FeatureTabbedNamespace Text Int
deriving (Eq, Ord) deriving (Eq, Ord)
@ -58,42 +58,42 @@ instance Show FeatureNamespace where
show (FeatureNamespace namespace) = unpack namespace show (FeatureNamespace namespace) = unpack namespace
show (FeatureTabbedNamespace namespace column) = ((unpack namespace) ++ "<" ++ (show column) ++ ">") show (FeatureTabbedNamespace namespace column) = ((unpack namespace) ++ "<" ++ (show column) ++ ">")
tokenizeForFeatures :: (Maybe Tokenizer) -> Text -> [Text] tokenizeForFactors :: (Maybe Tokenizer) -> Text -> [Text]
tokenizeForFeatures Nothing t = Data.List.filter (not . Data.Text.null) $ split splitPred t tokenizeForFactors Nothing t = Data.List.filter (not . Data.Text.null) $ split splitPred t
where splitPred c = c == ' ' || c == '\t' || c == ':' where splitPred c = c == ' ' || c == '\t' || c == ':'
tokenizeForFeatures mTokenizer t = tokenize mTokenizer t tokenizeForFactors mTokenizer t = tokenize mTokenizer t
extractAtomicFeatures :: (Maybe Tokenizer) -> BlackBoxDebuggingOptions -> Text -> [[AtomicFeature]] extractAtomicFactors :: (Maybe Tokenizer) -> BlackBoxDebuggingOptions -> Text -> [[AtomicFactor]]
extractAtomicFeatures mTokenizer bbdo t = [Data.List.map TextFeature tokens] ++ extractAtomicFactors mTokenizer bbdo t = [Data.List.map TextFactor tokens] ++
(if bbdoWordShapes bbdo (if bbdoWordShapes bbdo
then [nub $ Data.List.map (ShapeFeature . shapify) tokens] then [nub $ Data.List.map (ShapeFactor . shapify) tokens]
else []) else [])
where tokens = nub $ (tokenizeForFeatures mTokenizer) t where tokens = nub $ (tokenizeForFactors mTokenizer) t
extractSimpleFeatures :: (Maybe Tokenizer) -> BlackBoxDebuggingOptions -> Text -> [SimpleFeature] extractSimpleFactors :: (Maybe Tokenizer) -> BlackBoxDebuggingOptions -> Text -> [SimpleFactor]
extractSimpleFeatures mTokenizer bbdo t = Data.List.concat $ (Prelude.map (Prelude.map SimpleAtomicFeature) atomss) ++ extractSimpleFactors mTokenizer bbdo t = Data.List.concat $ (Prelude.map (Prelude.map SimpleAtomicFactor) atomss) ++
if bbdoBigrams bbdo if bbdoBigrams bbdo
then Prelude.map bigramFeatures atomss then Prelude.map bigramFactors atomss
else [] else []
where atomss = extractAtomicFeatures mTokenizer bbdo t where atomss = extractAtomicFactors mTokenizer bbdo t
bigramFeatures atoms = Prelude.map (\(a, b) -> BigramFeature a b) $ bigrams atoms bigramFactors atoms = Prelude.map (\(a, b) -> BigramFactor a b) $ bigrams atoms
extractFeatures :: (Maybe Tokenizer) -> BlackBoxDebuggingOptions -> Text -> Text -> [PeggedFeature] extractFactors :: (Maybe Tokenizer) -> BlackBoxDebuggingOptions -> Text -> Text -> [PeggedFactor]
extractFeatures mTokenizer bbdo namespace record = extractFactors mTokenizer bbdo namespace record =
Prelude.map (\af -> PeggedFeature (FeatureNamespace namespace) af) Prelude.map (\af -> PeggedFactor (FeatureNamespace namespace) af)
$ extractSimpleFeatures mTokenizer bbdo record $ extractSimpleFactors mTokenizer bbdo record
extractFeaturesFromTabbed :: (Maybe Tokenizer) -> BlackBoxDebuggingOptions -> Text -> Text -> [PeggedFeature] extractFactorsFromTabbed :: (Maybe Tokenizer) -> BlackBoxDebuggingOptions -> Text -> Text -> [PeggedFactor]
extractFeaturesFromTabbed mTokenizer bbdo namespace record = extractFactorsFromTabbed mTokenizer bbdo namespace record =
Data.List.concat Data.List.concat
$ Prelude.map (\(n, t) -> Prelude.map (\af -> PeggedFeature (FeatureTabbedNamespace namespace n) af) $ extractSimpleFeatures mTokenizer bbdo t) $ Prelude.map (\(n, t) -> Prelude.map (\af -> PeggedFactor (FeatureTabbedNamespace namespace n) af) $ extractSimpleFactors mTokenizer bbdo t)
$ Prelude.zip [1..] (splitOn "\t" record) $ Prelude.zip [1..] (splitOn "\t" record)
addCartesianFeatures :: BlackBoxDebuggingOptions -> [LineWithPeggedFeatures] -> [LineWithFeatures] addCartesianFactors :: BlackBoxDebuggingOptions -> [LineWithPeggedFactors] -> [LineWithFeatures]
addCartesianFeatures bbdo linesWithPeggedFeatures = addCartesianFeatures' (bbdoCartesian bbdo) linesWithPeggedFeatures addCartesianFactors bbdo linesWithPeggedFactors = addCartesianFactors' (bbdoCartesian bbdo) linesWithPeggedFactors
where addCartesianFeatures' _ linesWithPeggedFeatures where addCartesianFactors' _ linesWithPeggedFactors
= Prelude.map (\(LineWithPeggedFeatures rank score fs) -> = Prelude.map (\(LineWithPeggedFactors rank score fs) ->
LineWithFeatures rank score (Prelude.map UnaryFeature fs)) linesWithPeggedFeatures LineWithFeatures rank score (Prelude.map UnaryFeature fs)) linesWithPeggedFactors
cartesianFeatures :: [PeggedFeature] -> [Feature] cartesianFeatures :: [PeggedFactor] -> [Feature]
cartesianFeatures features = nub $ [CartesianFeature a b | a <- features, b <- features, a < b] cartesianFeatures factors = nub $ [CartesianFeature a b | a <- factors, b <- factors, a < b]

View File

@ -134,7 +134,7 @@ featureExtractor spec bbdo = CC.map extract
.| CC.map unwrapFeatures .| CC.map unwrapFeatures
.| CC.concat .| CC.concat
where extract (rank, line@(LineRecord _ _ _ _ score)) = where extract (rank, line@(LineRecord _ _ _ _ score)) =
LineWithPeggedFeatures rank score $ getFeatures mTokenizer bbdo line LineWithPeggedFactors rank score $ getFeatures mTokenizer bbdo line
mTokenizer = gesTokenizer spec mTokenizer = gesTokenizer spec
unwrapFeatures (LineWithFeatures rank score fs) = Prelude.map (\f -> RankedFeature f rank score) fs unwrapFeatures (LineWithFeatures rank score fs) = Prelude.map (\f -> RankedFeature f rank score) fs
@ -147,10 +147,10 @@ finalFeatures True minFreq = do
$ M.toList $ M.toList
$ M.fromListWith (+) $ M.fromListWith (+)
$ Data.List.concat $ Data.List.concat
$ Prelude.map (\(LineWithPeggedFeatures _ _ fs) -> Prelude.map (\f -> (f, 1)) fs) ls $ Prelude.map (\(LineWithPeggedFactors _ _ fs) -> Prelude.map (\f -> (f, 1)) fs) ls
(CC.yieldMany $ ls) .| CC.map (addCartesian unaryFeaturesFrequentEnough) (CC.yieldMany $ ls) .| CC.map (addCartesian unaryFeaturesFrequentEnough)
where addCartesian wanted (LineWithPeggedFeatures rank score fs) = LineWithFeatures rank score where addCartesian wanted (LineWithPeggedFactors rank score fs) = LineWithFeatures rank score
$ ((Prelude.map UnaryFeature fs) ++ $ ((Prelude.map UnaryFeature fs) ++
(cartesianFeatures $ Prelude.filter ((flip S.member) wanted) fs)) (cartesianFeatures $ Prelude.filter ((flip S.member) wanted) fs))
@ -161,15 +161,15 @@ filtreCartesian True = CC.concatMapAccum step S.empty
then [] then []
else [f]) else [f])
peggedToUnaryLine :: LineWithPeggedFeatures -> LineWithFeatures peggedToUnaryLine :: LineWithPeggedFactors -> LineWithFeatures
peggedToUnaryLine (LineWithPeggedFeatures rank score fs) = LineWithFeatures rank score (Prelude.map UnaryFeature fs) peggedToUnaryLine (LineWithPeggedFactors rank score fs) = LineWithFeatures rank score (Prelude.map UnaryFeature fs)
getFeatures :: Maybe Tokenizer -> BlackBoxDebuggingOptions -> LineRecord -> [PeggedFeature] getFeatures :: Maybe Tokenizer -> BlackBoxDebuggingOptions -> LineRecord -> [PeggedFactor]
getFeatures mTokenizer bbdo (LineRecord inLine expLine outLine _ _) = getFeatures mTokenizer bbdo (LineRecord inLine expLine outLine _ _) =
Data.List.concat [ Data.List.concat [
extractFeatures mTokenizer bbdo "exp" expLine, extractFactors mTokenizer bbdo "exp" expLine,
extractFeaturesFromTabbed mTokenizer bbdo "in" inLine, extractFactorsFromTabbed mTokenizer bbdo "in" inLine,
extractFeatures mTokenizer bbdo "out" outLine] extractFactors mTokenizer bbdo "out" outLine]
uScoresCounter :: Monad m => Integer -> ConduitT RankedFeature FeatureWithPValue (StateT Integer m) () uScoresCounter :: Monad m => Integer -> ConduitT RankedFeature FeatureWithPValue (StateT Integer m) ()
uScoresCounter minFreq = CC.map (\(RankedFeature feature r score) -> (feature, (r, score, 1))) uScoresCounter minFreq = CC.map (\(RankedFeature feature r score) -> (feature, (r, score, 1)))