Refactor Features into Factors
This commit is contained in:
parent
1c3908b273
commit
d5a8908599
src/GEval
@ -4,11 +4,11 @@ module GEval.FeatureExtractor
|
|||||||
(extractFactors,
|
(extractFactors,
|
||||||
extractFactorsFromTabbed,
|
extractFactorsFromTabbed,
|
||||||
cartesianFeatures,
|
cartesianFeatures,
|
||||||
LineWithFeatures(..),
|
LineWithFactors(..),
|
||||||
LineWithPeggedFactors(..),
|
LineWithPeggedFactors(..),
|
||||||
PeggedFactor(..),
|
PeggedFactor(..),
|
||||||
PeggedExistentialFactor(..),
|
PeggedExistentialFactor(..),
|
||||||
Feature(..),
|
Factor(..),
|
||||||
SimpleFactor(..),
|
SimpleFactor(..),
|
||||||
ExistentialFactor(..),
|
ExistentialFactor(..),
|
||||||
AtomicFactor(..),
|
AtomicFactor(..),
|
||||||
@ -26,15 +26,15 @@ import GEval.BlackBoxDebugging
|
|||||||
import GEval.Common
|
import GEval.Common
|
||||||
import Text.Read (readMaybe)
|
import Text.Read (readMaybe)
|
||||||
|
|
||||||
data LineWithFeatures = LineWithFeatures Double MetricValue [Feature]
|
data LineWithFactors = LineWithFactors Double MetricValue [Factor]
|
||||||
deriving (Eq, Ord)
|
deriving (Eq, Ord)
|
||||||
|
|
||||||
data Feature = UnaryFeature PeggedFactor | CartesianFeature PeggedExistentialFactor PeggedExistentialFactor
|
data Factor = UnaryFactor PeggedFactor | CartesianFactor PeggedExistentialFactor PeggedExistentialFactor
|
||||||
deriving (Eq, Ord)
|
deriving (Eq, Ord)
|
||||||
|
|
||||||
instance Show Feature where
|
instance Show Factor where
|
||||||
show (UnaryFeature factor) = show factor
|
show (UnaryFactor factor) = show factor
|
||||||
show (CartesianFeature factorA factorB) = (show factorA) ++ "~~" ++ (show factorB)
|
show (CartesianFactor factorA factorB) = (show factorA) ++ "~~" ++ (show factorB)
|
||||||
|
|
||||||
data LineWithPeggedFactors = LineWithPeggedFactors Double MetricValue [PeggedFactor]
|
data LineWithPeggedFactors = LineWithPeggedFactors Double MetricValue [PeggedFactor]
|
||||||
deriving (Eq, Ord)
|
deriving (Eq, Ord)
|
||||||
@ -115,14 +115,14 @@ extractFactorsFromTabbed mTokenizer bbdo namespace record =
|
|||||||
$ Prelude.map (\(n, t) -> Prelude.map (\af -> PeggedFactor (FeatureTabbedNamespace namespace n) af) $ extractSimpleFactors mTokenizer bbdo t)
|
$ Prelude.map (\(n, t) -> Prelude.map (\af -> PeggedFactor (FeatureTabbedNamespace namespace n) af) $ extractSimpleFactors mTokenizer bbdo t)
|
||||||
$ Prelude.zip [1..] (splitOn "\t" record)
|
$ Prelude.zip [1..] (splitOn "\t" record)
|
||||||
|
|
||||||
addCartesianFactors :: BlackBoxDebuggingOptions -> [LineWithPeggedFactors] -> [LineWithFeatures]
|
addCartesianFactors :: BlackBoxDebuggingOptions -> [LineWithPeggedFactors] -> [LineWithFactors]
|
||||||
addCartesianFactors bbdo linesWithPeggedFactors = addCartesianFactors' (bbdoCartesian bbdo) linesWithPeggedFactors
|
addCartesianFactors bbdo linesWithPeggedFactors = addCartesianFactors' (bbdoCartesian bbdo) linesWithPeggedFactors
|
||||||
where addCartesianFactors' _ linesWithPeggedFactors
|
where addCartesianFactors' _ linesWithPeggedFactors
|
||||||
= Prelude.map (\(LineWithPeggedFactors rank score fs) ->
|
= Prelude.map (\(LineWithPeggedFactors rank score fs) ->
|
||||||
LineWithFeatures rank score (Prelude.map UnaryFeature fs)) linesWithPeggedFactors
|
LineWithFactors rank score (Prelude.map UnaryFactor fs)) linesWithPeggedFactors
|
||||||
|
|
||||||
cartesianFeatures :: [PeggedExistentialFactor] -> [Feature]
|
cartesianFeatures :: [PeggedExistentialFactor] -> [Factor]
|
||||||
cartesianFeatures factors = nub $ [CartesianFeature a b | a <- factors, b <- factors, a < b]
|
cartesianFeatures factors = nub $ [CartesianFactor a b | a <- factors, b <- factors, a < b]
|
||||||
|
|
||||||
filterExistentialFactors :: [PeggedFactor] -> [PeggedExistentialFactor]
|
filterExistentialFactors :: [PeggedFactor] -> [PeggedExistentialFactor]
|
||||||
filterExistentialFactors factors = catMaybes $ Prelude.map toExistential factors
|
filterExistentialFactors factors = catMaybes $ Prelude.map toExistential factors
|
||||||
|
@ -83,7 +83,7 @@ runFeatureFilter (Just feature) spec bbdo = CC.map (\l -> (fakeRank, l))
|
|||||||
.| CC.map fst
|
.| CC.map fst
|
||||||
where mTokenizer = gesTokenizer spec
|
where mTokenizer = gesTokenizer spec
|
||||||
fakeRank = 0.0
|
fakeRank = 0.0
|
||||||
checkFeature feature (_, LineWithFeatures _ _ fs) = feature `elem` (Prelude.map show fs)
|
checkFeature feature (_, LineWithFactors _ _ fs) = feature `elem` (Prelude.map show fs)
|
||||||
|
|
||||||
runWorstFeatures :: ResultOrdering -> GEvalSpecification -> BlackBoxDebuggingOptions -> IO ()
|
runWorstFeatures :: ResultOrdering -> GEvalSpecification -> BlackBoxDebuggingOptions -> IO ()
|
||||||
runWorstFeatures ordering spec bbdo = runLineByLineGeneralized ordering' spec (worstFeaturesPipeline False spec bbdo)
|
runWorstFeatures ordering spec bbdo = runLineByLineGeneralized ordering' spec (worstFeaturesPipeline False spec bbdo)
|
||||||
@ -122,10 +122,10 @@ extractFeaturesAndPValues spec bbdo =
|
|||||||
.| uScoresCounter (bbdoMinFrequency bbdo)
|
.| uScoresCounter (bbdoMinFrequency bbdo)
|
||||||
|
|
||||||
|
|
||||||
data RankedFeature = RankedFeature Feature Double MetricValue
|
data RankedFactor = RankedFactor Factor Double MetricValue
|
||||||
deriving (Show)
|
deriving (Show)
|
||||||
|
|
||||||
data FeatureWithPValue = FeatureWithPValue Feature -- ^ feature itself
|
data FeatureWithPValue = FeatureWithPValue Factor -- ^ feature itself
|
||||||
Double -- ^ p-value
|
Double -- ^ p-value
|
||||||
MetricValue -- ^ average metric value
|
MetricValue -- ^ average metric value
|
||||||
Integer -- ^ count
|
Integer -- ^ count
|
||||||
@ -138,13 +138,13 @@ formatFeatureWithPValue (FeatureWithPValue f p avg c) =
|
|||||||
(pack $ printf "%0.8f" avg),
|
(pack $ printf "%0.8f" avg),
|
||||||
(pack $ printf "%0.20f" p)]
|
(pack $ printf "%0.20f" p)]
|
||||||
|
|
||||||
rankedFeatureExtractor :: Monad m => GEvalSpecification -> BlackBoxDebuggingOptions -> ConduitT (Double, LineRecord) RankedFeature m ()
|
rankedFeatureExtractor :: Monad m => GEvalSpecification -> BlackBoxDebuggingOptions -> ConduitT (Double, LineRecord) RankedFactor m ()
|
||||||
rankedFeatureExtractor spec bbdo = featureExtractor mTokenizer bbdo
|
rankedFeatureExtractor spec bbdo = featureExtractor mTokenizer bbdo
|
||||||
.| CC.map snd
|
.| CC.map snd
|
||||||
.| CC.map unwrapFeatures
|
.| CC.map unwrapFeatures
|
||||||
.| CC.concat
|
.| CC.concat
|
||||||
where mTokenizer = gesTokenizer spec
|
where mTokenizer = gesTokenizer spec
|
||||||
unwrapFeatures (LineWithFeatures rank score fs) = Prelude.map (\f -> RankedFeature f rank score) fs
|
unwrapFeatures (LineWithFactors rank score fs) = Prelude.map (\f -> RankedFactor f rank score) fs
|
||||||
|
|
||||||
class FeatureSource a where
|
class FeatureSource a where
|
||||||
getScore :: a -> MetricValue
|
getScore :: a -> MetricValue
|
||||||
@ -158,13 +158,13 @@ instance FeatureSource (LineRecord, LineRecord) where
|
|||||||
getScore (LineRecord _ _ _ _ scoreA, LineRecord _ _ _ _ scoreB) = scoreB - scoreA
|
getScore (LineRecord _ _ _ _ scoreA, LineRecord _ _ _ _ scoreB) = scoreB - scoreA
|
||||||
mainLineRecord (_, l) = l
|
mainLineRecord (_, l) = l
|
||||||
|
|
||||||
featureExtractor :: (Monad m, FeatureSource s) => Maybe Tokenizer -> BlackBoxDebuggingOptions -> ConduitT (Double, s) (s, LineWithFeatures) m ()
|
featureExtractor :: (Monad m, FeatureSource s) => Maybe Tokenizer -> BlackBoxDebuggingOptions -> ConduitT (Double, s) (s, LineWithFactors) m ()
|
||||||
featureExtractor mTokenizer bbdo = CC.map extract
|
featureExtractor mTokenizer bbdo = CC.map extract
|
||||||
.| finalFeatures (bbdoCartesian bbdo) (fromMaybe (bbdoMinFrequency bbdo) (bbdoMinCartesianFrequency bbdo))
|
.| finalFeatures (bbdoCartesian bbdo) (fromMaybe (bbdoMinFrequency bbdo) (bbdoMinCartesianFrequency bbdo))
|
||||||
where extract (rank, line) =
|
where extract (rank, line) =
|
||||||
(line, LineWithPeggedFactors rank (getScore line) $ getFeatures mTokenizer bbdo (mainLineRecord line))
|
(line, LineWithPeggedFactors rank (getScore line) $ getFeatures mTokenizer bbdo (mainLineRecord line))
|
||||||
|
|
||||||
finalFeatures :: Monad m => Bool -> Integer -> ConduitT (a, LineWithPeggedFactors) (a, LineWithFeatures) m ()
|
finalFeatures :: Monad m => Bool -> Integer -> ConduitT (a, LineWithPeggedFactors) (a, LineWithFactors) m ()
|
||||||
finalFeatures False _ = CC.map (\(l, p) -> (l, peggedToUnaryLine p))
|
finalFeatures False _ = CC.map (\(l, p) -> (l, peggedToUnaryLine p))
|
||||||
finalFeatures True minFreq = do
|
finalFeatures True minFreq = do
|
||||||
ls <- CC.sinkList
|
ls <- CC.sinkList
|
||||||
@ -178,20 +178,20 @@ finalFeatures True minFreq = do
|
|||||||
$ Prelude.map snd ls
|
$ Prelude.map snd ls
|
||||||
|
|
||||||
(CC.yieldMany $ ls) .| CC.map (addCartesian unaryFeaturesFrequentEnough)
|
(CC.yieldMany $ ls) .| CC.map (addCartesian unaryFeaturesFrequentEnough)
|
||||||
where addCartesian wanted (l, LineWithPeggedFactors rank score fs) = (l, LineWithFeatures rank score
|
where addCartesian wanted (l, LineWithPeggedFactors rank score fs) = (l, LineWithFactors rank score
|
||||||
$ ((Prelude.map UnaryFeature fs) ++
|
$ ((Prelude.map UnaryFactor fs) ++
|
||||||
(cartesianFeatures $ Prelude.filter ((flip S.member) wanted) $ filterExistentialFactors fs)))
|
(cartesianFeatures $ Prelude.filter ((flip S.member) wanted) $ filterExistentialFactors fs)))
|
||||||
|
|
||||||
filtreCartesian False = CC.map id
|
filtreCartesian False = CC.map id
|
||||||
filtreCartesian True = CC.concatMapAccum step S.empty
|
filtreCartesian True = CC.concatMapAccum step S.empty
|
||||||
where step f@(FeatureWithPValue (UnaryFeature (PeggedFactor namespace (SimpleExistentialFactor p))) _ _ _) mp = (S.insert (PeggedExistentialFactor namespace p) mp, [f])
|
where step f@(FeatureWithPValue (UnaryFactor (PeggedFactor namespace (SimpleExistentialFactor p))) _ _ _) mp = (S.insert (PeggedExistentialFactor namespace p) mp, [f])
|
||||||
step f@(FeatureWithPValue (UnaryFeature (PeggedFactor namespace (NumericalFactor _ _))) _ _ _) mp = (mp, [f])
|
step f@(FeatureWithPValue (UnaryFactor (PeggedFactor namespace (NumericalFactor _ _))) _ _ _) mp = (mp, [f])
|
||||||
step f@(FeatureWithPValue (CartesianFeature pA pB) _ _ _) mp = (mp, if pA `S.member` mp || pB `S.member` mp
|
step f@(FeatureWithPValue (CartesianFactor pA pB) _ _ _) mp = (mp, if pA `S.member` mp || pB `S.member` mp
|
||||||
then []
|
then []
|
||||||
else [f])
|
else [f])
|
||||||
|
|
||||||
peggedToUnaryLine :: LineWithPeggedFactors -> LineWithFeatures
|
peggedToUnaryLine :: LineWithPeggedFactors -> LineWithFactors
|
||||||
peggedToUnaryLine (LineWithPeggedFactors rank score fs) = LineWithFeatures rank score (Prelude.map UnaryFeature fs)
|
peggedToUnaryLine (LineWithPeggedFactors rank score fs) = LineWithFactors rank score (Prelude.map UnaryFactor fs)
|
||||||
|
|
||||||
getFeatures :: Maybe Tokenizer -> BlackBoxDebuggingOptions -> LineRecord -> [PeggedFactor]
|
getFeatures :: Maybe Tokenizer -> BlackBoxDebuggingOptions -> LineRecord -> [PeggedFactor]
|
||||||
getFeatures mTokenizer bbdo (LineRecord inLine expLine outLine _ _) =
|
getFeatures mTokenizer bbdo (LineRecord inLine expLine outLine _ _) =
|
||||||
@ -200,8 +200,8 @@ getFeatures mTokenizer bbdo (LineRecord inLine expLine outLine _ _) =
|
|||||||
extractFactorsFromTabbed mTokenizer bbdo "in" inLine,
|
extractFactorsFromTabbed mTokenizer bbdo "in" inLine,
|
||||||
extractFactors mTokenizer bbdo "out" outLine]
|
extractFactors mTokenizer bbdo "out" outLine]
|
||||||
|
|
||||||
uScoresCounter :: Monad m => Integer -> ConduitT RankedFeature FeatureWithPValue (StateT Integer m) ()
|
uScoresCounter :: Monad m => Integer -> ConduitT RankedFactor FeatureWithPValue (StateT Integer m) ()
|
||||||
uScoresCounter minFreq = CC.map (\(RankedFeature feature r score) -> (feature, (r, score, 1)))
|
uScoresCounter minFreq = CC.map (\(RankedFactor feature r score) -> (feature, (r, score, 1)))
|
||||||
.| gobbleAndDo countUScores
|
.| gobbleAndDo countUScores
|
||||||
.| lowerFreqFiltre
|
.| lowerFreqFiltre
|
||||||
.| pValueCalculator minFreq
|
.| pValueCalculator minFreq
|
||||||
@ -210,7 +210,7 @@ uScoresCounter minFreq = CC.map (\(RankedFeature feature r score) -> (feature, (
|
|||||||
$ M.fromListWith (\(r1, s1, c1) (r2, s2, c2) -> ((r1 + r2), (s1 + s2), (c1 + c2))) l
|
$ M.fromListWith (\(r1, s1, c1) (r2, s2, c2) -> ((r1 + r2), (s1 + s2), (c1 + c2))) l
|
||||||
lowerFreqFiltre = CC.filter (\(_, (_, _, c)) -> c >= minFreq)
|
lowerFreqFiltre = CC.filter (\(_, (_, _, c)) -> c >= minFreq)
|
||||||
|
|
||||||
pValueCalculator :: Monad m => Integer -> ConduitT (Feature, (Double, MetricValue, Integer)) FeatureWithPValue (StateT Integer m) ()
|
pValueCalculator :: Monad m => Integer -> ConduitT (Factor, (Double, MetricValue, Integer)) FeatureWithPValue (StateT Integer m) ()
|
||||||
pValueCalculator minFreq = do
|
pValueCalculator minFreq = do
|
||||||
firstVal <- await
|
firstVal <- await
|
||||||
case firstVal of
|
case firstVal of
|
||||||
@ -222,7 +222,7 @@ pValueCalculator minFreq = do
|
|||||||
CC.filter (\(_, (_, _, c)) -> total - c >= minFreq) .| CC.map (calculatePValue total)
|
CC.filter (\(_, (_, _, c)) -> total - c >= minFreq) .| CC.map (calculatePValue total)
|
||||||
Nothing -> return ()
|
Nothing -> return ()
|
||||||
|
|
||||||
calculatePValue :: Integer -> (Feature, (Double, MetricValue, Integer)) -> FeatureWithPValue
|
calculatePValue :: Integer -> (Factor, (Double, MetricValue, Integer)) -> FeatureWithPValue
|
||||||
calculatePValue total (f, (r, s, c)) = FeatureWithPValue f
|
calculatePValue total (f, (r, s, c)) = FeatureWithPValue f
|
||||||
(pvalue (r - minusR c) c (total - c))
|
(pvalue (r - minusR c) c (total - c))
|
||||||
(s / (fromIntegral c))
|
(s / (fromIntegral c))
|
||||||
|
Loading…
Reference in New Issue
Block a user