Refactor features

This commit is contained in:
Filip Graliński 2019-01-09 17:45:06 +01:00
parent affa45b71f
commit 1832a23b75
2 changed files with 35 additions and 11 deletions

View File

@ -3,7 +3,8 @@
module GEval.FeatureExtractor module GEval.FeatureExtractor
(extractUnigramFeatures, (extractUnigramFeatures,
extractUnigramFeaturesFromTabbed) extractUnigramFeaturesFromTabbed,
Feature(..))
where where
import Data.Text import Data.Text
@ -11,17 +12,40 @@ import Data.List
import Data.Monoid ((<>)) import Data.Monoid ((<>))
import Text.Tokenizer import Text.Tokenizer
extractUnigramFeatures :: (Maybe Tokenizer) -> Text -> Text -> [Text] data Feature = SimpleFeature FeatureNamespace AtomicFeature
extractUnigramFeatures mTokenizer namespace record = Prelude.map (prefix <>) $ nub $ (tokenizeForFeatures mTokenizer) record deriving (Eq, Ord)
where prefix = namespace <> ":"
instance Show Feature where
show (SimpleFeature namespace feature) = (show namespace) ++ ":" ++ (show feature)
data AtomicFeature = TextFeature Text
deriving (Eq, Ord)
instance Show AtomicFeature where
show (TextFeature t) = unpack t
data FeatureNamespace = FeatureNamespace Text | FeatureTabbedNamespace Text Int
deriving (Eq, Ord)
instance Show FeatureNamespace where
show (FeatureNamespace namespace) = unpack namespace
show (FeatureTabbedNamespace namespace column) = ((unpack namespace) ++ "<" ++ (show column) ++ ">")
tokenizeForFeatures :: (Maybe Tokenizer) -> Text -> [Text] tokenizeForFeatures :: (Maybe Tokenizer) -> Text -> [Text]
tokenizeForFeatures Nothing t = Data.List.filter (not . Data.Text.null) $ split splitPred t tokenizeForFeatures Nothing t = Data.List.filter (not . Data.Text.null) $ split splitPred t
where splitPred c = c == ' ' || c == '\t' || c == ':' where splitPred c = c == ' ' || c == '\t' || c == ':'
tokenizeForFeatures mTokenizer t = tokenize mTokenizer t tokenizeForFeatures mTokenizer t = tokenize mTokenizer t
extractUnigramFeaturesFromTabbed :: (Maybe Tokenizer) -> Text -> Text -> [Text] extractAtomicFeatures :: (Maybe Tokenizer) -> Text -> [AtomicFeature]
extractAtomicFeatures mTokenizer = nub . (Data.List.map TextFeature) . (tokenizeForFeatures mTokenizer)
extractUnigramFeatures :: (Maybe Tokenizer) -> Text -> Text -> [Feature]
extractUnigramFeatures mTokenizer namespace record =
Prelude.map (\af -> SimpleFeature (FeatureNamespace namespace) af)
$ extractAtomicFeatures mTokenizer record
extractUnigramFeaturesFromTabbed :: (Maybe Tokenizer) -> Text -> Text -> [Feature]
extractUnigramFeaturesFromTabbed mTokenizer namespace record = extractUnigramFeaturesFromTabbed mTokenizer namespace record =
Data.List.concat Data.List.concat
$ Prelude.map (\(n, t) -> extractUnigramFeatures mTokenizer (namespace <> "<" <> (pack $ show n) <> ">") t) $ Prelude.map (\(n, t) -> Prelude.map (\af -> SimpleFeature (FeatureTabbedNamespace namespace n) af) $ extractAtomicFeatures mTokenizer t)
$ Prelude.zip [1..] (splitOn "\t" record) $ Prelude.zip [1..] (splitOn "\t" record)

View File

@ -106,10 +106,10 @@ extractFeaturesAndPValues spec =
.| uScoresCounter .| uScoresCounter
data RankedFeature = RankedFeature Text Double MetricValue data RankedFeature = RankedFeature Feature Double MetricValue
deriving (Show) deriving (Show)
data FeatureWithPValue = FeatureWithPValue Text -- ^ feature itself data FeatureWithPValue = FeatureWithPValue Feature -- ^ feature itself
Double -- ^ p-value Double -- ^ p-value
MetricValue -- ^ average metric value MetricValue -- ^ average metric value
Integer -- ^ count Integer -- ^ count
@ -117,7 +117,7 @@ data FeatureWithPValue = FeatureWithPValue Text -- ^ feature itself
formatFeatureWithPValue :: FeatureWithPValue -> Text formatFeatureWithPValue :: FeatureWithPValue -> Text
formatFeatureWithPValue (FeatureWithPValue f p avg c) = formatFeatureWithPValue (FeatureWithPValue f p avg c) =
Data.Text.intercalate "\t" [f, Data.Text.intercalate "\t" [pack $ show f,
(pack $ show c), (pack $ show c),
(pack $ printf "%0.8f" avg), (pack $ printf "%0.8f" avg),
(pack $ printf "%0.20f" p)] (pack $ printf "%0.20f" p)]
@ -139,7 +139,7 @@ uScoresCounter = CC.map (\(RankedFeature feature r score) -> (feature, (r, score
M.toList M.toList
$ M.fromListWith (\(r1, s1, c1) (r2, s2, c2) -> ((r1 + r2), (s1 + s2), (c1 + c2))) l $ M.fromListWith (\(r1, s1, c1) (r2, s2, c2) -> ((r1 + r2), (s1 + s2), (c1 + c2))) l
pValueCalculator :: Monad m => ConduitT (Text, (Double, MetricValue, Integer)) FeatureWithPValue (StateT Integer m) () pValueCalculator :: Monad m => ConduitT (Feature, (Double, MetricValue, Integer)) FeatureWithPValue (StateT Integer m) ()
pValueCalculator = do pValueCalculator = do
firstVal <- await firstVal <- await
case firstVal of case firstVal of
@ -149,7 +149,7 @@ pValueCalculator = do
CC.map $ calculatePValue total CC.map $ calculatePValue total
Nothing -> return () Nothing -> return ()
calculatePValue :: Integer -> (Text, (Double, MetricValue, Integer)) -> FeatureWithPValue calculatePValue :: Integer -> (Feature, (Double, MetricValue, Integer)) -> FeatureWithPValue
calculatePValue total (f, (r, s, c)) = FeatureWithPValue f calculatePValue total (f, (r, s, c)) = FeatureWithPValue f
(pvalue (r - minusR c) c (total - c)) (pvalue (r - minusR c) c (total - c))
(s / (fromIntegral c)) (s / (fromIntegral c))