From 1832a23b75ee28386fddc9b7342aca80b6085d76 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Filip=20Grali=C5=84ski?= Date: Wed, 9 Jan 2019 17:45:06 +0100 Subject: [PATCH] Refactor features --- src/GEval/FeatureExtractor.hs | 36 +++++++++++++++++++++++++++++------ src/GEval/LineByLine.hs | 10 +++++----- 2 files changed, 35 insertions(+), 11 deletions(-) diff --git a/src/GEval/FeatureExtractor.hs b/src/GEval/FeatureExtractor.hs index 7b968d4..d7a6299 100644 --- a/src/GEval/FeatureExtractor.hs +++ b/src/GEval/FeatureExtractor.hs @@ -3,7 +3,8 @@ module GEval.FeatureExtractor (extractUnigramFeatures, - extractUnigramFeaturesFromTabbed) + extractUnigramFeaturesFromTabbed, + Feature(..)) where import Data.Text @@ -11,17 +12,40 @@ import Data.List import Data.Monoid ((<>)) import Text.Tokenizer -extractUnigramFeatures :: (Maybe Tokenizer) -> Text -> Text -> [Text] -extractUnigramFeatures mTokenizer namespace record = Prelude.map (prefix <>) $ nub $ (tokenizeForFeatures mTokenizer) record - where prefix = namespace <> ":" +data Feature = SimpleFeature FeatureNamespace AtomicFeature + deriving (Eq, Ord) + +instance Show Feature where + show (SimpleFeature namespace feature) = (show namespace) ++ ":" ++ (show feature) + +data AtomicFeature = TextFeature Text + deriving (Eq, Ord) + +instance Show AtomicFeature where + show (TextFeature t) = unpack t + +data FeatureNamespace = FeatureNamespace Text | FeatureTabbedNamespace Text Int + deriving (Eq, Ord) + +instance Show FeatureNamespace where + show (FeatureNamespace namespace) = unpack namespace + show (FeatureTabbedNamespace namespace column) = ((unpack namespace) ++ "<" ++ (show column) ++ ">") tokenizeForFeatures :: (Maybe Tokenizer) -> Text -> [Text] tokenizeForFeatures Nothing t = Data.List.filter (not . Data.Text.null) $ split splitPred t where splitPred c = c == ' ' || c == '\t' || c == ':' tokenizeForFeatures mTokenizer t = tokenize mTokenizer t -extractUnigramFeaturesFromTabbed :: (Maybe Tokenizer) -> Text -> Text -> [Text] +extractAtomicFeatures :: (Maybe Tokenizer) -> Text -> [AtomicFeature] +extractAtomicFeatures mTokenizer = nub . (Data.List.map TextFeature) . (tokenizeForFeatures mTokenizer) + +extractUnigramFeatures :: (Maybe Tokenizer) -> Text -> Text -> [Feature] +extractUnigramFeatures mTokenizer namespace record = + Prelude.map (\af -> SimpleFeature (FeatureNamespace namespace) af) + $ extractAtomicFeatures mTokenizer record + +extractUnigramFeaturesFromTabbed :: (Maybe Tokenizer) -> Text -> Text -> [Feature] extractUnigramFeaturesFromTabbed mTokenizer namespace record = Data.List.concat - $ Prelude.map (\(n, t) -> extractUnigramFeatures mTokenizer (namespace <> "<" <> (pack $ show n) <> ">") t) + $ Prelude.map (\(n, t) -> Prelude.map (\af -> SimpleFeature (FeatureTabbedNamespace namespace n) af) $ extractAtomicFeatures mTokenizer t) $ Prelude.zip [1..] (splitOn "\t" record) diff --git a/src/GEval/LineByLine.hs b/src/GEval/LineByLine.hs index e68b5a9..fe68b17 100644 --- a/src/GEval/LineByLine.hs +++ b/src/GEval/LineByLine.hs @@ -106,10 +106,10 @@ extractFeaturesAndPValues spec = .| uScoresCounter -data RankedFeature = RankedFeature Text Double MetricValue +data RankedFeature = RankedFeature Feature Double MetricValue deriving (Show) -data FeatureWithPValue = FeatureWithPValue Text -- ^ feature itself +data FeatureWithPValue = FeatureWithPValue Feature -- ^ feature itself Double -- ^ p-value MetricValue -- ^ average metric value Integer -- ^ count @@ -117,7 +117,7 @@ data FeatureWithPValue = FeatureWithPValue Text -- ^ feature itself formatFeatureWithPValue :: FeatureWithPValue -> Text formatFeatureWithPValue (FeatureWithPValue f p avg c) = - Data.Text.intercalate "\t" [f, + Data.Text.intercalate "\t" [pack $ show f, (pack $ show c), (pack $ printf "%0.8f" avg), (pack $ printf "%0.20f" p)] @@ -139,7 +139,7 @@ uScoresCounter = CC.map (\(RankedFeature feature r score) -> (feature, (r, score M.toList $ M.fromListWith (\(r1, s1, c1) (r2, s2, c2) -> ((r1 + r2), (s1 + s2), (c1 + c2))) l -pValueCalculator :: Monad m => ConduitT (Text, (Double, MetricValue, Integer)) FeatureWithPValue (StateT Integer m) () +pValueCalculator :: Monad m => ConduitT (Feature, (Double, MetricValue, Integer)) FeatureWithPValue (StateT Integer m) () pValueCalculator = do firstVal <- await case firstVal of @@ -149,7 +149,7 @@ pValueCalculator = do CC.map $ calculatePValue total Nothing -> return () -calculatePValue :: Integer -> (Text, (Double, MetricValue, Integer)) -> FeatureWithPValue +calculatePValue :: Integer -> (Feature, (Double, MetricValue, Integer)) -> FeatureWithPValue calculatePValue total (f, (r, s, c)) = FeatureWithPValue f (pvalue (r - minusR c) c (total - c)) (s / (fromIntegral c))