Refactor features
This commit is contained in:
parent
affa45b71f
commit
1832a23b75
@ -3,7 +3,8 @@
|
||||
|
||||
module GEval.FeatureExtractor
|
||||
(extractUnigramFeatures,
|
||||
extractUnigramFeaturesFromTabbed)
|
||||
extractUnigramFeaturesFromTabbed,
|
||||
Feature(..))
|
||||
where
|
||||
|
||||
import Data.Text
|
||||
@ -11,17 +12,40 @@ import Data.List
|
||||
import Data.Monoid ((<>))
|
||||
import Text.Tokenizer
|
||||
|
||||
extractUnigramFeatures :: (Maybe Tokenizer) -> Text -> Text -> [Text]
|
||||
extractUnigramFeatures mTokenizer namespace record = Prelude.map (prefix <>) $ nub $ (tokenizeForFeatures mTokenizer) record
|
||||
where prefix = namespace <> ":"
|
||||
data Feature = SimpleFeature FeatureNamespace AtomicFeature
|
||||
deriving (Eq, Ord)
|
||||
|
||||
instance Show Feature where
|
||||
show (SimpleFeature namespace feature) = (show namespace) ++ ":" ++ (show feature)
|
||||
|
||||
data AtomicFeature = TextFeature Text
|
||||
deriving (Eq, Ord)
|
||||
|
||||
instance Show AtomicFeature where
|
||||
show (TextFeature t) = unpack t
|
||||
|
||||
data FeatureNamespace = FeatureNamespace Text | FeatureTabbedNamespace Text Int
|
||||
deriving (Eq, Ord)
|
||||
|
||||
instance Show FeatureNamespace where
|
||||
show (FeatureNamespace namespace) = unpack namespace
|
||||
show (FeatureTabbedNamespace namespace column) = ((unpack namespace) ++ "<" ++ (show column) ++ ">")
|
||||
|
||||
tokenizeForFeatures :: (Maybe Tokenizer) -> Text -> [Text]
|
||||
tokenizeForFeatures Nothing t = Data.List.filter (not . Data.Text.null) $ split splitPred t
|
||||
where splitPred c = c == ' ' || c == '\t' || c == ':'
|
||||
tokenizeForFeatures mTokenizer t = tokenize mTokenizer t
|
||||
|
||||
extractUnigramFeaturesFromTabbed :: (Maybe Tokenizer) -> Text -> Text -> [Text]
|
||||
extractAtomicFeatures :: (Maybe Tokenizer) -> Text -> [AtomicFeature]
|
||||
extractAtomicFeatures mTokenizer = nub . (Data.List.map TextFeature) . (tokenizeForFeatures mTokenizer)
|
||||
|
||||
extractUnigramFeatures :: (Maybe Tokenizer) -> Text -> Text -> [Feature]
|
||||
extractUnigramFeatures mTokenizer namespace record =
|
||||
Prelude.map (\af -> SimpleFeature (FeatureNamespace namespace) af)
|
||||
$ extractAtomicFeatures mTokenizer record
|
||||
|
||||
extractUnigramFeaturesFromTabbed :: (Maybe Tokenizer) -> Text -> Text -> [Feature]
|
||||
extractUnigramFeaturesFromTabbed mTokenizer namespace record =
|
||||
Data.List.concat
|
||||
$ Prelude.map (\(n, t) -> extractUnigramFeatures mTokenizer (namespace <> "<" <> (pack $ show n) <> ">") t)
|
||||
$ Prelude.map (\(n, t) -> Prelude.map (\af -> SimpleFeature (FeatureTabbedNamespace namespace n) af) $ extractAtomicFeatures mTokenizer t)
|
||||
$ Prelude.zip [1..] (splitOn "\t" record)
|
||||
|
@ -106,10 +106,10 @@ extractFeaturesAndPValues spec =
|
||||
.| uScoresCounter
|
||||
|
||||
|
||||
data RankedFeature = RankedFeature Text Double MetricValue
|
||||
data RankedFeature = RankedFeature Feature Double MetricValue
|
||||
deriving (Show)
|
||||
|
||||
data FeatureWithPValue = FeatureWithPValue Text -- ^ feature itself
|
||||
data FeatureWithPValue = FeatureWithPValue Feature -- ^ feature itself
|
||||
Double -- ^ p-value
|
||||
MetricValue -- ^ average metric value
|
||||
Integer -- ^ count
|
||||
@ -117,7 +117,7 @@ data FeatureWithPValue = FeatureWithPValue Text -- ^ feature itself
|
||||
|
||||
formatFeatureWithPValue :: FeatureWithPValue -> Text
|
||||
formatFeatureWithPValue (FeatureWithPValue f p avg c) =
|
||||
Data.Text.intercalate "\t" [f,
|
||||
Data.Text.intercalate "\t" [pack $ show f,
|
||||
(pack $ show c),
|
||||
(pack $ printf "%0.8f" avg),
|
||||
(pack $ printf "%0.20f" p)]
|
||||
@ -139,7 +139,7 @@ uScoresCounter = CC.map (\(RankedFeature feature r score) -> (feature, (r, score
|
||||
M.toList
|
||||
$ M.fromListWith (\(r1, s1, c1) (r2, s2, c2) -> ((r1 + r2), (s1 + s2), (c1 + c2))) l
|
||||
|
||||
pValueCalculator :: Monad m => ConduitT (Text, (Double, MetricValue, Integer)) FeatureWithPValue (StateT Integer m) ()
|
||||
pValueCalculator :: Monad m => ConduitT (Feature, (Double, MetricValue, Integer)) FeatureWithPValue (StateT Integer m) ()
|
||||
pValueCalculator = do
|
||||
firstVal <- await
|
||||
case firstVal of
|
||||
@ -149,7 +149,7 @@ pValueCalculator = do
|
||||
CC.map $ calculatePValue total
|
||||
Nothing -> return ()
|
||||
|
||||
calculatePValue :: Integer -> (Text, (Double, MetricValue, Integer)) -> FeatureWithPValue
|
||||
calculatePValue :: Integer -> (Feature, (Double, MetricValue, Integer)) -> FeatureWithPValue
|
||||
calculatePValue total (f, (r, s, c)) = FeatureWithPValue f
|
||||
(pvalue (r - minusR c) c (total - c))
|
||||
(s / (fromIntegral c))
|
||||
|
Loading…
Reference in New Issue
Block a user