Refactor features
This commit is contained in:
parent
affa45b71f
commit
1832a23b75
src/GEval
@ -3,7 +3,8 @@
|
|||||||
|
|
||||||
module GEval.FeatureExtractor
|
module GEval.FeatureExtractor
|
||||||
(extractUnigramFeatures,
|
(extractUnigramFeatures,
|
||||||
extractUnigramFeaturesFromTabbed)
|
extractUnigramFeaturesFromTabbed,
|
||||||
|
Feature(..))
|
||||||
where
|
where
|
||||||
|
|
||||||
import Data.Text
|
import Data.Text
|
||||||
@ -11,17 +12,40 @@ import Data.List
|
|||||||
import Data.Monoid ((<>))
|
import Data.Monoid ((<>))
|
||||||
import Text.Tokenizer
|
import Text.Tokenizer
|
||||||
|
|
||||||
extractUnigramFeatures :: (Maybe Tokenizer) -> Text -> Text -> [Text]
|
data Feature = SimpleFeature FeatureNamespace AtomicFeature
|
||||||
extractUnigramFeatures mTokenizer namespace record = Prelude.map (prefix <>) $ nub $ (tokenizeForFeatures mTokenizer) record
|
deriving (Eq, Ord)
|
||||||
where prefix = namespace <> ":"
|
|
||||||
|
instance Show Feature where
|
||||||
|
show (SimpleFeature namespace feature) = (show namespace) ++ ":" ++ (show feature)
|
||||||
|
|
||||||
|
data AtomicFeature = TextFeature Text
|
||||||
|
deriving (Eq, Ord)
|
||||||
|
|
||||||
|
instance Show AtomicFeature where
|
||||||
|
show (TextFeature t) = unpack t
|
||||||
|
|
||||||
|
data FeatureNamespace = FeatureNamespace Text | FeatureTabbedNamespace Text Int
|
||||||
|
deriving (Eq, Ord)
|
||||||
|
|
||||||
|
instance Show FeatureNamespace where
|
||||||
|
show (FeatureNamespace namespace) = unpack namespace
|
||||||
|
show (FeatureTabbedNamespace namespace column) = ((unpack namespace) ++ "<" ++ (show column) ++ ">")
|
||||||
|
|
||||||
tokenizeForFeatures :: (Maybe Tokenizer) -> Text -> [Text]
|
tokenizeForFeatures :: (Maybe Tokenizer) -> Text -> [Text]
|
||||||
tokenizeForFeatures Nothing t = Data.List.filter (not . Data.Text.null) $ split splitPred t
|
tokenizeForFeatures Nothing t = Data.List.filter (not . Data.Text.null) $ split splitPred t
|
||||||
where splitPred c = c == ' ' || c == '\t' || c == ':'
|
where splitPred c = c == ' ' || c == '\t' || c == ':'
|
||||||
tokenizeForFeatures mTokenizer t = tokenize mTokenizer t
|
tokenizeForFeatures mTokenizer t = tokenize mTokenizer t
|
||||||
|
|
||||||
extractUnigramFeaturesFromTabbed :: (Maybe Tokenizer) -> Text -> Text -> [Text]
|
extractAtomicFeatures :: (Maybe Tokenizer) -> Text -> [AtomicFeature]
|
||||||
|
extractAtomicFeatures mTokenizer = nub . (Data.List.map TextFeature) . (tokenizeForFeatures mTokenizer)
|
||||||
|
|
||||||
|
extractUnigramFeatures :: (Maybe Tokenizer) -> Text -> Text -> [Feature]
|
||||||
|
extractUnigramFeatures mTokenizer namespace record =
|
||||||
|
Prelude.map (\af -> SimpleFeature (FeatureNamespace namespace) af)
|
||||||
|
$ extractAtomicFeatures mTokenizer record
|
||||||
|
|
||||||
|
extractUnigramFeaturesFromTabbed :: (Maybe Tokenizer) -> Text -> Text -> [Feature]
|
||||||
extractUnigramFeaturesFromTabbed mTokenizer namespace record =
|
extractUnigramFeaturesFromTabbed mTokenizer namespace record =
|
||||||
Data.List.concat
|
Data.List.concat
|
||||||
$ Prelude.map (\(n, t) -> extractUnigramFeatures mTokenizer (namespace <> "<" <> (pack $ show n) <> ">") t)
|
$ Prelude.map (\(n, t) -> Prelude.map (\af -> SimpleFeature (FeatureTabbedNamespace namespace n) af) $ extractAtomicFeatures mTokenizer t)
|
||||||
$ Prelude.zip [1..] (splitOn "\t" record)
|
$ Prelude.zip [1..] (splitOn "\t" record)
|
||||||
|
@ -106,10 +106,10 @@ extractFeaturesAndPValues spec =
|
|||||||
.| uScoresCounter
|
.| uScoresCounter
|
||||||
|
|
||||||
|
|
||||||
data RankedFeature = RankedFeature Text Double MetricValue
|
data RankedFeature = RankedFeature Feature Double MetricValue
|
||||||
deriving (Show)
|
deriving (Show)
|
||||||
|
|
||||||
data FeatureWithPValue = FeatureWithPValue Text -- ^ feature itself
|
data FeatureWithPValue = FeatureWithPValue Feature -- ^ feature itself
|
||||||
Double -- ^ p-value
|
Double -- ^ p-value
|
||||||
MetricValue -- ^ average metric value
|
MetricValue -- ^ average metric value
|
||||||
Integer -- ^ count
|
Integer -- ^ count
|
||||||
@ -117,7 +117,7 @@ data FeatureWithPValue = FeatureWithPValue Text -- ^ feature itself
|
|||||||
|
|
||||||
formatFeatureWithPValue :: FeatureWithPValue -> Text
|
formatFeatureWithPValue :: FeatureWithPValue -> Text
|
||||||
formatFeatureWithPValue (FeatureWithPValue f p avg c) =
|
formatFeatureWithPValue (FeatureWithPValue f p avg c) =
|
||||||
Data.Text.intercalate "\t" [f,
|
Data.Text.intercalate "\t" [pack $ show f,
|
||||||
(pack $ show c),
|
(pack $ show c),
|
||||||
(pack $ printf "%0.8f" avg),
|
(pack $ printf "%0.8f" avg),
|
||||||
(pack $ printf "%0.20f" p)]
|
(pack $ printf "%0.20f" p)]
|
||||||
@ -139,7 +139,7 @@ uScoresCounter = CC.map (\(RankedFeature feature r score) -> (feature, (r, score
|
|||||||
M.toList
|
M.toList
|
||||||
$ M.fromListWith (\(r1, s1, c1) (r2, s2, c2) -> ((r1 + r2), (s1 + s2), (c1 + c2))) l
|
$ M.fromListWith (\(r1, s1, c1) (r2, s2, c2) -> ((r1 + r2), (s1 + s2), (c1 + c2))) l
|
||||||
|
|
||||||
pValueCalculator :: Monad m => ConduitT (Text, (Double, MetricValue, Integer)) FeatureWithPValue (StateT Integer m) ()
|
pValueCalculator :: Monad m => ConduitT (Feature, (Double, MetricValue, Integer)) FeatureWithPValue (StateT Integer m) ()
|
||||||
pValueCalculator = do
|
pValueCalculator = do
|
||||||
firstVal <- await
|
firstVal <- await
|
||||||
case firstVal of
|
case firstVal of
|
||||||
@ -149,7 +149,7 @@ pValueCalculator = do
|
|||||||
CC.map $ calculatePValue total
|
CC.map $ calculatePValue total
|
||||||
Nothing -> return ()
|
Nothing -> return ()
|
||||||
|
|
||||||
calculatePValue :: Integer -> (Text, (Double, MetricValue, Integer)) -> FeatureWithPValue
|
calculatePValue :: Integer -> (Feature, (Double, MetricValue, Integer)) -> FeatureWithPValue
|
||||||
calculatePValue total (f, (r, s, c)) = FeatureWithPValue f
|
calculatePValue total (f, (r, s, c)) = FeatureWithPValue f
|
||||||
(pvalue (r - minusR c) c (total - c))
|
(pvalue (r - minusR c) c (total - c))
|
||||||
(s / (fromIntegral c))
|
(s / (fromIntegral c))
|
||||||
|
Loading…
Reference in New Issue
Block a user