Speed up cartesian features

This commit is contained in:
Filip Gralinski 2019-01-10 22:53:43 +01:00
parent 23aad86e72
commit 39bc3964b3
7 changed files with 53 additions and 17 deletions

View File

@ -1,6 +1,7 @@
module Main where module Main where
import GEval.Core import GEval.Core
import GEval.Common
import GEval.OptionsParser import GEval.OptionsParser
import GEval.ParseParams import GEval.ParseParams

View File

@ -6,6 +6,8 @@ import Data.Text.Read as TR
import Data.Attoparsec.Text import Data.Attoparsec.Text
type MetricValue = Double
-- some operations can be "hard" (on ints) or "soft" (on doubles), -- some operations can be "hard" (on ints) or "soft" (on doubles),
-- introduce a typeclass so that we could generalise easily -- introduce a typeclass so that we could generalise easily
class ConvertibleToDouble n where class ConvertibleToDouble n where

View File

@ -14,7 +14,6 @@ module GEval.Core
Metric(..), Metric(..),
MetricOrdering(..), MetricOrdering(..),
getMetricOrdering, getMetricOrdering,
MetricValue,
GEvalSpecialCommand(..), GEvalSpecialCommand(..),
GEvalSpecification(..), GEvalSpecification(..),
ResultOrdering(..), ResultOrdering(..),
@ -100,8 +99,6 @@ import Data.Word
import System.FilePath.Glob import System.FilePath.Glob
type MetricValue = Double
defaultLogLossHashedSize :: Word32 defaultLogLossHashedSize :: Word32
defaultLogLossHashedSize = 10 defaultLogLossHashedSize = 10

View File

@ -1,10 +1,12 @@
{-# LANGUAGE OverloadedStrings #-} {-# LANGUAGE OverloadedStrings #-}
module GEval.FeatureExtractor module GEval.FeatureExtractor
(extractFeatures, (extractFeatures,
extractFeaturesFromTabbed, extractFeaturesFromTabbed,
cartesianFeatures, cartesianFeatures,
LineWithFeatures(..),
LineWithPeggedFeatures(..),
PeggedFeature(..),
Feature(..)) Feature(..))
where where
@ -16,6 +18,9 @@ import Text.WordShape
import GEval.BlackBoxDebugging import GEval.BlackBoxDebugging
import GEval.Common import GEval.Common
data LineWithFeatures = LineWithFeatures Double MetricValue [Feature]
deriving (Eq, Ord)
data Feature = UnaryFeature PeggedFeature | CartesianFeature PeggedFeature PeggedFeature data Feature = UnaryFeature PeggedFeature | CartesianFeature PeggedFeature PeggedFeature
deriving (Eq, Ord) deriving (Eq, Ord)
@ -23,6 +28,9 @@ instance Show Feature where
show (UnaryFeature feature) = show feature show (UnaryFeature feature) = show feature
show (CartesianFeature featureA featureB) = (show featureA) ++ "~~" ++ (show featureB) show (CartesianFeature featureA featureB) = (show featureA) ++ "~~" ++ (show featureB)
data LineWithPeggedFeatures = LineWithPeggedFeatures Double MetricValue [PeggedFeature]
deriving (Eq, Ord)
data PeggedFeature = PeggedFeature FeatureNamespace SimpleFeature data PeggedFeature = PeggedFeature FeatureNamespace SimpleFeature
deriving (Eq, Ord) deriving (Eq, Ord)
@ -81,5 +89,11 @@ extractFeaturesFromTabbed mTokenizer bbdo namespace record =
$ Prelude.map (\(n, t) -> Prelude.map (\af -> PeggedFeature (FeatureTabbedNamespace namespace n) af) $ extractSimpleFeatures mTokenizer bbdo t) $ Prelude.map (\(n, t) -> Prelude.map (\af -> PeggedFeature (FeatureTabbedNamespace namespace n) af) $ extractSimpleFeatures mTokenizer bbdo t)
$ Prelude.zip [1..] (splitOn "\t" record) $ Prelude.zip [1..] (splitOn "\t" record)
addCartesianFeatures :: BlackBoxDebuggingOptions -> [LineWithPeggedFeatures] -> [LineWithFeatures]
addCartesianFeatures bbdo linesWithPeggedFeatures = addCartesianFeatures' (bbdoCartesian bbdo) linesWithPeggedFeatures
where addCartesianFeatures' _ linesWithPeggedFeatures
= Prelude.map (\(LineWithPeggedFeatures rank score fs) ->
LineWithFeatures rank score (Prelude.map UnaryFeature fs)) linesWithPeggedFeatures
cartesianFeatures :: [PeggedFeature] -> [Feature] cartesianFeatures :: [PeggedFeature] -> [Feature]
cartesianFeatures features = nub $ [CartesianFeature a b | a <- features, b <- features, a < b] cartesianFeatures features = nub $ [CartesianFeature a b | a <- features, b <- features, a < b]

View File

@ -20,6 +20,7 @@ module GEval.LineByLine
) where ) where
import GEval.Core import GEval.Core
import GEval.Common
import Text.Tokenizer import Text.Tokenizer
import Data.Conduit.AutoDecompress (doNothing) import Data.Conduit.AutoDecompress (doNothing)
@ -56,6 +57,7 @@ import Statistics.Distribution (cumulative)
import Statistics.Distribution.Normal (normalDistr) import Statistics.Distribution.Normal (normalDistr)
import qualified Data.Map.Strict as M import qualified Data.Map.Strict as M
import qualified Data.Set as S
data LineRecord = LineRecord Text Text Text Word32 MetricValue data LineRecord = LineRecord Text Text Text Word32 MetricValue
deriving (Eq, Show) deriving (Eq, Show)
@ -125,22 +127,40 @@ formatFeatureWithPValue (FeatureWithPValue f p avg c) =
(pack $ printf "%0.20f" p)] (pack $ printf "%0.20f" p)]
featureExtractor :: Monad m => GEvalSpecification -> BlackBoxDebuggingOptions -> ConduitT (Double, LineRecord) RankedFeature m () featureExtractor :: Monad m => GEvalSpecification -> BlackBoxDebuggingOptions -> ConduitT (Double, LineRecord) RankedFeature m ()
featureExtractor spec bbdo = CC.map extract .| CC.concat featureExtractor spec bbdo = CC.map extract
.| finalFeatures (bbdoCartesian bbdo) (bbdoMinFrequency bbdo)
.| CC.map unwrapFeatures
.| CC.concat
where extract (rank, line@(LineRecord _ _ _ _ score)) = where extract (rank, line@(LineRecord _ _ _ _ score)) =
Prelude.map (\f -> RankedFeature f rank score) LineWithPeggedFeatures rank score $ getFeatures mTokenizer bbdo line
$ getFeatures mTokenizer bbdo line
mTokenizer = gesTokenizer spec mTokenizer = gesTokenizer spec
unwrapFeatures (LineWithFeatures rank score fs) = Prelude.map (\f -> RankedFeature f rank score) fs
getFeatures :: Maybe Tokenizer -> BlackBoxDebuggingOptions -> LineRecord -> [Feature] finalFeatures False _ = CC.map peggedToUnaryLine
getFeatures mTokenizer bbdo (LineRecord inLine expLine outLine _ _) = Prelude.map UnaryFeature unaryFeatures ++ finalFeatures True minFreq = do
if bbdoCartesian bbdo ls <- CC.sinkList
then cartesianFeatures unaryFeatures let unaryFeaturesFrequentEnough = S.fromList
else [] $ Prelude.map (\(f, c) -> f)
where unaryFeatures = $ Prelude.filter (\(f, c) -> c >= minFreq)
Data.List.concat [ $ M.toList
extractFeatures mTokenizer bbdo "exp" expLine, $ M.fromListWith (+)
extractFeaturesFromTabbed mTokenizer bbdo "in" inLine, $ Data.List.concat
extractFeatures mTokenizer bbdo "out" outLine] $ Prelude.map (\(LineWithPeggedFeatures _ _ fs) -> Prelude.map (\f -> (f, 1)) fs) ls
(CC.yieldMany $ ls) .| CC.map (addCartesian unaryFeaturesFrequentEnough)
where addCartesian wanted (LineWithPeggedFeatures rank score fs) = LineWithFeatures rank score
$ ((Prelude.map UnaryFeature fs) ++
(cartesianFeatures $ Prelude.filter ((flip S.member) wanted) fs))
peggedToUnaryLine :: LineWithPeggedFeatures -> LineWithFeatures
peggedToUnaryLine (LineWithPeggedFeatures rank score fs) = LineWithFeatures rank score (Prelude.map UnaryFeature fs)
getFeatures :: Maybe Tokenizer -> BlackBoxDebuggingOptions -> LineRecord -> [PeggedFeature]
getFeatures mTokenizer bbdo (LineRecord inLine expLine outLine _ _) =
Data.List.concat [
extractFeatures mTokenizer bbdo "exp" expLine,
extractFeaturesFromTabbed mTokenizer bbdo "in" inLine,
extractFeatures mTokenizer bbdo "out" outLine]
uScoresCounter :: Monad m => Integer -> ConduitT RankedFeature FeatureWithPValue (StateT Integer m) () uScoresCounter :: Monad m => Integer -> ConduitT RankedFeature FeatureWithPValue (StateT Integer m) ()
uScoresCounter minFreq = CC.map (\(RankedFeature feature r score) -> (feature, (r, score, 1))) uScoresCounter minFreq = CC.map (\(RankedFeature feature r score) -> (feature, (r, score, 1)))

View File

@ -23,6 +23,7 @@ import Data.String.Here
import Data.Monoid ((<>)) import Data.Monoid ((<>))
import GEval.Core import GEval.Core
import GEval.Common
import GEval.CreateChallenge import GEval.CreateChallenge
import GEval.LineByLine import GEval.LineByLine
import GEval.Submit (submit) import GEval.Submit (submit)

View File

@ -4,6 +4,7 @@
import Test.Hspec import Test.Hspec
import GEval.Core import GEval.Core
import GEval.Common
import GEval.OptionsParser import GEval.OptionsParser
import GEval.BLEU import GEval.BLEU
import GEval.ClippEU import GEval.ClippEU