Speed up cartesian features
This commit is contained in:
parent
23aad86e72
commit
39bc3964b3
@ -1,6 +1,7 @@
|
|||||||
module Main where
|
module Main where
|
||||||
|
|
||||||
import GEval.Core
|
import GEval.Core
|
||||||
|
import GEval.Common
|
||||||
import GEval.OptionsParser
|
import GEval.OptionsParser
|
||||||
import GEval.ParseParams
|
import GEval.ParseParams
|
||||||
|
|
||||||
|
@ -6,6 +6,8 @@ import Data.Text.Read as TR
|
|||||||
|
|
||||||
import Data.Attoparsec.Text
|
import Data.Attoparsec.Text
|
||||||
|
|
||||||
|
type MetricValue = Double
|
||||||
|
|
||||||
-- some operations can be "hard" (on ints) or "soft" (on doubles),
|
-- some operations can be "hard" (on ints) or "soft" (on doubles),
|
||||||
-- introduce a typeclass so that we could generalise easily
|
-- introduce a typeclass so that we could generalise easily
|
||||||
class ConvertibleToDouble n where
|
class ConvertibleToDouble n where
|
||||||
|
@ -14,7 +14,6 @@ module GEval.Core
|
|||||||
Metric(..),
|
Metric(..),
|
||||||
MetricOrdering(..),
|
MetricOrdering(..),
|
||||||
getMetricOrdering,
|
getMetricOrdering,
|
||||||
MetricValue,
|
|
||||||
GEvalSpecialCommand(..),
|
GEvalSpecialCommand(..),
|
||||||
GEvalSpecification(..),
|
GEvalSpecification(..),
|
||||||
ResultOrdering(..),
|
ResultOrdering(..),
|
||||||
@ -100,8 +99,6 @@ import Data.Word
|
|||||||
|
|
||||||
import System.FilePath.Glob
|
import System.FilePath.Glob
|
||||||
|
|
||||||
type MetricValue = Double
|
|
||||||
|
|
||||||
defaultLogLossHashedSize :: Word32
|
defaultLogLossHashedSize :: Word32
|
||||||
defaultLogLossHashedSize = 10
|
defaultLogLossHashedSize = 10
|
||||||
|
|
||||||
|
@ -1,10 +1,12 @@
|
|||||||
{-# LANGUAGE OverloadedStrings #-}
|
{-# LANGUAGE OverloadedStrings #-}
|
||||||
|
|
||||||
|
|
||||||
module GEval.FeatureExtractor
|
module GEval.FeatureExtractor
|
||||||
(extractFeatures,
|
(extractFeatures,
|
||||||
extractFeaturesFromTabbed,
|
extractFeaturesFromTabbed,
|
||||||
cartesianFeatures,
|
cartesianFeatures,
|
||||||
|
LineWithFeatures(..),
|
||||||
|
LineWithPeggedFeatures(..),
|
||||||
|
PeggedFeature(..),
|
||||||
Feature(..))
|
Feature(..))
|
||||||
where
|
where
|
||||||
|
|
||||||
@ -16,6 +18,9 @@ import Text.WordShape
|
|||||||
import GEval.BlackBoxDebugging
|
import GEval.BlackBoxDebugging
|
||||||
import GEval.Common
|
import GEval.Common
|
||||||
|
|
||||||
|
data LineWithFeatures = LineWithFeatures Double MetricValue [Feature]
|
||||||
|
deriving (Eq, Ord)
|
||||||
|
|
||||||
data Feature = UnaryFeature PeggedFeature | CartesianFeature PeggedFeature PeggedFeature
|
data Feature = UnaryFeature PeggedFeature | CartesianFeature PeggedFeature PeggedFeature
|
||||||
deriving (Eq, Ord)
|
deriving (Eq, Ord)
|
||||||
|
|
||||||
@ -23,6 +28,9 @@ instance Show Feature where
|
|||||||
show (UnaryFeature feature) = show feature
|
show (UnaryFeature feature) = show feature
|
||||||
show (CartesianFeature featureA featureB) = (show featureA) ++ "~~" ++ (show featureB)
|
show (CartesianFeature featureA featureB) = (show featureA) ++ "~~" ++ (show featureB)
|
||||||
|
|
||||||
|
data LineWithPeggedFeatures = LineWithPeggedFeatures Double MetricValue [PeggedFeature]
|
||||||
|
deriving (Eq, Ord)
|
||||||
|
|
||||||
data PeggedFeature = PeggedFeature FeatureNamespace SimpleFeature
|
data PeggedFeature = PeggedFeature FeatureNamespace SimpleFeature
|
||||||
deriving (Eq, Ord)
|
deriving (Eq, Ord)
|
||||||
|
|
||||||
@ -81,5 +89,11 @@ extractFeaturesFromTabbed mTokenizer bbdo namespace record =
|
|||||||
$ Prelude.map (\(n, t) -> Prelude.map (\af -> PeggedFeature (FeatureTabbedNamespace namespace n) af) $ extractSimpleFeatures mTokenizer bbdo t)
|
$ Prelude.map (\(n, t) -> Prelude.map (\af -> PeggedFeature (FeatureTabbedNamespace namespace n) af) $ extractSimpleFeatures mTokenizer bbdo t)
|
||||||
$ Prelude.zip [1..] (splitOn "\t" record)
|
$ Prelude.zip [1..] (splitOn "\t" record)
|
||||||
|
|
||||||
|
addCartesianFeatures :: BlackBoxDebuggingOptions -> [LineWithPeggedFeatures] -> [LineWithFeatures]
|
||||||
|
addCartesianFeatures bbdo linesWithPeggedFeatures = addCartesianFeatures' (bbdoCartesian bbdo) linesWithPeggedFeatures
|
||||||
|
where addCartesianFeatures' _ linesWithPeggedFeatures
|
||||||
|
= Prelude.map (\(LineWithPeggedFeatures rank score fs) ->
|
||||||
|
LineWithFeatures rank score (Prelude.map UnaryFeature fs)) linesWithPeggedFeatures
|
||||||
|
|
||||||
cartesianFeatures :: [PeggedFeature] -> [Feature]
|
cartesianFeatures :: [PeggedFeature] -> [Feature]
|
||||||
cartesianFeatures features = nub $ [CartesianFeature a b | a <- features, b <- features, a < b]
|
cartesianFeatures features = nub $ [CartesianFeature a b | a <- features, b <- features, a < b]
|
||||||
|
@ -20,6 +20,7 @@ module GEval.LineByLine
|
|||||||
) where
|
) where
|
||||||
|
|
||||||
import GEval.Core
|
import GEval.Core
|
||||||
|
import GEval.Common
|
||||||
import Text.Tokenizer
|
import Text.Tokenizer
|
||||||
|
|
||||||
import Data.Conduit.AutoDecompress (doNothing)
|
import Data.Conduit.AutoDecompress (doNothing)
|
||||||
@ -56,6 +57,7 @@ import Statistics.Distribution (cumulative)
|
|||||||
import Statistics.Distribution.Normal (normalDistr)
|
import Statistics.Distribution.Normal (normalDistr)
|
||||||
|
|
||||||
import qualified Data.Map.Strict as M
|
import qualified Data.Map.Strict as M
|
||||||
|
import qualified Data.Set as S
|
||||||
|
|
||||||
data LineRecord = LineRecord Text Text Text Word32 MetricValue
|
data LineRecord = LineRecord Text Text Text Word32 MetricValue
|
||||||
deriving (Eq, Show)
|
deriving (Eq, Show)
|
||||||
@ -125,22 +127,40 @@ formatFeatureWithPValue (FeatureWithPValue f p avg c) =
|
|||||||
(pack $ printf "%0.20f" p)]
|
(pack $ printf "%0.20f" p)]
|
||||||
|
|
||||||
featureExtractor :: Monad m => GEvalSpecification -> BlackBoxDebuggingOptions -> ConduitT (Double, LineRecord) RankedFeature m ()
|
featureExtractor :: Monad m => GEvalSpecification -> BlackBoxDebuggingOptions -> ConduitT (Double, LineRecord) RankedFeature m ()
|
||||||
featureExtractor spec bbdo = CC.map extract .| CC.concat
|
featureExtractor spec bbdo = CC.map extract
|
||||||
|
.| finalFeatures (bbdoCartesian bbdo) (bbdoMinFrequency bbdo)
|
||||||
|
.| CC.map unwrapFeatures
|
||||||
|
.| CC.concat
|
||||||
where extract (rank, line@(LineRecord _ _ _ _ score)) =
|
where extract (rank, line@(LineRecord _ _ _ _ score)) =
|
||||||
Prelude.map (\f -> RankedFeature f rank score)
|
LineWithPeggedFeatures rank score $ getFeatures mTokenizer bbdo line
|
||||||
$ getFeatures mTokenizer bbdo line
|
|
||||||
mTokenizer = gesTokenizer spec
|
mTokenizer = gesTokenizer spec
|
||||||
|
unwrapFeatures (LineWithFeatures rank score fs) = Prelude.map (\f -> RankedFeature f rank score) fs
|
||||||
|
|
||||||
getFeatures :: Maybe Tokenizer -> BlackBoxDebuggingOptions -> LineRecord -> [Feature]
|
finalFeatures False _ = CC.map peggedToUnaryLine
|
||||||
getFeatures mTokenizer bbdo (LineRecord inLine expLine outLine _ _) = Prelude.map UnaryFeature unaryFeatures ++
|
finalFeatures True minFreq = do
|
||||||
if bbdoCartesian bbdo
|
ls <- CC.sinkList
|
||||||
then cartesianFeatures unaryFeatures
|
let unaryFeaturesFrequentEnough = S.fromList
|
||||||
else []
|
$ Prelude.map (\(f, c) -> f)
|
||||||
where unaryFeatures =
|
$ Prelude.filter (\(f, c) -> c >= minFreq)
|
||||||
Data.List.concat [
|
$ M.toList
|
||||||
extractFeatures mTokenizer bbdo "exp" expLine,
|
$ M.fromListWith (+)
|
||||||
extractFeaturesFromTabbed mTokenizer bbdo "in" inLine,
|
$ Data.List.concat
|
||||||
extractFeatures mTokenizer bbdo "out" outLine]
|
$ Prelude.map (\(LineWithPeggedFeatures _ _ fs) -> Prelude.map (\f -> (f, 1)) fs) ls
|
||||||
|
|
||||||
|
(CC.yieldMany $ ls) .| CC.map (addCartesian unaryFeaturesFrequentEnough)
|
||||||
|
where addCartesian wanted (LineWithPeggedFeatures rank score fs) = LineWithFeatures rank score
|
||||||
|
$ ((Prelude.map UnaryFeature fs) ++
|
||||||
|
(cartesianFeatures $ Prelude.filter ((flip S.member) wanted) fs))
|
||||||
|
|
||||||
|
peggedToUnaryLine :: LineWithPeggedFeatures -> LineWithFeatures
|
||||||
|
peggedToUnaryLine (LineWithPeggedFeatures rank score fs) = LineWithFeatures rank score (Prelude.map UnaryFeature fs)
|
||||||
|
|
||||||
|
getFeatures :: Maybe Tokenizer -> BlackBoxDebuggingOptions -> LineRecord -> [PeggedFeature]
|
||||||
|
getFeatures mTokenizer bbdo (LineRecord inLine expLine outLine _ _) =
|
||||||
|
Data.List.concat [
|
||||||
|
extractFeatures mTokenizer bbdo "exp" expLine,
|
||||||
|
extractFeaturesFromTabbed mTokenizer bbdo "in" inLine,
|
||||||
|
extractFeatures mTokenizer bbdo "out" outLine]
|
||||||
|
|
||||||
uScoresCounter :: Monad m => Integer -> ConduitT RankedFeature FeatureWithPValue (StateT Integer m) ()
|
uScoresCounter :: Monad m => Integer -> ConduitT RankedFeature FeatureWithPValue (StateT Integer m) ()
|
||||||
uScoresCounter minFreq = CC.map (\(RankedFeature feature r score) -> (feature, (r, score, 1)))
|
uScoresCounter minFreq = CC.map (\(RankedFeature feature r score) -> (feature, (r, score, 1)))
|
||||||
|
@ -23,6 +23,7 @@ import Data.String.Here
|
|||||||
import Data.Monoid ((<>))
|
import Data.Monoid ((<>))
|
||||||
|
|
||||||
import GEval.Core
|
import GEval.Core
|
||||||
|
import GEval.Common
|
||||||
import GEval.CreateChallenge
|
import GEval.CreateChallenge
|
||||||
import GEval.LineByLine
|
import GEval.LineByLine
|
||||||
import GEval.Submit (submit)
|
import GEval.Submit (submit)
|
||||||
|
@ -4,6 +4,7 @@
|
|||||||
import Test.Hspec
|
import Test.Hspec
|
||||||
|
|
||||||
import GEval.Core
|
import GEval.Core
|
||||||
|
import GEval.Common
|
||||||
import GEval.OptionsParser
|
import GEval.OptionsParser
|
||||||
import GEval.BLEU
|
import GEval.BLEU
|
||||||
import GEval.ClippEU
|
import GEval.ClippEU
|
||||||
|
Loading…
Reference in New Issue
Block a user