Speed up cartesian features
This commit is contained in:
parent
23aad86e72
commit
39bc3964b3
@ -1,6 +1,7 @@
|
||||
module Main where
|
||||
|
||||
import GEval.Core
|
||||
import GEval.Common
|
||||
import GEval.OptionsParser
|
||||
import GEval.ParseParams
|
||||
|
||||
|
@ -6,6 +6,8 @@ import Data.Text.Read as TR
|
||||
|
||||
import Data.Attoparsec.Text
|
||||
|
||||
type MetricValue = Double
|
||||
|
||||
-- some operations can be "hard" (on ints) or "soft" (on doubles),
|
||||
-- introduce a typeclass so that we could generalise easily
|
||||
class ConvertibleToDouble n where
|
||||
|
@ -14,7 +14,6 @@ module GEval.Core
|
||||
Metric(..),
|
||||
MetricOrdering(..),
|
||||
getMetricOrdering,
|
||||
MetricValue,
|
||||
GEvalSpecialCommand(..),
|
||||
GEvalSpecification(..),
|
||||
ResultOrdering(..),
|
||||
@ -100,8 +99,6 @@ import Data.Word
|
||||
|
||||
import System.FilePath.Glob
|
||||
|
||||
type MetricValue = Double
|
||||
|
||||
defaultLogLossHashedSize :: Word32
|
||||
defaultLogLossHashedSize = 10
|
||||
|
||||
|
@ -1,10 +1,12 @@
|
||||
{-# LANGUAGE OverloadedStrings #-}
|
||||
|
||||
|
||||
module GEval.FeatureExtractor
|
||||
(extractFeatures,
|
||||
extractFeaturesFromTabbed,
|
||||
cartesianFeatures,
|
||||
LineWithFeatures(..),
|
||||
LineWithPeggedFeatures(..),
|
||||
PeggedFeature(..),
|
||||
Feature(..))
|
||||
where
|
||||
|
||||
@ -16,6 +18,9 @@ import Text.WordShape
|
||||
import GEval.BlackBoxDebugging
|
||||
import GEval.Common
|
||||
|
||||
data LineWithFeatures = LineWithFeatures Double MetricValue [Feature]
|
||||
deriving (Eq, Ord)
|
||||
|
||||
data Feature = UnaryFeature PeggedFeature | CartesianFeature PeggedFeature PeggedFeature
|
||||
deriving (Eq, Ord)
|
||||
|
||||
@ -23,6 +28,9 @@ instance Show Feature where
|
||||
show (UnaryFeature feature) = show feature
|
||||
show (CartesianFeature featureA featureB) = (show featureA) ++ "~~" ++ (show featureB)
|
||||
|
||||
data LineWithPeggedFeatures = LineWithPeggedFeatures Double MetricValue [PeggedFeature]
|
||||
deriving (Eq, Ord)
|
||||
|
||||
data PeggedFeature = PeggedFeature FeatureNamespace SimpleFeature
|
||||
deriving (Eq, Ord)
|
||||
|
||||
@ -81,5 +89,11 @@ extractFeaturesFromTabbed mTokenizer bbdo namespace record =
|
||||
$ Prelude.map (\(n, t) -> Prelude.map (\af -> PeggedFeature (FeatureTabbedNamespace namespace n) af) $ extractSimpleFeatures mTokenizer bbdo t)
|
||||
$ Prelude.zip [1..] (splitOn "\t" record)
|
||||
|
||||
addCartesianFeatures :: BlackBoxDebuggingOptions -> [LineWithPeggedFeatures] -> [LineWithFeatures]
|
||||
addCartesianFeatures bbdo linesWithPeggedFeatures = addCartesianFeatures' (bbdoCartesian bbdo) linesWithPeggedFeatures
|
||||
where addCartesianFeatures' _ linesWithPeggedFeatures
|
||||
= Prelude.map (\(LineWithPeggedFeatures rank score fs) ->
|
||||
LineWithFeatures rank score (Prelude.map UnaryFeature fs)) linesWithPeggedFeatures
|
||||
|
||||
cartesianFeatures :: [PeggedFeature] -> [Feature]
|
||||
cartesianFeatures features = nub $ [CartesianFeature a b | a <- features, b <- features, a < b]
|
||||
|
@ -20,6 +20,7 @@ module GEval.LineByLine
|
||||
) where
|
||||
|
||||
import GEval.Core
|
||||
import GEval.Common
|
||||
import Text.Tokenizer
|
||||
|
||||
import Data.Conduit.AutoDecompress (doNothing)
|
||||
@ -56,6 +57,7 @@ import Statistics.Distribution (cumulative)
|
||||
import Statistics.Distribution.Normal (normalDistr)
|
||||
|
||||
import qualified Data.Map.Strict as M
|
||||
import qualified Data.Set as S
|
||||
|
||||
data LineRecord = LineRecord Text Text Text Word32 MetricValue
|
||||
deriving (Eq, Show)
|
||||
@ -125,22 +127,40 @@ formatFeatureWithPValue (FeatureWithPValue f p avg c) =
|
||||
(pack $ printf "%0.20f" p)]
|
||||
|
||||
featureExtractor :: Monad m => GEvalSpecification -> BlackBoxDebuggingOptions -> ConduitT (Double, LineRecord) RankedFeature m ()
|
||||
featureExtractor spec bbdo = CC.map extract .| CC.concat
|
||||
featureExtractor spec bbdo = CC.map extract
|
||||
.| finalFeatures (bbdoCartesian bbdo) (bbdoMinFrequency bbdo)
|
||||
.| CC.map unwrapFeatures
|
||||
.| CC.concat
|
||||
where extract (rank, line@(LineRecord _ _ _ _ score)) =
|
||||
Prelude.map (\f -> RankedFeature f rank score)
|
||||
$ getFeatures mTokenizer bbdo line
|
||||
LineWithPeggedFeatures rank score $ getFeatures mTokenizer bbdo line
|
||||
mTokenizer = gesTokenizer spec
|
||||
unwrapFeatures (LineWithFeatures rank score fs) = Prelude.map (\f -> RankedFeature f rank score) fs
|
||||
|
||||
getFeatures :: Maybe Tokenizer -> BlackBoxDebuggingOptions -> LineRecord -> [Feature]
|
||||
getFeatures mTokenizer bbdo (LineRecord inLine expLine outLine _ _) = Prelude.map UnaryFeature unaryFeatures ++
|
||||
if bbdoCartesian bbdo
|
||||
then cartesianFeatures unaryFeatures
|
||||
else []
|
||||
where unaryFeatures =
|
||||
Data.List.concat [
|
||||
extractFeatures mTokenizer bbdo "exp" expLine,
|
||||
extractFeaturesFromTabbed mTokenizer bbdo "in" inLine,
|
||||
extractFeatures mTokenizer bbdo "out" outLine]
|
||||
finalFeatures False _ = CC.map peggedToUnaryLine
|
||||
finalFeatures True minFreq = do
|
||||
ls <- CC.sinkList
|
||||
let unaryFeaturesFrequentEnough = S.fromList
|
||||
$ Prelude.map (\(f, c) -> f)
|
||||
$ Prelude.filter (\(f, c) -> c >= minFreq)
|
||||
$ M.toList
|
||||
$ M.fromListWith (+)
|
||||
$ Data.List.concat
|
||||
$ Prelude.map (\(LineWithPeggedFeatures _ _ fs) -> Prelude.map (\f -> (f, 1)) fs) ls
|
||||
|
||||
(CC.yieldMany $ ls) .| CC.map (addCartesian unaryFeaturesFrequentEnough)
|
||||
where addCartesian wanted (LineWithPeggedFeatures rank score fs) = LineWithFeatures rank score
|
||||
$ ((Prelude.map UnaryFeature fs) ++
|
||||
(cartesianFeatures $ Prelude.filter ((flip S.member) wanted) fs))
|
||||
|
||||
peggedToUnaryLine :: LineWithPeggedFeatures -> LineWithFeatures
|
||||
peggedToUnaryLine (LineWithPeggedFeatures rank score fs) = LineWithFeatures rank score (Prelude.map UnaryFeature fs)
|
||||
|
||||
getFeatures :: Maybe Tokenizer -> BlackBoxDebuggingOptions -> LineRecord -> [PeggedFeature]
|
||||
getFeatures mTokenizer bbdo (LineRecord inLine expLine outLine _ _) =
|
||||
Data.List.concat [
|
||||
extractFeatures mTokenizer bbdo "exp" expLine,
|
||||
extractFeaturesFromTabbed mTokenizer bbdo "in" inLine,
|
||||
extractFeatures mTokenizer bbdo "out" outLine]
|
||||
|
||||
uScoresCounter :: Monad m => Integer -> ConduitT RankedFeature FeatureWithPValue (StateT Integer m) ()
|
||||
uScoresCounter minFreq = CC.map (\(RankedFeature feature r score) -> (feature, (r, score, 1)))
|
||||
|
@ -23,6 +23,7 @@ import Data.String.Here
|
||||
import Data.Monoid ((<>))
|
||||
|
||||
import GEval.Core
|
||||
import GEval.Common
|
||||
import GEval.CreateChallenge
|
||||
import GEval.LineByLine
|
||||
import GEval.Submit (submit)
|
||||
|
@ -4,6 +4,7 @@
|
||||
import Test.Hspec
|
||||
|
||||
import GEval.Core
|
||||
import GEval.Common
|
||||
import GEval.OptionsParser
|
||||
import GEval.BLEU
|
||||
import GEval.ClippEU
|
||||
|
Loading…
Reference in New Issue
Block a user