diff --git a/app/Main.hs b/app/Main.hs index f7a8db6..6f3d412 100644 --- a/app/Main.hs +++ b/app/Main.hs @@ -1,6 +1,7 @@ module Main where import GEval.Core +import GEval.Common import GEval.OptionsParser import GEval.ParseParams diff --git a/src/GEval/Common.hs b/src/GEval/Common.hs index 8439408..ec921db 100644 --- a/src/GEval/Common.hs +++ b/src/GEval/Common.hs @@ -6,6 +6,8 @@ import Data.Text.Read as TR import Data.Attoparsec.Text +type MetricValue = Double + -- some operations can be "hard" (on ints) or "soft" (on doubles), -- introduce a typeclass so that we could generalise easily class ConvertibleToDouble n where diff --git a/src/GEval/Core.hs b/src/GEval/Core.hs index 14cc748..0f97c31 100644 --- a/src/GEval/Core.hs +++ b/src/GEval/Core.hs @@ -14,7 +14,6 @@ module GEval.Core Metric(..), MetricOrdering(..), getMetricOrdering, - MetricValue, GEvalSpecialCommand(..), GEvalSpecification(..), ResultOrdering(..), @@ -100,8 +99,6 @@ import Data.Word import System.FilePath.Glob -type MetricValue = Double - defaultLogLossHashedSize :: Word32 defaultLogLossHashedSize = 10 diff --git a/src/GEval/FeatureExtractor.hs b/src/GEval/FeatureExtractor.hs index 103c371..d9af354 100644 --- a/src/GEval/FeatureExtractor.hs +++ b/src/GEval/FeatureExtractor.hs @@ -1,10 +1,12 @@ {-# LANGUAGE OverloadedStrings #-} - module GEval.FeatureExtractor (extractFeatures, extractFeaturesFromTabbed, cartesianFeatures, + LineWithFeatures(..), + LineWithPeggedFeatures(..), + PeggedFeature(..), Feature(..)) where @@ -16,6 +18,9 @@ import Text.WordShape import GEval.BlackBoxDebugging import GEval.Common +data LineWithFeatures = LineWithFeatures Double MetricValue [Feature] + deriving (Eq, Ord) + data Feature = UnaryFeature PeggedFeature | CartesianFeature PeggedFeature PeggedFeature deriving (Eq, Ord) @@ -23,6 +28,9 @@ instance Show Feature where show (UnaryFeature feature) = show feature show (CartesianFeature featureA featureB) = (show featureA) ++ "~~" ++ (show featureB) +data LineWithPeggedFeatures = LineWithPeggedFeatures Double MetricValue [PeggedFeature] + deriving (Eq, Ord) + data PeggedFeature = PeggedFeature FeatureNamespace SimpleFeature deriving (Eq, Ord) @@ -81,5 +89,11 @@ extractFeaturesFromTabbed mTokenizer bbdo namespace record = $ Prelude.map (\(n, t) -> Prelude.map (\af -> PeggedFeature (FeatureTabbedNamespace namespace n) af) $ extractSimpleFeatures mTokenizer bbdo t) $ Prelude.zip [1..] (splitOn "\t" record) +addCartesianFeatures :: BlackBoxDebuggingOptions -> [LineWithPeggedFeatures] -> [LineWithFeatures] +addCartesianFeatures bbdo linesWithPeggedFeatures = addCartesianFeatures' (bbdoCartesian bbdo) linesWithPeggedFeatures + where addCartesianFeatures' _ linesWithPeggedFeatures + = Prelude.map (\(LineWithPeggedFeatures rank score fs) -> + LineWithFeatures rank score (Prelude.map UnaryFeature fs)) linesWithPeggedFeatures + cartesianFeatures :: [PeggedFeature] -> [Feature] cartesianFeatures features = nub $ [CartesianFeature a b | a <- features, b <- features, a < b] diff --git a/src/GEval/LineByLine.hs b/src/GEval/LineByLine.hs index 70aaa55..609b386 100644 --- a/src/GEval/LineByLine.hs +++ b/src/GEval/LineByLine.hs @@ -20,6 +20,7 @@ module GEval.LineByLine ) where import GEval.Core +import GEval.Common import Text.Tokenizer import Data.Conduit.AutoDecompress (doNothing) @@ -56,6 +57,7 @@ import Statistics.Distribution (cumulative) import Statistics.Distribution.Normal (normalDistr) import qualified Data.Map.Strict as M +import qualified Data.Set as S data LineRecord = LineRecord Text Text Text Word32 MetricValue deriving (Eq, Show) @@ -125,22 +127,40 @@ formatFeatureWithPValue (FeatureWithPValue f p avg c) = (pack $ printf "%0.20f" p)] featureExtractor :: Monad m => GEvalSpecification -> BlackBoxDebuggingOptions -> ConduitT (Double, LineRecord) RankedFeature m () -featureExtractor spec bbdo = CC.map extract .| CC.concat +featureExtractor spec bbdo = CC.map extract + .| finalFeatures (bbdoCartesian bbdo) (bbdoMinFrequency bbdo) + .| CC.map unwrapFeatures + .| CC.concat where extract (rank, line@(LineRecord _ _ _ _ score)) = - Prelude.map (\f -> RankedFeature f rank score) - $ getFeatures mTokenizer bbdo line + LineWithPeggedFeatures rank score $ getFeatures mTokenizer bbdo line mTokenizer = gesTokenizer spec + unwrapFeatures (LineWithFeatures rank score fs) = Prelude.map (\f -> RankedFeature f rank score) fs -getFeatures :: Maybe Tokenizer -> BlackBoxDebuggingOptions -> LineRecord -> [Feature] -getFeatures mTokenizer bbdo (LineRecord inLine expLine outLine _ _) = Prelude.map UnaryFeature unaryFeatures ++ - if bbdoCartesian bbdo - then cartesianFeatures unaryFeatures - else [] - where unaryFeatures = - Data.List.concat [ - extractFeatures mTokenizer bbdo "exp" expLine, - extractFeaturesFromTabbed mTokenizer bbdo "in" inLine, - extractFeatures mTokenizer bbdo "out" outLine] +finalFeatures False _ = CC.map peggedToUnaryLine +finalFeatures True minFreq = do + ls <- CC.sinkList + let unaryFeaturesFrequentEnough = S.fromList + $ Prelude.map (\(f, c) -> f) + $ Prelude.filter (\(f, c) -> c >= minFreq) + $ M.toList + $ M.fromListWith (+) + $ Data.List.concat + $ Prelude.map (\(LineWithPeggedFeatures _ _ fs) -> Prelude.map (\f -> (f, 1)) fs) ls + + (CC.yieldMany $ ls) .| CC.map (addCartesian unaryFeaturesFrequentEnough) + where addCartesian wanted (LineWithPeggedFeatures rank score fs) = LineWithFeatures rank score + $ ((Prelude.map UnaryFeature fs) ++ + (cartesianFeatures $ Prelude.filter ((flip S.member) wanted) fs)) + +peggedToUnaryLine :: LineWithPeggedFeatures -> LineWithFeatures +peggedToUnaryLine (LineWithPeggedFeatures rank score fs) = LineWithFeatures rank score (Prelude.map UnaryFeature fs) + +getFeatures :: Maybe Tokenizer -> BlackBoxDebuggingOptions -> LineRecord -> [PeggedFeature] +getFeatures mTokenizer bbdo (LineRecord inLine expLine outLine _ _) = + Data.List.concat [ + extractFeatures mTokenizer bbdo "exp" expLine, + extractFeaturesFromTabbed mTokenizer bbdo "in" inLine, + extractFeatures mTokenizer bbdo "out" outLine] uScoresCounter :: Monad m => Integer -> ConduitT RankedFeature FeatureWithPValue (StateT Integer m) () uScoresCounter minFreq = CC.map (\(RankedFeature feature r score) -> (feature, (r, score, 1))) diff --git a/src/GEval/OptionsParser.hs b/src/GEval/OptionsParser.hs index 756bb07..9c53e54 100644 --- a/src/GEval/OptionsParser.hs +++ b/src/GEval/OptionsParser.hs @@ -23,6 +23,7 @@ import Data.String.Here import Data.Monoid ((<>)) import GEval.Core +import GEval.Common import GEval.CreateChallenge import GEval.LineByLine import GEval.Submit (submit) diff --git a/test/Spec.hs b/test/Spec.hs index 5829f73..5aa5a68 100644 --- a/test/Spec.hs +++ b/test/Spec.hs @@ -4,6 +4,7 @@ import Test.Hspec import GEval.Core +import GEval.Common import GEval.OptionsParser import GEval.BLEU import GEval.ClippEU