Finish filtering

This commit is contained in:
Filip Gralinski 2020-05-13 15:34:16 +02:00
parent 78d2cd6501
commit 634dd21b12
7 changed files with 97 additions and 40 deletions

View File

@ -49,7 +49,7 @@ module GEval.Core
readHeaderFileWrapper, readHeaderFileWrapper,
getInHeader, getInHeader,
getOutHeader, getOutHeader,
addPreprocessing, addSchemeSpecifics,
LineSourcesSpecification(..), LineSourcesSpecification(..),
dataSourceToLineSourcesSpecification, dataSourceToLineSourcesSpecification,
fromSpecificationToWithoutInput, fromSpecificationToWithoutInput,
@ -140,9 +140,13 @@ isBetter metric valA valB = isBetter' metricOrdering valA valB
isBetter' TheLowerTheBetter valA valB = valA < valB isBetter' TheLowerTheBetter valA valB = valA < valB
metricOrdering = getMetricOrdering metric metricOrdering = getMetricOrdering metric
isInputNeeded :: Metric -> Bool isInputNeeded :: EvaluationScheme -> Bool
isInputNeeded CharMatch = True isInputNeeded (EvaluationScheme CharMatch _) = True
isInputNeeded _ = False isInputNeeded (EvaluationScheme _ ops) = hasFiltering ops
hasFiltering [] = False
hasFiltering ((FeatureFilter _):_) = True
hasFiltering (_:ops) = hasFiltering ops
-- | Could output be preprocessable -- | Could output be preprocessable
isPreprocessable :: Metric -> Bool isPreprocessable :: Metric -> Bool
@ -312,18 +316,19 @@ gevalOnSingleOut gevalSpec dataSource = do
vals <- Prelude.mapM (\scheme -> vals <- Prelude.mapM (\scheme ->
gevalCore (evaluationSchemeMetric scheme) gevalCore (evaluationSchemeMetric scheme)
(gesBootstrapResampling gevalSpec) (gesBootstrapResampling gevalSpec)
(addPreprocessing (applyPreprocessingOperations scheme) dataSource)) (addSchemeSpecifics scheme dataSource))
schemes schemes
return (outSource, vals) return (outSource, vals)
where outSource = dataSourceOut dataSource where outSource = dataSourceOut dataSource
schemes = gesMetrics gevalSpec schemes = gesMetrics gevalSpec
addPreprocessing :: (Text -> Text) -> DataSource -> DataSource addSchemeSpecifics :: EvaluationScheme -> DataSource -> DataSource
addPreprocessing prep dataSource = addSchemeSpecifics scheme dataSource =
dataSource { dataSource {
dataSourceChallengeData = (dataSourceChallengeData dataSource) { dataSourceChallengeData = (dataSourceChallengeData dataSource) {
challengeDataSourceFilter = getFilterForScheme (challengeDataSourceInHeader $ dataSourceChallengeData dataSource) scheme,
challengeDataSourcePreprocess = challengeDataSourcePreprocess =
(challengeDataSourcePreprocess $ dataSourceChallengeData dataSource) . prep }} (challengeDataSourcePreprocess $ dataSourceChallengeData dataSource) . (applyPreprocessingOperations scheme) }}
readHeaderFileWrapper :: Maybe FilePath -> IO (Maybe TabularHeader) readHeaderFileWrapper :: Maybe FilePath -> IO (Maybe TabularHeader)
readHeaderFileWrapper Nothing = return Nothing readHeaderFileWrapper Nothing = return Nothing
@ -352,7 +357,7 @@ checkAndGetDataSources forceInput gevalSpec = do
throwM $ NoExpectedDirectory d throwM $ NoExpectedDirectory d
Right expectedSource -> do Right expectedSource -> do
-- in most cases inputSource is NoSource (unless needed by a metric or in the line-by-line mode) -- in most cases inputSource is NoSource (unless needed by a metric or in the line-by-line mode)
inputSource <- getInputSourceIfNeeded forceInput (Prelude.map evaluationSchemeMetric schemes) expectedTestDirectory inputFile inputSource <- getInputSourceIfNeeded forceInput schemes expectedTestDirectory inputFile
mMultipleOuts <- checkMultipleOuts gevalSpec mMultipleOuts <- checkMultipleOuts gevalSpec
osss <- case mMultipleOuts of osss <- case mMultipleOuts of
@ -442,9 +447,9 @@ getOutFile gevalSpec out = outDirectory </> testName </> out
where outDirectory = gesOutDirectory gevalSpec where outDirectory = gesOutDirectory gevalSpec
testName = gesTestName gevalSpec testName = gesTestName gevalSpec
getInputSourceIfNeeded :: Bool -> [Metric] -> FilePath -> FilePath -> IO SourceSpec getInputSourceIfNeeded :: Bool -> [EvaluationScheme] -> FilePath -> FilePath -> IO SourceSpec
getInputSourceIfNeeded forced metrics directory inputFilePath getInputSourceIfNeeded forced schemes directory inputFilePath
| forced || (Prelude.any isInputNeeded metrics) = do | forced || (Prelude.any isInputNeeded schemes) = do
iss <- getSmartSourceSpec directory defaultInputFile inputFilePath iss <- getSmartSourceSpec directory defaultInputFile inputFilePath
case iss of case iss of
Left NoSpecGiven -> throwM $ NoInputFile inputFilePath Left NoSpecGiven -> throwM $ NoInputFile inputFilePath
@ -950,9 +955,12 @@ defineContinuation aggregator finalStep generateGraph = do
v <- aggregator v <- aggregator
return $ MetricOutput (SimpleRun $ finalStep v) (generateGraph v) return $ MetricOutput (SimpleRun $ finalStep v) (generateGraph v)
fromSpecificationToWithoutInput lsSpec = WithoutInput expectedSource outSource fromSpecificationToWithoutInput lsSpec = case lineSourcesFilter lsSpec of
NoFilter -> WithoutInput expectedSource outSource
theFilter -> WithoutInputButFiltered theFilter inputSource expectedSource outSource
where expectedSource = lineSourcesExpectedSource lsSpec where expectedSource = lineSourcesExpectedSource lsSpec
outSource = lineSourcesOutputSource lsSpec outSource = lineSourcesOutputSource lsSpec
inputSource = lineSourcesInputSource lsSpec
fromSpecificationToWithInput lsSpec = WithInput theFilter inpSource expectedSource outSource fromSpecificationToWithInput lsSpec = WithInput theFilter inpSource expectedSource outSource
where inpSource = lineSourcesInputSource lsSpec where inpSource = lineSourcesInputSource lsSpec

View File

@ -4,13 +4,17 @@ module GEval.DataSource
(ChallengeDataSource(..), (ChallengeDataSource(..),
DataSource(..), DataSource(..),
TargetRecord(..), TargetRecord(..),
Filter, Filter(..),
noFilter, noFilter,
getFilterForScheme,
applyFilter) applyFilter)
where where
import GEval.Common (SourceItem(..)) import GEval.Common (SourceItem(..))
import GEval.Selector (ItemTarget(..)) import GEval.Selector (ItemTarget(..), TargetRecord(..))
import GEval.FeatureExtractor (getFeatures)
import GEval.BlackBoxDebugging
import GEval.EvaluationScheme
import Data.Text import Data.Text
@ -18,17 +22,42 @@ import Data.Conduit.SmartSource
import Data.Conduit.Header import Data.Conduit.Header
import GEval.Selector import GEval.Selector
data TargetRecord = TargetRecord (SourceItem ItemTarget) (SourceItem ItemTarget) (SourceItem ItemTarget) data Filter = NoFilter | FilterByFeatures (Maybe TabularHeader) String
data Filter = NoFilter | InputFilter (Text -> Bool)
noFilter :: Filter noFilter :: Filter
noFilter = NoFilter noFilter = NoFilter
applyFilter :: Filter -> TargetRecord -> Bool applyFilter :: Filter -> TargetRecord -> Bool
applyFilter NoFilter _ = True applyFilter NoFilter _ = True
applyFilter (InputFilter fun) (TargetRecord (Got (RawItemTarget t)) _ _) = fun t applyFilter (FilterByFeatures mInHeader featureSpec) tR = applyFeatureFilter mInHeader featureSpec tR
applyFilter (InputFilter fun) (TargetRecord (Got (PartiallyParsedItemTarget ts)) _ _) = fun (intercalate "\t" ts)
getFilterForScheme :: Maybe TabularHeader -> EvaluationScheme -> Filter
getFilterForScheme mTabHeader (EvaluationScheme _ ops) = case findFilter ops of
Nothing -> NoFilter
Just f -> FilterByFeatures mTabHeader (unpack $ fixIndex f)
fixIndex = replace "[" "<" . replace "]" ">"
findFilter :: [PreprocessingOperation] -> Maybe Text
findFilter [] = Nothing
findFilter ((FeatureFilter f):_) = Just f
findFilter (_:ops) = findFilter ops
applyFeatureFilter :: Maybe TabularHeader -> String -> TargetRecord -> Bool
applyFeatureFilter mInHeader featureSpec tR = featureSpec `elem` (Prelude.map show fs)
where fs = getFeatures Nothing
BlackBoxDebuggingOptions {
bbdoMinFrequency = 0,
bbdoWordShapes = False,
bbdoBigrams = False,
bbdoCartesian = False,
bbdoMinCartesianFrequency = Nothing,
bbdoConsiderNumericalFeatures = False}
Nothing
tR
mInHeader
-- | This type specifies the way the challenge data (input and -- | This type specifies the way the challenge data (input and
-- expected data, but not outputs) flow into evaluation. -- expected data, but not outputs) flow into evaluation.

View File

@ -21,7 +21,8 @@ module GEval.FeatureExtractor
ReferencesData(..), ReferencesData(..),
FeatureIndex(..), FeatureIndex(..),
toTextualContent, toTextualContent,
filterExistentialFactors) filterExistentialFactors,
getFeatures)
where where
import Data.Text import Data.Text
@ -32,6 +33,7 @@ import Text.Tokenizer
import Text.WordShape import Text.WordShape
import GEval.BlackBoxDebugging import GEval.BlackBoxDebugging
import GEval.Common import GEval.Common
import GEval.Selector (TargetRecord(..), ItemTarget(..))
import Text.Read (readMaybe) import Text.Read (readMaybe)
import Control.Error.Util (hush) import Control.Error.Util (hush)
@ -256,3 +258,24 @@ instance WithTextualContent ExistentialFactor where
instance WithTextualContent AtomicFactor where instance WithTextualContent AtomicFactor where
toTextualContent (TextFactor t) = Just t toTextualContent (TextFactor t) = Just t
toTextualContent (ShapeFactor _) = Nothing toTextualContent (ShapeFactor _) = Nothing
getFeatures :: Maybe Tokenizer
-> BlackBoxDebuggingOptions
-> Maybe References
-> TargetRecord
-> Maybe TabularHeader
-> [PeggedFactor]
getFeatures mTokenizer bbdo mReferences (TargetRecord inLine expLine outLine) mInHeader =
Data.List.concat [
extractFactors mTokenizer bbdo mReferencesData "exp" (asText expLine),
extractFactorsFromTabbed mTokenizer bbdo mReferencesData "in" (asText inLine) mInHeader,
extractFactors mTokenizer bbdo mReferencesData "out" (asText outLine)]
where mReferencesData = case mReferences of
Just references -> Just $ ReferencesData {
referencesDataReferences = references,
referencesDataCurrentId = Nothing }
Nothing -> Nothing
asText (Got (RawItemTarget t)) = t
asText (Got (PartiallyParsedItemTarget ts)) = Data.Text.intercalate "\t" ts
asText (Wrong _) = ""
asText GEval.Common.Done = ""

View File

@ -299,7 +299,11 @@ featureExtractor :: (Monad m, FeatureSource s) => Maybe Tokenizer -> BlackBoxDeb
featureExtractor mTokenizer bbdo mReferences mInHeader = CC.map extract featureExtractor mTokenizer bbdo mReferences mInHeader = CC.map extract
.| finalFeatures (bbdoCartesian bbdo) (fromMaybe (bbdoMinFrequency bbdo) (bbdoMinCartesianFrequency bbdo)) .| finalFeatures (bbdoCartesian bbdo) (fromMaybe (bbdoMinFrequency bbdo) (bbdoMinCartesianFrequency bbdo))
where extract (rank, line) = where extract (rank, line) =
(line, LineWithPeggedFactors rank (getScore line) $ getFeatures mTokenizer bbdo mReferences (mainLineRecord line) mInHeader) (line, LineWithPeggedFactors rank (getScore line) $ getFeatures mTokenizer bbdo mReferences (lineToTargetRecord $ mainLineRecord line) mInHeader)
lineToTargetRecord (LineRecord inp exp out _ _) = TargetRecord (Got (RawItemTarget inp))
(Got (RawItemTarget exp))
(Got (RawItemTarget out))
finalFeatures :: Monad m => Bool -> Integer -> ConduitT (a, LineWithPeggedFactors) (a, LineWithFactors) m () finalFeatures :: Monad m => Bool -> Integer -> ConduitT (a, LineWithPeggedFactors) (a, LineWithFactors) m ()
finalFeatures False _ = CC.map (\(l, p) -> (l, peggedToUnaryLine p)) finalFeatures False _ = CC.map (\(l, p) -> (l, peggedToUnaryLine p))
@ -331,18 +335,6 @@ filtreCartesian True = CC.concatMapAccum step S.empty
peggedToUnaryLine :: LineWithPeggedFactors -> LineWithFactors peggedToUnaryLine :: LineWithPeggedFactors -> LineWithFactors
peggedToUnaryLine (LineWithPeggedFactors rank score fs) = LineWithFactors rank score (Prelude.map UnaryFactor fs) peggedToUnaryLine (LineWithPeggedFactors rank score fs) = LineWithFactors rank score (Prelude.map UnaryFactor fs)
getFeatures :: Maybe Tokenizer -> BlackBoxDebuggingOptions -> Maybe References -> LineRecord -> Maybe TabularHeader -> [PeggedFactor]
getFeatures mTokenizer bbdo mReferences (LineRecord inLine expLine outLine _ _) mInHeader =
Data.List.concat [
extractFactors mTokenizer bbdo mReferencesData "exp" expLine,
extractFactorsFromTabbed mTokenizer bbdo mReferencesData "in" inLine mInHeader,
extractFactors mTokenizer bbdo mReferencesData "out" outLine]
where mReferencesData = case mReferences of
Just references -> Just $ ReferencesData {
referencesDataReferences = references,
referencesDataCurrentId = Nothing }
Nothing -> Nothing
data FeatureAggregate = ExistentialFactorAggregate Double MetricValue Integer data FeatureAggregate = ExistentialFactorAggregate Double MetricValue Integer
| NumericalValueAggregate [Double] [MetricValue] [Int] [MetricValue] | NumericalValueAggregate [Double] [MetricValue] [Int] [MetricValue]
| LengthAggregate [Double] [MetricValue] [Int] | LengthAggregate [Double] [MetricValue] [Int]
@ -475,7 +467,7 @@ runLineByLineGeneralized ordering spec consum = do
return $ Just references return $ Just references
Nothing -> return Nothing Nothing -> return Nothing
dataSource' <- checkAndGetDataSource True spec dataSource' <- checkAndGetDataSource True spec
let dataSource = addPreprocessing (applyPreprocessingOperations scheme) dataSource' let dataSource = addSchemeSpecifics scheme dataSource'
gevalLineByLineCore metric dataSource (sorter ordering .| consum mReferences) gevalLineByLineCore metric dataSource (sorter ordering .| consum mReferences)
where metric = gesMainMetric spec where metric = gesMainMetric spec
scheme = gesMainScheme spec scheme = gesMainScheme spec
@ -526,7 +518,7 @@ runOracleItemBased spec = runMultiOutputGeneralized spec consum
runMultiOutputGeneralized :: GEvalSpecification -> ConduitT [LineRecord] Void (ResourceT IO) () -> IO () runMultiOutputGeneralized :: GEvalSpecification -> ConduitT [LineRecord] Void (ResourceT IO) () -> IO ()
runMultiOutputGeneralized spec consum = do runMultiOutputGeneralized spec consum = do
dataSource' <- checkAndGetDataSource True spec dataSource' <- checkAndGetDataSource True spec
let dataSource = addPreprocessing (applyPreprocessingOperations scheme) dataSource' let dataSource = addSchemeSpecifics scheme dataSource'
let (Just altOuts) = gesAltOutFiles spec let (Just altOuts) = gesAltOutFiles spec
altSourceSpecs' <- mapM (getSmartSourceSpec ((gesOutDirectory spec) </> (gesTestName spec)) "out.tsv") altOuts altSourceSpecs' <- mapM (getSmartSourceSpec ((gesOutDirectory spec) </> (gesTestName spec)) "out.tsv") altOuts
let altSourceSpecs = rights altSourceSpecs' let altSourceSpecs = rights altSourceSpecs'

View File

@ -4,10 +4,13 @@ module GEval.Selector
( Selector(..), ( Selector(..),
DataFormat(..), DataFormat(..),
ItemTarget(..), ItemTarget(..),
TargetRecord(..),
liftOp, liftOp,
select, select,
parseSelector ) where parseSelector ) where
import GEval.Common (SourceItem(..))
import qualified Data.Text as T import qualified Data.Text as T
import qualified Data.Vector as V import qualified Data.Vector as V
import qualified Data.Text.Encoding as DTE import qualified Data.Text.Encoding as DTE
@ -30,6 +33,8 @@ data DataFormat = Tsv | Jsonl
data ItemTarget = RawItemTarget T.Text | PartiallyParsedItemTarget [T.Text] data ItemTarget = RawItemTarget T.Text | PartiallyParsedItemTarget [T.Text]
deriving (Eq, Show) deriving (Eq, Show)
data TargetRecord = TargetRecord (SourceItem ItemTarget) (SourceItem ItemTarget) (SourceItem ItemTarget)
parseSelector :: String -> Selector parseSelector :: String -> Selector
parseSelector = Selector . T.splitOn "/" . T.pack parseSelector = Selector . T.splitOn "/" . T.pack

View File

@ -136,7 +136,7 @@ main = hspec $ do
it "sorted" $ it "sorted" $
runGEvalTest "accuracy-on-sorted" `shouldReturnAlmost` 0.75 runGEvalTest "accuracy-on-sorted" `shouldReturnAlmost` 0.75
it "with filtering" $ it "with filtering" $
runGEvalTest "accuracy-filtering" `shouldReturnAlmost` 1.0 runGEvalTest "accuracy-filtering" `shouldReturnAlmost` 0.6666
describe "F-measure" $ do describe "F-measure" $ do
it "simple example" $ it "simple example" $
runGEvalTest "f-measure-simple" `shouldReturnAlmost` 0.57142857 runGEvalTest "f-measure-simple" `shouldReturnAlmost` 0.57142857

View File

@ -1,5 +1,5 @@
foo foo
bar bar
bar bar
bar baz
bar bar

1 foo
2 bar
3 bar
4 bar baz
5 bar