Finish filtering

This commit is contained in:
Filip Gralinski 2020-05-13 15:34:16 +02:00
parent 78d2cd6501
commit 634dd21b12
7 changed files with 97 additions and 40 deletions

View File

@ -49,7 +49,7 @@ module GEval.Core
readHeaderFileWrapper,
getInHeader,
getOutHeader,
addPreprocessing,
addSchemeSpecifics,
LineSourcesSpecification(..),
dataSourceToLineSourcesSpecification,
fromSpecificationToWithoutInput,
@ -140,9 +140,13 @@ isBetter metric valA valB = isBetter' metricOrdering valA valB
isBetter' TheLowerTheBetter valA valB = valA < valB
metricOrdering = getMetricOrdering metric
isInputNeeded :: Metric -> Bool
isInputNeeded CharMatch = True
isInputNeeded _ = False
isInputNeeded :: EvaluationScheme -> Bool
isInputNeeded (EvaluationScheme CharMatch _) = True
isInputNeeded (EvaluationScheme _ ops) = hasFiltering ops
hasFiltering [] = False
hasFiltering ((FeatureFilter _):_) = True
hasFiltering (_:ops) = hasFiltering ops
-- | Could output be preprocessable
isPreprocessable :: Metric -> Bool
@ -312,18 +316,19 @@ gevalOnSingleOut gevalSpec dataSource = do
vals <- Prelude.mapM (\scheme ->
gevalCore (evaluationSchemeMetric scheme)
(gesBootstrapResampling gevalSpec)
(addPreprocessing (applyPreprocessingOperations scheme) dataSource))
(addSchemeSpecifics scheme dataSource))
schemes
return (outSource, vals)
where outSource = dataSourceOut dataSource
schemes = gesMetrics gevalSpec
addPreprocessing :: (Text -> Text) -> DataSource -> DataSource
addPreprocessing prep dataSource =
addSchemeSpecifics :: EvaluationScheme -> DataSource -> DataSource
addSchemeSpecifics scheme dataSource =
dataSource {
dataSourceChallengeData = (dataSourceChallengeData dataSource) {
challengeDataSourceFilter = getFilterForScheme (challengeDataSourceInHeader $ dataSourceChallengeData dataSource) scheme,
challengeDataSourcePreprocess =
(challengeDataSourcePreprocess $ dataSourceChallengeData dataSource) . prep }}
(challengeDataSourcePreprocess $ dataSourceChallengeData dataSource) . (applyPreprocessingOperations scheme) }}
readHeaderFileWrapper :: Maybe FilePath -> IO (Maybe TabularHeader)
readHeaderFileWrapper Nothing = return Nothing
@ -352,7 +357,7 @@ checkAndGetDataSources forceInput gevalSpec = do
throwM $ NoExpectedDirectory d
Right expectedSource -> do
-- in most cases inputSource is NoSource (unless needed by a metric or in the line-by-line mode)
inputSource <- getInputSourceIfNeeded forceInput (Prelude.map evaluationSchemeMetric schemes) expectedTestDirectory inputFile
inputSource <- getInputSourceIfNeeded forceInput schemes expectedTestDirectory inputFile
mMultipleOuts <- checkMultipleOuts gevalSpec
osss <- case mMultipleOuts of
@ -442,9 +447,9 @@ getOutFile gevalSpec out = outDirectory </> testName </> out
where outDirectory = gesOutDirectory gevalSpec
testName = gesTestName gevalSpec
getInputSourceIfNeeded :: Bool -> [Metric] -> FilePath -> FilePath -> IO SourceSpec
getInputSourceIfNeeded forced metrics directory inputFilePath
| forced || (Prelude.any isInputNeeded metrics) = do
getInputSourceIfNeeded :: Bool -> [EvaluationScheme] -> FilePath -> FilePath -> IO SourceSpec
getInputSourceIfNeeded forced schemes directory inputFilePath
| forced || (Prelude.any isInputNeeded schemes) = do
iss <- getSmartSourceSpec directory defaultInputFile inputFilePath
case iss of
Left NoSpecGiven -> throwM $ NoInputFile inputFilePath
@ -950,9 +955,12 @@ defineContinuation aggregator finalStep generateGraph = do
v <- aggregator
return $ MetricOutput (SimpleRun $ finalStep v) (generateGraph v)
fromSpecificationToWithoutInput lsSpec = WithoutInput expectedSource outSource
where expectedSource = lineSourcesExpectedSource lsSpec
outSource = lineSourcesOutputSource lsSpec
fromSpecificationToWithoutInput lsSpec = case lineSourcesFilter lsSpec of
NoFilter -> WithoutInput expectedSource outSource
theFilter -> WithoutInputButFiltered theFilter inputSource expectedSource outSource
where expectedSource = lineSourcesExpectedSource lsSpec
outSource = lineSourcesOutputSource lsSpec
inputSource = lineSourcesInputSource lsSpec
fromSpecificationToWithInput lsSpec = WithInput theFilter inpSource expectedSource outSource
where inpSource = lineSourcesInputSource lsSpec

View File

@ -4,13 +4,17 @@ module GEval.DataSource
(ChallengeDataSource(..),
DataSource(..),
TargetRecord(..),
Filter,
Filter(..),
noFilter,
getFilterForScheme,
applyFilter)
where
import GEval.Common (SourceItem(..))
import GEval.Selector (ItemTarget(..))
import GEval.Selector (ItemTarget(..), TargetRecord(..))
import GEval.FeatureExtractor (getFeatures)
import GEval.BlackBoxDebugging
import GEval.EvaluationScheme
import Data.Text
@ -18,17 +22,42 @@ import Data.Conduit.SmartSource
import Data.Conduit.Header
import GEval.Selector
data TargetRecord = TargetRecord (SourceItem ItemTarget) (SourceItem ItemTarget) (SourceItem ItemTarget)
data Filter = NoFilter | InputFilter (Text -> Bool)
data Filter = NoFilter | FilterByFeatures (Maybe TabularHeader) String
noFilter :: Filter
noFilter = NoFilter
applyFilter :: Filter -> TargetRecord -> Bool
applyFilter NoFilter _ = True
applyFilter (InputFilter fun) (TargetRecord (Got (RawItemTarget t)) _ _) = fun t
applyFilter (InputFilter fun) (TargetRecord (Got (PartiallyParsedItemTarget ts)) _ _) = fun (intercalate "\t" ts)
applyFilter (FilterByFeatures mInHeader featureSpec) tR = applyFeatureFilter mInHeader featureSpec tR
getFilterForScheme :: Maybe TabularHeader -> EvaluationScheme -> Filter
getFilterForScheme mTabHeader (EvaluationScheme _ ops) = case findFilter ops of
Nothing -> NoFilter
Just f -> FilterByFeatures mTabHeader (unpack $ fixIndex f)
fixIndex = replace "[" "<" . replace "]" ">"
findFilter :: [PreprocessingOperation] -> Maybe Text
findFilter [] = Nothing
findFilter ((FeatureFilter f):_) = Just f
findFilter (_:ops) = findFilter ops
applyFeatureFilter :: Maybe TabularHeader -> String -> TargetRecord -> Bool
applyFeatureFilter mInHeader featureSpec tR = featureSpec `elem` (Prelude.map show fs)
where fs = getFeatures Nothing
BlackBoxDebuggingOptions {
bbdoMinFrequency = 0,
bbdoWordShapes = False,
bbdoBigrams = False,
bbdoCartesian = False,
bbdoMinCartesianFrequency = Nothing,
bbdoConsiderNumericalFeatures = False}
Nothing
tR
mInHeader
-- | This type specifies the way the challenge data (input and
-- expected data, but not outputs) flow into evaluation.

View File

@ -21,7 +21,8 @@ module GEval.FeatureExtractor
ReferencesData(..),
FeatureIndex(..),
toTextualContent,
filterExistentialFactors)
filterExistentialFactors,
getFeatures)
where
import Data.Text
@ -32,6 +33,7 @@ import Text.Tokenizer
import Text.WordShape
import GEval.BlackBoxDebugging
import GEval.Common
import GEval.Selector (TargetRecord(..), ItemTarget(..))
import Text.Read (readMaybe)
import Control.Error.Util (hush)
@ -256,3 +258,24 @@ instance WithTextualContent ExistentialFactor where
instance WithTextualContent AtomicFactor where
toTextualContent (TextFactor t) = Just t
toTextualContent (ShapeFactor _) = Nothing
getFeatures :: Maybe Tokenizer
-> BlackBoxDebuggingOptions
-> Maybe References
-> TargetRecord
-> Maybe TabularHeader
-> [PeggedFactor]
getFeatures mTokenizer bbdo mReferences (TargetRecord inLine expLine outLine) mInHeader =
Data.List.concat [
extractFactors mTokenizer bbdo mReferencesData "exp" (asText expLine),
extractFactorsFromTabbed mTokenizer bbdo mReferencesData "in" (asText inLine) mInHeader,
extractFactors mTokenizer bbdo mReferencesData "out" (asText outLine)]
where mReferencesData = case mReferences of
Just references -> Just $ ReferencesData {
referencesDataReferences = references,
referencesDataCurrentId = Nothing }
Nothing -> Nothing
asText (Got (RawItemTarget t)) = t
asText (Got (PartiallyParsedItemTarget ts)) = Data.Text.intercalate "\t" ts
asText (Wrong _) = ""
asText GEval.Common.Done = ""

View File

@ -299,7 +299,11 @@ featureExtractor :: (Monad m, FeatureSource s) => Maybe Tokenizer -> BlackBoxDeb
featureExtractor mTokenizer bbdo mReferences mInHeader = CC.map extract
.| finalFeatures (bbdoCartesian bbdo) (fromMaybe (bbdoMinFrequency bbdo) (bbdoMinCartesianFrequency bbdo))
where extract (rank, line) =
(line, LineWithPeggedFactors rank (getScore line) $ getFeatures mTokenizer bbdo mReferences (mainLineRecord line) mInHeader)
(line, LineWithPeggedFactors rank (getScore line) $ getFeatures mTokenizer bbdo mReferences (lineToTargetRecord $ mainLineRecord line) mInHeader)
lineToTargetRecord (LineRecord inp exp out _ _) = TargetRecord (Got (RawItemTarget inp))
(Got (RawItemTarget exp))
(Got (RawItemTarget out))
finalFeatures :: Monad m => Bool -> Integer -> ConduitT (a, LineWithPeggedFactors) (a, LineWithFactors) m ()
finalFeatures False _ = CC.map (\(l, p) -> (l, peggedToUnaryLine p))
@ -331,18 +335,6 @@ filtreCartesian True = CC.concatMapAccum step S.empty
peggedToUnaryLine :: LineWithPeggedFactors -> LineWithFactors
peggedToUnaryLine (LineWithPeggedFactors rank score fs) = LineWithFactors rank score (Prelude.map UnaryFactor fs)
getFeatures :: Maybe Tokenizer -> BlackBoxDebuggingOptions -> Maybe References -> LineRecord -> Maybe TabularHeader -> [PeggedFactor]
getFeatures mTokenizer bbdo mReferences (LineRecord inLine expLine outLine _ _) mInHeader =
Data.List.concat [
extractFactors mTokenizer bbdo mReferencesData "exp" expLine,
extractFactorsFromTabbed mTokenizer bbdo mReferencesData "in" inLine mInHeader,
extractFactors mTokenizer bbdo mReferencesData "out" outLine]
where mReferencesData = case mReferences of
Just references -> Just $ ReferencesData {
referencesDataReferences = references,
referencesDataCurrentId = Nothing }
Nothing -> Nothing
data FeatureAggregate = ExistentialFactorAggregate Double MetricValue Integer
| NumericalValueAggregate [Double] [MetricValue] [Int] [MetricValue]
| LengthAggregate [Double] [MetricValue] [Int]
@ -475,7 +467,7 @@ runLineByLineGeneralized ordering spec consum = do
return $ Just references
Nothing -> return Nothing
dataSource' <- checkAndGetDataSource True spec
let dataSource = addPreprocessing (applyPreprocessingOperations scheme) dataSource'
let dataSource = addSchemeSpecifics scheme dataSource'
gevalLineByLineCore metric dataSource (sorter ordering .| consum mReferences)
where metric = gesMainMetric spec
scheme = gesMainScheme spec
@ -526,7 +518,7 @@ runOracleItemBased spec = runMultiOutputGeneralized spec consum
runMultiOutputGeneralized :: GEvalSpecification -> ConduitT [LineRecord] Void (ResourceT IO) () -> IO ()
runMultiOutputGeneralized spec consum = do
dataSource' <- checkAndGetDataSource True spec
let dataSource = addPreprocessing (applyPreprocessingOperations scheme) dataSource'
let dataSource = addSchemeSpecifics scheme dataSource'
let (Just altOuts) = gesAltOutFiles spec
altSourceSpecs' <- mapM (getSmartSourceSpec ((gesOutDirectory spec) </> (gesTestName spec)) "out.tsv") altOuts
let altSourceSpecs = rights altSourceSpecs'

View File

@ -4,10 +4,13 @@ module GEval.Selector
( Selector(..),
DataFormat(..),
ItemTarget(..),
TargetRecord(..),
liftOp,
select,
parseSelector ) where
import GEval.Common (SourceItem(..))
import qualified Data.Text as T
import qualified Data.Vector as V
import qualified Data.Text.Encoding as DTE
@ -30,6 +33,8 @@ data DataFormat = Tsv | Jsonl
data ItemTarget = RawItemTarget T.Text | PartiallyParsedItemTarget [T.Text]
deriving (Eq, Show)
data TargetRecord = TargetRecord (SourceItem ItemTarget) (SourceItem ItemTarget) (SourceItem ItemTarget)
parseSelector :: String -> Selector
parseSelector = Selector . T.splitOn "/" . T.pack

View File

@ -136,7 +136,7 @@ main = hspec $ do
it "sorted" $
runGEvalTest "accuracy-on-sorted" `shouldReturnAlmost` 0.75
it "with filtering" $
runGEvalTest "accuracy-filtering" `shouldReturnAlmost` 1.0
runGEvalTest "accuracy-filtering" `shouldReturnAlmost` 0.6666
describe "F-measure" $ do
it "simple example" $
runGEvalTest "f-measure-simple" `shouldReturnAlmost` 0.57142857

View File

@ -1,5 +1,5 @@
foo
bar
bar
bar
baz
bar

1 foo
2 bar
3 bar
4 bar baz
5 bar