Finish filtering
This commit is contained in:
parent
78d2cd6501
commit
634dd21b12
@ -49,7 +49,7 @@ module GEval.Core
|
||||
readHeaderFileWrapper,
|
||||
getInHeader,
|
||||
getOutHeader,
|
||||
addPreprocessing,
|
||||
addSchemeSpecifics,
|
||||
LineSourcesSpecification(..),
|
||||
dataSourceToLineSourcesSpecification,
|
||||
fromSpecificationToWithoutInput,
|
||||
@ -140,9 +140,13 @@ isBetter metric valA valB = isBetter' metricOrdering valA valB
|
||||
isBetter' TheLowerTheBetter valA valB = valA < valB
|
||||
metricOrdering = getMetricOrdering metric
|
||||
|
||||
isInputNeeded :: Metric -> Bool
|
||||
isInputNeeded CharMatch = True
|
||||
isInputNeeded _ = False
|
||||
isInputNeeded :: EvaluationScheme -> Bool
|
||||
isInputNeeded (EvaluationScheme CharMatch _) = True
|
||||
isInputNeeded (EvaluationScheme _ ops) = hasFiltering ops
|
||||
|
||||
hasFiltering [] = False
|
||||
hasFiltering ((FeatureFilter _):_) = True
|
||||
hasFiltering (_:ops) = hasFiltering ops
|
||||
|
||||
-- | Could output be preprocessable
|
||||
isPreprocessable :: Metric -> Bool
|
||||
@ -312,18 +316,19 @@ gevalOnSingleOut gevalSpec dataSource = do
|
||||
vals <- Prelude.mapM (\scheme ->
|
||||
gevalCore (evaluationSchemeMetric scheme)
|
||||
(gesBootstrapResampling gevalSpec)
|
||||
(addPreprocessing (applyPreprocessingOperations scheme) dataSource))
|
||||
(addSchemeSpecifics scheme dataSource))
|
||||
schemes
|
||||
return (outSource, vals)
|
||||
where outSource = dataSourceOut dataSource
|
||||
schemes = gesMetrics gevalSpec
|
||||
|
||||
addPreprocessing :: (Text -> Text) -> DataSource -> DataSource
|
||||
addPreprocessing prep dataSource =
|
||||
addSchemeSpecifics :: EvaluationScheme -> DataSource -> DataSource
|
||||
addSchemeSpecifics scheme dataSource =
|
||||
dataSource {
|
||||
dataSourceChallengeData = (dataSourceChallengeData dataSource) {
|
||||
challengeDataSourceFilter = getFilterForScheme (challengeDataSourceInHeader $ dataSourceChallengeData dataSource) scheme,
|
||||
challengeDataSourcePreprocess =
|
||||
(challengeDataSourcePreprocess $ dataSourceChallengeData dataSource) . prep }}
|
||||
(challengeDataSourcePreprocess $ dataSourceChallengeData dataSource) . (applyPreprocessingOperations scheme) }}
|
||||
|
||||
readHeaderFileWrapper :: Maybe FilePath -> IO (Maybe TabularHeader)
|
||||
readHeaderFileWrapper Nothing = return Nothing
|
||||
@ -352,7 +357,7 @@ checkAndGetDataSources forceInput gevalSpec = do
|
||||
throwM $ NoExpectedDirectory d
|
||||
Right expectedSource -> do
|
||||
-- in most cases inputSource is NoSource (unless needed by a metric or in the line-by-line mode)
|
||||
inputSource <- getInputSourceIfNeeded forceInput (Prelude.map evaluationSchemeMetric schemes) expectedTestDirectory inputFile
|
||||
inputSource <- getInputSourceIfNeeded forceInput schemes expectedTestDirectory inputFile
|
||||
|
||||
mMultipleOuts <- checkMultipleOuts gevalSpec
|
||||
osss <- case mMultipleOuts of
|
||||
@ -442,9 +447,9 @@ getOutFile gevalSpec out = outDirectory </> testName </> out
|
||||
where outDirectory = gesOutDirectory gevalSpec
|
||||
testName = gesTestName gevalSpec
|
||||
|
||||
getInputSourceIfNeeded :: Bool -> [Metric] -> FilePath -> FilePath -> IO SourceSpec
|
||||
getInputSourceIfNeeded forced metrics directory inputFilePath
|
||||
| forced || (Prelude.any isInputNeeded metrics) = do
|
||||
getInputSourceIfNeeded :: Bool -> [EvaluationScheme] -> FilePath -> FilePath -> IO SourceSpec
|
||||
getInputSourceIfNeeded forced schemes directory inputFilePath
|
||||
| forced || (Prelude.any isInputNeeded schemes) = do
|
||||
iss <- getSmartSourceSpec directory defaultInputFile inputFilePath
|
||||
case iss of
|
||||
Left NoSpecGiven -> throwM $ NoInputFile inputFilePath
|
||||
@ -950,9 +955,12 @@ defineContinuation aggregator finalStep generateGraph = do
|
||||
v <- aggregator
|
||||
return $ MetricOutput (SimpleRun $ finalStep v) (generateGraph v)
|
||||
|
||||
fromSpecificationToWithoutInput lsSpec = WithoutInput expectedSource outSource
|
||||
fromSpecificationToWithoutInput lsSpec = case lineSourcesFilter lsSpec of
|
||||
NoFilter -> WithoutInput expectedSource outSource
|
||||
theFilter -> WithoutInputButFiltered theFilter inputSource expectedSource outSource
|
||||
where expectedSource = lineSourcesExpectedSource lsSpec
|
||||
outSource = lineSourcesOutputSource lsSpec
|
||||
inputSource = lineSourcesInputSource lsSpec
|
||||
|
||||
fromSpecificationToWithInput lsSpec = WithInput theFilter inpSource expectedSource outSource
|
||||
where inpSource = lineSourcesInputSource lsSpec
|
||||
|
@ -4,13 +4,17 @@ module GEval.DataSource
|
||||
(ChallengeDataSource(..),
|
||||
DataSource(..),
|
||||
TargetRecord(..),
|
||||
Filter,
|
||||
Filter(..),
|
||||
noFilter,
|
||||
getFilterForScheme,
|
||||
applyFilter)
|
||||
where
|
||||
|
||||
import GEval.Common (SourceItem(..))
|
||||
import GEval.Selector (ItemTarget(..))
|
||||
import GEval.Selector (ItemTarget(..), TargetRecord(..))
|
||||
import GEval.FeatureExtractor (getFeatures)
|
||||
import GEval.BlackBoxDebugging
|
||||
import GEval.EvaluationScheme
|
||||
|
||||
import Data.Text
|
||||
|
||||
@ -18,17 +22,42 @@ import Data.Conduit.SmartSource
|
||||
import Data.Conduit.Header
|
||||
import GEval.Selector
|
||||
|
||||
data TargetRecord = TargetRecord (SourceItem ItemTarget) (SourceItem ItemTarget) (SourceItem ItemTarget)
|
||||
|
||||
data Filter = NoFilter | InputFilter (Text -> Bool)
|
||||
data Filter = NoFilter | FilterByFeatures (Maybe TabularHeader) String
|
||||
|
||||
noFilter :: Filter
|
||||
noFilter = NoFilter
|
||||
|
||||
applyFilter :: Filter -> TargetRecord -> Bool
|
||||
applyFilter NoFilter _ = True
|
||||
applyFilter (InputFilter fun) (TargetRecord (Got (RawItemTarget t)) _ _) = fun t
|
||||
applyFilter (InputFilter fun) (TargetRecord (Got (PartiallyParsedItemTarget ts)) _ _) = fun (intercalate "\t" ts)
|
||||
applyFilter (FilterByFeatures mInHeader featureSpec) tR = applyFeatureFilter mInHeader featureSpec tR
|
||||
|
||||
getFilterForScheme :: Maybe TabularHeader -> EvaluationScheme -> Filter
|
||||
getFilterForScheme mTabHeader (EvaluationScheme _ ops) = case findFilter ops of
|
||||
Nothing -> NoFilter
|
||||
Just f -> FilterByFeatures mTabHeader (unpack $ fixIndex f)
|
||||
|
||||
fixIndex = replace "[" "<" . replace "]" ">"
|
||||
|
||||
findFilter :: [PreprocessingOperation] -> Maybe Text
|
||||
findFilter [] = Nothing
|
||||
findFilter ((FeatureFilter f):_) = Just f
|
||||
findFilter (_:ops) = findFilter ops
|
||||
|
||||
|
||||
applyFeatureFilter :: Maybe TabularHeader -> String -> TargetRecord -> Bool
|
||||
applyFeatureFilter mInHeader featureSpec tR = featureSpec `elem` (Prelude.map show fs)
|
||||
where fs = getFeatures Nothing
|
||||
BlackBoxDebuggingOptions {
|
||||
bbdoMinFrequency = 0,
|
||||
bbdoWordShapes = False,
|
||||
bbdoBigrams = False,
|
||||
bbdoCartesian = False,
|
||||
bbdoMinCartesianFrequency = Nothing,
|
||||
bbdoConsiderNumericalFeatures = False}
|
||||
Nothing
|
||||
tR
|
||||
mInHeader
|
||||
|
||||
|
||||
-- | This type specifies the way the challenge data (input and
|
||||
-- expected data, but not outputs) flow into evaluation.
|
||||
|
@ -21,7 +21,8 @@ module GEval.FeatureExtractor
|
||||
ReferencesData(..),
|
||||
FeatureIndex(..),
|
||||
toTextualContent,
|
||||
filterExistentialFactors)
|
||||
filterExistentialFactors,
|
||||
getFeatures)
|
||||
where
|
||||
|
||||
import Data.Text
|
||||
@ -32,6 +33,7 @@ import Text.Tokenizer
|
||||
import Text.WordShape
|
||||
import GEval.BlackBoxDebugging
|
||||
import GEval.Common
|
||||
import GEval.Selector (TargetRecord(..), ItemTarget(..))
|
||||
import Text.Read (readMaybe)
|
||||
|
||||
import Control.Error.Util (hush)
|
||||
@ -256,3 +258,24 @@ instance WithTextualContent ExistentialFactor where
|
||||
instance WithTextualContent AtomicFactor where
|
||||
toTextualContent (TextFactor t) = Just t
|
||||
toTextualContent (ShapeFactor _) = Nothing
|
||||
|
||||
getFeatures :: Maybe Tokenizer
|
||||
-> BlackBoxDebuggingOptions
|
||||
-> Maybe References
|
||||
-> TargetRecord
|
||||
-> Maybe TabularHeader
|
||||
-> [PeggedFactor]
|
||||
getFeatures mTokenizer bbdo mReferences (TargetRecord inLine expLine outLine) mInHeader =
|
||||
Data.List.concat [
|
||||
extractFactors mTokenizer bbdo mReferencesData "exp" (asText expLine),
|
||||
extractFactorsFromTabbed mTokenizer bbdo mReferencesData "in" (asText inLine) mInHeader,
|
||||
extractFactors mTokenizer bbdo mReferencesData "out" (asText outLine)]
|
||||
where mReferencesData = case mReferences of
|
||||
Just references -> Just $ ReferencesData {
|
||||
referencesDataReferences = references,
|
||||
referencesDataCurrentId = Nothing }
|
||||
Nothing -> Nothing
|
||||
asText (Got (RawItemTarget t)) = t
|
||||
asText (Got (PartiallyParsedItemTarget ts)) = Data.Text.intercalate "\t" ts
|
||||
asText (Wrong _) = ""
|
||||
asText GEval.Common.Done = ""
|
||||
|
@ -299,7 +299,11 @@ featureExtractor :: (Monad m, FeatureSource s) => Maybe Tokenizer -> BlackBoxDeb
|
||||
featureExtractor mTokenizer bbdo mReferences mInHeader = CC.map extract
|
||||
.| finalFeatures (bbdoCartesian bbdo) (fromMaybe (bbdoMinFrequency bbdo) (bbdoMinCartesianFrequency bbdo))
|
||||
where extract (rank, line) =
|
||||
(line, LineWithPeggedFactors rank (getScore line) $ getFeatures mTokenizer bbdo mReferences (mainLineRecord line) mInHeader)
|
||||
(line, LineWithPeggedFactors rank (getScore line) $ getFeatures mTokenizer bbdo mReferences (lineToTargetRecord $ mainLineRecord line) mInHeader)
|
||||
|
||||
lineToTargetRecord (LineRecord inp exp out _ _) = TargetRecord (Got (RawItemTarget inp))
|
||||
(Got (RawItemTarget exp))
|
||||
(Got (RawItemTarget out))
|
||||
|
||||
finalFeatures :: Monad m => Bool -> Integer -> ConduitT (a, LineWithPeggedFactors) (a, LineWithFactors) m ()
|
||||
finalFeatures False _ = CC.map (\(l, p) -> (l, peggedToUnaryLine p))
|
||||
@ -331,18 +335,6 @@ filtreCartesian True = CC.concatMapAccum step S.empty
|
||||
peggedToUnaryLine :: LineWithPeggedFactors -> LineWithFactors
|
||||
peggedToUnaryLine (LineWithPeggedFactors rank score fs) = LineWithFactors rank score (Prelude.map UnaryFactor fs)
|
||||
|
||||
getFeatures :: Maybe Tokenizer -> BlackBoxDebuggingOptions -> Maybe References -> LineRecord -> Maybe TabularHeader -> [PeggedFactor]
|
||||
getFeatures mTokenizer bbdo mReferences (LineRecord inLine expLine outLine _ _) mInHeader =
|
||||
Data.List.concat [
|
||||
extractFactors mTokenizer bbdo mReferencesData "exp" expLine,
|
||||
extractFactorsFromTabbed mTokenizer bbdo mReferencesData "in" inLine mInHeader,
|
||||
extractFactors mTokenizer bbdo mReferencesData "out" outLine]
|
||||
where mReferencesData = case mReferences of
|
||||
Just references -> Just $ ReferencesData {
|
||||
referencesDataReferences = references,
|
||||
referencesDataCurrentId = Nothing }
|
||||
Nothing -> Nothing
|
||||
|
||||
data FeatureAggregate = ExistentialFactorAggregate Double MetricValue Integer
|
||||
| NumericalValueAggregate [Double] [MetricValue] [Int] [MetricValue]
|
||||
| LengthAggregate [Double] [MetricValue] [Int]
|
||||
@ -475,7 +467,7 @@ runLineByLineGeneralized ordering spec consum = do
|
||||
return $ Just references
|
||||
Nothing -> return Nothing
|
||||
dataSource' <- checkAndGetDataSource True spec
|
||||
let dataSource = addPreprocessing (applyPreprocessingOperations scheme) dataSource'
|
||||
let dataSource = addSchemeSpecifics scheme dataSource'
|
||||
gevalLineByLineCore metric dataSource (sorter ordering .| consum mReferences)
|
||||
where metric = gesMainMetric spec
|
||||
scheme = gesMainScheme spec
|
||||
@ -526,7 +518,7 @@ runOracleItemBased spec = runMultiOutputGeneralized spec consum
|
||||
runMultiOutputGeneralized :: GEvalSpecification -> ConduitT [LineRecord] Void (ResourceT IO) () -> IO ()
|
||||
runMultiOutputGeneralized spec consum = do
|
||||
dataSource' <- checkAndGetDataSource True spec
|
||||
let dataSource = addPreprocessing (applyPreprocessingOperations scheme) dataSource'
|
||||
let dataSource = addSchemeSpecifics scheme dataSource'
|
||||
let (Just altOuts) = gesAltOutFiles spec
|
||||
altSourceSpecs' <- mapM (getSmartSourceSpec ((gesOutDirectory spec) </> (gesTestName spec)) "out.tsv") altOuts
|
||||
let altSourceSpecs = rights altSourceSpecs'
|
||||
|
@ -4,10 +4,13 @@ module GEval.Selector
|
||||
( Selector(..),
|
||||
DataFormat(..),
|
||||
ItemTarget(..),
|
||||
TargetRecord(..),
|
||||
liftOp,
|
||||
select,
|
||||
parseSelector ) where
|
||||
|
||||
import GEval.Common (SourceItem(..))
|
||||
|
||||
import qualified Data.Text as T
|
||||
import qualified Data.Vector as V
|
||||
import qualified Data.Text.Encoding as DTE
|
||||
@ -30,6 +33,8 @@ data DataFormat = Tsv | Jsonl
|
||||
data ItemTarget = RawItemTarget T.Text | PartiallyParsedItemTarget [T.Text]
|
||||
deriving (Eq, Show)
|
||||
|
||||
data TargetRecord = TargetRecord (SourceItem ItemTarget) (SourceItem ItemTarget) (SourceItem ItemTarget)
|
||||
|
||||
parseSelector :: String -> Selector
|
||||
parseSelector = Selector . T.splitOn "/" . T.pack
|
||||
|
||||
|
@ -136,7 +136,7 @@ main = hspec $ do
|
||||
it "sorted" $
|
||||
runGEvalTest "accuracy-on-sorted" `shouldReturnAlmost` 0.75
|
||||
it "with filtering" $
|
||||
runGEvalTest "accuracy-filtering" `shouldReturnAlmost` 1.0
|
||||
runGEvalTest "accuracy-filtering" `shouldReturnAlmost` 0.6666
|
||||
describe "F-measure" $ do
|
||||
it "simple example" $
|
||||
runGEvalTest "f-measure-simple" `shouldReturnAlmost` 0.57142857
|
||||
|
@ -1,5 +1,5 @@
|
||||
foo
|
||||
bar
|
||||
bar
|
||||
bar
|
||||
baz
|
||||
bar
|
||||
|
|
Loading…
Reference in New Issue
Block a user