Finish filtering
This commit is contained in:
parent
78d2cd6501
commit
634dd21b12
@ -49,7 +49,7 @@ module GEval.Core
|
|||||||
readHeaderFileWrapper,
|
readHeaderFileWrapper,
|
||||||
getInHeader,
|
getInHeader,
|
||||||
getOutHeader,
|
getOutHeader,
|
||||||
addPreprocessing,
|
addSchemeSpecifics,
|
||||||
LineSourcesSpecification(..),
|
LineSourcesSpecification(..),
|
||||||
dataSourceToLineSourcesSpecification,
|
dataSourceToLineSourcesSpecification,
|
||||||
fromSpecificationToWithoutInput,
|
fromSpecificationToWithoutInput,
|
||||||
@ -140,9 +140,13 @@ isBetter metric valA valB = isBetter' metricOrdering valA valB
|
|||||||
isBetter' TheLowerTheBetter valA valB = valA < valB
|
isBetter' TheLowerTheBetter valA valB = valA < valB
|
||||||
metricOrdering = getMetricOrdering metric
|
metricOrdering = getMetricOrdering metric
|
||||||
|
|
||||||
isInputNeeded :: Metric -> Bool
|
isInputNeeded :: EvaluationScheme -> Bool
|
||||||
isInputNeeded CharMatch = True
|
isInputNeeded (EvaluationScheme CharMatch _) = True
|
||||||
isInputNeeded _ = False
|
isInputNeeded (EvaluationScheme _ ops) = hasFiltering ops
|
||||||
|
|
||||||
|
hasFiltering [] = False
|
||||||
|
hasFiltering ((FeatureFilter _):_) = True
|
||||||
|
hasFiltering (_:ops) = hasFiltering ops
|
||||||
|
|
||||||
-- | Could output be preprocessable
|
-- | Could output be preprocessable
|
||||||
isPreprocessable :: Metric -> Bool
|
isPreprocessable :: Metric -> Bool
|
||||||
@ -312,18 +316,19 @@ gevalOnSingleOut gevalSpec dataSource = do
|
|||||||
vals <- Prelude.mapM (\scheme ->
|
vals <- Prelude.mapM (\scheme ->
|
||||||
gevalCore (evaluationSchemeMetric scheme)
|
gevalCore (evaluationSchemeMetric scheme)
|
||||||
(gesBootstrapResampling gevalSpec)
|
(gesBootstrapResampling gevalSpec)
|
||||||
(addPreprocessing (applyPreprocessingOperations scheme) dataSource))
|
(addSchemeSpecifics scheme dataSource))
|
||||||
schemes
|
schemes
|
||||||
return (outSource, vals)
|
return (outSource, vals)
|
||||||
where outSource = dataSourceOut dataSource
|
where outSource = dataSourceOut dataSource
|
||||||
schemes = gesMetrics gevalSpec
|
schemes = gesMetrics gevalSpec
|
||||||
|
|
||||||
addPreprocessing :: (Text -> Text) -> DataSource -> DataSource
|
addSchemeSpecifics :: EvaluationScheme -> DataSource -> DataSource
|
||||||
addPreprocessing prep dataSource =
|
addSchemeSpecifics scheme dataSource =
|
||||||
dataSource {
|
dataSource {
|
||||||
dataSourceChallengeData = (dataSourceChallengeData dataSource) {
|
dataSourceChallengeData = (dataSourceChallengeData dataSource) {
|
||||||
|
challengeDataSourceFilter = getFilterForScheme (challengeDataSourceInHeader $ dataSourceChallengeData dataSource) scheme,
|
||||||
challengeDataSourcePreprocess =
|
challengeDataSourcePreprocess =
|
||||||
(challengeDataSourcePreprocess $ dataSourceChallengeData dataSource) . prep }}
|
(challengeDataSourcePreprocess $ dataSourceChallengeData dataSource) . (applyPreprocessingOperations scheme) }}
|
||||||
|
|
||||||
readHeaderFileWrapper :: Maybe FilePath -> IO (Maybe TabularHeader)
|
readHeaderFileWrapper :: Maybe FilePath -> IO (Maybe TabularHeader)
|
||||||
readHeaderFileWrapper Nothing = return Nothing
|
readHeaderFileWrapper Nothing = return Nothing
|
||||||
@ -352,7 +357,7 @@ checkAndGetDataSources forceInput gevalSpec = do
|
|||||||
throwM $ NoExpectedDirectory d
|
throwM $ NoExpectedDirectory d
|
||||||
Right expectedSource -> do
|
Right expectedSource -> do
|
||||||
-- in most cases inputSource is NoSource (unless needed by a metric or in the line-by-line mode)
|
-- in most cases inputSource is NoSource (unless needed by a metric or in the line-by-line mode)
|
||||||
inputSource <- getInputSourceIfNeeded forceInput (Prelude.map evaluationSchemeMetric schemes) expectedTestDirectory inputFile
|
inputSource <- getInputSourceIfNeeded forceInput schemes expectedTestDirectory inputFile
|
||||||
|
|
||||||
mMultipleOuts <- checkMultipleOuts gevalSpec
|
mMultipleOuts <- checkMultipleOuts gevalSpec
|
||||||
osss <- case mMultipleOuts of
|
osss <- case mMultipleOuts of
|
||||||
@ -442,9 +447,9 @@ getOutFile gevalSpec out = outDirectory </> testName </> out
|
|||||||
where outDirectory = gesOutDirectory gevalSpec
|
where outDirectory = gesOutDirectory gevalSpec
|
||||||
testName = gesTestName gevalSpec
|
testName = gesTestName gevalSpec
|
||||||
|
|
||||||
getInputSourceIfNeeded :: Bool -> [Metric] -> FilePath -> FilePath -> IO SourceSpec
|
getInputSourceIfNeeded :: Bool -> [EvaluationScheme] -> FilePath -> FilePath -> IO SourceSpec
|
||||||
getInputSourceIfNeeded forced metrics directory inputFilePath
|
getInputSourceIfNeeded forced schemes directory inputFilePath
|
||||||
| forced || (Prelude.any isInputNeeded metrics) = do
|
| forced || (Prelude.any isInputNeeded schemes) = do
|
||||||
iss <- getSmartSourceSpec directory defaultInputFile inputFilePath
|
iss <- getSmartSourceSpec directory defaultInputFile inputFilePath
|
||||||
case iss of
|
case iss of
|
||||||
Left NoSpecGiven -> throwM $ NoInputFile inputFilePath
|
Left NoSpecGiven -> throwM $ NoInputFile inputFilePath
|
||||||
@ -950,9 +955,12 @@ defineContinuation aggregator finalStep generateGraph = do
|
|||||||
v <- aggregator
|
v <- aggregator
|
||||||
return $ MetricOutput (SimpleRun $ finalStep v) (generateGraph v)
|
return $ MetricOutput (SimpleRun $ finalStep v) (generateGraph v)
|
||||||
|
|
||||||
fromSpecificationToWithoutInput lsSpec = WithoutInput expectedSource outSource
|
fromSpecificationToWithoutInput lsSpec = case lineSourcesFilter lsSpec of
|
||||||
|
NoFilter -> WithoutInput expectedSource outSource
|
||||||
|
theFilter -> WithoutInputButFiltered theFilter inputSource expectedSource outSource
|
||||||
where expectedSource = lineSourcesExpectedSource lsSpec
|
where expectedSource = lineSourcesExpectedSource lsSpec
|
||||||
outSource = lineSourcesOutputSource lsSpec
|
outSource = lineSourcesOutputSource lsSpec
|
||||||
|
inputSource = lineSourcesInputSource lsSpec
|
||||||
|
|
||||||
fromSpecificationToWithInput lsSpec = WithInput theFilter inpSource expectedSource outSource
|
fromSpecificationToWithInput lsSpec = WithInput theFilter inpSource expectedSource outSource
|
||||||
where inpSource = lineSourcesInputSource lsSpec
|
where inpSource = lineSourcesInputSource lsSpec
|
||||||
|
@ -4,13 +4,17 @@ module GEval.DataSource
|
|||||||
(ChallengeDataSource(..),
|
(ChallengeDataSource(..),
|
||||||
DataSource(..),
|
DataSource(..),
|
||||||
TargetRecord(..),
|
TargetRecord(..),
|
||||||
Filter,
|
Filter(..),
|
||||||
noFilter,
|
noFilter,
|
||||||
|
getFilterForScheme,
|
||||||
applyFilter)
|
applyFilter)
|
||||||
where
|
where
|
||||||
|
|
||||||
import GEval.Common (SourceItem(..))
|
import GEval.Common (SourceItem(..))
|
||||||
import GEval.Selector (ItemTarget(..))
|
import GEval.Selector (ItemTarget(..), TargetRecord(..))
|
||||||
|
import GEval.FeatureExtractor (getFeatures)
|
||||||
|
import GEval.BlackBoxDebugging
|
||||||
|
import GEval.EvaluationScheme
|
||||||
|
|
||||||
import Data.Text
|
import Data.Text
|
||||||
|
|
||||||
@ -18,17 +22,42 @@ import Data.Conduit.SmartSource
|
|||||||
import Data.Conduit.Header
|
import Data.Conduit.Header
|
||||||
import GEval.Selector
|
import GEval.Selector
|
||||||
|
|
||||||
data TargetRecord = TargetRecord (SourceItem ItemTarget) (SourceItem ItemTarget) (SourceItem ItemTarget)
|
data Filter = NoFilter | FilterByFeatures (Maybe TabularHeader) String
|
||||||
|
|
||||||
data Filter = NoFilter | InputFilter (Text -> Bool)
|
|
||||||
|
|
||||||
noFilter :: Filter
|
noFilter :: Filter
|
||||||
noFilter = NoFilter
|
noFilter = NoFilter
|
||||||
|
|
||||||
applyFilter :: Filter -> TargetRecord -> Bool
|
applyFilter :: Filter -> TargetRecord -> Bool
|
||||||
applyFilter NoFilter _ = True
|
applyFilter NoFilter _ = True
|
||||||
applyFilter (InputFilter fun) (TargetRecord (Got (RawItemTarget t)) _ _) = fun t
|
applyFilter (FilterByFeatures mInHeader featureSpec) tR = applyFeatureFilter mInHeader featureSpec tR
|
||||||
applyFilter (InputFilter fun) (TargetRecord (Got (PartiallyParsedItemTarget ts)) _ _) = fun (intercalate "\t" ts)
|
|
||||||
|
getFilterForScheme :: Maybe TabularHeader -> EvaluationScheme -> Filter
|
||||||
|
getFilterForScheme mTabHeader (EvaluationScheme _ ops) = case findFilter ops of
|
||||||
|
Nothing -> NoFilter
|
||||||
|
Just f -> FilterByFeatures mTabHeader (unpack $ fixIndex f)
|
||||||
|
|
||||||
|
fixIndex = replace "[" "<" . replace "]" ">"
|
||||||
|
|
||||||
|
findFilter :: [PreprocessingOperation] -> Maybe Text
|
||||||
|
findFilter [] = Nothing
|
||||||
|
findFilter ((FeatureFilter f):_) = Just f
|
||||||
|
findFilter (_:ops) = findFilter ops
|
||||||
|
|
||||||
|
|
||||||
|
applyFeatureFilter :: Maybe TabularHeader -> String -> TargetRecord -> Bool
|
||||||
|
applyFeatureFilter mInHeader featureSpec tR = featureSpec `elem` (Prelude.map show fs)
|
||||||
|
where fs = getFeatures Nothing
|
||||||
|
BlackBoxDebuggingOptions {
|
||||||
|
bbdoMinFrequency = 0,
|
||||||
|
bbdoWordShapes = False,
|
||||||
|
bbdoBigrams = False,
|
||||||
|
bbdoCartesian = False,
|
||||||
|
bbdoMinCartesianFrequency = Nothing,
|
||||||
|
bbdoConsiderNumericalFeatures = False}
|
||||||
|
Nothing
|
||||||
|
tR
|
||||||
|
mInHeader
|
||||||
|
|
||||||
|
|
||||||
-- | This type specifies the way the challenge data (input and
|
-- | This type specifies the way the challenge data (input and
|
||||||
-- expected data, but not outputs) flow into evaluation.
|
-- expected data, but not outputs) flow into evaluation.
|
||||||
|
@ -21,7 +21,8 @@ module GEval.FeatureExtractor
|
|||||||
ReferencesData(..),
|
ReferencesData(..),
|
||||||
FeatureIndex(..),
|
FeatureIndex(..),
|
||||||
toTextualContent,
|
toTextualContent,
|
||||||
filterExistentialFactors)
|
filterExistentialFactors,
|
||||||
|
getFeatures)
|
||||||
where
|
where
|
||||||
|
|
||||||
import Data.Text
|
import Data.Text
|
||||||
@ -32,6 +33,7 @@ import Text.Tokenizer
|
|||||||
import Text.WordShape
|
import Text.WordShape
|
||||||
import GEval.BlackBoxDebugging
|
import GEval.BlackBoxDebugging
|
||||||
import GEval.Common
|
import GEval.Common
|
||||||
|
import GEval.Selector (TargetRecord(..), ItemTarget(..))
|
||||||
import Text.Read (readMaybe)
|
import Text.Read (readMaybe)
|
||||||
|
|
||||||
import Control.Error.Util (hush)
|
import Control.Error.Util (hush)
|
||||||
@ -256,3 +258,24 @@ instance WithTextualContent ExistentialFactor where
|
|||||||
instance WithTextualContent AtomicFactor where
|
instance WithTextualContent AtomicFactor where
|
||||||
toTextualContent (TextFactor t) = Just t
|
toTextualContent (TextFactor t) = Just t
|
||||||
toTextualContent (ShapeFactor _) = Nothing
|
toTextualContent (ShapeFactor _) = Nothing
|
||||||
|
|
||||||
|
getFeatures :: Maybe Tokenizer
|
||||||
|
-> BlackBoxDebuggingOptions
|
||||||
|
-> Maybe References
|
||||||
|
-> TargetRecord
|
||||||
|
-> Maybe TabularHeader
|
||||||
|
-> [PeggedFactor]
|
||||||
|
getFeatures mTokenizer bbdo mReferences (TargetRecord inLine expLine outLine) mInHeader =
|
||||||
|
Data.List.concat [
|
||||||
|
extractFactors mTokenizer bbdo mReferencesData "exp" (asText expLine),
|
||||||
|
extractFactorsFromTabbed mTokenizer bbdo mReferencesData "in" (asText inLine) mInHeader,
|
||||||
|
extractFactors mTokenizer bbdo mReferencesData "out" (asText outLine)]
|
||||||
|
where mReferencesData = case mReferences of
|
||||||
|
Just references -> Just $ ReferencesData {
|
||||||
|
referencesDataReferences = references,
|
||||||
|
referencesDataCurrentId = Nothing }
|
||||||
|
Nothing -> Nothing
|
||||||
|
asText (Got (RawItemTarget t)) = t
|
||||||
|
asText (Got (PartiallyParsedItemTarget ts)) = Data.Text.intercalate "\t" ts
|
||||||
|
asText (Wrong _) = ""
|
||||||
|
asText GEval.Common.Done = ""
|
||||||
|
@ -299,7 +299,11 @@ featureExtractor :: (Monad m, FeatureSource s) => Maybe Tokenizer -> BlackBoxDeb
|
|||||||
featureExtractor mTokenizer bbdo mReferences mInHeader = CC.map extract
|
featureExtractor mTokenizer bbdo mReferences mInHeader = CC.map extract
|
||||||
.| finalFeatures (bbdoCartesian bbdo) (fromMaybe (bbdoMinFrequency bbdo) (bbdoMinCartesianFrequency bbdo))
|
.| finalFeatures (bbdoCartesian bbdo) (fromMaybe (bbdoMinFrequency bbdo) (bbdoMinCartesianFrequency bbdo))
|
||||||
where extract (rank, line) =
|
where extract (rank, line) =
|
||||||
(line, LineWithPeggedFactors rank (getScore line) $ getFeatures mTokenizer bbdo mReferences (mainLineRecord line) mInHeader)
|
(line, LineWithPeggedFactors rank (getScore line) $ getFeatures mTokenizer bbdo mReferences (lineToTargetRecord $ mainLineRecord line) mInHeader)
|
||||||
|
|
||||||
|
lineToTargetRecord (LineRecord inp exp out _ _) = TargetRecord (Got (RawItemTarget inp))
|
||||||
|
(Got (RawItemTarget exp))
|
||||||
|
(Got (RawItemTarget out))
|
||||||
|
|
||||||
finalFeatures :: Monad m => Bool -> Integer -> ConduitT (a, LineWithPeggedFactors) (a, LineWithFactors) m ()
|
finalFeatures :: Monad m => Bool -> Integer -> ConduitT (a, LineWithPeggedFactors) (a, LineWithFactors) m ()
|
||||||
finalFeatures False _ = CC.map (\(l, p) -> (l, peggedToUnaryLine p))
|
finalFeatures False _ = CC.map (\(l, p) -> (l, peggedToUnaryLine p))
|
||||||
@ -331,18 +335,6 @@ filtreCartesian True = CC.concatMapAccum step S.empty
|
|||||||
peggedToUnaryLine :: LineWithPeggedFactors -> LineWithFactors
|
peggedToUnaryLine :: LineWithPeggedFactors -> LineWithFactors
|
||||||
peggedToUnaryLine (LineWithPeggedFactors rank score fs) = LineWithFactors rank score (Prelude.map UnaryFactor fs)
|
peggedToUnaryLine (LineWithPeggedFactors rank score fs) = LineWithFactors rank score (Prelude.map UnaryFactor fs)
|
||||||
|
|
||||||
getFeatures :: Maybe Tokenizer -> BlackBoxDebuggingOptions -> Maybe References -> LineRecord -> Maybe TabularHeader -> [PeggedFactor]
|
|
||||||
getFeatures mTokenizer bbdo mReferences (LineRecord inLine expLine outLine _ _) mInHeader =
|
|
||||||
Data.List.concat [
|
|
||||||
extractFactors mTokenizer bbdo mReferencesData "exp" expLine,
|
|
||||||
extractFactorsFromTabbed mTokenizer bbdo mReferencesData "in" inLine mInHeader,
|
|
||||||
extractFactors mTokenizer bbdo mReferencesData "out" outLine]
|
|
||||||
where mReferencesData = case mReferences of
|
|
||||||
Just references -> Just $ ReferencesData {
|
|
||||||
referencesDataReferences = references,
|
|
||||||
referencesDataCurrentId = Nothing }
|
|
||||||
Nothing -> Nothing
|
|
||||||
|
|
||||||
data FeatureAggregate = ExistentialFactorAggregate Double MetricValue Integer
|
data FeatureAggregate = ExistentialFactorAggregate Double MetricValue Integer
|
||||||
| NumericalValueAggregate [Double] [MetricValue] [Int] [MetricValue]
|
| NumericalValueAggregate [Double] [MetricValue] [Int] [MetricValue]
|
||||||
| LengthAggregate [Double] [MetricValue] [Int]
|
| LengthAggregate [Double] [MetricValue] [Int]
|
||||||
@ -475,7 +467,7 @@ runLineByLineGeneralized ordering spec consum = do
|
|||||||
return $ Just references
|
return $ Just references
|
||||||
Nothing -> return Nothing
|
Nothing -> return Nothing
|
||||||
dataSource' <- checkAndGetDataSource True spec
|
dataSource' <- checkAndGetDataSource True spec
|
||||||
let dataSource = addPreprocessing (applyPreprocessingOperations scheme) dataSource'
|
let dataSource = addSchemeSpecifics scheme dataSource'
|
||||||
gevalLineByLineCore metric dataSource (sorter ordering .| consum mReferences)
|
gevalLineByLineCore metric dataSource (sorter ordering .| consum mReferences)
|
||||||
where metric = gesMainMetric spec
|
where metric = gesMainMetric spec
|
||||||
scheme = gesMainScheme spec
|
scheme = gesMainScheme spec
|
||||||
@ -526,7 +518,7 @@ runOracleItemBased spec = runMultiOutputGeneralized spec consum
|
|||||||
runMultiOutputGeneralized :: GEvalSpecification -> ConduitT [LineRecord] Void (ResourceT IO) () -> IO ()
|
runMultiOutputGeneralized :: GEvalSpecification -> ConduitT [LineRecord] Void (ResourceT IO) () -> IO ()
|
||||||
runMultiOutputGeneralized spec consum = do
|
runMultiOutputGeneralized spec consum = do
|
||||||
dataSource' <- checkAndGetDataSource True spec
|
dataSource' <- checkAndGetDataSource True spec
|
||||||
let dataSource = addPreprocessing (applyPreprocessingOperations scheme) dataSource'
|
let dataSource = addSchemeSpecifics scheme dataSource'
|
||||||
let (Just altOuts) = gesAltOutFiles spec
|
let (Just altOuts) = gesAltOutFiles spec
|
||||||
altSourceSpecs' <- mapM (getSmartSourceSpec ((gesOutDirectory spec) </> (gesTestName spec)) "out.tsv") altOuts
|
altSourceSpecs' <- mapM (getSmartSourceSpec ((gesOutDirectory spec) </> (gesTestName spec)) "out.tsv") altOuts
|
||||||
let altSourceSpecs = rights altSourceSpecs'
|
let altSourceSpecs = rights altSourceSpecs'
|
||||||
|
@ -4,10 +4,13 @@ module GEval.Selector
|
|||||||
( Selector(..),
|
( Selector(..),
|
||||||
DataFormat(..),
|
DataFormat(..),
|
||||||
ItemTarget(..),
|
ItemTarget(..),
|
||||||
|
TargetRecord(..),
|
||||||
liftOp,
|
liftOp,
|
||||||
select,
|
select,
|
||||||
parseSelector ) where
|
parseSelector ) where
|
||||||
|
|
||||||
|
import GEval.Common (SourceItem(..))
|
||||||
|
|
||||||
import qualified Data.Text as T
|
import qualified Data.Text as T
|
||||||
import qualified Data.Vector as V
|
import qualified Data.Vector as V
|
||||||
import qualified Data.Text.Encoding as DTE
|
import qualified Data.Text.Encoding as DTE
|
||||||
@ -30,6 +33,8 @@ data DataFormat = Tsv | Jsonl
|
|||||||
data ItemTarget = RawItemTarget T.Text | PartiallyParsedItemTarget [T.Text]
|
data ItemTarget = RawItemTarget T.Text | PartiallyParsedItemTarget [T.Text]
|
||||||
deriving (Eq, Show)
|
deriving (Eq, Show)
|
||||||
|
|
||||||
|
data TargetRecord = TargetRecord (SourceItem ItemTarget) (SourceItem ItemTarget) (SourceItem ItemTarget)
|
||||||
|
|
||||||
parseSelector :: String -> Selector
|
parseSelector :: String -> Selector
|
||||||
parseSelector = Selector . T.splitOn "/" . T.pack
|
parseSelector = Selector . T.splitOn "/" . T.pack
|
||||||
|
|
||||||
|
@ -136,7 +136,7 @@ main = hspec $ do
|
|||||||
it "sorted" $
|
it "sorted" $
|
||||||
runGEvalTest "accuracy-on-sorted" `shouldReturnAlmost` 0.75
|
runGEvalTest "accuracy-on-sorted" `shouldReturnAlmost` 0.75
|
||||||
it "with filtering" $
|
it "with filtering" $
|
||||||
runGEvalTest "accuracy-filtering" `shouldReturnAlmost` 1.0
|
runGEvalTest "accuracy-filtering" `shouldReturnAlmost` 0.6666
|
||||||
describe "F-measure" $ do
|
describe "F-measure" $ do
|
||||||
it "simple example" $
|
it "simple example" $
|
||||||
runGEvalTest "f-measure-simple" `shouldReturnAlmost` 0.57142857
|
runGEvalTest "f-measure-simple" `shouldReturnAlmost` 0.57142857
|
||||||
|
@ -1,5 +1,5 @@
|
|||||||
foo
|
foo
|
||||||
bar
|
bar
|
||||||
bar
|
bar
|
||||||
bar
|
baz
|
||||||
bar
|
bar
|
||||||
|
|
Loading…
Reference in New Issue
Block a user