Show column names in extracted features

This commit is contained in:
Filip Gralinski 2020-02-22 12:13:45 +01:00
parent 6d586c7238
commit 8d429b01cb
4 changed files with 89 additions and 64 deletions

View File

@ -1,7 +1,7 @@
{-# LANGUAGE OverloadedStrings #-}
module Data.Conduit.Header
(processHeader, TabularHeader, readHeaderFile)
(processHeader, TabularHeader(..), readHeaderFile)
where
import Data.Text

View File

@ -19,6 +19,7 @@ module GEval.FeatureExtractor
FeatureNamespace(..),
References(..),
ReferencesData(..),
FeatureIndex(..),
toTextualContent,
filterExistentialFactors)
where
@ -39,6 +40,8 @@ import Data.Attoparsec.Text
import Data.Attoparsec.Combinator
import Control.Applicative
import Data.Conduit.Header
import qualified Data.HashMap.Strict as H
import GEval.Annotation
@ -131,12 +134,19 @@ instance Show AtomicFactor where
show (TextFactor t) = unpack t
show (ShapeFactor (WordShape t)) = 'S':'H':'A':'P':'E':':':(unpack t)
data FeatureNamespace = FeatureNamespace Text | FeatureTabbedNamespace Text Int
data FeatureIndex = ColumnByNumber Int | ColumnByName Text
deriving (Eq, Ord)
instance Show FeatureIndex where
show (ColumnByNumber ix) = show ix
show (ColumnByName name) = unpack name
data FeatureNamespace = FeatureNamespace Text | FeatureTabbedNamespace Text FeatureIndex
deriving (Eq, Ord)
instance Show FeatureNamespace where
show (FeatureNamespace namespace) = unpack namespace
show (FeatureTabbedNamespace namespace column) = ((unpack namespace) ++ "<" ++ (show column) ++ ">")
show (FeatureTabbedNamespace namespace columnIndex) = ((unpack namespace) ++ "<" ++ (show columnIndex) ++ ">")
data References = References (H.HashMap Integer Text)
@ -205,11 +215,15 @@ extractFactorsFromField mTokenizer bbdo mReferenceData namespace record =
extractFactors :: (Maybe Tokenizer) -> BlackBoxDebuggingOptions -> Maybe ReferencesData -> Text -> Text -> [PeggedFactor]
extractFactors mTokenizer bbdo mReferencesData namespace record = extractFactorsFromField mTokenizer bbdo mReferencesData (FeatureNamespace namespace) record
extractFactorsFromTabbed :: (Maybe Tokenizer) -> BlackBoxDebuggingOptions -> Maybe ReferencesData -> Text -> Text -> [PeggedFactor]
extractFactorsFromTabbed mTokenizer bbdo mReferencesData namespace record =
extractFactorsFromTabbed :: (Maybe Tokenizer) -> BlackBoxDebuggingOptions -> Maybe ReferencesData -> Text -> Text -> Maybe TabularHeader -> [PeggedFactor]
extractFactorsFromTabbed mTokenizer bbdo mReferencesData namespace record mHeader =
Data.List.concat
$ Prelude.map (\(n, t) -> extractFactorsFromField mTokenizer bbdo mReferencesData (FeatureTabbedNamespace namespace n) t)
$ Prelude.zip [1..] (splitOn "\t" record)
$ Prelude.zip (generateColumnNames mHeader) (splitOn "\t" record)
generateColumnNames :: Maybe TabularHeader -> [FeatureIndex]
generateColumnNames Nothing = Data.List.map ColumnByNumber [1..]
generateColumnNames (Just (TabularHeader fields)) = Data.List.map ColumnByName fields
addCartesianFactors :: BlackBoxDebuggingOptions -> [LineWithPeggedFactors] -> [LineWithFactors]
addCartesianFactors bbdo linesWithPeggedFactors = addCartesianFactors' (bbdoCartesian bbdo) linesWithPeggedFactors

View File

@ -98,9 +98,11 @@ parseReferenceEntry line = (read $ unpack refId, t)
where [refId, t] = splitOn "\t" line
runLineByLine :: ResultOrdering -> Maybe String -> GEvalSpecification -> BlackBoxDebuggingOptions -> IO ()
runLineByLine ordering featureFilter spec bbdo = runLineByLineGeneralized ordering spec consum
where consum :: Maybe References -> ConduitT LineRecord Void (ResourceT IO) ()
consum = (\mReferences -> (runFeatureFilter featureFilter spec bbdo mReferences .| CL.map (encodeUtf8 . formatOutput) .| CC.unlinesAscii .| CC.stdout))
runLineByLine ordering featureFilter spec bbdo = do
mInHeader <- readHeaderFileWrapper $ getInHeader spec
runLineByLineGeneralized ordering spec (consum mInHeader)
where consum :: Maybe TabularHeader -> Maybe References -> ConduitT LineRecord Void (ResourceT IO) ()
consum = (\mInHeader -> \mReferences -> (runFeatureFilter featureFilter spec bbdo mReferences mInHeader .| CL.map (encodeUtf8 . formatOutput) .| CC.unlinesAscii .| CC.stdout))
formatOutput (LineRecord inp exp out _ score) = Data.Text.intercalate "\t" [
formatScore score,
escapeTabs inp,
@ -155,15 +157,15 @@ markBadFeatures worstFeaturesMap mTokenizer bbdo field line =
$ Prelude.map (featureToLineSpan worstFeaturesMap)
$ extractFactors mTokenizer bbdo Nothing field line
markBadFeaturesInTabbed :: (M.Map PeggedFactor Double) -> (Maybe Tokenizer) -> BlackBoxDebuggingOptions -> Text -> Text -> [LineSpan]
markBadFeaturesInTabbed worstFeaturesMap mTokenizer bbdo field line =
markBadFeaturesInTabbed :: (M.Map PeggedFactor Double) -> (Maybe Tokenizer) -> BlackBoxDebuggingOptions -> Text -> Text -> Maybe TabularHeader -> [LineSpan]
markBadFeaturesInTabbed worstFeaturesMap mTokenizer bbdo field line mInHeader =
catMaybes
$ Prelude.map (featureToLineSpan worstFeaturesMap)
$ extractFactorsFromTabbed mTokenizer bbdo Nothing field line
$ extractFactorsFromTabbed mTokenizer bbdo Nothing field line mInHeader
doMarking worstFeaturesMap mTokenizer bbdo (LineRecord inpLine expLine outLine _ score) =
SpanLineRecord (markBadFeaturesInTabbed worstFeaturesMap mTokenizer bbdo "in" inpLine)
doMarking worstFeaturesMap mTokenizer mInHeader bbdo (LineRecord inpLine expLine outLine _ score) =
SpanLineRecord (markBadFeaturesInTabbed worstFeaturesMap mTokenizer bbdo "in" inpLine mInHeader)
(markBadFeatures worstFeaturesMap mTokenizer bbdo "exp" expLine)
(markBadFeatures worstFeaturesMap mTokenizer bbdo "out" outLine)
score
@ -184,16 +186,17 @@ runLineByLineWithWorstFeaturesGeneralized :: ResultOrdering
-> IO r
runLineByLineWithWorstFeaturesGeneralized ordering featureFilter spec bbdo consum = do
hPutStrLn stderr "Looking for worst features..."
worstFeatures <- runLineByLineGeneralized ordering' spec (\mReferences -> worstFeaturesPipeline False spec bbdo mReferences (CL.take 100))
mInHeader <- readHeaderFileWrapper $ getInHeader spec
worstFeatures <- runLineByLineGeneralized ordering' spec (\mReferences -> worstFeaturesPipeline False spec bbdo mReferences mInHeader (CL.take 100))
let worstFeaturesMap = M.fromList
$ catMaybes
$ Prelude.map featureToFactor
$ Prelude.map (\(FeatureWithPValue feature pValue _ _) -> (feature, pValue)) worstFeatures
runLineByLineGeneralized ordering spec (consum' worstFeaturesMap)
where consum' worstFeaturesMap = (\mReferences -> (runFeatureFilter featureFilter spec bbdo mReferences
.| CL.map (doMarking worstFeaturesMap mTokenizer bbdo)
.| consum))
mInHeader <- readHeaderFileWrapper $ getInHeader spec
runLineByLineGeneralized ordering spec (consum' worstFeaturesMap mInHeader)
where consum' worstFeaturesMap mInHeader = (\mReferences -> (runFeatureFilter featureFilter spec bbdo mReferences mInHeader
.| CL.map (doMarking worstFeaturesMap mTokenizer mInHeader bbdo)
.| consum))
ordering' = forceSomeOrdering ordering
mTokenizer = gesTokenizer spec
@ -206,18 +209,21 @@ runFeatureFilter :: (Monad m, FeatureSource s) => Maybe String
-> GEvalSpecification
-> BlackBoxDebuggingOptions
-> Maybe References
-> Maybe TabularHeader
-> ConduitT s s m ()
runFeatureFilter Nothing _ _ _ = doNothing
runFeatureFilter (Just feature) spec bbdo mReferences = CC.map (\l -> (fakeRank, l))
.| featureExtractor mTokenizer bbdo mReferences
.| CC.filter (checkFeature feature)
.| CC.map fst
runFeatureFilter Nothing _ _ _ _ = doNothing
runFeatureFilter (Just feature) spec bbdo mReferences mInHeader = CC.map (\l -> (fakeRank, l))
.| featureExtractor mTokenizer bbdo mReferences mInHeader
.| CC.filter (checkFeature feature)
.| CC.map fst
where mTokenizer = gesTokenizer spec
fakeRank = 0.0
checkFeature feature (_, LineWithFactors _ _ fs) = feature `elem` (Prelude.map show fs)
runWorstFeatures :: ResultOrdering -> GEvalSpecification -> BlackBoxDebuggingOptions -> IO ()
runWorstFeatures ordering spec bbdo = runLineByLineGeneralized ordering' spec (\mReferences -> worstFeaturesPipeline False spec bbdo mReferences consumFeatures)
runWorstFeatures ordering spec bbdo = do
mInHeader <- readHeaderFileWrapper $ getInHeader spec
runLineByLineGeneralized ordering' spec (\mReferences -> worstFeaturesPipeline False spec bbdo mReferences mInHeader consumFeatures)
where ordering' = forceSomeOrdering ordering
consumFeatures = CL.map (encodeUtf8 . formatFeatureWithPValue)
@ -228,10 +234,11 @@ worstFeaturesPipeline :: Bool
-> GEvalSpecification
-> BlackBoxDebuggingOptions
-> Maybe References
-> Maybe TabularHeader
-> ConduitT FeatureWithPValue Void (ResourceT IO) a
-> ConduitT LineRecord Void (ResourceT IO) a
worstFeaturesPipeline reversed spec bbdo mReferences consum = rank (lessByMetric reversed $ gesMainMetric spec)
.| evalStateC 0 (extractFeaturesAndPValues spec bbdo mReferences)
worstFeaturesPipeline reversed spec bbdo mReferences mInHeader consum = rank (lessByMetric reversed $ gesMainMetric spec)
.| evalStateC 0 (extractFeaturesAndPValues spec bbdo mReferences mInHeader)
.| CC.filter (\(FeatureWithPValue _ p _ _) -> not $ isNaN p) -- NaN values would poison sorting
.| gobbleAndDo (sortBy featureOrder)
.| filtreCartesian (bbdoCartesian bbdo)
@ -245,10 +252,10 @@ forceSomeOrdering :: ResultOrdering -> ResultOrdering
forceSomeOrdering FirstTheBest = FirstTheBest
forceSomeOrdering _ = FirstTheWorst
extractFeaturesAndPValues :: Monad m => GEvalSpecification -> BlackBoxDebuggingOptions -> Maybe References -> ConduitT (Double, LineRecord) FeatureWithPValue (StateT Integer m) ()
extractFeaturesAndPValues spec bbdo mReferences =
extractFeaturesAndPValues :: Monad m => GEvalSpecification -> BlackBoxDebuggingOptions -> Maybe References -> Maybe TabularHeader -> ConduitT (Double, LineRecord) FeatureWithPValue (StateT Integer m) ()
extractFeaturesAndPValues spec bbdo mReferences mInHeader =
totalCounter
.| rankedFeatureExtractor spec bbdo mReferences
.| rankedFeatureExtractor spec bbdo mReferences mInHeader
.| uScoresCounter (bbdoMinFrequency bbdo)
@ -268,11 +275,11 @@ formatFeatureWithPValue (FeatureWithPValue f p avg c) =
(pack $ printf "%0.8f" avg),
(pack $ printf "%0.20f" p)]
rankedFeatureExtractor :: Monad m => GEvalSpecification -> BlackBoxDebuggingOptions -> Maybe References -> ConduitT (Double, LineRecord) RankedFactor m ()
rankedFeatureExtractor spec bbdo mReferences = featureExtractor mTokenizer bbdo mReferences
.| CC.map snd
.| CC.map unwrapFeatures
.| CC.concat
rankedFeatureExtractor :: Monad m => GEvalSpecification -> BlackBoxDebuggingOptions -> Maybe References -> Maybe TabularHeader -> ConduitT (Double, LineRecord) RankedFactor m ()
rankedFeatureExtractor spec bbdo mReferences mInHeader = featureExtractor mTokenizer bbdo mReferences mInHeader
.| CC.map snd
.| CC.map unwrapFeatures
.| CC.concat
where mTokenizer = gesTokenizer spec
unwrapFeatures (LineWithFactors rank score fs) = Prelude.map (\f -> RankedFactor f rank score) fs
@ -288,11 +295,11 @@ instance FeatureSource (LineRecord, LineRecord) where
getScore (LineRecord _ _ _ _ scoreA, LineRecord _ _ _ _ scoreB) = scoreB - scoreA
mainLineRecord (_, l) = l
featureExtractor :: (Monad m, FeatureSource s) => Maybe Tokenizer -> BlackBoxDebuggingOptions -> Maybe References -> ConduitT (Double, s) (s, LineWithFactors) m ()
featureExtractor mTokenizer bbdo mReferences = CC.map extract
featureExtractor :: (Monad m, FeatureSource s) => Maybe Tokenizer -> BlackBoxDebuggingOptions -> Maybe References -> Maybe TabularHeader -> ConduitT (Double, s) (s, LineWithFactors) m ()
featureExtractor mTokenizer bbdo mReferences mInHeader = CC.map extract
.| finalFeatures (bbdoCartesian bbdo) (fromMaybe (bbdoMinFrequency bbdo) (bbdoMinCartesianFrequency bbdo))
where extract (rank, line) =
(line, LineWithPeggedFactors rank (getScore line) $ getFeatures mTokenizer bbdo mReferences (mainLineRecord line))
(line, LineWithPeggedFactors rank (getScore line) $ getFeatures mTokenizer bbdo mReferences (mainLineRecord line) mInHeader)
finalFeatures :: Monad m => Bool -> Integer -> ConduitT (a, LineWithPeggedFactors) (a, LineWithFactors) m ()
finalFeatures False _ = CC.map (\(l, p) -> (l, peggedToUnaryLine p))
@ -324,11 +331,11 @@ filtreCartesian True = CC.concatMapAccum step S.empty
peggedToUnaryLine :: LineWithPeggedFactors -> LineWithFactors
peggedToUnaryLine (LineWithPeggedFactors rank score fs) = LineWithFactors rank score (Prelude.map UnaryFactor fs)
getFeatures :: Maybe Tokenizer -> BlackBoxDebuggingOptions -> Maybe References -> LineRecord -> [PeggedFactor]
getFeatures mTokenizer bbdo mReferences (LineRecord inLine expLine outLine _ _) =
getFeatures :: Maybe Tokenizer -> BlackBoxDebuggingOptions -> Maybe References -> LineRecord -> Maybe TabularHeader -> [PeggedFactor]
getFeatures mTokenizer bbdo mReferences (LineRecord inLine expLine outLine _ _) mInHeader =
Data.List.concat [
extractFactors mTokenizer bbdo mReferencesData "exp" expLine,
extractFactorsFromTabbed mTokenizer bbdo mReferencesData "in" inLine,
extractFactorsFromTabbed mTokenizer bbdo mReferencesData "in" inLine mInHeader,
extractFactors mTokenizer bbdo mReferencesData "out" outLine]
where mReferencesData = case mReferences of
Just references -> Just $ ReferencesData {
@ -487,13 +494,15 @@ gobbleAndDo fun = do
CC.yieldMany $ fun l
runDiff :: ResultOrdering -> Maybe String -> FilePath -> GEvalSpecification -> BlackBoxDebuggingOptions -> IO ()
runDiff ordering featureFilter otherOut spec bbdo = runDiffGeneralized ordering otherOut spec consum
where consum :: Maybe References -> ConduitT (LineRecord, LineRecord) Void (ResourceT IO) ()
consum = \mReferences -> CL.filter shouldBeShown
.| runFeatureFilter featureFilter spec bbdo mReferences
.| CL.map (encodeUtf8 . formatOutput)
.| CC.unlinesAscii
.| CC.stdout
runDiff ordering featureFilter otherOut spec bbdo = do
mInHeader <- readHeaderFileWrapper $ getInHeader spec
runDiffGeneralized ordering otherOut spec (consum mInHeader)
where consum :: Maybe TabularHeader -> Maybe References -> ConduitT (LineRecord, LineRecord) Void (ResourceT IO) ()
consum = \mInHeader -> \mReferences -> CL.filter shouldBeShown
.| runFeatureFilter featureFilter spec bbdo mReferences mInHeader
.| CL.map (encodeUtf8 . formatOutput)
.| CC.unlinesAscii
.| CC.stdout
shouldBeShown (LineRecord _ _ outA _ scoreA, LineRecord _ _ outB _ scoreB) =
outA /= outB && scoreA /= scoreB
formatOutput (LineRecord inp exp outA _ scoreA, LineRecord _ _ outB _ scoreB) = Data.Text.intercalate "\t" [
@ -533,15 +542,17 @@ runMultiOutputGeneralized spec consum = do
mSelector = gesSelector spec
runMostWorseningFeatures :: ResultOrdering -> FilePath -> GEvalSpecification -> BlackBoxDebuggingOptions -> IO ()
runMostWorseningFeatures ordering otherOut spec bbdo = runDiffGeneralized ordering' otherOut spec consum
runMostWorseningFeatures ordering otherOut spec bbdo = do
mInHeader <- readHeaderFileWrapper $ getInHeader spec
runDiffGeneralized ordering' otherOut spec (consum mInHeader)
where ordering' = forceSomeOrdering ordering
reversed = case ordering of
KeepTheOriginalOrder -> False
FirstTheWorst -> False
FirstTheBest -> True
consum :: Maybe References -> ConduitT (LineRecord, LineRecord) Void (ResourceT IO) ()
consum = \mReferences -> CC.map prepareFakeLineRecord
.| (worstFeaturesPipeline reversed spec bbdo mReferences consumFeatures)
consum :: Maybe TabularHeader -> Maybe References -> ConduitT (LineRecord, LineRecord) Void (ResourceT IO) ()
consum = \mInHeader -> \mReferences -> CC.map prepareFakeLineRecord
.| (worstFeaturesPipeline reversed spec bbdo mReferences mInHeader consumFeatures)
prepareFakeLineRecord :: (LineRecord, LineRecord) -> LineRecord
prepareFakeLineRecord (LineRecord _ _ _ _ scorePrev, LineRecord inp exp out c score) =
LineRecord inp exp out c (score - scorePrev)

View File

@ -654,26 +654,26 @@ main = hspec $ do
bbdoCartesian = False,
bbdoMinCartesianFrequency = Nothing,
bbdoConsiderNumericalFeatures = True }
(sort $ extractFactorsFromTabbed Nothing bbdo Nothing "in" "I like this\t34.3\ttests") `shouldBe` [
PeggedFactor (FeatureTabbedNamespace "in" 1)
(sort $ extractFactorsFromTabbed Nothing bbdo Nothing "in" "I like this\t34.3\ttests" Nothing) `shouldBe` [
PeggedFactor (FeatureTabbedNamespace "in" (ColumnByNumber 1))
(SimpleExistentialFactor (SimpleAtomicFactor (TextFactor "I"))),
PeggedFactor (FeatureTabbedNamespace "in" 1)
PeggedFactor (FeatureTabbedNamespace "in" (ColumnByNumber 1))
(SimpleExistentialFactor (SimpleAtomicFactor (TextFactor "like"))),
PeggedFactor (FeatureTabbedNamespace "in" 1)
PeggedFactor (FeatureTabbedNamespace "in" (ColumnByNumber 1))
(SimpleExistentialFactor (SimpleAtomicFactor (TextFactor "this"))),
PeggedFactor (FeatureTabbedNamespace "in" 1)
PeggedFactor (FeatureTabbedNamespace "in" (ColumnByNumber 1))
(SimpleExistentialFactor (BigramFactor (TextFactor "I") (TextFactor "like"))),
PeggedFactor (FeatureTabbedNamespace "in" 1)
PeggedFactor (FeatureTabbedNamespace "in" (ColumnByNumber 1))
(SimpleExistentialFactor (BigramFactor (TextFactor "like") (TextFactor "this"))),
PeggedFactor (FeatureTabbedNamespace "in" 1)
PeggedFactor (FeatureTabbedNamespace "in" (ColumnByNumber 1))
(NumericalFactor Nothing 11),
PeggedFactor (FeatureTabbedNamespace "in" 2)
PeggedFactor (FeatureTabbedNamespace "in" (ColumnByNumber 2))
(SimpleExistentialFactor (SimpleAtomicFactor (TextFactor "34.3"))),
PeggedFactor (FeatureTabbedNamespace "in" 2)
PeggedFactor (FeatureTabbedNamespace "in" (ColumnByNumber 2))
(NumericalFactor (Just 34.3) 4),
PeggedFactor (FeatureTabbedNamespace "in" 3)
PeggedFactor (FeatureTabbedNamespace "in" (ColumnByNumber 3))
(SimpleExistentialFactor (SimpleAtomicFactor (TextFactor "tests"))),
PeggedFactor (FeatureTabbedNamespace "in" 3)
PeggedFactor (FeatureTabbedNamespace "in" (ColumnByNumber 3))
(NumericalFactor Nothing 5) ]
describe "Kendall's tau" $ do
it "tau" $ do