Trying to get references

This commit is contained in:
Filip Graliński 2019-05-23 16:16:05 +02:00
parent 780b7016c5
commit b4ad774623
7 changed files with 145 additions and 60 deletions

View File

@ -90,6 +90,7 @@ library
, integration
, Chart
, Chart-cairo
, errors
default-language: Haskell2010
executable geval

View File

@ -3,7 +3,7 @@
module GEval.Annotation
(parseAnnotations, Annotation(..),
parseObtainedAnnotations, ObtainedAnnotation(..),
matchScore, getProbabilisticSoftCounts)
matchScore, getProbabilisticSoftCounts, intSetParser)
where
import qualified Data.IntSet as IS

View File

@ -291,7 +291,8 @@ data GEvalSpecification = GEvalSpecification
gesTokenizer :: Maybe Tokenizer,
gesGonitoHost :: Maybe String,
gesToken :: Maybe String,
gesGonitoGitAnnexRemote :: Maybe String}
gesGonitoGitAnnexRemote :: Maybe String,
gesReferences :: Maybe String }
gesMainMetric :: GEvalSpecification -> Metric
@ -378,7 +379,8 @@ defaultGEvalSpecification = GEvalSpecification {
gesTokenizer = Nothing,
gesGonitoHost = Nothing,
gesToken = Nothing,
gesGonitoGitAnnexRemote = Nothing }
gesGonitoGitAnnexRemote = Nothing,
gesReferences = Nothing}
isEmptyFile :: FilePath -> IO (Bool)
isEmptyFile path = do

View File

@ -17,19 +17,32 @@ module GEval.FeatureExtractor
ExistentialFactor(..),
AtomicFactor(..),
FeatureNamespace(..),
References(..),
ReferencesData(..),
filterExistentialFactors)
where
import Data.Text
import Data.List
import Data.Monoid ((<>))
import Data.Maybe (catMaybes)
import Data.Maybe (catMaybes, fromMaybe)
import Text.Tokenizer
import Text.WordShape
import GEval.BlackBoxDebugging
import GEval.Common
import Text.Read (readMaybe)
import Control.Error.Util (hush)
import Data.Attoparsec.Text
import Data.Attoparsec.Combinator
import Control.Applicative
import qualified Data.HashMap.Strict as H
import GEval.Annotation
import qualified Data.IntSet as IS
data Feature = UnaryFeature PeggedExistentialFactor
| CartesianFeature PeggedExistentialFactor PeggedExistentialFactor
| NumericalFeature FeatureNamespace NumericalType NumericalDirection
@ -124,6 +137,38 @@ instance Show FeatureNamespace where
show (FeatureNamespace namespace) = unpack namespace
show (FeatureTabbedNamespace namespace column) = ((unpack namespace) ++ "<" ++ (show column) ++ ">")
data References = References (H.HashMap Integer Text)
data ReferencesData = ReferencesData {
referencesDataReferences :: References,
referencesDataCurrentId :: Maybe Integer }
data ReferencePointer = ReferencePointer Integer IS.IntSet
getReferenced :: References -> ReferencePointer -> Maybe Text
getReferenced (References references) (ReferencePointer refId indexSet) = case H.lookup refId references of
Just t -> Just (getFrag t indexSet)
Nothing -> Nothing
getDirectOrReferenced :: Maybe References -> Text -> Text
getDirectOrReferenced Nothing record = record
getDirectOrReferenced (Just references) record = case parseReferencePointer record of
Just pointer -> fromMaybe record (getReferenced references pointer)
Nothing -> record
getFrag :: Text -> IS.IntSet -> Text
getFrag t indexSet = pack $ Data.List.map (\ix -> index t ix) $ IS.toAscList indexSet
parseReferencePointer :: Text -> Maybe ReferencePointer
parseReferencePointer t = hush $ parseOnly (referencePointerParser <* endOfInput) t
referencePointerParser :: Parser ReferencePointer
referencePointerParser = do
refId <- decimal
string " "
indexSet <- intSetParser
return $ ReferencePointer refId indexSet
tokenizeForFactors :: (Maybe Tokenizer) -> Text -> [Text]
tokenizeForFactors Nothing t = Data.List.filter (not . Data.Text.null) $ split splitPred t
where splitPred c = c == ' ' || c == '\t' || c == ':'
@ -150,19 +195,19 @@ extractSimpleFactors mTokenizer bbdo t = Data.List.concat $ (Prelude.map (Prelud
numericalFactor t = [NumericalFactor (readMaybe $ unpack t) (Data.Text.length t)]
extractFactorsFromField :: (Maybe Tokenizer) -> BlackBoxDebuggingOptions -> FeatureNamespace -> Text -> [PeggedFactor]
extractFactorsFromField mTokenizer bbdo namespace record =
extractFactorsFromField :: (Maybe Tokenizer) -> BlackBoxDebuggingOptions -> Maybe ReferencesData -> FeatureNamespace -> Text -> [PeggedFactor]
extractFactorsFromField mTokenizer bbdo mReferenceData namespace record =
Prelude.map (\af -> PeggedFactor namespace af)
$ extractSimpleFactors mTokenizer bbdo record
$ extractSimpleFactors mTokenizer bbdo (getDirectOrReferenced (referencesDataReferences <$> mReferenceData) record)
extractFactors :: (Maybe Tokenizer) -> BlackBoxDebuggingOptions -> Text -> Text -> [PeggedFactor]
extractFactors mTokenizer bbdo namespace record =
extractFactorsFromField mTokenizer bbdo (FeatureNamespace namespace) record
extractFactorsFromTabbed :: (Maybe Tokenizer) -> BlackBoxDebuggingOptions -> Text -> Text -> [PeggedFactor]
extractFactorsFromTabbed mTokenizer bbdo namespace record =
extractFactors :: (Maybe Tokenizer) -> BlackBoxDebuggingOptions -> Maybe ReferencesData -> Text -> Text -> [PeggedFactor]
extractFactors mTokenizer bbdo mReferencesData namespace record = extractFactorsFromField mTokenizer bbdo mReferencesData (FeatureNamespace namespace) record
extractFactorsFromTabbed :: (Maybe Tokenizer) -> BlackBoxDebuggingOptions -> Maybe ReferencesData -> Text -> Text -> [PeggedFactor]
extractFactorsFromTabbed mTokenizer bbdo mReferencesData namespace record =
Data.List.concat
$ Prelude.map (\(n, t) -> extractFactorsFromField mTokenizer bbdo (FeatureTabbedNamespace namespace n) t)
$ Prelude.map (\(n, t) -> extractFactorsFromField mTokenizer bbdo mReferencesData (FeatureTabbedNamespace namespace n) t)
$ Prelude.zip [1..] (splitOn "\t" record)
addCartesianFactors :: BlackBoxDebuggingOptions -> [LineWithPeggedFactors] -> [LineWithFactors]

View File

@ -61,16 +61,33 @@ import Statistics.Distribution (cumulative)
import Statistics.Distribution.Normal (normalDistr)
import Data.Statistics.Kendall (kendallZ)
import Data.Conduit.Binary (sourceFile)
import qualified Data.HashMap.Strict as H
import qualified Data.Map.Strict as M
import qualified Data.Set as S
data LineRecord = LineRecord Text Text Text Word32 MetricValue
deriving (Eq, Show)
readReferences :: FilePath -> IO References
readReferences referencesFilePath = do
h <- runResourceT $ runConduit $ (sourceFile referencesFilePath
.| CC.decodeUtf8Lenient
.| CT.lines
.| CC.map parseReferenceEntry
.| CC.foldl (\h (refId, t) -> H.insert refId t h) H.empty)
return $ References h
parseReferenceEntry :: Text -> (Integer, Text)
parseReferenceEntry line = (read $ unpack refId, t)
where [refId, t] = splitOn "\t" line
runLineByLine :: ResultOrdering -> Maybe String -> GEvalSpecification -> BlackBoxDebuggingOptions -> IO ()
runLineByLine ordering featureFilter spec bbdo = runLineByLineGeneralized ordering spec consum
where consum :: ConduitT LineRecord Void (ResourceT IO) ()
consum = (runFeatureFilter featureFilter spec bbdo .| CL.map (encodeUtf8 . formatOutput) .| CC.unlinesAscii .| CC.stdout)
where consum :: Maybe References -> ConduitT LineRecord Void (ResourceT IO) ()
consum = (\mReferences -> (runFeatureFilter featureFilter spec bbdo mReferences .| CL.map (encodeUtf8 . formatOutput) .| CC.unlinesAscii .| CC.stdout))
formatOutput (LineRecord inp exp out _ score) = Data.Text.intercalate "\t" [
formatScore score,
escapeTabs inp,
@ -79,25 +96,25 @@ runLineByLine ordering featureFilter spec bbdo = runLineByLineGeneralized orderi
formatScore :: MetricValue -> Text
formatScore = Data.Text.pack . printf "%f"
runFeatureFilter :: (Monad m, FeatureSource s) => Maybe String -> GEvalSpecification -> BlackBoxDebuggingOptions -> ConduitT s s m ()
runFeatureFilter Nothing _ _ = doNothing
runFeatureFilter (Just feature) spec bbdo = CC.map (\l -> (fakeRank, l))
.| featureExtractor mTokenizer bbdo
.| CC.filter (checkFeature feature)
.| CC.map fst
runFeatureFilter :: (Monad m, FeatureSource s) => Maybe String -> GEvalSpecification -> BlackBoxDebuggingOptions -> Maybe References -> ConduitT s s m ()
runFeatureFilter Nothing _ _ _ = doNothing
runFeatureFilter (Just feature) spec bbdo mReferences = CC.map (\l -> (fakeRank, l))
.| featureExtractor mTokenizer bbdo mReferences
.| CC.filter (checkFeature feature)
.| CC.map fst
where mTokenizer = gesTokenizer spec
fakeRank = 0.0
checkFeature feature (_, LineWithFactors _ _ fs) = feature `elem` (Prelude.map show fs)
runWorstFeatures :: ResultOrdering -> GEvalSpecification -> BlackBoxDebuggingOptions -> IO ()
runWorstFeatures ordering spec bbdo = runLineByLineGeneralized ordering' spec (worstFeaturesPipeline False spec bbdo)
runWorstFeatures ordering spec bbdo = runLineByLineGeneralized ordering' spec (\mReferences -> worstFeaturesPipeline False spec bbdo mReferences)
where ordering' = forceSomeOrdering ordering
worstFeaturesPipeline :: Bool -> GEvalSpecification -> BlackBoxDebuggingOptions -> ConduitT LineRecord Void (ResourceT IO) ()
worstFeaturesPipeline reversed spec bbdo = rank (lessByMetric reversed $ gesMainMetric spec)
.| evalStateC 0 (extractFeaturesAndPValues spec bbdo)
worstFeaturesPipeline :: Bool -> GEvalSpecification -> BlackBoxDebuggingOptions -> Maybe References -> ConduitT LineRecord Void (ResourceT IO) ()
worstFeaturesPipeline reversed spec bbdo mReferences = rank (lessByMetric reversed $ gesMainMetric spec)
.| evalStateC 0 (extractFeaturesAndPValues spec bbdo mReferences)
.| CC.filter (\(FeatureWithPValue _ p _ _) -> not $ isNaN p) -- NaN values would poison sorting
.| gobbleAndDo (sortBy featureOrder)
.| filtreCartesian (bbdoCartesian bbdo)
@ -120,10 +137,10 @@ forceSomeOrdering :: ResultOrdering -> ResultOrdering
forceSomeOrdering FirstTheBest = FirstTheBest
forceSomeOrdering _ = FirstTheWorst
extractFeaturesAndPValues :: Monad m => GEvalSpecification -> BlackBoxDebuggingOptions -> ConduitT (Double, LineRecord) FeatureWithPValue (StateT Integer m) ()
extractFeaturesAndPValues spec bbdo =
extractFeaturesAndPValues :: Monad m => GEvalSpecification -> BlackBoxDebuggingOptions -> Maybe References -> ConduitT (Double, LineRecord) FeatureWithPValue (StateT Integer m) ()
extractFeaturesAndPValues spec bbdo mReferences =
totalCounter
.| rankedFeatureExtractor spec bbdo
.| rankedFeatureExtractor spec bbdo mReferences
.| uScoresCounter (bbdoMinFrequency bbdo)
@ -143,11 +160,11 @@ formatFeatureWithPValue (FeatureWithPValue f p avg c) =
(pack $ printf "%0.8f" avg),
(pack $ printf "%0.20f" p)]
rankedFeatureExtractor :: Monad m => GEvalSpecification -> BlackBoxDebuggingOptions -> ConduitT (Double, LineRecord) RankedFactor m ()
rankedFeatureExtractor spec bbdo = featureExtractor mTokenizer bbdo
.| CC.map snd
.| CC.map unwrapFeatures
.| CC.concat
rankedFeatureExtractor :: Monad m => GEvalSpecification -> BlackBoxDebuggingOptions -> Maybe References -> ConduitT (Double, LineRecord) RankedFactor m ()
rankedFeatureExtractor spec bbdo mReferences = featureExtractor mTokenizer bbdo mReferences
.| CC.map snd
.| CC.map unwrapFeatures
.| CC.concat
where mTokenizer = gesTokenizer spec
unwrapFeatures (LineWithFactors rank score fs) = Prelude.map (\f -> RankedFactor f rank score) fs
@ -163,11 +180,11 @@ instance FeatureSource (LineRecord, LineRecord) where
getScore (LineRecord _ _ _ _ scoreA, LineRecord _ _ _ _ scoreB) = scoreB - scoreA
mainLineRecord (_, l) = l
featureExtractor :: (Monad m, FeatureSource s) => Maybe Tokenizer -> BlackBoxDebuggingOptions -> ConduitT (Double, s) (s, LineWithFactors) m ()
featureExtractor mTokenizer bbdo = CC.map extract
.| finalFeatures (bbdoCartesian bbdo) (fromMaybe (bbdoMinFrequency bbdo) (bbdoMinCartesianFrequency bbdo))
featureExtractor :: (Monad m, FeatureSource s) => Maybe Tokenizer -> BlackBoxDebuggingOptions -> Maybe References -> ConduitT (Double, s) (s, LineWithFactors) m ()
featureExtractor mTokenizer bbdo mReferences = CC.map extract
.| finalFeatures (bbdoCartesian bbdo) (fromMaybe (bbdoMinFrequency bbdo) (bbdoMinCartesianFrequency bbdo))
where extract (rank, line) =
(line, LineWithPeggedFactors rank (getScore line) $ getFeatures mTokenizer bbdo (mainLineRecord line))
(line, LineWithPeggedFactors rank (getScore line) $ getFeatures mTokenizer bbdo mReferences (mainLineRecord line))
finalFeatures :: Monad m => Bool -> Integer -> ConduitT (a, LineWithPeggedFactors) (a, LineWithFactors) m ()
finalFeatures False _ = CC.map (\(l, p) -> (l, peggedToUnaryLine p))
@ -199,12 +216,17 @@ filtreCartesian True = CC.concatMapAccum step S.empty
peggedToUnaryLine :: LineWithPeggedFactors -> LineWithFactors
peggedToUnaryLine (LineWithPeggedFactors rank score fs) = LineWithFactors rank score (Prelude.map UnaryFactor fs)
getFeatures :: Maybe Tokenizer -> BlackBoxDebuggingOptions -> LineRecord -> [PeggedFactor]
getFeatures mTokenizer bbdo (LineRecord inLine expLine outLine _ _) =
getFeatures :: Maybe Tokenizer -> BlackBoxDebuggingOptions -> Maybe References -> LineRecord -> [PeggedFactor]
getFeatures mTokenizer bbdo mReferences (LineRecord inLine expLine outLine _ _) =
Data.List.concat [
extractFactors mTokenizer bbdo "exp" expLine,
extractFactorsFromTabbed mTokenizer bbdo "in" inLine,
extractFactors mTokenizer bbdo "out" outLine]
extractFactors mTokenizer bbdo mReferencesData "exp" expLine,
extractFactorsFromTabbed mTokenizer bbdo mReferencesData "in" inLine,
extractFactors mTokenizer bbdo mReferencesData "out" outLine]
where mReferencesData = case mReferences of
Just references -> Just $ ReferencesData {
referencesDataReferences = references,
referencesDataCurrentId = Nothing }
Nothing -> Nothing
data FeatureAggregate = ExistentialFactorAggregate Double MetricValue Integer
| NumericalValueAggregate [Double] [MetricValue] [Int] [MetricValue]
@ -327,10 +349,15 @@ lessByMetric reversed metric = lessByMetric' reversed (getMetricOrdering metric)
(\(LineRecord _ _ _ _ scoreA) (LineRecord _ _ _ _ scoreB) ->
scoreA < scoreB)
runLineByLineGeneralized :: ResultOrdering -> GEvalSpecification -> ConduitT LineRecord Void (ResourceT IO) a -> IO a
runLineByLineGeneralized :: ResultOrdering -> GEvalSpecification -> (Maybe References -> ConduitT LineRecord Void (ResourceT IO) a) -> IO a
runLineByLineGeneralized ordering spec consum = do
mReferences <- case gesReferences spec of
Just referencesFp -> do
references <- readReferences referencesFp
return $ Just references
Nothing -> return Nothing
(inputFilePath, expectedFilePath, outFilePath) <- checkAndGetFilesSingleOut True spec
gevalLineByLineCore metric mSelector preprocess inputFilePath expectedFilePath outFilePath (sorter ordering .| consum)
gevalLineByLineCore metric mSelector preprocess inputFilePath expectedFilePath outFilePath (sorter ordering .| consum mReferences)
where metric = gesMainMetric spec
mSelector = gesSelector spec
preprocess = gesPreprocess spec
@ -340,6 +367,7 @@ runLineByLineGeneralized ordering spec consum = do
sortOrder FirstTheBest TheLowerTheBetter = compareScores
sortOrder _ _ = flip compareScores
compareScores (LineRecord _ _ _ _ s1) (LineRecord _ _ _ _ s2) = s1 `compare` s2
mReferences = Nothing
gobbleAndDo :: Monad m => ([a] -> [b]) -> ConduitT a b m ()
gobbleAndDo fun = do
@ -348,12 +376,12 @@ gobbleAndDo fun = do
runDiff :: ResultOrdering -> Maybe String -> FilePath -> GEvalSpecification -> BlackBoxDebuggingOptions -> IO ()
runDiff ordering featureFilter otherOut spec bbdo = runDiffGeneralized ordering otherOut spec consum
where consum :: ConduitT (LineRecord, LineRecord) Void (ResourceT IO) ()
consum = CL.filter shouldBeShown
.| runFeatureFilter featureFilter spec bbdo
.| CL.map (encodeUtf8 . formatOutput)
.| CC.unlinesAscii
.| CC.stdout
where consum :: Maybe References -> ConduitT (LineRecord, LineRecord) Void (ResourceT IO) ()
consum = \mReferences -> CL.filter shouldBeShown
.| runFeatureFilter featureFilter spec bbdo mReferences
.| CL.map (encodeUtf8 . formatOutput)
.| CC.unlinesAscii
.| CC.stdout
shouldBeShown (LineRecord _ _ outA _ scoreA, LineRecord _ _ outB _ scoreB) =
outA /= outB && scoreA /= scoreB
formatOutput (LineRecord inp exp outA _ scoreA, LineRecord _ _ outB _ scoreB) = Data.Text.intercalate "\t" [
@ -366,21 +394,22 @@ runDiff ordering featureFilter otherOut spec bbdo = runDiffGeneralized ordering
formatScoreDiff = Data.Text.pack . printf "%f"
runMostWorseningFeatures :: ResultOrdering -> FilePath -> GEvalSpecification -> BlackBoxDebuggingOptions -> IO ()
runMostWorseningFeatures ordering otherOut spec bbdo = runDiffGeneralized ordering' otherOut spec consum
runMostWorseningFeatures ordering otherOut spec bbdo = runDiffGeneralized ordering' otherOut spec consum
where ordering' = forceSomeOrdering ordering
reversed = case ordering of
KeepTheOriginalOrder -> False
FirstTheWorst -> False
FirstTheBest -> True
consum :: ConduitT (LineRecord, LineRecord) Void (ResourceT IO) ()
consum = CC.map prepareFakeLineRecord
.| (worstFeaturesPipeline reversed spec bbdo)
consum :: Maybe References -> ConduitT (LineRecord, LineRecord) Void (ResourceT IO) ()
consum = \mReferences -> CC.map prepareFakeLineRecord
.| (worstFeaturesPipeline reversed spec bbdo mReferences)
prepareFakeLineRecord :: (LineRecord, LineRecord) -> LineRecord
prepareFakeLineRecord (LineRecord _ _ _ _ scorePrev, LineRecord inp exp out c score) =
LineRecord inp exp out c (score - scorePrev)
mReferences = Nothing
runDiffGeneralized :: ResultOrdering -> FilePath -> GEvalSpecification -> ConduitT (LineRecord, LineRecord) Void (ResourceT IO) a -> IO a
runDiffGeneralized :: ResultOrdering -> FilePath -> GEvalSpecification -> (Maybe References -> ConduitT (LineRecord, LineRecord) Void (ResourceT IO) a) -> IO a
runDiffGeneralized ordering otherOut spec consum = do
(inputSource, expectedSource, outSource) <- checkAndGetFilesSingleOut True spec
ooss <- getSmartSourceSpec ((gesOutDirectory spec) </> (gesTestName spec)) "out.tsv" otherOut
@ -394,7 +423,7 @@ runDiffGeneralized ordering otherOut spec consum = do
runResourceT $ runConduit $
((getZipSource $ (,)
<$> ZipSource sourceA
<*> ZipSource sourceB) .| sorter ordering .| consum)
<*> ZipSource sourceB) .| sorter ordering .| consum mReferences)
where metric = gesMainMetric spec
preprocess = gesPreprocess spec
mSelector = gesSelector spec
@ -406,6 +435,7 @@ runDiffGeneralized ordering otherOut spec consum = do
compareScores ((LineRecord _ _ _ _ o1), (LineRecord _ _ _ _ n1))
((LineRecord _ _ _ _ o2), (LineRecord _ _ _ _ n2))
= (n1 - o1) `compare` (n2 - o2)
mReferences = Nothing
escapeTabs :: Text -> Text

View File

@ -177,6 +177,12 @@ specParser = GEvalSpecification
<> help "Submit ONLY: Specification of a git-annex remote."
)
)
<*> ( optional . strOption $
( long "references"
<> metavar "FILE"
<> help "External text file referenced"
)
)
defaultMinFrequency :: Integer
defaultMinFrequency = 1

View File

@ -401,15 +401,16 @@ main = hspec $ do
gesTokenizer = Nothing,
gesGonitoHost = Nothing,
gesToken = Nothing,
gesGonitoGitAnnexRemote = Nothing}
gesGonitoGitAnnexRemote = Nothing,
gesReferences = Nothing }
it "simple test" $ do
results <- runLineByLineGeneralized KeepTheOriginalOrder sampleChallenge Data.Conduit.List.consume
results <- runLineByLineGeneralized KeepTheOriginalOrder sampleChallenge (const Data.Conduit.List.consume)
Prelude.map (\(LineRecord inp _ _ _ _) -> inp) results `shouldBe` ["foo",
"bar",
"baz",
"baq"]
it "test sorting" $ do
results <- runLineByLineGeneralized FirstTheWorst sampleChallenge Data.Conduit.List.consume
results <- runLineByLineGeneralized FirstTheWorst sampleChallenge (const Data.Conduit.List.consume)
Prelude.head (Prelude.map (\(LineRecord inp _ _ _ _) -> inp) results) `shouldBe` "baq"
describe "handle --alt-metric option" $ do
it "accuracy instead of likelihood" $ do
@ -546,7 +547,7 @@ main = hspec $ do
bbdoCartesian = False,
bbdoMinCartesianFrequency = Nothing,
bbdoConsiderNumericalFeatures = True }
(sort $ extractFactorsFromTabbed Nothing bbdo "in" "I like this\t34.3\ttests") `shouldBe` [
(sort $ extractFactorsFromTabbed Nothing bbdo Nothing "in" "I like this\t34.3\ttests") `shouldBe` [
PeggedFactor (FeatureTabbedNamespace "in" 1)
(SimpleExistentialFactor (SimpleAtomicFactor (TextFactor "I"))),
PeggedFactor (FeatureTabbedNamespace "in" 1)