Add plotting graphs for selected metrics

2019-03-15 14:58:16 +01:00 · 2019-03-15 14:58:16 +01:00 · eb10a4c3b4
commit eb10a4c3b4
parent bbcb11c498
6 changed files with 152 additions and 68 deletions
--- a/geval.cabal
+++ b/geval.cabal
@ -88,6 +88,8 @@ library
                     , aeson
                     , aeson-pretty
                     , integration
+                     , Chart
+                     , Chart-cairo
  default-language:    Haskell2010

 executable geval
--- a/src/GEval/Common.hs
+++ b/src/GEval/Common.hs
@ -8,6 +8,16 @@ import Data.Attoparsec.Text

 type MetricValue = Double

+data GraphSeries = GraphSeries [(Double, Double)]
+
+data MetricOutput = MetricOutput MetricValue (Maybe GraphSeries)
+
+getMetricValue :: MetricOutput -> MetricValue
+getMetricValue (MetricOutput v _) = v
+
+getGraphSeries :: MetricOutput -> Maybe GraphSeries
+getGraphSeries (MetricOutput _ gs) = gs
+
 -- some operations can be "hard" (on ints) or "soft" (on doubles),
 -- introduce a typeclass so that we could generalise easily
 class ConvertibleToDouble n where
--- a/src/GEval/Core.hs
+++ b/src/GEval/Core.hs
@ -66,7 +66,7 @@ import Data.Maybe
 import Data.Either (rights)
 import Data.Tuple
 import qualified Data.List.Split as DLS
-import Data.List (sortBy, isSuffixOf)
+import Data.List (sortBy, isSuffixOf, minimum, maximum)
 import Text.NaturalComp

 import Control.Monad.IO.Class
@ -98,9 +98,12 @@ import qualified Data.HashMap.Strict as M
 import qualified Data.Vector as V
 import qualified Data.Vector.Generic as VG

+import qualified Data.Vector.Unboxed as DVU
+
 import Statistics.Correlation

 import Data.Statistics.Calibration(softCalibration)
+import Data.Statistics.Loess(loess)

 import Data.Proxy

@ -290,6 +293,7 @@ data GEvalSpecification = GEvalSpecification
                            gesToken :: Maybe String,
                            gesGonitoGitAnnexRemote :: Maybe String}

+
 gesMainMetric :: GEvalSpecification -> Metric
 gesMainMetric spec = case gesMetrics spec of
  (metric:_) -> metric
@ -316,7 +320,8 @@ data GEvalOptions = GEvalOptions
                      geoResultOrdering :: ResultOrdering,
                      geoFilter :: Maybe String,
                      geoSpec :: GEvalSpecification,
-                      geoBlackBoxDebugginsOptions :: BlackBoxDebuggingOptions }
+                      geoBlackBoxDebugginsOptions :: BlackBoxDebuggingOptions,
+                      geoGraphFile :: Maybe FilePath }

 data GEvalException = NoExpectedFile FilePath
                      | NoOutFile FilePath
@ -388,13 +393,16 @@ extensionsHandled = ["tsv", "jsonl"]

 data LineSource m = LineSource (ConduitT () Text m ()) (Text -> ItemTarget) (Text -> Text) SourceSpec Word32

-geval :: GEvalSpecification -> IO [(SourceSpec, [MetricValue])]
+geval :: GEvalSpecification -> IO [(SourceSpec, [MetricOutput])]
 geval gevalSpec = do
  (inputSource, expectedSource, outSources) <- checkAndGetFiles False gevalSpec
  results <- Prelude.mapM (gevalOnSingleOut gevalSpec inputSource expectedSource) outSources
  return $ sortBy (\a b ->  (show $ fst a) `naturalComp` (show $ fst b)) results

-gevalOnSingleOut :: GEvalSpecification -> SourceSpec -> SourceSpec -> SourceSpec -> IO (SourceSpec, [MetricValue])
+noGraph :: d -> Maybe GraphSeries
+noGraph = const Nothing
+
+gevalOnSingleOut :: GEvalSpecification -> SourceSpec -> SourceSpec -> SourceSpec -> IO (SourceSpec, [MetricOutput])
 gevalOnSingleOut gevalSpec inputSource expectedSource outSource = do
  vals <- Prelude.mapM (\metric -> gevalCore metric mSelector preprocess inputSource expectedSource outSource) metrics
  return (outSource, vals)
@ -530,7 +538,7 @@ getDataFormatFromFilePath path =

 dataDecoder fmt mSelector = CC.map (select fmt mSelector)

-gevalCoreOnSingleLines :: Metric -> (Text -> Text) -> (Text -> ItemTarget) -> LineInFile -> (Text -> ItemTarget) -> LineInFile -> (Text -> ItemTarget) -> LineInFile -> IO (MetricValue)
+gevalCoreOnSingleLines :: Metric -> (Text -> Text) -> (Text -> ItemTarget) -> LineInFile -> (Text -> ItemTarget) -> LineInFile -> (Text -> ItemTarget) -> LineInFile -> IO (MetricOutput)
 gevalCoreOnSingleLines metric preprocess inpDecoder inpLine expDecoder expLine outDecoder outLine =
  gevalCoreOnSources metric (singleLineAsLineSource inpLine inpDecoder preprocess)
                            (singleLineAsLineSource expLine expDecoder outputPreprocess)
@ -553,7 +561,7 @@ gevalCore :: Metric           -- ^ evaluation metric
          -> SourceSpec       -- ^ source specification for the input values
          -> SourceSpec       -- ^ source specification for the expected output
          -> SourceSpec       -- ^ source specification for the output
-          -> IO (MetricValue) -- ^ metric value for the output against the expected output
+          -> IO (MetricOutput) -- ^ metric value for the output against the expected output
 gevalCore metric mSelector preprocess inputSource expectedSource outSource = do
  whenM (isEmptyFileSource outSource) $ throwM $ EmptyOutput
  gevalCoreOnSources metric
@ -585,22 +593,22 @@ gevalCoreOnSources :: (MonadIO m, MonadThrow m, MonadUnliftIO m) =>
                     -> LineSource (ResourceT m)  -- ^ source of the input values
                     -> LineSource (ResourceT m)  -- ^ source to read the expected output
                     -> LineSource (ResourceT m)  -- ^ source to read the output
-                     -> m (MetricValue)           -- ^ metric values for the output against the expected output
+                     -> m (MetricOutput)           -- ^ metric values for the output against the expected output
 gevalCoreOnSources RMSE inputLineSource expectedLineSource outLineSource = do
-  mse <- gevalCoreOnSources MSE inputLineSource expectedLineSource outLineSource
-  return $ mse ** 0.5
+  MetricOutput mse g <- gevalCoreOnSources MSE inputLineSource expectedLineSource outLineSource
+  return $ MetricOutput (mse ** 0.5) g

 gevalCoreOnSources Likelihood inputLineSource expectedLineSource outLineSource = do
-  logLoss <- gevalCoreOnSources LogLoss inputLineSource expectedLineSource outLineSource
-  return $ logLossToLikehood logLoss
+  MetricOutput logLoss g <- gevalCoreOnSources LogLoss inputLineSource expectedLineSource outLineSource
+  return $ MetricOutput (logLossToLikehood logLoss) g

 gevalCoreOnSources (LikelihoodHashed b) inputLineSource expectedLineSource outLineSource = do
-  logLoss <- gevalCoreOnSources (LogLossHashed b) inputLineSource expectedLineSource outLineSource
-  return $ logLossToLikehood logLoss
+  MetricOutput logLoss g <- gevalCoreOnSources (LogLossHashed b) inputLineSource expectedLineSource outLineSource
+  return $ MetricOutput (logLossToLikehood logLoss) g

 gevalCoreOnSources MultiLabelLikelihood inputLineSource expectedLineSource outLineSource = do
-  logLoss <- gevalCoreOnSources MultiLabelLogLoss inputLineSource expectedLineSource outLineSource
-  return $ logLossToLikehood logLoss
+  MetricOutput logLoss g <- gevalCoreOnSources MultiLabelLogLoss inputLineSource expectedLineSource outLineSource
+  return $ MetricOutput (logLossToLikehood logLoss) g

 gevalCoreOnSources metric inputLineSource expectedLineSource outLineSource = do
  gevalCore' metric inputLineSource expectedLineSource outLineSource
@ -620,24 +628,24 @@ gevalCore' :: (MonadIO m, MonadThrow m, MonadUnliftIO m) =>
           -> LineSource (ResourceT m)  -- ^ source of the input values
           -> LineSource (ResourceT m)  -- ^ source to read the expected output
           -> LineSource (ResourceT m)  -- ^ source to read the output
-           -> m (MetricValue)           -- ^ metric values for the output against the expected output
-gevalCore' MSE _ = gevalCoreWithoutInput outParser outParser itemSquaredError averageC id
+           -> m (MetricOutput)           -- ^ metric values for the output against the expected output
+gevalCore' MSE _ = gevalCoreWithoutInput outParser outParser itemSquaredError averageC id noGraph
  where outParser = getValue . TR.double

-gevalCore' MAE _ = gevalCoreWithoutInput outParser outParser itemAbsoluteError averageC id
+gevalCore' MAE _ = gevalCoreWithoutInput outParser outParser itemAbsoluteError averageC id noGraph
  where outParser = getValue . TR.double

-gevalCore' SMAPE _ = gevalCoreWithoutInput outParser outParser smape averageC (* 100.0)
+gevalCore' SMAPE _ = gevalCoreWithoutInput outParser outParser smape averageC (* 100.0) noGraph
  where outParser = getValue . TR.double
        smape (exp, out) = (abs (exp-out)) `safeDoubleDiv` ((abs exp) + (abs out))

 gevalCore' Pearson _ = gevalCoreByCorrelationMeasure pearson
 gevalCore' Spearman _ = gevalCoreByCorrelationMeasure spearman

-gevalCore' LogLoss _ = gevalCoreWithoutInput outParser outParser itemLogLossError averageC id
+gevalCore' LogLoss _ = gevalCoreWithoutInput outParser outParser itemLogLossError averageC id noGraph
  where outParser = getValue . TR.double

-gevalCore' BLEU _ = gevalCoreWithoutInput (Right . Prelude.map Prelude.words . DLS.splitOn "\t" . unpack) (Right . Prelude.words . unpack) bleuCombine bleuAgg bleuFinal
+gevalCore' BLEU _ = gevalCoreWithoutInput (Right . Prelude.map Prelude.words . DLS.splitOn "\t" . unpack) (Right . Prelude.words . unpack) bleuCombine bleuAgg bleuFinal noGraph
  where bleuFinal (p1, p2, p3, p4, rl, l1, l2, l3, l4) = ((p1 /. l1) * (p2 /. l2) * (p3 /. l3) * (p4 /. l4)) ** 0.25 * (brevityPenalty l1 rl)
        bleuCombine (refs, sen) = bleuStep refs sen
        bleuAgg = CC.foldl bleuFuse (0, 0, 0, 0, 0,  0, 0, 0, 0)
@ -647,15 +655,15 @@ gevalCore' BLEU _ = gevalCoreWithoutInput (Right . Prelude.map Prelude.words . D
          | c == 0 && r > 0 = 0.0
          | otherwise = exp (1.0 - (r /. c))

-gevalCore' GLEU _ = gevalCoreWithoutInput (Right . Prelude.map Prelude.words . DLS.splitOn "\t" . unpack) (Right . Prelude.words . unpack) gleuCombine gleuAgg gleuFinal
+gevalCore' GLEU _ = gevalCoreWithoutInput (Right . Prelude.map Prelude.words . DLS.splitOn "\t" . unpack) (Right . Prelude.words . unpack) gleuCombine gleuAgg gleuFinal noGraph
  where gleuFinal (m, t) = m /. t
        gleuCombine (refs, sen) = gleuStep refs sen
        gleuAgg = CC.foldl gleuFuse (0, 0)
        gleuFuse (a1, a2) (b1, b2) = (a1+b1, a2+b2)

-gevalCore' WER _ = gevalCoreWithoutInput (Right . Prelude.words . unpack) (Right . Prelude.words . unpack) (uncurry werStep) averageC id
+gevalCore' WER _ = gevalCoreWithoutInput (Right . Prelude.words . unpack) (Right . Prelude.words . unpack) (uncurry werStep) averageC id noGraph

-gevalCore' Accuracy _ = gevalCoreWithoutInput (Right . strip) (Right . strip) hitOrMiss averageC id
+gevalCore' Accuracy _ = gevalCoreWithoutInput (Right . strip) (Right . strip) hitOrMiss averageC id noGraph
                      where hitOrMiss (exp, got) =
                              -- first try to parse what we got as a probability distribution
                              -- (like the one used for Likelikehood/LogLossHashed metric)
@ -680,7 +688,7 @@ gevalCore' Accuracy _ = gevalCoreWithoutInput (Right . strip) (Right . strip) hi
                            tryReadingAsFloat :: Text -> Maybe Float
                            tryReadingAsFloat = readMaybe . unpack

-gevalCore' (FMeasure beta) _ = gevalCoreWithoutInput outParser outParser getCount countAgg (fMeasureOnCounts beta)
+gevalCore' (FMeasure beta) _ = gevalCoreWithoutInput outParser outParser getCount countAgg (fMeasureOnCounts beta) noGraph
  where outParser = detected <=< (getValue . TR.double)
        expParser = expected <=< (getValue . TR.decimal)
        expected 1 = Right True
@ -698,7 +706,7 @@ gevalCore' (FMeasure beta) _ = gevalCoreWithoutInput outParser outParser getCoun
        getCount (False, True)  = (0, 0, 1)
        getCount (False, False) = (0, 0, 0)

-gevalCore' (MacroFMeasure beta) _ = gevalCoreWithoutInput (Right . Just . strip) (Right . predicted . strip) getClassesInvolved gatherClassC macroAverageOnCounts
+gevalCore' (MacroFMeasure beta) _ = gevalCoreWithoutInput (Right . Just . strip) (Right . predicted . strip) getClassesInvolved gatherClassC macroAverageOnCounts noGraph
                      where predicted got =
                              -- first try to parse what we got as a probability distribution
                              -- (like the one used for Likelikehood/LogLossHashed metric)
@ -732,6 +740,7 @@ gevalCore' (SoftFMeasure beta) _ = gevalCoreWithoutInput parseAnnotations
                                                         getSoftCounts
                                                         countAgg
                                                         (fMeasureOnCounts beta)
+                                                         noGraph
                      where getSoftCounts (expected, got) = (weightedMaxMatch matchScore expected got,
                                                             Prelude.length expected,
                                                             Prelude.length got)
@ -741,15 +750,22 @@ gevalCore' (ProbabilisticSoftFMeasure beta) _ = gevalCoreWithoutInput parseAnnot
                                                                      getProbabilisticSoftCounts
                                                                      probabilisticSoftAgg
                                                                      (fMeasureOnProbabilisticCounts beta)
+                                                                      loessGraph
  where probabilisticSoftAgg :: Monad m => ConduitM ([Double], [Double], Double, Int) o m ([Double], [Double], Double, Int)
        probabilisticSoftAgg = CC.foldl probabilisticSoftFolder ([], [], fromInteger 0, 0)
        probabilisticSoftFolder (r1, p1, g1, e1) (r2, p2, g2, e2) = (r1 ++ r2, p1 ++ p2, g1 + g2, e1 + e2)
+        loessGraph :: ([Double], [Double], Double, Int) -> Maybe GraphSeries
+        loessGraph (results, probs, _, _) = Just $ GraphSeries $ Prelude.map (\x -> (x, loess results' probs' x)) $ Prelude.filter (\p -> p >= lowest && p <= highest) $ Prelude.map (\d -> 0.01 * (fromIntegral d)) [1..99]
+           where results' = DVU.fromList results
+                 probs' = DVU.fromList probs
+                 lowest = Data.List.minimum probs
+                 highest = Data.List.maximum probs
        fMeasureOnProbabilisticCounts :: Double -> ([Double], [Double], Double, Int) -> Double
        fMeasureOnProbabilisticCounts beta (results, probs, got, nbExpected) = weightedHarmonicMean beta calibrationMeasure recall
           where calibrationMeasure = softCalibration results probs
                 recall = got /. nbExpected

-gevalCore' ClippEU _ = gevalCoreWithoutInput parseClippingSpecs parseClippings matchStep clippeuAgg finalStep
+gevalCore' ClippEU _ = gevalCoreWithoutInput parseClippingSpecs parseClippings matchStep clippeuAgg finalStep noGraph
  where
    parseClippings = controlledParse lineClippingsParser
    parseClippingSpecs = controlledParse lineClippingSpecsParser
@ -759,18 +775,19 @@ gevalCore' ClippEU _ = gevalCoreWithoutInput parseClippingSpecs parseClippings m
    clippeuAgg = CC.foldl countFolder (0, 0, 0)
    finalStep counts = f2MeasureOnCounts counts

-gevalCore' NMI _ = gevalCoreWithoutInput (Right . id) (Right . id) id (CC.foldl updateConfusionMatrix M.empty) normalizedMutualInformationFromConfusionMatrix
+gevalCore' NMI _ = gevalCoreWithoutInput (Right . id) (Right . id) id (CC.foldl updateConfusionMatrix M.empty) normalizedMutualInformationFromConfusionMatrix noGraph

 gevalCore' MAP _ = gevalCoreWithoutInput (Right . DLS.splitOn "\t" . unpack)
                                         (Right . DLS.splitOn "\t" . unpack)
                                         (\(e,g) -> calculateMAPForOneResult e g)
                                         averageC
                                         id
+                                         noGraph

 gevalCore' (LogLossHashed nbOfBits) _ = helper nbOfBits
  -- for LogLossHashed we "salt" each hash with the line number
  where helper nbOfBits expectedLineSource outLineSource =
-          gevalCore''' (ParserSpecWithoutInput (liftOp (Right . id)) (liftOp tentativeParser)) (\(lineNo, (t,d)) -> calculateLogLoss nbOfBits lineNo t (parseDistributionWrapper nbOfBits lineNo d)) averageC negate (WithoutInput expectedLineSource outLineSource)
+          gevalCore''' (ParserSpecWithoutInput (liftOp (Right . id)) (liftOp tentativeParser)) (\(lineNo, (t,d)) -> calculateLogLoss nbOfBits lineNo t (parseDistributionWrapper nbOfBits lineNo d)) averageC negate noGraph (WithoutInput expectedLineSource outLineSource)
        -- Unfortunately, we're parsing the distribution twice. We need to
        -- tentatively parse the distribution when the line number is unknown
        -- (so we just set it to 1)
@ -782,14 +799,14 @@ gevalCore' (LogLossHashed nbOfBits) _ = helper nbOfBits
 gevalCore' CharMatch inputLineSource = helper inputLineSource
 where
   helper inputLineSource expectedLineSource outputLineSource = do
-     gevalCoreGeneralized (ParserSpecWithInput justUnpack justUnpack justUnpack) step countAgg (fMeasureOnCounts charMatchBeta) (WithInput inputLineSource expectedLineSource outputLineSource)
+     gevalCoreGeneralized (ParserSpecWithInput justUnpack justUnpack justUnpack) step countAgg (fMeasureOnCounts charMatchBeta) noGraph (WithInput inputLineSource expectedLineSource outputLineSource)
   step (ParsedRecordWithInput inp exp out) = getCharMatchCount inp exp out
   justUnpack = liftOp (Right . unpack)


-gevalCore' BIOF1 _ = gevalCoreWithoutInput parseBioSequenceIntoEntities parseBioSequenceIntoEntities (uncurry gatherCountsForBIO) countAgg f1MeasureOnCounts
+gevalCore' BIOF1 _ = gevalCoreWithoutInput parseBioSequenceIntoEntities parseBioSequenceIntoEntities (uncurry gatherCountsForBIO) countAgg f1MeasureOnCounts noGraph

-gevalCore' BIOF1Labels _ = gevalCoreWithoutInput parseBioSequenceIntoEntitiesWithoutNormalization parseBioSequenceIntoEntitiesWithoutNormalization (uncurry gatherCountsForBIO) countAgg f1MeasureOnCounts
+gevalCore' BIOF1Labels _ = gevalCoreWithoutInput parseBioSequenceIntoEntitiesWithoutNormalization parseBioSequenceIntoEntitiesWithoutNormalization (uncurry gatherCountsForBIO) countAgg f1MeasureOnCounts noGraph
   where parseBioSequenceIntoEntitiesWithoutNormalization s = do
           entities <- parseBioSequenceIntoEntities s
           return $ Prelude.map eraseNormalisation entities
@ -799,6 +816,7 @@ gevalCore' TokenAccuracy _ = gevalCoreWithoutInput intoTokens
                                                   countHitsAndTotals
                                                   hitsAndTotalsAgg
                                                   (\(hits, total) -> hits /. total)
+                                                   noGraph
   where intoTokens = Right . Data.Text.words
         countHitsAndTotals :: ([Text], [Text]) -> (Int, Int)
         countHitsAndTotals (es, os) =
@ -820,6 +838,7 @@ gevalCore' (MultiLabelFMeasure beta) _ = gevalCoreWithoutInputOnItemTargets (Rig
                                                                            (getCounts (==))
                                                                            countAgg
                                                                            (fMeasureOnCounts beta)
+                                                                            noGraph
    where
      getWords (RawItemTarget t) = Prelude.map unpack $ selectByStandardThreshold $ parseIntoProbList t
      getWords (PartiallyParsedItemTarget ts) = Prelude.map unpack ts
@ -831,6 +850,7 @@ gevalCore' MultiLabelLogLoss _ = gevalCoreWithoutInput intoWords
                                                       (uncurry countLogLossOnProbList)
                                                       averageC
                                                       id
+                                                       noGraph
    where
      intoWords = Right . Data.Text.words

@ -841,9 +861,9 @@ gevalCoreByCorrelationMeasure :: (MonadUnliftIO m, MonadThrow m, MonadIO m) =>
                                (V.Vector (Double, Double) -> Double) -> -- ^ correlation function
                                LineSource (ResourceT m) ->  -- ^ source to read the expected output
                                LineSource (ResourceT m) ->  -- ^ source to read the output
-                                m (MetricValue)             -- ^ metric values for the output against the expected output
+                                m (MetricOutput)             -- ^ metric values for the output against the expected output
 gevalCoreByCorrelationMeasure correlationFunction =
-  gevalCoreWithoutInput outParser outParser id correlationC finalStep
+  gevalCoreWithoutInput outParser outParser id correlationC finalStep noGraph
  where outParser = getValue . TR.double
        correlationC = CC.foldl (flip (:)) []
        finalStep pairs = correlationFunction $ V.fromList pairs
@ -868,11 +888,12 @@ gevalCoreWithoutInput :: (MonadUnliftIO m, MonadThrow m, MonadIO m)
                      -> (ConduitT c Void (ResourceT m) d)  -- ^ a Conduit which aggregates all the combined values into
                                                   -- a "total" value
                      -> (d -> Double)             -- ^ function to transform the "total" value into the final score
+                      -> (d -> Maybe GraphSeries)
                      -> LineSource (ResourceT m)  -- ^ source to read the expected output
                      -> LineSource (ResourceT m)  -- ^ source to read the output
-                      -> m (MetricValue)           -- ^ metric values for the output against the expected output
-gevalCoreWithoutInput expParser outParser itemStep aggregator finalStep expectedLineStream outLineStream =
-  gevalCoreWithoutInputOnItemTargets (liftOp expParser) (liftOp outParser) itemStep aggregator finalStep expectedLineStream outLineStream
+                      -> m (MetricOutput)           -- ^ metric values for the output against the expected output
+gevalCoreWithoutInput expParser outParser itemStep aggregator finalStep generateGraph expectedLineStream outLineStream =
+  gevalCoreWithoutInputOnItemTargets (liftOp expParser) (liftOp outParser) itemStep aggregator finalStep generateGraph expectedLineStream outLineStream

 gevalCoreWithoutInputOnItemTargets :: (MonadUnliftIO m, MonadThrow m, MonadIO m)
                      => (ItemTarget -> Either String a) -- ^ parser for values in the expected output
@ -883,20 +904,21 @@ gevalCoreWithoutInputOnItemTargets :: (MonadUnliftIO m, MonadThrow m, MonadIO m)
                      -> (ConduitT c Void (ResourceT m) d)  -- ^ a Conduit which aggregates all the combined values into
                                                   -- a "total" value
                      -> (d -> Double)             -- ^ function to transform the "total" value into the final score
+                      -> (d -> Maybe GraphSeries)
                      -> LineSource (ResourceT m)  -- ^ source to read the expected output
                      -> LineSource (ResourceT m)  -- ^ source to read the output
-                      -> m (MetricValue)           -- ^ metric values for the output against the expected output
-gevalCoreWithoutInputOnItemTargets expParser outParser itemStep aggregator finalStep expectedLineStream outLineStream =
-  gevalCoreGeneralized (ParserSpecWithoutInput expParser outParser) (trans itemStep) aggregator finalStep (WithoutInput expectedLineStream outLineStream)
+                      -> m (MetricOutput)           -- ^ metric values for the output against the expected output
+gevalCoreWithoutInputOnItemTargets expParser outParser itemStep aggregator finalStep generateGraph expectedLineStream outLineStream =
+  gevalCoreGeneralized (ParserSpecWithoutInput expParser outParser) (trans itemStep) aggregator finalStep generateGraph (WithoutInput expectedLineStream outLineStream)
 where
   trans :: ((a, b) -> c) -> ParsedRecord (WithoutInput m a b) -> c
   trans step (ParsedRecordWithoutInput x y) = step (x, y)



-gevalCore''' :: (MonadUnliftIO m, MonadThrow m, MonadIO m) => ParserSpec (WithoutInput m a b) -> ((Word32, (a, b)) -> c) -> (ConduitT c Void (ResourceT m) d) -> (d -> Double) -> WithoutInput m a b -> m (MetricValue)
-gevalCore''' parserSpec itemStep aggregator finalStep context =
-  gevalCoreGeneralized' parserSpec (trans itemStep) aggregator finalStep context
+gevalCore''' :: (MonadUnliftIO m, MonadThrow m, MonadIO m) => ParserSpec (WithoutInput m a b) -> ((Word32, (a, b)) -> c) -> (ConduitT c Void (ResourceT m) d) -> (d -> Double) -> (d -> Maybe GraphSeries) -> WithoutInput m a b -> m (MetricOutput)
+gevalCore''' parserSpec itemStep aggregator finalStep generateGraph context =
+  gevalCoreGeneralized' parserSpec (trans itemStep) aggregator finalStep generateGraph context
 where
   trans :: ((Word32, (a, b)) -> c) -> (Word32, ParsedRecord (WithoutInput m a b)) -> c
   trans step (n, ParsedRecordWithoutInput x y) = step (n, (x, y))
@ -911,18 +933,19 @@ gevalCoreGeneralized :: (EvaluationContext ctxt m, MonadUnliftIO m, MonadThrow m
                                                             -- some "local" score calculated for each line (item)
                        -> (ConduitT c Void (ResourceT m) d) -- ^ a Conduit to aggregate score into a "total"
                        -> (d -> Double)                     -- ^ function to transform the "total" value into the final score
+                        -> (d -> Maybe GraphSeries)
                        -> ctxt                              -- ^ "context", i.e. 2 or 3 sources needed to operate
-                        -> m (MetricValue)
-gevalCoreGeneralized parserSpec itemStep aggregator finalStep context =
-  gevalCoreGeneralized' parserSpec (skipLineNumber itemStep) aggregator finalStep context
+                        -> m MetricOutput
+gevalCoreGeneralized parserSpec itemStep aggregator finalStep generateGraph context =
+  gevalCoreGeneralized' parserSpec (skipLineNumber itemStep) aggregator finalStep generateGraph context

-gevalCoreGeneralized' :: forall m ctxt c d . (EvaluationContext ctxt m, MonadUnliftIO m, MonadThrow m, MonadIO m) => ParserSpec ctxt -> ((Word32, ParsedRecord ctxt) -> c) -> (ConduitT c Void (ResourceT m) d) -> (d -> Double) -> ctxt -> m (MetricValue)
-gevalCoreGeneralized' parserSpec itemStep aggregator finalStep context = do
+gevalCoreGeneralized' :: forall m ctxt c d . (EvaluationContext ctxt m, MonadUnliftIO m, MonadThrow m, MonadIO m) => ParserSpec ctxt -> ((Word32, ParsedRecord ctxt) -> c) -> (ConduitT c Void (ResourceT m) d) -> (d -> Double) -> (d -> Maybe GraphSeries) -> ctxt -> m (MetricOutput)
+gevalCoreGeneralized' parserSpec itemStep aggregator finalStep generateGraph context = do
   v <- runResourceT $ runConduit $
     (((getZipSource $ (,)
       <$> ZipSource (CL.sourceList [(getFirstLineNo (Proxy :: Proxy m) context)..])
       <*> (ZipSource $ recordSource context parserSpec)) .| CL.map (checkStep (Proxy :: Proxy m) itemStep)) .| CL.catMaybes .| aggregator)
-   return $ finalStep v
+   return $ MetricOutput (finalStep v) (generateGraph v)

 -- | A type family to handle all the evaluation "context".
 --
--- a/src/GEval/LineByLine.hs
+++ b/src/GEval/LineByLine.hs
@ -431,7 +431,7 @@ gevalLineByLineSource metric mSelector preprocess inputSource expectedSource out
          s <- liftIO $ gevalCoreOnSingleLines metric preprocess (getDataDecoder inputLineSource) (LineInFile inputSource lineNo inp)
                                                                (getDataDecoder expectedLineSource) (LineInFile expectedSource lineNo exp)
                                                                (getDataDecoder outputLineSource) (LineInFile outSource lineNo out)
-          return $ LineRecord inp exp out lineNo s
+          return $ LineRecord inp exp out lineNo (getMetricValue s)

 justTokenize :: Maybe Tokenizer -> IO ()
 justTokenize Nothing = error "a tokenizer must be specified with --tokenizer option"
--- a/src/GEval/OptionsParser.hs
+++ b/src/GEval/OptionsParser.hs
@ -9,9 +9,14 @@ module GEval.OptionsParser
        precisionArgParser
        ) where

+import Debug.Trace
+
 import Paths_geval (version)
 import Data.Version (showVersion)

+import Graphics.Rendering.Chart.Easy
+import Graphics.Rendering.Chart.Backend.Cairo
+
 import Options.Applicative
 import qualified System.Directory as D
 import System.FilePath
@ -96,6 +101,10 @@ optionsParser = GEvalOptions
                   <> help "When in line-by-line or diff mode, show only items with a given feature"))
   <*> specParser
   <*> blackBoxDebuggingOptionsParser
+   <*> optional (strOption
+                 ( long "plot-graph"
+                   <> metavar "FILE-PATH"
+                   <> help "Plot an extra graph, applicable only for Probabilistic-Soft-F-score (LOESS function for calibration)"))

 precisionArgParser :: Parser Int
 precisionArgParser = option auto
@ -277,41 +286,80 @@ runGEval'' opts = runGEval''' (geoSpecialCommand opts)
                              (geoFilter opts)
                              (geoSpec opts)
                              (geoBlackBoxDebugginsOptions opts)
+                              (geoGraphFile opts)

 runGEval''' :: Maybe GEvalSpecialCommand
              -> ResultOrdering
              -> Maybe String
              -> GEvalSpecification
              -> BlackBoxDebuggingOptions
+              -> Maybe FilePath
              -> IO (Maybe [(SourceSpec, [MetricValue])])
-runGEval''' Nothing _ _ spec _ = do
-  vals <- geval spec
+runGEval''' Nothing _ _ spec _ mGraphFile = do
+  vals' <- geval spec
+  let vals = map (\(s, val) -> (s, map getMetricValue val)) vals'
+  case mGraphFile of
+    Just graphFile -> do
+      let graphsData = groupByMetric (gesMetrics spec) vals'
+      mapM_ (\(ix, d) -> (plotGraph (getGraphFilename ix graphFile) d)) $ zip [0..] graphsData
+    Nothing -> return ()
  return $ Just vals
-runGEval''' (Just Init) _ _ spec _ = do
+runGEval''' (Just Init) _ _ spec _ _ = do
  initChallenge spec
  return Nothing
-runGEval''' (Just PrintVersion) _ _ _ _ = do
+runGEval''' (Just PrintVersion) _ _ _ _ _ = do
  putStrLn ("geval " ++ showVersion version)
  return Nothing
-runGEval''' (Just LineByLine) ordering featureFilter spec bbdo = do
+runGEval''' (Just LineByLine) ordering featureFilter spec bbdo _ = do
  runLineByLine ordering featureFilter spec bbdo
  return Nothing
-runGEval''' (Just WorstFeatures) ordering _ spec bbdo = do
+runGEval''' (Just WorstFeatures) ordering _ spec bbdo _ = do
  runWorstFeatures ordering spec bbdo
  return Nothing
-runGEval''' (Just (Diff otherOut)) ordering featureFilter spec bbdo = do
+runGEval''' (Just (Diff otherOut)) ordering featureFilter spec bbdo _ = do
  runDiff ordering featureFilter otherOut spec bbdo
  return Nothing
-runGEval''' (Just (MostWorseningFeatures otherOut)) ordering _ spec bbdo = do
+runGEval''' (Just (MostWorseningFeatures otherOut)) ordering _ spec bbdo _ = do
  runMostWorseningFeatures ordering otherOut spec bbdo
  return Nothing
-runGEval''' (Just JustTokenize) _ _ spec _ = do
+runGEval''' (Just JustTokenize) _ _ spec _ _ = do
  justTokenize (gesTokenizer spec)
  return Nothing
-runGEval''' (Just Submit) _ _ spec _ = do
+runGEval''' (Just Submit) _ _ spec _ _ = do
  submit (gesGonitoHost spec) (gesToken spec) (gesGonitoGitAnnexRemote spec)
  return Nothing

+getGraphFilename :: Int -> FilePath -> FilePath
+getGraphFilename 0 fp = fp
+getGraphFilename ix fp = ((dropExtension fp) ++ "-" ++ (show ix)) ++ (takeExtension fp)
+
+groupByMetric :: [Metric]
+                -> [(SourceSpec, [MetricOutput])]
+                -> [(Metric, [(SourceSpec, GraphSeries)])]
+groupByMetric metrics results = filter (\(_, ss) -> not (null ss))
+                                $ map extractMetric
+                                $ zip [0..] metrics
+  where extractMetric (ix, metric) =
+            (metric, map (\(s, Just g) -> (s, g))
+                     $ filter (\(s, mg) -> isJust mg)
+                     $ map (\(s, out) -> (s, getGraphSeries out))
+                     $ map (\(s, outs) -> (s, outs !! ix)) results)
+
+
+plotGraph :: FilePath -> (Metric, [(SourceSpec, GraphSeries)]) -> IO ()
+plotGraph graphFile (metric@(ProbabilisticSoftFMeasure _), seriesSpecs) = do
+  toFile def graphFile $ do
+    layoutlr_title .= "GEval Graph / Loess / " ++ (show metric)
+    let perfectSeries = (FilePathSpec "Perfect",
+                         GraphSeries [(0.0, 0.0), (1.0, 1.0)])
+    mapM_ plotOneSeries $ (perfectSeries : seriesSpecs)
+  return ()
+  where
+    plotOneSeries :: (SourceSpec, GraphSeries) -> EC (LayoutLR Double Double Double) ()
+    plotOneSeries (sspec, GraphSeries series) = plotLeft (line (recoverPath sspec) [series])
+plotGraph _ _ = error "No graph for this metric!"
+
+
 initChallenge :: GEvalSpecification -> IO ()
 initChallenge spec = case gesExpectedDirectory spec of
  Nothing -> showInitInstructions
--- a/test/Spec.hs
+++ b/test/Spec.hs
@ -81,7 +81,7 @@ main :: IO ()
 main = hspec $ do
  describe "root mean square error" $ do
    it "simple test" $ do
-      [(_, (val:_))] <-  geval $ defaultGEvalSpecification {gesExpectedDirectory=Just "test/rmse-simple/rmse-simple", gesOutDirectory="test/rmse-simple/rmse-simple-solution"}
+      [(_, ((MetricOutput val _):_))] <-  geval $ defaultGEvalSpecification {gesExpectedDirectory=Just "test/rmse-simple/rmse-simple", gesOutDirectory="test/rmse-simple/rmse-simple-solution"}
      val `shouldBeAlmost` 0.64549722436790
  describe "mean square error" $ do
    it "simple test with arguments" $
@ -297,12 +297,13 @@ main = hspec $ do
      runGEvalTest "multilabel-likelihood-simple" `shouldReturnAlmost` 0.115829218528827
  describe "evaluating single lines" $ do
    it "RMSE" $ do
-      gevalCoreOnSingleLines RMSE id RawItemTarget
+      (MetricOutput v _) <- gevalCoreOnSingleLines RMSE id RawItemTarget
                                                          (LineInFile (FilePathSpec "stub1") 1 "blabla")
                                                          RawItemTarget
                                                          (LineInFile (FilePathSpec "stub2") 1 "3.4")
                                                          RawItemTarget
-                                     (LineInFile (FilePathSpec "stub3") 1 "2.6") `shouldReturnAlmost` 0.8
+                                                          (LineInFile (FilePathSpec "stub3") 1 "2.6")
+      v `shouldBeAlmost` 0.8
  describe "Annotation format" $ do
    it "just parse" $ do
      parseAnnotations "foo:3,7-10 baz:4-6" `shouldBe` Right [Annotation "foo" (IS.fromList [3,7,8,9,10]),