Start working on --list-metrics options

2019-08-21 23:44:18 +02:00 · 2019-08-21 23:44:18 +02:00 · dab2646798
commit dab2646798
parent 98b398d34f
8 changed files with 180 additions and 49 deletions
--- a/geval.cabal
+++ b/geval.cabal
@ -17,6 +17,7 @@ library
  hs-source-dirs:      src
  exposed-modules:     GEval.Core
                     , GEval.Metric
+                     , GEval.MetricsMeta
                     , GEval.EvaluationScheme
                     , GEval.CreateChallenge
                     , GEval.OptionsParser
@ -137,6 +138,7 @@ test-suite geval-test
                     , silently
                     , vector
                     , statistics
+                     , filepath
  ghc-options:         -threaded -rtsopts -with-rtsopts=-N
  default-language:    Haskell2010

--- a/src/GEval/Core.hs
+++ b/src/GEval/Core.hs
@ -187,7 +187,7 @@ data GEvalSpecialCommand = Init
                           | LineByLine | WorstFeatures
                           | Diff FilePath | MostWorseningFeatures FilePath
                           | PrintVersion | JustTokenize | Submit
-                           | Validate
+                           | Validate | ListMetrics

 data ResultOrdering = KeepTheOriginalOrder | FirstTheWorst | FirstTheBest

--- a/src/GEval/CreateChallenge.hs
+++ b/src/GEval/CreateChallenge.hs
@ -1,7 +1,8 @@
 {-# LANGUAGE QuasiQuotes #-}

 module GEval.CreateChallenge
-       (createChallenge)
+       (createChallenge,
+        testExpectedContents)
       where

 import GEval.Metric
--- a/src/GEval/EvaluationScheme.hs
+++ b/src/GEval/EvaluationScheme.hs
@ -1,5 +1,5 @@
 module GEval.EvaluationScheme
-  (EvaluationScheme(..), evaluationSchemeMetric, applyPreprocessingOperations, evaluationSchemeName)
+  (EvaluationScheme(..), evaluationSchemeMetric, applyPreprocessingOperations, evaluationSchemeName, PreprocessingOperation(..))
  where

 import GEval.Metric
--- a/src/GEval/Metric.hs
+++ b/src/GEval/Metric.hs
@ -5,7 +5,6 @@ module GEval.Metric
   MetricOrdering(..),
   defaultLogLossHashedSize,
   getMetricOrdering,
-   listOfAvailableMetrics,
   bestPossibleValue,
   perfectOutLineFromExpectedLine,
   fixedNumberOfColumnsInExpected,
@ -32,47 +31,6 @@ data Metric = RMSE | MSE | Pearson | Spearman | BLEU | GLEU | WER | Accuracy | C
              | SoftFMeasure Double | ProbabilisticSoftFMeasure Double
              deriving (Eq)

-- | the list of available metrics, to be shown to the user or to be
-- | used for tests
-listOfAvailableMetrics :: [Metric]
-listOfAvailableMetrics = [RMSE,
-                          MSE,
-                          MAE,
-                          SMAPE,
-                          Pearson,
-                          Spearman,
-                          Accuracy,
-                          LogLoss,
-                          Likelihood,
-                          FMeasure 1.0,
-                          FMeasure 2.0,
-                          FMeasure 0.25,
-                          MacroFMeasure 1.0,
-                          MacroFMeasure 2.0,
-                          MacroFMeasure 0.25,
-                          MultiLabelFMeasure 1.0,
-                          MultiLabelFMeasure 2.0,
-                          MultiLabelFMeasure 0.25,
-                          MultiLabelLikelihood,
-                          MAP,
-                          BLEU,
-                          GLEU,
-                          WER,
-                          NMI,
-                          ClippEU,
-                          LogLossHashed defaultLogLossHashedSize,
-                          LikelihoodHashed defaultLogLossHashedSize,
-                          BIOF1,
-                          BIOF1Labels,
-                          TokenAccuracy,
-                          SoftFMeasure 1.0,
-                          SoftFMeasure 2.0,
-                          SoftFMeasure 0.25,
-                          ProbabilisticSoftFMeasure 1.0,
-                          ProbabilisticSoftFMeasure 2.0,
-                          ProbabilisticSoftFMeasure 0.25,
-                          CharMatch]
-
 instance Show Metric where
  show RMSE = "RMSE"
  show MSE  = "MSE"
--- a/src/GEval/MetricsMeta.hs
+++ b/src/GEval/MetricsMeta.hs
@ -0,0 +1,133 @@
+{-# LANGUAGE OverloadedStrings #-}
+{-# LANGUAGE QuasiQuotes #-}
+
+module GEval.MetricsMeta
+  (listOfAvailableMetrics,
+   listOfAvailableEvaluationSchemes,
+   extraInfo,
+   isEvaluationSchemeDescribed,
+   getEvaluationSchemeDescription,
+   outContents,
+   expectedScore,
+   allMetricsDescription)
+  where
+
+import GEval.Common
+import GEval.Metric
+import GEval.EvaluationScheme
+import GEval.CreateChallenge (testExpectedContents)
+import GEval.PrecisionRecall (weightedHarmonicMean)
+
+import Text.Regex.PCRE.Heavy
+import Data.Either (fromRight)
+import Data.String.Here
+
+import Data.List (intercalate)
+import Text.Printf
+
+-- | the list of available metrics, to be shown to the user or to be
+-- | used for tests
+listOfAvailableMetrics :: [Metric]
+listOfAvailableMetrics = [RMSE,
+                          MSE,
+                          MAE,
+                          SMAPE,
+                          Pearson,
+                          Spearman,
+                          Accuracy,
+                          LogLoss,
+                          Likelihood,
+                          FMeasure 1.0,
+                          FMeasure 2.0,
+                          FMeasure 0.25,
+                          MacroFMeasure 1.0,
+                          MacroFMeasure 2.0,
+                          MacroFMeasure 0.25,
+                          MultiLabelFMeasure 1.0,
+                          MultiLabelFMeasure 2.0,
+                          MultiLabelFMeasure 0.25,
+                          MultiLabelLikelihood,
+                          MAP,
+                          BLEU,
+                          GLEU,
+                          WER,
+                          NMI,
+                          ClippEU,
+                          LogLossHashed defaultLogLossHashedSize,
+                          LikelihoodHashed defaultLogLossHashedSize,
+                          BIOF1,
+                          BIOF1Labels,
+                          TokenAccuracy,
+                          SoftFMeasure 1.0,
+                          SoftFMeasure 2.0,
+                          SoftFMeasure 0.25,
+                          ProbabilisticSoftFMeasure 1.0,
+                          ProbabilisticSoftFMeasure 2.0,
+                          ProbabilisticSoftFMeasure 0.25,
+                          CharMatch]
+
+extraInfo :: EvaluationScheme -> Maybe String
+extraInfo (EvaluationScheme GLEU [])  = Just "\"Google GLEU\" not the grammar correction metric"
+extraInfo (EvaluationScheme BLEU [LowerCasing,
+                                 RegexpMatch _]) = Just "BLEU on lowercased strings, only Latin characters and digits considered"
+extraInfo _ = Nothing
+
+-- As we just started describing metrics (or, to be precise,
+-- evaluation schemes), we need keep track of which metric is
+-- described and which - not.
+-- When all the metrics are described, this function should be
+-- removed.
+isEvaluationSchemeDescribed :: EvaluationScheme -> Bool
+isEvaluationSchemeDescribed (EvaluationScheme metric []) = isMetricDescribed metric
+isEvaluationSchemeDescribed _ = False
+
+isMetricDescribed :: Metric -> Bool
+isMetricDescribed (SoftFMeasure _) = True
+isMetricDescribed _ = False
+
+getEvaluationSchemeDescription :: EvaluationScheme -> String
+getEvaluationSchemeDescription (EvaluationScheme metric []) = getMetricDescription metric
+
+getMetricDescription :: Metric -> String
+getMetricDescription (SoftFMeasure _) =
+  [i|"Soft" F-measure on intervals, i.e. partial "hits" are considered. For instance,
+if a label `foo` is expected for the span 2-9 and this label is returned but with
+the span 8-12, it is counted as 1/4 for recall and 2/5 for precision.
+|]
+
+outContents :: Metric -> String
+outContents (SoftFMeasure _) = [hereLit|inwords:1-4
+inwords:1-3 indigits:5
+|]
+
+expectedScore :: EvaluationScheme -> MetricValue
+expectedScore (EvaluationScheme (SoftFMeasure beta) []) = weightedHarmonicMean beta precision recall
+  where precision = 0.25
+        recall = 0.75
+
+listOfAvailableEvaluationSchemes :: [EvaluationScheme]
+listOfAvailableEvaluationSchemes = map (\m -> EvaluationScheme m []) listOfAvailableMetrics
+                                   ++ [
+                                   EvaluationScheme BLEU [LowerCasing,
+                                                          RegexpMatch (fromRight undefined $ compileM "\\s+|[a-z0-9]+" [])]
+                                   ]
+
+allMetricsDescription :: String
+allMetricsDescription =
+  intercalate "\n\n\n" $ map formatEvaluationSchemeDescription listOfAvailableEvaluationSchemes
+
+formatEvaluationSchemeDescription :: EvaluationScheme -> String
+formatEvaluationSchemeDescription scheme@(EvaluationScheme metric _) = show scheme ++ "\n" ++ description
+  where description = if isEvaluationSchemeDescribed scheme
+                      then (getEvaluationSchemeDescription scheme)
+                           ++ "\nExample\n"
+                           ++ (pasteLines "Expected output" "Sample output")
+                           ++ concat (map (\(exp, out) -> pasteLines exp out) $ zip (lines $ testExpectedContents metric)
+                                                                                   (lines $ outContents metric))
+                           ++ "\nMetric value: " ++ (printf "%.4f" $ expectedScore scheme)
+                      else noDescription
+        noDescription = [hereLit|THE METRIC HAS NO DESCRIPTION YET, PLEASE ADD AN ISSUE TO https://gitlab.com/filipg/geval/issues
+IF YOU WANT TO HAVE IT DESCRIBED|]
+
+pasteLines :: String -> String -> String
+pasteLines a b = printf "%-35s %s\n" a b
--- a/src/GEval/OptionsParser.hs
+++ b/src/GEval/OptionsParser.hs
@ -27,6 +27,7 @@ import Data.Monoid ((<>))

 import GEval.Core
 import GEval.EvaluationScheme
+import GEval.MetricsMeta (extraInfo, listOfAvailableEvaluationSchemes, allMetricsDescription)
 import GEval.Common
 import GEval.CreateChallenge
 import GEval.LineByLine
@ -35,6 +36,8 @@ import GEval.BlackBoxDebugging
 import GEval.Selector
 import GEval.Validation

+import Data.List (intercalate)
+
 import Data.Conduit.SmartSource

 fullOptionsParser = info (helper <*> optionsParser)
@ -87,6 +90,10 @@ optionsParser = GEvalOptions
                (flag' Validate
                    ( long "validate"
                      <> help "Validate challenge, it searches for potential errors in the given challenge path, like missing columns, files or format data."))
+                <|>
+                (flag' ListMetrics
+                    ( long "list-metrics"
+                      <> help "List all metrics with their descriptions"))
                )

   <*> ((flag' FirstTheWorst
@ -233,12 +240,21 @@ sel :: Maybe Metric -> Metric -> Metric
 sel Nothing m = m
 sel (Just m) _ = m

+
+
 metricReader :: Parser [EvaluationScheme]
 metricReader = many $ option auto         -- actually `some` should be used instead of `many`, the problem is that
               ( long "metric"            -- --metric might be in the config.txt file...
                 <> short 'm'
                 <> metavar "METRIC"
-                 <> help "Metric to be used - RMSE, MSE, MAE, SMAPE, Pearson, Spearman, Accuracy, LogLoss, Likelihood, F-measure (specify as F1, F2, F0.25, etc.), macro F-measure (specify as Macro-F1, Macro-F2, Macro-F0.25, etc.), multi-label F-measure (specify as MultiLabel-F1, MultiLabel-F2, MultiLabel-F0.25, etc.), MultiLabel-Likelihood, MAP, BLEU, GLEU (\"Google GLEU\" not the grammar correction metric), WER, NMI, ClippEU, LogLossHashed, LikelihoodHashed, BIO-F1, BIO-F1-Labels, TokenAccuracy, soft F-measure (specify as Soft-F1, Soft-F2, Soft-F0.25), probabilistic soft F-measure (specify as Probabilistic-Soft-F1, Probabilistic-Soft-F2, Probabilistic-Soft-F0.25) or CharMatch" )
+                 <> help ("Metric to be used, e.g.:" ++ intercalate ", " (map
+                                                                          (\s -> (show s) ++ (case extraInfo s of
+                                                                                               Just eI -> " (" ++ eI ++ ")"
+                                                                                               Nothing -> ""))
+                                                                          listOfAvailableEvaluationSchemes)))
+
+
+--                   RMSE, MSE, MAE, SMAPE, Pearson, Spearman, Accuracy, LogLoss, Likelihood, F-measure (specify as F1, F2, F0.25, etc.), macro F-measure (specify as Macro-F1, Macro-F2, Macro-F0.25, etc.), multi-label F-measure (specify as MultiLabel-F1, MultiLabel-F2, MultiLabel-F0.25, etc.), MultiLabel-Likelihood, MAP, BLEU, GLEU (\"Google GLEU\" not the grammar correction metric), WER, NMI, ClippEU, LogLossHashed, LikelihoodHashed, BIO-F1, BIO-F1-Labels, TokenAccuracy, soft F-measure (specify as Soft-F1, Soft-F2, Soft-F0.25), probabilistic soft F-measure (specify as Probabilistic-Soft-F1, Probabilistic-Soft-F2, Probabilistic-Soft-F0.25) or CharMatch" )

 altMetricReader :: Parser (Maybe EvaluationScheme)
 altMetricReader = optional $ option auto
@ -341,6 +357,9 @@ runGEval''' (Just Submit) _ _ spec _ _ = do
 runGEval''' (Just Validate) _ _ spec _ _ = do
  validateChallenge spec
  return Nothing
+runGEval''' (Just ListMetrics) _ _ _ _ _ = do
+  listMetrics
+  return Nothing

 getGraphFilename :: Int -> FilePath -> FilePath
 getGraphFilename 0 fp = fp
@ -401,3 +420,6 @@ Run:
 to validate a directory CHALLENGE representing a Gonito challenge.
 |]
  exitFailure
+
+listMetrics :: IO ()
+listMetrics = putStrLn allMetricsDescription
--- a/test/Spec.hs
+++ b/test/Spec.hs
@ -4,6 +4,7 @@
 import Test.Hspec

 import GEval.Metric
+import GEval.MetricsMeta (listOfAvailableEvaluationSchemes, isEvaluationSchemeDescribed, expectedScore, outContents)
 import GEval.Core
 import GEval.Common
 import GEval.EvaluationScheme
@ -33,6 +34,8 @@ import Data.Map.Strict

 import Data.Conduit.List (consume)

+import System.FilePath
+
 import System.Directory
 import System.Process
 import System.Exit
@ -508,15 +511,27 @@ main = hspec $ do
      shapify "a" `shouldBe` (WordShape "a")
      shapify "B5" `shouldBe` (WordShape "A9")
  describe "create challenges and validate them" $ do
-    (flip mapM_) listOfAvailableMetrics $ \metric -> do
-        it (show metric) $ do
+    (flip mapM_) listOfAvailableEvaluationSchemes $ \scheme -> do
+        it (show scheme) $ do
          withSystemTempDirectory "geval-validation-test" $ \tempDir -> do
            let spec = defaultGEvalSpecification {
                  gesExpectedDirectory = Just tempDir,
-                  gesMetrics = [EvaluationScheme metric []],
+                  gesMetrics = [scheme],
                  gesPrecision = Just 4 }
            createChallenge True tempDir spec
            validationChallenge tempDir spec
+  describe "test sample outputs" $ do
+    (flip mapM_ ) (Prelude.filter isEvaluationSchemeDescribed listOfAvailableEvaluationSchemes) $ \scheme@(EvaluationScheme metric _) -> do
+      it (show scheme) $ do
+        withSystemTempDirectory "geval-sample-output-test" $ \tempDir -> do
+          let spec = defaultGEvalSpecification {
+                gesExpectedDirectory = Just tempDir,
+                gesMetrics = [scheme] }
+          createChallenge True tempDir spec
+          let outFile = tempDir </> "test-A" </> "out.tsv"
+          writeFile outFile (outContents metric)
+          obtainedScore <- (runGEval ["--expected-directory", tempDir, "--out-directory", tempDir]) >>= extractVal
+          obtainedScore `shouldBe` (expectedScore scheme)
  describe "submit" $ do
    it "current branch" $ do
      runGitTest "branch-test" (\_ -> getCurrentBranch) `shouldReturn` "develop"