Start working on --list-metrics options

2019-08-21 23:44:18 +02:00 · 2019-08-21 23:44:18 +02:00 · dab2646798
commit dab2646798
parent 98b398d34f
8 changed files with 180 additions and 49 deletions
--- a/geval.cabal
+++ b/geval.cabal
@ -17,6 +17,7 @@ library
  hs-source-dirs:      src
  exposed-modules:     GEval.Core
                     , GEval.Metric
                     , GEval.MetricsMeta
                     , GEval.EvaluationScheme
                     , GEval.CreateChallenge
                     , GEval.OptionsParser
@ -137,6 +138,7 @@ test-suite geval-test
                     , silently
                     , vector
                     , statistics
                     , filepath
  ghc-options:         -threaded -rtsopts -with-rtsopts=-N
  default-language:    Haskell2010
--- a/src/GEval/Core.hs
+++ b/src/GEval/Core.hs
@ -187,7 +187,7 @@ data GEvalSpecialCommand = Init
                           | LineByLine | WorstFeatures
                           | Diff FilePath | MostWorseningFeatures FilePath
                           | PrintVersion | JustTokenize | Submit
-                           | Validate
+                           | Validate | ListMetrics
 data ResultOrdering = KeepTheOriginalOrder | FirstTheWorst | FirstTheBest
--- a/src/GEval/CreateChallenge.hs
+++ b/src/GEval/CreateChallenge.hs
@ -1,7 +1,8 @@
 {-# LANGUAGE QuasiQuotes #-}
 module GEval.CreateChallenge
-       (createChallenge)
+       (createChallenge,
        testExpectedContents)
       where
 import GEval.Metric
--- a/src/GEval/EvaluationScheme.hs
+++ b/src/GEval/EvaluationScheme.hs
@ -1,5 +1,5 @@
 module GEval.EvaluationScheme
-  (EvaluationScheme(..), evaluationSchemeMetric, applyPreprocessingOperations, evaluationSchemeName)
+  (EvaluationScheme(..), evaluationSchemeMetric, applyPreprocessingOperations, evaluationSchemeName, PreprocessingOperation(..))
  where
 import GEval.Metric
--- a/src/GEval/Metric.hs
+++ b/src/GEval/Metric.hs
@ -5,7 +5,6 @@ module GEval.Metric
   MetricOrdering(..),
   defaultLogLossHashedSize,
   getMetricOrdering,
   listOfAvailableMetrics,
   bestPossibleValue,
   perfectOutLineFromExpectedLine,
   fixedNumberOfColumnsInExpected,
@ -32,47 +31,6 @@ data Metric = RMSE | MSE | Pearson | Spearman | BLEU | GLEU | WER | Accuracy | C
              | SoftFMeasure Double | ProbabilisticSoftFMeasure Double
              deriving (Eq)
 -- | the list of available metrics, to be shown to the user or to be
 -- | used for tests
 listOfAvailableMetrics :: [Metric]
 listOfAvailableMetrics = [RMSE,
                          MSE,
                          MAE,
                          SMAPE,
                          Pearson,
                          Spearman,
                          Accuracy,
                          LogLoss,
                          Likelihood,
                          FMeasure 1.0,
                          FMeasure 2.0,
                          FMeasure 0.25,
                          MacroFMeasure 1.0,
                          MacroFMeasure 2.0,
                          MacroFMeasure 0.25,
                          MultiLabelFMeasure 1.0,
                          MultiLabelFMeasure 2.0,
                          MultiLabelFMeasure 0.25,
                          MultiLabelLikelihood,
                          MAP,
                          BLEU,
                          GLEU,
                          WER,
                          NMI,
                          ClippEU,
                          LogLossHashed defaultLogLossHashedSize,
                          LikelihoodHashed defaultLogLossHashedSize,
                          BIOF1,
                          BIOF1Labels,
                          TokenAccuracy,
                          SoftFMeasure 1.0,
                          SoftFMeasure 2.0,
                          SoftFMeasure 0.25,
                          ProbabilisticSoftFMeasure 1.0,
                          ProbabilisticSoftFMeasure 2.0,
                          ProbabilisticSoftFMeasure 0.25,
                          CharMatch]
 instance Show Metric where
  show RMSE = "RMSE"
  show MSE  = "MSE"
--- a/src/GEval/MetricsMeta.hs
+++ b/src/GEval/MetricsMeta.hs
@ -0,0 +1,133 @@
 {-# LANGUAGE OverloadedStrings #-}
 {-# LANGUAGE QuasiQuotes #-}
 module GEval.MetricsMeta
  (listOfAvailableMetrics,
   listOfAvailableEvaluationSchemes,
   extraInfo,
   isEvaluationSchemeDescribed,
   getEvaluationSchemeDescription,
   outContents,
   expectedScore,
   allMetricsDescription)
  where
 import GEval.Common
 import GEval.Metric
 import GEval.EvaluationScheme
 import GEval.CreateChallenge (testExpectedContents)
 import GEval.PrecisionRecall (weightedHarmonicMean)
 import Text.Regex.PCRE.Heavy
 import Data.Either (fromRight)
 import Data.String.Here
 import Data.List (intercalate)
 import Text.Printf
 -- | the list of available metrics, to be shown to the user or to be
 -- | used for tests
 listOfAvailableMetrics :: [Metric]
 listOfAvailableMetrics = [RMSE,
                          MSE,
                          MAE,
                          SMAPE,
                          Pearson,
                          Spearman,
                          Accuracy,
                          LogLoss,
                          Likelihood,
                          FMeasure 1.0,
                          FMeasure 2.0,
                          FMeasure 0.25,
                          MacroFMeasure 1.0,
                          MacroFMeasure 2.0,
                          MacroFMeasure 0.25,
                          MultiLabelFMeasure 1.0,
                          MultiLabelFMeasure 2.0,
                          MultiLabelFMeasure 0.25,
                          MultiLabelLikelihood,
                          MAP,
                          BLEU,
                          GLEU,
                          WER,
                          NMI,
                          ClippEU,
                          LogLossHashed defaultLogLossHashedSize,
                          LikelihoodHashed defaultLogLossHashedSize,
                          BIOF1,
                          BIOF1Labels,
                          TokenAccuracy,
                          SoftFMeasure 1.0,
                          SoftFMeasure 2.0,
                          SoftFMeasure 0.25,
                          ProbabilisticSoftFMeasure 1.0,
                          ProbabilisticSoftFMeasure 2.0,
                          ProbabilisticSoftFMeasure 0.25,
                          CharMatch]
 extraInfo :: EvaluationScheme -> Maybe String
 extraInfo (EvaluationScheme GLEU [])  = Just "\"Google GLEU\" not the grammar correction metric"
 extraInfo (EvaluationScheme BLEU [LowerCasing,
                                 RegexpMatch _]) = Just "BLEU on lowercased strings, only Latin characters and digits considered"
 extraInfo _ = Nothing
 -- As we just started describing metrics (or, to be precise,
 -- evaluation schemes), we need keep track of which metric is
 -- described and which - not.
 -- When all the metrics are described, this function should be
 -- removed.
 isEvaluationSchemeDescribed :: EvaluationScheme -> Bool
 isEvaluationSchemeDescribed (EvaluationScheme metric []) = isMetricDescribed metric
 isEvaluationSchemeDescribed _ = False
 isMetricDescribed :: Metric -> Bool
 isMetricDescribed (SoftFMeasure _) = True
 isMetricDescribed _ = False
 getEvaluationSchemeDescription :: EvaluationScheme -> String
 getEvaluationSchemeDescription (EvaluationScheme metric []) = getMetricDescription metric
 getMetricDescription :: Metric -> String
 getMetricDescription (SoftFMeasure _) =
  [i|"Soft" F-measure on intervals, i.e. partial "hits" are considered. For instance,
 if a label `foo` is expected for the span 2-9 and this label is returned but with
 the span 8-12, it is counted as 1/4 for recall and 2/5 for precision.
 |]
 outContents :: Metric -> String
 outContents (SoftFMeasure _) = [hereLit|inwords:1-4
 inwords:1-3 indigits:5
 |]
 expectedScore :: EvaluationScheme -> MetricValue
 expectedScore (EvaluationScheme (SoftFMeasure beta) []) = weightedHarmonicMean beta precision recall
  where precision = 0.25
        recall = 0.75
 listOfAvailableEvaluationSchemes :: [EvaluationScheme]
 listOfAvailableEvaluationSchemes = map (\m -> EvaluationScheme m []) listOfAvailableMetrics
                                   ++ [
                                   EvaluationScheme BLEU [LowerCasing,
                                                          RegexpMatch (fromRight undefined $ compileM "\\s+|[a-z0-9]+" [])]
                                   ]
 allMetricsDescription :: String
 allMetricsDescription =
  intercalate "\n\n\n" $ map formatEvaluationSchemeDescription listOfAvailableEvaluationSchemes
 formatEvaluationSchemeDescription :: EvaluationScheme -> String
 formatEvaluationSchemeDescription scheme@(EvaluationScheme metric _) = show scheme ++ "\n" ++ description
  where description = if isEvaluationSchemeDescribed scheme
                      then (getEvaluationSchemeDescription scheme)
                           ++ "\nExample\n"
                           ++ (pasteLines "Expected output" "Sample output")
                           ++ concat (map (\(exp, out) -> pasteLines exp out) $ zip (lines $ testExpectedContents metric)
                                                                                   (lines $ outContents metric))
                           ++ "\nMetric value: " ++ (printf "%.4f" $ expectedScore scheme)
                      else noDescription
        noDescription = [hereLit|THE METRIC HAS NO DESCRIPTION YET, PLEASE ADD AN ISSUE TO https://gitlab.com/filipg/geval/issues
 IF YOU WANT TO HAVE IT DESCRIBED|]
 pasteLines :: String -> String -> String
 pasteLines a b = printf "%-35s %s\n" a b
--- a/src/GEval/OptionsParser.hs
+++ b/src/GEval/OptionsParser.hs
@ -27,6 +27,7 @@ import Data.Monoid ((<>))
 import GEval.Core
 import GEval.EvaluationScheme
 import GEval.MetricsMeta (extraInfo, listOfAvailableEvaluationSchemes, allMetricsDescription)
 import GEval.Common
 import GEval.CreateChallenge
 import GEval.LineByLine
@ -35,6 +36,8 @@ import GEval.BlackBoxDebugging
 import GEval.Selector
 import GEval.Validation
 import Data.List (intercalate)
 import Data.Conduit.SmartSource
 fullOptionsParser = info (helper <*> optionsParser)
@ -87,6 +90,10 @@ optionsParser = GEvalOptions
                (flag' Validate
                    ( long "validate"
                      <> help "Validate challenge, it searches for potential errors in the given challenge path, like missing columns, files or format data."))
                <|>
                (flag' ListMetrics
                    ( long "list-metrics"
                      <> help "List all metrics with their descriptions"))
                )
   <*> ((flag' FirstTheWorst
@ -233,12 +240,21 @@ sel :: Maybe Metric -> Metric -> Metric
 sel Nothing m = m
 sel (Just m) _ = m
 metricReader :: Parser [EvaluationScheme]
 metricReader = many $ option auto         -- actually `some` should be used instead of `many`, the problem is that
               ( long "metric"            -- --metric might be in the config.txt file...
                 <> short 'm'
                 <> metavar "METRIC"
-                 <> help "Metric to be used - RMSE, MSE, MAE, SMAPE, Pearson, Spearman, Accuracy, LogLoss, Likelihood, F-measure (specify as F1, F2, F0.25, etc.), macro F-measure (specify as Macro-F1, Macro-F2, Macro-F0.25, etc.), multi-label F-measure (specify as MultiLabel-F1, MultiLabel-F2, MultiLabel-F0.25, etc.), MultiLabel-Likelihood, MAP, BLEU, GLEU (\"Google GLEU\" not the grammar correction metric), WER, NMI, ClippEU, LogLossHashed, LikelihoodHashed, BIO-F1, BIO-F1-Labels, TokenAccuracy, soft F-measure (specify as Soft-F1, Soft-F2, Soft-F0.25), probabilistic soft F-measure (specify as Probabilistic-Soft-F1, Probabilistic-Soft-F2, Probabilistic-Soft-F0.25) or CharMatch" )
+                 <> help ("Metric to be used, e.g.:" ++ intercalate ", " (map
                                                                          (\s -> (show s) ++ (case extraInfo s of
                                                                                               Just eI -> " (" ++ eI ++ ")"
                                                                                               Nothing -> ""))
                                                                          listOfAvailableEvaluationSchemes)))
 --                   RMSE, MSE, MAE, SMAPE, Pearson, Spearman, Accuracy, LogLoss, Likelihood, F-measure (specify as F1, F2, F0.25, etc.), macro F-measure (specify as Macro-F1, Macro-F2, Macro-F0.25, etc.), multi-label F-measure (specify as MultiLabel-F1, MultiLabel-F2, MultiLabel-F0.25, etc.), MultiLabel-Likelihood, MAP, BLEU, GLEU (\"Google GLEU\" not the grammar correction metric), WER, NMI, ClippEU, LogLossHashed, LikelihoodHashed, BIO-F1, BIO-F1-Labels, TokenAccuracy, soft F-measure (specify as Soft-F1, Soft-F2, Soft-F0.25), probabilistic soft F-measure (specify as Probabilistic-Soft-F1, Probabilistic-Soft-F2, Probabilistic-Soft-F0.25) or CharMatch" )
 altMetricReader :: Parser (Maybe EvaluationScheme)
 altMetricReader = optional $ option auto
@ -341,6 +357,9 @@ runGEval''' (Just Submit) _ _ spec _ _ = do
 runGEval''' (Just Validate) _ _ spec _ _ = do
  validateChallenge spec
  return Nothing
 runGEval''' (Just ListMetrics) _ _ _ _ _ = do
  listMetrics
  return Nothing
 getGraphFilename :: Int -> FilePath -> FilePath
 getGraphFilename 0 fp = fp
@ -401,3 +420,6 @@ Run:
 to validate a directory CHALLENGE representing a Gonito challenge.
 |]
  exitFailure
 listMetrics :: IO ()
 listMetrics = putStrLn allMetricsDescription
--- a/test/Spec.hs
+++ b/test/Spec.hs
@ -4,6 +4,7 @@
 import Test.Hspec
 import GEval.Metric
 import GEval.MetricsMeta (listOfAvailableEvaluationSchemes, isEvaluationSchemeDescribed, expectedScore, outContents)
 import GEval.Core
 import GEval.Common
 import GEval.EvaluationScheme
@ -33,6 +34,8 @@ import Data.Map.Strict
 import Data.Conduit.List (consume)
 import System.FilePath
 import System.Directory
 import System.Process
 import System.Exit
@ -508,15 +511,27 @@ main = hspec $ do
      shapify "a" `shouldBe` (WordShape "a")
      shapify "B5" `shouldBe` (WordShape "A9")
  describe "create challenges and validate them" $ do
-    (flip mapM_) listOfAvailableMetrics $ \metric -> do
+    (flip mapM_) listOfAvailableEvaluationSchemes $ \scheme -> do
-        it (show metric) $ do
+        it (show scheme) $ do
          withSystemTempDirectory "geval-validation-test" $ \tempDir -> do
            let spec = defaultGEvalSpecification {
                  gesExpectedDirectory = Just tempDir,
-                  gesMetrics = [EvaluationScheme metric []],
+                  gesMetrics = [scheme],
                  gesPrecision = Just 4 }
            createChallenge True tempDir spec
            validationChallenge tempDir spec
  describe "test sample outputs" $ do
    (flip mapM_ ) (Prelude.filter isEvaluationSchemeDescribed listOfAvailableEvaluationSchemes) $ \scheme@(EvaluationScheme metric _) -> do
      it (show scheme) $ do
        withSystemTempDirectory "geval-sample-output-test" $ \tempDir -> do
          let spec = defaultGEvalSpecification {
                gesExpectedDirectory = Just tempDir,
                gesMetrics = [scheme] }
          createChallenge True tempDir spec
          let outFile = tempDir </> "test-A" </> "out.tsv"
          writeFile outFile (outContents metric)
          obtainedScore <- (runGEval ["--expected-directory", tempDir, "--out-directory", tempDir]) >>= extractVal
          obtainedScore `shouldBe` (expectedScore scheme)
  describe "submit" $ do
    it "current branch" $ do
      runGitTest "branch-test" (\_ -> getCurrentBranch) `shouldReturn` "develop"