Start working on --list-metrics options

This commit is contained in:
Filip Gralinski 2019-08-21 23:44:18 +02:00
parent 98b398d34f
commit dab2646798
8 changed files with 180 additions and 49 deletions

View File

@ -17,6 +17,7 @@ library
hs-source-dirs: src
exposed-modules: GEval.Core
, GEval.Metric
, GEval.MetricsMeta
, GEval.EvaluationScheme
, GEval.CreateChallenge
, GEval.OptionsParser
@ -137,6 +138,7 @@ test-suite geval-test
, silently
, vector
, statistics
, filepath
ghc-options: -threaded -rtsopts -with-rtsopts=-N
default-language: Haskell2010

View File

@ -187,7 +187,7 @@ data GEvalSpecialCommand = Init
| LineByLine | WorstFeatures
| Diff FilePath | MostWorseningFeatures FilePath
| PrintVersion | JustTokenize | Submit
| Validate
| Validate | ListMetrics
data ResultOrdering = KeepTheOriginalOrder | FirstTheWorst | FirstTheBest

View File

@ -1,7 +1,8 @@
{-# LANGUAGE QuasiQuotes #-}
module GEval.CreateChallenge
(createChallenge)
(createChallenge,
testExpectedContents)
where
import GEval.Metric

View File

@ -1,5 +1,5 @@
module GEval.EvaluationScheme
(EvaluationScheme(..), evaluationSchemeMetric, applyPreprocessingOperations, evaluationSchemeName)
(EvaluationScheme(..), evaluationSchemeMetric, applyPreprocessingOperations, evaluationSchemeName, PreprocessingOperation(..))
where
import GEval.Metric

View File

@ -5,7 +5,6 @@ module GEval.Metric
MetricOrdering(..),
defaultLogLossHashedSize,
getMetricOrdering,
listOfAvailableMetrics,
bestPossibleValue,
perfectOutLineFromExpectedLine,
fixedNumberOfColumnsInExpected,
@ -32,47 +31,6 @@ data Metric = RMSE | MSE | Pearson | Spearman | BLEU | GLEU | WER | Accuracy | C
| SoftFMeasure Double | ProbabilisticSoftFMeasure Double
deriving (Eq)
-- | the list of available metrics, to be shown to the user or to be
-- | used for tests
listOfAvailableMetrics :: [Metric]
listOfAvailableMetrics = [RMSE,
MSE,
MAE,
SMAPE,
Pearson,
Spearman,
Accuracy,
LogLoss,
Likelihood,
FMeasure 1.0,
FMeasure 2.0,
FMeasure 0.25,
MacroFMeasure 1.0,
MacroFMeasure 2.0,
MacroFMeasure 0.25,
MultiLabelFMeasure 1.0,
MultiLabelFMeasure 2.0,
MultiLabelFMeasure 0.25,
MultiLabelLikelihood,
MAP,
BLEU,
GLEU,
WER,
NMI,
ClippEU,
LogLossHashed defaultLogLossHashedSize,
LikelihoodHashed defaultLogLossHashedSize,
BIOF1,
BIOF1Labels,
TokenAccuracy,
SoftFMeasure 1.0,
SoftFMeasure 2.0,
SoftFMeasure 0.25,
ProbabilisticSoftFMeasure 1.0,
ProbabilisticSoftFMeasure 2.0,
ProbabilisticSoftFMeasure 0.25,
CharMatch]
instance Show Metric where
show RMSE = "RMSE"
show MSE = "MSE"

133
src/GEval/MetricsMeta.hs Normal file
View File

@ -0,0 +1,133 @@
{-# LANGUAGE OverloadedStrings #-}
{-# LANGUAGE QuasiQuotes #-}
module GEval.MetricsMeta
(listOfAvailableMetrics,
listOfAvailableEvaluationSchemes,
extraInfo,
isEvaluationSchemeDescribed,
getEvaluationSchemeDescription,
outContents,
expectedScore,
allMetricsDescription)
where
import GEval.Common
import GEval.Metric
import GEval.EvaluationScheme
import GEval.CreateChallenge (testExpectedContents)
import GEval.PrecisionRecall (weightedHarmonicMean)
import Text.Regex.PCRE.Heavy
import Data.Either (fromRight)
import Data.String.Here
import Data.List (intercalate)
import Text.Printf
-- | the list of available metrics, to be shown to the user or to be
-- | used for tests
listOfAvailableMetrics :: [Metric]
listOfAvailableMetrics = [RMSE,
MSE,
MAE,
SMAPE,
Pearson,
Spearman,
Accuracy,
LogLoss,
Likelihood,
FMeasure 1.0,
FMeasure 2.0,
FMeasure 0.25,
MacroFMeasure 1.0,
MacroFMeasure 2.0,
MacroFMeasure 0.25,
MultiLabelFMeasure 1.0,
MultiLabelFMeasure 2.0,
MultiLabelFMeasure 0.25,
MultiLabelLikelihood,
MAP,
BLEU,
GLEU,
WER,
NMI,
ClippEU,
LogLossHashed defaultLogLossHashedSize,
LikelihoodHashed defaultLogLossHashedSize,
BIOF1,
BIOF1Labels,
TokenAccuracy,
SoftFMeasure 1.0,
SoftFMeasure 2.0,
SoftFMeasure 0.25,
ProbabilisticSoftFMeasure 1.0,
ProbabilisticSoftFMeasure 2.0,
ProbabilisticSoftFMeasure 0.25,
CharMatch]
extraInfo :: EvaluationScheme -> Maybe String
extraInfo (EvaluationScheme GLEU []) = Just "\"Google GLEU\" not the grammar correction metric"
extraInfo (EvaluationScheme BLEU [LowerCasing,
RegexpMatch _]) = Just "BLEU on lowercased strings, only Latin characters and digits considered"
extraInfo _ = Nothing
-- As we just started describing metrics (or, to be precise,
-- evaluation schemes), we need keep track of which metric is
-- described and which - not.
-- When all the metrics are described, this function should be
-- removed.
isEvaluationSchemeDescribed :: EvaluationScheme -> Bool
isEvaluationSchemeDescribed (EvaluationScheme metric []) = isMetricDescribed metric
isEvaluationSchemeDescribed _ = False
isMetricDescribed :: Metric -> Bool
isMetricDescribed (SoftFMeasure _) = True
isMetricDescribed _ = False
getEvaluationSchemeDescription :: EvaluationScheme -> String
getEvaluationSchemeDescription (EvaluationScheme metric []) = getMetricDescription metric
getMetricDescription :: Metric -> String
getMetricDescription (SoftFMeasure _) =
[i|"Soft" F-measure on intervals, i.e. partial "hits" are considered. For instance,
if a label `foo` is expected for the span 2-9 and this label is returned but with
the span 8-12, it is counted as 1/4 for recall and 2/5 for precision.
|]
outContents :: Metric -> String
outContents (SoftFMeasure _) = [hereLit|inwords:1-4
inwords:1-3 indigits:5
|]
expectedScore :: EvaluationScheme -> MetricValue
expectedScore (EvaluationScheme (SoftFMeasure beta) []) = weightedHarmonicMean beta precision recall
where precision = 0.25
recall = 0.75
listOfAvailableEvaluationSchemes :: [EvaluationScheme]
listOfAvailableEvaluationSchemes = map (\m -> EvaluationScheme m []) listOfAvailableMetrics
++ [
EvaluationScheme BLEU [LowerCasing,
RegexpMatch (fromRight undefined $ compileM "\\s+|[a-z0-9]+" [])]
]
allMetricsDescription :: String
allMetricsDescription =
intercalate "\n\n\n" $ map formatEvaluationSchemeDescription listOfAvailableEvaluationSchemes
formatEvaluationSchemeDescription :: EvaluationScheme -> String
formatEvaluationSchemeDescription scheme@(EvaluationScheme metric _) = show scheme ++ "\n" ++ description
where description = if isEvaluationSchemeDescribed scheme
then (getEvaluationSchemeDescription scheme)
++ "\nExample\n"
++ (pasteLines "Expected output" "Sample output")
++ concat (map (\(exp, out) -> pasteLines exp out) $ zip (lines $ testExpectedContents metric)
(lines $ outContents metric))
++ "\nMetric value: " ++ (printf "%.4f" $ expectedScore scheme)
else noDescription
noDescription = [hereLit|THE METRIC HAS NO DESCRIPTION YET, PLEASE ADD AN ISSUE TO https://gitlab.com/filipg/geval/issues
IF YOU WANT TO HAVE IT DESCRIBED|]
pasteLines :: String -> String -> String
pasteLines a b = printf "%-35s %s\n" a b

View File

@ -27,6 +27,7 @@ import Data.Monoid ((<>))
import GEval.Core
import GEval.EvaluationScheme
import GEval.MetricsMeta (extraInfo, listOfAvailableEvaluationSchemes, allMetricsDescription)
import GEval.Common
import GEval.CreateChallenge
import GEval.LineByLine
@ -35,6 +36,8 @@ import GEval.BlackBoxDebugging
import GEval.Selector
import GEval.Validation
import Data.List (intercalate)
import Data.Conduit.SmartSource
fullOptionsParser = info (helper <*> optionsParser)
@ -87,6 +90,10 @@ optionsParser = GEvalOptions
(flag' Validate
( long "validate"
<> help "Validate challenge, it searches for potential errors in the given challenge path, like missing columns, files or format data."))
<|>
(flag' ListMetrics
( long "list-metrics"
<> help "List all metrics with their descriptions"))
)
<*> ((flag' FirstTheWorst
@ -233,12 +240,21 @@ sel :: Maybe Metric -> Metric -> Metric
sel Nothing m = m
sel (Just m) _ = m
metricReader :: Parser [EvaluationScheme]
metricReader = many $ option auto -- actually `some` should be used instead of `many`, the problem is that
( long "metric" -- --metric might be in the config.txt file...
<> short 'm'
<> metavar "METRIC"
<> help "Metric to be used - RMSE, MSE, MAE, SMAPE, Pearson, Spearman, Accuracy, LogLoss, Likelihood, F-measure (specify as F1, F2, F0.25, etc.), macro F-measure (specify as Macro-F1, Macro-F2, Macro-F0.25, etc.), multi-label F-measure (specify as MultiLabel-F1, MultiLabel-F2, MultiLabel-F0.25, etc.), MultiLabel-Likelihood, MAP, BLEU, GLEU (\"Google GLEU\" not the grammar correction metric), WER, NMI, ClippEU, LogLossHashed, LikelihoodHashed, BIO-F1, BIO-F1-Labels, TokenAccuracy, soft F-measure (specify as Soft-F1, Soft-F2, Soft-F0.25), probabilistic soft F-measure (specify as Probabilistic-Soft-F1, Probabilistic-Soft-F2, Probabilistic-Soft-F0.25) or CharMatch" )
<> help ("Metric to be used, e.g.:" ++ intercalate ", " (map
(\s -> (show s) ++ (case extraInfo s of
Just eI -> " (" ++ eI ++ ")"
Nothing -> ""))
listOfAvailableEvaluationSchemes)))
-- RMSE, MSE, MAE, SMAPE, Pearson, Spearman, Accuracy, LogLoss, Likelihood, F-measure (specify as F1, F2, F0.25, etc.), macro F-measure (specify as Macro-F1, Macro-F2, Macro-F0.25, etc.), multi-label F-measure (specify as MultiLabel-F1, MultiLabel-F2, MultiLabel-F0.25, etc.), MultiLabel-Likelihood, MAP, BLEU, GLEU (\"Google GLEU\" not the grammar correction metric), WER, NMI, ClippEU, LogLossHashed, LikelihoodHashed, BIO-F1, BIO-F1-Labels, TokenAccuracy, soft F-measure (specify as Soft-F1, Soft-F2, Soft-F0.25), probabilistic soft F-measure (specify as Probabilistic-Soft-F1, Probabilistic-Soft-F2, Probabilistic-Soft-F0.25) or CharMatch" )
altMetricReader :: Parser (Maybe EvaluationScheme)
altMetricReader = optional $ option auto
@ -341,6 +357,9 @@ runGEval''' (Just Submit) _ _ spec _ _ = do
runGEval''' (Just Validate) _ _ spec _ _ = do
validateChallenge spec
return Nothing
runGEval''' (Just ListMetrics) _ _ _ _ _ = do
listMetrics
return Nothing
getGraphFilename :: Int -> FilePath -> FilePath
getGraphFilename 0 fp = fp
@ -401,3 +420,6 @@ Run:
to validate a directory CHALLENGE representing a Gonito challenge.
|]
exitFailure
listMetrics :: IO ()
listMetrics = putStrLn allMetricsDescription

View File

@ -4,6 +4,7 @@
import Test.Hspec
import GEval.Metric
import GEval.MetricsMeta (listOfAvailableEvaluationSchemes, isEvaluationSchemeDescribed, expectedScore, outContents)
import GEval.Core
import GEval.Common
import GEval.EvaluationScheme
@ -33,6 +34,8 @@ import Data.Map.Strict
import Data.Conduit.List (consume)
import System.FilePath
import System.Directory
import System.Process
import System.Exit
@ -508,15 +511,27 @@ main = hspec $ do
shapify "a" `shouldBe` (WordShape "a")
shapify "B5" `shouldBe` (WordShape "A9")
describe "create challenges and validate them" $ do
(flip mapM_) listOfAvailableMetrics $ \metric -> do
it (show metric) $ do
(flip mapM_) listOfAvailableEvaluationSchemes $ \scheme -> do
it (show scheme) $ do
withSystemTempDirectory "geval-validation-test" $ \tempDir -> do
let spec = defaultGEvalSpecification {
gesExpectedDirectory = Just tempDir,
gesMetrics = [EvaluationScheme metric []],
gesMetrics = [scheme],
gesPrecision = Just 4 }
createChallenge True tempDir spec
validationChallenge tempDir spec
describe "test sample outputs" $ do
(flip mapM_ ) (Prelude.filter isEvaluationSchemeDescribed listOfAvailableEvaluationSchemes) $ \scheme@(EvaluationScheme metric _) -> do
it (show scheme) $ do
withSystemTempDirectory "geval-sample-output-test" $ \tempDir -> do
let spec = defaultGEvalSpecification {
gesExpectedDirectory = Just tempDir,
gesMetrics = [scheme] }
createChallenge True tempDir spec
let outFile = tempDir </> "test-A" </> "out.tsv"
writeFile outFile (outContents metric)
obtainedScore <- (runGEval ["--expected-directory", tempDir, "--out-directory", tempDir]) >>= extractVal
obtainedScore `shouldBe` (expectedScore scheme)
describe "submit" $ do
it "current branch" $ do
runGitTest "branch-test" (\_ -> getCurrentBranch) `shouldReturn` "develop"