Start working on --list-metrics options

This commit is contained in:
Filip Gralinski 2019-08-21 23:44:18 +02:00
parent 98b398d34f
commit dab2646798
8 changed files with 180 additions and 49 deletions

View File

@ -17,6 +17,7 @@ library
hs-source-dirs: src hs-source-dirs: src
exposed-modules: GEval.Core exposed-modules: GEval.Core
, GEval.Metric , GEval.Metric
, GEval.MetricsMeta
, GEval.EvaluationScheme , GEval.EvaluationScheme
, GEval.CreateChallenge , GEval.CreateChallenge
, GEval.OptionsParser , GEval.OptionsParser
@ -137,6 +138,7 @@ test-suite geval-test
, silently , silently
, vector , vector
, statistics , statistics
, filepath
ghc-options: -threaded -rtsopts -with-rtsopts=-N ghc-options: -threaded -rtsopts -with-rtsopts=-N
default-language: Haskell2010 default-language: Haskell2010

View File

@ -187,7 +187,7 @@ data GEvalSpecialCommand = Init
| LineByLine | WorstFeatures | LineByLine | WorstFeatures
| Diff FilePath | MostWorseningFeatures FilePath | Diff FilePath | MostWorseningFeatures FilePath
| PrintVersion | JustTokenize | Submit | PrintVersion | JustTokenize | Submit
| Validate | Validate | ListMetrics
data ResultOrdering = KeepTheOriginalOrder | FirstTheWorst | FirstTheBest data ResultOrdering = KeepTheOriginalOrder | FirstTheWorst | FirstTheBest

View File

@ -1,7 +1,8 @@
{-# LANGUAGE QuasiQuotes #-} {-# LANGUAGE QuasiQuotes #-}
module GEval.CreateChallenge module GEval.CreateChallenge
(createChallenge) (createChallenge,
testExpectedContents)
where where
import GEval.Metric import GEval.Metric

View File

@ -1,5 +1,5 @@
module GEval.EvaluationScheme module GEval.EvaluationScheme
(EvaluationScheme(..), evaluationSchemeMetric, applyPreprocessingOperations, evaluationSchemeName) (EvaluationScheme(..), evaluationSchemeMetric, applyPreprocessingOperations, evaluationSchemeName, PreprocessingOperation(..))
where where
import GEval.Metric import GEval.Metric

View File

@ -5,7 +5,6 @@ module GEval.Metric
MetricOrdering(..), MetricOrdering(..),
defaultLogLossHashedSize, defaultLogLossHashedSize,
getMetricOrdering, getMetricOrdering,
listOfAvailableMetrics,
bestPossibleValue, bestPossibleValue,
perfectOutLineFromExpectedLine, perfectOutLineFromExpectedLine,
fixedNumberOfColumnsInExpected, fixedNumberOfColumnsInExpected,
@ -32,47 +31,6 @@ data Metric = RMSE | MSE | Pearson | Spearman | BLEU | GLEU | WER | Accuracy | C
| SoftFMeasure Double | ProbabilisticSoftFMeasure Double | SoftFMeasure Double | ProbabilisticSoftFMeasure Double
deriving (Eq) deriving (Eq)
-- | the list of available metrics, to be shown to the user or to be
-- | used for tests
listOfAvailableMetrics :: [Metric]
listOfAvailableMetrics = [RMSE,
MSE,
MAE,
SMAPE,
Pearson,
Spearman,
Accuracy,
LogLoss,
Likelihood,
FMeasure 1.0,
FMeasure 2.0,
FMeasure 0.25,
MacroFMeasure 1.0,
MacroFMeasure 2.0,
MacroFMeasure 0.25,
MultiLabelFMeasure 1.0,
MultiLabelFMeasure 2.0,
MultiLabelFMeasure 0.25,
MultiLabelLikelihood,
MAP,
BLEU,
GLEU,
WER,
NMI,
ClippEU,
LogLossHashed defaultLogLossHashedSize,
LikelihoodHashed defaultLogLossHashedSize,
BIOF1,
BIOF1Labels,
TokenAccuracy,
SoftFMeasure 1.0,
SoftFMeasure 2.0,
SoftFMeasure 0.25,
ProbabilisticSoftFMeasure 1.0,
ProbabilisticSoftFMeasure 2.0,
ProbabilisticSoftFMeasure 0.25,
CharMatch]
instance Show Metric where instance Show Metric where
show RMSE = "RMSE" show RMSE = "RMSE"
show MSE = "MSE" show MSE = "MSE"

133
src/GEval/MetricsMeta.hs Normal file
View File

@ -0,0 +1,133 @@
{-# LANGUAGE OverloadedStrings #-}
{-# LANGUAGE QuasiQuotes #-}
module GEval.MetricsMeta
(listOfAvailableMetrics,
listOfAvailableEvaluationSchemes,
extraInfo,
isEvaluationSchemeDescribed,
getEvaluationSchemeDescription,
outContents,
expectedScore,
allMetricsDescription)
where
import GEval.Common
import GEval.Metric
import GEval.EvaluationScheme
import GEval.CreateChallenge (testExpectedContents)
import GEval.PrecisionRecall (weightedHarmonicMean)
import Text.Regex.PCRE.Heavy
import Data.Either (fromRight)
import Data.String.Here
import Data.List (intercalate)
import Text.Printf
-- | the list of available metrics, to be shown to the user or to be
-- | used for tests
listOfAvailableMetrics :: [Metric]
listOfAvailableMetrics = [RMSE,
MSE,
MAE,
SMAPE,
Pearson,
Spearman,
Accuracy,
LogLoss,
Likelihood,
FMeasure 1.0,
FMeasure 2.0,
FMeasure 0.25,
MacroFMeasure 1.0,
MacroFMeasure 2.0,
MacroFMeasure 0.25,
MultiLabelFMeasure 1.0,
MultiLabelFMeasure 2.0,
MultiLabelFMeasure 0.25,
MultiLabelLikelihood,
MAP,
BLEU,
GLEU,
WER,
NMI,
ClippEU,
LogLossHashed defaultLogLossHashedSize,
LikelihoodHashed defaultLogLossHashedSize,
BIOF1,
BIOF1Labels,
TokenAccuracy,
SoftFMeasure 1.0,
SoftFMeasure 2.0,
SoftFMeasure 0.25,
ProbabilisticSoftFMeasure 1.0,
ProbabilisticSoftFMeasure 2.0,
ProbabilisticSoftFMeasure 0.25,
CharMatch]
extraInfo :: EvaluationScheme -> Maybe String
extraInfo (EvaluationScheme GLEU []) = Just "\"Google GLEU\" not the grammar correction metric"
extraInfo (EvaluationScheme BLEU [LowerCasing,
RegexpMatch _]) = Just "BLEU on lowercased strings, only Latin characters and digits considered"
extraInfo _ = Nothing
-- As we just started describing metrics (or, to be precise,
-- evaluation schemes), we need keep track of which metric is
-- described and which - not.
-- When all the metrics are described, this function should be
-- removed.
isEvaluationSchemeDescribed :: EvaluationScheme -> Bool
isEvaluationSchemeDescribed (EvaluationScheme metric []) = isMetricDescribed metric
isEvaluationSchemeDescribed _ = False
isMetricDescribed :: Metric -> Bool
isMetricDescribed (SoftFMeasure _) = True
isMetricDescribed _ = False
getEvaluationSchemeDescription :: EvaluationScheme -> String
getEvaluationSchemeDescription (EvaluationScheme metric []) = getMetricDescription metric
getMetricDescription :: Metric -> String
getMetricDescription (SoftFMeasure _) =
[i|"Soft" F-measure on intervals, i.e. partial "hits" are considered. For instance,
if a label `foo` is expected for the span 2-9 and this label is returned but with
the span 8-12, it is counted as 1/4 for recall and 2/5 for precision.
|]
outContents :: Metric -> String
outContents (SoftFMeasure _) = [hereLit|inwords:1-4
inwords:1-3 indigits:5
|]
expectedScore :: EvaluationScheme -> MetricValue
expectedScore (EvaluationScheme (SoftFMeasure beta) []) = weightedHarmonicMean beta precision recall
where precision = 0.25
recall = 0.75
listOfAvailableEvaluationSchemes :: [EvaluationScheme]
listOfAvailableEvaluationSchemes = map (\m -> EvaluationScheme m []) listOfAvailableMetrics
++ [
EvaluationScheme BLEU [LowerCasing,
RegexpMatch (fromRight undefined $ compileM "\\s+|[a-z0-9]+" [])]
]
allMetricsDescription :: String
allMetricsDescription =
intercalate "\n\n\n" $ map formatEvaluationSchemeDescription listOfAvailableEvaluationSchemes
formatEvaluationSchemeDescription :: EvaluationScheme -> String
formatEvaluationSchemeDescription scheme@(EvaluationScheme metric _) = show scheme ++ "\n" ++ description
where description = if isEvaluationSchemeDescribed scheme
then (getEvaluationSchemeDescription scheme)
++ "\nExample\n"
++ (pasteLines "Expected output" "Sample output")
++ concat (map (\(exp, out) -> pasteLines exp out) $ zip (lines $ testExpectedContents metric)
(lines $ outContents metric))
++ "\nMetric value: " ++ (printf "%.4f" $ expectedScore scheme)
else noDescription
noDescription = [hereLit|THE METRIC HAS NO DESCRIPTION YET, PLEASE ADD AN ISSUE TO https://gitlab.com/filipg/geval/issues
IF YOU WANT TO HAVE IT DESCRIBED|]
pasteLines :: String -> String -> String
pasteLines a b = printf "%-35s %s\n" a b

View File

@ -27,6 +27,7 @@ import Data.Monoid ((<>))
import GEval.Core import GEval.Core
import GEval.EvaluationScheme import GEval.EvaluationScheme
import GEval.MetricsMeta (extraInfo, listOfAvailableEvaluationSchemes, allMetricsDescription)
import GEval.Common import GEval.Common
import GEval.CreateChallenge import GEval.CreateChallenge
import GEval.LineByLine import GEval.LineByLine
@ -35,6 +36,8 @@ import GEval.BlackBoxDebugging
import GEval.Selector import GEval.Selector
import GEval.Validation import GEval.Validation
import Data.List (intercalate)
import Data.Conduit.SmartSource import Data.Conduit.SmartSource
fullOptionsParser = info (helper <*> optionsParser) fullOptionsParser = info (helper <*> optionsParser)
@ -87,6 +90,10 @@ optionsParser = GEvalOptions
(flag' Validate (flag' Validate
( long "validate" ( long "validate"
<> help "Validate challenge, it searches for potential errors in the given challenge path, like missing columns, files or format data.")) <> help "Validate challenge, it searches for potential errors in the given challenge path, like missing columns, files or format data."))
<|>
(flag' ListMetrics
( long "list-metrics"
<> help "List all metrics with their descriptions"))
) )
<*> ((flag' FirstTheWorst <*> ((flag' FirstTheWorst
@ -233,12 +240,21 @@ sel :: Maybe Metric -> Metric -> Metric
sel Nothing m = m sel Nothing m = m
sel (Just m) _ = m sel (Just m) _ = m
metricReader :: Parser [EvaluationScheme] metricReader :: Parser [EvaluationScheme]
metricReader = many $ option auto -- actually `some` should be used instead of `many`, the problem is that metricReader = many $ option auto -- actually `some` should be used instead of `many`, the problem is that
( long "metric" -- --metric might be in the config.txt file... ( long "metric" -- --metric might be in the config.txt file...
<> short 'm' <> short 'm'
<> metavar "METRIC" <> metavar "METRIC"
<> help "Metric to be used - RMSE, MSE, MAE, SMAPE, Pearson, Spearman, Accuracy, LogLoss, Likelihood, F-measure (specify as F1, F2, F0.25, etc.), macro F-measure (specify as Macro-F1, Macro-F2, Macro-F0.25, etc.), multi-label F-measure (specify as MultiLabel-F1, MultiLabel-F2, MultiLabel-F0.25, etc.), MultiLabel-Likelihood, MAP, BLEU, GLEU (\"Google GLEU\" not the grammar correction metric), WER, NMI, ClippEU, LogLossHashed, LikelihoodHashed, BIO-F1, BIO-F1-Labels, TokenAccuracy, soft F-measure (specify as Soft-F1, Soft-F2, Soft-F0.25), probabilistic soft F-measure (specify as Probabilistic-Soft-F1, Probabilistic-Soft-F2, Probabilistic-Soft-F0.25) or CharMatch" ) <> help ("Metric to be used, e.g.:" ++ intercalate ", " (map
(\s -> (show s) ++ (case extraInfo s of
Just eI -> " (" ++ eI ++ ")"
Nothing -> ""))
listOfAvailableEvaluationSchemes)))
-- RMSE, MSE, MAE, SMAPE, Pearson, Spearman, Accuracy, LogLoss, Likelihood, F-measure (specify as F1, F2, F0.25, etc.), macro F-measure (specify as Macro-F1, Macro-F2, Macro-F0.25, etc.), multi-label F-measure (specify as MultiLabel-F1, MultiLabel-F2, MultiLabel-F0.25, etc.), MultiLabel-Likelihood, MAP, BLEU, GLEU (\"Google GLEU\" not the grammar correction metric), WER, NMI, ClippEU, LogLossHashed, LikelihoodHashed, BIO-F1, BIO-F1-Labels, TokenAccuracy, soft F-measure (specify as Soft-F1, Soft-F2, Soft-F0.25), probabilistic soft F-measure (specify as Probabilistic-Soft-F1, Probabilistic-Soft-F2, Probabilistic-Soft-F0.25) or CharMatch" )
altMetricReader :: Parser (Maybe EvaluationScheme) altMetricReader :: Parser (Maybe EvaluationScheme)
altMetricReader = optional $ option auto altMetricReader = optional $ option auto
@ -341,6 +357,9 @@ runGEval''' (Just Submit) _ _ spec _ _ = do
runGEval''' (Just Validate) _ _ spec _ _ = do runGEval''' (Just Validate) _ _ spec _ _ = do
validateChallenge spec validateChallenge spec
return Nothing return Nothing
runGEval''' (Just ListMetrics) _ _ _ _ _ = do
listMetrics
return Nothing
getGraphFilename :: Int -> FilePath -> FilePath getGraphFilename :: Int -> FilePath -> FilePath
getGraphFilename 0 fp = fp getGraphFilename 0 fp = fp
@ -401,3 +420,6 @@ Run:
to validate a directory CHALLENGE representing a Gonito challenge. to validate a directory CHALLENGE representing a Gonito challenge.
|] |]
exitFailure exitFailure
listMetrics :: IO ()
listMetrics = putStrLn allMetricsDescription

View File

@ -4,6 +4,7 @@
import Test.Hspec import Test.Hspec
import GEval.Metric import GEval.Metric
import GEval.MetricsMeta (listOfAvailableEvaluationSchemes, isEvaluationSchemeDescribed, expectedScore, outContents)
import GEval.Core import GEval.Core
import GEval.Common import GEval.Common
import GEval.EvaluationScheme import GEval.EvaluationScheme
@ -33,6 +34,8 @@ import Data.Map.Strict
import Data.Conduit.List (consume) import Data.Conduit.List (consume)
import System.FilePath
import System.Directory import System.Directory
import System.Process import System.Process
import System.Exit import System.Exit
@ -508,15 +511,27 @@ main = hspec $ do
shapify "a" `shouldBe` (WordShape "a") shapify "a" `shouldBe` (WordShape "a")
shapify "B5" `shouldBe` (WordShape "A9") shapify "B5" `shouldBe` (WordShape "A9")
describe "create challenges and validate them" $ do describe "create challenges and validate them" $ do
(flip mapM_) listOfAvailableMetrics $ \metric -> do (flip mapM_) listOfAvailableEvaluationSchemes $ \scheme -> do
it (show metric) $ do it (show scheme) $ do
withSystemTempDirectory "geval-validation-test" $ \tempDir -> do withSystemTempDirectory "geval-validation-test" $ \tempDir -> do
let spec = defaultGEvalSpecification { let spec = defaultGEvalSpecification {
gesExpectedDirectory = Just tempDir, gesExpectedDirectory = Just tempDir,
gesMetrics = [EvaluationScheme metric []], gesMetrics = [scheme],
gesPrecision = Just 4 } gesPrecision = Just 4 }
createChallenge True tempDir spec createChallenge True tempDir spec
validationChallenge tempDir spec validationChallenge tempDir spec
describe "test sample outputs" $ do
(flip mapM_ ) (Prelude.filter isEvaluationSchemeDescribed listOfAvailableEvaluationSchemes) $ \scheme@(EvaluationScheme metric _) -> do
it (show scheme) $ do
withSystemTempDirectory "geval-sample-output-test" $ \tempDir -> do
let spec = defaultGEvalSpecification {
gesExpectedDirectory = Just tempDir,
gesMetrics = [scheme] }
createChallenge True tempDir spec
let outFile = tempDir </> "test-A" </> "out.tsv"
writeFile outFile (outContents metric)
obtainedScore <- (runGEval ["--expected-directory", tempDir, "--out-directory", tempDir]) >>= extractVal
obtainedScore `shouldBe` (expectedScore scheme)
describe "submit" $ do describe "submit" $ do
it "current branch" $ do it "current branch" $ do
runGitTest "branch-test" (\_ -> getCurrentBranch) `shouldReturn` "develop" runGitTest "branch-test" (\_ -> getCurrentBranch) `shouldReturn` "develop"