Start working on --list-metrics options
This commit is contained in:
parent
98b398d34f
commit
dab2646798
@ -17,6 +17,7 @@ library
|
||||
hs-source-dirs: src
|
||||
exposed-modules: GEval.Core
|
||||
, GEval.Metric
|
||||
, GEval.MetricsMeta
|
||||
, GEval.EvaluationScheme
|
||||
, GEval.CreateChallenge
|
||||
, GEval.OptionsParser
|
||||
@ -137,6 +138,7 @@ test-suite geval-test
|
||||
, silently
|
||||
, vector
|
||||
, statistics
|
||||
, filepath
|
||||
ghc-options: -threaded -rtsopts -with-rtsopts=-N
|
||||
default-language: Haskell2010
|
||||
|
||||
|
@ -187,7 +187,7 @@ data GEvalSpecialCommand = Init
|
||||
| LineByLine | WorstFeatures
|
||||
| Diff FilePath | MostWorseningFeatures FilePath
|
||||
| PrintVersion | JustTokenize | Submit
|
||||
| Validate
|
||||
| Validate | ListMetrics
|
||||
|
||||
data ResultOrdering = KeepTheOriginalOrder | FirstTheWorst | FirstTheBest
|
||||
|
||||
|
@ -1,7 +1,8 @@
|
||||
{-# LANGUAGE QuasiQuotes #-}
|
||||
|
||||
module GEval.CreateChallenge
|
||||
(createChallenge)
|
||||
(createChallenge,
|
||||
testExpectedContents)
|
||||
where
|
||||
|
||||
import GEval.Metric
|
||||
|
@ -1,5 +1,5 @@
|
||||
module GEval.EvaluationScheme
|
||||
(EvaluationScheme(..), evaluationSchemeMetric, applyPreprocessingOperations, evaluationSchemeName)
|
||||
(EvaluationScheme(..), evaluationSchemeMetric, applyPreprocessingOperations, evaluationSchemeName, PreprocessingOperation(..))
|
||||
where
|
||||
|
||||
import GEval.Metric
|
||||
|
@ -5,7 +5,6 @@ module GEval.Metric
|
||||
MetricOrdering(..),
|
||||
defaultLogLossHashedSize,
|
||||
getMetricOrdering,
|
||||
listOfAvailableMetrics,
|
||||
bestPossibleValue,
|
||||
perfectOutLineFromExpectedLine,
|
||||
fixedNumberOfColumnsInExpected,
|
||||
@ -32,47 +31,6 @@ data Metric = RMSE | MSE | Pearson | Spearman | BLEU | GLEU | WER | Accuracy | C
|
||||
| SoftFMeasure Double | ProbabilisticSoftFMeasure Double
|
||||
deriving (Eq)
|
||||
|
||||
-- | the list of available metrics, to be shown to the user or to be
|
||||
-- | used for tests
|
||||
listOfAvailableMetrics :: [Metric]
|
||||
listOfAvailableMetrics = [RMSE,
|
||||
MSE,
|
||||
MAE,
|
||||
SMAPE,
|
||||
Pearson,
|
||||
Spearman,
|
||||
Accuracy,
|
||||
LogLoss,
|
||||
Likelihood,
|
||||
FMeasure 1.0,
|
||||
FMeasure 2.0,
|
||||
FMeasure 0.25,
|
||||
MacroFMeasure 1.0,
|
||||
MacroFMeasure 2.0,
|
||||
MacroFMeasure 0.25,
|
||||
MultiLabelFMeasure 1.0,
|
||||
MultiLabelFMeasure 2.0,
|
||||
MultiLabelFMeasure 0.25,
|
||||
MultiLabelLikelihood,
|
||||
MAP,
|
||||
BLEU,
|
||||
GLEU,
|
||||
WER,
|
||||
NMI,
|
||||
ClippEU,
|
||||
LogLossHashed defaultLogLossHashedSize,
|
||||
LikelihoodHashed defaultLogLossHashedSize,
|
||||
BIOF1,
|
||||
BIOF1Labels,
|
||||
TokenAccuracy,
|
||||
SoftFMeasure 1.0,
|
||||
SoftFMeasure 2.0,
|
||||
SoftFMeasure 0.25,
|
||||
ProbabilisticSoftFMeasure 1.0,
|
||||
ProbabilisticSoftFMeasure 2.0,
|
||||
ProbabilisticSoftFMeasure 0.25,
|
||||
CharMatch]
|
||||
|
||||
instance Show Metric where
|
||||
show RMSE = "RMSE"
|
||||
show MSE = "MSE"
|
||||
|
133
src/GEval/MetricsMeta.hs
Normal file
133
src/GEval/MetricsMeta.hs
Normal file
@ -0,0 +1,133 @@
|
||||
{-# LANGUAGE OverloadedStrings #-}
|
||||
{-# LANGUAGE QuasiQuotes #-}
|
||||
|
||||
module GEval.MetricsMeta
|
||||
(listOfAvailableMetrics,
|
||||
listOfAvailableEvaluationSchemes,
|
||||
extraInfo,
|
||||
isEvaluationSchemeDescribed,
|
||||
getEvaluationSchemeDescription,
|
||||
outContents,
|
||||
expectedScore,
|
||||
allMetricsDescription)
|
||||
where
|
||||
|
||||
import GEval.Common
|
||||
import GEval.Metric
|
||||
import GEval.EvaluationScheme
|
||||
import GEval.CreateChallenge (testExpectedContents)
|
||||
import GEval.PrecisionRecall (weightedHarmonicMean)
|
||||
|
||||
import Text.Regex.PCRE.Heavy
|
||||
import Data.Either (fromRight)
|
||||
import Data.String.Here
|
||||
|
||||
import Data.List (intercalate)
|
||||
import Text.Printf
|
||||
|
||||
-- | the list of available metrics, to be shown to the user or to be
|
||||
-- | used for tests
|
||||
listOfAvailableMetrics :: [Metric]
|
||||
listOfAvailableMetrics = [RMSE,
|
||||
MSE,
|
||||
MAE,
|
||||
SMAPE,
|
||||
Pearson,
|
||||
Spearman,
|
||||
Accuracy,
|
||||
LogLoss,
|
||||
Likelihood,
|
||||
FMeasure 1.0,
|
||||
FMeasure 2.0,
|
||||
FMeasure 0.25,
|
||||
MacroFMeasure 1.0,
|
||||
MacroFMeasure 2.0,
|
||||
MacroFMeasure 0.25,
|
||||
MultiLabelFMeasure 1.0,
|
||||
MultiLabelFMeasure 2.0,
|
||||
MultiLabelFMeasure 0.25,
|
||||
MultiLabelLikelihood,
|
||||
MAP,
|
||||
BLEU,
|
||||
GLEU,
|
||||
WER,
|
||||
NMI,
|
||||
ClippEU,
|
||||
LogLossHashed defaultLogLossHashedSize,
|
||||
LikelihoodHashed defaultLogLossHashedSize,
|
||||
BIOF1,
|
||||
BIOF1Labels,
|
||||
TokenAccuracy,
|
||||
SoftFMeasure 1.0,
|
||||
SoftFMeasure 2.0,
|
||||
SoftFMeasure 0.25,
|
||||
ProbabilisticSoftFMeasure 1.0,
|
||||
ProbabilisticSoftFMeasure 2.0,
|
||||
ProbabilisticSoftFMeasure 0.25,
|
||||
CharMatch]
|
||||
|
||||
extraInfo :: EvaluationScheme -> Maybe String
|
||||
extraInfo (EvaluationScheme GLEU []) = Just "\"Google GLEU\" not the grammar correction metric"
|
||||
extraInfo (EvaluationScheme BLEU [LowerCasing,
|
||||
RegexpMatch _]) = Just "BLEU on lowercased strings, only Latin characters and digits considered"
|
||||
extraInfo _ = Nothing
|
||||
|
||||
-- As we just started describing metrics (or, to be precise,
|
||||
-- evaluation schemes), we need keep track of which metric is
|
||||
-- described and which - not.
|
||||
-- When all the metrics are described, this function should be
|
||||
-- removed.
|
||||
isEvaluationSchemeDescribed :: EvaluationScheme -> Bool
|
||||
isEvaluationSchemeDescribed (EvaluationScheme metric []) = isMetricDescribed metric
|
||||
isEvaluationSchemeDescribed _ = False
|
||||
|
||||
isMetricDescribed :: Metric -> Bool
|
||||
isMetricDescribed (SoftFMeasure _) = True
|
||||
isMetricDescribed _ = False
|
||||
|
||||
getEvaluationSchemeDescription :: EvaluationScheme -> String
|
||||
getEvaluationSchemeDescription (EvaluationScheme metric []) = getMetricDescription metric
|
||||
|
||||
getMetricDescription :: Metric -> String
|
||||
getMetricDescription (SoftFMeasure _) =
|
||||
[i|"Soft" F-measure on intervals, i.e. partial "hits" are considered. For instance,
|
||||
if a label `foo` is expected for the span 2-9 and this label is returned but with
|
||||
the span 8-12, it is counted as 1/4 for recall and 2/5 for precision.
|
||||
|]
|
||||
|
||||
outContents :: Metric -> String
|
||||
outContents (SoftFMeasure _) = [hereLit|inwords:1-4
|
||||
inwords:1-3 indigits:5
|
||||
|]
|
||||
|
||||
expectedScore :: EvaluationScheme -> MetricValue
|
||||
expectedScore (EvaluationScheme (SoftFMeasure beta) []) = weightedHarmonicMean beta precision recall
|
||||
where precision = 0.25
|
||||
recall = 0.75
|
||||
|
||||
listOfAvailableEvaluationSchemes :: [EvaluationScheme]
|
||||
listOfAvailableEvaluationSchemes = map (\m -> EvaluationScheme m []) listOfAvailableMetrics
|
||||
++ [
|
||||
EvaluationScheme BLEU [LowerCasing,
|
||||
RegexpMatch (fromRight undefined $ compileM "\\s+|[a-z0-9]+" [])]
|
||||
]
|
||||
|
||||
allMetricsDescription :: String
|
||||
allMetricsDescription =
|
||||
intercalate "\n\n\n" $ map formatEvaluationSchemeDescription listOfAvailableEvaluationSchemes
|
||||
|
||||
formatEvaluationSchemeDescription :: EvaluationScheme -> String
|
||||
formatEvaluationSchemeDescription scheme@(EvaluationScheme metric _) = show scheme ++ "\n" ++ description
|
||||
where description = if isEvaluationSchemeDescribed scheme
|
||||
then (getEvaluationSchemeDescription scheme)
|
||||
++ "\nExample\n"
|
||||
++ (pasteLines "Expected output" "Sample output")
|
||||
++ concat (map (\(exp, out) -> pasteLines exp out) $ zip (lines $ testExpectedContents metric)
|
||||
(lines $ outContents metric))
|
||||
++ "\nMetric value: " ++ (printf "%.4f" $ expectedScore scheme)
|
||||
else noDescription
|
||||
noDescription = [hereLit|THE METRIC HAS NO DESCRIPTION YET, PLEASE ADD AN ISSUE TO https://gitlab.com/filipg/geval/issues
|
||||
IF YOU WANT TO HAVE IT DESCRIBED|]
|
||||
|
||||
pasteLines :: String -> String -> String
|
||||
pasteLines a b = printf "%-35s %s\n" a b
|
@ -27,6 +27,7 @@ import Data.Monoid ((<>))
|
||||
|
||||
import GEval.Core
|
||||
import GEval.EvaluationScheme
|
||||
import GEval.MetricsMeta (extraInfo, listOfAvailableEvaluationSchemes, allMetricsDescription)
|
||||
import GEval.Common
|
||||
import GEval.CreateChallenge
|
||||
import GEval.LineByLine
|
||||
@ -35,6 +36,8 @@ import GEval.BlackBoxDebugging
|
||||
import GEval.Selector
|
||||
import GEval.Validation
|
||||
|
||||
import Data.List (intercalate)
|
||||
|
||||
import Data.Conduit.SmartSource
|
||||
|
||||
fullOptionsParser = info (helper <*> optionsParser)
|
||||
@ -87,6 +90,10 @@ optionsParser = GEvalOptions
|
||||
(flag' Validate
|
||||
( long "validate"
|
||||
<> help "Validate challenge, it searches for potential errors in the given challenge path, like missing columns, files or format data."))
|
||||
<|>
|
||||
(flag' ListMetrics
|
||||
( long "list-metrics"
|
||||
<> help "List all metrics with their descriptions"))
|
||||
)
|
||||
|
||||
<*> ((flag' FirstTheWorst
|
||||
@ -233,12 +240,21 @@ sel :: Maybe Metric -> Metric -> Metric
|
||||
sel Nothing m = m
|
||||
sel (Just m) _ = m
|
||||
|
||||
|
||||
|
||||
metricReader :: Parser [EvaluationScheme]
|
||||
metricReader = many $ option auto -- actually `some` should be used instead of `many`, the problem is that
|
||||
( long "metric" -- --metric might be in the config.txt file...
|
||||
<> short 'm'
|
||||
<> metavar "METRIC"
|
||||
<> help "Metric to be used - RMSE, MSE, MAE, SMAPE, Pearson, Spearman, Accuracy, LogLoss, Likelihood, F-measure (specify as F1, F2, F0.25, etc.), macro F-measure (specify as Macro-F1, Macro-F2, Macro-F0.25, etc.), multi-label F-measure (specify as MultiLabel-F1, MultiLabel-F2, MultiLabel-F0.25, etc.), MultiLabel-Likelihood, MAP, BLEU, GLEU (\"Google GLEU\" not the grammar correction metric), WER, NMI, ClippEU, LogLossHashed, LikelihoodHashed, BIO-F1, BIO-F1-Labels, TokenAccuracy, soft F-measure (specify as Soft-F1, Soft-F2, Soft-F0.25), probabilistic soft F-measure (specify as Probabilistic-Soft-F1, Probabilistic-Soft-F2, Probabilistic-Soft-F0.25) or CharMatch" )
|
||||
<> help ("Metric to be used, e.g.:" ++ intercalate ", " (map
|
||||
(\s -> (show s) ++ (case extraInfo s of
|
||||
Just eI -> " (" ++ eI ++ ")"
|
||||
Nothing -> ""))
|
||||
listOfAvailableEvaluationSchemes)))
|
||||
|
||||
|
||||
-- RMSE, MSE, MAE, SMAPE, Pearson, Spearman, Accuracy, LogLoss, Likelihood, F-measure (specify as F1, F2, F0.25, etc.), macro F-measure (specify as Macro-F1, Macro-F2, Macro-F0.25, etc.), multi-label F-measure (specify as MultiLabel-F1, MultiLabel-F2, MultiLabel-F0.25, etc.), MultiLabel-Likelihood, MAP, BLEU, GLEU (\"Google GLEU\" not the grammar correction metric), WER, NMI, ClippEU, LogLossHashed, LikelihoodHashed, BIO-F1, BIO-F1-Labels, TokenAccuracy, soft F-measure (specify as Soft-F1, Soft-F2, Soft-F0.25), probabilistic soft F-measure (specify as Probabilistic-Soft-F1, Probabilistic-Soft-F2, Probabilistic-Soft-F0.25) or CharMatch" )
|
||||
|
||||
altMetricReader :: Parser (Maybe EvaluationScheme)
|
||||
altMetricReader = optional $ option auto
|
||||
@ -341,6 +357,9 @@ runGEval''' (Just Submit) _ _ spec _ _ = do
|
||||
runGEval''' (Just Validate) _ _ spec _ _ = do
|
||||
validateChallenge spec
|
||||
return Nothing
|
||||
runGEval''' (Just ListMetrics) _ _ _ _ _ = do
|
||||
listMetrics
|
||||
return Nothing
|
||||
|
||||
getGraphFilename :: Int -> FilePath -> FilePath
|
||||
getGraphFilename 0 fp = fp
|
||||
@ -401,3 +420,6 @@ Run:
|
||||
to validate a directory CHALLENGE representing a Gonito challenge.
|
||||
|]
|
||||
exitFailure
|
||||
|
||||
listMetrics :: IO ()
|
||||
listMetrics = putStrLn allMetricsDescription
|
||||
|
21
test/Spec.hs
21
test/Spec.hs
@ -4,6 +4,7 @@
|
||||
import Test.Hspec
|
||||
|
||||
import GEval.Metric
|
||||
import GEval.MetricsMeta (listOfAvailableEvaluationSchemes, isEvaluationSchemeDescribed, expectedScore, outContents)
|
||||
import GEval.Core
|
||||
import GEval.Common
|
||||
import GEval.EvaluationScheme
|
||||
@ -33,6 +34,8 @@ import Data.Map.Strict
|
||||
|
||||
import Data.Conduit.List (consume)
|
||||
|
||||
import System.FilePath
|
||||
|
||||
import System.Directory
|
||||
import System.Process
|
||||
import System.Exit
|
||||
@ -508,15 +511,27 @@ main = hspec $ do
|
||||
shapify "a" `shouldBe` (WordShape "a")
|
||||
shapify "B5" `shouldBe` (WordShape "A9")
|
||||
describe "create challenges and validate them" $ do
|
||||
(flip mapM_) listOfAvailableMetrics $ \metric -> do
|
||||
it (show metric) $ do
|
||||
(flip mapM_) listOfAvailableEvaluationSchemes $ \scheme -> do
|
||||
it (show scheme) $ do
|
||||
withSystemTempDirectory "geval-validation-test" $ \tempDir -> do
|
||||
let spec = defaultGEvalSpecification {
|
||||
gesExpectedDirectory = Just tempDir,
|
||||
gesMetrics = [EvaluationScheme metric []],
|
||||
gesMetrics = [scheme],
|
||||
gesPrecision = Just 4 }
|
||||
createChallenge True tempDir spec
|
||||
validationChallenge tempDir spec
|
||||
describe "test sample outputs" $ do
|
||||
(flip mapM_ ) (Prelude.filter isEvaluationSchemeDescribed listOfAvailableEvaluationSchemes) $ \scheme@(EvaluationScheme metric _) -> do
|
||||
it (show scheme) $ do
|
||||
withSystemTempDirectory "geval-sample-output-test" $ \tempDir -> do
|
||||
let spec = defaultGEvalSpecification {
|
||||
gesExpectedDirectory = Just tempDir,
|
||||
gesMetrics = [scheme] }
|
||||
createChallenge True tempDir spec
|
||||
let outFile = tempDir </> "test-A" </> "out.tsv"
|
||||
writeFile outFile (outContents metric)
|
||||
obtainedScore <- (runGEval ["--expected-directory", tempDir, "--out-directory", tempDir]) >>= extractVal
|
||||
obtainedScore `shouldBe` (expectedScore scheme)
|
||||
describe "submit" $ do
|
||||
it "current branch" $ do
|
||||
runGitTest "branch-test" (\_ -> getCurrentBranch) `shouldReturn` "develop"
|
||||
|
Loading…
Reference in New Issue
Block a user