Start working on --list-metrics options
This commit is contained in:
parent
98b398d34f
commit
dab2646798
@ -17,6 +17,7 @@ library
|
|||||||
hs-source-dirs: src
|
hs-source-dirs: src
|
||||||
exposed-modules: GEval.Core
|
exposed-modules: GEval.Core
|
||||||
, GEval.Metric
|
, GEval.Metric
|
||||||
|
, GEval.MetricsMeta
|
||||||
, GEval.EvaluationScheme
|
, GEval.EvaluationScheme
|
||||||
, GEval.CreateChallenge
|
, GEval.CreateChallenge
|
||||||
, GEval.OptionsParser
|
, GEval.OptionsParser
|
||||||
@ -137,6 +138,7 @@ test-suite geval-test
|
|||||||
, silently
|
, silently
|
||||||
, vector
|
, vector
|
||||||
, statistics
|
, statistics
|
||||||
|
, filepath
|
||||||
ghc-options: -threaded -rtsopts -with-rtsopts=-N
|
ghc-options: -threaded -rtsopts -with-rtsopts=-N
|
||||||
default-language: Haskell2010
|
default-language: Haskell2010
|
||||||
|
|
||||||
|
@ -187,7 +187,7 @@ data GEvalSpecialCommand = Init
|
|||||||
| LineByLine | WorstFeatures
|
| LineByLine | WorstFeatures
|
||||||
| Diff FilePath | MostWorseningFeatures FilePath
|
| Diff FilePath | MostWorseningFeatures FilePath
|
||||||
| PrintVersion | JustTokenize | Submit
|
| PrintVersion | JustTokenize | Submit
|
||||||
| Validate
|
| Validate | ListMetrics
|
||||||
|
|
||||||
data ResultOrdering = KeepTheOriginalOrder | FirstTheWorst | FirstTheBest
|
data ResultOrdering = KeepTheOriginalOrder | FirstTheWorst | FirstTheBest
|
||||||
|
|
||||||
|
@ -1,7 +1,8 @@
|
|||||||
{-# LANGUAGE QuasiQuotes #-}
|
{-# LANGUAGE QuasiQuotes #-}
|
||||||
|
|
||||||
module GEval.CreateChallenge
|
module GEval.CreateChallenge
|
||||||
(createChallenge)
|
(createChallenge,
|
||||||
|
testExpectedContents)
|
||||||
where
|
where
|
||||||
|
|
||||||
import GEval.Metric
|
import GEval.Metric
|
||||||
|
@ -1,5 +1,5 @@
|
|||||||
module GEval.EvaluationScheme
|
module GEval.EvaluationScheme
|
||||||
(EvaluationScheme(..), evaluationSchemeMetric, applyPreprocessingOperations, evaluationSchemeName)
|
(EvaluationScheme(..), evaluationSchemeMetric, applyPreprocessingOperations, evaluationSchemeName, PreprocessingOperation(..))
|
||||||
where
|
where
|
||||||
|
|
||||||
import GEval.Metric
|
import GEval.Metric
|
||||||
|
@ -5,7 +5,6 @@ module GEval.Metric
|
|||||||
MetricOrdering(..),
|
MetricOrdering(..),
|
||||||
defaultLogLossHashedSize,
|
defaultLogLossHashedSize,
|
||||||
getMetricOrdering,
|
getMetricOrdering,
|
||||||
listOfAvailableMetrics,
|
|
||||||
bestPossibleValue,
|
bestPossibleValue,
|
||||||
perfectOutLineFromExpectedLine,
|
perfectOutLineFromExpectedLine,
|
||||||
fixedNumberOfColumnsInExpected,
|
fixedNumberOfColumnsInExpected,
|
||||||
@ -32,47 +31,6 @@ data Metric = RMSE | MSE | Pearson | Spearman | BLEU | GLEU | WER | Accuracy | C
|
|||||||
| SoftFMeasure Double | ProbabilisticSoftFMeasure Double
|
| SoftFMeasure Double | ProbabilisticSoftFMeasure Double
|
||||||
deriving (Eq)
|
deriving (Eq)
|
||||||
|
|
||||||
-- | the list of available metrics, to be shown to the user or to be
|
|
||||||
-- | used for tests
|
|
||||||
listOfAvailableMetrics :: [Metric]
|
|
||||||
listOfAvailableMetrics = [RMSE,
|
|
||||||
MSE,
|
|
||||||
MAE,
|
|
||||||
SMAPE,
|
|
||||||
Pearson,
|
|
||||||
Spearman,
|
|
||||||
Accuracy,
|
|
||||||
LogLoss,
|
|
||||||
Likelihood,
|
|
||||||
FMeasure 1.0,
|
|
||||||
FMeasure 2.0,
|
|
||||||
FMeasure 0.25,
|
|
||||||
MacroFMeasure 1.0,
|
|
||||||
MacroFMeasure 2.0,
|
|
||||||
MacroFMeasure 0.25,
|
|
||||||
MultiLabelFMeasure 1.0,
|
|
||||||
MultiLabelFMeasure 2.0,
|
|
||||||
MultiLabelFMeasure 0.25,
|
|
||||||
MultiLabelLikelihood,
|
|
||||||
MAP,
|
|
||||||
BLEU,
|
|
||||||
GLEU,
|
|
||||||
WER,
|
|
||||||
NMI,
|
|
||||||
ClippEU,
|
|
||||||
LogLossHashed defaultLogLossHashedSize,
|
|
||||||
LikelihoodHashed defaultLogLossHashedSize,
|
|
||||||
BIOF1,
|
|
||||||
BIOF1Labels,
|
|
||||||
TokenAccuracy,
|
|
||||||
SoftFMeasure 1.0,
|
|
||||||
SoftFMeasure 2.0,
|
|
||||||
SoftFMeasure 0.25,
|
|
||||||
ProbabilisticSoftFMeasure 1.0,
|
|
||||||
ProbabilisticSoftFMeasure 2.0,
|
|
||||||
ProbabilisticSoftFMeasure 0.25,
|
|
||||||
CharMatch]
|
|
||||||
|
|
||||||
instance Show Metric where
|
instance Show Metric where
|
||||||
show RMSE = "RMSE"
|
show RMSE = "RMSE"
|
||||||
show MSE = "MSE"
|
show MSE = "MSE"
|
||||||
|
133
src/GEval/MetricsMeta.hs
Normal file
133
src/GEval/MetricsMeta.hs
Normal file
@ -0,0 +1,133 @@
|
|||||||
|
{-# LANGUAGE OverloadedStrings #-}
|
||||||
|
{-# LANGUAGE QuasiQuotes #-}
|
||||||
|
|
||||||
|
module GEval.MetricsMeta
|
||||||
|
(listOfAvailableMetrics,
|
||||||
|
listOfAvailableEvaluationSchemes,
|
||||||
|
extraInfo,
|
||||||
|
isEvaluationSchemeDescribed,
|
||||||
|
getEvaluationSchemeDescription,
|
||||||
|
outContents,
|
||||||
|
expectedScore,
|
||||||
|
allMetricsDescription)
|
||||||
|
where
|
||||||
|
|
||||||
|
import GEval.Common
|
||||||
|
import GEval.Metric
|
||||||
|
import GEval.EvaluationScheme
|
||||||
|
import GEval.CreateChallenge (testExpectedContents)
|
||||||
|
import GEval.PrecisionRecall (weightedHarmonicMean)
|
||||||
|
|
||||||
|
import Text.Regex.PCRE.Heavy
|
||||||
|
import Data.Either (fromRight)
|
||||||
|
import Data.String.Here
|
||||||
|
|
||||||
|
import Data.List (intercalate)
|
||||||
|
import Text.Printf
|
||||||
|
|
||||||
|
-- | the list of available metrics, to be shown to the user or to be
|
||||||
|
-- | used for tests
|
||||||
|
listOfAvailableMetrics :: [Metric]
|
||||||
|
listOfAvailableMetrics = [RMSE,
|
||||||
|
MSE,
|
||||||
|
MAE,
|
||||||
|
SMAPE,
|
||||||
|
Pearson,
|
||||||
|
Spearman,
|
||||||
|
Accuracy,
|
||||||
|
LogLoss,
|
||||||
|
Likelihood,
|
||||||
|
FMeasure 1.0,
|
||||||
|
FMeasure 2.0,
|
||||||
|
FMeasure 0.25,
|
||||||
|
MacroFMeasure 1.0,
|
||||||
|
MacroFMeasure 2.0,
|
||||||
|
MacroFMeasure 0.25,
|
||||||
|
MultiLabelFMeasure 1.0,
|
||||||
|
MultiLabelFMeasure 2.0,
|
||||||
|
MultiLabelFMeasure 0.25,
|
||||||
|
MultiLabelLikelihood,
|
||||||
|
MAP,
|
||||||
|
BLEU,
|
||||||
|
GLEU,
|
||||||
|
WER,
|
||||||
|
NMI,
|
||||||
|
ClippEU,
|
||||||
|
LogLossHashed defaultLogLossHashedSize,
|
||||||
|
LikelihoodHashed defaultLogLossHashedSize,
|
||||||
|
BIOF1,
|
||||||
|
BIOF1Labels,
|
||||||
|
TokenAccuracy,
|
||||||
|
SoftFMeasure 1.0,
|
||||||
|
SoftFMeasure 2.0,
|
||||||
|
SoftFMeasure 0.25,
|
||||||
|
ProbabilisticSoftFMeasure 1.0,
|
||||||
|
ProbabilisticSoftFMeasure 2.0,
|
||||||
|
ProbabilisticSoftFMeasure 0.25,
|
||||||
|
CharMatch]
|
||||||
|
|
||||||
|
extraInfo :: EvaluationScheme -> Maybe String
|
||||||
|
extraInfo (EvaluationScheme GLEU []) = Just "\"Google GLEU\" not the grammar correction metric"
|
||||||
|
extraInfo (EvaluationScheme BLEU [LowerCasing,
|
||||||
|
RegexpMatch _]) = Just "BLEU on lowercased strings, only Latin characters and digits considered"
|
||||||
|
extraInfo _ = Nothing
|
||||||
|
|
||||||
|
-- As we just started describing metrics (or, to be precise,
|
||||||
|
-- evaluation schemes), we need keep track of which metric is
|
||||||
|
-- described and which - not.
|
||||||
|
-- When all the metrics are described, this function should be
|
||||||
|
-- removed.
|
||||||
|
isEvaluationSchemeDescribed :: EvaluationScheme -> Bool
|
||||||
|
isEvaluationSchemeDescribed (EvaluationScheme metric []) = isMetricDescribed metric
|
||||||
|
isEvaluationSchemeDescribed _ = False
|
||||||
|
|
||||||
|
isMetricDescribed :: Metric -> Bool
|
||||||
|
isMetricDescribed (SoftFMeasure _) = True
|
||||||
|
isMetricDescribed _ = False
|
||||||
|
|
||||||
|
getEvaluationSchemeDescription :: EvaluationScheme -> String
|
||||||
|
getEvaluationSchemeDescription (EvaluationScheme metric []) = getMetricDescription metric
|
||||||
|
|
||||||
|
getMetricDescription :: Metric -> String
|
||||||
|
getMetricDescription (SoftFMeasure _) =
|
||||||
|
[i|"Soft" F-measure on intervals, i.e. partial "hits" are considered. For instance,
|
||||||
|
if a label `foo` is expected for the span 2-9 and this label is returned but with
|
||||||
|
the span 8-12, it is counted as 1/4 for recall and 2/5 for precision.
|
||||||
|
|]
|
||||||
|
|
||||||
|
outContents :: Metric -> String
|
||||||
|
outContents (SoftFMeasure _) = [hereLit|inwords:1-4
|
||||||
|
inwords:1-3 indigits:5
|
||||||
|
|]
|
||||||
|
|
||||||
|
expectedScore :: EvaluationScheme -> MetricValue
|
||||||
|
expectedScore (EvaluationScheme (SoftFMeasure beta) []) = weightedHarmonicMean beta precision recall
|
||||||
|
where precision = 0.25
|
||||||
|
recall = 0.75
|
||||||
|
|
||||||
|
listOfAvailableEvaluationSchemes :: [EvaluationScheme]
|
||||||
|
listOfAvailableEvaluationSchemes = map (\m -> EvaluationScheme m []) listOfAvailableMetrics
|
||||||
|
++ [
|
||||||
|
EvaluationScheme BLEU [LowerCasing,
|
||||||
|
RegexpMatch (fromRight undefined $ compileM "\\s+|[a-z0-9]+" [])]
|
||||||
|
]
|
||||||
|
|
||||||
|
allMetricsDescription :: String
|
||||||
|
allMetricsDescription =
|
||||||
|
intercalate "\n\n\n" $ map formatEvaluationSchemeDescription listOfAvailableEvaluationSchemes
|
||||||
|
|
||||||
|
formatEvaluationSchemeDescription :: EvaluationScheme -> String
|
||||||
|
formatEvaluationSchemeDescription scheme@(EvaluationScheme metric _) = show scheme ++ "\n" ++ description
|
||||||
|
where description = if isEvaluationSchemeDescribed scheme
|
||||||
|
then (getEvaluationSchemeDescription scheme)
|
||||||
|
++ "\nExample\n"
|
||||||
|
++ (pasteLines "Expected output" "Sample output")
|
||||||
|
++ concat (map (\(exp, out) -> pasteLines exp out) $ zip (lines $ testExpectedContents metric)
|
||||||
|
(lines $ outContents metric))
|
||||||
|
++ "\nMetric value: " ++ (printf "%.4f" $ expectedScore scheme)
|
||||||
|
else noDescription
|
||||||
|
noDescription = [hereLit|THE METRIC HAS NO DESCRIPTION YET, PLEASE ADD AN ISSUE TO https://gitlab.com/filipg/geval/issues
|
||||||
|
IF YOU WANT TO HAVE IT DESCRIBED|]
|
||||||
|
|
||||||
|
pasteLines :: String -> String -> String
|
||||||
|
pasteLines a b = printf "%-35s %s\n" a b
|
@ -27,6 +27,7 @@ import Data.Monoid ((<>))
|
|||||||
|
|
||||||
import GEval.Core
|
import GEval.Core
|
||||||
import GEval.EvaluationScheme
|
import GEval.EvaluationScheme
|
||||||
|
import GEval.MetricsMeta (extraInfo, listOfAvailableEvaluationSchemes, allMetricsDescription)
|
||||||
import GEval.Common
|
import GEval.Common
|
||||||
import GEval.CreateChallenge
|
import GEval.CreateChallenge
|
||||||
import GEval.LineByLine
|
import GEval.LineByLine
|
||||||
@ -35,6 +36,8 @@ import GEval.BlackBoxDebugging
|
|||||||
import GEval.Selector
|
import GEval.Selector
|
||||||
import GEval.Validation
|
import GEval.Validation
|
||||||
|
|
||||||
|
import Data.List (intercalate)
|
||||||
|
|
||||||
import Data.Conduit.SmartSource
|
import Data.Conduit.SmartSource
|
||||||
|
|
||||||
fullOptionsParser = info (helper <*> optionsParser)
|
fullOptionsParser = info (helper <*> optionsParser)
|
||||||
@ -87,6 +90,10 @@ optionsParser = GEvalOptions
|
|||||||
(flag' Validate
|
(flag' Validate
|
||||||
( long "validate"
|
( long "validate"
|
||||||
<> help "Validate challenge, it searches for potential errors in the given challenge path, like missing columns, files or format data."))
|
<> help "Validate challenge, it searches for potential errors in the given challenge path, like missing columns, files or format data."))
|
||||||
|
<|>
|
||||||
|
(flag' ListMetrics
|
||||||
|
( long "list-metrics"
|
||||||
|
<> help "List all metrics with their descriptions"))
|
||||||
)
|
)
|
||||||
|
|
||||||
<*> ((flag' FirstTheWorst
|
<*> ((flag' FirstTheWorst
|
||||||
@ -233,12 +240,21 @@ sel :: Maybe Metric -> Metric -> Metric
|
|||||||
sel Nothing m = m
|
sel Nothing m = m
|
||||||
sel (Just m) _ = m
|
sel (Just m) _ = m
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
metricReader :: Parser [EvaluationScheme]
|
metricReader :: Parser [EvaluationScheme]
|
||||||
metricReader = many $ option auto -- actually `some` should be used instead of `many`, the problem is that
|
metricReader = many $ option auto -- actually `some` should be used instead of `many`, the problem is that
|
||||||
( long "metric" -- --metric might be in the config.txt file...
|
( long "metric" -- --metric might be in the config.txt file...
|
||||||
<> short 'm'
|
<> short 'm'
|
||||||
<> metavar "METRIC"
|
<> metavar "METRIC"
|
||||||
<> help "Metric to be used - RMSE, MSE, MAE, SMAPE, Pearson, Spearman, Accuracy, LogLoss, Likelihood, F-measure (specify as F1, F2, F0.25, etc.), macro F-measure (specify as Macro-F1, Macro-F2, Macro-F0.25, etc.), multi-label F-measure (specify as MultiLabel-F1, MultiLabel-F2, MultiLabel-F0.25, etc.), MultiLabel-Likelihood, MAP, BLEU, GLEU (\"Google GLEU\" not the grammar correction metric), WER, NMI, ClippEU, LogLossHashed, LikelihoodHashed, BIO-F1, BIO-F1-Labels, TokenAccuracy, soft F-measure (specify as Soft-F1, Soft-F2, Soft-F0.25), probabilistic soft F-measure (specify as Probabilistic-Soft-F1, Probabilistic-Soft-F2, Probabilistic-Soft-F0.25) or CharMatch" )
|
<> help ("Metric to be used, e.g.:" ++ intercalate ", " (map
|
||||||
|
(\s -> (show s) ++ (case extraInfo s of
|
||||||
|
Just eI -> " (" ++ eI ++ ")"
|
||||||
|
Nothing -> ""))
|
||||||
|
listOfAvailableEvaluationSchemes)))
|
||||||
|
|
||||||
|
|
||||||
|
-- RMSE, MSE, MAE, SMAPE, Pearson, Spearman, Accuracy, LogLoss, Likelihood, F-measure (specify as F1, F2, F0.25, etc.), macro F-measure (specify as Macro-F1, Macro-F2, Macro-F0.25, etc.), multi-label F-measure (specify as MultiLabel-F1, MultiLabel-F2, MultiLabel-F0.25, etc.), MultiLabel-Likelihood, MAP, BLEU, GLEU (\"Google GLEU\" not the grammar correction metric), WER, NMI, ClippEU, LogLossHashed, LikelihoodHashed, BIO-F1, BIO-F1-Labels, TokenAccuracy, soft F-measure (specify as Soft-F1, Soft-F2, Soft-F0.25), probabilistic soft F-measure (specify as Probabilistic-Soft-F1, Probabilistic-Soft-F2, Probabilistic-Soft-F0.25) or CharMatch" )
|
||||||
|
|
||||||
altMetricReader :: Parser (Maybe EvaluationScheme)
|
altMetricReader :: Parser (Maybe EvaluationScheme)
|
||||||
altMetricReader = optional $ option auto
|
altMetricReader = optional $ option auto
|
||||||
@ -341,6 +357,9 @@ runGEval''' (Just Submit) _ _ spec _ _ = do
|
|||||||
runGEval''' (Just Validate) _ _ spec _ _ = do
|
runGEval''' (Just Validate) _ _ spec _ _ = do
|
||||||
validateChallenge spec
|
validateChallenge spec
|
||||||
return Nothing
|
return Nothing
|
||||||
|
runGEval''' (Just ListMetrics) _ _ _ _ _ = do
|
||||||
|
listMetrics
|
||||||
|
return Nothing
|
||||||
|
|
||||||
getGraphFilename :: Int -> FilePath -> FilePath
|
getGraphFilename :: Int -> FilePath -> FilePath
|
||||||
getGraphFilename 0 fp = fp
|
getGraphFilename 0 fp = fp
|
||||||
@ -401,3 +420,6 @@ Run:
|
|||||||
to validate a directory CHALLENGE representing a Gonito challenge.
|
to validate a directory CHALLENGE representing a Gonito challenge.
|
||||||
|]
|
|]
|
||||||
exitFailure
|
exitFailure
|
||||||
|
|
||||||
|
listMetrics :: IO ()
|
||||||
|
listMetrics = putStrLn allMetricsDescription
|
||||||
|
21
test/Spec.hs
21
test/Spec.hs
@ -4,6 +4,7 @@
|
|||||||
import Test.Hspec
|
import Test.Hspec
|
||||||
|
|
||||||
import GEval.Metric
|
import GEval.Metric
|
||||||
|
import GEval.MetricsMeta (listOfAvailableEvaluationSchemes, isEvaluationSchemeDescribed, expectedScore, outContents)
|
||||||
import GEval.Core
|
import GEval.Core
|
||||||
import GEval.Common
|
import GEval.Common
|
||||||
import GEval.EvaluationScheme
|
import GEval.EvaluationScheme
|
||||||
@ -33,6 +34,8 @@ import Data.Map.Strict
|
|||||||
|
|
||||||
import Data.Conduit.List (consume)
|
import Data.Conduit.List (consume)
|
||||||
|
|
||||||
|
import System.FilePath
|
||||||
|
|
||||||
import System.Directory
|
import System.Directory
|
||||||
import System.Process
|
import System.Process
|
||||||
import System.Exit
|
import System.Exit
|
||||||
@ -508,15 +511,27 @@ main = hspec $ do
|
|||||||
shapify "a" `shouldBe` (WordShape "a")
|
shapify "a" `shouldBe` (WordShape "a")
|
||||||
shapify "B5" `shouldBe` (WordShape "A9")
|
shapify "B5" `shouldBe` (WordShape "A9")
|
||||||
describe "create challenges and validate them" $ do
|
describe "create challenges and validate them" $ do
|
||||||
(flip mapM_) listOfAvailableMetrics $ \metric -> do
|
(flip mapM_) listOfAvailableEvaluationSchemes $ \scheme -> do
|
||||||
it (show metric) $ do
|
it (show scheme) $ do
|
||||||
withSystemTempDirectory "geval-validation-test" $ \tempDir -> do
|
withSystemTempDirectory "geval-validation-test" $ \tempDir -> do
|
||||||
let spec = defaultGEvalSpecification {
|
let spec = defaultGEvalSpecification {
|
||||||
gesExpectedDirectory = Just tempDir,
|
gesExpectedDirectory = Just tempDir,
|
||||||
gesMetrics = [EvaluationScheme metric []],
|
gesMetrics = [scheme],
|
||||||
gesPrecision = Just 4 }
|
gesPrecision = Just 4 }
|
||||||
createChallenge True tempDir spec
|
createChallenge True tempDir spec
|
||||||
validationChallenge tempDir spec
|
validationChallenge tempDir spec
|
||||||
|
describe "test sample outputs" $ do
|
||||||
|
(flip mapM_ ) (Prelude.filter isEvaluationSchemeDescribed listOfAvailableEvaluationSchemes) $ \scheme@(EvaluationScheme metric _) -> do
|
||||||
|
it (show scheme) $ do
|
||||||
|
withSystemTempDirectory "geval-sample-output-test" $ \tempDir -> do
|
||||||
|
let spec = defaultGEvalSpecification {
|
||||||
|
gesExpectedDirectory = Just tempDir,
|
||||||
|
gesMetrics = [scheme] }
|
||||||
|
createChallenge True tempDir spec
|
||||||
|
let outFile = tempDir </> "test-A" </> "out.tsv"
|
||||||
|
writeFile outFile (outContents metric)
|
||||||
|
obtainedScore <- (runGEval ["--expected-directory", tempDir, "--out-directory", tempDir]) >>= extractVal
|
||||||
|
obtainedScore `shouldBe` (expectedScore scheme)
|
||||||
describe "submit" $ do
|
describe "submit" $ do
|
||||||
it "current branch" $ do
|
it "current branch" $ do
|
||||||
runGitTest "branch-test" (\_ -> getCurrentBranch) `shouldReturn` "develop"
|
runGitTest "branch-test" (\_ -> getCurrentBranch) `shouldReturn` "develop"
|
||||||
|
Loading…
Reference in New Issue
Block a user