diff --git a/geval.cabal b/geval.cabal index 7d07ecb..8bf0c46 100644 --- a/geval.cabal +++ b/geval.cabal @@ -17,6 +17,7 @@ library hs-source-dirs: src exposed-modules: GEval.Core , GEval.Metric + , GEval.MetricsMeta , GEval.EvaluationScheme , GEval.CreateChallenge , GEval.OptionsParser @@ -137,6 +138,7 @@ test-suite geval-test , silently , vector , statistics + , filepath ghc-options: -threaded -rtsopts -with-rtsopts=-N default-language: Haskell2010 diff --git a/src/GEval/Core.hs b/src/GEval/Core.hs index 0feeef4..96edc11 100644 --- a/src/GEval/Core.hs +++ b/src/GEval/Core.hs @@ -187,7 +187,7 @@ data GEvalSpecialCommand = Init | LineByLine | WorstFeatures | Diff FilePath | MostWorseningFeatures FilePath | PrintVersion | JustTokenize | Submit - | Validate + | Validate | ListMetrics data ResultOrdering = KeepTheOriginalOrder | FirstTheWorst | FirstTheBest diff --git a/src/GEval/CreateChallenge.hs b/src/GEval/CreateChallenge.hs index 34bbd55..61d5a28 100644 --- a/src/GEval/CreateChallenge.hs +++ b/src/GEval/CreateChallenge.hs @@ -1,7 +1,8 @@ {-# LANGUAGE QuasiQuotes #-} module GEval.CreateChallenge - (createChallenge) + (createChallenge, + testExpectedContents) where import GEval.Metric diff --git a/src/GEval/EvaluationScheme.hs b/src/GEval/EvaluationScheme.hs index a3677fe..29840c7 100644 --- a/src/GEval/EvaluationScheme.hs +++ b/src/GEval/EvaluationScheme.hs @@ -1,5 +1,5 @@ module GEval.EvaluationScheme - (EvaluationScheme(..), evaluationSchemeMetric, applyPreprocessingOperations, evaluationSchemeName) + (EvaluationScheme(..), evaluationSchemeMetric, applyPreprocessingOperations, evaluationSchemeName, PreprocessingOperation(..)) where import GEval.Metric diff --git a/src/GEval/Metric.hs b/src/GEval/Metric.hs index d2e1e40..ecd02b6 100644 --- a/src/GEval/Metric.hs +++ b/src/GEval/Metric.hs @@ -5,7 +5,6 @@ module GEval.Metric MetricOrdering(..), defaultLogLossHashedSize, getMetricOrdering, - listOfAvailableMetrics, bestPossibleValue, perfectOutLineFromExpectedLine, fixedNumberOfColumnsInExpected, @@ -32,47 +31,6 @@ data Metric = RMSE | MSE | Pearson | Spearman | BLEU | GLEU | WER | Accuracy | C | SoftFMeasure Double | ProbabilisticSoftFMeasure Double deriving (Eq) --- | the list of available metrics, to be shown to the user or to be --- | used for tests -listOfAvailableMetrics :: [Metric] -listOfAvailableMetrics = [RMSE, - MSE, - MAE, - SMAPE, - Pearson, - Spearman, - Accuracy, - LogLoss, - Likelihood, - FMeasure 1.0, - FMeasure 2.0, - FMeasure 0.25, - MacroFMeasure 1.0, - MacroFMeasure 2.0, - MacroFMeasure 0.25, - MultiLabelFMeasure 1.0, - MultiLabelFMeasure 2.0, - MultiLabelFMeasure 0.25, - MultiLabelLikelihood, - MAP, - BLEU, - GLEU, - WER, - NMI, - ClippEU, - LogLossHashed defaultLogLossHashedSize, - LikelihoodHashed defaultLogLossHashedSize, - BIOF1, - BIOF1Labels, - TokenAccuracy, - SoftFMeasure 1.0, - SoftFMeasure 2.0, - SoftFMeasure 0.25, - ProbabilisticSoftFMeasure 1.0, - ProbabilisticSoftFMeasure 2.0, - ProbabilisticSoftFMeasure 0.25, - CharMatch] - instance Show Metric where show RMSE = "RMSE" show MSE = "MSE" diff --git a/src/GEval/MetricsMeta.hs b/src/GEval/MetricsMeta.hs new file mode 100644 index 0000000..0a2c398 --- /dev/null +++ b/src/GEval/MetricsMeta.hs @@ -0,0 +1,133 @@ +{-# LANGUAGE OverloadedStrings #-} +{-# LANGUAGE QuasiQuotes #-} + +module GEval.MetricsMeta + (listOfAvailableMetrics, + listOfAvailableEvaluationSchemes, + extraInfo, + isEvaluationSchemeDescribed, + getEvaluationSchemeDescription, + outContents, + expectedScore, + allMetricsDescription) + where + +import GEval.Common +import GEval.Metric +import GEval.EvaluationScheme +import GEval.CreateChallenge (testExpectedContents) +import GEval.PrecisionRecall (weightedHarmonicMean) + +import Text.Regex.PCRE.Heavy +import Data.Either (fromRight) +import Data.String.Here + +import Data.List (intercalate) +import Text.Printf + +-- | the list of available metrics, to be shown to the user or to be +-- | used for tests +listOfAvailableMetrics :: [Metric] +listOfAvailableMetrics = [RMSE, + MSE, + MAE, + SMAPE, + Pearson, + Spearman, + Accuracy, + LogLoss, + Likelihood, + FMeasure 1.0, + FMeasure 2.0, + FMeasure 0.25, + MacroFMeasure 1.0, + MacroFMeasure 2.0, + MacroFMeasure 0.25, + MultiLabelFMeasure 1.0, + MultiLabelFMeasure 2.0, + MultiLabelFMeasure 0.25, + MultiLabelLikelihood, + MAP, + BLEU, + GLEU, + WER, + NMI, + ClippEU, + LogLossHashed defaultLogLossHashedSize, + LikelihoodHashed defaultLogLossHashedSize, + BIOF1, + BIOF1Labels, + TokenAccuracy, + SoftFMeasure 1.0, + SoftFMeasure 2.0, + SoftFMeasure 0.25, + ProbabilisticSoftFMeasure 1.0, + ProbabilisticSoftFMeasure 2.0, + ProbabilisticSoftFMeasure 0.25, + CharMatch] + +extraInfo :: EvaluationScheme -> Maybe String +extraInfo (EvaluationScheme GLEU []) = Just "\"Google GLEU\" not the grammar correction metric" +extraInfo (EvaluationScheme BLEU [LowerCasing, + RegexpMatch _]) = Just "BLEU on lowercased strings, only Latin characters and digits considered" +extraInfo _ = Nothing + +-- As we just started describing metrics (or, to be precise, +-- evaluation schemes), we need keep track of which metric is +-- described and which - not. +-- When all the metrics are described, this function should be +-- removed. +isEvaluationSchemeDescribed :: EvaluationScheme -> Bool +isEvaluationSchemeDescribed (EvaluationScheme metric []) = isMetricDescribed metric +isEvaluationSchemeDescribed _ = False + +isMetricDescribed :: Metric -> Bool +isMetricDescribed (SoftFMeasure _) = True +isMetricDescribed _ = False + +getEvaluationSchemeDescription :: EvaluationScheme -> String +getEvaluationSchemeDescription (EvaluationScheme metric []) = getMetricDescription metric + +getMetricDescription :: Metric -> String +getMetricDescription (SoftFMeasure _) = + [i|"Soft" F-measure on intervals, i.e. partial "hits" are considered. For instance, +if a label `foo` is expected for the span 2-9 and this label is returned but with +the span 8-12, it is counted as 1/4 for recall and 2/5 for precision. +|] + +outContents :: Metric -> String +outContents (SoftFMeasure _) = [hereLit|inwords:1-4 +inwords:1-3 indigits:5 +|] + +expectedScore :: EvaluationScheme -> MetricValue +expectedScore (EvaluationScheme (SoftFMeasure beta) []) = weightedHarmonicMean beta precision recall + where precision = 0.25 + recall = 0.75 + +listOfAvailableEvaluationSchemes :: [EvaluationScheme] +listOfAvailableEvaluationSchemes = map (\m -> EvaluationScheme m []) listOfAvailableMetrics + ++ [ + EvaluationScheme BLEU [LowerCasing, + RegexpMatch (fromRight undefined $ compileM "\\s+|[a-z0-9]+" [])] + ] + +allMetricsDescription :: String +allMetricsDescription = + intercalate "\n\n\n" $ map formatEvaluationSchemeDescription listOfAvailableEvaluationSchemes + +formatEvaluationSchemeDescription :: EvaluationScheme -> String +formatEvaluationSchemeDescription scheme@(EvaluationScheme metric _) = show scheme ++ "\n" ++ description + where description = if isEvaluationSchemeDescribed scheme + then (getEvaluationSchemeDescription scheme) + ++ "\nExample\n" + ++ (pasteLines "Expected output" "Sample output") + ++ concat (map (\(exp, out) -> pasteLines exp out) $ zip (lines $ testExpectedContents metric) + (lines $ outContents metric)) + ++ "\nMetric value: " ++ (printf "%.4f" $ expectedScore scheme) + else noDescription + noDescription = [hereLit|THE METRIC HAS NO DESCRIPTION YET, PLEASE ADD AN ISSUE TO https://gitlab.com/filipg/geval/issues +IF YOU WANT TO HAVE IT DESCRIBED|] + +pasteLines :: String -> String -> String +pasteLines a b = printf "%-35s %s\n" a b diff --git a/src/GEval/OptionsParser.hs b/src/GEval/OptionsParser.hs index e0cfb0f..83c17a5 100644 --- a/src/GEval/OptionsParser.hs +++ b/src/GEval/OptionsParser.hs @@ -27,6 +27,7 @@ import Data.Monoid ((<>)) import GEval.Core import GEval.EvaluationScheme +import GEval.MetricsMeta (extraInfo, listOfAvailableEvaluationSchemes, allMetricsDescription) import GEval.Common import GEval.CreateChallenge import GEval.LineByLine @@ -35,6 +36,8 @@ import GEval.BlackBoxDebugging import GEval.Selector import GEval.Validation +import Data.List (intercalate) + import Data.Conduit.SmartSource fullOptionsParser = info (helper <*> optionsParser) @@ -87,6 +90,10 @@ optionsParser = GEvalOptions (flag' Validate ( long "validate" <> help "Validate challenge, it searches for potential errors in the given challenge path, like missing columns, files or format data.")) + <|> + (flag' ListMetrics + ( long "list-metrics" + <> help "List all metrics with their descriptions")) ) <*> ((flag' FirstTheWorst @@ -233,12 +240,21 @@ sel :: Maybe Metric -> Metric -> Metric sel Nothing m = m sel (Just m) _ = m + + metricReader :: Parser [EvaluationScheme] metricReader = many $ option auto -- actually `some` should be used instead of `many`, the problem is that ( long "metric" -- --metric might be in the config.txt file... <> short 'm' <> metavar "METRIC" - <> help "Metric to be used - RMSE, MSE, MAE, SMAPE, Pearson, Spearman, Accuracy, LogLoss, Likelihood, F-measure (specify as F1, F2, F0.25, etc.), macro F-measure (specify as Macro-F1, Macro-F2, Macro-F0.25, etc.), multi-label F-measure (specify as MultiLabel-F1, MultiLabel-F2, MultiLabel-F0.25, etc.), MultiLabel-Likelihood, MAP, BLEU, GLEU (\"Google GLEU\" not the grammar correction metric), WER, NMI, ClippEU, LogLossHashed, LikelihoodHashed, BIO-F1, BIO-F1-Labels, TokenAccuracy, soft F-measure (specify as Soft-F1, Soft-F2, Soft-F0.25), probabilistic soft F-measure (specify as Probabilistic-Soft-F1, Probabilistic-Soft-F2, Probabilistic-Soft-F0.25) or CharMatch" ) + <> help ("Metric to be used, e.g.:" ++ intercalate ", " (map + (\s -> (show s) ++ (case extraInfo s of + Just eI -> " (" ++ eI ++ ")" + Nothing -> "")) + listOfAvailableEvaluationSchemes))) + + +-- RMSE, MSE, MAE, SMAPE, Pearson, Spearman, Accuracy, LogLoss, Likelihood, F-measure (specify as F1, F2, F0.25, etc.), macro F-measure (specify as Macro-F1, Macro-F2, Macro-F0.25, etc.), multi-label F-measure (specify as MultiLabel-F1, MultiLabel-F2, MultiLabel-F0.25, etc.), MultiLabel-Likelihood, MAP, BLEU, GLEU (\"Google GLEU\" not the grammar correction metric), WER, NMI, ClippEU, LogLossHashed, LikelihoodHashed, BIO-F1, BIO-F1-Labels, TokenAccuracy, soft F-measure (specify as Soft-F1, Soft-F2, Soft-F0.25), probabilistic soft F-measure (specify as Probabilistic-Soft-F1, Probabilistic-Soft-F2, Probabilistic-Soft-F0.25) or CharMatch" ) altMetricReader :: Parser (Maybe EvaluationScheme) altMetricReader = optional $ option auto @@ -341,6 +357,9 @@ runGEval''' (Just Submit) _ _ spec _ _ = do runGEval''' (Just Validate) _ _ spec _ _ = do validateChallenge spec return Nothing +runGEval''' (Just ListMetrics) _ _ _ _ _ = do + listMetrics + return Nothing getGraphFilename :: Int -> FilePath -> FilePath getGraphFilename 0 fp = fp @@ -401,3 +420,6 @@ Run: to validate a directory CHALLENGE representing a Gonito challenge. |] exitFailure + +listMetrics :: IO () +listMetrics = putStrLn allMetricsDescription diff --git a/test/Spec.hs b/test/Spec.hs index bbffbaf..25d95e3 100644 --- a/test/Spec.hs +++ b/test/Spec.hs @@ -4,6 +4,7 @@ import Test.Hspec import GEval.Metric +import GEval.MetricsMeta (listOfAvailableEvaluationSchemes, isEvaluationSchemeDescribed, expectedScore, outContents) import GEval.Core import GEval.Common import GEval.EvaluationScheme @@ -33,6 +34,8 @@ import Data.Map.Strict import Data.Conduit.List (consume) +import System.FilePath + import System.Directory import System.Process import System.Exit @@ -508,15 +511,27 @@ main = hspec $ do shapify "a" `shouldBe` (WordShape "a") shapify "B5" `shouldBe` (WordShape "A9") describe "create challenges and validate them" $ do - (flip mapM_) listOfAvailableMetrics $ \metric -> do - it (show metric) $ do + (flip mapM_) listOfAvailableEvaluationSchemes $ \scheme -> do + it (show scheme) $ do withSystemTempDirectory "geval-validation-test" $ \tempDir -> do let spec = defaultGEvalSpecification { gesExpectedDirectory = Just tempDir, - gesMetrics = [EvaluationScheme metric []], + gesMetrics = [scheme], gesPrecision = Just 4 } createChallenge True tempDir spec validationChallenge tempDir spec + describe "test sample outputs" $ do + (flip mapM_ ) (Prelude.filter isEvaluationSchemeDescribed listOfAvailableEvaluationSchemes) $ \scheme@(EvaluationScheme metric _) -> do + it (show scheme) $ do + withSystemTempDirectory "geval-sample-output-test" $ \tempDir -> do + let spec = defaultGEvalSpecification { + gesExpectedDirectory = Just tempDir, + gesMetrics = [scheme] } + createChallenge True tempDir spec + let outFile = tempDir "test-A" "out.tsv" + writeFile outFile (outContents metric) + obtainedScore <- (runGEval ["--expected-directory", tempDir, "--out-directory", tempDir]) >>= extractVal + obtainedScore `shouldBe` (expectedScore scheme) describe "submit" $ do it "current branch" $ do runGitTest "branch-test" (\_ -> getCurrentBranch) `shouldReturn` "develop"