Add bootstrap resampling

2020-01-28 23:14:46 +01:00 · 2020-01-28 23:14:46 +01:00 · d2a59e59a5
commit d2a59e59a5
parent 580c141a8e
5 changed files with 24 additions and 12 deletions
--- a/Handler/Evaluate.hs
+++ b/Handler/Evaluate.hs
@ -17,12 +17,14 @@ import qualified Data.Map.Strict as M
 import GEval.Core
 import GEval.EvaluationScheme
 import GEval.Common (MetricValue)
 import GEval.OptionsParser
 import GEval.ParseParams (parseParamsFromFilePath, OutputFileParsed(..))
 import GEval.Common (GEvalException, MetricResult(..), MetricValue)
 import GEval.Formatting (formatTheResult)
 import Options.Applicative
 import Data.Conduit.SmartSource
 import Data.Conduit.Bootstrap (defaultConfidenceLevel, getConfidenceBounds)
 import System.FilePath (takeFileName, dropExtensions, (-<.>))
@ -222,15 +224,17 @@ checkOrInsertEvaluation repoDir chan version out = do
        Right (Left _) -> do
          err chan "Cannot parse options, check the challenge repo"
        Right (Right (_, Just [(_, [result])])) -> do
-          msg chan $ concat [ "Evaluated! Score ", (formatNonScientifically result) ]
+          msg chan $ concat [ "Evaluated! Score ", (T.pack $ formatTheResult Nothing result) ]
          time <- liftIO getCurrentTime
-          _ <- runDB $ insert $ Evaluation {
+          _ <- runDB $ insert $ let (pointResult, errorBound) = extractResult result
-            evaluationTest=outTest out,
+                               in Evaluation {
-            evaluationChecksum=outChecksum out,
+                                   evaluationTest=outTest out,
-            evaluationScore=Just result,
+                                   evaluationChecksum=outChecksum out,
-            evaluationErrorMessage=Nothing,
+                                   evaluationScore=Just pointResult,
-            evaluationStamp=time,
+                                   evaluationErrorBound=errorBound,
-            evaluationVersion=Just version }
+                                   evaluationErrorMessage=Nothing,
                                   evaluationStamp=time,
                                   evaluationVersion=Just version }
          msg chan "Evaluation done"
        Right (Right (_, Just _)) -> do
          err chan "Unexpected multiple results (???)"
@ -239,12 +243,17 @@ checkOrInsertEvaluation repoDir chan version out = do
        Left exception -> do
          err chan $ "Evaluation failed: " ++ (T.pack $ show exception)
 extractResult :: MetricResult -> (MetricValue, Maybe MetricValue)
 extractResult (SimpleRun r) = (r, Nothing)
 extractResult (BootstrapResampling vals) = ((upperBound + lowerBound) / 2.0, Just ((upperBound - lowerBound) / 2.0))
  where (lowerBound, upperBound) = getConfidenceBounds defaultConfidenceLevel vals
 rawEval :: FilePath
          -> EvaluationScheme
          -> FilePath
          -> Text
          -> FilePath
-          -> IO (Either GEvalException (Either (ParserResult GEvalOptions) (GEvalOptions, Maybe [(SourceSpec, [MetricValue])])))
+          -> IO (Either GEvalException (Either (ParserResult GEvalOptions) (GEvalOptions, Maybe [(SourceSpec, [MetricResult])])))
 rawEval challengeDir metric repoDir name outF = Import.try (runGEvalGetOptions [
                                                          "--alt-metric", (show metric),
                                                          "--expected-directory", challengeDir,
--- a/Handler/Query.hs
+++ b/Handler/Query.hs
@ -301,6 +301,7 @@ lineByLineTable (Entity testId test) theStamp = mempty
          evaluationTest = testId,
          evaluationChecksum = testChecksum test,
          evaluationScore = Just score,
          evaluationErrorBound = Nothing,
          evaluationErrorMessage = Nothing,
          evaluationStamp = theStamp,
          evaluationVersion = Nothing }
--- a/Handler/Shared.hs
+++ b/Handler/Shared.hs
@ -41,6 +41,7 @@ import Text.Regex.TDFA
 import GEval.Core
 import GEval.EvaluationScheme
 import GEval.Formatting (formatTheResultWithErrorBounds)
 import qualified Data.Vector as DV
@ -449,7 +450,7 @@ formatTruncatedScore :: Maybe Int -> Maybe Evaluation -> Text
 formatTruncatedScore Nothing e = formatFullScore e
 formatTruncatedScore _ Nothing  = formatFullScore Nothing
 formatTruncatedScore (Just precision) (Just evaluation) = case evaluationScore evaluation of
-  Just score -> T.pack $ printf "%0.*f" precision score
+  Just score -> T.pack $ formatTheResultWithErrorBounds (Just precision) score (evaluationErrorBound evaluation)
  Nothing -> formatFullScore Nothing
 formatScore :: Maybe Int -> Double -> Text
--- a/config/models
+++ b/config/models
@ -122,6 +122,7 @@ Evaluation
    test TestId
    checksum SHA1
    score Double Maybe
    errorBound Double Maybe
    errorMessage Text Maybe
    stamp UTCTime default=now()
    -- Should be just SHA1 (without Maybe) - Maybe is just a legacy
--- a/gonito.cabal
+++ b/gonito.cabal
@ -128,7 +128,7 @@ library
                 , filemanip
                 , cryptohash
                 , markdown
-                 , geval >= 1.27 && < 1.29
+                 , geval >= 1.31.1 && < 1.32
                 , filepath
                 , yesod-table
                 , regex-tdfa