diff --git a/Handler/Evaluate.hs b/Handler/Evaluate.hs index d24b8f2..df1516e 100644 --- a/Handler/Evaluate.hs +++ b/Handler/Evaluate.hs @@ -17,12 +17,14 @@ import qualified Data.Map.Strict as M import GEval.Core import GEval.EvaluationScheme -import GEval.Common (MetricValue) import GEval.OptionsParser import GEval.ParseParams (parseParamsFromFilePath, OutputFileParsed(..)) +import GEval.Common (GEvalException, MetricResult(..), MetricValue) +import GEval.Formatting (formatTheResult) import Options.Applicative import Data.Conduit.SmartSource +import Data.Conduit.Bootstrap (defaultConfidenceLevel, getConfidenceBounds) import System.FilePath (takeFileName, dropExtensions, (-<.>)) @@ -222,15 +224,17 @@ checkOrInsertEvaluation repoDir chan version out = do Right (Left _) -> do err chan "Cannot parse options, check the challenge repo" Right (Right (_, Just [(_, [result])])) -> do - msg chan $ concat [ "Evaluated! Score ", (formatNonScientifically result) ] + msg chan $ concat [ "Evaluated! Score ", (T.pack $ formatTheResult Nothing result) ] time <- liftIO getCurrentTime - _ <- runDB $ insert $ Evaluation { - evaluationTest=outTest out, - evaluationChecksum=outChecksum out, - evaluationScore=Just result, - evaluationErrorMessage=Nothing, - evaluationStamp=time, - evaluationVersion=Just version } + _ <- runDB $ insert $ let (pointResult, errorBound) = extractResult result + in Evaluation { + evaluationTest=outTest out, + evaluationChecksum=outChecksum out, + evaluationScore=Just pointResult, + evaluationErrorBound=errorBound, + evaluationErrorMessage=Nothing, + evaluationStamp=time, + evaluationVersion=Just version } msg chan "Evaluation done" Right (Right (_, Just _)) -> do err chan "Unexpected multiple results (???)" @@ -239,12 +243,17 @@ checkOrInsertEvaluation repoDir chan version out = do Left exception -> do err chan $ "Evaluation failed: " ++ (T.pack $ show exception) +extractResult :: MetricResult -> (MetricValue, Maybe MetricValue) +extractResult (SimpleRun r) = (r, Nothing) +extractResult (BootstrapResampling vals) = ((upperBound + lowerBound) / 2.0, Just ((upperBound - lowerBound) / 2.0)) + where (lowerBound, upperBound) = getConfidenceBounds defaultConfidenceLevel vals + rawEval :: FilePath -> EvaluationScheme -> FilePath -> Text -> FilePath - -> IO (Either GEvalException (Either (ParserResult GEvalOptions) (GEvalOptions, Maybe [(SourceSpec, [MetricValue])]))) + -> IO (Either GEvalException (Either (ParserResult GEvalOptions) (GEvalOptions, Maybe [(SourceSpec, [MetricResult])]))) rawEval challengeDir metric repoDir name outF = Import.try (runGEvalGetOptions [ "--alt-metric", (show metric), "--expected-directory", challengeDir, diff --git a/Handler/Query.hs b/Handler/Query.hs index 3c9ab77..1061f39 100644 --- a/Handler/Query.hs +++ b/Handler/Query.hs @@ -301,6 +301,7 @@ lineByLineTable (Entity testId test) theStamp = mempty evaluationTest = testId, evaluationChecksum = testChecksum test, evaluationScore = Just score, + evaluationErrorBound = Nothing, evaluationErrorMessage = Nothing, evaluationStamp = theStamp, evaluationVersion = Nothing } diff --git a/Handler/Shared.hs b/Handler/Shared.hs index f8a4b83..952f136 100644 --- a/Handler/Shared.hs +++ b/Handler/Shared.hs @@ -41,6 +41,7 @@ import Text.Regex.TDFA import GEval.Core import GEval.EvaluationScheme +import GEval.Formatting (formatTheResultWithErrorBounds) import qualified Data.Vector as DV @@ -449,7 +450,7 @@ formatTruncatedScore :: Maybe Int -> Maybe Evaluation -> Text formatTruncatedScore Nothing e = formatFullScore e formatTruncatedScore _ Nothing = formatFullScore Nothing formatTruncatedScore (Just precision) (Just evaluation) = case evaluationScore evaluation of - Just score -> T.pack $ printf "%0.*f" precision score + Just score -> T.pack $ formatTheResultWithErrorBounds (Just precision) score (evaluationErrorBound evaluation) Nothing -> formatFullScore Nothing formatScore :: Maybe Int -> Double -> Text diff --git a/config/models b/config/models index 2b0f2e8..6aa0fcb 100644 --- a/config/models +++ b/config/models @@ -122,6 +122,7 @@ Evaluation test TestId checksum SHA1 score Double Maybe + errorBound Double Maybe errorMessage Text Maybe stamp UTCTime default=now() -- Should be just SHA1 (without Maybe) - Maybe is just a legacy diff --git a/gonito.cabal b/gonito.cabal index 6988be0..c8c3bbf 100644 --- a/gonito.cabal +++ b/gonito.cabal @@ -128,7 +128,7 @@ library , filemanip , cryptohash , markdown - , geval >= 1.27 && < 1.29 + , geval >= 1.31.1 && < 1.32 , filepath , yesod-table , regex-tdfa