forked from filipg/gonito
Add bootstrap resampling
This commit is contained in:
parent
580c141a8e
commit
d2a59e59a5
@ -17,12 +17,14 @@ import qualified Data.Map.Strict as M
|
||||
|
||||
import GEval.Core
|
||||
import GEval.EvaluationScheme
|
||||
import GEval.Common (MetricValue)
|
||||
import GEval.OptionsParser
|
||||
import GEval.ParseParams (parseParamsFromFilePath, OutputFileParsed(..))
|
||||
import GEval.Common (GEvalException, MetricResult(..), MetricValue)
|
||||
import GEval.Formatting (formatTheResult)
|
||||
|
||||
import Options.Applicative
|
||||
import Data.Conduit.SmartSource
|
||||
import Data.Conduit.Bootstrap (defaultConfidenceLevel, getConfidenceBounds)
|
||||
|
||||
import System.FilePath (takeFileName, dropExtensions, (-<.>))
|
||||
|
||||
@ -222,15 +224,17 @@ checkOrInsertEvaluation repoDir chan version out = do
|
||||
Right (Left _) -> do
|
||||
err chan "Cannot parse options, check the challenge repo"
|
||||
Right (Right (_, Just [(_, [result])])) -> do
|
||||
msg chan $ concat [ "Evaluated! Score ", (formatNonScientifically result) ]
|
||||
msg chan $ concat [ "Evaluated! Score ", (T.pack $ formatTheResult Nothing result) ]
|
||||
time <- liftIO getCurrentTime
|
||||
_ <- runDB $ insert $ Evaluation {
|
||||
evaluationTest=outTest out,
|
||||
evaluationChecksum=outChecksum out,
|
||||
evaluationScore=Just result,
|
||||
evaluationErrorMessage=Nothing,
|
||||
evaluationStamp=time,
|
||||
evaluationVersion=Just version }
|
||||
_ <- runDB $ insert $ let (pointResult, errorBound) = extractResult result
|
||||
in Evaluation {
|
||||
evaluationTest=outTest out,
|
||||
evaluationChecksum=outChecksum out,
|
||||
evaluationScore=Just pointResult,
|
||||
evaluationErrorBound=errorBound,
|
||||
evaluationErrorMessage=Nothing,
|
||||
evaluationStamp=time,
|
||||
evaluationVersion=Just version }
|
||||
msg chan "Evaluation done"
|
||||
Right (Right (_, Just _)) -> do
|
||||
err chan "Unexpected multiple results (???)"
|
||||
@ -239,12 +243,17 @@ checkOrInsertEvaluation repoDir chan version out = do
|
||||
Left exception -> do
|
||||
err chan $ "Evaluation failed: " ++ (T.pack $ show exception)
|
||||
|
||||
extractResult :: MetricResult -> (MetricValue, Maybe MetricValue)
|
||||
extractResult (SimpleRun r) = (r, Nothing)
|
||||
extractResult (BootstrapResampling vals) = ((upperBound + lowerBound) / 2.0, Just ((upperBound - lowerBound) / 2.0))
|
||||
where (lowerBound, upperBound) = getConfidenceBounds defaultConfidenceLevel vals
|
||||
|
||||
rawEval :: FilePath
|
||||
-> EvaluationScheme
|
||||
-> FilePath
|
||||
-> Text
|
||||
-> FilePath
|
||||
-> IO (Either GEvalException (Either (ParserResult GEvalOptions) (GEvalOptions, Maybe [(SourceSpec, [MetricValue])])))
|
||||
-> IO (Either GEvalException (Either (ParserResult GEvalOptions) (GEvalOptions, Maybe [(SourceSpec, [MetricResult])])))
|
||||
rawEval challengeDir metric repoDir name outF = Import.try (runGEvalGetOptions [
|
||||
"--alt-metric", (show metric),
|
||||
"--expected-directory", challengeDir,
|
||||
|
@ -301,6 +301,7 @@ lineByLineTable (Entity testId test) theStamp = mempty
|
||||
evaluationTest = testId,
|
||||
evaluationChecksum = testChecksum test,
|
||||
evaluationScore = Just score,
|
||||
evaluationErrorBound = Nothing,
|
||||
evaluationErrorMessage = Nothing,
|
||||
evaluationStamp = theStamp,
|
||||
evaluationVersion = Nothing }
|
||||
|
@ -41,6 +41,7 @@ import Text.Regex.TDFA
|
||||
|
||||
import GEval.Core
|
||||
import GEval.EvaluationScheme
|
||||
import GEval.Formatting (formatTheResultWithErrorBounds)
|
||||
|
||||
import qualified Data.Vector as DV
|
||||
|
||||
@ -449,7 +450,7 @@ formatTruncatedScore :: Maybe Int -> Maybe Evaluation -> Text
|
||||
formatTruncatedScore Nothing e = formatFullScore e
|
||||
formatTruncatedScore _ Nothing = formatFullScore Nothing
|
||||
formatTruncatedScore (Just precision) (Just evaluation) = case evaluationScore evaluation of
|
||||
Just score -> T.pack $ printf "%0.*f" precision score
|
||||
Just score -> T.pack $ formatTheResultWithErrorBounds (Just precision) score (evaluationErrorBound evaluation)
|
||||
Nothing -> formatFullScore Nothing
|
||||
|
||||
formatScore :: Maybe Int -> Double -> Text
|
||||
|
@ -122,6 +122,7 @@ Evaluation
|
||||
test TestId
|
||||
checksum SHA1
|
||||
score Double Maybe
|
||||
errorBound Double Maybe
|
||||
errorMessage Text Maybe
|
||||
stamp UTCTime default=now()
|
||||
-- Should be just SHA1 (without Maybe) - Maybe is just a legacy
|
||||
|
@ -128,7 +128,7 @@ library
|
||||
, filemanip
|
||||
, cryptohash
|
||||
, markdown
|
||||
, geval >= 1.27 && < 1.29
|
||||
, geval >= 1.31.1 && < 1.32
|
||||
, filepath
|
||||
, yesod-table
|
||||
, regex-tdfa
|
||||
|
Loading…
Reference in New Issue
Block a user