Implement auxiliary calibration function

2019-03-11 11:42:26 +01:00 · 2019-03-11 11:42:26 +01:00 · 8393bec3ae
commit 8393bec3ae
parent 19642db43f
5 changed files with 99 additions and 2 deletions
--- a/geval.cabal
+++ b/geval.cabal
@ -41,6 +41,8 @@ library
                     , Text.WordShape
                     , Data.Statistics.Kendall
                     , GEval.Selector
                     , Data.Statistics.Loess
                     , Data.Statistics.Calibration
                     , Paths_geval
  build-depends:       base >= 4.7 && < 5
                     , cond
@ -85,6 +87,8 @@ library
                     , vector-algorithms
                     , aeson
                     , aeson-pretty
                     , numeric-tools
                     , integration
  default-language:    Haskell2010
 executable geval
--- a/src/Data/Statistics/Calibration.hs
+++ b/src/Data/Statistics/Calibration.hs
@ -0,0 +1,46 @@
 module Data.Statistics.Calibration
   (calibration, softCalibration) where
 import Data.Statistics.Loess(loess)
 import Numeric.Tools.Integration
 import Numeric.Integration.TanhSinh
 import Data.List(minimum, maximum)
 import qualified Data.Vector.Unboxed as DVU
 minBand :: Double
 minBand = 0.001
 bool2Double :: Bool -> Double
 bool2Double True = 1.0
 bool2Double False = 0.0
 mean :: [Double] -> Double
 mean results = (sum results) / (fromIntegral n)
  where n = length results
 band :: [Double] -> Double
 band xs = (maximum xs) - (minimum xs)
 calibration :: [Bool] -> [Double] -> Double
 calibration results probs = softCalibration results' probs
  where results' = map bool2Double results
 integrate :: (Double, Double) -> (Double -> Double) -> Double
 integrate (a, b) fun = case simpson fun a b of
  (r:_) -> result r
 softCalibration :: [Double] -> [Double] -> Double
 softCalibration [] [] = 1.0
 softCalibration [] _ = error "too few booleans in calibration"
 softCalibration _ [] = error "too few probabilities in calibration"
 softCalibration results probs
  | band probs < minBand = handleNarrowBand results probs
  | otherwise = 1.0 - (min 1.0 (2.0 * (highest - lowest) * (integrate (lowest, highest) (\x -> abs ((loess (DVU.fromList probs) (DVU.fromList results) x) - x)))))
  where lowest = minimum probs
        highest = maximum probs
 handleNarrowBand :: [Double] -> [Double] -> Double
 handleNarrowBand results probs = 1.0 - deviation
  where deviation = abs (g - t)
        g = mean probs
        t = mean results
--- a/src/Data/Statistics/Loess.hs
+++ b/src/Data/Statistics/Loess.hs
@ -0,0 +1,21 @@
 module Data.Statistics.Loess
   (loess) where
 import qualified Statistics.Matrix.Types as SMT
 import Statistics.Regression (ols)
 import Data.Vector.Unboxed((!), zipWith, length, (++), map)
 import Statistics.Matrix(transpose)
 triCube :: Double -> Double
 triCube d = (1.0 - (abs d) ** 3) ** 3
 loess :: SMT.Vector -> SMT.Vector -> Double -> Double
 loess inputs outputs x = a * x + b
  where a = params ! 1
        b = params ! 0
        params = ols inputMatrix scaledOutputs
        weights = Data.Vector.Unboxed.map (\v -> triCube (x - v)) inputs
        scaledOutputs = Data.Vector.Unboxed.zipWith (*) outputs weights
        scaledInputs = Data.Vector.Unboxed.zipWith (*) inputs weights
        inputMatrix = transpose (SMT.Matrix 2 (Data.Vector.Unboxed.length inputs) 1000 (weights Data.Vector.Unboxed.++ scaledInputs))
--- a/stack.yaml
+++ b/stack.yaml
@ -1,5 +1,5 @@
 flags: {}
 packages:
 - '.'
-extra-deps: [murmur3-1.0.3,naturalcomp-0.0.3,Munkres-0.1]
+extra-deps: [murmur3-1.0.3,naturalcomp-0.0.3,Munkres-0.1,numeric-tools-0.2.0.1]
 resolver: lts-11.9
--- a/test/Spec.hs
+++ b/test/Spec.hs
@ -54,6 +54,10 @@ import qualified Data.Conduit.Combinators as CC
 import Statistics.Distribution (cumulative)
 import Statistics.Distribution.Normal (normalDistr)
 import Data.Statistics.Kendall (kendall, kendallZ)
 import qualified Data.Vector.Unboxed as DVU
 import qualified Statistics.Matrix.Types as SMT
 import Data.Statistics.Loess (loess)
 import Data.Statistics.Calibration (calibration)
 informationRetrievalBookExample :: [(String, Int)]
 informationRetrievalBookExample = [("o", 2), ("o", 2), ("d", 2), ("x", 3), ("d", 3),
@ -564,7 +568,29 @@ main = hspec $ do
      kendallZ (V.fromList $ Prelude.zip [12, 2, 1, 12, 2] [1, 4, 7, 1, 0]) `shouldBeAlmost` (-1.0742)
    it "p-value" $ do
      (2 * (cumulative (normalDistr 0.0 1.0) $ kendallZ (V.fromList $ Prelude.zip [12, 2, 1, 12, 2] [1, 4, 7, 1, 0]))) `shouldBeAlmost` 0.2827
-
+  describe "Loess" $ do
    it "simple" $ do
      loess (DVU.fromList [0.2, 0.6, 1.0])
            (DVU.fromList [-0.6, 0.2, 1.0])
            0.4 `shouldBeAlmost` (-0.2)
  describe "Calibration" $ do
    it "empty list" $ do
      calibration [] [] `shouldBeAlmost` 1.0
    it "one element" $ do
      calibration [True] [1.0] `shouldBeAlmost` 1.0
      calibration [False] [0.0] `shouldBeAlmost` 1.0
      calibration [True] [0.0] `shouldBeAlmost` 0.0
      calibration [False] [1.0] `shouldBeAlmost` 0.0
      calibration [True] [0.7] `shouldBeAlmost` 0.7
      calibration [True] [0.3] `shouldBeAlmost` 0.3
      calibration [False] [0.7] `shouldBeAlmost` 0.3
      calibration [False] [0.3] `shouldBeAlmost` 0.7
    it "perfect calibration" $ do
      calibration [True, True, False] [0.5, 1.0, 0.5] `shouldBeAlmost` 1.0
    it "totally wrong" $ do
      calibration [True, False] [0.0, 1.0] `shouldBeAlmost` 0.0
      calibration [True, False, False, True, False] [0.0, 1.0, 1.0, 0.5, 0.5] `shouldBeAlmost` 0.0
      calibration [False, True, True, True, True, False, False, True, False] [0.25, 0.25, 0.0, 0.25, 0.25, 1.0, 1.0, 0.5, 0.5] `shouldBeAlmost` 0.0
 checkConduitPure conduit inList expList = do
  let outList = runConduitPure $ CC.yieldMany inList .| conduit .| CC.sinkList