Implement auxiliary calibration function

2019-03-11 11:42:26 +01:00 · 2019-03-11 11:42:26 +01:00 · 8393bec3ae
commit 8393bec3ae
parent 19642db43f
5 changed files with 99 additions and 2 deletions
--- a/geval.cabal
+++ b/geval.cabal
@ -41,6 +41,8 @@ library
                     , Text.WordShape
                     , Data.Statistics.Kendall
                     , GEval.Selector
+                     , Data.Statistics.Loess
+                     , Data.Statistics.Calibration
                     , Paths_geval
  build-depends:       base >= 4.7 && < 5
                     , cond
@ -85,6 +87,8 @@ library
                     , vector-algorithms
                     , aeson
                     , aeson-pretty
+                     , numeric-tools
+                     , integration
  default-language:    Haskell2010

 executable geval
--- a/src/Data/Statistics/Calibration.hs
+++ b/src/Data/Statistics/Calibration.hs
@ -0,0 +1,46 @@
+module Data.Statistics.Calibration
+   (calibration, softCalibration) where
+
+import Data.Statistics.Loess(loess)
+import Numeric.Tools.Integration
+import Numeric.Integration.TanhSinh
+import Data.List(minimum, maximum)
+import qualified Data.Vector.Unboxed as DVU
+
+minBand :: Double
+minBand = 0.001
+
+bool2Double :: Bool -> Double
+bool2Double True = 1.0
+bool2Double False = 0.0
+
+mean :: [Double] -> Double
+mean results = (sum results) / (fromIntegral n)
+  where n = length results
+
+band :: [Double] -> Double
+band xs = (maximum xs) - (minimum xs)
+
+calibration :: [Bool] -> [Double] -> Double
+calibration results probs = softCalibration results' probs
+  where results' = map bool2Double results
+
+integrate :: (Double, Double) -> (Double -> Double) -> Double
+integrate (a, b) fun = case simpson fun a b of
+  (r:_) -> result r
+
+softCalibration :: [Double] -> [Double] -> Double
+softCalibration [] [] = 1.0
+softCalibration [] _ = error "too few booleans in calibration"
+softCalibration _ [] = error "too few probabilities in calibration"
+softCalibration results probs
+  | band probs < minBand = handleNarrowBand results probs
+  | otherwise = 1.0 - (min 1.0 (2.0 * (highest - lowest) * (integrate (lowest, highest) (\x -> abs ((loess (DVU.fromList probs) (DVU.fromList results) x) - x)))))
+  where lowest = minimum probs
+        highest = maximum probs
+
+handleNarrowBand :: [Double] -> [Double] -> Double
+handleNarrowBand results probs = 1.0 - deviation
+  where deviation = abs (g - t)
+        g = mean probs
+        t = mean results
--- a/src/Data/Statistics/Loess.hs
+++ b/src/Data/Statistics/Loess.hs
@ -0,0 +1,21 @@
+module Data.Statistics.Loess
+   (loess) where
+
+import qualified Statistics.Matrix.Types as SMT
+import Statistics.Regression (ols)
+import Data.Vector.Unboxed((!), zipWith, length, (++), map)
+import Statistics.Matrix(transpose)
+
+
+triCube :: Double -> Double
+triCube d = (1.0 - (abs d) ** 3) ** 3
+
+loess :: SMT.Vector -> SMT.Vector -> Double -> Double
+loess inputs outputs x = a * x + b
+  where a = params ! 1
+        b = params ! 0
+        params = ols inputMatrix scaledOutputs
+        weights = Data.Vector.Unboxed.map (\v -> triCube (x - v)) inputs
+        scaledOutputs = Data.Vector.Unboxed.zipWith (*) outputs weights
+        scaledInputs = Data.Vector.Unboxed.zipWith (*) inputs weights
+        inputMatrix = transpose (SMT.Matrix 2 (Data.Vector.Unboxed.length inputs) 1000 (weights Data.Vector.Unboxed.++ scaledInputs))
--- a/stack.yaml
+++ b/stack.yaml
@ -1,5 +1,5 @@
 flags: {}
 packages:
 - '.'
-extra-deps: [murmur3-1.0.3,naturalcomp-0.0.3,Munkres-0.1]
+extra-deps: [murmur3-1.0.3,naturalcomp-0.0.3,Munkres-0.1,numeric-tools-0.2.0.1]
 resolver: lts-11.9
--- a/test/Spec.hs
+++ b/test/Spec.hs
@ -54,6 +54,10 @@ import qualified Data.Conduit.Combinators as CC
 import Statistics.Distribution (cumulative)
 import Statistics.Distribution.Normal (normalDistr)
 import Data.Statistics.Kendall (kendall, kendallZ)
+import qualified Data.Vector.Unboxed as DVU
+import qualified Statistics.Matrix.Types as SMT
+import Data.Statistics.Loess (loess)
+import Data.Statistics.Calibration (calibration)

 informationRetrievalBookExample :: [(String, Int)]
 informationRetrievalBookExample = [("o", 2), ("o", 2), ("d", 2), ("x", 3), ("d", 3),
@ -564,7 +568,29 @@ main = hspec $ do
      kendallZ (V.fromList $ Prelude.zip [12, 2, 1, 12, 2] [1, 4, 7, 1, 0]) `shouldBeAlmost` (-1.0742)
    it "p-value" $ do
      (2 * (cumulative (normalDistr 0.0 1.0) $ kendallZ (V.fromList $ Prelude.zip [12, 2, 1, 12, 2] [1, 4, 7, 1, 0]))) `shouldBeAlmost` 0.2827
-
+  describe "Loess" $ do
+    it "simple" $ do
+      loess (DVU.fromList [0.2, 0.6, 1.0])
+            (DVU.fromList [-0.6, 0.2, 1.0])
+            0.4 `shouldBeAlmost` (-0.2)
+  describe "Calibration" $ do
+    it "empty list" $ do
+      calibration [] [] `shouldBeAlmost` 1.0
+    it "one element" $ do
+      calibration [True] [1.0] `shouldBeAlmost` 1.0
+      calibration [False] [0.0] `shouldBeAlmost` 1.0
+      calibration [True] [0.0] `shouldBeAlmost` 0.0
+      calibration [False] [1.0] `shouldBeAlmost` 0.0
+      calibration [True] [0.7] `shouldBeAlmost` 0.7
+      calibration [True] [0.3] `shouldBeAlmost` 0.3
+      calibration [False] [0.7] `shouldBeAlmost` 0.3
+      calibration [False] [0.3] `shouldBeAlmost` 0.7
+    it "perfect calibration" $ do
+      calibration [True, True, False] [0.5, 1.0, 0.5] `shouldBeAlmost` 1.0
+    it "totally wrong" $ do
+      calibration [True, False] [0.0, 1.0] `shouldBeAlmost` 0.0
+      calibration [True, False, False, True, False] [0.0, 1.0, 1.0, 0.5, 0.5] `shouldBeAlmost` 0.0
+      calibration [False, True, True, True, True, False, False, True, False] [0.25, 0.25, 0.0, 0.25, 0.25, 1.0, 1.0, 0.5, 0.5] `shouldBeAlmost` 0.0

 checkConduitPure conduit inList expList = do
  let outList = runConduitPure $ CC.yieldMany inList .| conduit .| CC.sinkList