diff --git a/geval.cabal b/geval.cabal index 256f1a1..dd82c79 100644 --- a/geval.cabal +++ b/geval.cabal @@ -41,6 +41,8 @@ library , Text.WordShape , Data.Statistics.Kendall , GEval.Selector + , Data.Statistics.Loess + , Data.Statistics.Calibration , Paths_geval build-depends: base >= 4.7 && < 5 , cond @@ -85,6 +87,8 @@ library , vector-algorithms , aeson , aeson-pretty + , numeric-tools + , integration default-language: Haskell2010 executable geval diff --git a/src/Data/Statistics/Calibration.hs b/src/Data/Statistics/Calibration.hs new file mode 100644 index 0000000..a4b8a31 --- /dev/null +++ b/src/Data/Statistics/Calibration.hs @@ -0,0 +1,46 @@ +module Data.Statistics.Calibration + (calibration, softCalibration) where + +import Data.Statistics.Loess(loess) +import Numeric.Tools.Integration +import Numeric.Integration.TanhSinh +import Data.List(minimum, maximum) +import qualified Data.Vector.Unboxed as DVU + +minBand :: Double +minBand = 0.001 + +bool2Double :: Bool -> Double +bool2Double True = 1.0 +bool2Double False = 0.0 + +mean :: [Double] -> Double +mean results = (sum results) / (fromIntegral n) + where n = length results + +band :: [Double] -> Double +band xs = (maximum xs) - (minimum xs) + +calibration :: [Bool] -> [Double] -> Double +calibration results probs = softCalibration results' probs + where results' = map bool2Double results + +integrate :: (Double, Double) -> (Double -> Double) -> Double +integrate (a, b) fun = case simpson fun a b of + (r:_) -> result r + +softCalibration :: [Double] -> [Double] -> Double +softCalibration [] [] = 1.0 +softCalibration [] _ = error "too few booleans in calibration" +softCalibration _ [] = error "too few probabilities in calibration" +softCalibration results probs + | band probs < minBand = handleNarrowBand results probs + | otherwise = 1.0 - (min 1.0 (2.0 * (highest - lowest) * (integrate (lowest, highest) (\x -> abs ((loess (DVU.fromList probs) (DVU.fromList results) x) - x))))) + where lowest = minimum probs + highest = maximum probs + +handleNarrowBand :: [Double] -> [Double] -> Double +handleNarrowBand results probs = 1.0 - deviation + where deviation = abs (g - t) + g = mean probs + t = mean results diff --git a/src/Data/Statistics/Loess.hs b/src/Data/Statistics/Loess.hs new file mode 100644 index 0000000..96afa10 --- /dev/null +++ b/src/Data/Statistics/Loess.hs @@ -0,0 +1,21 @@ +module Data.Statistics.Loess + (loess) where + +import qualified Statistics.Matrix.Types as SMT +import Statistics.Regression (ols) +import Data.Vector.Unboxed((!), zipWith, length, (++), map) +import Statistics.Matrix(transpose) + + +triCube :: Double -> Double +triCube d = (1.0 - (abs d) ** 3) ** 3 + +loess :: SMT.Vector -> SMT.Vector -> Double -> Double +loess inputs outputs x = a * x + b + where a = params ! 1 + b = params ! 0 + params = ols inputMatrix scaledOutputs + weights = Data.Vector.Unboxed.map (\v -> triCube (x - v)) inputs + scaledOutputs = Data.Vector.Unboxed.zipWith (*) outputs weights + scaledInputs = Data.Vector.Unboxed.zipWith (*) inputs weights + inputMatrix = transpose (SMT.Matrix 2 (Data.Vector.Unboxed.length inputs) 1000 (weights Data.Vector.Unboxed.++ scaledInputs)) diff --git a/stack.yaml b/stack.yaml index 572ce27..2cf63ca 100644 --- a/stack.yaml +++ b/stack.yaml @@ -1,5 +1,5 @@ flags: {} packages: - '.' -extra-deps: [murmur3-1.0.3,naturalcomp-0.0.3,Munkres-0.1] +extra-deps: [murmur3-1.0.3,naturalcomp-0.0.3,Munkres-0.1,numeric-tools-0.2.0.1] resolver: lts-11.9 diff --git a/test/Spec.hs b/test/Spec.hs index 6c50f1f..024ac28 100644 --- a/test/Spec.hs +++ b/test/Spec.hs @@ -54,6 +54,10 @@ import qualified Data.Conduit.Combinators as CC import Statistics.Distribution (cumulative) import Statistics.Distribution.Normal (normalDistr) import Data.Statistics.Kendall (kendall, kendallZ) +import qualified Data.Vector.Unboxed as DVU +import qualified Statistics.Matrix.Types as SMT +import Data.Statistics.Loess (loess) +import Data.Statistics.Calibration (calibration) informationRetrievalBookExample :: [(String, Int)] informationRetrievalBookExample = [("o", 2), ("o", 2), ("d", 2), ("x", 3), ("d", 3), @@ -564,7 +568,29 @@ main = hspec $ do kendallZ (V.fromList $ Prelude.zip [12, 2, 1, 12, 2] [1, 4, 7, 1, 0]) `shouldBeAlmost` (-1.0742) it "p-value" $ do (2 * (cumulative (normalDistr 0.0 1.0) $ kendallZ (V.fromList $ Prelude.zip [12, 2, 1, 12, 2] [1, 4, 7, 1, 0]))) `shouldBeAlmost` 0.2827 - + describe "Loess" $ do + it "simple" $ do + loess (DVU.fromList [0.2, 0.6, 1.0]) + (DVU.fromList [-0.6, 0.2, 1.0]) + 0.4 `shouldBeAlmost` (-0.2) + describe "Calibration" $ do + it "empty list" $ do + calibration [] [] `shouldBeAlmost` 1.0 + it "one element" $ do + calibration [True] [1.0] `shouldBeAlmost` 1.0 + calibration [False] [0.0] `shouldBeAlmost` 1.0 + calibration [True] [0.0] `shouldBeAlmost` 0.0 + calibration [False] [1.0] `shouldBeAlmost` 0.0 + calibration [True] [0.7] `shouldBeAlmost` 0.7 + calibration [True] [0.3] `shouldBeAlmost` 0.3 + calibration [False] [0.7] `shouldBeAlmost` 0.3 + calibration [False] [0.3] `shouldBeAlmost` 0.7 + it "perfect calibration" $ do + calibration [True, True, False] [0.5, 1.0, 0.5] `shouldBeAlmost` 1.0 + it "totally wrong" $ do + calibration [True, False] [0.0, 1.0] `shouldBeAlmost` 0.0 + calibration [True, False, False, True, False] [0.0, 1.0, 1.0, 0.5, 0.5] `shouldBeAlmost` 0.0 + calibration [False, True, True, True, True, False, False, True, False] [0.25, 0.25, 0.0, 0.25, 0.25, 1.0, 1.0, 0.5, 0.5] `shouldBeAlmost` 0.0 checkConduitPure conduit inList expList = do let outList = runConduitPure $ CC.yieldMany inList .| conduit .| CC.sinkList