Inzynierka/Lib/site-packages/scipy/stats/tests/test_sampling.py
2023-06-02 12:51:02 +02:00

1358 lines
50 KiB
Python

import threading
import pickle
import pytest
from copy import deepcopy
import platform
import sys
import math
import numpy as np
from numpy.testing import assert_allclose, assert_equal, suppress_warnings
from numpy.lib import NumpyVersion
from scipy.stats.sampling import (
TransformedDensityRejection,
DiscreteAliasUrn,
DiscreteGuideTable,
NumericalInversePolynomial,
NumericalInverseHermite,
SimpleRatioUniforms,
UNURANError
)
from scipy import stats
from scipy import special
from scipy.stats import chisquare, cramervonmises
from scipy.stats._distr_params import distdiscrete, distcont
from scipy._lib._util import check_random_state
# common test data: this data can be shared between all the tests.
# Normal distribution shared between all the continuous methods
class StandardNormal:
def pdf(self, x):
# normalization constant needed for NumericalInverseHermite
return 1./np.sqrt(2.*np.pi) * np.exp(-0.5 * x*x)
def dpdf(self, x):
return 1./np.sqrt(2.*np.pi) * -x * np.exp(-0.5 * x*x)
def cdf(self, x):
return special.ndtr(x)
all_methods = [
("TransformedDensityRejection", {"dist": StandardNormal()}),
("DiscreteAliasUrn", {"dist": [0.02, 0.18, 0.8]}),
("DiscreteGuideTable", {"dist": [0.02, 0.18, 0.8]}),
("NumericalInversePolynomial", {"dist": StandardNormal()}),
("NumericalInverseHermite", {"dist": StandardNormal()}),
("SimpleRatioUniforms", {"dist": StandardNormal(), "mode": 0})
]
if (sys.implementation.name == 'pypy'
and sys.implementation.version < (7, 3, 10)):
# changed in PyPy for v7.3.10
floaterr = r"unsupported operand type for float\(\): 'list'"
else:
floaterr = r"must be real number, not list"
# Make sure an internal error occurs in UNU.RAN when invalid callbacks are
# passed. Moreover, different generators throw different error messages.
# So, in case of an `UNURANError`, we do not validate the error message.
bad_pdfs_common = [
# Negative PDF
(lambda x: -x, UNURANError, r"..."),
# Returning wrong type
(lambda x: [], TypeError, floaterr),
# Undefined name inside the function
(lambda x: foo, NameError, r"name 'foo' is not defined"), # type: ignore[name-defined] # noqa
# Infinite value returned => Overflow error.
(lambda x: np.inf, UNURANError, r"..."),
# NaN value => internal error in UNU.RAN
(lambda x: np.nan, UNURANError, r"..."),
# signature of PDF wrong
(lambda: 1.0, TypeError, r"takes 0 positional arguments but 1 was given")
]
# same approach for dpdf
bad_dpdf_common = [
# Infinite value returned.
(lambda x: np.inf, UNURANError, r"..."),
# NaN value => internal error in UNU.RAN
(lambda x: np.nan, UNURANError, r"..."),
# Returning wrong type
(lambda x: [], TypeError, floaterr),
# Undefined name inside the function
(lambda x: foo, NameError, r"name 'foo' is not defined"), # type: ignore[name-defined] # noqa
# signature of dPDF wrong
(lambda: 1.0, TypeError, r"takes 0 positional arguments but 1 was given")
]
# same approach for logpdf
bad_logpdfs_common = [
# Returning wrong type
(lambda x: [], TypeError, floaterr),
# Undefined name inside the function
(lambda x: foo, NameError, r"name 'foo' is not defined"), # type: ignore[name-defined] # noqa
# Infinite value returned => Overflow error.
(lambda x: np.inf, UNURANError, r"..."),
# NaN value => internal error in UNU.RAN
(lambda x: np.nan, UNURANError, r"..."),
# signature of logpdf wrong
(lambda: 1.0, TypeError, r"takes 0 positional arguments but 1 was given")
]
bad_pv_common = [
([], r"must contain at least one element"),
([[1.0, 0.0]], r"wrong number of dimensions \(expected 1, got 2\)"),
([0.2, 0.4, np.nan, 0.8], r"must contain only finite / non-nan values"),
([0.2, 0.4, np.inf, 0.8], r"must contain only finite / non-nan values"),
([0.0, 0.0], r"must contain at least one non-zero value"),
]
# size of the domains is incorrect
bad_sized_domains = [
# > 2 elements in the domain
((1, 2, 3), ValueError, r"must be a length 2 tuple"),
# empty domain
((), ValueError, r"must be a length 2 tuple")
]
# domain values are incorrect
bad_domains = [
((2, 1), UNURANError, r"left >= right"),
((1, 1), UNURANError, r"left >= right"),
]
# infinite and nan values present in domain.
inf_nan_domains = [
# left >= right
((10, 10), UNURANError, r"left >= right"),
((np.inf, np.inf), UNURANError, r"left >= right"),
((-np.inf, -np.inf), UNURANError, r"left >= right"),
((np.inf, -np.inf), UNURANError, r"left >= right"),
# Also include nans in some of the domains.
((-np.inf, np.nan), ValueError, r"only non-nan values"),
((np.nan, np.inf), ValueError, r"only non-nan values")
]
# `nan` values present in domain. Some distributions don't support
# infinite tails, so don't mix the nan values with infinities.
nan_domains = [
((0, np.nan), ValueError, r"only non-nan values"),
((np.nan, np.nan), ValueError, r"only non-nan values")
]
# all the methods should throw errors for nan, bad sized, and bad valued
# domains.
@pytest.mark.parametrize("domain, err, msg",
bad_domains + bad_sized_domains +
nan_domains) # type: ignore[operator]
@pytest.mark.parametrize("method, kwargs", all_methods)
def test_bad_domain(domain, err, msg, method, kwargs):
Method = getattr(stats.sampling, method)
with pytest.raises(err, match=msg):
Method(**kwargs, domain=domain)
@pytest.mark.parametrize("method, kwargs", all_methods)
def test_random_state(method, kwargs):
Method = getattr(stats.sampling, method)
# simple seed that works for any version of NumPy
seed = 123
rng1 = Method(**kwargs, random_state=seed)
rng2 = Method(**kwargs, random_state=seed)
assert_equal(rng1.rvs(100), rng2.rvs(100))
# global seed
np.random.seed(123)
rng1 = Method(**kwargs)
rvs1 = rng1.rvs(100)
np.random.seed(None)
rng2 = Method(**kwargs, random_state=123)
rvs2 = rng2.rvs(100)
assert_equal(rvs1, rvs2)
# Generator seed for new NumPy
# when a RandomState is given, it should take the bitgen_t
# member of the class and create a Generator instance.
seed1 = np.random.RandomState(np.random.MT19937(123))
seed2 = np.random.Generator(np.random.MT19937(123))
rng1 = Method(**kwargs, random_state=seed1)
rng2 = Method(**kwargs, random_state=seed2)
assert_equal(rng1.rvs(100), rng2.rvs(100))
def test_set_random_state():
rng1 = TransformedDensityRejection(StandardNormal(), random_state=123)
rng2 = TransformedDensityRejection(StandardNormal())
rng2.set_random_state(123)
assert_equal(rng1.rvs(100), rng2.rvs(100))
rng = TransformedDensityRejection(StandardNormal(), random_state=123)
rvs1 = rng.rvs(100)
rng.set_random_state(123)
rvs2 = rng.rvs(100)
assert_equal(rvs1, rvs2)
def test_threading_behaviour():
# Test if the API is thread-safe.
# This verifies if the lock mechanism and the use of `PyErr_Occurred`
# is correct.
errors = {"err1": None, "err2": None}
class Distribution:
def __init__(self, pdf_msg):
self.pdf_msg = pdf_msg
def pdf(self, x):
if 49.9 < x < 50.0:
raise ValueError(self.pdf_msg)
return x
def dpdf(self, x):
return 1
def func1():
dist = Distribution('foo')
rng = TransformedDensityRejection(dist, domain=(10, 100),
random_state=12)
try:
rng.rvs(100000)
except ValueError as e:
errors['err1'] = e.args[0]
def func2():
dist = Distribution('bar')
rng = TransformedDensityRejection(dist, domain=(10, 100),
random_state=2)
try:
rng.rvs(100000)
except ValueError as e:
errors['err2'] = e.args[0]
t1 = threading.Thread(target=func1)
t2 = threading.Thread(target=func2)
t1.start()
t2.start()
t1.join()
t2.join()
assert errors['err1'] == 'foo'
assert errors['err2'] == 'bar'
@pytest.mark.parametrize("method, kwargs", all_methods)
def test_pickle(method, kwargs):
Method = getattr(stats.sampling, method)
rng1 = Method(**kwargs, random_state=123)
obj = pickle.dumps(rng1)
rng2 = pickle.loads(obj)
assert_equal(rng1.rvs(100), rng2.rvs(100))
@pytest.mark.parametrize("size", [None, 0, (0, ), 1, (10, 3), (2, 3, 4, 5),
(0, 0), (0, 1)])
def test_rvs_size(size):
# As the `rvs` method is present in the base class and shared between
# all the classes, we can just test with one of the methods.
rng = TransformedDensityRejection(StandardNormal())
if size is None:
assert np.isscalar(rng.rvs(size))
else:
if np.isscalar(size):
size = (size, )
assert rng.rvs(size).shape == size
def test_with_scipy_distribution():
# test if the setup works with SciPy's rv_frozen distributions
dist = stats.norm()
urng = np.random.default_rng(0)
rng = NumericalInverseHermite(dist, random_state=urng)
u = np.linspace(0, 1, num=100)
check_cont_samples(rng, dist, dist.stats())
assert_allclose(dist.ppf(u), rng.ppf(u))
# test if it works with `loc` and `scale`
dist = stats.norm(loc=10., scale=5.)
rng = NumericalInverseHermite(dist, random_state=urng)
check_cont_samples(rng, dist, dist.stats())
assert_allclose(dist.ppf(u), rng.ppf(u))
# check for discrete distributions
dist = stats.binom(10, 0.2)
rng = DiscreteAliasUrn(dist, random_state=urng)
domain = dist.support()
pv = dist.pmf(np.arange(domain[0], domain[1]+1))
check_discr_samples(rng, pv, dist.stats())
def check_cont_samples(rng, dist, mv_ex):
rvs = rng.rvs(100000)
mv = rvs.mean(), rvs.var()
# test the moments only if the variance is finite
if np.isfinite(mv_ex[1]):
assert_allclose(mv, mv_ex, rtol=1e-7, atol=1e-1)
# Cramer Von Mises test for goodness-of-fit
rvs = rng.rvs(500)
dist.cdf = np.vectorize(dist.cdf)
pval = cramervonmises(rvs, dist.cdf).pvalue
assert pval > 0.1
def check_discr_samples(rng, pv, mv_ex):
rvs = rng.rvs(100000)
# test if the first few moments match
mv = rvs.mean(), rvs.var()
assert_allclose(mv, mv_ex, rtol=1e-3, atol=1e-1)
# normalize
pv = pv / pv.sum()
# chi-squared test for goodness-of-fit
obs_freqs = np.zeros_like(pv)
_, freqs = np.unique(rvs, return_counts=True)
freqs = freqs / freqs.sum()
obs_freqs[:freqs.size] = freqs
pval = chisquare(obs_freqs, pv).pvalue
assert pval > 0.1
def test_warning_center_not_in_domain():
# UNURAN will warn if the center provided or the one computed w/o the
# domain is outside of the domain
msg = "102 : center moved into domain of distribution"
with pytest.warns(RuntimeWarning, match=msg):
NumericalInversePolynomial(StandardNormal(), center=0, domain=(3, 5))
with pytest.warns(RuntimeWarning, match=msg):
NumericalInversePolynomial(StandardNormal(), domain=(3, 5))
@pytest.mark.parametrize('method', ["SimpleRatioUniforms",
"NumericalInversePolynomial",
"TransformedDensityRejection"])
def test_error_mode_not_in_domain(method):
# UNURAN raises an error if the mode is not in the domain
# the behavior is different compared to the case that center is not in the
# domain. mode is supposed to be the exact value, center can be an
# approximate value
Method = getattr(stats.sampling, method)
msg = "17 : mode not in domain"
with pytest.raises(UNURANError, match=msg):
Method(StandardNormal(), mode=0, domain=(3, 5))
@pytest.mark.parametrize('method', ["NumericalInverseHermite",
"NumericalInversePolynomial"])
class TestQRVS:
def test_input_validation(self, method):
match = "`qmc_engine` must be an instance of..."
with pytest.raises(ValueError, match=match):
Method = getattr(stats.sampling, method)
gen = Method(StandardNormal())
gen.qrvs(qmc_engine=0)
# issues with QMCEngines and old NumPy
Method = getattr(stats.sampling, method)
gen = Method(StandardNormal())
match = "`d` must be consistent with dimension of `qmc_engine`."
with pytest.raises(ValueError, match=match):
gen.qrvs(d=3, qmc_engine=stats.qmc.Halton(2))
qrngs = [None, stats.qmc.Sobol(1, seed=0), stats.qmc.Halton(3, seed=0)]
# `size=None` should not add anything to the shape, `size=1` should
sizes = [(None, tuple()), (1, (1,)), (4, (4,)),
((4,), (4,)), ((2, 4), (2, 4))] # type: ignore
# Neither `d=None` nor `d=1` should add anything to the shape
ds = [(None, tuple()), (1, tuple()), (3, (3,))]
@pytest.mark.parametrize('qrng', qrngs)
@pytest.mark.parametrize('size_in, size_out', sizes)
@pytest.mark.parametrize('d_in, d_out', ds)
def test_QRVS_shape_consistency(self, qrng, size_in, size_out,
d_in, d_out, method):
w32 = sys.platform == "win32" and platform.architecture()[0] == "32bit"
if w32 and method == "NumericalInversePolynomial":
pytest.xfail("NumericalInversePolynomial.qrvs fails for Win "
"32-bit")
dist = StandardNormal()
Method = getattr(stats.sampling, method)
gen = Method(dist)
# If d and qrng.d are inconsistent, an error is raised
if d_in is not None and qrng is not None and qrng.d != d_in:
match = "`d` must be consistent with dimension of `qmc_engine`."
with pytest.raises(ValueError, match=match):
gen.qrvs(size_in, d=d_in, qmc_engine=qrng)
return
# Sometimes d is really determined by qrng
if d_in is None and qrng is not None and qrng.d != 1:
d_out = (qrng.d,)
shape_expected = size_out + d_out
qrng2 = deepcopy(qrng)
qrvs = gen.qrvs(size=size_in, d=d_in, qmc_engine=qrng)
if size_in is not None:
assert qrvs.shape == shape_expected
if qrng2 is not None:
uniform = qrng2.random(np.prod(size_in) or 1)
qrvs2 = stats.norm.ppf(uniform).reshape(shape_expected)
assert_allclose(qrvs, qrvs2, atol=1e-12)
def test_QRVS_size_tuple(self, method):
# QMCEngine samples are always of shape (n, d). When `size` is a tuple,
# we set `n = prod(size)` in the call to qmc_engine.random, transform
# the sample, and reshape it to the final dimensions. When we reshape,
# we need to be careful, because the _columns_ of the sample returned
# by a QMCEngine are "independent"-ish, but the elements within the
# columns are not. We need to make sure that this doesn't get mixed up
# by reshaping: qrvs[..., i] should remain "independent"-ish of
# qrvs[..., i+1], but the elements within qrvs[..., i] should be
# transformed from the same low-discrepancy sequence.
dist = StandardNormal()
Method = getattr(stats.sampling, method)
gen = Method(dist)
size = (3, 4)
d = 5
qrng = stats.qmc.Halton(d, seed=0)
qrng2 = stats.qmc.Halton(d, seed=0)
uniform = qrng2.random(np.prod(size))
qrvs = gen.qrvs(size=size, d=d, qmc_engine=qrng)
qrvs2 = stats.norm.ppf(uniform)
for i in range(d):
sample = qrvs[..., i]
sample2 = qrvs2[:, i].reshape(size)
assert_allclose(sample, sample2, atol=1e-12)
class TestTransformedDensityRejection:
# Simple Custom Distribution
class dist0:
def pdf(self, x):
return 3/4 * (1-x*x)
def dpdf(self, x):
return 3/4 * (-2*x)
def cdf(self, x):
return 3/4 * (x - x**3/3 + 2/3)
def support(self):
return -1, 1
# Standard Normal Distribution
class dist1:
def pdf(self, x):
return stats.norm._pdf(x / 0.1)
def dpdf(self, x):
return -x / 0.01 * stats.norm._pdf(x / 0.1)
def cdf(self, x):
return stats.norm._cdf(x / 0.1)
# pdf with piecewise linear function as transformed density
# with T = -1/sqrt with shift. Taken from UNU.RAN test suite
# (from file t_tdr_ps.c)
class dist2:
def __init__(self, shift):
self.shift = shift
def pdf(self, x):
x -= self.shift
y = 1. / (abs(x) + 1.)
return 0.5 * y * y
def dpdf(self, x):
x -= self.shift
y = 1. / (abs(x) + 1.)
y = y * y * y
return y if (x < 0.) else -y
def cdf(self, x):
x -= self.shift
if x <= 0.:
return 0.5 / (1. - x)
else:
return 1. - 0.5 / (1. + x)
dists = [dist0(), dist1(), dist2(0.), dist2(10000.)]
# exact mean and variance of the distributions in the list dists
mv0 = [0., 4./15.]
mv1 = [0., 0.01]
mv2 = [0., np.inf]
mv3 = [10000., np.inf]
mvs = [mv0, mv1, mv2, mv3]
@pytest.mark.parametrize("dist, mv_ex",
zip(dists, mvs))
def test_basic(self, dist, mv_ex):
with suppress_warnings() as sup:
# filter the warnings thrown by UNU.RAN
sup.filter(RuntimeWarning)
rng = TransformedDensityRejection(dist, random_state=42)
check_cont_samples(rng, dist, mv_ex)
# PDF 0 everywhere => bad construction points
bad_pdfs = [(lambda x: 0, UNURANError, r"50 : bad construction points.")]
bad_pdfs += bad_pdfs_common # type: ignore[arg-type]
@pytest.mark.parametrize("pdf, err, msg", bad_pdfs)
def test_bad_pdf(self, pdf, err, msg):
class dist:
pass
dist.pdf = pdf
dist.dpdf = lambda x: 1 # an arbitrary dPDF
with pytest.raises(err, match=msg):
TransformedDensityRejection(dist)
@pytest.mark.parametrize("dpdf, err, msg", bad_dpdf_common)
def test_bad_dpdf(self, dpdf, err, msg):
class dist:
pass
dist.pdf = lambda x: x
dist.dpdf = dpdf
with pytest.raises(err, match=msg):
TransformedDensityRejection(dist, domain=(1, 10))
# test domains with inf + nan in them. need to write a custom test for
# this because not all methods support infinite tails.
@pytest.mark.parametrize("domain, err, msg", inf_nan_domains)
def test_inf_nan_domains(self, domain, err, msg):
with pytest.raises(err, match=msg):
TransformedDensityRejection(StandardNormal(), domain=domain)
@pytest.mark.parametrize("construction_points", [-1, 0, 0.1])
def test_bad_construction_points_scalar(self, construction_points):
with pytest.raises(ValueError, match=r"`construction_points` must be "
r"a positive integer."):
TransformedDensityRejection(
StandardNormal(), construction_points=construction_points
)
def test_bad_construction_points_array(self):
# empty array
construction_points = []
with pytest.raises(ValueError, match=r"`construction_points` must "
r"either be a "
r"scalar or a non-empty array."):
TransformedDensityRejection(
StandardNormal(), construction_points=construction_points
)
# construction_points not monotonically increasing
construction_points = [1, 1, 1, 1, 1, 1]
with pytest.warns(RuntimeWarning, match=r"33 : starting points not "
r"strictly monotonically "
r"increasing"):
TransformedDensityRejection(
StandardNormal(), construction_points=construction_points
)
# construction_points containing nans
construction_points = [np.nan, np.nan, np.nan]
with pytest.raises(UNURANError, match=r"50 : bad construction "
r"points."):
TransformedDensityRejection(
StandardNormal(), construction_points=construction_points
)
# construction_points out of domain
construction_points = [-10, 10]
with pytest.warns(RuntimeWarning, match=r"50 : starting point out of "
r"domain"):
TransformedDensityRejection(
StandardNormal(), domain=(-3, 3),
construction_points=construction_points
)
@pytest.mark.parametrize("c", [-1., np.nan, np.inf, 0.1, 1.])
def test_bad_c(self, c):
msg = r"`c` must either be -0.5 or 0."
with pytest.raises(ValueError, match=msg):
TransformedDensityRejection(StandardNormal(), c=-1.)
u = [np.linspace(0, 1, num=1000), [], [[]], [np.nan],
[-np.inf, np.nan, np.inf], 0,
[[np.nan, 0.5, 0.1], [0.2, 0.4, np.inf], [-2, 3, 4]]]
@pytest.mark.parametrize("u", u)
def test_ppf_hat(self, u):
# Increase the `max_squeeze_hat_ratio` so the ppf_hat is more
# accurate.
rng = TransformedDensityRejection(StandardNormal(),
max_squeeze_hat_ratio=0.9999)
# Older versions of NumPy throw RuntimeWarnings for comparisons
# with nan.
with suppress_warnings() as sup:
sup.filter(RuntimeWarning, "invalid value encountered in greater")
sup.filter(RuntimeWarning, "invalid value encountered in "
"greater_equal")
sup.filter(RuntimeWarning, "invalid value encountered in less")
sup.filter(RuntimeWarning, "invalid value encountered in "
"less_equal")
res = rng.ppf_hat(u)
expected = stats.norm.ppf(u)
assert_allclose(res, expected, rtol=1e-3, atol=1e-5)
assert res.shape == expected.shape
def test_bad_dist(self):
# Empty distribution
class dist:
...
msg = r"`pdf` required but not found."
with pytest.raises(ValueError, match=msg):
TransformedDensityRejection(dist)
# dPDF not present in dist
class dist:
pdf = lambda x: 1-x*x # noqa: E731
msg = r"`dpdf` required but not found."
with pytest.raises(ValueError, match=msg):
TransformedDensityRejection(dist)
class TestDiscreteAliasUrn:
# DAU fails on these probably because of large domains and small
# computation errors in PMF. Mean/SD match but chi-squared test fails.
basic_fail_dists = {
'nchypergeom_fisher', # numerical erros on tails
'nchypergeom_wallenius', # numerical erros on tails
'randint' # fails on 32-bit ubuntu
}
@pytest.mark.parametrize("distname, params", distdiscrete)
def test_basic(self, distname, params):
if distname in self.basic_fail_dists:
msg = ("DAU fails on these probably because of large domains "
"and small computation errors in PMF.")
pytest.skip(msg)
if not isinstance(distname, str):
dist = distname
else:
dist = getattr(stats, distname)
dist = dist(*params)
domain = dist.support()
if not np.isfinite(domain[1] - domain[0]):
# DAU only works with finite domain. So, skip the distributions
# with infinite tails.
pytest.skip("DAU only works with a finite domain.")
k = np.arange(domain[0], domain[1]+1)
pv = dist.pmf(k)
mv_ex = dist.stats('mv')
rng = DiscreteAliasUrn(dist, random_state=42)
check_discr_samples(rng, pv, mv_ex)
# Can't use bad_pmf_common here as we evaluate PMF early on to avoid
# unhelpful errors from UNU.RAN.
bad_pmf = [
# inf returned
(lambda x: np.inf, ValueError,
r"must contain only finite / non-nan values"),
# nan returned
(lambda x: np.nan, ValueError,
r"must contain only finite / non-nan values"),
# all zeros
(lambda x: 0.0, ValueError,
r"must contain at least one non-zero value"),
# Undefined name inside the function
(lambda x: foo, NameError, # type: ignore[name-defined] # noqa
r"name 'foo' is not defined"),
# Returning wrong type.
(lambda x: [], ValueError,
r"setting an array element with a sequence."),
# probabilities < 0
(lambda x: -x, UNURANError,
r"50 : probability < 0"),
# signature of PMF wrong
(lambda: 1.0, TypeError,
r"takes 0 positional arguments but 1 was given")
]
@pytest.mark.parametrize("pmf, err, msg", bad_pmf)
def test_bad_pmf(self, pmf, err, msg):
class dist:
pass
dist.pmf = pmf
with pytest.raises(err, match=msg):
DiscreteAliasUrn(dist, domain=(1, 10))
@pytest.mark.parametrize("pv", [[0.18, 0.02, 0.8],
[1.0, 2.0, 3.0, 4.0, 5.0, 6.0]])
def test_sampling_with_pv(self, pv):
pv = np.asarray(pv, dtype=np.float64)
rng = DiscreteAliasUrn(pv, random_state=123)
rvs = rng.rvs(100_000)
pv = pv / pv.sum()
variates = np.arange(0, len(pv))
# test if the first few moments match
m_expected = np.average(variates, weights=pv)
v_expected = np.average((variates - m_expected) ** 2, weights=pv)
mv_expected = m_expected, v_expected
check_discr_samples(rng, pv, mv_expected)
@pytest.mark.parametrize("pv, msg", bad_pv_common)
def test_bad_pv(self, pv, msg):
with pytest.raises(ValueError, match=msg):
DiscreteAliasUrn(pv)
# DAU doesn't support infinite tails. So, it should throw an error when
# inf is present in the domain.
inf_domain = [(-np.inf, np.inf), (np.inf, np.inf), (-np.inf, -np.inf),
(0, np.inf), (-np.inf, 0)]
@pytest.mark.parametrize("domain", inf_domain)
def test_inf_domain(self, domain):
with pytest.raises(ValueError, match=r"must be finite"):
DiscreteAliasUrn(stats.binom(10, 0.2), domain=domain)
def test_bad_urn_factor(self):
with pytest.warns(RuntimeWarning, match=r"relative urn size < 1."):
DiscreteAliasUrn([0.5, 0.5], urn_factor=-1)
def test_bad_args(self):
msg = (r"`domain` must be provided when the "
r"probability vector is not available.")
class dist:
def pmf(self, x):
return x
with pytest.raises(ValueError, match=msg):
DiscreteAliasUrn(dist)
class TestNumericalInversePolynomial:
# Simple Custom Distribution
class dist0:
def pdf(self, x):
return 3/4 * (1-x*x)
def cdf(self, x):
return 3/4 * (x - x**3/3 + 2/3)
def support(self):
return -1, 1
# Standard Normal Distribution
class dist1:
def pdf(self, x):
return stats.norm._pdf(x / 0.1)
def cdf(self, x):
return stats.norm._cdf(x / 0.1)
# Sin 2 distribution
# / 0.05 + 0.45*(1 +sin(2 Pi x)) if |x| <= 1
# f(x) = <
# \ 0 otherwise
# Taken from UNU.RAN test suite (from file t_pinv.c)
class dist2:
def pdf(self, x):
return 0.05 + 0.45 * (1 + np.sin(2*np.pi*x))
def cdf(self, x):
return (0.05*(x + 1) +
0.9*(1. + 2.*np.pi*(1 + x) - np.cos(2.*np.pi*x)) /
(4.*np.pi))
def support(self):
return -1, 1
# Sin 10 distribution
# / 0.05 + 0.45*(1 +sin(2 Pi x)) if |x| <= 5
# f(x) = <
# \ 0 otherwise
# Taken from UNU.RAN test suite (from file t_pinv.c)
class dist3:
def pdf(self, x):
return 0.2 * (0.05 + 0.45 * (1 + np.sin(2*np.pi*x)))
def cdf(self, x):
return x/10. + 0.5 + 0.09/(2*np.pi) * (np.cos(10*np.pi) -
np.cos(2*np.pi*x))
def support(self):
return -5, 5
dists = [dist0(), dist1(), dist2(), dist3()]
# exact mean and variance of the distributions in the list dists
mv0 = [0., 4./15.]
mv1 = [0., 0.01]
mv2 = [-0.45/np.pi, 2/3*0.5 - 0.45**2/np.pi**2]
mv3 = [-0.45/np.pi, 0.2 * 250/3 * 0.5 - 0.45**2/np.pi**2]
mvs = [mv0, mv1, mv2, mv3]
@pytest.mark.parametrize("dist, mv_ex",
zip(dists, mvs))
def test_basic(self, dist, mv_ex):
rng = NumericalInversePolynomial(dist, random_state=42)
check_cont_samples(rng, dist, mv_ex)
very_slow_dists = ['studentized_range', 'trapezoid', 'triang', 'vonmises',
'levy_stable', 'kappa4', 'ksone', 'kstwo', 'levy_l',
'gausshyper', 'anglit']
# for these distributions, some assertions fail due to minor
# numerical differences. They can be avoided either by changing
# the seed or by increasing the u_resolution.
fail_dists = ['ncf', 'pareto', 'chi2', 'fatiguelife', 'halfgennorm',
'gibrat', 'lognorm', 'ncx2', 't']
@pytest.mark.xslow
@pytest.mark.parametrize("distname, params", distcont)
def test_basic_all_scipy_dists(self, distname, params):
if distname in self.very_slow_dists:
pytest.skip(f"PINV too slow for {distname}")
if distname in self.fail_dists:
pytest.skip(f"PINV fails for {distname}")
dist = (getattr(stats, distname)
if isinstance(distname, str)
else distname)
dist = dist(*params)
with suppress_warnings() as sup:
sup.filter(RuntimeWarning)
rng = NumericalInversePolynomial(dist, random_state=42)
check_cont_samples(rng, dist, [dist.mean(), dist.var()])
@pytest.mark.parametrize("pdf, err, msg", bad_pdfs_common)
def test_bad_pdf(self, pdf, err, msg):
class dist:
pass
dist.pdf = pdf
with pytest.raises(err, match=msg):
NumericalInversePolynomial(dist, domain=[0, 5])
@pytest.mark.parametrize("logpdf, err, msg", bad_logpdfs_common)
def test_bad_logpdf(self, logpdf, err, msg):
class dist:
pass
dist.logpdf = logpdf
with pytest.raises(err, match=msg):
NumericalInversePolynomial(dist, domain=[0, 5])
# test domains with inf + nan in them. need to write a custom test for
# this because not all methods support infinite tails.
@pytest.mark.parametrize("domain, err, msg", inf_nan_domains)
def test_inf_nan_domains(self, domain, err, msg):
with pytest.raises(err, match=msg):
NumericalInversePolynomial(StandardNormal(), domain=domain)
u = [
# test if quantile 0 and 1 return -inf and inf respectively and check
# the correctness of the PPF for equidistant points between 0 and 1.
np.linspace(0, 1, num=10000),
# test the PPF method for empty arrays
[], [[]],
# test if nans and infs return nan result.
[np.nan], [-np.inf, np.nan, np.inf],
# test if a scalar is returned for a scalar input.
0,
# test for arrays with nans, values greater than 1 and less than 0,
# and some valid values.
[[np.nan, 0.5, 0.1], [0.2, 0.4, np.inf], [-2, 3, 4]]
]
@pytest.mark.parametrize("u", u)
def test_ppf(self, u):
dist = StandardNormal()
rng = NumericalInversePolynomial(dist, u_resolution=1e-14)
# Older versions of NumPy throw RuntimeWarnings for comparisons
# with nan.
with suppress_warnings() as sup:
sup.filter(RuntimeWarning, "invalid value encountered in greater")
sup.filter(RuntimeWarning, "invalid value encountered in "
"greater_equal")
sup.filter(RuntimeWarning, "invalid value encountered in less")
sup.filter(RuntimeWarning, "invalid value encountered in "
"less_equal")
res = rng.ppf(u)
expected = stats.norm.ppf(u)
assert_allclose(res, expected, rtol=1e-11, atol=1e-11)
assert res.shape == expected.shape
x = [np.linspace(-10, 10, num=10000), [], [[]], [np.nan],
[-np.inf, np.nan, np.inf], 0,
[[np.nan, 0.5, 0.1], [0.2, 0.4, np.inf], [-np.inf, 3, 4]]]
@pytest.mark.parametrize("x", x)
def test_cdf(self, x):
dist = StandardNormal()
rng = NumericalInversePolynomial(dist, u_resolution=1e-14)
# Older versions of NumPy throw RuntimeWarnings for comparisons
# with nan.
with suppress_warnings() as sup:
sup.filter(RuntimeWarning, "invalid value encountered in greater")
sup.filter(RuntimeWarning, "invalid value encountered in "
"greater_equal")
sup.filter(RuntimeWarning, "invalid value encountered in less")
sup.filter(RuntimeWarning, "invalid value encountered in "
"less_equal")
res = rng.cdf(x)
expected = stats.norm.cdf(x)
assert_allclose(res, expected, rtol=1e-11, atol=1e-11)
assert res.shape == expected.shape
def test_u_error(self):
dist = StandardNormal()
rng = NumericalInversePolynomial(dist, u_resolution=1e-10)
max_error, mae = rng.u_error()
assert max_error < 1e-10
assert mae <= max_error
rng = NumericalInversePolynomial(dist, u_resolution=1e-14)
max_error, mae = rng.u_error()
assert max_error < 1e-14
assert mae <= max_error
bad_orders = [1, 4.5, 20, np.inf, np.nan]
bad_u_resolution = [1e-20, 1e-1, np.inf, np.nan]
@pytest.mark.parametrize("order", bad_orders)
def test_bad_orders(self, order):
dist = StandardNormal()
msg = r"`order` must be an integer in the range \[3, 17\]."
with pytest.raises(ValueError, match=msg):
NumericalInversePolynomial(dist, order=order)
@pytest.mark.parametrize("u_resolution", bad_u_resolution)
def test_bad_u_resolution(self, u_resolution):
msg = r"`u_resolution` must be between 1e-15 and 1e-5."
with pytest.raises(ValueError, match=msg):
NumericalInversePolynomial(StandardNormal(),
u_resolution=u_resolution)
def test_bad_args(self):
class BadDist:
def cdf(self, x):
return stats.norm._cdf(x)
dist = BadDist()
msg = r"Either of the methods `pdf` or `logpdf` must be specified"
with pytest.raises(ValueError, match=msg):
rng = NumericalInversePolynomial(dist)
dist = StandardNormal()
rng = NumericalInversePolynomial(dist)
msg = r"`sample_size` must be greater than or equal to 1000."
with pytest.raises(ValueError, match=msg):
rng.u_error(10)
class Distribution:
def pdf(self, x):
return np.exp(-0.5 * x*x)
dist = Distribution()
rng = NumericalInversePolynomial(dist)
msg = r"Exact CDF required but not found."
with pytest.raises(ValueError, match=msg):
rng.u_error()
def test_logpdf_pdf_consistency(self):
# 1. check that PINV works with pdf and logpdf only
# 2. check that generated ppf is the same (up to a small tolerance)
class MyDist:
pass
# create genrator from dist with only pdf
dist_pdf = MyDist()
dist_pdf.pdf = lambda x: math.exp(-x*x/2)
rng1 = NumericalInversePolynomial(dist_pdf)
# create dist with only logpdf
dist_logpdf = MyDist()
dist_logpdf.logpdf = lambda x: -x*x/2
rng2 = NumericalInversePolynomial(dist_logpdf)
q = np.linspace(1e-5, 1-1e-5, num=100)
assert_allclose(rng1.ppf(q), rng2.ppf(q))
class TestNumericalInverseHermite:
# / (1 +sin(2 Pi x))/2 if |x| <= 1
# f(x) = <
# \ 0 otherwise
# Taken from UNU.RAN test suite (from file t_hinv.c)
class dist0:
def pdf(self, x):
return 0.5*(1. + np.sin(2.*np.pi*x))
def dpdf(self, x):
return np.pi*np.cos(2.*np.pi*x)
def cdf(self, x):
return (1. + 2.*np.pi*(1 + x) - np.cos(2.*np.pi*x)) / (4.*np.pi)
def support(self):
return -1, 1
# / Max(sin(2 Pi x)),0)Pi/2 if -1 < x <0.5
# f(x) = <
# \ 0 otherwise
# Taken from UNU.RAN test suite (from file t_hinv.c)
class dist1:
def pdf(self, x):
if (x <= -0.5):
return np.sin((2. * np.pi) * x) * 0.5 * np.pi
if (x < 0.):
return 0.
if (x <= 0.5):
return np.sin((2. * np.pi) * x) * 0.5 * np.pi
def dpdf(self, x):
if (x <= -0.5):
return np.cos((2. * np.pi) * x) * np.pi * np.pi
if (x < 0.):
return 0.
if (x <= 0.5):
return np.cos((2. * np.pi) * x) * np.pi * np.pi
def cdf(self, x):
if (x <= -0.5):
return 0.25 * (1 - np.cos((2. * np.pi) * x))
if (x < 0.):
return 0.5
if (x <= 0.5):
return 0.75 - 0.25 * np.cos((2. * np.pi) * x)
def support(self):
return -1, 0.5
dists = [dist0(), dist1()]
# exact mean and variance of the distributions in the list dists
mv0 = [-1/(2*np.pi), 1/3 - 1/(4*np.pi*np.pi)]
mv1 = [-1/4, 3/8-1/(2*np.pi*np.pi) - 1/16]
mvs = [mv0, mv1]
@pytest.mark.parametrize("dist, mv_ex",
zip(dists, mvs))
@pytest.mark.parametrize("order", [3, 5])
def test_basic(self, dist, mv_ex, order):
rng = NumericalInverseHermite(dist, order=order, random_state=42)
check_cont_samples(rng, dist, mv_ex)
# test domains with inf + nan in them. need to write a custom test for
# this because not all methods support infinite tails.
@pytest.mark.parametrize("domain, err, msg", inf_nan_domains)
def test_inf_nan_domains(self, domain, err, msg):
with pytest.raises(err, match=msg):
NumericalInverseHermite(StandardNormal(), domain=domain)
def basic_test_all_scipy_dists(self, distname, shapes):
slow_dists = {'ksone', 'kstwo', 'levy_stable', 'skewnorm'}
fail_dists = {'beta', 'gausshyper', 'geninvgauss', 'ncf', 'nct',
'norminvgauss', 'genhyperbolic', 'studentized_range',
'vonmises', 'kappa4', 'invgauss', 'wald'}
if distname in slow_dists:
pytest.skip("Distribution is too slow")
if distname in fail_dists:
# specific reasons documented in gh-13319
# https://github.com/scipy/scipy/pull/13319#discussion_r626188955
pytest.xfail("Fails - usually due to inaccurate CDF/PDF")
np.random.seed(0)
dist = getattr(stats, distname)(*shapes)
fni = NumericalInverseHermite(dist)
x = np.random.rand(10)
p_tol = np.max(np.abs(dist.ppf(x)-fni.ppf(x))/np.abs(dist.ppf(x)))
u_tol = np.max(np.abs(dist.cdf(fni.ppf(x)) - x))
assert p_tol < 1e-8
assert u_tol < 1e-12
@pytest.mark.filterwarnings('ignore::RuntimeWarning')
@pytest.mark.xslow
@pytest.mark.parametrize(("distname", "shapes"), distcont)
def test_basic_all_scipy_dists(self, distname, shapes):
# if distname == "truncnorm":
# pytest.skip("Tested separately")
self.basic_test_all_scipy_dists(distname, shapes)
@pytest.mark.filterwarnings('ignore::RuntimeWarning')
def test_basic_truncnorm_gh17155(self):
self.basic_test_all_scipy_dists("truncnorm", (0.1, 2))
def test_input_validation(self):
match = r"`order` must be either 1, 3, or 5."
with pytest.raises(ValueError, match=match):
NumericalInverseHermite(StandardNormal(), order=2)
match = "`cdf` required but not found"
with pytest.raises(ValueError, match=match):
NumericalInverseHermite("norm")
match = "could not convert string to float"
with pytest.raises(ValueError, match=match):
NumericalInverseHermite(StandardNormal(),
u_resolution='ekki')
rngs = [None, 0, np.random.RandomState(0)]
rngs.append(np.random.default_rng(0)) # type: ignore
sizes = [(None, tuple()), (8, (8,)), ((4, 5, 6), (4, 5, 6))]
@pytest.mark.parametrize('rng', rngs)
@pytest.mark.parametrize('size_in, size_out', sizes)
def test_RVS(self, rng, size_in, size_out):
dist = StandardNormal()
fni = NumericalInverseHermite(dist)
rng2 = deepcopy(rng)
rvs = fni.rvs(size=size_in, random_state=rng)
if size_in is not None:
assert rvs.shape == size_out
if rng2 is not None:
rng2 = check_random_state(rng2)
uniform = rng2.uniform(size=size_in)
rvs2 = stats.norm.ppf(uniform)
assert_allclose(rvs, rvs2)
def test_inaccurate_CDF(self):
# CDF function with inaccurate tail cannot be inverted; see gh-13319
# https://github.com/scipy/scipy/pull/13319#discussion_r626188955
shapes = (2.3098496451481823, 0.6268795430096368)
match = ("98 : one or more intervals very short; possibly due to "
"numerical problems with a pole or very flat tail")
# fails with default tol
with pytest.warns(RuntimeWarning, match=match):
NumericalInverseHermite(stats.beta(*shapes))
# no error with coarser tol
NumericalInverseHermite(stats.beta(*shapes), u_resolution=1e-8)
def test_custom_distribution(self):
dist1 = StandardNormal()
fni1 = NumericalInverseHermite(dist1)
dist2 = stats.norm()
fni2 = NumericalInverseHermite(dist2)
assert_allclose(fni1.rvs(random_state=0), fni2.rvs(random_state=0))
u = [
# check the correctness of the PPF for equidistant points between
# 0.02 and 0.98.
np.linspace(0., 1., num=10000),
# test the PPF method for empty arrays
[], [[]],
# test if nans and infs return nan result.
[np.nan], [-np.inf, np.nan, np.inf],
# test if a scalar is returned for a scalar input.
0,
# test for arrays with nans, values greater than 1 and less than 0,
# and some valid values.
[[np.nan, 0.5, 0.1], [0.2, 0.4, np.inf], [-2, 3, 4]]
]
@pytest.mark.parametrize("u", u)
def test_ppf(self, u):
dist = StandardNormal()
rng = NumericalInverseHermite(dist, u_resolution=1e-12)
# Older versions of NumPy throw RuntimeWarnings for comparisons
# with nan.
with suppress_warnings() as sup:
sup.filter(RuntimeWarning, "invalid value encountered in greater")
sup.filter(RuntimeWarning, "invalid value encountered in "
"greater_equal")
sup.filter(RuntimeWarning, "invalid value encountered in less")
sup.filter(RuntimeWarning, "invalid value encountered in "
"less_equal")
res = rng.ppf(u)
expected = stats.norm.ppf(u)
assert_allclose(res, expected, rtol=1e-9, atol=3e-10)
assert res.shape == expected.shape
def test_u_error(self):
dist = StandardNormal()
rng = NumericalInverseHermite(dist, u_resolution=1e-10)
max_error, mae = rng.u_error()
assert max_error < 1e-10
assert mae <= max_error
with suppress_warnings() as sup:
# ignore warning about u-resolution being too small.
sup.filter(RuntimeWarning)
rng = NumericalInverseHermite(dist, u_resolution=1e-14)
max_error, mae = rng.u_error()
assert max_error < 1e-14
assert mae <= max_error
class TestDiscreteGuideTable:
basic_fail_dists = {
'nchypergeom_fisher', # numerical errors on tails
'nchypergeom_wallenius', # numerical errors on tails
'randint' # fails on 32-bit ubuntu
}
def test_guide_factor_gt3_raises_warning(self):
pv = [0.1, 0.3, 0.6]
urng = np.random.default_rng()
with pytest.warns(RuntimeWarning):
DiscreteGuideTable(pv, random_state=urng, guide_factor=7)
def test_guide_factor_zero_raises_warning(self):
pv = [0.1, 0.3, 0.6]
urng = np.random.default_rng()
with pytest.warns(RuntimeWarning):
DiscreteGuideTable(pv, random_state=urng, guide_factor=0)
def test_negative_guide_factor_raises_warning(self):
# This occurs from the UNU.RAN wrapper automatically.
# however it already gives a useful warning
# Here we just test that a warning is raised.
pv = [0.1, 0.3, 0.6]
urng = np.random.default_rng()
with pytest.warns(RuntimeWarning):
DiscreteGuideTable(pv, random_state=urng, guide_factor=-1)
@pytest.mark.parametrize("distname, params", distdiscrete)
def test_basic(self, distname, params):
if distname in self.basic_fail_dists:
msg = ("DGT fails on these probably because of large domains "
"and small computation errors in PMF.")
pytest.skip(msg)
if not isinstance(distname, str):
dist = distname
else:
dist = getattr(stats, distname)
dist = dist(*params)
domain = dist.support()
if not np.isfinite(domain[1] - domain[0]):
# DGT only works with finite domain. So, skip the distributions
# with infinite tails.
pytest.skip("DGT only works with a finite domain.")
k = np.arange(domain[0], domain[1]+1)
pv = dist.pmf(k)
mv_ex = dist.stats('mv')
rng = DiscreteGuideTable(dist, random_state=42)
check_discr_samples(rng, pv, mv_ex)
u = [
# the correctness of the PPF for equidistant points between 0 and 1.
np.linspace(0, 1, num=10000),
# test the PPF method for empty arrays
[], [[]],
# test if nans and infs return nan result.
[np.nan], [-np.inf, np.nan, np.inf],
# test if a scalar is returned for a scalar input.
0,
# test for arrays with nans, values greater than 1 and less than 0,
# and some valid values.
[[np.nan, 0.5, 0.1], [0.2, 0.4, np.inf], [-2, 3, 4]]
]
@pytest.mark.parametrize('u', u)
def test_ppf(self, u):
n, p = 4, 0.1
dist = stats.binom(n, p)
rng = DiscreteGuideTable(dist, random_state=42)
# Older versions of NumPy throw RuntimeWarnings for comparisons
# with nan.
with suppress_warnings() as sup:
sup.filter(RuntimeWarning, "invalid value encountered in greater")
sup.filter(RuntimeWarning, "invalid value encountered in "
"greater_equal")
sup.filter(RuntimeWarning, "invalid value encountered in less")
sup.filter(RuntimeWarning, "invalid value encountered in "
"less_equal")
res = rng.ppf(u)
expected = stats.binom.ppf(u, n, p)
assert_equal(res.shape, expected.shape)
assert_equal(res, expected)
@pytest.mark.parametrize("pv, msg", bad_pv_common)
def test_bad_pv(self, pv, msg):
with pytest.raises(ValueError, match=msg):
DiscreteGuideTable(pv)
# DGT doesn't support infinite tails. So, it should throw an error when
# inf is present in the domain.
inf_domain = [(-np.inf, np.inf), (np.inf, np.inf), (-np.inf, -np.inf),
(0, np.inf), (-np.inf, 0)]
@pytest.mark.parametrize("domain", inf_domain)
def test_inf_domain(self, domain):
with pytest.raises(ValueError, match=r"must be finite"):
DiscreteGuideTable(stats.binom(10, 0.2), domain=domain)
class TestSimpleRatioUniforms:
# pdf with piecewise linear function as transformed density
# with T = -1/sqrt with shift. Taken from UNU.RAN test suite
# (from file t_srou.c)
class dist:
def __init__(self, shift):
self.shift = shift
self.mode = shift
def pdf(self, x):
x -= self.shift
y = 1. / (abs(x) + 1.)
return 0.5 * y * y
def cdf(self, x):
x -= self.shift
if x <= 0.:
return 0.5 / (1. - x)
else:
return 1. - 0.5 / (1. + x)
dists = [dist(0.), dist(10000.)]
# exact mean and variance of the distributions in the list dists
mv1 = [0., np.inf]
mv2 = [10000., np.inf]
mvs = [mv1, mv2]
@pytest.mark.parametrize("dist, mv_ex",
zip(dists, mvs))
def test_basic(self, dist, mv_ex):
rng = SimpleRatioUniforms(dist, mode=dist.mode, random_state=42)
check_cont_samples(rng, dist, mv_ex)
rng = SimpleRatioUniforms(dist, mode=dist.mode,
cdf_at_mode=dist.cdf(dist.mode),
random_state=42)
check_cont_samples(rng, dist, mv_ex)
# test domains with inf + nan in them. need to write a custom test for
# this because not all methods support infinite tails.
@pytest.mark.parametrize("domain, err, msg", inf_nan_domains)
def test_inf_nan_domains(self, domain, err, msg):
with pytest.raises(err, match=msg):
SimpleRatioUniforms(StandardNormal(), domain=domain)
def test_bad_args(self):
# pdf_area < 0
with pytest.raises(ValueError, match=r"`pdf_area` must be > 0"):
SimpleRatioUniforms(StandardNormal(), mode=0, pdf_area=-1)