3RNN/Lib/site-packages/scipy/stats/tests/test_discrete_basic.py

549 lines
20 KiB
Python
Raw Normal View History

2024-05-26 19:49:15 +02:00
import numpy.testing as npt
from numpy.testing import assert_allclose
import numpy as np
import pytest
from scipy import stats
from .common_tests import (check_normalization, check_moment,
check_mean_expect,
check_var_expect, check_skew_expect,
check_kurt_expect, check_entropy,
check_private_entropy, check_edge_support,
check_named_args, check_random_state_property,
check_pickling, check_rvs_broadcast,
check_freezing,)
from scipy.stats._distr_params import distdiscrete, invdistdiscrete
from scipy.stats._distn_infrastructure import rv_discrete_frozen
vals = ([1, 2, 3, 4], [0.1, 0.2, 0.3, 0.4])
distdiscrete += [[stats.rv_discrete(values=vals), ()]]
# For these distributions, test_discrete_basic only runs with test mode full
distslow = {'zipfian', 'nhypergeom'}
def cases_test_discrete_basic():
seen = set()
for distname, arg in distdiscrete:
if distname in distslow:
yield pytest.param(distname, arg, distname, marks=pytest.mark.slow)
else:
yield distname, arg, distname not in seen
seen.add(distname)
@pytest.mark.parametrize('distname,arg,first_case', cases_test_discrete_basic())
def test_discrete_basic(distname, arg, first_case):
try:
distfn = getattr(stats, distname)
except TypeError:
distfn = distname
distname = 'sample distribution'
np.random.seed(9765456)
rvs = distfn.rvs(size=2000, *arg)
supp = np.unique(rvs)
m, v = distfn.stats(*arg)
check_cdf_ppf(distfn, arg, supp, distname + ' cdf_ppf')
check_pmf_cdf(distfn, arg, distname)
check_oth(distfn, arg, supp, distname + ' oth')
check_edge_support(distfn, arg)
alpha = 0.01
check_discrete_chisquare(distfn, arg, rvs, alpha,
distname + ' chisquare')
if first_case:
locscale_defaults = (0,)
meths = [distfn.pmf, distfn.logpmf, distfn.cdf, distfn.logcdf,
distfn.logsf]
# make sure arguments are within support
# for some distributions, this needs to be overridden
spec_k = {'randint': 11, 'hypergeom': 4, 'bernoulli': 0,
'nchypergeom_wallenius': 6}
k = spec_k.get(distname, 1)
check_named_args(distfn, k, arg, locscale_defaults, meths)
if distname != 'sample distribution':
check_scale_docstring(distfn)
check_random_state_property(distfn, arg)
check_pickling(distfn, arg)
check_freezing(distfn, arg)
# Entropy
check_entropy(distfn, arg, distname)
if distfn.__class__._entropy != stats.rv_discrete._entropy:
check_private_entropy(distfn, arg, stats.rv_discrete)
@pytest.mark.parametrize('distname,arg', distdiscrete)
def test_moments(distname, arg):
try:
distfn = getattr(stats, distname)
except TypeError:
distfn = distname
distname = 'sample distribution'
m, v, s, k = distfn.stats(*arg, moments='mvsk')
check_normalization(distfn, arg, distname)
# compare `stats` and `moment` methods
check_moment(distfn, arg, m, v, distname)
check_mean_expect(distfn, arg, m, distname)
check_var_expect(distfn, arg, m, v, distname)
check_skew_expect(distfn, arg, m, v, s, distname)
with np.testing.suppress_warnings() as sup:
if distname in ['zipf', 'betanbinom']:
sup.filter(RuntimeWarning)
check_kurt_expect(distfn, arg, m, v, k, distname)
# frozen distr moments
check_moment_frozen(distfn, arg, m, 1)
check_moment_frozen(distfn, arg, v+m*m, 2)
@pytest.mark.parametrize('dist,shape_args', distdiscrete)
def test_rvs_broadcast(dist, shape_args):
# If shape_only is True, it means the _rvs method of the
# distribution uses more than one random number to generate a random
# variate. That means the result of using rvs with broadcasting or
# with a nontrivial size will not necessarily be the same as using the
# numpy.vectorize'd version of rvs(), so we can only compare the shapes
# of the results, not the values.
# Whether or not a distribution is in the following list is an
# implementation detail of the distribution, not a requirement. If
# the implementation the rvs() method of a distribution changes, this
# test might also have to be changed.
shape_only = dist in ['betabinom', 'betanbinom', 'skellam', 'yulesimon',
'dlaplace', 'nchypergeom_fisher',
'nchypergeom_wallenius']
try:
distfunc = getattr(stats, dist)
except TypeError:
distfunc = dist
dist = f'rv_discrete(values=({dist.xk!r}, {dist.pk!r}))'
loc = np.zeros(2)
nargs = distfunc.numargs
allargs = []
bshape = []
# Generate shape parameter arguments...
for k in range(nargs):
shp = (k + 3,) + (1,)*(k + 1)
param_val = shape_args[k]
allargs.append(np.full(shp, param_val))
bshape.insert(0, shp[0])
allargs.append(loc)
bshape.append(loc.size)
# bshape holds the expected shape when loc, scale, and the shape
# parameters are all broadcast together.
check_rvs_broadcast(
distfunc, dist, allargs, bshape, shape_only, [np.dtype(int)]
)
@pytest.mark.parametrize('dist,args', distdiscrete)
def test_ppf_with_loc(dist, args):
try:
distfn = getattr(stats, dist)
except TypeError:
distfn = dist
#check with a negative, no and positive relocation.
np.random.seed(1942349)
re_locs = [np.random.randint(-10, -1), 0, np.random.randint(1, 10)]
_a, _b = distfn.support(*args)
for loc in re_locs:
npt.assert_array_equal(
[_a-1+loc, _b+loc],
[distfn.ppf(0.0, *args, loc=loc), distfn.ppf(1.0, *args, loc=loc)]
)
@pytest.mark.parametrize('dist, args', distdiscrete)
def test_isf_with_loc(dist, args):
try:
distfn = getattr(stats, dist)
except TypeError:
distfn = dist
# check with a negative, no and positive relocation.
np.random.seed(1942349)
re_locs = [np.random.randint(-10, -1), 0, np.random.randint(1, 10)]
_a, _b = distfn.support(*args)
for loc in re_locs:
expected = _b + loc, _a - 1 + loc
res = distfn.isf(0., *args, loc=loc), distfn.isf(1., *args, loc=loc)
npt.assert_array_equal(expected, res)
# test broadcasting behaviour
re_locs = [np.random.randint(-10, -1, size=(5, 3)),
np.zeros((5, 3)),
np.random.randint(1, 10, size=(5, 3))]
_a, _b = distfn.support(*args)
for loc in re_locs:
expected = _b + loc, _a - 1 + loc
res = distfn.isf(0., *args, loc=loc), distfn.isf(1., *args, loc=loc)
npt.assert_array_equal(expected, res)
def check_cdf_ppf(distfn, arg, supp, msg):
# supp is assumed to be an array of integers in the support of distfn
# (but not necessarily all the integers in the support).
# This test assumes that the PMF of any value in the support of the
# distribution is greater than 1e-8.
# cdf is a step function, and ppf(q) = min{k : cdf(k) >= q, k integer}
cdf_supp = distfn.cdf(supp, *arg)
# In very rare cases, the finite precision calculation of ppf(cdf(supp))
# can produce an array in which an element is off by one. We nudge the
# CDF values down by 15 ULPs help to avoid this.
cdf_supp0 = cdf_supp - 15*np.spacing(cdf_supp)
npt.assert_array_equal(distfn.ppf(cdf_supp0, *arg),
supp, msg + '-roundtrip')
# Repeat the same calculation, but with the CDF values decreased by 1e-8.
npt.assert_array_equal(distfn.ppf(distfn.cdf(supp, *arg) - 1e-8, *arg),
supp, msg + '-roundtrip')
if not hasattr(distfn, 'xk'):
_a, _b = distfn.support(*arg)
supp1 = supp[supp < _b]
npt.assert_array_equal(distfn.ppf(distfn.cdf(supp1, *arg) + 1e-8, *arg),
supp1 + distfn.inc, msg + ' ppf-cdf-next')
def check_pmf_cdf(distfn, arg, distname):
if hasattr(distfn, 'xk'):
index = distfn.xk
else:
startind = int(distfn.ppf(0.01, *arg) - 1)
index = list(range(startind, startind + 10))
cdfs = distfn.cdf(index, *arg)
pmfs_cum = distfn.pmf(index, *arg).cumsum()
atol, rtol = 1e-10, 1e-10
if distname == 'skellam': # ncx2 accuracy
atol, rtol = 1e-5, 1e-5
npt.assert_allclose(cdfs - cdfs[0], pmfs_cum - pmfs_cum[0],
atol=atol, rtol=rtol)
# also check that pmf at non-integral k is zero
k = np.asarray(index)
k_shifted = k[:-1] + np.diff(k)/2
npt.assert_equal(distfn.pmf(k_shifted, *arg), 0)
# better check frozen distributions, and also when loc != 0
loc = 0.5
dist = distfn(loc=loc, *arg)
npt.assert_allclose(dist.pmf(k[1:] + loc), np.diff(dist.cdf(k + loc)))
npt.assert_equal(dist.pmf(k_shifted + loc), 0)
def check_moment_frozen(distfn, arg, m, k):
npt.assert_allclose(distfn(*arg).moment(k), m,
atol=1e-10, rtol=1e-10)
def check_oth(distfn, arg, supp, msg):
# checking other methods of distfn
npt.assert_allclose(distfn.sf(supp, *arg), 1. - distfn.cdf(supp, *arg),
atol=1e-10, rtol=1e-10)
q = np.linspace(0.01, 0.99, 20)
npt.assert_allclose(distfn.isf(q, *arg), distfn.ppf(1. - q, *arg),
atol=1e-10, rtol=1e-10)
median_sf = distfn.isf(0.5, *arg)
npt.assert_(distfn.sf(median_sf - 1, *arg) > 0.5)
npt.assert_(distfn.cdf(median_sf + 1, *arg) > 0.5)
def check_discrete_chisquare(distfn, arg, rvs, alpha, msg):
"""Perform chisquare test for random sample of a discrete distribution
Parameters
----------
distname : string
name of distribution function
arg : sequence
parameters of distribution
alpha : float
significance level, threshold for p-value
Returns
-------
result : bool
0 if test passes, 1 if test fails
"""
wsupp = 0.05
# construct intervals with minimum mass `wsupp`.
# intervals are left-half-open as in a cdf difference
_a, _b = distfn.support(*arg)
lo = int(max(_a, -1000))
high = int(min(_b, 1000)) + 1
distsupport = range(lo, high)
last = 0
distsupp = [lo]
distmass = []
for ii in distsupport:
current = distfn.cdf(ii, *arg)
if current - last >= wsupp - 1e-14:
distsupp.append(ii)
distmass.append(current - last)
last = current
if current > (1 - wsupp):
break
if distsupp[-1] < _b:
distsupp.append(_b)
distmass.append(1 - last)
distsupp = np.array(distsupp)
distmass = np.array(distmass)
# convert intervals to right-half-open as required by histogram
histsupp = distsupp + 1e-8
histsupp[0] = _a
# find sample frequencies and perform chisquare test
freq, hsupp = np.histogram(rvs, histsupp)
chis, pval = stats.chisquare(np.array(freq), len(rvs)*distmass)
npt.assert_(
pval > alpha,
f'chisquare - test for {msg} at arg = {str(arg)} with pval = {str(pval)}'
)
def check_scale_docstring(distfn):
if distfn.__doc__ is not None:
# Docstrings can be stripped if interpreter is run with -OO
npt.assert_('scale' not in distfn.__doc__)
@pytest.mark.parametrize('method', ['pmf', 'logpmf', 'cdf', 'logcdf',
'sf', 'logsf', 'ppf', 'isf'])
@pytest.mark.parametrize('distname, args', distdiscrete)
def test_methods_with_lists(method, distname, args):
# Test that the discrete distributions can accept Python lists
# as arguments.
try:
dist = getattr(stats, distname)
except TypeError:
return
if method in ['ppf', 'isf']:
z = [0.1, 0.2]
else:
z = [0, 1]
p2 = [[p]*2 for p in args]
loc = [0, 1]
result = dist.pmf(z, *p2, loc=loc)
npt.assert_allclose(result,
[dist.pmf(*v) for v in zip(z, *p2, loc)],
rtol=1e-15, atol=1e-15)
@pytest.mark.parametrize('distname, args', invdistdiscrete)
def test_cdf_gh13280_regression(distname, args):
# Test for nan output when shape parameters are invalid
dist = getattr(stats, distname)
x = np.arange(-2, 15)
vals = dist.cdf(x, *args)
expected = np.nan
npt.assert_equal(vals, expected)
def cases_test_discrete_integer_shapes():
# distributions parameters that are only allowed to be integral when
# fitting, but are allowed to be real as input to PDF, etc.
integrality_exceptions = {'nbinom': {'n'}, 'betanbinom': {'n'}}
seen = set()
for distname, shapes in distdiscrete:
if distname in seen:
continue
seen.add(distname)
try:
dist = getattr(stats, distname)
except TypeError:
continue
shape_info = dist._shape_info()
for i, shape in enumerate(shape_info):
if (shape.name in integrality_exceptions.get(distname, set()) or
not shape.integrality):
continue
yield distname, shape.name, shapes
@pytest.mark.parametrize('distname, shapename, shapes',
cases_test_discrete_integer_shapes())
def test_integer_shapes(distname, shapename, shapes):
dist = getattr(stats, distname)
shape_info = dist._shape_info()
shape_names = [shape.name for shape in shape_info]
i = shape_names.index(shapename) # this element of params must be integral
shapes_copy = list(shapes)
valid_shape = shapes[i]
invalid_shape = valid_shape - 0.5 # arbitrary non-integral value
new_valid_shape = valid_shape - 1
shapes_copy[i] = [[valid_shape], [invalid_shape], [new_valid_shape]]
a, b = dist.support(*shapes)
x = np.round(np.linspace(a, b, 5))
pmf = dist.pmf(x, *shapes_copy)
assert not np.any(np.isnan(pmf[0, :]))
assert np.all(np.isnan(pmf[1, :]))
assert not np.any(np.isnan(pmf[2, :]))
def test_frozen_attributes():
# gh-14827 reported that all frozen distributions had both pmf and pdf
# attributes; continuous should have pdf and discrete should have pmf.
message = "'rv_discrete_frozen' object has no attribute"
with pytest.raises(AttributeError, match=message):
stats.binom(10, 0.5).pdf
with pytest.raises(AttributeError, match=message):
stats.binom(10, 0.5).logpdf
stats.binom.pdf = "herring"
frozen_binom = stats.binom(10, 0.5)
assert isinstance(frozen_binom, rv_discrete_frozen)
delattr(stats.binom, 'pdf')
@pytest.mark.parametrize('distname, shapes', distdiscrete)
def test_interval(distname, shapes):
# gh-11026 reported that `interval` returns incorrect values when
# `confidence=1`. The values were not incorrect, but it was not intuitive
# that the left end of the interval should extend beyond the support of the
# distribution. Confirm that this is the behavior for all distributions.
if isinstance(distname, str):
dist = getattr(stats, distname)
else:
dist = distname
a, b = dist.support(*shapes)
npt.assert_equal(dist.ppf([0, 1], *shapes), (a-1, b))
npt.assert_equal(dist.isf([1, 0], *shapes), (a-1, b))
npt.assert_equal(dist.interval(1, *shapes), (a-1, b))
@pytest.mark.xfail_on_32bit("Sensible to machine precision")
def test_rv_sample():
# Thoroughly test rv_sample and check that gh-3758 is resolved
# Generate a random discrete distribution
rng = np.random.default_rng(98430143469)
xk = np.sort(rng.random(10) * 10)
pk = rng.random(10)
pk /= np.sum(pk)
dist = stats.rv_discrete(values=(xk, pk))
# Generate points to the left and right of xk
xk_left = (np.array([0] + xk[:-1].tolist()) + xk)/2
xk_right = (np.array(xk[1:].tolist() + [xk[-1]+1]) + xk)/2
# Generate points to the left and right of cdf
cdf2 = np.cumsum(pk)
cdf2_left = (np.array([0] + cdf2[:-1].tolist()) + cdf2)/2
cdf2_right = (np.array(cdf2[1:].tolist() + [1]) + cdf2)/2
# support - leftmost and rightmost xk
a, b = dist.support()
assert_allclose(a, xk[0])
assert_allclose(b, xk[-1])
# pmf - supported only on the xk
assert_allclose(dist.pmf(xk), pk)
assert_allclose(dist.pmf(xk_right), 0)
assert_allclose(dist.pmf(xk_left), 0)
# logpmf is log of the pmf; log(0) = -np.inf
with np.errstate(divide='ignore'):
assert_allclose(dist.logpmf(xk), np.log(pk))
assert_allclose(dist.logpmf(xk_right), -np.inf)
assert_allclose(dist.logpmf(xk_left), -np.inf)
# cdf - the cumulative sum of the pmf
assert_allclose(dist.cdf(xk), cdf2)
assert_allclose(dist.cdf(xk_right), cdf2)
assert_allclose(dist.cdf(xk_left), [0]+cdf2[:-1].tolist())
with np.errstate(divide='ignore'):
assert_allclose(dist.logcdf(xk), np.log(dist.cdf(xk)),
atol=1e-15)
assert_allclose(dist.logcdf(xk_right), np.log(dist.cdf(xk_right)),
atol=1e-15)
assert_allclose(dist.logcdf(xk_left), np.log(dist.cdf(xk_left)),
atol=1e-15)
# sf is 1-cdf
assert_allclose(dist.sf(xk), 1-dist.cdf(xk))
assert_allclose(dist.sf(xk_right), 1-dist.cdf(xk_right))
assert_allclose(dist.sf(xk_left), 1-dist.cdf(xk_left))
with np.errstate(divide='ignore'):
assert_allclose(dist.logsf(xk), np.log(dist.sf(xk)),
atol=1e-15)
assert_allclose(dist.logsf(xk_right), np.log(dist.sf(xk_right)),
atol=1e-15)
assert_allclose(dist.logsf(xk_left), np.log(dist.sf(xk_left)),
atol=1e-15)
# ppf
assert_allclose(dist.ppf(cdf2), xk)
assert_allclose(dist.ppf(cdf2_left), xk)
assert_allclose(dist.ppf(cdf2_right)[:-1], xk[1:])
assert_allclose(dist.ppf(0), a - 1)
assert_allclose(dist.ppf(1), b)
# isf
sf2 = dist.sf(xk)
assert_allclose(dist.isf(sf2), xk)
assert_allclose(dist.isf(1-cdf2_left), dist.ppf(cdf2_left))
assert_allclose(dist.isf(1-cdf2_right), dist.ppf(cdf2_right))
assert_allclose(dist.isf(0), b)
assert_allclose(dist.isf(1), a - 1)
# interval is (ppf(alpha/2), isf(alpha/2))
ps = np.linspace(0.01, 0.99, 10)
int2 = dist.ppf(ps/2), dist.isf(ps/2)
assert_allclose(dist.interval(1-ps), int2)
assert_allclose(dist.interval(0), dist.median())
assert_allclose(dist.interval(1), (a-1, b))
# median is simply ppf(0.5)
med2 = dist.ppf(0.5)
assert_allclose(dist.median(), med2)
# all four stats (mean, var, skew, and kurtosis) from the definitions
mean2 = np.sum(xk*pk)
var2 = np.sum((xk - mean2)**2 * pk)
skew2 = np.sum((xk - mean2)**3 * pk) / var2**(3/2)
kurt2 = np.sum((xk - mean2)**4 * pk) / var2**2 - 3
assert_allclose(dist.mean(), mean2)
assert_allclose(dist.std(), np.sqrt(var2))
assert_allclose(dist.var(), var2)
assert_allclose(dist.stats(moments='mvsk'), (mean2, var2, skew2, kurt2))
# noncentral moment against definition
mom3 = np.sum((xk**3) * pk)
assert_allclose(dist.moment(3), mom3)
# expect - check against moments
assert_allclose(dist.expect(lambda x: 1), 1)
assert_allclose(dist.expect(), mean2)
assert_allclose(dist.expect(lambda x: x**3), mom3)
# entropy is the negative of the expected value of log(p)
with np.errstate(divide='ignore'):
assert_allclose(-dist.expect(lambda x: dist.logpmf(x)), dist.entropy())
# RVS is just ppf of uniform random variates
rng = np.random.default_rng(98430143469)
rvs = dist.rvs(size=100, random_state=rng)
rng = np.random.default_rng(98430143469)
rvs0 = dist.ppf(rng.random(size=100))
assert_allclose(rvs, rvs0)