2022 lines
77 KiB
Python
2022 lines
77 KiB
Python
from collections import namedtuple
|
|
from dataclasses import dataclass
|
|
from math import comb
|
|
import numpy as np
|
|
import warnings
|
|
from itertools import combinations
|
|
import scipy.stats
|
|
from scipy.optimize import shgo
|
|
from . import distributions
|
|
from ._common import ConfidenceInterval
|
|
from ._continuous_distns import chi2, norm
|
|
from scipy.special import gamma, kv, gammaln
|
|
from scipy.fft import ifft
|
|
from ._stats_pythran import _a_ij_Aij_Dij2
|
|
from ._stats_pythran import (
|
|
_concordant_pairs as _P, _discordant_pairs as _Q
|
|
)
|
|
from ._axis_nan_policy import _axis_nan_policy_factory
|
|
from scipy.stats import _stats_py
|
|
|
|
__all__ = ['epps_singleton_2samp', 'cramervonmises', 'somersd',
|
|
'barnard_exact', 'boschloo_exact', 'cramervonmises_2samp',
|
|
'tukey_hsd', 'poisson_means_test']
|
|
|
|
Epps_Singleton_2sampResult = namedtuple('Epps_Singleton_2sampResult',
|
|
('statistic', 'pvalue'))
|
|
|
|
|
|
@_axis_nan_policy_factory(Epps_Singleton_2sampResult, n_samples=2, too_small=4)
|
|
def epps_singleton_2samp(x, y, t=(0.4, 0.8)):
|
|
"""Compute the Epps-Singleton (ES) test statistic.
|
|
|
|
Test the null hypothesis that two samples have the same underlying
|
|
probability distribution.
|
|
|
|
Parameters
|
|
----------
|
|
x, y : array-like
|
|
The two samples of observations to be tested. Input must not have more
|
|
than one dimension. Samples can have different lengths.
|
|
t : array-like, optional
|
|
The points (t1, ..., tn) where the empirical characteristic function is
|
|
to be evaluated. It should be positive distinct numbers. The default
|
|
value (0.4, 0.8) is proposed in [1]_. Input must not have more than
|
|
one dimension.
|
|
|
|
Returns
|
|
-------
|
|
statistic : float
|
|
The test statistic.
|
|
pvalue : float
|
|
The associated p-value based on the asymptotic chi2-distribution.
|
|
|
|
See Also
|
|
--------
|
|
ks_2samp, anderson_ksamp
|
|
|
|
Notes
|
|
-----
|
|
Testing whether two samples are generated by the same underlying
|
|
distribution is a classical question in statistics. A widely used test is
|
|
the Kolmogorov-Smirnov (KS) test which relies on the empirical
|
|
distribution function. Epps and Singleton introduce a test based on the
|
|
empirical characteristic function in [1]_.
|
|
|
|
One advantage of the ES test compared to the KS test is that is does
|
|
not assume a continuous distribution. In [1]_, the authors conclude
|
|
that the test also has a higher power than the KS test in many
|
|
examples. They recommend the use of the ES test for discrete samples as
|
|
well as continuous samples with at least 25 observations each, whereas
|
|
`anderson_ksamp` is recommended for smaller sample sizes in the
|
|
continuous case.
|
|
|
|
The p-value is computed from the asymptotic distribution of the test
|
|
statistic which follows a `chi2` distribution. If the sample size of both
|
|
`x` and `y` is below 25, the small sample correction proposed in [1]_ is
|
|
applied to the test statistic.
|
|
|
|
The default values of `t` are determined in [1]_ by considering
|
|
various distributions and finding good values that lead to a high power
|
|
of the test in general. Table III in [1]_ gives the optimal values for
|
|
the distributions tested in that study. The values of `t` are scaled by
|
|
the semi-interquartile range in the implementation, see [1]_.
|
|
|
|
References
|
|
----------
|
|
.. [1] T. W. Epps and K. J. Singleton, "An omnibus test for the two-sample
|
|
problem using the empirical characteristic function", Journal of
|
|
Statistical Computation and Simulation 26, p. 177--203, 1986.
|
|
|
|
.. [2] S. J. Goerg and J. Kaiser, "Nonparametric testing of distributions
|
|
- the Epps-Singleton two-sample test using the empirical characteristic
|
|
function", The Stata Journal 9(3), p. 454--465, 2009.
|
|
|
|
"""
|
|
# x and y are converted to arrays by the decorator
|
|
t = np.asarray(t)
|
|
# check if x and y are valid inputs
|
|
nx, ny = len(x), len(y)
|
|
if (nx < 5) or (ny < 5):
|
|
raise ValueError('x and y should have at least 5 elements, but len(x) '
|
|
f'= {nx} and len(y) = {ny}.')
|
|
if not np.isfinite(x).all():
|
|
raise ValueError('x must not contain nonfinite values.')
|
|
if not np.isfinite(y).all():
|
|
raise ValueError('y must not contain nonfinite values.')
|
|
n = nx + ny
|
|
|
|
# check if t is valid
|
|
if t.ndim > 1:
|
|
raise ValueError(f't must be 1d, but t.ndim equals {t.ndim}.')
|
|
if np.less_equal(t, 0).any():
|
|
raise ValueError('t must contain positive elements only.')
|
|
|
|
# rescale t with semi-iqr as proposed in [1]; import iqr here to avoid
|
|
# circular import
|
|
from scipy.stats import iqr
|
|
sigma = iqr(np.hstack((x, y))) / 2
|
|
ts = np.reshape(t, (-1, 1)) / sigma
|
|
|
|
# covariance estimation of ES test
|
|
gx = np.vstack((np.cos(ts*x), np.sin(ts*x))).T # shape = (nx, 2*len(t))
|
|
gy = np.vstack((np.cos(ts*y), np.sin(ts*y))).T
|
|
cov_x = np.cov(gx.T, bias=True) # the test uses biased cov-estimate
|
|
cov_y = np.cov(gy.T, bias=True)
|
|
est_cov = (n/nx)*cov_x + (n/ny)*cov_y
|
|
est_cov_inv = np.linalg.pinv(est_cov)
|
|
r = np.linalg.matrix_rank(est_cov_inv)
|
|
if r < 2*len(t):
|
|
warnings.warn('Estimated covariance matrix does not have full rank. '
|
|
'This indicates a bad choice of the input t and the '
|
|
'test might not be consistent.', # see p. 183 in [1]_
|
|
stacklevel=2)
|
|
|
|
# compute test statistic w distributed asympt. as chisquare with df=r
|
|
g_diff = np.mean(gx, axis=0) - np.mean(gy, axis=0)
|
|
w = n*np.dot(g_diff.T, np.dot(est_cov_inv, g_diff))
|
|
|
|
# apply small-sample correction
|
|
if (max(nx, ny) < 25):
|
|
corr = 1.0/(1.0 + n**(-0.45) + 10.1*(nx**(-1.7) + ny**(-1.7)))
|
|
w = corr * w
|
|
|
|
p = chi2.sf(w, r)
|
|
|
|
return Epps_Singleton_2sampResult(w, p)
|
|
|
|
|
|
def poisson_means_test(k1, n1, k2, n2, *, diff=0, alternative='two-sided'):
|
|
r"""
|
|
Performs the Poisson means test, AKA the "E-test".
|
|
|
|
This is a test of the null hypothesis that the difference between means of
|
|
two Poisson distributions is `diff`. The samples are provided as the
|
|
number of events `k1` and `k2` observed within measurement intervals
|
|
(e.g. of time, space, number of observations) of sizes `n1` and `n2`.
|
|
|
|
Parameters
|
|
----------
|
|
k1 : int
|
|
Number of events observed from distribution 1.
|
|
n1: float
|
|
Size of sample from distribution 1.
|
|
k2 : int
|
|
Number of events observed from distribution 2.
|
|
n2 : float
|
|
Size of sample from distribution 2.
|
|
diff : float, default=0
|
|
The hypothesized difference in means between the distributions
|
|
underlying the samples.
|
|
alternative : {'two-sided', 'less', 'greater'}, optional
|
|
Defines the alternative hypothesis.
|
|
The following options are available (default is 'two-sided'):
|
|
|
|
* 'two-sided': the difference between distribution means is not
|
|
equal to `diff`
|
|
* 'less': the difference between distribution means is less than
|
|
`diff`
|
|
* 'greater': the difference between distribution means is greater
|
|
than `diff`
|
|
|
|
Returns
|
|
-------
|
|
statistic : float
|
|
The test statistic (see [1]_ equation 3.3).
|
|
pvalue : float
|
|
The probability of achieving such an extreme value of the test
|
|
statistic under the null hypothesis.
|
|
|
|
Notes
|
|
-----
|
|
|
|
Let:
|
|
|
|
.. math:: X_1 \sim \mbox{Poisson}(\mathtt{n1}\lambda_1)
|
|
|
|
be a random variable independent of
|
|
|
|
.. math:: X_2 \sim \mbox{Poisson}(\mathtt{n2}\lambda_2)
|
|
|
|
and let ``k1`` and ``k2`` be the observed values of :math:`X_1`
|
|
and :math:`X_2`, respectively. Then `poisson_means_test` uses the number
|
|
of observed events ``k1`` and ``k2`` from samples of size ``n1`` and
|
|
``n2``, respectively, to test the null hypothesis that
|
|
|
|
.. math::
|
|
H_0: \lambda_1 - \lambda_2 = \mathtt{diff}
|
|
|
|
A benefit of the E-test is that it has good power for small sample sizes,
|
|
which can reduce sampling costs [1]_. It has been evaluated and determined
|
|
to be more powerful than the comparable C-test, sometimes referred to as
|
|
the Poisson exact test.
|
|
|
|
References
|
|
----------
|
|
.. [1] Krishnamoorthy, K., & Thomson, J. (2004). A more powerful test for
|
|
comparing two Poisson means. Journal of Statistical Planning and
|
|
Inference, 119(1), 23-35.
|
|
|
|
.. [2] Przyborowski, J., & Wilenski, H. (1940). Homogeneity of results in
|
|
testing samples from Poisson series: With an application to testing
|
|
clover seed for dodder. Biometrika, 31(3/4), 313-323.
|
|
|
|
Examples
|
|
--------
|
|
|
|
Suppose that a gardener wishes to test the number of dodder (weed) seeds
|
|
in a sack of clover seeds that they buy from a seed company. It has
|
|
previously been established that the number of dodder seeds in clover
|
|
follows the Poisson distribution.
|
|
|
|
A 100 gram sample is drawn from the sack before being shipped to the
|
|
gardener. The sample is analyzed, and it is found to contain no dodder
|
|
seeds; that is, `k1` is 0. However, upon arrival, the gardener draws
|
|
another 100 gram sample from the sack. This time, three dodder seeds are
|
|
found in the sample; that is, `k2` is 3. The gardener would like to
|
|
know if the difference is significant and not due to chance. The
|
|
null hypothesis is that the difference between the two samples is merely
|
|
due to chance, or that :math:`\lambda_1 - \lambda_2 = \mathtt{diff}`
|
|
where :math:`\mathtt{diff} = 0`. The alternative hypothesis is that the
|
|
difference is not due to chance, or :math:`\lambda_1 - \lambda_2 \ne 0`.
|
|
The gardener selects a significance level of 5% to reject the null
|
|
hypothesis in favor of the alternative [2]_.
|
|
|
|
>>> import scipy.stats as stats
|
|
>>> res = stats.poisson_means_test(0, 100, 3, 100)
|
|
>>> res.statistic, res.pvalue
|
|
(-1.7320508075688772, 0.08837900929018157)
|
|
|
|
The p-value is .088, indicating a near 9% chance of observing a value of
|
|
the test statistic under the null hypothesis. This exceeds 5%, so the
|
|
gardener does not reject the null hypothesis as the difference cannot be
|
|
regarded as significant at this level.
|
|
"""
|
|
|
|
_poisson_means_test_iv(k1, n1, k2, n2, diff, alternative)
|
|
|
|
# "for a given k_1 and k_2, an estimate of \lambda_2 is given by" [1] (3.4)
|
|
lmbd_hat2 = ((k1 + k2) / (n1 + n2) - diff * n1 / (n1 + n2))
|
|
|
|
# "\hat{\lambda_{2k}} may be less than or equal to zero ... and in this
|
|
# case the null hypothesis cannot be rejected ... [and] it is not necessary
|
|
# to compute the p-value". [1] page 26 below eq. (3.6).
|
|
if lmbd_hat2 <= 0:
|
|
return _stats_py.SignificanceResult(0, 1)
|
|
|
|
# The unbiased variance estimate [1] (3.2)
|
|
var = k1 / (n1 ** 2) + k2 / (n2 ** 2)
|
|
|
|
# The _observed_ pivot statistic from the input. It follows the
|
|
# unnumbered equation following equation (3.3) This is used later in
|
|
# comparison with the computed pivot statistics in an indicator function.
|
|
t_k1k2 = (k1 / n1 - k2 / n2 - diff) / np.sqrt(var)
|
|
|
|
# Equation (3.5) of [1] is lengthy, so it is broken into several parts,
|
|
# beginning here. Note that the probability mass function of poisson is
|
|
# exp^(-\mu)*\mu^k/k!, so and this is called with shape \mu, here noted
|
|
# here as nlmbd_hat*. The strategy for evaluating the double summation in
|
|
# (3.5) is to create two arrays of the values of the two products inside
|
|
# the summation and then broadcast them together into a matrix, and then
|
|
# sum across the entire matrix.
|
|
|
|
# Compute constants (as seen in the first and second separated products in
|
|
# (3.5).). (This is the shape (\mu) parameter of the poisson distribution.)
|
|
nlmbd_hat1 = n1 * (lmbd_hat2 + diff)
|
|
nlmbd_hat2 = n2 * lmbd_hat2
|
|
|
|
# Determine summation bounds for tail ends of distribution rather than
|
|
# summing to infinity. `x1*` is for the outer sum and `x2*` is the inner
|
|
# sum.
|
|
x1_lb, x1_ub = distributions.poisson.ppf([1e-10, 1 - 1e-16], nlmbd_hat1)
|
|
x2_lb, x2_ub = distributions.poisson.ppf([1e-10, 1 - 1e-16], nlmbd_hat2)
|
|
|
|
# Construct arrays to function as the x_1 and x_2 counters on the summation
|
|
# in (3.5). `x1` is in columns and `x2` is in rows to allow for
|
|
# broadcasting.
|
|
x1 = np.arange(x1_lb, x1_ub + 1)
|
|
x2 = np.arange(x2_lb, x2_ub + 1)[:, None]
|
|
|
|
# These are the two products in equation (3.5) with `prob_x1` being the
|
|
# first (left side) and `prob_x2` being the second (right side). (To
|
|
# make as clear as possible: the 1st contains a "+ d" term, the 2nd does
|
|
# not.)
|
|
prob_x1 = distributions.poisson.pmf(x1, nlmbd_hat1)
|
|
prob_x2 = distributions.poisson.pmf(x2, nlmbd_hat2)
|
|
|
|
# compute constants for use in the "pivot statistic" per the
|
|
# unnumbered equation following (3.3).
|
|
lmbd_x1 = x1 / n1
|
|
lmbd_x2 = x2 / n2
|
|
lmbds_diff = lmbd_x1 - lmbd_x2 - diff
|
|
var_x1x2 = lmbd_x1 / n1 + lmbd_x2 / n2
|
|
|
|
# This is the 'pivot statistic' for use in the indicator of the summation
|
|
# (left side of "I[.]").
|
|
with np.errstate(invalid='ignore', divide='ignore'):
|
|
t_x1x2 = lmbds_diff / np.sqrt(var_x1x2)
|
|
|
|
# `[indicator]` implements the "I[.] ... the indicator function" per
|
|
# the paragraph following equation (3.5).
|
|
if alternative == 'two-sided':
|
|
indicator = np.abs(t_x1x2) >= np.abs(t_k1k2)
|
|
elif alternative == 'less':
|
|
indicator = t_x1x2 <= t_k1k2
|
|
else:
|
|
indicator = t_x1x2 >= t_k1k2
|
|
|
|
# Multiply all combinations of the products together, exclude terms
|
|
# based on the `indicator` and then sum. (3.5)
|
|
pvalue = np.sum((prob_x1 * prob_x2)[indicator])
|
|
return _stats_py.SignificanceResult(t_k1k2, pvalue)
|
|
|
|
|
|
def _poisson_means_test_iv(k1, n1, k2, n2, diff, alternative):
|
|
# """check for valid types and values of input to `poisson_mean_test`."""
|
|
if k1 != int(k1) or k2 != int(k2):
|
|
raise TypeError('`k1` and `k2` must be integers.')
|
|
|
|
count_err = '`k1` and `k2` must be greater than or equal to 0.'
|
|
if k1 < 0 or k2 < 0:
|
|
raise ValueError(count_err)
|
|
|
|
if n1 <= 0 or n2 <= 0:
|
|
raise ValueError('`n1` and `n2` must be greater than 0.')
|
|
|
|
if diff < 0:
|
|
raise ValueError('diff must be greater than or equal to 0.')
|
|
|
|
alternatives = {'two-sided', 'less', 'greater'}
|
|
if alternative.lower() not in alternatives:
|
|
raise ValueError(f"Alternative must be one of '{alternatives}'.")
|
|
|
|
|
|
class CramerVonMisesResult:
|
|
def __init__(self, statistic, pvalue):
|
|
self.statistic = statistic
|
|
self.pvalue = pvalue
|
|
|
|
def __repr__(self):
|
|
return (f"{self.__class__.__name__}(statistic={self.statistic}, "
|
|
f"pvalue={self.pvalue})")
|
|
|
|
|
|
def _psi1_mod(x):
|
|
"""
|
|
psi1 is defined in equation 1.10 in Csörgő, S. and Faraway, J. (1996).
|
|
This implements a modified version by excluding the term V(x) / 12
|
|
(here: _cdf_cvm_inf(x) / 12) to avoid evaluating _cdf_cvm_inf(x)
|
|
twice in _cdf_cvm.
|
|
|
|
Implementation based on MAPLE code of Julian Faraway and R code of the
|
|
function pCvM in the package goftest (v1.1.1), permission granted
|
|
by Adrian Baddeley. Main difference in the implementation: the code
|
|
here keeps adding terms of the series until the terms are small enough.
|
|
"""
|
|
|
|
def _ed2(y):
|
|
z = y**2 / 4
|
|
b = kv(1/4, z) + kv(3/4, z)
|
|
return np.exp(-z) * (y/2)**(3/2) * b / np.sqrt(np.pi)
|
|
|
|
def _ed3(y):
|
|
z = y**2 / 4
|
|
c = np.exp(-z) / np.sqrt(np.pi)
|
|
return c * (y/2)**(5/2) * (2*kv(1/4, z) + 3*kv(3/4, z) - kv(5/4, z))
|
|
|
|
def _Ak(k, x):
|
|
m = 2*k + 1
|
|
sx = 2 * np.sqrt(x)
|
|
y1 = x**(3/4)
|
|
y2 = x**(5/4)
|
|
|
|
e1 = m * gamma(k + 1/2) * _ed2((4 * k + 3)/sx) / (9 * y1)
|
|
e2 = gamma(k + 1/2) * _ed3((4 * k + 1) / sx) / (72 * y2)
|
|
e3 = 2 * (m + 2) * gamma(k + 3/2) * _ed3((4 * k + 5) / sx) / (12 * y2)
|
|
e4 = 7 * m * gamma(k + 1/2) * _ed2((4 * k + 1) / sx) / (144 * y1)
|
|
e5 = 7 * m * gamma(k + 1/2) * _ed2((4 * k + 5) / sx) / (144 * y1)
|
|
|
|
return e1 + e2 + e3 + e4 + e5
|
|
|
|
x = np.asarray(x)
|
|
tot = np.zeros_like(x, dtype='float')
|
|
cond = np.ones_like(x, dtype='bool')
|
|
k = 0
|
|
while np.any(cond):
|
|
z = -_Ak(k, x[cond]) / (np.pi * gamma(k + 1))
|
|
tot[cond] = tot[cond] + z
|
|
cond[cond] = np.abs(z) >= 1e-7
|
|
k += 1
|
|
|
|
return tot
|
|
|
|
|
|
def _cdf_cvm_inf(x):
|
|
"""
|
|
Calculate the cdf of the Cramér-von Mises statistic (infinite sample size).
|
|
|
|
See equation 1.2 in Csörgő, S. and Faraway, J. (1996).
|
|
|
|
Implementation based on MAPLE code of Julian Faraway and R code of the
|
|
function pCvM in the package goftest (v1.1.1), permission granted
|
|
by Adrian Baddeley. Main difference in the implementation: the code
|
|
here keeps adding terms of the series until the terms are small enough.
|
|
|
|
The function is not expected to be accurate for large values of x, say
|
|
x > 4, when the cdf is very close to 1.
|
|
"""
|
|
x = np.asarray(x)
|
|
|
|
def term(x, k):
|
|
# this expression can be found in [2], second line of (1.3)
|
|
u = np.exp(gammaln(k + 0.5) - gammaln(k+1)) / (np.pi**1.5 * np.sqrt(x))
|
|
y = 4*k + 1
|
|
q = y**2 / (16*x)
|
|
b = kv(0.25, q)
|
|
return u * np.sqrt(y) * np.exp(-q) * b
|
|
|
|
tot = np.zeros_like(x, dtype='float')
|
|
cond = np.ones_like(x, dtype='bool')
|
|
k = 0
|
|
while np.any(cond):
|
|
z = term(x[cond], k)
|
|
tot[cond] = tot[cond] + z
|
|
cond[cond] = np.abs(z) >= 1e-7
|
|
k += 1
|
|
|
|
return tot
|
|
|
|
|
|
def _cdf_cvm(x, n=None):
|
|
"""
|
|
Calculate the cdf of the Cramér-von Mises statistic for a finite sample
|
|
size n. If N is None, use the asymptotic cdf (n=inf).
|
|
|
|
See equation 1.8 in Csörgő, S. and Faraway, J. (1996) for finite samples,
|
|
1.2 for the asymptotic cdf.
|
|
|
|
The function is not expected to be accurate for large values of x, say
|
|
x > 2, when the cdf is very close to 1 and it might return values > 1
|
|
in that case, e.g. _cdf_cvm(2.0, 12) = 1.0000027556716846. Moreover, it
|
|
is not accurate for small values of n, especially close to the bounds of
|
|
the distribution's domain, [1/(12*n), n/3], where the value jumps to 0
|
|
and 1, respectively. These are limitations of the approximation by Csörgő
|
|
and Faraway (1996) implemented in this function.
|
|
"""
|
|
x = np.asarray(x)
|
|
if n is None:
|
|
y = _cdf_cvm_inf(x)
|
|
else:
|
|
# support of the test statistic is [12/n, n/3], see 1.1 in [2]
|
|
y = np.zeros_like(x, dtype='float')
|
|
sup = (1./(12*n) < x) & (x < n/3.)
|
|
# note: _psi1_mod does not include the term _cdf_cvm_inf(x) / 12
|
|
# therefore, we need to add it here
|
|
y[sup] = _cdf_cvm_inf(x[sup]) * (1 + 1./(12*n)) + _psi1_mod(x[sup]) / n
|
|
y[x >= n/3] = 1
|
|
|
|
if y.ndim == 0:
|
|
return y[()]
|
|
return y
|
|
|
|
|
|
def _cvm_result_to_tuple(res):
|
|
return res.statistic, res.pvalue
|
|
|
|
|
|
@_axis_nan_policy_factory(CramerVonMisesResult, n_samples=1, too_small=1,
|
|
result_to_tuple=_cvm_result_to_tuple)
|
|
def cramervonmises(rvs, cdf, args=()):
|
|
"""Perform the one-sample Cramér-von Mises test for goodness of fit.
|
|
|
|
This performs a test of the goodness of fit of a cumulative distribution
|
|
function (cdf) :math:`F` compared to the empirical distribution function
|
|
:math:`F_n` of observed random variates :math:`X_1, ..., X_n` that are
|
|
assumed to be independent and identically distributed ([1]_).
|
|
The null hypothesis is that the :math:`X_i` have cumulative distribution
|
|
:math:`F`.
|
|
|
|
Parameters
|
|
----------
|
|
rvs : array_like
|
|
A 1-D array of observed values of the random variables :math:`X_i`.
|
|
cdf : str or callable
|
|
The cumulative distribution function :math:`F` to test the
|
|
observations against. If a string, it should be the name of a
|
|
distribution in `scipy.stats`. If a callable, that callable is used
|
|
to calculate the cdf: ``cdf(x, *args) -> float``.
|
|
args : tuple, optional
|
|
Distribution parameters. These are assumed to be known; see Notes.
|
|
|
|
Returns
|
|
-------
|
|
res : object with attributes
|
|
statistic : float
|
|
Cramér-von Mises statistic.
|
|
pvalue : float
|
|
The p-value.
|
|
|
|
See Also
|
|
--------
|
|
kstest, cramervonmises_2samp
|
|
|
|
Notes
|
|
-----
|
|
.. versionadded:: 1.6.0
|
|
|
|
The p-value relies on the approximation given by equation 1.8 in [2]_.
|
|
It is important to keep in mind that the p-value is only accurate if
|
|
one tests a simple hypothesis, i.e. the parameters of the reference
|
|
distribution are known. If the parameters are estimated from the data
|
|
(composite hypothesis), the computed p-value is not reliable.
|
|
|
|
References
|
|
----------
|
|
.. [1] Cramér-von Mises criterion, Wikipedia,
|
|
https://en.wikipedia.org/wiki/Cram%C3%A9r%E2%80%93von_Mises_criterion
|
|
.. [2] Csörgő, S. and Faraway, J. (1996). The Exact and Asymptotic
|
|
Distribution of Cramér-von Mises Statistics. Journal of the
|
|
Royal Statistical Society, pp. 221-234.
|
|
|
|
Examples
|
|
--------
|
|
|
|
Suppose we wish to test whether data generated by ``scipy.stats.norm.rvs``
|
|
were, in fact, drawn from the standard normal distribution. We choose a
|
|
significance level of ``alpha=0.05``.
|
|
|
|
>>> import numpy as np
|
|
>>> from scipy import stats
|
|
>>> rng = np.random.default_rng(165417232101553420507139617764912913465)
|
|
>>> x = stats.norm.rvs(size=500, random_state=rng)
|
|
>>> res = stats.cramervonmises(x, 'norm')
|
|
>>> res.statistic, res.pvalue
|
|
(0.1072085112565724, 0.5508482238203407)
|
|
|
|
The p-value exceeds our chosen significance level, so we do not
|
|
reject the null hypothesis that the observed sample is drawn from the
|
|
standard normal distribution.
|
|
|
|
Now suppose we wish to check whether the same samples shifted by 2.1 is
|
|
consistent with being drawn from a normal distribution with a mean of 2.
|
|
|
|
>>> y = x + 2.1
|
|
>>> res = stats.cramervonmises(y, 'norm', args=(2,))
|
|
>>> res.statistic, res.pvalue
|
|
(0.8364446265294695, 0.00596286797008283)
|
|
|
|
Here we have used the `args` keyword to specify the mean (``loc``)
|
|
of the normal distribution to test the data against. This is equivalent
|
|
to the following, in which we create a frozen normal distribution with
|
|
mean 2.1, then pass its ``cdf`` method as an argument.
|
|
|
|
>>> frozen_dist = stats.norm(loc=2)
|
|
>>> res = stats.cramervonmises(y, frozen_dist.cdf)
|
|
>>> res.statistic, res.pvalue
|
|
(0.8364446265294695, 0.00596286797008283)
|
|
|
|
In either case, we would reject the null hypothesis that the observed
|
|
sample is drawn from a normal distribution with a mean of 2 (and default
|
|
variance of 1) because the p-value is less than our chosen
|
|
significance level.
|
|
|
|
"""
|
|
if isinstance(cdf, str):
|
|
cdf = getattr(distributions, cdf).cdf
|
|
|
|
vals = np.sort(np.asarray(rvs))
|
|
|
|
if vals.size <= 1:
|
|
raise ValueError('The sample must contain at least two observations.')
|
|
|
|
n = len(vals)
|
|
cdfvals = cdf(vals, *args)
|
|
|
|
u = (2*np.arange(1, n+1) - 1)/(2*n)
|
|
w = 1/(12*n) + np.sum((u - cdfvals)**2)
|
|
|
|
# avoid small negative values that can occur due to the approximation
|
|
p = max(0, 1. - _cdf_cvm(w, n))
|
|
|
|
return CramerVonMisesResult(statistic=w, pvalue=p)
|
|
|
|
|
|
def _get_wilcoxon_distr(n):
|
|
"""
|
|
Distribution of probability of the Wilcoxon ranksum statistic r_plus (sum
|
|
of ranks of positive differences).
|
|
Returns an array with the probabilities of all the possible ranks
|
|
r = 0, ..., n*(n+1)/2
|
|
"""
|
|
c = np.ones(1, dtype=np.float64)
|
|
for k in range(1, n + 1):
|
|
prev_c = c
|
|
c = np.zeros(k * (k + 1) // 2 + 1, dtype=np.float64)
|
|
m = len(prev_c)
|
|
c[:m] = prev_c * 0.5
|
|
c[-m:] += prev_c * 0.5
|
|
return c
|
|
|
|
|
|
def _get_wilcoxon_distr2(n):
|
|
"""
|
|
Distribution of probability of the Wilcoxon ranksum statistic r_plus (sum
|
|
of ranks of positive differences).
|
|
Returns an array with the probabilities of all the possible ranks
|
|
r = 0, ..., n*(n+1)/2
|
|
This is a slower reference function
|
|
References
|
|
----------
|
|
.. [1] 1. Harris T, Hardin JW. Exact Wilcoxon Signed-Rank and Wilcoxon
|
|
Mann-Whitney Ranksum Tests. The Stata Journal. 2013;13(2):337-343.
|
|
"""
|
|
ai = np.arange(1, n+1)[:, None]
|
|
t = n*(n+1)/2
|
|
q = 2*t
|
|
j = np.arange(q)
|
|
theta = 2*np.pi/q*j
|
|
phi_sp = np.prod(np.cos(theta*ai), axis=0)
|
|
phi_s = np.exp(1j*theta*t) * phi_sp
|
|
p = np.real(ifft(phi_s))
|
|
res = np.zeros(int(t)+1)
|
|
res[:-1:] = p[::2]
|
|
res[0] /= 2
|
|
res[-1] = res[0]
|
|
return res
|
|
|
|
|
|
def _tau_b(A):
|
|
"""Calculate Kendall's tau-b and p-value from contingency table."""
|
|
# See [2] 2.2 and 4.2
|
|
|
|
# contingency table must be truly 2D
|
|
if A.shape[0] == 1 or A.shape[1] == 1:
|
|
return np.nan, np.nan
|
|
|
|
NA = A.sum()
|
|
PA = _P(A)
|
|
QA = _Q(A)
|
|
Sri2 = (A.sum(axis=1)**2).sum()
|
|
Scj2 = (A.sum(axis=0)**2).sum()
|
|
denominator = (NA**2 - Sri2)*(NA**2 - Scj2)
|
|
|
|
tau = (PA-QA)/(denominator)**0.5
|
|
|
|
numerator = 4*(_a_ij_Aij_Dij2(A) - (PA - QA)**2 / NA)
|
|
s02_tau_b = numerator/denominator
|
|
if s02_tau_b == 0: # Avoid divide by zero
|
|
return tau, 0
|
|
Z = tau/s02_tau_b**0.5
|
|
p = 2*norm.sf(abs(Z)) # 2-sided p-value
|
|
|
|
return tau, p
|
|
|
|
|
|
def _somers_d(A, alternative='two-sided'):
|
|
"""Calculate Somers' D and p-value from contingency table."""
|
|
# See [3] page 1740
|
|
|
|
# contingency table must be truly 2D
|
|
if A.shape[0] <= 1 or A.shape[1] <= 1:
|
|
return np.nan, np.nan
|
|
|
|
NA = A.sum()
|
|
NA2 = NA**2
|
|
PA = _P(A)
|
|
QA = _Q(A)
|
|
Sri2 = (A.sum(axis=1)**2).sum()
|
|
|
|
d = (PA - QA)/(NA2 - Sri2)
|
|
|
|
S = _a_ij_Aij_Dij2(A) - (PA-QA)**2/NA
|
|
|
|
with np.errstate(divide='ignore'):
|
|
Z = (PA - QA)/(4*(S))**0.5
|
|
|
|
p = scipy.stats._stats_py._get_pvalue(Z, distributions.norm, alternative)
|
|
|
|
return d, p
|
|
|
|
|
|
@dataclass
|
|
class SomersDResult:
|
|
statistic: float
|
|
pvalue: float
|
|
table: np.ndarray
|
|
|
|
|
|
def somersd(x, y=None, alternative='two-sided'):
|
|
r"""Calculates Somers' D, an asymmetric measure of ordinal association.
|
|
|
|
Like Kendall's :math:`\tau`, Somers' :math:`D` is a measure of the
|
|
correspondence between two rankings. Both statistics consider the
|
|
difference between the number of concordant and discordant pairs in two
|
|
rankings :math:`X` and :math:`Y`, and both are normalized such that values
|
|
close to 1 indicate strong agreement and values close to -1 indicate
|
|
strong disagreement. They differ in how they are normalized. To show the
|
|
relationship, Somers' :math:`D` can be defined in terms of Kendall's
|
|
:math:`\tau_a`:
|
|
|
|
.. math::
|
|
D(Y|X) = \frac{\tau_a(X, Y)}{\tau_a(X, X)}
|
|
|
|
Suppose the first ranking :math:`X` has :math:`r` distinct ranks and the
|
|
second ranking :math:`Y` has :math:`s` distinct ranks. These two lists of
|
|
:math:`n` rankings can also be viewed as an :math:`r \times s` contingency
|
|
table in which element :math:`i, j` is the number of rank pairs with rank
|
|
:math:`i` in ranking :math:`X` and rank :math:`j` in ranking :math:`Y`.
|
|
Accordingly, `somersd` also allows the input data to be supplied as a
|
|
single, 2D contingency table instead of as two separate, 1D rankings.
|
|
|
|
Note that the definition of Somers' :math:`D` is asymmetric: in general,
|
|
:math:`D(Y|X) \neq D(X|Y)`. ``somersd(x, y)`` calculates Somers'
|
|
:math:`D(Y|X)`: the "row" variable :math:`X` is treated as an independent
|
|
variable, and the "column" variable :math:`Y` is dependent. For Somers'
|
|
:math:`D(X|Y)`, swap the input lists or transpose the input table.
|
|
|
|
Parameters
|
|
----------
|
|
x : array_like
|
|
1D array of rankings, treated as the (row) independent variable.
|
|
Alternatively, a 2D contingency table.
|
|
y : array_like, optional
|
|
If `x` is a 1D array of rankings, `y` is a 1D array of rankings of the
|
|
same length, treated as the (column) dependent variable.
|
|
If `x` is 2D, `y` is ignored.
|
|
alternative : {'two-sided', 'less', 'greater'}, optional
|
|
Defines the alternative hypothesis. Default is 'two-sided'.
|
|
The following options are available:
|
|
* 'two-sided': the rank correlation is nonzero
|
|
* 'less': the rank correlation is negative (less than zero)
|
|
* 'greater': the rank correlation is positive (greater than zero)
|
|
|
|
Returns
|
|
-------
|
|
res : SomersDResult
|
|
A `SomersDResult` object with the following fields:
|
|
|
|
statistic : float
|
|
The Somers' :math:`D` statistic.
|
|
pvalue : float
|
|
The p-value for a hypothesis test whose null
|
|
hypothesis is an absence of association, :math:`D=0`.
|
|
See notes for more information.
|
|
table : 2D array
|
|
The contingency table formed from rankings `x` and `y` (or the
|
|
provided contingency table, if `x` is a 2D array)
|
|
|
|
See Also
|
|
--------
|
|
kendalltau : Calculates Kendall's tau, another correlation measure.
|
|
weightedtau : Computes a weighted version of Kendall's tau.
|
|
spearmanr : Calculates a Spearman rank-order correlation coefficient.
|
|
pearsonr : Calculates a Pearson correlation coefficient.
|
|
|
|
Notes
|
|
-----
|
|
This function follows the contingency table approach of [2]_ and
|
|
[3]_. *p*-values are computed based on an asymptotic approximation of
|
|
the test statistic distribution under the null hypothesis :math:`D=0`.
|
|
|
|
Theoretically, hypothesis tests based on Kendall's :math:`tau` and Somers'
|
|
:math:`D` should be identical.
|
|
However, the *p*-values returned by `kendalltau` are based
|
|
on the null hypothesis of *independence* between :math:`X` and :math:`Y`
|
|
(i.e. the population from which pairs in :math:`X` and :math:`Y` are
|
|
sampled contains equal numbers of all possible pairs), which is more
|
|
specific than the null hypothesis :math:`D=0` used here. If the null
|
|
hypothesis of independence is desired, it is acceptable to use the
|
|
*p*-value returned by `kendalltau` with the statistic returned by
|
|
`somersd` and vice versa. For more information, see [2]_.
|
|
|
|
Contingency tables are formatted according to the convention used by
|
|
SAS and R: the first ranking supplied (``x``) is the "row" variable, and
|
|
the second ranking supplied (``y``) is the "column" variable. This is
|
|
opposite the convention of Somers' original paper [1]_.
|
|
|
|
References
|
|
----------
|
|
.. [1] Robert H. Somers, "A New Asymmetric Measure of Association for
|
|
Ordinal Variables", *American Sociological Review*, Vol. 27, No. 6,
|
|
pp. 799--811, 1962.
|
|
|
|
.. [2] Morton B. Brown and Jacqueline K. Benedetti, "Sampling Behavior of
|
|
Tests for Correlation in Two-Way Contingency Tables", *Journal of
|
|
the American Statistical Association* Vol. 72, No. 358, pp.
|
|
309--315, 1977.
|
|
|
|
.. [3] SAS Institute, Inc., "The FREQ Procedure (Book Excerpt)",
|
|
*SAS/STAT 9.2 User's Guide, Second Edition*, SAS Publishing, 2009.
|
|
|
|
.. [4] Laerd Statistics, "Somers' d using SPSS Statistics", *SPSS
|
|
Statistics Tutorials and Statistical Guides*,
|
|
https://statistics.laerd.com/spss-tutorials/somers-d-using-spss-statistics.php,
|
|
Accessed July 31, 2020.
|
|
|
|
Examples
|
|
--------
|
|
We calculate Somers' D for the example given in [4]_, in which a hotel
|
|
chain owner seeks to determine the association between hotel room
|
|
cleanliness and customer satisfaction. The independent variable, hotel
|
|
room cleanliness, is ranked on an ordinal scale: "below average (1)",
|
|
"average (2)", or "above average (3)". The dependent variable, customer
|
|
satisfaction, is ranked on a second scale: "very dissatisfied (1)",
|
|
"moderately dissatisfied (2)", "neither dissatisfied nor satisfied (3)",
|
|
"moderately satisfied (4)", or "very satisfied (5)". 189 customers
|
|
respond to the survey, and the results are cast into a contingency table
|
|
with the hotel room cleanliness as the "row" variable and customer
|
|
satisfaction as the "column" variable.
|
|
|
|
+-----+-----+-----+-----+-----+-----+
|
|
| | (1) | (2) | (3) | (4) | (5) |
|
|
+=====+=====+=====+=====+=====+=====+
|
|
| (1) | 27 | 25 | 14 | 7 | 0 |
|
|
+-----+-----+-----+-----+-----+-----+
|
|
| (2) | 7 | 14 | 18 | 35 | 12 |
|
|
+-----+-----+-----+-----+-----+-----+
|
|
| (3) | 1 | 3 | 2 | 7 | 17 |
|
|
+-----+-----+-----+-----+-----+-----+
|
|
|
|
For example, 27 customers assigned their room a cleanliness ranking of
|
|
"below average (1)" and a corresponding satisfaction of "very
|
|
dissatisfied (1)". We perform the analysis as follows.
|
|
|
|
>>> from scipy.stats import somersd
|
|
>>> table = [[27, 25, 14, 7, 0], [7, 14, 18, 35, 12], [1, 3, 2, 7, 17]]
|
|
>>> res = somersd(table)
|
|
>>> res.statistic
|
|
0.6032766111513396
|
|
>>> res.pvalue
|
|
1.0007091191074533e-27
|
|
|
|
The value of the Somers' D statistic is approximately 0.6, indicating
|
|
a positive correlation between room cleanliness and customer satisfaction
|
|
in the sample.
|
|
The *p*-value is very small, indicating a very small probability of
|
|
observing such an extreme value of the statistic under the null
|
|
hypothesis that the statistic of the entire population (from which
|
|
our sample of 189 customers is drawn) is zero. This supports the
|
|
alternative hypothesis that the true value of Somers' D for the population
|
|
is nonzero.
|
|
|
|
"""
|
|
x, y = np.array(x), np.array(y)
|
|
if x.ndim == 1:
|
|
if x.size != y.size:
|
|
raise ValueError("Rankings must be of equal length.")
|
|
table = scipy.stats.contingency.crosstab(x, y)[1]
|
|
elif x.ndim == 2:
|
|
if np.any(x < 0):
|
|
raise ValueError("All elements of the contingency table must be "
|
|
"non-negative.")
|
|
if np.any(x != x.astype(int)):
|
|
raise ValueError("All elements of the contingency table must be "
|
|
"integer.")
|
|
if x.nonzero()[0].size < 2:
|
|
raise ValueError("At least two elements of the contingency table "
|
|
"must be nonzero.")
|
|
table = x
|
|
else:
|
|
raise ValueError("x must be either a 1D or 2D array")
|
|
# The table type is converted to a float to avoid an integer overflow
|
|
d, p = _somers_d(table.astype(float), alternative)
|
|
|
|
# add alias for consistency with other correlation functions
|
|
res = SomersDResult(d, p, table)
|
|
res.correlation = d
|
|
return res
|
|
|
|
|
|
# This could be combined with `_all_partitions` in `_resampling.py`
|
|
def _all_partitions(nx, ny):
|
|
"""
|
|
Partition a set of indices into two fixed-length sets in all possible ways
|
|
|
|
Partition a set of indices 0 ... nx + ny - 1 into two sets of length nx and
|
|
ny in all possible ways (ignoring order of elements).
|
|
"""
|
|
z = np.arange(nx+ny)
|
|
for c in combinations(z, nx):
|
|
x = np.array(c)
|
|
mask = np.ones(nx+ny, bool)
|
|
mask[x] = False
|
|
y = z[mask]
|
|
yield x, y
|
|
|
|
|
|
def _compute_log_combinations(n):
|
|
"""Compute all log combination of C(n, k)."""
|
|
gammaln_arr = gammaln(np.arange(n + 1) + 1)
|
|
return gammaln(n + 1) - gammaln_arr - gammaln_arr[::-1]
|
|
|
|
|
|
@dataclass
|
|
class BarnardExactResult:
|
|
statistic: float
|
|
pvalue: float
|
|
|
|
|
|
def barnard_exact(table, alternative="two-sided", pooled=True, n=32):
|
|
r"""Perform a Barnard exact test on a 2x2 contingency table.
|
|
|
|
Parameters
|
|
----------
|
|
table : array_like of ints
|
|
A 2x2 contingency table. Elements should be non-negative integers.
|
|
|
|
alternative : {'two-sided', 'less', 'greater'}, optional
|
|
Defines the null and alternative hypotheses. Default is 'two-sided'.
|
|
Please see explanations in the Notes section below.
|
|
|
|
pooled : bool, optional
|
|
Whether to compute score statistic with pooled variance (as in
|
|
Student's t-test, for example) or unpooled variance (as in Welch's
|
|
t-test). Default is ``True``.
|
|
|
|
n : int, optional
|
|
Number of sampling points used in the construction of the sampling
|
|
method. Note that this argument will automatically be converted to
|
|
the next higher power of 2 since `scipy.stats.qmc.Sobol` is used to
|
|
select sample points. Default is 32. Must be positive. In most cases,
|
|
32 points is enough to reach good precision. More points comes at
|
|
performance cost.
|
|
|
|
Returns
|
|
-------
|
|
ber : BarnardExactResult
|
|
A result object with the following attributes.
|
|
|
|
statistic : float
|
|
The Wald statistic with pooled or unpooled variance, depending
|
|
on the user choice of `pooled`.
|
|
|
|
pvalue : float
|
|
P-value, the probability of obtaining a distribution at least as
|
|
extreme as the one that was actually observed, assuming that the
|
|
null hypothesis is true.
|
|
|
|
See Also
|
|
--------
|
|
chi2_contingency : Chi-square test of independence of variables in a
|
|
contingency table.
|
|
fisher_exact : Fisher exact test on a 2x2 contingency table.
|
|
boschloo_exact : Boschloo's exact test on a 2x2 contingency table,
|
|
which is an uniformly more powerful alternative to Fisher's exact test.
|
|
|
|
Notes
|
|
-----
|
|
Barnard's test is an exact test used in the analysis of contingency
|
|
tables. It examines the association of two categorical variables, and
|
|
is a more powerful alternative than Fisher's exact test
|
|
for 2x2 contingency tables.
|
|
|
|
Let's define :math:`X_0` a 2x2 matrix representing the observed sample,
|
|
where each column stores the binomial experiment, as in the example
|
|
below. Let's also define :math:`p_1, p_2` the theoretical binomial
|
|
probabilities for :math:`x_{11}` and :math:`x_{12}`. When using
|
|
Barnard exact test, we can assert three different null hypotheses :
|
|
|
|
- :math:`H_0 : p_1 \geq p_2` versus :math:`H_1 : p_1 < p_2`,
|
|
with `alternative` = "less"
|
|
|
|
- :math:`H_0 : p_1 \leq p_2` versus :math:`H_1 : p_1 > p_2`,
|
|
with `alternative` = "greater"
|
|
|
|
- :math:`H_0 : p_1 = p_2` versus :math:`H_1 : p_1 \neq p_2`,
|
|
with `alternative` = "two-sided" (default one)
|
|
|
|
In order to compute Barnard's exact test, we are using the Wald
|
|
statistic [3]_ with pooled or unpooled variance.
|
|
Under the default assumption that both variances are equal
|
|
(``pooled = True``), the statistic is computed as:
|
|
|
|
.. math::
|
|
|
|
T(X) = \frac{
|
|
\hat{p}_1 - \hat{p}_2
|
|
}{
|
|
\sqrt{
|
|
\hat{p}(1 - \hat{p})
|
|
(\frac{1}{c_1} +
|
|
\frac{1}{c_2})
|
|
}
|
|
}
|
|
|
|
with :math:`\hat{p}_1, \hat{p}_2` and :math:`\hat{p}` the estimator of
|
|
:math:`p_1, p_2` and :math:`p`, the latter being the combined probability,
|
|
given the assumption that :math:`p_1 = p_2`.
|
|
|
|
If this assumption is invalid (``pooled = False``), the statistic is:
|
|
|
|
.. math::
|
|
|
|
T(X) = \frac{
|
|
\hat{p}_1 - \hat{p}_2
|
|
}{
|
|
\sqrt{
|
|
\frac{\hat{p}_1 (1 - \hat{p}_1)}{c_1} +
|
|
\frac{\hat{p}_2 (1 - \hat{p}_2)}{c_2}
|
|
}
|
|
}
|
|
|
|
The p-value is then computed as:
|
|
|
|
.. math::
|
|
|
|
\sum
|
|
\binom{c_1}{x_{11}}
|
|
\binom{c_2}{x_{12}}
|
|
\pi^{x_{11} + x_{12}}
|
|
(1 - \pi)^{t - x_{11} - x_{12}}
|
|
|
|
where the sum is over all 2x2 contingency tables :math:`X` such that:
|
|
* :math:`T(X) \leq T(X_0)` when `alternative` = "less",
|
|
* :math:`T(X) \geq T(X_0)` when `alternative` = "greater", or
|
|
* :math:`T(X) \geq |T(X_0)|` when `alternative` = "two-sided".
|
|
Above, :math:`c_1, c_2` are the sum of the columns 1 and 2,
|
|
and :math:`t` the total (sum of the 4 sample's element).
|
|
|
|
The returned p-value is the maximum p-value taken over the nuisance
|
|
parameter :math:`\pi`, where :math:`0 \leq \pi \leq 1`.
|
|
|
|
This function's complexity is :math:`O(n c_1 c_2)`, where `n` is the
|
|
number of sample points.
|
|
|
|
References
|
|
----------
|
|
.. [1] Barnard, G. A. "Significance Tests for 2x2 Tables". *Biometrika*.
|
|
34.1/2 (1947): 123-138. :doi:`dpgkg3`
|
|
|
|
.. [2] Mehta, Cyrus R., and Pralay Senchaudhuri. "Conditional versus
|
|
unconditional exact tests for comparing two binomials."
|
|
*Cytel Software Corporation* 675 (2003): 1-5.
|
|
|
|
.. [3] "Wald Test". *Wikipedia*. https://en.wikipedia.org/wiki/Wald_test
|
|
|
|
Examples
|
|
--------
|
|
An example use of Barnard's test is presented in [2]_.
|
|
|
|
Consider the following example of a vaccine efficacy study
|
|
(Chan, 1998). In a randomized clinical trial of 30 subjects, 15 were
|
|
inoculated with a recombinant DNA influenza vaccine and the 15 were
|
|
inoculated with a placebo. Twelve of the 15 subjects in the placebo
|
|
group (80%) eventually became infected with influenza whereas for the
|
|
vaccine group, only 7 of the 15 subjects (47%) became infected. The
|
|
data are tabulated as a 2 x 2 table::
|
|
|
|
Vaccine Placebo
|
|
Yes 7 12
|
|
No 8 3
|
|
|
|
When working with statistical hypothesis testing, we usually use a
|
|
threshold probability or significance level upon which we decide
|
|
to reject the null hypothesis :math:`H_0`. Suppose we choose the common
|
|
significance level of 5%.
|
|
|
|
Our alternative hypothesis is that the vaccine will lower the chance of
|
|
becoming infected with the virus; that is, the probability :math:`p_1` of
|
|
catching the virus with the vaccine will be *less than* the probability
|
|
:math:`p_2` of catching the virus without the vaccine. Therefore, we call
|
|
`barnard_exact` with the ``alternative="less"`` option:
|
|
|
|
>>> import scipy.stats as stats
|
|
>>> res = stats.barnard_exact([[7, 12], [8, 3]], alternative="less")
|
|
>>> res.statistic
|
|
-1.894...
|
|
>>> res.pvalue
|
|
0.03407...
|
|
|
|
Under the null hypothesis that the vaccine will not lower the chance of
|
|
becoming infected, the probability of obtaining test results at least as
|
|
extreme as the observed data is approximately 3.4%. Since this p-value is
|
|
less than our chosen significance level, we have evidence to reject
|
|
:math:`H_0` in favor of the alternative.
|
|
|
|
Suppose we had used Fisher's exact test instead:
|
|
|
|
>>> _, pvalue = stats.fisher_exact([[7, 12], [8, 3]], alternative="less")
|
|
>>> pvalue
|
|
0.0640...
|
|
|
|
With the same threshold significance of 5%, we would not have been able
|
|
to reject the null hypothesis in favor of the alternative. As stated in
|
|
[2]_, Barnard's test is uniformly more powerful than Fisher's exact test
|
|
because Barnard's test does not condition on any margin. Fisher's test
|
|
should only be used when both sets of marginals are fixed.
|
|
|
|
"""
|
|
if n <= 0:
|
|
raise ValueError(
|
|
"Number of points `n` must be strictly positive, "
|
|
f"found {n!r}"
|
|
)
|
|
|
|
table = np.asarray(table, dtype=np.int64)
|
|
|
|
if not table.shape == (2, 2):
|
|
raise ValueError("The input `table` must be of shape (2, 2).")
|
|
|
|
if np.any(table < 0):
|
|
raise ValueError("All values in `table` must be nonnegative.")
|
|
|
|
if 0 in table.sum(axis=0):
|
|
# If both values in column are zero, the p-value is 1 and
|
|
# the score's statistic is NaN.
|
|
return BarnardExactResult(np.nan, 1.0)
|
|
|
|
total_col_1, total_col_2 = table.sum(axis=0)
|
|
|
|
x1 = np.arange(total_col_1 + 1, dtype=np.int64).reshape(-1, 1)
|
|
x2 = np.arange(total_col_2 + 1, dtype=np.int64).reshape(1, -1)
|
|
|
|
# We need to calculate the wald statistics for each combination of x1 and
|
|
# x2.
|
|
p1, p2 = x1 / total_col_1, x2 / total_col_2
|
|
|
|
if pooled:
|
|
p = (x1 + x2) / (total_col_1 + total_col_2)
|
|
variances = p * (1 - p) * (1 / total_col_1 + 1 / total_col_2)
|
|
else:
|
|
variances = p1 * (1 - p1) / total_col_1 + p2 * (1 - p2) / total_col_2
|
|
|
|
# To avoid warning when dividing by 0
|
|
with np.errstate(divide="ignore", invalid="ignore"):
|
|
wald_statistic = np.divide((p1 - p2), np.sqrt(variances))
|
|
|
|
wald_statistic[p1 == p2] = 0 # Removing NaN values
|
|
|
|
wald_stat_obs = wald_statistic[table[0, 0], table[0, 1]]
|
|
|
|
if alternative == "two-sided":
|
|
index_arr = np.abs(wald_statistic) >= abs(wald_stat_obs)
|
|
elif alternative == "less":
|
|
index_arr = wald_statistic <= wald_stat_obs
|
|
elif alternative == "greater":
|
|
index_arr = wald_statistic >= wald_stat_obs
|
|
else:
|
|
msg = (
|
|
"`alternative` should be one of {'two-sided', 'less', 'greater'},"
|
|
f" found {alternative!r}"
|
|
)
|
|
raise ValueError(msg)
|
|
|
|
x1_sum_x2 = x1 + x2
|
|
|
|
x1_log_comb = _compute_log_combinations(total_col_1)
|
|
x2_log_comb = _compute_log_combinations(total_col_2)
|
|
x1_sum_x2_log_comb = x1_log_comb[x1] + x2_log_comb[x2]
|
|
|
|
result = shgo(
|
|
_get_binomial_log_p_value_with_nuisance_param,
|
|
args=(x1_sum_x2, x1_sum_x2_log_comb, index_arr),
|
|
bounds=((0, 1),),
|
|
n=n,
|
|
sampling_method="sobol",
|
|
)
|
|
|
|
# result.fun is the negative log pvalue and therefore needs to be
|
|
# changed before return
|
|
p_value = np.clip(np.exp(-result.fun), a_min=0, a_max=1)
|
|
return BarnardExactResult(wald_stat_obs, p_value)
|
|
|
|
|
|
@dataclass
|
|
class BoschlooExactResult:
|
|
statistic: float
|
|
pvalue: float
|
|
|
|
|
|
def boschloo_exact(table, alternative="two-sided", n=32):
|
|
r"""Perform Boschloo's exact test on a 2x2 contingency table.
|
|
|
|
Parameters
|
|
----------
|
|
table : array_like of ints
|
|
A 2x2 contingency table. Elements should be non-negative integers.
|
|
|
|
alternative : {'two-sided', 'less', 'greater'}, optional
|
|
Defines the null and alternative hypotheses. Default is 'two-sided'.
|
|
Please see explanations in the Notes section below.
|
|
|
|
n : int, optional
|
|
Number of sampling points used in the construction of the sampling
|
|
method. Note that this argument will automatically be converted to
|
|
the next higher power of 2 since `scipy.stats.qmc.Sobol` is used to
|
|
select sample points. Default is 32. Must be positive. In most cases,
|
|
32 points is enough to reach good precision. More points comes at
|
|
performance cost.
|
|
|
|
Returns
|
|
-------
|
|
ber : BoschlooExactResult
|
|
A result object with the following attributes.
|
|
|
|
statistic : float
|
|
The statistic used in Boschloo's test; that is, the p-value
|
|
from Fisher's exact test.
|
|
|
|
pvalue : float
|
|
P-value, the probability of obtaining a distribution at least as
|
|
extreme as the one that was actually observed, assuming that the
|
|
null hypothesis is true.
|
|
|
|
See Also
|
|
--------
|
|
chi2_contingency : Chi-square test of independence of variables in a
|
|
contingency table.
|
|
fisher_exact : Fisher exact test on a 2x2 contingency table.
|
|
barnard_exact : Barnard's exact test, which is a more powerful alternative
|
|
than Fisher's exact test for 2x2 contingency tables.
|
|
|
|
Notes
|
|
-----
|
|
Boschloo's test is an exact test used in the analysis of contingency
|
|
tables. It examines the association of two categorical variables, and
|
|
is a uniformly more powerful alternative to Fisher's exact test
|
|
for 2x2 contingency tables.
|
|
|
|
Boschloo's exact test uses the p-value of Fisher's exact test as a
|
|
statistic, and Boschloo's p-value is the probability under the null
|
|
hypothesis of observing such an extreme value of this statistic.
|
|
|
|
Let's define :math:`X_0` a 2x2 matrix representing the observed sample,
|
|
where each column stores the binomial experiment, as in the example
|
|
below. Let's also define :math:`p_1, p_2` the theoretical binomial
|
|
probabilities for :math:`x_{11}` and :math:`x_{12}`. When using
|
|
Boschloo exact test, we can assert three different alternative hypotheses:
|
|
|
|
- :math:`H_0 : p_1=p_2` versus :math:`H_1 : p_1 < p_2`,
|
|
with `alternative` = "less"
|
|
|
|
- :math:`H_0 : p_1=p_2` versus :math:`H_1 : p_1 > p_2`,
|
|
with `alternative` = "greater"
|
|
|
|
- :math:`H_0 : p_1=p_2` versus :math:`H_1 : p_1 \neq p_2`,
|
|
with `alternative` = "two-sided" (default)
|
|
|
|
There are multiple conventions for computing a two-sided p-value when the
|
|
null distribution is asymmetric. Here, we apply the convention that the
|
|
p-value of a two-sided test is twice the minimum of the p-values of the
|
|
one-sided tests (clipped to 1.0). Note that `fisher_exact` follows a
|
|
different convention, so for a given `table`, the statistic reported by
|
|
`boschloo_exact` may differ from the p-value reported by `fisher_exact`
|
|
when ``alternative='two-sided'``.
|
|
|
|
.. versionadded:: 1.7.0
|
|
|
|
References
|
|
----------
|
|
.. [1] R.D. Boschloo. "Raised conditional level of significance for the
|
|
2 x 2-table when testing the equality of two probabilities",
|
|
Statistica Neerlandica, 24(1), 1970
|
|
|
|
.. [2] "Boschloo's test", Wikipedia,
|
|
https://en.wikipedia.org/wiki/Boschloo%27s_test
|
|
|
|
.. [3] Lise M. Saari et al. "Employee attitudes and job satisfaction",
|
|
Human Resource Management, 43(4), 395-407, 2004,
|
|
:doi:`10.1002/hrm.20032`.
|
|
|
|
Examples
|
|
--------
|
|
In the following example, we consider the article "Employee
|
|
attitudes and job satisfaction" [3]_
|
|
which reports the results of a survey from 63 scientists and 117 college
|
|
professors. Of the 63 scientists, 31 said they were very satisfied with
|
|
their jobs, whereas 74 of the college professors were very satisfied
|
|
with their work. Is this significant evidence that college
|
|
professors are happier with their work than scientists?
|
|
The following table summarizes the data mentioned above::
|
|
|
|
college professors scientists
|
|
Very Satisfied 74 31
|
|
Dissatisfied 43 32
|
|
|
|
When working with statistical hypothesis testing, we usually use a
|
|
threshold probability or significance level upon which we decide
|
|
to reject the null hypothesis :math:`H_0`. Suppose we choose the common
|
|
significance level of 5%.
|
|
|
|
Our alternative hypothesis is that college professors are truly more
|
|
satisfied with their work than scientists. Therefore, we expect
|
|
:math:`p_1` the proportion of very satisfied college professors to be
|
|
greater than :math:`p_2`, the proportion of very satisfied scientists.
|
|
We thus call `boschloo_exact` with the ``alternative="greater"`` option:
|
|
|
|
>>> import scipy.stats as stats
|
|
>>> res = stats.boschloo_exact([[74, 31], [43, 32]], alternative="greater")
|
|
>>> res.statistic
|
|
0.0483...
|
|
>>> res.pvalue
|
|
0.0355...
|
|
|
|
Under the null hypothesis that scientists are happier in their work than
|
|
college professors, the probability of obtaining test
|
|
results at least as extreme as the observed data is approximately 3.55%.
|
|
Since this p-value is less than our chosen significance level, we have
|
|
evidence to reject :math:`H_0` in favor of the alternative hypothesis.
|
|
|
|
"""
|
|
hypergeom = distributions.hypergeom
|
|
|
|
if n <= 0:
|
|
raise ValueError(
|
|
"Number of points `n` must be strictly positive,"
|
|
f" found {n!r}"
|
|
)
|
|
|
|
table = np.asarray(table, dtype=np.int64)
|
|
|
|
if not table.shape == (2, 2):
|
|
raise ValueError("The input `table` must be of shape (2, 2).")
|
|
|
|
if np.any(table < 0):
|
|
raise ValueError("All values in `table` must be nonnegative.")
|
|
|
|
if 0 in table.sum(axis=0):
|
|
# If both values in column are zero, the p-value is 1 and
|
|
# the score's statistic is NaN.
|
|
return BoschlooExactResult(np.nan, np.nan)
|
|
|
|
total_col_1, total_col_2 = table.sum(axis=0)
|
|
total = total_col_1 + total_col_2
|
|
x1 = np.arange(total_col_1 + 1, dtype=np.int64).reshape(1, -1)
|
|
x2 = np.arange(total_col_2 + 1, dtype=np.int64).reshape(-1, 1)
|
|
x1_sum_x2 = x1 + x2
|
|
|
|
if alternative == 'less':
|
|
pvalues = hypergeom.cdf(x1, total, x1_sum_x2, total_col_1).T
|
|
elif alternative == 'greater':
|
|
# Same formula as the 'less' case, but with the second column.
|
|
pvalues = hypergeom.cdf(x2, total, x1_sum_x2, total_col_2).T
|
|
elif alternative == 'two-sided':
|
|
boschloo_less = boschloo_exact(table, alternative="less", n=n)
|
|
boschloo_greater = boschloo_exact(table, alternative="greater", n=n)
|
|
|
|
res = (
|
|
boschloo_less if boschloo_less.pvalue < boschloo_greater.pvalue
|
|
else boschloo_greater
|
|
)
|
|
|
|
# Two-sided p-value is defined as twice the minimum of the one-sided
|
|
# p-values
|
|
pvalue = np.clip(2 * res.pvalue, a_min=0, a_max=1)
|
|
return BoschlooExactResult(res.statistic, pvalue)
|
|
else:
|
|
msg = (
|
|
f"`alternative` should be one of {'two-sided', 'less', 'greater'},"
|
|
f" found {alternative!r}"
|
|
)
|
|
raise ValueError(msg)
|
|
|
|
fisher_stat = pvalues[table[0, 0], table[0, 1]]
|
|
|
|
# fisher_stat * (1+1e-13) guards us from small numerical error. It is
|
|
# equivalent to np.isclose with relative tol of 1e-13 and absolute tol of 0
|
|
# For more throughout explanations, see gh-14178
|
|
index_arr = pvalues <= fisher_stat * (1+1e-13)
|
|
|
|
x1, x2, x1_sum_x2 = x1.T, x2.T, x1_sum_x2.T
|
|
x1_log_comb = _compute_log_combinations(total_col_1)
|
|
x2_log_comb = _compute_log_combinations(total_col_2)
|
|
x1_sum_x2_log_comb = x1_log_comb[x1] + x2_log_comb[x2]
|
|
|
|
result = shgo(
|
|
_get_binomial_log_p_value_with_nuisance_param,
|
|
args=(x1_sum_x2, x1_sum_x2_log_comb, index_arr),
|
|
bounds=((0, 1),),
|
|
n=n,
|
|
sampling_method="sobol",
|
|
)
|
|
|
|
# result.fun is the negative log pvalue and therefore needs to be
|
|
# changed before return
|
|
p_value = np.clip(np.exp(-result.fun), a_min=0, a_max=1)
|
|
return BoschlooExactResult(fisher_stat, p_value)
|
|
|
|
|
|
def _get_binomial_log_p_value_with_nuisance_param(
|
|
nuisance_param, x1_sum_x2, x1_sum_x2_log_comb, index_arr
|
|
):
|
|
r"""
|
|
Compute the log pvalue in respect of a nuisance parameter considering
|
|
a 2x2 sample space.
|
|
|
|
Parameters
|
|
----------
|
|
nuisance_param : float
|
|
nuisance parameter used in the computation of the maximisation of
|
|
the p-value. Must be between 0 and 1
|
|
|
|
x1_sum_x2 : ndarray
|
|
Sum of x1 and x2 inside barnard_exact
|
|
|
|
x1_sum_x2_log_comb : ndarray
|
|
sum of the log combination of x1 and x2
|
|
|
|
index_arr : ndarray of boolean
|
|
|
|
Returns
|
|
-------
|
|
p_value : float
|
|
Return the maximum p-value considering every nuisance parameter
|
|
between 0 and 1
|
|
|
|
Notes
|
|
-----
|
|
|
|
Both Barnard's test and Boschloo's test iterate over a nuisance parameter
|
|
:math:`\pi \in [0, 1]` to find the maximum p-value. To search this
|
|
maxima, this function return the negative log pvalue with respect to the
|
|
nuisance parameter passed in params. This negative log p-value is then
|
|
used in `shgo` to find the minimum negative pvalue which is our maximum
|
|
pvalue.
|
|
|
|
Also, to compute the different combination used in the
|
|
p-values' computation formula, this function uses `gammaln` which is
|
|
more tolerant for large value than `scipy.special.comb`. `gammaln` gives
|
|
a log combination. For the little precision loss, performances are
|
|
improved a lot.
|
|
"""
|
|
t1, t2 = x1_sum_x2.shape
|
|
n = t1 + t2 - 2
|
|
with np.errstate(divide="ignore", invalid="ignore"):
|
|
log_nuisance = np.log(
|
|
nuisance_param,
|
|
out=np.zeros_like(nuisance_param),
|
|
where=nuisance_param >= 0,
|
|
)
|
|
log_1_minus_nuisance = np.log(
|
|
1 - nuisance_param,
|
|
out=np.zeros_like(nuisance_param),
|
|
where=1 - nuisance_param >= 0,
|
|
)
|
|
|
|
nuisance_power_x1_x2 = log_nuisance * x1_sum_x2
|
|
nuisance_power_x1_x2[(x1_sum_x2 == 0)[:, :]] = 0
|
|
|
|
nuisance_power_n_minus_x1_x2 = log_1_minus_nuisance * (n - x1_sum_x2)
|
|
nuisance_power_n_minus_x1_x2[(x1_sum_x2 == n)[:, :]] = 0
|
|
|
|
tmp_log_values_arr = (
|
|
x1_sum_x2_log_comb
|
|
+ nuisance_power_x1_x2
|
|
+ nuisance_power_n_minus_x1_x2
|
|
)
|
|
|
|
tmp_values_from_index = tmp_log_values_arr[index_arr]
|
|
|
|
# To avoid dividing by zero in log function and getting inf value,
|
|
# values are centered according to the max
|
|
max_value = tmp_values_from_index.max()
|
|
|
|
# To have better result's precision, the log pvalue is taken here.
|
|
# Indeed, pvalue is included inside [0, 1] interval. Passing the
|
|
# pvalue to log makes the interval a lot bigger ([-inf, 0]), and thus
|
|
# help us to achieve better precision
|
|
with np.errstate(divide="ignore", invalid="ignore"):
|
|
log_probs = np.exp(tmp_values_from_index - max_value).sum()
|
|
log_pvalue = max_value + np.log(
|
|
log_probs,
|
|
out=np.full_like(log_probs, -np.inf),
|
|
where=log_probs > 0,
|
|
)
|
|
|
|
# Since shgo find the minima, minus log pvalue is returned
|
|
return -log_pvalue
|
|
|
|
|
|
def _pval_cvm_2samp_exact(s, m, n):
|
|
"""
|
|
Compute the exact p-value of the Cramer-von Mises two-sample test
|
|
for a given value s of the test statistic.
|
|
m and n are the sizes of the samples.
|
|
|
|
[1] Y. Xiao, A. Gordon, and A. Yakovlev, "A C++ Program for
|
|
the Cramér-Von Mises Two-Sample Test", J. Stat. Soft.,
|
|
vol. 17, no. 8, pp. 1-15, Dec. 2006.
|
|
[2] T. W. Anderson "On the Distribution of the Two-Sample Cramer-von Mises
|
|
Criterion," The Annals of Mathematical Statistics, Ann. Math. Statist.
|
|
33(3), 1148-1159, (September, 1962)
|
|
"""
|
|
|
|
# [1, p. 3]
|
|
lcm = np.lcm(m, n)
|
|
# [1, p. 4], below eq. 3
|
|
a = lcm // m
|
|
b = lcm // n
|
|
# Combine Eq. 9 in [2] with Eq. 2 in [1] and solve for $\zeta$
|
|
# Hint: `s` is $U$ in [2], and $T_2$ in [1] is $T$ in [2]
|
|
mn = m * n
|
|
zeta = lcm ** 2 * (m + n) * (6 * s - mn * (4 * mn - 1)) // (6 * mn ** 2)
|
|
|
|
# bound maximum value that may appear in `gs` (remember both rows!)
|
|
zeta_bound = lcm**2 * (m + n) # bound elements in row 1
|
|
combinations = comb(m + n, m) # sum of row 2
|
|
max_gs = max(zeta_bound, combinations)
|
|
dtype = np.min_scalar_type(max_gs)
|
|
|
|
# the frequency table of $g_{u, v}^+$ defined in [1, p. 6]
|
|
gs = ([np.array([[0], [1]], dtype=dtype)]
|
|
+ [np.empty((2, 0), dtype=dtype) for _ in range(m)])
|
|
for u in range(n + 1):
|
|
next_gs = []
|
|
tmp = np.empty((2, 0), dtype=dtype)
|
|
for v, g in enumerate(gs):
|
|
# Calculate g recursively with eq. 11 in [1]. Even though it
|
|
# doesn't look like it, this also does 12/13 (all of Algorithm 1).
|
|
vi, i0, i1 = np.intersect1d(tmp[0], g[0], return_indices=True)
|
|
tmp = np.concatenate([
|
|
np.stack([vi, tmp[1, i0] + g[1, i1]]),
|
|
np.delete(tmp, i0, 1),
|
|
np.delete(g, i1, 1)
|
|
], 1)
|
|
res = (a * v - b * u) ** 2
|
|
tmp[0] += res.astype(dtype)
|
|
next_gs.append(tmp)
|
|
gs = next_gs
|
|
value, freq = gs[m]
|
|
return np.float64(np.sum(freq[value >= zeta]) / combinations)
|
|
|
|
|
|
@_axis_nan_policy_factory(CramerVonMisesResult, n_samples=2, too_small=1,
|
|
result_to_tuple=_cvm_result_to_tuple)
|
|
def cramervonmises_2samp(x, y, method='auto'):
|
|
"""Perform the two-sample Cramér-von Mises test for goodness of fit.
|
|
|
|
This is the two-sample version of the Cramér-von Mises test ([1]_):
|
|
for two independent samples :math:`X_1, ..., X_n` and
|
|
:math:`Y_1, ..., Y_m`, the null hypothesis is that the samples
|
|
come from the same (unspecified) continuous distribution.
|
|
|
|
Parameters
|
|
----------
|
|
x : array_like
|
|
A 1-D array of observed values of the random variables :math:`X_i`.
|
|
y : array_like
|
|
A 1-D array of observed values of the random variables :math:`Y_i`.
|
|
method : {'auto', 'asymptotic', 'exact'}, optional
|
|
The method used to compute the p-value, see Notes for details.
|
|
The default is 'auto'.
|
|
|
|
Returns
|
|
-------
|
|
res : object with attributes
|
|
statistic : float
|
|
Cramér-von Mises statistic.
|
|
pvalue : float
|
|
The p-value.
|
|
|
|
See Also
|
|
--------
|
|
cramervonmises, anderson_ksamp, epps_singleton_2samp, ks_2samp
|
|
|
|
Notes
|
|
-----
|
|
.. versionadded:: 1.7.0
|
|
|
|
The statistic is computed according to equation 9 in [2]_. The
|
|
calculation of the p-value depends on the keyword `method`:
|
|
|
|
- ``asymptotic``: The p-value is approximated by using the limiting
|
|
distribution of the test statistic.
|
|
- ``exact``: The exact p-value is computed by enumerating all
|
|
possible combinations of the test statistic, see [2]_.
|
|
|
|
If ``method='auto'``, the exact approach is used
|
|
if both samples contain equal to or less than 20 observations,
|
|
otherwise the asymptotic distribution is used.
|
|
|
|
If the underlying distribution is not continuous, the p-value is likely to
|
|
be conservative (Section 6.2 in [3]_). When ranking the data to compute
|
|
the test statistic, midranks are used if there are ties.
|
|
|
|
References
|
|
----------
|
|
.. [1] https://en.wikipedia.org/wiki/Cramer-von_Mises_criterion
|
|
.. [2] Anderson, T.W. (1962). On the distribution of the two-sample
|
|
Cramer-von-Mises criterion. The Annals of Mathematical
|
|
Statistics, pp. 1148-1159.
|
|
.. [3] Conover, W.J., Practical Nonparametric Statistics, 1971.
|
|
|
|
Examples
|
|
--------
|
|
|
|
Suppose we wish to test whether two samples generated by
|
|
``scipy.stats.norm.rvs`` have the same distribution. We choose a
|
|
significance level of alpha=0.05.
|
|
|
|
>>> import numpy as np
|
|
>>> from scipy import stats
|
|
>>> rng = np.random.default_rng()
|
|
>>> x = stats.norm.rvs(size=100, random_state=rng)
|
|
>>> y = stats.norm.rvs(size=70, random_state=rng)
|
|
>>> res = stats.cramervonmises_2samp(x, y)
|
|
>>> res.statistic, res.pvalue
|
|
(0.29376470588235293, 0.1412873014573014)
|
|
|
|
The p-value exceeds our chosen significance level, so we do not
|
|
reject the null hypothesis that the observed samples are drawn from the
|
|
same distribution.
|
|
|
|
For small sample sizes, one can compute the exact p-values:
|
|
|
|
>>> x = stats.norm.rvs(size=7, random_state=rng)
|
|
>>> y = stats.t.rvs(df=2, size=6, random_state=rng)
|
|
>>> res = stats.cramervonmises_2samp(x, y, method='exact')
|
|
>>> res.statistic, res.pvalue
|
|
(0.197802197802198, 0.31643356643356646)
|
|
|
|
The p-value based on the asymptotic distribution is a good approximation
|
|
even though the sample size is small.
|
|
|
|
>>> res = stats.cramervonmises_2samp(x, y, method='asymptotic')
|
|
>>> res.statistic, res.pvalue
|
|
(0.197802197802198, 0.2966041181527128)
|
|
|
|
Independent of the method, one would not reject the null hypothesis at the
|
|
chosen significance level in this example.
|
|
|
|
"""
|
|
xa = np.sort(np.asarray(x))
|
|
ya = np.sort(np.asarray(y))
|
|
|
|
if xa.size <= 1 or ya.size <= 1:
|
|
raise ValueError('x and y must contain at least two observations.')
|
|
if method not in ['auto', 'exact', 'asymptotic']:
|
|
raise ValueError('method must be either auto, exact or asymptotic.')
|
|
|
|
nx = len(xa)
|
|
ny = len(ya)
|
|
|
|
if method == 'auto':
|
|
if max(nx, ny) > 20:
|
|
method = 'asymptotic'
|
|
else:
|
|
method = 'exact'
|
|
|
|
# get ranks of x and y in the pooled sample
|
|
z = np.concatenate([xa, ya])
|
|
# in case of ties, use midrank (see [1])
|
|
r = scipy.stats.rankdata(z, method='average')
|
|
rx = r[:nx]
|
|
ry = r[nx:]
|
|
|
|
# compute U (eq. 10 in [2])
|
|
u = nx * np.sum((rx - np.arange(1, nx+1))**2)
|
|
u += ny * np.sum((ry - np.arange(1, ny+1))**2)
|
|
|
|
# compute T (eq. 9 in [2])
|
|
k, N = nx*ny, nx + ny
|
|
t = u / (k*N) - (4*k - 1)/(6*N)
|
|
|
|
if method == 'exact':
|
|
p = _pval_cvm_2samp_exact(u, nx, ny)
|
|
else:
|
|
# compute expected value and variance of T (eq. 11 and 14 in [2])
|
|
et = (1 + 1/N)/6
|
|
vt = (N+1) * (4*k*N - 3*(nx**2 + ny**2) - 2*k)
|
|
vt = vt / (45 * N**2 * 4 * k)
|
|
|
|
# computed the normalized statistic (eq. 15 in [2])
|
|
tn = 1/6 + (t - et) / np.sqrt(45 * vt)
|
|
|
|
# approximate distribution of tn with limiting distribution
|
|
# of the one-sample test statistic
|
|
# if tn < 0.003, the _cdf_cvm_inf(tn) < 1.28*1e-18, return 1.0 directly
|
|
if tn < 0.003:
|
|
p = 1.0
|
|
else:
|
|
p = max(0, 1. - _cdf_cvm_inf(tn))
|
|
|
|
return CramerVonMisesResult(statistic=t, pvalue=p)
|
|
|
|
|
|
class TukeyHSDResult:
|
|
"""Result of `scipy.stats.tukey_hsd`.
|
|
|
|
Attributes
|
|
----------
|
|
statistic : float ndarray
|
|
The computed statistic of the test for each comparison. The element
|
|
at index ``(i, j)`` is the statistic for the comparison between groups
|
|
``i`` and ``j``.
|
|
pvalue : float ndarray
|
|
The associated p-value from the studentized range distribution. The
|
|
element at index ``(i, j)`` is the p-value for the comparison
|
|
between groups ``i`` and ``j``.
|
|
|
|
Notes
|
|
-----
|
|
The string representation of this object displays the most recently
|
|
calculated confidence interval, and if none have been previously
|
|
calculated, it will evaluate ``confidence_interval()``.
|
|
|
|
References
|
|
----------
|
|
.. [1] NIST/SEMATECH e-Handbook of Statistical Methods, "7.4.7.1. Tukey's
|
|
Method."
|
|
https://www.itl.nist.gov/div898/handbook/prc/section4/prc471.htm,
|
|
28 November 2020.
|
|
"""
|
|
|
|
def __init__(self, statistic, pvalue, _nobs, _ntreatments, _stand_err):
|
|
self.statistic = statistic
|
|
self.pvalue = pvalue
|
|
self._ntreatments = _ntreatments
|
|
self._nobs = _nobs
|
|
self._stand_err = _stand_err
|
|
self._ci = None
|
|
self._ci_cl = None
|
|
|
|
def __str__(self):
|
|
# Note: `__str__` prints the confidence intervals from the most
|
|
# recent call to `confidence_interval`. If it has not been called,
|
|
# it will be called with the default CL of .95.
|
|
if self._ci is None:
|
|
self.confidence_interval(confidence_level=.95)
|
|
s = ("Tukey's HSD Pairwise Group Comparisons"
|
|
f" ({self._ci_cl*100:.1f}% Confidence Interval)\n")
|
|
s += "Comparison Statistic p-value Lower CI Upper CI\n"
|
|
for i in range(self.pvalue.shape[0]):
|
|
for j in range(self.pvalue.shape[0]):
|
|
if i != j:
|
|
s += (f" ({i} - {j}) {self.statistic[i, j]:>10.3f}"
|
|
f"{self.pvalue[i, j]:>10.3f}"
|
|
f"{self._ci.low[i, j]:>10.3f}"
|
|
f"{self._ci.high[i, j]:>10.3f}\n")
|
|
return s
|
|
|
|
def confidence_interval(self, confidence_level=.95):
|
|
"""Compute the confidence interval for the specified confidence level.
|
|
|
|
Parameters
|
|
----------
|
|
confidence_level : float, optional
|
|
Confidence level for the computed confidence interval
|
|
of the estimated proportion. Default is .95.
|
|
|
|
Returns
|
|
-------
|
|
ci : ``ConfidenceInterval`` object
|
|
The object has attributes ``low`` and ``high`` that hold the
|
|
lower and upper bounds of the confidence intervals for each
|
|
comparison. The high and low values are accessible for each
|
|
comparison at index ``(i, j)`` between groups ``i`` and ``j``.
|
|
|
|
References
|
|
----------
|
|
.. [1] NIST/SEMATECH e-Handbook of Statistical Methods, "7.4.7.1.
|
|
Tukey's Method."
|
|
https://www.itl.nist.gov/div898/handbook/prc/section4/prc471.htm,
|
|
28 November 2020.
|
|
|
|
Examples
|
|
--------
|
|
>>> from scipy.stats import tukey_hsd
|
|
>>> group0 = [24.5, 23.5, 26.4, 27.1, 29.9]
|
|
>>> group1 = [28.4, 34.2, 29.5, 32.2, 30.1]
|
|
>>> group2 = [26.1, 28.3, 24.3, 26.2, 27.8]
|
|
>>> result = tukey_hsd(group0, group1, group2)
|
|
>>> ci = result.confidence_interval()
|
|
>>> ci.low
|
|
array([[-3.649159, -8.249159, -3.909159],
|
|
[ 0.950841, -3.649159, 0.690841],
|
|
[-3.389159, -7.989159, -3.649159]])
|
|
>>> ci.high
|
|
array([[ 3.649159, -0.950841, 3.389159],
|
|
[ 8.249159, 3.649159, 7.989159],
|
|
[ 3.909159, -0.690841, 3.649159]])
|
|
"""
|
|
# check to see if the supplied confidence level matches that of the
|
|
# previously computed CI.
|
|
if (self._ci is not None and self._ci_cl is not None and
|
|
confidence_level == self._ci_cl):
|
|
return self._ci
|
|
|
|
if not 0 < confidence_level < 1:
|
|
raise ValueError("Confidence level must be between 0 and 1.")
|
|
# determine the critical value of the studentized range using the
|
|
# appropriate confidence level, number of treatments, and degrees
|
|
# of freedom as determined by the number of data less the number of
|
|
# treatments. ("Confidence limits for Tukey's method")[1]. Note that
|
|
# in the cases of unequal sample sizes there will be a criterion for
|
|
# each group comparison.
|
|
params = (confidence_level, self._nobs, self._ntreatments - self._nobs)
|
|
srd = distributions.studentized_range.ppf(*params)
|
|
# also called maximum critical value, the Tukey criterion is the
|
|
# studentized range critical value * the square root of mean square
|
|
# error over the sample size.
|
|
tukey_criterion = srd * self._stand_err
|
|
# the confidence levels are determined by the
|
|
# `mean_differences` +- `tukey_criterion`
|
|
upper_conf = self.statistic + tukey_criterion
|
|
lower_conf = self.statistic - tukey_criterion
|
|
self._ci = ConfidenceInterval(low=lower_conf, high=upper_conf)
|
|
self._ci_cl = confidence_level
|
|
return self._ci
|
|
|
|
|
|
def _tukey_hsd_iv(args):
|
|
if (len(args)) < 2:
|
|
raise ValueError("There must be more than 1 treatment.")
|
|
args = [np.asarray(arg) for arg in args]
|
|
for arg in args:
|
|
if arg.ndim != 1:
|
|
raise ValueError("Input samples must be one-dimensional.")
|
|
if arg.size <= 1:
|
|
raise ValueError("Input sample size must be greater than one.")
|
|
if np.isinf(arg).any():
|
|
raise ValueError("Input samples must be finite.")
|
|
return args
|
|
|
|
|
|
def tukey_hsd(*args):
|
|
"""Perform Tukey's HSD test for equality of means over multiple treatments.
|
|
|
|
Tukey's honestly significant difference (HSD) test performs pairwise
|
|
comparison of means for a set of samples. Whereas ANOVA (e.g. `f_oneway`)
|
|
assesses whether the true means underlying each sample are identical,
|
|
Tukey's HSD is a post hoc test used to compare the mean of each sample
|
|
to the mean of each other sample.
|
|
|
|
The null hypothesis is that the distributions underlying the samples all
|
|
have the same mean. The test statistic, which is computed for every
|
|
possible pairing of samples, is simply the difference between the sample
|
|
means. For each pair, the p-value is the probability under the null
|
|
hypothesis (and other assumptions; see notes) of observing such an extreme
|
|
value of the statistic, considering that many pairwise comparisons are
|
|
being performed. Confidence intervals for the difference between each pair
|
|
of means are also available.
|
|
|
|
Parameters
|
|
----------
|
|
sample1, sample2, ... : array_like
|
|
The sample measurements for each group. There must be at least
|
|
two arguments.
|
|
|
|
Returns
|
|
-------
|
|
result : `~scipy.stats._result_classes.TukeyHSDResult` instance
|
|
The return value is an object with the following attributes:
|
|
|
|
statistic : float ndarray
|
|
The computed statistic of the test for each comparison. The element
|
|
at index ``(i, j)`` is the statistic for the comparison between
|
|
groups ``i`` and ``j``.
|
|
pvalue : float ndarray
|
|
The computed p-value of the test for each comparison. The element
|
|
at index ``(i, j)`` is the p-value for the comparison between
|
|
groups ``i`` and ``j``.
|
|
|
|
The object has the following methods:
|
|
|
|
confidence_interval(confidence_level=0.95):
|
|
Compute the confidence interval for the specified confidence level.
|
|
|
|
See Also
|
|
--------
|
|
dunnett : performs comparison of means against a control group.
|
|
|
|
Notes
|
|
-----
|
|
The use of this test relies on several assumptions.
|
|
|
|
1. The observations are independent within and among groups.
|
|
2. The observations within each group are normally distributed.
|
|
3. The distributions from which the samples are drawn have the same finite
|
|
variance.
|
|
|
|
The original formulation of the test was for samples of equal size [6]_.
|
|
In case of unequal sample sizes, the test uses the Tukey-Kramer method
|
|
[4]_.
|
|
|
|
References
|
|
----------
|
|
.. [1] NIST/SEMATECH e-Handbook of Statistical Methods, "7.4.7.1. Tukey's
|
|
Method."
|
|
https://www.itl.nist.gov/div898/handbook/prc/section4/prc471.htm,
|
|
28 November 2020.
|
|
.. [2] Abdi, Herve & Williams, Lynne. (2021). "Tukey's Honestly Significant
|
|
Difference (HSD) Test."
|
|
https://personal.utdallas.edu/~herve/abdi-HSD2010-pretty.pdf
|
|
.. [3] "One-Way ANOVA Using SAS PROC ANOVA & PROC GLM." SAS
|
|
Tutorials, 2007, www.stattutorials.com/SAS/TUTORIAL-PROC-GLM.htm.
|
|
.. [4] Kramer, Clyde Young. "Extension of Multiple Range Tests to Group
|
|
Means with Unequal Numbers of Replications." Biometrics, vol. 12,
|
|
no. 3, 1956, pp. 307-310. JSTOR, www.jstor.org/stable/3001469.
|
|
Accessed 25 May 2021.
|
|
.. [5] NIST/SEMATECH e-Handbook of Statistical Methods, "7.4.3.3.
|
|
The ANOVA table and tests of hypotheses about means"
|
|
https://www.itl.nist.gov/div898/handbook/prc/section4/prc433.htm,
|
|
2 June 2021.
|
|
.. [6] Tukey, John W. "Comparing Individual Means in the Analysis of
|
|
Variance." Biometrics, vol. 5, no. 2, 1949, pp. 99-114. JSTOR,
|
|
www.jstor.org/stable/3001913. Accessed 14 June 2021.
|
|
|
|
|
|
Examples
|
|
--------
|
|
Here are some data comparing the time to relief of three brands of
|
|
headache medicine, reported in minutes. Data adapted from [3]_.
|
|
|
|
>>> import numpy as np
|
|
>>> from scipy.stats import tukey_hsd
|
|
>>> group0 = [24.5, 23.5, 26.4, 27.1, 29.9]
|
|
>>> group1 = [28.4, 34.2, 29.5, 32.2, 30.1]
|
|
>>> group2 = [26.1, 28.3, 24.3, 26.2, 27.8]
|
|
|
|
We would like to see if the means between any of the groups are
|
|
significantly different. First, visually examine a box and whisker plot.
|
|
|
|
>>> import matplotlib.pyplot as plt
|
|
>>> fig, ax = plt.subplots(1, 1)
|
|
>>> ax.boxplot([group0, group1, group2])
|
|
>>> ax.set_xticklabels(["group0", "group1", "group2"]) # doctest: +SKIP
|
|
>>> ax.set_ylabel("mean") # doctest: +SKIP
|
|
>>> plt.show()
|
|
|
|
From the box and whisker plot, we can see overlap in the interquartile
|
|
ranges group 1 to group 2 and group 3, but we can apply the ``tukey_hsd``
|
|
test to determine if the difference between means is significant. We
|
|
set a significance level of .05 to reject the null hypothesis.
|
|
|
|
>>> res = tukey_hsd(group0, group1, group2)
|
|
>>> print(res)
|
|
Tukey's HSD Pairwise Group Comparisons (95.0% Confidence Interval)
|
|
Comparison Statistic p-value Lower CI Upper CI
|
|
(0 - 1) -4.600 0.014 -8.249 -0.951
|
|
(0 - 2) -0.260 0.980 -3.909 3.389
|
|
(1 - 0) 4.600 0.014 0.951 8.249
|
|
(1 - 2) 4.340 0.020 0.691 7.989
|
|
(2 - 0) 0.260 0.980 -3.389 3.909
|
|
(2 - 1) -4.340 0.020 -7.989 -0.691
|
|
|
|
The null hypothesis is that each group has the same mean. The p-value for
|
|
comparisons between ``group0`` and ``group1`` as well as ``group1`` and
|
|
``group2`` do not exceed .05, so we reject the null hypothesis that they
|
|
have the same means. The p-value of the comparison between ``group0``
|
|
and ``group2`` exceeds .05, so we accept the null hypothesis that there
|
|
is not a significant difference between their means.
|
|
|
|
We can also compute the confidence interval associated with our chosen
|
|
confidence level.
|
|
|
|
>>> group0 = [24.5, 23.5, 26.4, 27.1, 29.9]
|
|
>>> group1 = [28.4, 34.2, 29.5, 32.2, 30.1]
|
|
>>> group2 = [26.1, 28.3, 24.3, 26.2, 27.8]
|
|
>>> result = tukey_hsd(group0, group1, group2)
|
|
>>> conf = res.confidence_interval(confidence_level=.99)
|
|
>>> for ((i, j), l) in np.ndenumerate(conf.low):
|
|
... # filter out self comparisons
|
|
... if i != j:
|
|
... h = conf.high[i,j]
|
|
... print(f"({i} - {j}) {l:>6.3f} {h:>6.3f}")
|
|
(0 - 1) -9.480 0.280
|
|
(0 - 2) -5.140 4.620
|
|
(1 - 0) -0.280 9.480
|
|
(1 - 2) -0.540 9.220
|
|
(2 - 0) -4.620 5.140
|
|
(2 - 1) -9.220 0.540
|
|
"""
|
|
args = _tukey_hsd_iv(args)
|
|
ntreatments = len(args)
|
|
means = np.asarray([np.mean(arg) for arg in args])
|
|
nsamples_treatments = np.asarray([a.size for a in args])
|
|
nobs = np.sum(nsamples_treatments)
|
|
|
|
# determine mean square error [5]. Note that this is sometimes called
|
|
# mean square error within.
|
|
mse = (np.sum([np.var(arg, ddof=1) for arg in args] *
|
|
(nsamples_treatments - 1)) / (nobs - ntreatments))
|
|
|
|
# The calculation of the standard error differs when treatments differ in
|
|
# size. See ("Unequal sample sizes")[1].
|
|
if np.unique(nsamples_treatments).size == 1:
|
|
# all input groups are the same length, so only one value needs to be
|
|
# calculated [1].
|
|
normalize = 2 / nsamples_treatments[0]
|
|
else:
|
|
# to compare groups of differing sizes, we must compute a variance
|
|
# value for each individual comparison. Use broadcasting to get the
|
|
# resulting matrix. [3], verified against [4] (page 308).
|
|
normalize = 1 / nsamples_treatments + 1 / nsamples_treatments[None].T
|
|
|
|
# the standard error is used in the computation of the tukey criterion and
|
|
# finding the p-values.
|
|
stand_err = np.sqrt(normalize * mse / 2)
|
|
|
|
# the mean difference is the test statistic.
|
|
mean_differences = means[None].T - means
|
|
|
|
# Calculate the t-statistic to use within the survival function of the
|
|
# studentized range to get the p-value.
|
|
t_stat = np.abs(mean_differences) / stand_err
|
|
|
|
params = t_stat, ntreatments, nobs - ntreatments
|
|
pvalues = distributions.studentized_range.sf(*params)
|
|
|
|
return TukeyHSDResult(mean_differences, pvalues, ntreatments,
|
|
nobs, stand_err)
|