Inzynierka_Gwiazdy/machine_learning/Lib/site-packages/scipy/stats/_variation.py
2023-09-20 19:46:58 +02:00

225 lines
8.1 KiB
Python

import numpy as np
from numpy.core.multiarray import normalize_axis_index
from scipy._lib._util import _nan_allsame, _contains_nan
from ._stats_py import _chk_asarray
def _nanvariation(a, *, axis=0, ddof=0, keepdims=False):
"""
Private version of `variation` that ignores nan.
`a` must be a numpy array.
`axis` is assumed to be normalized, i.e. 0 <= axis < a.ndim.
"""
#
# In theory, this should be as simple as something like
# nanstd(a, ddof=ddof, axis=axis, keepdims=keepdims) /
# nanmean(a, axis=axis, keepdims=keepdims)
# In practice, annoying issues arise. Specifically, numpy
# generates warnings in certain edge cases that we don't want
# to propagate to the user. Unfortunately, there does not
# appear to be a thread-safe way to filter out the warnings,
# so we have to do the calculation in a way that doesn't
# generate numpy warnings.
#
# Let N be the number of non-nan inputs in a slice.
# Conditions that generate nan:
# * empty input (i.e. N = 0)
# * All non-nan values 0
# * N < ddof
# * N == ddof and the input is constant
# Conditions that generate inf:
# * non-constant input and either
# * the mean is 0, or
# * N == ddof
#
a_isnan = np.isnan(a)
all_nan = a_isnan.all(axis=axis, keepdims=True)
all_nan_full = np.broadcast_to(all_nan, a.shape)
all_zero = (a_isnan | (a == 0)).all(axis=axis, keepdims=True) & ~all_nan
# ngood is the number of non-nan values in each slice.
ngood = (a.shape[axis] -
np.expand_dims(np.count_nonzero(a_isnan, axis=axis), axis))
# The return value is nan where ddof > ngood.
ddof_too_big = ddof > ngood
# If ddof == ngood, the return value is nan where the input is constant and
# inf otherwise.
ddof_equal_n = ddof == ngood
is_const = _nan_allsame(a, axis=axis, keepdims=True)
a2 = a.copy()
# If an entire slice is nan, `np.nanmean` will generate a warning,
# so we replace those nan's with 1.0 before computing the mean.
# We'll fix the corresponding output later.
a2[all_nan_full] = 1.0
mean_a = np.nanmean(a2, axis=axis, keepdims=True)
# If ddof >= ngood (the number of non-nan values in the slice), `np.nanstd`
# will generate a warning, so set all the values in such a slice to 1.0.
# We'll fix the corresponding output later.
a2[np.broadcast_to(ddof_too_big, a2.shape) | ddof_equal_n] = 1.0
with np.errstate(invalid='ignore'):
std_a = np.nanstd(a2, axis=axis, ddof=ddof, keepdims=True)
del a2
sum_zero = np.nansum(a, axis=axis, keepdims=True) == 0
# Where the sum along the axis is 0, replace mean_a with 1. This avoids
# division by zero. We'll fix the corresponding output later.
mean_a[sum_zero] = 1.0
# Here--finally!--is the calculation of the variation.
result = std_a / mean_a
# Now fix the values that were given fake data to avoid warnings.
result[~is_const & sum_zero] = np.inf
signed_inf_mask = ~is_const & ddof_equal_n
result[signed_inf_mask] = np.sign(mean_a[signed_inf_mask]) * np.inf
nan_mask = all_zero | all_nan | ddof_too_big | (ddof_equal_n & is_const)
result[nan_mask] = np.nan
if not keepdims:
result = np.squeeze(result, axis=axis)
if result.shape == ():
result = result[()]
return result
def variation(a, axis=0, nan_policy='propagate', ddof=0, *, keepdims=False):
"""
Compute the coefficient of variation.
The coefficient of variation is the standard deviation divided by the
mean. This function is equivalent to::
np.std(x, axis=axis, ddof=ddof) / np.mean(x)
The default for ``ddof`` is 0, but many definitions of the coefficient
of variation use the square root of the unbiased sample variance
for the sample standard deviation, which corresponds to ``ddof=1``.
The function does not take the absolute value of the mean of the data,
so the return value is negative if the mean is negative.
Parameters
----------
a : array_like
Input array.
axis : int or None, optional
Axis along which to calculate the coefficient of variation.
Default is 0. If None, compute over the whole array `a`.
nan_policy : {'propagate', 'raise', 'omit'}, optional
Defines how to handle when input contains ``nan``.
The following options are available:
* 'propagate': return ``nan``
* 'raise': raise an exception
* 'omit': perform the calculation with ``nan`` values omitted
The default is 'propagate'.
ddof : int, optional
Gives the "Delta Degrees Of Freedom" used when computing the
standard deviation. The divisor used in the calculation of the
standard deviation is ``N - ddof``, where ``N`` is the number of
elements. `ddof` must be less than ``N``; if it isn't, the result
will be ``nan`` or ``inf``, depending on ``N`` and the values in
the array. By default `ddof` is zero for backwards compatibility,
but it is recommended to use ``ddof=1`` to ensure that the sample
standard deviation is computed as the square root of the unbiased
sample variance.
keepdims : bool, optional
If this is set to True, the axes which are reduced are left in the
result as dimensions with size one. With this option, the result
will broadcast correctly against the input array.
Returns
-------
variation : ndarray
The calculated variation along the requested axis.
Notes
-----
There are several edge cases that are handled without generating a
warning:
* If both the mean and the standard deviation are zero, ``nan``
is returned.
* If the mean is zero and the standard deviation is nonzero, ``inf``
is returned.
* If the input has length zero (either because the array has zero
length, or all the input values are ``nan`` and ``nan_policy`` is
``'omit'``), ``nan`` is returned.
* If the input contains ``inf``, ``nan`` is returned.
References
----------
.. [1] Zwillinger, D. and Kokoska, S. (2000). CRC Standard
Probability and Statistics Tables and Formulae. Chapman & Hall: New
York. 2000.
Examples
--------
>>> import numpy as np
>>> from scipy.stats import variation
>>> variation([1, 2, 3, 4, 5], ddof=1)
0.5270462766947299
Compute the variation along a given dimension of an array that contains
a few ``nan`` values:
>>> x = np.array([[ 10.0, np.nan, 11.0, 19.0, 23.0, 29.0, 98.0],
... [ 29.0, 30.0, 32.0, 33.0, 35.0, 56.0, 57.0],
... [np.nan, np.nan, 12.0, 13.0, 16.0, 16.0, 17.0]])
>>> variation(x, axis=1, ddof=1, nan_policy='omit')
array([1.05109361, 0.31428986, 0.146483 ])
"""
a, axis = _chk_asarray(a, axis)
axis = normalize_axis_index(axis, ndim=a.ndim)
n = a.shape[axis]
contains_nan, nan_policy = _contains_nan(a, nan_policy)
if contains_nan and nan_policy == 'omit':
return _nanvariation(a, axis=axis, ddof=ddof, keepdims=keepdims)
if a.size == 0 or ddof > n:
# Handle as a special case to avoid spurious warnings.
# The return values, if any, are all nan.
shp = list(a.shape)
if keepdims:
shp[axis] = 1
else:
del shp[axis]
if len(shp) == 0:
result = np.nan
else:
result = np.full(shp, fill_value=np.nan)
return result
mean_a = a.mean(axis, keepdims=True)
if ddof == n:
# Another special case. Result is either inf or nan.
std_a = a.std(axis=axis, ddof=0, keepdims=True)
result = np.full_like(std_a, fill_value=np.nan)
result.flat[std_a.flat > 0] = (np.sign(mean_a) * np.inf).flat
if result.shape == ():
result = result[()]
return result
with np.errstate(divide='ignore', invalid='ignore'):
std_a = a.std(axis, ddof=ddof, keepdims=True)
result = std_a / mean_a
if not keepdims:
result = np.squeeze(result, axis=axis)
if result.shape == ():
result = result[()]
return result