import numpy as np from numpy.core.multiarray import normalize_axis_index from scipy._lib._util import _nan_allsame, _contains_nan from ._stats_py import _chk_asarray def _nanvariation(a, *, axis=0, ddof=0, keepdims=False): """ Private version of `variation` that ignores nan. `a` must be a numpy array. `axis` is assumed to be normalized, i.e. 0 <= axis < a.ndim. """ # # In theory, this should be as simple as something like # nanstd(a, ddof=ddof, axis=axis, keepdims=keepdims) / # nanmean(a, axis=axis, keepdims=keepdims) # In practice, annoying issues arise. Specifically, numpy # generates warnings in certain edge cases that we don't want # to propagate to the user. Unfortunately, there does not # appear to be a thread-safe way to filter out the warnings, # so we have to do the calculation in a way that doesn't # generate numpy warnings. # # Let N be the number of non-nan inputs in a slice. # Conditions that generate nan: # * empty input (i.e. N = 0) # * All non-nan values 0 # * N < ddof # * N == ddof and the input is constant # Conditions that generate inf: # * non-constant input and either # * the mean is 0, or # * N == ddof # a_isnan = np.isnan(a) all_nan = a_isnan.all(axis=axis, keepdims=True) all_nan_full = np.broadcast_to(all_nan, a.shape) all_zero = (a_isnan | (a == 0)).all(axis=axis, keepdims=True) & ~all_nan # ngood is the number of non-nan values in each slice. ngood = (a.shape[axis] - np.expand_dims(np.count_nonzero(a_isnan, axis=axis), axis)) # The return value is nan where ddof > ngood. ddof_too_big = ddof > ngood # If ddof == ngood, the return value is nan where the input is constant and # inf otherwise. ddof_equal_n = ddof == ngood is_const = _nan_allsame(a, axis=axis, keepdims=True) a2 = a.copy() # If an entire slice is nan, `np.nanmean` will generate a warning, # so we replace those nan's with 1.0 before computing the mean. # We'll fix the corresponding output later. a2[all_nan_full] = 1.0 mean_a = np.nanmean(a2, axis=axis, keepdims=True) # If ddof >= ngood (the number of non-nan values in the slice), `np.nanstd` # will generate a warning, so set all the values in such a slice to 1.0. # We'll fix the corresponding output later. a2[np.broadcast_to(ddof_too_big, a2.shape) | ddof_equal_n] = 1.0 with np.errstate(invalid='ignore'): std_a = np.nanstd(a2, axis=axis, ddof=ddof, keepdims=True) del a2 sum_zero = np.nansum(a, axis=axis, keepdims=True) == 0 # Where the sum along the axis is 0, replace mean_a with 1. This avoids # division by zero. We'll fix the corresponding output later. mean_a[sum_zero] = 1.0 # Here--finally!--is the calculation of the variation. result = std_a / mean_a # Now fix the values that were given fake data to avoid warnings. result[~is_const & sum_zero] = np.inf signed_inf_mask = ~is_const & ddof_equal_n result[signed_inf_mask] = np.sign(mean_a[signed_inf_mask]) * np.inf nan_mask = all_zero | all_nan | ddof_too_big | (ddof_equal_n & is_const) result[nan_mask] = np.nan if not keepdims: result = np.squeeze(result, axis=axis) if result.shape == (): result = result[()] return result def variation(a, axis=0, nan_policy='propagate', ddof=0, *, keepdims=False): """ Compute the coefficient of variation. The coefficient of variation is the standard deviation divided by the mean. This function is equivalent to:: np.std(x, axis=axis, ddof=ddof) / np.mean(x) The default for ``ddof`` is 0, but many definitions of the coefficient of variation use the square root of the unbiased sample variance for the sample standard deviation, which corresponds to ``ddof=1``. The function does not take the absolute value of the mean of the data, so the return value is negative if the mean is negative. Parameters ---------- a : array_like Input array. axis : int or None, optional Axis along which to calculate the coefficient of variation. Default is 0. If None, compute over the whole array `a`. nan_policy : {'propagate', 'raise', 'omit'}, optional Defines how to handle when input contains ``nan``. The following options are available: * 'propagate': return ``nan`` * 'raise': raise an exception * 'omit': perform the calculation with ``nan`` values omitted The default is 'propagate'. ddof : int, optional Gives the "Delta Degrees Of Freedom" used when computing the standard deviation. The divisor used in the calculation of the standard deviation is ``N - ddof``, where ``N`` is the number of elements. `ddof` must be less than ``N``; if it isn't, the result will be ``nan`` or ``inf``, depending on ``N`` and the values in the array. By default `ddof` is zero for backwards compatibility, but it is recommended to use ``ddof=1`` to ensure that the sample standard deviation is computed as the square root of the unbiased sample variance. keepdims : bool, optional If this is set to True, the axes which are reduced are left in the result as dimensions with size one. With this option, the result will broadcast correctly against the input array. Returns ------- variation : ndarray The calculated variation along the requested axis. Notes ----- There are several edge cases that are handled without generating a warning: * If both the mean and the standard deviation are zero, ``nan`` is returned. * If the mean is zero and the standard deviation is nonzero, ``inf`` is returned. * If the input has length zero (either because the array has zero length, or all the input values are ``nan`` and ``nan_policy`` is ``'omit'``), ``nan`` is returned. * If the input contains ``inf``, ``nan`` is returned. References ---------- .. [1] Zwillinger, D. and Kokoska, S. (2000). CRC Standard Probability and Statistics Tables and Formulae. Chapman & Hall: New York. 2000. Examples -------- >>> import numpy as np >>> from scipy.stats import variation >>> variation([1, 2, 3, 4, 5], ddof=1) 0.5270462766947299 Compute the variation along a given dimension of an array that contains a few ``nan`` values: >>> x = np.array([[ 10.0, np.nan, 11.0, 19.0, 23.0, 29.0, 98.0], ... [ 29.0, 30.0, 32.0, 33.0, 35.0, 56.0, 57.0], ... [np.nan, np.nan, 12.0, 13.0, 16.0, 16.0, 17.0]]) >>> variation(x, axis=1, ddof=1, nan_policy='omit') array([1.05109361, 0.31428986, 0.146483 ]) """ a, axis = _chk_asarray(a, axis) axis = normalize_axis_index(axis, ndim=a.ndim) n = a.shape[axis] contains_nan, nan_policy = _contains_nan(a, nan_policy) if contains_nan and nan_policy == 'omit': return _nanvariation(a, axis=axis, ddof=ddof, keepdims=keepdims) if a.size == 0 or ddof > n: # Handle as a special case to avoid spurious warnings. # The return values, if any, are all nan. shp = list(a.shape) if keepdims: shp[axis] = 1 else: del shp[axis] if len(shp) == 0: result = np.nan else: result = np.full(shp, fill_value=np.nan) return result mean_a = a.mean(axis, keepdims=True) if ddof == n: # Another special case. Result is either inf or nan. std_a = a.std(axis=axis, ddof=0, keepdims=True) result = np.full_like(std_a, fill_value=np.nan) result.flat[std_a.flat > 0] = (np.sign(mean_a) * np.inf).flat if result.shape == (): result = result[()] return result with np.errstate(divide='ignore', invalid='ignore'): std_a = a.std(axis, ddof=ddof, keepdims=True) result = std_a / mean_a if not keepdims: result = np.squeeze(result, axis=axis) if result.shape == (): result = result[()] return result