projektAI/venv/Lib/site-packages/pandas/core/nanops.py
2021-06-06 22:13:05 +02:00

1750 lines
48 KiB
Python

from __future__ import annotations
import functools
import itertools
import operator
from typing import Any, Optional, Tuple, Union, cast
import warnings
import numpy as np
from pandas._config import get_option
from pandas._libs import NaT, Timedelta, iNaT, lib
from pandas._typing import ArrayLike, Dtype, DtypeObj, F, Scalar
from pandas.compat._optional import import_optional_dependency
from pandas.core.dtypes.common import (
get_dtype,
is_any_int_dtype,
is_bool_dtype,
is_complex,
is_datetime64_any_dtype,
is_float,
is_float_dtype,
is_integer,
is_integer_dtype,
is_numeric_dtype,
is_object_dtype,
is_scalar,
is_timedelta64_dtype,
needs_i8_conversion,
pandas_dtype,
)
from pandas.core.dtypes.dtypes import PeriodDtype
from pandas.core.dtypes.missing import isna, na_value_for_dtype, notna
from pandas.core.construction import extract_array
bn = import_optional_dependency("bottleneck", raise_on_missing=False, on_version="warn")
_BOTTLENECK_INSTALLED = bn is not None
_USE_BOTTLENECK = False
def set_use_bottleneck(v: bool = True) -> None:
# set/unset to use bottleneck
global _USE_BOTTLENECK
if _BOTTLENECK_INSTALLED:
_USE_BOTTLENECK = v
set_use_bottleneck(get_option("compute.use_bottleneck"))
class disallow:
def __init__(self, *dtypes):
super().__init__()
self.dtypes = tuple(pandas_dtype(dtype).type for dtype in dtypes)
def check(self, obj) -> bool:
return hasattr(obj, "dtype") and issubclass(obj.dtype.type, self.dtypes)
def __call__(self, f: F) -> F:
@functools.wraps(f)
def _f(*args, **kwargs):
obj_iter = itertools.chain(args, kwargs.values())
if any(self.check(obj) for obj in obj_iter):
f_name = f.__name__.replace("nan", "")
raise TypeError(
f"reduction operation '{f_name}' not allowed for this dtype"
)
try:
with np.errstate(invalid="ignore"):
return f(*args, **kwargs)
except ValueError as e:
# we want to transform an object array
# ValueError message to the more typical TypeError
# e.g. this is normally a disallowed function on
# object arrays that contain strings
if is_object_dtype(args[0]):
raise TypeError(e) from e
raise
return cast(F, _f)
class bottleneck_switch:
def __init__(self, name=None, **kwargs):
self.name = name
self.kwargs = kwargs
def __call__(self, alt: F) -> F:
bn_name = self.name or alt.__name__
try:
bn_func = getattr(bn, bn_name)
except (AttributeError, NameError): # pragma: no cover
bn_func = None
@functools.wraps(alt)
def f(
values: np.ndarray,
*,
axis: Optional[int] = None,
skipna: bool = True,
**kwds,
):
if len(self.kwargs) > 0:
for k, v in self.kwargs.items():
if k not in kwds:
kwds[k] = v
if values.size == 0 and kwds.get("min_count") is None:
# We are empty, returning NA for our type
# Only applies for the default `min_count` of None
# since that affects how empty arrays are handled.
# TODO(GH-18976) update all the nanops methods to
# correctly handle empty inputs and remove this check.
# It *may* just be `var`
return _na_for_min_count(values, axis)
if _USE_BOTTLENECK and skipna and _bn_ok_dtype(values.dtype, bn_name):
if kwds.get("mask", None) is None:
# `mask` is not recognised by bottleneck, would raise
# TypeError if called
kwds.pop("mask", None)
result = bn_func(values, axis=axis, **kwds)
# prefer to treat inf/-inf as NA, but must compute the func
# twice :(
if _has_infs(result):
result = alt(values, axis=axis, skipna=skipna, **kwds)
else:
result = alt(values, axis=axis, skipna=skipna, **kwds)
else:
result = alt(values, axis=axis, skipna=skipna, **kwds)
return result
return cast(F, f)
def _bn_ok_dtype(dtype: DtypeObj, name: str) -> bool:
# Bottleneck chokes on datetime64, PeriodDtype (or and EA)
if not is_object_dtype(dtype) and not needs_i8_conversion(dtype):
# GH 15507
# bottleneck does not properly upcast during the sum
# so can overflow
# GH 9422
# further we also want to preserve NaN when all elements
# are NaN, unlike bottleneck/numpy which consider this
# to be 0
if name in ["nansum", "nanprod"]:
return False
return True
return False
def _has_infs(result) -> bool:
if isinstance(result, np.ndarray):
if result.dtype == "f8":
return lib.has_infs_f8(result.ravel("K"))
elif result.dtype == "f4":
return lib.has_infs_f4(result.ravel("K"))
try:
return np.isinf(result).any()
except (TypeError, NotImplementedError):
# if it doesn't support infs, then it can't have infs
return False
def _get_fill_value(
dtype: DtypeObj, fill_value: Optional[Scalar] = None, fill_value_typ=None
):
""" return the correct fill value for the dtype of the values """
if fill_value is not None:
return fill_value
if _na_ok_dtype(dtype):
if fill_value_typ is None:
return np.nan
else:
if fill_value_typ == "+inf":
return np.inf
else:
return -np.inf
else:
if fill_value_typ is None:
return iNaT
else:
if fill_value_typ == "+inf":
# need the max int here
return np.iinfo(np.int64).max
else:
return iNaT
def _maybe_get_mask(
values: np.ndarray, skipna: bool, mask: Optional[np.ndarray]
) -> Optional[np.ndarray]:
"""
Compute a mask if and only if necessary.
This function will compute a mask iff it is necessary. Otherwise,
return the provided mask (potentially None) when a mask does not need to be
computed.
A mask is never necessary if the values array is of boolean or integer
dtypes, as these are incapable of storing NaNs. If passing a NaN-capable
dtype that is interpretable as either boolean or integer data (eg,
timedelta64), a mask must be provided.
If the skipna parameter is False, a new mask will not be computed.
The mask is computed using isna() by default. Setting invert=True selects
notna() as the masking function.
Parameters
----------
values : ndarray
input array to potentially compute mask for
skipna : bool
boolean for whether NaNs should be skipped
mask : Optional[ndarray]
nan-mask if known
Returns
-------
Optional[np.ndarray]
"""
if mask is None:
if is_bool_dtype(values.dtype) or is_integer_dtype(values.dtype):
# Boolean data cannot contain nulls, so signal via mask being None
return None
if skipna or needs_i8_conversion(values.dtype):
mask = isna(values)
return mask
def _get_values(
values: np.ndarray,
skipna: bool,
fill_value: Any = None,
fill_value_typ: Optional[str] = None,
mask: Optional[np.ndarray] = None,
) -> Tuple[np.ndarray, Optional[np.ndarray], np.dtype, np.dtype, Any]:
"""
Utility to get the values view, mask, dtype, dtype_max, and fill_value.
If both mask and fill_value/fill_value_typ are not None and skipna is True,
the values array will be copied.
For input arrays of boolean or integer dtypes, copies will only occur if a
precomputed mask, a fill_value/fill_value_typ, and skipna=True are
provided.
Parameters
----------
values : ndarray
input array to potentially compute mask for
skipna : bool
boolean for whether NaNs should be skipped
fill_value : Any
value to fill NaNs with
fill_value_typ : str
Set to '+inf' or '-inf' to handle dtype-specific infinities
mask : Optional[np.ndarray]
nan-mask if known
Returns
-------
values : ndarray
Potential copy of input value array
mask : Optional[ndarray[bool]]
Mask for values, if deemed necessary to compute
dtype : np.dtype
dtype for values
dtype_max : np.dtype
platform independent dtype
fill_value : Any
fill value used
"""
# In _get_values is only called from within nanops, and in all cases
# with scalar fill_value. This guarantee is important for the
# np.where call below
assert is_scalar(fill_value)
values = extract_array(values, extract_numpy=True)
mask = _maybe_get_mask(values, skipna, mask)
dtype = values.dtype
datetimelike = False
if needs_i8_conversion(values.dtype):
# changing timedelta64/datetime64 to int64 needs to happen after
# finding `mask` above
values = np.asarray(values.view("i8"))
datetimelike = True
dtype_ok = _na_ok_dtype(dtype)
# get our fill value (in case we need to provide an alternative
# dtype for it)
fill_value = _get_fill_value(
dtype, fill_value=fill_value, fill_value_typ=fill_value_typ
)
if skipna and (mask is not None) and (fill_value is not None):
if mask.any():
if dtype_ok or datetimelike:
values = values.copy()
np.putmask(values, mask, fill_value)
else:
# np.where will promote if needed
values = np.where(~mask, values, fill_value)
# return a platform independent precision dtype
dtype_max = dtype
if is_integer_dtype(dtype) or is_bool_dtype(dtype):
dtype_max = np.dtype(np.int64)
elif is_float_dtype(dtype):
dtype_max = np.dtype(np.float64)
return values, mask, dtype, dtype_max, fill_value
def _na_ok_dtype(dtype: DtypeObj) -> bool:
if needs_i8_conversion(dtype):
return False
return not issubclass(dtype.type, np.integer)
def _wrap_results(result, dtype: np.dtype, fill_value=None):
""" wrap our results if needed """
if result is NaT:
pass
elif is_datetime64_any_dtype(dtype):
if fill_value is None:
# GH#24293
fill_value = iNaT
if not isinstance(result, np.ndarray):
assert not isna(fill_value), "Expected non-null fill_value"
if result == fill_value:
result = np.nan
if isna(result):
result = np.datetime64("NaT", "ns")
else:
result = np.int64(result).view("datetime64[ns]")
else:
# If we have float dtype, taking a view will give the wrong result
result = result.astype(dtype)
elif is_timedelta64_dtype(dtype):
if not isinstance(result, np.ndarray):
if result == fill_value:
result = np.nan
# raise if we have a timedelta64[ns] which is too large
if np.fabs(result) > np.iinfo(np.int64).max:
raise ValueError("overflow in timedelta operation")
result = Timedelta(result, unit="ns")
else:
result = result.astype("m8[ns]").view(dtype)
return result
def _datetimelike_compat(func: F) -> F:
"""
If we have datetime64 or timedelta64 values, ensure we have a correct
mask before calling the wrapped function, then cast back afterwards.
"""
@functools.wraps(func)
def new_func(
values: np.ndarray,
*,
axis: Optional[int] = None,
skipna: bool = True,
mask: Optional[np.ndarray] = None,
**kwargs,
):
orig_values = values
datetimelike = values.dtype.kind in ["m", "M"]
if datetimelike and mask is None:
mask = isna(values)
result = func(values, axis=axis, skipna=skipna, mask=mask, **kwargs)
if datetimelike:
result = _wrap_results(result, orig_values.dtype, fill_value=iNaT)
if not skipna:
result = _mask_datetimelike_result(result, axis, mask, orig_values)
return result
return cast(F, new_func)
def _na_for_min_count(
values: np.ndarray, axis: Optional[int]
) -> Union[Scalar, np.ndarray]:
"""
Return the missing value for `values`.
Parameters
----------
values : ndarray
axis : int or None
axis for the reduction, required if values.ndim > 1.
Returns
-------
result : scalar or ndarray
For 1-D values, returns a scalar of the correct missing type.
For 2-D values, returns a 1-D array where each element is missing.
"""
# we either return np.nan or pd.NaT
if is_numeric_dtype(values):
values = values.astype("float64")
fill_value = na_value_for_dtype(values.dtype)
if fill_value is NaT:
fill_value = values.dtype.type("NaT", "ns")
if values.ndim == 1:
return fill_value
elif axis is None:
return fill_value
else:
result_shape = values.shape[:axis] + values.shape[axis + 1 :]
result = np.full(result_shape, fill_value, dtype=values.dtype)
return result
def nanany(
values: np.ndarray,
*,
axis: Optional[int] = None,
skipna: bool = True,
mask: Optional[np.ndarray] = None,
) -> bool:
"""
Check if any elements along an axis evaluate to True.
Parameters
----------
values : ndarray
axis : int, optional
skipna : bool, default True
mask : ndarray[bool], optional
nan-mask if known
Returns
-------
result : bool
Examples
--------
>>> import pandas.core.nanops as nanops
>>> s = pd.Series([1, 2])
>>> nanops.nanany(s)
True
>>> import pandas.core.nanops as nanops
>>> s = pd.Series([np.nan])
>>> nanops.nanany(s)
False
"""
values, _, _, _, _ = _get_values(values, skipna, fill_value=False, mask=mask)
return values.any(axis)
def nanall(
values: np.ndarray,
*,
axis: Optional[int] = None,
skipna: bool = True,
mask: Optional[np.ndarray] = None,
) -> bool:
"""
Check if all elements along an axis evaluate to True.
Parameters
----------
values : ndarray
axis: int, optional
skipna : bool, default True
mask : ndarray[bool], optional
nan-mask if known
Returns
-------
result : bool
Examples
--------
>>> import pandas.core.nanops as nanops
>>> s = pd.Series([1, 2, np.nan])
>>> nanops.nanall(s)
True
>>> import pandas.core.nanops as nanops
>>> s = pd.Series([1, 0])
>>> nanops.nanall(s)
False
"""
values, _, _, _, _ = _get_values(values, skipna, fill_value=True, mask=mask)
return values.all(axis)
@disallow("M8")
@_datetimelike_compat
def nansum(
values: np.ndarray,
*,
axis: Optional[int] = None,
skipna: bool = True,
min_count: int = 0,
mask: Optional[np.ndarray] = None,
) -> float:
"""
Sum the elements along an axis ignoring NaNs
Parameters
----------
values : ndarray[dtype]
axis: int, optional
skipna : bool, default True
min_count: int, default 0
mask : ndarray[bool], optional
nan-mask if known
Returns
-------
result : dtype
Examples
--------
>>> import pandas.core.nanops as nanops
>>> s = pd.Series([1, 2, np.nan])
>>> nanops.nansum(s)
3.0
"""
values, mask, dtype, dtype_max, _ = _get_values(
values, skipna, fill_value=0, mask=mask
)
dtype_sum = dtype_max
if is_float_dtype(dtype):
dtype_sum = dtype
elif is_timedelta64_dtype(dtype):
dtype_sum = np.float64
the_sum = values.sum(axis, dtype=dtype_sum)
the_sum = _maybe_null_out(the_sum, axis, mask, values.shape, min_count=min_count)
return the_sum
def _mask_datetimelike_result(
result: Union[np.ndarray, np.datetime64, np.timedelta64],
axis: Optional[int],
mask: np.ndarray,
orig_values: np.ndarray,
):
if isinstance(result, np.ndarray):
# we need to apply the mask
result = result.astype("i8").view(orig_values.dtype)
axis_mask = mask.any(axis=axis)
result[axis_mask] = iNaT
else:
if mask.any():
result = NaT
return result
@disallow(PeriodDtype)
@bottleneck_switch()
@_datetimelike_compat
def nanmean(
values: np.ndarray,
*,
axis: Optional[int] = None,
skipna: bool = True,
mask: Optional[np.ndarray] = None,
) -> float:
"""
Compute the mean of the element along an axis ignoring NaNs
Parameters
----------
values : ndarray
axis: int, optional
skipna : bool, default True
mask : ndarray[bool], optional
nan-mask if known
Returns
-------
float
Unless input is a float array, in which case use the same
precision as the input array.
Examples
--------
>>> import pandas.core.nanops as nanops
>>> s = pd.Series([1, 2, np.nan])
>>> nanops.nanmean(s)
1.5
"""
values, mask, dtype, dtype_max, _ = _get_values(
values, skipna, fill_value=0, mask=mask
)
dtype_sum = dtype_max
dtype_count = np.float64
# not using needs_i8_conversion because that includes period
if dtype.kind in ["m", "M"]:
dtype_sum = np.float64
elif is_integer_dtype(dtype):
dtype_sum = np.float64
elif is_float_dtype(dtype):
dtype_sum = dtype
dtype_count = dtype
count = _get_counts(values.shape, mask, axis, dtype=dtype_count)
the_sum = _ensure_numeric(values.sum(axis, dtype=dtype_sum))
if axis is not None and getattr(the_sum, "ndim", False):
count = cast(np.ndarray, count)
with np.errstate(all="ignore"):
# suppress division by zero warnings
the_mean = the_sum / count
ct_mask = count == 0
if ct_mask.any():
the_mean[ct_mask] = np.nan
else:
the_mean = the_sum / count if count > 0 else np.nan
return the_mean
@bottleneck_switch()
def nanmedian(values, *, axis=None, skipna=True, mask=None):
"""
Parameters
----------
values : ndarray
axis: int, optional
skipna : bool, default True
mask : ndarray[bool], optional
nan-mask if known
Returns
-------
result : float
Unless input is a float array, in which case use the same
precision as the input array.
Examples
--------
>>> import pandas.core.nanops as nanops
>>> s = pd.Series([1, np.nan, 2, 2])
>>> nanops.nanmedian(s)
2.0
"""
def get_median(x):
mask = notna(x)
if not skipna and not mask.all():
return np.nan
with warnings.catch_warnings():
# Suppress RuntimeWarning about All-NaN slice
warnings.filterwarnings("ignore", "All-NaN slice encountered")
res = np.nanmedian(x[mask])
return res
values, mask, dtype, _, _ = _get_values(values, skipna, mask=mask)
if not is_float_dtype(values.dtype):
try:
values = values.astype("f8")
except ValueError as err:
# e.g. "could not convert string to float: 'a'"
raise TypeError from err
if mask is not None:
values[mask] = np.nan
if axis is None:
values = values.ravel("K")
notempty = values.size
# an array from a frame
if values.ndim > 1:
# there's a non-empty array to apply over otherwise numpy raises
if notempty:
if not skipna:
res = np.apply_along_axis(get_median, axis, values)
else:
# fastpath for the skipna case
with warnings.catch_warnings():
# Suppress RuntimeWarning about All-NaN slice
warnings.filterwarnings("ignore", "All-NaN slice encountered")
res = np.nanmedian(values, axis)
else:
# must return the correct shape, but median is not defined for the
# empty set so return nans of shape "everything but the passed axis"
# since "axis" is where the reduction would occur if we had a nonempty
# array
res = get_empty_reduction_result(values.shape, axis, np.float_, np.nan)
else:
# otherwise return a scalar value
res = get_median(values) if notempty else np.nan
return _wrap_results(res, dtype)
def get_empty_reduction_result(
shape: Tuple[int, ...], axis: int, dtype: np.dtype, fill_value: Any
) -> np.ndarray:
"""
The result from a reduction on an empty ndarray.
Parameters
----------
shape : Tuple[int]
axis : int
dtype : np.dtype
fill_value : Any
Returns
-------
np.ndarray
"""
shp = np.array(shape)
dims = np.arange(len(shape))
ret = np.empty(shp[dims != axis], dtype=dtype)
ret.fill(fill_value)
return ret
def _get_counts_nanvar(
value_counts: Tuple[int],
mask: Optional[np.ndarray],
axis: Optional[int],
ddof: int,
dtype: Dtype = float,
) -> Tuple[Union[int, np.ndarray], Union[int, np.ndarray]]:
"""
Get the count of non-null values along an axis, accounting
for degrees of freedom.
Parameters
----------
values_shape : Tuple[int]
shape tuple from values ndarray, used if mask is None
mask : Optional[ndarray[bool]]
locations in values that should be considered missing
axis : Optional[int]
axis to count along
ddof : int
degrees of freedom
dtype : type, optional
type to use for count
Returns
-------
count : scalar or array
d : scalar or array
"""
dtype = get_dtype(dtype)
count = _get_counts(value_counts, mask, axis, dtype=dtype)
d = count - dtype.type(ddof)
# always return NaN, never inf
if is_scalar(count):
if count <= ddof:
count = np.nan
d = np.nan
else:
mask2: np.ndarray = count <= ddof
if mask2.any():
np.putmask(d, mask2, np.nan)
np.putmask(count, mask2, np.nan)
return count, d
@bottleneck_switch(ddof=1)
def nanstd(values, *, axis=None, skipna=True, ddof=1, mask=None):
"""
Compute the standard deviation along given axis while ignoring NaNs
Parameters
----------
values : ndarray
axis: int, optional
skipna : bool, default True
ddof : int, default 1
Delta Degrees of Freedom. The divisor used in calculations is N - ddof,
where N represents the number of elements.
mask : ndarray[bool], optional
nan-mask if known
Returns
-------
result : float
Unless input is a float array, in which case use the same
precision as the input array.
Examples
--------
>>> import pandas.core.nanops as nanops
>>> s = pd.Series([1, np.nan, 2, 3])
>>> nanops.nanstd(s)
1.0
"""
if values.dtype == "M8[ns]":
values = values.view("m8[ns]")
orig_dtype = values.dtype
values, mask, _, _, _ = _get_values(values, skipna, mask=mask)
result = np.sqrt(nanvar(values, axis=axis, skipna=skipna, ddof=ddof, mask=mask))
return _wrap_results(result, orig_dtype)
@disallow("M8", "m8")
@bottleneck_switch(ddof=1)
def nanvar(values, *, axis=None, skipna=True, ddof=1, mask=None):
"""
Compute the variance along given axis while ignoring NaNs
Parameters
----------
values : ndarray
axis: int, optional
skipna : bool, default True
ddof : int, default 1
Delta Degrees of Freedom. The divisor used in calculations is N - ddof,
where N represents the number of elements.
mask : ndarray[bool], optional
nan-mask if known
Returns
-------
result : float
Unless input is a float array, in which case use the same
precision as the input array.
Examples
--------
>>> import pandas.core.nanops as nanops
>>> s = pd.Series([1, np.nan, 2, 3])
>>> nanops.nanvar(s)
1.0
"""
values = extract_array(values, extract_numpy=True)
dtype = values.dtype
mask = _maybe_get_mask(values, skipna, mask)
if is_any_int_dtype(dtype):
values = values.astype("f8")
if mask is not None:
values[mask] = np.nan
if is_float_dtype(values.dtype):
count, d = _get_counts_nanvar(values.shape, mask, axis, ddof, values.dtype)
else:
count, d = _get_counts_nanvar(values.shape, mask, axis, ddof)
if skipna and mask is not None:
values = values.copy()
np.putmask(values, mask, 0)
# xref GH10242
# Compute variance via two-pass algorithm, which is stable against
# cancellation errors and relatively accurate for small numbers of
# observations.
#
# See https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance
avg = _ensure_numeric(values.sum(axis=axis, dtype=np.float64)) / count
if axis is not None:
avg = np.expand_dims(avg, axis)
sqr = _ensure_numeric((avg - values) ** 2)
if mask is not None:
np.putmask(sqr, mask, 0)
result = sqr.sum(axis=axis, dtype=np.float64) / d
# Return variance as np.float64 (the datatype used in the accumulator),
# unless we were dealing with a float array, in which case use the same
# precision as the original values array.
if is_float_dtype(dtype):
result = result.astype(dtype)
return result
@disallow("M8", "m8")
def nansem(
values: np.ndarray,
*,
axis: Optional[int] = None,
skipna: bool = True,
ddof: int = 1,
mask: Optional[np.ndarray] = None,
) -> float:
"""
Compute the standard error in the mean along given axis while ignoring NaNs
Parameters
----------
values : ndarray
axis: int, optional
skipna : bool, default True
ddof : int, default 1
Delta Degrees of Freedom. The divisor used in calculations is N - ddof,
where N represents the number of elements.
mask : ndarray[bool], optional
nan-mask if known
Returns
-------
result : float64
Unless input is a float array, in which case use the same
precision as the input array.
Examples
--------
>>> import pandas.core.nanops as nanops
>>> s = pd.Series([1, np.nan, 2, 3])
>>> nanops.nansem(s)
0.5773502691896258
"""
# This checks if non-numeric-like data is passed with numeric_only=False
# and raises a TypeError otherwise
nanvar(values, axis=axis, skipna=skipna, ddof=ddof, mask=mask)
mask = _maybe_get_mask(values, skipna, mask)
if not is_float_dtype(values.dtype):
values = values.astype("f8")
count, _ = _get_counts_nanvar(values.shape, mask, axis, ddof, values.dtype)
var = nanvar(values, axis=axis, skipna=skipna, ddof=ddof)
return np.sqrt(var) / np.sqrt(count)
def _nanminmax(meth, fill_value_typ):
@bottleneck_switch(name="nan" + meth)
@_datetimelike_compat
def reduction(
values: np.ndarray,
*,
axis: Optional[int] = None,
skipna: bool = True,
mask: Optional[np.ndarray] = None,
) -> Dtype:
values, mask, dtype, dtype_max, fill_value = _get_values(
values, skipna, fill_value_typ=fill_value_typ, mask=mask
)
if (axis is not None and values.shape[axis] == 0) or values.size == 0:
try:
result = getattr(values, meth)(axis, dtype=dtype_max)
result.fill(np.nan)
except (AttributeError, TypeError, ValueError):
result = np.nan
else:
result = getattr(values, meth)(axis)
result = _maybe_null_out(result, axis, mask, values.shape)
return result
return reduction
nanmin = _nanminmax("min", fill_value_typ="+inf")
nanmax = _nanminmax("max", fill_value_typ="-inf")
@disallow("O")
def nanargmax(
values: np.ndarray,
*,
axis: Optional[int] = None,
skipna: bool = True,
mask: Optional[np.ndarray] = None,
) -> Union[int, np.ndarray]:
"""
Parameters
----------
values : ndarray
axis: int, optional
skipna : bool, default True
mask : ndarray[bool], optional
nan-mask if known
Returns
-------
result : int or ndarray[int]
The index/indices of max value in specified axis or -1 in the NA case
Examples
--------
>>> import pandas.core.nanops as nanops
>>> arr = np.array([1, 2, 3, np.nan, 4])
>>> nanops.nanargmax(arr)
4
>>> arr = np.array(range(12), dtype=np.float64).reshape(4, 3)
>>> arr[2:, 2] = np.nan
>>> arr
array([[ 0., 1., 2.],
[ 3., 4., 5.],
[ 6., 7., nan],
[ 9., 10., nan]])
>>> nanops.nanargmax(arr, axis=1)
array([2, 2, 1, 1], dtype=int64)
"""
values, mask, _, _, _ = _get_values(values, True, fill_value_typ="-inf", mask=mask)
result = values.argmax(axis)
result = _maybe_arg_null_out(result, axis, mask, skipna)
return result
@disallow("O")
def nanargmin(
values: np.ndarray,
*,
axis: Optional[int] = None,
skipna: bool = True,
mask: Optional[np.ndarray] = None,
) -> Union[int, np.ndarray]:
"""
Parameters
----------
values : ndarray
axis: int, optional
skipna : bool, default True
mask : ndarray[bool], optional
nan-mask if known
Returns
-------
result : int or ndarray[int]
The index/indices of min value in specified axis or -1 in the NA case
Examples
--------
>>> import pandas.core.nanops as nanops
>>> arr = np.array([1, 2, 3, np.nan, 4])
>>> nanops.nanargmin(arr)
0
>>> arr = np.array(range(12), dtype=np.float64).reshape(4, 3)
>>> arr[2:, 0] = np.nan
>>> arr
array([[ 0., 1., 2.],
[ 3., 4., 5.],
[nan, 7., 8.],
[nan, 10., 11.]])
>>> nanops.nanargmin(arr, axis=1)
array([0, 0, 1, 1], dtype=int64)
"""
values, mask, _, _, _ = _get_values(values, True, fill_value_typ="+inf", mask=mask)
result = values.argmin(axis)
result = _maybe_arg_null_out(result, axis, mask, skipna)
return result
@disallow("M8", "m8")
def nanskew(
values: np.ndarray,
*,
axis: Optional[int] = None,
skipna: bool = True,
mask: Optional[np.ndarray] = None,
) -> float:
"""
Compute the sample skewness.
The statistic computed here is the adjusted Fisher-Pearson standardized
moment coefficient G1. The algorithm computes this coefficient directly
from the second and third central moment.
Parameters
----------
values : ndarray
axis: int, optional
skipna : bool, default True
mask : ndarray[bool], optional
nan-mask if known
Returns
-------
result : float64
Unless input is a float array, in which case use the same
precision as the input array.
Examples
--------
>>> import pandas.core.nanops as nanops
>>> s = pd.Series([1, np.nan, 1, 2])
>>> nanops.nanskew(s)
1.7320508075688787
"""
values = extract_array(values, extract_numpy=True)
mask = _maybe_get_mask(values, skipna, mask)
if not is_float_dtype(values.dtype):
values = values.astype("f8")
count = _get_counts(values.shape, mask, axis)
else:
count = _get_counts(values.shape, mask, axis, dtype=values.dtype)
if skipna and mask is not None:
values = values.copy()
np.putmask(values, mask, 0)
mean = values.sum(axis, dtype=np.float64) / count
if axis is not None:
mean = np.expand_dims(mean, axis)
adjusted = values - mean
if skipna and mask is not None:
np.putmask(adjusted, mask, 0)
adjusted2 = adjusted ** 2
adjusted3 = adjusted2 * adjusted
m2 = adjusted2.sum(axis, dtype=np.float64)
m3 = adjusted3.sum(axis, dtype=np.float64)
# floating point error
#
# #18044 in _libs/windows.pyx calc_skew follow this behavior
# to fix the fperr to treat m2 <1e-14 as zero
m2 = _zero_out_fperr(m2)
m3 = _zero_out_fperr(m3)
with np.errstate(invalid="ignore", divide="ignore"):
result = (count * (count - 1) ** 0.5 / (count - 2)) * (m3 / m2 ** 1.5)
dtype = values.dtype
if is_float_dtype(dtype):
result = result.astype(dtype)
if isinstance(result, np.ndarray):
result = np.where(m2 == 0, 0, result)
result[count < 3] = np.nan
return result
else:
result = 0 if m2 == 0 else result
if count < 3:
return np.nan
return result
@disallow("M8", "m8")
def nankurt(
values: np.ndarray,
*,
axis: Optional[int] = None,
skipna: bool = True,
mask: Optional[np.ndarray] = None,
) -> float:
"""
Compute the sample excess kurtosis
The statistic computed here is the adjusted Fisher-Pearson standardized
moment coefficient G2, computed directly from the second and fourth
central moment.
Parameters
----------
values : ndarray
axis: int, optional
skipna : bool, default True
mask : ndarray[bool], optional
nan-mask if known
Returns
-------
result : float64
Unless input is a float array, in which case use the same
precision as the input array.
Examples
--------
>>> import pandas.core.nanops as nanops
>>> s = pd.Series([1, np.nan, 1, 3, 2])
>>> nanops.nankurt(s)
-1.2892561983471076
"""
values = extract_array(values, extract_numpy=True)
mask = _maybe_get_mask(values, skipna, mask)
if not is_float_dtype(values.dtype):
values = values.astype("f8")
count = _get_counts(values.shape, mask, axis)
else:
count = _get_counts(values.shape, mask, axis, dtype=values.dtype)
if skipna and mask is not None:
values = values.copy()
np.putmask(values, mask, 0)
mean = values.sum(axis, dtype=np.float64) / count
if axis is not None:
mean = np.expand_dims(mean, axis)
adjusted = values - mean
if skipna and mask is not None:
np.putmask(adjusted, mask, 0)
adjusted2 = adjusted ** 2
adjusted4 = adjusted2 ** 2
m2 = adjusted2.sum(axis, dtype=np.float64)
m4 = adjusted4.sum(axis, dtype=np.float64)
with np.errstate(invalid="ignore", divide="ignore"):
adj = 3 * (count - 1) ** 2 / ((count - 2) * (count - 3))
numer = count * (count + 1) * (count - 1) * m4
denom = (count - 2) * (count - 3) * m2 ** 2
# floating point error
#
# #18044 in _libs/windows.pyx calc_kurt follow this behavior
# to fix the fperr to treat denom <1e-14 as zero
numer = _zero_out_fperr(numer)
denom = _zero_out_fperr(denom)
if not isinstance(denom, np.ndarray):
# if ``denom`` is a scalar, check these corner cases first before
# doing division
if count < 4:
return np.nan
if denom == 0:
return 0
with np.errstate(invalid="ignore", divide="ignore"):
result = numer / denom - adj
dtype = values.dtype
if is_float_dtype(dtype):
result = result.astype(dtype)
if isinstance(result, np.ndarray):
result = np.where(denom == 0, 0, result)
result[count < 4] = np.nan
return result
@disallow("M8", "m8")
def nanprod(
values: np.ndarray,
*,
axis: Optional[int] = None,
skipna: bool = True,
min_count: int = 0,
mask: Optional[np.ndarray] = None,
) -> float:
"""
Parameters
----------
values : ndarray[dtype]
axis: int, optional
skipna : bool, default True
min_count: int, default 0
mask : ndarray[bool], optional
nan-mask if known
Returns
-------
Dtype
The product of all elements on a given axis. ( NaNs are treated as 1)
Examples
--------
>>> import pandas.core.nanops as nanops
>>> s = pd.Series([1, 2, 3, np.nan])
>>> nanops.nanprod(s)
6.0
"""
mask = _maybe_get_mask(values, skipna, mask)
if skipna and mask is not None:
values = values.copy()
values[mask] = 1
result = values.prod(axis)
return _maybe_null_out(result, axis, mask, values.shape, min_count=min_count)
def _maybe_arg_null_out(
result: np.ndarray, axis: Optional[int], mask: Optional[np.ndarray], skipna: bool
) -> Union[np.ndarray, int]:
# helper function for nanargmin/nanargmax
if mask is None:
return result
if axis is None or not getattr(result, "ndim", False):
if skipna:
if mask.all():
result = -1
else:
if mask.any():
result = -1
else:
if skipna:
na_mask = mask.all(axis)
else:
na_mask = mask.any(axis)
if na_mask.any():
result[na_mask] = -1
return result
def _get_counts(
values_shape: Tuple[int, ...],
mask: Optional[np.ndarray],
axis: Optional[int],
dtype: Dtype = float,
) -> Union[int, float, np.ndarray]:
"""
Get the count of non-null values along an axis
Parameters
----------
values_shape : tuple of int
shape tuple from values ndarray, used if mask is None
mask : Optional[ndarray[bool]]
locations in values that should be considered missing
axis : Optional[int]
axis to count along
dtype : type, optional
type to use for count
Returns
-------
count : scalar or array
"""
dtype = get_dtype(dtype)
if axis is None:
if mask is not None:
n = mask.size - mask.sum()
else:
n = np.prod(values_shape)
return dtype.type(n)
if mask is not None:
count = mask.shape[axis] - mask.sum(axis)
else:
count = values_shape[axis]
if is_scalar(count):
return dtype.type(count)
try:
return count.astype(dtype)
except AttributeError:
return np.array(count, dtype=dtype)
def _maybe_null_out(
result: np.ndarray,
axis: Optional[int],
mask: Optional[np.ndarray],
shape: Tuple[int, ...],
min_count: int = 1,
) -> Union[np.ndarray, float]:
"""
Returns
-------
Dtype
The product of all elements on a given axis. ( NaNs are treated as 1)
"""
if mask is not None and axis is not None and getattr(result, "ndim", False):
null_mask = (mask.shape[axis] - mask.sum(axis) - min_count) < 0
if np.any(null_mask):
if is_numeric_dtype(result):
if np.iscomplexobj(result):
result = result.astype("c16")
else:
result = result.astype("f8")
result[null_mask] = np.nan
else:
# GH12941, use None to auto cast null
result[null_mask] = None
elif result is not NaT:
if check_below_min_count(shape, mask, min_count):
result = np.nan
return result
def check_below_min_count(
shape: Tuple[int, ...], mask: Optional[np.ndarray], min_count: int
) -> bool:
"""
Check for the `min_count` keyword. Returns True if below `min_count` (when
missing value should be returned from the reduction).
Parameters
----------
shape : tuple
The shape of the values (`values.shape`).
mask : ndarray or None
Boolean numpy array (typically of same shape as `shape`) or None.
min_count : int
Keyword passed through from sum/prod call.
Returns
-------
bool
"""
if min_count > 0:
if mask is None:
# no missing values, only check size
non_nulls = np.prod(shape)
else:
non_nulls = mask.size - mask.sum()
if non_nulls < min_count:
return True
return False
def _zero_out_fperr(arg):
# #18044 reference this behavior to fix rolling skew/kurt issue
if isinstance(arg, np.ndarray):
with np.errstate(invalid="ignore"):
return np.where(np.abs(arg) < 1e-14, 0, arg)
else:
return arg.dtype.type(0) if np.abs(arg) < 1e-14 else arg
@disallow("M8", "m8")
def nancorr(
a: np.ndarray, b: np.ndarray, *, method="pearson", min_periods: Optional[int] = None
):
"""
a, b: ndarrays
"""
if len(a) != len(b):
raise AssertionError("Operands to nancorr must have same size")
if min_periods is None:
min_periods = 1
valid = notna(a) & notna(b)
if not valid.all():
a = a[valid]
b = b[valid]
if len(a) < min_periods:
return np.nan
f = get_corr_func(method)
return f(a, b)
def get_corr_func(method):
if method == "kendall":
from scipy.stats import kendalltau
def func(a, b):
return kendalltau(a, b)[0]
return func
elif method == "spearman":
from scipy.stats import spearmanr
def func(a, b):
return spearmanr(a, b)[0]
return func
elif method == "pearson":
def func(a, b):
return np.corrcoef(a, b)[0, 1]
return func
elif callable(method):
return method
raise ValueError(
f"Unknown method '{method}', expected one of "
"'kendall', 'spearman', 'pearson', or callable"
)
@disallow("M8", "m8")
def nancov(
a: np.ndarray,
b: np.ndarray,
*,
min_periods: Optional[int] = None,
ddof: Optional[int] = 1,
):
if len(a) != len(b):
raise AssertionError("Operands to nancov must have same size")
if min_periods is None:
min_periods = 1
valid = notna(a) & notna(b)
if not valid.all():
a = a[valid]
b = b[valid]
if len(a) < min_periods:
return np.nan
return np.cov(a, b, ddof=ddof)[0, 1]
def _ensure_numeric(x):
if isinstance(x, np.ndarray):
if is_integer_dtype(x) or is_bool_dtype(x):
x = x.astype(np.float64)
elif is_object_dtype(x):
try:
x = x.astype(np.complex128)
except (TypeError, ValueError):
try:
x = x.astype(np.float64)
except ValueError as err:
# GH#29941 we get here with object arrays containing strs
raise TypeError(f"Could not convert {x} to numeric") from err
else:
if not np.any(np.imag(x)):
x = x.real
elif not (is_float(x) or is_integer(x) or is_complex(x)):
try:
x = float(x)
except ValueError:
# e.g. "1+1j" or "foo"
try:
x = complex(x)
except ValueError as err:
# e.g. "foo"
raise TypeError(f"Could not convert {x} to numeric") from err
return x
# NA-friendly array comparisons
def make_nancomp(op):
def f(x, y):
xmask = isna(x)
ymask = isna(y)
mask = xmask | ymask
with np.errstate(all="ignore"):
result = op(x, y)
if mask.any():
if is_bool_dtype(result):
result = result.astype("O")
np.putmask(result, mask, np.nan)
return result
return f
nangt = make_nancomp(operator.gt)
nange = make_nancomp(operator.ge)
nanlt = make_nancomp(operator.lt)
nanle = make_nancomp(operator.le)
naneq = make_nancomp(operator.eq)
nanne = make_nancomp(operator.ne)
def _nanpercentile_1d(
values: np.ndarray, mask: np.ndarray, q, na_value: Scalar, interpolation
) -> Union[Scalar, np.ndarray]:
"""
Wrapper for np.percentile that skips missing values, specialized to
1-dimensional case.
Parameters
----------
values : array over which to find quantiles
mask : ndarray[bool]
locations in values that should be considered missing
q : scalar or array of quantile indices to find
na_value : scalar
value to return for empty or all-null values
interpolation : str
Returns
-------
quantiles : scalar or array
"""
# mask is Union[ExtensionArray, ndarray]
values = values[~mask]
if len(values) == 0:
if lib.is_scalar(q):
return na_value
else:
return np.array([na_value] * len(q), dtype=values.dtype)
return np.percentile(values, q, interpolation=interpolation)
def nanpercentile(
values: np.ndarray,
q,
*,
axis: int,
na_value,
mask: np.ndarray,
ndim: int,
interpolation,
):
"""
Wrapper for np.percentile that skips missing values.
Parameters
----------
values : array over which to find quantiles
q : scalar or array of quantile indices to find
axis : {0, 1}
na_value : scalar
value to return for empty or all-null values
mask : ndarray[bool]
locations in values that should be considered missing
ndim : {1, 2}
interpolation : str
Returns
-------
quantiles : scalar or array
"""
if values.dtype.kind in ["m", "M"]:
# need to cast to integer to avoid rounding errors in numpy
result = nanpercentile(
values.view("i8"),
q=q,
axis=axis,
na_value=na_value.view("i8"),
mask=mask,
ndim=ndim,
interpolation=interpolation,
)
# Note: we have to do `astype` and not view because in general we
# have float result at this point, not i8
return result.astype(values.dtype)
if not lib.is_scalar(mask) and mask.any():
if ndim == 1:
return _nanpercentile_1d(
values, mask, q, na_value, interpolation=interpolation
)
else:
# for nonconsolidatable blocks mask is 1D, but values 2D
if mask.ndim < values.ndim:
mask = mask.reshape(values.shape)
if axis == 0:
values = values.T
mask = mask.T
result = [
_nanpercentile_1d(val, m, q, na_value, interpolation=interpolation)
for (val, m) in zip(list(values), list(mask))
]
result = np.array(result, dtype=values.dtype, copy=False).T
return result
else:
return np.percentile(values, q, axis=axis, interpolation=interpolation)
def na_accum_func(values: ArrayLike, accum_func, *, skipna: bool) -> ArrayLike:
"""
Cumulative function with skipna support.
Parameters
----------
values : np.ndarray or ExtensionArray
accum_func : {np.cumprod, np.maximum.accumulate, np.cumsum, np.minimum.accumulate}
skipna : bool
Returns
-------
np.ndarray or ExtensionArray
"""
mask_a, mask_b = {
np.cumprod: (1.0, np.nan),
np.maximum.accumulate: (-np.inf, np.nan),
np.cumsum: (0.0, np.nan),
np.minimum.accumulate: (np.inf, np.nan),
}[accum_func]
# We will be applying this function to block values
if values.dtype.kind in ["m", "M"]:
# GH#30460, GH#29058
# numpy 1.18 started sorting NaTs at the end instead of beginning,
# so we need to work around to maintain backwards-consistency.
orig_dtype = values.dtype
# We need to define mask before masking NaTs
mask = isna(values)
if accum_func == np.minimum.accumulate:
# Note: the accum_func comparison fails as an "is" comparison
y = values.view("i8")
y[mask] = np.iinfo(np.int64).max
changed = True
else:
y = values
changed = False
result = accum_func(y.view("i8"), axis=0)
if skipna:
result[mask] = iNaT
elif accum_func == np.minimum.accumulate:
# Restore NaTs that we masked previously
nz = (~np.asarray(mask)).nonzero()[0]
if len(nz):
# everything up to the first non-na entry stays NaT
result[: nz[0]] = iNaT
if changed:
# restore NaT elements
y[mask] = iNaT # TODO: could try/finally for this?
if isinstance(values, np.ndarray):
result = result.view(orig_dtype)
else:
# DatetimeArray
result = type(values)._simple_new( # type: ignore[attr-defined]
result, dtype=orig_dtype
)
elif skipna and not issubclass(values.dtype.type, (np.integer, np.bool_)):
vals = values.copy()
mask = isna(vals)
vals[mask] = mask_a
result = accum_func(vals, axis=0)
result[mask] = mask_b
else:
result = accum_func(values, axis=0)
return result