LSR/env/lib/python3.6/site-packages/pandas/tests/test_nanops.py
2020-06-04 17:24:47 +02:00

1075 lines
37 KiB
Python

from functools import partial
import operator
import warnings
import numpy as np
import pytest
import pandas.util._test_decorators as td
from pandas.core.dtypes.common import is_integer_dtype
import pandas as pd
from pandas import Series, isna
import pandas._testing as tm
from pandas.core.arrays import DatetimeArray
import pandas.core.nanops as nanops
use_bn = nanops._USE_BOTTLENECK
has_c16 = hasattr(np, "complex128")
class TestnanopsDataFrame:
def setup_method(self, method):
np.random.seed(11235)
nanops._USE_BOTTLENECK = False
arr_shape = (11, 7)
self.arr_float = np.random.randn(*arr_shape)
self.arr_float1 = np.random.randn(*arr_shape)
self.arr_complex = self.arr_float + self.arr_float1 * 1j
self.arr_int = np.random.randint(-10, 10, arr_shape)
self.arr_bool = np.random.randint(0, 2, arr_shape) == 0
self.arr_str = np.abs(self.arr_float).astype("S")
self.arr_utf = np.abs(self.arr_float).astype("U")
self.arr_date = np.random.randint(0, 20000, arr_shape).astype("M8[ns]")
self.arr_tdelta = np.random.randint(0, 20000, arr_shape).astype("m8[ns]")
self.arr_nan = np.tile(np.nan, arr_shape)
self.arr_float_nan = np.vstack([self.arr_float, self.arr_nan])
self.arr_float1_nan = np.vstack([self.arr_float1, self.arr_nan])
self.arr_nan_float1 = np.vstack([self.arr_nan, self.arr_float1])
self.arr_nan_nan = np.vstack([self.arr_nan, self.arr_nan])
self.arr_inf = self.arr_float * np.inf
self.arr_float_inf = np.vstack([self.arr_float, self.arr_inf])
self.arr_nan_inf = np.vstack([self.arr_nan, self.arr_inf])
self.arr_float_nan_inf = np.vstack([self.arr_float, self.arr_nan, self.arr_inf])
self.arr_nan_nan_inf = np.vstack([self.arr_nan, self.arr_nan, self.arr_inf])
self.arr_obj = np.vstack(
[
self.arr_float.astype("O"),
self.arr_int.astype("O"),
self.arr_bool.astype("O"),
self.arr_complex.astype("O"),
self.arr_str.astype("O"),
self.arr_utf.astype("O"),
self.arr_date.astype("O"),
self.arr_tdelta.astype("O"),
]
)
with np.errstate(invalid="ignore"):
self.arr_nan_nanj = self.arr_nan + self.arr_nan * 1j
self.arr_complex_nan = np.vstack([self.arr_complex, self.arr_nan_nanj])
self.arr_nan_infj = self.arr_inf * 1j
self.arr_complex_nan_infj = np.vstack([self.arr_complex, self.arr_nan_infj])
self.arr_float_2d = self.arr_float
self.arr_float1_2d = self.arr_float1
self.arr_nan_2d = self.arr_nan
self.arr_float_nan_2d = self.arr_float_nan
self.arr_float1_nan_2d = self.arr_float1_nan
self.arr_nan_float1_2d = self.arr_nan_float1
self.arr_float_1d = self.arr_float[:, 0]
self.arr_float1_1d = self.arr_float1[:, 0]
self.arr_nan_1d = self.arr_nan[:, 0]
self.arr_float_nan_1d = self.arr_float_nan[:, 0]
self.arr_float1_nan_1d = self.arr_float1_nan[:, 0]
self.arr_nan_float1_1d = self.arr_nan_float1[:, 0]
def teardown_method(self, method):
nanops._USE_BOTTLENECK = use_bn
def check_results(self, targ, res, axis, check_dtype=True):
res = getattr(res, "asm8", res)
res = getattr(res, "values", res)
# timedeltas are a beast here
def _coerce_tds(targ, res):
if hasattr(targ, "dtype") and targ.dtype == "m8[ns]":
if len(targ) == 1:
targ = targ[0].item()
res = res.item()
else:
targ = targ.view("i8")
return targ, res
try:
if (
axis != 0
and hasattr(targ, "shape")
and targ.ndim
and targ.shape != res.shape
):
res = np.split(res, [targ.shape[0]], axis=0)[0]
except (ValueError, IndexError):
targ, res = _coerce_tds(targ, res)
try:
tm.assert_almost_equal(targ, res, check_dtype=check_dtype)
except AssertionError:
# handle timedelta dtypes
if hasattr(targ, "dtype") and targ.dtype == "m8[ns]":
targ, res = _coerce_tds(targ, res)
tm.assert_almost_equal(targ, res, check_dtype=check_dtype)
return
# There are sometimes rounding errors with
# complex and object dtypes.
# If it isn't one of those, re-raise the error.
if not hasattr(res, "dtype") or res.dtype.kind not in ["c", "O"]:
raise
# convert object dtypes to something that can be split into
# real and imaginary parts
if res.dtype.kind == "O":
if targ.dtype.kind != "O":
res = res.astype(targ.dtype)
else:
cast_dtype = "c16" if has_c16 else "f8"
res = res.astype(cast_dtype)
targ = targ.astype(cast_dtype)
# there should never be a case where numpy returns an object
# but nanops doesn't, so make that an exception
elif targ.dtype.kind == "O":
raise
tm.assert_almost_equal(np.real(targ), np.real(res), check_dtype=check_dtype)
tm.assert_almost_equal(np.imag(targ), np.imag(res), check_dtype=check_dtype)
def check_fun_data(
self,
testfunc,
targfunc,
testarval,
targarval,
check_dtype=True,
empty_targfunc=None,
**kwargs,
):
for axis in list(range(targarval.ndim)) + [None]:
for skipna in [False, True]:
targartempval = targarval if skipna else testarval
if skipna and empty_targfunc and isna(targartempval).all():
targ = empty_targfunc(targartempval, axis=axis, **kwargs)
else:
targ = targfunc(targartempval, axis=axis, **kwargs)
res = testfunc(testarval, axis=axis, skipna=skipna, **kwargs)
self.check_results(targ, res, axis, check_dtype=check_dtype)
if skipna:
res = testfunc(testarval, axis=axis, **kwargs)
self.check_results(targ, res, axis, check_dtype=check_dtype)
if axis is None:
res = testfunc(testarval, skipna=skipna, **kwargs)
self.check_results(targ, res, axis, check_dtype=check_dtype)
if skipna and axis is None:
res = testfunc(testarval, **kwargs)
self.check_results(targ, res, axis, check_dtype=check_dtype)
if testarval.ndim <= 1:
return
# Recurse on lower-dimension
testarval2 = np.take(testarval, 0, axis=-1)
targarval2 = np.take(targarval, 0, axis=-1)
self.check_fun_data(
testfunc,
targfunc,
testarval2,
targarval2,
check_dtype=check_dtype,
empty_targfunc=empty_targfunc,
**kwargs,
)
def check_fun(self, testfunc, targfunc, testar, empty_targfunc=None, **kwargs):
targar = testar
if testar.endswith("_nan") and hasattr(self, testar[:-4]):
targar = testar[:-4]
testarval = getattr(self, testar)
targarval = getattr(self, targar)
self.check_fun_data(
testfunc,
targfunc,
testarval,
targarval,
empty_targfunc=empty_targfunc,
**kwargs,
)
def check_funs(
self,
testfunc,
targfunc,
allow_complex=True,
allow_all_nan=True,
allow_date=True,
allow_tdelta=True,
allow_obj=True,
**kwargs,
):
self.check_fun(testfunc, targfunc, "arr_float", **kwargs)
self.check_fun(testfunc, targfunc, "arr_float_nan", **kwargs)
self.check_fun(testfunc, targfunc, "arr_int", **kwargs)
self.check_fun(testfunc, targfunc, "arr_bool", **kwargs)
objs = [
self.arr_float.astype("O"),
self.arr_int.astype("O"),
self.arr_bool.astype("O"),
]
if allow_all_nan:
self.check_fun(testfunc, targfunc, "arr_nan", **kwargs)
if allow_complex:
self.check_fun(testfunc, targfunc, "arr_complex", **kwargs)
self.check_fun(testfunc, targfunc, "arr_complex_nan", **kwargs)
if allow_all_nan:
self.check_fun(testfunc, targfunc, "arr_nan_nanj", **kwargs)
objs += [self.arr_complex.astype("O")]
if allow_date:
targfunc(self.arr_date)
self.check_fun(testfunc, targfunc, "arr_date", **kwargs)
objs += [self.arr_date.astype("O")]
if allow_tdelta:
try:
targfunc(self.arr_tdelta)
except TypeError:
pass
else:
self.check_fun(testfunc, targfunc, "arr_tdelta", **kwargs)
objs += [self.arr_tdelta.astype("O")]
if allow_obj:
self.arr_obj = np.vstack(objs)
# some nanops handle object dtypes better than their numpy
# counterparts, so the numpy functions need to be given something
# else
if allow_obj == "convert":
targfunc = partial(
self._badobj_wrap, func=targfunc, allow_complex=allow_complex
)
self.check_fun(testfunc, targfunc, "arr_obj", **kwargs)
def _badobj_wrap(self, value, func, allow_complex=True, **kwargs):
if value.dtype.kind == "O":
if allow_complex:
value = value.astype("c16")
else:
value = value.astype("f8")
return func(value, **kwargs)
@pytest.mark.parametrize(
"nan_op,np_op", [(nanops.nanany, np.any), (nanops.nanall, np.all)]
)
def test_nan_funcs(self, nan_op, np_op):
# TODO: allow tdelta, doesn't break tests
self.check_funs(
nan_op, np_op, allow_all_nan=False, allow_date=False, allow_tdelta=False
)
def test_nansum(self):
self.check_funs(
nanops.nansum,
np.sum,
allow_date=False,
check_dtype=False,
empty_targfunc=np.nansum,
)
def test_nanmean(self):
self.check_funs(
nanops.nanmean,
np.mean,
allow_complex=False, # TODO: allow this, doesn't break test
allow_obj=False,
allow_date=False,
)
def test_nanmean_overflow(self):
# GH 10155
# In the previous implementation mean can overflow for int dtypes, it
# is now consistent with numpy
for a in [2 ** 55, -(2 ** 55), 20150515061816532]:
s = Series(a, index=range(500), dtype=np.int64)
result = s.mean()
np_result = s.values.mean()
assert result == a
assert result == np_result
assert result.dtype == np.float64
@pytest.mark.parametrize(
"dtype",
[
np.int16,
np.int32,
np.int64,
np.float32,
np.float64,
getattr(np, "float128", None),
],
)
def test_returned_dtype(self, dtype):
if dtype is None:
# no float128 available
return
s = Series(range(10), dtype=dtype)
group_a = ["mean", "std", "var", "skew", "kurt"]
group_b = ["min", "max"]
for method in group_a + group_b:
result = getattr(s, method)()
if is_integer_dtype(dtype) and method in group_a:
assert result.dtype == np.float64
else:
assert result.dtype == dtype
def test_nanmedian(self):
with warnings.catch_warnings(record=True):
warnings.simplefilter("ignore", RuntimeWarning)
self.check_funs(
nanops.nanmedian,
np.median,
allow_complex=False,
allow_date=False,
allow_obj="convert",
)
@pytest.mark.parametrize("ddof", range(3))
def test_nanvar(self, ddof):
self.check_funs(
nanops.nanvar,
np.var,
allow_complex=False,
allow_date=False,
allow_obj="convert",
ddof=ddof,
)
@pytest.mark.parametrize("ddof", range(3))
def test_nanstd(self, ddof):
self.check_funs(
nanops.nanstd,
np.std,
allow_complex=False,
allow_date=False,
allow_obj="convert",
ddof=ddof,
)
@td.skip_if_no_scipy
@pytest.mark.parametrize("ddof", range(3))
def test_nansem(self, ddof):
from scipy.stats import sem
with np.errstate(invalid="ignore"):
self.check_funs(
nanops.nansem,
sem,
allow_complex=False,
allow_date=False,
allow_tdelta=False,
allow_obj="convert",
ddof=ddof,
)
@pytest.mark.parametrize(
"nan_op,np_op", [(nanops.nanmin, np.min), (nanops.nanmax, np.max)]
)
def test_nanops_with_warnings(self, nan_op, np_op):
with warnings.catch_warnings(record=True):
warnings.simplefilter("ignore", RuntimeWarning)
self.check_funs(nan_op, np_op, allow_obj=False)
def _argminmax_wrap(self, value, axis=None, func=None):
res = func(value, axis)
nans = np.min(value, axis)
nullnan = isna(nans)
if res.ndim:
res[nullnan] = -1
elif (
hasattr(nullnan, "all")
and nullnan.all()
or not hasattr(nullnan, "all")
and nullnan
):
res = -1
return res
def test_nanargmax(self):
with warnings.catch_warnings(record=True):
warnings.simplefilter("ignore", RuntimeWarning)
func = partial(self._argminmax_wrap, func=np.argmax)
self.check_funs(nanops.nanargmax, func, allow_obj=False)
def test_nanargmin(self):
with warnings.catch_warnings(record=True):
warnings.simplefilter("ignore", RuntimeWarning)
func = partial(self._argminmax_wrap, func=np.argmin)
self.check_funs(nanops.nanargmin, func, allow_obj=False)
def _skew_kurt_wrap(self, values, axis=None, func=None):
if not isinstance(values.dtype.type, np.floating):
values = values.astype("f8")
result = func(values, axis=axis, bias=False)
# fix for handling cases where all elements in an axis are the same
if isinstance(result, np.ndarray):
result[np.max(values, axis=axis) == np.min(values, axis=axis)] = 0
return result
elif np.max(values) == np.min(values):
return 0.0
return result
@td.skip_if_no_scipy
def test_nanskew(self):
from scipy.stats import skew
func = partial(self._skew_kurt_wrap, func=skew)
with np.errstate(invalid="ignore"):
self.check_funs(
nanops.nanskew,
func,
allow_complex=False,
allow_date=False,
allow_tdelta=False,
)
@td.skip_if_no_scipy
def test_nankurt(self):
from scipy.stats import kurtosis
func1 = partial(kurtosis, fisher=True)
func = partial(self._skew_kurt_wrap, func=func1)
with np.errstate(invalid="ignore"):
self.check_funs(
nanops.nankurt,
func,
allow_complex=False,
allow_date=False,
allow_tdelta=False,
)
def test_nanprod(self):
self.check_funs(
nanops.nanprod,
np.prod,
allow_date=False,
allow_tdelta=False,
empty_targfunc=np.nanprod,
)
def check_nancorr_nancov_2d(self, checkfun, targ0, targ1, **kwargs):
res00 = checkfun(self.arr_float_2d, self.arr_float1_2d, **kwargs)
res01 = checkfun(
self.arr_float_2d,
self.arr_float1_2d,
min_periods=len(self.arr_float_2d) - 1,
**kwargs,
)
tm.assert_almost_equal(targ0, res00)
tm.assert_almost_equal(targ0, res01)
res10 = checkfun(self.arr_float_nan_2d, self.arr_float1_nan_2d, **kwargs)
res11 = checkfun(
self.arr_float_nan_2d,
self.arr_float1_nan_2d,
min_periods=len(self.arr_float_2d) - 1,
**kwargs,
)
tm.assert_almost_equal(targ1, res10)
tm.assert_almost_equal(targ1, res11)
targ2 = np.nan
res20 = checkfun(self.arr_nan_2d, self.arr_float1_2d, **kwargs)
res21 = checkfun(self.arr_float_2d, self.arr_nan_2d, **kwargs)
res22 = checkfun(self.arr_nan_2d, self.arr_nan_2d, **kwargs)
res23 = checkfun(self.arr_float_nan_2d, self.arr_nan_float1_2d, **kwargs)
res24 = checkfun(
self.arr_float_nan_2d,
self.arr_nan_float1_2d,
min_periods=len(self.arr_float_2d) - 1,
**kwargs,
)
res25 = checkfun(
self.arr_float_2d,
self.arr_float1_2d,
min_periods=len(self.arr_float_2d) + 1,
**kwargs,
)
tm.assert_almost_equal(targ2, res20)
tm.assert_almost_equal(targ2, res21)
tm.assert_almost_equal(targ2, res22)
tm.assert_almost_equal(targ2, res23)
tm.assert_almost_equal(targ2, res24)
tm.assert_almost_equal(targ2, res25)
def check_nancorr_nancov_1d(self, checkfun, targ0, targ1, **kwargs):
res00 = checkfun(self.arr_float_1d, self.arr_float1_1d, **kwargs)
res01 = checkfun(
self.arr_float_1d,
self.arr_float1_1d,
min_periods=len(self.arr_float_1d) - 1,
**kwargs,
)
tm.assert_almost_equal(targ0, res00)
tm.assert_almost_equal(targ0, res01)
res10 = checkfun(self.arr_float_nan_1d, self.arr_float1_nan_1d, **kwargs)
res11 = checkfun(
self.arr_float_nan_1d,
self.arr_float1_nan_1d,
min_periods=len(self.arr_float_1d) - 1,
**kwargs,
)
tm.assert_almost_equal(targ1, res10)
tm.assert_almost_equal(targ1, res11)
targ2 = np.nan
res20 = checkfun(self.arr_nan_1d, self.arr_float1_1d, **kwargs)
res21 = checkfun(self.arr_float_1d, self.arr_nan_1d, **kwargs)
res22 = checkfun(self.arr_nan_1d, self.arr_nan_1d, **kwargs)
res23 = checkfun(self.arr_float_nan_1d, self.arr_nan_float1_1d, **kwargs)
res24 = checkfun(
self.arr_float_nan_1d,
self.arr_nan_float1_1d,
min_periods=len(self.arr_float_1d) - 1,
**kwargs,
)
res25 = checkfun(
self.arr_float_1d,
self.arr_float1_1d,
min_periods=len(self.arr_float_1d) + 1,
**kwargs,
)
tm.assert_almost_equal(targ2, res20)
tm.assert_almost_equal(targ2, res21)
tm.assert_almost_equal(targ2, res22)
tm.assert_almost_equal(targ2, res23)
tm.assert_almost_equal(targ2, res24)
tm.assert_almost_equal(targ2, res25)
def test_nancorr(self):
targ0 = np.corrcoef(self.arr_float_2d, self.arr_float1_2d)[0, 1]
targ1 = np.corrcoef(self.arr_float_2d.flat, self.arr_float1_2d.flat)[0, 1]
self.check_nancorr_nancov_2d(nanops.nancorr, targ0, targ1)
targ0 = np.corrcoef(self.arr_float_1d, self.arr_float1_1d)[0, 1]
targ1 = np.corrcoef(self.arr_float_1d.flat, self.arr_float1_1d.flat)[0, 1]
self.check_nancorr_nancov_1d(nanops.nancorr, targ0, targ1, method="pearson")
def test_nancorr_pearson(self):
targ0 = np.corrcoef(self.arr_float_2d, self.arr_float1_2d)[0, 1]
targ1 = np.corrcoef(self.arr_float_2d.flat, self.arr_float1_2d.flat)[0, 1]
self.check_nancorr_nancov_2d(nanops.nancorr, targ0, targ1, method="pearson")
targ0 = np.corrcoef(self.arr_float_1d, self.arr_float1_1d)[0, 1]
targ1 = np.corrcoef(self.arr_float_1d.flat, self.arr_float1_1d.flat)[0, 1]
self.check_nancorr_nancov_1d(nanops.nancorr, targ0, targ1, method="pearson")
@td.skip_if_no_scipy
def test_nancorr_kendall(self):
from scipy.stats import kendalltau
targ0 = kendalltau(self.arr_float_2d, self.arr_float1_2d)[0]
targ1 = kendalltau(self.arr_float_2d.flat, self.arr_float1_2d.flat)[0]
self.check_nancorr_nancov_2d(nanops.nancorr, targ0, targ1, method="kendall")
targ0 = kendalltau(self.arr_float_1d, self.arr_float1_1d)[0]
targ1 = kendalltau(self.arr_float_1d.flat, self.arr_float1_1d.flat)[0]
self.check_nancorr_nancov_1d(nanops.nancorr, targ0, targ1, method="kendall")
@td.skip_if_no_scipy
def test_nancorr_spearman(self):
from scipy.stats import spearmanr
targ0 = spearmanr(self.arr_float_2d, self.arr_float1_2d)[0]
targ1 = spearmanr(self.arr_float_2d.flat, self.arr_float1_2d.flat)[0]
self.check_nancorr_nancov_2d(nanops.nancorr, targ0, targ1, method="spearman")
targ0 = spearmanr(self.arr_float_1d, self.arr_float1_1d)[0]
targ1 = spearmanr(self.arr_float_1d.flat, self.arr_float1_1d.flat)[0]
self.check_nancorr_nancov_1d(nanops.nancorr, targ0, targ1, method="spearman")
@td.skip_if_no_scipy
def test_invalid_method(self):
targ0 = np.corrcoef(self.arr_float_2d, self.arr_float1_2d)[0, 1]
targ1 = np.corrcoef(self.arr_float_2d.flat, self.arr_float1_2d.flat)[0, 1]
msg = "Unkown method 'foo', expected one of 'kendall', 'spearman'"
with pytest.raises(ValueError, match=msg):
self.check_nancorr_nancov_1d(nanops.nancorr, targ0, targ1, method="foo")
def test_nancov(self):
targ0 = np.cov(self.arr_float_2d, self.arr_float1_2d)[0, 1]
targ1 = np.cov(self.arr_float_2d.flat, self.arr_float1_2d.flat)[0, 1]
self.check_nancorr_nancov_2d(nanops.nancov, targ0, targ1)
targ0 = np.cov(self.arr_float_1d, self.arr_float1_1d)[0, 1]
targ1 = np.cov(self.arr_float_1d.flat, self.arr_float1_1d.flat)[0, 1]
self.check_nancorr_nancov_1d(nanops.nancov, targ0, targ1)
def check_nancomp(self, checkfun, targ0):
arr_float = self.arr_float
arr_float1 = self.arr_float1
arr_nan = self.arr_nan
arr_nan_nan = self.arr_nan_nan
arr_float_nan = self.arr_float_nan
arr_float1_nan = self.arr_float1_nan
arr_nan_float1 = self.arr_nan_float1
while targ0.ndim:
res0 = checkfun(arr_float, arr_float1)
tm.assert_almost_equal(targ0, res0)
if targ0.ndim > 1:
targ1 = np.vstack([targ0, arr_nan])
else:
targ1 = np.hstack([targ0, arr_nan])
res1 = checkfun(arr_float_nan, arr_float1_nan)
tm.assert_numpy_array_equal(targ1, res1, check_dtype=False)
targ2 = arr_nan_nan
res2 = checkfun(arr_float_nan, arr_nan_float1)
tm.assert_numpy_array_equal(targ2, res2, check_dtype=False)
# Lower dimension for next step in the loop
arr_float = np.take(arr_float, 0, axis=-1)
arr_float1 = np.take(arr_float1, 0, axis=-1)
arr_nan = np.take(arr_nan, 0, axis=-1)
arr_nan_nan = np.take(arr_nan_nan, 0, axis=-1)
arr_float_nan = np.take(arr_float_nan, 0, axis=-1)
arr_float1_nan = np.take(arr_float1_nan, 0, axis=-1)
arr_nan_float1 = np.take(arr_nan_float1, 0, axis=-1)
targ0 = np.take(targ0, 0, axis=-1)
@pytest.mark.parametrize(
"op,nanop",
[
(operator.eq, nanops.naneq),
(operator.ne, nanops.nanne),
(operator.gt, nanops.nangt),
(operator.ge, nanops.nange),
(operator.lt, nanops.nanlt),
(operator.le, nanops.nanle),
],
)
def test_nan_comparison(self, op, nanop):
targ0 = op(self.arr_float, self.arr_float1)
self.check_nancomp(nanop, targ0)
def check_bool(self, func, value, correct):
while getattr(value, "ndim", True):
res0 = func(value)
if correct:
assert res0
else:
assert not res0
if not hasattr(value, "ndim"):
break
# Reduce dimension for next step in the loop
value = np.take(value, 0, axis=-1)
def test__has_infs(self):
pairs = [
("arr_complex", False),
("arr_int", False),
("arr_bool", False),
("arr_str", False),
("arr_utf", False),
("arr_complex", False),
("arr_complex_nan", False),
("arr_nan_nanj", False),
("arr_nan_infj", True),
("arr_complex_nan_infj", True),
]
pairs_float = [
("arr_float", False),
("arr_nan", False),
("arr_float_nan", False),
("arr_nan_nan", False),
("arr_float_inf", True),
("arr_inf", True),
("arr_nan_inf", True),
("arr_float_nan_inf", True),
("arr_nan_nan_inf", True),
]
for arr, correct in pairs:
val = getattr(self, arr)
self.check_bool(nanops._has_infs, val, correct)
for arr, correct in pairs_float:
val = getattr(self, arr)
self.check_bool(nanops._has_infs, val, correct)
self.check_bool(nanops._has_infs, val.astype("f4"), correct)
self.check_bool(nanops._has_infs, val.astype("f2"), correct)
def test__bn_ok_dtype(self):
assert nanops._bn_ok_dtype(self.arr_float.dtype, "test")
assert nanops._bn_ok_dtype(self.arr_complex.dtype, "test")
assert nanops._bn_ok_dtype(self.arr_int.dtype, "test")
assert nanops._bn_ok_dtype(self.arr_bool.dtype, "test")
assert nanops._bn_ok_dtype(self.arr_str.dtype, "test")
assert nanops._bn_ok_dtype(self.arr_utf.dtype, "test")
assert not nanops._bn_ok_dtype(self.arr_date.dtype, "test")
assert not nanops._bn_ok_dtype(self.arr_tdelta.dtype, "test")
assert not nanops._bn_ok_dtype(self.arr_obj.dtype, "test")
class TestEnsureNumeric:
def test_numeric_values(self):
# Test integer
assert nanops._ensure_numeric(1) == 1
# Test float
assert nanops._ensure_numeric(1.1) == 1.1
# Test complex
assert nanops._ensure_numeric(1 + 2j) == 1 + 2j
def test_ndarray(self):
# Test numeric ndarray
values = np.array([1, 2, 3])
assert np.allclose(nanops._ensure_numeric(values), values)
# Test object ndarray
o_values = values.astype(object)
assert np.allclose(nanops._ensure_numeric(o_values), values)
# Test convertible string ndarray
s_values = np.array(["1", "2", "3"], dtype=object)
assert np.allclose(nanops._ensure_numeric(s_values), values)
# Test non-convertible string ndarray
s_values = np.array(["foo", "bar", "baz"], dtype=object)
msg = r"could not convert string to float: '(foo|baz)'"
with pytest.raises(ValueError, match=msg):
nanops._ensure_numeric(s_values)
def test_convertable_values(self):
assert np.allclose(nanops._ensure_numeric("1"), 1.0)
assert np.allclose(nanops._ensure_numeric("1.1"), 1.1)
assert np.allclose(nanops._ensure_numeric("1+1j"), 1 + 1j)
def test_non_convertable_values(self):
msg = "Could not convert foo to numeric"
with pytest.raises(TypeError, match=msg):
nanops._ensure_numeric("foo")
# with the wrong type, python raises TypeError for us
msg = "argument must be a string or a number"
with pytest.raises(TypeError, match=msg):
nanops._ensure_numeric({})
with pytest.raises(TypeError, match=msg):
nanops._ensure_numeric([])
class TestNanvarFixedValues:
# xref GH10242
def setup_method(self, method):
# Samples from a normal distribution.
self.variance = variance = 3.0
self.samples = self.prng.normal(scale=variance ** 0.5, size=100000)
def test_nanvar_all_finite(self):
samples = self.samples
actual_variance = nanops.nanvar(samples)
tm.assert_almost_equal(actual_variance, self.variance, check_less_precise=2)
def test_nanvar_nans(self):
samples = np.nan * np.ones(2 * self.samples.shape[0])
samples[::2] = self.samples
actual_variance = nanops.nanvar(samples, skipna=True)
tm.assert_almost_equal(actual_variance, self.variance, check_less_precise=2)
actual_variance = nanops.nanvar(samples, skipna=False)
tm.assert_almost_equal(actual_variance, np.nan, check_less_precise=2)
def test_nanstd_nans(self):
samples = np.nan * np.ones(2 * self.samples.shape[0])
samples[::2] = self.samples
actual_std = nanops.nanstd(samples, skipna=True)
tm.assert_almost_equal(actual_std, self.variance ** 0.5, check_less_precise=2)
actual_std = nanops.nanvar(samples, skipna=False)
tm.assert_almost_equal(actual_std, np.nan, check_less_precise=2)
def test_nanvar_axis(self):
# Generate some sample data.
samples_norm = self.samples
samples_unif = self.prng.uniform(size=samples_norm.shape[0])
samples = np.vstack([samples_norm, samples_unif])
actual_variance = nanops.nanvar(samples, axis=1)
tm.assert_almost_equal(
actual_variance, np.array([self.variance, 1.0 / 12]), check_less_precise=2
)
def test_nanvar_ddof(self):
n = 5
samples = self.prng.uniform(size=(10000, n + 1))
samples[:, -1] = np.nan # Force use of our own algorithm.
variance_0 = nanops.nanvar(samples, axis=1, skipna=True, ddof=0).mean()
variance_1 = nanops.nanvar(samples, axis=1, skipna=True, ddof=1).mean()
variance_2 = nanops.nanvar(samples, axis=1, skipna=True, ddof=2).mean()
# The unbiased estimate.
var = 1.0 / 12
tm.assert_almost_equal(variance_1, var, check_less_precise=2)
# The underestimated variance.
tm.assert_almost_equal(variance_0, (n - 1.0) / n * var, check_less_precise=2)
# The overestimated variance.
tm.assert_almost_equal(
variance_2, (n - 1.0) / (n - 2.0) * var, check_less_precise=2
)
def test_ground_truth(self):
# Test against values that were precomputed with Numpy.
samples = np.empty((4, 4))
samples[:3, :3] = np.array(
[
[0.97303362, 0.21869576, 0.55560287],
[0.72980153, 0.03109364, 0.99155171],
[0.09317602, 0.60078248, 0.15871292],
]
)
samples[3] = samples[:, 3] = np.nan
# Actual variances along axis=0, 1 for ddof=0, 1, 2
variance = np.array(
[
[
[0.13762259, 0.05619224, 0.11568816],
[0.20643388, 0.08428837, 0.17353224],
[0.41286776, 0.16857673, 0.34706449],
],
[
[0.09519783, 0.16435395, 0.05082054],
[0.14279674, 0.24653093, 0.07623082],
[0.28559348, 0.49306186, 0.15246163],
],
]
)
# Test nanvar.
for axis in range(2):
for ddof in range(3):
var = nanops.nanvar(samples, skipna=True, axis=axis, ddof=ddof)
tm.assert_almost_equal(var[:3], variance[axis, ddof])
assert np.isnan(var[3])
# Test nanstd.
for axis in range(2):
for ddof in range(3):
std = nanops.nanstd(samples, skipna=True, axis=axis, ddof=ddof)
tm.assert_almost_equal(std[:3], variance[axis, ddof] ** 0.5)
assert np.isnan(std[3])
def test_nanstd_roundoff(self):
# Regression test for GH 10242 (test data taken from GH 10489). Ensure
# that variance is stable.
data = Series(766897346 * np.ones(10))
for ddof in range(3):
result = data.std(ddof=ddof)
assert result == 0.0
@property
def prng(self):
return np.random.RandomState(1234)
class TestNanskewFixedValues:
# xref GH 11974
def setup_method(self, method):
# Test data + skewness value (computed with scipy.stats.skew)
self.samples = np.sin(np.linspace(0, 1, 200))
self.actual_skew = -0.1875895205961754
def test_constant_series(self):
# xref GH 11974
for val in [3075.2, 3075.3, 3075.5]:
data = val * np.ones(300)
skew = nanops.nanskew(data)
assert skew == 0.0
def test_all_finite(self):
alpha, beta = 0.3, 0.1
left_tailed = self.prng.beta(alpha, beta, size=100)
assert nanops.nanskew(left_tailed) < 0
alpha, beta = 0.1, 0.3
right_tailed = self.prng.beta(alpha, beta, size=100)
assert nanops.nanskew(right_tailed) > 0
def test_ground_truth(self):
skew = nanops.nanskew(self.samples)
tm.assert_almost_equal(skew, self.actual_skew)
def test_axis(self):
samples = np.vstack([self.samples, np.nan * np.ones(len(self.samples))])
skew = nanops.nanskew(samples, axis=1)
tm.assert_almost_equal(skew, np.array([self.actual_skew, np.nan]))
def test_nans(self):
samples = np.hstack([self.samples, np.nan])
skew = nanops.nanskew(samples, skipna=False)
assert np.isnan(skew)
def test_nans_skipna(self):
samples = np.hstack([self.samples, np.nan])
skew = nanops.nanskew(samples, skipna=True)
tm.assert_almost_equal(skew, self.actual_skew)
@property
def prng(self):
return np.random.RandomState(1234)
class TestNankurtFixedValues:
# xref GH 11974
def setup_method(self, method):
# Test data + kurtosis value (computed with scipy.stats.kurtosis)
self.samples = np.sin(np.linspace(0, 1, 200))
self.actual_kurt = -1.2058303433799713
def test_constant_series(self):
# xref GH 11974
for val in [3075.2, 3075.3, 3075.5]:
data = val * np.ones(300)
kurt = nanops.nankurt(data)
assert kurt == 0.0
def test_all_finite(self):
alpha, beta = 0.3, 0.1
left_tailed = self.prng.beta(alpha, beta, size=100)
assert nanops.nankurt(left_tailed) < 0
alpha, beta = 0.1, 0.3
right_tailed = self.prng.beta(alpha, beta, size=100)
assert nanops.nankurt(right_tailed) > 0
def test_ground_truth(self):
kurt = nanops.nankurt(self.samples)
tm.assert_almost_equal(kurt, self.actual_kurt)
def test_axis(self):
samples = np.vstack([self.samples, np.nan * np.ones(len(self.samples))])
kurt = nanops.nankurt(samples, axis=1)
tm.assert_almost_equal(kurt, np.array([self.actual_kurt, np.nan]))
def test_nans(self):
samples = np.hstack([self.samples, np.nan])
kurt = nanops.nankurt(samples, skipna=False)
assert np.isnan(kurt)
def test_nans_skipna(self):
samples = np.hstack([self.samples, np.nan])
kurt = nanops.nankurt(samples, skipna=True)
tm.assert_almost_equal(kurt, self.actual_kurt)
@property
def prng(self):
return np.random.RandomState(1234)
class TestDatetime64NaNOps:
@pytest.mark.parametrize("tz", [None, "UTC"])
@pytest.mark.xfail(reason="disabled")
# Enabling mean changes the behavior of DataFrame.mean
# See https://github.com/pandas-dev/pandas/issues/24752
def test_nanmean(self, tz):
dti = pd.date_range("2016-01-01", periods=3, tz=tz)
expected = dti[1]
for obj in [dti, DatetimeArray(dti), Series(dti)]:
result = nanops.nanmean(obj)
assert result == expected
dti2 = dti.insert(1, pd.NaT)
for obj in [dti2, DatetimeArray(dti2), Series(dti2)]:
result = nanops.nanmean(obj)
assert result == expected
def test_use_bottleneck():
if nanops._BOTTLENECK_INSTALLED:
pd.set_option("use_bottleneck", True)
assert pd.get_option("use_bottleneck")
pd.set_option("use_bottleneck", False)
assert not pd.get_option("use_bottleneck")
pd.set_option("use_bottleneck", use_bn)
@pytest.mark.parametrize(
"numpy_op, expected",
[
(np.sum, 10),
(np.nansum, 10),
(np.mean, 2.5),
(np.nanmean, 2.5),
(np.median, 2.5),
(np.nanmedian, 2.5),
(np.min, 1),
(np.max, 4),
(np.nanmin, 1),
(np.nanmax, 4),
],
)
def test_numpy_ops(numpy_op, expected):
# GH8383
result = numpy_op(pd.Series([1, 2, 3, 4]))
assert result == expected
@pytest.mark.parametrize(
"operation",
[
nanops.nanany,
nanops.nanall,
nanops.nansum,
nanops.nanmean,
nanops.nanmedian,
nanops.nanstd,
nanops.nanvar,
nanops.nansem,
nanops.nanargmax,
nanops.nanargmin,
nanops.nanmax,
nanops.nanmin,
nanops.nanskew,
nanops.nankurt,
nanops.nanprod,
],
)
def test_nanops_independent_of_mask_param(operation):
# GH22764
s = pd.Series([1, 2, np.nan, 3, np.nan, 4])
mask = s.isna()
median_expected = operation(s)
median_result = operation(s, mask=mask)
assert median_expected == median_result