1765 lines
62 KiB
Python
1765 lines
62 KiB
Python
|
from datetime import timedelta
|
||
|
from decimal import Decimal
|
||
|
import re
|
||
|
|
||
|
from dateutil.tz import tzlocal
|
||
|
import numpy as np
|
||
|
import pytest
|
||
|
|
||
|
from pandas.compat import is_platform_windows
|
||
|
import pandas.util._test_decorators as td
|
||
|
|
||
|
from pandas.core.dtypes.common import is_categorical_dtype
|
||
|
|
||
|
import pandas as pd
|
||
|
from pandas import (
|
||
|
Categorical,
|
||
|
DataFrame,
|
||
|
Index,
|
||
|
Series,
|
||
|
Timestamp,
|
||
|
date_range,
|
||
|
isna,
|
||
|
notna,
|
||
|
to_datetime,
|
||
|
to_timedelta,
|
||
|
)
|
||
|
import pandas._testing as tm
|
||
|
from pandas.core import (
|
||
|
algorithms,
|
||
|
nanops,
|
||
|
)
|
||
|
|
||
|
|
||
|
def assert_stat_op_calc(
|
||
|
opname,
|
||
|
alternative,
|
||
|
frame,
|
||
|
has_skipna=True,
|
||
|
check_dtype=True,
|
||
|
check_dates=False,
|
||
|
rtol=1e-5,
|
||
|
atol=1e-8,
|
||
|
skipna_alternative=None,
|
||
|
):
|
||
|
"""
|
||
|
Check that operator opname works as advertised on frame
|
||
|
|
||
|
Parameters
|
||
|
----------
|
||
|
opname : str
|
||
|
Name of the operator to test on frame
|
||
|
alternative : function
|
||
|
Function that opname is tested against; i.e. "frame.opname()" should
|
||
|
equal "alternative(frame)".
|
||
|
frame : DataFrame
|
||
|
The object that the tests are executed on
|
||
|
has_skipna : bool, default True
|
||
|
Whether the method "opname" has the kwarg "skip_na"
|
||
|
check_dtype : bool, default True
|
||
|
Whether the dtypes of the result of "frame.opname()" and
|
||
|
"alternative(frame)" should be checked.
|
||
|
check_dates : bool, default false
|
||
|
Whether opname should be tested on a Datetime Series
|
||
|
rtol : float, default 1e-5
|
||
|
Relative tolerance.
|
||
|
atol : float, default 1e-8
|
||
|
Absolute tolerance.
|
||
|
skipna_alternative : function, default None
|
||
|
NaN-safe version of alternative
|
||
|
"""
|
||
|
f = getattr(frame, opname)
|
||
|
|
||
|
if check_dates:
|
||
|
df = DataFrame({"b": date_range("1/1/2001", periods=2)})
|
||
|
with tm.assert_produces_warning(None):
|
||
|
result = getattr(df, opname)()
|
||
|
assert isinstance(result, Series)
|
||
|
|
||
|
df["a"] = range(len(df))
|
||
|
with tm.assert_produces_warning(None):
|
||
|
result = getattr(df, opname)()
|
||
|
assert isinstance(result, Series)
|
||
|
assert len(result)
|
||
|
|
||
|
if has_skipna:
|
||
|
|
||
|
def wrapper(x):
|
||
|
return alternative(x.values)
|
||
|
|
||
|
skipna_wrapper = tm._make_skipna_wrapper(alternative, skipna_alternative)
|
||
|
result0 = f(axis=0, skipna=False)
|
||
|
result1 = f(axis=1, skipna=False)
|
||
|
tm.assert_series_equal(
|
||
|
result0, frame.apply(wrapper), check_dtype=check_dtype, rtol=rtol, atol=atol
|
||
|
)
|
||
|
tm.assert_series_equal(
|
||
|
result1,
|
||
|
frame.apply(wrapper, axis=1),
|
||
|
rtol=rtol,
|
||
|
atol=atol,
|
||
|
)
|
||
|
else:
|
||
|
skipna_wrapper = alternative
|
||
|
|
||
|
result0 = f(axis=0)
|
||
|
result1 = f(axis=1)
|
||
|
tm.assert_series_equal(
|
||
|
result0,
|
||
|
frame.apply(skipna_wrapper),
|
||
|
check_dtype=check_dtype,
|
||
|
rtol=rtol,
|
||
|
atol=atol,
|
||
|
)
|
||
|
|
||
|
if opname in ["sum", "prod"]:
|
||
|
expected = frame.apply(skipna_wrapper, axis=1)
|
||
|
tm.assert_series_equal(
|
||
|
result1, expected, check_dtype=False, rtol=rtol, atol=atol
|
||
|
)
|
||
|
|
||
|
# check dtypes
|
||
|
if check_dtype:
|
||
|
lcd_dtype = frame.values.dtype
|
||
|
assert lcd_dtype == result0.dtype
|
||
|
assert lcd_dtype == result1.dtype
|
||
|
|
||
|
# bad axis
|
||
|
with pytest.raises(ValueError, match="No axis named 2"):
|
||
|
f(axis=2)
|
||
|
|
||
|
# all NA case
|
||
|
if has_skipna:
|
||
|
all_na = frame * np.NaN
|
||
|
r0 = getattr(all_na, opname)(axis=0)
|
||
|
r1 = getattr(all_na, opname)(axis=1)
|
||
|
if opname in ["sum", "prod"]:
|
||
|
unit = 1 if opname == "prod" else 0 # result for empty sum/prod
|
||
|
expected = Series(unit, index=r0.index, dtype=r0.dtype)
|
||
|
tm.assert_series_equal(r0, expected)
|
||
|
expected = Series(unit, index=r1.index, dtype=r1.dtype)
|
||
|
tm.assert_series_equal(r1, expected)
|
||
|
|
||
|
|
||
|
class TestDataFrameAnalytics:
|
||
|
# ---------------------------------------------------------------------
|
||
|
# Reductions
|
||
|
@pytest.mark.parametrize("axis", [0, 1])
|
||
|
@pytest.mark.parametrize(
|
||
|
"opname",
|
||
|
[
|
||
|
"count",
|
||
|
"sum",
|
||
|
"mean",
|
||
|
"product",
|
||
|
"median",
|
||
|
"min",
|
||
|
"max",
|
||
|
"nunique",
|
||
|
"var",
|
||
|
"std",
|
||
|
"sem",
|
||
|
pytest.param("skew", marks=td.skip_if_no_scipy),
|
||
|
pytest.param("kurt", marks=td.skip_if_no_scipy),
|
||
|
],
|
||
|
)
|
||
|
def test_stat_op_api_float_string_frame(self, float_string_frame, axis, opname):
|
||
|
if (opname in ("sum", "min", "max") and axis == 0) or opname in (
|
||
|
"count",
|
||
|
"nunique",
|
||
|
):
|
||
|
getattr(float_string_frame, opname)(axis=axis)
|
||
|
else:
|
||
|
msg = "|".join(
|
||
|
[
|
||
|
"Could not convert",
|
||
|
"could not convert",
|
||
|
"can't multiply sequence by non-int",
|
||
|
"unsupported operand type",
|
||
|
"not supported between instances of",
|
||
|
]
|
||
|
)
|
||
|
with pytest.raises(TypeError, match=msg):
|
||
|
getattr(float_string_frame, opname)(axis=axis)
|
||
|
if opname != "nunique":
|
||
|
getattr(float_string_frame, opname)(axis=axis, numeric_only=True)
|
||
|
|
||
|
@pytest.mark.parametrize("axis", [0, 1])
|
||
|
@pytest.mark.parametrize(
|
||
|
"opname",
|
||
|
[
|
||
|
"count",
|
||
|
"sum",
|
||
|
"mean",
|
||
|
"product",
|
||
|
"median",
|
||
|
"min",
|
||
|
"max",
|
||
|
"var",
|
||
|
"std",
|
||
|
"sem",
|
||
|
pytest.param("skew", marks=td.skip_if_no_scipy),
|
||
|
pytest.param("kurt", marks=td.skip_if_no_scipy),
|
||
|
],
|
||
|
)
|
||
|
def test_stat_op_api_float_frame(self, float_frame, axis, opname):
|
||
|
getattr(float_frame, opname)(axis=axis, numeric_only=False)
|
||
|
|
||
|
def test_stat_op_calc(self, float_frame_with_na, mixed_float_frame):
|
||
|
def count(s):
|
||
|
return notna(s).sum()
|
||
|
|
||
|
def nunique(s):
|
||
|
return len(algorithms.unique1d(s.dropna()))
|
||
|
|
||
|
def var(x):
|
||
|
return np.var(x, ddof=1)
|
||
|
|
||
|
def std(x):
|
||
|
return np.std(x, ddof=1)
|
||
|
|
||
|
def sem(x):
|
||
|
return np.std(x, ddof=1) / np.sqrt(len(x))
|
||
|
|
||
|
assert_stat_op_calc(
|
||
|
"nunique",
|
||
|
nunique,
|
||
|
float_frame_with_na,
|
||
|
has_skipna=False,
|
||
|
check_dtype=False,
|
||
|
check_dates=True,
|
||
|
)
|
||
|
|
||
|
# GH#32571: rol needed for flaky CI builds
|
||
|
# mixed types (with upcasting happening)
|
||
|
assert_stat_op_calc(
|
||
|
"sum",
|
||
|
np.sum,
|
||
|
mixed_float_frame.astype("float32"),
|
||
|
check_dtype=False,
|
||
|
rtol=1e-3,
|
||
|
)
|
||
|
|
||
|
assert_stat_op_calc(
|
||
|
"sum", np.sum, float_frame_with_na, skipna_alternative=np.nansum
|
||
|
)
|
||
|
assert_stat_op_calc("mean", np.mean, float_frame_with_na, check_dates=True)
|
||
|
assert_stat_op_calc(
|
||
|
"product", np.prod, float_frame_with_na, skipna_alternative=np.nanprod
|
||
|
)
|
||
|
|
||
|
assert_stat_op_calc("var", var, float_frame_with_na)
|
||
|
assert_stat_op_calc("std", std, float_frame_with_na)
|
||
|
assert_stat_op_calc("sem", sem, float_frame_with_na)
|
||
|
|
||
|
assert_stat_op_calc(
|
||
|
"count",
|
||
|
count,
|
||
|
float_frame_with_na,
|
||
|
has_skipna=False,
|
||
|
check_dtype=False,
|
||
|
check_dates=True,
|
||
|
)
|
||
|
|
||
|
@td.skip_if_no_scipy
|
||
|
def test_stat_op_calc_skew_kurtosis(self, float_frame_with_na):
|
||
|
def skewness(x):
|
||
|
from scipy.stats import skew
|
||
|
|
||
|
if len(x) < 3:
|
||
|
return np.nan
|
||
|
return skew(x, bias=False)
|
||
|
|
||
|
def kurt(x):
|
||
|
from scipy.stats import kurtosis
|
||
|
|
||
|
if len(x) < 4:
|
||
|
return np.nan
|
||
|
return kurtosis(x, bias=False)
|
||
|
|
||
|
assert_stat_op_calc("skew", skewness, float_frame_with_na)
|
||
|
assert_stat_op_calc("kurt", kurt, float_frame_with_na)
|
||
|
|
||
|
def test_median(self, float_frame_with_na, int_frame):
|
||
|
def wrapper(x):
|
||
|
if isna(x).any():
|
||
|
return np.nan
|
||
|
return np.median(x)
|
||
|
|
||
|
assert_stat_op_calc("median", wrapper, float_frame_with_na, check_dates=True)
|
||
|
assert_stat_op_calc(
|
||
|
"median", wrapper, int_frame, check_dtype=False, check_dates=True
|
||
|
)
|
||
|
|
||
|
@pytest.mark.parametrize(
|
||
|
"method", ["sum", "mean", "prod", "var", "std", "skew", "min", "max"]
|
||
|
)
|
||
|
@pytest.mark.parametrize(
|
||
|
"df",
|
||
|
[
|
||
|
DataFrame(
|
||
|
{
|
||
|
"a": [
|
||
|
-0.00049987540199591344,
|
||
|
-0.0016467257772919831,
|
||
|
0.00067695870775883013,
|
||
|
],
|
||
|
"b": [-0, -0, 0.0],
|
||
|
"c": [
|
||
|
0.00031111847529610595,
|
||
|
0.0014902627951905339,
|
||
|
-0.00094099200035979691,
|
||
|
],
|
||
|
},
|
||
|
index=["foo", "bar", "baz"],
|
||
|
dtype="O",
|
||
|
),
|
||
|
DataFrame({0: [np.nan, 2], 1: [np.nan, 3], 2: [np.nan, 4]}, dtype=object),
|
||
|
],
|
||
|
)
|
||
|
def test_stat_operators_attempt_obj_array(self, method, df, axis):
|
||
|
# GH#676
|
||
|
assert df.values.dtype == np.object_
|
||
|
result = getattr(df, method)(axis=axis)
|
||
|
expected = getattr(df.astype("f8"), method)(axis=axis).astype(object)
|
||
|
tm.assert_series_equal(result, expected)
|
||
|
|
||
|
@pytest.mark.parametrize("op", ["mean", "std", "var", "skew", "kurt", "sem"])
|
||
|
def test_mixed_ops(self, op):
|
||
|
# GH#16116
|
||
|
df = DataFrame(
|
||
|
{
|
||
|
"int": [1, 2, 3, 4],
|
||
|
"float": [1.0, 2.0, 3.0, 4.0],
|
||
|
"str": ["a", "b", "c", "d"],
|
||
|
}
|
||
|
)
|
||
|
msg = "|".join(
|
||
|
[
|
||
|
"Could not convert",
|
||
|
"could not convert",
|
||
|
"can't multiply sequence by non-int",
|
||
|
]
|
||
|
)
|
||
|
with pytest.raises(TypeError, match=msg):
|
||
|
getattr(df, op)()
|
||
|
|
||
|
with pd.option_context("use_bottleneck", False):
|
||
|
msg = "|".join(
|
||
|
[
|
||
|
"Could not convert",
|
||
|
"could not convert",
|
||
|
"can't multiply sequence by non-int",
|
||
|
]
|
||
|
)
|
||
|
with pytest.raises(TypeError, match=msg):
|
||
|
getattr(df, op)()
|
||
|
|
||
|
def test_reduce_mixed_frame(self):
|
||
|
# GH 6806
|
||
|
df = DataFrame(
|
||
|
{
|
||
|
"bool_data": [True, True, False, False, False],
|
||
|
"int_data": [10, 20, 30, 40, 50],
|
||
|
"string_data": ["a", "b", "c", "d", "e"],
|
||
|
}
|
||
|
)
|
||
|
df.reindex(columns=["bool_data", "int_data", "string_data"])
|
||
|
test = df.sum(axis=0)
|
||
|
tm.assert_numpy_array_equal(
|
||
|
test.values, np.array([2, 150, "abcde"], dtype=object)
|
||
|
)
|
||
|
alt = df.T.sum(axis=1)
|
||
|
tm.assert_series_equal(test, alt)
|
||
|
|
||
|
def test_nunique(self):
|
||
|
df = DataFrame({"A": [1, 1, 1], "B": [1, 2, 3], "C": [1, np.nan, 3]})
|
||
|
tm.assert_series_equal(df.nunique(), Series({"A": 1, "B": 3, "C": 2}))
|
||
|
tm.assert_series_equal(
|
||
|
df.nunique(dropna=False), Series({"A": 1, "B": 3, "C": 3})
|
||
|
)
|
||
|
tm.assert_series_equal(df.nunique(axis=1), Series({0: 1, 1: 2, 2: 2}))
|
||
|
tm.assert_series_equal(
|
||
|
df.nunique(axis=1, dropna=False), Series({0: 1, 1: 3, 2: 2})
|
||
|
)
|
||
|
|
||
|
@pytest.mark.parametrize("tz", [None, "UTC"])
|
||
|
def test_mean_mixed_datetime_numeric(self, tz):
|
||
|
# https://github.com/pandas-dev/pandas/issues/24752
|
||
|
df = DataFrame({"A": [1, 1], "B": [Timestamp("2000", tz=tz)] * 2})
|
||
|
result = df.mean()
|
||
|
expected = Series([1.0, Timestamp("2000", tz=tz)], index=["A", "B"])
|
||
|
tm.assert_series_equal(result, expected)
|
||
|
|
||
|
@pytest.mark.parametrize("tz", [None, "UTC"])
|
||
|
def test_mean_includes_datetimes(self, tz):
|
||
|
# https://github.com/pandas-dev/pandas/issues/24752
|
||
|
# Behavior in 0.24.0rc1 was buggy.
|
||
|
# As of 2.0 with numeric_only=None we do *not* drop datetime columns
|
||
|
df = DataFrame({"A": [Timestamp("2000", tz=tz)] * 2})
|
||
|
result = df.mean()
|
||
|
|
||
|
expected = Series([Timestamp("2000", tz=tz)], index=["A"])
|
||
|
tm.assert_series_equal(result, expected)
|
||
|
|
||
|
def test_mean_mixed_string_decimal(self):
|
||
|
# GH 11670
|
||
|
# possible bug when calculating mean of DataFrame?
|
||
|
|
||
|
d = [
|
||
|
{"A": 2, "B": None, "C": Decimal("628.00")},
|
||
|
{"A": 1, "B": None, "C": Decimal("383.00")},
|
||
|
{"A": 3, "B": None, "C": Decimal("651.00")},
|
||
|
{"A": 2, "B": None, "C": Decimal("575.00")},
|
||
|
{"A": 4, "B": None, "C": Decimal("1114.00")},
|
||
|
{"A": 1, "B": "TEST", "C": Decimal("241.00")},
|
||
|
{"A": 2, "B": None, "C": Decimal("572.00")},
|
||
|
{"A": 4, "B": None, "C": Decimal("609.00")},
|
||
|
{"A": 3, "B": None, "C": Decimal("820.00")},
|
||
|
{"A": 5, "B": None, "C": Decimal("1223.00")},
|
||
|
]
|
||
|
|
||
|
df = DataFrame(d)
|
||
|
|
||
|
with pytest.raises(TypeError, match="unsupported operand type"):
|
||
|
df.mean()
|
||
|
result = df[["A", "C"]].mean()
|
||
|
expected = Series([2.7, 681.6], index=["A", "C"], dtype=object)
|
||
|
tm.assert_series_equal(result, expected)
|
||
|
|
||
|
def test_var_std(self, datetime_frame):
|
||
|
result = datetime_frame.std(ddof=4)
|
||
|
expected = datetime_frame.apply(lambda x: x.std(ddof=4))
|
||
|
tm.assert_almost_equal(result, expected)
|
||
|
|
||
|
result = datetime_frame.var(ddof=4)
|
||
|
expected = datetime_frame.apply(lambda x: x.var(ddof=4))
|
||
|
tm.assert_almost_equal(result, expected)
|
||
|
|
||
|
arr = np.repeat(np.random.random((1, 1000)), 1000, 0)
|
||
|
result = nanops.nanvar(arr, axis=0)
|
||
|
assert not (result < 0).any()
|
||
|
|
||
|
with pd.option_context("use_bottleneck", False):
|
||
|
result = nanops.nanvar(arr, axis=0)
|
||
|
assert not (result < 0).any()
|
||
|
|
||
|
@pytest.mark.parametrize("meth", ["sem", "var", "std"])
|
||
|
def test_numeric_only_flag(self, meth):
|
||
|
# GH 9201
|
||
|
df1 = DataFrame(np.random.randn(5, 3), columns=["foo", "bar", "baz"])
|
||
|
# Cast to object to avoid implicit cast when setting entry to "100" below
|
||
|
df1 = df1.astype({"foo": object})
|
||
|
# set one entry to a number in str format
|
||
|
df1.loc[0, "foo"] = "100"
|
||
|
|
||
|
df2 = DataFrame(np.random.randn(5, 3), columns=["foo", "bar", "baz"])
|
||
|
# Cast to object to avoid implicit cast when setting entry to "a" below
|
||
|
df2 = df2.astype({"foo": object})
|
||
|
# set one entry to a non-number str
|
||
|
df2.loc[0, "foo"] = "a"
|
||
|
|
||
|
result = getattr(df1, meth)(axis=1, numeric_only=True)
|
||
|
expected = getattr(df1[["bar", "baz"]], meth)(axis=1)
|
||
|
tm.assert_series_equal(expected, result)
|
||
|
|
||
|
result = getattr(df2, meth)(axis=1, numeric_only=True)
|
||
|
expected = getattr(df2[["bar", "baz"]], meth)(axis=1)
|
||
|
tm.assert_series_equal(expected, result)
|
||
|
|
||
|
# df1 has all numbers, df2 has a letter inside
|
||
|
msg = r"unsupported operand type\(s\) for -: 'float' and 'str'"
|
||
|
with pytest.raises(TypeError, match=msg):
|
||
|
getattr(df1, meth)(axis=1, numeric_only=False)
|
||
|
msg = "could not convert string to float: 'a'"
|
||
|
with pytest.raises(TypeError, match=msg):
|
||
|
getattr(df2, meth)(axis=1, numeric_only=False)
|
||
|
|
||
|
def test_sem(self, datetime_frame):
|
||
|
result = datetime_frame.sem(ddof=4)
|
||
|
expected = datetime_frame.apply(lambda x: x.std(ddof=4) / np.sqrt(len(x)))
|
||
|
tm.assert_almost_equal(result, expected)
|
||
|
|
||
|
arr = np.repeat(np.random.random((1, 1000)), 1000, 0)
|
||
|
result = nanops.nansem(arr, axis=0)
|
||
|
assert not (result < 0).any()
|
||
|
|
||
|
with pd.option_context("use_bottleneck", False):
|
||
|
result = nanops.nansem(arr, axis=0)
|
||
|
assert not (result < 0).any()
|
||
|
|
||
|
@pytest.mark.parametrize(
|
||
|
"dropna, expected",
|
||
|
[
|
||
|
(
|
||
|
True,
|
||
|
{
|
||
|
"A": [12],
|
||
|
"B": [10.0],
|
||
|
"C": [1.0],
|
||
|
"D": ["a"],
|
||
|
"E": Categorical(["a"], categories=["a"]),
|
||
|
"F": to_datetime(["2000-1-2"]),
|
||
|
"G": to_timedelta(["1 days"]),
|
||
|
},
|
||
|
),
|
||
|
(
|
||
|
False,
|
||
|
{
|
||
|
"A": [12],
|
||
|
"B": [10.0],
|
||
|
"C": [np.nan],
|
||
|
"D": np.array([np.nan], dtype=object),
|
||
|
"E": Categorical([np.nan], categories=["a"]),
|
||
|
"F": [pd.NaT],
|
||
|
"G": to_timedelta([pd.NaT]),
|
||
|
},
|
||
|
),
|
||
|
(
|
||
|
True,
|
||
|
{
|
||
|
"H": [8, 9, np.nan, np.nan],
|
||
|
"I": [8, 9, np.nan, np.nan],
|
||
|
"J": [1, np.nan, np.nan, np.nan],
|
||
|
"K": Categorical(["a", np.nan, np.nan, np.nan], categories=["a"]),
|
||
|
"L": to_datetime(["2000-1-2", "NaT", "NaT", "NaT"]),
|
||
|
"M": to_timedelta(["1 days", "nan", "nan", "nan"]),
|
||
|
"N": [0, 1, 2, 3],
|
||
|
},
|
||
|
),
|
||
|
(
|
||
|
False,
|
||
|
{
|
||
|
"H": [8, 9, np.nan, np.nan],
|
||
|
"I": [8, 9, np.nan, np.nan],
|
||
|
"J": [1, np.nan, np.nan, np.nan],
|
||
|
"K": Categorical([np.nan, "a", np.nan, np.nan], categories=["a"]),
|
||
|
"L": to_datetime(["NaT", "2000-1-2", "NaT", "NaT"]),
|
||
|
"M": to_timedelta(["nan", "1 days", "nan", "nan"]),
|
||
|
"N": [0, 1, 2, 3],
|
||
|
},
|
||
|
),
|
||
|
],
|
||
|
)
|
||
|
def test_mode_dropna(self, dropna, expected):
|
||
|
df = DataFrame(
|
||
|
{
|
||
|
"A": [12, 12, 19, 11],
|
||
|
"B": [10, 10, np.nan, 3],
|
||
|
"C": [1, np.nan, np.nan, np.nan],
|
||
|
"D": [np.nan, np.nan, "a", np.nan],
|
||
|
"E": Categorical([np.nan, np.nan, "a", np.nan]),
|
||
|
"F": to_datetime(["NaT", "2000-1-2", "NaT", "NaT"]),
|
||
|
"G": to_timedelta(["1 days", "nan", "nan", "nan"]),
|
||
|
"H": [8, 8, 9, 9],
|
||
|
"I": [9, 9, 8, 8],
|
||
|
"J": [1, 1, np.nan, np.nan],
|
||
|
"K": Categorical(["a", np.nan, "a", np.nan]),
|
||
|
"L": to_datetime(["2000-1-2", "2000-1-2", "NaT", "NaT"]),
|
||
|
"M": to_timedelta(["1 days", "nan", "1 days", "nan"]),
|
||
|
"N": np.arange(4, dtype="int64"),
|
||
|
}
|
||
|
)
|
||
|
|
||
|
result = df[sorted(expected.keys())].mode(dropna=dropna)
|
||
|
expected = DataFrame(expected)
|
||
|
tm.assert_frame_equal(result, expected)
|
||
|
|
||
|
def test_mode_sortwarning(self):
|
||
|
# Check for the warning that is raised when the mode
|
||
|
# results cannot be sorted
|
||
|
|
||
|
df = DataFrame({"A": [np.nan, np.nan, "a", "a"]})
|
||
|
expected = DataFrame({"A": ["a", np.nan]})
|
||
|
|
||
|
with tm.assert_produces_warning(UserWarning):
|
||
|
result = df.mode(dropna=False)
|
||
|
result = result.sort_values(by="A").reset_index(drop=True)
|
||
|
|
||
|
tm.assert_frame_equal(result, expected)
|
||
|
|
||
|
def test_mode_empty_df(self):
|
||
|
df = DataFrame([], columns=["a", "b"])
|
||
|
result = df.mode()
|
||
|
expected = DataFrame([], columns=["a", "b"], index=Index([], dtype=np.int64))
|
||
|
tm.assert_frame_equal(result, expected)
|
||
|
|
||
|
def test_operators_timedelta64(self):
|
||
|
df = DataFrame(
|
||
|
{
|
||
|
"A": date_range("2012-1-1", periods=3, freq="D"),
|
||
|
"B": date_range("2012-1-2", periods=3, freq="D"),
|
||
|
"C": Timestamp("20120101") - timedelta(minutes=5, seconds=5),
|
||
|
}
|
||
|
)
|
||
|
|
||
|
diffs = DataFrame({"A": df["A"] - df["C"], "B": df["A"] - df["B"]})
|
||
|
|
||
|
# min
|
||
|
result = diffs.min()
|
||
|
assert result[0] == diffs.loc[0, "A"]
|
||
|
assert result[1] == diffs.loc[0, "B"]
|
||
|
|
||
|
result = diffs.min(axis=1)
|
||
|
assert (result == diffs.loc[0, "B"]).all()
|
||
|
|
||
|
# max
|
||
|
result = diffs.max()
|
||
|
assert result[0] == diffs.loc[2, "A"]
|
||
|
assert result[1] == diffs.loc[2, "B"]
|
||
|
|
||
|
result = diffs.max(axis=1)
|
||
|
assert (result == diffs["A"]).all()
|
||
|
|
||
|
# abs
|
||
|
result = diffs.abs()
|
||
|
result2 = abs(diffs)
|
||
|
expected = DataFrame({"A": df["A"] - df["C"], "B": df["B"] - df["A"]})
|
||
|
tm.assert_frame_equal(result, expected)
|
||
|
tm.assert_frame_equal(result2, expected)
|
||
|
|
||
|
# mixed frame
|
||
|
mixed = diffs.copy()
|
||
|
mixed["C"] = "foo"
|
||
|
mixed["D"] = 1
|
||
|
mixed["E"] = 1.0
|
||
|
mixed["F"] = Timestamp("20130101")
|
||
|
|
||
|
# results in an object array
|
||
|
result = mixed.min()
|
||
|
expected = Series(
|
||
|
[
|
||
|
pd.Timedelta(timedelta(seconds=5 * 60 + 5)),
|
||
|
pd.Timedelta(timedelta(days=-1)),
|
||
|
"foo",
|
||
|
1,
|
||
|
1.0,
|
||
|
Timestamp("20130101"),
|
||
|
],
|
||
|
index=mixed.columns,
|
||
|
)
|
||
|
tm.assert_series_equal(result, expected)
|
||
|
|
||
|
# excludes non-numeric
|
||
|
result = mixed.min(axis=1, numeric_only=True)
|
||
|
expected = Series([1, 1, 1.0], index=[0, 1, 2])
|
||
|
tm.assert_series_equal(result, expected)
|
||
|
|
||
|
# works when only those columns are selected
|
||
|
result = mixed[["A", "B"]].min(1)
|
||
|
expected = Series([timedelta(days=-1)] * 3)
|
||
|
tm.assert_series_equal(result, expected)
|
||
|
|
||
|
result = mixed[["A", "B"]].min()
|
||
|
expected = Series(
|
||
|
[timedelta(seconds=5 * 60 + 5), timedelta(days=-1)], index=["A", "B"]
|
||
|
)
|
||
|
tm.assert_series_equal(result, expected)
|
||
|
|
||
|
# GH 3106
|
||
|
df = DataFrame(
|
||
|
{
|
||
|
"time": date_range("20130102", periods=5),
|
||
|
"time2": date_range("20130105", periods=5),
|
||
|
}
|
||
|
)
|
||
|
df["off1"] = df["time2"] - df["time"]
|
||
|
assert df["off1"].dtype == "timedelta64[ns]"
|
||
|
|
||
|
df["off2"] = df["time"] - df["time2"]
|
||
|
df._consolidate_inplace()
|
||
|
assert df["off1"].dtype == "timedelta64[ns]"
|
||
|
assert df["off2"].dtype == "timedelta64[ns]"
|
||
|
|
||
|
def test_std_timedelta64_skipna_false(self):
|
||
|
# GH#37392
|
||
|
tdi = pd.timedelta_range("1 Day", periods=10)
|
||
|
df = DataFrame({"A": tdi, "B": tdi}, copy=True)
|
||
|
df.iloc[-2, -1] = pd.NaT
|
||
|
|
||
|
result = df.std(skipna=False)
|
||
|
expected = Series(
|
||
|
[df["A"].std(), pd.NaT], index=["A", "B"], dtype="timedelta64[ns]"
|
||
|
)
|
||
|
tm.assert_series_equal(result, expected)
|
||
|
|
||
|
result = df.std(axis=1, skipna=False)
|
||
|
expected = Series([pd.Timedelta(0)] * 8 + [pd.NaT, pd.Timedelta(0)])
|
||
|
tm.assert_series_equal(result, expected)
|
||
|
|
||
|
@pytest.mark.parametrize(
|
||
|
"values", [["2022-01-01", "2022-01-02", pd.NaT, "2022-01-03"], 4 * [pd.NaT]]
|
||
|
)
|
||
|
def test_std_datetime64_with_nat(
|
||
|
self, values, skipna, using_array_manager, request
|
||
|
):
|
||
|
# GH#51335
|
||
|
if using_array_manager and (
|
||
|
not skipna or all(value is pd.NaT for value in values)
|
||
|
):
|
||
|
mark = pytest.mark.xfail(
|
||
|
reason="GH#51446: Incorrect type inference on NaT in reduction result"
|
||
|
)
|
||
|
request.node.add_marker(mark)
|
||
|
df = DataFrame({"a": to_datetime(values)})
|
||
|
result = df.std(skipna=skipna)
|
||
|
if not skipna or all(value is pd.NaT for value in values):
|
||
|
expected = Series({"a": pd.NaT}, dtype="timedelta64[ns]")
|
||
|
else:
|
||
|
# 86400000000000ns == 1 day
|
||
|
expected = Series({"a": 86400000000000}, dtype="timedelta64[ns]")
|
||
|
tm.assert_series_equal(result, expected)
|
||
|
|
||
|
def test_sum_corner(self):
|
||
|
empty_frame = DataFrame()
|
||
|
|
||
|
axis0 = empty_frame.sum(0)
|
||
|
axis1 = empty_frame.sum(1)
|
||
|
assert isinstance(axis0, Series)
|
||
|
assert isinstance(axis1, Series)
|
||
|
assert len(axis0) == 0
|
||
|
assert len(axis1) == 0
|
||
|
|
||
|
@pytest.mark.parametrize(
|
||
|
"index",
|
||
|
[
|
||
|
tm.makeRangeIndex(0),
|
||
|
tm.makeDateIndex(0),
|
||
|
tm.makeNumericIndex(0, dtype=int),
|
||
|
tm.makeNumericIndex(0, dtype=float),
|
||
|
tm.makeDateIndex(0, freq="M"),
|
||
|
tm.makePeriodIndex(0),
|
||
|
],
|
||
|
)
|
||
|
def test_axis_1_empty(self, all_reductions, index, using_array_manager):
|
||
|
df = DataFrame(columns=["a"], index=index)
|
||
|
result = getattr(df, all_reductions)(axis=1)
|
||
|
if all_reductions in ("any", "all"):
|
||
|
expected_dtype = "bool"
|
||
|
elif all_reductions == "count":
|
||
|
expected_dtype = "int64"
|
||
|
else:
|
||
|
expected_dtype = "object"
|
||
|
expected = Series([], index=index, dtype=expected_dtype)
|
||
|
tm.assert_series_equal(result, expected)
|
||
|
|
||
|
@pytest.mark.parametrize("method, unit", [("sum", 0), ("prod", 1)])
|
||
|
@pytest.mark.parametrize("numeric_only", [None, True, False])
|
||
|
def test_sum_prod_nanops(self, method, unit, numeric_only):
|
||
|
idx = ["a", "b", "c"]
|
||
|
df = DataFrame({"a": [unit, unit], "b": [unit, np.nan], "c": [np.nan, np.nan]})
|
||
|
# The default
|
||
|
result = getattr(df, method)(numeric_only=numeric_only)
|
||
|
expected = Series([unit, unit, unit], index=idx, dtype="float64")
|
||
|
tm.assert_series_equal(result, expected)
|
||
|
|
||
|
# min_count=1
|
||
|
result = getattr(df, method)(numeric_only=numeric_only, min_count=1)
|
||
|
expected = Series([unit, unit, np.nan], index=idx)
|
||
|
tm.assert_series_equal(result, expected)
|
||
|
|
||
|
# min_count=0
|
||
|
result = getattr(df, method)(numeric_only=numeric_only, min_count=0)
|
||
|
expected = Series([unit, unit, unit], index=idx, dtype="float64")
|
||
|
tm.assert_series_equal(result, expected)
|
||
|
|
||
|
result = getattr(df.iloc[1:], method)(numeric_only=numeric_only, min_count=1)
|
||
|
expected = Series([unit, np.nan, np.nan], index=idx)
|
||
|
tm.assert_series_equal(result, expected)
|
||
|
|
||
|
# min_count > 1
|
||
|
df = DataFrame({"A": [unit] * 10, "B": [unit] * 5 + [np.nan] * 5})
|
||
|
result = getattr(df, method)(numeric_only=numeric_only, min_count=5)
|
||
|
expected = Series(result, index=["A", "B"])
|
||
|
tm.assert_series_equal(result, expected)
|
||
|
|
||
|
result = getattr(df, method)(numeric_only=numeric_only, min_count=6)
|
||
|
expected = Series(result, index=["A", "B"])
|
||
|
tm.assert_series_equal(result, expected)
|
||
|
|
||
|
def test_sum_nanops_timedelta(self):
|
||
|
# prod isn't defined on timedeltas
|
||
|
idx = ["a", "b", "c"]
|
||
|
df = DataFrame({"a": [0, 0], "b": [0, np.nan], "c": [np.nan, np.nan]})
|
||
|
|
||
|
df2 = df.apply(to_timedelta)
|
||
|
|
||
|
# 0 by default
|
||
|
result = df2.sum()
|
||
|
expected = Series([0, 0, 0], dtype="m8[ns]", index=idx)
|
||
|
tm.assert_series_equal(result, expected)
|
||
|
|
||
|
# min_count=0
|
||
|
result = df2.sum(min_count=0)
|
||
|
tm.assert_series_equal(result, expected)
|
||
|
|
||
|
# min_count=1
|
||
|
result = df2.sum(min_count=1)
|
||
|
expected = Series([0, 0, np.nan], dtype="m8[ns]", index=idx)
|
||
|
tm.assert_series_equal(result, expected)
|
||
|
|
||
|
def test_sum_nanops_min_count(self):
|
||
|
# https://github.com/pandas-dev/pandas/issues/39738
|
||
|
df = DataFrame({"x": [1, 2, 3], "y": [4, 5, 6]})
|
||
|
result = df.sum(min_count=10)
|
||
|
expected = Series([np.nan, np.nan], index=["x", "y"])
|
||
|
tm.assert_series_equal(result, expected)
|
||
|
|
||
|
@pytest.mark.parametrize("float_type", ["float16", "float32", "float64"])
|
||
|
@pytest.mark.parametrize(
|
||
|
"kwargs, expected_result",
|
||
|
[
|
||
|
({"axis": 1, "min_count": 2}, [3.2, 5.3, np.NaN]),
|
||
|
({"axis": 1, "min_count": 3}, [np.NaN, np.NaN, np.NaN]),
|
||
|
({"axis": 1, "skipna": False}, [3.2, 5.3, np.NaN]),
|
||
|
],
|
||
|
)
|
||
|
def test_sum_nanops_dtype_min_count(self, float_type, kwargs, expected_result):
|
||
|
# GH#46947
|
||
|
df = DataFrame({"a": [1.0, 2.3, 4.4], "b": [2.2, 3, np.nan]}, dtype=float_type)
|
||
|
result = df.sum(**kwargs)
|
||
|
expected = Series(expected_result).astype(float_type)
|
||
|
tm.assert_series_equal(result, expected)
|
||
|
|
||
|
@pytest.mark.parametrize("float_type", ["float16", "float32", "float64"])
|
||
|
@pytest.mark.parametrize(
|
||
|
"kwargs, expected_result",
|
||
|
[
|
||
|
({"axis": 1, "min_count": 2}, [2.0, 4.0, np.NaN]),
|
||
|
({"axis": 1, "min_count": 3}, [np.NaN, np.NaN, np.NaN]),
|
||
|
({"axis": 1, "skipna": False}, [2.0, 4.0, np.NaN]),
|
||
|
],
|
||
|
)
|
||
|
def test_prod_nanops_dtype_min_count(self, float_type, kwargs, expected_result):
|
||
|
# GH#46947
|
||
|
df = DataFrame(
|
||
|
{"a": [1.0, 2.0, 4.4], "b": [2.0, 2.0, np.nan]}, dtype=float_type
|
||
|
)
|
||
|
result = df.prod(**kwargs)
|
||
|
expected = Series(expected_result).astype(float_type)
|
||
|
tm.assert_series_equal(result, expected)
|
||
|
|
||
|
def test_sum_object(self, float_frame):
|
||
|
values = float_frame.values.astype(int)
|
||
|
frame = DataFrame(values, index=float_frame.index, columns=float_frame.columns)
|
||
|
deltas = frame * timedelta(1)
|
||
|
deltas.sum()
|
||
|
|
||
|
def test_sum_bool(self, float_frame):
|
||
|
# ensure this works, bug report
|
||
|
bools = np.isnan(float_frame)
|
||
|
bools.sum(1)
|
||
|
bools.sum(0)
|
||
|
|
||
|
def test_sum_mixed_datetime(self):
|
||
|
# GH#30886
|
||
|
df = DataFrame({"A": date_range("2000", periods=4), "B": [1, 2, 3, 4]}).reindex(
|
||
|
[2, 3, 4]
|
||
|
)
|
||
|
with pytest.raises(TypeError, match="does not support reduction 'sum'"):
|
||
|
df.sum()
|
||
|
|
||
|
def test_mean_corner(self, float_frame, float_string_frame):
|
||
|
# unit test when have object data
|
||
|
with pytest.raises(TypeError, match="Could not convert"):
|
||
|
float_string_frame.mean(axis=0)
|
||
|
|
||
|
# xs sum mixed type, just want to know it works...
|
||
|
with pytest.raises(TypeError, match="unsupported operand type"):
|
||
|
float_string_frame.mean(axis=1)
|
||
|
|
||
|
# take mean of boolean column
|
||
|
float_frame["bool"] = float_frame["A"] > 0
|
||
|
means = float_frame.mean(0)
|
||
|
assert means["bool"] == float_frame["bool"].values.mean()
|
||
|
|
||
|
def test_mean_datetimelike(self):
|
||
|
# GH#24757 check that datetimelike are excluded by default, handled
|
||
|
# correctly with numeric_only=True
|
||
|
# As of 2.0, datetimelike are *not* excluded with numeric_only=None
|
||
|
|
||
|
df = DataFrame(
|
||
|
{
|
||
|
"A": np.arange(3),
|
||
|
"B": date_range("2016-01-01", periods=3),
|
||
|
"C": pd.timedelta_range("1D", periods=3),
|
||
|
"D": pd.period_range("2016", periods=3, freq="A"),
|
||
|
}
|
||
|
)
|
||
|
result = df.mean(numeric_only=True)
|
||
|
expected = Series({"A": 1.0})
|
||
|
tm.assert_series_equal(result, expected)
|
||
|
|
||
|
with pytest.raises(TypeError, match="mean is not implemented for PeriodArray"):
|
||
|
df.mean()
|
||
|
|
||
|
def test_mean_datetimelike_numeric_only_false(self):
|
||
|
df = DataFrame(
|
||
|
{
|
||
|
"A": np.arange(3),
|
||
|
"B": date_range("2016-01-01", periods=3),
|
||
|
"C": pd.timedelta_range("1D", periods=3),
|
||
|
}
|
||
|
)
|
||
|
|
||
|
# datetime(tz) and timedelta work
|
||
|
result = df.mean(numeric_only=False)
|
||
|
expected = Series({"A": 1, "B": df.loc[1, "B"], "C": df.loc[1, "C"]})
|
||
|
tm.assert_series_equal(result, expected)
|
||
|
|
||
|
# mean of period is not allowed
|
||
|
df["D"] = pd.period_range("2016", periods=3, freq="A")
|
||
|
|
||
|
with pytest.raises(TypeError, match="mean is not implemented for Period"):
|
||
|
df.mean(numeric_only=False)
|
||
|
|
||
|
def test_mean_extensionarray_numeric_only_true(self):
|
||
|
# https://github.com/pandas-dev/pandas/issues/33256
|
||
|
arr = np.random.randint(1000, size=(10, 5))
|
||
|
df = DataFrame(arr, dtype="Int64")
|
||
|
result = df.mean(numeric_only=True)
|
||
|
expected = DataFrame(arr).mean()
|
||
|
tm.assert_series_equal(result, expected)
|
||
|
|
||
|
def test_stats_mixed_type(self, float_string_frame):
|
||
|
with pytest.raises(TypeError, match="could not convert"):
|
||
|
float_string_frame.std(1)
|
||
|
with pytest.raises(TypeError, match="could not convert"):
|
||
|
float_string_frame.var(1)
|
||
|
with pytest.raises(TypeError, match="unsupported operand type"):
|
||
|
float_string_frame.mean(1)
|
||
|
with pytest.raises(TypeError, match="could not convert"):
|
||
|
float_string_frame.skew(1)
|
||
|
|
||
|
def test_sum_bools(self):
|
||
|
df = DataFrame(index=range(1), columns=range(10))
|
||
|
bools = isna(df)
|
||
|
assert bools.sum(axis=1)[0] == 10
|
||
|
|
||
|
# ----------------------------------------------------------------------
|
||
|
# Index of max / min
|
||
|
|
||
|
@pytest.mark.parametrize("skipna", [True, False])
|
||
|
@pytest.mark.parametrize("axis", [0, 1])
|
||
|
def test_idxmin(self, float_frame, int_frame, skipna, axis):
|
||
|
frame = float_frame
|
||
|
frame.iloc[5:10] = np.nan
|
||
|
frame.iloc[15:20, -2:] = np.nan
|
||
|
for df in [frame, int_frame]:
|
||
|
result = df.idxmin(axis=axis, skipna=skipna)
|
||
|
expected = df.apply(Series.idxmin, axis=axis, skipna=skipna)
|
||
|
tm.assert_series_equal(result, expected)
|
||
|
|
||
|
@pytest.mark.parametrize("numeric_only", [True, False])
|
||
|
def test_idxmin_numeric_only(self, numeric_only):
|
||
|
df = DataFrame({"a": [2, 3, 1], "b": [2, 1, 1], "c": list("xyx")})
|
||
|
if numeric_only:
|
||
|
result = df.idxmin(numeric_only=numeric_only)
|
||
|
expected = Series([2, 1], index=["a", "b"])
|
||
|
tm.assert_series_equal(result, expected)
|
||
|
else:
|
||
|
with pytest.raises(TypeError, match="not allowed for this dtype"):
|
||
|
df.idxmin(numeric_only=numeric_only)
|
||
|
|
||
|
def test_idxmin_axis_2(self, float_frame):
|
||
|
frame = float_frame
|
||
|
msg = "No axis named 2 for object type DataFrame"
|
||
|
with pytest.raises(ValueError, match=msg):
|
||
|
frame.idxmin(axis=2)
|
||
|
|
||
|
@pytest.mark.parametrize("skipna", [True, False])
|
||
|
@pytest.mark.parametrize("axis", [0, 1])
|
||
|
def test_idxmax(self, float_frame, int_frame, skipna, axis):
|
||
|
frame = float_frame
|
||
|
frame.iloc[5:10] = np.nan
|
||
|
frame.iloc[15:20, -2:] = np.nan
|
||
|
for df in [frame, int_frame]:
|
||
|
result = df.idxmax(axis=axis, skipna=skipna)
|
||
|
expected = df.apply(Series.idxmax, axis=axis, skipna=skipna)
|
||
|
tm.assert_series_equal(result, expected)
|
||
|
|
||
|
@pytest.mark.parametrize("numeric_only", [True, False])
|
||
|
def test_idxmax_numeric_only(self, numeric_only):
|
||
|
df = DataFrame({"a": [2, 3, 1], "b": [2, 1, 1], "c": list("xyx")})
|
||
|
if numeric_only:
|
||
|
result = df.idxmax(numeric_only=numeric_only)
|
||
|
expected = Series([1, 0], index=["a", "b"])
|
||
|
tm.assert_series_equal(result, expected)
|
||
|
else:
|
||
|
with pytest.raises(TypeError, match="not allowed for this dtype"):
|
||
|
df.idxmin(numeric_only=numeric_only)
|
||
|
|
||
|
def test_idxmax_axis_2(self, float_frame):
|
||
|
frame = float_frame
|
||
|
msg = "No axis named 2 for object type DataFrame"
|
||
|
with pytest.raises(ValueError, match=msg):
|
||
|
frame.idxmax(axis=2)
|
||
|
|
||
|
def test_idxmax_mixed_dtype(self):
|
||
|
# don't cast to object, which would raise in nanops
|
||
|
dti = date_range("2016-01-01", periods=3)
|
||
|
|
||
|
# Copying dti is needed for ArrayManager otherwise when we set
|
||
|
# df.loc[0, 3] = pd.NaT below it edits dti
|
||
|
df = DataFrame({1: [0, 2, 1], 2: range(3)[::-1], 3: dti.copy(deep=True)})
|
||
|
|
||
|
result = df.idxmax()
|
||
|
expected = Series([1, 0, 2], index=[1, 2, 3])
|
||
|
tm.assert_series_equal(result, expected)
|
||
|
|
||
|
result = df.idxmin()
|
||
|
expected = Series([0, 2, 0], index=[1, 2, 3])
|
||
|
tm.assert_series_equal(result, expected)
|
||
|
|
||
|
# with NaTs
|
||
|
df.loc[0, 3] = pd.NaT
|
||
|
result = df.idxmax()
|
||
|
expected = Series([1, 0, 2], index=[1, 2, 3])
|
||
|
tm.assert_series_equal(result, expected)
|
||
|
|
||
|
result = df.idxmin()
|
||
|
expected = Series([0, 2, 1], index=[1, 2, 3])
|
||
|
tm.assert_series_equal(result, expected)
|
||
|
|
||
|
# with multi-column dt64 block
|
||
|
df[4] = dti[::-1]
|
||
|
df._consolidate_inplace()
|
||
|
|
||
|
result = df.idxmax()
|
||
|
expected = Series([1, 0, 2, 0], index=[1, 2, 3, 4])
|
||
|
tm.assert_series_equal(result, expected)
|
||
|
|
||
|
result = df.idxmin()
|
||
|
expected = Series([0, 2, 1, 2], index=[1, 2, 3, 4])
|
||
|
tm.assert_series_equal(result, expected)
|
||
|
|
||
|
@pytest.mark.parametrize(
|
||
|
"op, expected_value",
|
||
|
[("idxmax", [0, 4]), ("idxmin", [0, 5])],
|
||
|
)
|
||
|
def test_idxmax_idxmin_convert_dtypes(self, op, expected_value):
|
||
|
# GH 40346
|
||
|
df = DataFrame(
|
||
|
{
|
||
|
"ID": [100, 100, 100, 200, 200, 200],
|
||
|
"value": [0, 0, 0, 1, 2, 0],
|
||
|
},
|
||
|
dtype="Int64",
|
||
|
)
|
||
|
df = df.groupby("ID")
|
||
|
|
||
|
result = getattr(df, op)()
|
||
|
expected = DataFrame(
|
||
|
{"value": expected_value},
|
||
|
index=Index([100, 200], name="ID", dtype="Int64"),
|
||
|
)
|
||
|
tm.assert_frame_equal(result, expected)
|
||
|
|
||
|
def test_idxmax_dt64_multicolumn_axis1(self):
|
||
|
dti = date_range("2016-01-01", periods=3)
|
||
|
df = DataFrame({3: dti, 4: dti[::-1]}, copy=True)
|
||
|
df.iloc[0, 0] = pd.NaT
|
||
|
|
||
|
df._consolidate_inplace()
|
||
|
|
||
|
result = df.idxmax(axis=1)
|
||
|
expected = Series([4, 3, 3])
|
||
|
tm.assert_series_equal(result, expected)
|
||
|
|
||
|
result = df.idxmin(axis=1)
|
||
|
expected = Series([4, 3, 4])
|
||
|
tm.assert_series_equal(result, expected)
|
||
|
|
||
|
# ----------------------------------------------------------------------
|
||
|
# Logical reductions
|
||
|
|
||
|
@pytest.mark.parametrize("opname", ["any", "all"])
|
||
|
@pytest.mark.parametrize("axis", [0, 1])
|
||
|
@pytest.mark.parametrize("bool_only", [False, True])
|
||
|
def test_any_all_mixed_float(self, opname, axis, bool_only, float_string_frame):
|
||
|
# make sure op works on mixed-type frame
|
||
|
mixed = float_string_frame
|
||
|
mixed["_bool_"] = np.random.randn(len(mixed)) > 0.5
|
||
|
|
||
|
getattr(mixed, opname)(axis=axis, bool_only=bool_only)
|
||
|
|
||
|
@pytest.mark.parametrize("opname", ["any", "all"])
|
||
|
@pytest.mark.parametrize("axis", [0, 1])
|
||
|
def test_any_all_bool_with_na(self, opname, axis, bool_frame_with_na):
|
||
|
getattr(bool_frame_with_na, opname)(axis=axis, bool_only=False)
|
||
|
|
||
|
@pytest.mark.parametrize("opname", ["any", "all"])
|
||
|
def test_any_all_bool_frame(self, opname, bool_frame_with_na):
|
||
|
# GH#12863: numpy gives back non-boolean data for object type
|
||
|
# so fill NaNs to compare with pandas behavior
|
||
|
frame = bool_frame_with_na.fillna(True)
|
||
|
alternative = getattr(np, opname)
|
||
|
f = getattr(frame, opname)
|
||
|
|
||
|
def skipna_wrapper(x):
|
||
|
nona = x.dropna().values
|
||
|
return alternative(nona)
|
||
|
|
||
|
def wrapper(x):
|
||
|
return alternative(x.values)
|
||
|
|
||
|
result0 = f(axis=0, skipna=False)
|
||
|
result1 = f(axis=1, skipna=False)
|
||
|
|
||
|
tm.assert_series_equal(result0, frame.apply(wrapper))
|
||
|
tm.assert_series_equal(result1, frame.apply(wrapper, axis=1))
|
||
|
|
||
|
result0 = f(axis=0)
|
||
|
result1 = f(axis=1)
|
||
|
|
||
|
tm.assert_series_equal(result0, frame.apply(skipna_wrapper))
|
||
|
tm.assert_series_equal(
|
||
|
result1, frame.apply(skipna_wrapper, axis=1), check_dtype=False
|
||
|
)
|
||
|
|
||
|
# bad axis
|
||
|
with pytest.raises(ValueError, match="No axis named 2"):
|
||
|
f(axis=2)
|
||
|
|
||
|
# all NA case
|
||
|
all_na = frame * np.NaN
|
||
|
r0 = getattr(all_na, opname)(axis=0)
|
||
|
r1 = getattr(all_na, opname)(axis=1)
|
||
|
if opname == "any":
|
||
|
assert not r0.any()
|
||
|
assert not r1.any()
|
||
|
else:
|
||
|
assert r0.all()
|
||
|
assert r1.all()
|
||
|
|
||
|
def test_any_all_extra(self):
|
||
|
df = DataFrame(
|
||
|
{
|
||
|
"A": [True, False, False],
|
||
|
"B": [True, True, False],
|
||
|
"C": [True, True, True],
|
||
|
},
|
||
|
index=["a", "b", "c"],
|
||
|
)
|
||
|
result = df[["A", "B"]].any(axis=1)
|
||
|
expected = Series([True, True, False], index=["a", "b", "c"])
|
||
|
tm.assert_series_equal(result, expected)
|
||
|
|
||
|
result = df[["A", "B"]].any(axis=1, bool_only=True)
|
||
|
tm.assert_series_equal(result, expected)
|
||
|
|
||
|
result = df.all(1)
|
||
|
expected = Series([True, False, False], index=["a", "b", "c"])
|
||
|
tm.assert_series_equal(result, expected)
|
||
|
|
||
|
result = df.all(1, bool_only=True)
|
||
|
tm.assert_series_equal(result, expected)
|
||
|
|
||
|
# Axis is None
|
||
|
result = df.all(axis=None).item()
|
||
|
assert result is False
|
||
|
|
||
|
result = df.any(axis=None).item()
|
||
|
assert result is True
|
||
|
|
||
|
result = df[["C"]].all(axis=None).item()
|
||
|
assert result is True
|
||
|
|
||
|
@pytest.mark.parametrize("axis", [0, 1])
|
||
|
@pytest.mark.parametrize("bool_agg_func", ["any", "all"])
|
||
|
@pytest.mark.parametrize("skipna", [True, False])
|
||
|
def test_any_all_object_dtype(self, axis, bool_agg_func, skipna):
|
||
|
# GH#35450
|
||
|
df = DataFrame(
|
||
|
data=[
|
||
|
[1, np.nan, np.nan, True],
|
||
|
[np.nan, 2, np.nan, True],
|
||
|
[np.nan, np.nan, np.nan, True],
|
||
|
[np.nan, np.nan, "5", np.nan],
|
||
|
]
|
||
|
)
|
||
|
result = getattr(df, bool_agg_func)(axis=axis, skipna=skipna)
|
||
|
expected = Series([True, True, True, True])
|
||
|
tm.assert_series_equal(result, expected)
|
||
|
|
||
|
# GH#50947 deprecates this but it is not emitting a warning in some builds.
|
||
|
@pytest.mark.filterwarnings(
|
||
|
"ignore:'any' with datetime64 dtypes is deprecated.*:FutureWarning"
|
||
|
)
|
||
|
def test_any_datetime(self):
|
||
|
# GH 23070
|
||
|
float_data = [1, np.nan, 3, np.nan]
|
||
|
datetime_data = [
|
||
|
Timestamp("1960-02-15"),
|
||
|
Timestamp("1960-02-16"),
|
||
|
pd.NaT,
|
||
|
pd.NaT,
|
||
|
]
|
||
|
df = DataFrame({"A": float_data, "B": datetime_data})
|
||
|
|
||
|
result = df.any(axis=1)
|
||
|
|
||
|
expected = Series([True, True, True, False])
|
||
|
tm.assert_series_equal(result, expected)
|
||
|
|
||
|
def test_any_all_bool_only(self):
|
||
|
# GH 25101
|
||
|
df = DataFrame(
|
||
|
{"col1": [1, 2, 3], "col2": [4, 5, 6], "col3": [None, None, None]}
|
||
|
)
|
||
|
|
||
|
result = df.all(bool_only=True)
|
||
|
expected = Series(dtype=np.bool_, index=[])
|
||
|
tm.assert_series_equal(result, expected)
|
||
|
|
||
|
df = DataFrame(
|
||
|
{
|
||
|
"col1": [1, 2, 3],
|
||
|
"col2": [4, 5, 6],
|
||
|
"col3": [None, None, None],
|
||
|
"col4": [False, False, True],
|
||
|
}
|
||
|
)
|
||
|
|
||
|
result = df.all(bool_only=True)
|
||
|
expected = Series({"col4": False})
|
||
|
tm.assert_series_equal(result, expected)
|
||
|
|
||
|
@pytest.mark.parametrize(
|
||
|
"func, data, expected",
|
||
|
[
|
||
|
(np.any, {}, False),
|
||
|
(np.all, {}, True),
|
||
|
(np.any, {"A": []}, False),
|
||
|
(np.all, {"A": []}, True),
|
||
|
(np.any, {"A": [False, False]}, False),
|
||
|
(np.all, {"A": [False, False]}, False),
|
||
|
(np.any, {"A": [True, False]}, True),
|
||
|
(np.all, {"A": [True, False]}, False),
|
||
|
(np.any, {"A": [True, True]}, True),
|
||
|
(np.all, {"A": [True, True]}, True),
|
||
|
(np.any, {"A": [False], "B": [False]}, False),
|
||
|
(np.all, {"A": [False], "B": [False]}, False),
|
||
|
(np.any, {"A": [False, False], "B": [False, True]}, True),
|
||
|
(np.all, {"A": [False, False], "B": [False, True]}, False),
|
||
|
# other types
|
||
|
(np.all, {"A": Series([0.0, 1.0], dtype="float")}, False),
|
||
|
(np.any, {"A": Series([0.0, 1.0], dtype="float")}, True),
|
||
|
(np.all, {"A": Series([0, 1], dtype=int)}, False),
|
||
|
(np.any, {"A": Series([0, 1], dtype=int)}, True),
|
||
|
pytest.param(np.all, {"A": Series([0, 1], dtype="M8[ns]")}, False),
|
||
|
pytest.param(np.all, {"A": Series([0, 1], dtype="M8[ns, UTC]")}, False),
|
||
|
pytest.param(np.any, {"A": Series([0, 1], dtype="M8[ns]")}, True),
|
||
|
pytest.param(np.any, {"A": Series([0, 1], dtype="M8[ns, UTC]")}, True),
|
||
|
pytest.param(np.all, {"A": Series([1, 2], dtype="M8[ns]")}, True),
|
||
|
pytest.param(np.all, {"A": Series([1, 2], dtype="M8[ns, UTC]")}, True),
|
||
|
pytest.param(np.any, {"A": Series([1, 2], dtype="M8[ns]")}, True),
|
||
|
pytest.param(np.any, {"A": Series([1, 2], dtype="M8[ns, UTC]")}, True),
|
||
|
pytest.param(np.all, {"A": Series([0, 1], dtype="m8[ns]")}, False),
|
||
|
pytest.param(np.any, {"A": Series([0, 1], dtype="m8[ns]")}, True),
|
||
|
pytest.param(np.all, {"A": Series([1, 2], dtype="m8[ns]")}, True),
|
||
|
pytest.param(np.any, {"A": Series([1, 2], dtype="m8[ns]")}, True),
|
||
|
# np.all on Categorical raises, so the reduction drops the
|
||
|
# column, so all is being done on an empty Series, so is True
|
||
|
(np.all, {"A": Series([0, 1], dtype="category")}, True),
|
||
|
(np.any, {"A": Series([0, 1], dtype="category")}, False),
|
||
|
(np.all, {"A": Series([1, 2], dtype="category")}, True),
|
||
|
(np.any, {"A": Series([1, 2], dtype="category")}, False),
|
||
|
# Mix GH#21484
|
||
|
pytest.param(
|
||
|
np.all,
|
||
|
{
|
||
|
"A": Series([10, 20], dtype="M8[ns]"),
|
||
|
"B": Series([10, 20], dtype="m8[ns]"),
|
||
|
},
|
||
|
True,
|
||
|
),
|
||
|
],
|
||
|
)
|
||
|
def test_any_all_np_func(self, func, data, expected):
|
||
|
# GH 19976
|
||
|
data = DataFrame(data)
|
||
|
|
||
|
if any(is_categorical_dtype(x) for x in data.dtypes):
|
||
|
with pytest.raises(
|
||
|
TypeError, match="dtype category does not support reduction"
|
||
|
):
|
||
|
func(data)
|
||
|
|
||
|
# method version
|
||
|
with pytest.raises(
|
||
|
TypeError, match="dtype category does not support reduction"
|
||
|
):
|
||
|
getattr(DataFrame(data), func.__name__)(axis=None)
|
||
|
else:
|
||
|
msg = "'(any|all)' with datetime64 dtypes is deprecated"
|
||
|
if data.dtypes.apply(lambda x: x.kind == "M").any():
|
||
|
warn = FutureWarning
|
||
|
else:
|
||
|
warn = None
|
||
|
|
||
|
with tm.assert_produces_warning(warn, match=msg, check_stacklevel=False):
|
||
|
# GH#34479
|
||
|
result = func(data)
|
||
|
assert isinstance(result, np.bool_)
|
||
|
assert result.item() is expected
|
||
|
|
||
|
# method version
|
||
|
with tm.assert_produces_warning(warn, match=msg):
|
||
|
# GH#34479
|
||
|
result = getattr(DataFrame(data), func.__name__)(axis=None)
|
||
|
assert isinstance(result, np.bool_)
|
||
|
assert result.item() is expected
|
||
|
|
||
|
def test_any_all_object(self):
|
||
|
# GH 19976
|
||
|
result = np.all(DataFrame(columns=["a", "b"])).item()
|
||
|
assert result is True
|
||
|
|
||
|
result = np.any(DataFrame(columns=["a", "b"])).item()
|
||
|
assert result is False
|
||
|
|
||
|
def test_any_all_object_bool_only(self):
|
||
|
df = DataFrame({"A": ["foo", 2], "B": [True, False]}).astype(object)
|
||
|
df._consolidate_inplace()
|
||
|
df["C"] = Series([True, True])
|
||
|
|
||
|
# Categorical of bools is _not_ considered booly
|
||
|
df["D"] = df["C"].astype("category")
|
||
|
|
||
|
# The underlying bug is in DataFrame._get_bool_data, so we check
|
||
|
# that while we're here
|
||
|
res = df._get_bool_data()
|
||
|
expected = df[["C"]]
|
||
|
tm.assert_frame_equal(res, expected)
|
||
|
|
||
|
res = df.all(bool_only=True, axis=0)
|
||
|
expected = Series([True], index=["C"])
|
||
|
tm.assert_series_equal(res, expected)
|
||
|
|
||
|
# operating on a subset of columns should not produce a _larger_ Series
|
||
|
res = df[["B", "C"]].all(bool_only=True, axis=0)
|
||
|
tm.assert_series_equal(res, expected)
|
||
|
|
||
|
assert df.all(bool_only=True, axis=None)
|
||
|
|
||
|
res = df.any(bool_only=True, axis=0)
|
||
|
expected = Series([True], index=["C"])
|
||
|
tm.assert_series_equal(res, expected)
|
||
|
|
||
|
# operating on a subset of columns should not produce a _larger_ Series
|
||
|
res = df[["C"]].any(bool_only=True, axis=0)
|
||
|
tm.assert_series_equal(res, expected)
|
||
|
|
||
|
assert df.any(bool_only=True, axis=None)
|
||
|
|
||
|
# ---------------------------------------------------------------------
|
||
|
# Unsorted
|
||
|
|
||
|
def test_series_broadcasting(self):
|
||
|
# smoke test for numpy warnings
|
||
|
# GH 16378, GH 16306
|
||
|
df = DataFrame([1.0, 1.0, 1.0])
|
||
|
df_nan = DataFrame({"A": [np.nan, 2.0, np.nan]})
|
||
|
s = Series([1, 1, 1])
|
||
|
s_nan = Series([np.nan, np.nan, 1])
|
||
|
|
||
|
with tm.assert_produces_warning(None):
|
||
|
df_nan.clip(lower=s, axis=0)
|
||
|
for op in ["lt", "le", "gt", "ge", "eq", "ne"]:
|
||
|
getattr(df, op)(s_nan, axis=0)
|
||
|
|
||
|
|
||
|
class TestDataFrameReductions:
|
||
|
def test_min_max_dt64_with_NaT(self):
|
||
|
# Both NaT and Timestamp are in DataFrame.
|
||
|
df = DataFrame({"foo": [pd.NaT, pd.NaT, Timestamp("2012-05-01")]})
|
||
|
|
||
|
res = df.min()
|
||
|
exp = Series([Timestamp("2012-05-01")], index=["foo"])
|
||
|
tm.assert_series_equal(res, exp)
|
||
|
|
||
|
res = df.max()
|
||
|
exp = Series([Timestamp("2012-05-01")], index=["foo"])
|
||
|
tm.assert_series_equal(res, exp)
|
||
|
|
||
|
# GH12941, only NaTs are in DataFrame.
|
||
|
df = DataFrame({"foo": [pd.NaT, pd.NaT]})
|
||
|
|
||
|
res = df.min()
|
||
|
exp = Series([pd.NaT], index=["foo"])
|
||
|
tm.assert_series_equal(res, exp)
|
||
|
|
||
|
res = df.max()
|
||
|
exp = Series([pd.NaT], index=["foo"])
|
||
|
tm.assert_series_equal(res, exp)
|
||
|
|
||
|
def test_min_max_dt64_with_NaT_skipna_false(self, request, tz_naive_fixture):
|
||
|
# GH#36907
|
||
|
tz = tz_naive_fixture
|
||
|
if isinstance(tz, tzlocal) and is_platform_windows():
|
||
|
pytest.skip(
|
||
|
"GH#37659 OSError raised within tzlocal bc Windows "
|
||
|
"chokes in times before 1970-01-01"
|
||
|
)
|
||
|
|
||
|
df = DataFrame(
|
||
|
{
|
||
|
"a": [
|
||
|
Timestamp("2020-01-01 08:00:00", tz=tz),
|
||
|
Timestamp("1920-02-01 09:00:00", tz=tz),
|
||
|
],
|
||
|
"b": [Timestamp("2020-02-01 08:00:00", tz=tz), pd.NaT],
|
||
|
}
|
||
|
)
|
||
|
res = df.min(axis=1, skipna=False)
|
||
|
expected = Series([df.loc[0, "a"], pd.NaT])
|
||
|
assert expected.dtype == df["a"].dtype
|
||
|
|
||
|
tm.assert_series_equal(res, expected)
|
||
|
|
||
|
res = df.max(axis=1, skipna=False)
|
||
|
expected = Series([df.loc[0, "b"], pd.NaT])
|
||
|
assert expected.dtype == df["a"].dtype
|
||
|
|
||
|
tm.assert_series_equal(res, expected)
|
||
|
|
||
|
def test_min_max_dt64_api_consistency_with_NaT(self):
|
||
|
# Calling the following sum functions returned an error for dataframes but
|
||
|
# returned NaT for series. These tests check that the API is consistent in
|
||
|
# min/max calls on empty Series/DataFrames. See GH:33704 for more
|
||
|
# information
|
||
|
df = DataFrame({"x": to_datetime([])})
|
||
|
expected_dt_series = Series(to_datetime([]))
|
||
|
# check axis 0
|
||
|
assert (df.min(axis=0).x is pd.NaT) == (expected_dt_series.min() is pd.NaT)
|
||
|
assert (df.max(axis=0).x is pd.NaT) == (expected_dt_series.max() is pd.NaT)
|
||
|
|
||
|
# check axis 1
|
||
|
tm.assert_series_equal(df.min(axis=1), expected_dt_series)
|
||
|
tm.assert_series_equal(df.max(axis=1), expected_dt_series)
|
||
|
|
||
|
def test_min_max_dt64_api_consistency_empty_df(self):
|
||
|
# check DataFrame/Series api consistency when calling min/max on an empty
|
||
|
# DataFrame/Series.
|
||
|
df = DataFrame({"x": []})
|
||
|
expected_float_series = Series([], dtype=float)
|
||
|
# check axis 0
|
||
|
assert np.isnan(df.min(axis=0).x) == np.isnan(expected_float_series.min())
|
||
|
assert np.isnan(df.max(axis=0).x) == np.isnan(expected_float_series.max())
|
||
|
# check axis 1
|
||
|
tm.assert_series_equal(df.min(axis=1), expected_float_series)
|
||
|
tm.assert_series_equal(df.min(axis=1), expected_float_series)
|
||
|
|
||
|
@pytest.mark.parametrize(
|
||
|
"initial",
|
||
|
["2018-10-08 13:36:45+00:00", "2018-10-08 13:36:45+03:00"], # Non-UTC timezone
|
||
|
)
|
||
|
@pytest.mark.parametrize("method", ["min", "max"])
|
||
|
def test_preserve_timezone(self, initial: str, method):
|
||
|
# GH 28552
|
||
|
initial_dt = to_datetime(initial)
|
||
|
expected = Series([initial_dt])
|
||
|
df = DataFrame([expected])
|
||
|
result = getattr(df, method)(axis=1)
|
||
|
tm.assert_series_equal(result, expected)
|
||
|
|
||
|
@pytest.mark.parametrize("method", ["min", "max"])
|
||
|
def test_minmax_tzaware_skipna_axis_1(self, method, skipna):
|
||
|
# GH#51242
|
||
|
val = to_datetime("1900-01-01", utc=True)
|
||
|
df = DataFrame(
|
||
|
{"a": Series([pd.NaT, pd.NaT, val]), "b": Series([pd.NaT, val, val])}
|
||
|
)
|
||
|
op = getattr(df, method)
|
||
|
result = op(axis=1, skipna=skipna)
|
||
|
if skipna:
|
||
|
expected = Series([pd.NaT, val, val])
|
||
|
else:
|
||
|
expected = Series([pd.NaT, pd.NaT, val])
|
||
|
tm.assert_series_equal(result, expected)
|
||
|
|
||
|
def test_frame_any_with_timedelta(self):
|
||
|
# GH#17667
|
||
|
df = DataFrame(
|
||
|
{
|
||
|
"a": Series([0, 0]),
|
||
|
"t": Series([to_timedelta(0, "s"), to_timedelta(1, "ms")]),
|
||
|
}
|
||
|
)
|
||
|
|
||
|
result = df.any(axis=0)
|
||
|
expected = Series(data=[False, True], index=["a", "t"])
|
||
|
tm.assert_series_equal(result, expected)
|
||
|
|
||
|
result = df.any(axis=1)
|
||
|
expected = Series(data=[False, True])
|
||
|
tm.assert_series_equal(result, expected)
|
||
|
|
||
|
def test_reductions_skipna_none_raises(
|
||
|
self, request, frame_or_series, all_reductions
|
||
|
):
|
||
|
if all_reductions == "count":
|
||
|
request.node.add_marker(
|
||
|
pytest.mark.xfail(reason="Count does not accept skipna")
|
||
|
)
|
||
|
obj = frame_or_series([1, 2, 3])
|
||
|
msg = 'For argument "skipna" expected type bool, received type NoneType.'
|
||
|
with pytest.raises(ValueError, match=msg):
|
||
|
getattr(obj, all_reductions)(skipna=None)
|
||
|
|
||
|
@td.skip_array_manager_invalid_test
|
||
|
def test_reduction_timestamp_smallest_unit(self):
|
||
|
# GH#52524
|
||
|
df = DataFrame(
|
||
|
{
|
||
|
"a": Series([Timestamp("2019-12-31")], dtype="datetime64[s]"),
|
||
|
"b": Series(
|
||
|
[Timestamp("2019-12-31 00:00:00.123")], dtype="datetime64[ms]"
|
||
|
),
|
||
|
}
|
||
|
)
|
||
|
result = df.max()
|
||
|
expected = Series(
|
||
|
[Timestamp("2019-12-31"), Timestamp("2019-12-31 00:00:00.123")],
|
||
|
dtype="datetime64[ms]",
|
||
|
index=["a", "b"],
|
||
|
)
|
||
|
tm.assert_series_equal(result, expected)
|
||
|
|
||
|
@td.skip_array_manager_not_yet_implemented
|
||
|
def test_reduction_timedelta_smallest_unit(self):
|
||
|
# GH#52524
|
||
|
df = DataFrame(
|
||
|
{
|
||
|
"a": Series([pd.Timedelta("1 days")], dtype="timedelta64[s]"),
|
||
|
"b": Series([pd.Timedelta("1 days")], dtype="timedelta64[ms]"),
|
||
|
}
|
||
|
)
|
||
|
result = df.max()
|
||
|
expected = Series(
|
||
|
[pd.Timedelta("1 days"), pd.Timedelta("1 days")],
|
||
|
dtype="timedelta64[ms]",
|
||
|
index=["a", "b"],
|
||
|
)
|
||
|
tm.assert_series_equal(result, expected)
|
||
|
|
||
|
|
||
|
class TestNuisanceColumns:
|
||
|
@pytest.mark.parametrize("method", ["any", "all"])
|
||
|
def test_any_all_categorical_dtype_nuisance_column(self, method):
|
||
|
# GH#36076 DataFrame should match Series behavior
|
||
|
ser = Series([0, 1], dtype="category", name="A")
|
||
|
df = ser.to_frame()
|
||
|
|
||
|
# Double-check the Series behavior is to raise
|
||
|
with pytest.raises(TypeError, match="does not support reduction"):
|
||
|
getattr(ser, method)()
|
||
|
|
||
|
with pytest.raises(TypeError, match="does not support reduction"):
|
||
|
getattr(np, method)(ser)
|
||
|
|
||
|
with pytest.raises(TypeError, match="does not support reduction"):
|
||
|
getattr(df, method)(bool_only=False)
|
||
|
|
||
|
with pytest.raises(TypeError, match="does not support reduction"):
|
||
|
getattr(df, method)(bool_only=None)
|
||
|
|
||
|
with pytest.raises(TypeError, match="does not support reduction"):
|
||
|
getattr(np, method)(df, axis=0)
|
||
|
|
||
|
def test_median_categorical_dtype_nuisance_column(self):
|
||
|
# GH#21020 DataFrame.median should match Series.median
|
||
|
df = DataFrame({"A": Categorical([1, 2, 2, 2, 3])})
|
||
|
ser = df["A"]
|
||
|
|
||
|
# Double-check the Series behavior is to raise
|
||
|
with pytest.raises(TypeError, match="does not support reduction"):
|
||
|
ser.median()
|
||
|
|
||
|
with pytest.raises(TypeError, match="does not support reduction"):
|
||
|
df.median(numeric_only=False)
|
||
|
|
||
|
with pytest.raises(TypeError, match="does not support reduction"):
|
||
|
df.median()
|
||
|
|
||
|
# same thing, but with an additional non-categorical column
|
||
|
df["B"] = df["A"].astype(int)
|
||
|
|
||
|
with pytest.raises(TypeError, match="does not support reduction"):
|
||
|
df.median(numeric_only=False)
|
||
|
|
||
|
with pytest.raises(TypeError, match="does not support reduction"):
|
||
|
df.median()
|
||
|
|
||
|
# TODO: np.median(df, axis=0) gives np.array([2.0, 2.0]) instead
|
||
|
# of expected.values
|
||
|
|
||
|
@pytest.mark.parametrize("method", ["min", "max"])
|
||
|
def test_min_max_categorical_dtype_non_ordered_nuisance_column(self, method):
|
||
|
# GH#28949 DataFrame.min should behave like Series.min
|
||
|
cat = Categorical(["a", "b", "c", "b"], ordered=False)
|
||
|
ser = Series(cat)
|
||
|
df = ser.to_frame("A")
|
||
|
|
||
|
# Double-check the Series behavior
|
||
|
with pytest.raises(TypeError, match="is not ordered for operation"):
|
||
|
getattr(ser, method)()
|
||
|
|
||
|
with pytest.raises(TypeError, match="is not ordered for operation"):
|
||
|
getattr(np, method)(ser)
|
||
|
|
||
|
with pytest.raises(TypeError, match="is not ordered for operation"):
|
||
|
getattr(df, method)(numeric_only=False)
|
||
|
|
||
|
with pytest.raises(TypeError, match="is not ordered for operation"):
|
||
|
getattr(df, method)()
|
||
|
|
||
|
with pytest.raises(TypeError, match="is not ordered for operation"):
|
||
|
getattr(np, method)(df, axis=0)
|
||
|
|
||
|
# same thing, but with an additional non-categorical column
|
||
|
df["B"] = df["A"].astype(object)
|
||
|
with pytest.raises(TypeError, match="is not ordered for operation"):
|
||
|
getattr(df, method)()
|
||
|
|
||
|
with pytest.raises(TypeError, match="is not ordered for operation"):
|
||
|
getattr(np, method)(df, axis=0)
|
||
|
|
||
|
|
||
|
def test_sum_timedelta64_skipna_false(using_array_manager, request):
|
||
|
# GH#17235
|
||
|
if using_array_manager:
|
||
|
mark = pytest.mark.xfail(
|
||
|
reason="Incorrect type inference on NaT in reduction result"
|
||
|
)
|
||
|
request.node.add_marker(mark)
|
||
|
|
||
|
arr = np.arange(8).astype(np.int64).view("m8[s]").reshape(4, 2)
|
||
|
arr[-1, -1] = "Nat"
|
||
|
|
||
|
df = DataFrame(arr)
|
||
|
assert (df.dtypes == arr.dtype).all()
|
||
|
|
||
|
result = df.sum(skipna=False)
|
||
|
expected = Series([pd.Timedelta(seconds=12), pd.NaT], dtype="m8[s]")
|
||
|
tm.assert_series_equal(result, expected)
|
||
|
|
||
|
result = df.sum(axis=0, skipna=False)
|
||
|
tm.assert_series_equal(result, expected)
|
||
|
|
||
|
result = df.sum(axis=1, skipna=False)
|
||
|
expected = Series(
|
||
|
[
|
||
|
pd.Timedelta(seconds=1),
|
||
|
pd.Timedelta(seconds=5),
|
||
|
pd.Timedelta(seconds=9),
|
||
|
pd.NaT,
|
||
|
],
|
||
|
dtype="m8[s]",
|
||
|
)
|
||
|
tm.assert_series_equal(result, expected)
|
||
|
|
||
|
|
||
|
def test_mixed_frame_with_integer_sum():
|
||
|
# https://github.com/pandas-dev/pandas/issues/34520
|
||
|
df = DataFrame([["a", 1]], columns=list("ab"))
|
||
|
df = df.astype({"b": "Int64"})
|
||
|
result = df.sum()
|
||
|
expected = Series(["a", 1], index=["a", "b"])
|
||
|
tm.assert_series_equal(result, expected)
|
||
|
|
||
|
|
||
|
@pytest.mark.parametrize("numeric_only", [True, False, None])
|
||
|
@pytest.mark.parametrize("method", ["min", "max"])
|
||
|
def test_minmax_extensionarray(method, numeric_only):
|
||
|
# https://github.com/pandas-dev/pandas/issues/32651
|
||
|
int64_info = np.iinfo("int64")
|
||
|
ser = Series([int64_info.max, None, int64_info.min], dtype=pd.Int64Dtype())
|
||
|
df = DataFrame({"Int64": ser})
|
||
|
result = getattr(df, method)(numeric_only=numeric_only)
|
||
|
expected = Series(
|
||
|
[getattr(int64_info, method)], index=Index(["Int64"], dtype="object")
|
||
|
)
|
||
|
tm.assert_series_equal(result, expected)
|
||
|
|
||
|
|
||
|
@pytest.mark.parametrize("ts_value", [Timestamp("2000-01-01"), pd.NaT])
|
||
|
def test_frame_mixed_numeric_object_with_timestamp(ts_value):
|
||
|
# GH 13912
|
||
|
df = DataFrame({"a": [1], "b": [1.1], "c": ["foo"], "d": [ts_value]})
|
||
|
with pytest.raises(TypeError, match="does not support reduction"):
|
||
|
df.sum()
|
||
|
|
||
|
|
||
|
def test_prod_sum_min_count_mixed_object():
|
||
|
# https://github.com/pandas-dev/pandas/issues/41074
|
||
|
df = DataFrame([1, "a", True])
|
||
|
|
||
|
result = df.prod(axis=0, min_count=1, numeric_only=False)
|
||
|
expected = Series(["a"])
|
||
|
tm.assert_series_equal(result, expected)
|
||
|
|
||
|
msg = re.escape("unsupported operand type(s) for +: 'int' and 'str'")
|
||
|
with pytest.raises(TypeError, match=msg):
|
||
|
df.sum(axis=0, min_count=1, numeric_only=False)
|
||
|
|
||
|
|
||
|
@pytest.mark.parametrize("method", ["min", "max", "mean", "median", "skew", "kurt"])
|
||
|
@pytest.mark.parametrize("numeric_only", [True, False])
|
||
|
def test_reduction_axis_none_returns_scalar(method, numeric_only):
|
||
|
# GH#21597 As of 2.0, axis=None reduces over all axes.
|
||
|
|
||
|
df = DataFrame(np.random.randn(4, 4))
|
||
|
|
||
|
result = getattr(df, method)(axis=None, numeric_only=numeric_only)
|
||
|
np_arr = df.to_numpy()
|
||
|
if method in {"skew", "kurt"}:
|
||
|
comp_mod = pytest.importorskip("scipy.stats")
|
||
|
if method == "kurt":
|
||
|
method = "kurtosis"
|
||
|
expected = getattr(comp_mod, method)(np_arr, bias=False, axis=None)
|
||
|
tm.assert_almost_equal(result, expected)
|
||
|
else:
|
||
|
expected = getattr(np, method)(np_arr, axis=None)
|
||
|
assert result == expected
|
||
|
|
||
|
|
||
|
@pytest.mark.parametrize(
|
||
|
"kernel",
|
||
|
[
|
||
|
"corr",
|
||
|
"corrwith",
|
||
|
"cov",
|
||
|
"idxmax",
|
||
|
"idxmin",
|
||
|
"kurt",
|
||
|
"max",
|
||
|
"mean",
|
||
|
"median",
|
||
|
"min",
|
||
|
"prod",
|
||
|
"quantile",
|
||
|
"sem",
|
||
|
"skew",
|
||
|
"std",
|
||
|
"sum",
|
||
|
"var",
|
||
|
],
|
||
|
)
|
||
|
def test_fails_on_non_numeric(kernel):
|
||
|
# GH#46852
|
||
|
df = DataFrame({"a": [1, 2, 3], "b": object})
|
||
|
args = (df,) if kernel == "corrwith" else ()
|
||
|
msg = "|".join(
|
||
|
[
|
||
|
"not allowed for this dtype",
|
||
|
"argument must be a string or a number",
|
||
|
"not supported between instances of",
|
||
|
"unsupported operand type",
|
||
|
"argument must be a string or a real number",
|
||
|
]
|
||
|
)
|
||
|
with pytest.raises(TypeError, match=msg):
|
||
|
getattr(df, kernel)(*args)
|