projektAI/venv/Lib/site-packages/pandas/tests/groupby/test_function.py
2021-06-06 22:13:05 +02:00

1124 lines
35 KiB
Python

import builtins
from io import StringIO
import numpy as np
import pytest
from pandas.errors import UnsupportedFunctionCall
import pandas as pd
from pandas import DataFrame, Index, MultiIndex, Series, Timestamp, date_range, isna
import pandas._testing as tm
import pandas.core.nanops as nanops
from pandas.util import _test_decorators as td
@pytest.fixture(
params=[np.int32, np.int64, np.float32, np.float64],
ids=["np.int32", "np.int64", "np.float32", "np.float64"],
)
def numpy_dtypes_for_minmax(request):
"""
Fixture of numpy dtypes with min and max values used for testing
cummin and cummax
"""
dtype = request.param
min_val = (
np.iinfo(dtype).min if np.dtype(dtype).kind == "i" else np.finfo(dtype).min
)
max_val = (
np.iinfo(dtype).max if np.dtype(dtype).kind == "i" else np.finfo(dtype).max
)
return (dtype, min_val, max_val)
@pytest.mark.parametrize("agg_func", ["any", "all"])
@pytest.mark.parametrize("skipna", [True, False])
@pytest.mark.parametrize(
"vals",
[
["foo", "bar", "baz"],
["foo", "", ""],
["", "", ""],
[1, 2, 3],
[1, 0, 0],
[0, 0, 0],
[1.0, 2.0, 3.0],
[1.0, 0.0, 0.0],
[0.0, 0.0, 0.0],
[True, True, True],
[True, False, False],
[False, False, False],
[np.nan, np.nan, np.nan],
],
)
def test_groupby_bool_aggs(agg_func, skipna, vals):
df = DataFrame({"key": ["a"] * 3 + ["b"] * 3, "val": vals * 2})
# Figure out expectation using Python builtin
exp = getattr(builtins, agg_func)(vals)
# edge case for missing data with skipna and 'any'
if skipna and all(isna(vals)) and agg_func == "any":
exp = False
exp_df = DataFrame([exp] * 2, columns=["val"], index=Index(["a", "b"], name="key"))
result = getattr(df.groupby("key"), agg_func)(skipna=skipna)
tm.assert_frame_equal(result, exp_df)
def test_max_min_non_numeric():
# #2700
aa = DataFrame({"nn": [11, 11, 22, 22], "ii": [1, 2, 3, 4], "ss": 4 * ["mama"]})
result = aa.groupby("nn").max()
assert "ss" in result
result = aa.groupby("nn").max(numeric_only=False)
assert "ss" in result
result = aa.groupby("nn").min()
assert "ss" in result
result = aa.groupby("nn").min(numeric_only=False)
assert "ss" in result
def test_min_date_with_nans():
# GH26321
dates = pd.to_datetime(
Series(["2019-05-09", "2019-05-09", "2019-05-09"]), format="%Y-%m-%d"
).dt.date
df = DataFrame({"a": [np.nan, "1", np.nan], "b": [0, 1, 1], "c": dates})
result = df.groupby("b", as_index=False)["c"].min()["c"]
expected = pd.to_datetime(
Series(["2019-05-09", "2019-05-09"], name="c"), format="%Y-%m-%d"
).dt.date
tm.assert_series_equal(result, expected)
result = df.groupby("b")["c"].min()
expected.index.name = "b"
tm.assert_series_equal(result, expected)
def test_intercept_builtin_sum():
s = Series([1.0, 2.0, np.nan, 3.0])
grouped = s.groupby([0, 1, 2, 2])
result = grouped.agg(builtins.sum)
result2 = grouped.apply(builtins.sum)
expected = grouped.sum()
tm.assert_series_equal(result, expected)
tm.assert_series_equal(result2, expected)
# @pytest.mark.parametrize("f", [max, min, sum])
# def test_builtins_apply(f):
@pytest.mark.parametrize("f", [max, min, sum])
@pytest.mark.parametrize("keys", ["jim", ["jim", "joe"]]) # Single key # Multi-key
def test_builtins_apply(keys, f):
# see gh-8155
df = DataFrame(np.random.randint(1, 50, (1000, 2)), columns=["jim", "joe"])
df["jolie"] = np.random.randn(1000)
fname = f.__name__
result = df.groupby(keys).apply(f)
ngroups = len(df.drop_duplicates(subset=keys))
assert_msg = f"invalid frame shape: {result.shape} (expected ({ngroups}, 3))"
assert result.shape == (ngroups, 3), assert_msg
tm.assert_frame_equal(
result, # numpy's equivalent function
df.groupby(keys).apply(getattr(np, fname)),
)
if f != sum:
expected = df.groupby(keys).agg(fname).reset_index()
expected.set_index(keys, inplace=True, drop=False)
tm.assert_frame_equal(result, expected, check_dtype=False)
tm.assert_series_equal(getattr(result, fname)(), getattr(df, fname)())
class TestNumericOnly:
# make sure that we are passing thru kwargs to our agg functions
@pytest.fixture
def df(self):
# GH3668
# GH5724
df = DataFrame(
{
"group": [1, 1, 2],
"int": [1, 2, 3],
"float": [4.0, 5.0, 6.0],
"string": list("abc"),
"category_string": Series(list("abc")).astype("category"),
"category_int": [7, 8, 9],
"datetime": date_range("20130101", periods=3),
"datetimetz": date_range("20130101", periods=3, tz="US/Eastern"),
"timedelta": pd.timedelta_range("1 s", periods=3, freq="s"),
},
columns=[
"group",
"int",
"float",
"string",
"category_string",
"category_int",
"datetime",
"datetimetz",
"timedelta",
],
)
return df
@pytest.mark.parametrize("method", ["mean", "median"])
def test_averages(self, df, method):
# mean / median
expected_columns_numeric = Index(["int", "float", "category_int"])
gb = df.groupby("group")
expected = DataFrame(
{
"category_int": [7.5, 9],
"float": [4.5, 6.0],
"timedelta": [pd.Timedelta("1.5s"), pd.Timedelta("3s")],
"int": [1.5, 3],
"datetime": [
Timestamp("2013-01-01 12:00:00"),
Timestamp("2013-01-03 00:00:00"),
],
"datetimetz": [
Timestamp("2013-01-01 12:00:00", tz="US/Eastern"),
Timestamp("2013-01-03 00:00:00", tz="US/Eastern"),
],
},
index=Index([1, 2], name="group"),
columns=[
"int",
"float",
"category_int",
"datetime",
"datetimetz",
"timedelta",
],
)
result = getattr(gb, method)(numeric_only=False)
tm.assert_frame_equal(result.reindex_like(expected), expected)
expected_columns = expected.columns
self._check(df, method, expected_columns, expected_columns_numeric)
@pytest.mark.parametrize("method", ["min", "max"])
def test_extrema(self, df, method):
# TODO: min, max *should* handle
# categorical (ordered) dtype
expected_columns = Index(
[
"int",
"float",
"string",
"category_int",
"datetime",
"datetimetz",
"timedelta",
]
)
expected_columns_numeric = expected_columns
self._check(df, method, expected_columns, expected_columns_numeric)
@pytest.mark.parametrize("method", ["first", "last"])
def test_first_last(self, df, method):
expected_columns = Index(
[
"int",
"float",
"string",
"category_string",
"category_int",
"datetime",
"datetimetz",
"timedelta",
]
)
expected_columns_numeric = expected_columns
self._check(df, method, expected_columns, expected_columns_numeric)
@pytest.mark.parametrize("method", ["sum", "cumsum"])
def test_sum_cumsum(self, df, method):
expected_columns_numeric = Index(["int", "float", "category_int"])
expected_columns = Index(
["int", "float", "string", "category_int", "timedelta"]
)
if method == "cumsum":
# cumsum loses string
expected_columns = Index(["int", "float", "category_int", "timedelta"])
self._check(df, method, expected_columns, expected_columns_numeric)
@pytest.mark.parametrize("method", ["prod", "cumprod"])
def test_prod_cumprod(self, df, method):
expected_columns = Index(["int", "float", "category_int"])
expected_columns_numeric = expected_columns
self._check(df, method, expected_columns, expected_columns_numeric)
@pytest.mark.parametrize("method", ["cummin", "cummax"])
def test_cummin_cummax(self, df, method):
# like min, max, but don't include strings
expected_columns = Index(
["int", "float", "category_int", "datetime", "datetimetz", "timedelta"]
)
# GH#15561: numeric_only=False set by default like min/max
expected_columns_numeric = expected_columns
self._check(df, method, expected_columns, expected_columns_numeric)
def _check(self, df, method, expected_columns, expected_columns_numeric):
gb = df.groupby("group")
result = getattr(gb, method)()
tm.assert_index_equal(result.columns, expected_columns_numeric)
result = getattr(gb, method)(numeric_only=False)
tm.assert_index_equal(result.columns, expected_columns)
class TestGroupByNonCythonPaths:
# GH#5610 non-cython calls should not include the grouper
# Tests for code not expected to go through cython paths.
@pytest.fixture
def df(self):
df = DataFrame(
[[1, 2, "foo"], [1, np.nan, "bar"], [3, np.nan, "baz"]],
columns=["A", "B", "C"],
)
return df
@pytest.fixture
def gb(self, df):
gb = df.groupby("A")
return gb
@pytest.fixture
def gni(self, df):
gni = df.groupby("A", as_index=False)
return gni
# TODO: non-unique columns, as_index=False
def test_idxmax(self, gb):
# object dtype so idxmax goes through _aggregate_item_by_item
# GH#5610
# non-cython calls should not include the grouper
expected = DataFrame([[0.0], [np.nan]], columns=["B"], index=[1, 3])
expected.index.name = "A"
result = gb.idxmax()
tm.assert_frame_equal(result, expected)
def test_idxmin(self, gb):
# object dtype so idxmax goes through _aggregate_item_by_item
# GH#5610
# non-cython calls should not include the grouper
expected = DataFrame([[0.0], [np.nan]], columns=["B"], index=[1, 3])
expected.index.name = "A"
result = gb.idxmin()
tm.assert_frame_equal(result, expected)
def test_any(self, gb):
expected = DataFrame(
[[True, True], [False, True]], columns=["B", "C"], index=[1, 3]
)
expected.index.name = "A"
result = gb.any()
tm.assert_frame_equal(result, expected)
def test_mad(self, gb, gni):
# mad
expected = DataFrame([[0], [np.nan]], columns=["B"], index=[1, 3])
expected.index.name = "A"
result = gb.mad()
tm.assert_frame_equal(result, expected)
expected = DataFrame([[1, 0.0], [3, np.nan]], columns=["A", "B"], index=[0, 1])
result = gni.mad()
tm.assert_frame_equal(result, expected)
def test_describe(self, df, gb, gni):
# describe
expected_index = Index([1, 3], name="A")
expected_col = pd.MultiIndex(
levels=[["B"], ["count", "mean", "std", "min", "25%", "50%", "75%", "max"]],
codes=[[0] * 8, list(range(8))],
)
expected = DataFrame(
[
[1.0, 2.0, np.nan, 2.0, 2.0, 2.0, 2.0, 2.0],
[0.0, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan],
],
index=expected_index,
columns=expected_col,
)
result = gb.describe()
tm.assert_frame_equal(result, expected)
expected = pd.concat(
[
df[df.A == 1].describe().unstack().to_frame().T,
df[df.A == 3].describe().unstack().to_frame().T,
]
)
expected.index = Index([0, 1])
result = gni.describe()
tm.assert_frame_equal(result, expected)
def test_cython_api2():
# this takes the fast apply path
# cumsum (GH5614)
df = DataFrame([[1, 2, np.nan], [1, np.nan, 9], [3, 4, 9]], columns=["A", "B", "C"])
expected = DataFrame([[2, np.nan], [np.nan, 9], [4, 9]], columns=["B", "C"])
result = df.groupby("A").cumsum()
tm.assert_frame_equal(result, expected)
# GH 5755 - cumsum is a transformer and should ignore as_index
result = df.groupby("A", as_index=False).cumsum()
tm.assert_frame_equal(result, expected)
# GH 13994
result = df.groupby("A").cumsum(axis=1)
expected = df.cumsum(axis=1)
tm.assert_frame_equal(result, expected)
result = df.groupby("A").cumprod(axis=1)
expected = df.cumprod(axis=1)
tm.assert_frame_equal(result, expected)
def test_cython_median():
df = DataFrame(np.random.randn(1000))
df.values[::2] = np.nan
labels = np.random.randint(0, 50, size=1000).astype(float)
labels[::17] = np.nan
result = df.groupby(labels).median()
exp = df.groupby(labels).agg(nanops.nanmedian)
tm.assert_frame_equal(result, exp)
df = DataFrame(np.random.randn(1000, 5))
rs = df.groupby(labels).agg(np.median)
xp = df.groupby(labels).median()
tm.assert_frame_equal(rs, xp)
def test_median_empty_bins(observed):
df = DataFrame(np.random.randint(0, 44, 500))
grps = range(0, 55, 5)
bins = pd.cut(df[0], grps)
result = df.groupby(bins, observed=observed).median()
expected = df.groupby(bins, observed=observed).agg(lambda x: x.median())
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize(
"dtype", ["int8", "int16", "int32", "int64", "float32", "float64", "uint64"]
)
@pytest.mark.parametrize(
"method,data",
[
("first", {"df": [{"a": 1, "b": 1}, {"a": 2, "b": 3}]}),
("last", {"df": [{"a": 1, "b": 2}, {"a": 2, "b": 4}]}),
("min", {"df": [{"a": 1, "b": 1}, {"a": 2, "b": 3}]}),
("max", {"df": [{"a": 1, "b": 2}, {"a": 2, "b": 4}]}),
("nth", {"df": [{"a": 1, "b": 2}, {"a": 2, "b": 4}], "args": [1]}),
("count", {"df": [{"a": 1, "b": 2}, {"a": 2, "b": 2}], "out_type": "int64"}),
],
)
def test_groupby_non_arithmetic_agg_types(dtype, method, data):
# GH9311, GH6620
df = DataFrame(
[{"a": 1, "b": 1}, {"a": 1, "b": 2}, {"a": 2, "b": 3}, {"a": 2, "b": 4}]
)
df["b"] = df.b.astype(dtype)
if "args" not in data:
data["args"] = []
if "out_type" in data:
out_type = data["out_type"]
else:
out_type = dtype
exp = data["df"]
df_out = DataFrame(exp)
df_out["b"] = df_out.b.astype(out_type)
df_out.set_index("a", inplace=True)
grpd = df.groupby("a")
t = getattr(grpd, method)(*data["args"])
tm.assert_frame_equal(t, df_out)
@pytest.mark.parametrize(
"i",
[
(
Timestamp("2011-01-15 12:50:28.502376"),
Timestamp("2011-01-20 12:50:28.593448"),
),
(24650000000000001, 24650000000000002),
],
)
def test_groupby_non_arithmetic_agg_int_like_precision(i):
# see gh-6620, gh-9311
df = DataFrame([{"a": 1, "b": i[0]}, {"a": 1, "b": i[1]}])
grp_exp = {
"first": {"expected": i[0]},
"last": {"expected": i[1]},
"min": {"expected": i[0]},
"max": {"expected": i[1]},
"nth": {"expected": i[1], "args": [1]},
"count": {"expected": 2},
}
for method, data in grp_exp.items():
if "args" not in data:
data["args"] = []
grouped = df.groupby("a")
res = getattr(grouped, method)(*data["args"])
assert res.iloc[0].b == data["expected"]
@pytest.mark.parametrize(
"func, values",
[
("idxmin", {"c_int": [0, 2], "c_float": [1, 3], "c_date": [1, 2]}),
("idxmax", {"c_int": [1, 3], "c_float": [0, 2], "c_date": [0, 3]}),
],
)
def test_idxmin_idxmax_returns_int_types(func, values):
# GH 25444
df = DataFrame(
{
"name": ["A", "A", "B", "B"],
"c_int": [1, 2, 3, 4],
"c_float": [4.02, 3.03, 2.04, 1.05],
"c_date": ["2019", "2018", "2016", "2017"],
}
)
df["c_date"] = pd.to_datetime(df["c_date"])
result = getattr(df.groupby("name"), func)()
expected = DataFrame(values, index=Index(["A", "B"], name="name"))
tm.assert_frame_equal(result, expected)
def test_idxmin_idxmax_axis1():
df = DataFrame(np.random.randn(10, 4), columns=["A", "B", "C", "D"])
df["A"] = [1, 2, 3, 1, 2, 3, 1, 2, 3, 4]
gb = df.groupby("A")
res = gb.idxmax(axis=1)
alt = df.iloc[:, 1:].idxmax(axis=1)
indexer = res.index.get_level_values(1)
tm.assert_series_equal(alt[indexer], res.droplevel("A"))
df["E"] = pd.date_range("2016-01-01", periods=10)
gb2 = df.groupby("A")
msg = "reduction operation 'argmax' not allowed for this dtype"
with pytest.raises(TypeError, match=msg):
gb2.idxmax(axis=1)
def test_groupby_cumprod():
# GH 4095
df = DataFrame({"key": ["b"] * 10, "value": 2})
actual = df.groupby("key")["value"].cumprod()
expected = df.groupby("key")["value"].apply(lambda x: x.cumprod())
expected.name = "value"
tm.assert_series_equal(actual, expected)
df = DataFrame({"key": ["b"] * 100, "value": 2})
actual = df.groupby("key")["value"].cumprod()
# if overflows, groupby product casts to float
# while numpy passes back invalid values
df["value"] = df["value"].astype(float)
expected = df.groupby("key")["value"].apply(lambda x: x.cumprod())
expected.name = "value"
tm.assert_series_equal(actual, expected)
def scipy_sem(*args, **kwargs):
from scipy.stats import sem
return sem(*args, ddof=1, **kwargs)
@pytest.mark.parametrize(
"op,targop",
[
("mean", np.mean),
("median", np.median),
("std", np.std),
("var", np.var),
("sum", np.sum),
("prod", np.prod),
("min", np.min),
("max", np.max),
("first", lambda x: x.iloc[0]),
("last", lambda x: x.iloc[-1]),
("count", np.size),
pytest.param("sem", scipy_sem, marks=td.skip_if_no_scipy),
],
)
def test_ops_general(op, targop):
df = DataFrame(np.random.randn(1000))
labels = np.random.randint(0, 50, size=1000).astype(float)
result = getattr(df.groupby(labels), op)().astype(float)
expected = df.groupby(labels).agg(targop)
tm.assert_frame_equal(result, expected)
def test_max_nan_bug():
raw = """,Date,app,File
-04-23,2013-04-23 00:00:00,,log080001.log
-05-06,2013-05-06 00:00:00,,log.log
-05-07,2013-05-07 00:00:00,OE,xlsx"""
df = pd.read_csv(StringIO(raw), parse_dates=[0])
gb = df.groupby("Date")
r = gb[["File"]].max()
e = gb["File"].max().to_frame()
tm.assert_frame_equal(r, e)
assert not r["File"].isna().any()
def test_nlargest():
a = Series([1, 3, 5, 7, 2, 9, 0, 4, 6, 10])
b = Series(list("a" * 5 + "b" * 5))
gb = a.groupby(b)
r = gb.nlargest(3)
e = Series(
[7, 5, 3, 10, 9, 6],
index=MultiIndex.from_arrays([list("aaabbb"), [3, 2, 1, 9, 5, 8]]),
)
tm.assert_series_equal(r, e)
a = Series([1, 1, 3, 2, 0, 3, 3, 2, 1, 0])
gb = a.groupby(b)
e = Series(
[3, 2, 1, 3, 3, 2],
index=MultiIndex.from_arrays([list("aaabbb"), [2, 3, 1, 6, 5, 7]]),
)
tm.assert_series_equal(gb.nlargest(3, keep="last"), e)
def test_nlargest_mi_grouper():
# see gh-21411
npr = np.random.RandomState(123456789)
dts = date_range("20180101", periods=10)
iterables = [dts, ["one", "two"]]
idx = MultiIndex.from_product(iterables, names=["first", "second"])
s = Series(npr.randn(20), index=idx)
result = s.groupby("first").nlargest(1)
exp_idx = MultiIndex.from_tuples(
[
(dts[0], dts[0], "one"),
(dts[1], dts[1], "one"),
(dts[2], dts[2], "one"),
(dts[3], dts[3], "two"),
(dts[4], dts[4], "one"),
(dts[5], dts[5], "one"),
(dts[6], dts[6], "one"),
(dts[7], dts[7], "one"),
(dts[8], dts[8], "two"),
(dts[9], dts[9], "one"),
],
names=["first", "first", "second"],
)
exp_values = [
2.2129019979039612,
1.8417114045748335,
0.858963679564603,
1.3759151378258088,
0.9430284594687134,
0.5296914208183142,
0.8318045593815487,
-0.8476703342910327,
0.3804446884133735,
-0.8028845810770998,
]
expected = Series(exp_values, index=exp_idx)
tm.assert_series_equal(result, expected, check_exact=False, rtol=1e-3)
def test_nsmallest():
a = Series([1, 3, 5, 7, 2, 9, 0, 4, 6, 10])
b = Series(list("a" * 5 + "b" * 5))
gb = a.groupby(b)
r = gb.nsmallest(3)
e = Series(
[1, 2, 3, 0, 4, 6],
index=MultiIndex.from_arrays([list("aaabbb"), [0, 4, 1, 6, 7, 8]]),
)
tm.assert_series_equal(r, e)
a = Series([1, 1, 3, 2, 0, 3, 3, 2, 1, 0])
gb = a.groupby(b)
e = Series(
[0, 1, 1, 0, 1, 2],
index=MultiIndex.from_arrays([list("aaabbb"), [4, 1, 0, 9, 8, 7]]),
)
tm.assert_series_equal(gb.nsmallest(3, keep="last"), e)
@pytest.mark.parametrize("func", ["cumprod", "cumsum"])
def test_numpy_compat(func):
# see gh-12811
df = DataFrame({"A": [1, 2, 1], "B": [1, 2, 3]})
g = df.groupby("A")
msg = "numpy operations are not valid with groupby"
with pytest.raises(UnsupportedFunctionCall, match=msg):
getattr(g, func)(1, 2, 3)
with pytest.raises(UnsupportedFunctionCall, match=msg):
getattr(g, func)(foo=1)
def test_cummin(numpy_dtypes_for_minmax):
dtype = numpy_dtypes_for_minmax[0]
min_val = numpy_dtypes_for_minmax[1]
# GH 15048
base_df = DataFrame({"A": [1, 1, 1, 1, 2, 2, 2, 2], "B": [3, 4, 3, 2, 2, 3, 2, 1]})
expected_mins = [3, 3, 3, 2, 2, 2, 2, 1]
df = base_df.astype(dtype)
expected = DataFrame({"B": expected_mins}).astype(dtype)
result = df.groupby("A").cummin()
tm.assert_frame_equal(result, expected)
result = df.groupby("A").B.apply(lambda x: x.cummin()).to_frame()
tm.assert_frame_equal(result, expected)
# Test w/ min value for dtype
df.loc[[2, 6], "B"] = min_val
expected.loc[[2, 3, 6, 7], "B"] = min_val
result = df.groupby("A").cummin()
tm.assert_frame_equal(result, expected)
expected = df.groupby("A").B.apply(lambda x: x.cummin()).to_frame()
tm.assert_frame_equal(result, expected)
# Test nan in some values
base_df.loc[[0, 2, 4, 6], "B"] = np.nan
expected = DataFrame({"B": [np.nan, 4, np.nan, 2, np.nan, 3, np.nan, 1]})
result = base_df.groupby("A").cummin()
tm.assert_frame_equal(result, expected)
expected = base_df.groupby("A").B.apply(lambda x: x.cummin()).to_frame()
tm.assert_frame_equal(result, expected)
# GH 15561
df = DataFrame({"a": [1], "b": pd.to_datetime(["2001"])})
expected = Series(pd.to_datetime("2001"), index=[0], name="b")
result = df.groupby("a")["b"].cummin()
tm.assert_series_equal(expected, result)
# GH 15635
df = DataFrame({"a": [1, 2, 1], "b": [1, 2, 2]})
result = df.groupby("a").b.cummin()
expected = Series([1, 2, 1], name="b")
tm.assert_series_equal(result, expected)
def test_cummin_all_nan_column():
base_df = DataFrame({"A": [1, 1, 1, 1, 2, 2, 2, 2], "B": [np.nan] * 8})
expected = DataFrame({"B": [np.nan] * 8})
result = base_df.groupby("A").cummin()
tm.assert_frame_equal(expected, result)
result = base_df.groupby("A").B.apply(lambda x: x.cummin()).to_frame()
tm.assert_frame_equal(expected, result)
def test_cummax(numpy_dtypes_for_minmax):
dtype = numpy_dtypes_for_minmax[0]
max_val = numpy_dtypes_for_minmax[2]
# GH 15048
base_df = DataFrame({"A": [1, 1, 1, 1, 2, 2, 2, 2], "B": [3, 4, 3, 2, 2, 3, 2, 1]})
expected_maxs = [3, 4, 4, 4, 2, 3, 3, 3]
df = base_df.astype(dtype)
expected = DataFrame({"B": expected_maxs}).astype(dtype)
result = df.groupby("A").cummax()
tm.assert_frame_equal(result, expected)
result = df.groupby("A").B.apply(lambda x: x.cummax()).to_frame()
tm.assert_frame_equal(result, expected)
# Test w/ max value for dtype
df.loc[[2, 6], "B"] = max_val
expected.loc[[2, 3, 6, 7], "B"] = max_val
result = df.groupby("A").cummax()
tm.assert_frame_equal(result, expected)
expected = df.groupby("A").B.apply(lambda x: x.cummax()).to_frame()
tm.assert_frame_equal(result, expected)
# Test nan in some values
base_df.loc[[0, 2, 4, 6], "B"] = np.nan
expected = DataFrame({"B": [np.nan, 4, np.nan, 4, np.nan, 3, np.nan, 3]})
result = base_df.groupby("A").cummax()
tm.assert_frame_equal(result, expected)
expected = base_df.groupby("A").B.apply(lambda x: x.cummax()).to_frame()
tm.assert_frame_equal(result, expected)
# GH 15561
df = DataFrame({"a": [1], "b": pd.to_datetime(["2001"])})
expected = Series(pd.to_datetime("2001"), index=[0], name="b")
result = df.groupby("a")["b"].cummax()
tm.assert_series_equal(expected, result)
# GH 15635
df = DataFrame({"a": [1, 2, 1], "b": [2, 1, 1]})
result = df.groupby("a").b.cummax()
expected = Series([2, 1, 2], name="b")
tm.assert_series_equal(result, expected)
def test_cummax_all_nan_column():
base_df = DataFrame({"A": [1, 1, 1, 1, 2, 2, 2, 2], "B": [np.nan] * 8})
expected = DataFrame({"B": [np.nan] * 8})
result = base_df.groupby("A").cummax()
tm.assert_frame_equal(expected, result)
result = base_df.groupby("A").B.apply(lambda x: x.cummax()).to_frame()
tm.assert_frame_equal(expected, result)
@pytest.mark.parametrize(
"in_vals, out_vals",
[
# Basics: strictly increasing (T), strictly decreasing (F),
# abs val increasing (F), non-strictly increasing (T)
([1, 2, 5, 3, 2, 0, 4, 5, -6, 1, 1], [True, False, False, True]),
# Test with inf vals
(
[1, 2.1, np.inf, 3, 2, np.inf, -np.inf, 5, 11, 1, -np.inf],
[True, False, True, False],
),
# Test with nan vals; should always be False
(
[1, 2, np.nan, 3, 2, np.nan, np.nan, 5, -np.inf, 1, np.nan],
[False, False, False, False],
),
],
)
def test_is_monotonic_increasing(in_vals, out_vals):
# GH 17015
source_dict = {
"A": ["1", "2", "3", "4", "5", "6", "7", "8", "9", "10", "11"],
"B": ["a", "a", "a", "b", "b", "b", "c", "c", "c", "d", "d"],
"C": in_vals,
}
df = DataFrame(source_dict)
result = df.groupby("B").C.is_monotonic_increasing
index = Index(list("abcd"), name="B")
expected = Series(index=index, data=out_vals, name="C")
tm.assert_series_equal(result, expected)
# Also check result equal to manually taking x.is_monotonic_increasing.
expected = df.groupby(["B"]).C.apply(lambda x: x.is_monotonic_increasing)
tm.assert_series_equal(result, expected)
@pytest.mark.parametrize(
"in_vals, out_vals",
[
# Basics: strictly decreasing (T), strictly increasing (F),
# abs val decreasing (F), non-strictly increasing (T)
([10, 9, 7, 3, 4, 5, -3, 2, 0, 1, 1], [True, False, False, True]),
# Test with inf vals
(
[np.inf, 1, -np.inf, np.inf, 2, -3, -np.inf, 5, -3, -np.inf, -np.inf],
[True, True, False, True],
),
# Test with nan vals; should always be False
(
[1, 2, np.nan, 3, 2, np.nan, np.nan, 5, -np.inf, 1, np.nan],
[False, False, False, False],
),
],
)
def test_is_monotonic_decreasing(in_vals, out_vals):
# GH 17015
source_dict = {
"A": ["1", "2", "3", "4", "5", "6", "7", "8", "9", "10", "11"],
"B": ["a", "a", "a", "b", "b", "b", "c", "c", "c", "d", "d"],
"C": in_vals,
}
df = DataFrame(source_dict)
result = df.groupby("B").C.is_monotonic_decreasing
index = Index(list("abcd"), name="B")
expected = Series(index=index, data=out_vals, name="C")
tm.assert_series_equal(result, expected)
# describe
# --------------------------------
def test_apply_describe_bug(mframe):
grouped = mframe.groupby(level="first")
grouped.describe() # it works!
def test_series_describe_multikey():
ts = tm.makeTimeSeries()
grouped = ts.groupby([lambda x: x.year, lambda x: x.month])
result = grouped.describe()
tm.assert_series_equal(result["mean"], grouped.mean(), check_names=False)
tm.assert_series_equal(result["std"], grouped.std(), check_names=False)
tm.assert_series_equal(result["min"], grouped.min(), check_names=False)
def test_series_describe_single():
ts = tm.makeTimeSeries()
grouped = ts.groupby(lambda x: x.month)
result = grouped.apply(lambda x: x.describe())
expected = grouped.describe().stack()
tm.assert_series_equal(result, expected)
def test_series_index_name(df):
grouped = df.loc[:, ["C"]].groupby(df["A"])
result = grouped.agg(lambda x: x.mean())
assert result.index.name == "A"
def test_frame_describe_multikey(tsframe):
grouped = tsframe.groupby([lambda x: x.year, lambda x: x.month])
result = grouped.describe()
desc_groups = []
for col in tsframe:
group = grouped[col].describe()
# GH 17464 - Remove duplicate MultiIndex levels
group_col = pd.MultiIndex(
levels=[[col], group.columns],
codes=[[0] * len(group.columns), range(len(group.columns))],
)
group = DataFrame(group.values, columns=group_col, index=group.index)
desc_groups.append(group)
expected = pd.concat(desc_groups, axis=1)
tm.assert_frame_equal(result, expected)
groupedT = tsframe.groupby({"A": 0, "B": 0, "C": 1, "D": 1}, axis=1)
result = groupedT.describe()
expected = tsframe.describe().T
tm.assert_frame_equal(result, expected)
def test_frame_describe_tupleindex():
# GH 14848 - regression from 0.19.0 to 0.19.1
df1 = DataFrame(
{
"x": [1, 2, 3, 4, 5] * 3,
"y": [10, 20, 30, 40, 50] * 3,
"z": [100, 200, 300, 400, 500] * 3,
}
)
df1["k"] = [(0, 0, 1), (0, 1, 0), (1, 0, 0)] * 5
df2 = df1.rename(columns={"k": "key"})
msg = "Names should be list-like for a MultiIndex"
with pytest.raises(ValueError, match=msg):
df1.groupby("k").describe()
with pytest.raises(ValueError, match=msg):
df2.groupby("key").describe()
def test_frame_describe_unstacked_format():
# GH 4792
prices = {
Timestamp("2011-01-06 10:59:05", tz=None): 24990,
Timestamp("2011-01-06 12:43:33", tz=None): 25499,
Timestamp("2011-01-06 12:54:09", tz=None): 25499,
}
volumes = {
Timestamp("2011-01-06 10:59:05", tz=None): 1500000000,
Timestamp("2011-01-06 12:43:33", tz=None): 5000000000,
Timestamp("2011-01-06 12:54:09", tz=None): 100000000,
}
df = DataFrame({"PRICE": prices, "VOLUME": volumes})
result = df.groupby("PRICE").VOLUME.describe()
data = [
df[df.PRICE == 24990].VOLUME.describe().values.tolist(),
df[df.PRICE == 25499].VOLUME.describe().values.tolist(),
]
expected = DataFrame(
data,
index=Index([24990, 25499], name="PRICE"),
columns=["count", "mean", "std", "min", "25%", "50%", "75%", "max"],
)
tm.assert_frame_equal(result, expected)
@pytest.mark.filterwarnings(
"ignore:"
"indexing past lexsort depth may impact performance:"
"pandas.errors.PerformanceWarning"
)
@pytest.mark.parametrize("as_index", [True, False])
def test_describe_with_duplicate_output_column_names(as_index):
# GH 35314
df = DataFrame(
{
"a": [99, 99, 99, 88, 88, 88],
"b": [1, 2, 3, 4, 5, 6],
"c": [10, 20, 30, 40, 50, 60],
},
columns=["a", "b", "b"],
)
expected = (
DataFrame.from_records(
[
("a", "count", 3.0, 3.0),
("a", "mean", 88.0, 99.0),
("a", "std", 0.0, 0.0),
("a", "min", 88.0, 99.0),
("a", "25%", 88.0, 99.0),
("a", "50%", 88.0, 99.0),
("a", "75%", 88.0, 99.0),
("a", "max", 88.0, 99.0),
("b", "count", 3.0, 3.0),
("b", "mean", 5.0, 2.0),
("b", "std", 1.0, 1.0),
("b", "min", 4.0, 1.0),
("b", "25%", 4.5, 1.5),
("b", "50%", 5.0, 2.0),
("b", "75%", 5.5, 2.5),
("b", "max", 6.0, 3.0),
("b", "count", 3.0, 3.0),
("b", "mean", 5.0, 2.0),
("b", "std", 1.0, 1.0),
("b", "min", 4.0, 1.0),
("b", "25%", 4.5, 1.5),
("b", "50%", 5.0, 2.0),
("b", "75%", 5.5, 2.5),
("b", "max", 6.0, 3.0),
],
)
.set_index([0, 1])
.T
)
expected.columns.names = [None, None]
expected.index = Index([88, 99], name="a")
if as_index:
expected = expected.drop(columns=["a"], level=0)
else:
expected = expected.reset_index(drop=True)
result = df.groupby("a", as_index=as_index).describe()
tm.assert_frame_equal(result, expected)
def test_groupby_mean_no_overflow():
# Regression test for (#22487)
df = DataFrame(
{
"user": ["A", "A", "A", "A", "A"],
"connections": [4970, 4749, 4719, 4704, 18446744073699999744],
}
)
assert df.groupby("user")["connections"].mean()["A"] == 3689348814740003840
@pytest.mark.parametrize(
"values",
[
{
"a": [1, 1, 1, 2, 2, 2, 3, 3, 3],
"b": [1, pd.NA, 2, 1, pd.NA, 2, 1, pd.NA, 2],
},
{"a": [1, 1, 2, 2, 3, 3], "b": [1, 2, 1, 2, 1, 2]},
],
)
@pytest.mark.parametrize("function", ["mean", "median", "var"])
def test_apply_to_nullable_integer_returns_float(values, function):
# https://github.com/pandas-dev/pandas/issues/32219
output = 0.5 if function == "var" else 1.5
arr = np.array([output] * 3, dtype=float)
idx = Index([1, 2, 3], dtype=object, name="a")
expected = DataFrame({"b": arr}, index=idx).astype("Float64")
groups = DataFrame(values, dtype="Int64").groupby("a")
result = getattr(groups, function)()
tm.assert_frame_equal(result, expected)
result = groups.agg(function)
tm.assert_frame_equal(result, expected)
result = groups.agg([function])
expected.columns = MultiIndex.from_tuples([("b", function)])
tm.assert_frame_equal(result, expected)
def test_groupby_sum_below_mincount_nullable_integer():
# https://github.com/pandas-dev/pandas/issues/32861
df = DataFrame({"a": [0, 1, 2], "b": [0, 1, 2], "c": [0, 1, 2]}, dtype="Int64")
grouped = df.groupby("a")
idx = Index([0, 1, 2], dtype=object, name="a")
result = grouped["b"].sum(min_count=2)
expected = Series([pd.NA] * 3, dtype="Int64", index=idx, name="b")
tm.assert_series_equal(result, expected)
result = grouped.sum(min_count=2)
expected = DataFrame({"b": [pd.NA] * 3, "c": [pd.NA] * 3}, dtype="Int64", index=idx)
tm.assert_frame_equal(result, expected)