Inzynierka/Lib/site-packages/pandas/tests/groupby/test_groupby.py
2023-06-02 12:51:02 +02:00

2838 lines
87 KiB
Python
Raw Blame History

This file contains invisible Unicode characters

This file contains invisible Unicode characters that are indistinguishable to humans but may be processed differently by a computer. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

from datetime import datetime
from decimal import Decimal
import numpy as np
import pytest
from pandas.compat import IS64
from pandas.errors import (
PerformanceWarning,
SpecificationError,
)
import pandas as pd
from pandas import (
Categorical,
DataFrame,
Grouper,
Index,
MultiIndex,
RangeIndex,
Series,
Timedelta,
Timestamp,
date_range,
to_datetime,
)
import pandas._testing as tm
from pandas.core.arrays import BooleanArray
import pandas.core.common as com
from pandas.tests.groupby import get_groupby_method_args
def test_repr():
# GH18203
result = repr(Grouper(key="A", level="B"))
expected = "Grouper(key='A', level='B', axis=0, sort=False, dropna=True)"
assert result == expected
def test_groupby_std_datetimelike():
# GH#48481
tdi = pd.timedelta_range("1 Day", periods=10000)
ser = Series(tdi)
ser[::5] *= 2 # get different std for different groups
df = ser.to_frame("A")
df["B"] = ser + Timestamp(0)
df["C"] = ser + Timestamp(0, tz="UTC")
df.iloc[-1] = pd.NaT # last group includes NaTs
gb = df.groupby(list(range(5)) * 2000)
result = gb.std()
# Note: this does not _exactly_ match what we would get if we did
# [gb.get_group(i).std() for i in gb.groups]
# but it _does_ match the floating point error we get doing the
# same operation on int64 data xref GH#51332
td1 = Timedelta("2887 days 11:21:02.326710176")
td4 = Timedelta("2886 days 00:42:34.664668096")
exp_ser = Series([td1 * 2, td1, td1, td1, td4], index=np.arange(5))
expected = DataFrame({"A": exp_ser, "B": exp_ser, "C": exp_ser})
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize("dtype", ["int64", "int32", "float64", "float32"])
def test_basic(dtype):
data = Series(np.arange(9) // 3, index=np.arange(9), dtype=dtype)
index = np.arange(9)
np.random.shuffle(index)
data = data.reindex(index)
grouped = data.groupby(lambda x: x // 3, group_keys=False)
for k, v in grouped:
assert len(v) == 3
agged = grouped.aggregate(np.mean)
assert agged[1] == 1
tm.assert_series_equal(agged, grouped.agg(np.mean)) # shorthand
tm.assert_series_equal(agged, grouped.mean())
tm.assert_series_equal(grouped.agg(np.sum), grouped.sum())
expected = grouped.apply(lambda x: x * x.sum())
transformed = grouped.transform(lambda x: x * x.sum())
assert transformed[7] == 12
tm.assert_series_equal(transformed, expected)
value_grouped = data.groupby(data)
tm.assert_series_equal(
value_grouped.aggregate(np.mean), agged, check_index_type=False
)
# complex agg
agged = grouped.aggregate([np.mean, np.std])
msg = r"nested renamer is not supported"
with pytest.raises(SpecificationError, match=msg):
grouped.aggregate({"one": np.mean, "two": np.std})
group_constants = {0: 10, 1: 20, 2: 30}
agged = grouped.agg(lambda x: group_constants[x.name] + x.mean())
assert agged[1] == 21
# corner cases
msg = "Must produce aggregated value"
# exception raised is type Exception
with pytest.raises(Exception, match=msg):
grouped.aggregate(lambda x: x * 2)
def test_groupby_nonobject_dtype(mframe, df_mixed_floats):
key = mframe.index.codes[0]
grouped = mframe.groupby(key)
result = grouped.sum()
expected = mframe.groupby(key.astype("O")).sum()
assert result.index.dtype == np.int8
assert expected.index.dtype == np.int64
tm.assert_frame_equal(result, expected, check_index_type=False)
# GH 3911, mixed frame non-conversion
df = df_mixed_floats.copy()
df["value"] = range(len(df))
def max_value(group):
return group.loc[group["value"].idxmax()]
applied = df.groupby("A").apply(max_value)
result = applied.dtypes
expected = df.dtypes
tm.assert_series_equal(result, expected)
def test_inconsistent_return_type():
# GH5592
# inconsistent return type
df = DataFrame(
{
"A": ["Tiger", "Tiger", "Tiger", "Lamb", "Lamb", "Pony", "Pony"],
"B": Series(np.arange(7), dtype="int64"),
"C": date_range("20130101", periods=7),
}
)
def f_0(grp):
return grp.iloc[0]
expected = df.groupby("A").first()[["B"]]
result = df.groupby("A").apply(f_0)[["B"]]
tm.assert_frame_equal(result, expected)
def f_1(grp):
if grp.name == "Tiger":
return None
return grp.iloc[0]
result = df.groupby("A").apply(f_1)[["B"]]
e = expected.copy()
e.loc["Tiger"] = np.nan
tm.assert_frame_equal(result, e)
def f_2(grp):
if grp.name == "Pony":
return None
return grp.iloc[0]
result = df.groupby("A").apply(f_2)[["B"]]
e = expected.copy()
e.loc["Pony"] = np.nan
tm.assert_frame_equal(result, e)
# 5592 revisited, with datetimes
def f_3(grp):
if grp.name == "Pony":
return None
return grp.iloc[0]
result = df.groupby("A").apply(f_3)[["C"]]
e = df.groupby("A").first()[["C"]]
e.loc["Pony"] = pd.NaT
tm.assert_frame_equal(result, e)
# scalar outputs
def f_4(grp):
if grp.name == "Pony":
return None
return grp.iloc[0].loc["C"]
result = df.groupby("A").apply(f_4)
e = df.groupby("A").first()["C"].copy()
e.loc["Pony"] = np.nan
e.name = None
tm.assert_series_equal(result, e)
def test_pass_args_kwargs(ts, tsframe):
def f(x, q=None, axis=0):
return np.percentile(x, q, axis=axis)
g = lambda x: np.percentile(x, 80, axis=0)
# Series
ts_grouped = ts.groupby(lambda x: x.month)
agg_result = ts_grouped.agg(np.percentile, 80, axis=0)
apply_result = ts_grouped.apply(np.percentile, 80, axis=0)
trans_result = ts_grouped.transform(np.percentile, 80, axis=0)
agg_expected = ts_grouped.quantile(0.8)
trans_expected = ts_grouped.transform(g)
tm.assert_series_equal(apply_result, agg_expected)
tm.assert_series_equal(agg_result, agg_expected)
tm.assert_series_equal(trans_result, trans_expected)
agg_result = ts_grouped.agg(f, q=80)
apply_result = ts_grouped.apply(f, q=80)
trans_result = ts_grouped.transform(f, q=80)
tm.assert_series_equal(agg_result, agg_expected)
tm.assert_series_equal(apply_result, agg_expected)
tm.assert_series_equal(trans_result, trans_expected)
# DataFrame
for as_index in [True, False]:
df_grouped = tsframe.groupby(lambda x: x.month, as_index=as_index)
agg_result = df_grouped.agg(np.percentile, 80, axis=0)
apply_result = df_grouped.apply(DataFrame.quantile, 0.8)
expected = df_grouped.quantile(0.8)
tm.assert_frame_equal(apply_result, expected, check_names=False)
tm.assert_frame_equal(agg_result, expected)
apply_result = df_grouped.apply(DataFrame.quantile, [0.4, 0.8])
expected_seq = df_grouped.quantile([0.4, 0.8])
tm.assert_frame_equal(apply_result, expected_seq, check_names=False)
agg_result = df_grouped.agg(f, q=80)
apply_result = df_grouped.apply(DataFrame.quantile, q=0.8)
tm.assert_frame_equal(agg_result, expected)
tm.assert_frame_equal(apply_result, expected, check_names=False)
@pytest.mark.parametrize("as_index", [True, False])
def test_pass_args_kwargs_duplicate_columns(tsframe, as_index):
# go through _aggregate_frame with self.axis == 0 and duplicate columns
tsframe.columns = ["A", "B", "A", "C"]
gb = tsframe.groupby(lambda x: x.month, as_index=as_index)
res = gb.agg(np.percentile, 80, axis=0)
ex_data = {
1: tsframe[tsframe.index.month == 1].quantile(0.8),
2: tsframe[tsframe.index.month == 2].quantile(0.8),
}
expected = DataFrame(ex_data).T
expected.index = expected.index.astype(np.int32)
if not as_index:
# TODO: try to get this more consistent?
expected.index = Index(range(2))
tm.assert_frame_equal(res, expected)
def test_len():
df = tm.makeTimeDataFrame()
grouped = df.groupby([lambda x: x.year, lambda x: x.month, lambda x: x.day])
assert len(grouped) == len(df)
grouped = df.groupby([lambda x: x.year, lambda x: x.month])
expected = len({(x.year, x.month) for x in df.index})
assert len(grouped) == expected
# issue 11016
df = DataFrame({"a": [np.nan] * 3, "b": [1, 2, 3]})
assert len(df.groupby("a")) == 0
assert len(df.groupby("b")) == 3
assert len(df.groupby(["a", "b"])) == 3
def test_basic_regression():
# regression
result = Series([1.0 * x for x in list(range(1, 10)) * 10])
data = np.random.random(1100) * 10.0
groupings = Series(data)
grouped = result.groupby(groupings)
grouped.mean()
@pytest.mark.parametrize(
"dtype", ["float64", "float32", "int64", "int32", "int16", "int8"]
)
def test_with_na_groups(dtype):
index = Index(np.arange(10))
values = Series(np.ones(10), index, dtype=dtype)
labels = Series(
[np.nan, "foo", "bar", "bar", np.nan, np.nan, "bar", "bar", np.nan, "foo"],
index=index,
)
# this SHOULD be an int
grouped = values.groupby(labels)
agged = grouped.agg(len)
expected = Series([4, 2], index=["bar", "foo"])
tm.assert_series_equal(agged, expected, check_dtype=False)
# assert issubclass(agged.dtype.type, np.integer)
# explicitly return a float from my function
def f(x):
return float(len(x))
agged = grouped.agg(f)
expected = Series([4.0, 2.0], index=["bar", "foo"])
tm.assert_series_equal(agged, expected)
def test_indices_concatenation_order():
# GH 2808
def f1(x):
y = x[(x.b % 2) == 1] ** 2
if y.empty:
multiindex = MultiIndex(levels=[[]] * 2, codes=[[]] * 2, names=["b", "c"])
res = DataFrame(columns=["a"], index=multiindex)
return res
else:
y = y.set_index(["b", "c"])
return y
def f2(x):
y = x[(x.b % 2) == 1] ** 2
if y.empty:
return DataFrame()
else:
y = y.set_index(["b", "c"])
return y
def f3(x):
y = x[(x.b % 2) == 1] ** 2
if y.empty:
multiindex = MultiIndex(
levels=[[]] * 2, codes=[[]] * 2, names=["foo", "bar"]
)
res = DataFrame(columns=["a", "b"], index=multiindex)
return res
else:
return y
df = DataFrame({"a": [1, 2, 2, 2], "b": range(4), "c": range(5, 9)})
df2 = DataFrame({"a": [3, 2, 2, 2], "b": range(4), "c": range(5, 9)})
# correct result
result1 = df.groupby("a").apply(f1)
result2 = df2.groupby("a").apply(f1)
tm.assert_frame_equal(result1, result2)
# should fail (not the same number of levels)
msg = "Cannot concat indices that do not have the same number of levels"
with pytest.raises(AssertionError, match=msg):
df.groupby("a").apply(f2)
with pytest.raises(AssertionError, match=msg):
df2.groupby("a").apply(f2)
# should fail (incorrect shape)
with pytest.raises(AssertionError, match=msg):
df.groupby("a").apply(f3)
with pytest.raises(AssertionError, match=msg):
df2.groupby("a").apply(f3)
def test_attr_wrapper(ts):
grouped = ts.groupby(lambda x: x.weekday())
result = grouped.std()
expected = grouped.agg(lambda x: np.std(x, ddof=1))
tm.assert_series_equal(result, expected)
# this is pretty cool
result = grouped.describe()
expected = {name: gp.describe() for name, gp in grouped}
expected = DataFrame(expected).T
tm.assert_frame_equal(result, expected)
# get attribute
result = grouped.dtype
expected = grouped.agg(lambda x: x.dtype)
tm.assert_series_equal(result, expected)
# make sure raises error
msg = "'SeriesGroupBy' object has no attribute 'foo'"
with pytest.raises(AttributeError, match=msg):
getattr(grouped, "foo")
def test_frame_groupby(tsframe):
grouped = tsframe.groupby(lambda x: x.weekday())
# aggregate
aggregated = grouped.aggregate(np.mean)
assert len(aggregated) == 5
assert len(aggregated.columns) == 4
# by string
tscopy = tsframe.copy()
tscopy["weekday"] = [x.weekday() for x in tscopy.index]
stragged = tscopy.groupby("weekday").aggregate(np.mean)
tm.assert_frame_equal(stragged, aggregated, check_names=False)
# transform
grouped = tsframe.head(30).groupby(lambda x: x.weekday())
transformed = grouped.transform(lambda x: x - x.mean())
assert len(transformed) == 30
assert len(transformed.columns) == 4
# transform propagate
transformed = grouped.transform(lambda x: x.mean())
for name, group in grouped:
mean = group.mean()
for idx in group.index:
tm.assert_series_equal(transformed.xs(idx), mean, check_names=False)
# iterate
for weekday, group in grouped:
assert group.index[0].weekday() == weekday
# groups / group_indices
groups = grouped.groups
indices = grouped.indices
for k, v in groups.items():
samething = tsframe.index.take(indices[k])
assert (samething == v).all()
def test_frame_groupby_columns(tsframe):
mapping = {"A": 0, "B": 0, "C": 1, "D": 1}
grouped = tsframe.groupby(mapping, axis=1)
# aggregate
aggregated = grouped.aggregate(np.mean)
assert len(aggregated) == len(tsframe)
assert len(aggregated.columns) == 2
# transform
tf = lambda x: x - x.mean()
groupedT = tsframe.T.groupby(mapping, axis=0)
tm.assert_frame_equal(groupedT.transform(tf).T, grouped.transform(tf))
# iterate
for k, v in grouped:
assert len(v.columns) == 2
def test_frame_set_name_single(df):
grouped = df.groupby("A")
result = grouped.mean(numeric_only=True)
assert result.index.name == "A"
result = df.groupby("A", as_index=False).mean(numeric_only=True)
assert result.index.name != "A"
result = grouped[["C", "D"]].agg(np.mean)
assert result.index.name == "A"
result = grouped.agg({"C": np.mean, "D": np.std})
assert result.index.name == "A"
result = grouped["C"].mean()
assert result.index.name == "A"
result = grouped["C"].agg(np.mean)
assert result.index.name == "A"
result = grouped["C"].agg([np.mean, np.std])
assert result.index.name == "A"
msg = r"nested renamer is not supported"
with pytest.raises(SpecificationError, match=msg):
grouped["C"].agg({"foo": np.mean, "bar": np.std})
def test_multi_func(df):
col1 = df["A"]
col2 = df["B"]
grouped = df.groupby([col1.get, col2.get])
agged = grouped.mean(numeric_only=True)
expected = df.groupby(["A", "B"]).mean()
# TODO groupby get drops names
tm.assert_frame_equal(
agged.loc[:, ["C", "D"]], expected.loc[:, ["C", "D"]], check_names=False
)
# some "groups" with no data
df = DataFrame(
{
"v1": np.random.randn(6),
"v2": np.random.randn(6),
"k1": np.array(["b", "b", "b", "a", "a", "a"]),
"k2": np.array(["1", "1", "1", "2", "2", "2"]),
},
index=["one", "two", "three", "four", "five", "six"],
)
# only verify that it works for now
grouped = df.groupby(["k1", "k2"])
grouped.agg(np.sum)
def test_multi_key_multiple_functions(df):
grouped = df.groupby(["A", "B"])["C"]
agged = grouped.agg([np.mean, np.std])
expected = DataFrame({"mean": grouped.agg(np.mean), "std": grouped.agg(np.std)})
tm.assert_frame_equal(agged, expected)
def test_frame_multi_key_function_list():
data = DataFrame(
{
"A": [
"foo",
"foo",
"foo",
"foo",
"bar",
"bar",
"bar",
"bar",
"foo",
"foo",
"foo",
],
"B": [
"one",
"one",
"one",
"two",
"one",
"one",
"one",
"two",
"two",
"two",
"one",
],
"D": np.random.randn(11),
"E": np.random.randn(11),
"F": np.random.randn(11),
}
)
grouped = data.groupby(["A", "B"])
funcs = [np.mean, np.std]
agged = grouped.agg(funcs)
expected = pd.concat(
[grouped["D"].agg(funcs), grouped["E"].agg(funcs), grouped["F"].agg(funcs)],
keys=["D", "E", "F"],
axis=1,
)
assert isinstance(agged.index, MultiIndex)
assert isinstance(expected.index, MultiIndex)
tm.assert_frame_equal(agged, expected)
def test_frame_multi_key_function_list_partial_failure():
data = DataFrame(
{
"A": [
"foo",
"foo",
"foo",
"foo",
"bar",
"bar",
"bar",
"bar",
"foo",
"foo",
"foo",
],
"B": [
"one",
"one",
"one",
"two",
"one",
"one",
"one",
"two",
"two",
"two",
"one",
],
"C": [
"dull",
"dull",
"shiny",
"dull",
"dull",
"shiny",
"shiny",
"dull",
"shiny",
"shiny",
"shiny",
],
"D": np.random.randn(11),
"E": np.random.randn(11),
"F": np.random.randn(11),
}
)
grouped = data.groupby(["A", "B"])
funcs = [np.mean, np.std]
with pytest.raises(TypeError, match="Could not convert dullshinyshiny to numeric"):
grouped.agg(funcs)
@pytest.mark.parametrize("op", [lambda x: x.sum(), lambda x: x.mean()])
def test_groupby_multiple_columns(df, op):
data = df
grouped = data.groupby(["A", "B"])
result1 = op(grouped)
keys = []
values = []
for n1, gp1 in data.groupby("A"):
for n2, gp2 in gp1.groupby("B"):
keys.append((n1, n2))
values.append(op(gp2.loc[:, ["C", "D"]]))
mi = MultiIndex.from_tuples(keys, names=["A", "B"])
expected = pd.concat(values, axis=1).T
expected.index = mi
# a little bit crude
for col in ["C", "D"]:
result_col = op(grouped[col])
pivoted = result1[col]
exp = expected[col]
tm.assert_series_equal(result_col, exp)
tm.assert_series_equal(pivoted, exp)
# test single series works the same
result = data["C"].groupby([data["A"], data["B"]]).mean()
expected = data.groupby(["A", "B"]).mean()["C"]
tm.assert_series_equal(result, expected)
def test_as_index_select_column():
# GH 5764
df = DataFrame([[1, 2], [1, 4], [5, 6]], columns=["A", "B"])
result = df.groupby("A", as_index=False)["B"].get_group(1)
expected = Series([2, 4], name="B")
tm.assert_series_equal(result, expected)
result = df.groupby("A", as_index=False, group_keys=True)["B"].apply(
lambda x: x.cumsum()
)
expected = Series(
[2, 6, 6], name="B", index=MultiIndex.from_tuples([(0, 0), (0, 1), (1, 2)])
)
tm.assert_series_equal(result, expected)
def test_groupby_as_index_select_column_sum_empty_df():
# GH 35246
df = DataFrame(columns=Index(["A", "B", "C"], name="alpha"))
left = df.groupby(by="A", as_index=False)["B"].sum(numeric_only=False)
expected = DataFrame(columns=df.columns[:2], index=range(0))
# GH#50744 - Columns after selection shouldn't retain names
expected.columns.names = [None]
tm.assert_frame_equal(left, expected)
def test_groupby_as_index_agg(df):
grouped = df.groupby("A", as_index=False)
# single-key
result = grouped[["C", "D"]].agg(np.mean)
expected = grouped.mean(numeric_only=True)
tm.assert_frame_equal(result, expected)
result2 = grouped.agg({"C": np.mean, "D": np.sum})
expected2 = grouped.mean(numeric_only=True)
expected2["D"] = grouped.sum()["D"]
tm.assert_frame_equal(result2, expected2)
grouped = df.groupby("A", as_index=True)
msg = r"nested renamer is not supported"
with pytest.raises(SpecificationError, match=msg):
grouped["C"].agg({"Q": np.sum})
# multi-key
grouped = df.groupby(["A", "B"], as_index=False)
result = grouped.agg(np.mean)
expected = grouped.mean()
tm.assert_frame_equal(result, expected)
result2 = grouped.agg({"C": np.mean, "D": np.sum})
expected2 = grouped.mean()
expected2["D"] = grouped.sum()["D"]
tm.assert_frame_equal(result2, expected2)
expected3 = grouped["C"].sum()
expected3 = DataFrame(expected3).rename(columns={"C": "Q"})
result3 = grouped["C"].agg({"Q": np.sum})
tm.assert_frame_equal(result3, expected3)
# GH7115 & GH8112 & GH8582
df = DataFrame(np.random.randint(0, 100, (50, 3)), columns=["jim", "joe", "jolie"])
ts = Series(np.random.randint(5, 10, 50), name="jim")
gr = df.groupby(ts)
gr.nth(0) # invokes set_selection_from_grouper internally
tm.assert_frame_equal(gr.apply(sum), df.groupby(ts).apply(sum))
for attr in ["mean", "max", "count", "idxmax", "cumsum", "all"]:
gr = df.groupby(ts, as_index=False)
left = getattr(gr, attr)()
gr = df.groupby(ts.values, as_index=True)
right = getattr(gr, attr)().reset_index(drop=True)
tm.assert_frame_equal(left, right)
def test_ops_not_as_index(reduction_func):
# GH 10355, 21090
# Using as_index=False should not modify grouped column
if reduction_func in ("corrwith", "nth", "ngroup"):
pytest.skip(f"GH 5755: Test not applicable for {reduction_func}")
df = DataFrame(np.random.randint(0, 5, size=(100, 2)), columns=["a", "b"])
expected = getattr(df.groupby("a"), reduction_func)()
if reduction_func == "size":
expected = expected.rename("size")
expected = expected.reset_index()
if reduction_func != "size":
# 32 bit compat -> groupby preserves dtype whereas reset_index casts to int64
expected["a"] = expected["a"].astype(df["a"].dtype)
g = df.groupby("a", as_index=False)
result = getattr(g, reduction_func)()
tm.assert_frame_equal(result, expected)
result = g.agg(reduction_func)
tm.assert_frame_equal(result, expected)
result = getattr(g["b"], reduction_func)()
tm.assert_frame_equal(result, expected)
result = g["b"].agg(reduction_func)
tm.assert_frame_equal(result, expected)
def test_as_index_series_return_frame(df):
grouped = df.groupby("A", as_index=False)
grouped2 = df.groupby(["A", "B"], as_index=False)
result = grouped["C"].agg(np.sum)
expected = grouped.agg(np.sum).loc[:, ["A", "C"]]
assert isinstance(result, DataFrame)
tm.assert_frame_equal(result, expected)
result2 = grouped2["C"].agg(np.sum)
expected2 = grouped2.agg(np.sum).loc[:, ["A", "B", "C"]]
assert isinstance(result2, DataFrame)
tm.assert_frame_equal(result2, expected2)
result = grouped["C"].sum()
expected = grouped.sum().loc[:, ["A", "C"]]
assert isinstance(result, DataFrame)
tm.assert_frame_equal(result, expected)
result2 = grouped2["C"].sum()
expected2 = grouped2.sum().loc[:, ["A", "B", "C"]]
assert isinstance(result2, DataFrame)
tm.assert_frame_equal(result2, expected2)
def test_as_index_series_column_slice_raises(df):
# GH15072
grouped = df.groupby("A", as_index=False)
msg = r"Column\(s\) C already selected"
with pytest.raises(IndexError, match=msg):
grouped["C"].__getitem__("D")
def test_groupby_as_index_cython(df):
data = df
# single-key
grouped = data.groupby("A", as_index=False)
result = grouped.mean(numeric_only=True)
expected = data.groupby(["A"]).mean(numeric_only=True)
expected.insert(0, "A", expected.index)
expected.index = RangeIndex(len(expected))
tm.assert_frame_equal(result, expected)
# multi-key
grouped = data.groupby(["A", "B"], as_index=False)
result = grouped.mean()
expected = data.groupby(["A", "B"]).mean()
arrays = list(zip(*expected.index.values))
expected.insert(0, "A", arrays[0])
expected.insert(1, "B", arrays[1])
expected.index = RangeIndex(len(expected))
tm.assert_frame_equal(result, expected)
def test_groupby_as_index_series_scalar(df):
grouped = df.groupby(["A", "B"], as_index=False)
# GH #421
result = grouped["C"].agg(len)
expected = grouped.agg(len).loc[:, ["A", "B", "C"]]
tm.assert_frame_equal(result, expected)
def test_groupby_as_index_corner(df, ts):
msg = "as_index=False only valid with DataFrame"
with pytest.raises(TypeError, match=msg):
ts.groupby(lambda x: x.weekday(), as_index=False)
msg = "as_index=False only valid for axis=0"
with pytest.raises(ValueError, match=msg):
df.groupby(lambda x: x.lower(), as_index=False, axis=1)
def test_groupby_multiple_key():
df = tm.makeTimeDataFrame()
grouped = df.groupby([lambda x: x.year, lambda x: x.month, lambda x: x.day])
agged = grouped.sum()
tm.assert_almost_equal(df.values, agged.values)
grouped = df.T.groupby(
[lambda x: x.year, lambda x: x.month, lambda x: x.day], axis=1
)
agged = grouped.agg(lambda x: x.sum())
tm.assert_index_equal(agged.index, df.columns)
tm.assert_almost_equal(df.T.values, agged.values)
agged = grouped.agg(lambda x: x.sum())
tm.assert_almost_equal(df.T.values, agged.values)
def test_groupby_multi_corner(df):
# test that having an all-NA column doesn't mess you up
df = df.copy()
df["bad"] = np.nan
agged = df.groupby(["A", "B"]).mean()
expected = df.groupby(["A", "B"]).mean()
expected["bad"] = np.nan
tm.assert_frame_equal(agged, expected)
def test_raises_on_nuisance(df):
grouped = df.groupby("A")
with pytest.raises(TypeError, match="Could not convert"):
grouped.agg(np.mean)
with pytest.raises(TypeError, match="Could not convert"):
grouped.mean()
df = df.loc[:, ["A", "C", "D"]]
df["E"] = datetime.now()
grouped = df.groupby("A")
msg = "datetime64 type does not support sum operations"
with pytest.raises(TypeError, match=msg):
grouped.agg(np.sum)
with pytest.raises(TypeError, match=msg):
grouped.sum()
# won't work with axis = 1
grouped = df.groupby({"A": 0, "C": 0, "D": 1, "E": 1}, axis=1)
msg = "does not support reduction 'sum'"
with pytest.raises(TypeError, match=msg):
grouped.agg(lambda x: x.sum(0, numeric_only=False))
@pytest.mark.parametrize(
"agg_function",
["max", "min"],
)
def test_keep_nuisance_agg(df, agg_function):
# GH 38815
grouped = df.groupby("A")
result = getattr(grouped, agg_function)()
expected = result.copy()
expected.loc["bar", "B"] = getattr(df.loc[df["A"] == "bar", "B"], agg_function)()
expected.loc["foo", "B"] = getattr(df.loc[df["A"] == "foo", "B"], agg_function)()
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize(
"agg_function",
["sum", "mean", "prod", "std", "var", "sem", "median"],
)
@pytest.mark.parametrize("numeric_only", [True, False])
def test_omit_nuisance_agg(df, agg_function, numeric_only):
# GH 38774, GH 38815
grouped = df.groupby("A")
no_drop_nuisance = ("var", "std", "sem", "mean", "prod", "median")
if agg_function in no_drop_nuisance and not numeric_only:
# Added numeric_only as part of GH#46560; these do not drop nuisance
# columns when numeric_only is False
klass = ValueError if agg_function in ("std", "sem") else TypeError
msg = "|".join(["[C|c]ould not convert", "can't multiply sequence"])
with pytest.raises(klass, match=msg):
getattr(grouped, agg_function)(numeric_only=numeric_only)
else:
result = getattr(grouped, agg_function)(numeric_only=numeric_only)
if not numeric_only and agg_function == "sum":
# sum is successful on column B
columns = ["A", "B", "C", "D"]
else:
columns = ["A", "C", "D"]
expected = getattr(df.loc[:, columns].groupby("A"), agg_function)(
numeric_only=numeric_only
)
tm.assert_frame_equal(result, expected)
def test_raise_on_nuisance_python_single(df):
# GH 38815
grouped = df.groupby("A")
with pytest.raises(TypeError, match="could not convert"):
grouped.skew()
def test_raise_on_nuisance_python_multiple(three_group):
grouped = three_group.groupby(["A", "B"])
with pytest.raises(TypeError, match="Could not convert"):
grouped.agg(np.mean)
with pytest.raises(TypeError, match="Could not convert"):
grouped.mean()
def test_empty_groups_corner(mframe):
# handle empty groups
df = DataFrame(
{
"k1": np.array(["b", "b", "b", "a", "a", "a"]),
"k2": np.array(["1", "1", "1", "2", "2", "2"]),
"k3": ["foo", "bar"] * 3,
"v1": np.random.randn(6),
"v2": np.random.randn(6),
}
)
grouped = df.groupby(["k1", "k2"])
result = grouped[["v1", "v2"]].agg(np.mean)
expected = grouped.mean(numeric_only=True)
tm.assert_frame_equal(result, expected)
grouped = mframe[3:5].groupby(level=0)
agged = grouped.apply(lambda x: x.mean())
agged_A = grouped["A"].apply(np.mean)
tm.assert_series_equal(agged["A"], agged_A)
assert agged.index.name == "first"
def test_nonsense_func():
df = DataFrame([0])
msg = r"unsupported operand type\(s\) for \+: 'int' and 'str'"
with pytest.raises(TypeError, match=msg):
df.groupby(lambda x: x + "foo")
def test_wrap_aggregated_output_multindex(mframe):
df = mframe.T
df["baz", "two"] = "peekaboo"
keys = [np.array([0, 0, 1]), np.array([0, 0, 1])]
with pytest.raises(TypeError, match="Could not convert"):
df.groupby(keys).agg(np.mean)
agged = df.drop(columns=("baz", "two")).groupby(keys).agg(np.mean)
assert isinstance(agged.columns, MultiIndex)
def aggfun(ser):
if ser.name == ("foo", "one"):
raise TypeError("Test error message")
return ser.sum()
with pytest.raises(TypeError, match="Test error message"):
df.groupby(keys).aggregate(aggfun)
def test_groupby_level_apply(mframe):
result = mframe.groupby(level=0).count()
assert result.index.name == "first"
result = mframe.groupby(level=1).count()
assert result.index.name == "second"
result = mframe["A"].groupby(level=0).count()
assert result.index.name == "first"
def test_groupby_level_mapper(mframe):
deleveled = mframe.reset_index()
mapper0 = {"foo": 0, "bar": 0, "baz": 1, "qux": 1}
mapper1 = {"one": 0, "two": 0, "three": 1}
result0 = mframe.groupby(mapper0, level=0).sum()
result1 = mframe.groupby(mapper1, level=1).sum()
mapped_level0 = np.array(
[mapper0.get(x) for x in deleveled["first"]], dtype=np.int64
)
mapped_level1 = np.array(
[mapper1.get(x) for x in deleveled["second"]], dtype=np.int64
)
expected0 = mframe.groupby(mapped_level0).sum()
expected1 = mframe.groupby(mapped_level1).sum()
expected0.index.name, expected1.index.name = "first", "second"
tm.assert_frame_equal(result0, expected0)
tm.assert_frame_equal(result1, expected1)
def test_groupby_level_nonmulti():
# GH 1313, GH 13901
s = Series([1, 2, 3, 10, 4, 5, 20, 6], Index([1, 2, 3, 1, 4, 5, 2, 6], name="foo"))
expected = Series([11, 22, 3, 4, 5, 6], Index(range(1, 7), name="foo"))
result = s.groupby(level=0).sum()
tm.assert_series_equal(result, expected)
result = s.groupby(level=[0]).sum()
tm.assert_series_equal(result, expected)
result = s.groupby(level=-1).sum()
tm.assert_series_equal(result, expected)
result = s.groupby(level=[-1]).sum()
tm.assert_series_equal(result, expected)
msg = "level > 0 or level < -1 only valid with MultiIndex"
with pytest.raises(ValueError, match=msg):
s.groupby(level=1)
with pytest.raises(ValueError, match=msg):
s.groupby(level=-2)
msg = "No group keys passed!"
with pytest.raises(ValueError, match=msg):
s.groupby(level=[])
msg = "multiple levels only valid with MultiIndex"
with pytest.raises(ValueError, match=msg):
s.groupby(level=[0, 0])
with pytest.raises(ValueError, match=msg):
s.groupby(level=[0, 1])
msg = "level > 0 or level < -1 only valid with MultiIndex"
with pytest.raises(ValueError, match=msg):
s.groupby(level=[1])
def test_groupby_complex():
# GH 12902
a = Series(data=np.arange(4) * (1 + 2j), index=[0, 0, 1, 1])
expected = Series((1 + 2j, 5 + 10j))
result = a.groupby(level=0).sum()
tm.assert_series_equal(result, expected)
def test_groupby_complex_numbers():
# GH 17927
df = DataFrame(
[
{"a": 1, "b": 1 + 1j},
{"a": 1, "b": 1 + 2j},
{"a": 4, "b": 1},
]
)
expected = DataFrame(
np.array([1, 1, 1], dtype=np.int64),
index=Index([(1 + 1j), (1 + 2j), (1 + 0j)], name="b"),
columns=Index(["a"], dtype="object"),
)
result = df.groupby("b", sort=False).count()
tm.assert_frame_equal(result, expected)
# Sorted by the magnitude of the complex numbers
expected.index = Index([(1 + 0j), (1 + 1j), (1 + 2j)], name="b")
result = df.groupby("b", sort=True).count()
tm.assert_frame_equal(result, expected)
def test_groupby_series_indexed_differently():
s1 = Series(
[5.0, -9.0, 4.0, 100.0, -5.0, 55.0, 6.7],
index=Index(["a", "b", "c", "d", "e", "f", "g"]),
)
s2 = Series(
[1.0, 1.0, 4.0, 5.0, 5.0, 7.0], index=Index(["a", "b", "d", "f", "g", "h"])
)
grouped = s1.groupby(s2)
agged = grouped.mean()
exp = s1.groupby(s2.reindex(s1.index).get).mean()
tm.assert_series_equal(agged, exp)
def test_groupby_with_hier_columns():
tuples = list(
zip(
*[
["bar", "bar", "baz", "baz", "foo", "foo", "qux", "qux"],
["one", "two", "one", "two", "one", "two", "one", "two"],
]
)
)
index = MultiIndex.from_tuples(tuples)
columns = MultiIndex.from_tuples(
[("A", "cat"), ("B", "dog"), ("B", "cat"), ("A", "dog")]
)
df = DataFrame(np.random.randn(8, 4), index=index, columns=columns)
result = df.groupby(level=0).mean()
tm.assert_index_equal(result.columns, columns)
result = df.groupby(level=0, axis=1).mean()
tm.assert_index_equal(result.index, df.index)
result = df.groupby(level=0).agg(np.mean)
tm.assert_index_equal(result.columns, columns)
result = df.groupby(level=0).apply(lambda x: x.mean())
tm.assert_index_equal(result.columns, columns)
result = df.groupby(level=0, axis=1).agg(lambda x: x.mean(1))
tm.assert_index_equal(result.columns, Index(["A", "B"]))
tm.assert_index_equal(result.index, df.index)
# add a nuisance column
sorted_columns, _ = columns.sortlevel(0)
df["A", "foo"] = "bar"
result = df.groupby(level=0).mean(numeric_only=True)
tm.assert_index_equal(result.columns, df.columns[:-1])
def test_grouping_ndarray(df):
grouped = df.groupby(df["A"].values)
result = grouped.sum()
expected = df.groupby(df["A"].rename(None)).sum()
tm.assert_frame_equal(result, expected)
def test_groupby_wrong_multi_labels():
index = Index([0, 1, 2, 3, 4], name="index")
data = DataFrame(
{
"foo": ["foo1", "foo1", "foo2", "foo1", "foo3"],
"bar": ["bar1", "bar2", "bar2", "bar1", "bar1"],
"baz": ["baz1", "baz1", "baz1", "baz2", "baz2"],
"spam": ["spam2", "spam3", "spam2", "spam1", "spam1"],
"data": [20, 30, 40, 50, 60],
},
index=index,
)
grouped = data.groupby(["foo", "bar", "baz", "spam"])
result = grouped.agg(np.mean)
expected = grouped.mean()
tm.assert_frame_equal(result, expected)
def test_groupby_series_with_name(df):
result = df.groupby(df["A"]).mean(numeric_only=True)
result2 = df.groupby(df["A"], as_index=False).mean(numeric_only=True)
assert result.index.name == "A"
assert "A" in result2
result = df.groupby([df["A"], df["B"]]).mean()
result2 = df.groupby([df["A"], df["B"]], as_index=False).mean()
assert result.index.names == ("A", "B")
assert "A" in result2
assert "B" in result2
def test_seriesgroupby_name_attr(df):
# GH 6265
result = df.groupby("A")["C"]
assert result.count().name == "C"
assert result.mean().name == "C"
testFunc = lambda x: np.sum(x) * 2
assert result.agg(testFunc).name == "C"
def test_consistency_name():
# GH 12363
df = DataFrame(
{
"A": ["foo", "bar", "foo", "bar", "foo", "bar", "foo", "foo"],
"B": ["one", "one", "two", "two", "two", "two", "one", "two"],
"C": np.random.randn(8) + 1.0,
"D": np.arange(8),
}
)
expected = df.groupby(["A"]).B.count()
result = df.B.groupby(df.A).count()
tm.assert_series_equal(result, expected)
def test_groupby_name_propagation(df):
# GH 6124
def summarize(df, name=None):
return Series({"count": 1, "mean": 2, "omissions": 3}, name=name)
def summarize_random_name(df):
# Provide a different name for each Series. In this case, groupby
# should not attempt to propagate the Series name since they are
# inconsistent.
return Series({"count": 1, "mean": 2, "omissions": 3}, name=df.iloc[0]["A"])
metrics = df.groupby("A").apply(summarize)
assert metrics.columns.name is None
metrics = df.groupby("A").apply(summarize, "metrics")
assert metrics.columns.name == "metrics"
metrics = df.groupby("A").apply(summarize_random_name)
assert metrics.columns.name is None
def test_groupby_nonstring_columns():
df = DataFrame([np.arange(10) for x in range(10)])
grouped = df.groupby(0)
result = grouped.mean()
expected = df.groupby(df[0]).mean()
tm.assert_frame_equal(result, expected)
def test_groupby_mixed_type_columns():
# GH 13432, unorderable types in py3
df = DataFrame([[0, 1, 2]], columns=["A", "B", 0])
expected = DataFrame([[1, 2]], columns=["B", 0], index=Index([0], name="A"))
result = df.groupby("A").first()
tm.assert_frame_equal(result, expected)
result = df.groupby("A").sum()
tm.assert_frame_equal(result, expected)
def test_cython_grouper_series_bug_noncontig():
arr = np.empty((100, 100))
arr.fill(np.nan)
obj = Series(arr[:, 0])
inds = np.tile(range(10), 10)
result = obj.groupby(inds).agg(Series.median)
assert result.isna().all()
def test_series_grouper_noncontig_index():
index = Index(tm.rands_array(10, 100))
values = Series(np.random.randn(50), index=index[::2])
labels = np.random.randint(0, 5, 50)
# it works!
grouped = values.groupby(labels)
# accessing the index elements causes segfault
f = lambda x: len(set(map(id, x.index)))
grouped.agg(f)
def test_convert_objects_leave_decimal_alone():
s = Series(range(5))
labels = np.array(["a", "b", "c", "d", "e"], dtype="O")
def convert_fast(x):
return Decimal(str(x.mean()))
def convert_force_pure(x):
# base will be length 0
assert len(x.values.base) > 0
return Decimal(str(x.mean()))
grouped = s.groupby(labels)
result = grouped.agg(convert_fast)
assert result.dtype == np.object_
assert isinstance(result[0], Decimal)
result = grouped.agg(convert_force_pure)
assert result.dtype == np.object_
assert isinstance(result[0], Decimal)
def test_groupby_dtype_inference_empty():
# GH 6733
df = DataFrame({"x": [], "range": np.arange(0, dtype="int64")})
assert df["x"].dtype == np.float64
result = df.groupby("x").first()
exp_index = Index([], name="x", dtype=np.float64)
expected = DataFrame({"range": Series([], index=exp_index, dtype="int64")})
tm.assert_frame_equal(result, expected, by_blocks=True)
def test_groupby_unit64_float_conversion():
#  GH: 30859 groupby converts unit64 to floats sometimes
df = DataFrame({"first": [1], "second": [1], "value": [16148277970000000000]})
result = df.groupby(["first", "second"])["value"].max()
expected = Series(
[16148277970000000000],
MultiIndex.from_product([[1], [1]], names=["first", "second"]),
name="value",
)
tm.assert_series_equal(result, expected)
def test_groupby_list_infer_array_like(df):
result = df.groupby(list(df["A"])).mean(numeric_only=True)
expected = df.groupby(df["A"]).mean(numeric_only=True)
tm.assert_frame_equal(result, expected, check_names=False)
with pytest.raises(KeyError, match=r"^'foo'$"):
df.groupby(list(df["A"][:-1]))
# pathological case of ambiguity
df = DataFrame({"foo": [0, 1], "bar": [3, 4], "val": np.random.randn(2)})
result = df.groupby(["foo", "bar"]).mean()
expected = df.groupby([df["foo"], df["bar"]]).mean()[["val"]]
def test_groupby_keys_same_size_as_index():
# GH 11185
freq = "s"
index = date_range(
start=Timestamp("2015-09-29T11:34:44-0700"), periods=2, freq=freq
)
df = DataFrame([["A", 10], ["B", 15]], columns=["metric", "values"], index=index)
result = df.groupby([Grouper(level=0, freq=freq), "metric"]).mean()
expected = df.set_index([df.index, "metric"]).astype(float)
tm.assert_frame_equal(result, expected)
def test_groupby_one_row():
# GH 11741
msg = r"^'Z'$"
df1 = DataFrame(np.random.randn(1, 4), columns=list("ABCD"))
with pytest.raises(KeyError, match=msg):
df1.groupby("Z")
df2 = DataFrame(np.random.randn(2, 4), columns=list("ABCD"))
with pytest.raises(KeyError, match=msg):
df2.groupby("Z")
def test_groupby_nat_exclude():
# GH 6992
df = DataFrame(
{
"values": np.random.randn(8),
"dt": [
np.nan,
Timestamp("2013-01-01"),
np.nan,
Timestamp("2013-02-01"),
np.nan,
Timestamp("2013-02-01"),
np.nan,
Timestamp("2013-01-01"),
],
"str": [np.nan, "a", np.nan, "a", np.nan, "a", np.nan, "b"],
}
)
grouped = df.groupby("dt")
expected = [Index([1, 7]), Index([3, 5])]
keys = sorted(grouped.groups.keys())
assert len(keys) == 2
for k, e in zip(keys, expected):
# grouped.groups keys are np.datetime64 with system tz
# not to be affected by tz, only compare values
tm.assert_index_equal(grouped.groups[k], e)
# confirm obj is not filtered
tm.assert_frame_equal(grouped.grouper.groupings[0].obj, df)
assert grouped.ngroups == 2
expected = {
Timestamp("2013-01-01 00:00:00"): np.array([1, 7], dtype=np.intp),
Timestamp("2013-02-01 00:00:00"): np.array([3, 5], dtype=np.intp),
}
for k in grouped.indices:
tm.assert_numpy_array_equal(grouped.indices[k], expected[k])
tm.assert_frame_equal(grouped.get_group(Timestamp("2013-01-01")), df.iloc[[1, 7]])
tm.assert_frame_equal(grouped.get_group(Timestamp("2013-02-01")), df.iloc[[3, 5]])
with pytest.raises(KeyError, match=r"^NaT$"):
grouped.get_group(pd.NaT)
nan_df = DataFrame(
{"nan": [np.nan, np.nan, np.nan], "nat": [pd.NaT, pd.NaT, pd.NaT]}
)
assert nan_df["nan"].dtype == "float64"
assert nan_df["nat"].dtype == "datetime64[ns]"
for key in ["nan", "nat"]:
grouped = nan_df.groupby(key)
assert grouped.groups == {}
assert grouped.ngroups == 0
assert grouped.indices == {}
with pytest.raises(KeyError, match=r"^nan$"):
grouped.get_group(np.nan)
with pytest.raises(KeyError, match=r"^NaT$"):
grouped.get_group(pd.NaT)
def test_groupby_two_group_keys_all_nan():
# GH #36842: Grouping over two group keys shouldn't raise an error
df = DataFrame({"a": [np.nan, np.nan], "b": [np.nan, np.nan], "c": [1, 2]})
result = df.groupby(["a", "b"]).indices
assert result == {}
def test_groupby_2d_malformed():
d = DataFrame(index=range(2))
d["group"] = ["g1", "g2"]
d["zeros"] = [0, 0]
d["ones"] = [1, 1]
d["label"] = ["l1", "l2"]
tmp = d.groupby(["group"]).mean(numeric_only=True)
res_values = np.array([[0.0, 1.0], [0.0, 1.0]])
tm.assert_index_equal(tmp.columns, Index(["zeros", "ones"]))
tm.assert_numpy_array_equal(tmp.values, res_values)
def test_int32_overflow():
B = np.concatenate((np.arange(10000), np.arange(10000), np.arange(5000)))
A = np.arange(25000)
df = DataFrame({"A": A, "B": B, "C": A, "D": B, "E": np.random.randn(25000)})
left = df.groupby(["A", "B", "C", "D"]).sum()
right = df.groupby(["D", "C", "B", "A"]).sum()
assert len(left) == len(right)
def test_groupby_sort_multi():
df = DataFrame(
{
"a": ["foo", "bar", "baz"],
"b": [3, 2, 1],
"c": [0, 1, 2],
"d": np.random.randn(3),
}
)
tups = [tuple(row) for row in df[["a", "b", "c"]].values]
tups = com.asarray_tuplesafe(tups)
result = df.groupby(["a", "b", "c"], sort=True).sum()
tm.assert_numpy_array_equal(result.index.values, tups[[1, 2, 0]])
tups = [tuple(row) for row in df[["c", "a", "b"]].values]
tups = com.asarray_tuplesafe(tups)
result = df.groupby(["c", "a", "b"], sort=True).sum()
tm.assert_numpy_array_equal(result.index.values, tups)
tups = [tuple(x) for x in df[["b", "c", "a"]].values]
tups = com.asarray_tuplesafe(tups)
result = df.groupby(["b", "c", "a"], sort=True).sum()
tm.assert_numpy_array_equal(result.index.values, tups[[2, 1, 0]])
df = DataFrame(
{"a": [0, 1, 2, 0, 1, 2], "b": [0, 0, 0, 1, 1, 1], "d": np.random.randn(6)}
)
grouped = df.groupby(["a", "b"])["d"]
result = grouped.sum()
def _check_groupby(df, result, keys, field, f=lambda x: x.sum()):
tups = [tuple(row) for row in df[keys].values]
tups = com.asarray_tuplesafe(tups)
expected = f(df.groupby(tups)[field])
for k, v in expected.items():
assert result[k] == v
_check_groupby(df, result, ["a", "b"], "d")
def test_dont_clobber_name_column():
df = DataFrame(
{"key": ["a", "a", "a", "b", "b", "b"], "name": ["foo", "bar", "baz"] * 2}
)
result = df.groupby("key", group_keys=False).apply(lambda x: x)
tm.assert_frame_equal(result, df)
def test_skip_group_keys():
tsf = tm.makeTimeDataFrame()
grouped = tsf.groupby(lambda x: x.month, group_keys=False)
result = grouped.apply(lambda x: x.sort_values(by="A")[:3])
pieces = [group.sort_values(by="A")[:3] for key, group in grouped]
expected = pd.concat(pieces)
tm.assert_frame_equal(result, expected)
grouped = tsf["A"].groupby(lambda x: x.month, group_keys=False)
result = grouped.apply(lambda x: x.sort_values()[:3])
pieces = [group.sort_values()[:3] for key, group in grouped]
expected = pd.concat(pieces)
tm.assert_series_equal(result, expected)
def test_no_nonsense_name(float_frame):
# GH #995
s = float_frame["C"].copy()
s.name = None
result = s.groupby(float_frame["A"]).agg(np.sum)
assert result.name is None
def test_multifunc_sum_bug():
# GH #1065
x = DataFrame(np.arange(9).reshape(3, 3))
x["test"] = 0
x["fl"] = [1.3, 1.5, 1.6]
grouped = x.groupby("test")
result = grouped.agg({"fl": "sum", 2: "size"})
assert result["fl"].dtype == np.float64
def test_handle_dict_return_value(df):
def f(group):
return {"max": group.max(), "min": group.min()}
def g(group):
return Series({"max": group.max(), "min": group.min()})
result = df.groupby("A")["C"].apply(f)
expected = df.groupby("A")["C"].apply(g)
assert isinstance(result, Series)
tm.assert_series_equal(result, expected)
@pytest.mark.parametrize("grouper", ["A", ["A", "B"]])
def test_set_group_name(df, grouper):
def f(group):
assert group.name is not None
return group
def freduce(group):
assert group.name is not None
return group.sum()
def freducex(x):
return freduce(x)
grouped = df.groupby(grouper, group_keys=False)
# make sure all these work
grouped.apply(f)
grouped.aggregate(freduce)
grouped.aggregate({"C": freduce, "D": freduce})
grouped.transform(f)
grouped["C"].apply(f)
grouped["C"].aggregate(freduce)
grouped["C"].aggregate([freduce, freducex])
grouped["C"].transform(f)
def test_group_name_available_in_inference_pass():
# gh-15062
df = DataFrame({"a": [0, 0, 1, 1, 2, 2], "b": np.arange(6)})
names = []
def f(group):
names.append(group.name)
return group.copy()
df.groupby("a", sort=False, group_keys=False).apply(f)
expected_names = [0, 1, 2]
assert names == expected_names
def test_no_dummy_key_names(df):
# see gh-1291
result = df.groupby(df["A"].values).sum()
assert result.index.name is None
result = df.groupby([df["A"].values, df["B"].values]).sum()
assert result.index.names == (None, None)
def test_groupby_sort_multiindex_series():
# series multiindex groupby sort argument was not being passed through
# _compress_group_index
# GH 9444
index = MultiIndex(
levels=[[1, 2], [1, 2]],
codes=[[0, 0, 0, 0, 1, 1], [1, 1, 0, 0, 0, 0]],
names=["a", "b"],
)
mseries = Series([0, 1, 2, 3, 4, 5], index=index)
index = MultiIndex(
levels=[[1, 2], [1, 2]], codes=[[0, 0, 1], [1, 0, 0]], names=["a", "b"]
)
mseries_result = Series([0, 2, 4], index=index)
result = mseries.groupby(level=["a", "b"], sort=False).first()
tm.assert_series_equal(result, mseries_result)
result = mseries.groupby(level=["a", "b"], sort=True).first()
tm.assert_series_equal(result, mseries_result.sort_index())
def test_groupby_reindex_inside_function():
periods = 1000
ind = date_range(start="2012/1/1", freq="5min", periods=periods)
df = DataFrame({"high": np.arange(periods), "low": np.arange(periods)}, index=ind)
def agg_before(func, fix=False):
"""
Run an aggregate func on the subset of data.
"""
def _func(data):
d = data.loc[data.index.map(lambda x: x.hour < 11)].dropna()
if fix:
data[data.index[0]]
if len(d) == 0:
return None
return func(d)
return _func
grouped = df.groupby(lambda x: datetime(x.year, x.month, x.day))
closure_bad = grouped.agg({"high": agg_before(np.max)})
closure_good = grouped.agg({"high": agg_before(np.max, True)})
tm.assert_frame_equal(closure_bad, closure_good)
def test_groupby_multiindex_missing_pair():
# GH9049
df = DataFrame(
{
"group1": ["a", "a", "a", "b"],
"group2": ["c", "c", "d", "c"],
"value": [1, 1, 1, 5],
}
)
df = df.set_index(["group1", "group2"])
df_grouped = df.groupby(level=["group1", "group2"], sort=True)
res = df_grouped.agg("sum")
idx = MultiIndex.from_tuples(
[("a", "c"), ("a", "d"), ("b", "c")], names=["group1", "group2"]
)
exp = DataFrame([[2], [1], [5]], index=idx, columns=["value"])
tm.assert_frame_equal(res, exp)
def test_groupby_multiindex_not_lexsorted():
# GH 11640
# define the lexsorted version
lexsorted_mi = MultiIndex.from_tuples(
[("a", ""), ("b1", "c1"), ("b2", "c2")], names=["b", "c"]
)
lexsorted_df = DataFrame([[1, 3, 4]], columns=lexsorted_mi)
assert lexsorted_df.columns._is_lexsorted()
# define the non-lexsorted version
not_lexsorted_df = DataFrame(
columns=["a", "b", "c", "d"], data=[[1, "b1", "c1", 3], [1, "b2", "c2", 4]]
)
not_lexsorted_df = not_lexsorted_df.pivot_table(
index="a", columns=["b", "c"], values="d"
)
not_lexsorted_df = not_lexsorted_df.reset_index()
assert not not_lexsorted_df.columns._is_lexsorted()
# compare the results
tm.assert_frame_equal(lexsorted_df, not_lexsorted_df)
expected = lexsorted_df.groupby("a").mean()
with tm.assert_produces_warning(PerformanceWarning):
result = not_lexsorted_df.groupby("a").mean()
tm.assert_frame_equal(expected, result)
# a transforming function should work regardless of sort
# GH 14776
df = DataFrame(
{"x": ["a", "a", "b", "a"], "y": [1, 1, 2, 2], "z": [1, 2, 3, 4]}
).set_index(["x", "y"])
assert not df.index._is_lexsorted()
for level in [0, 1, [0, 1]]:
for sort in [False, True]:
result = df.groupby(level=level, sort=sort, group_keys=False).apply(
DataFrame.drop_duplicates
)
expected = df
tm.assert_frame_equal(expected, result)
result = (
df.sort_index()
.groupby(level=level, sort=sort, group_keys=False)
.apply(DataFrame.drop_duplicates)
)
expected = df.sort_index()
tm.assert_frame_equal(expected, result)
def test_index_label_overlaps_location():
# checking we don't have any label/location confusion in the
# wake of GH5375
df = DataFrame(list("ABCDE"), index=[2, 0, 2, 1, 1])
g = df.groupby(list("ababb"))
actual = g.filter(lambda x: len(x) > 2)
expected = df.iloc[[1, 3, 4]]
tm.assert_frame_equal(actual, expected)
ser = df[0]
g = ser.groupby(list("ababb"))
actual = g.filter(lambda x: len(x) > 2)
expected = ser.take([1, 3, 4])
tm.assert_series_equal(actual, expected)
# and again, with a generic Index of floats
df.index = df.index.astype(float)
g = df.groupby(list("ababb"))
actual = g.filter(lambda x: len(x) > 2)
expected = df.iloc[[1, 3, 4]]
tm.assert_frame_equal(actual, expected)
ser = df[0]
g = ser.groupby(list("ababb"))
actual = g.filter(lambda x: len(x) > 2)
expected = ser.take([1, 3, 4])
tm.assert_series_equal(actual, expected)
def test_transform_doesnt_clobber_ints():
# GH 7972
n = 6
x = np.arange(n)
df = DataFrame({"a": x // 2, "b": 2.0 * x, "c": 3.0 * x})
df2 = DataFrame({"a": x // 2 * 1.0, "b": 2.0 * x, "c": 3.0 * x})
gb = df.groupby("a")
result = gb.transform("mean")
gb2 = df2.groupby("a")
expected = gb2.transform("mean")
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize(
"sort_column",
["ints", "floats", "strings", ["ints", "floats"], ["ints", "strings"]],
)
@pytest.mark.parametrize(
"group_column", ["int_groups", "string_groups", ["int_groups", "string_groups"]]
)
def test_groupby_preserves_sort(sort_column, group_column):
# Test to ensure that groupby always preserves sort order of original
# object. Issue #8588 and #9651
df = DataFrame(
{
"int_groups": [3, 1, 0, 1, 0, 3, 3, 3],
"string_groups": ["z", "a", "z", "a", "a", "g", "g", "g"],
"ints": [8, 7, 4, 5, 2, 9, 1, 1],
"floats": [2.3, 5.3, 6.2, -2.4, 2.2, 1.1, 1.1, 5],
"strings": ["z", "d", "a", "e", "word", "word2", "42", "47"],
}
)
# Try sorting on different types and with different group types
df = df.sort_values(by=sort_column)
g = df.groupby(group_column)
def test_sort(x):
tm.assert_frame_equal(x, x.sort_values(by=sort_column))
g.apply(test_sort)
def test_pivot_table_values_key_error():
# This test is designed to replicate the error in issue #14938
df = DataFrame(
{
"eventDate": date_range(datetime.today(), periods=20, freq="M").tolist(),
"thename": range(0, 20),
}
)
df["year"] = df.set_index("eventDate").index.year
df["month"] = df.set_index("eventDate").index.month
with pytest.raises(KeyError, match="'badname'"):
df.reset_index().pivot_table(
index="year", columns="month", values="badname", aggfunc="count"
)
@pytest.mark.parametrize("columns", ["C", ["C"]])
@pytest.mark.parametrize("keys", [["A"], ["A", "B"]])
@pytest.mark.parametrize(
"values",
[
[True],
[0],
[0.0],
["a"],
Categorical([0]),
[to_datetime(0)],
date_range(0, 1, 1, tz="US/Eastern"),
pd.period_range("2016-01-01", periods=3, freq="D"),
pd.array([0], dtype="Int64"),
pd.array([0], dtype="Float64"),
pd.array([False], dtype="boolean"),
],
ids=[
"bool",
"int",
"float",
"str",
"cat",
"dt64",
"dt64tz",
"period",
"Int64",
"Float64",
"boolean",
],
)
@pytest.mark.parametrize("method", ["attr", "agg", "apply"])
@pytest.mark.parametrize(
"op", ["idxmax", "idxmin", "min", "max", "sum", "prod", "skew"]
)
def test_empty_groupby(
columns, keys, values, method, op, request, using_array_manager, dropna
):
# GH8093 & GH26411
override_dtype = None
if (
isinstance(values, Categorical)
and len(keys) == 1
and op in ["idxmax", "idxmin"]
):
mark = pytest.mark.xfail(
raises=ValueError, match="attempt to get arg(min|max) of an empty sequence"
)
request.node.add_marker(mark)
if isinstance(values, BooleanArray) and op in ["sum", "prod"]:
# We expect to get Int64 back for these
override_dtype = "Int64"
if isinstance(values[0], bool) and op in ("prod", "sum"):
# sum/product of bools is an integer
override_dtype = "int64"
df = DataFrame({"A": values, "B": values, "C": values}, columns=list("ABC"))
if hasattr(values, "dtype"):
# check that we did the construction right
assert (df.dtypes == values.dtype).all()
df = df.iloc[:0]
gb = df.groupby(keys, group_keys=False, dropna=dropna)[columns]
def get_result(**kwargs):
if method == "attr":
return getattr(gb, op)(**kwargs)
else:
return getattr(gb, method)(op, **kwargs)
def get_categorical_invalid_expected():
# Categorical is special without 'observed=True', we get an NaN entry
# corresponding to the unobserved group. If we passed observed=True
# to groupby, expected would just be 'df.set_index(keys)[columns]'
# as below
lev = Categorical([0], dtype=values.dtype)
if len(keys) != 1:
idx = MultiIndex.from_product([lev, lev], names=keys)
else:
# all columns are dropped, but we end up with one row
# Categorical is special without 'observed=True'
idx = Index(lev, name=keys[0])
expected = DataFrame([], columns=[], index=idx)
return expected
is_per = isinstance(df.dtypes[0], pd.PeriodDtype)
is_dt64 = df.dtypes[0].kind == "M"
is_cat = isinstance(values, Categorical)
if isinstance(values, Categorical) and not values.ordered and op in ["min", "max"]:
msg = f"Cannot perform {op} with non-ordered Categorical"
with pytest.raises(TypeError, match=msg):
get_result()
if isinstance(columns, list):
# i.e. DataframeGroupBy, not SeriesGroupBy
result = get_result(numeric_only=True)
expected = get_categorical_invalid_expected()
tm.assert_equal(result, expected)
return
if op in ["prod", "sum", "skew"]:
# ops that require more than just ordered-ness
if is_dt64 or is_cat or is_per:
# GH#41291
# datetime64 -> prod and sum are invalid
if op == "skew":
msg = "does not support reduction 'skew'"
elif is_dt64:
msg = "datetime64 type does not support"
elif is_per:
msg = "Period type does not support"
else:
msg = "category type does not support"
with pytest.raises(TypeError, match=msg):
get_result()
if not isinstance(columns, list):
# i.e. SeriesGroupBy
return
elif op == "skew":
# TODO: test the numeric_only=True case
return
else:
# i.e. op in ["prod", "sum"]:
# i.e. DataFrameGroupBy
# ops that require more than just ordered-ness
# GH#41291
result = get_result(numeric_only=True)
# with numeric_only=True, these are dropped, and we get
# an empty DataFrame back
expected = df.set_index(keys)[[]]
if is_cat:
expected = get_categorical_invalid_expected()
tm.assert_equal(result, expected)
return
result = get_result()
expected = df.set_index(keys)[columns]
if override_dtype is not None:
expected = expected.astype(override_dtype)
if len(keys) == 1:
expected.index.name = keys[0]
tm.assert_equal(result, expected)
def test_empty_groupby_apply_nonunique_columns():
# GH#44417
df = DataFrame(np.random.randn(0, 4))
df[3] = df[3].astype(np.int64)
df.columns = [0, 1, 2, 0]
gb = df.groupby(df[1], group_keys=False)
res = gb.apply(lambda x: x)
assert (res.dtypes == df.dtypes).all()
def test_tuple_as_grouping():
# https://github.com/pandas-dev/pandas/issues/18314
df = DataFrame(
{
("a", "b"): [1, 1, 1, 1],
"a": [2, 2, 2, 2],
"b": [2, 2, 2, 2],
"c": [1, 1, 1, 1],
}
)
with pytest.raises(KeyError, match=r"('a', 'b')"):
df[["a", "b", "c"]].groupby(("a", "b"))
result = df.groupby(("a", "b"))["c"].sum()
expected = Series([4], name="c", index=Index([1], name=("a", "b")))
tm.assert_series_equal(result, expected)
def test_tuple_correct_keyerror():
# https://github.com/pandas-dev/pandas/issues/18798
df = DataFrame(1, index=range(3), columns=MultiIndex.from_product([[1, 2], [3, 4]]))
with pytest.raises(KeyError, match=r"^\(7, 8\)$"):
df.groupby((7, 8)).mean()
def test_groupby_agg_ohlc_non_first():
# GH 21716
df = DataFrame(
[[1], [1]],
columns=Index(["foo"], name="mycols"),
index=date_range("2018-01-01", periods=2, freq="D", name="dti"),
)
expected = DataFrame(
[[1, 1, 1, 1, 1], [1, 1, 1, 1, 1]],
columns=MultiIndex.from_tuples(
(
("foo", "sum", "foo"),
("foo", "ohlc", "open"),
("foo", "ohlc", "high"),
("foo", "ohlc", "low"),
("foo", "ohlc", "close"),
),
names=["mycols", None, None],
),
index=date_range("2018-01-01", periods=2, freq="D", name="dti"),
)
result = df.groupby(Grouper(freq="D")).agg(["sum", "ohlc"])
tm.assert_frame_equal(result, expected)
def test_groupby_multiindex_nat():
# GH 9236
values = [
(pd.NaT, "a"),
(datetime(2012, 1, 2), "a"),
(datetime(2012, 1, 2), "b"),
(datetime(2012, 1, 3), "a"),
]
mi = MultiIndex.from_tuples(values, names=["date", None])
ser = Series([3, 2, 2.5, 4], index=mi)
result = ser.groupby(level=1).mean()
expected = Series([3.0, 2.5], index=["a", "b"])
tm.assert_series_equal(result, expected)
def test_groupby_empty_list_raises():
# GH 5289
values = zip(range(10), range(10))
df = DataFrame(values, columns=["apple", "b"])
msg = "Grouper and axis must be same length"
with pytest.raises(ValueError, match=msg):
df.groupby([[]])
def test_groupby_multiindex_series_keys_len_equal_group_axis():
# GH 25704
index_array = [["x", "x"], ["a", "b"], ["k", "k"]]
index_names = ["first", "second", "third"]
ri = MultiIndex.from_arrays(index_array, names=index_names)
s = Series(data=[1, 2], index=ri)
result = s.groupby(["first", "third"]).sum()
index_array = [["x"], ["k"]]
index_names = ["first", "third"]
ei = MultiIndex.from_arrays(index_array, names=index_names)
expected = Series([3], index=ei)
tm.assert_series_equal(result, expected)
def test_groupby_groups_in_BaseGrouper():
# GH 26326
# Test if DataFrame grouped with a pandas.Grouper has correct groups
mi = MultiIndex.from_product([["A", "B"], ["C", "D"]], names=["alpha", "beta"])
df = DataFrame({"foo": [1, 2, 1, 2], "bar": [1, 2, 3, 4]}, index=mi)
result = df.groupby([Grouper(level="alpha"), "beta"])
expected = df.groupby(["alpha", "beta"])
assert result.groups == expected.groups
result = df.groupby(["beta", Grouper(level="alpha")])
expected = df.groupby(["beta", "alpha"])
assert result.groups == expected.groups
@pytest.mark.parametrize("group_name", ["x", ["x"]])
def test_groupby_axis_1(group_name):
# GH 27614
df = DataFrame(
np.arange(12).reshape(3, 4), index=[0, 1, 0], columns=[10, 20, 10, 20]
)
df.index.name = "y"
df.columns.name = "x"
results = df.groupby(group_name, axis=1).sum()
expected = df.T.groupby(group_name).sum().T
tm.assert_frame_equal(results, expected)
# test on MI column
iterables = [["bar", "baz", "foo"], ["one", "two"]]
mi = MultiIndex.from_product(iterables=iterables, names=["x", "x1"])
df = DataFrame(np.arange(18).reshape(3, 6), index=[0, 1, 0], columns=mi)
results = df.groupby(group_name, axis=1).sum()
expected = df.T.groupby(group_name).sum().T
tm.assert_frame_equal(results, expected)
@pytest.mark.parametrize(
"op, expected",
[
(
"shift",
{
"time": [
None,
None,
Timestamp("2019-01-01 12:00:00"),
Timestamp("2019-01-01 12:30:00"),
None,
None,
]
},
),
(
"bfill",
{
"time": [
Timestamp("2019-01-01 12:00:00"),
Timestamp("2019-01-01 12:30:00"),
Timestamp("2019-01-01 14:00:00"),
Timestamp("2019-01-01 14:30:00"),
Timestamp("2019-01-01 14:00:00"),
Timestamp("2019-01-01 14:30:00"),
]
},
),
(
"ffill",
{
"time": [
Timestamp("2019-01-01 12:00:00"),
Timestamp("2019-01-01 12:30:00"),
Timestamp("2019-01-01 12:00:00"),
Timestamp("2019-01-01 12:30:00"),
Timestamp("2019-01-01 14:00:00"),
Timestamp("2019-01-01 14:30:00"),
]
},
),
],
)
def test_shift_bfill_ffill_tz(tz_naive_fixture, op, expected):
# GH19995, GH27992: Check that timezone does not drop in shift, bfill, and ffill
tz = tz_naive_fixture
data = {
"id": ["A", "B", "A", "B", "A", "B"],
"time": [
Timestamp("2019-01-01 12:00:00"),
Timestamp("2019-01-01 12:30:00"),
None,
None,
Timestamp("2019-01-01 14:00:00"),
Timestamp("2019-01-01 14:30:00"),
],
}
df = DataFrame(data).assign(time=lambda x: x.time.dt.tz_localize(tz))
grouped = df.groupby("id")
result = getattr(grouped, op)()
expected = DataFrame(expected).assign(time=lambda x: x.time.dt.tz_localize(tz))
tm.assert_frame_equal(result, expected)
def test_groupby_only_none_group():
# see GH21624
# this was crashing with "ValueError: Length of passed values is 1, index implies 0"
df = DataFrame({"g": [None], "x": 1})
actual = df.groupby("g")["x"].transform("sum")
expected = Series([np.nan], name="x")
tm.assert_series_equal(actual, expected)
def test_groupby_duplicate_index():
# GH#29189 the groupby call here used to raise
ser = Series([2, 5, 6, 8], index=[2.0, 4.0, 4.0, 5.0])
gb = ser.groupby(level=0)
result = gb.mean()
expected = Series([2, 5.5, 8], index=[2.0, 4.0, 5.0])
tm.assert_series_equal(result, expected)
def test_group_on_empty_multiindex(transformation_func, request):
# GH 47787
# With one row, those are transforms so the schema should be the same
df = DataFrame(
data=[[1, Timestamp("today"), 3, 4]],
columns=["col_1", "col_2", "col_3", "col_4"],
)
df["col_3"] = df["col_3"].astype(int)
df["col_4"] = df["col_4"].astype(int)
df = df.set_index(["col_1", "col_2"])
if transformation_func == "fillna":
args = ("ffill",)
else:
args = ()
result = df.iloc[:0].groupby(["col_1"]).transform(transformation_func, *args)
expected = df.groupby(["col_1"]).transform(transformation_func, *args).iloc[:0]
if transformation_func in ("diff", "shift"):
expected = expected.astype(int)
tm.assert_equal(result, expected)
result = (
df["col_3"].iloc[:0].groupby(["col_1"]).transform(transformation_func, *args)
)
expected = (
df["col_3"].groupby(["col_1"]).transform(transformation_func, *args).iloc[:0]
)
if transformation_func in ("diff", "shift"):
expected = expected.astype(int)
tm.assert_equal(result, expected)
@pytest.mark.parametrize(
"idx",
[
Index(["a", "a"], name="foo"),
MultiIndex.from_tuples((("a", "a"), ("a", "a")), names=["foo", "bar"]),
],
)
def test_dup_labels_output_shape(groupby_func, idx):
if groupby_func in {"size", "ngroup", "cumcount"}:
pytest.skip(f"Not applicable for {groupby_func}")
df = DataFrame([[1, 1]], columns=idx)
grp_by = df.groupby([0])
args = get_groupby_method_args(groupby_func, df)
result = getattr(grp_by, groupby_func)(*args)
assert result.shape == (1, 2)
tm.assert_index_equal(result.columns, idx)
def test_groupby_crash_on_nunique(axis):
# Fix following 30253
dti = date_range("2016-01-01", periods=2, name="foo")
df = DataFrame({("A", "B"): [1, 2], ("A", "C"): [1, 3], ("D", "B"): [0, 0]})
df.columns.names = ("bar", "baz")
df.index = dti
axis_number = df._get_axis_number(axis)
if not axis_number:
df = df.T
gb = df.groupby(axis=axis_number, level=0)
result = gb.nunique()
expected = DataFrame({"A": [1, 2], "D": [1, 1]}, index=dti)
expected.columns.name = "bar"
if not axis_number:
expected = expected.T
tm.assert_frame_equal(result, expected)
if axis_number == 0:
# same thing, but empty columns
gb2 = df[[]].groupby(axis=axis_number, level=0)
exp = expected[[]]
else:
# same thing, but empty rows
gb2 = df.loc[[]].groupby(axis=axis_number, level=0)
# default for empty when we can't infer a dtype is float64
exp = expected.loc[[]].astype(np.float64)
res = gb2.nunique()
tm.assert_frame_equal(res, exp)
def test_groupby_list_level():
# GH 9790
expected = DataFrame(np.arange(0, 9).reshape(3, 3), dtype=float)
result = expected.groupby(level=[0]).mean()
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize(
"max_seq_items, expected",
[
(5, "{0: [0], 1: [1], 2: [2], 3: [3], 4: [4]}"),
(4, "{0: [0], 1: [1], 2: [2], 3: [3], ...}"),
(1, "{0: [0], ...}"),
],
)
def test_groups_repr_truncates(max_seq_items, expected):
# GH 1135
df = DataFrame(np.random.randn(5, 1))
df["a"] = df.index
with pd.option_context("display.max_seq_items", max_seq_items):
result = df.groupby("a").groups.__repr__()
assert result == expected
result = df.groupby(np.array(df.a)).groups.__repr__()
assert result == expected
def test_group_on_two_row_multiindex_returns_one_tuple_key():
# GH 18451
df = DataFrame([{"a": 1, "b": 2, "c": 99}, {"a": 1, "b": 2, "c": 88}])
df = df.set_index(["a", "b"])
grp = df.groupby(["a", "b"])
result = grp.indices
expected = {(1, 2): np.array([0, 1], dtype=np.int64)}
assert len(result) == 1
key = (1, 2)
assert (result[key] == expected[key]).all()
@pytest.mark.parametrize(
"klass, attr, value",
[
(DataFrame, "level", "a"),
(DataFrame, "as_index", False),
(DataFrame, "sort", False),
(DataFrame, "group_keys", False),
(DataFrame, "observed", True),
(DataFrame, "dropna", False),
(Series, "level", "a"),
(Series, "as_index", False),
(Series, "sort", False),
(Series, "group_keys", False),
(Series, "observed", True),
(Series, "dropna", False),
],
)
def test_subsetting_columns_keeps_attrs(klass, attr, value):
# GH 9959 - When subsetting columns, don't drop attributes
df = DataFrame({"a": [1], "b": [2], "c": [3]})
if attr != "axis":
df = df.set_index("a")
expected = df.groupby("a", **{attr: value})
result = expected[["b"]] if klass is DataFrame else expected["b"]
assert getattr(result, attr) == getattr(expected, attr)
def test_subsetting_columns_axis_1():
# GH 37725
g = DataFrame({"A": [1], "B": [2], "C": [3]}).groupby([0, 0, 1], axis=1)
match = "Cannot subset columns when using axis=1"
with pytest.raises(ValueError, match=match):
g[["A", "B"]].sum()
@pytest.mark.parametrize("func", ["sum", "any", "shift"])
def test_groupby_column_index_name_lost(func):
# GH: 29764 groupby loses index sometimes
expected = Index(["a"], name="idx")
df = DataFrame([[1]], columns=expected)
df_grouped = df.groupby([1])
result = getattr(df_grouped, func)().columns
tm.assert_index_equal(result, expected)
def test_groupby_duplicate_columns():
# GH: 31735
df = DataFrame(
{"A": ["f", "e", "g", "h"], "B": ["a", "b", "c", "d"], "C": [1, 2, 3, 4]}
).astype(object)
df.columns = ["A", "B", "B"]
result = df.groupby([0, 0, 0, 0]).min()
expected = DataFrame(
[["e", "a", 1]], index=np.array([0]), columns=["A", "B", "B"], dtype=object
)
tm.assert_frame_equal(result, expected)
def test_groupby_series_with_tuple_name():
# GH 37755
ser = Series([1, 2, 3, 4], index=[1, 1, 2, 2], name=("a", "a"))
ser.index.name = ("b", "b")
result = ser.groupby(level=0).last()
expected = Series([2, 4], index=[1, 2], name=("a", "a"))
expected.index.name = ("b", "b")
tm.assert_series_equal(result, expected)
@pytest.mark.xfail(not IS64, reason="GH#38778: fail on 32-bit system")
@pytest.mark.parametrize(
"func, values", [("sum", [97.0, 98.0]), ("mean", [24.25, 24.5])]
)
def test_groupby_numerical_stability_sum_mean(func, values):
# GH#38778
data = [1e16, 1e16, 97, 98, -5e15, -5e15, -5e15, -5e15]
df = DataFrame({"group": [1, 2] * 4, "a": data, "b": data})
result = getattr(df.groupby("group"), func)()
expected = DataFrame({"a": values, "b": values}, index=Index([1, 2], name="group"))
tm.assert_frame_equal(result, expected)
@pytest.mark.xfail(not IS64, reason="GH#38778: fail on 32-bit system")
def test_groupby_numerical_stability_cumsum():
# GH#38934
data = [1e16, 1e16, 97, 98, -5e15, -5e15, -5e15, -5e15]
df = DataFrame({"group": [1, 2] * 4, "a": data, "b": data})
result = df.groupby("group").cumsum()
exp_data = (
[1e16] * 2 + [1e16 + 96, 1e16 + 98] + [5e15 + 97, 5e15 + 98] + [97.0, 98.0]
)
expected = DataFrame({"a": exp_data, "b": exp_data})
tm.assert_frame_equal(result, expected, check_exact=True)
def test_groupby_cumsum_skipna_false():
# GH#46216 don't propagate np.nan above the diagonal
arr = np.random.randn(5, 5)
df = DataFrame(arr)
for i in range(5):
df.iloc[i, i] = np.nan
df["A"] = 1
gb = df.groupby("A")
res = gb.cumsum(skipna=False)
expected = df[[0, 1, 2, 3, 4]].cumsum(skipna=False)
tm.assert_frame_equal(res, expected)
def test_groupby_cumsum_timedelta64():
# GH#46216 don't ignore is_datetimelike in libgroupby.group_cumsum
dti = date_range("2016-01-01", periods=5)
ser = Series(dti) - dti[0]
ser[2] = pd.NaT
df = DataFrame({"A": 1, "B": ser})
gb = df.groupby("A")
res = gb.cumsum(numeric_only=False, skipna=True)
exp = DataFrame({"B": [ser[0], ser[1], pd.NaT, ser[4], ser[4] * 2]})
tm.assert_frame_equal(res, exp)
res = gb.cumsum(numeric_only=False, skipna=False)
exp = DataFrame({"B": [ser[0], ser[1], pd.NaT, pd.NaT, pd.NaT]})
tm.assert_frame_equal(res, exp)
def test_groupby_mean_duplicate_index(rand_series_with_duplicate_datetimeindex):
dups = rand_series_with_duplicate_datetimeindex
result = dups.groupby(level=0).mean()
expected = dups.groupby(dups.index).mean()
tm.assert_series_equal(result, expected)
def test_groupby_all_nan_groups_drop():
# GH 15036
s = Series([1, 2, 3], [np.nan, np.nan, np.nan])
result = s.groupby(s.index).sum()
expected = Series([], index=Index([], dtype=np.float64), dtype=np.int64)
tm.assert_series_equal(result, expected)
@pytest.mark.parametrize("numeric_only", [True, False])
def test_groupby_empty_multi_column(as_index, numeric_only):
# GH 15106 & GH 41998
df = DataFrame(data=[], columns=["A", "B", "C"])
gb = df.groupby(["A", "B"], as_index=as_index)
result = gb.sum(numeric_only=numeric_only)
if as_index:
index = MultiIndex([[], []], [[], []], names=["A", "B"])
columns = ["C"] if not numeric_only else []
else:
index = RangeIndex(0)
columns = ["A", "B", "C"] if not numeric_only else ["A", "B"]
expected = DataFrame([], columns=columns, index=index)
tm.assert_frame_equal(result, expected)
def test_groupby_aggregation_non_numeric_dtype():
# GH #43108
df = DataFrame(
[["M", [1]], ["M", [1]], ["W", [10]], ["W", [20]]], columns=["MW", "v"]
)
expected = DataFrame(
{
"v": [[1, 1], [10, 20]],
},
index=Index(["M", "W"], dtype="object", name="MW"),
)
gb = df.groupby(by=["MW"])
result = gb.sum()
tm.assert_frame_equal(result, expected)
def test_groupby_aggregation_multi_non_numeric_dtype():
# GH #42395
df = DataFrame(
{
"x": [1, 0, 1, 1, 0],
"y": [Timedelta(i, "days") for i in range(1, 6)],
"z": [Timedelta(i * 10, "days") for i in range(1, 6)],
}
)
expected = DataFrame(
{
"y": [Timedelta(i, "days") for i in range(7, 9)],
"z": [Timedelta(i * 10, "days") for i in range(7, 9)],
},
index=Index([0, 1], dtype="int64", name="x"),
)
gb = df.groupby(by=["x"])
result = gb.sum()
tm.assert_frame_equal(result, expected)
def test_groupby_aggregation_numeric_with_non_numeric_dtype():
# GH #43108
df = DataFrame(
{
"x": [1, 0, 1, 1, 0],
"y": [Timedelta(i, "days") for i in range(1, 6)],
"z": list(range(1, 6)),
}
)
expected = DataFrame(
{"y": [Timedelta(7, "days"), Timedelta(8, "days")], "z": [7, 8]},
index=Index([0, 1], dtype="int64", name="x"),
)
gb = df.groupby(by=["x"])
result = gb.sum()
tm.assert_frame_equal(result, expected)
def test_groupby_filtered_df_std():
# GH 16174
dicts = [
{"filter_col": False, "groupby_col": True, "bool_col": True, "float_col": 10.5},
{"filter_col": True, "groupby_col": True, "bool_col": True, "float_col": 20.5},
{"filter_col": True, "groupby_col": True, "bool_col": True, "float_col": 30.5},
]
df = DataFrame(dicts)
df_filter = df[df["filter_col"] == True] # noqa:E712
dfgb = df_filter.groupby("groupby_col")
result = dfgb.std()
expected = DataFrame(
[[0.0, 0.0, 7.071068]],
columns=["filter_col", "bool_col", "float_col"],
index=Index([True], name="groupby_col"),
)
tm.assert_frame_equal(result, expected)
def test_datetime_categorical_multikey_groupby_indices():
# GH 26859
df = DataFrame(
{
"a": Series(list("abc")),
"b": Series(
to_datetime(["2018-01-01", "2018-02-01", "2018-03-01"]),
dtype="category",
),
"c": Categorical.from_codes([-1, 0, 1], categories=[0, 1]),
}
)
result = df.groupby(["a", "b"]).indices
expected = {
("a", Timestamp("2018-01-01 00:00:00")): np.array([0]),
("b", Timestamp("2018-02-01 00:00:00")): np.array([1]),
("c", Timestamp("2018-03-01 00:00:00")): np.array([2]),
}
assert result == expected
def test_rolling_wrong_param_min_period():
# GH34037
name_l = ["Alice"] * 5 + ["Bob"] * 5
val_l = [np.nan, np.nan, 1, 2, 3] + [np.nan, 1, 2, 3, 4]
test_df = DataFrame([name_l, val_l]).T
test_df.columns = ["name", "val"]
result_error_msg = r"__init__\(\) got an unexpected keyword argument 'min_period'"
with pytest.raises(TypeError, match=result_error_msg):
test_df.groupby("name")["val"].rolling(window=2, min_period=1).sum()
def test_by_column_values_with_same_starting_value():
# GH29635
df = DataFrame(
{
"Name": ["Thomas", "Thomas", "Thomas John"],
"Credit": [1200, 1300, 900],
"Mood": ["sad", "happy", "happy"],
}
)
aggregate_details = {"Mood": Series.mode, "Credit": "sum"}
result = df.groupby(["Name"]).agg(aggregate_details)
expected_result = DataFrame(
{
"Mood": [["happy", "sad"], "happy"],
"Credit": [2500, 900],
"Name": ["Thomas", "Thomas John"],
}
).set_index("Name")
tm.assert_frame_equal(result, expected_result)
def test_groupby_none_in_first_mi_level():
# GH#47348
arr = [[None, 1, 0, 1], [2, 3, 2, 3]]
ser = Series(1, index=MultiIndex.from_arrays(arr, names=["a", "b"]))
result = ser.groupby(level=[0, 1]).sum()
expected = Series(
[1, 2], MultiIndex.from_tuples([(0.0, 2), (1.0, 3)], names=["a", "b"])
)
tm.assert_series_equal(result, expected)
def test_groupby_none_column_name():
# GH#47348
df = DataFrame({None: [1, 1, 2, 2], "b": [1, 1, 2, 3], "c": [4, 5, 6, 7]})
result = df.groupby(by=[None]).sum()
expected = DataFrame({"b": [2, 5], "c": [9, 13]}, index=Index([1, 2], name=None))
tm.assert_frame_equal(result, expected)
def test_single_element_list_grouping():
# GH 42795
df = DataFrame({"a": [1, 2], "b": [np.nan, 5], "c": [np.nan, 2]}, index=["x", "y"])
result = [key for key, _ in df.groupby(["a"])]
expected = [(1,), (2,)]
assert result == expected
@pytest.mark.parametrize("func", ["sum", "cumsum", "cumprod", "prod"])
def test_groupby_avoid_casting_to_float(func):
# GH#37493
val = 922337203685477580
df = DataFrame({"a": 1, "b": [val]})
result = getattr(df.groupby("a"), func)() - val
expected = DataFrame({"b": [0]}, index=Index([1], name="a"))
if func in ["cumsum", "cumprod"]:
expected = expected.reset_index(drop=True)
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize("func, val", [("sum", 3), ("prod", 2)])
def test_groupby_sum_support_mask(any_numeric_ea_dtype, func, val):
# GH#37493
df = DataFrame({"a": 1, "b": [1, 2, pd.NA]}, dtype=any_numeric_ea_dtype)
result = getattr(df.groupby("a"), func)()
expected = DataFrame(
{"b": [val]},
index=Index([1], name="a", dtype=any_numeric_ea_dtype),
dtype=any_numeric_ea_dtype,
)
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize("val, dtype", [(111, "int"), (222, "uint")])
def test_groupby_overflow(val, dtype):
# GH#37493
df = DataFrame({"a": 1, "b": [val, val]}, dtype=f"{dtype}8")
result = df.groupby("a").sum()
expected = DataFrame(
{"b": [val * 2]},
index=Index([1], name="a", dtype=f"{dtype}8"),
dtype=f"{dtype}64",
)
tm.assert_frame_equal(result, expected)
result = df.groupby("a").cumsum()
expected = DataFrame({"b": [val, val * 2]}, dtype=f"{dtype}64")
tm.assert_frame_equal(result, expected)
result = df.groupby("a").prod()
expected = DataFrame(
{"b": [val * val]},
index=Index([1], name="a", dtype=f"{dtype}8"),
dtype=f"{dtype}64",
)
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize("skipna, val", [(True, 3), (False, pd.NA)])
def test_groupby_cumsum_mask(any_numeric_ea_dtype, skipna, val):
# GH#37493
df = DataFrame({"a": 1, "b": [1, pd.NA, 2]}, dtype=any_numeric_ea_dtype)
result = df.groupby("a").cumsum(skipna=skipna)
expected = DataFrame(
{"b": [1, pd.NA, val]},
dtype=any_numeric_ea_dtype,
)
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize(
"val_in, index, val_out",
[
(
[1.0, 2.0, 3.0, 4.0, 5.0],
["foo", "foo", "bar", "baz", "blah"],
[3.0, 4.0, 5.0, 3.0],
),
(
[1.0, 2.0, 3.0, 4.0, 5.0, 6.0],
["foo", "foo", "bar", "baz", "blah", "blah"],
[3.0, 4.0, 11.0, 3.0],
),
],
)
def test_groupby_index_name_in_index_content(val_in, index, val_out):
# GH 48567
series = Series(data=val_in, name="values", index=Index(index, name="blah"))
result = series.groupby("blah").sum()
expected = Series(
data=val_out,
name="values",
index=Index(["bar", "baz", "blah", "foo"], name="blah"),
)
tm.assert_series_equal(result, expected)
result = series.to_frame().groupby("blah").sum()
expected = expected.to_frame()
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize("n", [1, 10, 32, 100, 1000])
def test_sum_of_booleans(n):
# GH 50347
df = DataFrame({"groupby_col": 1, "bool": [True] * n})
df["bool"] = df["bool"].eq(True)
result = df.groupby("groupby_col").sum()
expected = DataFrame({"bool": [n]}, index=Index([1], name="groupby_col"))
tm.assert_frame_equal(result, expected)
@pytest.mark.filterwarnings(
"ignore:invalid value encountered in remainder:RuntimeWarning"
)
@pytest.mark.parametrize("method", ["head", "tail", "nth", "first", "last"])
def test_groupby_method_drop_na(method):
# GH 21755
df = DataFrame({"A": ["a", np.nan, "b", np.nan, "c"], "B": range(5)})
if method == "nth":
result = getattr(df.groupby("A"), method)(n=0)
else:
result = getattr(df.groupby("A"), method)()
if method in ["first", "last"]:
expected = DataFrame({"B": [0, 2, 4]}).set_index(
Series(["a", "b", "c"], name="A")
)
else:
expected = DataFrame({"A": ["a", "b", "c"], "B": [0, 2, 4]}, index=[0, 2, 4])
tm.assert_frame_equal(result, expected)
def test_groupby_reduce_period():
# GH#51040
pi = pd.period_range("2016-01-01", periods=100, freq="D")
grps = list(range(10)) * 10
ser = pi.to_series()
gb = ser.groupby(grps)
with pytest.raises(TypeError, match="Period type does not support sum operations"):
gb.sum()
with pytest.raises(
TypeError, match="Period type does not support cumsum operations"
):
gb.cumsum()
with pytest.raises(TypeError, match="Period type does not support prod operations"):
gb.prod()
with pytest.raises(
TypeError, match="Period type does not support cumprod operations"
):
gb.cumprod()
res = gb.max()
expected = ser[-10:]
expected.index = Index(range(10), dtype=np.int_)
tm.assert_series_equal(res, expected)
res = gb.min()
expected = ser[:10]
expected.index = Index(range(10), dtype=np.int_)
tm.assert_series_equal(res, expected)
def test_obj_with_exclusions_duplicate_columns():
# GH#50806
df = DataFrame([[0, 1, 2, 3]])
df.columns = [0, 1, 2, 0]
gb = df.groupby(df[1])
result = gb._obj_with_exclusions
expected = df.take([0, 2, 3], axis=1)
tm.assert_frame_equal(result, expected)