2170 lines
72 KiB
Python
2170 lines
72 KiB
Python
from datetime import datetime
|
|
|
|
import numpy as np
|
|
import pytest
|
|
|
|
import pandas as pd
|
|
from pandas import (
|
|
Categorical,
|
|
CategoricalIndex,
|
|
DataFrame,
|
|
Index,
|
|
MultiIndex,
|
|
Series,
|
|
qcut,
|
|
)
|
|
import pandas._testing as tm
|
|
from pandas.api.typing import SeriesGroupBy
|
|
from pandas.tests.groupby import get_groupby_method_args
|
|
|
|
|
|
def cartesian_product_for_groupers(result, args, names, fill_value=np.nan):
|
|
"""Reindex to a cartesian production for the groupers,
|
|
preserving the nature (Categorical) of each grouper
|
|
"""
|
|
|
|
def f(a):
|
|
if isinstance(a, (CategoricalIndex, Categorical)):
|
|
categories = a.categories
|
|
a = Categorical.from_codes(
|
|
np.arange(len(categories)), categories=categories, ordered=a.ordered
|
|
)
|
|
return a
|
|
|
|
index = MultiIndex.from_product(map(f, args), names=names)
|
|
return result.reindex(index, fill_value=fill_value).sort_index()
|
|
|
|
|
|
_results_for_groupbys_with_missing_categories = {
|
|
# This maps the builtin groupby functions to their expected outputs for
|
|
# missing categories when they are called on a categorical grouper with
|
|
# observed=False. Some functions are expected to return NaN, some zero.
|
|
# These expected values can be used across several tests (i.e. they are
|
|
# the same for SeriesGroupBy and DataFrameGroupBy) but they should only be
|
|
# hardcoded in one place.
|
|
"all": np.nan,
|
|
"any": np.nan,
|
|
"count": 0,
|
|
"corrwith": np.nan,
|
|
"first": np.nan,
|
|
"idxmax": np.nan,
|
|
"idxmin": np.nan,
|
|
"last": np.nan,
|
|
"max": np.nan,
|
|
"mean": np.nan,
|
|
"median": np.nan,
|
|
"min": np.nan,
|
|
"nth": np.nan,
|
|
"nunique": 0,
|
|
"prod": np.nan,
|
|
"quantile": np.nan,
|
|
"sem": np.nan,
|
|
"size": 0,
|
|
"skew": np.nan,
|
|
"std": np.nan,
|
|
"sum": 0,
|
|
"var": np.nan,
|
|
}
|
|
|
|
|
|
def test_apply_use_categorical_name(df):
|
|
cats = qcut(df.C, 4)
|
|
|
|
def get_stats(group):
|
|
return {
|
|
"min": group.min(),
|
|
"max": group.max(),
|
|
"count": group.count(),
|
|
"mean": group.mean(),
|
|
}
|
|
|
|
result = df.groupby(cats, observed=False).D.apply(get_stats)
|
|
assert result.index.names[0] == "C"
|
|
|
|
|
|
def test_basic(using_infer_string): # TODO: split this test
|
|
cats = Categorical(
|
|
["a", "a", "a", "b", "b", "b", "c", "c", "c"],
|
|
categories=["a", "b", "c", "d"],
|
|
ordered=True,
|
|
)
|
|
data = DataFrame({"a": [1, 1, 1, 2, 2, 2, 3, 4, 5], "b": cats})
|
|
|
|
exp_index = CategoricalIndex(list("abcd"), name="b", ordered=True)
|
|
expected = DataFrame({"a": [1, 2, 4, np.nan]}, index=exp_index)
|
|
result = data.groupby("b", observed=False).mean()
|
|
tm.assert_frame_equal(result, expected)
|
|
|
|
cat1 = Categorical(["a", "a", "b", "b"], categories=["a", "b", "z"], ordered=True)
|
|
cat2 = Categorical(["c", "d", "c", "d"], categories=["c", "d", "y"], ordered=True)
|
|
df = DataFrame({"A": cat1, "B": cat2, "values": [1, 2, 3, 4]})
|
|
|
|
# single grouper
|
|
gb = df.groupby("A", observed=False)
|
|
exp_idx = CategoricalIndex(["a", "b", "z"], name="A", ordered=True)
|
|
expected = DataFrame({"values": Series([3, 7, 0], index=exp_idx)})
|
|
result = gb.sum(numeric_only=True)
|
|
tm.assert_frame_equal(result, expected)
|
|
|
|
# GH 8623
|
|
x = DataFrame(
|
|
[[1, "John P. Doe"], [2, "Jane Dove"], [1, "John P. Doe"]],
|
|
columns=["person_id", "person_name"],
|
|
)
|
|
x["person_name"] = Categorical(x.person_name)
|
|
|
|
g = x.groupby(["person_id"], observed=False)
|
|
result = g.transform(lambda x: x)
|
|
tm.assert_frame_equal(result, x[["person_name"]])
|
|
|
|
result = x.drop_duplicates("person_name")
|
|
expected = x.iloc[[0, 1]]
|
|
tm.assert_frame_equal(result, expected)
|
|
|
|
def f(x):
|
|
return x.drop_duplicates("person_name").iloc[0]
|
|
|
|
msg = "DataFrameGroupBy.apply operated on the grouping columns"
|
|
with tm.assert_produces_warning(DeprecationWarning, match=msg):
|
|
result = g.apply(f)
|
|
expected = x.iloc[[0, 1]].copy()
|
|
expected.index = Index([1, 2], name="person_id")
|
|
dtype = "string[pyarrow_numpy]" if using_infer_string else object
|
|
expected["person_name"] = expected["person_name"].astype(dtype)
|
|
tm.assert_frame_equal(result, expected)
|
|
|
|
# GH 9921
|
|
# Monotonic
|
|
df = DataFrame({"a": [5, 15, 25]})
|
|
c = pd.cut(df.a, bins=[0, 10, 20, 30, 40])
|
|
|
|
msg = "using SeriesGroupBy.sum"
|
|
with tm.assert_produces_warning(FutureWarning, match=msg):
|
|
# GH#53425
|
|
result = df.a.groupby(c, observed=False).transform(sum)
|
|
tm.assert_series_equal(result, df["a"])
|
|
|
|
tm.assert_series_equal(
|
|
df.a.groupby(c, observed=False).transform(lambda xs: np.sum(xs)), df["a"]
|
|
)
|
|
msg = "using DataFrameGroupBy.sum"
|
|
with tm.assert_produces_warning(FutureWarning, match=msg):
|
|
# GH#53425
|
|
result = df.groupby(c, observed=False).transform(sum)
|
|
expected = df[["a"]]
|
|
tm.assert_frame_equal(result, expected)
|
|
|
|
gbc = df.groupby(c, observed=False)
|
|
result = gbc.transform(lambda xs: np.max(xs, axis=0))
|
|
tm.assert_frame_equal(result, df[["a"]])
|
|
|
|
result2 = gbc.transform(lambda xs: np.max(xs, axis=0))
|
|
msg = "using DataFrameGroupBy.max"
|
|
with tm.assert_produces_warning(FutureWarning, match=msg):
|
|
# GH#53425
|
|
result3 = gbc.transform(max)
|
|
result4 = gbc.transform(np.maximum.reduce)
|
|
result5 = gbc.transform(lambda xs: np.maximum.reduce(xs))
|
|
tm.assert_frame_equal(result2, df[["a"]], check_dtype=False)
|
|
tm.assert_frame_equal(result3, df[["a"]], check_dtype=False)
|
|
tm.assert_frame_equal(result4, df[["a"]])
|
|
tm.assert_frame_equal(result5, df[["a"]])
|
|
|
|
# Filter
|
|
tm.assert_series_equal(df.a.groupby(c, observed=False).filter(np.all), df["a"])
|
|
tm.assert_frame_equal(df.groupby(c, observed=False).filter(np.all), df)
|
|
|
|
# Non-monotonic
|
|
df = DataFrame({"a": [5, 15, 25, -5]})
|
|
c = pd.cut(df.a, bins=[-10, 0, 10, 20, 30, 40])
|
|
|
|
msg = "using SeriesGroupBy.sum"
|
|
with tm.assert_produces_warning(FutureWarning, match=msg):
|
|
# GH#53425
|
|
result = df.a.groupby(c, observed=False).transform(sum)
|
|
tm.assert_series_equal(result, df["a"])
|
|
|
|
tm.assert_series_equal(
|
|
df.a.groupby(c, observed=False).transform(lambda xs: np.sum(xs)), df["a"]
|
|
)
|
|
msg = "using DataFrameGroupBy.sum"
|
|
with tm.assert_produces_warning(FutureWarning, match=msg):
|
|
# GH#53425
|
|
result = df.groupby(c, observed=False).transform(sum)
|
|
expected = df[["a"]]
|
|
tm.assert_frame_equal(result, expected)
|
|
|
|
tm.assert_frame_equal(
|
|
df.groupby(c, observed=False).transform(lambda xs: np.sum(xs)), df[["a"]]
|
|
)
|
|
|
|
# GH 9603
|
|
df = DataFrame({"a": [1, 0, 0, 0]})
|
|
c = pd.cut(df.a, [0, 1, 2, 3, 4], labels=Categorical(list("abcd")))
|
|
result = df.groupby(c, observed=False).apply(len)
|
|
|
|
exp_index = CategoricalIndex(c.values.categories, ordered=c.values.ordered)
|
|
expected = Series([1, 0, 0, 0], index=exp_index)
|
|
expected.index.name = "a"
|
|
tm.assert_series_equal(result, expected)
|
|
|
|
# more basic
|
|
levels = ["foo", "bar", "baz", "qux"]
|
|
codes = np.random.default_rng(2).integers(0, 4, size=100)
|
|
|
|
cats = Categorical.from_codes(codes, levels, ordered=True)
|
|
|
|
data = DataFrame(np.random.default_rng(2).standard_normal((100, 4)))
|
|
|
|
result = data.groupby(cats, observed=False).mean()
|
|
|
|
expected = data.groupby(np.asarray(cats), observed=False).mean()
|
|
exp_idx = CategoricalIndex(levels, categories=cats.categories, ordered=True)
|
|
expected = expected.reindex(exp_idx)
|
|
|
|
tm.assert_frame_equal(result, expected)
|
|
|
|
grouped = data.groupby(cats, observed=False)
|
|
desc_result = grouped.describe()
|
|
|
|
idx = cats.codes.argsort()
|
|
ord_labels = np.asarray(cats).take(idx)
|
|
ord_data = data.take(idx)
|
|
|
|
exp_cats = Categorical(
|
|
ord_labels, ordered=True, categories=["foo", "bar", "baz", "qux"]
|
|
)
|
|
expected = ord_data.groupby(exp_cats, sort=False, observed=False).describe()
|
|
tm.assert_frame_equal(desc_result, expected)
|
|
|
|
# GH 10460
|
|
expc = Categorical.from_codes(np.arange(4).repeat(8), levels, ordered=True)
|
|
exp = CategoricalIndex(expc)
|
|
tm.assert_index_equal(
|
|
(desc_result.stack(future_stack=True).index.get_level_values(0)), exp
|
|
)
|
|
exp = Index(["count", "mean", "std", "min", "25%", "50%", "75%", "max"] * 4)
|
|
tm.assert_index_equal(
|
|
(desc_result.stack(future_stack=True).index.get_level_values(1)), exp
|
|
)
|
|
|
|
|
|
def test_level_get_group(observed):
|
|
# GH15155
|
|
df = DataFrame(
|
|
data=np.arange(2, 22, 2),
|
|
index=MultiIndex(
|
|
levels=[CategoricalIndex(["a", "b"]), range(10)],
|
|
codes=[[0] * 5 + [1] * 5, range(10)],
|
|
names=["Index1", "Index2"],
|
|
),
|
|
)
|
|
g = df.groupby(level=["Index1"], observed=observed)
|
|
|
|
# expected should equal test.loc[["a"]]
|
|
# GH15166
|
|
expected = DataFrame(
|
|
data=np.arange(2, 12, 2),
|
|
index=MultiIndex(
|
|
levels=[CategoricalIndex(["a", "b"]), range(5)],
|
|
codes=[[0] * 5, range(5)],
|
|
names=["Index1", "Index2"],
|
|
),
|
|
)
|
|
msg = "you will need to pass a length-1 tuple"
|
|
with tm.assert_produces_warning(FutureWarning, match=msg):
|
|
# GH#25971 - warn when not passing a length-1 tuple
|
|
result = g.get_group("a")
|
|
|
|
tm.assert_frame_equal(result, expected)
|
|
|
|
|
|
def test_sorting_with_different_categoricals():
|
|
# GH 24271
|
|
df = DataFrame(
|
|
{
|
|
"group": ["A"] * 6 + ["B"] * 6,
|
|
"dose": ["high", "med", "low"] * 4,
|
|
"outcomes": np.arange(12.0),
|
|
}
|
|
)
|
|
|
|
df.dose = Categorical(df.dose, categories=["low", "med", "high"], ordered=True)
|
|
|
|
result = df.groupby("group")["dose"].value_counts()
|
|
result = result.sort_index(level=0, sort_remaining=True)
|
|
index = ["low", "med", "high", "low", "med", "high"]
|
|
index = Categorical(index, categories=["low", "med", "high"], ordered=True)
|
|
index = [["A", "A", "A", "B", "B", "B"], CategoricalIndex(index)]
|
|
index = MultiIndex.from_arrays(index, names=["group", "dose"])
|
|
expected = Series([2] * 6, index=index, name="count")
|
|
tm.assert_series_equal(result, expected)
|
|
|
|
|
|
@pytest.mark.parametrize("ordered", [True, False])
|
|
def test_apply(ordered):
|
|
# GH 10138
|
|
|
|
dense = Categorical(list("abc"), ordered=ordered)
|
|
|
|
# 'b' is in the categories but not in the list
|
|
missing = Categorical(list("aaa"), categories=["a", "b"], ordered=ordered)
|
|
values = np.arange(len(dense))
|
|
df = DataFrame({"missing": missing, "dense": dense, "values": values})
|
|
grouped = df.groupby(["missing", "dense"], observed=True)
|
|
|
|
# missing category 'b' should still exist in the output index
|
|
idx = MultiIndex.from_arrays([missing, dense], names=["missing", "dense"])
|
|
expected = DataFrame([0, 1, 2.0], index=idx, columns=["values"])
|
|
|
|
result = grouped.apply(lambda x: np.mean(x, axis=0))
|
|
tm.assert_frame_equal(result, expected)
|
|
|
|
result = grouped.mean()
|
|
tm.assert_frame_equal(result, expected)
|
|
|
|
msg = "using DataFrameGroupBy.mean"
|
|
with tm.assert_produces_warning(FutureWarning, match=msg):
|
|
# GH#53425
|
|
result = grouped.agg(np.mean)
|
|
tm.assert_frame_equal(result, expected)
|
|
|
|
# but for transform we should still get back the original index
|
|
idx = MultiIndex.from_arrays([missing, dense], names=["missing", "dense"])
|
|
expected = Series(1, index=idx)
|
|
msg = "DataFrameGroupBy.apply operated on the grouping columns"
|
|
with tm.assert_produces_warning(DeprecationWarning, match=msg):
|
|
result = grouped.apply(lambda x: 1)
|
|
tm.assert_series_equal(result, expected)
|
|
|
|
|
|
def test_observed(observed):
|
|
# multiple groupers, don't re-expand the output space
|
|
# of the grouper
|
|
# gh-14942 (implement)
|
|
# gh-10132 (back-compat)
|
|
# gh-8138 (back-compat)
|
|
# gh-8869
|
|
|
|
cat1 = Categorical(["a", "a", "b", "b"], categories=["a", "b", "z"], ordered=True)
|
|
cat2 = Categorical(["c", "d", "c", "d"], categories=["c", "d", "y"], ordered=True)
|
|
df = DataFrame({"A": cat1, "B": cat2, "values": [1, 2, 3, 4]})
|
|
df["C"] = ["foo", "bar"] * 2
|
|
|
|
# multiple groupers with a non-cat
|
|
gb = df.groupby(["A", "B", "C"], observed=observed)
|
|
exp_index = MultiIndex.from_arrays(
|
|
[cat1, cat2, ["foo", "bar"] * 2], names=["A", "B", "C"]
|
|
)
|
|
expected = DataFrame({"values": Series([1, 2, 3, 4], index=exp_index)}).sort_index()
|
|
result = gb.sum()
|
|
if not observed:
|
|
expected = cartesian_product_for_groupers(
|
|
expected, [cat1, cat2, ["foo", "bar"]], list("ABC"), fill_value=0
|
|
)
|
|
|
|
tm.assert_frame_equal(result, expected)
|
|
|
|
gb = df.groupby(["A", "B"], observed=observed)
|
|
exp_index = MultiIndex.from_arrays([cat1, cat2], names=["A", "B"])
|
|
expected = DataFrame(
|
|
{"values": [1, 2, 3, 4], "C": ["foo", "bar", "foo", "bar"]}, index=exp_index
|
|
)
|
|
result = gb.sum()
|
|
if not observed:
|
|
expected = cartesian_product_for_groupers(
|
|
expected, [cat1, cat2], list("AB"), fill_value=0
|
|
)
|
|
|
|
tm.assert_frame_equal(result, expected)
|
|
|
|
# https://github.com/pandas-dev/pandas/issues/8138
|
|
d = {
|
|
"cat": Categorical(
|
|
["a", "b", "a", "b"], categories=["a", "b", "c"], ordered=True
|
|
),
|
|
"ints": [1, 1, 2, 2],
|
|
"val": [10, 20, 30, 40],
|
|
}
|
|
df = DataFrame(d)
|
|
|
|
# Grouping on a single column
|
|
groups_single_key = df.groupby("cat", observed=observed)
|
|
result = groups_single_key.mean()
|
|
|
|
exp_index = CategoricalIndex(
|
|
list("ab"), name="cat", categories=list("abc"), ordered=True
|
|
)
|
|
expected = DataFrame({"ints": [1.5, 1.5], "val": [20.0, 30]}, index=exp_index)
|
|
if not observed:
|
|
index = CategoricalIndex(
|
|
list("abc"), name="cat", categories=list("abc"), ordered=True
|
|
)
|
|
expected = expected.reindex(index)
|
|
|
|
tm.assert_frame_equal(result, expected)
|
|
|
|
# Grouping on two columns
|
|
groups_double_key = df.groupby(["cat", "ints"], observed=observed)
|
|
result = groups_double_key.agg("mean")
|
|
expected = DataFrame(
|
|
{
|
|
"val": [10.0, 30.0, 20.0, 40.0],
|
|
"cat": Categorical(
|
|
["a", "a", "b", "b"], categories=["a", "b", "c"], ordered=True
|
|
),
|
|
"ints": [1, 2, 1, 2],
|
|
}
|
|
).set_index(["cat", "ints"])
|
|
if not observed:
|
|
expected = cartesian_product_for_groupers(
|
|
expected, [df.cat.values, [1, 2]], ["cat", "ints"]
|
|
)
|
|
|
|
tm.assert_frame_equal(result, expected)
|
|
|
|
# GH 10132
|
|
for key in [("a", 1), ("b", 2), ("b", 1), ("a", 2)]:
|
|
c, i = key
|
|
result = groups_double_key.get_group(key)
|
|
expected = df[(df.cat == c) & (df.ints == i)]
|
|
tm.assert_frame_equal(result, expected)
|
|
|
|
# gh-8869
|
|
# with as_index
|
|
d = {
|
|
"foo": [10, 8, 4, 8, 4, 1, 1],
|
|
"bar": [10, 20, 30, 40, 50, 60, 70],
|
|
"baz": ["d", "c", "e", "a", "a", "d", "c"],
|
|
}
|
|
df = DataFrame(d)
|
|
cat = pd.cut(df["foo"], np.linspace(0, 10, 3))
|
|
df["range"] = cat
|
|
groups = df.groupby(["range", "baz"], as_index=False, observed=observed)
|
|
result = groups.agg("mean")
|
|
|
|
groups2 = df.groupby(["range", "baz"], as_index=True, observed=observed)
|
|
expected = groups2.agg("mean").reset_index()
|
|
tm.assert_frame_equal(result, expected)
|
|
|
|
|
|
def test_observed_codes_remap(observed):
|
|
d = {"C1": [3, 3, 4, 5], "C2": [1, 2, 3, 4], "C3": [10, 100, 200, 34]}
|
|
df = DataFrame(d)
|
|
values = pd.cut(df["C1"], [1, 2, 3, 6])
|
|
values.name = "cat"
|
|
groups_double_key = df.groupby([values, "C2"], observed=observed)
|
|
|
|
idx = MultiIndex.from_arrays([values, [1, 2, 3, 4]], names=["cat", "C2"])
|
|
expected = DataFrame(
|
|
{"C1": [3.0, 3.0, 4.0, 5.0], "C3": [10.0, 100.0, 200.0, 34.0]}, index=idx
|
|
)
|
|
if not observed:
|
|
expected = cartesian_product_for_groupers(
|
|
expected, [values.values, [1, 2, 3, 4]], ["cat", "C2"]
|
|
)
|
|
|
|
result = groups_double_key.agg("mean")
|
|
tm.assert_frame_equal(result, expected)
|
|
|
|
|
|
def test_observed_perf():
|
|
# we create a cartesian product, so this is
|
|
# non-performant if we don't use observed values
|
|
# gh-14942
|
|
df = DataFrame(
|
|
{
|
|
"cat": np.random.default_rng(2).integers(0, 255, size=30000),
|
|
"int_id": np.random.default_rng(2).integers(0, 255, size=30000),
|
|
"other_id": np.random.default_rng(2).integers(0, 10000, size=30000),
|
|
"foo": 0,
|
|
}
|
|
)
|
|
df["cat"] = df.cat.astype(str).astype("category")
|
|
|
|
grouped = df.groupby(["cat", "int_id", "other_id"], observed=True)
|
|
result = grouped.count()
|
|
assert result.index.levels[0].nunique() == df.cat.nunique()
|
|
assert result.index.levels[1].nunique() == df.int_id.nunique()
|
|
assert result.index.levels[2].nunique() == df.other_id.nunique()
|
|
|
|
|
|
def test_observed_groups(observed):
|
|
# gh-20583
|
|
# test that we have the appropriate groups
|
|
|
|
cat = Categorical(["a", "c", "a"], categories=["a", "b", "c"])
|
|
df = DataFrame({"cat": cat, "vals": [1, 2, 3]})
|
|
g = df.groupby("cat", observed=observed)
|
|
|
|
result = g.groups
|
|
if observed:
|
|
expected = {"a": Index([0, 2], dtype="int64"), "c": Index([1], dtype="int64")}
|
|
else:
|
|
expected = {
|
|
"a": Index([0, 2], dtype="int64"),
|
|
"b": Index([], dtype="int64"),
|
|
"c": Index([1], dtype="int64"),
|
|
}
|
|
|
|
tm.assert_dict_equal(result, expected)
|
|
|
|
|
|
@pytest.mark.parametrize(
|
|
"keys, expected_values, expected_index_levels",
|
|
[
|
|
("a", [15, 9, 0], CategoricalIndex([1, 2, 3], name="a")),
|
|
(
|
|
["a", "b"],
|
|
[7, 8, 0, 0, 0, 9, 0, 0, 0],
|
|
[CategoricalIndex([1, 2, 3], name="a"), Index([4, 5, 6])],
|
|
),
|
|
(
|
|
["a", "a2"],
|
|
[15, 0, 0, 0, 9, 0, 0, 0, 0],
|
|
[
|
|
CategoricalIndex([1, 2, 3], name="a"),
|
|
CategoricalIndex([1, 2, 3], name="a"),
|
|
],
|
|
),
|
|
],
|
|
)
|
|
@pytest.mark.parametrize("test_series", [True, False])
|
|
def test_unobserved_in_index(keys, expected_values, expected_index_levels, test_series):
|
|
# GH#49354 - ensure unobserved cats occur when grouping by index levels
|
|
df = DataFrame(
|
|
{
|
|
"a": Categorical([1, 1, 2], categories=[1, 2, 3]),
|
|
"a2": Categorical([1, 1, 2], categories=[1, 2, 3]),
|
|
"b": [4, 5, 6],
|
|
"c": [7, 8, 9],
|
|
}
|
|
).set_index(["a", "a2"])
|
|
if "b" not in keys:
|
|
# Only keep b when it is used for grouping for consistent columns in the result
|
|
df = df.drop(columns="b")
|
|
|
|
gb = df.groupby(keys, observed=False)
|
|
if test_series:
|
|
gb = gb["c"]
|
|
result = gb.sum()
|
|
|
|
if len(keys) == 1:
|
|
index = expected_index_levels
|
|
else:
|
|
codes = [[0, 0, 0, 1, 1, 1, 2, 2, 2], 3 * [0, 1, 2]]
|
|
index = MultiIndex(
|
|
expected_index_levels,
|
|
codes=codes,
|
|
names=keys,
|
|
)
|
|
expected = DataFrame({"c": expected_values}, index=index)
|
|
if test_series:
|
|
expected = expected["c"]
|
|
tm.assert_equal(result, expected)
|
|
|
|
|
|
def test_observed_groups_with_nan(observed):
|
|
# GH 24740
|
|
df = DataFrame(
|
|
{
|
|
"cat": Categorical(["a", np.nan, "a"], categories=["a", "b", "d"]),
|
|
"vals": [1, 2, 3],
|
|
}
|
|
)
|
|
g = df.groupby("cat", observed=observed)
|
|
result = g.groups
|
|
if observed:
|
|
expected = {"a": Index([0, 2], dtype="int64")}
|
|
else:
|
|
expected = {
|
|
"a": Index([0, 2], dtype="int64"),
|
|
"b": Index([], dtype="int64"),
|
|
"d": Index([], dtype="int64"),
|
|
}
|
|
tm.assert_dict_equal(result, expected)
|
|
|
|
|
|
def test_observed_nth():
|
|
# GH 26385
|
|
cat = Categorical(["a", np.nan, np.nan], categories=["a", "b", "c"])
|
|
ser = Series([1, 2, 3])
|
|
df = DataFrame({"cat": cat, "ser": ser})
|
|
|
|
result = df.groupby("cat", observed=False)["ser"].nth(0)
|
|
expected = df["ser"].iloc[[0]]
|
|
tm.assert_series_equal(result, expected)
|
|
|
|
|
|
def test_dataframe_categorical_with_nan(observed):
|
|
# GH 21151
|
|
s1 = Categorical([np.nan, "a", np.nan, "a"], categories=["a", "b", "c"])
|
|
s2 = Series([1, 2, 3, 4])
|
|
df = DataFrame({"s1": s1, "s2": s2})
|
|
result = df.groupby("s1", observed=observed).first().reset_index()
|
|
if observed:
|
|
expected = DataFrame(
|
|
{"s1": Categorical(["a"], categories=["a", "b", "c"]), "s2": [2]}
|
|
)
|
|
else:
|
|
expected = DataFrame(
|
|
{
|
|
"s1": Categorical(["a", "b", "c"], categories=["a", "b", "c"]),
|
|
"s2": [2, np.nan, np.nan],
|
|
}
|
|
)
|
|
tm.assert_frame_equal(result, expected)
|
|
|
|
|
|
@pytest.mark.parametrize("ordered", [True, False])
|
|
@pytest.mark.parametrize("observed", [True, False])
|
|
@pytest.mark.parametrize("sort", [True, False])
|
|
def test_dataframe_categorical_ordered_observed_sort(ordered, observed, sort):
|
|
# GH 25871: Fix groupby sorting on ordered Categoricals
|
|
# GH 25167: Groupby with observed=True doesn't sort
|
|
|
|
# Build a dataframe with cat having one unobserved category ('missing'),
|
|
# and a Series with identical values
|
|
label = Categorical(
|
|
["d", "a", "b", "a", "d", "b"],
|
|
categories=["a", "b", "missing", "d"],
|
|
ordered=ordered,
|
|
)
|
|
val = Series(["d", "a", "b", "a", "d", "b"])
|
|
df = DataFrame({"label": label, "val": val})
|
|
|
|
# aggregate on the Categorical
|
|
result = df.groupby("label", observed=observed, sort=sort)["val"].aggregate("first")
|
|
|
|
# If ordering works, we expect index labels equal to aggregation results,
|
|
# except for 'observed=False': label 'missing' has aggregation None
|
|
label = Series(result.index.array, dtype="object")
|
|
aggr = Series(result.array)
|
|
if not observed:
|
|
aggr[aggr.isna()] = "missing"
|
|
if not all(label == aggr):
|
|
msg = (
|
|
"Labels and aggregation results not consistently sorted\n"
|
|
f"for (ordered={ordered}, observed={observed}, sort={sort})\n"
|
|
f"Result:\n{result}"
|
|
)
|
|
assert False, msg
|
|
|
|
|
|
def test_datetime():
|
|
# GH9049: ensure backward compatibility
|
|
levels = pd.date_range("2014-01-01", periods=4)
|
|
codes = np.random.default_rng(2).integers(0, 4, size=100)
|
|
|
|
cats = Categorical.from_codes(codes, levels, ordered=True)
|
|
|
|
data = DataFrame(np.random.default_rng(2).standard_normal((100, 4)))
|
|
result = data.groupby(cats, observed=False).mean()
|
|
|
|
expected = data.groupby(np.asarray(cats), observed=False).mean()
|
|
expected = expected.reindex(levels)
|
|
expected.index = CategoricalIndex(
|
|
expected.index, categories=expected.index, ordered=True
|
|
)
|
|
|
|
tm.assert_frame_equal(result, expected)
|
|
|
|
grouped = data.groupby(cats, observed=False)
|
|
desc_result = grouped.describe()
|
|
|
|
idx = cats.codes.argsort()
|
|
ord_labels = cats.take(idx)
|
|
ord_data = data.take(idx)
|
|
expected = ord_data.groupby(ord_labels, observed=False).describe()
|
|
tm.assert_frame_equal(desc_result, expected)
|
|
tm.assert_index_equal(desc_result.index, expected.index)
|
|
tm.assert_index_equal(
|
|
desc_result.index.get_level_values(0), expected.index.get_level_values(0)
|
|
)
|
|
|
|
# GH 10460
|
|
expc = Categorical.from_codes(np.arange(4).repeat(8), levels, ordered=True)
|
|
exp = CategoricalIndex(expc)
|
|
tm.assert_index_equal(
|
|
(desc_result.stack(future_stack=True).index.get_level_values(0)), exp
|
|
)
|
|
exp = Index(["count", "mean", "std", "min", "25%", "50%", "75%", "max"] * 4)
|
|
tm.assert_index_equal(
|
|
(desc_result.stack(future_stack=True).index.get_level_values(1)), exp
|
|
)
|
|
|
|
|
|
def test_categorical_index():
|
|
s = np.random.default_rng(2)
|
|
levels = ["foo", "bar", "baz", "qux"]
|
|
codes = s.integers(0, 4, size=20)
|
|
cats = Categorical.from_codes(codes, levels, ordered=True)
|
|
df = DataFrame(np.repeat(np.arange(20), 4).reshape(-1, 4), columns=list("abcd"))
|
|
df["cats"] = cats
|
|
|
|
# with a cat index
|
|
result = df.set_index("cats").groupby(level=0, observed=False).sum()
|
|
expected = df[list("abcd")].groupby(cats.codes, observed=False).sum()
|
|
expected.index = CategoricalIndex(
|
|
Categorical.from_codes([0, 1, 2, 3], levels, ordered=True), name="cats"
|
|
)
|
|
tm.assert_frame_equal(result, expected)
|
|
|
|
# with a cat column, should produce a cat index
|
|
result = df.groupby("cats", observed=False).sum()
|
|
expected = df[list("abcd")].groupby(cats.codes, observed=False).sum()
|
|
expected.index = CategoricalIndex(
|
|
Categorical.from_codes([0, 1, 2, 3], levels, ordered=True), name="cats"
|
|
)
|
|
tm.assert_frame_equal(result, expected)
|
|
|
|
|
|
def test_describe_categorical_columns():
|
|
# GH 11558
|
|
cats = CategoricalIndex(
|
|
["qux", "foo", "baz", "bar"],
|
|
categories=["foo", "bar", "baz", "qux"],
|
|
ordered=True,
|
|
)
|
|
df = DataFrame(np.random.default_rng(2).standard_normal((20, 4)), columns=cats)
|
|
result = df.groupby([1, 2, 3, 4] * 5).describe()
|
|
|
|
tm.assert_index_equal(result.stack(future_stack=True).columns, cats)
|
|
tm.assert_categorical_equal(
|
|
result.stack(future_stack=True).columns.values, cats.values
|
|
)
|
|
|
|
|
|
def test_unstack_categorical():
|
|
# GH11558 (example is taken from the original issue)
|
|
df = DataFrame(
|
|
{"a": range(10), "medium": ["A", "B"] * 5, "artist": list("XYXXY") * 2}
|
|
)
|
|
df["medium"] = df["medium"].astype("category")
|
|
|
|
gcat = df.groupby(["artist", "medium"], observed=False)["a"].count().unstack()
|
|
result = gcat.describe()
|
|
|
|
exp_columns = CategoricalIndex(["A", "B"], ordered=False, name="medium")
|
|
tm.assert_index_equal(result.columns, exp_columns)
|
|
tm.assert_categorical_equal(result.columns.values, exp_columns.values)
|
|
|
|
result = gcat["A"] + gcat["B"]
|
|
expected = Series([6, 4], index=Index(["X", "Y"], name="artist"))
|
|
tm.assert_series_equal(result, expected)
|
|
|
|
|
|
def test_bins_unequal_len():
|
|
# GH3011
|
|
series = Series([np.nan, np.nan, 1, 1, 2, 2, 3, 3, 4, 4])
|
|
bins = pd.cut(series.dropna().values, 4)
|
|
|
|
# len(bins) != len(series) here
|
|
with pytest.raises(ValueError, match="Grouper and axis must be same length"):
|
|
series.groupby(bins).mean()
|
|
|
|
|
|
@pytest.mark.parametrize(
|
|
["series", "data"],
|
|
[
|
|
# Group a series with length and index equal to those of the grouper.
|
|
(Series(range(4)), {"A": [0, 3], "B": [1, 2]}),
|
|
# Group a series with length equal to that of the grouper and index unequal to
|
|
# that of the grouper.
|
|
(Series(range(4)).rename(lambda idx: idx + 1), {"A": [2], "B": [0, 1]}),
|
|
# GH44179: Group a series with length unequal to that of the grouper.
|
|
(Series(range(7)), {"A": [0, 3], "B": [1, 2]}),
|
|
],
|
|
)
|
|
def test_categorical_series(series, data):
|
|
# Group the given series by a series with categorical data type such that group A
|
|
# takes indices 0 and 3 and group B indices 1 and 2, obtaining the values mapped in
|
|
# the given data.
|
|
groupby = series.groupby(Series(list("ABBA"), dtype="category"), observed=False)
|
|
result = groupby.aggregate(list)
|
|
expected = Series(data, index=CategoricalIndex(data.keys()))
|
|
tm.assert_series_equal(result, expected)
|
|
|
|
|
|
def test_as_index():
|
|
# GH13204
|
|
df = DataFrame(
|
|
{
|
|
"cat": Categorical([1, 2, 2], [1, 2, 3]),
|
|
"A": [10, 11, 11],
|
|
"B": [101, 102, 103],
|
|
}
|
|
)
|
|
result = df.groupby(["cat", "A"], as_index=False, observed=True).sum()
|
|
expected = DataFrame(
|
|
{
|
|
"cat": Categorical([1, 2], categories=df.cat.cat.categories),
|
|
"A": [10, 11],
|
|
"B": [101, 205],
|
|
},
|
|
columns=["cat", "A", "B"],
|
|
)
|
|
tm.assert_frame_equal(result, expected)
|
|
|
|
# function grouper
|
|
f = lambda r: df.loc[r, "A"]
|
|
msg = "A grouping .* was excluded from the result"
|
|
with tm.assert_produces_warning(FutureWarning, match=msg):
|
|
result = df.groupby(["cat", f], as_index=False, observed=True).sum()
|
|
expected = DataFrame(
|
|
{
|
|
"cat": Categorical([1, 2], categories=df.cat.cat.categories),
|
|
"A": [10, 22],
|
|
"B": [101, 205],
|
|
},
|
|
columns=["cat", "A", "B"],
|
|
)
|
|
tm.assert_frame_equal(result, expected)
|
|
|
|
# another not in-axis grouper (conflicting names in index)
|
|
s = Series(["a", "b", "b"], name="cat")
|
|
msg = "A grouping .* was excluded from the result"
|
|
with tm.assert_produces_warning(FutureWarning, match=msg):
|
|
result = df.groupby(["cat", s], as_index=False, observed=True).sum()
|
|
tm.assert_frame_equal(result, expected)
|
|
|
|
# is original index dropped?
|
|
group_columns = ["cat", "A"]
|
|
expected = DataFrame(
|
|
{
|
|
"cat": Categorical([1, 2], categories=df.cat.cat.categories),
|
|
"A": [10, 11],
|
|
"B": [101, 205],
|
|
},
|
|
columns=["cat", "A", "B"],
|
|
)
|
|
|
|
for name in [None, "X", "B"]:
|
|
df.index = Index(list("abc"), name=name)
|
|
result = df.groupby(group_columns, as_index=False, observed=True).sum()
|
|
|
|
tm.assert_frame_equal(result, expected)
|
|
|
|
|
|
def test_preserve_categories():
|
|
# GH-13179
|
|
categories = list("abc")
|
|
|
|
# ordered=True
|
|
df = DataFrame({"A": Categorical(list("ba"), categories=categories, ordered=True)})
|
|
sort_index = CategoricalIndex(categories, categories, ordered=True, name="A")
|
|
nosort_index = CategoricalIndex(list("bac"), categories, ordered=True, name="A")
|
|
tm.assert_index_equal(
|
|
df.groupby("A", sort=True, observed=False).first().index, sort_index
|
|
)
|
|
# GH#42482 - don't sort result when sort=False, even when ordered=True
|
|
tm.assert_index_equal(
|
|
df.groupby("A", sort=False, observed=False).first().index, nosort_index
|
|
)
|
|
|
|
# ordered=False
|
|
df = DataFrame({"A": Categorical(list("ba"), categories=categories, ordered=False)})
|
|
sort_index = CategoricalIndex(categories, categories, ordered=False, name="A")
|
|
# GH#48749 - don't change order of categories
|
|
# GH#42482 - don't sort result when sort=False, even when ordered=True
|
|
nosort_index = CategoricalIndex(list("bac"), list("abc"), ordered=False, name="A")
|
|
tm.assert_index_equal(
|
|
df.groupby("A", sort=True, observed=False).first().index, sort_index
|
|
)
|
|
tm.assert_index_equal(
|
|
df.groupby("A", sort=False, observed=False).first().index, nosort_index
|
|
)
|
|
|
|
|
|
def test_preserve_categorical_dtype():
|
|
# GH13743, GH13854
|
|
df = DataFrame(
|
|
{
|
|
"A": [1, 2, 1, 1, 2],
|
|
"B": [10, 16, 22, 28, 34],
|
|
"C1": Categorical(list("abaab"), categories=list("bac"), ordered=False),
|
|
"C2": Categorical(list("abaab"), categories=list("bac"), ordered=True),
|
|
}
|
|
)
|
|
# single grouper
|
|
exp_full = DataFrame(
|
|
{
|
|
"A": [2.0, 1.0, np.nan],
|
|
"B": [25.0, 20.0, np.nan],
|
|
"C1": Categorical(list("bac"), categories=list("bac"), ordered=False),
|
|
"C2": Categorical(list("bac"), categories=list("bac"), ordered=True),
|
|
}
|
|
)
|
|
for col in ["C1", "C2"]:
|
|
result1 = df.groupby(by=col, as_index=False, observed=False).mean(
|
|
numeric_only=True
|
|
)
|
|
result2 = (
|
|
df.groupby(by=col, as_index=True, observed=False)
|
|
.mean(numeric_only=True)
|
|
.reset_index()
|
|
)
|
|
expected = exp_full.reindex(columns=result1.columns)
|
|
tm.assert_frame_equal(result1, expected)
|
|
tm.assert_frame_equal(result2, expected)
|
|
|
|
|
|
@pytest.mark.parametrize(
|
|
"func, values",
|
|
[
|
|
("first", ["second", "first"]),
|
|
("last", ["fourth", "third"]),
|
|
("min", ["fourth", "first"]),
|
|
("max", ["second", "third"]),
|
|
],
|
|
)
|
|
def test_preserve_on_ordered_ops(func, values):
|
|
# gh-18502
|
|
# preserve the categoricals on ops
|
|
c = Categorical(["first", "second", "third", "fourth"], ordered=True)
|
|
df = DataFrame({"payload": [-1, -2, -1, -2], "col": c})
|
|
g = df.groupby("payload")
|
|
result = getattr(g, func)()
|
|
expected = DataFrame(
|
|
{"payload": [-2, -1], "col": Series(values, dtype=c.dtype)}
|
|
).set_index("payload")
|
|
tm.assert_frame_equal(result, expected)
|
|
|
|
# we should also preserve categorical for SeriesGroupBy
|
|
sgb = df.groupby("payload")["col"]
|
|
result = getattr(sgb, func)()
|
|
expected = expected["col"]
|
|
tm.assert_series_equal(result, expected)
|
|
|
|
|
|
def test_categorical_no_compress():
|
|
data = Series(np.random.default_rng(2).standard_normal(9))
|
|
|
|
codes = np.array([0, 0, 0, 1, 1, 1, 2, 2, 2])
|
|
cats = Categorical.from_codes(codes, [0, 1, 2], ordered=True)
|
|
|
|
result = data.groupby(cats, observed=False).mean()
|
|
exp = data.groupby(codes, observed=False).mean()
|
|
|
|
exp.index = CategoricalIndex(
|
|
exp.index, categories=cats.categories, ordered=cats.ordered
|
|
)
|
|
tm.assert_series_equal(result, exp)
|
|
|
|
codes = np.array([0, 0, 0, 1, 1, 1, 3, 3, 3])
|
|
cats = Categorical.from_codes(codes, [0, 1, 2, 3], ordered=True)
|
|
|
|
result = data.groupby(cats, observed=False).mean()
|
|
exp = data.groupby(codes, observed=False).mean().reindex(cats.categories)
|
|
exp.index = CategoricalIndex(
|
|
exp.index, categories=cats.categories, ordered=cats.ordered
|
|
)
|
|
tm.assert_series_equal(result, exp)
|
|
|
|
cats = Categorical(
|
|
["a", "a", "a", "b", "b", "b", "c", "c", "c"],
|
|
categories=["a", "b", "c", "d"],
|
|
ordered=True,
|
|
)
|
|
data = DataFrame({"a": [1, 1, 1, 2, 2, 2, 3, 4, 5], "b": cats})
|
|
|
|
result = data.groupby("b", observed=False).mean()
|
|
result = result["a"].values
|
|
exp = np.array([1, 2, 4, np.nan])
|
|
tm.assert_numpy_array_equal(result, exp)
|
|
|
|
|
|
def test_groupby_empty_with_category():
|
|
# GH-9614
|
|
# test fix for when group by on None resulted in
|
|
# coercion of dtype categorical -> float
|
|
df = DataFrame({"A": [None] * 3, "B": Categorical(["train", "train", "test"])})
|
|
result = df.groupby("A").first()["B"]
|
|
expected = Series(
|
|
Categorical([], categories=["test", "train"]),
|
|
index=Series([], dtype="object", name="A"),
|
|
name="B",
|
|
)
|
|
tm.assert_series_equal(result, expected)
|
|
|
|
|
|
def test_sort():
|
|
# https://stackoverflow.com/questions/23814368/sorting-pandas-
|
|
# categorical-labels-after-groupby
|
|
# This should result in a properly sorted Series so that the plot
|
|
# has a sorted x axis
|
|
# self.cat.groupby(['value_group'])['value_group'].count().plot(kind='bar')
|
|
|
|
df = DataFrame({"value": np.random.default_rng(2).integers(0, 10000, 100)})
|
|
labels = [f"{i} - {i+499}" for i in range(0, 10000, 500)]
|
|
cat_labels = Categorical(labels, labels)
|
|
|
|
df = df.sort_values(by=["value"], ascending=True)
|
|
df["value_group"] = pd.cut(
|
|
df.value, range(0, 10500, 500), right=False, labels=cat_labels
|
|
)
|
|
|
|
res = df.groupby(["value_group"], observed=False)["value_group"].count()
|
|
exp = res[sorted(res.index, key=lambda x: float(x.split()[0]))]
|
|
exp.index = CategoricalIndex(exp.index, name=exp.index.name)
|
|
tm.assert_series_equal(res, exp)
|
|
|
|
|
|
@pytest.mark.parametrize("ordered", [True, False])
|
|
def test_sort2(sort, ordered):
|
|
# dataframe groupby sort was being ignored # GH 8868
|
|
# GH#48749 - don't change order of categories
|
|
# GH#42482 - don't sort result when sort=False, even when ordered=True
|
|
df = DataFrame(
|
|
[
|
|
["(7.5, 10]", 10, 10],
|
|
["(7.5, 10]", 8, 20],
|
|
["(2.5, 5]", 5, 30],
|
|
["(5, 7.5]", 6, 40],
|
|
["(2.5, 5]", 4, 50],
|
|
["(0, 2.5]", 1, 60],
|
|
["(5, 7.5]", 7, 70],
|
|
],
|
|
columns=["range", "foo", "bar"],
|
|
)
|
|
df["range"] = Categorical(df["range"], ordered=ordered)
|
|
result = df.groupby("range", sort=sort, observed=False).first()
|
|
|
|
if sort:
|
|
data_values = [[1, 60], [5, 30], [6, 40], [10, 10]]
|
|
index_values = ["(0, 2.5]", "(2.5, 5]", "(5, 7.5]", "(7.5, 10]"]
|
|
else:
|
|
data_values = [[10, 10], [5, 30], [6, 40], [1, 60]]
|
|
index_values = ["(7.5, 10]", "(2.5, 5]", "(5, 7.5]", "(0, 2.5]"]
|
|
expected = DataFrame(
|
|
data_values,
|
|
columns=["foo", "bar"],
|
|
index=CategoricalIndex(index_values, name="range", ordered=ordered),
|
|
)
|
|
|
|
tm.assert_frame_equal(result, expected)
|
|
|
|
|
|
@pytest.mark.parametrize("ordered", [True, False])
|
|
def test_sort_datetimelike(sort, ordered):
|
|
# GH10505
|
|
# GH#42482 - don't sort result when sort=False, even when ordered=True
|
|
|
|
# use same data as test_groupby_sort_categorical, which category is
|
|
# corresponding to datetime.month
|
|
df = DataFrame(
|
|
{
|
|
"dt": [
|
|
datetime(2011, 7, 1),
|
|
datetime(2011, 7, 1),
|
|
datetime(2011, 2, 1),
|
|
datetime(2011, 5, 1),
|
|
datetime(2011, 2, 1),
|
|
datetime(2011, 1, 1),
|
|
datetime(2011, 5, 1),
|
|
],
|
|
"foo": [10, 8, 5, 6, 4, 1, 7],
|
|
"bar": [10, 20, 30, 40, 50, 60, 70],
|
|
},
|
|
columns=["dt", "foo", "bar"],
|
|
)
|
|
|
|
# ordered=True
|
|
df["dt"] = Categorical(df["dt"], ordered=ordered)
|
|
if sort:
|
|
data_values = [[1, 60], [5, 30], [6, 40], [10, 10]]
|
|
index_values = [
|
|
datetime(2011, 1, 1),
|
|
datetime(2011, 2, 1),
|
|
datetime(2011, 5, 1),
|
|
datetime(2011, 7, 1),
|
|
]
|
|
else:
|
|
data_values = [[10, 10], [5, 30], [6, 40], [1, 60]]
|
|
index_values = [
|
|
datetime(2011, 7, 1),
|
|
datetime(2011, 2, 1),
|
|
datetime(2011, 5, 1),
|
|
datetime(2011, 1, 1),
|
|
]
|
|
expected = DataFrame(
|
|
data_values,
|
|
columns=["foo", "bar"],
|
|
index=CategoricalIndex(index_values, name="dt", ordered=ordered),
|
|
)
|
|
result = df.groupby("dt", sort=sort, observed=False).first()
|
|
tm.assert_frame_equal(result, expected)
|
|
|
|
|
|
def test_empty_sum():
|
|
# https://github.com/pandas-dev/pandas/issues/18678
|
|
df = DataFrame(
|
|
{"A": Categorical(["a", "a", "b"], categories=["a", "b", "c"]), "B": [1, 2, 1]}
|
|
)
|
|
expected_idx = CategoricalIndex(["a", "b", "c"], name="A")
|
|
|
|
# 0 by default
|
|
result = df.groupby("A", observed=False).B.sum()
|
|
expected = Series([3, 1, 0], expected_idx, name="B")
|
|
tm.assert_series_equal(result, expected)
|
|
|
|
# min_count=0
|
|
result = df.groupby("A", observed=False).B.sum(min_count=0)
|
|
expected = Series([3, 1, 0], expected_idx, name="B")
|
|
tm.assert_series_equal(result, expected)
|
|
|
|
# min_count=1
|
|
result = df.groupby("A", observed=False).B.sum(min_count=1)
|
|
expected = Series([3, 1, np.nan], expected_idx, name="B")
|
|
tm.assert_series_equal(result, expected)
|
|
|
|
# min_count>1
|
|
result = df.groupby("A", observed=False).B.sum(min_count=2)
|
|
expected = Series([3, np.nan, np.nan], expected_idx, name="B")
|
|
tm.assert_series_equal(result, expected)
|
|
|
|
|
|
def test_empty_prod():
|
|
# https://github.com/pandas-dev/pandas/issues/18678
|
|
df = DataFrame(
|
|
{"A": Categorical(["a", "a", "b"], categories=["a", "b", "c"]), "B": [1, 2, 1]}
|
|
)
|
|
|
|
expected_idx = CategoricalIndex(["a", "b", "c"], name="A")
|
|
|
|
# 1 by default
|
|
result = df.groupby("A", observed=False).B.prod()
|
|
expected = Series([2, 1, 1], expected_idx, name="B")
|
|
tm.assert_series_equal(result, expected)
|
|
|
|
# min_count=0
|
|
result = df.groupby("A", observed=False).B.prod(min_count=0)
|
|
expected = Series([2, 1, 1], expected_idx, name="B")
|
|
tm.assert_series_equal(result, expected)
|
|
|
|
# min_count=1
|
|
result = df.groupby("A", observed=False).B.prod(min_count=1)
|
|
expected = Series([2, 1, np.nan], expected_idx, name="B")
|
|
tm.assert_series_equal(result, expected)
|
|
|
|
|
|
def test_groupby_multiindex_categorical_datetime():
|
|
# https://github.com/pandas-dev/pandas/issues/21390
|
|
|
|
df = DataFrame(
|
|
{
|
|
"key1": Categorical(list("abcbabcba")),
|
|
"key2": Categorical(
|
|
list(pd.date_range("2018-06-01 00", freq="1min", periods=3)) * 3
|
|
),
|
|
"values": np.arange(9),
|
|
}
|
|
)
|
|
result = df.groupby(["key1", "key2"], observed=False).mean()
|
|
|
|
idx = MultiIndex.from_product(
|
|
[
|
|
Categorical(["a", "b", "c"]),
|
|
Categorical(pd.date_range("2018-06-01 00", freq="1min", periods=3)),
|
|
],
|
|
names=["key1", "key2"],
|
|
)
|
|
expected = DataFrame({"values": [0, 4, 8, 3, 4, 5, 6, np.nan, 2]}, index=idx)
|
|
tm.assert_frame_equal(result, expected)
|
|
|
|
|
|
@pytest.mark.parametrize(
|
|
"as_index, expected",
|
|
[
|
|
(
|
|
True,
|
|
Series(
|
|
index=MultiIndex.from_arrays(
|
|
[Series([1, 1, 2], dtype="category"), [1, 2, 2]], names=["a", "b"]
|
|
),
|
|
data=[1, 2, 3],
|
|
name="x",
|
|
),
|
|
),
|
|
(
|
|
False,
|
|
DataFrame(
|
|
{
|
|
"a": Series([1, 1, 2], dtype="category"),
|
|
"b": [1, 2, 2],
|
|
"x": [1, 2, 3],
|
|
}
|
|
),
|
|
),
|
|
],
|
|
)
|
|
def test_groupby_agg_observed_true_single_column(as_index, expected):
|
|
# GH-23970
|
|
df = DataFrame(
|
|
{"a": Series([1, 1, 2], dtype="category"), "b": [1, 2, 2], "x": [1, 2, 3]}
|
|
)
|
|
|
|
result = df.groupby(["a", "b"], as_index=as_index, observed=True)["x"].sum()
|
|
|
|
tm.assert_equal(result, expected)
|
|
|
|
|
|
@pytest.mark.parametrize("fill_value", [None, np.nan, pd.NaT])
|
|
def test_shift(fill_value):
|
|
ct = Categorical(
|
|
["a", "b", "c", "d"], categories=["a", "b", "c", "d"], ordered=False
|
|
)
|
|
expected = Categorical(
|
|
[None, "a", "b", "c"], categories=["a", "b", "c", "d"], ordered=False
|
|
)
|
|
res = ct.shift(1, fill_value=fill_value)
|
|
tm.assert_equal(res, expected)
|
|
|
|
|
|
@pytest.fixture
|
|
def df_cat(df):
|
|
"""
|
|
DataFrame with multiple categorical columns and a column of integers.
|
|
Shortened so as not to contain all possible combinations of categories.
|
|
Useful for testing `observed` kwarg functionality on GroupBy objects.
|
|
|
|
Parameters
|
|
----------
|
|
df: DataFrame
|
|
Non-categorical, longer DataFrame from another fixture, used to derive
|
|
this one
|
|
|
|
Returns
|
|
-------
|
|
df_cat: DataFrame
|
|
"""
|
|
df_cat = df.copy()[:4] # leave out some groups
|
|
df_cat["A"] = df_cat["A"].astype("category")
|
|
df_cat["B"] = df_cat["B"].astype("category")
|
|
df_cat["C"] = Series([1, 2, 3, 4])
|
|
df_cat = df_cat.drop(["D"], axis=1)
|
|
return df_cat
|
|
|
|
|
|
@pytest.mark.parametrize("operation", ["agg", "apply"])
|
|
def test_seriesgroupby_observed_true(df_cat, operation):
|
|
# GH#24880
|
|
# GH#49223 - order of results was wrong when grouping by index levels
|
|
lev_a = Index(["bar", "bar", "foo", "foo"], dtype=df_cat["A"].dtype, name="A")
|
|
lev_b = Index(["one", "three", "one", "two"], dtype=df_cat["B"].dtype, name="B")
|
|
index = MultiIndex.from_arrays([lev_a, lev_b])
|
|
expected = Series(data=[2, 4, 1, 3], index=index, name="C").sort_index()
|
|
|
|
grouped = df_cat.groupby(["A", "B"], observed=True)["C"]
|
|
msg = "using np.sum" if operation == "apply" else "using SeriesGroupBy.sum"
|
|
with tm.assert_produces_warning(FutureWarning, match=msg):
|
|
# GH#53425
|
|
result = getattr(grouped, operation)(sum)
|
|
tm.assert_series_equal(result, expected)
|
|
|
|
|
|
@pytest.mark.parametrize("operation", ["agg", "apply"])
|
|
@pytest.mark.parametrize("observed", [False, None])
|
|
def test_seriesgroupby_observed_false_or_none(df_cat, observed, operation):
|
|
# GH 24880
|
|
# GH#49223 - order of results was wrong when grouping by index levels
|
|
index, _ = MultiIndex.from_product(
|
|
[
|
|
CategoricalIndex(["bar", "foo"], ordered=False),
|
|
CategoricalIndex(["one", "three", "two"], ordered=False),
|
|
],
|
|
names=["A", "B"],
|
|
).sortlevel()
|
|
|
|
expected = Series(data=[2, 4, np.nan, 1, np.nan, 3], index=index, name="C")
|
|
if operation == "agg":
|
|
msg = "The 'downcast' keyword in fillna is deprecated"
|
|
with tm.assert_produces_warning(FutureWarning, match=msg):
|
|
expected = expected.fillna(0, downcast="infer")
|
|
grouped = df_cat.groupby(["A", "B"], observed=observed)["C"]
|
|
msg = "using SeriesGroupBy.sum" if operation == "agg" else "using np.sum"
|
|
with tm.assert_produces_warning(FutureWarning, match=msg):
|
|
# GH#53425
|
|
result = getattr(grouped, operation)(sum)
|
|
tm.assert_series_equal(result, expected)
|
|
|
|
|
|
@pytest.mark.parametrize(
|
|
"observed, index, data",
|
|
[
|
|
(
|
|
True,
|
|
MultiIndex.from_arrays(
|
|
[
|
|
Index(["bar"] * 4 + ["foo"] * 4, dtype="category", name="A"),
|
|
Index(
|
|
["one", "one", "three", "three", "one", "one", "two", "two"],
|
|
dtype="category",
|
|
name="B",
|
|
),
|
|
Index(["min", "max"] * 4),
|
|
]
|
|
),
|
|
[2, 2, 4, 4, 1, 1, 3, 3],
|
|
),
|
|
(
|
|
False,
|
|
MultiIndex.from_product(
|
|
[
|
|
CategoricalIndex(["bar", "foo"], ordered=False),
|
|
CategoricalIndex(["one", "three", "two"], ordered=False),
|
|
Index(["min", "max"]),
|
|
],
|
|
names=["A", "B", None],
|
|
),
|
|
[2, 2, 4, 4, np.nan, np.nan, 1, 1, np.nan, np.nan, 3, 3],
|
|
),
|
|
(
|
|
None,
|
|
MultiIndex.from_product(
|
|
[
|
|
CategoricalIndex(["bar", "foo"], ordered=False),
|
|
CategoricalIndex(["one", "three", "two"], ordered=False),
|
|
Index(["min", "max"]),
|
|
],
|
|
names=["A", "B", None],
|
|
),
|
|
[2, 2, 4, 4, np.nan, np.nan, 1, 1, np.nan, np.nan, 3, 3],
|
|
),
|
|
],
|
|
)
|
|
def test_seriesgroupby_observed_apply_dict(df_cat, observed, index, data):
|
|
# GH 24880
|
|
expected = Series(data=data, index=index, name="C")
|
|
result = df_cat.groupby(["A", "B"], observed=observed)["C"].apply(
|
|
lambda x: {"min": x.min(), "max": x.max()}
|
|
)
|
|
tm.assert_series_equal(result, expected)
|
|
|
|
|
|
def test_groupby_categorical_series_dataframe_consistent(df_cat):
|
|
# GH 20416
|
|
expected = df_cat.groupby(["A", "B"], observed=False)["C"].mean()
|
|
result = df_cat.groupby(["A", "B"], observed=False).mean()["C"]
|
|
tm.assert_series_equal(result, expected)
|
|
|
|
|
|
@pytest.mark.parametrize("code", [([1, 0, 0]), ([0, 0, 0])])
|
|
def test_groupby_categorical_axis_1(code):
|
|
# GH 13420
|
|
df = DataFrame({"a": [1, 2, 3, 4], "b": [-1, -2, -3, -4], "c": [5, 6, 7, 8]})
|
|
cat = Categorical.from_codes(code, categories=list("abc"))
|
|
msg = "DataFrame.groupby with axis=1 is deprecated"
|
|
with tm.assert_produces_warning(FutureWarning, match=msg):
|
|
gb = df.groupby(cat, axis=1, observed=False)
|
|
result = gb.mean()
|
|
msg = "The 'axis' keyword in DataFrame.groupby is deprecated"
|
|
with tm.assert_produces_warning(FutureWarning, match=msg):
|
|
gb2 = df.T.groupby(cat, axis=0, observed=False)
|
|
expected = gb2.mean().T
|
|
tm.assert_frame_equal(result, expected)
|
|
|
|
|
|
def test_groupby_cat_preserves_structure(observed, ordered):
|
|
# GH 28787
|
|
df = DataFrame(
|
|
{"Name": Categorical(["Bob", "Greg"], ordered=ordered), "Item": [1, 2]},
|
|
columns=["Name", "Item"],
|
|
)
|
|
expected = df.copy()
|
|
|
|
result = (
|
|
df.groupby("Name", observed=observed)
|
|
.agg(DataFrame.sum, skipna=True)
|
|
.reset_index()
|
|
)
|
|
|
|
tm.assert_frame_equal(result, expected)
|
|
|
|
|
|
def test_get_nonexistent_category():
|
|
# Accessing a Category that is not in the dataframe
|
|
df = DataFrame({"var": ["a", "a", "b", "b"], "val": range(4)})
|
|
with pytest.raises(KeyError, match="'vau'"):
|
|
df.groupby("var").apply(
|
|
lambda rows: DataFrame(
|
|
{"var": [rows.iloc[-1]["var"]], "val": [rows.iloc[-1]["vau"]]}
|
|
)
|
|
)
|
|
|
|
|
|
def test_series_groupby_on_2_categoricals_unobserved(reduction_func, observed):
|
|
# GH 17605
|
|
if reduction_func == "ngroup":
|
|
pytest.skip("ngroup is not truly a reduction")
|
|
|
|
df = DataFrame(
|
|
{
|
|
"cat_1": Categorical(list("AABB"), categories=list("ABCD")),
|
|
"cat_2": Categorical(list("AB") * 2, categories=list("ABCD")),
|
|
"value": [0.1] * 4,
|
|
}
|
|
)
|
|
args = get_groupby_method_args(reduction_func, df)
|
|
|
|
expected_length = 4 if observed else 16
|
|
|
|
series_groupby = df.groupby(["cat_1", "cat_2"], observed=observed)["value"]
|
|
|
|
if reduction_func == "corrwith":
|
|
# TODO: implemented SeriesGroupBy.corrwith. See GH 32293
|
|
assert not hasattr(series_groupby, reduction_func)
|
|
return
|
|
|
|
agg = getattr(series_groupby, reduction_func)
|
|
|
|
if not observed and reduction_func in ["idxmin", "idxmax"]:
|
|
# idxmin and idxmax are designed to fail on empty inputs
|
|
with pytest.raises(
|
|
ValueError, match="empty group due to unobserved categories"
|
|
):
|
|
agg(*args)
|
|
return
|
|
|
|
result = agg(*args)
|
|
|
|
assert len(result) == expected_length
|
|
|
|
|
|
def test_series_groupby_on_2_categoricals_unobserved_zeroes_or_nans(
|
|
reduction_func, request
|
|
):
|
|
# GH 17605
|
|
# Tests whether the unobserved categories in the result contain 0 or NaN
|
|
|
|
if reduction_func == "ngroup":
|
|
pytest.skip("ngroup is not truly a reduction")
|
|
|
|
if reduction_func == "corrwith": # GH 32293
|
|
mark = pytest.mark.xfail(
|
|
reason="TODO: implemented SeriesGroupBy.corrwith. See GH 32293"
|
|
)
|
|
request.applymarker(mark)
|
|
|
|
df = DataFrame(
|
|
{
|
|
"cat_1": Categorical(list("AABB"), categories=list("ABC")),
|
|
"cat_2": Categorical(list("AB") * 2, categories=list("ABC")),
|
|
"value": [0.1] * 4,
|
|
}
|
|
)
|
|
unobserved = [tuple("AC"), tuple("BC"), tuple("CA"), tuple("CB"), tuple("CC")]
|
|
args = get_groupby_method_args(reduction_func, df)
|
|
|
|
series_groupby = df.groupby(["cat_1", "cat_2"], observed=False)["value"]
|
|
agg = getattr(series_groupby, reduction_func)
|
|
|
|
if reduction_func in ["idxmin", "idxmax"]:
|
|
# idxmin and idxmax are designed to fail on empty inputs
|
|
with pytest.raises(
|
|
ValueError, match="empty group due to unobserved categories"
|
|
):
|
|
agg(*args)
|
|
return
|
|
|
|
result = agg(*args)
|
|
|
|
zero_or_nan = _results_for_groupbys_with_missing_categories[reduction_func]
|
|
|
|
for idx in unobserved:
|
|
val = result.loc[idx]
|
|
assert (pd.isna(zero_or_nan) and pd.isna(val)) or (val == zero_or_nan)
|
|
|
|
# If we expect unobserved values to be zero, we also expect the dtype to be int.
|
|
# Except for .sum(). If the observed categories sum to dtype=float (i.e. their
|
|
# sums have decimals), then the zeros for the missing categories should also be
|
|
# floats.
|
|
if zero_or_nan == 0 and reduction_func != "sum":
|
|
assert np.issubdtype(result.dtype, np.integer)
|
|
|
|
|
|
def test_dataframe_groupby_on_2_categoricals_when_observed_is_true(reduction_func):
|
|
# GH 23865
|
|
# GH 27075
|
|
# Ensure that df.groupby, when 'by' is two Categorical variables,
|
|
# does not return the categories that are not in df when observed=True
|
|
if reduction_func == "ngroup":
|
|
pytest.skip("ngroup does not return the Categories on the index")
|
|
|
|
df = DataFrame(
|
|
{
|
|
"cat_1": Categorical(list("AABB"), categories=list("ABC")),
|
|
"cat_2": Categorical(list("1111"), categories=list("12")),
|
|
"value": [0.1, 0.1, 0.1, 0.1],
|
|
}
|
|
)
|
|
unobserved_cats = [("A", "2"), ("B", "2"), ("C", "1"), ("C", "2")]
|
|
|
|
df_grp = df.groupby(["cat_1", "cat_2"], observed=True)
|
|
|
|
args = get_groupby_method_args(reduction_func, df)
|
|
res = getattr(df_grp, reduction_func)(*args)
|
|
|
|
for cat in unobserved_cats:
|
|
assert cat not in res.index
|
|
|
|
|
|
@pytest.mark.parametrize("observed", [False, None])
|
|
def test_dataframe_groupby_on_2_categoricals_when_observed_is_false(
|
|
reduction_func, observed
|
|
):
|
|
# GH 23865
|
|
# GH 27075
|
|
# Ensure that df.groupby, when 'by' is two Categorical variables,
|
|
# returns the categories that are not in df when observed=False/None
|
|
|
|
if reduction_func == "ngroup":
|
|
pytest.skip("ngroup does not return the Categories on the index")
|
|
|
|
df = DataFrame(
|
|
{
|
|
"cat_1": Categorical(list("AABB"), categories=list("ABC")),
|
|
"cat_2": Categorical(list("1111"), categories=list("12")),
|
|
"value": [0.1, 0.1, 0.1, 0.1],
|
|
}
|
|
)
|
|
unobserved_cats = [("A", "2"), ("B", "2"), ("C", "1"), ("C", "2")]
|
|
|
|
df_grp = df.groupby(["cat_1", "cat_2"], observed=observed)
|
|
|
|
args = get_groupby_method_args(reduction_func, df)
|
|
|
|
if not observed and reduction_func in ["idxmin", "idxmax"]:
|
|
# idxmin and idxmax are designed to fail on empty inputs
|
|
with pytest.raises(
|
|
ValueError, match="empty group due to unobserved categories"
|
|
):
|
|
getattr(df_grp, reduction_func)(*args)
|
|
return
|
|
|
|
res = getattr(df_grp, reduction_func)(*args)
|
|
|
|
expected = _results_for_groupbys_with_missing_categories[reduction_func]
|
|
|
|
if expected is np.nan:
|
|
assert res.loc[unobserved_cats].isnull().all().all()
|
|
else:
|
|
assert (res.loc[unobserved_cats] == expected).all().all()
|
|
|
|
|
|
def test_series_groupby_categorical_aggregation_getitem():
|
|
# GH 8870
|
|
d = {"foo": [10, 8, 4, 1], "bar": [10, 20, 30, 40], "baz": ["d", "c", "d", "c"]}
|
|
df = DataFrame(d)
|
|
cat = pd.cut(df["foo"], np.linspace(0, 20, 5))
|
|
df["range"] = cat
|
|
groups = df.groupby(["range", "baz"], as_index=True, sort=True, observed=False)
|
|
result = groups["foo"].agg("mean")
|
|
expected = groups.agg("mean")["foo"]
|
|
tm.assert_series_equal(result, expected)
|
|
|
|
|
|
@pytest.mark.parametrize(
|
|
"func, expected_values",
|
|
[(Series.nunique, [1, 1, 2]), (Series.count, [1, 2, 2])],
|
|
)
|
|
def test_groupby_agg_categorical_columns(func, expected_values):
|
|
# 31256
|
|
df = DataFrame(
|
|
{
|
|
"id": [0, 1, 2, 3, 4],
|
|
"groups": [0, 1, 1, 2, 2],
|
|
"value": Categorical([0, 0, 0, 0, 1]),
|
|
}
|
|
).set_index("id")
|
|
result = df.groupby("groups").agg(func)
|
|
|
|
expected = DataFrame(
|
|
{"value": expected_values}, index=Index([0, 1, 2], name="groups")
|
|
)
|
|
tm.assert_frame_equal(result, expected)
|
|
|
|
|
|
def test_groupby_agg_non_numeric():
|
|
df = DataFrame({"A": Categorical(["a", "a", "b"], categories=["a", "b", "c"])})
|
|
expected = DataFrame({"A": [2, 1]}, index=np.array([1, 2]))
|
|
|
|
result = df.groupby([1, 2, 1]).agg(Series.nunique)
|
|
tm.assert_frame_equal(result, expected)
|
|
|
|
result = df.groupby([1, 2, 1]).nunique()
|
|
tm.assert_frame_equal(result, expected)
|
|
|
|
|
|
@pytest.mark.parametrize("func", ["first", "last"])
|
|
def test_groupby_first_returned_categorical_instead_of_dataframe(func):
|
|
# GH 28641: groupby drops index, when grouping over categorical column with
|
|
# first/last. Renamed Categorical instead of DataFrame previously.
|
|
df = DataFrame({"A": [1997], "B": Series(["b"], dtype="category").cat.as_ordered()})
|
|
df_grouped = df.groupby("A")["B"]
|
|
result = getattr(df_grouped, func)()
|
|
|
|
# ordered categorical dtype should be preserved
|
|
expected = Series(
|
|
["b"], index=Index([1997], name="A"), name="B", dtype=df["B"].dtype
|
|
)
|
|
tm.assert_series_equal(result, expected)
|
|
|
|
|
|
def test_read_only_category_no_sort():
|
|
# GH33410
|
|
cats = np.array([1, 2])
|
|
cats.flags.writeable = False
|
|
df = DataFrame(
|
|
{"a": [1, 3, 5, 7], "b": Categorical([1, 1, 2, 2], categories=Index(cats))}
|
|
)
|
|
expected = DataFrame(data={"a": [2.0, 6.0]}, index=CategoricalIndex(cats, name="b"))
|
|
result = df.groupby("b", sort=False, observed=False).mean()
|
|
tm.assert_frame_equal(result, expected)
|
|
|
|
|
|
def test_sorted_missing_category_values():
|
|
# GH 28597
|
|
df = DataFrame(
|
|
{
|
|
"foo": [
|
|
"small",
|
|
"large",
|
|
"large",
|
|
"large",
|
|
"medium",
|
|
"large",
|
|
"large",
|
|
"medium",
|
|
],
|
|
"bar": ["C", "A", "A", "C", "A", "C", "A", "C"],
|
|
}
|
|
)
|
|
df["foo"] = (
|
|
df["foo"]
|
|
.astype("category")
|
|
.cat.set_categories(["tiny", "small", "medium", "large"], ordered=True)
|
|
)
|
|
|
|
expected = DataFrame(
|
|
{
|
|
"tiny": {"A": 0, "C": 0},
|
|
"small": {"A": 0, "C": 1},
|
|
"medium": {"A": 1, "C": 1},
|
|
"large": {"A": 3, "C": 2},
|
|
}
|
|
)
|
|
expected = expected.rename_axis("bar", axis="index")
|
|
expected.columns = CategoricalIndex(
|
|
["tiny", "small", "medium", "large"],
|
|
categories=["tiny", "small", "medium", "large"],
|
|
ordered=True,
|
|
name="foo",
|
|
dtype="category",
|
|
)
|
|
|
|
result = df.groupby(["bar", "foo"], observed=False).size().unstack()
|
|
|
|
tm.assert_frame_equal(result, expected)
|
|
|
|
|
|
def test_agg_cython_category_not_implemented_fallback():
|
|
# https://github.com/pandas-dev/pandas/issues/31450
|
|
df = DataFrame({"col_num": [1, 1, 2, 3]})
|
|
df["col_cat"] = df["col_num"].astype("category")
|
|
|
|
result = df.groupby("col_num").col_cat.first()
|
|
|
|
# ordered categorical dtype should definitely be preserved;
|
|
# this is unordered, so is less-clear case (if anything, it should raise)
|
|
expected = Series(
|
|
[1, 2, 3],
|
|
index=Index([1, 2, 3], name="col_num"),
|
|
name="col_cat",
|
|
dtype=df["col_cat"].dtype,
|
|
)
|
|
tm.assert_series_equal(result, expected)
|
|
|
|
result = df.groupby("col_num").agg({"col_cat": "first"})
|
|
expected = expected.to_frame()
|
|
tm.assert_frame_equal(result, expected)
|
|
|
|
|
|
def test_aggregate_categorical_with_isnan():
|
|
# GH 29837
|
|
df = DataFrame(
|
|
{
|
|
"A": [1, 1, 1, 1],
|
|
"B": [1, 2, 1, 2],
|
|
"numerical_col": [0.1, 0.2, np.nan, 0.3],
|
|
"object_col": ["foo", "bar", "foo", "fee"],
|
|
"categorical_col": ["foo", "bar", "foo", "fee"],
|
|
}
|
|
)
|
|
|
|
df = df.astype({"categorical_col": "category"})
|
|
|
|
result = df.groupby(["A", "B"]).agg(lambda df: df.isna().sum())
|
|
index = MultiIndex.from_arrays([[1, 1], [1, 2]], names=("A", "B"))
|
|
expected = DataFrame(
|
|
data={
|
|
"numerical_col": [1, 0],
|
|
"object_col": [0, 0],
|
|
"categorical_col": [0, 0],
|
|
},
|
|
index=index,
|
|
)
|
|
tm.assert_frame_equal(result, expected)
|
|
|
|
|
|
def test_categorical_transform():
|
|
# GH 29037
|
|
df = DataFrame(
|
|
{
|
|
"package_id": [1, 1, 1, 2, 2, 3],
|
|
"status": [
|
|
"Waiting",
|
|
"OnTheWay",
|
|
"Delivered",
|
|
"Waiting",
|
|
"OnTheWay",
|
|
"Waiting",
|
|
],
|
|
}
|
|
)
|
|
|
|
delivery_status_type = pd.CategoricalDtype(
|
|
categories=["Waiting", "OnTheWay", "Delivered"], ordered=True
|
|
)
|
|
df["status"] = df["status"].astype(delivery_status_type)
|
|
msg = "using SeriesGroupBy.max"
|
|
with tm.assert_produces_warning(FutureWarning, match=msg):
|
|
# GH#53425
|
|
df["last_status"] = df.groupby("package_id")["status"].transform(max)
|
|
result = df.copy()
|
|
|
|
expected = DataFrame(
|
|
{
|
|
"package_id": [1, 1, 1, 2, 2, 3],
|
|
"status": [
|
|
"Waiting",
|
|
"OnTheWay",
|
|
"Delivered",
|
|
"Waiting",
|
|
"OnTheWay",
|
|
"Waiting",
|
|
],
|
|
"last_status": [
|
|
"Delivered",
|
|
"Delivered",
|
|
"Delivered",
|
|
"OnTheWay",
|
|
"OnTheWay",
|
|
"Waiting",
|
|
],
|
|
}
|
|
)
|
|
|
|
expected["status"] = expected["status"].astype(delivery_status_type)
|
|
|
|
# .transform(max) should preserve ordered categoricals
|
|
expected["last_status"] = expected["last_status"].astype(delivery_status_type)
|
|
|
|
tm.assert_frame_equal(result, expected)
|
|
|
|
|
|
@pytest.mark.parametrize("func", ["first", "last"])
|
|
def test_series_groupby_first_on_categorical_col_grouped_on_2_categoricals(
|
|
func: str, observed: bool
|
|
):
|
|
# GH 34951
|
|
cat = Categorical([0, 0, 1, 1])
|
|
val = [0, 1, 1, 0]
|
|
df = DataFrame({"a": cat, "b": cat, "c": val})
|
|
|
|
cat2 = Categorical([0, 1])
|
|
idx = MultiIndex.from_product([cat2, cat2], names=["a", "b"])
|
|
expected_dict = {
|
|
"first": Series([0, np.nan, np.nan, 1], idx, name="c"),
|
|
"last": Series([1, np.nan, np.nan, 0], idx, name="c"),
|
|
}
|
|
|
|
expected = expected_dict[func]
|
|
if observed:
|
|
expected = expected.dropna().astype(np.int64)
|
|
|
|
srs_grp = df.groupby(["a", "b"], observed=observed)["c"]
|
|
result = getattr(srs_grp, func)()
|
|
tm.assert_series_equal(result, expected)
|
|
|
|
|
|
@pytest.mark.parametrize("func", ["first", "last"])
|
|
def test_df_groupby_first_on_categorical_col_grouped_on_2_categoricals(
|
|
func: str, observed: bool
|
|
):
|
|
# GH 34951
|
|
cat = Categorical([0, 0, 1, 1])
|
|
val = [0, 1, 1, 0]
|
|
df = DataFrame({"a": cat, "b": cat, "c": val})
|
|
|
|
cat2 = Categorical([0, 1])
|
|
idx = MultiIndex.from_product([cat2, cat2], names=["a", "b"])
|
|
expected_dict = {
|
|
"first": Series([0, np.nan, np.nan, 1], idx, name="c"),
|
|
"last": Series([1, np.nan, np.nan, 0], idx, name="c"),
|
|
}
|
|
|
|
expected = expected_dict[func].to_frame()
|
|
if observed:
|
|
expected = expected.dropna().astype(np.int64)
|
|
|
|
df_grp = df.groupby(["a", "b"], observed=observed)
|
|
result = getattr(df_grp, func)()
|
|
tm.assert_frame_equal(result, expected)
|
|
|
|
|
|
def test_groupby_categorical_indices_unused_categories():
|
|
# GH#38642
|
|
df = DataFrame(
|
|
{
|
|
"key": Categorical(["b", "b", "a"], categories=["a", "b", "c"]),
|
|
"col": range(3),
|
|
}
|
|
)
|
|
grouped = df.groupby("key", sort=False, observed=False)
|
|
result = grouped.indices
|
|
expected = {
|
|
"b": np.array([0, 1], dtype="intp"),
|
|
"a": np.array([2], dtype="intp"),
|
|
"c": np.array([], dtype="intp"),
|
|
}
|
|
assert result.keys() == expected.keys()
|
|
for key in result.keys():
|
|
tm.assert_numpy_array_equal(result[key], expected[key])
|
|
|
|
|
|
@pytest.mark.parametrize("func", ["first", "last"])
|
|
def test_groupby_last_first_preserve_categoricaldtype(func):
|
|
# GH#33090
|
|
df = DataFrame({"a": [1, 2, 3]})
|
|
df["b"] = df["a"].astype("category")
|
|
result = getattr(df.groupby("a")["b"], func)()
|
|
expected = Series(
|
|
Categorical([1, 2, 3]), name="b", index=Index([1, 2, 3], name="a")
|
|
)
|
|
tm.assert_series_equal(expected, result)
|
|
|
|
|
|
def test_groupby_categorical_observed_nunique():
|
|
# GH#45128
|
|
df = DataFrame({"a": [1, 2], "b": [1, 2], "c": [10, 11]})
|
|
df = df.astype(dtype={"a": "category", "b": "category"})
|
|
result = df.groupby(["a", "b"], observed=True).nunique()["c"]
|
|
expected = Series(
|
|
[1, 1],
|
|
index=MultiIndex.from_arrays(
|
|
[CategoricalIndex([1, 2], name="a"), CategoricalIndex([1, 2], name="b")]
|
|
),
|
|
name="c",
|
|
)
|
|
tm.assert_series_equal(result, expected)
|
|
|
|
|
|
def test_groupby_categorical_aggregate_functions():
|
|
# GH#37275
|
|
dtype = pd.CategoricalDtype(categories=["small", "big"], ordered=True)
|
|
df = DataFrame(
|
|
[[1, "small"], [1, "big"], [2, "small"]], columns=["grp", "description"]
|
|
).astype({"description": dtype})
|
|
|
|
result = df.groupby("grp")["description"].max()
|
|
expected = Series(
|
|
["big", "small"],
|
|
index=Index([1, 2], name="grp"),
|
|
name="description",
|
|
dtype=pd.CategoricalDtype(categories=["small", "big"], ordered=True),
|
|
)
|
|
|
|
tm.assert_series_equal(result, expected)
|
|
|
|
|
|
def test_groupby_categorical_dropna(observed, dropna):
|
|
# GH#48645 - dropna should have no impact on the result when there are no NA values
|
|
cat = Categorical([1, 2], categories=[1, 2, 3])
|
|
df = DataFrame({"x": Categorical([1, 2], categories=[1, 2, 3]), "y": [3, 4]})
|
|
gb = df.groupby("x", observed=observed, dropna=dropna)
|
|
result = gb.sum()
|
|
|
|
if observed:
|
|
expected = DataFrame({"y": [3, 4]}, index=cat)
|
|
else:
|
|
index = CategoricalIndex([1, 2, 3], [1, 2, 3])
|
|
expected = DataFrame({"y": [3, 4, 0]}, index=index)
|
|
expected.index.name = "x"
|
|
|
|
tm.assert_frame_equal(result, expected)
|
|
|
|
|
|
@pytest.mark.parametrize("index_kind", ["range", "single", "multi"])
|
|
@pytest.mark.parametrize("ordered", [True, False])
|
|
def test_category_order_reducer(
|
|
request, as_index, sort, observed, reduction_func, index_kind, ordered
|
|
):
|
|
# GH#48749
|
|
if reduction_func == "corrwith" and not as_index:
|
|
msg = "GH#49950 - corrwith with as_index=False may not have grouping column"
|
|
request.applymarker(pytest.mark.xfail(reason=msg))
|
|
elif index_kind != "range" and not as_index:
|
|
pytest.skip(reason="Result doesn't have categories, nothing to test")
|
|
df = DataFrame(
|
|
{
|
|
"a": Categorical([2, 1, 2, 3], categories=[1, 4, 3, 2], ordered=ordered),
|
|
"b": range(4),
|
|
}
|
|
)
|
|
if index_kind == "range":
|
|
keys = ["a"]
|
|
elif index_kind == "single":
|
|
keys = ["a"]
|
|
df = df.set_index(keys)
|
|
elif index_kind == "multi":
|
|
keys = ["a", "a2"]
|
|
df["a2"] = df["a"]
|
|
df = df.set_index(keys)
|
|
args = get_groupby_method_args(reduction_func, df)
|
|
gb = df.groupby(keys, as_index=as_index, sort=sort, observed=observed)
|
|
|
|
if not observed and reduction_func in ["idxmin", "idxmax"]:
|
|
# idxmin and idxmax are designed to fail on empty inputs
|
|
with pytest.raises(
|
|
ValueError, match="empty group due to unobserved categories"
|
|
):
|
|
getattr(gb, reduction_func)(*args)
|
|
return
|
|
|
|
op_result = getattr(gb, reduction_func)(*args)
|
|
if as_index:
|
|
result = op_result.index.get_level_values("a").categories
|
|
else:
|
|
result = op_result["a"].cat.categories
|
|
expected = Index([1, 4, 3, 2])
|
|
tm.assert_index_equal(result, expected)
|
|
|
|
if index_kind == "multi":
|
|
result = op_result.index.get_level_values("a2").categories
|
|
tm.assert_index_equal(result, expected)
|
|
|
|
|
|
@pytest.mark.parametrize("index_kind", ["single", "multi"])
|
|
@pytest.mark.parametrize("ordered", [True, False])
|
|
def test_category_order_transformer(
|
|
as_index, sort, observed, transformation_func, index_kind, ordered
|
|
):
|
|
# GH#48749
|
|
df = DataFrame(
|
|
{
|
|
"a": Categorical([2, 1, 2, 3], categories=[1, 4, 3, 2], ordered=ordered),
|
|
"b": range(4),
|
|
}
|
|
)
|
|
if index_kind == "single":
|
|
keys = ["a"]
|
|
df = df.set_index(keys)
|
|
elif index_kind == "multi":
|
|
keys = ["a", "a2"]
|
|
df["a2"] = df["a"]
|
|
df = df.set_index(keys)
|
|
args = get_groupby_method_args(transformation_func, df)
|
|
gb = df.groupby(keys, as_index=as_index, sort=sort, observed=observed)
|
|
warn = FutureWarning if transformation_func == "fillna" else None
|
|
msg = "DataFrameGroupBy.fillna is deprecated"
|
|
with tm.assert_produces_warning(warn, match=msg):
|
|
op_result = getattr(gb, transformation_func)(*args)
|
|
result = op_result.index.get_level_values("a").categories
|
|
expected = Index([1, 4, 3, 2])
|
|
tm.assert_index_equal(result, expected)
|
|
|
|
if index_kind == "multi":
|
|
result = op_result.index.get_level_values("a2").categories
|
|
tm.assert_index_equal(result, expected)
|
|
|
|
|
|
@pytest.mark.parametrize("index_kind", ["range", "single", "multi"])
|
|
@pytest.mark.parametrize("method", ["head", "tail"])
|
|
@pytest.mark.parametrize("ordered", [True, False])
|
|
def test_category_order_head_tail(
|
|
as_index, sort, observed, method, index_kind, ordered
|
|
):
|
|
# GH#48749
|
|
df = DataFrame(
|
|
{
|
|
"a": Categorical([2, 1, 2, 3], categories=[1, 4, 3, 2], ordered=ordered),
|
|
"b": range(4),
|
|
}
|
|
)
|
|
if index_kind == "range":
|
|
keys = ["a"]
|
|
elif index_kind == "single":
|
|
keys = ["a"]
|
|
df = df.set_index(keys)
|
|
elif index_kind == "multi":
|
|
keys = ["a", "a2"]
|
|
df["a2"] = df["a"]
|
|
df = df.set_index(keys)
|
|
gb = df.groupby(keys, as_index=as_index, sort=sort, observed=observed)
|
|
op_result = getattr(gb, method)()
|
|
if index_kind == "range":
|
|
result = op_result["a"].cat.categories
|
|
else:
|
|
result = op_result.index.get_level_values("a").categories
|
|
expected = Index([1, 4, 3, 2])
|
|
tm.assert_index_equal(result, expected)
|
|
|
|
if index_kind == "multi":
|
|
result = op_result.index.get_level_values("a2").categories
|
|
tm.assert_index_equal(result, expected)
|
|
|
|
|
|
@pytest.mark.parametrize("index_kind", ["range", "single", "multi"])
|
|
@pytest.mark.parametrize("method", ["apply", "agg", "transform"])
|
|
@pytest.mark.parametrize("ordered", [True, False])
|
|
def test_category_order_apply(as_index, sort, observed, method, index_kind, ordered):
|
|
# GH#48749
|
|
if (method == "transform" and index_kind == "range") or (
|
|
not as_index and index_kind != "range"
|
|
):
|
|
pytest.skip("No categories in result, nothing to test")
|
|
df = DataFrame(
|
|
{
|
|
"a": Categorical([2, 1, 2, 3], categories=[1, 4, 3, 2], ordered=ordered),
|
|
"b": range(4),
|
|
}
|
|
)
|
|
if index_kind == "range":
|
|
keys = ["a"]
|
|
elif index_kind == "single":
|
|
keys = ["a"]
|
|
df = df.set_index(keys)
|
|
elif index_kind == "multi":
|
|
keys = ["a", "a2"]
|
|
df["a2"] = df["a"]
|
|
df = df.set_index(keys)
|
|
gb = df.groupby(keys, as_index=as_index, sort=sort, observed=observed)
|
|
warn = DeprecationWarning if method == "apply" and index_kind == "range" else None
|
|
msg = "DataFrameGroupBy.apply operated on the grouping columns"
|
|
with tm.assert_produces_warning(warn, match=msg):
|
|
op_result = getattr(gb, method)(lambda x: x.sum(numeric_only=True))
|
|
if (method == "transform" or not as_index) and index_kind == "range":
|
|
result = op_result["a"].cat.categories
|
|
else:
|
|
result = op_result.index.get_level_values("a").categories
|
|
expected = Index([1, 4, 3, 2])
|
|
tm.assert_index_equal(result, expected)
|
|
|
|
if index_kind == "multi":
|
|
result = op_result.index.get_level_values("a2").categories
|
|
tm.assert_index_equal(result, expected)
|
|
|
|
|
|
@pytest.mark.parametrize("index_kind", ["range", "single", "multi"])
|
|
def test_many_categories(as_index, sort, index_kind, ordered):
|
|
# GH#48749 - Test when the grouper has many categories
|
|
if index_kind != "range" and not as_index:
|
|
pytest.skip(reason="Result doesn't have categories, nothing to test")
|
|
categories = np.arange(9999, -1, -1)
|
|
grouper = Categorical([2, 1, 2, 3], categories=categories, ordered=ordered)
|
|
df = DataFrame({"a": grouper, "b": range(4)})
|
|
if index_kind == "range":
|
|
keys = ["a"]
|
|
elif index_kind == "single":
|
|
keys = ["a"]
|
|
df = df.set_index(keys)
|
|
elif index_kind == "multi":
|
|
keys = ["a", "a2"]
|
|
df["a2"] = df["a"]
|
|
df = df.set_index(keys)
|
|
gb = df.groupby(keys, as_index=as_index, sort=sort, observed=True)
|
|
result = gb.sum()
|
|
|
|
# Test is setup so that data and index are the same values
|
|
data = [3, 2, 1] if sort else [2, 1, 3]
|
|
|
|
index = CategoricalIndex(
|
|
data, categories=grouper.categories, ordered=ordered, name="a"
|
|
)
|
|
if as_index:
|
|
expected = DataFrame({"b": data})
|
|
if index_kind == "multi":
|
|
expected.index = MultiIndex.from_frame(DataFrame({"a": index, "a2": index}))
|
|
else:
|
|
expected.index = index
|
|
elif index_kind == "multi":
|
|
expected = DataFrame({"a": Series(index), "a2": Series(index), "b": data})
|
|
else:
|
|
expected = DataFrame({"a": Series(index), "b": data})
|
|
|
|
tm.assert_frame_equal(result, expected)
|
|
|
|
|
|
@pytest.mark.parametrize("cat_columns", ["a", "b", ["a", "b"]])
|
|
@pytest.mark.parametrize("keys", ["a", "b", ["a", "b"]])
|
|
def test_groupby_default_depr(cat_columns, keys):
|
|
# GH#43999
|
|
df = DataFrame({"a": [1, 1, 2, 3], "b": [4, 5, 6, 7]})
|
|
df[cat_columns] = df[cat_columns].astype("category")
|
|
msg = "The default of observed=False is deprecated"
|
|
klass = FutureWarning if set(cat_columns) & set(keys) else None
|
|
with tm.assert_produces_warning(klass, match=msg):
|
|
df.groupby(keys)
|
|
|
|
|
|
@pytest.mark.parametrize("test_series", [True, False])
|
|
@pytest.mark.parametrize("keys", [["a1"], ["a1", "a2"]])
|
|
def test_agg_list(request, as_index, observed, reduction_func, test_series, keys):
|
|
# GH#52760
|
|
if test_series and reduction_func == "corrwith":
|
|
assert not hasattr(SeriesGroupBy, "corrwith")
|
|
pytest.skip("corrwith not implemented for SeriesGroupBy")
|
|
elif reduction_func == "corrwith":
|
|
msg = "GH#32293: attempts to call SeriesGroupBy.corrwith"
|
|
request.applymarker(pytest.mark.xfail(reason=msg))
|
|
elif (
|
|
reduction_func == "nunique"
|
|
and not test_series
|
|
and len(keys) != 1
|
|
and not observed
|
|
and not as_index
|
|
):
|
|
msg = "GH#52848 - raises a ValueError"
|
|
request.applymarker(pytest.mark.xfail(reason=msg))
|
|
|
|
df = DataFrame({"a1": [0, 0, 1], "a2": [2, 3, 3], "b": [4, 5, 6]})
|
|
df = df.astype({"a1": "category", "a2": "category"})
|
|
if "a2" not in keys:
|
|
df = df.drop(columns="a2")
|
|
gb = df.groupby(by=keys, as_index=as_index, observed=observed)
|
|
if test_series:
|
|
gb = gb["b"]
|
|
args = get_groupby_method_args(reduction_func, df)
|
|
|
|
if not observed and reduction_func in ["idxmin", "idxmax"] and keys == ["a1", "a2"]:
|
|
with pytest.raises(
|
|
ValueError, match="empty group due to unobserved categories"
|
|
):
|
|
gb.agg([reduction_func], *args)
|
|
return
|
|
|
|
result = gb.agg([reduction_func], *args)
|
|
expected = getattr(gb, reduction_func)(*args)
|
|
|
|
if as_index and (test_series or reduction_func == "size"):
|
|
expected = expected.to_frame(reduction_func)
|
|
if not test_series:
|
|
expected.columns = MultiIndex.from_tuples(
|
|
[(ind, "") for ind in expected.columns[:-1]] + [("b", reduction_func)]
|
|
)
|
|
elif not as_index:
|
|
expected.columns = keys + [reduction_func]
|
|
|
|
tm.assert_equal(result, expected)
|