import pytest from pandas.core.dtypes.common import ( is_bool_dtype, is_numeric_dtype, is_object_dtype, is_string_dtype, ) import pandas as pd import pandas._testing as tm from pandas.tests.extension.base.base import BaseExtensionTests class BaseGroupbyTests(BaseExtensionTests): """Groupby-specific tests.""" def test_grouping_grouper(self, data_for_grouping): df = pd.DataFrame( {"A": ["B", "B", None, None, "A", "A", "B", "C"], "B": data_for_grouping} ) gr1 = df.groupby("A").grouper.groupings[0] gr2 = df.groupby("B").grouper.groupings[0] tm.assert_numpy_array_equal(gr1.grouping_vector, df.A.values) tm.assert_extension_array_equal(gr2.grouping_vector, data_for_grouping) @pytest.mark.parametrize("as_index", [True, False]) def test_groupby_extension_agg(self, as_index, data_for_grouping): df = pd.DataFrame({"A": [1, 1, 2, 2, 3, 3, 1, 4], "B": data_for_grouping}) result = df.groupby("B", as_index=as_index).A.mean() _, uniques = pd.factorize(data_for_grouping, sort=True) if as_index: index = pd.Index(uniques, name="B") expected = pd.Series([3.0, 1.0, 4.0], index=index, name="A") self.assert_series_equal(result, expected) else: expected = pd.DataFrame({"B": uniques, "A": [3.0, 1.0, 4.0]}) self.assert_frame_equal(result, expected) def test_groupby_agg_extension(self, data_for_grouping): # GH#38980 groupby agg on extension type fails for non-numeric types df = pd.DataFrame({"A": [1, 1, 2, 2, 3, 3, 1, 4], "B": data_for_grouping}) expected = df.iloc[[0, 2, 4, 7]] expected = expected.set_index("A") result = df.groupby("A").agg({"B": "first"}) self.assert_frame_equal(result, expected) result = df.groupby("A").agg("first") self.assert_frame_equal(result, expected) result = df.groupby("A").first() self.assert_frame_equal(result, expected) def test_groupby_agg_extension_timedelta_cumsum_with_named_aggregation(self): # GH#41720 expected = pd.DataFrame( { "td": { 0: pd.Timedelta("0 days 01:00:00"), 1: pd.Timedelta("0 days 01:15:00"), 2: pd.Timedelta("0 days 01:15:00"), } } ) df = pd.DataFrame( { "td": pd.Series( ["0 days 01:00:00", "0 days 00:15:00", "0 days 01:15:00"], dtype="timedelta64[ns]", ), "grps": ["a", "a", "b"], } ) gb = df.groupby("grps") result = gb.agg(td=("td", "cumsum")) self.assert_frame_equal(result, expected) def test_groupby_extension_no_sort(self, data_for_grouping): df = pd.DataFrame({"A": [1, 1, 2, 2, 3, 3, 1, 4], "B": data_for_grouping}) result = df.groupby("B", sort=False).A.mean() _, index = pd.factorize(data_for_grouping, sort=False) index = pd.Index(index, name="B") expected = pd.Series([1.0, 3.0, 4.0], index=index, name="A") self.assert_series_equal(result, expected) def test_groupby_extension_transform(self, data_for_grouping): valid = data_for_grouping[~data_for_grouping.isna()] df = pd.DataFrame({"A": [1, 1, 3, 3, 1, 4], "B": valid}) result = df.groupby("B").A.transform(len) expected = pd.Series([3, 3, 2, 2, 3, 1], name="A") self.assert_series_equal(result, expected) def test_groupby_extension_apply(self, data_for_grouping, groupby_apply_op): df = pd.DataFrame({"A": [1, 1, 2, 2, 3, 3, 1, 4], "B": data_for_grouping}) df.groupby("B", group_keys=False).apply(groupby_apply_op) df.groupby("B", group_keys=False).A.apply(groupby_apply_op) df.groupby("A", group_keys=False).apply(groupby_apply_op) df.groupby("A", group_keys=False).B.apply(groupby_apply_op) def test_groupby_apply_identity(self, data_for_grouping): df = pd.DataFrame({"A": [1, 1, 2, 2, 3, 3, 1, 4], "B": data_for_grouping}) result = df.groupby("A").B.apply(lambda x: x.array) expected = pd.Series( [ df.B.iloc[[0, 1, 6]].array, df.B.iloc[[2, 3]].array, df.B.iloc[[4, 5]].array, df.B.iloc[[7]].array, ], index=pd.Index([1, 2, 3, 4], name="A"), name="B", ) self.assert_series_equal(result, expected) def test_in_numeric_groupby(self, data_for_grouping): df = pd.DataFrame( { "A": [1, 1, 2, 2, 3, 3, 1, 4], "B": data_for_grouping, "C": [1, 1, 1, 1, 1, 1, 1, 1], } ) dtype = data_for_grouping.dtype if ( is_numeric_dtype(dtype) or is_bool_dtype(dtype) or dtype.name == "decimal" or is_string_dtype(dtype) or is_object_dtype(dtype) or dtype.kind == "m" # in particular duration[*][pyarrow] ): expected = pd.Index(["B", "C"]) result = df.groupby("A").sum().columns else: expected = pd.Index(["C"]) with pytest.raises(TypeError, match="does not support"): df.groupby("A").sum().columns result = df.groupby("A").sum(numeric_only=True).columns tm.assert_index_equal(result, expected)