""" test cython .agg behavior """ import numpy as np import pytest from pandas.core.dtypes.common import is_float_dtype import pandas as pd from pandas import DataFrame, Index, NaT, Series, Timedelta, Timestamp, bdate_range import pandas._testing as tm from pandas.core.groupby.groupby import DataError @pytest.mark.parametrize( "op_name", [ "count", "sum", "std", "var", "sem", "mean", pytest.param( "median", # ignore mean of empty slice # and all-NaN marks=[pytest.mark.filterwarnings("ignore::RuntimeWarning")], ), "prod", "min", "max", ], ) def test_cythonized_aggers(op_name): data = { "A": [0, 0, 0, 0, 1, 1, 1, 1, 1, 1.0, np.nan, np.nan], "B": ["A", "B"] * 6, "C": np.random.randn(12), } df = DataFrame(data) df.loc[2:10:2, "C"] = np.nan op = lambda x: getattr(x, op_name)() # single column grouped = df.drop(["B"], axis=1).groupby("A") exp = {cat: op(group["C"]) for cat, group in grouped} exp = DataFrame({"C": exp}) exp.index.name = "A" result = op(grouped) tm.assert_frame_equal(result, exp) # multiple columns grouped = df.groupby(["A", "B"]) expd = {} for (cat1, cat2), group in grouped: expd.setdefault(cat1, {})[cat2] = op(group["C"]) exp = DataFrame(expd).T.stack(dropna=False) exp.index.names = ["A", "B"] exp.name = "C" result = op(grouped)["C"] if op_name in ["sum", "prod"]: tm.assert_series_equal(result, exp) def test_cython_agg_boolean(): frame = DataFrame( { "a": np.random.randint(0, 5, 50), "b": np.random.randint(0, 2, 50).astype("bool"), } ) result = frame.groupby("a")["b"].mean() expected = frame.groupby("a")["b"].agg(np.mean) tm.assert_series_equal(result, expected) def test_cython_agg_nothing_to_agg(): frame = DataFrame({"a": np.random.randint(0, 5, 50), "b": ["foo", "bar"] * 25}) msg = "No numeric types to aggregate" with pytest.raises(DataError, match=msg): frame.groupby("a")["b"].mean() frame = DataFrame({"a": np.random.randint(0, 5, 50), "b": ["foo", "bar"] * 25}) with pytest.raises(DataError, match=msg): frame[["b"]].groupby(frame["a"]).mean() def test_cython_agg_nothing_to_agg_with_dates(): frame = DataFrame( { "a": np.random.randint(0, 5, 50), "b": ["foo", "bar"] * 25, "dates": pd.date_range("now", periods=50, freq="T"), } ) msg = "No numeric types to aggregate" with pytest.raises(DataError, match=msg): frame.groupby("b").dates.mean() def test_cython_agg_frame_columns(): # #2113 df = DataFrame({"x": [1, 2, 3], "y": [3, 4, 5]}) df.groupby(level=0, axis="columns").mean() df.groupby(level=0, axis="columns").mean() df.groupby(level=0, axis="columns").mean() df.groupby(level=0, axis="columns").mean() def test_cython_agg_return_dict(): # GH 16741 df = DataFrame( { "A": ["foo", "bar", "foo", "bar", "foo", "bar", "foo", "foo"], "B": ["one", "one", "two", "three", "two", "two", "one", "three"], "C": np.random.randn(8), "D": np.random.randn(8), } ) ts = df.groupby("A")["B"].agg(lambda x: x.value_counts().to_dict()) expected = Series( [{"two": 1, "one": 1, "three": 1}, {"two": 2, "one": 2, "three": 1}], index=Index(["bar", "foo"], name="A"), name="B", ) tm.assert_series_equal(ts, expected) def test_cython_fail_agg(): dr = bdate_range("1/1/2000", periods=50) ts = Series(["A", "B", "C", "D", "E"] * 10, index=dr) grouped = ts.groupby(lambda x: x.month) summed = grouped.sum() expected = grouped.agg(np.sum) tm.assert_series_equal(summed, expected) @pytest.mark.parametrize( "op, targop", [ ("mean", np.mean), ("median", np.median), ("var", np.var), ("add", np.sum), ("prod", np.prod), ("min", np.min), ("max", np.max), ("first", lambda x: x.iloc[0]), ("last", lambda x: x.iloc[-1]), ], ) def test__cython_agg_general(op, targop): df = DataFrame(np.random.randn(1000)) labels = np.random.randint(0, 50, size=1000).astype(float) result = df.groupby(labels)._cython_agg_general(op) expected = df.groupby(labels).agg(targop) tm.assert_frame_equal(result, expected) @pytest.mark.parametrize( "op, targop", [ ("mean", np.mean), ("median", lambda x: np.median(x) if len(x) > 0 else np.nan), ("var", lambda x: np.var(x, ddof=1)), ("min", np.min), ("max", np.max), ], ) def test_cython_agg_empty_buckets(op, targop, observed): df = DataFrame([11, 12, 13]) grps = range(0, 55, 5) # calling _cython_agg_general directly, instead of via the user API # which sets different values for min_count, so do that here. g = df.groupby(pd.cut(df[0], grps), observed=observed) result = g._cython_agg_general(op) g = df.groupby(pd.cut(df[0], grps), observed=observed) expected = g.agg(lambda x: targop(x)) tm.assert_frame_equal(result, expected) def test_cython_agg_empty_buckets_nanops(observed): # GH-18869 can't call nanops on empty groups, so hardcode expected # for these df = DataFrame([11, 12, 13], columns=["a"]) grps = range(0, 25, 5) # add / sum result = df.groupby(pd.cut(df["a"], grps), observed=observed)._cython_agg_general( "add" ) intervals = pd.interval_range(0, 20, freq=5) expected = DataFrame( {"a": [0, 0, 36, 0]}, index=pd.CategoricalIndex(intervals, name="a", ordered=True), ) if observed: expected = expected[expected.a != 0] tm.assert_frame_equal(result, expected) # prod result = df.groupby(pd.cut(df["a"], grps), observed=observed)._cython_agg_general( "prod" ) expected = DataFrame( {"a": [1, 1, 1716, 1]}, index=pd.CategoricalIndex(intervals, name="a", ordered=True), ) if observed: expected = expected[expected.a != 1] tm.assert_frame_equal(result, expected) @pytest.mark.parametrize("op", ["first", "last", "max", "min"]) @pytest.mark.parametrize( "data", [Timestamp("2016-10-14 21:00:44.557"), Timedelta("17088 days 21:00:44.557")] ) def test_cython_with_timestamp_and_nat(op, data): # https://github.com/pandas-dev/pandas/issues/19526 df = DataFrame({"a": [0, 1], "b": [data, NaT]}) index = Index([0, 1], name="a") # We will group by a and test the cython aggregations expected = DataFrame({"b": [data, NaT]}, index=index) result = df.groupby("a").aggregate(op) tm.assert_frame_equal(expected, result) @pytest.mark.parametrize( "agg", [ "min", "max", "count", "sum", "prod", "var", "mean", "median", "ohlc", "cumprod", "cumsum", "shift", "any", "all", "quantile", "first", "last", "rank", "cummin", "cummax", ], ) def test_read_only_buffer_source_agg(agg): # https://github.com/pandas-dev/pandas/issues/36014 df = DataFrame( { "sepal_length": [5.1, 4.9, 4.7, 4.6, 5.0], "species": ["setosa", "setosa", "setosa", "setosa", "setosa"], } ) df._mgr.blocks[0].values.flags.writeable = False result = df.groupby(["species"]).agg({"sepal_length": agg}) expected = df.copy().groupby(["species"]).agg({"sepal_length": agg}) tm.assert_equal(result, expected) @pytest.mark.parametrize( "op_name", [ "count", "sum", "std", "var", "sem", "mean", "median", "prod", "min", "max", ], ) def test_cython_agg_nullable_int(op_name): # ensure that the cython-based aggregations don't fail for nullable dtype # (eg https://github.com/pandas-dev/pandas/issues/37415) df = DataFrame( { "A": ["A", "B"] * 5, "B": pd.array([1, 2, 3, 4, 5, 6, 7, 8, 9, pd.NA], dtype="Int64"), } ) result = getattr(df.groupby("A")["B"], op_name)() df2 = df.assign(B=df["B"].astype("float64")) expected = getattr(df2.groupby("A")["B"], op_name)() if op_name != "count": # the result is not yet consistently using Int64/Float64 dtype, # so for now just checking the values by casting to float result = result.astype("float64") tm.assert_series_equal(result, expected) @pytest.mark.parametrize("with_na", [True, False]) @pytest.mark.parametrize( "op_name, action", [ # ("count", "always_int"), ("sum", "large_int"), # ("std", "always_float"), ("var", "always_float"), # ("sem", "always_float"), ("mean", "always_float"), ("median", "always_float"), ("prod", "large_int"), ("min", "preserve"), ("max", "preserve"), ("first", "preserve"), ("last", "preserve"), ], ) @pytest.mark.parametrize( "data", [ pd.array([1, 2, 3, 4], dtype="Int64"), pd.array([1, 2, 3, 4], dtype="Int8"), pd.array([0.1, 0.2, 0.3, 0.4], dtype="Float32"), pd.array([0.1, 0.2, 0.3, 0.4], dtype="Float64"), pd.array([True, True, False, False], dtype="boolean"), ], ) def test_cython_agg_EA_known_dtypes(data, op_name, action, with_na): if with_na: data[3] = pd.NA df = DataFrame({"key": ["a", "a", "b", "b"], "col": data}) grouped = df.groupby("key") if action == "always_int": # always Int64 expected_dtype = pd.Int64Dtype() elif action == "large_int": # for any int/bool use Int64, for float preserve dtype if is_float_dtype(data.dtype): expected_dtype = data.dtype else: expected_dtype = pd.Int64Dtype() elif action == "always_float": # for any int/bool use Float64, for float preserve dtype if is_float_dtype(data.dtype): expected_dtype = data.dtype else: expected_dtype = pd.Float64Dtype() elif action == "preserve": expected_dtype = data.dtype result = getattr(grouped, op_name)() assert result["col"].dtype == expected_dtype result = grouped.aggregate(op_name) assert result["col"].dtype == expected_dtype result = getattr(grouped["col"], op_name)() assert result.dtype == expected_dtype result = grouped["col"].aggregate(op_name) assert result.dtype == expected_dtype