import numpy as np import pytest import pandas as pd import pandas._testing as tm @pytest.mark.parametrize( "dropna, tuples, outputs", [ ( True, [["A", "B"], ["B", "A"]], {"c": [13.0, 123.23], "d": [13.0, 123.0], "e": [13.0, 1.0]}, ), ( False, [["A", "B"], ["A", np.nan], ["B", "A"]], { "c": [13.0, 12.3, 123.23], "d": [13.0, 233.0, 123.0], "e": [13.0, 12.0, 1.0], }, ), ], ) def test_groupby_dropna_multi_index_dataframe_nan_in_one_group( dropna, tuples, outputs, nulls_fixture ): # GH 3729 this is to test that NA is in one group df_list = [ ["A", "B", 12, 12, 12], ["A", nulls_fixture, 12.3, 233.0, 12], ["B", "A", 123.23, 123, 1], ["A", "B", 1, 1, 1.0], ] df = pd.DataFrame(df_list, columns=["a", "b", "c", "d", "e"]) grouped = df.groupby(["a", "b"], dropna=dropna).sum() mi = pd.MultiIndex.from_tuples(tuples, names=list("ab")) # Since right now, by default MI will drop NA from levels when we create MI # via `from_*`, so we need to add NA for level manually afterwards. if not dropna: mi = mi.set_levels(["A", "B", np.nan], level="b") expected = pd.DataFrame(outputs, index=mi) tm.assert_frame_equal(grouped, expected) @pytest.mark.parametrize( "dropna, tuples, outputs", [ ( True, [["A", "B"], ["B", "A"]], {"c": [12.0, 123.23], "d": [12.0, 123.0], "e": [12.0, 1.0]}, ), ( False, [["A", "B"], ["A", np.nan], ["B", "A"], [np.nan, "B"]], { "c": [12.0, 13.3, 123.23, 1.0], "d": [12.0, 234.0, 123.0, 1.0], "e": [12.0, 13.0, 1.0, 1.0], }, ), ], ) def test_groupby_dropna_multi_index_dataframe_nan_in_two_groups( dropna, tuples, outputs, nulls_fixture, nulls_fixture2 ): # GH 3729 this is to test that NA in different groups with different representations df_list = [ ["A", "B", 12, 12, 12], ["A", nulls_fixture, 12.3, 233.0, 12], ["B", "A", 123.23, 123, 1], [nulls_fixture2, "B", 1, 1, 1.0], ["A", nulls_fixture2, 1, 1, 1.0], ] df = pd.DataFrame(df_list, columns=["a", "b", "c", "d", "e"]) grouped = df.groupby(["a", "b"], dropna=dropna).sum() mi = pd.MultiIndex.from_tuples(tuples, names=list("ab")) # Since right now, by default MI will drop NA from levels when we create MI # via `from_*`, so we need to add NA for level manually afterwards. if not dropna: mi = mi.set_levels([["A", "B", np.nan], ["A", "B", np.nan]]) expected = pd.DataFrame(outputs, index=mi) tm.assert_frame_equal(grouped, expected) @pytest.mark.parametrize( "dropna, idx, outputs", [ (True, ["A", "B"], {"b": [123.23, 13.0], "c": [123.0, 13.0], "d": [1.0, 13.0]}), ( False, ["A", "B", np.nan], { "b": [123.23, 13.0, 12.3], "c": [123.0, 13.0, 233.0], "d": [1.0, 13.0, 12.0], }, ), ], ) def test_groupby_dropna_normal_index_dataframe(dropna, idx, outputs): # GH 3729 df_list = [ ["B", 12, 12, 12], [None, 12.3, 233.0, 12], ["A", 123.23, 123, 1], ["B", 1, 1, 1.0], ] df = pd.DataFrame(df_list, columns=["a", "b", "c", "d"]) grouped = df.groupby("a", dropna=dropna).sum() expected = pd.DataFrame(outputs, index=pd.Index(idx, dtype="object", name="a")) tm.assert_frame_equal(grouped, expected) @pytest.mark.parametrize( "dropna, idx, expected", [ (True, ["a", "a", "b", np.nan], pd.Series([3, 3], index=["a", "b"])), ( False, ["a", "a", "b", np.nan], pd.Series([3, 3, 3], index=["a", "b", np.nan]), ), ], ) def test_groupby_dropna_series_level(dropna, idx, expected): ser = pd.Series([1, 2, 3, 3], index=idx) result = ser.groupby(level=0, dropna=dropna).sum() tm.assert_series_equal(result, expected) @pytest.mark.parametrize( "dropna, expected", [ (True, pd.Series([210.0, 350.0], index=["a", "b"], name="Max Speed")), ( False, pd.Series([210.0, 350.0, 20.0], index=["a", "b", np.nan], name="Max Speed"), ), ], ) def test_groupby_dropna_series_by(dropna, expected): ser = pd.Series( [390.0, 350.0, 30.0, 20.0], index=["Falcon", "Falcon", "Parrot", "Parrot"], name="Max Speed", ) result = ser.groupby(["a", "b", "a", np.nan], dropna=dropna).mean() tm.assert_series_equal(result, expected) @pytest.mark.parametrize("dropna", (False, True)) def test_grouper_dropna_propagation(dropna): # GH 36604 df = pd.DataFrame({"A": [0, 0, 1, None], "B": [1, 2, 3, None]}) gb = df.groupby("A", dropna=dropna) assert gb.grouper.dropna == dropna @pytest.mark.parametrize( "dropna,df_expected,s_expected", [ pytest.param( True, pd.DataFrame({"B": [2, 2, 1]}), pd.Series(data=[2, 2, 1], name="B"), marks=pytest.mark.xfail(raises=ValueError), ), ( False, pd.DataFrame({"B": [2, 2, 1, 1]}), pd.Series(data=[2, 2, 1, 1], name="B"), ), ], ) def test_slice_groupby_then_transform(dropna, df_expected, s_expected): # GH35014 df = pd.DataFrame({"A": [0, 0, 1, None], "B": [1, 2, 3, None]}) gb = df.groupby("A", dropna=dropna) res = gb.transform(len) tm.assert_frame_equal(res, df_expected) gb_slice = gb[["B"]] res = gb_slice.transform(len) tm.assert_frame_equal(res, df_expected) res = gb["B"].transform(len) tm.assert_series_equal(res, s_expected) @pytest.mark.parametrize( "dropna, tuples, outputs", [ ( True, [["A", "B"], ["B", "A"]], {"c": [13.0, 123.23], "d": [12.0, 123.0], "e": [1.0, 1.0]}, ), ( False, [["A", "B"], ["A", np.nan], ["B", "A"]], { "c": [13.0, 12.3, 123.23], "d": [12.0, 233.0, 123.0], "e": [1.0, 12.0, 1.0], }, ), ], ) def test_groupby_dropna_multi_index_dataframe_agg(dropna, tuples, outputs): # GH 3729 df_list = [ ["A", "B", 12, 12, 12], ["A", None, 12.3, 233.0, 12], ["B", "A", 123.23, 123, 1], ["A", "B", 1, 1, 1.0], ] df = pd.DataFrame(df_list, columns=["a", "b", "c", "d", "e"]) agg_dict = {"c": sum, "d": max, "e": "min"} grouped = df.groupby(["a", "b"], dropna=dropna).agg(agg_dict) mi = pd.MultiIndex.from_tuples(tuples, names=list("ab")) # Since right now, by default MI will drop NA from levels when we create MI # via `from_*`, so we need to add NA for level manually afterwards. if not dropna: mi = mi.set_levels(["A", "B", np.nan], level="b") expected = pd.DataFrame(outputs, index=mi) tm.assert_frame_equal(grouped, expected) @pytest.mark.arm_slow @pytest.mark.parametrize( "datetime1, datetime2", [ (pd.Timestamp("2020-01-01"), pd.Timestamp("2020-02-01")), (pd.Timedelta("-2 days"), pd.Timedelta("-1 days")), (pd.Period("2020-01-01"), pd.Period("2020-02-01")), ], ) @pytest.mark.parametrize("dropna, values", [(True, [12, 3]), (False, [12, 3, 6])]) def test_groupby_dropna_datetime_like_data( dropna, values, datetime1, datetime2, unique_nulls_fixture, unique_nulls_fixture2 ): # 3729 df = pd.DataFrame( { "values": [1, 2, 3, 4, 5, 6], "dt": [ datetime1, unique_nulls_fixture, datetime2, unique_nulls_fixture2, datetime1, datetime1, ], } ) if dropna: indexes = [datetime1, datetime2] else: indexes = [datetime1, datetime2, np.nan] grouped = df.groupby("dt", dropna=dropna).agg({"values": sum}) expected = pd.DataFrame({"values": values}, index=pd.Index(indexes, name="dt")) tm.assert_frame_equal(grouped, expected) @pytest.mark.parametrize( "dropna, data, selected_data, levels", [ pytest.param( False, {"groups": ["a", "a", "b", np.nan], "values": [10, 10, 20, 30]}, {"values": [0, 1, 0, 0]}, ["a", "b", np.nan], id="dropna_false_has_nan", ), pytest.param( True, {"groups": ["a", "a", "b", np.nan], "values": [10, 10, 20, 30]}, {"values": [0, 1, 0]}, None, id="dropna_true_has_nan", ), pytest.param( # no nan in "groups"; dropna=True|False should be same. False, {"groups": ["a", "a", "b", "c"], "values": [10, 10, 20, 30]}, {"values": [0, 1, 0, 0]}, None, id="dropna_false_no_nan", ), pytest.param( # no nan in "groups"; dropna=True|False should be same. True, {"groups": ["a", "a", "b", "c"], "values": [10, 10, 20, 30]}, {"values": [0, 1, 0, 0]}, None, id="dropna_true_no_nan", ), ], ) def test_groupby_apply_with_dropna_for_multi_index(dropna, data, selected_data, levels): # GH 35889 df = pd.DataFrame(data) gb = df.groupby("groups", dropna=dropna) result = gb.apply(lambda grp: pd.DataFrame({"values": range(len(grp))})) mi_tuples = tuple(zip(data["groups"], selected_data["values"])) mi = pd.MultiIndex.from_tuples(mi_tuples, names=["groups", None]) # Since right now, by default MI will drop NA from levels when we create MI # via `from_*`, so we need to add NA for level manually afterwards. if not dropna and levels: mi = mi.set_levels(levels, level="groups") expected = pd.DataFrame(selected_data, index=mi) tm.assert_frame_equal(result, expected) def test_groupby_nan_included(): # GH 35646 data = {"group": ["g1", np.nan, "g1", "g2", np.nan], "B": [0, 1, 2, 3, 4]} df = pd.DataFrame(data) grouped = df.groupby("group", dropna=False) result = grouped.indices dtype = np.intp expected = { "g1": np.array([0, 2], dtype=dtype), "g2": np.array([3], dtype=dtype), np.nan: np.array([1, 4], dtype=dtype), } for result_values, expected_values in zip(result.values(), expected.values()): tm.assert_numpy_array_equal(result_values, expected_values) assert np.isnan(list(result.keys())[2]) assert list(result.keys())[0:2] == ["g1", "g2"]