import numpy as np import pytest import pandas as pd from pandas import DataFrame, Series, Timestamp import pandas._testing as tm def test_filter_series(): s = Series([1, 3, 20, 5, 22, 24, 7]) expected_odd = Series([1, 3, 5, 7], index=[0, 1, 3, 6]) expected_even = Series([20, 22, 24], index=[2, 4, 5]) grouper = s.apply(lambda x: x % 2) grouped = s.groupby(grouper) tm.assert_series_equal(grouped.filter(lambda x: x.mean() < 10), expected_odd) tm.assert_series_equal(grouped.filter(lambda x: x.mean() > 10), expected_even) # Test dropna=False. tm.assert_series_equal( grouped.filter(lambda x: x.mean() < 10, dropna=False), expected_odd.reindex(s.index), ) tm.assert_series_equal( grouped.filter(lambda x: x.mean() > 10, dropna=False), expected_even.reindex(s.index), ) def test_filter_single_column_df(): df = DataFrame([1, 3, 20, 5, 22, 24, 7]) expected_odd = DataFrame([1, 3, 5, 7], index=[0, 1, 3, 6]) expected_even = DataFrame([20, 22, 24], index=[2, 4, 5]) grouper = df[0].apply(lambda x: x % 2) grouped = df.groupby(grouper) tm.assert_frame_equal(grouped.filter(lambda x: x.mean() < 10), expected_odd) tm.assert_frame_equal(grouped.filter(lambda x: x.mean() > 10), expected_even) # Test dropna=False. tm.assert_frame_equal( grouped.filter(lambda x: x.mean() < 10, dropna=False), expected_odd.reindex(df.index), ) tm.assert_frame_equal( grouped.filter(lambda x: x.mean() > 10, dropna=False), expected_even.reindex(df.index), ) def test_filter_multi_column_df(): df = DataFrame({"A": [1, 12, 12, 1], "B": [1, 1, 1, 1]}) grouper = df["A"].apply(lambda x: x % 2) grouped = df.groupby(grouper) expected = DataFrame({"A": [12, 12], "B": [1, 1]}, index=[1, 2]) tm.assert_frame_equal( grouped.filter(lambda x: x["A"].sum() - x["B"].sum() > 10), expected ) def test_filter_mixed_df(): df = DataFrame({"A": [1, 12, 12, 1], "B": "a b c d".split()}) grouper = df["A"].apply(lambda x: x % 2) grouped = df.groupby(grouper) expected = DataFrame({"A": [12, 12], "B": ["b", "c"]}, index=[1, 2]) tm.assert_frame_equal(grouped.filter(lambda x: x["A"].sum() > 10), expected) def test_filter_out_all_groups(): s = Series([1, 3, 20, 5, 22, 24, 7]) grouper = s.apply(lambda x: x % 2) grouped = s.groupby(grouper) tm.assert_series_equal(grouped.filter(lambda x: x.mean() > 1000), s[[]]) df = DataFrame({"A": [1, 12, 12, 1], "B": "a b c d".split()}) grouper = df["A"].apply(lambda x: x % 2) grouped = df.groupby(grouper) tm.assert_frame_equal(grouped.filter(lambda x: x["A"].sum() > 1000), df.loc[[]]) def test_filter_out_no_groups(): s = Series([1, 3, 20, 5, 22, 24, 7]) grouper = s.apply(lambda x: x % 2) grouped = s.groupby(grouper) filtered = grouped.filter(lambda x: x.mean() > 0) tm.assert_series_equal(filtered, s) df = DataFrame({"A": [1, 12, 12, 1], "B": "a b c d".split()}) grouper = df["A"].apply(lambda x: x % 2) grouped = df.groupby(grouper) filtered = grouped.filter(lambda x: x["A"].mean() > 0) tm.assert_frame_equal(filtered, df) def test_filter_out_all_groups_in_df(): # GH12768 df = DataFrame({"a": [1, 1, 2], "b": [1, 2, 0]}) res = df.groupby("a") res = res.filter(lambda x: x["b"].sum() > 5, dropna=False) expected = DataFrame({"a": [np.nan] * 3, "b": [np.nan] * 3}) tm.assert_frame_equal(expected, res) df = DataFrame({"a": [1, 1, 2], "b": [1, 2, 0]}) res = df.groupby("a") res = res.filter(lambda x: x["b"].sum() > 5, dropna=True) expected = DataFrame({"a": [], "b": []}, dtype="int64") tm.assert_frame_equal(expected, res) def test_filter_condition_raises(): def raise_if_sum_is_zero(x): if x.sum() == 0: raise ValueError else: return x.sum() > 0 s = Series([-1, 0, 1, 2]) grouper = s.apply(lambda x: x % 2) grouped = s.groupby(grouper) msg = "the filter must return a boolean result" with pytest.raises(TypeError, match=msg): grouped.filter(raise_if_sum_is_zero) def test_filter_with_axis_in_groupby(): # issue 11041 index = pd.MultiIndex.from_product([range(10), [0, 1]]) data = DataFrame(np.arange(100).reshape(-1, 20), columns=index, dtype="int64") result = data.groupby(level=0, axis=1).filter(lambda x: x.iloc[0, 0] > 10) expected = data.iloc[:, 12:20] tm.assert_frame_equal(result, expected) def test_filter_bad_shapes(): df = DataFrame({"A": np.arange(8), "B": list("aabbbbcc"), "C": np.arange(8)}) s = df["B"] g_df = df.groupby("B") g_s = s.groupby(s) f = lambda x: x msg = "filter function returned a DataFrame, but expected a scalar bool" with pytest.raises(TypeError, match=msg): g_df.filter(f) msg = "the filter must return a boolean result" with pytest.raises(TypeError, match=msg): g_s.filter(f) f = lambda x: x == 1 msg = "filter function returned a DataFrame, but expected a scalar bool" with pytest.raises(TypeError, match=msg): g_df.filter(f) msg = "the filter must return a boolean result" with pytest.raises(TypeError, match=msg): g_s.filter(f) f = lambda x: np.outer(x, x) msg = "can't multiply sequence by non-int of type 'str'" with pytest.raises(TypeError, match=msg): g_df.filter(f) msg = "the filter must return a boolean result" with pytest.raises(TypeError, match=msg): g_s.filter(f) def test_filter_nan_is_false(): df = DataFrame({"A": np.arange(8), "B": list("aabbbbcc"), "C": np.arange(8)}) s = df["B"] g_df = df.groupby(df["B"]) g_s = s.groupby(s) f = lambda x: np.nan tm.assert_frame_equal(g_df.filter(f), df.loc[[]]) tm.assert_series_equal(g_s.filter(f), s[[]]) def test_filter_against_workaround(): np.random.seed(0) # Series of ints s = Series(np.random.randint(0, 100, 1000)) grouper = s.apply(lambda x: np.round(x, -1)) grouped = s.groupby(grouper) f = lambda x: x.mean() > 10 old_way = s[grouped.transform(f).astype("bool")] new_way = grouped.filter(f) tm.assert_series_equal(new_way.sort_values(), old_way.sort_values()) # Series of floats s = 100 * Series(np.random.random(1000)) grouper = s.apply(lambda x: np.round(x, -1)) grouped = s.groupby(grouper) f = lambda x: x.mean() > 10 old_way = s[grouped.transform(f).astype("bool")] new_way = grouped.filter(f) tm.assert_series_equal(new_way.sort_values(), old_way.sort_values()) # Set up DataFrame of ints, floats, strings. from string import ascii_lowercase letters = np.array(list(ascii_lowercase)) N = 1000 random_letters = letters.take(np.random.randint(0, 26, N)) df = DataFrame( { "ints": Series(np.random.randint(0, 100, N)), "floats": N / 10 * Series(np.random.random(N)), "letters": Series(random_letters), } ) # Group by ints; filter on floats. grouped = df.groupby("ints") old_way = df[grouped.floats.transform(lambda x: x.mean() > N / 20).astype("bool")] new_way = grouped.filter(lambda x: x["floats"].mean() > N / 20) tm.assert_frame_equal(new_way, old_way) # Group by floats (rounded); filter on strings. grouper = df.floats.apply(lambda x: np.round(x, -1)) grouped = df.groupby(grouper) old_way = df[grouped.letters.transform(lambda x: len(x) < N / 10).astype("bool")] new_way = grouped.filter(lambda x: len(x.letters) < N / 10) tm.assert_frame_equal(new_way, old_way) # Group by strings; filter on ints. grouped = df.groupby("letters") old_way = df[grouped.ints.transform(lambda x: x.mean() > N / 20).astype("bool")] new_way = grouped.filter(lambda x: x["ints"].mean() > N / 20) tm.assert_frame_equal(new_way, old_way) def test_filter_using_len(): # BUG GH4447 df = DataFrame({"A": np.arange(8), "B": list("aabbbbcc"), "C": np.arange(8)}) grouped = df.groupby("B") actual = grouped.filter(lambda x: len(x) > 2) expected = DataFrame( {"A": np.arange(2, 6), "B": list("bbbb"), "C": np.arange(2, 6)}, index=np.arange(2, 6), ) tm.assert_frame_equal(actual, expected) actual = grouped.filter(lambda x: len(x) > 4) expected = df.loc[[]] tm.assert_frame_equal(actual, expected) # Series have always worked properly, but we'll test anyway. s = df["B"] grouped = s.groupby(s) actual = grouped.filter(lambda x: len(x) > 2) expected = Series(4 * ["b"], index=np.arange(2, 6), name="B") tm.assert_series_equal(actual, expected) actual = grouped.filter(lambda x: len(x) > 4) expected = s[[]] tm.assert_series_equal(actual, expected) def test_filter_maintains_ordering(): # Simple case: index is sequential. #4621 df = DataFrame( {"pid": [1, 1, 1, 2, 2, 3, 3, 3], "tag": [23, 45, 62, 24, 45, 34, 25, 62]} ) s = df["pid"] grouped = df.groupby("tag") actual = grouped.filter(lambda x: len(x) > 1) expected = df.iloc[[1, 2, 4, 7]] tm.assert_frame_equal(actual, expected) grouped = s.groupby(df["tag"]) actual = grouped.filter(lambda x: len(x) > 1) expected = s.iloc[[1, 2, 4, 7]] tm.assert_series_equal(actual, expected) # Now index is sequentially decreasing. df.index = np.arange(len(df) - 1, -1, -1) s = df["pid"] grouped = df.groupby("tag") actual = grouped.filter(lambda x: len(x) > 1) expected = df.iloc[[1, 2, 4, 7]] tm.assert_frame_equal(actual, expected) grouped = s.groupby(df["tag"]) actual = grouped.filter(lambda x: len(x) > 1) expected = s.iloc[[1, 2, 4, 7]] tm.assert_series_equal(actual, expected) # Index is shuffled. SHUFFLED = [4, 6, 7, 2, 1, 0, 5, 3] df.index = df.index[SHUFFLED] s = df["pid"] grouped = df.groupby("tag") actual = grouped.filter(lambda x: len(x) > 1) expected = df.iloc[[1, 2, 4, 7]] tm.assert_frame_equal(actual, expected) grouped = s.groupby(df["tag"]) actual = grouped.filter(lambda x: len(x) > 1) expected = s.iloc[[1, 2, 4, 7]] tm.assert_series_equal(actual, expected) def test_filter_multiple_timestamp(): # GH 10114 df = DataFrame( { "A": np.arange(5, dtype="int64"), "B": ["foo", "bar", "foo", "bar", "bar"], "C": Timestamp("20130101"), } ) grouped = df.groupby(["B", "C"]) result = grouped["A"].filter(lambda x: True) tm.assert_series_equal(df["A"], result) result = grouped["A"].transform(len) expected = Series([2, 3, 2, 3, 3], name="A") tm.assert_series_equal(result, expected) result = grouped.filter(lambda x: True) tm.assert_frame_equal(df, result) result = grouped.transform("sum") expected = DataFrame({"A": [2, 8, 2, 8, 8]}) tm.assert_frame_equal(result, expected) result = grouped.transform(len) expected = DataFrame({"A": [2, 3, 2, 3, 3]}) tm.assert_frame_equal(result, expected) def test_filter_and_transform_with_non_unique_int_index(): # GH4620 index = [1, 1, 1, 2, 1, 1, 0, 1] df = DataFrame( {"pid": [1, 1, 1, 2, 2, 3, 3, 3], "tag": [23, 45, 62, 24, 45, 34, 25, 62]}, index=index, ) grouped_df = df.groupby("tag") ser = df["pid"] grouped_ser = ser.groupby(df["tag"]) expected_indexes = [1, 2, 4, 7] # Filter DataFrame actual = grouped_df.filter(lambda x: len(x) > 1) expected = df.iloc[expected_indexes] tm.assert_frame_equal(actual, expected) actual = grouped_df.filter(lambda x: len(x) > 1, dropna=False) expected = df.copy() expected.iloc[[0, 3, 5, 6]] = np.nan tm.assert_frame_equal(actual, expected) # Filter Series actual = grouped_ser.filter(lambda x: len(x) > 1) expected = ser.take(expected_indexes) tm.assert_series_equal(actual, expected) actual = grouped_ser.filter(lambda x: len(x) > 1, dropna=False) NA = np.nan expected = Series([NA, 1, 1, NA, 2, NA, NA, 3], index, name="pid") # ^ made manually because this can get confusing! tm.assert_series_equal(actual, expected) # Transform Series actual = grouped_ser.transform(len) expected = Series([1, 2, 2, 1, 2, 1, 1, 2], index, name="pid") tm.assert_series_equal(actual, expected) # Transform (a column from) DataFrameGroupBy actual = grouped_df.pid.transform(len) tm.assert_series_equal(actual, expected) def test_filter_and_transform_with_multiple_non_unique_int_index(): # GH4620 index = [1, 1, 1, 2, 0, 0, 0, 1] df = DataFrame( {"pid": [1, 1, 1, 2, 2, 3, 3, 3], "tag": [23, 45, 62, 24, 45, 34, 25, 62]}, index=index, ) grouped_df = df.groupby("tag") ser = df["pid"] grouped_ser = ser.groupby(df["tag"]) expected_indexes = [1, 2, 4, 7] # Filter DataFrame actual = grouped_df.filter(lambda x: len(x) > 1) expected = df.iloc[expected_indexes] tm.assert_frame_equal(actual, expected) actual = grouped_df.filter(lambda x: len(x) > 1, dropna=False) expected = df.copy() expected.iloc[[0, 3, 5, 6]] = np.nan tm.assert_frame_equal(actual, expected) # Filter Series actual = grouped_ser.filter(lambda x: len(x) > 1) expected = ser.take(expected_indexes) tm.assert_series_equal(actual, expected) actual = grouped_ser.filter(lambda x: len(x) > 1, dropna=False) NA = np.nan expected = Series([NA, 1, 1, NA, 2, NA, NA, 3], index, name="pid") # ^ made manually because this can get confusing! tm.assert_series_equal(actual, expected) # Transform Series actual = grouped_ser.transform(len) expected = Series([1, 2, 2, 1, 2, 1, 1, 2], index, name="pid") tm.assert_series_equal(actual, expected) # Transform (a column from) DataFrameGroupBy actual = grouped_df.pid.transform(len) tm.assert_series_equal(actual, expected) def test_filter_and_transform_with_non_unique_float_index(): # GH4620 index = np.array([1, 1, 1, 2, 1, 1, 0, 1], dtype=float) df = DataFrame( {"pid": [1, 1, 1, 2, 2, 3, 3, 3], "tag": [23, 45, 62, 24, 45, 34, 25, 62]}, index=index, ) grouped_df = df.groupby("tag") ser = df["pid"] grouped_ser = ser.groupby(df["tag"]) expected_indexes = [1, 2, 4, 7] # Filter DataFrame actual = grouped_df.filter(lambda x: len(x) > 1) expected = df.iloc[expected_indexes] tm.assert_frame_equal(actual, expected) actual = grouped_df.filter(lambda x: len(x) > 1, dropna=False) expected = df.copy() expected.iloc[[0, 3, 5, 6]] = np.nan tm.assert_frame_equal(actual, expected) # Filter Series actual = grouped_ser.filter(lambda x: len(x) > 1) expected = ser.take(expected_indexes) tm.assert_series_equal(actual, expected) actual = grouped_ser.filter(lambda x: len(x) > 1, dropna=False) NA = np.nan expected = Series([NA, 1, 1, NA, 2, NA, NA, 3], index, name="pid") # ^ made manually because this can get confusing! tm.assert_series_equal(actual, expected) # Transform Series actual = grouped_ser.transform(len) expected = Series([1, 2, 2, 1, 2, 1, 1, 2], index, name="pid") tm.assert_series_equal(actual, expected) # Transform (a column from) DataFrameGroupBy actual = grouped_df.pid.transform(len) tm.assert_series_equal(actual, expected) def test_filter_and_transform_with_non_unique_timestamp_index(): # GH4620 t0 = Timestamp("2013-09-30 00:05:00") t1 = Timestamp("2013-10-30 00:05:00") t2 = Timestamp("2013-11-30 00:05:00") index = [t1, t1, t1, t2, t1, t1, t0, t1] df = DataFrame( {"pid": [1, 1, 1, 2, 2, 3, 3, 3], "tag": [23, 45, 62, 24, 45, 34, 25, 62]}, index=index, ) grouped_df = df.groupby("tag") ser = df["pid"] grouped_ser = ser.groupby(df["tag"]) expected_indexes = [1, 2, 4, 7] # Filter DataFrame actual = grouped_df.filter(lambda x: len(x) > 1) expected = df.iloc[expected_indexes] tm.assert_frame_equal(actual, expected) actual = grouped_df.filter(lambda x: len(x) > 1, dropna=False) expected = df.copy() expected.iloc[[0, 3, 5, 6]] = np.nan tm.assert_frame_equal(actual, expected) # Filter Series actual = grouped_ser.filter(lambda x: len(x) > 1) expected = ser.take(expected_indexes) tm.assert_series_equal(actual, expected) actual = grouped_ser.filter(lambda x: len(x) > 1, dropna=False) NA = np.nan expected = Series([NA, 1, 1, NA, 2, NA, NA, 3], index, name="pid") # ^ made manually because this can get confusing! tm.assert_series_equal(actual, expected) # Transform Series actual = grouped_ser.transform(len) expected = Series([1, 2, 2, 1, 2, 1, 1, 2], index, name="pid") tm.assert_series_equal(actual, expected) # Transform (a column from) DataFrameGroupBy actual = grouped_df.pid.transform(len) tm.assert_series_equal(actual, expected) def test_filter_and_transform_with_non_unique_string_index(): # GH4620 index = list("bbbcbbab") df = DataFrame( {"pid": [1, 1, 1, 2, 2, 3, 3, 3], "tag": [23, 45, 62, 24, 45, 34, 25, 62]}, index=index, ) grouped_df = df.groupby("tag") ser = df["pid"] grouped_ser = ser.groupby(df["tag"]) expected_indexes = [1, 2, 4, 7] # Filter DataFrame actual = grouped_df.filter(lambda x: len(x) > 1) expected = df.iloc[expected_indexes] tm.assert_frame_equal(actual, expected) actual = grouped_df.filter(lambda x: len(x) > 1, dropna=False) expected = df.copy() expected.iloc[[0, 3, 5, 6]] = np.nan tm.assert_frame_equal(actual, expected) # Filter Series actual = grouped_ser.filter(lambda x: len(x) > 1) expected = ser.take(expected_indexes) tm.assert_series_equal(actual, expected) actual = grouped_ser.filter(lambda x: len(x) > 1, dropna=False) NA = np.nan expected = Series([NA, 1, 1, NA, 2, NA, NA, 3], index, name="pid") # ^ made manually because this can get confusing! tm.assert_series_equal(actual, expected) # Transform Series actual = grouped_ser.transform(len) expected = Series([1, 2, 2, 1, 2, 1, 1, 2], index, name="pid") tm.assert_series_equal(actual, expected) # Transform (a column from) DataFrameGroupBy actual = grouped_df.pid.transform(len) tm.assert_series_equal(actual, expected) def test_filter_has_access_to_grouped_cols(): df = DataFrame([[1, 2], [1, 3], [5, 6]], columns=["A", "B"]) g = df.groupby("A") # previously didn't have access to col A #???? filt = g.filter(lambda x: x["A"].sum() == 2) tm.assert_frame_equal(filt, df.iloc[[0, 1]]) def test_filter_enforces_scalarness(): df = DataFrame( [ ["best", "a", "x"], ["worst", "b", "y"], ["best", "c", "x"], ["best", "d", "y"], ["worst", "d", "y"], ["worst", "d", "y"], ["best", "d", "z"], ], columns=["a", "b", "c"], ) with pytest.raises(TypeError, match="filter function returned a.*"): df.groupby("c").filter(lambda g: g["a"] == "best") def test_filter_non_bool_raises(): df = DataFrame( [ ["best", "a", 1], ["worst", "b", 1], ["best", "c", 1], ["best", "d", 1], ["worst", "d", 1], ["worst", "d", 1], ["best", "d", 1], ], columns=["a", "b", "c"], ) with pytest.raises(TypeError, match="filter function returned a.*"): df.groupby("a").filter(lambda g: g.c.mean()) def test_filter_dropna_with_empty_groups(): # GH 10780 data = Series(np.random.rand(9), index=np.repeat([1, 2, 3], 3)) groupped = data.groupby(level=0) result_false = groupped.filter(lambda x: x.mean() > 1, dropna=False) expected_false = Series([np.nan] * 9, index=np.repeat([1, 2, 3], 3)) tm.assert_series_equal(result_false, expected_false) result_true = groupped.filter(lambda x: x.mean() > 1, dropna=True) expected_true = Series(index=pd.Index([], dtype=int), dtype=np.float64) tm.assert_series_equal(result_true, expected_true)