projektAI/venv/Lib/site-packages/pandas/tests/groupby/test_filters.py

598 lines
20 KiB
Python
Raw Normal View History

2021-06-06 22:13:05 +02:00
import numpy as np
import pytest
import pandas as pd
from pandas import DataFrame, Series, Timestamp
import pandas._testing as tm
def test_filter_series():
s = Series([1, 3, 20, 5, 22, 24, 7])
expected_odd = Series([1, 3, 5, 7], index=[0, 1, 3, 6])
expected_even = Series([20, 22, 24], index=[2, 4, 5])
grouper = s.apply(lambda x: x % 2)
grouped = s.groupby(grouper)
tm.assert_series_equal(grouped.filter(lambda x: x.mean() < 10), expected_odd)
tm.assert_series_equal(grouped.filter(lambda x: x.mean() > 10), expected_even)
# Test dropna=False.
tm.assert_series_equal(
grouped.filter(lambda x: x.mean() < 10, dropna=False),
expected_odd.reindex(s.index),
)
tm.assert_series_equal(
grouped.filter(lambda x: x.mean() > 10, dropna=False),
expected_even.reindex(s.index),
)
def test_filter_single_column_df():
df = DataFrame([1, 3, 20, 5, 22, 24, 7])
expected_odd = DataFrame([1, 3, 5, 7], index=[0, 1, 3, 6])
expected_even = DataFrame([20, 22, 24], index=[2, 4, 5])
grouper = df[0].apply(lambda x: x % 2)
grouped = df.groupby(grouper)
tm.assert_frame_equal(grouped.filter(lambda x: x.mean() < 10), expected_odd)
tm.assert_frame_equal(grouped.filter(lambda x: x.mean() > 10), expected_even)
# Test dropna=False.
tm.assert_frame_equal(
grouped.filter(lambda x: x.mean() < 10, dropna=False),
expected_odd.reindex(df.index),
)
tm.assert_frame_equal(
grouped.filter(lambda x: x.mean() > 10, dropna=False),
expected_even.reindex(df.index),
)
def test_filter_multi_column_df():
df = DataFrame({"A": [1, 12, 12, 1], "B": [1, 1, 1, 1]})
grouper = df["A"].apply(lambda x: x % 2)
grouped = df.groupby(grouper)
expected = DataFrame({"A": [12, 12], "B": [1, 1]}, index=[1, 2])
tm.assert_frame_equal(
grouped.filter(lambda x: x["A"].sum() - x["B"].sum() > 10), expected
)
def test_filter_mixed_df():
df = DataFrame({"A": [1, 12, 12, 1], "B": "a b c d".split()})
grouper = df["A"].apply(lambda x: x % 2)
grouped = df.groupby(grouper)
expected = DataFrame({"A": [12, 12], "B": ["b", "c"]}, index=[1, 2])
tm.assert_frame_equal(grouped.filter(lambda x: x["A"].sum() > 10), expected)
def test_filter_out_all_groups():
s = Series([1, 3, 20, 5, 22, 24, 7])
grouper = s.apply(lambda x: x % 2)
grouped = s.groupby(grouper)
tm.assert_series_equal(grouped.filter(lambda x: x.mean() > 1000), s[[]])
df = DataFrame({"A": [1, 12, 12, 1], "B": "a b c d".split()})
grouper = df["A"].apply(lambda x: x % 2)
grouped = df.groupby(grouper)
tm.assert_frame_equal(grouped.filter(lambda x: x["A"].sum() > 1000), df.loc[[]])
def test_filter_out_no_groups():
s = Series([1, 3, 20, 5, 22, 24, 7])
grouper = s.apply(lambda x: x % 2)
grouped = s.groupby(grouper)
filtered = grouped.filter(lambda x: x.mean() > 0)
tm.assert_series_equal(filtered, s)
df = DataFrame({"A": [1, 12, 12, 1], "B": "a b c d".split()})
grouper = df["A"].apply(lambda x: x % 2)
grouped = df.groupby(grouper)
filtered = grouped.filter(lambda x: x["A"].mean() > 0)
tm.assert_frame_equal(filtered, df)
def test_filter_out_all_groups_in_df():
# GH12768
df = DataFrame({"a": [1, 1, 2], "b": [1, 2, 0]})
res = df.groupby("a")
res = res.filter(lambda x: x["b"].sum() > 5, dropna=False)
expected = DataFrame({"a": [np.nan] * 3, "b": [np.nan] * 3})
tm.assert_frame_equal(expected, res)
df = DataFrame({"a": [1, 1, 2], "b": [1, 2, 0]})
res = df.groupby("a")
res = res.filter(lambda x: x["b"].sum() > 5, dropna=True)
expected = DataFrame({"a": [], "b": []}, dtype="int64")
tm.assert_frame_equal(expected, res)
def test_filter_condition_raises():
def raise_if_sum_is_zero(x):
if x.sum() == 0:
raise ValueError
else:
return x.sum() > 0
s = Series([-1, 0, 1, 2])
grouper = s.apply(lambda x: x % 2)
grouped = s.groupby(grouper)
msg = "the filter must return a boolean result"
with pytest.raises(TypeError, match=msg):
grouped.filter(raise_if_sum_is_zero)
def test_filter_with_axis_in_groupby():
# issue 11041
index = pd.MultiIndex.from_product([range(10), [0, 1]])
data = DataFrame(np.arange(100).reshape(-1, 20), columns=index, dtype="int64")
result = data.groupby(level=0, axis=1).filter(lambda x: x.iloc[0, 0] > 10)
expected = data.iloc[:, 12:20]
tm.assert_frame_equal(result, expected)
def test_filter_bad_shapes():
df = DataFrame({"A": np.arange(8), "B": list("aabbbbcc"), "C": np.arange(8)})
s = df["B"]
g_df = df.groupby("B")
g_s = s.groupby(s)
f = lambda x: x
msg = "filter function returned a DataFrame, but expected a scalar bool"
with pytest.raises(TypeError, match=msg):
g_df.filter(f)
msg = "the filter must return a boolean result"
with pytest.raises(TypeError, match=msg):
g_s.filter(f)
f = lambda x: x == 1
msg = "filter function returned a DataFrame, but expected a scalar bool"
with pytest.raises(TypeError, match=msg):
g_df.filter(f)
msg = "the filter must return a boolean result"
with pytest.raises(TypeError, match=msg):
g_s.filter(f)
f = lambda x: np.outer(x, x)
msg = "can't multiply sequence by non-int of type 'str'"
with pytest.raises(TypeError, match=msg):
g_df.filter(f)
msg = "the filter must return a boolean result"
with pytest.raises(TypeError, match=msg):
g_s.filter(f)
def test_filter_nan_is_false():
df = DataFrame({"A": np.arange(8), "B": list("aabbbbcc"), "C": np.arange(8)})
s = df["B"]
g_df = df.groupby(df["B"])
g_s = s.groupby(s)
f = lambda x: np.nan
tm.assert_frame_equal(g_df.filter(f), df.loc[[]])
tm.assert_series_equal(g_s.filter(f), s[[]])
def test_filter_against_workaround():
np.random.seed(0)
# Series of ints
s = Series(np.random.randint(0, 100, 1000))
grouper = s.apply(lambda x: np.round(x, -1))
grouped = s.groupby(grouper)
f = lambda x: x.mean() > 10
old_way = s[grouped.transform(f).astype("bool")]
new_way = grouped.filter(f)
tm.assert_series_equal(new_way.sort_values(), old_way.sort_values())
# Series of floats
s = 100 * Series(np.random.random(1000))
grouper = s.apply(lambda x: np.round(x, -1))
grouped = s.groupby(grouper)
f = lambda x: x.mean() > 10
old_way = s[grouped.transform(f).astype("bool")]
new_way = grouped.filter(f)
tm.assert_series_equal(new_way.sort_values(), old_way.sort_values())
# Set up DataFrame of ints, floats, strings.
from string import ascii_lowercase
letters = np.array(list(ascii_lowercase))
N = 1000
random_letters = letters.take(np.random.randint(0, 26, N))
df = DataFrame(
{
"ints": Series(np.random.randint(0, 100, N)),
"floats": N / 10 * Series(np.random.random(N)),
"letters": Series(random_letters),
}
)
# Group by ints; filter on floats.
grouped = df.groupby("ints")
old_way = df[grouped.floats.transform(lambda x: x.mean() > N / 20).astype("bool")]
new_way = grouped.filter(lambda x: x["floats"].mean() > N / 20)
tm.assert_frame_equal(new_way, old_way)
# Group by floats (rounded); filter on strings.
grouper = df.floats.apply(lambda x: np.round(x, -1))
grouped = df.groupby(grouper)
old_way = df[grouped.letters.transform(lambda x: len(x) < N / 10).astype("bool")]
new_way = grouped.filter(lambda x: len(x.letters) < N / 10)
tm.assert_frame_equal(new_way, old_way)
# Group by strings; filter on ints.
grouped = df.groupby("letters")
old_way = df[grouped.ints.transform(lambda x: x.mean() > N / 20).astype("bool")]
new_way = grouped.filter(lambda x: x["ints"].mean() > N / 20)
tm.assert_frame_equal(new_way, old_way)
def test_filter_using_len():
# BUG GH4447
df = DataFrame({"A": np.arange(8), "B": list("aabbbbcc"), "C": np.arange(8)})
grouped = df.groupby("B")
actual = grouped.filter(lambda x: len(x) > 2)
expected = DataFrame(
{"A": np.arange(2, 6), "B": list("bbbb"), "C": np.arange(2, 6)},
index=np.arange(2, 6),
)
tm.assert_frame_equal(actual, expected)
actual = grouped.filter(lambda x: len(x) > 4)
expected = df.loc[[]]
tm.assert_frame_equal(actual, expected)
# Series have always worked properly, but we'll test anyway.
s = df["B"]
grouped = s.groupby(s)
actual = grouped.filter(lambda x: len(x) > 2)
expected = Series(4 * ["b"], index=np.arange(2, 6), name="B")
tm.assert_series_equal(actual, expected)
actual = grouped.filter(lambda x: len(x) > 4)
expected = s[[]]
tm.assert_series_equal(actual, expected)
def test_filter_maintains_ordering():
# Simple case: index is sequential. #4621
df = DataFrame(
{"pid": [1, 1, 1, 2, 2, 3, 3, 3], "tag": [23, 45, 62, 24, 45, 34, 25, 62]}
)
s = df["pid"]
grouped = df.groupby("tag")
actual = grouped.filter(lambda x: len(x) > 1)
expected = df.iloc[[1, 2, 4, 7]]
tm.assert_frame_equal(actual, expected)
grouped = s.groupby(df["tag"])
actual = grouped.filter(lambda x: len(x) > 1)
expected = s.iloc[[1, 2, 4, 7]]
tm.assert_series_equal(actual, expected)
# Now index is sequentially decreasing.
df.index = np.arange(len(df) - 1, -1, -1)
s = df["pid"]
grouped = df.groupby("tag")
actual = grouped.filter(lambda x: len(x) > 1)
expected = df.iloc[[1, 2, 4, 7]]
tm.assert_frame_equal(actual, expected)
grouped = s.groupby(df["tag"])
actual = grouped.filter(lambda x: len(x) > 1)
expected = s.iloc[[1, 2, 4, 7]]
tm.assert_series_equal(actual, expected)
# Index is shuffled.
SHUFFLED = [4, 6, 7, 2, 1, 0, 5, 3]
df.index = df.index[SHUFFLED]
s = df["pid"]
grouped = df.groupby("tag")
actual = grouped.filter(lambda x: len(x) > 1)
expected = df.iloc[[1, 2, 4, 7]]
tm.assert_frame_equal(actual, expected)
grouped = s.groupby(df["tag"])
actual = grouped.filter(lambda x: len(x) > 1)
expected = s.iloc[[1, 2, 4, 7]]
tm.assert_series_equal(actual, expected)
def test_filter_multiple_timestamp():
# GH 10114
df = DataFrame(
{
"A": np.arange(5, dtype="int64"),
"B": ["foo", "bar", "foo", "bar", "bar"],
"C": Timestamp("20130101"),
}
)
grouped = df.groupby(["B", "C"])
result = grouped["A"].filter(lambda x: True)
tm.assert_series_equal(df["A"], result)
result = grouped["A"].transform(len)
expected = Series([2, 3, 2, 3, 3], name="A")
tm.assert_series_equal(result, expected)
result = grouped.filter(lambda x: True)
tm.assert_frame_equal(df, result)
result = grouped.transform("sum")
expected = DataFrame({"A": [2, 8, 2, 8, 8]})
tm.assert_frame_equal(result, expected)
result = grouped.transform(len)
expected = DataFrame({"A": [2, 3, 2, 3, 3]})
tm.assert_frame_equal(result, expected)
def test_filter_and_transform_with_non_unique_int_index():
# GH4620
index = [1, 1, 1, 2, 1, 1, 0, 1]
df = DataFrame(
{"pid": [1, 1, 1, 2, 2, 3, 3, 3], "tag": [23, 45, 62, 24, 45, 34, 25, 62]},
index=index,
)
grouped_df = df.groupby("tag")
ser = df["pid"]
grouped_ser = ser.groupby(df["tag"])
expected_indexes = [1, 2, 4, 7]
# Filter DataFrame
actual = grouped_df.filter(lambda x: len(x) > 1)
expected = df.iloc[expected_indexes]
tm.assert_frame_equal(actual, expected)
actual = grouped_df.filter(lambda x: len(x) > 1, dropna=False)
expected = df.copy()
expected.iloc[[0, 3, 5, 6]] = np.nan
tm.assert_frame_equal(actual, expected)
# Filter Series
actual = grouped_ser.filter(lambda x: len(x) > 1)
expected = ser.take(expected_indexes)
tm.assert_series_equal(actual, expected)
actual = grouped_ser.filter(lambda x: len(x) > 1, dropna=False)
NA = np.nan
expected = Series([NA, 1, 1, NA, 2, NA, NA, 3], index, name="pid")
# ^ made manually because this can get confusing!
tm.assert_series_equal(actual, expected)
# Transform Series
actual = grouped_ser.transform(len)
expected = Series([1, 2, 2, 1, 2, 1, 1, 2], index, name="pid")
tm.assert_series_equal(actual, expected)
# Transform (a column from) DataFrameGroupBy
actual = grouped_df.pid.transform(len)
tm.assert_series_equal(actual, expected)
def test_filter_and_transform_with_multiple_non_unique_int_index():
# GH4620
index = [1, 1, 1, 2, 0, 0, 0, 1]
df = DataFrame(
{"pid": [1, 1, 1, 2, 2, 3, 3, 3], "tag": [23, 45, 62, 24, 45, 34, 25, 62]},
index=index,
)
grouped_df = df.groupby("tag")
ser = df["pid"]
grouped_ser = ser.groupby(df["tag"])
expected_indexes = [1, 2, 4, 7]
# Filter DataFrame
actual = grouped_df.filter(lambda x: len(x) > 1)
expected = df.iloc[expected_indexes]
tm.assert_frame_equal(actual, expected)
actual = grouped_df.filter(lambda x: len(x) > 1, dropna=False)
expected = df.copy()
expected.iloc[[0, 3, 5, 6]] = np.nan
tm.assert_frame_equal(actual, expected)
# Filter Series
actual = grouped_ser.filter(lambda x: len(x) > 1)
expected = ser.take(expected_indexes)
tm.assert_series_equal(actual, expected)
actual = grouped_ser.filter(lambda x: len(x) > 1, dropna=False)
NA = np.nan
expected = Series([NA, 1, 1, NA, 2, NA, NA, 3], index, name="pid")
# ^ made manually because this can get confusing!
tm.assert_series_equal(actual, expected)
# Transform Series
actual = grouped_ser.transform(len)
expected = Series([1, 2, 2, 1, 2, 1, 1, 2], index, name="pid")
tm.assert_series_equal(actual, expected)
# Transform (a column from) DataFrameGroupBy
actual = grouped_df.pid.transform(len)
tm.assert_series_equal(actual, expected)
def test_filter_and_transform_with_non_unique_float_index():
# GH4620
index = np.array([1, 1, 1, 2, 1, 1, 0, 1], dtype=float)
df = DataFrame(
{"pid": [1, 1, 1, 2, 2, 3, 3, 3], "tag": [23, 45, 62, 24, 45, 34, 25, 62]},
index=index,
)
grouped_df = df.groupby("tag")
ser = df["pid"]
grouped_ser = ser.groupby(df["tag"])
expected_indexes = [1, 2, 4, 7]
# Filter DataFrame
actual = grouped_df.filter(lambda x: len(x) > 1)
expected = df.iloc[expected_indexes]
tm.assert_frame_equal(actual, expected)
actual = grouped_df.filter(lambda x: len(x) > 1, dropna=False)
expected = df.copy()
expected.iloc[[0, 3, 5, 6]] = np.nan
tm.assert_frame_equal(actual, expected)
# Filter Series
actual = grouped_ser.filter(lambda x: len(x) > 1)
expected = ser.take(expected_indexes)
tm.assert_series_equal(actual, expected)
actual = grouped_ser.filter(lambda x: len(x) > 1, dropna=False)
NA = np.nan
expected = Series([NA, 1, 1, NA, 2, NA, NA, 3], index, name="pid")
# ^ made manually because this can get confusing!
tm.assert_series_equal(actual, expected)
# Transform Series
actual = grouped_ser.transform(len)
expected = Series([1, 2, 2, 1, 2, 1, 1, 2], index, name="pid")
tm.assert_series_equal(actual, expected)
# Transform (a column from) DataFrameGroupBy
actual = grouped_df.pid.transform(len)
tm.assert_series_equal(actual, expected)
def test_filter_and_transform_with_non_unique_timestamp_index():
# GH4620
t0 = Timestamp("2013-09-30 00:05:00")
t1 = Timestamp("2013-10-30 00:05:00")
t2 = Timestamp("2013-11-30 00:05:00")
index = [t1, t1, t1, t2, t1, t1, t0, t1]
df = DataFrame(
{"pid": [1, 1, 1, 2, 2, 3, 3, 3], "tag": [23, 45, 62, 24, 45, 34, 25, 62]},
index=index,
)
grouped_df = df.groupby("tag")
ser = df["pid"]
grouped_ser = ser.groupby(df["tag"])
expected_indexes = [1, 2, 4, 7]
# Filter DataFrame
actual = grouped_df.filter(lambda x: len(x) > 1)
expected = df.iloc[expected_indexes]
tm.assert_frame_equal(actual, expected)
actual = grouped_df.filter(lambda x: len(x) > 1, dropna=False)
expected = df.copy()
expected.iloc[[0, 3, 5, 6]] = np.nan
tm.assert_frame_equal(actual, expected)
# Filter Series
actual = grouped_ser.filter(lambda x: len(x) > 1)
expected = ser.take(expected_indexes)
tm.assert_series_equal(actual, expected)
actual = grouped_ser.filter(lambda x: len(x) > 1, dropna=False)
NA = np.nan
expected = Series([NA, 1, 1, NA, 2, NA, NA, 3], index, name="pid")
# ^ made manually because this can get confusing!
tm.assert_series_equal(actual, expected)
# Transform Series
actual = grouped_ser.transform(len)
expected = Series([1, 2, 2, 1, 2, 1, 1, 2], index, name="pid")
tm.assert_series_equal(actual, expected)
# Transform (a column from) DataFrameGroupBy
actual = grouped_df.pid.transform(len)
tm.assert_series_equal(actual, expected)
def test_filter_and_transform_with_non_unique_string_index():
# GH4620
index = list("bbbcbbab")
df = DataFrame(
{"pid": [1, 1, 1, 2, 2, 3, 3, 3], "tag": [23, 45, 62, 24, 45, 34, 25, 62]},
index=index,
)
grouped_df = df.groupby("tag")
ser = df["pid"]
grouped_ser = ser.groupby(df["tag"])
expected_indexes = [1, 2, 4, 7]
# Filter DataFrame
actual = grouped_df.filter(lambda x: len(x) > 1)
expected = df.iloc[expected_indexes]
tm.assert_frame_equal(actual, expected)
actual = grouped_df.filter(lambda x: len(x) > 1, dropna=False)
expected = df.copy()
expected.iloc[[0, 3, 5, 6]] = np.nan
tm.assert_frame_equal(actual, expected)
# Filter Series
actual = grouped_ser.filter(lambda x: len(x) > 1)
expected = ser.take(expected_indexes)
tm.assert_series_equal(actual, expected)
actual = grouped_ser.filter(lambda x: len(x) > 1, dropna=False)
NA = np.nan
expected = Series([NA, 1, 1, NA, 2, NA, NA, 3], index, name="pid")
# ^ made manually because this can get confusing!
tm.assert_series_equal(actual, expected)
# Transform Series
actual = grouped_ser.transform(len)
expected = Series([1, 2, 2, 1, 2, 1, 1, 2], index, name="pid")
tm.assert_series_equal(actual, expected)
# Transform (a column from) DataFrameGroupBy
actual = grouped_df.pid.transform(len)
tm.assert_series_equal(actual, expected)
def test_filter_has_access_to_grouped_cols():
df = DataFrame([[1, 2], [1, 3], [5, 6]], columns=["A", "B"])
g = df.groupby("A")
# previously didn't have access to col A #????
filt = g.filter(lambda x: x["A"].sum() == 2)
tm.assert_frame_equal(filt, df.iloc[[0, 1]])
def test_filter_enforces_scalarness():
df = DataFrame(
[
["best", "a", "x"],
["worst", "b", "y"],
["best", "c", "x"],
["best", "d", "y"],
["worst", "d", "y"],
["worst", "d", "y"],
["best", "d", "z"],
],
columns=["a", "b", "c"],
)
with pytest.raises(TypeError, match="filter function returned a.*"):
df.groupby("c").filter(lambda g: g["a"] == "best")
def test_filter_non_bool_raises():
df = DataFrame(
[
["best", "a", 1],
["worst", "b", 1],
["best", "c", 1],
["best", "d", 1],
["worst", "d", 1],
["worst", "d", 1],
["best", "d", 1],
],
columns=["a", "b", "c"],
)
with pytest.raises(TypeError, match="filter function returned a.*"):
df.groupby("a").filter(lambda g: g.c.mean())
def test_filter_dropna_with_empty_groups():
# GH 10780
data = Series(np.random.rand(9), index=np.repeat([1, 2, 3], 3))
groupped = data.groupby(level=0)
result_false = groupped.filter(lambda x: x.mean() > 1, dropna=False)
expected_false = Series([np.nan] * 9, index=np.repeat([1, 2, 3], 3))
tm.assert_series_equal(result_false, expected_false)
result_true = groupped.filter(lambda x: x.mean() > 1, dropna=True)
expected_true = Series(index=pd.Index([], dtype=int), dtype=np.float64)
tm.assert_series_equal(result_true, expected_true)