444 lines
10 KiB
Python
444 lines
10 KiB
Python
"""
|
|
test methods relating to generic function evaluation
|
|
the so-called white/black lists
|
|
"""
|
|
|
|
from string import ascii_lowercase
|
|
|
|
import numpy as np
|
|
import pytest
|
|
|
|
from pandas import DataFrame, Index, MultiIndex, Series, date_range
|
|
import pandas._testing as tm
|
|
from pandas.core.groupby.base import (
|
|
groupby_other_methods,
|
|
reduction_kernels,
|
|
transformation_kernels,
|
|
)
|
|
|
|
AGG_FUNCTIONS = [
|
|
"sum",
|
|
"prod",
|
|
"min",
|
|
"max",
|
|
"median",
|
|
"mean",
|
|
"skew",
|
|
"mad",
|
|
"std",
|
|
"var",
|
|
"sem",
|
|
]
|
|
AGG_FUNCTIONS_WITH_SKIPNA = ["skew", "mad"]
|
|
|
|
df_allowlist = [
|
|
"quantile",
|
|
"fillna",
|
|
"mad",
|
|
"take",
|
|
"idxmax",
|
|
"idxmin",
|
|
"tshift",
|
|
"skew",
|
|
"plot",
|
|
"hist",
|
|
"dtypes",
|
|
"corrwith",
|
|
"corr",
|
|
"cov",
|
|
"diff",
|
|
]
|
|
|
|
|
|
@pytest.fixture(params=df_allowlist)
|
|
def df_allowlist_fixture(request):
|
|
return request.param
|
|
|
|
|
|
s_allowlist = [
|
|
"quantile",
|
|
"fillna",
|
|
"mad",
|
|
"take",
|
|
"idxmax",
|
|
"idxmin",
|
|
"tshift",
|
|
"skew",
|
|
"plot",
|
|
"hist",
|
|
"dtype",
|
|
"corr",
|
|
"cov",
|
|
"diff",
|
|
"unique",
|
|
"nlargest",
|
|
"nsmallest",
|
|
"is_monotonic_increasing",
|
|
"is_monotonic_decreasing",
|
|
]
|
|
|
|
|
|
@pytest.fixture(params=s_allowlist)
|
|
def s_allowlist_fixture(request):
|
|
return request.param
|
|
|
|
|
|
@pytest.fixture
|
|
def mframe():
|
|
index = MultiIndex(
|
|
levels=[["foo", "bar", "baz", "qux"], ["one", "two", "three"]],
|
|
codes=[[0, 0, 0, 1, 1, 2, 2, 3, 3, 3], [0, 1, 2, 0, 1, 1, 2, 0, 1, 2]],
|
|
names=["first", "second"],
|
|
)
|
|
return DataFrame(np.random.randn(10, 3), index=index, columns=["A", "B", "C"])
|
|
|
|
|
|
@pytest.fixture
|
|
def df():
|
|
return DataFrame(
|
|
{
|
|
"A": ["foo", "bar", "foo", "bar", "foo", "bar", "foo", "foo"],
|
|
"B": ["one", "one", "two", "three", "two", "two", "one", "three"],
|
|
"C": np.random.randn(8),
|
|
"D": np.random.randn(8),
|
|
}
|
|
)
|
|
|
|
|
|
@pytest.fixture
|
|
def df_letters():
|
|
letters = np.array(list(ascii_lowercase))
|
|
N = 10
|
|
random_letters = letters.take(np.random.randint(0, 26, N))
|
|
df = DataFrame(
|
|
{
|
|
"floats": N / 10 * Series(np.random.random(N)),
|
|
"letters": Series(random_letters),
|
|
}
|
|
)
|
|
return df
|
|
|
|
|
|
@pytest.mark.parametrize("allowlist", [df_allowlist, s_allowlist])
|
|
def test_groupby_allowlist(df_letters, allowlist):
|
|
df = df_letters
|
|
if allowlist == df_allowlist:
|
|
# dataframe
|
|
obj = df_letters
|
|
else:
|
|
obj = df_letters["floats"]
|
|
|
|
gb = obj.groupby(df.letters)
|
|
|
|
assert set(allowlist) == set(gb._apply_allowlist)
|
|
|
|
|
|
def check_allowlist(obj, df, m):
|
|
# check the obj for a particular allowlist m
|
|
|
|
gb = obj.groupby(df.letters)
|
|
|
|
f = getattr(type(gb), m)
|
|
|
|
# name
|
|
try:
|
|
n = f.__name__
|
|
except AttributeError:
|
|
return
|
|
assert n == m
|
|
|
|
# qualname
|
|
try:
|
|
n = f.__qualname__
|
|
except AttributeError:
|
|
return
|
|
assert n.endswith(m)
|
|
|
|
|
|
def test_groupby_series_allowlist(df_letters, s_allowlist_fixture):
|
|
m = s_allowlist_fixture
|
|
df = df_letters
|
|
check_allowlist(df.letters, df, m)
|
|
|
|
|
|
def test_groupby_frame_allowlist(df_letters, df_allowlist_fixture):
|
|
m = df_allowlist_fixture
|
|
df = df_letters
|
|
check_allowlist(df, df, m)
|
|
|
|
|
|
@pytest.fixture
|
|
def raw_frame():
|
|
index = MultiIndex(
|
|
levels=[["foo", "bar", "baz", "qux"], ["one", "two", "three"]],
|
|
codes=[[0, 0, 0, 1, 1, 2, 2, 3, 3, 3], [0, 1, 2, 0, 1, 1, 2, 0, 1, 2]],
|
|
names=["first", "second"],
|
|
)
|
|
raw_frame = DataFrame(
|
|
np.random.randn(10, 3), index=index, columns=Index(["A", "B", "C"], name="exp")
|
|
)
|
|
raw_frame.iloc[1, [1, 2]] = np.nan
|
|
raw_frame.iloc[7, [0, 1]] = np.nan
|
|
return raw_frame
|
|
|
|
|
|
@pytest.mark.parametrize("op", AGG_FUNCTIONS)
|
|
@pytest.mark.parametrize("level", [0, 1])
|
|
@pytest.mark.parametrize("axis", [0, 1])
|
|
@pytest.mark.parametrize("skipna", [True, False])
|
|
@pytest.mark.parametrize("sort", [True, False])
|
|
def test_regression_allowlist_methods(raw_frame, op, level, axis, skipna, sort):
|
|
# GH6944
|
|
# GH 17537
|
|
# explicitly test the allowlist methods
|
|
|
|
if axis == 0:
|
|
frame = raw_frame
|
|
else:
|
|
frame = raw_frame.T
|
|
|
|
if op in AGG_FUNCTIONS_WITH_SKIPNA:
|
|
grouped = frame.groupby(level=level, axis=axis, sort=sort)
|
|
result = getattr(grouped, op)(skipna=skipna)
|
|
expected = getattr(frame, op)(level=level, axis=axis, skipna=skipna)
|
|
if sort:
|
|
expected = expected.sort_index(axis=axis, level=level)
|
|
tm.assert_frame_equal(result, expected)
|
|
else:
|
|
grouped = frame.groupby(level=level, axis=axis, sort=sort)
|
|
result = getattr(grouped, op)()
|
|
expected = getattr(frame, op)(level=level, axis=axis)
|
|
if sort:
|
|
expected = expected.sort_index(axis=axis, level=level)
|
|
tm.assert_frame_equal(result, expected)
|
|
|
|
|
|
def test_groupby_blocklist(df_letters):
|
|
df = df_letters
|
|
s = df_letters.floats
|
|
|
|
blocklist = [
|
|
"eval",
|
|
"query",
|
|
"abs",
|
|
"where",
|
|
"mask",
|
|
"align",
|
|
"groupby",
|
|
"clip",
|
|
"astype",
|
|
"at",
|
|
"combine",
|
|
"consolidate",
|
|
"convert_objects",
|
|
]
|
|
to_methods = [method for method in dir(df) if method.startswith("to_")]
|
|
|
|
blocklist.extend(to_methods)
|
|
|
|
for bl in blocklist:
|
|
for obj in (df, s):
|
|
gb = obj.groupby(df.letters)
|
|
|
|
# e.g., to_csv
|
|
defined_but_not_allowed = (
|
|
f"(?:^Cannot.+{repr(bl)}.+'{type(gb).__name__}'.+try "
|
|
f"using the 'apply' method$)"
|
|
)
|
|
|
|
# e.g., query, eval
|
|
not_defined = (
|
|
f"(?:^'{type(gb).__name__}' object has no attribute {repr(bl)}$)"
|
|
)
|
|
|
|
msg = f"{defined_but_not_allowed}|{not_defined}"
|
|
|
|
with pytest.raises(AttributeError, match=msg):
|
|
getattr(gb, bl)
|
|
|
|
|
|
def test_tab_completion(mframe):
|
|
grp = mframe.groupby(level="second")
|
|
results = {v for v in dir(grp) if not v.startswith("_")}
|
|
expected = {
|
|
"A",
|
|
"B",
|
|
"C",
|
|
"agg",
|
|
"aggregate",
|
|
"apply",
|
|
"boxplot",
|
|
"filter",
|
|
"first",
|
|
"get_group",
|
|
"groups",
|
|
"hist",
|
|
"indices",
|
|
"last",
|
|
"max",
|
|
"mean",
|
|
"median",
|
|
"min",
|
|
"ngroups",
|
|
"nth",
|
|
"ohlc",
|
|
"plot",
|
|
"prod",
|
|
"size",
|
|
"std",
|
|
"sum",
|
|
"transform",
|
|
"var",
|
|
"sem",
|
|
"count",
|
|
"nunique",
|
|
"head",
|
|
"describe",
|
|
"cummax",
|
|
"quantile",
|
|
"rank",
|
|
"cumprod",
|
|
"tail",
|
|
"resample",
|
|
"cummin",
|
|
"fillna",
|
|
"cumsum",
|
|
"cumcount",
|
|
"ngroup",
|
|
"all",
|
|
"shift",
|
|
"skew",
|
|
"take",
|
|
"tshift",
|
|
"pct_change",
|
|
"any",
|
|
"mad",
|
|
"corr",
|
|
"corrwith",
|
|
"cov",
|
|
"dtypes",
|
|
"ndim",
|
|
"diff",
|
|
"idxmax",
|
|
"idxmin",
|
|
"ffill",
|
|
"bfill",
|
|
"pad",
|
|
"backfill",
|
|
"rolling",
|
|
"expanding",
|
|
"pipe",
|
|
"sample",
|
|
"ewm",
|
|
}
|
|
assert results == expected
|
|
|
|
|
|
def test_groupby_function_rename(mframe):
|
|
grp = mframe.groupby(level="second")
|
|
for name in ["sum", "prod", "min", "max", "first", "last"]:
|
|
f = getattr(grp, name)
|
|
assert f.__name__ == name
|
|
|
|
|
|
@pytest.mark.filterwarnings("ignore:tshift is deprecated:FutureWarning")
|
|
def test_groupby_selection_with_methods(df):
|
|
# some methods which require DatetimeIndex
|
|
rng = date_range("2014", periods=len(df))
|
|
df.index = rng
|
|
|
|
g = df.groupby(["A"])[["C"]]
|
|
g_exp = df[["C"]].groupby(df["A"])
|
|
# TODO check groupby with > 1 col ?
|
|
|
|
# methods which are called as .foo()
|
|
methods = [
|
|
"count",
|
|
"corr",
|
|
"cummax",
|
|
"cummin",
|
|
"cumprod",
|
|
"describe",
|
|
"rank",
|
|
"quantile",
|
|
"diff",
|
|
"shift",
|
|
"all",
|
|
"any",
|
|
"idxmin",
|
|
"idxmax",
|
|
"ffill",
|
|
"bfill",
|
|
"pct_change",
|
|
]
|
|
|
|
for m in methods:
|
|
res = getattr(g, m)()
|
|
exp = getattr(g_exp, m)()
|
|
|
|
# should always be frames!
|
|
tm.assert_frame_equal(res, exp)
|
|
|
|
# check that the index cache is cleared
|
|
with pytest.raises(ValueError, match="Freq was not set in the index"):
|
|
# GH#35937
|
|
g.tshift()
|
|
|
|
# methods which aren't just .foo()
|
|
tm.assert_frame_equal(g.fillna(0), g_exp.fillna(0))
|
|
tm.assert_frame_equal(g.dtypes, g_exp.dtypes)
|
|
tm.assert_frame_equal(g.apply(lambda x: x.sum()), g_exp.apply(lambda x: x.sum()))
|
|
|
|
tm.assert_frame_equal(g.resample("D").mean(), g_exp.resample("D").mean())
|
|
tm.assert_frame_equal(g.resample("D").ohlc(), g_exp.resample("D").ohlc())
|
|
|
|
tm.assert_frame_equal(
|
|
g.filter(lambda x: len(x) == 3), g_exp.filter(lambda x: len(x) == 3)
|
|
)
|
|
|
|
|
|
def test_all_methods_categorized(mframe):
|
|
grp = mframe.groupby(mframe.iloc[:, 0])
|
|
names = {_ for _ in dir(grp) if not _.startswith("_")} - set(mframe.columns)
|
|
new_names = set(names)
|
|
new_names -= reduction_kernels
|
|
new_names -= transformation_kernels
|
|
new_names -= groupby_other_methods
|
|
|
|
assert not (reduction_kernels & transformation_kernels)
|
|
assert not (reduction_kernels & groupby_other_methods)
|
|
assert not (transformation_kernels & groupby_other_methods)
|
|
|
|
# new public method?
|
|
if new_names:
|
|
msg = f"""
|
|
There are uncatgeorized methods defined on the Grouper class:
|
|
{new_names}.
|
|
|
|
Was a new method recently added?
|
|
|
|
Every public method On Grouper must appear in exactly one the
|
|
following three lists defined in pandas.core.groupby.base:
|
|
- `reduction_kernels`
|
|
- `transformation_kernels`
|
|
- `groupby_other_methods`
|
|
see the comments in pandas/core/groupby/base.py for guidance on
|
|
how to fix this test.
|
|
"""
|
|
raise AssertionError(msg)
|
|
|
|
# removed a public method?
|
|
all_categorized = reduction_kernels | transformation_kernels | groupby_other_methods
|
|
print(names)
|
|
print(all_categorized)
|
|
if not (names == all_categorized):
|
|
msg = f"""
|
|
Some methods which are supposed to be on the Grouper class
|
|
are missing:
|
|
{all_categorized - names}.
|
|
|
|
They're still defined in one of the lists that live in pandas/core/groupby/base.py.
|
|
If you removed a method, you should update them
|
|
"""
|
|
raise AssertionError(msg)
|