634 lines
20 KiB
Python
634 lines
20 KiB
Python
# Only tests that raise an error and have no better location should go here.
|
|
# Tests for specific groupby methods should go in their respective
|
|
# test file.
|
|
|
|
import datetime
|
|
|
|
import numpy as np
|
|
import pytest
|
|
|
|
from pandas import (
|
|
Categorical,
|
|
DataFrame,
|
|
Grouper,
|
|
Series,
|
|
)
|
|
from pandas.tests.groupby import get_groupby_method_args
|
|
|
|
|
|
@pytest.fixture(
|
|
params=[
|
|
"a",
|
|
["a"],
|
|
["a", "b"],
|
|
Grouper(key="a"),
|
|
lambda x: x % 2,
|
|
[0, 0, 0, 1, 2, 2, 2, 3, 3],
|
|
np.array([0, 0, 0, 1, 2, 2, 2, 3, 3]),
|
|
dict(zip(range(9), [0, 0, 0, 1, 2, 2, 2, 3, 3])),
|
|
Series([1, 1, 1, 1, 1, 2, 2, 2, 2]),
|
|
[Series([1, 1, 1, 1, 1, 2, 2, 2, 2]), Series([3, 3, 4, 4, 4, 4, 4, 3, 3])],
|
|
]
|
|
)
|
|
def by(request):
|
|
return request.param
|
|
|
|
|
|
@pytest.fixture(params=[True, False])
|
|
def groupby_series(request):
|
|
return request.param
|
|
|
|
|
|
@pytest.mark.parametrize("how", ["method", "agg", "transform"])
|
|
def test_groupby_raises_string(how, by, groupby_series, groupby_func):
|
|
df = DataFrame(
|
|
{
|
|
"a": [1, 1, 1, 1, 1, 2, 2, 2, 2],
|
|
"b": [3, 3, 4, 4, 4, 4, 4, 3, 3],
|
|
"c": range(9),
|
|
"d": list("xyzwtyuio"),
|
|
}
|
|
)
|
|
args = get_groupby_method_args(groupby_func, df)
|
|
gb = df.groupby(by=by)
|
|
|
|
if groupby_series:
|
|
gb = gb["d"]
|
|
|
|
if groupby_func == "corrwith":
|
|
assert not hasattr(gb, "corrwith")
|
|
return
|
|
|
|
klass, msg = {
|
|
"all": (None, ""),
|
|
"any": (None, ""),
|
|
"bfill": (None, ""),
|
|
"corrwith": (TypeError, "Could not convert"),
|
|
"count": (None, ""),
|
|
"cumcount": (None, ""),
|
|
"cummax": (
|
|
(NotImplementedError, TypeError),
|
|
"(function|cummax) is not (implemented|supported) for (this|object) dtype",
|
|
),
|
|
"cummin": (
|
|
(NotImplementedError, TypeError),
|
|
"(function|cummin) is not (implemented|supported) for (this|object) dtype",
|
|
),
|
|
"cumprod": (
|
|
(NotImplementedError, TypeError),
|
|
"(function|cumprod) is not (implemented|supported) for (this|object) dtype",
|
|
),
|
|
"cumsum": (
|
|
(NotImplementedError, TypeError),
|
|
"(function|cumsum) is not (implemented|supported) for (this|object) dtype",
|
|
),
|
|
"diff": (TypeError, "unsupported operand type"),
|
|
"ffill": (None, ""),
|
|
"fillna": (None, ""),
|
|
"first": (None, ""),
|
|
"idxmax": (TypeError, "'argmax' not allowed for this dtype"),
|
|
"idxmin": (TypeError, "'argmin' not allowed for this dtype"),
|
|
"last": (None, ""),
|
|
"max": (None, ""),
|
|
"mean": (TypeError, "Could not convert xy?z?w?t?y?u?i?o? to numeric"),
|
|
"median": (TypeError, "could not convert string to float"),
|
|
"min": (None, ""),
|
|
"ngroup": (None, ""),
|
|
"nunique": (None, ""),
|
|
"pct_change": (TypeError, "unsupported operand type"),
|
|
"prod": (TypeError, "can't multiply sequence by non-int of type 'str'"),
|
|
"quantile": (TypeError, "cannot be performed against 'object' dtypes!"),
|
|
"rank": (None, ""),
|
|
"sem": (ValueError, "could not convert string to float"),
|
|
"shift": (None, ""),
|
|
"size": (None, ""),
|
|
"skew": (TypeError, "could not convert string to float"),
|
|
"std": (ValueError, "could not convert string to float"),
|
|
"sum": (None, ""),
|
|
"var": (TypeError, "could not convert string to float"),
|
|
}[groupby_func]
|
|
|
|
if klass is None:
|
|
if how == "method":
|
|
getattr(gb, groupby_func)(*args)
|
|
elif how == "agg":
|
|
gb.agg(groupby_func, *args)
|
|
else:
|
|
gb.transform(groupby_func, *args)
|
|
else:
|
|
with pytest.raises(klass, match=msg):
|
|
if how == "method":
|
|
getattr(gb, groupby_func)(*args)
|
|
elif how == "agg":
|
|
gb.agg(groupby_func, *args)
|
|
else:
|
|
gb.transform(groupby_func, *args)
|
|
|
|
|
|
@pytest.mark.parametrize("how", ["agg", "transform"])
|
|
def test_groupby_raises_string_udf(how, by, groupby_series):
|
|
df = DataFrame(
|
|
{
|
|
"a": [1, 1, 1, 1, 1, 2, 2, 2, 2],
|
|
"b": [3, 3, 4, 4, 4, 4, 4, 3, 3],
|
|
"c": range(9),
|
|
"d": list("xyzwtyuio"),
|
|
}
|
|
)
|
|
gb = df.groupby(by=by)
|
|
|
|
if groupby_series:
|
|
gb = gb["d"]
|
|
|
|
def func(x):
|
|
raise TypeError("Test error message")
|
|
|
|
with pytest.raises(TypeError, match="Test error message"):
|
|
getattr(gb, how)(func)
|
|
|
|
|
|
@pytest.mark.parametrize("how", ["agg", "transform"])
|
|
@pytest.mark.parametrize("groupby_func_np", [np.sum, np.mean])
|
|
def test_groupby_raises_string_np(how, by, groupby_series, groupby_func_np):
|
|
# GH#50749
|
|
df = DataFrame(
|
|
{
|
|
"a": [1, 1, 1, 1, 1, 2, 2, 2, 2],
|
|
"b": [3, 3, 4, 4, 4, 4, 4, 3, 3],
|
|
"c": range(9),
|
|
"d": list("xyzwtyuio"),
|
|
}
|
|
)
|
|
gb = df.groupby(by=by)
|
|
|
|
if groupby_series:
|
|
gb = gb["d"]
|
|
|
|
klass, msg = {
|
|
np.sum: (None, ""),
|
|
np.mean: (TypeError, "Could not convert xy?z?w?t?y?u?i?o? to numeric"),
|
|
}[groupby_func_np]
|
|
|
|
if klass is None:
|
|
getattr(gb, how)(groupby_func_np)
|
|
else:
|
|
with pytest.raises(klass, match=msg):
|
|
getattr(gb, how)(groupby_func_np)
|
|
|
|
|
|
@pytest.mark.parametrize("how", ["method", "agg", "transform"])
|
|
def test_groupby_raises_datetime(how, by, groupby_series, groupby_func):
|
|
df = DataFrame(
|
|
{
|
|
"a": [1, 1, 1, 1, 1, 2, 2, 2, 2],
|
|
"b": [3, 3, 4, 4, 4, 4, 4, 3, 3],
|
|
"c": range(9),
|
|
"d": datetime.datetime(2005, 1, 1, 10, 30, 23, 540000),
|
|
}
|
|
)
|
|
args = get_groupby_method_args(groupby_func, df)
|
|
gb = df.groupby(by=by)
|
|
|
|
if groupby_series:
|
|
gb = gb["d"]
|
|
|
|
if groupby_func == "corrwith":
|
|
assert not hasattr(gb, "corrwith")
|
|
return
|
|
|
|
klass, msg = {
|
|
"all": (None, ""),
|
|
"any": (None, ""),
|
|
"bfill": (None, ""),
|
|
"corrwith": (TypeError, "cannot perform __mul__ with this index type"),
|
|
"count": (None, ""),
|
|
"cumcount": (None, ""),
|
|
"cummax": (None, ""),
|
|
"cummin": (None, ""),
|
|
"cumprod": (TypeError, "datetime64 type does not support cumprod operations"),
|
|
"cumsum": (TypeError, "datetime64 type does not support cumsum operations"),
|
|
"diff": (None, ""),
|
|
"ffill": (None, ""),
|
|
"fillna": (None, ""),
|
|
"first": (None, ""),
|
|
"idxmax": (None, ""),
|
|
"idxmin": (None, ""),
|
|
"last": (None, ""),
|
|
"max": (None, ""),
|
|
"mean": (None, ""),
|
|
"median": (None, ""),
|
|
"min": (None, ""),
|
|
"ngroup": (None, ""),
|
|
"nunique": (None, ""),
|
|
"pct_change": (TypeError, "cannot perform __truediv__ with this index type"),
|
|
"prod": (TypeError, "datetime64 type does not support prod"),
|
|
"quantile": (None, ""),
|
|
"rank": (None, ""),
|
|
"sem": (None, ""),
|
|
"shift": (None, ""),
|
|
"size": (None, ""),
|
|
"skew": (TypeError, r"dtype datetime64\[ns\] does not support reduction"),
|
|
"std": (None, ""),
|
|
"sum": (TypeError, "datetime64 type does not support sum operations"),
|
|
"var": (None, ""),
|
|
}[groupby_func]
|
|
|
|
if klass is None:
|
|
if how == "method":
|
|
getattr(gb, groupby_func)(*args)
|
|
elif how == "agg":
|
|
gb.agg(groupby_func, *args)
|
|
else:
|
|
gb.transform(groupby_func, *args)
|
|
else:
|
|
with pytest.raises(klass, match=msg):
|
|
if how == "method":
|
|
getattr(gb, groupby_func)(*args)
|
|
elif how == "agg":
|
|
gb.agg(groupby_func, *args)
|
|
else:
|
|
gb.transform(groupby_func, *args)
|
|
|
|
|
|
@pytest.mark.parametrize("how", ["agg", "transform"])
|
|
def test_groupby_raises_datetime_udf(how, by, groupby_series):
|
|
df = DataFrame(
|
|
{
|
|
"a": [1, 1, 1, 1, 1, 2, 2, 2, 2],
|
|
"b": [3, 3, 4, 4, 4, 4, 4, 3, 3],
|
|
"c": range(9),
|
|
"d": datetime.datetime(2005, 1, 1, 10, 30, 23, 540000),
|
|
}
|
|
)
|
|
|
|
gb = df.groupby(by=by)
|
|
|
|
if groupby_series:
|
|
gb = gb["d"]
|
|
|
|
def func(x):
|
|
raise TypeError("Test error message")
|
|
|
|
with pytest.raises(TypeError, match="Test error message"):
|
|
getattr(gb, how)(func)
|
|
|
|
|
|
@pytest.mark.parametrize("how", ["agg", "transform"])
|
|
@pytest.mark.parametrize("groupby_func_np", [np.sum, np.mean])
|
|
def test_groupby_raises_datetime_np(how, by, groupby_series, groupby_func_np):
|
|
# GH#50749
|
|
df = DataFrame(
|
|
{
|
|
"a": [1, 1, 1, 1, 1, 2, 2, 2, 2],
|
|
"b": [3, 3, 4, 4, 4, 4, 4, 3, 3],
|
|
"c": range(9),
|
|
"d": datetime.datetime(2005, 1, 1, 10, 30, 23, 540000),
|
|
}
|
|
)
|
|
gb = df.groupby(by=by)
|
|
|
|
if groupby_series:
|
|
gb = gb["d"]
|
|
|
|
klass, msg = {
|
|
np.sum: (TypeError, "datetime64 type does not support sum operations"),
|
|
np.mean: (None, ""),
|
|
}[groupby_func_np]
|
|
|
|
if klass is None:
|
|
getattr(gb, how)(groupby_func_np)
|
|
else:
|
|
with pytest.raises(klass, match=msg):
|
|
getattr(gb, how)(groupby_func_np)
|
|
|
|
|
|
@pytest.mark.parametrize("how", ["method", "agg", "transform"])
|
|
def test_groupby_raises_category(
|
|
how, by, groupby_series, groupby_func, using_copy_on_write
|
|
):
|
|
# GH#50749
|
|
df = DataFrame(
|
|
{
|
|
"a": [1, 1, 1, 1, 1, 2, 2, 2, 2],
|
|
"b": [3, 3, 4, 4, 4, 4, 4, 3, 3],
|
|
"c": range(9),
|
|
"d": Categorical(
|
|
["a", "a", "a", "a", "b", "b", "b", "b", "c"],
|
|
categories=["a", "b", "c", "d"],
|
|
ordered=True,
|
|
),
|
|
}
|
|
)
|
|
args = get_groupby_method_args(groupby_func, df)
|
|
gb = df.groupby(by=by)
|
|
|
|
if groupby_series:
|
|
gb = gb["d"]
|
|
|
|
if groupby_func == "corrwith":
|
|
assert not hasattr(gb, "corrwith")
|
|
return
|
|
|
|
klass, msg = {
|
|
"all": (None, ""),
|
|
"any": (None, ""),
|
|
"bfill": (None, ""),
|
|
"corrwith": (
|
|
TypeError,
|
|
r"unsupported operand type\(s\) for \*: 'Categorical' and 'int'",
|
|
),
|
|
"count": (None, ""),
|
|
"cumcount": (None, ""),
|
|
"cummax": (
|
|
(NotImplementedError, TypeError),
|
|
"(category type does not support cummax operations|"
|
|
+ "category dtype not supported|"
|
|
+ "cummax is not supported for category dtype)",
|
|
),
|
|
"cummin": (
|
|
(NotImplementedError, TypeError),
|
|
"(category type does not support cummin operations|"
|
|
+ "category dtype not supported|"
|
|
"cummin is not supported for category dtype)",
|
|
),
|
|
"cumprod": (
|
|
(NotImplementedError, TypeError),
|
|
"(category type does not support cumprod operations|"
|
|
+ "category dtype not supported|"
|
|
"cumprod is not supported for category dtype)",
|
|
),
|
|
"cumsum": (
|
|
(NotImplementedError, TypeError),
|
|
"(category type does not support cumsum operations|"
|
|
+ "category dtype not supported|"
|
|
"cumsum is not supported for category dtype)",
|
|
),
|
|
"diff": (
|
|
TypeError,
|
|
r"unsupported operand type\(s\) for -: 'Categorical' and 'Categorical'",
|
|
),
|
|
"ffill": (None, ""),
|
|
"fillna": (
|
|
TypeError,
|
|
r"Cannot setitem on a Categorical with a new category \(0\), "
|
|
+ "set the categories first",
|
|
)
|
|
if not using_copy_on_write
|
|
else (None, ""), # no-op with CoW
|
|
"first": (None, ""),
|
|
"idxmax": (None, ""),
|
|
"idxmin": (None, ""),
|
|
"last": (None, ""),
|
|
"max": (None, ""),
|
|
"mean": (
|
|
TypeError,
|
|
"'Categorical' with dtype category does not support reduction 'mean'",
|
|
),
|
|
"median": (
|
|
TypeError,
|
|
"'Categorical' with dtype category does not support reduction 'median'",
|
|
),
|
|
"min": (None, ""),
|
|
"ngroup": (None, ""),
|
|
"nunique": (None, ""),
|
|
"pct_change": (
|
|
TypeError,
|
|
r"unsupported operand type\(s\) for /: 'Categorical' and 'Categorical'",
|
|
),
|
|
"prod": (TypeError, "category type does not support prod operations"),
|
|
"quantile": (TypeError, "No matching signature found"),
|
|
"rank": (None, ""),
|
|
"sem": (ValueError, "Cannot cast object dtype to float64"),
|
|
"shift": (None, ""),
|
|
"size": (None, ""),
|
|
"skew": (
|
|
TypeError,
|
|
"'Categorical' with dtype category does not support reduction 'skew'",
|
|
),
|
|
"std": (ValueError, "Cannot cast object dtype to float64"),
|
|
"sum": (TypeError, "category type does not support sum operations"),
|
|
"var": (
|
|
TypeError,
|
|
"'Categorical' with dtype category does not support reduction 'var'",
|
|
),
|
|
}[groupby_func]
|
|
|
|
if klass is None:
|
|
if how == "method":
|
|
getattr(gb, groupby_func)(*args)
|
|
elif how == "agg":
|
|
gb.agg(groupby_func, *args)
|
|
else:
|
|
gb.transform(groupby_func, *args)
|
|
else:
|
|
with pytest.raises(klass, match=msg):
|
|
if how == "method":
|
|
getattr(gb, groupby_func)(*args)
|
|
elif how == "agg":
|
|
gb.agg(groupby_func, *args)
|
|
else:
|
|
gb.transform(groupby_func, *args)
|
|
|
|
|
|
@pytest.mark.parametrize("how", ["agg", "transform"])
|
|
def test_groupby_raises_category_udf(how, by, groupby_series):
|
|
# GH#50749
|
|
df = DataFrame(
|
|
{
|
|
"a": [1, 1, 1, 1, 1, 2, 2, 2, 2],
|
|
"b": [3, 3, 4, 4, 4, 4, 4, 3, 3],
|
|
"c": range(9),
|
|
"d": Categorical(
|
|
["a", "a", "a", "a", "b", "b", "b", "b", "c"],
|
|
categories=["a", "b", "c", "d"],
|
|
ordered=True,
|
|
),
|
|
}
|
|
)
|
|
gb = df.groupby(by=by)
|
|
|
|
if groupby_series:
|
|
gb = gb["d"]
|
|
|
|
def func(x):
|
|
raise TypeError("Test error message")
|
|
|
|
with pytest.raises(TypeError, match="Test error message"):
|
|
getattr(gb, how)(func)
|
|
|
|
|
|
@pytest.mark.parametrize("how", ["agg", "transform"])
|
|
@pytest.mark.parametrize("groupby_func_np", [np.sum, np.mean])
|
|
def test_groupby_raises_category_np(how, by, groupby_series, groupby_func_np):
|
|
# GH#50749
|
|
df = DataFrame(
|
|
{
|
|
"a": [1, 1, 1, 1, 1, 2, 2, 2, 2],
|
|
"b": [3, 3, 4, 4, 4, 4, 4, 3, 3],
|
|
"c": range(9),
|
|
"d": Categorical(
|
|
["a", "a", "a", "a", "b", "b", "b", "b", "c"],
|
|
categories=["a", "b", "c", "d"],
|
|
ordered=True,
|
|
),
|
|
}
|
|
)
|
|
gb = df.groupby(by=by)
|
|
|
|
if groupby_series:
|
|
gb = gb["d"]
|
|
|
|
klass, msg = {
|
|
np.sum: (TypeError, "category type does not support sum operations"),
|
|
np.mean: (
|
|
TypeError,
|
|
"'Categorical' with dtype category does not support reduction 'mean'",
|
|
),
|
|
}[groupby_func_np]
|
|
|
|
if klass is None:
|
|
getattr(gb, how)(groupby_func_np)
|
|
else:
|
|
with pytest.raises(klass, match=msg):
|
|
getattr(gb, how)(groupby_func_np)
|
|
|
|
|
|
@pytest.mark.parametrize("how", ["method", "agg", "transform"])
|
|
def test_groupby_raises_category_on_category(
|
|
how, by, groupby_series, groupby_func, observed, using_copy_on_write
|
|
):
|
|
# GH#50749
|
|
df = DataFrame(
|
|
{
|
|
"a": Categorical(
|
|
["a", "a", "a", "a", "b", "b", "b", "b", "c"],
|
|
categories=["a", "b", "c", "d"],
|
|
ordered=True,
|
|
),
|
|
"b": [3, 3, 4, 4, 4, 4, 4, 3, 3],
|
|
"c": range(9),
|
|
"d": Categorical(
|
|
["a", "a", "a", "a", "b", "b", "c", "c", "c"],
|
|
categories=["a", "b", "c", "d"],
|
|
ordered=True,
|
|
),
|
|
}
|
|
)
|
|
args = get_groupby_method_args(groupby_func, df)
|
|
gb = df.groupby(by=by, observed=observed)
|
|
|
|
if groupby_series:
|
|
gb = gb["d"]
|
|
|
|
if groupby_func == "corrwith":
|
|
assert not hasattr(gb, "corrwith")
|
|
return
|
|
|
|
empty_groups = any(group.empty for group in gb.groups.values())
|
|
|
|
klass, msg = {
|
|
"all": (None, ""),
|
|
"any": (None, ""),
|
|
"bfill": (None, ""),
|
|
"corrwith": (
|
|
TypeError,
|
|
r"unsupported operand type\(s\) for \*: 'Categorical' and 'int'",
|
|
),
|
|
"count": (None, ""),
|
|
"cumcount": (None, ""),
|
|
"cummax": (
|
|
(NotImplementedError, TypeError),
|
|
"(cummax is not supported for category dtype|"
|
|
+ "category dtype not supported|"
|
|
+ "category type does not support cummax operations)",
|
|
),
|
|
"cummin": (
|
|
(NotImplementedError, TypeError),
|
|
"(cummin is not supported for category dtype|"
|
|
+ "category dtype not supported|"
|
|
"category type does not support cummin operations)",
|
|
),
|
|
"cumprod": (
|
|
(NotImplementedError, TypeError),
|
|
"(cumprod is not supported for category dtype|"
|
|
+ "category dtype not supported|"
|
|
"category type does not support cumprod operations)",
|
|
),
|
|
"cumsum": (
|
|
(NotImplementedError, TypeError),
|
|
"(cumsum is not supported for category dtype|"
|
|
+ "category dtype not supported|"
|
|
+ "category type does not support cumsum operations)",
|
|
),
|
|
"diff": (TypeError, "unsupported operand type"),
|
|
"ffill": (None, ""),
|
|
"fillna": (
|
|
TypeError,
|
|
r"Cannot setitem on a Categorical with a new category \(0\), "
|
|
+ "set the categories first",
|
|
)
|
|
if not using_copy_on_write
|
|
else (None, ""), # no-op with CoW
|
|
"first": (None, ""),
|
|
"idxmax": (ValueError, "attempt to get argmax of an empty sequence")
|
|
if empty_groups
|
|
else (None, ""),
|
|
"idxmin": (ValueError, "attempt to get argmin of an empty sequence")
|
|
if empty_groups
|
|
else (None, ""),
|
|
"last": (None, ""),
|
|
"max": (None, ""),
|
|
"mean": (
|
|
TypeError,
|
|
"'Categorical' with dtype category does not support reduction 'mean'",
|
|
),
|
|
"median": (
|
|
TypeError,
|
|
"'Categorical' with dtype category does not support reduction 'median'",
|
|
),
|
|
"min": (None, ""),
|
|
"ngroup": (None, ""),
|
|
"nunique": (None, ""),
|
|
"pct_change": (TypeError, "unsupported operand type"),
|
|
"prod": (TypeError, "category type does not support prod operations"),
|
|
"quantile": (TypeError, ""),
|
|
"rank": (None, ""),
|
|
"sem": (ValueError, "Cannot cast object dtype to float64"),
|
|
"shift": (None, ""),
|
|
"size": (None, ""),
|
|
"skew": (
|
|
TypeError,
|
|
"'Categorical' with dtype category does not support reduction 'skew'",
|
|
),
|
|
"std": (ValueError, "Cannot cast object dtype to float64"),
|
|
"sum": (TypeError, "category type does not support sum operations"),
|
|
"var": (
|
|
TypeError,
|
|
"'Categorical' with dtype category does not support reduction 'var'",
|
|
),
|
|
}[groupby_func]
|
|
|
|
if klass is None:
|
|
if how == "method":
|
|
getattr(gb, groupby_func)(*args)
|
|
elif how == "agg":
|
|
gb.agg(groupby_func, *args)
|
|
else:
|
|
gb.transform(groupby_func, *args)
|
|
else:
|
|
with pytest.raises(klass, match=msg):
|
|
if how == "method":
|
|
getattr(gb, groupby_func)(*args)
|
|
elif how == "agg":
|
|
gb.agg(groupby_func, *args)
|
|
else:
|
|
gb.transform(groupby_func, *args)
|
|
|
|
|
|
def test_subsetting_columns_axis_1_raises():
|
|
# GH 35443
|
|
df = DataFrame({"a": [1], "b": [2], "c": [3]})
|
|
gb = df.groupby("a", axis=1)
|
|
with pytest.raises(ValueError, match="Cannot subset columns when using axis=1"):
|
|
gb["b"]
|