1597 lines
49 KiB
Python
1597 lines
49 KiB
Python
import builtins
|
|
import datetime as dt
|
|
from io import StringIO
|
|
from itertools import product
|
|
from string import ascii_lowercase
|
|
|
|
import numpy as np
|
|
import pytest
|
|
|
|
from pandas.errors import UnsupportedFunctionCall
|
|
|
|
import pandas as pd
|
|
from pandas import (
|
|
DataFrame,
|
|
Index,
|
|
MultiIndex,
|
|
NaT,
|
|
Series,
|
|
Timestamp,
|
|
_is_numpy_dev,
|
|
date_range,
|
|
isna,
|
|
)
|
|
import pandas._testing as tm
|
|
import pandas.core.nanops as nanops
|
|
from pandas.util import _test_decorators as td
|
|
|
|
|
|
@pytest.mark.parametrize("agg_func", ["any", "all"])
|
|
@pytest.mark.parametrize("skipna", [True, False])
|
|
@pytest.mark.parametrize(
|
|
"vals",
|
|
[
|
|
["foo", "bar", "baz"],
|
|
["foo", "", ""],
|
|
["", "", ""],
|
|
[1, 2, 3],
|
|
[1, 0, 0],
|
|
[0, 0, 0],
|
|
[1.0, 2.0, 3.0],
|
|
[1.0, 0.0, 0.0],
|
|
[0.0, 0.0, 0.0],
|
|
[True, True, True],
|
|
[True, False, False],
|
|
[False, False, False],
|
|
[np.nan, np.nan, np.nan],
|
|
],
|
|
)
|
|
def test_groupby_bool_aggs(agg_func, skipna, vals):
|
|
df = DataFrame({"key": ["a"] * 3 + ["b"] * 3, "val": vals * 2})
|
|
|
|
# Figure out expectation using Python builtin
|
|
exp = getattr(builtins, agg_func)(vals)
|
|
|
|
# edge case for missing data with skipna and 'any'
|
|
if skipna and all(isna(vals)) and agg_func == "any":
|
|
exp = False
|
|
|
|
exp_df = DataFrame([exp] * 2, columns=["val"], index=Index(["a", "b"], name="key"))
|
|
result = getattr(df.groupby("key"), agg_func)(skipna=skipna)
|
|
tm.assert_frame_equal(result, exp_df)
|
|
|
|
|
|
def test_max_min_non_numeric():
|
|
# #2700
|
|
aa = DataFrame({"nn": [11, 11, 22, 22], "ii": [1, 2, 3, 4], "ss": 4 * ["mama"]})
|
|
|
|
result = aa.groupby("nn").max()
|
|
assert "ss" in result
|
|
|
|
result = aa.groupby("nn").max(numeric_only=False)
|
|
assert "ss" in result
|
|
|
|
result = aa.groupby("nn").min()
|
|
assert "ss" in result
|
|
|
|
result = aa.groupby("nn").min(numeric_only=False)
|
|
assert "ss" in result
|
|
|
|
|
|
def test_intercept_builtin_sum():
|
|
s = Series([1.0, 2.0, np.nan, 3.0])
|
|
grouped = s.groupby([0, 1, 2, 2])
|
|
|
|
result = grouped.agg(builtins.sum)
|
|
result2 = grouped.apply(builtins.sum)
|
|
expected = grouped.sum()
|
|
tm.assert_series_equal(result, expected)
|
|
tm.assert_series_equal(result2, expected)
|
|
|
|
|
|
# @pytest.mark.parametrize("f", [max, min, sum])
|
|
# def test_builtins_apply(f):
|
|
|
|
|
|
@pytest.mark.parametrize("f", [max, min, sum])
|
|
@pytest.mark.parametrize("keys", ["jim", ["jim", "joe"]]) # Single key # Multi-key
|
|
def test_builtins_apply(keys, f):
|
|
# see gh-8155
|
|
df = pd.DataFrame(np.random.randint(1, 50, (1000, 2)), columns=["jim", "joe"])
|
|
df["jolie"] = np.random.randn(1000)
|
|
|
|
fname = f.__name__
|
|
result = df.groupby(keys).apply(f)
|
|
ngroups = len(df.drop_duplicates(subset=keys))
|
|
|
|
assert_msg = f"invalid frame shape: {result.shape} (expected ({ngroups}, 3))"
|
|
assert result.shape == (ngroups, 3), assert_msg
|
|
|
|
tm.assert_frame_equal(
|
|
result, # numpy's equivalent function
|
|
df.groupby(keys).apply(getattr(np, fname)),
|
|
)
|
|
|
|
if f != sum:
|
|
expected = df.groupby(keys).agg(fname).reset_index()
|
|
expected.set_index(keys, inplace=True, drop=False)
|
|
tm.assert_frame_equal(result, expected, check_dtype=False)
|
|
|
|
tm.assert_series_equal(getattr(result, fname)(), getattr(df, fname)())
|
|
|
|
|
|
def test_arg_passthru():
|
|
# make sure that we are passing thru kwargs
|
|
# to our agg functions
|
|
|
|
# GH3668
|
|
# GH5724
|
|
df = pd.DataFrame(
|
|
{
|
|
"group": [1, 1, 2],
|
|
"int": [1, 2, 3],
|
|
"float": [4.0, 5.0, 6.0],
|
|
"string": list("abc"),
|
|
"category_string": pd.Series(list("abc")).astype("category"),
|
|
"category_int": [7, 8, 9],
|
|
"datetime": pd.date_range("20130101", periods=3),
|
|
"datetimetz": pd.date_range("20130101", periods=3, tz="US/Eastern"),
|
|
"timedelta": pd.timedelta_range("1 s", periods=3, freq="s"),
|
|
},
|
|
columns=[
|
|
"group",
|
|
"int",
|
|
"float",
|
|
"string",
|
|
"category_string",
|
|
"category_int",
|
|
"datetime",
|
|
"datetimetz",
|
|
"timedelta",
|
|
],
|
|
)
|
|
|
|
expected_columns_numeric = Index(["int", "float", "category_int"])
|
|
|
|
# mean / median
|
|
expected = pd.DataFrame(
|
|
{
|
|
"category_int": [7.5, 9],
|
|
"float": [4.5, 6.0],
|
|
"timedelta": [pd.Timedelta("1.5s"), pd.Timedelta("3s")],
|
|
"int": [1.5, 3],
|
|
"datetime": [
|
|
pd.Timestamp("2013-01-01 12:00:00"),
|
|
pd.Timestamp("2013-01-03 00:00:00"),
|
|
],
|
|
"datetimetz": [
|
|
pd.Timestamp("2013-01-01 12:00:00", tz="US/Eastern"),
|
|
pd.Timestamp("2013-01-03 00:00:00", tz="US/Eastern"),
|
|
],
|
|
},
|
|
index=Index([1, 2], name="group"),
|
|
columns=["int", "float", "category_int", "datetime", "datetimetz", "timedelta"],
|
|
)
|
|
|
|
for attr in ["mean", "median"]:
|
|
f = getattr(df.groupby("group"), attr)
|
|
result = f()
|
|
tm.assert_index_equal(result.columns, expected_columns_numeric)
|
|
|
|
result = f(numeric_only=False)
|
|
tm.assert_frame_equal(result.reindex_like(expected), expected)
|
|
|
|
# TODO: min, max *should* handle
|
|
# categorical (ordered) dtype
|
|
expected_columns = Index(
|
|
[
|
|
"int",
|
|
"float",
|
|
"string",
|
|
"category_int",
|
|
"datetime",
|
|
"datetimetz",
|
|
"timedelta",
|
|
]
|
|
)
|
|
for attr in ["min", "max"]:
|
|
f = getattr(df.groupby("group"), attr)
|
|
result = f()
|
|
tm.assert_index_equal(result.columns, expected_columns)
|
|
|
|
result = f(numeric_only=False)
|
|
tm.assert_index_equal(result.columns, expected_columns)
|
|
|
|
expected_columns = Index(
|
|
[
|
|
"int",
|
|
"float",
|
|
"string",
|
|
"category_string",
|
|
"category_int",
|
|
"datetime",
|
|
"datetimetz",
|
|
"timedelta",
|
|
]
|
|
)
|
|
for attr in ["first", "last"]:
|
|
f = getattr(df.groupby("group"), attr)
|
|
result = f()
|
|
tm.assert_index_equal(result.columns, expected_columns)
|
|
|
|
result = f(numeric_only=False)
|
|
tm.assert_index_equal(result.columns, expected_columns)
|
|
|
|
expected_columns = Index(["int", "float", "string", "category_int", "timedelta"])
|
|
for attr in ["sum"]:
|
|
f = getattr(df.groupby("group"), attr)
|
|
result = f()
|
|
tm.assert_index_equal(result.columns, expected_columns_numeric)
|
|
|
|
result = f(numeric_only=False)
|
|
tm.assert_index_equal(result.columns, expected_columns)
|
|
|
|
expected_columns = Index(["int", "float", "category_int"])
|
|
for attr in ["prod", "cumprod"]:
|
|
f = getattr(df.groupby("group"), attr)
|
|
result = f()
|
|
tm.assert_index_equal(result.columns, expected_columns_numeric)
|
|
|
|
result = f(numeric_only=False)
|
|
tm.assert_index_equal(result.columns, expected_columns)
|
|
|
|
# like min, max, but don't include strings
|
|
expected_columns = Index(
|
|
["int", "float", "category_int", "datetime", "datetimetz", "timedelta"]
|
|
)
|
|
for attr in ["cummin", "cummax"]:
|
|
f = getattr(df.groupby("group"), attr)
|
|
result = f()
|
|
# GH 15561: numeric_only=False set by default like min/max
|
|
tm.assert_index_equal(result.columns, expected_columns)
|
|
|
|
result = f(numeric_only=False)
|
|
tm.assert_index_equal(result.columns, expected_columns)
|
|
|
|
expected_columns = Index(["int", "float", "category_int", "timedelta"])
|
|
for attr in ["cumsum"]:
|
|
f = getattr(df.groupby("group"), attr)
|
|
result = f()
|
|
tm.assert_index_equal(result.columns, expected_columns_numeric)
|
|
|
|
result = f(numeric_only=False)
|
|
tm.assert_index_equal(result.columns, expected_columns)
|
|
|
|
|
|
def test_non_cython_api():
|
|
|
|
# GH5610
|
|
# non-cython calls should not include the grouper
|
|
|
|
df = DataFrame(
|
|
[[1, 2, "foo"], [1, np.nan, "bar"], [3, np.nan, "baz"]], columns=["A", "B", "C"]
|
|
)
|
|
g = df.groupby("A")
|
|
gni = df.groupby("A", as_index=False)
|
|
|
|
# mad
|
|
expected = DataFrame([[0], [np.nan]], columns=["B"], index=[1, 3])
|
|
expected.index.name = "A"
|
|
result = g.mad()
|
|
tm.assert_frame_equal(result, expected)
|
|
|
|
expected = DataFrame([[0.0, 0.0], [0, np.nan]], columns=["A", "B"], index=[0, 1])
|
|
result = gni.mad()
|
|
tm.assert_frame_equal(result, expected)
|
|
|
|
# describe
|
|
expected_index = pd.Index([1, 3], name="A")
|
|
expected_col = pd.MultiIndex(
|
|
levels=[["B"], ["count", "mean", "std", "min", "25%", "50%", "75%", "max"]],
|
|
codes=[[0] * 8, list(range(8))],
|
|
)
|
|
expected = pd.DataFrame(
|
|
[
|
|
[1.0, 2.0, np.nan, 2.0, 2.0, 2.0, 2.0, 2.0],
|
|
[0.0, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan],
|
|
],
|
|
index=expected_index,
|
|
columns=expected_col,
|
|
)
|
|
result = g.describe()
|
|
tm.assert_frame_equal(result, expected)
|
|
|
|
expected = pd.concat(
|
|
[
|
|
df[df.A == 1].describe().unstack().to_frame().T,
|
|
df[df.A == 3].describe().unstack().to_frame().T,
|
|
]
|
|
)
|
|
expected.index = pd.Index([0, 1])
|
|
result = gni.describe()
|
|
tm.assert_frame_equal(result, expected)
|
|
|
|
# any
|
|
expected = DataFrame(
|
|
[[True, True], [False, True]], columns=["B", "C"], index=[1, 3]
|
|
)
|
|
expected.index.name = "A"
|
|
result = g.any()
|
|
tm.assert_frame_equal(result, expected)
|
|
|
|
# idxmax
|
|
expected = DataFrame([[0.0], [np.nan]], columns=["B"], index=[1, 3])
|
|
expected.index.name = "A"
|
|
result = g.idxmax()
|
|
tm.assert_frame_equal(result, expected)
|
|
|
|
|
|
def test_cython_api2():
|
|
|
|
# this takes the fast apply path
|
|
|
|
# cumsum (GH5614)
|
|
df = DataFrame([[1, 2, np.nan], [1, np.nan, 9], [3, 4, 9]], columns=["A", "B", "C"])
|
|
expected = DataFrame([[2, np.nan], [np.nan, 9], [4, 9]], columns=["B", "C"])
|
|
result = df.groupby("A").cumsum()
|
|
tm.assert_frame_equal(result, expected)
|
|
|
|
# GH 5755 - cumsum is a transformer and should ignore as_index
|
|
result = df.groupby("A", as_index=False).cumsum()
|
|
tm.assert_frame_equal(result, expected)
|
|
|
|
# GH 13994
|
|
result = df.groupby("A").cumsum(axis=1)
|
|
expected = df.cumsum(axis=1)
|
|
tm.assert_frame_equal(result, expected)
|
|
result = df.groupby("A").cumprod(axis=1)
|
|
expected = df.cumprod(axis=1)
|
|
tm.assert_frame_equal(result, expected)
|
|
|
|
|
|
def test_cython_median():
|
|
df = DataFrame(np.random.randn(1000))
|
|
df.values[::2] = np.nan
|
|
|
|
labels = np.random.randint(0, 50, size=1000).astype(float)
|
|
labels[::17] = np.nan
|
|
|
|
result = df.groupby(labels).median()
|
|
exp = df.groupby(labels).agg(nanops.nanmedian)
|
|
tm.assert_frame_equal(result, exp)
|
|
|
|
df = DataFrame(np.random.randn(1000, 5))
|
|
rs = df.groupby(labels).agg(np.median)
|
|
xp = df.groupby(labels).median()
|
|
tm.assert_frame_equal(rs, xp)
|
|
|
|
|
|
def test_median_empty_bins(observed):
|
|
df = pd.DataFrame(np.random.randint(0, 44, 500))
|
|
|
|
grps = range(0, 55, 5)
|
|
bins = pd.cut(df[0], grps)
|
|
|
|
result = df.groupby(bins, observed=observed).median()
|
|
expected = df.groupby(bins, observed=observed).agg(lambda x: x.median())
|
|
tm.assert_frame_equal(result, expected)
|
|
|
|
|
|
@pytest.mark.parametrize(
|
|
"dtype", ["int8", "int16", "int32", "int64", "float32", "float64", "uint64"]
|
|
)
|
|
@pytest.mark.parametrize(
|
|
"method,data",
|
|
[
|
|
("first", {"df": [{"a": 1, "b": 1}, {"a": 2, "b": 3}]}),
|
|
("last", {"df": [{"a": 1, "b": 2}, {"a": 2, "b": 4}]}),
|
|
("min", {"df": [{"a": 1, "b": 1}, {"a": 2, "b": 3}]}),
|
|
("max", {"df": [{"a": 1, "b": 2}, {"a": 2, "b": 4}]}),
|
|
("nth", {"df": [{"a": 1, "b": 2}, {"a": 2, "b": 4}], "args": [1]}),
|
|
("count", {"df": [{"a": 1, "b": 2}, {"a": 2, "b": 2}], "out_type": "int64"}),
|
|
],
|
|
)
|
|
def test_groupby_non_arithmetic_agg_types(dtype, method, data):
|
|
# GH9311, GH6620
|
|
df = pd.DataFrame(
|
|
[{"a": 1, "b": 1}, {"a": 1, "b": 2}, {"a": 2, "b": 3}, {"a": 2, "b": 4}]
|
|
)
|
|
|
|
df["b"] = df.b.astype(dtype)
|
|
|
|
if "args" not in data:
|
|
data["args"] = []
|
|
|
|
if "out_type" in data:
|
|
out_type = data["out_type"]
|
|
else:
|
|
out_type = dtype
|
|
|
|
exp = data["df"]
|
|
df_out = pd.DataFrame(exp)
|
|
|
|
df_out["b"] = df_out.b.astype(out_type)
|
|
df_out.set_index("a", inplace=True)
|
|
|
|
grpd = df.groupby("a")
|
|
t = getattr(grpd, method)(*data["args"])
|
|
tm.assert_frame_equal(t, df_out)
|
|
|
|
|
|
@pytest.mark.parametrize(
|
|
"i",
|
|
[
|
|
(
|
|
Timestamp("2011-01-15 12:50:28.502376"),
|
|
Timestamp("2011-01-20 12:50:28.593448"),
|
|
),
|
|
(24650000000000001, 24650000000000002),
|
|
],
|
|
)
|
|
def test_groupby_non_arithmetic_agg_int_like_precision(i):
|
|
# see gh-6620, gh-9311
|
|
df = pd.DataFrame([{"a": 1, "b": i[0]}, {"a": 1, "b": i[1]}])
|
|
|
|
grp_exp = {
|
|
"first": {"expected": i[0]},
|
|
"last": {"expected": i[1]},
|
|
"min": {"expected": i[0]},
|
|
"max": {"expected": i[1]},
|
|
"nth": {"expected": i[1], "args": [1]},
|
|
"count": {"expected": 2},
|
|
}
|
|
|
|
for method, data in grp_exp.items():
|
|
if "args" not in data:
|
|
data["args"] = []
|
|
|
|
grouped = df.groupby("a")
|
|
res = getattr(grouped, method)(*data["args"])
|
|
|
|
assert res.iloc[0].b == data["expected"]
|
|
|
|
|
|
@pytest.mark.parametrize(
|
|
"func, values",
|
|
[
|
|
("idxmin", {"c_int": [0, 2], "c_float": [1, 3], "c_date": [1, 2]}),
|
|
("idxmax", {"c_int": [1, 3], "c_float": [0, 2], "c_date": [0, 3]}),
|
|
],
|
|
)
|
|
def test_idxmin_idxmax_returns_int_types(func, values):
|
|
# GH 25444
|
|
df = pd.DataFrame(
|
|
{
|
|
"name": ["A", "A", "B", "B"],
|
|
"c_int": [1, 2, 3, 4],
|
|
"c_float": [4.02, 3.03, 2.04, 1.05],
|
|
"c_date": ["2019", "2018", "2016", "2017"],
|
|
}
|
|
)
|
|
df["c_date"] = pd.to_datetime(df["c_date"])
|
|
|
|
result = getattr(df.groupby("name"), func)()
|
|
|
|
expected = pd.DataFrame(values, index=Index(["A", "B"], name="name"))
|
|
|
|
tm.assert_frame_equal(result, expected)
|
|
|
|
|
|
def test_fill_consistency():
|
|
|
|
# GH9221
|
|
# pass thru keyword arguments to the generated wrapper
|
|
# are set if the passed kw is None (only)
|
|
df = DataFrame(
|
|
index=pd.MultiIndex.from_product(
|
|
[["value1", "value2"], date_range("2014-01-01", "2014-01-06")]
|
|
),
|
|
columns=Index(["1", "2"], name="id"),
|
|
)
|
|
df["1"] = [
|
|
np.nan,
|
|
1,
|
|
np.nan,
|
|
np.nan,
|
|
11,
|
|
np.nan,
|
|
np.nan,
|
|
2,
|
|
np.nan,
|
|
np.nan,
|
|
22,
|
|
np.nan,
|
|
]
|
|
df["2"] = [
|
|
np.nan,
|
|
3,
|
|
np.nan,
|
|
np.nan,
|
|
33,
|
|
np.nan,
|
|
np.nan,
|
|
4,
|
|
np.nan,
|
|
np.nan,
|
|
44,
|
|
np.nan,
|
|
]
|
|
|
|
expected = df.groupby(level=0, axis=0).fillna(method="ffill")
|
|
result = df.T.groupby(level=0, axis=1).fillna(method="ffill").T
|
|
tm.assert_frame_equal(result, expected)
|
|
|
|
|
|
def test_groupby_cumprod():
|
|
# GH 4095
|
|
df = pd.DataFrame({"key": ["b"] * 10, "value": 2})
|
|
|
|
actual = df.groupby("key")["value"].cumprod()
|
|
expected = df.groupby("key")["value"].apply(lambda x: x.cumprod())
|
|
expected.name = "value"
|
|
tm.assert_series_equal(actual, expected)
|
|
|
|
df = pd.DataFrame({"key": ["b"] * 100, "value": 2})
|
|
actual = df.groupby("key")["value"].cumprod()
|
|
# if overflows, groupby product casts to float
|
|
# while numpy passes back invalid values
|
|
df["value"] = df["value"].astype(float)
|
|
expected = df.groupby("key")["value"].apply(lambda x: x.cumprod())
|
|
expected.name = "value"
|
|
tm.assert_series_equal(actual, expected)
|
|
|
|
|
|
def scipy_sem(*args, **kwargs):
|
|
from scipy.stats import sem
|
|
|
|
return sem(*args, ddof=1, **kwargs)
|
|
|
|
|
|
@pytest.mark.parametrize(
|
|
"op,targop",
|
|
[
|
|
("mean", np.mean),
|
|
("median", np.median),
|
|
("std", np.std),
|
|
("var", np.var),
|
|
("sum", np.sum),
|
|
("prod", np.prod),
|
|
("min", np.min),
|
|
("max", np.max),
|
|
("first", lambda x: x.iloc[0]),
|
|
("last", lambda x: x.iloc[-1]),
|
|
("count", np.size),
|
|
pytest.param("sem", scipy_sem, marks=td.skip_if_no_scipy),
|
|
],
|
|
)
|
|
def test_ops_general(op, targop):
|
|
df = DataFrame(np.random.randn(1000))
|
|
labels = np.random.randint(0, 50, size=1000).astype(float)
|
|
|
|
result = getattr(df.groupby(labels), op)().astype(float)
|
|
expected = df.groupby(labels).agg(targop)
|
|
tm.assert_frame_equal(result, expected)
|
|
|
|
|
|
def test_max_nan_bug():
|
|
raw = """,Date,app,File
|
|
-04-23,2013-04-23 00:00:00,,log080001.log
|
|
-05-06,2013-05-06 00:00:00,,log.log
|
|
-05-07,2013-05-07 00:00:00,OE,xlsx"""
|
|
|
|
df = pd.read_csv(StringIO(raw), parse_dates=[0])
|
|
gb = df.groupby("Date")
|
|
r = gb[["File"]].max()
|
|
e = gb["File"].max().to_frame()
|
|
tm.assert_frame_equal(r, e)
|
|
assert not r["File"].isna().any()
|
|
|
|
|
|
def test_nlargest():
|
|
a = Series([1, 3, 5, 7, 2, 9, 0, 4, 6, 10])
|
|
b = Series(list("a" * 5 + "b" * 5))
|
|
gb = a.groupby(b)
|
|
r = gb.nlargest(3)
|
|
e = Series(
|
|
[7, 5, 3, 10, 9, 6],
|
|
index=MultiIndex.from_arrays([list("aaabbb"), [3, 2, 1, 9, 5, 8]]),
|
|
)
|
|
tm.assert_series_equal(r, e)
|
|
|
|
a = Series([1, 1, 3, 2, 0, 3, 3, 2, 1, 0])
|
|
gb = a.groupby(b)
|
|
e = Series(
|
|
[3, 2, 1, 3, 3, 2],
|
|
index=MultiIndex.from_arrays([list("aaabbb"), [2, 3, 1, 6, 5, 7]]),
|
|
)
|
|
tm.assert_series_equal(gb.nlargest(3, keep="last"), e)
|
|
|
|
|
|
def test_nlargest_mi_grouper():
|
|
# see gh-21411
|
|
npr = np.random.RandomState(123456789)
|
|
|
|
dts = date_range("20180101", periods=10)
|
|
iterables = [dts, ["one", "two"]]
|
|
|
|
idx = MultiIndex.from_product(iterables, names=["first", "second"])
|
|
s = Series(npr.randn(20), index=idx)
|
|
|
|
result = s.groupby("first").nlargest(1)
|
|
|
|
exp_idx = MultiIndex.from_tuples(
|
|
[
|
|
(dts[0], dts[0], "one"),
|
|
(dts[1], dts[1], "one"),
|
|
(dts[2], dts[2], "one"),
|
|
(dts[3], dts[3], "two"),
|
|
(dts[4], dts[4], "one"),
|
|
(dts[5], dts[5], "one"),
|
|
(dts[6], dts[6], "one"),
|
|
(dts[7], dts[7], "one"),
|
|
(dts[8], dts[8], "two"),
|
|
(dts[9], dts[9], "one"),
|
|
],
|
|
names=["first", "first", "second"],
|
|
)
|
|
|
|
exp_values = [
|
|
2.2129019979039612,
|
|
1.8417114045748335,
|
|
0.858963679564603,
|
|
1.3759151378258088,
|
|
0.9430284594687134,
|
|
0.5296914208183142,
|
|
0.8318045593815487,
|
|
-0.8476703342910327,
|
|
0.3804446884133735,
|
|
-0.8028845810770998,
|
|
]
|
|
|
|
expected = Series(exp_values, index=exp_idx)
|
|
tm.assert_series_equal(result, expected, check_exact=False, check_less_precise=True)
|
|
|
|
|
|
def test_nsmallest():
|
|
a = Series([1, 3, 5, 7, 2, 9, 0, 4, 6, 10])
|
|
b = Series(list("a" * 5 + "b" * 5))
|
|
gb = a.groupby(b)
|
|
r = gb.nsmallest(3)
|
|
e = Series(
|
|
[1, 2, 3, 0, 4, 6],
|
|
index=MultiIndex.from_arrays([list("aaabbb"), [0, 4, 1, 6, 7, 8]]),
|
|
)
|
|
tm.assert_series_equal(r, e)
|
|
|
|
a = Series([1, 1, 3, 2, 0, 3, 3, 2, 1, 0])
|
|
gb = a.groupby(b)
|
|
e = Series(
|
|
[0, 1, 1, 0, 1, 2],
|
|
index=MultiIndex.from_arrays([list("aaabbb"), [4, 1, 0, 9, 8, 7]]),
|
|
)
|
|
tm.assert_series_equal(gb.nsmallest(3, keep="last"), e)
|
|
|
|
|
|
@pytest.mark.parametrize("func", ["mean", "var", "std", "cumprod", "cumsum"])
|
|
def test_numpy_compat(func):
|
|
# see gh-12811
|
|
df = pd.DataFrame({"A": [1, 2, 1], "B": [1, 2, 3]})
|
|
g = df.groupby("A")
|
|
|
|
msg = "numpy operations are not valid with groupby"
|
|
|
|
with pytest.raises(UnsupportedFunctionCall, match=msg):
|
|
getattr(g, func)(1, 2, 3)
|
|
with pytest.raises(UnsupportedFunctionCall, match=msg):
|
|
getattr(g, func)(foo=1)
|
|
|
|
|
|
@pytest.mark.xfail(
|
|
_is_numpy_dev,
|
|
reason="https://github.com/pandas-dev/pandas/issues/31992",
|
|
strict=False,
|
|
)
|
|
def test_cummin_cummax():
|
|
# GH 15048
|
|
num_types = [np.int32, np.int64, np.float32, np.float64]
|
|
num_mins = [
|
|
np.iinfo(np.int32).min,
|
|
np.iinfo(np.int64).min,
|
|
np.finfo(np.float32).min,
|
|
np.finfo(np.float64).min,
|
|
]
|
|
num_max = [
|
|
np.iinfo(np.int32).max,
|
|
np.iinfo(np.int64).max,
|
|
np.finfo(np.float32).max,
|
|
np.finfo(np.float64).max,
|
|
]
|
|
base_df = pd.DataFrame(
|
|
{"A": [1, 1, 1, 1, 2, 2, 2, 2], "B": [3, 4, 3, 2, 2, 3, 2, 1]}
|
|
)
|
|
expected_mins = [3, 3, 3, 2, 2, 2, 2, 1]
|
|
expected_maxs = [3, 4, 4, 4, 2, 3, 3, 3]
|
|
|
|
for dtype, min_val, max_val in zip(num_types, num_mins, num_max):
|
|
df = base_df.astype(dtype)
|
|
|
|
# cummin
|
|
expected = pd.DataFrame({"B": expected_mins}).astype(dtype)
|
|
result = df.groupby("A").cummin()
|
|
tm.assert_frame_equal(result, expected)
|
|
result = df.groupby("A").B.apply(lambda x: x.cummin()).to_frame()
|
|
tm.assert_frame_equal(result, expected)
|
|
|
|
# Test cummin w/ min value for dtype
|
|
df.loc[[2, 6], "B"] = min_val
|
|
expected.loc[[2, 3, 6, 7], "B"] = min_val
|
|
result = df.groupby("A").cummin()
|
|
tm.assert_frame_equal(result, expected)
|
|
expected = df.groupby("A").B.apply(lambda x: x.cummin()).to_frame()
|
|
tm.assert_frame_equal(result, expected)
|
|
|
|
# cummax
|
|
expected = pd.DataFrame({"B": expected_maxs}).astype(dtype)
|
|
result = df.groupby("A").cummax()
|
|
tm.assert_frame_equal(result, expected)
|
|
result = df.groupby("A").B.apply(lambda x: x.cummax()).to_frame()
|
|
tm.assert_frame_equal(result, expected)
|
|
|
|
# Test cummax w/ max value for dtype
|
|
df.loc[[2, 6], "B"] = max_val
|
|
expected.loc[[2, 3, 6, 7], "B"] = max_val
|
|
result = df.groupby("A").cummax()
|
|
tm.assert_frame_equal(result, expected)
|
|
expected = df.groupby("A").B.apply(lambda x: x.cummax()).to_frame()
|
|
tm.assert_frame_equal(result, expected)
|
|
|
|
# Test nan in some values
|
|
base_df.loc[[0, 2, 4, 6], "B"] = np.nan
|
|
expected = pd.DataFrame({"B": [np.nan, 4, np.nan, 2, np.nan, 3, np.nan, 1]})
|
|
result = base_df.groupby("A").cummin()
|
|
tm.assert_frame_equal(result, expected)
|
|
expected = base_df.groupby("A").B.apply(lambda x: x.cummin()).to_frame()
|
|
tm.assert_frame_equal(result, expected)
|
|
|
|
expected = pd.DataFrame({"B": [np.nan, 4, np.nan, 4, np.nan, 3, np.nan, 3]})
|
|
result = base_df.groupby("A").cummax()
|
|
tm.assert_frame_equal(result, expected)
|
|
expected = base_df.groupby("A").B.apply(lambda x: x.cummax()).to_frame()
|
|
tm.assert_frame_equal(result, expected)
|
|
|
|
# Test nan in entire column
|
|
base_df["B"] = np.nan
|
|
expected = pd.DataFrame({"B": [np.nan] * 8})
|
|
result = base_df.groupby("A").cummin()
|
|
tm.assert_frame_equal(expected, result)
|
|
result = base_df.groupby("A").B.apply(lambda x: x.cummin()).to_frame()
|
|
tm.assert_frame_equal(expected, result)
|
|
result = base_df.groupby("A").cummax()
|
|
tm.assert_frame_equal(expected, result)
|
|
result = base_df.groupby("A").B.apply(lambda x: x.cummax()).to_frame()
|
|
tm.assert_frame_equal(expected, result)
|
|
|
|
# GH 15561
|
|
df = pd.DataFrame(dict(a=[1], b=pd.to_datetime(["2001"])))
|
|
expected = pd.Series(pd.to_datetime("2001"), index=[0], name="b")
|
|
for method in ["cummax", "cummin"]:
|
|
result = getattr(df.groupby("a")["b"], method)()
|
|
tm.assert_series_equal(expected, result)
|
|
|
|
# GH 15635
|
|
df = pd.DataFrame(dict(a=[1, 2, 1], b=[2, 1, 1]))
|
|
result = df.groupby("a").b.cummax()
|
|
expected = pd.Series([2, 1, 2], name="b")
|
|
tm.assert_series_equal(result, expected)
|
|
|
|
df = pd.DataFrame(dict(a=[1, 2, 1], b=[1, 2, 2]))
|
|
result = df.groupby("a").b.cummin()
|
|
expected = pd.Series([1, 2, 1], name="b")
|
|
tm.assert_series_equal(result, expected)
|
|
|
|
|
|
@pytest.mark.parametrize(
|
|
"in_vals, out_vals",
|
|
[
|
|
# Basics: strictly increasing (T), strictly decreasing (F),
|
|
# abs val increasing (F), non-strictly increasing (T)
|
|
([1, 2, 5, 3, 2, 0, 4, 5, -6, 1, 1], [True, False, False, True]),
|
|
# Test with inf vals
|
|
(
|
|
[1, 2.1, np.inf, 3, 2, np.inf, -np.inf, 5, 11, 1, -np.inf],
|
|
[True, False, True, False],
|
|
),
|
|
# Test with nan vals; should always be False
|
|
(
|
|
[1, 2, np.nan, 3, 2, np.nan, np.nan, 5, -np.inf, 1, np.nan],
|
|
[False, False, False, False],
|
|
),
|
|
],
|
|
)
|
|
def test_is_monotonic_increasing(in_vals, out_vals):
|
|
# GH 17015
|
|
source_dict = {
|
|
"A": ["1", "2", "3", "4", "5", "6", "7", "8", "9", "10", "11"],
|
|
"B": ["a", "a", "a", "b", "b", "b", "c", "c", "c", "d", "d"],
|
|
"C": in_vals,
|
|
}
|
|
df = pd.DataFrame(source_dict)
|
|
result = df.groupby("B").C.is_monotonic_increasing
|
|
index = Index(list("abcd"), name="B")
|
|
expected = pd.Series(index=index, data=out_vals, name="C")
|
|
tm.assert_series_equal(result, expected)
|
|
|
|
# Also check result equal to manually taking x.is_monotonic_increasing.
|
|
expected = df.groupby(["B"]).C.apply(lambda x: x.is_monotonic_increasing)
|
|
tm.assert_series_equal(result, expected)
|
|
|
|
|
|
@pytest.mark.parametrize(
|
|
"in_vals, out_vals",
|
|
[
|
|
# Basics: strictly decreasing (T), strictly increasing (F),
|
|
# abs val decreasing (F), non-strictly increasing (T)
|
|
([10, 9, 7, 3, 4, 5, -3, 2, 0, 1, 1], [True, False, False, True]),
|
|
# Test with inf vals
|
|
(
|
|
[np.inf, 1, -np.inf, np.inf, 2, -3, -np.inf, 5, -3, -np.inf, -np.inf],
|
|
[True, True, False, True],
|
|
),
|
|
# Test with nan vals; should always be False
|
|
(
|
|
[1, 2, np.nan, 3, 2, np.nan, np.nan, 5, -np.inf, 1, np.nan],
|
|
[False, False, False, False],
|
|
),
|
|
],
|
|
)
|
|
def test_is_monotonic_decreasing(in_vals, out_vals):
|
|
# GH 17015
|
|
source_dict = {
|
|
"A": ["1", "2", "3", "4", "5", "6", "7", "8", "9", "10", "11"],
|
|
"B": ["a", "a", "a", "b", "b", "b", "c", "c", "c", "d", "d"],
|
|
"C": in_vals,
|
|
}
|
|
|
|
df = pd.DataFrame(source_dict)
|
|
result = df.groupby("B").C.is_monotonic_decreasing
|
|
index = Index(list("abcd"), name="B")
|
|
expected = pd.Series(index=index, data=out_vals, name="C")
|
|
tm.assert_series_equal(result, expected)
|
|
|
|
|
|
# describe
|
|
# --------------------------------
|
|
|
|
|
|
def test_apply_describe_bug(mframe):
|
|
grouped = mframe.groupby(level="first")
|
|
grouped.describe() # it works!
|
|
|
|
|
|
def test_series_describe_multikey():
|
|
ts = tm.makeTimeSeries()
|
|
grouped = ts.groupby([lambda x: x.year, lambda x: x.month])
|
|
result = grouped.describe()
|
|
tm.assert_series_equal(result["mean"], grouped.mean(), check_names=False)
|
|
tm.assert_series_equal(result["std"], grouped.std(), check_names=False)
|
|
tm.assert_series_equal(result["min"], grouped.min(), check_names=False)
|
|
|
|
|
|
def test_series_describe_single():
|
|
ts = tm.makeTimeSeries()
|
|
grouped = ts.groupby(lambda x: x.month)
|
|
result = grouped.apply(lambda x: x.describe())
|
|
expected = grouped.describe().stack()
|
|
tm.assert_series_equal(result, expected)
|
|
|
|
|
|
def test_series_index_name(df):
|
|
grouped = df.loc[:, ["C"]].groupby(df["A"])
|
|
result = grouped.agg(lambda x: x.mean())
|
|
assert result.index.name == "A"
|
|
|
|
|
|
def test_frame_describe_multikey(tsframe):
|
|
grouped = tsframe.groupby([lambda x: x.year, lambda x: x.month])
|
|
result = grouped.describe()
|
|
desc_groups = []
|
|
for col in tsframe:
|
|
group = grouped[col].describe()
|
|
# GH 17464 - Remove duplicate MultiIndex levels
|
|
group_col = pd.MultiIndex(
|
|
levels=[[col], group.columns],
|
|
codes=[[0] * len(group.columns), range(len(group.columns))],
|
|
)
|
|
group = pd.DataFrame(group.values, columns=group_col, index=group.index)
|
|
desc_groups.append(group)
|
|
expected = pd.concat(desc_groups, axis=1)
|
|
tm.assert_frame_equal(result, expected)
|
|
|
|
groupedT = tsframe.groupby({"A": 0, "B": 0, "C": 1, "D": 1}, axis=1)
|
|
result = groupedT.describe()
|
|
expected = tsframe.describe().T
|
|
expected.index = pd.MultiIndex(
|
|
levels=[[0, 1], expected.index],
|
|
codes=[[0, 0, 1, 1], range(len(expected.index))],
|
|
)
|
|
tm.assert_frame_equal(result, expected)
|
|
|
|
|
|
def test_frame_describe_tupleindex():
|
|
|
|
# GH 14848 - regression from 0.19.0 to 0.19.1
|
|
df1 = DataFrame(
|
|
{
|
|
"x": [1, 2, 3, 4, 5] * 3,
|
|
"y": [10, 20, 30, 40, 50] * 3,
|
|
"z": [100, 200, 300, 400, 500] * 3,
|
|
}
|
|
)
|
|
df1["k"] = [(0, 0, 1), (0, 1, 0), (1, 0, 0)] * 5
|
|
df2 = df1.rename(columns={"k": "key"})
|
|
msg = "Names should be list-like for a MultiIndex"
|
|
with pytest.raises(ValueError, match=msg):
|
|
df1.groupby("k").describe()
|
|
with pytest.raises(ValueError, match=msg):
|
|
df2.groupby("key").describe()
|
|
|
|
|
|
def test_frame_describe_unstacked_format():
|
|
# GH 4792
|
|
prices = {
|
|
pd.Timestamp("2011-01-06 10:59:05", tz=None): 24990,
|
|
pd.Timestamp("2011-01-06 12:43:33", tz=None): 25499,
|
|
pd.Timestamp("2011-01-06 12:54:09", tz=None): 25499,
|
|
}
|
|
volumes = {
|
|
pd.Timestamp("2011-01-06 10:59:05", tz=None): 1500000000,
|
|
pd.Timestamp("2011-01-06 12:43:33", tz=None): 5000000000,
|
|
pd.Timestamp("2011-01-06 12:54:09", tz=None): 100000000,
|
|
}
|
|
df = pd.DataFrame({"PRICE": prices, "VOLUME": volumes})
|
|
result = df.groupby("PRICE").VOLUME.describe()
|
|
data = [
|
|
df[df.PRICE == 24990].VOLUME.describe().values.tolist(),
|
|
df[df.PRICE == 25499].VOLUME.describe().values.tolist(),
|
|
]
|
|
expected = pd.DataFrame(
|
|
data,
|
|
index=pd.Index([24990, 25499], name="PRICE"),
|
|
columns=["count", "mean", "std", "min", "25%", "50%", "75%", "max"],
|
|
)
|
|
tm.assert_frame_equal(result, expected)
|
|
|
|
|
|
# nunique
|
|
# --------------------------------
|
|
|
|
|
|
@pytest.mark.parametrize("n", 10 ** np.arange(2, 6))
|
|
@pytest.mark.parametrize("m", [10, 100, 1000])
|
|
@pytest.mark.parametrize("sort", [False, True])
|
|
@pytest.mark.parametrize("dropna", [False, True])
|
|
def test_series_groupby_nunique(n, m, sort, dropna):
|
|
def check_nunique(df, keys, as_index=True):
|
|
original_df = df.copy()
|
|
gr = df.groupby(keys, as_index=as_index, sort=sort)
|
|
left = gr["julie"].nunique(dropna=dropna)
|
|
|
|
gr = df.groupby(keys, as_index=as_index, sort=sort)
|
|
right = gr["julie"].apply(Series.nunique, dropna=dropna)
|
|
if not as_index:
|
|
right = right.reset_index(drop=True)
|
|
|
|
tm.assert_series_equal(left, right, check_names=False)
|
|
tm.assert_frame_equal(df, original_df)
|
|
|
|
days = date_range("2015-08-23", periods=10)
|
|
|
|
frame = DataFrame(
|
|
{
|
|
"jim": np.random.choice(list(ascii_lowercase), n),
|
|
"joe": np.random.choice(days, n),
|
|
"julie": np.random.randint(0, m, n),
|
|
}
|
|
)
|
|
|
|
check_nunique(frame, ["jim"])
|
|
check_nunique(frame, ["jim", "joe"])
|
|
|
|
frame.loc[1::17, "jim"] = None
|
|
frame.loc[3::37, "joe"] = None
|
|
frame.loc[7::19, "julie"] = None
|
|
frame.loc[8::19, "julie"] = None
|
|
frame.loc[9::19, "julie"] = None
|
|
|
|
check_nunique(frame, ["jim"])
|
|
check_nunique(frame, ["jim", "joe"])
|
|
check_nunique(frame, ["jim"], as_index=False)
|
|
check_nunique(frame, ["jim", "joe"], as_index=False)
|
|
|
|
|
|
def test_nunique():
|
|
df = DataFrame({"A": list("abbacc"), "B": list("abxacc"), "C": list("abbacx")})
|
|
|
|
expected = DataFrame({"A": [1] * 3, "B": [1, 2, 1], "C": [1, 1, 2]})
|
|
result = df.groupby("A", as_index=False).nunique()
|
|
tm.assert_frame_equal(result, expected)
|
|
|
|
# as_index
|
|
expected.index = list("abc")
|
|
expected.index.name = "A"
|
|
result = df.groupby("A").nunique()
|
|
tm.assert_frame_equal(result, expected)
|
|
|
|
# with na
|
|
result = df.replace({"x": None}).groupby("A").nunique(dropna=False)
|
|
tm.assert_frame_equal(result, expected)
|
|
|
|
# dropna
|
|
expected = DataFrame({"A": [1] * 3, "B": [1] * 3, "C": [1] * 3}, index=list("abc"))
|
|
expected.index.name = "A"
|
|
result = df.replace({"x": None}).groupby("A").nunique()
|
|
tm.assert_frame_equal(result, expected)
|
|
|
|
|
|
def test_nunique_with_object():
|
|
# GH 11077
|
|
data = pd.DataFrame(
|
|
[
|
|
[100, 1, "Alice"],
|
|
[200, 2, "Bob"],
|
|
[300, 3, "Charlie"],
|
|
[-400, 4, "Dan"],
|
|
[500, 5, "Edith"],
|
|
],
|
|
columns=["amount", "id", "name"],
|
|
)
|
|
|
|
result = data.groupby(["id", "amount"])["name"].nunique()
|
|
index = MultiIndex.from_arrays([data.id, data.amount])
|
|
expected = pd.Series([1] * 5, name="name", index=index)
|
|
tm.assert_series_equal(result, expected)
|
|
|
|
|
|
def test_nunique_with_empty_series():
|
|
# GH 12553
|
|
data = pd.Series(name="name", dtype=object)
|
|
result = data.groupby(level=0).nunique()
|
|
expected = pd.Series(name="name", dtype="int64")
|
|
tm.assert_series_equal(result, expected)
|
|
|
|
|
|
def test_nunique_with_timegrouper():
|
|
# GH 13453
|
|
test = pd.DataFrame(
|
|
{
|
|
"time": [
|
|
Timestamp("2016-06-28 09:35:35"),
|
|
Timestamp("2016-06-28 16:09:30"),
|
|
Timestamp("2016-06-28 16:46:28"),
|
|
],
|
|
"data": ["1", "2", "3"],
|
|
}
|
|
).set_index("time")
|
|
result = test.groupby(pd.Grouper(freq="h"))["data"].nunique()
|
|
expected = test.groupby(pd.Grouper(freq="h"))["data"].apply(pd.Series.nunique)
|
|
tm.assert_series_equal(result, expected)
|
|
|
|
|
|
@pytest.mark.parametrize(
|
|
"key, data, dropna, expected",
|
|
[
|
|
(
|
|
["x", "x", "x"],
|
|
[Timestamp("2019-01-01"), NaT, Timestamp("2019-01-01")],
|
|
True,
|
|
Series([1], index=pd.Index(["x"], name="key"), name="data"),
|
|
),
|
|
(
|
|
["x", "x", "x"],
|
|
[dt.date(2019, 1, 1), NaT, dt.date(2019, 1, 1)],
|
|
True,
|
|
Series([1], index=pd.Index(["x"], name="key"), name="data"),
|
|
),
|
|
(
|
|
["x", "x", "x", "y", "y"],
|
|
[dt.date(2019, 1, 1), NaT, dt.date(2019, 1, 1), NaT, dt.date(2019, 1, 1)],
|
|
False,
|
|
Series([2, 2], index=pd.Index(["x", "y"], name="key"), name="data"),
|
|
),
|
|
(
|
|
["x", "x", "x", "x", "y"],
|
|
[dt.date(2019, 1, 1), NaT, dt.date(2019, 1, 1), NaT, dt.date(2019, 1, 1)],
|
|
False,
|
|
Series([2, 1], index=pd.Index(["x", "y"], name="key"), name="data"),
|
|
),
|
|
],
|
|
)
|
|
def test_nunique_with_NaT(key, data, dropna, expected):
|
|
# GH 27951
|
|
df = pd.DataFrame({"key": key, "data": data})
|
|
result = df.groupby(["key"])["data"].nunique(dropna=dropna)
|
|
tm.assert_series_equal(result, expected)
|
|
|
|
|
|
def test_nunique_preserves_column_level_names():
|
|
# GH 23222
|
|
test = pd.DataFrame([1, 2, 2], columns=pd.Index(["A"], name="level_0"))
|
|
result = test.groupby([0, 0, 0]).nunique()
|
|
expected = pd.DataFrame([2], columns=test.columns)
|
|
tm.assert_frame_equal(result, expected)
|
|
|
|
|
|
# count
|
|
# --------------------------------
|
|
|
|
|
|
def test_groupby_timedelta_cython_count():
|
|
df = DataFrame(
|
|
{"g": list("ab" * 2), "delt": np.arange(4).astype("timedelta64[ns]")}
|
|
)
|
|
expected = Series([2, 2], index=pd.Index(["a", "b"], name="g"), name="delt")
|
|
result = df.groupby("g").delt.count()
|
|
tm.assert_series_equal(expected, result)
|
|
|
|
|
|
def test_count():
|
|
n = 1 << 15
|
|
dr = date_range("2015-08-30", periods=n // 10, freq="T")
|
|
|
|
df = DataFrame(
|
|
{
|
|
"1st": np.random.choice(list(ascii_lowercase), n),
|
|
"2nd": np.random.randint(0, 5, n),
|
|
"3rd": np.random.randn(n).round(3),
|
|
"4th": np.random.randint(-10, 10, n),
|
|
"5th": np.random.choice(dr, n),
|
|
"6th": np.random.randn(n).round(3),
|
|
"7th": np.random.randn(n).round(3),
|
|
"8th": np.random.choice(dr, n) - np.random.choice(dr, 1),
|
|
"9th": np.random.choice(list(ascii_lowercase), n),
|
|
}
|
|
)
|
|
|
|
for col in df.columns.drop(["1st", "2nd", "4th"]):
|
|
df.loc[np.random.choice(n, n // 10), col] = np.nan
|
|
|
|
df["9th"] = df["9th"].astype("category")
|
|
|
|
for key in ["1st", "2nd", ["1st", "2nd"]]:
|
|
left = df.groupby(key).count()
|
|
right = df.groupby(key).apply(DataFrame.count).drop(key, axis=1)
|
|
tm.assert_frame_equal(left, right)
|
|
|
|
|
|
def test_count_non_nulls():
|
|
# GH#5610
|
|
# count counts non-nulls
|
|
df = pd.DataFrame(
|
|
[[1, 2, "foo"], [1, np.nan, "bar"], [3, np.nan, np.nan]],
|
|
columns=["A", "B", "C"],
|
|
)
|
|
|
|
count_as = df.groupby("A").count()
|
|
count_not_as = df.groupby("A", as_index=False).count()
|
|
|
|
expected = DataFrame([[1, 2], [0, 0]], columns=["B", "C"], index=[1, 3])
|
|
expected.index.name = "A"
|
|
tm.assert_frame_equal(count_not_as, expected.reset_index())
|
|
tm.assert_frame_equal(count_as, expected)
|
|
|
|
count_B = df.groupby("A")["B"].count()
|
|
tm.assert_series_equal(count_B, expected["B"])
|
|
|
|
|
|
def test_count_object():
|
|
df = pd.DataFrame({"a": ["a"] * 3 + ["b"] * 3, "c": [2] * 3 + [3] * 3})
|
|
result = df.groupby("c").a.count()
|
|
expected = pd.Series([3, 3], index=pd.Index([2, 3], name="c"), name="a")
|
|
tm.assert_series_equal(result, expected)
|
|
|
|
df = pd.DataFrame({"a": ["a", np.nan, np.nan] + ["b"] * 3, "c": [2] * 3 + [3] * 3})
|
|
result = df.groupby("c").a.count()
|
|
expected = pd.Series([1, 3], index=pd.Index([2, 3], name="c"), name="a")
|
|
tm.assert_series_equal(result, expected)
|
|
|
|
|
|
def test_count_cross_type():
|
|
# GH8169
|
|
vals = np.hstack(
|
|
(np.random.randint(0, 5, (100, 2)), np.random.randint(0, 2, (100, 2)))
|
|
)
|
|
|
|
df = pd.DataFrame(vals, columns=["a", "b", "c", "d"])
|
|
df[df == 2] = np.nan
|
|
expected = df.groupby(["c", "d"]).count()
|
|
|
|
for t in ["float32", "object"]:
|
|
df["a"] = df["a"].astype(t)
|
|
df["b"] = df["b"].astype(t)
|
|
result = df.groupby(["c", "d"]).count()
|
|
tm.assert_frame_equal(result, expected)
|
|
|
|
|
|
def test_lower_int_prec_count():
|
|
df = DataFrame(
|
|
{
|
|
"a": np.array([0, 1, 2, 100], np.int8),
|
|
"b": np.array([1, 2, 3, 6], np.uint32),
|
|
"c": np.array([4, 5, 6, 8], np.int16),
|
|
"grp": list("ab" * 2),
|
|
}
|
|
)
|
|
result = df.groupby("grp").count()
|
|
expected = DataFrame(
|
|
{"a": [2, 2], "b": [2, 2], "c": [2, 2]}, index=pd.Index(list("ab"), name="grp")
|
|
)
|
|
tm.assert_frame_equal(result, expected)
|
|
|
|
|
|
def test_count_uses_size_on_exception():
|
|
class RaisingObjectException(Exception):
|
|
pass
|
|
|
|
class RaisingObject:
|
|
def __init__(self, msg="I will raise inside Cython"):
|
|
super().__init__()
|
|
self.msg = msg
|
|
|
|
def __eq__(self, other):
|
|
# gets called in Cython to check that raising calls the method
|
|
raise RaisingObjectException(self.msg)
|
|
|
|
df = DataFrame({"a": [RaisingObject() for _ in range(4)], "grp": list("ab" * 2)})
|
|
result = df.groupby("grp").count()
|
|
expected = DataFrame({"a": [2, 2]}, index=pd.Index(list("ab"), name="grp"))
|
|
tm.assert_frame_equal(result, expected)
|
|
|
|
|
|
# size
|
|
# --------------------------------
|
|
|
|
|
|
def test_size(df):
|
|
grouped = df.groupby(["A", "B"])
|
|
result = grouped.size()
|
|
for key, group in grouped:
|
|
assert result[key] == len(group)
|
|
|
|
grouped = df.groupby("A")
|
|
result = grouped.size()
|
|
for key, group in grouped:
|
|
assert result[key] == len(group)
|
|
|
|
grouped = df.groupby("B")
|
|
result = grouped.size()
|
|
for key, group in grouped:
|
|
assert result[key] == len(group)
|
|
|
|
df = DataFrame(np.random.choice(20, (1000, 3)), columns=list("abc"))
|
|
for sort, key in product((False, True), ("a", "b", ["a", "b"])):
|
|
left = df.groupby(key, sort=sort).size()
|
|
right = df.groupby(key, sort=sort)["c"].apply(lambda a: a.shape[0])
|
|
tm.assert_series_equal(left, right, check_names=False)
|
|
|
|
# GH11699
|
|
df = DataFrame(columns=["A", "B"])
|
|
out = Series(dtype="int64", index=Index([], name="A"))
|
|
tm.assert_series_equal(df.groupby("A").size(), out)
|
|
|
|
|
|
def test_size_groupby_all_null():
|
|
# GH23050
|
|
# Assert no 'Value Error : Length of passed values is 2, index implies 0'
|
|
df = DataFrame({"A": [None, None]}) # all-null groups
|
|
result = df.groupby("A").size()
|
|
expected = Series(dtype="int64", index=Index([], name="A"))
|
|
tm.assert_series_equal(result, expected)
|
|
|
|
|
|
# quantile
|
|
# --------------------------------
|
|
@pytest.mark.parametrize(
|
|
"interpolation", ["linear", "lower", "higher", "nearest", "midpoint"]
|
|
)
|
|
@pytest.mark.parametrize(
|
|
"a_vals,b_vals",
|
|
[
|
|
# Ints
|
|
([1, 2, 3, 4, 5], [5, 4, 3, 2, 1]),
|
|
([1, 2, 3, 4], [4, 3, 2, 1]),
|
|
([1, 2, 3, 4, 5], [4, 3, 2, 1]),
|
|
# Floats
|
|
([1.0, 2.0, 3.0, 4.0, 5.0], [5.0, 4.0, 3.0, 2.0, 1.0]),
|
|
# Missing data
|
|
([1.0, np.nan, 3.0, np.nan, 5.0], [5.0, np.nan, 3.0, np.nan, 1.0]),
|
|
([np.nan, 4.0, np.nan, 2.0, np.nan], [np.nan, 4.0, np.nan, 2.0, np.nan]),
|
|
# Timestamps
|
|
(
|
|
list(pd.date_range("1/1/18", freq="D", periods=5)),
|
|
list(pd.date_range("1/1/18", freq="D", periods=5))[::-1],
|
|
),
|
|
# All NA
|
|
([np.nan] * 5, [np.nan] * 5),
|
|
],
|
|
)
|
|
@pytest.mark.parametrize("q", [0, 0.25, 0.5, 0.75, 1])
|
|
def test_quantile(interpolation, a_vals, b_vals, q):
|
|
if interpolation == "nearest" and q == 0.5 and b_vals == [4, 3, 2, 1]:
|
|
pytest.skip(
|
|
"Unclear numpy expectation for nearest result with equidistant data"
|
|
)
|
|
|
|
a_expected = pd.Series(a_vals).quantile(q, interpolation=interpolation)
|
|
b_expected = pd.Series(b_vals).quantile(q, interpolation=interpolation)
|
|
|
|
df = DataFrame(
|
|
{"key": ["a"] * len(a_vals) + ["b"] * len(b_vals), "val": a_vals + b_vals}
|
|
)
|
|
|
|
expected = DataFrame(
|
|
[a_expected, b_expected], columns=["val"], index=Index(["a", "b"], name="key")
|
|
)
|
|
result = df.groupby("key").quantile(q, interpolation=interpolation)
|
|
|
|
tm.assert_frame_equal(result, expected)
|
|
|
|
|
|
def test_quantile_array():
|
|
# https://github.com/pandas-dev/pandas/issues/27526
|
|
df = pd.DataFrame({"A": [0, 1, 2, 3, 4]})
|
|
result = df.groupby([0, 0, 1, 1, 1]).quantile([0.25])
|
|
|
|
index = pd.MultiIndex.from_product([[0, 1], [0.25]])
|
|
expected = pd.DataFrame({"A": [0.25, 2.50]}, index=index)
|
|
tm.assert_frame_equal(result, expected)
|
|
|
|
df = pd.DataFrame({"A": [0, 1, 2, 3], "B": [4, 5, 6, 7]})
|
|
index = pd.MultiIndex.from_product([[0, 1], [0.25, 0.75]])
|
|
|
|
result = df.groupby([0, 0, 1, 1]).quantile([0.25, 0.75])
|
|
expected = pd.DataFrame(
|
|
{"A": [0.25, 0.75, 2.25, 2.75], "B": [4.25, 4.75, 6.25, 6.75]}, index=index
|
|
)
|
|
tm.assert_frame_equal(result, expected)
|
|
|
|
|
|
def test_quantile_array2():
|
|
# https://github.com/pandas-dev/pandas/pull/28085#issuecomment-524066959
|
|
df = pd.DataFrame(
|
|
np.random.RandomState(0).randint(0, 5, size=(10, 3)), columns=list("ABC")
|
|
)
|
|
result = df.groupby("A").quantile([0.3, 0.7])
|
|
expected = pd.DataFrame(
|
|
{
|
|
"B": [0.9, 2.1, 2.2, 3.4, 1.6, 2.4, 2.3, 2.7, 0.0, 0.0],
|
|
"C": [1.2, 2.8, 1.8, 3.0, 0.0, 0.0, 1.9, 3.1, 3.0, 3.0],
|
|
},
|
|
index=pd.MultiIndex.from_product(
|
|
[[0, 1, 2, 3, 4], [0.3, 0.7]], names=["A", None]
|
|
),
|
|
)
|
|
tm.assert_frame_equal(result, expected)
|
|
|
|
|
|
def test_quantile_array_no_sort():
|
|
df = pd.DataFrame({"A": [0, 1, 2], "B": [3, 4, 5]})
|
|
result = df.groupby([1, 0, 1], sort=False).quantile([0.25, 0.5, 0.75])
|
|
expected = pd.DataFrame(
|
|
{"A": [0.5, 1.0, 1.5, 1.0, 1.0, 1.0], "B": [3.5, 4.0, 4.5, 4.0, 4.0, 4.0]},
|
|
index=pd.MultiIndex.from_product([[1, 0], [0.25, 0.5, 0.75]]),
|
|
)
|
|
tm.assert_frame_equal(result, expected)
|
|
|
|
result = df.groupby([1, 0, 1], sort=False).quantile([0.75, 0.25])
|
|
expected = pd.DataFrame(
|
|
{"A": [1.5, 0.5, 1.0, 1.0], "B": [4.5, 3.5, 4.0, 4.0]},
|
|
index=pd.MultiIndex.from_product([[1, 0], [0.75, 0.25]]),
|
|
)
|
|
tm.assert_frame_equal(result, expected)
|
|
|
|
|
|
def test_quantile_array_multiple_levels():
|
|
df = pd.DataFrame(
|
|
{"A": [0, 1, 2], "B": [3, 4, 5], "c": ["a", "a", "a"], "d": ["a", "a", "b"]}
|
|
)
|
|
result = df.groupby(["c", "d"]).quantile([0.25, 0.75])
|
|
index = pd.MultiIndex.from_tuples(
|
|
[("a", "a", 0.25), ("a", "a", 0.75), ("a", "b", 0.25), ("a", "b", 0.75)],
|
|
names=["c", "d", None],
|
|
)
|
|
expected = pd.DataFrame(
|
|
{"A": [0.25, 0.75, 2.0, 2.0], "B": [3.25, 3.75, 5.0, 5.0]}, index=index
|
|
)
|
|
tm.assert_frame_equal(result, expected)
|
|
|
|
|
|
@pytest.mark.parametrize("frame_size", [(2, 3), (100, 10)])
|
|
@pytest.mark.parametrize("groupby", [[0], [0, 1]])
|
|
@pytest.mark.parametrize("q", [[0.5, 0.6]])
|
|
def test_groupby_quantile_with_arraylike_q_and_int_columns(frame_size, groupby, q):
|
|
# GH30289
|
|
nrow, ncol = frame_size
|
|
df = pd.DataFrame(
|
|
np.array([ncol * [_ % 4] for _ in range(nrow)]), columns=range(ncol)
|
|
)
|
|
|
|
idx_levels = [list(range(min(nrow, 4)))] * len(groupby) + [q]
|
|
idx_codes = [[x for x in range(min(nrow, 4)) for _ in q]] * len(groupby) + [
|
|
list(range(len(q))) * min(nrow, 4)
|
|
]
|
|
expected_index = pd.MultiIndex(
|
|
levels=idx_levels, codes=idx_codes, names=groupby + [None]
|
|
)
|
|
expected_values = [
|
|
[float(x)] * (ncol - len(groupby)) for x in range(min(nrow, 4)) for _ in q
|
|
]
|
|
expected_columns = [x for x in range(ncol) if x not in groupby]
|
|
expected = pd.DataFrame(
|
|
expected_values, index=expected_index, columns=expected_columns
|
|
)
|
|
result = df.groupby(groupby).quantile(q)
|
|
|
|
tm.assert_frame_equal(result, expected)
|
|
|
|
|
|
def test_quantile_raises():
|
|
df = pd.DataFrame(
|
|
[["foo", "a"], ["foo", "b"], ["foo", "c"]], columns=["key", "val"]
|
|
)
|
|
|
|
with pytest.raises(TypeError, match="cannot be performed against 'object' dtypes"):
|
|
df.groupby("key").quantile()
|
|
|
|
|
|
def test_quantile_out_of_bounds_q_raises():
|
|
# https://github.com/pandas-dev/pandas/issues/27470
|
|
df = pd.DataFrame(dict(a=[0, 0, 0, 1, 1, 1], b=range(6)))
|
|
g = df.groupby([0, 0, 0, 1, 1, 1])
|
|
with pytest.raises(ValueError, match="Got '50.0' instead"):
|
|
g.quantile(50)
|
|
|
|
with pytest.raises(ValueError, match="Got '-1.0' instead"):
|
|
g.quantile(-1)
|
|
|
|
|
|
def test_quantile_missing_group_values_no_segfaults():
|
|
# GH 28662
|
|
data = np.array([1.0, np.nan, 1.0])
|
|
df = pd.DataFrame(dict(key=data, val=range(3)))
|
|
|
|
# Random segfaults; would have been guaranteed in loop
|
|
grp = df.groupby("key")
|
|
for _ in range(100):
|
|
grp.quantile()
|
|
|
|
|
|
def test_quantile_missing_group_values_correct_results():
|
|
# GH 28662
|
|
data = np.array([1.0, np.nan, 3.0, np.nan])
|
|
df = pd.DataFrame(dict(key=data, val=range(4)))
|
|
|
|
result = df.groupby("key").quantile()
|
|
expected = pd.DataFrame(
|
|
[1.0, 3.0], index=pd.Index([1.0, 3.0], name="key"), columns=["val"]
|
|
)
|
|
tm.assert_frame_equal(result, expected)
|
|
|
|
|
|
# pipe
|
|
# --------------------------------
|
|
|
|
|
|
def test_pipe():
|
|
# Test the pipe method of DataFrameGroupBy.
|
|
# Issue #17871
|
|
|
|
random_state = np.random.RandomState(1234567890)
|
|
|
|
df = DataFrame(
|
|
{
|
|
"A": ["foo", "bar", "foo", "bar", "foo", "bar", "foo", "foo"],
|
|
"B": random_state.randn(8),
|
|
"C": random_state.randn(8),
|
|
}
|
|
)
|
|
|
|
def f(dfgb):
|
|
return dfgb.B.max() - dfgb.C.min().min()
|
|
|
|
def square(srs):
|
|
return srs ** 2
|
|
|
|
# Note that the transformations are
|
|
# GroupBy -> Series
|
|
# Series -> Series
|
|
# This then chains the GroupBy.pipe and the
|
|
# NDFrame.pipe methods
|
|
result = df.groupby("A").pipe(f).pipe(square)
|
|
|
|
index = Index(["bar", "foo"], dtype="object", name="A")
|
|
expected = pd.Series([8.99110003361, 8.17516964785], name="B", index=index)
|
|
|
|
tm.assert_series_equal(expected, result)
|
|
|
|
|
|
def test_pipe_args():
|
|
# Test passing args to the pipe method of DataFrameGroupBy.
|
|
# Issue #17871
|
|
|
|
df = pd.DataFrame(
|
|
{
|
|
"group": ["A", "A", "B", "B", "C"],
|
|
"x": [1.0, 2.0, 3.0, 2.0, 5.0],
|
|
"y": [10.0, 100.0, 1000.0, -100.0, -1000.0],
|
|
}
|
|
)
|
|
|
|
def f(dfgb, arg1):
|
|
return dfgb.filter(lambda grp: grp.y.mean() > arg1, dropna=False).groupby(
|
|
dfgb.grouper
|
|
)
|
|
|
|
def g(dfgb, arg2):
|
|
return dfgb.sum() / dfgb.sum().sum() + arg2
|
|
|
|
def h(df, arg3):
|
|
return df.x + df.y - arg3
|
|
|
|
result = df.groupby("group").pipe(f, 0).pipe(g, 10).pipe(h, 100)
|
|
|
|
# Assert the results here
|
|
index = pd.Index(["A", "B", "C"], name="group")
|
|
expected = pd.Series([-79.5160891089, -78.4839108911, -80], index=index)
|
|
|
|
tm.assert_series_equal(expected, result)
|
|
|
|
# test SeriesGroupby.pipe
|
|
ser = pd.Series([1, 1, 2, 2, 3, 3])
|
|
result = ser.groupby(ser).pipe(lambda grp: grp.sum() * grp.count())
|
|
|
|
expected = pd.Series([4, 8, 12], index=pd.Int64Index([1, 2, 3]))
|
|
|
|
tm.assert_series_equal(result, expected)
|
|
|
|
|
|
def test_groupby_mean_no_overflow():
|
|
# Regression test for (#22487)
|
|
df = pd.DataFrame(
|
|
{
|
|
"user": ["A", "A", "A", "A", "A"],
|
|
"connections": [4970, 4749, 4719, 4704, 18446744073699999744],
|
|
}
|
|
)
|
|
assert df.groupby("user")["connections"].mean()["A"] == 3689348814740003840
|
|
|
|
|
|
@pytest.mark.parametrize(
|
|
"values",
|
|
[
|
|
{
|
|
"a": [1, 1, 1, 2, 2, 2, 3, 3, 3],
|
|
"b": [1, pd.NA, 2, 1, pd.NA, 2, 1, pd.NA, 2],
|
|
},
|
|
{"a": [1, 1, 2, 2, 3, 3], "b": [1, 2, 1, 2, 1, 2]},
|
|
],
|
|
)
|
|
@pytest.mark.parametrize("function", ["mean", "median", "var"])
|
|
def test_apply_to_nullable_integer_returns_float(values, function):
|
|
# https://github.com/pandas-dev/pandas/issues/32219
|
|
output = 0.5 if function == "var" else 1.5
|
|
arr = np.array([output] * 3, dtype=float)
|
|
idx = pd.Index([1, 2, 3], dtype=object, name="a")
|
|
expected = pd.DataFrame({"b": arr}, index=idx)
|
|
|
|
groups = pd.DataFrame(values, dtype="Int64").groupby("a")
|
|
|
|
result = getattr(groups, function)()
|
|
tm.assert_frame_equal(result, expected)
|
|
|
|
result = groups.agg(function)
|
|
tm.assert_frame_equal(result, expected)
|
|
|
|
result = groups.agg([function])
|
|
expected.columns = MultiIndex.from_tuples([("b", function)])
|
|
tm.assert_frame_equal(result, expected)
|