1070 lines
33 KiB
Python
1070 lines
33 KiB
Python
|
"""
|
||
|
these are systematically testing all of the args to value_counts
|
||
|
with different size combinations. This is to ensure stability of the sorting
|
||
|
and proper parameter handling
|
||
|
"""
|
||
|
|
||
|
from itertools import product
|
||
|
|
||
|
import numpy as np
|
||
|
import pytest
|
||
|
|
||
|
from pandas import (
|
||
|
Categorical,
|
||
|
CategoricalIndex,
|
||
|
DataFrame,
|
||
|
Grouper,
|
||
|
Index,
|
||
|
MultiIndex,
|
||
|
Series,
|
||
|
date_range,
|
||
|
to_datetime,
|
||
|
)
|
||
|
import pandas._testing as tm
|
||
|
|
||
|
|
||
|
def tests_value_counts_index_names_category_column():
|
||
|
# GH44324 Missing name of index category column
|
||
|
df = DataFrame(
|
||
|
{
|
||
|
"gender": ["female"],
|
||
|
"country": ["US"],
|
||
|
}
|
||
|
)
|
||
|
df["gender"] = df["gender"].astype("category")
|
||
|
result = df.groupby("country")["gender"].value_counts()
|
||
|
|
||
|
# Construct expected, very specific multiindex
|
||
|
df_mi_expected = DataFrame([["US", "female"]], columns=["country", "gender"])
|
||
|
df_mi_expected["gender"] = df_mi_expected["gender"].astype("category")
|
||
|
mi_expected = MultiIndex.from_frame(df_mi_expected)
|
||
|
expected = Series([1], index=mi_expected, name="count")
|
||
|
|
||
|
tm.assert_series_equal(result, expected)
|
||
|
|
||
|
|
||
|
# our starting frame
|
||
|
def seed_df(seed_nans, n, m):
|
||
|
np.random.seed(1234)
|
||
|
days = date_range("2015-08-24", periods=10)
|
||
|
|
||
|
frame = DataFrame(
|
||
|
{
|
||
|
"1st": np.random.choice(list("abcd"), n),
|
||
|
"2nd": np.random.choice(days, n),
|
||
|
"3rd": np.random.randint(1, m + 1, n),
|
||
|
}
|
||
|
)
|
||
|
|
||
|
if seed_nans:
|
||
|
# Explicitly cast to float to avoid implicit cast when setting nan
|
||
|
frame["3rd"] = frame["3rd"].astype("float")
|
||
|
frame.loc[1::11, "1st"] = np.nan
|
||
|
frame.loc[3::17, "2nd"] = np.nan
|
||
|
frame.loc[7::19, "3rd"] = np.nan
|
||
|
frame.loc[8::19, "3rd"] = np.nan
|
||
|
frame.loc[9::19, "3rd"] = np.nan
|
||
|
|
||
|
return frame
|
||
|
|
||
|
|
||
|
# create input df, keys, and the bins
|
||
|
binned = []
|
||
|
ids = []
|
||
|
for seed_nans in [True, False]:
|
||
|
for n, m in product((100, 1000), (5, 20)):
|
||
|
df = seed_df(seed_nans, n, m)
|
||
|
bins = None, np.arange(0, max(5, df["3rd"].max()) + 1, 2)
|
||
|
keys = "1st", "2nd", ["1st", "2nd"]
|
||
|
for k, b in product(keys, bins):
|
||
|
binned.append((df, k, b, n, m))
|
||
|
ids.append(f"{k}-{n}-{m}")
|
||
|
|
||
|
|
||
|
@pytest.mark.slow
|
||
|
@pytest.mark.parametrize("df, keys, bins, n, m", binned, ids=ids)
|
||
|
@pytest.mark.parametrize("isort", [True, False])
|
||
|
@pytest.mark.parametrize("normalize, name", [(True, "proportion"), (False, "count")])
|
||
|
@pytest.mark.parametrize("sort", [True, False])
|
||
|
@pytest.mark.parametrize("ascending", [True, False])
|
||
|
@pytest.mark.parametrize("dropna", [True, False])
|
||
|
def test_series_groupby_value_counts(
|
||
|
df, keys, bins, n, m, isort, normalize, name, sort, ascending, dropna
|
||
|
):
|
||
|
def rebuild_index(df):
|
||
|
arr = list(map(df.index.get_level_values, range(df.index.nlevels)))
|
||
|
df.index = MultiIndex.from_arrays(arr, names=df.index.names)
|
||
|
return df
|
||
|
|
||
|
kwargs = {
|
||
|
"normalize": normalize,
|
||
|
"sort": sort,
|
||
|
"ascending": ascending,
|
||
|
"dropna": dropna,
|
||
|
"bins": bins,
|
||
|
}
|
||
|
|
||
|
gr = df.groupby(keys, sort=isort)
|
||
|
left = gr["3rd"].value_counts(**kwargs)
|
||
|
|
||
|
gr = df.groupby(keys, sort=isort)
|
||
|
right = gr["3rd"].apply(Series.value_counts, **kwargs)
|
||
|
right.index.names = right.index.names[:-1] + ["3rd"]
|
||
|
# https://github.com/pandas-dev/pandas/issues/49909
|
||
|
right = right.rename(name)
|
||
|
|
||
|
# have to sort on index because of unstable sort on values
|
||
|
left, right = map(rebuild_index, (left, right)) # xref GH9212
|
||
|
tm.assert_series_equal(left.sort_index(), right.sort_index())
|
||
|
|
||
|
|
||
|
@pytest.mark.parametrize("utc", [True, False])
|
||
|
def test_series_groupby_value_counts_with_grouper(utc):
|
||
|
# GH28479
|
||
|
df = DataFrame(
|
||
|
{
|
||
|
"Timestamp": [
|
||
|
1565083561,
|
||
|
1565083561 + 86400,
|
||
|
1565083561 + 86500,
|
||
|
1565083561 + 86400 * 2,
|
||
|
1565083561 + 86400 * 3,
|
||
|
1565083561 + 86500 * 3,
|
||
|
1565083561 + 86400 * 4,
|
||
|
],
|
||
|
"Food": ["apple", "apple", "banana", "banana", "orange", "orange", "pear"],
|
||
|
}
|
||
|
).drop([3])
|
||
|
|
||
|
df["Datetime"] = to_datetime(df["Timestamp"], utc=utc, unit="s")
|
||
|
dfg = df.groupby(Grouper(freq="1D", key="Datetime"))
|
||
|
|
||
|
# have to sort on index because of unstable sort on values xref GH9212
|
||
|
result = dfg["Food"].value_counts().sort_index()
|
||
|
expected = dfg["Food"].apply(Series.value_counts).sort_index()
|
||
|
expected.index.names = result.index.names
|
||
|
# https://github.com/pandas-dev/pandas/issues/49909
|
||
|
expected = expected.rename("count")
|
||
|
|
||
|
tm.assert_series_equal(result, expected)
|
||
|
|
||
|
|
||
|
@pytest.mark.parametrize("columns", [["A", "B"], ["A", "B", "C"]])
|
||
|
def test_series_groupby_value_counts_empty(columns):
|
||
|
# GH39172
|
||
|
df = DataFrame(columns=columns)
|
||
|
dfg = df.groupby(columns[:-1])
|
||
|
|
||
|
result = dfg[columns[-1]].value_counts()
|
||
|
expected = Series([], dtype=result.dtype, name="count")
|
||
|
expected.index = MultiIndex.from_arrays([[]] * len(columns), names=columns)
|
||
|
|
||
|
tm.assert_series_equal(result, expected)
|
||
|
|
||
|
|
||
|
@pytest.mark.parametrize("columns", [["A", "B"], ["A", "B", "C"]])
|
||
|
def test_series_groupby_value_counts_one_row(columns):
|
||
|
# GH42618
|
||
|
df = DataFrame(data=[range(len(columns))], columns=columns)
|
||
|
dfg = df.groupby(columns[:-1])
|
||
|
|
||
|
result = dfg[columns[-1]].value_counts()
|
||
|
expected = df.value_counts()
|
||
|
|
||
|
tm.assert_series_equal(result, expected)
|
||
|
|
||
|
|
||
|
def test_series_groupby_value_counts_on_categorical():
|
||
|
# GH38672
|
||
|
|
||
|
s = Series(Categorical(["a"], categories=["a", "b"]))
|
||
|
result = s.groupby([0]).value_counts()
|
||
|
|
||
|
expected = Series(
|
||
|
data=[1, 0],
|
||
|
index=MultiIndex.from_arrays(
|
||
|
[
|
||
|
np.array([0, 0]),
|
||
|
CategoricalIndex(
|
||
|
["a", "b"], categories=["a", "b"], ordered=False, dtype="category"
|
||
|
),
|
||
|
]
|
||
|
),
|
||
|
name="count",
|
||
|
)
|
||
|
|
||
|
# Expected:
|
||
|
# 0 a 1
|
||
|
# b 0
|
||
|
# dtype: int64
|
||
|
|
||
|
tm.assert_series_equal(result, expected)
|
||
|
|
||
|
|
||
|
def test_series_groupby_value_counts_no_sort():
|
||
|
# GH#50482
|
||
|
df = DataFrame(
|
||
|
{
|
||
|
"gender": ["male", "male", "female", "male", "female", "male"],
|
||
|
"education": ["low", "medium", "high", "low", "high", "low"],
|
||
|
"country": ["US", "FR", "US", "FR", "FR", "FR"],
|
||
|
}
|
||
|
)
|
||
|
gb = df.groupby(["country", "gender"], sort=False)["education"]
|
||
|
result = gb.value_counts(sort=False)
|
||
|
index = MultiIndex(
|
||
|
levels=[["US", "FR"], ["male", "female"], ["low", "medium", "high"]],
|
||
|
codes=[[0, 1, 0, 1, 1], [0, 0, 1, 0, 1], [0, 1, 2, 0, 2]],
|
||
|
names=["country", "gender", "education"],
|
||
|
)
|
||
|
expected = Series([1, 1, 1, 2, 1], index=index, name="count")
|
||
|
tm.assert_series_equal(result, expected)
|
||
|
|
||
|
|
||
|
@pytest.fixture
|
||
|
def education_df():
|
||
|
return DataFrame(
|
||
|
{
|
||
|
"gender": ["male", "male", "female", "male", "female", "male"],
|
||
|
"education": ["low", "medium", "high", "low", "high", "low"],
|
||
|
"country": ["US", "FR", "US", "FR", "FR", "FR"],
|
||
|
}
|
||
|
)
|
||
|
|
||
|
|
||
|
def test_axis(education_df):
|
||
|
gp = education_df.groupby("country", axis=1)
|
||
|
with pytest.raises(NotImplementedError, match="axis"):
|
||
|
gp.value_counts()
|
||
|
|
||
|
|
||
|
def test_bad_subset(education_df):
|
||
|
gp = education_df.groupby("country")
|
||
|
with pytest.raises(ValueError, match="subset"):
|
||
|
gp.value_counts(subset=["country"])
|
||
|
|
||
|
|
||
|
def test_basic(education_df):
|
||
|
# gh43564
|
||
|
result = education_df.groupby("country")[["gender", "education"]].value_counts(
|
||
|
normalize=True
|
||
|
)
|
||
|
expected = Series(
|
||
|
data=[0.5, 0.25, 0.25, 0.5, 0.5],
|
||
|
index=MultiIndex.from_tuples(
|
||
|
[
|
||
|
("FR", "male", "low"),
|
||
|
("FR", "female", "high"),
|
||
|
("FR", "male", "medium"),
|
||
|
("US", "female", "high"),
|
||
|
("US", "male", "low"),
|
||
|
],
|
||
|
names=["country", "gender", "education"],
|
||
|
),
|
||
|
name="proportion",
|
||
|
)
|
||
|
tm.assert_series_equal(result, expected)
|
||
|
|
||
|
|
||
|
def _frame_value_counts(df, keys, normalize, sort, ascending):
|
||
|
return df[keys].value_counts(normalize=normalize, sort=sort, ascending=ascending)
|
||
|
|
||
|
|
||
|
@pytest.mark.parametrize("groupby", ["column", "array", "function"])
|
||
|
@pytest.mark.parametrize("normalize, name", [(True, "proportion"), (False, "count")])
|
||
|
@pytest.mark.parametrize(
|
||
|
"sort, ascending",
|
||
|
[
|
||
|
(False, None),
|
||
|
(True, True),
|
||
|
(True, False),
|
||
|
],
|
||
|
)
|
||
|
@pytest.mark.parametrize("as_index", [True, False])
|
||
|
@pytest.mark.parametrize("frame", [True, False])
|
||
|
def test_against_frame_and_seriesgroupby(
|
||
|
education_df, groupby, normalize, name, sort, ascending, as_index, frame
|
||
|
):
|
||
|
# test all parameters:
|
||
|
# - Use column, array or function as by= parameter
|
||
|
# - Whether or not to normalize
|
||
|
# - Whether or not to sort and how
|
||
|
# - Whether or not to use the groupby as an index
|
||
|
# - 3-way compare against:
|
||
|
# - apply with :meth:`~DataFrame.value_counts`
|
||
|
# - `~SeriesGroupBy.value_counts`
|
||
|
by = {
|
||
|
"column": "country",
|
||
|
"array": education_df["country"].values,
|
||
|
"function": lambda x: education_df["country"][x] == "US",
|
||
|
}[groupby]
|
||
|
|
||
|
gp = education_df.groupby(by=by, as_index=as_index)
|
||
|
result = gp[["gender", "education"]].value_counts(
|
||
|
normalize=normalize, sort=sort, ascending=ascending
|
||
|
)
|
||
|
if frame:
|
||
|
# compare against apply with DataFrame value_counts
|
||
|
expected = gp.apply(
|
||
|
_frame_value_counts, ["gender", "education"], normalize, sort, ascending
|
||
|
)
|
||
|
|
||
|
if as_index:
|
||
|
tm.assert_series_equal(result, expected)
|
||
|
else:
|
||
|
name = "proportion" if normalize else "count"
|
||
|
expected = expected.reset_index().rename({0: name}, axis=1)
|
||
|
if groupby == "column":
|
||
|
expected = expected.rename({"level_0": "country"}, axis=1)
|
||
|
expected["country"] = np.where(expected["country"], "US", "FR")
|
||
|
elif groupby == "function":
|
||
|
expected["level_0"] = expected["level_0"] == 1
|
||
|
else:
|
||
|
expected["level_0"] = np.where(expected["level_0"], "US", "FR")
|
||
|
tm.assert_frame_equal(result, expected)
|
||
|
else:
|
||
|
# compare against SeriesGroupBy value_counts
|
||
|
education_df["both"] = education_df["gender"] + "-" + education_df["education"]
|
||
|
expected = gp["both"].value_counts(
|
||
|
normalize=normalize, sort=sort, ascending=ascending
|
||
|
)
|
||
|
expected.name = name
|
||
|
if as_index:
|
||
|
index_frame = expected.index.to_frame(index=False)
|
||
|
index_frame["gender"] = index_frame["both"].str.split("-").str.get(0)
|
||
|
index_frame["education"] = index_frame["both"].str.split("-").str.get(1)
|
||
|
del index_frame["both"]
|
||
|
index_frame = index_frame.rename({0: None}, axis=1)
|
||
|
expected.index = MultiIndex.from_frame(index_frame)
|
||
|
tm.assert_series_equal(result, expected)
|
||
|
else:
|
||
|
expected.insert(1, "gender", expected["both"].str.split("-").str.get(0))
|
||
|
expected.insert(2, "education", expected["both"].str.split("-").str.get(1))
|
||
|
del expected["both"]
|
||
|
tm.assert_frame_equal(result, expected)
|
||
|
|
||
|
|
||
|
@pytest.mark.parametrize("normalize", [True, False])
|
||
|
@pytest.mark.parametrize(
|
||
|
"sort, ascending, expected_rows, expected_count, expected_group_size",
|
||
|
[
|
||
|
(False, None, [0, 1, 2, 3, 4], [1, 1, 1, 2, 1], [1, 3, 1, 3, 1]),
|
||
|
(True, False, [4, 3, 1, 2, 0], [1, 2, 1, 1, 1], [1, 3, 3, 1, 1]),
|
||
|
(True, True, [4, 1, 3, 2, 0], [1, 1, 2, 1, 1], [1, 3, 3, 1, 1]),
|
||
|
],
|
||
|
)
|
||
|
def test_compound(
|
||
|
education_df,
|
||
|
normalize,
|
||
|
sort,
|
||
|
ascending,
|
||
|
expected_rows,
|
||
|
expected_count,
|
||
|
expected_group_size,
|
||
|
):
|
||
|
# Multiple groupby keys and as_index=False
|
||
|
gp = education_df.groupby(["country", "gender"], as_index=False, sort=False)
|
||
|
result = gp["education"].value_counts(
|
||
|
normalize=normalize, sort=sort, ascending=ascending
|
||
|
)
|
||
|
expected = DataFrame()
|
||
|
for column in ["country", "gender", "education"]:
|
||
|
expected[column] = [education_df[column][row] for row in expected_rows]
|
||
|
if normalize:
|
||
|
expected["proportion"] = expected_count
|
||
|
expected["proportion"] /= expected_group_size
|
||
|
else:
|
||
|
expected["count"] = expected_count
|
||
|
tm.assert_frame_equal(result, expected)
|
||
|
|
||
|
|
||
|
@pytest.fixture
|
||
|
def animals_df():
|
||
|
return DataFrame(
|
||
|
{"key": [1, 1, 1, 1], "num_legs": [2, 4, 4, 6], "num_wings": [2, 0, 0, 0]},
|
||
|
index=["falcon", "dog", "cat", "ant"],
|
||
|
)
|
||
|
|
||
|
|
||
|
@pytest.mark.parametrize(
|
||
|
"sort, ascending, normalize, name, expected_data, expected_index",
|
||
|
[
|
||
|
(False, None, False, "count", [1, 2, 1], [(1, 1, 1), (2, 4, 6), (2, 0, 0)]),
|
||
|
(True, True, False, "count", [1, 1, 2], [(1, 1, 1), (2, 6, 4), (2, 0, 0)]),
|
||
|
(True, False, False, "count", [2, 1, 1], [(1, 1, 1), (4, 2, 6), (0, 2, 0)]),
|
||
|
(
|
||
|
True,
|
||
|
False,
|
||
|
True,
|
||
|
"proportion",
|
||
|
[0.5, 0.25, 0.25],
|
||
|
[(1, 1, 1), (4, 2, 6), (0, 2, 0)],
|
||
|
),
|
||
|
],
|
||
|
)
|
||
|
def test_data_frame_value_counts(
|
||
|
animals_df, sort, ascending, normalize, name, expected_data, expected_index
|
||
|
):
|
||
|
# 3-way compare with :meth:`~DataFrame.value_counts`
|
||
|
# Tests from frame/methods/test_value_counts.py
|
||
|
result_frame = animals_df.value_counts(
|
||
|
sort=sort, ascending=ascending, normalize=normalize
|
||
|
)
|
||
|
expected = Series(
|
||
|
data=expected_data,
|
||
|
index=MultiIndex.from_arrays(
|
||
|
expected_index, names=["key", "num_legs", "num_wings"]
|
||
|
),
|
||
|
name=name,
|
||
|
)
|
||
|
tm.assert_series_equal(result_frame, expected)
|
||
|
|
||
|
result_frame_groupby = animals_df.groupby("key").value_counts(
|
||
|
sort=sort, ascending=ascending, normalize=normalize
|
||
|
)
|
||
|
|
||
|
tm.assert_series_equal(result_frame_groupby, expected)
|
||
|
|
||
|
|
||
|
@pytest.fixture
|
||
|
def nulls_df():
|
||
|
n = np.nan
|
||
|
return DataFrame(
|
||
|
{
|
||
|
"A": [1, 1, n, 4, n, 6, 6, 6, 6],
|
||
|
"B": [1, 1, 3, n, n, 6, 6, 6, 6],
|
||
|
"C": [1, 2, 3, 4, 5, 6, n, 8, n],
|
||
|
"D": [1, 2, 3, 4, 5, 6, 7, n, n],
|
||
|
}
|
||
|
)
|
||
|
|
||
|
|
||
|
@pytest.mark.parametrize(
|
||
|
"group_dropna, count_dropna, expected_rows, expected_values",
|
||
|
[
|
||
|
(
|
||
|
False,
|
||
|
False,
|
||
|
[0, 1, 3, 5, 7, 6, 8, 2, 4],
|
||
|
[0.5, 0.5, 1.0, 0.25, 0.25, 0.25, 0.25, 1.0, 1.0],
|
||
|
),
|
||
|
(False, True, [0, 1, 3, 5, 2, 4], [0.5, 0.5, 1.0, 1.0, 1.0, 1.0]),
|
||
|
(True, False, [0, 1, 5, 7, 6, 8], [0.5, 0.5, 0.25, 0.25, 0.25, 0.25]),
|
||
|
(True, True, [0, 1, 5], [0.5, 0.5, 1.0]),
|
||
|
],
|
||
|
)
|
||
|
def test_dropna_combinations(
|
||
|
nulls_df, group_dropna, count_dropna, expected_rows, expected_values
|
||
|
):
|
||
|
gp = nulls_df.groupby(["A", "B"], dropna=group_dropna)
|
||
|
result = gp.value_counts(normalize=True, sort=True, dropna=count_dropna)
|
||
|
columns = DataFrame()
|
||
|
for column in nulls_df.columns:
|
||
|
columns[column] = [nulls_df[column][row] for row in expected_rows]
|
||
|
index = MultiIndex.from_frame(columns)
|
||
|
expected = Series(data=expected_values, index=index, name="proportion")
|
||
|
tm.assert_series_equal(result, expected)
|
||
|
|
||
|
|
||
|
@pytest.fixture
|
||
|
def names_with_nulls_df(nulls_fixture):
|
||
|
return DataFrame(
|
||
|
{
|
||
|
"key": [1, 1, 1, 1],
|
||
|
"first_name": ["John", "Anne", "John", "Beth"],
|
||
|
"middle_name": ["Smith", nulls_fixture, nulls_fixture, "Louise"],
|
||
|
},
|
||
|
)
|
||
|
|
||
|
|
||
|
@pytest.mark.parametrize(
|
||
|
"dropna, expected_data, expected_index",
|
||
|
[
|
||
|
(
|
||
|
True,
|
||
|
[1, 1],
|
||
|
MultiIndex.from_arrays(
|
||
|
[(1, 1), ("Beth", "John"), ("Louise", "Smith")],
|
||
|
names=["key", "first_name", "middle_name"],
|
||
|
),
|
||
|
),
|
||
|
(
|
||
|
False,
|
||
|
[1, 1, 1, 1],
|
||
|
MultiIndex(
|
||
|
levels=[
|
||
|
Index([1]),
|
||
|
Index(["Anne", "Beth", "John"]),
|
||
|
Index(["Louise", "Smith", np.nan]),
|
||
|
],
|
||
|
codes=[[0, 0, 0, 0], [0, 1, 2, 2], [2, 0, 1, 2]],
|
||
|
names=["key", "first_name", "middle_name"],
|
||
|
),
|
||
|
),
|
||
|
],
|
||
|
)
|
||
|
@pytest.mark.parametrize("normalize, name", [(False, "count"), (True, "proportion")])
|
||
|
def test_data_frame_value_counts_dropna(
|
||
|
names_with_nulls_df, dropna, normalize, name, expected_data, expected_index
|
||
|
):
|
||
|
# GH 41334
|
||
|
# 3-way compare with :meth:`~DataFrame.value_counts`
|
||
|
# Tests with nulls from frame/methods/test_value_counts.py
|
||
|
result_frame = names_with_nulls_df.value_counts(dropna=dropna, normalize=normalize)
|
||
|
expected = Series(
|
||
|
data=expected_data,
|
||
|
index=expected_index,
|
||
|
name=name,
|
||
|
)
|
||
|
if normalize:
|
||
|
expected /= float(len(expected_data))
|
||
|
|
||
|
tm.assert_series_equal(result_frame, expected)
|
||
|
|
||
|
result_frame_groupby = names_with_nulls_df.groupby("key").value_counts(
|
||
|
dropna=dropna, normalize=normalize
|
||
|
)
|
||
|
|
||
|
tm.assert_series_equal(result_frame_groupby, expected)
|
||
|
|
||
|
|
||
|
@pytest.mark.parametrize("as_index", [False, True])
|
||
|
@pytest.mark.parametrize("observed", [False, True])
|
||
|
@pytest.mark.parametrize(
|
||
|
"normalize, name, expected_data",
|
||
|
[
|
||
|
(
|
||
|
False,
|
||
|
"count",
|
||
|
np.array([2, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0], dtype=np.int64),
|
||
|
),
|
||
|
(
|
||
|
True,
|
||
|
"proportion",
|
||
|
np.array([0.5, 0.25, 0.25, 0.0, 0.0, 0.0, 0.5, 0.5, 0.0, 0.0, 0.0, 0.0]),
|
||
|
),
|
||
|
],
|
||
|
)
|
||
|
def test_categorical_single_grouper_with_only_observed_categories(
|
||
|
education_df, as_index, observed, normalize, name, expected_data
|
||
|
):
|
||
|
# Test single categorical grouper with only observed grouping categories
|
||
|
# when non-groupers are also categorical
|
||
|
|
||
|
gp = education_df.astype("category").groupby(
|
||
|
"country", as_index=as_index, observed=observed
|
||
|
)
|
||
|
result = gp.value_counts(normalize=normalize)
|
||
|
|
||
|
expected_index = MultiIndex.from_tuples(
|
||
|
[
|
||
|
("FR", "male", "low"),
|
||
|
("FR", "female", "high"),
|
||
|
("FR", "male", "medium"),
|
||
|
("FR", "female", "low"),
|
||
|
("FR", "female", "medium"),
|
||
|
("FR", "male", "high"),
|
||
|
("US", "female", "high"),
|
||
|
("US", "male", "low"),
|
||
|
("US", "female", "low"),
|
||
|
("US", "female", "medium"),
|
||
|
("US", "male", "high"),
|
||
|
("US", "male", "medium"),
|
||
|
],
|
||
|
names=["country", "gender", "education"],
|
||
|
)
|
||
|
|
||
|
expected_series = Series(
|
||
|
data=expected_data,
|
||
|
index=expected_index,
|
||
|
name=name,
|
||
|
)
|
||
|
for i in range(3):
|
||
|
expected_series.index = expected_series.index.set_levels(
|
||
|
CategoricalIndex(expected_series.index.levels[i]), level=i
|
||
|
)
|
||
|
|
||
|
if as_index:
|
||
|
tm.assert_series_equal(result, expected_series)
|
||
|
else:
|
||
|
expected = expected_series.reset_index(
|
||
|
name="proportion" if normalize else "count"
|
||
|
)
|
||
|
tm.assert_frame_equal(result, expected)
|
||
|
|
||
|
|
||
|
def assert_categorical_single_grouper(
|
||
|
education_df, as_index, observed, expected_index, normalize, name, expected_data
|
||
|
):
|
||
|
# Test single categorical grouper when non-groupers are also categorical
|
||
|
education_df = education_df.copy().astype("category")
|
||
|
|
||
|
# Add non-observed grouping categories
|
||
|
education_df["country"] = education_df["country"].cat.add_categories(["ASIA"])
|
||
|
|
||
|
gp = education_df.groupby("country", as_index=as_index, observed=observed)
|
||
|
result = gp.value_counts(normalize=normalize)
|
||
|
|
||
|
expected_series = Series(
|
||
|
data=expected_data,
|
||
|
index=MultiIndex.from_tuples(
|
||
|
expected_index,
|
||
|
names=["country", "gender", "education"],
|
||
|
),
|
||
|
name=name,
|
||
|
)
|
||
|
for i in range(3):
|
||
|
index_level = CategoricalIndex(expected_series.index.levels[i])
|
||
|
if i == 0:
|
||
|
index_level = index_level.set_categories(
|
||
|
education_df["country"].cat.categories
|
||
|
)
|
||
|
expected_series.index = expected_series.index.set_levels(index_level, level=i)
|
||
|
|
||
|
if as_index:
|
||
|
tm.assert_series_equal(result, expected_series)
|
||
|
else:
|
||
|
expected = expected_series.reset_index(name=name)
|
||
|
tm.assert_frame_equal(result, expected)
|
||
|
|
||
|
|
||
|
@pytest.mark.parametrize("as_index", [True, False])
|
||
|
@pytest.mark.parametrize(
|
||
|
"normalize, name, expected_data",
|
||
|
[
|
||
|
(
|
||
|
False,
|
||
|
"count",
|
||
|
np.array([2, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0], dtype=np.int64),
|
||
|
),
|
||
|
(
|
||
|
True,
|
||
|
"proportion",
|
||
|
np.array([0.5, 0.25, 0.25, 0.0, 0.0, 0.0, 0.5, 0.5, 0.0, 0.0, 0.0, 0.0]),
|
||
|
),
|
||
|
],
|
||
|
)
|
||
|
def test_categorical_single_grouper_observed_true(
|
||
|
education_df, as_index, normalize, name, expected_data
|
||
|
):
|
||
|
# GH#46357
|
||
|
|
||
|
expected_index = [
|
||
|
("FR", "male", "low"),
|
||
|
("FR", "female", "high"),
|
||
|
("FR", "male", "medium"),
|
||
|
("FR", "female", "low"),
|
||
|
("FR", "female", "medium"),
|
||
|
("FR", "male", "high"),
|
||
|
("US", "female", "high"),
|
||
|
("US", "male", "low"),
|
||
|
("US", "female", "low"),
|
||
|
("US", "female", "medium"),
|
||
|
("US", "male", "high"),
|
||
|
("US", "male", "medium"),
|
||
|
]
|
||
|
|
||
|
assert_categorical_single_grouper(
|
||
|
education_df=education_df,
|
||
|
as_index=as_index,
|
||
|
observed=True,
|
||
|
expected_index=expected_index,
|
||
|
normalize=normalize,
|
||
|
name=name,
|
||
|
expected_data=expected_data,
|
||
|
)
|
||
|
|
||
|
|
||
|
@pytest.mark.parametrize("as_index", [True, False])
|
||
|
@pytest.mark.parametrize(
|
||
|
"normalize, name, expected_data",
|
||
|
[
|
||
|
(
|
||
|
False,
|
||
|
"count",
|
||
|
np.array(
|
||
|
[2, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], dtype=np.int64
|
||
|
),
|
||
|
),
|
||
|
(
|
||
|
True,
|
||
|
"proportion",
|
||
|
np.array(
|
||
|
[
|
||
|
0.5,
|
||
|
0.25,
|
||
|
0.25,
|
||
|
0.0,
|
||
|
0.0,
|
||
|
0.0,
|
||
|
0.5,
|
||
|
0.5,
|
||
|
0.0,
|
||
|
0.0,
|
||
|
0.0,
|
||
|
0.0,
|
||
|
0.0,
|
||
|
0.0,
|
||
|
0.0,
|
||
|
0.0,
|
||
|
0.0,
|
||
|
0.0,
|
||
|
]
|
||
|
),
|
||
|
),
|
||
|
],
|
||
|
)
|
||
|
def test_categorical_single_grouper_observed_false(
|
||
|
education_df, as_index, normalize, name, expected_data
|
||
|
):
|
||
|
# GH#46357
|
||
|
|
||
|
expected_index = [
|
||
|
("FR", "male", "low"),
|
||
|
("FR", "female", "high"),
|
||
|
("FR", "male", "medium"),
|
||
|
("FR", "female", "low"),
|
||
|
("FR", "male", "high"),
|
||
|
("FR", "female", "medium"),
|
||
|
("US", "female", "high"),
|
||
|
("US", "male", "low"),
|
||
|
("US", "male", "medium"),
|
||
|
("US", "male", "high"),
|
||
|
("US", "female", "medium"),
|
||
|
("US", "female", "low"),
|
||
|
("ASIA", "male", "low"),
|
||
|
("ASIA", "male", "high"),
|
||
|
("ASIA", "female", "medium"),
|
||
|
("ASIA", "female", "low"),
|
||
|
("ASIA", "female", "high"),
|
||
|
("ASIA", "male", "medium"),
|
||
|
]
|
||
|
|
||
|
assert_categorical_single_grouper(
|
||
|
education_df=education_df,
|
||
|
as_index=as_index,
|
||
|
observed=False,
|
||
|
expected_index=expected_index,
|
||
|
normalize=normalize,
|
||
|
name=name,
|
||
|
expected_data=expected_data,
|
||
|
)
|
||
|
|
||
|
|
||
|
@pytest.mark.parametrize("as_index", [True, False])
|
||
|
@pytest.mark.parametrize(
|
||
|
"observed, expected_index",
|
||
|
[
|
||
|
(
|
||
|
False,
|
||
|
[
|
||
|
("FR", "high", "female"),
|
||
|
("FR", "high", "male"),
|
||
|
("FR", "low", "male"),
|
||
|
("FR", "low", "female"),
|
||
|
("FR", "medium", "male"),
|
||
|
("FR", "medium", "female"),
|
||
|
("US", "high", "female"),
|
||
|
("US", "high", "male"),
|
||
|
("US", "low", "male"),
|
||
|
("US", "low", "female"),
|
||
|
("US", "medium", "female"),
|
||
|
("US", "medium", "male"),
|
||
|
],
|
||
|
),
|
||
|
(
|
||
|
True,
|
||
|
[
|
||
|
("FR", "high", "female"),
|
||
|
("FR", "low", "male"),
|
||
|
("FR", "medium", "male"),
|
||
|
("US", "high", "female"),
|
||
|
("US", "low", "male"),
|
||
|
],
|
||
|
),
|
||
|
],
|
||
|
)
|
||
|
@pytest.mark.parametrize(
|
||
|
"normalize, name, expected_data",
|
||
|
[
|
||
|
(
|
||
|
False,
|
||
|
"count",
|
||
|
np.array([1, 0, 2, 0, 1, 0, 1, 0, 1, 0, 0, 0], dtype=np.int64),
|
||
|
),
|
||
|
(
|
||
|
True,
|
||
|
"proportion",
|
||
|
# NaN values corresponds to non-observed groups
|
||
|
np.array([1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 0.0, 0.0]),
|
||
|
),
|
||
|
],
|
||
|
)
|
||
|
def test_categorical_multiple_groupers(
|
||
|
education_df, as_index, observed, expected_index, normalize, name, expected_data
|
||
|
):
|
||
|
# GH#46357
|
||
|
|
||
|
# Test multiple categorical groupers when non-groupers are non-categorical
|
||
|
education_df = education_df.copy()
|
||
|
education_df["country"] = education_df["country"].astype("category")
|
||
|
education_df["education"] = education_df["education"].astype("category")
|
||
|
|
||
|
gp = education_df.groupby(
|
||
|
["country", "education"], as_index=as_index, observed=observed
|
||
|
)
|
||
|
result = gp.value_counts(normalize=normalize)
|
||
|
|
||
|
expected_series = Series(
|
||
|
data=expected_data[expected_data > 0.0] if observed else expected_data,
|
||
|
index=MultiIndex.from_tuples(
|
||
|
expected_index,
|
||
|
names=["country", "education", "gender"],
|
||
|
),
|
||
|
name=name,
|
||
|
)
|
||
|
for i in range(2):
|
||
|
expected_series.index = expected_series.index.set_levels(
|
||
|
CategoricalIndex(expected_series.index.levels[i]), level=i
|
||
|
)
|
||
|
|
||
|
if as_index:
|
||
|
tm.assert_series_equal(result, expected_series)
|
||
|
else:
|
||
|
expected = expected_series.reset_index(
|
||
|
name="proportion" if normalize else "count"
|
||
|
)
|
||
|
tm.assert_frame_equal(result, expected)
|
||
|
|
||
|
|
||
|
@pytest.mark.parametrize("as_index", [False, True])
|
||
|
@pytest.mark.parametrize("observed", [False, True])
|
||
|
@pytest.mark.parametrize(
|
||
|
"normalize, name, expected_data",
|
||
|
[
|
||
|
(
|
||
|
False,
|
||
|
"count",
|
||
|
np.array([2, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0], dtype=np.int64),
|
||
|
),
|
||
|
(
|
||
|
True,
|
||
|
"proportion",
|
||
|
# NaN values corresponds to non-observed groups
|
||
|
np.array([0.5, 0.25, 0.25, 0.0, 0.0, 0.0, 0.5, 0.5, 0.0, 0.0, 0.0, 0.0]),
|
||
|
),
|
||
|
],
|
||
|
)
|
||
|
def test_categorical_non_groupers(
|
||
|
education_df, as_index, observed, normalize, name, expected_data
|
||
|
):
|
||
|
# GH#46357 Test non-observed categories are included in the result,
|
||
|
# regardless of `observed`
|
||
|
education_df = education_df.copy()
|
||
|
education_df["gender"] = education_df["gender"].astype("category")
|
||
|
education_df["education"] = education_df["education"].astype("category")
|
||
|
|
||
|
gp = education_df.groupby("country", as_index=as_index, observed=observed)
|
||
|
result = gp.value_counts(normalize=normalize)
|
||
|
|
||
|
expected_index = [
|
||
|
("FR", "male", "low"),
|
||
|
("FR", "female", "high"),
|
||
|
("FR", "male", "medium"),
|
||
|
("FR", "female", "low"),
|
||
|
("FR", "female", "medium"),
|
||
|
("FR", "male", "high"),
|
||
|
("US", "female", "high"),
|
||
|
("US", "male", "low"),
|
||
|
("US", "female", "low"),
|
||
|
("US", "female", "medium"),
|
||
|
("US", "male", "high"),
|
||
|
("US", "male", "medium"),
|
||
|
]
|
||
|
expected_series = Series(
|
||
|
data=expected_data,
|
||
|
index=MultiIndex.from_tuples(
|
||
|
expected_index,
|
||
|
names=["country", "gender", "education"],
|
||
|
),
|
||
|
name=name,
|
||
|
)
|
||
|
for i in range(1, 3):
|
||
|
expected_series.index = expected_series.index.set_levels(
|
||
|
CategoricalIndex(expected_series.index.levels[i]), level=i
|
||
|
)
|
||
|
|
||
|
if as_index:
|
||
|
tm.assert_series_equal(result, expected_series)
|
||
|
else:
|
||
|
expected = expected_series.reset_index(
|
||
|
name="proportion" if normalize else "count"
|
||
|
)
|
||
|
tm.assert_frame_equal(result, expected)
|
||
|
|
||
|
|
||
|
@pytest.mark.parametrize(
|
||
|
"normalize, expected_label, expected_values",
|
||
|
[
|
||
|
(False, "count", [1, 1, 1]),
|
||
|
(True, "proportion", [0.5, 0.5, 1.0]),
|
||
|
],
|
||
|
)
|
||
|
def test_mixed_groupings(normalize, expected_label, expected_values):
|
||
|
# Test multiple groupings
|
||
|
df = DataFrame({"A": [1, 2, 1], "B": [1, 2, 3]})
|
||
|
gp = df.groupby([[4, 5, 4], "A", lambda i: 7 if i == 1 else 8], as_index=False)
|
||
|
result = gp.value_counts(sort=True, normalize=normalize)
|
||
|
expected = DataFrame(
|
||
|
{
|
||
|
"level_0": np.array([4, 4, 5], dtype=np.int_),
|
||
|
"A": [1, 1, 2],
|
||
|
"level_2": [8, 8, 7],
|
||
|
"B": [1, 3, 2],
|
||
|
expected_label: expected_values,
|
||
|
}
|
||
|
)
|
||
|
tm.assert_frame_equal(result, expected)
|
||
|
|
||
|
|
||
|
@pytest.mark.parametrize(
|
||
|
"test, columns, expected_names",
|
||
|
[
|
||
|
("repeat", list("abbde"), ["a", None, "d", "b", "b", "e"]),
|
||
|
("level", list("abcd") + ["level_1"], ["a", None, "d", "b", "c", "level_1"]),
|
||
|
],
|
||
|
)
|
||
|
@pytest.mark.parametrize("as_index", [False, True])
|
||
|
def test_column_label_duplicates(test, columns, expected_names, as_index):
|
||
|
# GH 44992
|
||
|
# Test for duplicate input column labels and generated duplicate labels
|
||
|
df = DataFrame([[1, 3, 5, 7, 9], [2, 4, 6, 8, 10]], columns=columns)
|
||
|
expected_data = [(1, 0, 7, 3, 5, 9), (2, 1, 8, 4, 6, 10)]
|
||
|
keys = ["a", np.array([0, 1], dtype=np.int64), "d"]
|
||
|
result = df.groupby(keys, as_index=as_index).value_counts()
|
||
|
if as_index:
|
||
|
expected = Series(
|
||
|
data=(1, 1),
|
||
|
index=MultiIndex.from_tuples(
|
||
|
expected_data,
|
||
|
names=expected_names,
|
||
|
),
|
||
|
name="count",
|
||
|
)
|
||
|
tm.assert_series_equal(result, expected)
|
||
|
else:
|
||
|
expected_data = [list(row) + [1] for row in expected_data]
|
||
|
expected_columns = list(expected_names)
|
||
|
expected_columns[1] = "level_1"
|
||
|
expected_columns.append("count")
|
||
|
expected = DataFrame(expected_data, columns=expected_columns)
|
||
|
tm.assert_frame_equal(result, expected)
|
||
|
|
||
|
|
||
|
@pytest.mark.parametrize(
|
||
|
"normalize, expected_label",
|
||
|
[
|
||
|
(False, "count"),
|
||
|
(True, "proportion"),
|
||
|
],
|
||
|
)
|
||
|
def test_result_label_duplicates(normalize, expected_label):
|
||
|
# Test for result column label duplicating an input column label
|
||
|
gb = DataFrame([[1, 2, 3]], columns=["a", "b", expected_label]).groupby(
|
||
|
"a", as_index=False
|
||
|
)
|
||
|
msg = f"Column label '{expected_label}' is duplicate of result column"
|
||
|
with pytest.raises(ValueError, match=msg):
|
||
|
gb.value_counts(normalize=normalize)
|
||
|
|
||
|
|
||
|
def test_ambiguous_grouping():
|
||
|
# Test that groupby is not confused by groupings length equal to row count
|
||
|
df = DataFrame({"a": [1, 1]})
|
||
|
gb = df.groupby(np.array([1, 1], dtype=np.int64))
|
||
|
result = gb.value_counts()
|
||
|
expected = Series(
|
||
|
[2], index=MultiIndex.from_tuples([[1, 1]], names=[None, "a"]), name="count"
|
||
|
)
|
||
|
tm.assert_series_equal(result, expected)
|
||
|
|
||
|
|
||
|
def test_subset_overlaps_gb_key_raises():
|
||
|
# GH 46383
|
||
|
df = DataFrame({"c1": ["a", "b", "c"], "c2": ["x", "y", "y"]}, index=[0, 1, 1])
|
||
|
msg = "Keys {'c1'} in subset cannot be in the groupby column keys."
|
||
|
with pytest.raises(ValueError, match=msg):
|
||
|
df.groupby("c1").value_counts(subset=["c1"])
|
||
|
|
||
|
|
||
|
def test_subset_doesnt_exist_in_frame():
|
||
|
# GH 46383
|
||
|
df = DataFrame({"c1": ["a", "b", "c"], "c2": ["x", "y", "y"]}, index=[0, 1, 1])
|
||
|
msg = "Keys {'c3'} in subset do not exist in the DataFrame."
|
||
|
with pytest.raises(ValueError, match=msg):
|
||
|
df.groupby("c1").value_counts(subset=["c3"])
|
||
|
|
||
|
|
||
|
def test_subset():
|
||
|
# GH 46383
|
||
|
df = DataFrame({"c1": ["a", "b", "c"], "c2": ["x", "y", "y"]}, index=[0, 1, 1])
|
||
|
result = df.groupby(level=0).value_counts(subset=["c2"])
|
||
|
expected = Series(
|
||
|
[1, 2],
|
||
|
index=MultiIndex.from_arrays([[0, 1], ["x", "y"]], names=[None, "c2"]),
|
||
|
name="count",
|
||
|
)
|
||
|
tm.assert_series_equal(result, expected)
|
||
|
|
||
|
|
||
|
def test_subset_duplicate_columns():
|
||
|
# GH 46383
|
||
|
df = DataFrame(
|
||
|
[["a", "x", "x"], ["b", "y", "y"], ["b", "y", "y"]],
|
||
|
index=[0, 1, 1],
|
||
|
columns=["c1", "c2", "c2"],
|
||
|
)
|
||
|
result = df.groupby(level=0).value_counts(subset=["c2"])
|
||
|
expected = Series(
|
||
|
[1, 2],
|
||
|
index=MultiIndex.from_arrays(
|
||
|
[[0, 1], ["x", "y"], ["x", "y"]], names=[None, "c2", "c2"]
|
||
|
),
|
||
|
name="count",
|
||
|
)
|
||
|
tm.assert_series_equal(result, expected)
|
||
|
|
||
|
|
||
|
@pytest.mark.parametrize("utc", [True, False])
|
||
|
def test_value_counts_time_grouper(utc):
|
||
|
# GH#50486
|
||
|
df = DataFrame(
|
||
|
{
|
||
|
"Timestamp": [
|
||
|
1565083561,
|
||
|
1565083561 + 86400,
|
||
|
1565083561 + 86500,
|
||
|
1565083561 + 86400 * 2,
|
||
|
1565083561 + 86400 * 3,
|
||
|
1565083561 + 86500 * 3,
|
||
|
1565083561 + 86400 * 4,
|
||
|
],
|
||
|
"Food": ["apple", "apple", "banana", "banana", "orange", "orange", "pear"],
|
||
|
}
|
||
|
).drop([3])
|
||
|
|
||
|
df["Datetime"] = to_datetime(df["Timestamp"], utc=utc, unit="s")
|
||
|
gb = df.groupby(Grouper(freq="1D", key="Datetime"))
|
||
|
result = gb.value_counts()
|
||
|
dates = to_datetime(
|
||
|
["2019-08-06", "2019-08-07", "2019-08-09", "2019-08-10"], utc=utc
|
||
|
)
|
||
|
timestamps = df["Timestamp"].unique()
|
||
|
index = MultiIndex(
|
||
|
levels=[dates, timestamps, ["apple", "banana", "orange", "pear"]],
|
||
|
codes=[[0, 1, 1, 2, 2, 3], range(6), [0, 0, 1, 2, 2, 3]],
|
||
|
names=["Datetime", "Timestamp", "Food"],
|
||
|
)
|
||
|
expected = Series(1, index=index, name="count")
|
||
|
tm.assert_series_equal(result, expected)
|