Inzynierka/Lib/site-packages/pandas/tests/groupby/test_value_counts.py

1070 lines
33 KiB
Python
Raw Normal View History

2023-06-02 12:51:02 +02:00
"""
these are systematically testing all of the args to value_counts
with different size combinations. This is to ensure stability of the sorting
and proper parameter handling
"""
from itertools import product
import numpy as np
import pytest
from pandas import (
Categorical,
CategoricalIndex,
DataFrame,
Grouper,
Index,
MultiIndex,
Series,
date_range,
to_datetime,
)
import pandas._testing as tm
def tests_value_counts_index_names_category_column():
# GH44324 Missing name of index category column
df = DataFrame(
{
"gender": ["female"],
"country": ["US"],
}
)
df["gender"] = df["gender"].astype("category")
result = df.groupby("country")["gender"].value_counts()
# Construct expected, very specific multiindex
df_mi_expected = DataFrame([["US", "female"]], columns=["country", "gender"])
df_mi_expected["gender"] = df_mi_expected["gender"].astype("category")
mi_expected = MultiIndex.from_frame(df_mi_expected)
expected = Series([1], index=mi_expected, name="count")
tm.assert_series_equal(result, expected)
# our starting frame
def seed_df(seed_nans, n, m):
np.random.seed(1234)
days = date_range("2015-08-24", periods=10)
frame = DataFrame(
{
"1st": np.random.choice(list("abcd"), n),
"2nd": np.random.choice(days, n),
"3rd": np.random.randint(1, m + 1, n),
}
)
if seed_nans:
# Explicitly cast to float to avoid implicit cast when setting nan
frame["3rd"] = frame["3rd"].astype("float")
frame.loc[1::11, "1st"] = np.nan
frame.loc[3::17, "2nd"] = np.nan
frame.loc[7::19, "3rd"] = np.nan
frame.loc[8::19, "3rd"] = np.nan
frame.loc[9::19, "3rd"] = np.nan
return frame
# create input df, keys, and the bins
binned = []
ids = []
for seed_nans in [True, False]:
for n, m in product((100, 1000), (5, 20)):
df = seed_df(seed_nans, n, m)
bins = None, np.arange(0, max(5, df["3rd"].max()) + 1, 2)
keys = "1st", "2nd", ["1st", "2nd"]
for k, b in product(keys, bins):
binned.append((df, k, b, n, m))
ids.append(f"{k}-{n}-{m}")
@pytest.mark.slow
@pytest.mark.parametrize("df, keys, bins, n, m", binned, ids=ids)
@pytest.mark.parametrize("isort", [True, False])
@pytest.mark.parametrize("normalize, name", [(True, "proportion"), (False, "count")])
@pytest.mark.parametrize("sort", [True, False])
@pytest.mark.parametrize("ascending", [True, False])
@pytest.mark.parametrize("dropna", [True, False])
def test_series_groupby_value_counts(
df, keys, bins, n, m, isort, normalize, name, sort, ascending, dropna
):
def rebuild_index(df):
arr = list(map(df.index.get_level_values, range(df.index.nlevels)))
df.index = MultiIndex.from_arrays(arr, names=df.index.names)
return df
kwargs = {
"normalize": normalize,
"sort": sort,
"ascending": ascending,
"dropna": dropna,
"bins": bins,
}
gr = df.groupby(keys, sort=isort)
left = gr["3rd"].value_counts(**kwargs)
gr = df.groupby(keys, sort=isort)
right = gr["3rd"].apply(Series.value_counts, **kwargs)
right.index.names = right.index.names[:-1] + ["3rd"]
# https://github.com/pandas-dev/pandas/issues/49909
right = right.rename(name)
# have to sort on index because of unstable sort on values
left, right = map(rebuild_index, (left, right)) # xref GH9212
tm.assert_series_equal(left.sort_index(), right.sort_index())
@pytest.mark.parametrize("utc", [True, False])
def test_series_groupby_value_counts_with_grouper(utc):
# GH28479
df = DataFrame(
{
"Timestamp": [
1565083561,
1565083561 + 86400,
1565083561 + 86500,
1565083561 + 86400 * 2,
1565083561 + 86400 * 3,
1565083561 + 86500 * 3,
1565083561 + 86400 * 4,
],
"Food": ["apple", "apple", "banana", "banana", "orange", "orange", "pear"],
}
).drop([3])
df["Datetime"] = to_datetime(df["Timestamp"], utc=utc, unit="s")
dfg = df.groupby(Grouper(freq="1D", key="Datetime"))
# have to sort on index because of unstable sort on values xref GH9212
result = dfg["Food"].value_counts().sort_index()
expected = dfg["Food"].apply(Series.value_counts).sort_index()
expected.index.names = result.index.names
# https://github.com/pandas-dev/pandas/issues/49909
expected = expected.rename("count")
tm.assert_series_equal(result, expected)
@pytest.mark.parametrize("columns", [["A", "B"], ["A", "B", "C"]])
def test_series_groupby_value_counts_empty(columns):
# GH39172
df = DataFrame(columns=columns)
dfg = df.groupby(columns[:-1])
result = dfg[columns[-1]].value_counts()
expected = Series([], dtype=result.dtype, name="count")
expected.index = MultiIndex.from_arrays([[]] * len(columns), names=columns)
tm.assert_series_equal(result, expected)
@pytest.mark.parametrize("columns", [["A", "B"], ["A", "B", "C"]])
def test_series_groupby_value_counts_one_row(columns):
# GH42618
df = DataFrame(data=[range(len(columns))], columns=columns)
dfg = df.groupby(columns[:-1])
result = dfg[columns[-1]].value_counts()
expected = df.value_counts()
tm.assert_series_equal(result, expected)
def test_series_groupby_value_counts_on_categorical():
# GH38672
s = Series(Categorical(["a"], categories=["a", "b"]))
result = s.groupby([0]).value_counts()
expected = Series(
data=[1, 0],
index=MultiIndex.from_arrays(
[
np.array([0, 0]),
CategoricalIndex(
["a", "b"], categories=["a", "b"], ordered=False, dtype="category"
),
]
),
name="count",
)
# Expected:
# 0 a 1
# b 0
# dtype: int64
tm.assert_series_equal(result, expected)
def test_series_groupby_value_counts_no_sort():
# GH#50482
df = DataFrame(
{
"gender": ["male", "male", "female", "male", "female", "male"],
"education": ["low", "medium", "high", "low", "high", "low"],
"country": ["US", "FR", "US", "FR", "FR", "FR"],
}
)
gb = df.groupby(["country", "gender"], sort=False)["education"]
result = gb.value_counts(sort=False)
index = MultiIndex(
levels=[["US", "FR"], ["male", "female"], ["low", "medium", "high"]],
codes=[[0, 1, 0, 1, 1], [0, 0, 1, 0, 1], [0, 1, 2, 0, 2]],
names=["country", "gender", "education"],
)
expected = Series([1, 1, 1, 2, 1], index=index, name="count")
tm.assert_series_equal(result, expected)
@pytest.fixture
def education_df():
return DataFrame(
{
"gender": ["male", "male", "female", "male", "female", "male"],
"education": ["low", "medium", "high", "low", "high", "low"],
"country": ["US", "FR", "US", "FR", "FR", "FR"],
}
)
def test_axis(education_df):
gp = education_df.groupby("country", axis=1)
with pytest.raises(NotImplementedError, match="axis"):
gp.value_counts()
def test_bad_subset(education_df):
gp = education_df.groupby("country")
with pytest.raises(ValueError, match="subset"):
gp.value_counts(subset=["country"])
def test_basic(education_df):
# gh43564
result = education_df.groupby("country")[["gender", "education"]].value_counts(
normalize=True
)
expected = Series(
data=[0.5, 0.25, 0.25, 0.5, 0.5],
index=MultiIndex.from_tuples(
[
("FR", "male", "low"),
("FR", "female", "high"),
("FR", "male", "medium"),
("US", "female", "high"),
("US", "male", "low"),
],
names=["country", "gender", "education"],
),
name="proportion",
)
tm.assert_series_equal(result, expected)
def _frame_value_counts(df, keys, normalize, sort, ascending):
return df[keys].value_counts(normalize=normalize, sort=sort, ascending=ascending)
@pytest.mark.parametrize("groupby", ["column", "array", "function"])
@pytest.mark.parametrize("normalize, name", [(True, "proportion"), (False, "count")])
@pytest.mark.parametrize(
"sort, ascending",
[
(False, None),
(True, True),
(True, False),
],
)
@pytest.mark.parametrize("as_index", [True, False])
@pytest.mark.parametrize("frame", [True, False])
def test_against_frame_and_seriesgroupby(
education_df, groupby, normalize, name, sort, ascending, as_index, frame
):
# test all parameters:
# - Use column, array or function as by= parameter
# - Whether or not to normalize
# - Whether or not to sort and how
# - Whether or not to use the groupby as an index
# - 3-way compare against:
# - apply with :meth:`~DataFrame.value_counts`
# - `~SeriesGroupBy.value_counts`
by = {
"column": "country",
"array": education_df["country"].values,
"function": lambda x: education_df["country"][x] == "US",
}[groupby]
gp = education_df.groupby(by=by, as_index=as_index)
result = gp[["gender", "education"]].value_counts(
normalize=normalize, sort=sort, ascending=ascending
)
if frame:
# compare against apply with DataFrame value_counts
expected = gp.apply(
_frame_value_counts, ["gender", "education"], normalize, sort, ascending
)
if as_index:
tm.assert_series_equal(result, expected)
else:
name = "proportion" if normalize else "count"
expected = expected.reset_index().rename({0: name}, axis=1)
if groupby == "column":
expected = expected.rename({"level_0": "country"}, axis=1)
expected["country"] = np.where(expected["country"], "US", "FR")
elif groupby == "function":
expected["level_0"] = expected["level_0"] == 1
else:
expected["level_0"] = np.where(expected["level_0"], "US", "FR")
tm.assert_frame_equal(result, expected)
else:
# compare against SeriesGroupBy value_counts
education_df["both"] = education_df["gender"] + "-" + education_df["education"]
expected = gp["both"].value_counts(
normalize=normalize, sort=sort, ascending=ascending
)
expected.name = name
if as_index:
index_frame = expected.index.to_frame(index=False)
index_frame["gender"] = index_frame["both"].str.split("-").str.get(0)
index_frame["education"] = index_frame["both"].str.split("-").str.get(1)
del index_frame["both"]
index_frame = index_frame.rename({0: None}, axis=1)
expected.index = MultiIndex.from_frame(index_frame)
tm.assert_series_equal(result, expected)
else:
expected.insert(1, "gender", expected["both"].str.split("-").str.get(0))
expected.insert(2, "education", expected["both"].str.split("-").str.get(1))
del expected["both"]
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize("normalize", [True, False])
@pytest.mark.parametrize(
"sort, ascending, expected_rows, expected_count, expected_group_size",
[
(False, None, [0, 1, 2, 3, 4], [1, 1, 1, 2, 1], [1, 3, 1, 3, 1]),
(True, False, [4, 3, 1, 2, 0], [1, 2, 1, 1, 1], [1, 3, 3, 1, 1]),
(True, True, [4, 1, 3, 2, 0], [1, 1, 2, 1, 1], [1, 3, 3, 1, 1]),
],
)
def test_compound(
education_df,
normalize,
sort,
ascending,
expected_rows,
expected_count,
expected_group_size,
):
# Multiple groupby keys and as_index=False
gp = education_df.groupby(["country", "gender"], as_index=False, sort=False)
result = gp["education"].value_counts(
normalize=normalize, sort=sort, ascending=ascending
)
expected = DataFrame()
for column in ["country", "gender", "education"]:
expected[column] = [education_df[column][row] for row in expected_rows]
if normalize:
expected["proportion"] = expected_count
expected["proportion"] /= expected_group_size
else:
expected["count"] = expected_count
tm.assert_frame_equal(result, expected)
@pytest.fixture
def animals_df():
return DataFrame(
{"key": [1, 1, 1, 1], "num_legs": [2, 4, 4, 6], "num_wings": [2, 0, 0, 0]},
index=["falcon", "dog", "cat", "ant"],
)
@pytest.mark.parametrize(
"sort, ascending, normalize, name, expected_data, expected_index",
[
(False, None, False, "count", [1, 2, 1], [(1, 1, 1), (2, 4, 6), (2, 0, 0)]),
(True, True, False, "count", [1, 1, 2], [(1, 1, 1), (2, 6, 4), (2, 0, 0)]),
(True, False, False, "count", [2, 1, 1], [(1, 1, 1), (4, 2, 6), (0, 2, 0)]),
(
True,
False,
True,
"proportion",
[0.5, 0.25, 0.25],
[(1, 1, 1), (4, 2, 6), (0, 2, 0)],
),
],
)
def test_data_frame_value_counts(
animals_df, sort, ascending, normalize, name, expected_data, expected_index
):
# 3-way compare with :meth:`~DataFrame.value_counts`
# Tests from frame/methods/test_value_counts.py
result_frame = animals_df.value_counts(
sort=sort, ascending=ascending, normalize=normalize
)
expected = Series(
data=expected_data,
index=MultiIndex.from_arrays(
expected_index, names=["key", "num_legs", "num_wings"]
),
name=name,
)
tm.assert_series_equal(result_frame, expected)
result_frame_groupby = animals_df.groupby("key").value_counts(
sort=sort, ascending=ascending, normalize=normalize
)
tm.assert_series_equal(result_frame_groupby, expected)
@pytest.fixture
def nulls_df():
n = np.nan
return DataFrame(
{
"A": [1, 1, n, 4, n, 6, 6, 6, 6],
"B": [1, 1, 3, n, n, 6, 6, 6, 6],
"C": [1, 2, 3, 4, 5, 6, n, 8, n],
"D": [1, 2, 3, 4, 5, 6, 7, n, n],
}
)
@pytest.mark.parametrize(
"group_dropna, count_dropna, expected_rows, expected_values",
[
(
False,
False,
[0, 1, 3, 5, 7, 6, 8, 2, 4],
[0.5, 0.5, 1.0, 0.25, 0.25, 0.25, 0.25, 1.0, 1.0],
),
(False, True, [0, 1, 3, 5, 2, 4], [0.5, 0.5, 1.0, 1.0, 1.0, 1.0]),
(True, False, [0, 1, 5, 7, 6, 8], [0.5, 0.5, 0.25, 0.25, 0.25, 0.25]),
(True, True, [0, 1, 5], [0.5, 0.5, 1.0]),
],
)
def test_dropna_combinations(
nulls_df, group_dropna, count_dropna, expected_rows, expected_values
):
gp = nulls_df.groupby(["A", "B"], dropna=group_dropna)
result = gp.value_counts(normalize=True, sort=True, dropna=count_dropna)
columns = DataFrame()
for column in nulls_df.columns:
columns[column] = [nulls_df[column][row] for row in expected_rows]
index = MultiIndex.from_frame(columns)
expected = Series(data=expected_values, index=index, name="proportion")
tm.assert_series_equal(result, expected)
@pytest.fixture
def names_with_nulls_df(nulls_fixture):
return DataFrame(
{
"key": [1, 1, 1, 1],
"first_name": ["John", "Anne", "John", "Beth"],
"middle_name": ["Smith", nulls_fixture, nulls_fixture, "Louise"],
},
)
@pytest.mark.parametrize(
"dropna, expected_data, expected_index",
[
(
True,
[1, 1],
MultiIndex.from_arrays(
[(1, 1), ("Beth", "John"), ("Louise", "Smith")],
names=["key", "first_name", "middle_name"],
),
),
(
False,
[1, 1, 1, 1],
MultiIndex(
levels=[
Index([1]),
Index(["Anne", "Beth", "John"]),
Index(["Louise", "Smith", np.nan]),
],
codes=[[0, 0, 0, 0], [0, 1, 2, 2], [2, 0, 1, 2]],
names=["key", "first_name", "middle_name"],
),
),
],
)
@pytest.mark.parametrize("normalize, name", [(False, "count"), (True, "proportion")])
def test_data_frame_value_counts_dropna(
names_with_nulls_df, dropna, normalize, name, expected_data, expected_index
):
# GH 41334
# 3-way compare with :meth:`~DataFrame.value_counts`
# Tests with nulls from frame/methods/test_value_counts.py
result_frame = names_with_nulls_df.value_counts(dropna=dropna, normalize=normalize)
expected = Series(
data=expected_data,
index=expected_index,
name=name,
)
if normalize:
expected /= float(len(expected_data))
tm.assert_series_equal(result_frame, expected)
result_frame_groupby = names_with_nulls_df.groupby("key").value_counts(
dropna=dropna, normalize=normalize
)
tm.assert_series_equal(result_frame_groupby, expected)
@pytest.mark.parametrize("as_index", [False, True])
@pytest.mark.parametrize("observed", [False, True])
@pytest.mark.parametrize(
"normalize, name, expected_data",
[
(
False,
"count",
np.array([2, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0], dtype=np.int64),
),
(
True,
"proportion",
np.array([0.5, 0.25, 0.25, 0.0, 0.0, 0.0, 0.5, 0.5, 0.0, 0.0, 0.0, 0.0]),
),
],
)
def test_categorical_single_grouper_with_only_observed_categories(
education_df, as_index, observed, normalize, name, expected_data
):
# Test single categorical grouper with only observed grouping categories
# when non-groupers are also categorical
gp = education_df.astype("category").groupby(
"country", as_index=as_index, observed=observed
)
result = gp.value_counts(normalize=normalize)
expected_index = MultiIndex.from_tuples(
[
("FR", "male", "low"),
("FR", "female", "high"),
("FR", "male", "medium"),
("FR", "female", "low"),
("FR", "female", "medium"),
("FR", "male", "high"),
("US", "female", "high"),
("US", "male", "low"),
("US", "female", "low"),
("US", "female", "medium"),
("US", "male", "high"),
("US", "male", "medium"),
],
names=["country", "gender", "education"],
)
expected_series = Series(
data=expected_data,
index=expected_index,
name=name,
)
for i in range(3):
expected_series.index = expected_series.index.set_levels(
CategoricalIndex(expected_series.index.levels[i]), level=i
)
if as_index:
tm.assert_series_equal(result, expected_series)
else:
expected = expected_series.reset_index(
name="proportion" if normalize else "count"
)
tm.assert_frame_equal(result, expected)
def assert_categorical_single_grouper(
education_df, as_index, observed, expected_index, normalize, name, expected_data
):
# Test single categorical grouper when non-groupers are also categorical
education_df = education_df.copy().astype("category")
# Add non-observed grouping categories
education_df["country"] = education_df["country"].cat.add_categories(["ASIA"])
gp = education_df.groupby("country", as_index=as_index, observed=observed)
result = gp.value_counts(normalize=normalize)
expected_series = Series(
data=expected_data,
index=MultiIndex.from_tuples(
expected_index,
names=["country", "gender", "education"],
),
name=name,
)
for i in range(3):
index_level = CategoricalIndex(expected_series.index.levels[i])
if i == 0:
index_level = index_level.set_categories(
education_df["country"].cat.categories
)
expected_series.index = expected_series.index.set_levels(index_level, level=i)
if as_index:
tm.assert_series_equal(result, expected_series)
else:
expected = expected_series.reset_index(name=name)
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize("as_index", [True, False])
@pytest.mark.parametrize(
"normalize, name, expected_data",
[
(
False,
"count",
np.array([2, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0], dtype=np.int64),
),
(
True,
"proportion",
np.array([0.5, 0.25, 0.25, 0.0, 0.0, 0.0, 0.5, 0.5, 0.0, 0.0, 0.0, 0.0]),
),
],
)
def test_categorical_single_grouper_observed_true(
education_df, as_index, normalize, name, expected_data
):
# GH#46357
expected_index = [
("FR", "male", "low"),
("FR", "female", "high"),
("FR", "male", "medium"),
("FR", "female", "low"),
("FR", "female", "medium"),
("FR", "male", "high"),
("US", "female", "high"),
("US", "male", "low"),
("US", "female", "low"),
("US", "female", "medium"),
("US", "male", "high"),
("US", "male", "medium"),
]
assert_categorical_single_grouper(
education_df=education_df,
as_index=as_index,
observed=True,
expected_index=expected_index,
normalize=normalize,
name=name,
expected_data=expected_data,
)
@pytest.mark.parametrize("as_index", [True, False])
@pytest.mark.parametrize(
"normalize, name, expected_data",
[
(
False,
"count",
np.array(
[2, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], dtype=np.int64
),
),
(
True,
"proportion",
np.array(
[
0.5,
0.25,
0.25,
0.0,
0.0,
0.0,
0.5,
0.5,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
]
),
),
],
)
def test_categorical_single_grouper_observed_false(
education_df, as_index, normalize, name, expected_data
):
# GH#46357
expected_index = [
("FR", "male", "low"),
("FR", "female", "high"),
("FR", "male", "medium"),
("FR", "female", "low"),
("FR", "male", "high"),
("FR", "female", "medium"),
("US", "female", "high"),
("US", "male", "low"),
("US", "male", "medium"),
("US", "male", "high"),
("US", "female", "medium"),
("US", "female", "low"),
("ASIA", "male", "low"),
("ASIA", "male", "high"),
("ASIA", "female", "medium"),
("ASIA", "female", "low"),
("ASIA", "female", "high"),
("ASIA", "male", "medium"),
]
assert_categorical_single_grouper(
education_df=education_df,
as_index=as_index,
observed=False,
expected_index=expected_index,
normalize=normalize,
name=name,
expected_data=expected_data,
)
@pytest.mark.parametrize("as_index", [True, False])
@pytest.mark.parametrize(
"observed, expected_index",
[
(
False,
[
("FR", "high", "female"),
("FR", "high", "male"),
("FR", "low", "male"),
("FR", "low", "female"),
("FR", "medium", "male"),
("FR", "medium", "female"),
("US", "high", "female"),
("US", "high", "male"),
("US", "low", "male"),
("US", "low", "female"),
("US", "medium", "female"),
("US", "medium", "male"),
],
),
(
True,
[
("FR", "high", "female"),
("FR", "low", "male"),
("FR", "medium", "male"),
("US", "high", "female"),
("US", "low", "male"),
],
),
],
)
@pytest.mark.parametrize(
"normalize, name, expected_data",
[
(
False,
"count",
np.array([1, 0, 2, 0, 1, 0, 1, 0, 1, 0, 0, 0], dtype=np.int64),
),
(
True,
"proportion",
# NaN values corresponds to non-observed groups
np.array([1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 0.0, 0.0]),
),
],
)
def test_categorical_multiple_groupers(
education_df, as_index, observed, expected_index, normalize, name, expected_data
):
# GH#46357
# Test multiple categorical groupers when non-groupers are non-categorical
education_df = education_df.copy()
education_df["country"] = education_df["country"].astype("category")
education_df["education"] = education_df["education"].astype("category")
gp = education_df.groupby(
["country", "education"], as_index=as_index, observed=observed
)
result = gp.value_counts(normalize=normalize)
expected_series = Series(
data=expected_data[expected_data > 0.0] if observed else expected_data,
index=MultiIndex.from_tuples(
expected_index,
names=["country", "education", "gender"],
),
name=name,
)
for i in range(2):
expected_series.index = expected_series.index.set_levels(
CategoricalIndex(expected_series.index.levels[i]), level=i
)
if as_index:
tm.assert_series_equal(result, expected_series)
else:
expected = expected_series.reset_index(
name="proportion" if normalize else "count"
)
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize("as_index", [False, True])
@pytest.mark.parametrize("observed", [False, True])
@pytest.mark.parametrize(
"normalize, name, expected_data",
[
(
False,
"count",
np.array([2, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0], dtype=np.int64),
),
(
True,
"proportion",
# NaN values corresponds to non-observed groups
np.array([0.5, 0.25, 0.25, 0.0, 0.0, 0.0, 0.5, 0.5, 0.0, 0.0, 0.0, 0.0]),
),
],
)
def test_categorical_non_groupers(
education_df, as_index, observed, normalize, name, expected_data
):
# GH#46357 Test non-observed categories are included in the result,
# regardless of `observed`
education_df = education_df.copy()
education_df["gender"] = education_df["gender"].astype("category")
education_df["education"] = education_df["education"].astype("category")
gp = education_df.groupby("country", as_index=as_index, observed=observed)
result = gp.value_counts(normalize=normalize)
expected_index = [
("FR", "male", "low"),
("FR", "female", "high"),
("FR", "male", "medium"),
("FR", "female", "low"),
("FR", "female", "medium"),
("FR", "male", "high"),
("US", "female", "high"),
("US", "male", "low"),
("US", "female", "low"),
("US", "female", "medium"),
("US", "male", "high"),
("US", "male", "medium"),
]
expected_series = Series(
data=expected_data,
index=MultiIndex.from_tuples(
expected_index,
names=["country", "gender", "education"],
),
name=name,
)
for i in range(1, 3):
expected_series.index = expected_series.index.set_levels(
CategoricalIndex(expected_series.index.levels[i]), level=i
)
if as_index:
tm.assert_series_equal(result, expected_series)
else:
expected = expected_series.reset_index(
name="proportion" if normalize else "count"
)
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize(
"normalize, expected_label, expected_values",
[
(False, "count", [1, 1, 1]),
(True, "proportion", [0.5, 0.5, 1.0]),
],
)
def test_mixed_groupings(normalize, expected_label, expected_values):
# Test multiple groupings
df = DataFrame({"A": [1, 2, 1], "B": [1, 2, 3]})
gp = df.groupby([[4, 5, 4], "A", lambda i: 7 if i == 1 else 8], as_index=False)
result = gp.value_counts(sort=True, normalize=normalize)
expected = DataFrame(
{
"level_0": np.array([4, 4, 5], dtype=np.int_),
"A": [1, 1, 2],
"level_2": [8, 8, 7],
"B": [1, 3, 2],
expected_label: expected_values,
}
)
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize(
"test, columns, expected_names",
[
("repeat", list("abbde"), ["a", None, "d", "b", "b", "e"]),
("level", list("abcd") + ["level_1"], ["a", None, "d", "b", "c", "level_1"]),
],
)
@pytest.mark.parametrize("as_index", [False, True])
def test_column_label_duplicates(test, columns, expected_names, as_index):
# GH 44992
# Test for duplicate input column labels and generated duplicate labels
df = DataFrame([[1, 3, 5, 7, 9], [2, 4, 6, 8, 10]], columns=columns)
expected_data = [(1, 0, 7, 3, 5, 9), (2, 1, 8, 4, 6, 10)]
keys = ["a", np.array([0, 1], dtype=np.int64), "d"]
result = df.groupby(keys, as_index=as_index).value_counts()
if as_index:
expected = Series(
data=(1, 1),
index=MultiIndex.from_tuples(
expected_data,
names=expected_names,
),
name="count",
)
tm.assert_series_equal(result, expected)
else:
expected_data = [list(row) + [1] for row in expected_data]
expected_columns = list(expected_names)
expected_columns[1] = "level_1"
expected_columns.append("count")
expected = DataFrame(expected_data, columns=expected_columns)
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize(
"normalize, expected_label",
[
(False, "count"),
(True, "proportion"),
],
)
def test_result_label_duplicates(normalize, expected_label):
# Test for result column label duplicating an input column label
gb = DataFrame([[1, 2, 3]], columns=["a", "b", expected_label]).groupby(
"a", as_index=False
)
msg = f"Column label '{expected_label}' is duplicate of result column"
with pytest.raises(ValueError, match=msg):
gb.value_counts(normalize=normalize)
def test_ambiguous_grouping():
# Test that groupby is not confused by groupings length equal to row count
df = DataFrame({"a": [1, 1]})
gb = df.groupby(np.array([1, 1], dtype=np.int64))
result = gb.value_counts()
expected = Series(
[2], index=MultiIndex.from_tuples([[1, 1]], names=[None, "a"]), name="count"
)
tm.assert_series_equal(result, expected)
def test_subset_overlaps_gb_key_raises():
# GH 46383
df = DataFrame({"c1": ["a", "b", "c"], "c2": ["x", "y", "y"]}, index=[0, 1, 1])
msg = "Keys {'c1'} in subset cannot be in the groupby column keys."
with pytest.raises(ValueError, match=msg):
df.groupby("c1").value_counts(subset=["c1"])
def test_subset_doesnt_exist_in_frame():
# GH 46383
df = DataFrame({"c1": ["a", "b", "c"], "c2": ["x", "y", "y"]}, index=[0, 1, 1])
msg = "Keys {'c3'} in subset do not exist in the DataFrame."
with pytest.raises(ValueError, match=msg):
df.groupby("c1").value_counts(subset=["c3"])
def test_subset():
# GH 46383
df = DataFrame({"c1": ["a", "b", "c"], "c2": ["x", "y", "y"]}, index=[0, 1, 1])
result = df.groupby(level=0).value_counts(subset=["c2"])
expected = Series(
[1, 2],
index=MultiIndex.from_arrays([[0, 1], ["x", "y"]], names=[None, "c2"]),
name="count",
)
tm.assert_series_equal(result, expected)
def test_subset_duplicate_columns():
# GH 46383
df = DataFrame(
[["a", "x", "x"], ["b", "y", "y"], ["b", "y", "y"]],
index=[0, 1, 1],
columns=["c1", "c2", "c2"],
)
result = df.groupby(level=0).value_counts(subset=["c2"])
expected = Series(
[1, 2],
index=MultiIndex.from_arrays(
[[0, 1], ["x", "y"], ["x", "y"]], names=[None, "c2", "c2"]
),
name="count",
)
tm.assert_series_equal(result, expected)
@pytest.mark.parametrize("utc", [True, False])
def test_value_counts_time_grouper(utc):
# GH#50486
df = DataFrame(
{
"Timestamp": [
1565083561,
1565083561 + 86400,
1565083561 + 86500,
1565083561 + 86400 * 2,
1565083561 + 86400 * 3,
1565083561 + 86500 * 3,
1565083561 + 86400 * 4,
],
"Food": ["apple", "apple", "banana", "banana", "orange", "orange", "pear"],
}
).drop([3])
df["Datetime"] = to_datetime(df["Timestamp"], utc=utc, unit="s")
gb = df.groupby(Grouper(freq="1D", key="Datetime"))
result = gb.value_counts()
dates = to_datetime(
["2019-08-06", "2019-08-07", "2019-08-09", "2019-08-10"], utc=utc
)
timestamps = df["Timestamp"].unique()
index = MultiIndex(
levels=[dates, timestamps, ["apple", "banana", "orange", "pear"]],
codes=[[0, 1, 1, 2, 2, 3], range(6), [0, 0, 1, 2, 2, 3]],
names=["Datetime", "Timestamp", "Food"],
)
expected = Series(1, index=index, name="count")
tm.assert_series_equal(result, expected)