projektAI/venv/Lib/site-packages/pandas/tests/frame/indexing/test_categorical.py
2021-06-06 22:13:05 +02:00

389 lines
14 KiB
Python

import numpy as np
import pytest
from pandas.core.dtypes.dtypes import CategoricalDtype
import pandas as pd
from pandas import Categorical, DataFrame, Index, Series
import pandas._testing as tm
class TestDataFrameIndexingCategorical:
def test_assignment(self):
# assignment
df = DataFrame(
{"value": np.array(np.random.randint(0, 10000, 100), dtype="int32")}
)
labels = Categorical([f"{i} - {i + 499}" for i in range(0, 10000, 500)])
df = df.sort_values(by=["value"], ascending=True)
s = pd.cut(df.value, range(0, 10500, 500), right=False, labels=labels)
d = s.values
df["D"] = d
str(df)
result = df.dtypes
expected = Series(
[np.dtype("int32"), CategoricalDtype(categories=labels, ordered=False)],
index=["value", "D"],
)
tm.assert_series_equal(result, expected)
df["E"] = s
str(df)
result = df.dtypes
expected = Series(
[
np.dtype("int32"),
CategoricalDtype(categories=labels, ordered=False),
CategoricalDtype(categories=labels, ordered=False),
],
index=["value", "D", "E"],
)
tm.assert_series_equal(result, expected)
result1 = df["D"]
result2 = df["E"]
tm.assert_categorical_equal(result1._mgr._block.values, d)
# sorting
s.name = "E"
tm.assert_series_equal(result2.sort_index(), s.sort_index())
cat = Categorical([1, 2, 3, 10], categories=[1, 2, 3, 4, 10])
df = DataFrame(Series(cat))
def test_assigning_ops(self):
# systematically test the assigning operations:
# for all slicing ops:
# for value in categories and value not in categories:
# - assign a single value -> exp_single_cats_value
# - assign a complete row (mixed values) -> exp_single_row
# assign multiple rows (mixed values) (-> array) -> exp_multi_row
# assign a part of a column with dtype == categorical ->
# exp_parts_cats_col
# assign a part of a column with dtype != categorical ->
# exp_parts_cats_col
cats = Categorical(["a", "a", "a", "a", "a", "a", "a"], categories=["a", "b"])
idx = Index(["h", "i", "j", "k", "l", "m", "n"])
values = [1, 1, 1, 1, 1, 1, 1]
orig = DataFrame({"cats": cats, "values": values}, index=idx)
# the expected values
# changed single row
cats1 = Categorical(["a", "a", "b", "a", "a", "a", "a"], categories=["a", "b"])
idx1 = Index(["h", "i", "j", "k", "l", "m", "n"])
values1 = [1, 1, 2, 1, 1, 1, 1]
exp_single_row = DataFrame({"cats": cats1, "values": values1}, index=idx1)
# changed multiple rows
cats2 = Categorical(["a", "a", "b", "b", "a", "a", "a"], categories=["a", "b"])
idx2 = Index(["h", "i", "j", "k", "l", "m", "n"])
values2 = [1, 1, 2, 2, 1, 1, 1]
exp_multi_row = DataFrame({"cats": cats2, "values": values2}, index=idx2)
# changed part of the cats column
cats3 = Categorical(["a", "a", "b", "b", "a", "a", "a"], categories=["a", "b"])
idx3 = Index(["h", "i", "j", "k", "l", "m", "n"])
values3 = [1, 1, 1, 1, 1, 1, 1]
exp_parts_cats_col = DataFrame({"cats": cats3, "values": values3}, index=idx3)
# changed single value in cats col
cats4 = Categorical(["a", "a", "b", "a", "a", "a", "a"], categories=["a", "b"])
idx4 = Index(["h", "i", "j", "k", "l", "m", "n"])
values4 = [1, 1, 1, 1, 1, 1, 1]
exp_single_cats_value = DataFrame(
{"cats": cats4, "values": values4}, index=idx4
)
# iloc
# ###############
# - assign a single value -> exp_single_cats_value
df = orig.copy()
df.iloc[2, 0] = "b"
tm.assert_frame_equal(df, exp_single_cats_value)
df = orig.copy()
df.iloc[df.index == "j", 0] = "b"
tm.assert_frame_equal(df, exp_single_cats_value)
# - assign a single value not in the current categories set
msg1 = (
"Cannot setitem on a Categorical with a new category, "
"set the categories first"
)
msg2 = "Cannot set a Categorical with another, without identical categories"
with pytest.raises(ValueError, match=msg1):
df = orig.copy()
df.iloc[2, 0] = "c"
# - assign a complete row (mixed values) -> exp_single_row
df = orig.copy()
df.iloc[2, :] = ["b", 2]
tm.assert_frame_equal(df, exp_single_row)
# - assign a complete row (mixed values) not in categories set
with pytest.raises(ValueError, match=msg1):
df = orig.copy()
df.iloc[2, :] = ["c", 2]
# - assign multiple rows (mixed values) -> exp_multi_row
df = orig.copy()
df.iloc[2:4, :] = [["b", 2], ["b", 2]]
tm.assert_frame_equal(df, exp_multi_row)
with pytest.raises(ValueError, match=msg1):
df = orig.copy()
df.iloc[2:4, :] = [["c", 2], ["c", 2]]
# assign a part of a column with dtype == categorical ->
# exp_parts_cats_col
df = orig.copy()
df.iloc[2:4, 0] = Categorical(["b", "b"], categories=["a", "b"])
tm.assert_frame_equal(df, exp_parts_cats_col)
with pytest.raises(ValueError, match=msg2):
# different categories -> not sure if this should fail or pass
df = orig.copy()
df.iloc[2:4, 0] = Categorical(list("bb"), categories=list("abc"))
with pytest.raises(ValueError, match=msg2):
# different values
df = orig.copy()
df.iloc[2:4, 0] = Categorical(list("cc"), categories=list("abc"))
# assign a part of a column with dtype != categorical ->
# exp_parts_cats_col
df = orig.copy()
df.iloc[2:4, 0] = ["b", "b"]
tm.assert_frame_equal(df, exp_parts_cats_col)
with pytest.raises(ValueError, match=msg1):
df.iloc[2:4, 0] = ["c", "c"]
# loc
# ##############
# - assign a single value -> exp_single_cats_value
df = orig.copy()
df.loc["j", "cats"] = "b"
tm.assert_frame_equal(df, exp_single_cats_value)
df = orig.copy()
df.loc[df.index == "j", "cats"] = "b"
tm.assert_frame_equal(df, exp_single_cats_value)
# - assign a single value not in the current categories set
with pytest.raises(ValueError, match=msg1):
df = orig.copy()
df.loc["j", "cats"] = "c"
# - assign a complete row (mixed values) -> exp_single_row
df = orig.copy()
df.loc["j", :] = ["b", 2]
tm.assert_frame_equal(df, exp_single_row)
# - assign a complete row (mixed values) not in categories set
with pytest.raises(ValueError, match=msg1):
df = orig.copy()
df.loc["j", :] = ["c", 2]
# - assign multiple rows (mixed values) -> exp_multi_row
df = orig.copy()
df.loc["j":"k", :] = [["b", 2], ["b", 2]]
tm.assert_frame_equal(df, exp_multi_row)
with pytest.raises(ValueError, match=msg1):
df = orig.copy()
df.loc["j":"k", :] = [["c", 2], ["c", 2]]
# assign a part of a column with dtype == categorical ->
# exp_parts_cats_col
df = orig.copy()
df.loc["j":"k", "cats"] = Categorical(["b", "b"], categories=["a", "b"])
tm.assert_frame_equal(df, exp_parts_cats_col)
with pytest.raises(ValueError, match=msg2):
# different categories -> not sure if this should fail or pass
df = orig.copy()
df.loc["j":"k", "cats"] = Categorical(
["b", "b"], categories=["a", "b", "c"]
)
with pytest.raises(ValueError, match=msg2):
# different values
df = orig.copy()
df.loc["j":"k", "cats"] = Categorical(
["c", "c"], categories=["a", "b", "c"]
)
# assign a part of a column with dtype != categorical ->
# exp_parts_cats_col
df = orig.copy()
df.loc["j":"k", "cats"] = ["b", "b"]
tm.assert_frame_equal(df, exp_parts_cats_col)
with pytest.raises(ValueError, match=msg1):
df.loc["j":"k", "cats"] = ["c", "c"]
# loc
# ##############
# - assign a single value -> exp_single_cats_value
df = orig.copy()
df.loc["j", df.columns[0]] = "b"
tm.assert_frame_equal(df, exp_single_cats_value)
df = orig.copy()
df.loc[df.index == "j", df.columns[0]] = "b"
tm.assert_frame_equal(df, exp_single_cats_value)
# - assign a single value not in the current categories set
with pytest.raises(ValueError, match=msg1):
df = orig.copy()
df.loc["j", df.columns[0]] = "c"
# - assign a complete row (mixed values) -> exp_single_row
df = orig.copy()
df.loc["j", :] = ["b", 2]
tm.assert_frame_equal(df, exp_single_row)
# - assign a complete row (mixed values) not in categories set
with pytest.raises(ValueError, match=msg1):
df = orig.copy()
df.loc["j", :] = ["c", 2]
# - assign multiple rows (mixed values) -> exp_multi_row
df = orig.copy()
df.loc["j":"k", :] = [["b", 2], ["b", 2]]
tm.assert_frame_equal(df, exp_multi_row)
with pytest.raises(ValueError, match=msg1):
df = orig.copy()
df.loc["j":"k", :] = [["c", 2], ["c", 2]]
# assign a part of a column with dtype == categorical ->
# exp_parts_cats_col
df = orig.copy()
df.loc["j":"k", df.columns[0]] = Categorical(["b", "b"], categories=["a", "b"])
tm.assert_frame_equal(df, exp_parts_cats_col)
with pytest.raises(ValueError, match=msg2):
# different categories -> not sure if this should fail or pass
df = orig.copy()
df.loc["j":"k", df.columns[0]] = Categorical(
["b", "b"], categories=["a", "b", "c"]
)
with pytest.raises(ValueError, match=msg2):
# different values
df = orig.copy()
df.loc["j":"k", df.columns[0]] = Categorical(
["c", "c"], categories=["a", "b", "c"]
)
# assign a part of a column with dtype != categorical ->
# exp_parts_cats_col
df = orig.copy()
df.loc["j":"k", df.columns[0]] = ["b", "b"]
tm.assert_frame_equal(df, exp_parts_cats_col)
with pytest.raises(ValueError, match=msg1):
df.loc["j":"k", df.columns[0]] = ["c", "c"]
# iat
df = orig.copy()
df.iat[2, 0] = "b"
tm.assert_frame_equal(df, exp_single_cats_value)
# - assign a single value not in the current categories set
with pytest.raises(ValueError, match=msg1):
df = orig.copy()
df.iat[2, 0] = "c"
# at
# - assign a single value -> exp_single_cats_value
df = orig.copy()
df.at["j", "cats"] = "b"
tm.assert_frame_equal(df, exp_single_cats_value)
# - assign a single value not in the current categories set
with pytest.raises(ValueError, match=msg1):
df = orig.copy()
df.at["j", "cats"] = "c"
# fancy indexing
catsf = Categorical(
["a", "a", "c", "c", "a", "a", "a"], categories=["a", "b", "c"]
)
idxf = Index(["h", "i", "j", "k", "l", "m", "n"])
valuesf = [1, 1, 3, 3, 1, 1, 1]
df = DataFrame({"cats": catsf, "values": valuesf}, index=idxf)
exp_fancy = exp_multi_row.copy()
return_value = exp_fancy["cats"].cat.set_categories(
["a", "b", "c"], inplace=True
)
assert return_value is None
df[df["cats"] == "c"] = ["b", 2]
# category c is kept in .categories
tm.assert_frame_equal(df, exp_fancy)
# set_value
df = orig.copy()
df.at["j", "cats"] = "b"
tm.assert_frame_equal(df, exp_single_cats_value)
with pytest.raises(ValueError, match=msg1):
df = orig.copy()
df.at["j", "cats"] = "c"
# Assigning a Category to parts of a int/... column uses the values of
# the Categorical
df = DataFrame({"a": [1, 1, 1, 1, 1], "b": list("aaaaa")})
exp = DataFrame({"a": [1, "b", "b", 1, 1], "b": list("aabba")})
df.loc[1:2, "a"] = Categorical(["b", "b"], categories=["a", "b"])
df.loc[2:3, "b"] = Categorical(["b", "b"], categories=["a", "b"])
tm.assert_frame_equal(df, exp)
def test_loc_setitem_single_row_categorical(self):
# GH 25495
df = DataFrame({"Alpha": ["a"], "Numeric": [0]})
categories = Categorical(df["Alpha"], categories=["a", "b", "c"])
df.loc[:, "Alpha"] = categories
result = df["Alpha"]
expected = Series(categories, index=df.index, name="Alpha")
tm.assert_series_equal(result, expected)
def test_loc_indexing_preserves_index_category_dtype(self):
# GH 15166
df = DataFrame(
data=np.arange(2, 22, 2),
index=pd.MultiIndex(
levels=[pd.CategoricalIndex(["a", "b"]), range(10)],
codes=[[0] * 5 + [1] * 5, range(10)],
names=["Index1", "Index2"],
),
)
expected = pd.CategoricalIndex(
["a", "b"],
categories=["a", "b"],
ordered=False,
name="Index1",
dtype="category",
)
result = df.index.levels[0]
tm.assert_index_equal(result, expected)
result = df.loc[["a"]].index.levels[0]
tm.assert_index_equal(result, expected)