import numpy as np import pytest from pandas.core.dtypes.dtypes import CategoricalDtype import pandas as pd from pandas import Categorical, DataFrame, Index, Series import pandas._testing as tm class TestDataFrameIndexingCategorical: def test_assignment(self): # assignment df = DataFrame( {"value": np.array(np.random.randint(0, 10000, 100), dtype="int32")} ) labels = Categorical([f"{i} - {i + 499}" for i in range(0, 10000, 500)]) df = df.sort_values(by=["value"], ascending=True) s = pd.cut(df.value, range(0, 10500, 500), right=False, labels=labels) d = s.values df["D"] = d str(df) result = df.dtypes expected = Series( [np.dtype("int32"), CategoricalDtype(categories=labels, ordered=False)], index=["value", "D"], ) tm.assert_series_equal(result, expected) df["E"] = s str(df) result = df.dtypes expected = Series( [ np.dtype("int32"), CategoricalDtype(categories=labels, ordered=False), CategoricalDtype(categories=labels, ordered=False), ], index=["value", "D", "E"], ) tm.assert_series_equal(result, expected) result1 = df["D"] result2 = df["E"] tm.assert_categorical_equal(result1._mgr._block.values, d) # sorting s.name = "E" tm.assert_series_equal(result2.sort_index(), s.sort_index()) cat = Categorical([1, 2, 3, 10], categories=[1, 2, 3, 4, 10]) df = DataFrame(Series(cat)) def test_assigning_ops(self): # systematically test the assigning operations: # for all slicing ops: # for value in categories and value not in categories: # - assign a single value -> exp_single_cats_value # - assign a complete row (mixed values) -> exp_single_row # assign multiple rows (mixed values) (-> array) -> exp_multi_row # assign a part of a column with dtype == categorical -> # exp_parts_cats_col # assign a part of a column with dtype != categorical -> # exp_parts_cats_col cats = Categorical(["a", "a", "a", "a", "a", "a", "a"], categories=["a", "b"]) idx = Index(["h", "i", "j", "k", "l", "m", "n"]) values = [1, 1, 1, 1, 1, 1, 1] orig = DataFrame({"cats": cats, "values": values}, index=idx) # the expected values # changed single row cats1 = Categorical(["a", "a", "b", "a", "a", "a", "a"], categories=["a", "b"]) idx1 = Index(["h", "i", "j", "k", "l", "m", "n"]) values1 = [1, 1, 2, 1, 1, 1, 1] exp_single_row = DataFrame({"cats": cats1, "values": values1}, index=idx1) # changed multiple rows cats2 = Categorical(["a", "a", "b", "b", "a", "a", "a"], categories=["a", "b"]) idx2 = Index(["h", "i", "j", "k", "l", "m", "n"]) values2 = [1, 1, 2, 2, 1, 1, 1] exp_multi_row = DataFrame({"cats": cats2, "values": values2}, index=idx2) # changed part of the cats column cats3 = Categorical(["a", "a", "b", "b", "a", "a", "a"], categories=["a", "b"]) idx3 = Index(["h", "i", "j", "k", "l", "m", "n"]) values3 = [1, 1, 1, 1, 1, 1, 1] exp_parts_cats_col = DataFrame({"cats": cats3, "values": values3}, index=idx3) # changed single value in cats col cats4 = Categorical(["a", "a", "b", "a", "a", "a", "a"], categories=["a", "b"]) idx4 = Index(["h", "i", "j", "k", "l", "m", "n"]) values4 = [1, 1, 1, 1, 1, 1, 1] exp_single_cats_value = DataFrame( {"cats": cats4, "values": values4}, index=idx4 ) # iloc # ############### # - assign a single value -> exp_single_cats_value df = orig.copy() df.iloc[2, 0] = "b" tm.assert_frame_equal(df, exp_single_cats_value) df = orig.copy() df.iloc[df.index == "j", 0] = "b" tm.assert_frame_equal(df, exp_single_cats_value) # - assign a single value not in the current categories set msg1 = ( "Cannot setitem on a Categorical with a new category, " "set the categories first" ) msg2 = "Cannot set a Categorical with another, without identical categories" with pytest.raises(ValueError, match=msg1): df = orig.copy() df.iloc[2, 0] = "c" # - assign a complete row (mixed values) -> exp_single_row df = orig.copy() df.iloc[2, :] = ["b", 2] tm.assert_frame_equal(df, exp_single_row) # - assign a complete row (mixed values) not in categories set with pytest.raises(ValueError, match=msg1): df = orig.copy() df.iloc[2, :] = ["c", 2] # - assign multiple rows (mixed values) -> exp_multi_row df = orig.copy() df.iloc[2:4, :] = [["b", 2], ["b", 2]] tm.assert_frame_equal(df, exp_multi_row) with pytest.raises(ValueError, match=msg1): df = orig.copy() df.iloc[2:4, :] = [["c", 2], ["c", 2]] # assign a part of a column with dtype == categorical -> # exp_parts_cats_col df = orig.copy() df.iloc[2:4, 0] = Categorical(["b", "b"], categories=["a", "b"]) tm.assert_frame_equal(df, exp_parts_cats_col) with pytest.raises(ValueError, match=msg2): # different categories -> not sure if this should fail or pass df = orig.copy() df.iloc[2:4, 0] = Categorical(list("bb"), categories=list("abc")) with pytest.raises(ValueError, match=msg2): # different values df = orig.copy() df.iloc[2:4, 0] = Categorical(list("cc"), categories=list("abc")) # assign a part of a column with dtype != categorical -> # exp_parts_cats_col df = orig.copy() df.iloc[2:4, 0] = ["b", "b"] tm.assert_frame_equal(df, exp_parts_cats_col) with pytest.raises(ValueError, match=msg1): df.iloc[2:4, 0] = ["c", "c"] # loc # ############## # - assign a single value -> exp_single_cats_value df = orig.copy() df.loc["j", "cats"] = "b" tm.assert_frame_equal(df, exp_single_cats_value) df = orig.copy() df.loc[df.index == "j", "cats"] = "b" tm.assert_frame_equal(df, exp_single_cats_value) # - assign a single value not in the current categories set with pytest.raises(ValueError, match=msg1): df = orig.copy() df.loc["j", "cats"] = "c" # - assign a complete row (mixed values) -> exp_single_row df = orig.copy() df.loc["j", :] = ["b", 2] tm.assert_frame_equal(df, exp_single_row) # - assign a complete row (mixed values) not in categories set with pytest.raises(ValueError, match=msg1): df = orig.copy() df.loc["j", :] = ["c", 2] # - assign multiple rows (mixed values) -> exp_multi_row df = orig.copy() df.loc["j":"k", :] = [["b", 2], ["b", 2]] tm.assert_frame_equal(df, exp_multi_row) with pytest.raises(ValueError, match=msg1): df = orig.copy() df.loc["j":"k", :] = [["c", 2], ["c", 2]] # assign a part of a column with dtype == categorical -> # exp_parts_cats_col df = orig.copy() df.loc["j":"k", "cats"] = Categorical(["b", "b"], categories=["a", "b"]) tm.assert_frame_equal(df, exp_parts_cats_col) with pytest.raises(ValueError, match=msg2): # different categories -> not sure if this should fail or pass df = orig.copy() df.loc["j":"k", "cats"] = Categorical( ["b", "b"], categories=["a", "b", "c"] ) with pytest.raises(ValueError, match=msg2): # different values df = orig.copy() df.loc["j":"k", "cats"] = Categorical( ["c", "c"], categories=["a", "b", "c"] ) # assign a part of a column with dtype != categorical -> # exp_parts_cats_col df = orig.copy() df.loc["j":"k", "cats"] = ["b", "b"] tm.assert_frame_equal(df, exp_parts_cats_col) with pytest.raises(ValueError, match=msg1): df.loc["j":"k", "cats"] = ["c", "c"] # loc # ############## # - assign a single value -> exp_single_cats_value df = orig.copy() df.loc["j", df.columns[0]] = "b" tm.assert_frame_equal(df, exp_single_cats_value) df = orig.copy() df.loc[df.index == "j", df.columns[0]] = "b" tm.assert_frame_equal(df, exp_single_cats_value) # - assign a single value not in the current categories set with pytest.raises(ValueError, match=msg1): df = orig.copy() df.loc["j", df.columns[0]] = "c" # - assign a complete row (mixed values) -> exp_single_row df = orig.copy() df.loc["j", :] = ["b", 2] tm.assert_frame_equal(df, exp_single_row) # - assign a complete row (mixed values) not in categories set with pytest.raises(ValueError, match=msg1): df = orig.copy() df.loc["j", :] = ["c", 2] # - assign multiple rows (mixed values) -> exp_multi_row df = orig.copy() df.loc["j":"k", :] = [["b", 2], ["b", 2]] tm.assert_frame_equal(df, exp_multi_row) with pytest.raises(ValueError, match=msg1): df = orig.copy() df.loc["j":"k", :] = [["c", 2], ["c", 2]] # assign a part of a column with dtype == categorical -> # exp_parts_cats_col df = orig.copy() df.loc["j":"k", df.columns[0]] = Categorical(["b", "b"], categories=["a", "b"]) tm.assert_frame_equal(df, exp_parts_cats_col) with pytest.raises(ValueError, match=msg2): # different categories -> not sure if this should fail or pass df = orig.copy() df.loc["j":"k", df.columns[0]] = Categorical( ["b", "b"], categories=["a", "b", "c"] ) with pytest.raises(ValueError, match=msg2): # different values df = orig.copy() df.loc["j":"k", df.columns[0]] = Categorical( ["c", "c"], categories=["a", "b", "c"] ) # assign a part of a column with dtype != categorical -> # exp_parts_cats_col df = orig.copy() df.loc["j":"k", df.columns[0]] = ["b", "b"] tm.assert_frame_equal(df, exp_parts_cats_col) with pytest.raises(ValueError, match=msg1): df.loc["j":"k", df.columns[0]] = ["c", "c"] # iat df = orig.copy() df.iat[2, 0] = "b" tm.assert_frame_equal(df, exp_single_cats_value) # - assign a single value not in the current categories set with pytest.raises(ValueError, match=msg1): df = orig.copy() df.iat[2, 0] = "c" # at # - assign a single value -> exp_single_cats_value df = orig.copy() df.at["j", "cats"] = "b" tm.assert_frame_equal(df, exp_single_cats_value) # - assign a single value not in the current categories set with pytest.raises(ValueError, match=msg1): df = orig.copy() df.at["j", "cats"] = "c" # fancy indexing catsf = Categorical( ["a", "a", "c", "c", "a", "a", "a"], categories=["a", "b", "c"] ) idxf = Index(["h", "i", "j", "k", "l", "m", "n"]) valuesf = [1, 1, 3, 3, 1, 1, 1] df = DataFrame({"cats": catsf, "values": valuesf}, index=idxf) exp_fancy = exp_multi_row.copy() return_value = exp_fancy["cats"].cat.set_categories( ["a", "b", "c"], inplace=True ) assert return_value is None df[df["cats"] == "c"] = ["b", 2] # category c is kept in .categories tm.assert_frame_equal(df, exp_fancy) # set_value df = orig.copy() df.at["j", "cats"] = "b" tm.assert_frame_equal(df, exp_single_cats_value) with pytest.raises(ValueError, match=msg1): df = orig.copy() df.at["j", "cats"] = "c" # Assigning a Category to parts of a int/... column uses the values of # the Categorical df = DataFrame({"a": [1, 1, 1, 1, 1], "b": list("aaaaa")}) exp = DataFrame({"a": [1, "b", "b", 1, 1], "b": list("aabba")}) df.loc[1:2, "a"] = Categorical(["b", "b"], categories=["a", "b"]) df.loc[2:3, "b"] = Categorical(["b", "b"], categories=["a", "b"]) tm.assert_frame_equal(df, exp) def test_loc_setitem_single_row_categorical(self): # GH 25495 df = DataFrame({"Alpha": ["a"], "Numeric": [0]}) categories = Categorical(df["Alpha"], categories=["a", "b", "c"]) df.loc[:, "Alpha"] = categories result = df["Alpha"] expected = Series(categories, index=df.index, name="Alpha") tm.assert_series_equal(result, expected) def test_loc_indexing_preserves_index_category_dtype(self): # GH 15166 df = DataFrame( data=np.arange(2, 22, 2), index=pd.MultiIndex( levels=[pd.CategoricalIndex(["a", "b"]), range(10)], codes=[[0] * 5 + [1] * 5, range(10)], names=["Index1", "Index2"], ), ) expected = pd.CategoricalIndex( ["a", "b"], categories=["a", "b"], ordered=False, name="Index1", dtype="category", ) result = df.index.levels[0] tm.assert_index_equal(result, expected) result = df.loc[["a"]].index.levels[0] tm.assert_index_equal(result, expected)