import numpy as np import pytest from pandas.core.dtypes.dtypes import CategoricalDtype from pandas import Categorical, CategoricalIndex, Index, Series, Timestamp import pandas._testing as tm class TestCategoricalDtypes: def test_is_dtype_equal_deprecated(self): # GH#37545 c1 = Categorical(list("aabca"), categories=list("abc"), ordered=False) with tm.assert_produces_warning(FutureWarning): c1.is_dtype_equal(c1) def test_categories_match_up_to_permutation(self): # test dtype comparisons between cats c1 = Categorical(list("aabca"), categories=list("abc"), ordered=False) c2 = Categorical(list("aabca"), categories=list("cab"), ordered=False) c3 = Categorical(list("aabca"), categories=list("cab"), ordered=True) assert c1._categories_match_up_to_permutation(c1) assert c2._categories_match_up_to_permutation(c2) assert c3._categories_match_up_to_permutation(c3) assert c1._categories_match_up_to_permutation(c2) assert not c1._categories_match_up_to_permutation(c3) assert not c1._categories_match_up_to_permutation(Index(list("aabca"))) assert not c1._categories_match_up_to_permutation(c1.astype(object)) assert c1._categories_match_up_to_permutation(CategoricalIndex(c1)) assert c1._categories_match_up_to_permutation( CategoricalIndex(c1, categories=list("cab")) ) assert not c1._categories_match_up_to_permutation( CategoricalIndex(c1, ordered=True) ) # GH 16659 s1 = Series(c1) s2 = Series(c2) s3 = Series(c3) assert c1._categories_match_up_to_permutation(s1) assert c2._categories_match_up_to_permutation(s2) assert c3._categories_match_up_to_permutation(s3) assert c1._categories_match_up_to_permutation(s2) assert not c1._categories_match_up_to_permutation(s3) assert not c1._categories_match_up_to_permutation(s1.astype(object)) def test_set_dtype_same(self): c = Categorical(["a", "b", "c"]) result = c._set_dtype(CategoricalDtype(["a", "b", "c"])) tm.assert_categorical_equal(result, c) def test_set_dtype_new_categories(self): c = Categorical(["a", "b", "c"]) result = c._set_dtype(CategoricalDtype(list("abcd"))) tm.assert_numpy_array_equal(result.codes, c.codes) tm.assert_index_equal(result.dtype.categories, Index(list("abcd"))) @pytest.mark.parametrize( "values, categories, new_categories", [ # No NaNs, same cats, same order (["a", "b", "a"], ["a", "b"], ["a", "b"]), # No NaNs, same cats, different order (["a", "b", "a"], ["a", "b"], ["b", "a"]), # Same, unsorted (["b", "a", "a"], ["a", "b"], ["a", "b"]), # No NaNs, same cats, different order (["b", "a", "a"], ["a", "b"], ["b", "a"]), # NaNs (["a", "b", "c"], ["a", "b"], ["a", "b"]), (["a", "b", "c"], ["a", "b"], ["b", "a"]), (["b", "a", "c"], ["a", "b"], ["a", "b"]), (["b", "a", "c"], ["a", "b"], ["a", "b"]), # Introduce NaNs (["a", "b", "c"], ["a", "b"], ["a"]), (["a", "b", "c"], ["a", "b"], ["b"]), (["b", "a", "c"], ["a", "b"], ["a"]), (["b", "a", "c"], ["a", "b"], ["a"]), # No overlap (["a", "b", "c"], ["a", "b"], ["d", "e"]), ], ) @pytest.mark.parametrize("ordered", [True, False]) def test_set_dtype_many(self, values, categories, new_categories, ordered): c = Categorical(values, categories) expected = Categorical(values, new_categories, ordered) result = c._set_dtype(expected.dtype) tm.assert_categorical_equal(result, expected) def test_set_dtype_no_overlap(self): c = Categorical(["a", "b", "c"], ["d", "e"]) result = c._set_dtype(CategoricalDtype(["a", "b"])) expected = Categorical([None, None, None], categories=["a", "b"]) tm.assert_categorical_equal(result, expected) def test_codes_dtypes(self): # GH 8453 result = Categorical(["foo", "bar", "baz"]) assert result.codes.dtype == "int8" result = Categorical([f"foo{i:05d}" for i in range(400)]) assert result.codes.dtype == "int16" result = Categorical([f"foo{i:05d}" for i in range(40000)]) assert result.codes.dtype == "int32" # adding cats result = Categorical(["foo", "bar", "baz"]) assert result.codes.dtype == "int8" result = result.add_categories([f"foo{i:05d}" for i in range(400)]) assert result.codes.dtype == "int16" # removing cats result = result.remove_categories([f"foo{i:05d}" for i in range(300)]) assert result.codes.dtype == "int8" @pytest.mark.parametrize("ordered", [True, False]) def test_astype(self, ordered): # string cat = Categorical(list("abbaaccc"), ordered=ordered) result = cat.astype(object) expected = np.array(cat) tm.assert_numpy_array_equal(result, expected) msg = r"Cannot cast object dtype to " with pytest.raises(ValueError, match=msg): cat.astype(float) # numeric cat = Categorical([0, 1, 2, 2, 1, 0, 1, 0, 2], ordered=ordered) result = cat.astype(object) expected = np.array(cat, dtype=object) tm.assert_numpy_array_equal(result, expected) result = cat.astype(int) expected = np.array(cat, dtype="int") tm.assert_numpy_array_equal(result, expected) result = cat.astype(float) expected = np.array(cat, dtype=float) tm.assert_numpy_array_equal(result, expected) @pytest.mark.parametrize("dtype_ordered", [True, False]) @pytest.mark.parametrize("cat_ordered", [True, False]) def test_astype_category(self, dtype_ordered, cat_ordered): # GH 10696/18593 data = list("abcaacbab") cat = Categorical(data, categories=list("bac"), ordered=cat_ordered) # standard categories dtype = CategoricalDtype(ordered=dtype_ordered) result = cat.astype(dtype) expected = Categorical(data, categories=cat.categories, ordered=dtype_ordered) tm.assert_categorical_equal(result, expected) # non-standard categories dtype = CategoricalDtype(list("adc"), dtype_ordered) result = cat.astype(dtype) expected = Categorical(data, dtype=dtype) tm.assert_categorical_equal(result, expected) if dtype_ordered is False: # dtype='category' can't specify ordered, so only test once result = cat.astype("category") expected = cat tm.assert_categorical_equal(result, expected) def test_iter_python_types(self): # GH-19909 cat = Categorical([1, 2]) assert isinstance(list(cat)[0], int) assert isinstance(cat.tolist()[0], int) def test_iter_python_types_datetime(self): cat = Categorical([Timestamp("2017-01-01"), Timestamp("2017-01-02")]) assert isinstance(list(cat)[0], Timestamp) assert isinstance(cat.tolist()[0], Timestamp)