430 lines
15 KiB
Python
430 lines
15 KiB
Python
import numpy as np
|
|
import pytest
|
|
|
|
from pandas._libs import index as libindex
|
|
|
|
import pandas as pd
|
|
from pandas import Categorical
|
|
import pandas._testing as tm
|
|
from pandas.core.indexes.api import CategoricalIndex, Index
|
|
|
|
from ..common import Base
|
|
|
|
|
|
class TestCategoricalIndex(Base):
|
|
_holder = CategoricalIndex
|
|
|
|
@pytest.fixture
|
|
def index(self, request):
|
|
return tm.makeCategoricalIndex(100)
|
|
|
|
def create_index(self, categories=None, ordered=False):
|
|
if categories is None:
|
|
categories = list("cab")
|
|
return CategoricalIndex(list("aabbca"), categories=categories, ordered=ordered)
|
|
|
|
def test_can_hold_identifiers(self):
|
|
idx = self.create_index(categories=list("abcd"))
|
|
key = idx[0]
|
|
assert idx._can_hold_identifiers_and_holds_name(key) is True
|
|
|
|
def test_append(self):
|
|
|
|
ci = self.create_index()
|
|
categories = ci.categories
|
|
|
|
# append cats with the same categories
|
|
result = ci[:3].append(ci[3:])
|
|
tm.assert_index_equal(result, ci, exact=True)
|
|
|
|
foos = [ci[:1], ci[1:3], ci[3:]]
|
|
result = foos[0].append(foos[1:])
|
|
tm.assert_index_equal(result, ci, exact=True)
|
|
|
|
# empty
|
|
result = ci.append([])
|
|
tm.assert_index_equal(result, ci, exact=True)
|
|
|
|
# appending with different categories or reordered is not ok
|
|
msg = "all inputs must be Index"
|
|
with pytest.raises(TypeError, match=msg):
|
|
ci.append(ci.values.set_categories(list("abcd")))
|
|
with pytest.raises(TypeError, match=msg):
|
|
ci.append(ci.values.reorder_categories(list("abc")))
|
|
|
|
# with objects
|
|
result = ci.append(Index(["c", "a"]))
|
|
expected = CategoricalIndex(list("aabbcaca"), categories=categories)
|
|
tm.assert_index_equal(result, expected, exact=True)
|
|
|
|
# invalid objects -> cast to object via concat_compat
|
|
result = ci.append(Index(["a", "d"]))
|
|
expected = Index(["a", "a", "b", "b", "c", "a", "a", "d"])
|
|
tm.assert_index_equal(result, expected, exact=True)
|
|
|
|
# GH14298 - if base object is not categorical -> coerce to object
|
|
result = Index(["c", "a"]).append(ci)
|
|
expected = Index(list("caaabbca"))
|
|
tm.assert_index_equal(result, expected, exact=True)
|
|
|
|
def test_append_to_another(self):
|
|
# hits Index._concat
|
|
fst = Index(["a", "b"])
|
|
snd = CategoricalIndex(["d", "e"])
|
|
result = fst.append(snd)
|
|
expected = Index(["a", "b", "d", "e"])
|
|
tm.assert_index_equal(result, expected)
|
|
|
|
def test_insert(self):
|
|
|
|
ci = self.create_index()
|
|
categories = ci.categories
|
|
|
|
# test 0th element
|
|
result = ci.insert(0, "a")
|
|
expected = CategoricalIndex(list("aaabbca"), categories=categories)
|
|
tm.assert_index_equal(result, expected, exact=True)
|
|
|
|
# test Nth element that follows Python list behavior
|
|
result = ci.insert(-1, "a")
|
|
expected = CategoricalIndex(list("aabbcaa"), categories=categories)
|
|
tm.assert_index_equal(result, expected, exact=True)
|
|
|
|
# test empty
|
|
result = CategoricalIndex(categories=categories).insert(0, "a")
|
|
expected = CategoricalIndex(["a"], categories=categories)
|
|
tm.assert_index_equal(result, expected, exact=True)
|
|
|
|
# invalid
|
|
msg = "'fill_value=d' is not present in this Categorical's categories"
|
|
with pytest.raises(TypeError, match=msg):
|
|
ci.insert(0, "d")
|
|
|
|
# GH 18295 (test missing)
|
|
expected = CategoricalIndex(["a", np.nan, "a", "b", "c", "b"])
|
|
for na in (np.nan, pd.NaT, None):
|
|
result = CategoricalIndex(list("aabcb")).insert(1, na)
|
|
tm.assert_index_equal(result, expected)
|
|
|
|
def test_insert_na_mismatched_dtype(self):
|
|
ci = CategoricalIndex([0, 1, 1])
|
|
msg = "'fill_value=NaT' is not present in this Categorical's categories"
|
|
with pytest.raises(TypeError, match=msg):
|
|
ci.insert(0, pd.NaT)
|
|
|
|
def test_delete(self):
|
|
|
|
ci = self.create_index()
|
|
categories = ci.categories
|
|
|
|
result = ci.delete(0)
|
|
expected = CategoricalIndex(list("abbca"), categories=categories)
|
|
tm.assert_index_equal(result, expected, exact=True)
|
|
|
|
result = ci.delete(-1)
|
|
expected = CategoricalIndex(list("aabbc"), categories=categories)
|
|
tm.assert_index_equal(result, expected, exact=True)
|
|
|
|
with tm.external_error_raised((IndexError, ValueError)):
|
|
# Either depending on NumPy version
|
|
ci.delete(10)
|
|
|
|
@pytest.mark.parametrize(
|
|
"data, non_lexsorted_data",
|
|
[[[1, 2, 3], [9, 0, 1, 2, 3]], [list("abc"), list("fabcd")]],
|
|
)
|
|
def test_is_monotonic(self, data, non_lexsorted_data):
|
|
c = CategoricalIndex(data)
|
|
assert c.is_monotonic_increasing is True
|
|
assert c.is_monotonic_decreasing is False
|
|
|
|
c = CategoricalIndex(data, ordered=True)
|
|
assert c.is_monotonic_increasing is True
|
|
assert c.is_monotonic_decreasing is False
|
|
|
|
c = CategoricalIndex(data, categories=reversed(data))
|
|
assert c.is_monotonic_increasing is False
|
|
assert c.is_monotonic_decreasing is True
|
|
|
|
c = CategoricalIndex(data, categories=reversed(data), ordered=True)
|
|
assert c.is_monotonic_increasing is False
|
|
assert c.is_monotonic_decreasing is True
|
|
|
|
# test when data is neither monotonic increasing nor decreasing
|
|
reordered_data = [data[0], data[2], data[1]]
|
|
c = CategoricalIndex(reordered_data, categories=reversed(data))
|
|
assert c.is_monotonic_increasing is False
|
|
assert c.is_monotonic_decreasing is False
|
|
|
|
# non lexsorted categories
|
|
categories = non_lexsorted_data
|
|
|
|
c = CategoricalIndex(categories[:2], categories=categories)
|
|
assert c.is_monotonic_increasing is True
|
|
assert c.is_monotonic_decreasing is False
|
|
|
|
c = CategoricalIndex(categories[1:3], categories=categories)
|
|
assert c.is_monotonic_increasing is True
|
|
assert c.is_monotonic_decreasing is False
|
|
|
|
def test_has_duplicates(self):
|
|
idx = CategoricalIndex([0, 0, 0], name="foo")
|
|
assert idx.is_unique is False
|
|
assert idx.has_duplicates is True
|
|
|
|
idx = CategoricalIndex([0, 1], categories=[2, 3], name="foo")
|
|
assert idx.is_unique is False
|
|
assert idx.has_duplicates is True
|
|
|
|
idx = CategoricalIndex([0, 1, 2, 3], categories=[1, 2, 3], name="foo")
|
|
assert idx.is_unique is True
|
|
assert idx.has_duplicates is False
|
|
|
|
@pytest.mark.parametrize(
|
|
"data, categories, expected",
|
|
[
|
|
(
|
|
[1, 1, 1],
|
|
[1, 2, 3],
|
|
{
|
|
"first": np.array([False, True, True]),
|
|
"last": np.array([True, True, False]),
|
|
False: np.array([True, True, True]),
|
|
},
|
|
),
|
|
(
|
|
[1, 1, 1],
|
|
list("abc"),
|
|
{
|
|
"first": np.array([False, True, True]),
|
|
"last": np.array([True, True, False]),
|
|
False: np.array([True, True, True]),
|
|
},
|
|
),
|
|
(
|
|
[2, "a", "b"],
|
|
list("abc"),
|
|
{
|
|
"first": np.zeros(shape=(3), dtype=np.bool_),
|
|
"last": np.zeros(shape=(3), dtype=np.bool_),
|
|
False: np.zeros(shape=(3), dtype=np.bool_),
|
|
},
|
|
),
|
|
(
|
|
list("abb"),
|
|
list("abc"),
|
|
{
|
|
"first": np.array([False, False, True]),
|
|
"last": np.array([False, True, False]),
|
|
False: np.array([False, True, True]),
|
|
},
|
|
),
|
|
],
|
|
)
|
|
def test_drop_duplicates(self, data, categories, expected):
|
|
|
|
idx = CategoricalIndex(data, categories=categories, name="foo")
|
|
for keep, e in expected.items():
|
|
tm.assert_numpy_array_equal(idx.duplicated(keep=keep), e)
|
|
e = idx[~e]
|
|
result = idx.drop_duplicates(keep=keep)
|
|
tm.assert_index_equal(result, e)
|
|
|
|
@pytest.mark.parametrize(
|
|
"data, categories, expected_data, expected_categories",
|
|
[
|
|
([1, 1, 1], [1, 2, 3], [1], [1]),
|
|
([1, 1, 1], list("abc"), [np.nan], []),
|
|
([1, 2, "a"], [1, 2, 3], [1, 2, np.nan], [1, 2]),
|
|
([2, "a", "b"], list("abc"), [np.nan, "a", "b"], ["a", "b"]),
|
|
],
|
|
)
|
|
def test_unique(self, data, categories, expected_data, expected_categories):
|
|
|
|
idx = CategoricalIndex(data, categories=categories)
|
|
expected = CategoricalIndex(expected_data, categories=expected_categories)
|
|
tm.assert_index_equal(idx.unique(), expected)
|
|
|
|
def test_repr_roundtrip(self):
|
|
|
|
ci = CategoricalIndex(["a", "b"], categories=["a", "b"], ordered=True)
|
|
str(ci)
|
|
tm.assert_index_equal(eval(repr(ci)), ci, exact=True)
|
|
|
|
# formatting
|
|
str(ci)
|
|
|
|
# long format
|
|
# this is not reprable
|
|
ci = CategoricalIndex(np.random.randint(0, 5, size=100))
|
|
str(ci)
|
|
|
|
def test_isin(self):
|
|
|
|
ci = CategoricalIndex(list("aabca") + [np.nan], categories=["c", "a", "b"])
|
|
tm.assert_numpy_array_equal(
|
|
ci.isin(["c"]), np.array([False, False, False, True, False, False])
|
|
)
|
|
tm.assert_numpy_array_equal(
|
|
ci.isin(["c", "a", "b"]), np.array([True] * 5 + [False])
|
|
)
|
|
tm.assert_numpy_array_equal(
|
|
ci.isin(["c", "a", "b", np.nan]), np.array([True] * 6)
|
|
)
|
|
|
|
# mismatched categorical -> coerced to ndarray so doesn't matter
|
|
result = ci.isin(ci.set_categories(list("abcdefghi")))
|
|
expected = np.array([True] * 6)
|
|
tm.assert_numpy_array_equal(result, expected)
|
|
|
|
result = ci.isin(ci.set_categories(list("defghi")))
|
|
expected = np.array([False] * 5 + [True])
|
|
tm.assert_numpy_array_equal(result, expected)
|
|
|
|
def test_identical(self):
|
|
|
|
ci1 = CategoricalIndex(["a", "b"], categories=["a", "b"], ordered=True)
|
|
ci2 = CategoricalIndex(["a", "b"], categories=["a", "b", "c"], ordered=True)
|
|
assert ci1.identical(ci1)
|
|
assert ci1.identical(ci1.copy())
|
|
assert not ci1.identical(ci2)
|
|
|
|
def test_ensure_copied_data(self, index):
|
|
# gh-12309: Check the "copy" argument of each
|
|
# Index.__new__ is honored.
|
|
#
|
|
# Must be tested separately from other indexes because
|
|
# self.values is not an ndarray.
|
|
# GH#29918 Index.base has been removed
|
|
# FIXME: is this test still meaningful?
|
|
_base = lambda ar: ar if getattr(ar, "base", None) is None else ar.base
|
|
|
|
result = CategoricalIndex(index.values, copy=True)
|
|
tm.assert_index_equal(index, result)
|
|
assert _base(index.values) is not _base(result.values)
|
|
|
|
result = CategoricalIndex(index.values, copy=False)
|
|
assert _base(index.values) is _base(result.values)
|
|
|
|
def test_frame_repr(self):
|
|
df = pd.DataFrame({"A": [1, 2, 3]}, index=CategoricalIndex(["a", "b", "c"]))
|
|
result = repr(df)
|
|
expected = " A\na 1\nb 2\nc 3"
|
|
assert result == expected
|
|
|
|
def test_reindex_base(self):
|
|
# See test_reindex.py
|
|
pass
|
|
|
|
def test_map_str(self):
|
|
# See test_map.py
|
|
pass
|
|
|
|
|
|
class TestCategoricalIndex2:
|
|
# Tests that are not overriding a test in Base
|
|
|
|
def test_format_different_scalar_lengths(self):
|
|
# GH35439
|
|
idx = CategoricalIndex(["aaaaaaaaa", "b"])
|
|
expected = ["aaaaaaaaa", "b"]
|
|
assert idx.format() == expected
|
|
|
|
@pytest.mark.parametrize(
|
|
"dtype, engine_type",
|
|
[
|
|
(np.int8, libindex.Int8Engine),
|
|
(np.int16, libindex.Int16Engine),
|
|
(np.int32, libindex.Int32Engine),
|
|
(np.int64, libindex.Int64Engine),
|
|
],
|
|
)
|
|
def test_engine_type(self, dtype, engine_type):
|
|
if dtype != np.int64:
|
|
# num. of uniques required to push CategoricalIndex.codes to a
|
|
# dtype (128 categories required for .codes dtype to be int16 etc.)
|
|
num_uniques = {np.int8: 1, np.int16: 128, np.int32: 32768}[dtype]
|
|
ci = CategoricalIndex(range(num_uniques))
|
|
else:
|
|
# having 2**32 - 2**31 categories would be very memory-intensive,
|
|
# so we cheat a bit with the dtype
|
|
ci = CategoricalIndex(range(32768)) # == 2**16 - 2**(16 - 1)
|
|
ci.values._codes = ci.values._codes.astype("int64")
|
|
assert np.issubdtype(ci.codes.dtype, dtype)
|
|
assert isinstance(ci._engine, engine_type)
|
|
|
|
@pytest.mark.parametrize(
|
|
"func,op_name",
|
|
[
|
|
(lambda idx: idx - idx, "__sub__"),
|
|
(lambda idx: idx + idx, "__add__"),
|
|
(lambda idx: idx - ["a", "b"], "__sub__"),
|
|
(lambda idx: idx + ["a", "b"], "__add__"),
|
|
(lambda idx: ["a", "b"] - idx, "__rsub__"),
|
|
(lambda idx: ["a", "b"] + idx, "__radd__"),
|
|
],
|
|
)
|
|
def test_disallow_addsub_ops(self, func, op_name):
|
|
# GH 10039
|
|
# set ops (+/-) raise TypeError
|
|
idx = Index(Categorical(["a", "b"]))
|
|
cat_or_list = "'(Categorical|list)' and '(Categorical|list)'"
|
|
msg = "|".join(
|
|
[
|
|
f"cannot perform {op_name} with this index type: CategoricalIndex",
|
|
"can only concatenate list",
|
|
rf"unsupported operand type\(s\) for [\+-]: {cat_or_list}",
|
|
]
|
|
)
|
|
with pytest.raises(TypeError, match=msg):
|
|
func(idx)
|
|
|
|
def test_method_delegation(self):
|
|
|
|
ci = CategoricalIndex(list("aabbca"), categories=list("cabdef"))
|
|
result = ci.set_categories(list("cab"))
|
|
tm.assert_index_equal(
|
|
result, CategoricalIndex(list("aabbca"), categories=list("cab"))
|
|
)
|
|
|
|
ci = CategoricalIndex(list("aabbca"), categories=list("cab"))
|
|
result = ci.rename_categories(list("efg"))
|
|
tm.assert_index_equal(
|
|
result, CategoricalIndex(list("ffggef"), categories=list("efg"))
|
|
)
|
|
|
|
# GH18862 (let rename_categories take callables)
|
|
result = ci.rename_categories(lambda x: x.upper())
|
|
tm.assert_index_equal(
|
|
result, CategoricalIndex(list("AABBCA"), categories=list("CAB"))
|
|
)
|
|
|
|
ci = CategoricalIndex(list("aabbca"), categories=list("cab"))
|
|
result = ci.add_categories(["d"])
|
|
tm.assert_index_equal(
|
|
result, CategoricalIndex(list("aabbca"), categories=list("cabd"))
|
|
)
|
|
|
|
ci = CategoricalIndex(list("aabbca"), categories=list("cab"))
|
|
result = ci.remove_categories(["c"])
|
|
tm.assert_index_equal(
|
|
result,
|
|
CategoricalIndex(list("aabb") + [np.nan] + ["a"], categories=list("ab")),
|
|
)
|
|
|
|
ci = CategoricalIndex(list("aabbca"), categories=list("cabdef"))
|
|
result = ci.as_unordered()
|
|
tm.assert_index_equal(result, ci)
|
|
|
|
ci = CategoricalIndex(list("aabbca"), categories=list("cabdef"))
|
|
result = ci.as_ordered()
|
|
tm.assert_index_equal(
|
|
result,
|
|
CategoricalIndex(list("aabbca"), categories=list("cabdef"), ordered=True),
|
|
)
|
|
|
|
# invalid
|
|
msg = "cannot use inplace with CategoricalIndex"
|
|
with pytest.raises(ValueError, match=msg):
|
|
ci.set_categories(list("cab"), inplace=True)
|