217 lines
7.6 KiB
Python
217 lines
7.6 KiB
Python
import collections
|
|
|
|
import numpy as np
|
|
import pytest
|
|
|
|
from pandas.core.dtypes.dtypes import CategoricalDtype
|
|
|
|
import pandas as pd
|
|
from pandas import (
|
|
Categorical,
|
|
DataFrame,
|
|
Index,
|
|
Series,
|
|
isna,
|
|
)
|
|
import pandas._testing as tm
|
|
|
|
|
|
class TestCategoricalMissing:
|
|
def test_isna(self):
|
|
exp = np.array([False, False, True])
|
|
cat = Categorical(["a", "b", np.nan])
|
|
res = cat.isna()
|
|
|
|
tm.assert_numpy_array_equal(res, exp)
|
|
|
|
def test_na_flags_int_categories(self):
|
|
# #1457
|
|
|
|
categories = list(range(10))
|
|
labels = np.random.default_rng(2).integers(0, 10, 20)
|
|
labels[::5] = -1
|
|
|
|
cat = Categorical(labels, categories)
|
|
repr(cat)
|
|
|
|
tm.assert_numpy_array_equal(isna(cat), labels == -1)
|
|
|
|
def test_nan_handling(self):
|
|
# Nans are represented as -1 in codes
|
|
c = Categorical(["a", "b", np.nan, "a"])
|
|
tm.assert_index_equal(c.categories, Index(["a", "b"]))
|
|
tm.assert_numpy_array_equal(c._codes, np.array([0, 1, -1, 0], dtype=np.int8))
|
|
c[1] = np.nan
|
|
tm.assert_index_equal(c.categories, Index(["a", "b"]))
|
|
tm.assert_numpy_array_equal(c._codes, np.array([0, -1, -1, 0], dtype=np.int8))
|
|
|
|
# Adding nan to categories should make assigned nan point to the
|
|
# category!
|
|
c = Categorical(["a", "b", np.nan, "a"])
|
|
tm.assert_index_equal(c.categories, Index(["a", "b"]))
|
|
tm.assert_numpy_array_equal(c._codes, np.array([0, 1, -1, 0], dtype=np.int8))
|
|
|
|
def test_set_dtype_nans(self):
|
|
c = Categorical(["a", "b", np.nan])
|
|
result = c._set_dtype(CategoricalDtype(["a", "c"]))
|
|
tm.assert_numpy_array_equal(result.codes, np.array([0, -1, -1], dtype="int8"))
|
|
|
|
def test_set_item_nan(self):
|
|
cat = Categorical([1, 2, 3])
|
|
cat[1] = np.nan
|
|
|
|
exp = Categorical([1, np.nan, 3], categories=[1, 2, 3])
|
|
tm.assert_categorical_equal(cat, exp)
|
|
|
|
@pytest.mark.parametrize(
|
|
"fillna_kwargs, msg",
|
|
[
|
|
(
|
|
{"value": 1, "method": "ffill"},
|
|
"Cannot specify both 'value' and 'method'.",
|
|
),
|
|
({}, "Must specify a fill 'value' or 'method'."),
|
|
({"method": "bad"}, "Invalid fill method. Expecting .* bad"),
|
|
(
|
|
{"value": Series([1, 2, 3, 4, "a"])},
|
|
"Cannot setitem on a Categorical with a new category",
|
|
),
|
|
],
|
|
)
|
|
def test_fillna_raises(self, fillna_kwargs, msg):
|
|
# https://github.com/pandas-dev/pandas/issues/19682
|
|
# https://github.com/pandas-dev/pandas/issues/13628
|
|
cat = Categorical([1, 2, 3, None, None])
|
|
|
|
if len(fillna_kwargs) == 1 and "value" in fillna_kwargs:
|
|
err = TypeError
|
|
else:
|
|
err = ValueError
|
|
|
|
with pytest.raises(err, match=msg):
|
|
cat.fillna(**fillna_kwargs)
|
|
|
|
@pytest.mark.parametrize("named", [True, False])
|
|
def test_fillna_iterable_category(self, named):
|
|
# https://github.com/pandas-dev/pandas/issues/21097
|
|
if named:
|
|
Point = collections.namedtuple("Point", "x y")
|
|
else:
|
|
Point = lambda *args: args # tuple
|
|
cat = Categorical(np.array([Point(0, 0), Point(0, 1), None], dtype=object))
|
|
result = cat.fillna(Point(0, 0))
|
|
expected = Categorical([Point(0, 0), Point(0, 1), Point(0, 0)])
|
|
|
|
tm.assert_categorical_equal(result, expected)
|
|
|
|
# Case where the Point is not among our categories; we want ValueError,
|
|
# not NotImplementedError GH#41914
|
|
cat = Categorical(np.array([Point(1, 0), Point(0, 1), None], dtype=object))
|
|
msg = "Cannot setitem on a Categorical with a new category"
|
|
with pytest.raises(TypeError, match=msg):
|
|
cat.fillna(Point(0, 0))
|
|
|
|
def test_fillna_array(self):
|
|
# accept Categorical or ndarray value if it holds appropriate values
|
|
cat = Categorical(["A", "B", "C", None, None])
|
|
|
|
other = cat.fillna("C")
|
|
result = cat.fillna(other)
|
|
tm.assert_categorical_equal(result, other)
|
|
assert isna(cat[-1]) # didn't modify original inplace
|
|
|
|
other = np.array(["A", "B", "C", "B", "A"])
|
|
result = cat.fillna(other)
|
|
expected = Categorical(["A", "B", "C", "B", "A"], dtype=cat.dtype)
|
|
tm.assert_categorical_equal(result, expected)
|
|
assert isna(cat[-1]) # didn't modify original inplace
|
|
|
|
@pytest.mark.parametrize(
|
|
"values, expected",
|
|
[
|
|
([1, 2, 3], np.array([False, False, False])),
|
|
([1, 2, np.nan], np.array([False, False, True])),
|
|
([1, 2, np.inf], np.array([False, False, True])),
|
|
([1, 2, pd.NA], np.array([False, False, True])),
|
|
],
|
|
)
|
|
def test_use_inf_as_na(self, values, expected):
|
|
# https://github.com/pandas-dev/pandas/issues/33594
|
|
msg = "use_inf_as_na option is deprecated"
|
|
with tm.assert_produces_warning(FutureWarning, match=msg):
|
|
with pd.option_context("mode.use_inf_as_na", True):
|
|
cat = Categorical(values)
|
|
result = cat.isna()
|
|
tm.assert_numpy_array_equal(result, expected)
|
|
|
|
result = Series(cat).isna()
|
|
expected = Series(expected)
|
|
tm.assert_series_equal(result, expected)
|
|
|
|
result = DataFrame(cat).isna()
|
|
expected = DataFrame(expected)
|
|
tm.assert_frame_equal(result, expected)
|
|
|
|
@pytest.mark.parametrize(
|
|
"values, expected",
|
|
[
|
|
([1, 2, 3], np.array([False, False, False])),
|
|
([1, 2, np.nan], np.array([False, False, True])),
|
|
([1, 2, np.inf], np.array([False, False, True])),
|
|
([1, 2, pd.NA], np.array([False, False, True])),
|
|
],
|
|
)
|
|
def test_use_inf_as_na_outside_context(self, values, expected):
|
|
# https://github.com/pandas-dev/pandas/issues/33594
|
|
# Using isna directly for Categorical will fail in general here
|
|
cat = Categorical(values)
|
|
|
|
msg = "use_inf_as_na option is deprecated"
|
|
with tm.assert_produces_warning(FutureWarning, match=msg):
|
|
with pd.option_context("mode.use_inf_as_na", True):
|
|
result = isna(cat)
|
|
tm.assert_numpy_array_equal(result, expected)
|
|
|
|
result = isna(Series(cat))
|
|
expected = Series(expected)
|
|
tm.assert_series_equal(result, expected)
|
|
|
|
result = isna(DataFrame(cat))
|
|
expected = DataFrame(expected)
|
|
tm.assert_frame_equal(result, expected)
|
|
|
|
@pytest.mark.parametrize(
|
|
"a1, a2, categories",
|
|
[
|
|
(["a", "b", "c"], [np.nan, "a", "b"], ["a", "b", "c"]),
|
|
([1, 2, 3], [np.nan, 1, 2], [1, 2, 3]),
|
|
],
|
|
)
|
|
def test_compare_categorical_with_missing(self, a1, a2, categories):
|
|
# GH 28384
|
|
cat_type = CategoricalDtype(categories)
|
|
|
|
# !=
|
|
result = Series(a1, dtype=cat_type) != Series(a2, dtype=cat_type)
|
|
expected = Series(a1) != Series(a2)
|
|
tm.assert_series_equal(result, expected)
|
|
|
|
# ==
|
|
result = Series(a1, dtype=cat_type) == Series(a2, dtype=cat_type)
|
|
expected = Series(a1) == Series(a2)
|
|
tm.assert_series_equal(result, expected)
|
|
|
|
@pytest.mark.parametrize(
|
|
"na_value, dtype",
|
|
[
|
|
(pd.NaT, "datetime64[ns]"),
|
|
(None, "float64"),
|
|
(np.nan, "float64"),
|
|
(pd.NA, "float64"),
|
|
],
|
|
)
|
|
def test_categorical_only_missing_values_no_cast(self, na_value, dtype):
|
|
# GH#44900
|
|
result = Categorical([na_value, na_value])
|
|
tm.assert_index_equal(result.categories, Index([], dtype=dtype))
|