import re import numpy as np import pytest import pytz from pandas.core.dtypes.base import registry from pandas.core.dtypes.common import ( is_bool_dtype, is_categorical, is_categorical_dtype, is_datetime64_any_dtype, is_datetime64_dtype, is_datetime64_ns_dtype, is_datetime64tz_dtype, is_dtype_equal, is_interval_dtype, is_period_dtype, is_string_dtype, ) from pandas.core.dtypes.dtypes import ( CategoricalDtype, DatetimeTZDtype, IntervalDtype, PeriodDtype, ) import pandas as pd from pandas import ( Categorical, CategoricalIndex, DatetimeIndex, IntervalIndex, Series, date_range, ) import pandas._testing as tm from pandas.core.arrays.sparse import SparseArray, SparseDtype class Base: def test_hash(self, dtype): hash(dtype) def test_equality_invalid(self, dtype): assert not dtype == "foo" assert not is_dtype_equal(dtype, np.int64) def test_numpy_informed(self, dtype): # npdev 2020-02-02 changed from "data type not understood" to # "Cannot interpret 'foo' as a data type" msg = "|".join( ["data type not understood", "Cannot interpret '.*' as a data type"] ) with pytest.raises(TypeError, match=msg): np.dtype(dtype) assert not dtype == np.str_ assert not np.str_ == dtype def test_pickle(self, dtype): # make sure our cache is NOT pickled # clear the cache type(dtype).reset_cache() assert not len(dtype._cache) # force back to the cache result = tm.round_trip_pickle(dtype) if not isinstance(dtype, PeriodDtype): # Because PeriodDtype has a cython class as a base class, # it has different pickle semantics, and its cache is re-populated # on un-pickling. assert not len(dtype._cache) assert result == dtype class TestCategoricalDtype(Base): @pytest.fixture def dtype(self): """ Class level fixture of dtype for TestCategoricalDtype """ return CategoricalDtype() def test_hash_vs_equality(self, dtype): dtype2 = CategoricalDtype() assert dtype == dtype2 assert dtype2 == dtype assert hash(dtype) == hash(dtype2) def test_equality(self, dtype): assert is_dtype_equal(dtype, "category") assert is_dtype_equal(dtype, CategoricalDtype()) assert not is_dtype_equal(dtype, "foo") def test_construction_from_string(self, dtype): result = CategoricalDtype.construct_from_string("category") assert is_dtype_equal(dtype, result) msg = "Cannot construct a 'CategoricalDtype' from 'foo'" with pytest.raises(TypeError, match=msg): CategoricalDtype.construct_from_string("foo") def test_constructor_invalid(self): msg = "Parameter 'categories' must be list-like" with pytest.raises(TypeError, match=msg): CategoricalDtype("category") dtype1 = CategoricalDtype(["a", "b"], ordered=True) dtype2 = CategoricalDtype(["x", "y"], ordered=False) c = Categorical([0, 1], dtype=dtype1, fastpath=True) @pytest.mark.parametrize( "values, categories, ordered, dtype, expected", [ [None, None, None, None, CategoricalDtype()], [None, ["a", "b"], True, None, dtype1], [c, None, None, dtype2, dtype2], [c, ["x", "y"], False, None, dtype2], ], ) def test_from_values_or_dtype(self, values, categories, ordered, dtype, expected): result = CategoricalDtype._from_values_or_dtype( values, categories, ordered, dtype ) assert result == expected @pytest.mark.parametrize( "values, categories, ordered, dtype", [ [None, ["a", "b"], True, dtype2], [None, ["a", "b"], None, dtype2], [None, None, True, dtype2], ], ) def test_from_values_or_dtype_raises(self, values, categories, ordered, dtype): msg = "Cannot specify `categories` or `ordered` together with `dtype`." with pytest.raises(ValueError, match=msg): CategoricalDtype._from_values_or_dtype(values, categories, ordered, dtype) def test_from_values_or_dtype_invalid_dtype(self): msg = "Cannot not construct CategoricalDtype from " with pytest.raises(ValueError, match=msg): CategoricalDtype._from_values_or_dtype(None, None, None, object) def test_is_dtype(self, dtype): assert CategoricalDtype.is_dtype(dtype) assert CategoricalDtype.is_dtype("category") assert CategoricalDtype.is_dtype(CategoricalDtype()) assert not CategoricalDtype.is_dtype("foo") assert not CategoricalDtype.is_dtype(np.float64) def test_basic(self, dtype): assert is_categorical_dtype(dtype) factor = Categorical(["a", "b", "b", "a", "a", "c", "c", "c"]) s = Series(factor, name="A") # dtypes assert is_categorical_dtype(s.dtype) assert is_categorical_dtype(s) assert not is_categorical_dtype(np.dtype("float64")) with tm.assert_produces_warning(FutureWarning): # GH#33385 deprecated assert is_categorical(s.dtype) assert is_categorical(s) assert not is_categorical(np.dtype("float64")) assert not is_categorical(1.0) def test_tuple_categories(self): categories = [(1, "a"), (2, "b"), (3, "c")] result = CategoricalDtype(categories) assert all(result.categories == categories) @pytest.mark.parametrize( "categories, expected", [ ([True, False], True), ([True, False, None], True), ([True, False, "a", "b'"], False), ([0, 1], False), ], ) def test_is_boolean(self, categories, expected): cat = Categorical(categories) assert cat.dtype._is_boolean is expected assert is_bool_dtype(cat) is expected assert is_bool_dtype(cat.dtype) is expected def test_dtype_specific_categorical_dtype(self): expected = "datetime64[ns]" result = str(Categorical(DatetimeIndex([])).categories.dtype) assert result == expected def test_not_string(self): # though CategoricalDtype has object kind, it cannot be string assert not is_string_dtype(CategoricalDtype()) def test_repr_range_categories(self): rng = pd.Index(range(3)) dtype = CategoricalDtype(categories=rng, ordered=False) result = repr(dtype) expected = "CategoricalDtype(categories=range(0, 3), ordered=False)" assert result == expected class TestDatetimeTZDtype(Base): @pytest.fixture def dtype(self): """ Class level fixture of dtype for TestDatetimeTZDtype """ return DatetimeTZDtype("ns", "US/Eastern") def test_alias_to_unit_raises(self): # 23990 with pytest.raises(ValueError, match="Passing a dtype alias"): DatetimeTZDtype("datetime64[ns, US/Central]") def test_alias_to_unit_bad_alias_raises(self): # 23990 with pytest.raises(TypeError, match=""): DatetimeTZDtype("this is a bad string") with pytest.raises(TypeError, match=""): DatetimeTZDtype("datetime64[ns, US/NotATZ]") def test_hash_vs_equality(self, dtype): # make sure that we satisfy is semantics dtype2 = DatetimeTZDtype("ns", "US/Eastern") dtype3 = DatetimeTZDtype(dtype2) assert dtype == dtype2 assert dtype2 == dtype assert dtype3 == dtype assert hash(dtype) == hash(dtype2) assert hash(dtype) == hash(dtype3) dtype4 = DatetimeTZDtype("ns", "US/Central") assert dtype2 != dtype4 assert hash(dtype2) != hash(dtype4) def test_construction(self): msg = "DatetimeTZDtype only supports ns units" with pytest.raises(ValueError, match=msg): DatetimeTZDtype("ms", "US/Eastern") def test_subclass(self): a = DatetimeTZDtype.construct_from_string("datetime64[ns, US/Eastern]") b = DatetimeTZDtype.construct_from_string("datetime64[ns, CET]") assert issubclass(type(a), type(a)) assert issubclass(type(a), type(b)) def test_compat(self, dtype): assert is_datetime64tz_dtype(dtype) assert is_datetime64tz_dtype("datetime64[ns, US/Eastern]") assert is_datetime64_any_dtype(dtype) assert is_datetime64_any_dtype("datetime64[ns, US/Eastern]") assert is_datetime64_ns_dtype(dtype) assert is_datetime64_ns_dtype("datetime64[ns, US/Eastern]") assert not is_datetime64_dtype(dtype) assert not is_datetime64_dtype("datetime64[ns, US/Eastern]") def test_construction_from_string(self, dtype): result = DatetimeTZDtype.construct_from_string("datetime64[ns, US/Eastern]") assert is_dtype_equal(dtype, result) @pytest.mark.parametrize( "string", [ "foo", "datetime64[ns, notatz]", # non-nano unit "datetime64[ps, UTC]", # dateutil str that returns None from gettz "datetime64[ns, dateutil/invalid]", ], ) def test_construct_from_string_invalid_raises(self, string): msg = f"Cannot construct a 'DatetimeTZDtype' from '{string}'" with pytest.raises(TypeError, match=re.escape(msg)): DatetimeTZDtype.construct_from_string(string) def test_construct_from_string_wrong_type_raises(self): msg = "'construct_from_string' expects a string, got " with pytest.raises(TypeError, match=msg): DatetimeTZDtype.construct_from_string(["datetime64[ns, notatz]"]) def test_is_dtype(self, dtype): assert not DatetimeTZDtype.is_dtype(None) assert DatetimeTZDtype.is_dtype(dtype) assert DatetimeTZDtype.is_dtype("datetime64[ns, US/Eastern]") assert DatetimeTZDtype.is_dtype("M8[ns, US/Eastern]") assert not DatetimeTZDtype.is_dtype("foo") assert DatetimeTZDtype.is_dtype(DatetimeTZDtype("ns", "US/Pacific")) assert not DatetimeTZDtype.is_dtype(np.float64) def test_equality(self, dtype): assert is_dtype_equal(dtype, "datetime64[ns, US/Eastern]") assert is_dtype_equal(dtype, "M8[ns, US/Eastern]") assert is_dtype_equal(dtype, DatetimeTZDtype("ns", "US/Eastern")) assert not is_dtype_equal(dtype, "foo") assert not is_dtype_equal(dtype, DatetimeTZDtype("ns", "CET")) assert not is_dtype_equal( DatetimeTZDtype("ns", "US/Eastern"), DatetimeTZDtype("ns", "US/Pacific") ) # numpy compat assert is_dtype_equal(np.dtype("M8[ns]"), "datetime64[ns]") assert dtype == "M8[ns, US/Eastern]" def test_basic(self, dtype): assert is_datetime64tz_dtype(dtype) dr = date_range("20130101", periods=3, tz="US/Eastern") s = Series(dr, name="A") # dtypes assert is_datetime64tz_dtype(s.dtype) assert is_datetime64tz_dtype(s) assert not is_datetime64tz_dtype(np.dtype("float64")) assert not is_datetime64tz_dtype(1.0) def test_dst(self): dr1 = date_range("2013-01-01", periods=3, tz="US/Eastern") s1 = Series(dr1, name="A") assert is_datetime64tz_dtype(s1) dr2 = date_range("2013-08-01", periods=3, tz="US/Eastern") s2 = Series(dr2, name="A") assert is_datetime64tz_dtype(s2) assert s1.dtype == s2.dtype @pytest.mark.parametrize("tz", ["UTC", "US/Eastern"]) @pytest.mark.parametrize("constructor", ["M8", "datetime64"]) def test_parser(self, tz, constructor): # pr #11245 dtz_str = f"{constructor}[ns, {tz}]" result = DatetimeTZDtype.construct_from_string(dtz_str) expected = DatetimeTZDtype("ns", tz) assert result == expected def test_empty(self): with pytest.raises(TypeError, match="A 'tz' is required."): DatetimeTZDtype() def test_tz_standardize(self): # GH 24713 tz = pytz.timezone("US/Eastern") dr = date_range("2013-01-01", periods=3, tz="US/Eastern") dtype = DatetimeTZDtype("ns", dr.tz) assert dtype.tz == tz dtype = DatetimeTZDtype("ns", dr[0].tz) assert dtype.tz == tz class TestPeriodDtype(Base): @pytest.fixture def dtype(self): """ Class level fixture of dtype for TestPeriodDtype """ return PeriodDtype("D") def test_hash_vs_equality(self, dtype): # make sure that we satisfy is semantics dtype2 = PeriodDtype("D") dtype3 = PeriodDtype(dtype2) assert dtype == dtype2 assert dtype2 == dtype assert dtype3 == dtype assert dtype is dtype2 assert dtype2 is dtype assert dtype3 is dtype assert hash(dtype) == hash(dtype2) assert hash(dtype) == hash(dtype3) def test_construction(self): with pytest.raises(ValueError, match="Invalid frequency: xx"): PeriodDtype("xx") for s in ["period[D]", "Period[D]", "D"]: dt = PeriodDtype(s) assert dt.freq == pd.tseries.offsets.Day() assert is_period_dtype(dt) for s in ["period[3D]", "Period[3D]", "3D"]: dt = PeriodDtype(s) assert dt.freq == pd.tseries.offsets.Day(3) assert is_period_dtype(dt) for s in [ "period[26H]", "Period[26H]", "26H", "period[1D2H]", "Period[1D2H]", "1D2H", ]: dt = PeriodDtype(s) assert dt.freq == pd.tseries.offsets.Hour(26) assert is_period_dtype(dt) def test_subclass(self): a = PeriodDtype("period[D]") b = PeriodDtype("period[3D]") assert issubclass(type(a), type(a)) assert issubclass(type(a), type(b)) def test_identity(self): assert PeriodDtype("period[D]") == PeriodDtype("period[D]") assert PeriodDtype("period[D]") is PeriodDtype("period[D]") assert PeriodDtype("period[3D]") == PeriodDtype("period[3D]") assert PeriodDtype("period[3D]") is PeriodDtype("period[3D]") assert PeriodDtype("period[1S1U]") == PeriodDtype("period[1000001U]") assert PeriodDtype("period[1S1U]") is PeriodDtype("period[1000001U]") def test_compat(self, dtype): assert not is_datetime64_ns_dtype(dtype) assert not is_datetime64_ns_dtype("period[D]") assert not is_datetime64_dtype(dtype) assert not is_datetime64_dtype("period[D]") def test_construction_from_string(self, dtype): result = PeriodDtype("period[D]") assert is_dtype_equal(dtype, result) result = PeriodDtype.construct_from_string("period[D]") assert is_dtype_equal(dtype, result) with pytest.raises(TypeError, match="list"): PeriodDtype.construct_from_string([1, 2, 3]) @pytest.mark.parametrize( "string", [ "foo", "period[foo]", "foo[D]", "datetime64[ns]", "datetime64[ns, US/Eastern]", ], ) def test_construct_dtype_from_string_invalid_raises(self, string): msg = f"Cannot construct a 'PeriodDtype' from '{string}'" with pytest.raises(TypeError, match=re.escape(msg)): PeriodDtype.construct_from_string(string) def test_is_dtype(self, dtype): assert PeriodDtype.is_dtype(dtype) assert PeriodDtype.is_dtype("period[D]") assert PeriodDtype.is_dtype("period[3D]") assert PeriodDtype.is_dtype(PeriodDtype("3D")) assert PeriodDtype.is_dtype("period[U]") assert PeriodDtype.is_dtype("period[S]") assert PeriodDtype.is_dtype(PeriodDtype("U")) assert PeriodDtype.is_dtype(PeriodDtype("S")) assert not PeriodDtype.is_dtype("D") assert not PeriodDtype.is_dtype("3D") assert not PeriodDtype.is_dtype("U") assert not PeriodDtype.is_dtype("S") assert not PeriodDtype.is_dtype("foo") assert not PeriodDtype.is_dtype(np.object_) assert not PeriodDtype.is_dtype(np.int64) assert not PeriodDtype.is_dtype(np.float64) def test_equality(self, dtype): assert is_dtype_equal(dtype, "period[D]") assert is_dtype_equal(dtype, PeriodDtype("D")) assert is_dtype_equal(dtype, PeriodDtype("D")) assert is_dtype_equal(PeriodDtype("D"), PeriodDtype("D")) assert not is_dtype_equal(dtype, "D") assert not is_dtype_equal(PeriodDtype("D"), PeriodDtype("2D")) def test_basic(self, dtype): assert is_period_dtype(dtype) pidx = pd.period_range("2013-01-01 09:00", periods=5, freq="H") assert is_period_dtype(pidx.dtype) assert is_period_dtype(pidx) s = Series(pidx, name="A") assert is_period_dtype(s.dtype) assert is_period_dtype(s) assert not is_period_dtype(np.dtype("float64")) assert not is_period_dtype(1.0) def test_empty(self): dt = PeriodDtype() msg = "object has no attribute 'freqstr'" with pytest.raises(AttributeError, match=msg): str(dt) def test_not_string(self): # though PeriodDtype has object kind, it cannot be string assert not is_string_dtype(PeriodDtype("D")) class TestIntervalDtype(Base): @pytest.fixture def dtype(self): """ Class level fixture of dtype for TestIntervalDtype """ return IntervalDtype("int64") def test_hash_vs_equality(self, dtype): # make sure that we satisfy is semantics dtype2 = IntervalDtype("int64") dtype3 = IntervalDtype(dtype2) assert dtype == dtype2 assert dtype2 == dtype assert dtype3 == dtype assert dtype is dtype2 assert dtype2 is dtype3 assert dtype3 is dtype assert hash(dtype) == hash(dtype2) assert hash(dtype) == hash(dtype3) dtype1 = IntervalDtype("interval") dtype2 = IntervalDtype(dtype1) dtype3 = IntervalDtype("interval") assert dtype2 == dtype1 assert dtype2 == dtype2 assert dtype2 == dtype3 assert dtype2 is dtype1 assert dtype2 is dtype2 assert dtype2 is dtype3 assert hash(dtype2) == hash(dtype1) assert hash(dtype2) == hash(dtype2) assert hash(dtype2) == hash(dtype3) @pytest.mark.parametrize( "subtype", ["interval[int64]", "Interval[int64]", "int64", np.dtype("int64")] ) def test_construction(self, subtype): i = IntervalDtype(subtype) assert i.subtype == np.dtype("int64") assert is_interval_dtype(i) @pytest.mark.parametrize("subtype", [None, "interval", "Interval"]) def test_construction_generic(self, subtype): # generic i = IntervalDtype(subtype) assert i.subtype is None assert is_interval_dtype(i) @pytest.mark.parametrize( "subtype", [ CategoricalDtype(list("abc"), False), CategoricalDtype(list("wxyz"), True), object, str, "