projektAI/venv/Lib/site-packages/pandas/core/dtypes/dtypes.py

1174 lines
37 KiB
Python
Raw Normal View History

2021-06-06 22:13:05 +02:00
"""
Define extension dtypes.
"""
import re
from typing import (
TYPE_CHECKING,
Any,
Dict,
List,
MutableMapping,
Optional,
Tuple,
Type,
Union,
cast,
)
import numpy as np
import pytz
from pandas._libs.interval import Interval
from pandas._libs.tslibs import NaT, Period, Timestamp, dtypes, timezones, to_offset
from pandas._libs.tslibs.offsets import BaseOffset
from pandas._typing import DtypeObj, Ordered
from pandas.core.dtypes.base import ExtensionDtype, register_extension_dtype
from pandas.core.dtypes.generic import ABCCategoricalIndex, ABCIndexClass
from pandas.core.dtypes.inference import is_bool, is_list_like
if TYPE_CHECKING:
import pyarrow
from pandas import Categorical
from pandas.core.arrays import DatetimeArray, IntervalArray, PeriodArray
str_type = str
class PandasExtensionDtype(ExtensionDtype):
"""
A np.dtype duck-typed class, suitable for holding a custom dtype.
THIS IS NOT A REAL NUMPY DTYPE
"""
type: Any
kind: Any
# The Any type annotations above are here only because mypy seems to have a
# problem dealing with multiple inheritance from PandasExtensionDtype
# and ExtensionDtype's @properties in the subclasses below. The kind and
# type variables in those subclasses are explicitly typed below.
subdtype = None
str: str_type
num = 100
shape: Tuple[int, ...] = ()
itemsize = 8
base = None
isbuiltin = 0
isnative = 0
_cache: Dict[str_type, "PandasExtensionDtype"] = {}
def __str__(self) -> str_type:
"""
Return a string representation for a particular Object
"""
return self.name
def __repr__(self) -> str_type:
"""
Return a string representation for a particular object.
"""
return str(self)
def __hash__(self) -> int:
raise NotImplementedError("sub-classes should implement an __hash__ method")
def __getstate__(self) -> Dict[str_type, Any]:
# pickle support; we don't want to pickle the cache
return {k: getattr(self, k, None) for k in self._metadata}
@classmethod
def reset_cache(cls) -> None:
""" clear the cache """
cls._cache = {}
class CategoricalDtypeType(type):
"""
the type of CategoricalDtype, this metaclass determines subclass ability
"""
pass
@register_extension_dtype
class CategoricalDtype(PandasExtensionDtype, ExtensionDtype):
"""
Type for categorical data with the categories and orderedness.
Parameters
----------
categories : sequence, optional
Must be unique, and must not contain any nulls.
The categories are stored in an Index,
and if an index is provided the dtype of that index will be used.
ordered : bool or None, default False
Whether or not this categorical is treated as a ordered categorical.
None can be used to maintain the ordered value of existing categoricals when
used in operations that combine categoricals, e.g. astype, and will resolve to
False if there is no existing ordered to maintain.
Attributes
----------
categories
ordered
Methods
-------
None
See Also
--------
Categorical : Represent a categorical variable in classic R / S-plus fashion.
Notes
-----
This class is useful for specifying the type of a ``Categorical``
independent of the values. See :ref:`categorical.categoricaldtype`
for more.
Examples
--------
>>> t = pd.CategoricalDtype(categories=['b', 'a'], ordered=True)
>>> pd.Series(['a', 'b', 'a', 'c'], dtype=t)
0 a
1 b
2 a
3 NaN
dtype: category
Categories (2, object): ['b' < 'a']
An empty CategoricalDtype with a specific dtype can be created
by providing an empty index. As follows,
>>> pd.CategoricalDtype(pd.DatetimeIndex([])).categories.dtype
dtype('<M8[ns]')
"""
# TODO: Document public vs. private API
name = "category"
type: Type[CategoricalDtypeType] = CategoricalDtypeType
kind: str_type = "O"
str = "|O08"
base = np.dtype("O")
_metadata = ("categories", "ordered")
_cache: Dict[str_type, PandasExtensionDtype] = {}
def __init__(self, categories=None, ordered: Ordered = False):
self._finalize(categories, ordered, fastpath=False)
@classmethod
def _from_fastpath(
cls, categories=None, ordered: Optional[bool] = None
) -> "CategoricalDtype":
self = cls.__new__(cls)
self._finalize(categories, ordered, fastpath=True)
return self
@classmethod
def _from_categorical_dtype(
cls, dtype: "CategoricalDtype", categories=None, ordered: Ordered = None
) -> "CategoricalDtype":
if categories is ordered is None:
return dtype
if categories is None:
categories = dtype.categories
if ordered is None:
ordered = dtype.ordered
return cls(categories, ordered)
@classmethod
def _from_values_or_dtype(
cls,
values=None,
categories=None,
ordered: Optional[bool] = None,
dtype: Optional["CategoricalDtype"] = None,
) -> "CategoricalDtype":
"""
Construct dtype from the input parameters used in :class:`Categorical`.
This constructor method specifically does not do the factorization
step, if that is needed to find the categories. This constructor may
therefore return ``CategoricalDtype(categories=None, ordered=None)``,
which may not be useful. Additional steps may therefore have to be
taken to create the final dtype.
The return dtype is specified from the inputs in this prioritized
order:
1. if dtype is a CategoricalDtype, return dtype
2. if dtype is the string 'category', create a CategoricalDtype from
the supplied categories and ordered parameters, and return that.
3. if values is a categorical, use value.dtype, but override it with
categories and ordered if either/both of those are not None.
4. if dtype is None and values is not a categorical, construct the
dtype from categories and ordered, even if either of those is None.
Parameters
----------
values : list-like, optional
The list-like must be 1-dimensional.
categories : list-like, optional
Categories for the CategoricalDtype.
ordered : bool, optional
Designating if the categories are ordered.
dtype : CategoricalDtype or the string "category", optional
If ``CategoricalDtype``, cannot be used together with
`categories` or `ordered`.
Returns
-------
CategoricalDtype
Examples
--------
>>> pd.CategoricalDtype._from_values_or_dtype()
CategoricalDtype(categories=None, ordered=None)
>>> pd.CategoricalDtype._from_values_or_dtype(
... categories=['a', 'b'], ordered=True
... )
CategoricalDtype(categories=['a', 'b'], ordered=True)
>>> dtype1 = pd.CategoricalDtype(['a', 'b'], ordered=True)
>>> dtype2 = pd.CategoricalDtype(['x', 'y'], ordered=False)
>>> c = pd.Categorical([0, 1], dtype=dtype1, fastpath=True)
>>> pd.CategoricalDtype._from_values_or_dtype(
... c, ['x', 'y'], ordered=True, dtype=dtype2
... )
Traceback (most recent call last):
...
ValueError: Cannot specify `categories` or `ordered` together with
`dtype`.
The supplied dtype takes precedence over values' dtype:
>>> pd.CategoricalDtype._from_values_or_dtype(c, dtype=dtype2)
CategoricalDtype(categories=['x', 'y'], ordered=False)
"""
if dtype is not None:
# The dtype argument takes precedence over values.dtype (if any)
if isinstance(dtype, str):
if dtype == "category":
dtype = CategoricalDtype(categories, ordered)
else:
raise ValueError(f"Unknown dtype {repr(dtype)}")
elif categories is not None or ordered is not None:
raise ValueError(
"Cannot specify `categories` or `ordered` together with `dtype`."
)
elif not isinstance(dtype, CategoricalDtype):
raise ValueError(f"Cannot not construct CategoricalDtype from {dtype}")
elif cls.is_dtype(values):
# If no "dtype" was passed, use the one from "values", but honor
# the "ordered" and "categories" arguments
dtype = values.dtype._from_categorical_dtype(
values.dtype, categories, ordered
)
else:
# If dtype=None and values is not categorical, create a new dtype.
# Note: This could potentially have categories=None and
# ordered=None.
dtype = CategoricalDtype(categories, ordered)
return dtype
@classmethod
def construct_from_string(cls, string: str_type) -> "CategoricalDtype":
"""
Construct a CategoricalDtype from a string.
Parameters
----------
string : str
Must be the string "category" in order to be successfully constructed.
Returns
-------
CategoricalDtype
Instance of the dtype.
Raises
------
TypeError
If a CategoricalDtype cannot be constructed from the input.
"""
if not isinstance(string, str):
raise TypeError(
f"'construct_from_string' expects a string, got {type(string)}"
)
if string != cls.name:
raise TypeError(f"Cannot construct a 'CategoricalDtype' from '{string}'")
# need ordered=None to ensure that operations specifying dtype="category" don't
# override the ordered value for existing categoricals
return cls(ordered=None)
def _finalize(self, categories, ordered: Ordered, fastpath: bool = False) -> None:
if ordered is not None:
self.validate_ordered(ordered)
if categories is not None:
categories = self.validate_categories(categories, fastpath=fastpath)
self._categories = categories
self._ordered = ordered
def __setstate__(self, state: MutableMapping[str_type, Any]) -> None:
# for pickle compat. __get_state__ is defined in the
# PandasExtensionDtype superclass and uses the public properties to
# pickle -> need to set the settable private ones here (see GH26067)
self._categories = state.pop("categories", None)
self._ordered = state.pop("ordered", False)
def __hash__(self) -> int:
# _hash_categories returns a uint64, so use the negative
# space for when we have unknown categories to avoid a conflict
if self.categories is None:
if self.ordered:
return -1
else:
return -2
# We *do* want to include the real self.ordered here
return int(self._hash_categories(self.categories, self.ordered))
def __eq__(self, other: Any) -> bool:
"""
Rules for CDT equality:
1) Any CDT is equal to the string 'category'
2) Any CDT is equal to itself
3) Any CDT is equal to a CDT with categories=None regardless of ordered
4) A CDT with ordered=True is only equal to another CDT with
ordered=True and identical categories in the same order
5) A CDT with ordered={False, None} is only equal to another CDT with
ordered={False, None} and identical categories, but same order is
not required. There is no distinction between False/None.
6) Any other comparison returns False
"""
if isinstance(other, str):
return other == self.name
elif other is self:
return True
elif not (hasattr(other, "ordered") and hasattr(other, "categories")):
return False
elif self.categories is None or other.categories is None:
# We're forced into a suboptimal corner thanks to math and
# backwards compatibility. We require that `CDT(...) == 'category'`
# for all CDTs **including** `CDT(None, ...)`. Therefore, *all*
# CDT(., .) = CDT(None, False) and *all*
# CDT(., .) = CDT(None, True).
return True
elif self.ordered or other.ordered:
# At least one has ordered=True; equal if both have ordered=True
# and the same values for categories in the same order.
return (self.ordered == other.ordered) and self.categories.equals(
other.categories
)
else:
# Neither has ordered=True; equal if both have the same categories,
# but same order is not necessary. There is no distinction between
# ordered=False and ordered=None: CDT(., False) and CDT(., None)
# will be equal if they have the same categories.
left = self.categories
right = other.categories
# GH#36280 the ordering of checks here is for performance
if not left.dtype == right.dtype:
return False
if len(left) != len(right):
return False
if self.categories.equals(other.categories):
# Check and see if they happen to be identical categories
return True
if left.dtype != object:
# Faster than calculating hash
indexer = left.get_indexer(right)
# Because left and right have the same length and are unique,
# `indexer` not having any -1s implies that there is a
# bijection between `left` and `right`.
return (indexer != -1).all()
# With object-dtype we need a comparison that identifies
# e.g. int(2) as distinct from float(2)
return hash(self) == hash(other)
def __repr__(self) -> str_type:
if self.categories is None:
data = "None"
else:
data = self.categories._format_data(name=type(self).__name__)
if data is None:
# self.categories is RangeIndex
data = str(self.categories._range)
data = data.rstrip(", ")
return f"CategoricalDtype(categories={data}, ordered={self.ordered})"
@staticmethod
def _hash_categories(categories, ordered: Ordered = True) -> int:
from pandas.core.util.hashing import (
combine_hash_arrays,
hash_array,
hash_tuples,
)
if len(categories) and isinstance(categories[0], tuple):
# assumes if any individual category is a tuple, then all our. ATM
# I don't really want to support just some of the categories being
# tuples.
categories = list(categories) # breaks if a np.array of categories
cat_array = hash_tuples(categories)
else:
if categories.dtype == "O":
if len({type(x) for x in categories}) != 1:
# TODO: hash_array doesn't handle mixed types. It casts
# everything to a str first, which means we treat
# {'1', '2'} the same as {'1', 2}
# find a better solution
hashed = hash((tuple(categories), ordered))
return hashed
if DatetimeTZDtype.is_dtype(categories.dtype):
# Avoid future warning.
categories = categories.astype("datetime64[ns]")
cat_array = hash_array(np.asarray(categories), categorize=False)
if ordered:
cat_array = np.vstack(
[cat_array, np.arange(len(cat_array), dtype=cat_array.dtype)]
)
else:
cat_array = [cat_array]
hashed = combine_hash_arrays(iter(cat_array), num_items=len(cat_array))
return np.bitwise_xor.reduce(hashed)
@classmethod
def construct_array_type(cls) -> Type["Categorical"]:
"""
Return the array type associated with this dtype.
Returns
-------
type
"""
from pandas import Categorical
return Categorical
@staticmethod
def validate_ordered(ordered: Ordered) -> None:
"""
Validates that we have a valid ordered parameter. If
it is not a boolean, a TypeError will be raised.
Parameters
----------
ordered : object
The parameter to be verified.
Raises
------
TypeError
If 'ordered' is not a boolean.
"""
if not is_bool(ordered):
raise TypeError("'ordered' must either be 'True' or 'False'")
@staticmethod
def validate_categories(categories, fastpath: bool = False):
"""
Validates that we have good categories
Parameters
----------
categories : array-like
fastpath : bool
Whether to skip nan and uniqueness checks
Returns
-------
categories : Index
"""
from pandas.core.indexes.base import Index
if not fastpath and not is_list_like(categories):
raise TypeError(
f"Parameter 'categories' must be list-like, was {repr(categories)}"
)
elif not isinstance(categories, ABCIndexClass):
categories = Index(categories, tupleize_cols=False)
if not fastpath:
if categories.hasnans:
raise ValueError("Categorical categories cannot be null")
if not categories.is_unique:
raise ValueError("Categorical categories must be unique")
if isinstance(categories, ABCCategoricalIndex):
categories = categories.categories
return categories
def update_dtype(
self, dtype: Union[str_type, "CategoricalDtype"]
) -> "CategoricalDtype":
"""
Returns a CategoricalDtype with categories and ordered taken from dtype
if specified, otherwise falling back to self if unspecified
Parameters
----------
dtype : CategoricalDtype
Returns
-------
new_dtype : CategoricalDtype
"""
if isinstance(dtype, str) and dtype == "category":
# dtype='category' should not change anything
return self
elif not self.is_dtype(dtype):
raise ValueError(
f"a CategoricalDtype must be passed to perform an update, "
f"got {repr(dtype)}"
)
else:
# from here on, dtype is a CategoricalDtype
dtype = cast(CategoricalDtype, dtype)
# update categories/ordered unless they've been explicitly passed as None
new_categories = (
dtype.categories if dtype.categories is not None else self.categories
)
new_ordered = dtype.ordered if dtype.ordered is not None else self.ordered
return CategoricalDtype(new_categories, new_ordered)
@property
def categories(self):
"""
An ``Index`` containing the unique categories allowed.
"""
return self._categories
@property
def ordered(self) -> Ordered:
"""
Whether the categories have an ordered relationship.
"""
return self._ordered
@property
def _is_boolean(self) -> bool:
from pandas.core.dtypes.common import is_bool_dtype
return is_bool_dtype(self.categories)
def _get_common_dtype(self, dtypes: List[DtypeObj]) -> Optional[DtypeObj]:
from pandas.core.arrays.sparse import SparseDtype
# check if we have all categorical dtype with identical categories
if all(isinstance(x, CategoricalDtype) for x in dtypes):
first = dtypes[0]
if all(first == other for other in dtypes[1:]):
return first
# special case non-initialized categorical
# TODO we should figure out the expected return value in general
non_init_cats = [
isinstance(x, CategoricalDtype) and x.categories is None for x in dtypes
]
if all(non_init_cats):
return self
elif any(non_init_cats):
return None
# categorical is aware of Sparse -> extract sparse subdtypes
dtypes = [x.subtype if isinstance(x, SparseDtype) else x for x in dtypes]
# extract the categories' dtype
non_cat_dtypes = [
x.categories.dtype if isinstance(x, CategoricalDtype) else x for x in dtypes
]
# TODO should categorical always give an answer?
from pandas.core.dtypes.cast import find_common_type
return find_common_type(non_cat_dtypes)
@register_extension_dtype
class DatetimeTZDtype(PandasExtensionDtype):
"""
An ExtensionDtype for timezone-aware datetime data.
**This is not an actual numpy dtype**, but a duck type.
Parameters
----------
unit : str, default "ns"
The precision of the datetime data. Currently limited
to ``"ns"``.
tz : str, int, or datetime.tzinfo
The timezone.
Attributes
----------
unit
tz
Methods
-------
None
Raises
------
pytz.UnknownTimeZoneError
When the requested timezone cannot be found.
Examples
--------
>>> pd.DatetimeTZDtype(tz='UTC')
datetime64[ns, UTC]
>>> pd.DatetimeTZDtype(tz='dateutil/US/Central')
datetime64[ns, tzfile('/usr/share/zoneinfo/US/Central')]
"""
type: Type[Timestamp] = Timestamp
kind: str_type = "M"
str = "|M8[ns]"
num = 101
base = np.dtype("M8[ns]")
na_value = NaT
_metadata = ("unit", "tz")
_match = re.compile(r"(datetime64|M8)\[(?P<unit>.+), (?P<tz>.+)\]")
_cache: Dict[str_type, PandasExtensionDtype] = {}
def __init__(self, unit: Union[str_type, "DatetimeTZDtype"] = "ns", tz=None):
if isinstance(unit, DatetimeTZDtype):
# error: "str" has no attribute "tz"
unit, tz = unit.unit, unit.tz # type: ignore[attr-defined]
if unit != "ns":
if isinstance(unit, str) and tz is None:
# maybe a string like datetime64[ns, tz], which we support for
# now.
result = type(self).construct_from_string(unit)
unit = result.unit
tz = result.tz
msg = (
f"Passing a dtype alias like 'datetime64[ns, {tz}]' "
"to DatetimeTZDtype is no longer supported. Use "
"'DatetimeTZDtype.construct_from_string()' instead."
)
raise ValueError(msg)
else:
raise ValueError("DatetimeTZDtype only supports ns units")
if tz:
tz = timezones.maybe_get_tz(tz)
tz = timezones.tz_standardize(tz)
elif tz is not None:
raise pytz.UnknownTimeZoneError(tz)
if tz is None:
raise TypeError("A 'tz' is required.")
self._unit = unit
self._tz = tz
@property
def unit(self) -> str_type:
"""
The precision of the datetime data.
"""
return self._unit
@property
def tz(self):
"""
The timezone.
"""
return self._tz
@classmethod
def construct_array_type(cls) -> Type["DatetimeArray"]:
"""
Return the array type associated with this dtype.
Returns
-------
type
"""
from pandas.core.arrays import DatetimeArray
return DatetimeArray
@classmethod
def construct_from_string(cls, string: str_type) -> "DatetimeTZDtype":
"""
Construct a DatetimeTZDtype from a string.
Parameters
----------
string : str
The string alias for this DatetimeTZDtype.
Should be formatted like ``datetime64[ns, <tz>]``,
where ``<tz>`` is the timezone name.
Examples
--------
>>> DatetimeTZDtype.construct_from_string('datetime64[ns, UTC]')
datetime64[ns, UTC]
"""
if not isinstance(string, str):
raise TypeError(
f"'construct_from_string' expects a string, got {type(string)}"
)
msg = f"Cannot construct a 'DatetimeTZDtype' from '{string}'"
match = cls._match.match(string)
if match:
d = match.groupdict()
try:
return cls(unit=d["unit"], tz=d["tz"])
except (KeyError, TypeError, ValueError) as err:
# KeyError if maybe_get_tz tries and fails to get a
# pytz timezone (actually pytz.UnknownTimeZoneError).
# TypeError if we pass a nonsense tz;
# ValueError if we pass a unit other than "ns"
raise TypeError(msg) from err
raise TypeError(msg)
def __str__(self) -> str_type:
return f"datetime64[{self.unit}, {self.tz}]"
@property
def name(self) -> str_type:
"""A string representation of the dtype."""
return str(self)
def __hash__(self) -> int:
# make myself hashable
# TODO: update this.
return hash(str(self))
def __eq__(self, other: Any) -> bool:
if isinstance(other, str):
if other.startswith("M8["):
other = "datetime64[" + other[3:]
return other == self.name
return (
isinstance(other, DatetimeTZDtype)
and self.unit == other.unit
and str(self.tz) == str(other.tz)
)
def __setstate__(self, state) -> None:
# for pickle compat. __get_state__ is defined in the
# PandasExtensionDtype superclass and uses the public properties to
# pickle -> need to set the settable private ones here (see GH26067)
self._tz = state["tz"]
self._unit = state["unit"]
@register_extension_dtype
class PeriodDtype(dtypes.PeriodDtypeBase, PandasExtensionDtype):
"""
An ExtensionDtype for Period data.
**This is not an actual numpy dtype**, but a duck type.
Parameters
----------
freq : str or DateOffset
The frequency of this PeriodDtype.
Attributes
----------
freq
Methods
-------
None
Examples
--------
>>> pd.PeriodDtype(freq='D')
period[D]
>>> pd.PeriodDtype(freq=pd.offsets.MonthEnd())
period[M]
"""
type: Type[Period] = Period
kind: str_type = "O"
str = "|O08"
base = np.dtype("O")
num = 102
_metadata = ("freq",)
_match = re.compile(r"(P|p)eriod\[(?P<freq>.+)\]")
_cache: Dict[str_type, PandasExtensionDtype] = {}
def __new__(cls, freq=None):
"""
Parameters
----------
freq : frequency
"""
if isinstance(freq, PeriodDtype):
return freq
elif freq is None:
# empty constructor for pickle compat
# -10_000 corresponds to PeriodDtypeCode.UNDEFINED
u = dtypes.PeriodDtypeBase.__new__(cls, -10_000)
u._freq = None
return u
if not isinstance(freq, BaseOffset):
freq = cls._parse_dtype_strict(freq)
try:
return cls._cache[freq.freqstr]
except KeyError:
dtype_code = freq._period_dtype_code
u = dtypes.PeriodDtypeBase.__new__(cls, dtype_code)
u._freq = freq
cls._cache[freq.freqstr] = u
return u
def __reduce__(self):
return type(self), (self.freq,)
@property
def freq(self):
"""
The frequency object of this PeriodDtype.
"""
return self._freq
@classmethod
def _parse_dtype_strict(cls, freq):
if isinstance(freq, str):
if freq.startswith("period[") or freq.startswith("Period["):
m = cls._match.search(freq)
if m is not None:
freq = m.group("freq")
freq = to_offset(freq)
if freq is not None:
return freq
raise ValueError("could not construct PeriodDtype")
@classmethod
def construct_from_string(cls, string: str_type) -> "PeriodDtype":
"""
Strict construction from a string, raise a TypeError if not
possible
"""
if (
isinstance(string, str)
and (string.startswith("period[") or string.startswith("Period["))
or isinstance(string, BaseOffset)
):
# do not parse string like U as period[U]
# avoid tuple to be regarded as freq
try:
return cls(freq=string)
except ValueError:
pass
if isinstance(string, str):
msg = f"Cannot construct a 'PeriodDtype' from '{string}'"
else:
msg = f"'construct_from_string' expects a string, got {type(string)}"
raise TypeError(msg)
def __str__(self) -> str_type:
return self.name
@property
def name(self) -> str_type:
return f"period[{self.freq.freqstr}]"
@property
def na_value(self):
return NaT
def __hash__(self) -> int:
# make myself hashable
return hash(str(self))
def __eq__(self, other: Any) -> bool:
if isinstance(other, str):
return other == self.name or other == self.name.title()
return isinstance(other, PeriodDtype) and self.freq == other.freq
def __ne__(self, other: Any) -> bool:
return not self.__eq__(other)
def __setstate__(self, state):
# for pickle compat. __getstate__ is defined in the
# PandasExtensionDtype superclass and uses the public properties to
# pickle -> need to set the settable private ones here (see GH26067)
self._freq = state["freq"]
@classmethod
def is_dtype(cls, dtype: object) -> bool:
"""
Return a boolean if we if the passed type is an actual dtype that we
can match (via string or type)
"""
if isinstance(dtype, str):
# PeriodDtype can be instantiated from freq string like "U",
# but doesn't regard freq str like "U" as dtype.
if dtype.startswith("period[") or dtype.startswith("Period["):
try:
if cls._parse_dtype_strict(dtype) is not None:
return True
else:
return False
except ValueError:
return False
else:
return False
return super().is_dtype(dtype)
@classmethod
def construct_array_type(cls) -> Type["PeriodArray"]:
"""
Return the array type associated with this dtype.
Returns
-------
type
"""
from pandas.core.arrays import PeriodArray
return PeriodArray
def __from_arrow__(
self, array: Union["pyarrow.Array", "pyarrow.ChunkedArray"]
) -> "PeriodArray":
"""
Construct PeriodArray from pyarrow Array/ChunkedArray.
"""
import pyarrow
from pandas.core.arrays import PeriodArray
from pandas.core.arrays._arrow_utils import pyarrow_array_to_numpy_and_mask
if isinstance(array, pyarrow.Array):
chunks = [array]
else:
chunks = array.chunks
results = []
for arr in chunks:
data, mask = pyarrow_array_to_numpy_and_mask(arr, dtype="int64")
parr = PeriodArray(data.copy(), freq=self.freq, copy=False)
parr[~mask] = NaT
results.append(parr)
return PeriodArray._concat_same_type(results)
@register_extension_dtype
class IntervalDtype(PandasExtensionDtype):
"""
An ExtensionDtype for Interval data.
**This is not an actual numpy dtype**, but a duck type.
Parameters
----------
subtype : str, np.dtype
The dtype of the Interval bounds.
Attributes
----------
subtype
Methods
-------
None
Examples
--------
>>> pd.IntervalDtype(subtype='int64')
interval[int64]
"""
name = "interval"
kind: str_type = "O"
str = "|O08"
base = np.dtype("O")
num = 103
_metadata = ("subtype",)
_match = re.compile(r"(I|i)nterval\[(?P<subtype>.+)\]")
_cache: Dict[str_type, PandasExtensionDtype] = {}
def __new__(cls, subtype=None):
from pandas.core.dtypes.common import is_string_dtype, pandas_dtype
if isinstance(subtype, IntervalDtype):
return subtype
elif subtype is None:
# we are called as an empty constructor
# generally for pickle compat
u = object.__new__(cls)
u._subtype = None
return u
elif isinstance(subtype, str) and subtype.lower() == "interval":
subtype = None
else:
if isinstance(subtype, str):
m = cls._match.search(subtype)
if m is not None:
subtype = m.group("subtype")
try:
subtype = pandas_dtype(subtype)
except TypeError as err:
raise TypeError("could not construct IntervalDtype") from err
if CategoricalDtype.is_dtype(subtype) or is_string_dtype(subtype):
# GH 19016
msg = (
"category, object, and string subtypes are not supported "
"for IntervalDtype"
)
raise TypeError(msg)
try:
return cls._cache[str(subtype)]
except KeyError:
u = object.__new__(cls)
u._subtype = subtype
cls._cache[str(subtype)] = u
return u
@property
def subtype(self):
"""
The dtype of the Interval bounds.
"""
return self._subtype
@classmethod
def construct_array_type(cls) -> Type["IntervalArray"]:
"""
Return the array type associated with this dtype.
Returns
-------
type
"""
from pandas.core.arrays import IntervalArray
return IntervalArray
@classmethod
def construct_from_string(cls, string):
"""
attempt to construct this type from a string, raise a TypeError
if its not possible
"""
if not isinstance(string, str):
raise TypeError(
f"'construct_from_string' expects a string, got {type(string)}"
)
if string.lower() == "interval" or cls._match.search(string) is not None:
return cls(string)
msg = (
f"Cannot construct a 'IntervalDtype' from '{string}'.\n\n"
"Incorrectly formatted string passed to constructor. "
"Valid formats include Interval or Interval[dtype] "
"where dtype is numeric, datetime, or timedelta"
)
raise TypeError(msg)
@property
def type(self):
return Interval
def __str__(self) -> str_type:
if self.subtype is None:
return "interval"
return f"interval[{self.subtype}]"
def __hash__(self) -> int:
# make myself hashable
return hash(str(self))
def __eq__(self, other: Any) -> bool:
if isinstance(other, str):
return other.lower() in (self.name.lower(), str(self).lower())
elif not isinstance(other, IntervalDtype):
return False
elif self.subtype is None or other.subtype is None:
# None should match any subtype
return True
else:
from pandas.core.dtypes.common import is_dtype_equal
return is_dtype_equal(self.subtype, other.subtype)
def __setstate__(self, state):
# for pickle compat. __get_state__ is defined in the
# PandasExtensionDtype superclass and uses the public properties to
# pickle -> need to set the settable private ones here (see GH26067)
self._subtype = state["subtype"]
@classmethod
def is_dtype(cls, dtype: object) -> bool:
"""
Return a boolean if we if the passed type is an actual dtype that we
can match (via string or type)
"""
if isinstance(dtype, str):
if dtype.lower().startswith("interval"):
try:
if cls.construct_from_string(dtype) is not None:
return True
else:
return False
except (ValueError, TypeError):
return False
else:
return False
return super().is_dtype(dtype)
def __from_arrow__(
self, array: Union["pyarrow.Array", "pyarrow.ChunkedArray"]
) -> "IntervalArray":
"""
Construct IntervalArray from pyarrow Array/ChunkedArray.
"""
import pyarrow
from pandas.core.arrays import IntervalArray
if isinstance(array, pyarrow.Array):
chunks = [array]
else:
chunks = array.chunks
results = []
for arr in chunks:
left = np.asarray(arr.storage.field("left"), dtype=self.subtype)
right = np.asarray(arr.storage.field("right"), dtype=self.subtype)
iarr = IntervalArray.from_arrays(left, right, closed=array.type.closed)
results.append(iarr)
return IntervalArray._concat_same_type(results)