projektAI/venv/Lib/site-packages/pandas/core/arrays/sparse/dtype.py
2021-06-06 22:13:05 +02:00

386 lines
12 KiB
Python

"""Sparse Dtype"""
import re
from typing import TYPE_CHECKING, Any, List, Optional, Tuple, Type
import warnings
import numpy as np
from pandas._typing import Dtype, DtypeObj
from pandas.errors import PerformanceWarning
from pandas.core.dtypes.base import ExtensionDtype, register_extension_dtype
from pandas.core.dtypes.cast import astype_nansafe
from pandas.core.dtypes.common import (
is_bool_dtype,
is_extension_array_dtype,
is_object_dtype,
is_scalar,
is_string_dtype,
pandas_dtype,
)
from pandas.core.dtypes.missing import isna, na_value_for_dtype
if TYPE_CHECKING:
from pandas.core.arrays.sparse.array import SparseArray
@register_extension_dtype
class SparseDtype(ExtensionDtype):
"""
Dtype for data stored in :class:`SparseArray`.
This dtype implements the pandas ExtensionDtype interface.
.. versionadded:: 0.24.0
Parameters
----------
dtype : str, ExtensionDtype, numpy.dtype, type, default numpy.float64
The dtype of the underlying array storing the non-fill value values.
fill_value : scalar, optional
The scalar value not stored in the SparseArray. By default, this
depends on `dtype`.
=========== ==========
dtype na_value
=========== ==========
float ``np.nan``
int ``0``
bool ``False``
datetime64 ``pd.NaT``
timedelta64 ``pd.NaT``
=========== ==========
The default value may be overridden by specifying a `fill_value`.
Attributes
----------
None
Methods
-------
None
"""
# We include `_is_na_fill_value` in the metadata to avoid hash collisions
# between SparseDtype(float, 0.0) and SparseDtype(float, nan).
# Without is_na_fill_value in the comparison, those would be equal since
# hash(nan) is (sometimes?) 0.
_metadata = ("_dtype", "_fill_value", "_is_na_fill_value")
def __init__(self, dtype: Dtype = np.float64, fill_value: Any = None):
if isinstance(dtype, type(self)):
if fill_value is None:
fill_value = dtype.fill_value
dtype = dtype.subtype
dtype = pandas_dtype(dtype)
if is_string_dtype(dtype):
dtype = np.dtype("object")
if fill_value is None:
fill_value = na_value_for_dtype(dtype)
if not is_scalar(fill_value):
raise ValueError(f"fill_value must be a scalar. Got {fill_value} instead")
self._dtype = dtype
self._fill_value = fill_value
def __hash__(self):
# Python3 doesn't inherit __hash__ when a base class overrides
# __eq__, so we explicitly do it here.
return super().__hash__()
def __eq__(self, other: Any) -> bool:
# We have to override __eq__ to handle NA values in _metadata.
# The base class does simple == checks, which fail for NA.
if isinstance(other, str):
try:
other = self.construct_from_string(other)
except TypeError:
return False
if isinstance(other, type(self)):
subtype = self.subtype == other.subtype
if self._is_na_fill_value:
# this case is complicated by two things:
# SparseDtype(float, float(nan)) == SparseDtype(float, np.nan)
# SparseDtype(float, np.nan) != SparseDtype(float, pd.NaT)
# i.e. we want to treat any floating-point NaN as equal, but
# not a floating-point NaN and a datetime NaT.
fill_value = (
other._is_na_fill_value
and isinstance(self.fill_value, type(other.fill_value))
or isinstance(other.fill_value, type(self.fill_value))
)
else:
fill_value = self.fill_value == other.fill_value
return subtype and fill_value
return False
@property
def fill_value(self):
"""
The fill value of the array.
Converting the SparseArray to a dense ndarray will fill the
array with this value.
.. warning::
It's possible to end up with a SparseArray that has ``fill_value``
values in ``sp_values``. This can occur, for example, when setting
``SparseArray.fill_value`` directly.
"""
return self._fill_value
@property
def _is_na_fill_value(self):
return isna(self.fill_value)
@property
def _is_numeric(self) -> bool:
return not is_object_dtype(self.subtype)
@property
def _is_boolean(self) -> bool:
return is_bool_dtype(self.subtype)
@property
def kind(self):
"""
The sparse kind. Either 'integer', or 'block'.
"""
return self.subtype.kind
@property
def type(self):
return self.subtype.type
@property
def subtype(self):
return self._dtype
@property
def name(self):
return f"Sparse[{self.subtype.name}, {repr(self.fill_value)}]"
def __repr__(self) -> str:
return self.name
@classmethod
def construct_array_type(cls) -> Type["SparseArray"]:
"""
Return the array type associated with this dtype.
Returns
-------
type
"""
from pandas.core.arrays.sparse.array import SparseArray
return SparseArray
@classmethod
def construct_from_string(cls, string: str) -> "SparseDtype":
"""
Construct a SparseDtype from a string form.
Parameters
----------
string : str
Can take the following forms.
string dtype
================ ============================
'int' SparseDtype[np.int64, 0]
'Sparse' SparseDtype[np.float64, nan]
'Sparse[int]' SparseDtype[np.int64, 0]
'Sparse[int, 0]' SparseDtype[np.int64, 0]
================ ============================
It is not possible to specify non-default fill values
with a string. An argument like ``'Sparse[int, 1]'``
will raise a ``TypeError`` because the default fill value
for integers is 0.
Returns
-------
SparseDtype
"""
if not isinstance(string, str):
raise TypeError(
f"'construct_from_string' expects a string, got {type(string)}"
)
msg = f"Cannot construct a 'SparseDtype' from '{string}'"
if string.startswith("Sparse"):
try:
sub_type, has_fill_value = cls._parse_subtype(string)
except ValueError as err:
raise TypeError(msg) from err
else:
result = SparseDtype(sub_type)
msg = (
f"Cannot construct a 'SparseDtype' from '{string}'.\n\nIt "
"looks like the fill_value in the string is not "
"the default for the dtype. Non-default fill_values "
"are not supported. Use the 'SparseDtype()' "
"constructor instead."
)
if has_fill_value and str(result) != string:
raise TypeError(msg)
return result
else:
raise TypeError(msg)
@staticmethod
def _parse_subtype(dtype: str) -> Tuple[str, bool]:
"""
Parse a string to get the subtype
Parameters
----------
dtype : str
A string like
* Sparse[subtype]
* Sparse[subtype, fill_value]
Returns
-------
subtype : str
Raises
------
ValueError
When the subtype cannot be extracted.
"""
xpr = re.compile(r"Sparse\[(?P<subtype>[^,]*)(, )?(?P<fill_value>.*?)?\]$")
m = xpr.match(dtype)
has_fill_value = False
if m:
subtype = m.groupdict()["subtype"]
has_fill_value = bool(m.groupdict()["fill_value"])
elif dtype == "Sparse":
subtype = "float64"
else:
raise ValueError(f"Cannot parse {dtype}")
return subtype, has_fill_value
@classmethod
def is_dtype(cls, dtype: object) -> bool:
dtype = getattr(dtype, "dtype", dtype)
if isinstance(dtype, str) and dtype.startswith("Sparse"):
sub_type, _ = cls._parse_subtype(dtype)
dtype = np.dtype(sub_type)
elif isinstance(dtype, cls):
return True
return isinstance(dtype, np.dtype) or dtype == "Sparse"
def update_dtype(self, dtype):
"""
Convert the SparseDtype to a new dtype.
This takes care of converting the ``fill_value``.
Parameters
----------
dtype : Union[str, numpy.dtype, SparseDtype]
The new dtype to use.
* For a SparseDtype, it is simply returned
* For a NumPy dtype (or str), the current fill value
is converted to the new dtype, and a SparseDtype
with `dtype` and the new fill value is returned.
Returns
-------
SparseDtype
A new SparseDtype with the correct `dtype` and fill value
for that `dtype`.
Raises
------
ValueError
When the current fill value cannot be converted to the
new `dtype` (e.g. trying to convert ``np.nan`` to an
integer dtype).
Examples
--------
>>> SparseDtype(int, 0).update_dtype(float)
Sparse[float64, 0.0]
>>> SparseDtype(int, 1).update_dtype(SparseDtype(float, np.nan))
Sparse[float64, nan]
"""
cls = type(self)
dtype = pandas_dtype(dtype)
if not isinstance(dtype, cls):
if is_extension_array_dtype(dtype):
raise TypeError("sparse arrays of extension dtypes not supported")
fill_value = astype_nansafe(np.array(self.fill_value), dtype).item()
dtype = cls(dtype, fill_value=fill_value)
return dtype
@property
def _subtype_with_str(self):
"""
Whether the SparseDtype's subtype should be considered ``str``.
Typically, pandas will store string data in an object-dtype array.
When converting values to a dtype, e.g. in ``.astype``, we need to
be more specific, we need the actual underlying type.
Returns
-------
>>> SparseDtype(int, 1)._subtype_with_str
dtype('int64')
>>> SparseDtype(object, 1)._subtype_with_str
dtype('O')
>>> dtype = SparseDtype(str, '')
>>> dtype.subtype
dtype('O')
>>> dtype._subtype_with_str
<class 'str'>
"""
if isinstance(self.fill_value, str):
return type(self.fill_value)
return self.subtype
def _get_common_dtype(self, dtypes: List[DtypeObj]) -> Optional[DtypeObj]:
# TODO for now only handle SparseDtypes and numpy dtypes => extend
# with other compatibtle extension dtypes
if any(
isinstance(x, ExtensionDtype) and not isinstance(x, SparseDtype)
for x in dtypes
):
return None
fill_values = [x.fill_value for x in dtypes if isinstance(x, SparseDtype)]
fill_value = fill_values[0]
# np.nan isn't a singleton, so we may end up with multiple
# NaNs here, so we ignore tha all NA case too.
if not (len(set(fill_values)) == 1 or isna(fill_values).all()):
warnings.warn(
"Concatenating sparse arrays with multiple fill "
f"values: '{fill_values}'. Picking the first and "
"converting the rest.",
PerformanceWarning,
stacklevel=6,
)
np_dtypes = [x.subtype if isinstance(x, SparseDtype) else x for x in dtypes]
return SparseDtype(np.find_common_type(np_dtypes, []), fill_value=fill_value)