516 lines
16 KiB
Python
516 lines
16 KiB
Python
import numbers
|
|
from typing import TYPE_CHECKING, List, Optional, Tuple, Type, Union
|
|
import warnings
|
|
|
|
import numpy as np
|
|
|
|
from pandas._libs import lib, missing as libmissing
|
|
from pandas._typing import ArrayLike, DtypeObj
|
|
from pandas.compat.numpy import function as nv
|
|
from pandas.util._decorators import cache_readonly
|
|
|
|
from pandas.core.dtypes.cast import astype_nansafe
|
|
from pandas.core.dtypes.common import (
|
|
is_bool_dtype,
|
|
is_datetime64_dtype,
|
|
is_float_dtype,
|
|
is_integer_dtype,
|
|
is_list_like,
|
|
is_object_dtype,
|
|
pandas_dtype,
|
|
)
|
|
from pandas.core.dtypes.dtypes import register_extension_dtype
|
|
from pandas.core.dtypes.missing import isna
|
|
|
|
from pandas.core import ops
|
|
from pandas.core.ops import invalid_comparison
|
|
from pandas.core.tools.numeric import to_numeric
|
|
|
|
from .masked import BaseMaskedDtype
|
|
from .numeric import NumericArray
|
|
|
|
if TYPE_CHECKING:
|
|
import pyarrow
|
|
|
|
|
|
class FloatingDtype(BaseMaskedDtype):
|
|
"""
|
|
An ExtensionDtype to hold a single size of floating dtype.
|
|
|
|
These specific implementations are subclasses of the non-public
|
|
FloatingDtype. For example we have Float32Dtype to represent float32.
|
|
|
|
The attributes name & type are set when these subclasses are created.
|
|
"""
|
|
|
|
def __repr__(self) -> str:
|
|
return f"{self.name}Dtype()"
|
|
|
|
@property
|
|
def _is_numeric(self) -> bool:
|
|
return True
|
|
|
|
@classmethod
|
|
def construct_array_type(cls) -> Type["FloatingArray"]:
|
|
"""
|
|
Return the array type associated with this dtype.
|
|
|
|
Returns
|
|
-------
|
|
type
|
|
"""
|
|
return FloatingArray
|
|
|
|
def _get_common_dtype(self, dtypes: List[DtypeObj]) -> Optional[DtypeObj]:
|
|
# for now only handle other floating types
|
|
if not all(isinstance(t, FloatingDtype) for t in dtypes):
|
|
return None
|
|
np_dtype = np.find_common_type(
|
|
[t.numpy_dtype for t in dtypes], [] # type: ignore[union-attr]
|
|
)
|
|
if np.issubdtype(np_dtype, np.floating):
|
|
return FLOAT_STR_TO_DTYPE[str(np_dtype)]
|
|
return None
|
|
|
|
def __from_arrow__(
|
|
self, array: Union["pyarrow.Array", "pyarrow.ChunkedArray"]
|
|
) -> "FloatingArray":
|
|
"""
|
|
Construct FloatingArray from pyarrow Array/ChunkedArray.
|
|
"""
|
|
import pyarrow
|
|
|
|
from pandas.core.arrays._arrow_utils import pyarrow_array_to_numpy_and_mask
|
|
|
|
pyarrow_type = pyarrow.from_numpy_dtype(self.type)
|
|
if not array.type.equals(pyarrow_type):
|
|
array = array.cast(pyarrow_type)
|
|
|
|
if isinstance(array, pyarrow.Array):
|
|
chunks = [array]
|
|
else:
|
|
# pyarrow.ChunkedArray
|
|
chunks = array.chunks
|
|
|
|
results = []
|
|
for arr in chunks:
|
|
data, mask = pyarrow_array_to_numpy_and_mask(arr, dtype=self.type)
|
|
float_arr = FloatingArray(data.copy(), ~mask, copy=False)
|
|
results.append(float_arr)
|
|
|
|
return FloatingArray._concat_same_type(results)
|
|
|
|
|
|
def coerce_to_array(
|
|
values, dtype=None, mask=None, copy: bool = False
|
|
) -> Tuple[np.ndarray, np.ndarray]:
|
|
"""
|
|
Coerce the input values array to numpy arrays with a mask.
|
|
|
|
Parameters
|
|
----------
|
|
values : 1D list-like
|
|
dtype : float dtype
|
|
mask : bool 1D array, optional
|
|
copy : bool, default False
|
|
if True, copy the input
|
|
|
|
Returns
|
|
-------
|
|
tuple of (values, mask)
|
|
"""
|
|
# if values is floating numpy array, preserve its dtype
|
|
if dtype is None and hasattr(values, "dtype"):
|
|
if is_float_dtype(values.dtype):
|
|
dtype = values.dtype
|
|
|
|
if dtype is not None:
|
|
if isinstance(dtype, str) and dtype.startswith("Float"):
|
|
# Avoid DeprecationWarning from NumPy about np.dtype("Float64")
|
|
# https://github.com/numpy/numpy/pull/7476
|
|
dtype = dtype.lower()
|
|
|
|
if not issubclass(type(dtype), FloatingDtype):
|
|
try:
|
|
dtype = FLOAT_STR_TO_DTYPE[str(np.dtype(dtype))]
|
|
except KeyError as err:
|
|
raise ValueError(f"invalid dtype specified {dtype}") from err
|
|
|
|
if isinstance(values, FloatingArray):
|
|
values, mask = values._data, values._mask
|
|
if dtype is not None:
|
|
values = values.astype(dtype.numpy_dtype, copy=False)
|
|
|
|
if copy:
|
|
values = values.copy()
|
|
mask = mask.copy()
|
|
return values, mask
|
|
|
|
values = np.array(values, copy=copy)
|
|
if is_object_dtype(values):
|
|
inferred_type = lib.infer_dtype(values, skipna=True)
|
|
if inferred_type == "empty":
|
|
values = np.empty(len(values))
|
|
values.fill(np.nan)
|
|
elif inferred_type not in [
|
|
"floating",
|
|
"integer",
|
|
"mixed-integer",
|
|
"integer-na",
|
|
"mixed-integer-float",
|
|
]:
|
|
raise TypeError(f"{values.dtype} cannot be converted to a FloatingDtype")
|
|
|
|
elif is_bool_dtype(values) and is_float_dtype(dtype):
|
|
values = np.array(values, dtype=float, copy=copy)
|
|
|
|
elif not (is_integer_dtype(values) or is_float_dtype(values)):
|
|
raise TypeError(f"{values.dtype} cannot be converted to a FloatingDtype")
|
|
|
|
if mask is None:
|
|
mask = isna(values)
|
|
else:
|
|
assert len(mask) == len(values)
|
|
|
|
if not values.ndim == 1:
|
|
raise TypeError("values must be a 1D list-like")
|
|
if not mask.ndim == 1:
|
|
raise TypeError("mask must be a 1D list-like")
|
|
|
|
# infer dtype if needed
|
|
if dtype is None:
|
|
dtype = np.dtype("float64")
|
|
else:
|
|
dtype = dtype.type
|
|
|
|
# if we are float, let's make sure that we can
|
|
# safely cast
|
|
|
|
# we copy as need to coerce here
|
|
# TODO should this be a safe cast?
|
|
if mask.any():
|
|
values = values.copy()
|
|
values[mask] = np.nan
|
|
values = values.astype(dtype, copy=False) # , casting="safe")
|
|
else:
|
|
values = values.astype(dtype, copy=False) # , casting="safe")
|
|
|
|
return values, mask
|
|
|
|
|
|
class FloatingArray(NumericArray):
|
|
"""
|
|
Array of floating (optional missing) values.
|
|
|
|
.. versionadded:: 1.2.0
|
|
|
|
.. warning::
|
|
|
|
FloatingArray is currently experimental, and its API or internal
|
|
implementation may change without warning. Expecially the behaviour
|
|
regarding NaN (distinct from NA missing values) is subject to change.
|
|
|
|
We represent a FloatingArray with 2 numpy arrays:
|
|
|
|
- data: contains a numpy float array of the appropriate dtype
|
|
- mask: a boolean array holding a mask on the data, True is missing
|
|
|
|
To construct an FloatingArray from generic array-like input, use
|
|
:func:`pandas.array` with one of the float dtypes (see examples).
|
|
|
|
See :ref:`integer_na` for more.
|
|
|
|
Parameters
|
|
----------
|
|
values : numpy.ndarray
|
|
A 1-d float-dtype array.
|
|
mask : numpy.ndarray
|
|
A 1-d boolean-dtype array indicating missing values.
|
|
copy : bool, default False
|
|
Whether to copy the `values` and `mask`.
|
|
|
|
Attributes
|
|
----------
|
|
None
|
|
|
|
Methods
|
|
-------
|
|
None
|
|
|
|
Returns
|
|
-------
|
|
FloatingArray
|
|
|
|
Examples
|
|
--------
|
|
Create an FloatingArray with :func:`pandas.array`:
|
|
|
|
>>> pd.array([0.1, None, 0.3], dtype=pd.Float32Dtype())
|
|
<FloatingArray>
|
|
[0.1, <NA>, 0.3]
|
|
Length: 3, dtype: Float32
|
|
|
|
String aliases for the dtypes are also available. They are capitalized.
|
|
|
|
>>> pd.array([0.1, None, 0.3], dtype="Float32")
|
|
<FloatingArray>
|
|
[0.1, <NA>, 0.3]
|
|
Length: 3, dtype: Float32
|
|
"""
|
|
|
|
# The value used to fill '_data' to avoid upcasting
|
|
_internal_fill_value = 0.0
|
|
|
|
@cache_readonly
|
|
def dtype(self) -> FloatingDtype:
|
|
return FLOAT_STR_TO_DTYPE[str(self._data.dtype)]
|
|
|
|
def __init__(self, values: np.ndarray, mask: np.ndarray, copy: bool = False):
|
|
if not (isinstance(values, np.ndarray) and values.dtype.kind == "f"):
|
|
raise TypeError(
|
|
"values should be floating numpy array. Use "
|
|
"the 'pd.array' function instead"
|
|
)
|
|
super().__init__(values, mask, copy=copy)
|
|
|
|
@classmethod
|
|
def _from_sequence(
|
|
cls, scalars, *, dtype=None, copy: bool = False
|
|
) -> "FloatingArray":
|
|
values, mask = coerce_to_array(scalars, dtype=dtype, copy=copy)
|
|
return FloatingArray(values, mask)
|
|
|
|
@classmethod
|
|
def _from_sequence_of_strings(
|
|
cls, strings, *, dtype=None, copy: bool = False
|
|
) -> "FloatingArray":
|
|
scalars = to_numeric(strings, errors="raise")
|
|
return cls._from_sequence(scalars, dtype=dtype, copy=copy)
|
|
|
|
_HANDLED_TYPES = (np.ndarray, numbers.Number)
|
|
|
|
def __array_ufunc__(self, ufunc, method: str, *inputs, **kwargs):
|
|
# For FloatingArray inputs, we apply the ufunc to ._data
|
|
# and mask the result.
|
|
if method == "reduce":
|
|
# Not clear how to handle missing values in reductions. Raise.
|
|
raise NotImplementedError("The 'reduce' method is not supported.")
|
|
out = kwargs.get("out", ())
|
|
|
|
for x in inputs + out:
|
|
if not isinstance(x, self._HANDLED_TYPES + (FloatingArray,)):
|
|
return NotImplemented
|
|
|
|
# for binary ops, use our custom dunder methods
|
|
result = ops.maybe_dispatch_ufunc_to_dunder_op(
|
|
self, ufunc, method, *inputs, **kwargs
|
|
)
|
|
if result is not NotImplemented:
|
|
return result
|
|
|
|
mask = np.zeros(len(self), dtype=bool)
|
|
inputs2 = []
|
|
for x in inputs:
|
|
if isinstance(x, FloatingArray):
|
|
mask |= x._mask
|
|
inputs2.append(x._data)
|
|
else:
|
|
inputs2.append(x)
|
|
|
|
def reconstruct(x):
|
|
# we don't worry about scalar `x` here, since we
|
|
# raise for reduce up above.
|
|
|
|
# TODO
|
|
if is_float_dtype(x.dtype):
|
|
m = mask.copy()
|
|
return FloatingArray(x, m)
|
|
else:
|
|
x[mask] = np.nan
|
|
return x
|
|
|
|
result = getattr(ufunc, method)(*inputs2, **kwargs)
|
|
if isinstance(result, tuple):
|
|
tuple(reconstruct(x) for x in result)
|
|
else:
|
|
return reconstruct(result)
|
|
|
|
def _coerce_to_array(self, value) -> Tuple[np.ndarray, np.ndarray]:
|
|
return coerce_to_array(value, dtype=self.dtype)
|
|
|
|
def astype(self, dtype, copy: bool = True) -> ArrayLike:
|
|
"""
|
|
Cast to a NumPy array or ExtensionArray with 'dtype'.
|
|
|
|
Parameters
|
|
----------
|
|
dtype : str or dtype
|
|
Typecode or data-type to which the array is cast.
|
|
copy : bool, default True
|
|
Whether to copy the data, even if not necessary. If False,
|
|
a copy is made only if the old dtype does not match the
|
|
new dtype.
|
|
|
|
Returns
|
|
-------
|
|
ndarray or ExtensionArray
|
|
NumPy ndarray, or BooleanArray, IntegerArray or FloatingArray with
|
|
'dtype' for its dtype.
|
|
|
|
Raises
|
|
------
|
|
TypeError
|
|
if incompatible type with an FloatingDtype, equivalent of same_kind
|
|
casting
|
|
"""
|
|
from pandas.core.arrays.string_ import StringArray, StringDtype
|
|
|
|
dtype = pandas_dtype(dtype)
|
|
|
|
# if the dtype is exactly the same, we can fastpath
|
|
if self.dtype == dtype:
|
|
# return the same object for copy=False
|
|
return self.copy() if copy else self
|
|
# if we are astyping to another nullable masked dtype, we can fastpath
|
|
if isinstance(dtype, BaseMaskedDtype):
|
|
# TODO deal with NaNs
|
|
data = self._data.astype(dtype.numpy_dtype, copy=copy)
|
|
# mask is copied depending on whether the data was copied, and
|
|
# not directly depending on the `copy` keyword
|
|
mask = self._mask if data is self._data else self._mask.copy()
|
|
return dtype.construct_array_type()(data, mask, copy=False)
|
|
elif isinstance(dtype, StringDtype):
|
|
return StringArray._from_sequence(self, copy=False)
|
|
|
|
# coerce
|
|
if is_float_dtype(dtype):
|
|
# In astype, we consider dtype=float to also mean na_value=np.nan
|
|
kwargs = {"na_value": np.nan}
|
|
elif is_datetime64_dtype(dtype):
|
|
kwargs = {"na_value": np.datetime64("NaT")}
|
|
else:
|
|
kwargs = {}
|
|
|
|
data = self.to_numpy(dtype=dtype, **kwargs)
|
|
return astype_nansafe(data, dtype, copy=False)
|
|
|
|
def _values_for_argsort(self) -> np.ndarray:
|
|
return self._data
|
|
|
|
def _cmp_method(self, other, op):
|
|
from pandas.arrays import BooleanArray, IntegerArray
|
|
|
|
mask = None
|
|
|
|
if isinstance(other, (BooleanArray, IntegerArray, FloatingArray)):
|
|
other, mask = other._data, other._mask
|
|
|
|
elif is_list_like(other):
|
|
other = np.asarray(other)
|
|
if other.ndim > 1:
|
|
raise NotImplementedError("can only perform ops with 1-d structures")
|
|
|
|
if other is libmissing.NA:
|
|
# numpy does not handle pd.NA well as "other" scalar (it returns
|
|
# a scalar False instead of an array)
|
|
# This may be fixed by NA.__array_ufunc__. Revisit this check
|
|
# once that's implemented.
|
|
result = np.zeros(self._data.shape, dtype="bool")
|
|
mask = np.ones(self._data.shape, dtype="bool")
|
|
else:
|
|
with warnings.catch_warnings():
|
|
# numpy may show a FutureWarning:
|
|
# elementwise comparison failed; returning scalar instead,
|
|
# but in the future will perform elementwise comparison
|
|
# before returning NotImplemented. We fall back to the correct
|
|
# behavior today, so that should be fine to ignore.
|
|
warnings.filterwarnings("ignore", "elementwise", FutureWarning)
|
|
with np.errstate(all="ignore"):
|
|
method = getattr(self._data, f"__{op.__name__}__")
|
|
result = method(other)
|
|
|
|
if result is NotImplemented:
|
|
result = invalid_comparison(self._data, other, op)
|
|
|
|
# nans propagate
|
|
if mask is None:
|
|
mask = self._mask.copy()
|
|
else:
|
|
mask = self._mask | mask
|
|
|
|
return BooleanArray(result, mask)
|
|
|
|
def sum(self, *, skipna=True, min_count=0, **kwargs):
|
|
nv.validate_sum((), kwargs)
|
|
return super()._reduce("sum", skipna=skipna, min_count=min_count)
|
|
|
|
def prod(self, *, skipna=True, min_count=0, **kwargs):
|
|
nv.validate_prod((), kwargs)
|
|
return super()._reduce("prod", skipna=skipna, min_count=min_count)
|
|
|
|
def min(self, *, skipna=True, **kwargs):
|
|
nv.validate_min((), kwargs)
|
|
return super()._reduce("min", skipna=skipna)
|
|
|
|
def max(self, *, skipna=True, **kwargs):
|
|
nv.validate_max((), kwargs)
|
|
return super()._reduce("max", skipna=skipna)
|
|
|
|
def _maybe_mask_result(self, result, mask, other, op_name: str):
|
|
"""
|
|
Parameters
|
|
----------
|
|
result : array-like
|
|
mask : array-like bool
|
|
other : scalar or array-like
|
|
op_name : str
|
|
"""
|
|
# TODO are there cases we don't end up with float?
|
|
# if we have a float operand we are by-definition
|
|
# a float result
|
|
# or our op is a divide
|
|
# if (is_float_dtype(other) or is_float(other)) or (
|
|
# op_name in ["rtruediv", "truediv"]
|
|
# ):
|
|
# result[mask] = np.nan
|
|
# return result
|
|
|
|
return type(self)(result, mask, copy=False)
|
|
|
|
|
|
_dtype_docstring = """
|
|
An ExtensionDtype for {dtype} data.
|
|
|
|
This dtype uses ``pd.NA`` as missing value indicator.
|
|
|
|
Attributes
|
|
----------
|
|
None
|
|
|
|
Methods
|
|
-------
|
|
None
|
|
"""
|
|
|
|
# create the Dtype
|
|
|
|
|
|
@register_extension_dtype
|
|
class Float32Dtype(FloatingDtype):
|
|
type = np.float32
|
|
name = "Float32"
|
|
__doc__ = _dtype_docstring.format(dtype="float32")
|
|
|
|
|
|
@register_extension_dtype
|
|
class Float64Dtype(FloatingDtype):
|
|
type = np.float64
|
|
name = "Float64"
|
|
__doc__ = _dtype_docstring.format(dtype="float64")
|
|
|
|
|
|
FLOAT_STR_TO_DTYPE = {
|
|
"float32": Float32Dtype(),
|
|
"float64": Float64Dtype(),
|
|
}
|