projektAI/venv/Lib/site-packages/pandas/core/arrays/floating.py
2021-06-06 22:13:05 +02:00

516 lines
16 KiB
Python

import numbers
from typing import TYPE_CHECKING, List, Optional, Tuple, Type, Union
import warnings
import numpy as np
from pandas._libs import lib, missing as libmissing
from pandas._typing import ArrayLike, DtypeObj
from pandas.compat.numpy import function as nv
from pandas.util._decorators import cache_readonly
from pandas.core.dtypes.cast import astype_nansafe
from pandas.core.dtypes.common import (
is_bool_dtype,
is_datetime64_dtype,
is_float_dtype,
is_integer_dtype,
is_list_like,
is_object_dtype,
pandas_dtype,
)
from pandas.core.dtypes.dtypes import register_extension_dtype
from pandas.core.dtypes.missing import isna
from pandas.core import ops
from pandas.core.ops import invalid_comparison
from pandas.core.tools.numeric import to_numeric
from .masked import BaseMaskedDtype
from .numeric import NumericArray
if TYPE_CHECKING:
import pyarrow
class FloatingDtype(BaseMaskedDtype):
"""
An ExtensionDtype to hold a single size of floating dtype.
These specific implementations are subclasses of the non-public
FloatingDtype. For example we have Float32Dtype to represent float32.
The attributes name & type are set when these subclasses are created.
"""
def __repr__(self) -> str:
return f"{self.name}Dtype()"
@property
def _is_numeric(self) -> bool:
return True
@classmethod
def construct_array_type(cls) -> Type["FloatingArray"]:
"""
Return the array type associated with this dtype.
Returns
-------
type
"""
return FloatingArray
def _get_common_dtype(self, dtypes: List[DtypeObj]) -> Optional[DtypeObj]:
# for now only handle other floating types
if not all(isinstance(t, FloatingDtype) for t in dtypes):
return None
np_dtype = np.find_common_type(
[t.numpy_dtype for t in dtypes], [] # type: ignore[union-attr]
)
if np.issubdtype(np_dtype, np.floating):
return FLOAT_STR_TO_DTYPE[str(np_dtype)]
return None
def __from_arrow__(
self, array: Union["pyarrow.Array", "pyarrow.ChunkedArray"]
) -> "FloatingArray":
"""
Construct FloatingArray from pyarrow Array/ChunkedArray.
"""
import pyarrow
from pandas.core.arrays._arrow_utils import pyarrow_array_to_numpy_and_mask
pyarrow_type = pyarrow.from_numpy_dtype(self.type)
if not array.type.equals(pyarrow_type):
array = array.cast(pyarrow_type)
if isinstance(array, pyarrow.Array):
chunks = [array]
else:
# pyarrow.ChunkedArray
chunks = array.chunks
results = []
for arr in chunks:
data, mask = pyarrow_array_to_numpy_and_mask(arr, dtype=self.type)
float_arr = FloatingArray(data.copy(), ~mask, copy=False)
results.append(float_arr)
return FloatingArray._concat_same_type(results)
def coerce_to_array(
values, dtype=None, mask=None, copy: bool = False
) -> Tuple[np.ndarray, np.ndarray]:
"""
Coerce the input values array to numpy arrays with a mask.
Parameters
----------
values : 1D list-like
dtype : float dtype
mask : bool 1D array, optional
copy : bool, default False
if True, copy the input
Returns
-------
tuple of (values, mask)
"""
# if values is floating numpy array, preserve its dtype
if dtype is None and hasattr(values, "dtype"):
if is_float_dtype(values.dtype):
dtype = values.dtype
if dtype is not None:
if isinstance(dtype, str) and dtype.startswith("Float"):
# Avoid DeprecationWarning from NumPy about np.dtype("Float64")
# https://github.com/numpy/numpy/pull/7476
dtype = dtype.lower()
if not issubclass(type(dtype), FloatingDtype):
try:
dtype = FLOAT_STR_TO_DTYPE[str(np.dtype(dtype))]
except KeyError as err:
raise ValueError(f"invalid dtype specified {dtype}") from err
if isinstance(values, FloatingArray):
values, mask = values._data, values._mask
if dtype is not None:
values = values.astype(dtype.numpy_dtype, copy=False)
if copy:
values = values.copy()
mask = mask.copy()
return values, mask
values = np.array(values, copy=copy)
if is_object_dtype(values):
inferred_type = lib.infer_dtype(values, skipna=True)
if inferred_type == "empty":
values = np.empty(len(values))
values.fill(np.nan)
elif inferred_type not in [
"floating",
"integer",
"mixed-integer",
"integer-na",
"mixed-integer-float",
]:
raise TypeError(f"{values.dtype} cannot be converted to a FloatingDtype")
elif is_bool_dtype(values) and is_float_dtype(dtype):
values = np.array(values, dtype=float, copy=copy)
elif not (is_integer_dtype(values) or is_float_dtype(values)):
raise TypeError(f"{values.dtype} cannot be converted to a FloatingDtype")
if mask is None:
mask = isna(values)
else:
assert len(mask) == len(values)
if not values.ndim == 1:
raise TypeError("values must be a 1D list-like")
if not mask.ndim == 1:
raise TypeError("mask must be a 1D list-like")
# infer dtype if needed
if dtype is None:
dtype = np.dtype("float64")
else:
dtype = dtype.type
# if we are float, let's make sure that we can
# safely cast
# we copy as need to coerce here
# TODO should this be a safe cast?
if mask.any():
values = values.copy()
values[mask] = np.nan
values = values.astype(dtype, copy=False) # , casting="safe")
else:
values = values.astype(dtype, copy=False) # , casting="safe")
return values, mask
class FloatingArray(NumericArray):
"""
Array of floating (optional missing) values.
.. versionadded:: 1.2.0
.. warning::
FloatingArray is currently experimental, and its API or internal
implementation may change without warning. Expecially the behaviour
regarding NaN (distinct from NA missing values) is subject to change.
We represent a FloatingArray with 2 numpy arrays:
- data: contains a numpy float array of the appropriate dtype
- mask: a boolean array holding a mask on the data, True is missing
To construct an FloatingArray from generic array-like input, use
:func:`pandas.array` with one of the float dtypes (see examples).
See :ref:`integer_na` for more.
Parameters
----------
values : numpy.ndarray
A 1-d float-dtype array.
mask : numpy.ndarray
A 1-d boolean-dtype array indicating missing values.
copy : bool, default False
Whether to copy the `values` and `mask`.
Attributes
----------
None
Methods
-------
None
Returns
-------
FloatingArray
Examples
--------
Create an FloatingArray with :func:`pandas.array`:
>>> pd.array([0.1, None, 0.3], dtype=pd.Float32Dtype())
<FloatingArray>
[0.1, <NA>, 0.3]
Length: 3, dtype: Float32
String aliases for the dtypes are also available. They are capitalized.
>>> pd.array([0.1, None, 0.3], dtype="Float32")
<FloatingArray>
[0.1, <NA>, 0.3]
Length: 3, dtype: Float32
"""
# The value used to fill '_data' to avoid upcasting
_internal_fill_value = 0.0
@cache_readonly
def dtype(self) -> FloatingDtype:
return FLOAT_STR_TO_DTYPE[str(self._data.dtype)]
def __init__(self, values: np.ndarray, mask: np.ndarray, copy: bool = False):
if not (isinstance(values, np.ndarray) and values.dtype.kind == "f"):
raise TypeError(
"values should be floating numpy array. Use "
"the 'pd.array' function instead"
)
super().__init__(values, mask, copy=copy)
@classmethod
def _from_sequence(
cls, scalars, *, dtype=None, copy: bool = False
) -> "FloatingArray":
values, mask = coerce_to_array(scalars, dtype=dtype, copy=copy)
return FloatingArray(values, mask)
@classmethod
def _from_sequence_of_strings(
cls, strings, *, dtype=None, copy: bool = False
) -> "FloatingArray":
scalars = to_numeric(strings, errors="raise")
return cls._from_sequence(scalars, dtype=dtype, copy=copy)
_HANDLED_TYPES = (np.ndarray, numbers.Number)
def __array_ufunc__(self, ufunc, method: str, *inputs, **kwargs):
# For FloatingArray inputs, we apply the ufunc to ._data
# and mask the result.
if method == "reduce":
# Not clear how to handle missing values in reductions. Raise.
raise NotImplementedError("The 'reduce' method is not supported.")
out = kwargs.get("out", ())
for x in inputs + out:
if not isinstance(x, self._HANDLED_TYPES + (FloatingArray,)):
return NotImplemented
# for binary ops, use our custom dunder methods
result = ops.maybe_dispatch_ufunc_to_dunder_op(
self, ufunc, method, *inputs, **kwargs
)
if result is not NotImplemented:
return result
mask = np.zeros(len(self), dtype=bool)
inputs2 = []
for x in inputs:
if isinstance(x, FloatingArray):
mask |= x._mask
inputs2.append(x._data)
else:
inputs2.append(x)
def reconstruct(x):
# we don't worry about scalar `x` here, since we
# raise for reduce up above.
# TODO
if is_float_dtype(x.dtype):
m = mask.copy()
return FloatingArray(x, m)
else:
x[mask] = np.nan
return x
result = getattr(ufunc, method)(*inputs2, **kwargs)
if isinstance(result, tuple):
tuple(reconstruct(x) for x in result)
else:
return reconstruct(result)
def _coerce_to_array(self, value) -> Tuple[np.ndarray, np.ndarray]:
return coerce_to_array(value, dtype=self.dtype)
def astype(self, dtype, copy: bool = True) -> ArrayLike:
"""
Cast to a NumPy array or ExtensionArray with 'dtype'.
Parameters
----------
dtype : str or dtype
Typecode or data-type to which the array is cast.
copy : bool, default True
Whether to copy the data, even if not necessary. If False,
a copy is made only if the old dtype does not match the
new dtype.
Returns
-------
ndarray or ExtensionArray
NumPy ndarray, or BooleanArray, IntegerArray or FloatingArray with
'dtype' for its dtype.
Raises
------
TypeError
if incompatible type with an FloatingDtype, equivalent of same_kind
casting
"""
from pandas.core.arrays.string_ import StringArray, StringDtype
dtype = pandas_dtype(dtype)
# if the dtype is exactly the same, we can fastpath
if self.dtype == dtype:
# return the same object for copy=False
return self.copy() if copy else self
# if we are astyping to another nullable masked dtype, we can fastpath
if isinstance(dtype, BaseMaskedDtype):
# TODO deal with NaNs
data = self._data.astype(dtype.numpy_dtype, copy=copy)
# mask is copied depending on whether the data was copied, and
# not directly depending on the `copy` keyword
mask = self._mask if data is self._data else self._mask.copy()
return dtype.construct_array_type()(data, mask, copy=False)
elif isinstance(dtype, StringDtype):
return StringArray._from_sequence(self, copy=False)
# coerce
if is_float_dtype(dtype):
# In astype, we consider dtype=float to also mean na_value=np.nan
kwargs = {"na_value": np.nan}
elif is_datetime64_dtype(dtype):
kwargs = {"na_value": np.datetime64("NaT")}
else:
kwargs = {}
data = self.to_numpy(dtype=dtype, **kwargs)
return astype_nansafe(data, dtype, copy=False)
def _values_for_argsort(self) -> np.ndarray:
return self._data
def _cmp_method(self, other, op):
from pandas.arrays import BooleanArray, IntegerArray
mask = None
if isinstance(other, (BooleanArray, IntegerArray, FloatingArray)):
other, mask = other._data, other._mask
elif is_list_like(other):
other = np.asarray(other)
if other.ndim > 1:
raise NotImplementedError("can only perform ops with 1-d structures")
if other is libmissing.NA:
# numpy does not handle pd.NA well as "other" scalar (it returns
# a scalar False instead of an array)
# This may be fixed by NA.__array_ufunc__. Revisit this check
# once that's implemented.
result = np.zeros(self._data.shape, dtype="bool")
mask = np.ones(self._data.shape, dtype="bool")
else:
with warnings.catch_warnings():
# numpy may show a FutureWarning:
# elementwise comparison failed; returning scalar instead,
# but in the future will perform elementwise comparison
# before returning NotImplemented. We fall back to the correct
# behavior today, so that should be fine to ignore.
warnings.filterwarnings("ignore", "elementwise", FutureWarning)
with np.errstate(all="ignore"):
method = getattr(self._data, f"__{op.__name__}__")
result = method(other)
if result is NotImplemented:
result = invalid_comparison(self._data, other, op)
# nans propagate
if mask is None:
mask = self._mask.copy()
else:
mask = self._mask | mask
return BooleanArray(result, mask)
def sum(self, *, skipna=True, min_count=0, **kwargs):
nv.validate_sum((), kwargs)
return super()._reduce("sum", skipna=skipna, min_count=min_count)
def prod(self, *, skipna=True, min_count=0, **kwargs):
nv.validate_prod((), kwargs)
return super()._reduce("prod", skipna=skipna, min_count=min_count)
def min(self, *, skipna=True, **kwargs):
nv.validate_min((), kwargs)
return super()._reduce("min", skipna=skipna)
def max(self, *, skipna=True, **kwargs):
nv.validate_max((), kwargs)
return super()._reduce("max", skipna=skipna)
def _maybe_mask_result(self, result, mask, other, op_name: str):
"""
Parameters
----------
result : array-like
mask : array-like bool
other : scalar or array-like
op_name : str
"""
# TODO are there cases we don't end up with float?
# if we have a float operand we are by-definition
# a float result
# or our op is a divide
# if (is_float_dtype(other) or is_float(other)) or (
# op_name in ["rtruediv", "truediv"]
# ):
# result[mask] = np.nan
# return result
return type(self)(result, mask, copy=False)
_dtype_docstring = """
An ExtensionDtype for {dtype} data.
This dtype uses ``pd.NA`` as missing value indicator.
Attributes
----------
None
Methods
-------
None
"""
# create the Dtype
@register_extension_dtype
class Float32Dtype(FloatingDtype):
type = np.float32
name = "Float32"
__doc__ = _dtype_docstring.format(dtype="float32")
@register_extension_dtype
class Float64Dtype(FloatingDtype):
type = np.float64
name = "Float64"
__doc__ = _dtype_docstring.format(dtype="float64")
FLOAT_STR_TO_DTYPE = {
"float32": Float32Dtype(),
"float64": Float64Dtype(),
}