292 lines
9.0 KiB
Python
292 lines
9.0 KiB
Python
|
from __future__ import annotations
|
||
|
|
||
|
import numbers
|
||
|
from typing import (
|
||
|
TYPE_CHECKING,
|
||
|
Any,
|
||
|
Callable,
|
||
|
Mapping,
|
||
|
TypeVar,
|
||
|
)
|
||
|
|
||
|
import numpy as np
|
||
|
|
||
|
from pandas._libs import (
|
||
|
lib,
|
||
|
missing as libmissing,
|
||
|
)
|
||
|
from pandas._typing import (
|
||
|
Dtype,
|
||
|
DtypeObj,
|
||
|
npt,
|
||
|
)
|
||
|
from pandas.errors import AbstractMethodError
|
||
|
from pandas.util._decorators import cache_readonly
|
||
|
|
||
|
from pandas.core.dtypes.common import (
|
||
|
is_bool_dtype,
|
||
|
is_float_dtype,
|
||
|
is_integer_dtype,
|
||
|
is_object_dtype,
|
||
|
is_string_dtype,
|
||
|
pandas_dtype,
|
||
|
)
|
||
|
|
||
|
from pandas.core.arrays.masked import (
|
||
|
BaseMaskedArray,
|
||
|
BaseMaskedDtype,
|
||
|
)
|
||
|
|
||
|
if TYPE_CHECKING:
|
||
|
import pyarrow
|
||
|
|
||
|
|
||
|
T = TypeVar("T", bound="NumericArray")
|
||
|
|
||
|
|
||
|
class NumericDtype(BaseMaskedDtype):
|
||
|
_default_np_dtype: np.dtype
|
||
|
_checker: Callable[[Any], bool] # is_foo_dtype
|
||
|
|
||
|
def __repr__(self) -> str:
|
||
|
return f"{self.name}Dtype()"
|
||
|
|
||
|
@cache_readonly
|
||
|
def is_signed_integer(self) -> bool:
|
||
|
return self.kind == "i"
|
||
|
|
||
|
@cache_readonly
|
||
|
def is_unsigned_integer(self) -> bool:
|
||
|
return self.kind == "u"
|
||
|
|
||
|
@property
|
||
|
def _is_numeric(self) -> bool:
|
||
|
return True
|
||
|
|
||
|
def __from_arrow__(
|
||
|
self, array: pyarrow.Array | pyarrow.ChunkedArray
|
||
|
) -> BaseMaskedArray:
|
||
|
"""
|
||
|
Construct IntegerArray/FloatingArray from pyarrow Array/ChunkedArray.
|
||
|
"""
|
||
|
import pyarrow
|
||
|
|
||
|
from pandas.core.arrays.arrow._arrow_utils import (
|
||
|
pyarrow_array_to_numpy_and_mask,
|
||
|
)
|
||
|
|
||
|
array_class = self.construct_array_type()
|
||
|
|
||
|
pyarrow_type = pyarrow.from_numpy_dtype(self.type)
|
||
|
if not array.type.equals(pyarrow_type):
|
||
|
# test_from_arrow_type_error raise for string, but allow
|
||
|
# through itemsize conversion GH#31896
|
||
|
rt_dtype = pandas_dtype(array.type.to_pandas_dtype())
|
||
|
if rt_dtype.kind not in ["i", "u", "f"]:
|
||
|
# Could allow "c" or potentially disallow float<->int conversion,
|
||
|
# but at the moment we specifically test that uint<->int works
|
||
|
raise TypeError(
|
||
|
f"Expected array of {self} type, got {array.type} instead"
|
||
|
)
|
||
|
|
||
|
array = array.cast(pyarrow_type)
|
||
|
|
||
|
if isinstance(array, pyarrow.Array):
|
||
|
chunks = [array]
|
||
|
else:
|
||
|
# pyarrow.ChunkedArray
|
||
|
chunks = array.chunks
|
||
|
|
||
|
results = []
|
||
|
for arr in chunks:
|
||
|
data, mask = pyarrow_array_to_numpy_and_mask(arr, dtype=self.numpy_dtype)
|
||
|
num_arr = array_class(data.copy(), ~mask, copy=False)
|
||
|
results.append(num_arr)
|
||
|
|
||
|
if not results:
|
||
|
return array_class(
|
||
|
np.array([], dtype=self.numpy_dtype), np.array([], dtype=np.bool_)
|
||
|
)
|
||
|
elif len(results) == 1:
|
||
|
# avoid additional copy in _concat_same_type
|
||
|
return results[0]
|
||
|
else:
|
||
|
return array_class._concat_same_type(results)
|
||
|
|
||
|
@classmethod
|
||
|
def _str_to_dtype_mapping(cls) -> Mapping[str, NumericDtype]:
|
||
|
raise AbstractMethodError(cls)
|
||
|
|
||
|
@classmethod
|
||
|
def _standardize_dtype(cls, dtype: NumericDtype | str | np.dtype) -> NumericDtype:
|
||
|
"""
|
||
|
Convert a string representation or a numpy dtype to NumericDtype.
|
||
|
"""
|
||
|
if isinstance(dtype, str) and (dtype.startswith(("Int", "UInt", "Float"))):
|
||
|
# Avoid DeprecationWarning from NumPy about np.dtype("Int64")
|
||
|
# https://github.com/numpy/numpy/pull/7476
|
||
|
dtype = dtype.lower()
|
||
|
|
||
|
if not isinstance(dtype, NumericDtype):
|
||
|
mapping = cls._str_to_dtype_mapping()
|
||
|
try:
|
||
|
dtype = mapping[str(np.dtype(dtype))]
|
||
|
except KeyError as err:
|
||
|
raise ValueError(f"invalid dtype specified {dtype}") from err
|
||
|
return dtype
|
||
|
|
||
|
@classmethod
|
||
|
def _safe_cast(cls, values: np.ndarray, dtype: np.dtype, copy: bool) -> np.ndarray:
|
||
|
"""
|
||
|
Safely cast the values to the given dtype.
|
||
|
|
||
|
"safe" in this context means the casting is lossless.
|
||
|
"""
|
||
|
raise AbstractMethodError(cls)
|
||
|
|
||
|
|
||
|
def _coerce_to_data_and_mask(values, mask, dtype, copy, dtype_cls, default_dtype):
|
||
|
checker = dtype_cls._checker
|
||
|
|
||
|
inferred_type = None
|
||
|
|
||
|
if dtype is None and hasattr(values, "dtype"):
|
||
|
if checker(values.dtype):
|
||
|
dtype = values.dtype
|
||
|
|
||
|
if dtype is not None:
|
||
|
dtype = dtype_cls._standardize_dtype(dtype)
|
||
|
|
||
|
cls = dtype_cls.construct_array_type()
|
||
|
if isinstance(values, cls):
|
||
|
values, mask = values._data, values._mask
|
||
|
if dtype is not None:
|
||
|
values = values.astype(dtype.numpy_dtype, copy=False)
|
||
|
|
||
|
if copy:
|
||
|
values = values.copy()
|
||
|
mask = mask.copy()
|
||
|
return values, mask, dtype, inferred_type
|
||
|
|
||
|
original = values
|
||
|
values = np.array(values, copy=copy)
|
||
|
inferred_type = None
|
||
|
if is_object_dtype(values.dtype) or is_string_dtype(values.dtype):
|
||
|
inferred_type = lib.infer_dtype(values, skipna=True)
|
||
|
if inferred_type == "boolean" and dtype is None:
|
||
|
name = dtype_cls.__name__.strip("_")
|
||
|
raise TypeError(f"{values.dtype} cannot be converted to {name}")
|
||
|
|
||
|
elif is_bool_dtype(values) and checker(dtype):
|
||
|
values = np.array(values, dtype=default_dtype, copy=copy)
|
||
|
|
||
|
elif not (is_integer_dtype(values) or is_float_dtype(values)):
|
||
|
name = dtype_cls.__name__.strip("_")
|
||
|
raise TypeError(f"{values.dtype} cannot be converted to {name}")
|
||
|
|
||
|
if values.ndim != 1:
|
||
|
raise TypeError("values must be a 1D list-like")
|
||
|
|
||
|
if mask is None:
|
||
|
if is_integer_dtype(values):
|
||
|
# fastpath
|
||
|
mask = np.zeros(len(values), dtype=np.bool_)
|
||
|
else:
|
||
|
mask = libmissing.is_numeric_na(values)
|
||
|
else:
|
||
|
assert len(mask) == len(values)
|
||
|
|
||
|
if mask.ndim != 1:
|
||
|
raise TypeError("mask must be a 1D list-like")
|
||
|
|
||
|
# infer dtype if needed
|
||
|
if dtype is None:
|
||
|
dtype = default_dtype
|
||
|
else:
|
||
|
dtype = dtype.type
|
||
|
|
||
|
if is_integer_dtype(dtype) and is_float_dtype(values.dtype) and len(values) > 0:
|
||
|
if mask.all():
|
||
|
values = np.ones(values.shape, dtype=dtype)
|
||
|
else:
|
||
|
idx = np.nanargmax(values)
|
||
|
if int(values[idx]) != original[idx]:
|
||
|
# We have ints that lost precision during the cast.
|
||
|
inferred_type = lib.infer_dtype(original, skipna=True)
|
||
|
if (
|
||
|
inferred_type not in ["floating", "mixed-integer-float"]
|
||
|
and not mask.any()
|
||
|
):
|
||
|
values = np.array(original, dtype=dtype, copy=False)
|
||
|
else:
|
||
|
values = np.array(original, dtype="object", copy=False)
|
||
|
|
||
|
# we copy as need to coerce here
|
||
|
if mask.any():
|
||
|
values = values.copy()
|
||
|
values[mask] = cls._internal_fill_value
|
||
|
if inferred_type in ("string", "unicode"):
|
||
|
# casts from str are always safe since they raise
|
||
|
# a ValueError if the str cannot be parsed into a float
|
||
|
values = values.astype(dtype, copy=copy)
|
||
|
else:
|
||
|
values = dtype_cls._safe_cast(values, dtype, copy=False)
|
||
|
|
||
|
return values, mask, dtype, inferred_type
|
||
|
|
||
|
|
||
|
class NumericArray(BaseMaskedArray):
|
||
|
"""
|
||
|
Base class for IntegerArray and FloatingArray.
|
||
|
"""
|
||
|
|
||
|
_dtype_cls: type[NumericDtype]
|
||
|
|
||
|
def __init__(
|
||
|
self, values: np.ndarray, mask: npt.NDArray[np.bool_], copy: bool = False
|
||
|
) -> None:
|
||
|
checker = self._dtype_cls._checker
|
||
|
if not (isinstance(values, np.ndarray) and checker(values.dtype)):
|
||
|
descr = (
|
||
|
"floating"
|
||
|
if self._dtype_cls.kind == "f" # type: ignore[comparison-overlap]
|
||
|
else "integer"
|
||
|
)
|
||
|
raise TypeError(
|
||
|
f"values should be {descr} numpy array. Use "
|
||
|
"the 'pd.array' function instead"
|
||
|
)
|
||
|
if values.dtype == np.float16:
|
||
|
# If we don't raise here, then accessing self.dtype would raise
|
||
|
raise TypeError("FloatingArray does not support np.float16 dtype.")
|
||
|
|
||
|
super().__init__(values, mask, copy=copy)
|
||
|
|
||
|
@cache_readonly
|
||
|
def dtype(self) -> NumericDtype:
|
||
|
mapping = self._dtype_cls._str_to_dtype_mapping()
|
||
|
return mapping[str(self._data.dtype)]
|
||
|
|
||
|
@classmethod
|
||
|
def _coerce_to_array(
|
||
|
cls, value, *, dtype: DtypeObj, copy: bool = False
|
||
|
) -> tuple[np.ndarray, np.ndarray]:
|
||
|
dtype_cls = cls._dtype_cls
|
||
|
default_dtype = dtype_cls._default_np_dtype
|
||
|
mask = None
|
||
|
values, mask, _, _ = _coerce_to_data_and_mask(
|
||
|
value, mask, dtype, copy, dtype_cls, default_dtype
|
||
|
)
|
||
|
return values, mask
|
||
|
|
||
|
@classmethod
|
||
|
def _from_sequence_of_strings(
|
||
|
cls: type[T], strings, *, dtype: Dtype | None = None, copy: bool = False
|
||
|
) -> T:
|
||
|
from pandas.core.tools.numeric import to_numeric
|
||
|
|
||
|
scalars = to_numeric(strings, errors="raise", dtype_backend="numpy_nullable")
|
||
|
return cls._from_sequence(scalars, dtype=dtype, copy=copy)
|
||
|
|
||
|
_HANDLED_TYPES = (np.ndarray, numbers.Number)
|