from __future__ import annotations import numbers from typing import ( TYPE_CHECKING, Any, Callable, Mapping, TypeVar, ) import numpy as np from pandas._libs import ( lib, missing as libmissing, ) from pandas._typing import ( Dtype, DtypeObj, npt, ) from pandas.errors import AbstractMethodError from pandas.util._decorators import cache_readonly from pandas.core.dtypes.common import ( is_bool_dtype, is_float_dtype, is_integer_dtype, is_object_dtype, is_string_dtype, pandas_dtype, ) from pandas.core.arrays.masked import ( BaseMaskedArray, BaseMaskedDtype, ) if TYPE_CHECKING: import pyarrow T = TypeVar("T", bound="NumericArray") class NumericDtype(BaseMaskedDtype): _default_np_dtype: np.dtype _checker: Callable[[Any], bool] # is_foo_dtype def __repr__(self) -> str: return f"{self.name}Dtype()" @cache_readonly def is_signed_integer(self) -> bool: return self.kind == "i" @cache_readonly def is_unsigned_integer(self) -> bool: return self.kind == "u" @property def _is_numeric(self) -> bool: return True def __from_arrow__( self, array: pyarrow.Array | pyarrow.ChunkedArray ) -> BaseMaskedArray: """ Construct IntegerArray/FloatingArray from pyarrow Array/ChunkedArray. """ import pyarrow from pandas.core.arrays.arrow._arrow_utils import ( pyarrow_array_to_numpy_and_mask, ) array_class = self.construct_array_type() pyarrow_type = pyarrow.from_numpy_dtype(self.type) if not array.type.equals(pyarrow_type): # test_from_arrow_type_error raise for string, but allow # through itemsize conversion GH#31896 rt_dtype = pandas_dtype(array.type.to_pandas_dtype()) if rt_dtype.kind not in ["i", "u", "f"]: # Could allow "c" or potentially disallow float<->int conversion, # but at the moment we specifically test that uint<->int works raise TypeError( f"Expected array of {self} type, got {array.type} instead" ) array = array.cast(pyarrow_type) if isinstance(array, pyarrow.Array): chunks = [array] else: # pyarrow.ChunkedArray chunks = array.chunks results = [] for arr in chunks: data, mask = pyarrow_array_to_numpy_and_mask(arr, dtype=self.numpy_dtype) num_arr = array_class(data.copy(), ~mask, copy=False) results.append(num_arr) if not results: return array_class( np.array([], dtype=self.numpy_dtype), np.array([], dtype=np.bool_) ) elif len(results) == 1: # avoid additional copy in _concat_same_type return results[0] else: return array_class._concat_same_type(results) @classmethod def _str_to_dtype_mapping(cls) -> Mapping[str, NumericDtype]: raise AbstractMethodError(cls) @classmethod def _standardize_dtype(cls, dtype: NumericDtype | str | np.dtype) -> NumericDtype: """ Convert a string representation or a numpy dtype to NumericDtype. """ if isinstance(dtype, str) and (dtype.startswith(("Int", "UInt", "Float"))): # Avoid DeprecationWarning from NumPy about np.dtype("Int64") # https://github.com/numpy/numpy/pull/7476 dtype = dtype.lower() if not isinstance(dtype, NumericDtype): mapping = cls._str_to_dtype_mapping() try: dtype = mapping[str(np.dtype(dtype))] except KeyError as err: raise ValueError(f"invalid dtype specified {dtype}") from err return dtype @classmethod def _safe_cast(cls, values: np.ndarray, dtype: np.dtype, copy: bool) -> np.ndarray: """ Safely cast the values to the given dtype. "safe" in this context means the casting is lossless. """ raise AbstractMethodError(cls) def _coerce_to_data_and_mask(values, mask, dtype, copy, dtype_cls, default_dtype): checker = dtype_cls._checker inferred_type = None if dtype is None and hasattr(values, "dtype"): if checker(values.dtype): dtype = values.dtype if dtype is not None: dtype = dtype_cls._standardize_dtype(dtype) cls = dtype_cls.construct_array_type() if isinstance(values, cls): values, mask = values._data, values._mask if dtype is not None: values = values.astype(dtype.numpy_dtype, copy=False) if copy: values = values.copy() mask = mask.copy() return values, mask, dtype, inferred_type original = values values = np.array(values, copy=copy) inferred_type = None if is_object_dtype(values.dtype) or is_string_dtype(values.dtype): inferred_type = lib.infer_dtype(values, skipna=True) if inferred_type == "boolean" and dtype is None: name = dtype_cls.__name__.strip("_") raise TypeError(f"{values.dtype} cannot be converted to {name}") elif is_bool_dtype(values) and checker(dtype): values = np.array(values, dtype=default_dtype, copy=copy) elif not (is_integer_dtype(values) or is_float_dtype(values)): name = dtype_cls.__name__.strip("_") raise TypeError(f"{values.dtype} cannot be converted to {name}") if values.ndim != 1: raise TypeError("values must be a 1D list-like") if mask is None: if is_integer_dtype(values): # fastpath mask = np.zeros(len(values), dtype=np.bool_) else: mask = libmissing.is_numeric_na(values) else: assert len(mask) == len(values) if mask.ndim != 1: raise TypeError("mask must be a 1D list-like") # infer dtype if needed if dtype is None: dtype = default_dtype else: dtype = dtype.type if is_integer_dtype(dtype) and is_float_dtype(values.dtype) and len(values) > 0: if mask.all(): values = np.ones(values.shape, dtype=dtype) else: idx = np.nanargmax(values) if int(values[idx]) != original[idx]: # We have ints that lost precision during the cast. inferred_type = lib.infer_dtype(original, skipna=True) if ( inferred_type not in ["floating", "mixed-integer-float"] and not mask.any() ): values = np.array(original, dtype=dtype, copy=False) else: values = np.array(original, dtype="object", copy=False) # we copy as need to coerce here if mask.any(): values = values.copy() values[mask] = cls._internal_fill_value if inferred_type in ("string", "unicode"): # casts from str are always safe since they raise # a ValueError if the str cannot be parsed into a float values = values.astype(dtype, copy=copy) else: values = dtype_cls._safe_cast(values, dtype, copy=False) return values, mask, dtype, inferred_type class NumericArray(BaseMaskedArray): """ Base class for IntegerArray and FloatingArray. """ _dtype_cls: type[NumericDtype] def __init__( self, values: np.ndarray, mask: npt.NDArray[np.bool_], copy: bool = False ) -> None: checker = self._dtype_cls._checker if not (isinstance(values, np.ndarray) and checker(values.dtype)): descr = ( "floating" if self._dtype_cls.kind == "f" # type: ignore[comparison-overlap] else "integer" ) raise TypeError( f"values should be {descr} numpy array. Use " "the 'pd.array' function instead" ) if values.dtype == np.float16: # If we don't raise here, then accessing self.dtype would raise raise TypeError("FloatingArray does not support np.float16 dtype.") super().__init__(values, mask, copy=copy) @cache_readonly def dtype(self) -> NumericDtype: mapping = self._dtype_cls._str_to_dtype_mapping() return mapping[str(self._data.dtype)] @classmethod def _coerce_to_array( cls, value, *, dtype: DtypeObj, copy: bool = False ) -> tuple[np.ndarray, np.ndarray]: dtype_cls = cls._dtype_cls default_dtype = dtype_cls._default_np_dtype mask = None values, mask, _, _ = _coerce_to_data_and_mask( value, mask, dtype, copy, dtype_cls, default_dtype ) return values, mask @classmethod def _from_sequence_of_strings( cls: type[T], strings, *, dtype: Dtype | None = None, copy: bool = False ) -> T: from pandas.core.tools.numeric import to_numeric scalars = to_numeric(strings, errors="raise", dtype_backend="numpy_nullable") return cls._from_sequence(scalars, dtype=dtype, copy=copy) _HANDLED_TYPES = (np.ndarray, numbers.Number)