387 lines
12 KiB
Python
387 lines
12 KiB
Python
from __future__ import annotations
|
|
|
|
from typing import TYPE_CHECKING, Any, Optional, Sequence, Tuple, Type, TypeVar, Union
|
|
|
|
import numpy as np
|
|
|
|
from pandas._libs import lib, missing as libmissing
|
|
from pandas._typing import Scalar
|
|
from pandas.errors import AbstractMethodError
|
|
from pandas.util._decorators import cache_readonly, doc
|
|
|
|
from pandas.core.dtypes.base import ExtensionDtype
|
|
from pandas.core.dtypes.common import (
|
|
is_integer,
|
|
is_object_dtype,
|
|
is_scalar,
|
|
is_string_dtype,
|
|
)
|
|
from pandas.core.dtypes.missing import isna, notna
|
|
|
|
from pandas.core import nanops
|
|
from pandas.core.algorithms import factorize_array, take
|
|
from pandas.core.array_algos import masked_reductions
|
|
from pandas.core.arraylike import OpsMixin
|
|
from pandas.core.arrays import ExtensionArray
|
|
from pandas.core.indexers import check_array_indexer
|
|
|
|
if TYPE_CHECKING:
|
|
from pandas import Series
|
|
|
|
|
|
BaseMaskedArrayT = TypeVar("BaseMaskedArrayT", bound="BaseMaskedArray")
|
|
|
|
|
|
class BaseMaskedDtype(ExtensionDtype):
|
|
"""
|
|
Base class for dtypes for BasedMaskedArray subclasses.
|
|
"""
|
|
|
|
name: str
|
|
base = None
|
|
type: Type
|
|
|
|
na_value = libmissing.NA
|
|
|
|
@cache_readonly
|
|
def numpy_dtype(self) -> np.dtype:
|
|
""" Return an instance of our numpy dtype """
|
|
return np.dtype(self.type)
|
|
|
|
@cache_readonly
|
|
def kind(self) -> str:
|
|
return self.numpy_dtype.kind
|
|
|
|
@cache_readonly
|
|
def itemsize(self) -> int:
|
|
""" Return the number of bytes in this dtype """
|
|
return self.numpy_dtype.itemsize
|
|
|
|
@classmethod
|
|
def construct_array_type(cls) -> Type[BaseMaskedArray]:
|
|
"""
|
|
Return the array type associated with this dtype.
|
|
|
|
Returns
|
|
-------
|
|
type
|
|
"""
|
|
raise NotImplementedError
|
|
|
|
|
|
class BaseMaskedArray(OpsMixin, ExtensionArray):
|
|
"""
|
|
Base class for masked arrays (which use _data and _mask to store the data).
|
|
|
|
numpy based
|
|
"""
|
|
|
|
# The value used to fill '_data' to avoid upcasting
|
|
_internal_fill_value: Scalar
|
|
|
|
def __init__(self, values: np.ndarray, mask: np.ndarray, copy: bool = False):
|
|
# values is supposed to already be validated in the subclass
|
|
if not (isinstance(mask, np.ndarray) and mask.dtype == np.bool_):
|
|
raise TypeError(
|
|
"mask should be boolean numpy array. Use "
|
|
"the 'pd.array' function instead"
|
|
)
|
|
if values.ndim != 1:
|
|
raise ValueError("values must be a 1D array")
|
|
if mask.ndim != 1:
|
|
raise ValueError("mask must be a 1D array")
|
|
|
|
if copy:
|
|
values = values.copy()
|
|
mask = mask.copy()
|
|
|
|
self._data = values
|
|
self._mask = mask
|
|
|
|
@property
|
|
def dtype(self) -> BaseMaskedDtype:
|
|
raise AbstractMethodError(self)
|
|
|
|
def __getitem__(
|
|
self, item: Union[int, slice, np.ndarray]
|
|
) -> Union[BaseMaskedArray, Any]:
|
|
if is_integer(item):
|
|
if self._mask[item]:
|
|
return self.dtype.na_value
|
|
return self._data[item]
|
|
|
|
item = check_array_indexer(self, item)
|
|
|
|
return type(self)(self._data[item], self._mask[item])
|
|
|
|
def _coerce_to_array(self, values) -> Tuple[np.ndarray, np.ndarray]:
|
|
raise AbstractMethodError(self)
|
|
|
|
def __setitem__(self, key, value) -> None:
|
|
_is_scalar = is_scalar(value)
|
|
if _is_scalar:
|
|
value = [value]
|
|
value, mask = self._coerce_to_array(value)
|
|
|
|
if _is_scalar:
|
|
value = value[0]
|
|
mask = mask[0]
|
|
|
|
key = check_array_indexer(self, key)
|
|
self._data[key] = value
|
|
self._mask[key] = mask
|
|
|
|
def __iter__(self):
|
|
for i in range(len(self)):
|
|
if self._mask[i]:
|
|
yield self.dtype.na_value
|
|
else:
|
|
yield self._data[i]
|
|
|
|
def __len__(self) -> int:
|
|
return len(self._data)
|
|
|
|
def __invert__(self: BaseMaskedArrayT) -> BaseMaskedArrayT:
|
|
return type(self)(~self._data, self._mask.copy())
|
|
|
|
def to_numpy(
|
|
self, dtype=None, copy: bool = False, na_value: Scalar = lib.no_default
|
|
) -> np.ndarray:
|
|
"""
|
|
Convert to a NumPy Array.
|
|
|
|
By default converts to an object-dtype NumPy array. Specify the `dtype` and
|
|
`na_value` keywords to customize the conversion.
|
|
|
|
Parameters
|
|
----------
|
|
dtype : dtype, default object
|
|
The numpy dtype to convert to.
|
|
copy : bool, default False
|
|
Whether to ensure that the returned value is a not a view on
|
|
the array. Note that ``copy=False`` does not *ensure* that
|
|
``to_numpy()`` is no-copy. Rather, ``copy=True`` ensure that
|
|
a copy is made, even if not strictly necessary. This is typically
|
|
only possible when no missing values are present and `dtype`
|
|
is the equivalent numpy dtype.
|
|
na_value : scalar, optional
|
|
Scalar missing value indicator to use in numpy array. Defaults
|
|
to the native missing value indicator of this array (pd.NA).
|
|
|
|
Returns
|
|
-------
|
|
numpy.ndarray
|
|
|
|
Examples
|
|
--------
|
|
An object-dtype is the default result
|
|
|
|
>>> a = pd.array([True, False, pd.NA], dtype="boolean")
|
|
>>> a.to_numpy()
|
|
array([True, False, <NA>], dtype=object)
|
|
|
|
When no missing values are present, an equivalent dtype can be used.
|
|
|
|
>>> pd.array([True, False], dtype="boolean").to_numpy(dtype="bool")
|
|
array([ True, False])
|
|
>>> pd.array([1, 2], dtype="Int64").to_numpy("int64")
|
|
array([1, 2])
|
|
|
|
However, requesting such dtype will raise a ValueError if
|
|
missing values are present and the default missing value :attr:`NA`
|
|
is used.
|
|
|
|
>>> a = pd.array([True, False, pd.NA], dtype="boolean")
|
|
>>> a
|
|
<BooleanArray>
|
|
[True, False, <NA>]
|
|
Length: 3, dtype: boolean
|
|
|
|
>>> a.to_numpy(dtype="bool")
|
|
Traceback (most recent call last):
|
|
...
|
|
ValueError: cannot convert to bool numpy array in presence of missing values
|
|
|
|
Specify a valid `na_value` instead
|
|
|
|
>>> a.to_numpy(dtype="bool", na_value=False)
|
|
array([ True, False, False])
|
|
"""
|
|
if na_value is lib.no_default:
|
|
na_value = libmissing.NA
|
|
if dtype is None:
|
|
dtype = object
|
|
if self._hasna:
|
|
if (
|
|
not is_object_dtype(dtype)
|
|
and not is_string_dtype(dtype)
|
|
and na_value is libmissing.NA
|
|
):
|
|
raise ValueError(
|
|
f"cannot convert to '{dtype}'-dtype NumPy array "
|
|
"with missing values. Specify an appropriate 'na_value' "
|
|
"for this dtype."
|
|
)
|
|
# don't pass copy to astype -> always need a copy since we are mutating
|
|
data = self._data.astype(dtype)
|
|
data[self._mask] = na_value
|
|
else:
|
|
data = self._data.astype(dtype, copy=copy)
|
|
return data
|
|
|
|
__array_priority__ = 1000 # higher than ndarray so ops dispatch to us
|
|
|
|
def __array__(self, dtype=None) -> np.ndarray:
|
|
"""
|
|
the array interface, return my values
|
|
We return an object array here to preserve our scalar values
|
|
"""
|
|
return self.to_numpy(dtype=dtype)
|
|
|
|
def __arrow_array__(self, type=None):
|
|
"""
|
|
Convert myself into a pyarrow Array.
|
|
"""
|
|
import pyarrow as pa
|
|
|
|
return pa.array(self._data, mask=self._mask, type=type)
|
|
|
|
@property
|
|
def _hasna(self) -> bool:
|
|
# Note: this is expensive right now! The hope is that we can
|
|
# make this faster by having an optional mask, but not have to change
|
|
# source code using it..
|
|
return self._mask.any()
|
|
|
|
def isna(self) -> np.ndarray:
|
|
return self._mask
|
|
|
|
@property
|
|
def _na_value(self):
|
|
return self.dtype.na_value
|
|
|
|
@property
|
|
def nbytes(self) -> int:
|
|
return self._data.nbytes + self._mask.nbytes
|
|
|
|
@classmethod
|
|
def _concat_same_type(
|
|
cls: Type[BaseMaskedArrayT], to_concat: Sequence[BaseMaskedArrayT]
|
|
) -> BaseMaskedArrayT:
|
|
data = np.concatenate([x._data for x in to_concat])
|
|
mask = np.concatenate([x._mask for x in to_concat])
|
|
return cls(data, mask)
|
|
|
|
def take(
|
|
self: BaseMaskedArrayT,
|
|
indexer,
|
|
*,
|
|
allow_fill: bool = False,
|
|
fill_value: Optional[Scalar] = None,
|
|
) -> BaseMaskedArrayT:
|
|
# we always fill with 1 internally
|
|
# to avoid upcasting
|
|
data_fill_value = self._internal_fill_value if isna(fill_value) else fill_value
|
|
result = take(
|
|
self._data, indexer, fill_value=data_fill_value, allow_fill=allow_fill
|
|
)
|
|
|
|
mask = take(self._mask, indexer, fill_value=True, allow_fill=allow_fill)
|
|
|
|
# if we are filling
|
|
# we only fill where the indexer is null
|
|
# not existing missing values
|
|
# TODO(jreback) what if we have a non-na float as a fill value?
|
|
if allow_fill and notna(fill_value):
|
|
fill_mask = np.asarray(indexer) == -1
|
|
result[fill_mask] = fill_value
|
|
mask = mask ^ fill_mask
|
|
|
|
return type(self)(result, mask, copy=False)
|
|
|
|
def copy(self: BaseMaskedArrayT) -> BaseMaskedArrayT:
|
|
data, mask = self._data, self._mask
|
|
data = data.copy()
|
|
mask = mask.copy()
|
|
return type(self)(data, mask, copy=False)
|
|
|
|
@doc(ExtensionArray.factorize)
|
|
def factorize(self, na_sentinel: int = -1) -> Tuple[np.ndarray, ExtensionArray]:
|
|
arr = self._data
|
|
mask = self._mask
|
|
|
|
codes, uniques = factorize_array(arr, na_sentinel=na_sentinel, mask=mask)
|
|
|
|
# the hashtables don't handle all different types of bits
|
|
uniques = uniques.astype(self.dtype.numpy_dtype, copy=False)
|
|
uniques = type(self)(uniques, np.zeros(len(uniques), dtype=bool))
|
|
return codes, uniques
|
|
|
|
def value_counts(self, dropna: bool = True) -> "Series":
|
|
"""
|
|
Returns a Series containing counts of each unique value.
|
|
|
|
Parameters
|
|
----------
|
|
dropna : bool, default True
|
|
Don't include counts of missing values.
|
|
|
|
Returns
|
|
-------
|
|
counts : Series
|
|
|
|
See Also
|
|
--------
|
|
Series.value_counts
|
|
"""
|
|
from pandas import Index, Series
|
|
from pandas.arrays import IntegerArray
|
|
|
|
# compute counts on the data with no nans
|
|
data = self._data[~self._mask]
|
|
value_counts = Index(data).value_counts()
|
|
|
|
# TODO(extension)
|
|
# if we have allow Index to hold an ExtensionArray
|
|
# this is easier
|
|
index = value_counts.index._values.astype(object)
|
|
|
|
# if we want nans, count the mask
|
|
if dropna:
|
|
counts = value_counts._values
|
|
else:
|
|
counts = np.empty(len(value_counts) + 1, dtype="int64")
|
|
counts[:-1] = value_counts
|
|
counts[-1] = self._mask.sum()
|
|
|
|
index = Index(
|
|
np.concatenate([index, np.array([self.dtype.na_value], dtype=object)]),
|
|
dtype=object,
|
|
)
|
|
|
|
mask = np.zeros(len(counts), dtype="bool")
|
|
counts = IntegerArray(counts, mask)
|
|
|
|
return Series(counts, index=index)
|
|
|
|
def _reduce(self, name: str, *, skipna: bool = True, **kwargs):
|
|
data = self._data
|
|
mask = self._mask
|
|
|
|
if name in {"sum", "prod", "min", "max"}:
|
|
op = getattr(masked_reductions, name)
|
|
return op(data, mask, skipna=skipna, **kwargs)
|
|
|
|
# coerce to a nan-aware float if needed
|
|
# (we explicitly use NaN within reductions)
|
|
if self._hasna:
|
|
data = self.to_numpy("float64", na_value=np.nan)
|
|
|
|
op = getattr(nanops, "nan" + name)
|
|
result = op(data, axis=0, skipna=skipna, mask=mask, **kwargs)
|
|
|
|
if np.isnan(result):
|
|
return libmissing.NA
|
|
|
|
return result
|