2021-06-06 22:13:05 +02:00

723 lines
22 KiB

import numbers
from typing import TYPE_CHECKING, List, Tuple, Type, Union
import warnings
import numpy as np
from pandas._libs import lib, missing as libmissing
from pandas._typing import ArrayLike
from pandas.compat.numpy import function as nv
from pandas.core.dtypes.common import (
from pandas.core.dtypes.dtypes import register_extension_dtype
from pandas.core.dtypes.missing import isna
from pandas.core import ops
from .masked import BaseMaskedArray, BaseMaskedDtype
import pyarrow
class BooleanDtype(BaseMaskedDtype):
Extension dtype for boolean data.
.. versionadded:: 1.0.0
.. warning::
BooleanDtype is considered experimental. The implementation and
parts of the API may change without warning.
>>> pd.BooleanDtype()
name = "boolean"
# mypy:
def type(self) -> Type: # type: ignore[override]
return np.bool_
def kind(self) -> str:
return "b"
def numpy_dtype(self) -> np.dtype:
return np.dtype("bool")
def construct_array_type(cls) -> Type["BooleanArray"]:
Return the array type associated with this dtype.
return BooleanArray
def __repr__(self) -> str:
return "BooleanDtype"
def _is_boolean(self) -> bool:
return True
def _is_numeric(self) -> bool:
return True
def __from_arrow__(
self, array: Union["pyarrow.Array", "pyarrow.ChunkedArray"]
) -> "BooleanArray":
Construct BooleanArray from pyarrow Array/ChunkedArray.
import pyarrow
if isinstance(array, pyarrow.Array):
chunks = [array]
# pyarrow.ChunkedArray
chunks = array.chunks
results = []
for arr in chunks:
# TODO should optimize this without going through object array
bool_arr = BooleanArray._from_sequence(np.array(arr))
return BooleanArray._concat_same_type(results)
def coerce_to_array(
values, mask=None, copy: bool = False
) -> Tuple[np.ndarray, np.ndarray]:
Coerce the input values array to numpy arrays with a mask.
values : 1D list-like
mask : bool 1D array, optional
copy : bool, default False
if True, copy the input
tuple of (values, mask)
if isinstance(values, BooleanArray):
if mask is not None:
raise ValueError("cannot pass mask for BooleanArray input")
values, mask = values._data, values._mask
if copy:
values = values.copy()
mask = mask.copy()
return values, mask
mask_values = None
if isinstance(values, np.ndarray) and values.dtype == np.bool_:
if copy:
values = values.copy()
elif isinstance(values, np.ndarray) and is_numeric_dtype(values.dtype):
mask_values = isna(values)
values_bool = np.zeros(len(values), dtype=bool)
values_bool[~mask_values] = values[~mask_values].astype(bool)
if not np.all(
values_bool[~mask_values].astype(values.dtype) == values[~mask_values]
raise TypeError("Need to pass bool-like values")
values = values_bool
values_object = np.asarray(values, dtype=object)
inferred_dtype = lib.infer_dtype(values_object, skipna=True)
integer_like = ("floating", "integer", "mixed-integer-float")
if inferred_dtype not in ("boolean", "empty") + integer_like:
raise TypeError("Need to pass bool-like values")
mask_values = isna(values_object)
values = np.zeros(len(values), dtype=bool)
values[~mask_values] = values_object[~mask_values].astype(bool)
# if the values were integer-like, validate it were actually 0/1's
if (inferred_dtype in integer_like) and not (
== values_object[~mask_values].astype(float)
raise TypeError("Need to pass bool-like values")
if mask is None and mask_values is None:
mask = np.zeros(len(values), dtype=bool)
elif mask is None:
mask = mask_values
if isinstance(mask, np.ndarray) and mask.dtype == np.bool_:
if mask_values is not None:
mask = mask | mask_values
if copy:
mask = mask.copy()
mask = np.array(mask, dtype=bool)
if mask_values is not None:
mask = mask | mask_values
if values.ndim != 1:
raise ValueError("values must be a 1D list-like")
if mask.ndim != 1:
raise ValueError("mask must be a 1D list-like")
return values, mask
class BooleanArray(BaseMaskedArray):
Array of boolean (True/False) data with missing values.
This is a pandas Extension array for boolean data, under the hood
represented by 2 numpy arrays: a boolean array with the data and
a boolean array with the mask (True indicating missing).
BooleanArray implements Kleene logic (sometimes called three-value
logic) for logical operations. See :ref:`boolean.kleene` for more.
To construct an BooleanArray from generic array-like input, use
:func:`pandas.array` specifying ``dtype="boolean"`` (see examples
.. versionadded:: 1.0.0
.. warning::
BooleanArray is considered experimental. The implementation and
parts of the API may change without warning.
values : numpy.ndarray
A 1-d boolean-dtype array with the data.
mask : numpy.ndarray
A 1-d boolean-dtype array indicating missing values (True
indicates missing).
copy : bool, default False
Whether to copy the `values` and `mask` arrays.
Create an BooleanArray with :func:`pandas.array`:
>>> pd.array([True, False, None], dtype="boolean")
[True, False, <NA>]
Length: 3, dtype: boolean
# The value used to fill '_data' to avoid upcasting
_internal_fill_value = False
def __init__(self, values: np.ndarray, mask: np.ndarray, copy: bool = False):
if not (isinstance(values, np.ndarray) and values.dtype == np.bool_):
raise TypeError(
"values should be boolean numpy array. Use "
"the 'pd.array' function instead"
self._dtype = BooleanDtype()
super().__init__(values, mask, copy=copy)
def dtype(self) -> BooleanDtype:
return self._dtype
def _from_sequence(
cls, scalars, *, dtype=None, copy: bool = False
) -> "BooleanArray":
if dtype:
assert dtype == "boolean"
values, mask = coerce_to_array(scalars, copy=copy)
return BooleanArray(values, mask)
def _from_sequence_of_strings(
cls, strings: List[str], *, dtype=None, copy: bool = False
) -> "BooleanArray":
def map_string(s):
if isna(s):
return s
elif s in ["True", "TRUE", "true", "1", "1.0"]:
return True
elif s in ["False", "FALSE", "false", "0", "0.0"]:
return False
raise ValueError(f"{s} cannot be cast to bool")
scalars = [map_string(x) for x in strings]
return cls._from_sequence(scalars, dtype=dtype, copy=copy)
_HANDLED_TYPES = (np.ndarray, numbers.Number, bool, np.bool_)
def __array_ufunc__(self, ufunc, method: str, *inputs, **kwargs):
# For BooleanArray inputs, we apply the ufunc to ._data
# and mask the result.
if method == "reduce":
# Not clear how to handle missing values in reductions. Raise.
raise NotImplementedError("The 'reduce' method is not supported.")
out = kwargs.get("out", ())
for x in inputs + out:
if not isinstance(x, self._HANDLED_TYPES + (BooleanArray,)):
return NotImplemented
# for binary ops, use our custom dunder methods
result = ops.maybe_dispatch_ufunc_to_dunder_op(
self, ufunc, method, *inputs, **kwargs
if result is not NotImplemented:
return result
mask = np.zeros(len(self), dtype=bool)
inputs2 = []
for x in inputs:
if isinstance(x, BooleanArray):
mask |= x._mask
def reconstruct(x):
# we don't worry about scalar `x` here, since we
# raise for reduce up above.
if is_bool_dtype(x.dtype):
m = mask.copy()
return BooleanArray(x, m)
x[mask] = np.nan
return x
result = getattr(ufunc, method)(*inputs2, **kwargs)
if isinstance(result, tuple):
tuple(reconstruct(x) for x in result)
return reconstruct(result)
def _coerce_to_array(self, value) -> Tuple[np.ndarray, np.ndarray]:
return coerce_to_array(value)
def astype(self, dtype, copy: bool = True) -> ArrayLike:
Cast to a NumPy array or ExtensionArray with 'dtype'.
dtype : str or dtype
Typecode or data-type to which the array is cast.
copy : bool, default True
Whether to copy the data, even if not necessary. If False,
a copy is made only if the old dtype does not match the
new dtype.
ndarray or ExtensionArray
NumPy ndarray, BooleanArray or IntegerArray with 'dtype' for its dtype.
if incompatible type with an BooleanDtype, equivalent of same_kind
from pandas.core.arrays.string_ import StringDtype
dtype = pandas_dtype(dtype)
if isinstance(dtype, BooleanDtype):
values, mask = coerce_to_array(self, copy=copy)
if not copy:
return self
return BooleanArray(values, mask, copy=False)
elif isinstance(dtype, StringDtype):
return dtype.construct_array_type()._from_sequence(self, copy=False)
if is_bool_dtype(dtype):
# astype_nansafe converts np.nan to True
if self._hasna:
raise ValueError("cannot convert float NaN to bool")
return self._data.astype(dtype, copy=copy)
if is_extension_array_dtype(dtype) and is_integer_dtype(dtype):
from pandas.core.arrays import IntegerArray
return IntegerArray(
self._data.astype(dtype.numpy_dtype), self._mask.copy(), copy=False
# for integer, error if there are missing values
if is_integer_dtype(dtype) and self._hasna:
raise ValueError("cannot convert NA to integer")
# for float dtype, ensure we use np.nan before casting (numpy cannot
# deal with pd.NA)
na_value = self._na_value
if is_float_dtype(dtype):
na_value = np.nan
# coerce
return self.to_numpy(dtype=dtype, na_value=na_value, copy=False)
def _values_for_argsort(self) -> np.ndarray:
Return values for sorting.
The transformed values should maintain the ordering between values
within the array.
See Also
ExtensionArray.argsort : Return the indices that would sort this array.
data = self._data.copy()
data[self._mask] = -1
return data
def any(self, *, skipna: bool = True, **kwargs):
Return whether any element is True.
Returns False unless there is at least one element that is True.
By default, NAs are skipped. If ``skipna=False`` is specified and
missing values are present, similar :ref:`Kleene logic <boolean.kleene>`
is used as for logical operations.
skipna : bool, default True
Exclude NA values. If the entire array is NA and `skipna` is
True, then the result will be False, as for an empty array.
If `skipna` is False, the result will still be True if there is
at least one element that is True, otherwise NA will be returned
if there are NA's present.
**kwargs : any, default None
Additional keywords have no effect but might be accepted for
compatibility with NumPy.
bool or :attr:`pandas.NA`
See Also
numpy.any : Numpy version of this method.
BooleanArray.all : Return whether all elements are True.
The result indicates whether any element is True (and by default
skips NAs):
>>> pd.array([True, False, True]).any()
>>> pd.array([True, False, pd.NA]).any()
>>> pd.array([False, False, pd.NA]).any()
>>> pd.array([], dtype="boolean").any()
>>> pd.array([pd.NA], dtype="boolean").any()
With ``skipna=False``, the result can be NA if this is logically
required (whether ``pd.NA`` is True or False influences the result):
>>> pd.array([True, False, pd.NA]).any(skipna=False)
>>> pd.array([False, False, pd.NA]).any(skipna=False)
kwargs.pop("axis", None)
nv.validate_any((), kwargs)
values = self._data.copy()
np.putmask(values, self._mask, False)
result = values.any()
if skipna:
return result
if result or len(self) == 0 or not self._mask.any():
return result
return self.dtype.na_value
def all(self, *, skipna: bool = True, **kwargs):
Return whether all elements are True.
Returns True unless there is at least one element that is False.
By default, NAs are skipped. If ``skipna=False`` is specified and
missing values are present, similar :ref:`Kleene logic <boolean.kleene>`
is used as for logical operations.
skipna : bool, default True
Exclude NA values. If the entire array is NA and `skipna` is
True, then the result will be True, as for an empty array.
If `skipna` is False, the result will still be False if there is
at least one element that is False, otherwise NA will be returned
if there are NA's present.
**kwargs : any, default None
Additional keywords have no effect but might be accepted for
compatibility with NumPy.
bool or :attr:`pandas.NA`
See Also
numpy.all : Numpy version of this method.
BooleanArray.any : Return whether any element is True.
The result indicates whether any element is True (and by default
skips NAs):
>>> pd.array([True, True, pd.NA]).all()
>>> pd.array([True, False, pd.NA]).all()
>>> pd.array([], dtype="boolean").all()
>>> pd.array([pd.NA], dtype="boolean").all()
With ``skipna=False``, the result can be NA if this is logically
required (whether ``pd.NA`` is True or False influences the result):
>>> pd.array([True, True, pd.NA]).all(skipna=False)
>>> pd.array([True, False, pd.NA]).all(skipna=False)
kwargs.pop("axis", None)
nv.validate_all((), kwargs)
values = self._data.copy()
np.putmask(values, self._mask, True)
result = values.all()
if skipna:
return result
if not result or len(self) == 0 or not self._mask.any():
return result
return self.dtype.na_value
def _logical_method(self, other, op):
assert op.__name__ in {"or_", "ror_", "and_", "rand_", "xor", "rxor"}
other_is_booleanarray = isinstance(other, BooleanArray)
other_is_scalar = lib.is_scalar(other)
mask = None
if other_is_booleanarray:
other, mask = other._data, other._mask
elif is_list_like(other):
other = np.asarray(other, dtype="bool")
if other.ndim > 1:
raise NotImplementedError("can only perform ops with 1-d structures")
other, mask = coerce_to_array(other, copy=False)
elif isinstance(other, np.bool_):
other = other.item()
if other_is_scalar and other is not libmissing.NA and not lib.is_bool(other):
raise TypeError(
"'other' should be pandas.NA or a bool. "
f"Got {type(other).__name__} instead."
if not other_is_scalar and len(self) != len(other):
raise ValueError("Lengths must match to compare")
if op.__name__ in {"or_", "ror_"}:
result, mask = ops.kleene_or(self._data, other, self._mask, mask)
elif op.__name__ in {"and_", "rand_"}:
result, mask = ops.kleene_and(self._data, other, self._mask, mask)
elif op.__name__ in {"xor", "rxor"}:
result, mask = ops.kleene_xor(self._data, other, self._mask, mask)
return BooleanArray(result, mask)
def _cmp_method(self, other, op):
from pandas.arrays import FloatingArray, IntegerArray
if isinstance(other, (IntegerArray, FloatingArray)):
return NotImplemented
mask = None
if isinstance(other, BooleanArray):
other, mask = other._data, other._mask
elif is_list_like(other):
other = np.asarray(other)
if other.ndim > 1:
raise NotImplementedError("can only perform ops with 1-d structures")
if len(self) != len(other):
raise ValueError("Lengths must match to compare")
if other is libmissing.NA:
# numpy does not handle pd.NA well as "other" scalar (it returns
# a scalar False instead of an array)
result = np.zeros_like(self._data)
mask = np.ones_like(self._data)
# numpy will show a DeprecationWarning on invalid elementwise
# comparisons, this will raise in the future
with warnings.catch_warnings():
warnings.filterwarnings("ignore", "elementwise", FutureWarning)
with np.errstate(all="ignore"):
result = op(self._data, other)
# nans propagate
if mask is None:
mask = self._mask.copy()
mask = self._mask | mask
return BooleanArray(result, mask, copy=False)
def _arith_method(self, other, op):
mask = None
op_name = op.__name__
if isinstance(other, BooleanArray):
other, mask = other._data, other._mask
elif is_list_like(other):
other = np.asarray(other)
if other.ndim > 1:
raise NotImplementedError("can only perform ops with 1-d structures")
if len(self) != len(other):
raise ValueError("Lengths must match")
# nans propagate
if mask is None:
mask = self._mask
if other is libmissing.NA:
mask |= True
mask = self._mask | mask
if other is libmissing.NA:
# if other is NA, the result will be all NA and we can't run the
# actual op, so we need to choose the resulting dtype manually
if op_name in {"floordiv", "rfloordiv", "mod", "rmod", "pow", "rpow"}:
dtype = "int8"
dtype = "bool"
result = np.zeros(len(self._data), dtype=dtype)
if op_name in {"pow", "rpow"} and isinstance(other, np.bool_):
# Avoid DeprecationWarning: In future, it will be an error
# for 'np.bool_' scalars to be interpreted as an index
other = bool(other)
with np.errstate(all="ignore"):
result = op(self._data, other)
# divmod returns a tuple
if op_name == "divmod":
div, mod = result
return (
self._maybe_mask_result(div, mask, other, "floordiv"),
self._maybe_mask_result(mod, mask, other, "mod"),
return self._maybe_mask_result(result, mask, other, op_name)
def _reduce(self, name: str, *, skipna: bool = True, **kwargs):
if name in {"any", "all"}:
return getattr(self, name)(skipna=skipna, **kwargs)
return super()._reduce(name, skipna=skipna, **kwargs)
def _maybe_mask_result(self, result, mask, other, op_name: str):
result : array-like
mask : array-like bool
other : scalar or array-like
op_name : str
# if we have a float operand we are by-definition
# a float result
# or our op is a divide
if (is_float_dtype(other) or is_float(other)) or (
op_name in ["rtruediv", "truediv"]
from pandas.core.arrays import FloatingArray
return FloatingArray(result, mask, copy=False)
elif is_bool_dtype(result):
return BooleanArray(result, mask, copy=False)
elif is_integer_dtype(result):
from pandas.core.arrays import IntegerArray
return IntegerArray(result, mask, copy=False)
result[mask] = np.nan
return result