Inzynierka/Lib/site-packages/pandas/core/internals/blocks.py

2608 lines
86 KiB
Python
Raw Permalink Normal View History

2023-06-02 12:51:02 +02:00
from __future__ import annotations
from functools import wraps
import re
from typing import (
TYPE_CHECKING,
Any,
Callable,
Iterable,
Sequence,
cast,
final,
)
import numpy as np
from pandas._config import using_copy_on_write
from pandas._libs import (
internals as libinternals,
lib,
writers,
)
from pandas._libs.internals import (
BlockPlacement,
BlockValuesRefs,
)
from pandas._libs.missing import NA
from pandas._libs.tslibs import IncompatibleFrequency
from pandas._typing import (
ArrayLike,
AxisInt,
DtypeObj,
F,
FillnaOptions,
IgnoreRaise,
QuantileInterpolation,
Shape,
npt,
)
from pandas.errors import AbstractMethodError
from pandas.util._decorators import cache_readonly
from pandas.util._validators import validate_bool_kwarg
from pandas.core.dtypes.astype import (
astype_array_safe,
astype_is_view,
)
from pandas.core.dtypes.cast import (
LossySetitemError,
can_hold_element,
find_result_type,
maybe_downcast_to_dtype,
np_can_hold_element,
)
from pandas.core.dtypes.common import (
ensure_platform_int,
is_1d_only_ea_dtype,
is_1d_only_ea_obj,
is_dtype_equal,
is_interval_dtype,
is_list_like,
is_sparse,
is_string_dtype,
)
from pandas.core.dtypes.dtypes import (
DatetimeTZDtype,
ExtensionDtype,
PandasDtype,
PeriodDtype,
)
from pandas.core.dtypes.generic import (
ABCDataFrame,
ABCIndex,
ABCPandasArray,
ABCSeries,
)
from pandas.core.dtypes.missing import (
is_valid_na_for_dtype,
isna,
na_value_for_dtype,
)
from pandas.core import missing
import pandas.core.algorithms as algos
from pandas.core.array_algos.putmask import (
extract_bool_array,
putmask_inplace,
putmask_without_repeat,
setitem_datetimelike_compat,
validate_putmask,
)
from pandas.core.array_algos.quantile import quantile_compat
from pandas.core.array_algos.replace import (
compare_or_regex_search,
replace_regex,
should_use_regex,
)
from pandas.core.array_algos.transforms import shift
from pandas.core.arrays import (
Categorical,
DatetimeArray,
ExtensionArray,
IntervalArray,
PandasArray,
PeriodArray,
TimedeltaArray,
)
from pandas.core.arrays.sparse import SparseDtype
from pandas.core.base import PandasObject
import pandas.core.common as com
from pandas.core.computation import expressions
from pandas.core.construction import (
ensure_wrapped_if_datetimelike,
extract_array,
)
from pandas.core.indexers import check_setitem_lengths
if TYPE_CHECKING:
from pandas.core.api import Index
from pandas.core.arrays._mixins import NDArrayBackedExtensionArray
# comparison is faster than is_object_dtype
_dtype_obj = np.dtype("object")
def maybe_split(meth: F) -> F:
"""
If we have a multi-column block, split and operate block-wise. Otherwise
use the original method.
"""
@wraps(meth)
def newfunc(self, *args, **kwargs) -> list[Block]:
if self.ndim == 1 or self.shape[0] == 1:
return meth(self, *args, **kwargs)
else:
# Split and operate column-by-column
return self.split_and_operate(meth, *args, **kwargs)
return cast(F, newfunc)
class Block(PandasObject):
"""
Canonical n-dimensional unit of homogeneous dtype contained in a pandas
data structure
Index-ignorant; let the container take care of that
"""
values: np.ndarray | ExtensionArray
ndim: int
refs: BlockValuesRefs
__init__: Callable
__slots__ = ()
is_numeric = False
is_object = False
is_extension = False
_can_consolidate = True
_validate_ndim = True
@final
@cache_readonly
def _consolidate_key(self):
return self._can_consolidate, self.dtype.name
@final
@cache_readonly
def _can_hold_na(self) -> bool:
"""
Can we store NA values in this Block?
"""
dtype = self.dtype
if isinstance(dtype, np.dtype):
return dtype.kind not in ["b", "i", "u"]
return dtype._can_hold_na
@final
@property
def is_bool(self) -> bool:
"""
We can be bool if a) we are bool dtype or b) object dtype with bool objects.
"""
return self.values.dtype == np.dtype(bool)
@final
def external_values(self):
return external_values(self.values)
@final
@cache_readonly
def fill_value(self):
# Used in reindex_indexer
return na_value_for_dtype(self.dtype, compat=False)
@final
def _standardize_fill_value(self, value):
# if we are passed a scalar None, convert it here
if self.dtype != _dtype_obj and is_valid_na_for_dtype(value, self.dtype):
value = self.fill_value
return value
@property
def mgr_locs(self) -> BlockPlacement:
return self._mgr_locs
@mgr_locs.setter
def mgr_locs(self, new_mgr_locs: BlockPlacement) -> None:
self._mgr_locs = new_mgr_locs
@final
def make_block(
self, values, placement=None, refs: BlockValuesRefs | None = None
) -> Block:
"""
Create a new block, with type inference propagate any values that are
not specified
"""
if placement is None:
placement = self._mgr_locs
if self.is_extension:
values = ensure_block_shape(values, ndim=self.ndim)
# TODO: perf by not going through new_block
# We assume maybe_coerce_values has already been called
return new_block(values, placement=placement, ndim=self.ndim, refs=refs)
@final
def make_block_same_class(
self,
values,
placement: BlockPlacement | None = None,
refs: BlockValuesRefs | None = None,
) -> Block:
"""Wrap given values in a block of same type as self."""
# Pre-2.0 we called ensure_wrapped_if_datetimelike because fastparquet
# relied on it, as of 2.0 the caller is responsible for this.
if placement is None:
placement = self._mgr_locs
# We assume maybe_coerce_values has already been called
return type(self)(values, placement=placement, ndim=self.ndim, refs=refs)
@final
def __repr__(self) -> str:
# don't want to print out all of the items here
name = type(self).__name__
if self.ndim == 1:
result = f"{name}: {len(self)} dtype: {self.dtype}"
else:
shape = " x ".join([str(s) for s in self.shape])
result = f"{name}: {self.mgr_locs.indexer}, {shape}, dtype: {self.dtype}"
return result
@final
def __len__(self) -> int:
return len(self.values)
@final
def getitem_block(self, slicer: slice | npt.NDArray[np.intp]) -> Block:
"""
Perform __getitem__-like, return result as block.
Only supports slices that preserve dimensionality.
"""
# Note: the only place where we are called with ndarray[intp]
# is from internals.concat, and we can verify that never happens
# with 1-column blocks, i.e. never for ExtensionBlock.
new_mgr_locs = self._mgr_locs[slicer]
new_values = self._slice(slicer)
refs = self.refs if isinstance(slicer, slice) else None
return type(self)(new_values, new_mgr_locs, self.ndim, refs=refs)
@final
def getitem_block_columns(
self, slicer: slice, new_mgr_locs: BlockPlacement
) -> Block:
"""
Perform __getitem__-like, return result as block.
Only supports slices that preserve dimensionality.
"""
new_values = self._slice(slicer)
if new_values.ndim != self.values.ndim:
raise ValueError("Only same dim slicing is allowed")
return type(self)(new_values, new_mgr_locs, self.ndim, refs=self.refs)
@final
def _can_hold_element(self, element: Any) -> bool:
"""require the same dtype as ourselves"""
element = extract_array(element, extract_numpy=True)
return can_hold_element(self.values, element)
@final
def should_store(self, value: ArrayLike) -> bool:
"""
Should we set self.values[indexer] = value inplace or do we need to cast?
Parameters
----------
value : np.ndarray or ExtensionArray
Returns
-------
bool
"""
# faster equivalent to is_dtype_equal(value.dtype, self.dtype)
try:
return value.dtype == self.dtype
except TypeError:
return False
# ---------------------------------------------------------------------
# Apply/Reduce and Helpers
@final
def apply(self, func, **kwargs) -> list[Block]:
"""
apply the function to my values; return a block if we are not
one
"""
result = func(self.values, **kwargs)
return self._split_op_result(result)
@final
def reduce(self, func) -> list[Block]:
# We will apply the function and reshape the result into a single-row
# Block with the same mgr_locs; squeezing will be done at a higher level
assert self.ndim == 2
result = func(self.values)
if self.values.ndim == 1:
# TODO(EA2D): special case not needed with 2D EAs
res_values = np.array([[result]])
else:
res_values = result.reshape(-1, 1)
nb = self.make_block(res_values)
return [nb]
@final
def _split_op_result(self, result: ArrayLike) -> list[Block]:
# See also: split_and_operate
if result.ndim > 1 and isinstance(result.dtype, ExtensionDtype):
# TODO(EA2D): unnecessary with 2D EAs
# if we get a 2D ExtensionArray, we need to split it into 1D pieces
nbs = []
for i, loc in enumerate(self._mgr_locs):
if not is_1d_only_ea_obj(result):
vals = result[i : i + 1]
else:
vals = result[i]
block = self.make_block(values=vals, placement=loc)
nbs.append(block)
return nbs
nb = self.make_block(result)
return [nb]
@final
def _split(self) -> list[Block]:
"""
Split a block into a list of single-column blocks.
"""
assert self.ndim == 2
new_blocks = []
for i, ref_loc in enumerate(self._mgr_locs):
vals = self.values[slice(i, i + 1)]
bp = BlockPlacement(ref_loc)
nb = type(self)(vals, placement=bp, ndim=2, refs=self.refs)
new_blocks.append(nb)
return new_blocks
@final
def split_and_operate(self, func, *args, **kwargs) -> list[Block]:
"""
Split the block and apply func column-by-column.
Parameters
----------
func : Block method
*args
**kwargs
Returns
-------
List[Block]
"""
assert self.ndim == 2 and self.shape[0] != 1
res_blocks = []
for nb in self._split():
rbs = func(nb, *args, **kwargs)
res_blocks.extend(rbs)
return res_blocks
# ---------------------------------------------------------------------
# Up/Down-casting
@final
def coerce_to_target_dtype(self, other) -> Block:
"""
coerce the current block to a dtype compat for other
we will return a block, possibly object, and not raise
we can also safely try to coerce to the same dtype
and will receive the same block
"""
new_dtype = find_result_type(self.values, other)
return self.astype(new_dtype, copy=False)
@final
def _maybe_downcast(
self, blocks: list[Block], downcast=None, using_cow: bool = False
) -> list[Block]:
if downcast is False:
return blocks
if self.dtype == _dtype_obj:
# TODO: does it matter that self.dtype might not match blocks[i].dtype?
# GH#44241 We downcast regardless of the argument;
# respecting 'downcast=None' may be worthwhile at some point,
# but ATM it breaks too much existing code.
# split and convert the blocks
return extend_blocks(
[blk.convert(using_cow=using_cow, copy=not using_cow) for blk in blocks]
)
if downcast is None:
return blocks
return extend_blocks([b._downcast_2d(downcast, using_cow) for b in blocks])
@final
@maybe_split
def _downcast_2d(self, dtype, using_cow: bool = False) -> list[Block]:
"""
downcast specialized to 2D case post-validation.
Refactored to allow use of maybe_split.
"""
new_values = maybe_downcast_to_dtype(self.values, dtype=dtype)
refs = self.refs if using_cow and new_values is self.values else None
return [self.make_block(new_values, refs=refs)]
def convert(
self,
*,
copy: bool = True,
using_cow: bool = False,
) -> list[Block]:
"""
attempt to coerce any object types to better types return a copy
of the block (if copy = True) by definition we are not an ObjectBlock
here!
"""
if not copy and using_cow:
return [self.copy(deep=False)]
return [self.copy()] if copy else [self]
# ---------------------------------------------------------------------
# Array-Like Methods
@cache_readonly
def dtype(self) -> DtypeObj:
return self.values.dtype
@final
def astype(
self,
dtype: DtypeObj,
copy: bool = False,
errors: IgnoreRaise = "raise",
using_cow: bool = False,
) -> Block:
"""
Coerce to the new dtype.
Parameters
----------
dtype : np.dtype or ExtensionDtype
copy : bool, default False
copy if indicated
errors : str, {'raise', 'ignore'}, default 'raise'
- ``raise`` : allow exceptions to be raised
- ``ignore`` : suppress exceptions. On error return original object
using_cow: bool, default False
Signaling if copy on write copy logic is used.
Returns
-------
Block
"""
values = self.values
new_values = astype_array_safe(values, dtype, copy=copy, errors=errors)
new_values = maybe_coerce_values(new_values)
refs = None
if using_cow and astype_is_view(values.dtype, new_values.dtype):
refs = self.refs
newb = self.make_block(new_values, refs=refs)
if newb.shape != self.shape:
raise TypeError(
f"cannot set astype for copy = [{copy}] for dtype "
f"({self.dtype.name} [{self.shape}]) to different shape "
f"({newb.dtype.name} [{newb.shape}])"
)
return newb
@final
def to_native_types(self, na_rep: str = "nan", quoting=None, **kwargs) -> Block:
"""convert to our native types format"""
result = to_native_types(self.values, na_rep=na_rep, quoting=quoting, **kwargs)
return self.make_block(result)
@final
def copy(self, deep: bool = True) -> Block:
"""copy constructor"""
values = self.values
refs: BlockValuesRefs | None
if deep:
values = values.copy()
refs = None
else:
refs = self.refs
return type(self)(values, placement=self._mgr_locs, ndim=self.ndim, refs=refs)
# ---------------------------------------------------------------------
# Replace
@final
def replace(
self,
to_replace,
value,
inplace: bool = False,
# mask may be pre-computed if we're called from replace_list
mask: npt.NDArray[np.bool_] | None = None,
using_cow: bool = False,
) -> list[Block]:
"""
replace the to_replace value with value, possible to create new
blocks here this is just a call to putmask.
"""
# Note: the checks we do in NDFrame.replace ensure we never get
# here with listlike to_replace or value, as those cases
# go through replace_list
values = self.values
if isinstance(values, Categorical):
# TODO: avoid special-casing
# GH49404
if using_cow and (self.refs.has_reference() or not inplace):
blk = self.copy()
elif using_cow:
blk = self.copy(deep=False)
else:
blk = self if inplace else self.copy()
values = cast(Categorical, blk.values)
values._replace(to_replace=to_replace, value=value, inplace=True)
return [blk]
if not self._can_hold_element(to_replace):
# We cannot hold `to_replace`, so we know immediately that
# replacing it is a no-op.
# Note: If to_replace were a list, NDFrame.replace would call
# replace_list instead of replace.
if using_cow:
return [self.copy(deep=False)]
else:
return [self] if inplace else [self.copy()]
if mask is None:
mask = missing.mask_missing(values, to_replace)
if not mask.any():
# Note: we get here with test_replace_extension_other incorrectly
# bc _can_hold_element is incorrect.
if using_cow:
return [self.copy(deep=False)]
else:
return [self] if inplace else [self.copy()]
elif self._can_hold_element(value):
# TODO(CoW): Maybe split here as well into columns where mask has True
# and rest?
if using_cow:
if inplace:
blk = self.copy(deep=self.refs.has_reference())
else:
blk = self.copy()
else:
blk = self if inplace else self.copy()
putmask_inplace(blk.values, mask, value)
if not (self.is_object and value is None):
# if the user *explicitly* gave None, we keep None, otherwise
# may downcast to NaN
blocks = blk.convert(copy=False, using_cow=using_cow)
else:
blocks = [blk]
return blocks
elif self.ndim == 1 or self.shape[0] == 1:
if value is None or value is NA:
blk = self.astype(np.dtype(object))
else:
blk = self.coerce_to_target_dtype(value)
return blk.replace(
to_replace=to_replace,
value=value,
inplace=True,
mask=mask,
)
else:
# split so that we only upcast where necessary
blocks = []
for i, nb in enumerate(self._split()):
blocks.extend(
type(self).replace(
nb,
to_replace=to_replace,
value=value,
inplace=True,
mask=mask[i : i + 1],
using_cow=using_cow,
)
)
return blocks
@final
def _replace_regex(
self,
to_replace,
value,
inplace: bool = False,
mask=None,
using_cow: bool = False,
) -> list[Block]:
"""
Replace elements by the given value.
Parameters
----------
to_replace : object or pattern
Scalar to replace or regular expression to match.
value : object
Replacement object.
inplace : bool, default False
Perform inplace modification.
mask : array-like of bool, optional
True indicate corresponding element is ignored.
using_cow: bool, default False
Specifying if copy on write is enabled.
Returns
-------
List[Block]
"""
if not self._can_hold_element(to_replace):
# i.e. only ObjectBlock, but could in principle include a
# String ExtensionBlock
if using_cow:
return [self.copy(deep=False)]
return [self] if inplace else [self.copy()]
rx = re.compile(to_replace)
if using_cow:
if inplace and not self.refs.has_reference():
refs = self.refs
new_values = self.values
else:
refs = None
new_values = self.values.copy()
else:
refs = None
new_values = self.values if inplace else self.values.copy()
replace_regex(new_values, rx, value, mask)
block = self.make_block(new_values, refs=refs)
return block.convert(copy=False, using_cow=using_cow)
@final
def replace_list(
self,
src_list: Iterable[Any],
dest_list: Sequence[Any],
inplace: bool = False,
regex: bool = False,
using_cow: bool = False,
) -> list[Block]:
"""
See BlockManager.replace_list docstring.
"""
values = self.values
if isinstance(values, Categorical):
# TODO: avoid special-casing
# GH49404
if using_cow and inplace:
blk = self.copy(deep=self.refs.has_reference())
else:
blk = self if inplace else self.copy()
values = cast(Categorical, blk.values)
values._replace(to_replace=src_list, value=dest_list, inplace=True)
return [blk]
# Exclude anything that we know we won't contain
pairs = [
(x, y) for x, y in zip(src_list, dest_list) if self._can_hold_element(x)
]
if not len(pairs):
if using_cow:
return [self.copy(deep=False)]
# shortcut, nothing to replace
return [self] if inplace else [self.copy()]
src_len = len(pairs) - 1
if is_string_dtype(values.dtype):
# Calculate the mask once, prior to the call of comp
# in order to avoid repeating the same computations
na_mask = ~isna(values)
masks: Iterable[npt.NDArray[np.bool_]] = (
extract_bool_array(
cast(
ArrayLike,
compare_or_regex_search(
values, s[0], regex=regex, mask=na_mask
),
)
)
for s in pairs
)
else:
# GH#38086 faster if we know we dont need to check for regex
masks = (missing.mask_missing(values, s[0]) for s in pairs)
# Materialize if inplace = True, since the masks can change
# as we replace
if inplace:
masks = list(masks)
if using_cow and inplace:
# Don't set up refs here, otherwise we will think that we have
# references when we check again later
rb = [self]
else:
rb = [self if inplace else self.copy()]
for i, ((src, dest), mask) in enumerate(zip(pairs, masks)):
convert = i == src_len # only convert once at the end
new_rb: list[Block] = []
# GH-39338: _replace_coerce can split a block into
# single-column blocks, so track the index so we know
# where to index into the mask
for blk_num, blk in enumerate(rb):
if len(rb) == 1:
m = mask
else:
mib = mask
assert not isinstance(mib, bool)
m = mib[blk_num : blk_num + 1]
# error: Argument "mask" to "_replace_coerce" of "Block" has
# incompatible type "Union[ExtensionArray, ndarray[Any, Any], bool]";
# expected "ndarray[Any, dtype[bool_]]"
result = blk._replace_coerce(
to_replace=src,
value=dest,
mask=m,
inplace=inplace,
regex=regex,
using_cow=using_cow,
)
if convert and blk.is_object and not all(x is None for x in dest_list):
# GH#44498 avoid unwanted cast-back
result = extend_blocks(
[
b.convert(copy=True and not using_cow, using_cow=using_cow)
for b in result
]
)
new_rb.extend(result)
rb = new_rb
return rb
@final
def _replace_coerce(
self,
to_replace,
value,
mask: npt.NDArray[np.bool_],
inplace: bool = True,
regex: bool = False,
using_cow: bool = False,
) -> list[Block]:
"""
Replace value corresponding to the given boolean array with another
value.
Parameters
----------
to_replace : object or pattern
Scalar to replace or regular expression to match.
value : object
Replacement object.
mask : np.ndarray[bool]
True indicate corresponding element is ignored.
inplace : bool, default True
Perform inplace modification.
regex : bool, default False
If true, perform regular expression substitution.
Returns
-------
List[Block]
"""
if should_use_regex(regex, to_replace):
return self._replace_regex(
to_replace,
value,
inplace=inplace,
mask=mask,
)
else:
if value is None:
# gh-45601, gh-45836, gh-46634
if mask.any():
has_ref = self.refs.has_reference()
nb = self.astype(np.dtype(object), copy=False, using_cow=using_cow)
if (nb is self or using_cow) and not inplace:
nb = nb.copy()
elif inplace and has_ref and nb.refs.has_reference():
# no copy in astype and we had refs before
nb = nb.copy()
putmask_inplace(nb.values, mask, value)
return [nb]
if using_cow:
return [self.copy(deep=False)]
return [self] if inplace else [self.copy()]
return self.replace(
to_replace=to_replace,
value=value,
inplace=inplace,
mask=mask,
using_cow=using_cow,
)
# ---------------------------------------------------------------------
# 2D Methods - Shared by NumpyBlock and NDArrayBackedExtensionBlock
# but not ExtensionBlock
def _maybe_squeeze_arg(self, arg: np.ndarray) -> np.ndarray:
"""
For compatibility with 1D-only ExtensionArrays.
"""
return arg
def _unwrap_setitem_indexer(self, indexer):
"""
For compatibility with 1D-only ExtensionArrays.
"""
return indexer
# NB: this cannot be made cache_readonly because in mgr.set_values we pin
# new .values that can have different shape GH#42631
@property
def shape(self) -> Shape:
return self.values.shape
def iget(self, i: int | tuple[int, int] | tuple[slice, int]) -> np.ndarray:
# In the case where we have a tuple[slice, int], the slice will always
# be slice(None)
# Note: only reached with self.ndim == 2
# Invalid index type "Union[int, Tuple[int, int], Tuple[slice, int]]"
# for "Union[ndarray[Any, Any], ExtensionArray]"; expected type
# "Union[int, integer[Any]]"
return self.values[i] # type: ignore[index]
def _slice(
self, slicer: slice | npt.NDArray[np.bool_] | npt.NDArray[np.intp]
) -> ArrayLike:
"""return a slice of my values"""
return self.values[slicer]
def set_inplace(self, locs, values: ArrayLike, copy: bool = False) -> None:
"""
Modify block values in-place with new item value.
If copy=True, first copy the underlying values in place before modifying
(for Copy-on-Write).
Notes
-----
`set_inplace` never creates a new array or new Block, whereas `setitem`
_may_ create a new array and always creates a new Block.
Caller is responsible for checking values.dtype == self.dtype.
"""
if copy:
self.values = self.values.copy()
self.values[locs] = values
def take_nd(
self,
indexer: npt.NDArray[np.intp],
axis: AxisInt,
new_mgr_locs: BlockPlacement | None = None,
fill_value=lib.no_default,
) -> Block:
"""
Take values according to indexer and return them as a block.
"""
values = self.values
if fill_value is lib.no_default:
fill_value = self.fill_value
allow_fill = False
else:
allow_fill = True
# Note: algos.take_nd has upcast logic similar to coerce_to_target_dtype
new_values = algos.take_nd(
values, indexer, axis=axis, allow_fill=allow_fill, fill_value=fill_value
)
# Called from three places in managers, all of which satisfy
# these assertions
if isinstance(self, ExtensionBlock):
# NB: in this case, the 'axis' kwarg will be ignored in the
# algos.take_nd call above.
assert not (self.ndim == 1 and new_mgr_locs is None)
assert not (axis == 0 and new_mgr_locs is None)
if new_mgr_locs is None:
new_mgr_locs = self._mgr_locs
if not is_dtype_equal(new_values.dtype, self.dtype):
return self.make_block(new_values, new_mgr_locs)
else:
return self.make_block_same_class(new_values, new_mgr_locs)
def _unstack(
self,
unstacker,
fill_value,
new_placement: npt.NDArray[np.intp],
needs_masking: npt.NDArray[np.bool_],
):
"""
Return a list of unstacked blocks of self
Parameters
----------
unstacker : reshape._Unstacker
fill_value : int
Only used in ExtensionBlock._unstack
new_placement : np.ndarray[np.intp]
allow_fill : bool
needs_masking : np.ndarray[bool]
Returns
-------
blocks : list of Block
New blocks of unstacked values.
mask : array-like of bool
The mask of columns of `blocks` we should keep.
"""
new_values, mask = unstacker.get_new_values(
self.values.T, fill_value=fill_value
)
mask = mask.any(0)
# TODO: in all tests we have mask.all(); can we rely on that?
# Note: these next two lines ensure that
# mask.sum() == sum(len(nb.mgr_locs) for nb in blocks)
# which the calling function needs in order to pass verify_integrity=False
# to the BlockManager constructor
new_values = new_values.T[mask]
new_placement = new_placement[mask]
bp = BlockPlacement(new_placement)
blocks = [new_block_2d(new_values, placement=bp)]
return blocks, mask
# ---------------------------------------------------------------------
def setitem(self, indexer, value, using_cow: bool = False) -> Block:
"""
Attempt self.values[indexer] = value, possibly creating a new array.
Parameters
----------
indexer : tuple, list-like, array-like, slice, int
The subset of self.values to set
value : object
The value being set
using_cow: bool, default False
Signaling if CoW is used.
Returns
-------
Block
Notes
-----
`indexer` is a direct slice/positional indexer. `value` must
be a compatible shape.
"""
value = self._standardize_fill_value(value)
values = cast(np.ndarray, self.values)
if self.ndim == 2:
values = values.T
# length checking
check_setitem_lengths(indexer, value, values)
value = extract_array(value, extract_numpy=True)
try:
casted = np_can_hold_element(values.dtype, value)
except LossySetitemError:
# current dtype cannot store value, coerce to common dtype
nb = self.coerce_to_target_dtype(value)
return nb.setitem(indexer, value)
else:
if self.dtype == _dtype_obj:
# TODO: avoid having to construct values[indexer]
vi = values[indexer]
if lib.is_list_like(vi):
# checking lib.is_scalar here fails on
# test_iloc_setitem_custom_object
casted = setitem_datetimelike_compat(values, len(vi), casted)
if using_cow and self.refs.has_reference():
values = values.copy()
self = self.make_block_same_class(
values.T if values.ndim == 2 else values
)
if isinstance(casted, np.ndarray) and casted.ndim == 1 and len(casted) == 1:
# NumPy 1.25 deprecation: https://github.com/numpy/numpy/pull/10615
casted = casted[0, ...]
values[indexer] = casted
return self
def putmask(self, mask, new, using_cow: bool = False) -> list[Block]:
"""
putmask the data to the block; it is possible that we may create a
new dtype of block
Return the resulting block(s).
Parameters
----------
mask : np.ndarray[bool], SparseArray[bool], or BooleanArray
new : a ndarray/object
using_cow: bool, default False
Returns
-------
List[Block]
"""
orig_mask = mask
values = cast(np.ndarray, self.values)
mask, noop = validate_putmask(values.T, mask)
assert not isinstance(new, (ABCIndex, ABCSeries, ABCDataFrame))
if new is lib.no_default:
new = self.fill_value
new = self._standardize_fill_value(new)
new = extract_array(new, extract_numpy=True)
if noop:
if using_cow:
return [self.copy(deep=False)]
return [self]
try:
casted = np_can_hold_element(values.dtype, new)
if using_cow and self.refs.has_reference():
# Do this here to avoid copying twice
values = values.copy()
self = self.make_block_same_class(values)
putmask_without_repeat(values.T, mask, casted)
if using_cow:
return [self.copy(deep=False)]
return [self]
except LossySetitemError:
if self.ndim == 1 or self.shape[0] == 1:
# no need to split columns
if not is_list_like(new):
# using just new[indexer] can't save us the need to cast
return self.coerce_to_target_dtype(new).putmask(mask, new)
else:
indexer = mask.nonzero()[0]
nb = self.setitem(indexer, new[indexer], using_cow=using_cow)
return [nb]
else:
is_array = isinstance(new, np.ndarray)
res_blocks = []
nbs = self._split()
for i, nb in enumerate(nbs):
n = new
if is_array:
# we have a different value per-column
n = new[:, i : i + 1]
submask = orig_mask[:, i : i + 1]
rbs = nb.putmask(submask, n, using_cow=using_cow)
res_blocks.extend(rbs)
return res_blocks
def where(
self, other, cond, _downcast: str | bool = "infer", using_cow: bool = False
) -> list[Block]:
"""
evaluate the block; return result block(s) from the result
Parameters
----------
other : a ndarray/object
cond : np.ndarray[bool], SparseArray[bool], or BooleanArray
_downcast : str or None, default "infer"
Private because we only specify it when calling from fillna.
Returns
-------
List[Block]
"""
assert cond.ndim == self.ndim
assert not isinstance(other, (ABCIndex, ABCSeries, ABCDataFrame))
transpose = self.ndim == 2
cond = extract_bool_array(cond)
# EABlocks override where
values = cast(np.ndarray, self.values)
orig_other = other
if transpose:
values = values.T
icond, noop = validate_putmask(values, ~cond)
if noop:
# GH-39595: Always return a copy; short-circuit up/downcasting
if using_cow:
return [self.copy(deep=False)]
return [self.copy()]
if other is lib.no_default:
other = self.fill_value
other = self._standardize_fill_value(other)
try:
# try/except here is equivalent to a self._can_hold_element check,
# but this gets us back 'casted' which we will re-use below;
# without using 'casted', expressions.where may do unwanted upcasts.
casted = np_can_hold_element(values.dtype, other)
except (ValueError, TypeError, LossySetitemError):
# we cannot coerce, return a compat dtype
if self.ndim == 1 or self.shape[0] == 1:
# no need to split columns
block = self.coerce_to_target_dtype(other)
blocks = block.where(orig_other, cond, using_cow=using_cow)
return self._maybe_downcast(
blocks, downcast=_downcast, using_cow=using_cow
)
else:
# since _maybe_downcast would split blocks anyway, we
# can avoid some potential upcast/downcast by splitting
# on the front end.
is_array = isinstance(other, (np.ndarray, ExtensionArray))
res_blocks = []
nbs = self._split()
for i, nb in enumerate(nbs):
oth = other
if is_array:
# we have a different value per-column
oth = other[:, i : i + 1]
submask = cond[:, i : i + 1]
rbs = nb.where(
oth, submask, _downcast=_downcast, using_cow=using_cow
)
res_blocks.extend(rbs)
return res_blocks
else:
other = casted
alt = setitem_datetimelike_compat(values, icond.sum(), other)
if alt is not other:
if is_list_like(other) and len(other) < len(values):
# call np.where with other to get the appropriate ValueError
np.where(~icond, values, other)
raise NotImplementedError(
"This should not be reached; call to np.where above is "
"expected to raise ValueError. Please report a bug at "
"github.com/pandas-dev/pandas"
)
result = values.copy()
np.putmask(result, icond, alt)
else:
# By the time we get here, we should have all Series/Index
# args extracted to ndarray
if (
is_list_like(other)
and not isinstance(other, np.ndarray)
and len(other) == self.shape[-1]
):
# If we don't do this broadcasting here, then expressions.where
# will broadcast a 1D other to be row-like instead of
# column-like.
other = np.array(other).reshape(values.shape)
# If lengths don't match (or len(other)==1), we will raise
# inside expressions.where, see test_series_where
# Note: expressions.where may upcast.
result = expressions.where(~icond, values, other)
# The np_can_hold_element check _should_ ensure that we always
# have result.dtype == self.dtype here.
if transpose:
result = result.T
return [self.make_block(result)]
def fillna(
self,
value,
limit: int | None = None,
inplace: bool = False,
downcast=None,
using_cow: bool = False,
) -> list[Block]:
"""
fillna on the block with the value. If we fail, then convert to
ObjectBlock and try again
"""
# Caller is responsible for validating limit; if int it is strictly positive
inplace = validate_bool_kwarg(inplace, "inplace")
if not self._can_hold_na:
# can short-circuit the isna call
noop = True
else:
mask = isna(self.values)
mask, noop = validate_putmask(self.values, mask)
if noop:
# we can't process the value, but nothing to do
if inplace:
if using_cow:
return [self.copy(deep=False)]
# Arbitrarily imposing the convention that we ignore downcast
# on no-op when inplace=True
return [self]
else:
# GH#45423 consistent downcasting on no-ops.
nb = self.copy(deep=not using_cow)
nbs = nb._maybe_downcast([nb], downcast=downcast, using_cow=using_cow)
return nbs
if limit is not None:
mask[mask.cumsum(self.ndim - 1) > limit] = False
if inplace:
nbs = self.putmask(mask.T, value, using_cow=using_cow)
else:
# without _downcast, we would break
# test_fillna_dtype_conversion_equiv_replace
nbs = self.where(value, ~mask.T, _downcast=False)
# Note: blk._maybe_downcast vs self._maybe_downcast(nbs)
# makes a difference bc blk may have object dtype, which has
# different behavior in _maybe_downcast.
return extend_blocks(
[
blk._maybe_downcast([blk], downcast=downcast, using_cow=using_cow)
for blk in nbs
]
)
def interpolate(
self,
*,
method: FillnaOptions = "pad",
axis: AxisInt = 0,
index: Index | None = None,
inplace: bool = False,
limit: int | None = None,
limit_direction: str = "forward",
limit_area: str | None = None,
fill_value: Any | None = None,
downcast: str | None = None,
using_cow: bool = False,
**kwargs,
) -> list[Block]:
inplace = validate_bool_kwarg(inplace, "inplace")
if not self._can_hold_na:
# If there are no NAs, then interpolate is a no-op
if using_cow:
return [self.copy(deep=False)]
return [self] if inplace else [self.copy()]
try:
m = missing.clean_fill_method(method)
except ValueError:
m = None
if m is None and self.dtype.kind != "f":
# only deal with floats
# bc we already checked that can_hold_na, we don't have int dtype here
# test_interp_basic checks that we make a copy here
if using_cow:
return [self.copy(deep=False)]
return [self] if inplace else [self.copy()]
if self.is_object and self.ndim == 2 and self.shape[0] != 1 and axis == 0:
# split improves performance in ndarray.copy()
return self.split_and_operate(
type(self).interpolate,
method=method,
axis=axis,
index=index,
inplace=inplace,
limit=limit,
limit_direction=limit_direction,
limit_area=limit_area,
fill_value=fill_value,
downcast=downcast,
**kwargs,
)
refs = None
if inplace:
if using_cow and self.refs.has_reference():
data = self.values.copy()
else:
data = self.values
refs = self.refs
else:
data = self.values.copy()
data = cast(np.ndarray, data) # bc overridden by ExtensionBlock
missing.interpolate_array_2d(
data,
method=method,
axis=axis,
index=index,
limit=limit,
limit_direction=limit_direction,
limit_area=limit_area,
fill_value=fill_value,
**kwargs,
)
nb = self.make_block_same_class(data, refs=refs)
return nb._maybe_downcast([nb], downcast, using_cow)
def diff(self, n: int, axis: AxisInt = 1) -> list[Block]:
"""return block for the diff of the values"""
# only reached with ndim == 2 and axis == 1
new_values = algos.diff(self.values, n, axis=axis)
return [self.make_block(values=new_values)]
def shift(
self, periods: int, axis: AxisInt = 0, fill_value: Any = None
) -> list[Block]:
"""shift the block by periods, possibly upcast"""
# convert integer to float if necessary. need to do a lot more than
# that, handle boolean etc also
# Note: periods is never 0 here, as that is handled at the top of
# NDFrame.shift. If that ever changes, we can do a check for periods=0
# and possibly avoid coercing.
if not lib.is_scalar(fill_value) and self.dtype != _dtype_obj:
# with object dtype there is nothing to promote, and the user can
# pass pretty much any weird fill_value they like
# see test_shift_object_non_scalar_fill
raise ValueError("fill_value must be a scalar")
fill_value = self._standardize_fill_value(fill_value)
try:
# error: Argument 1 to "np_can_hold_element" has incompatible type
# "Union[dtype[Any], ExtensionDtype]"; expected "dtype[Any]"
casted = np_can_hold_element(
self.dtype, fill_value # type: ignore[arg-type]
)
except LossySetitemError:
nb = self.coerce_to_target_dtype(fill_value)
return nb.shift(periods, axis=axis, fill_value=fill_value)
else:
values = cast(np.ndarray, self.values)
new_values = shift(values, periods, axis, casted)
return [self.make_block(new_values)]
@final
def quantile(
self,
qs: Index, # with dtype float64
interpolation: QuantileInterpolation = "linear",
axis: AxisInt = 0,
) -> Block:
"""
compute the quantiles of the
Parameters
----------
qs : Index
The quantiles to be computed in float64.
interpolation : str, default 'linear'
Type of interpolation.
axis : int, default 0
Axis to compute.
Returns
-------
Block
"""
# We should always have ndim == 2 because Series dispatches to DataFrame
assert self.ndim == 2
assert axis == 1 # only ever called this way
assert is_list_like(qs) # caller is responsible for this
result = quantile_compat(self.values, np.asarray(qs._values), interpolation)
# ensure_block_shape needed for cases where we start with EA and result
# is ndarray, e.g. IntegerArray, SparseArray
result = ensure_block_shape(result, ndim=2)
return new_block_2d(result, placement=self._mgr_locs)
def round(self, decimals: int, using_cow: bool = False) -> Block:
"""
Rounds the values.
If the block is not of an integer or float dtype, nothing happens.
This is consistent with DataFrame.round behavivor.
(Note: Series.round would raise)
Parameters
----------
decimals: int,
Number of decimal places to round to.
Caller is responsible for validating this
using_cow: bool,
Whether Copy on Write is enabled right now
"""
if not self.is_numeric or self.is_bool:
return self.copy(deep=not using_cow)
refs = None
# TODO: round only defined on BaseMaskedArray
# Series also does this, so would need to fix both places
# error: Item "ExtensionArray" of "Union[ndarray[Any, Any], ExtensionArray]"
# has no attribute "round"
values = self.values.round(decimals) # type: ignore[union-attr]
if values is self.values:
refs = self.refs
if not using_cow:
# Normally would need to do this before, but
# numpy only returns same array when round operation
# is no-op
# https://github.com/numpy/numpy/blob/486878b37fc7439a3b2b87747f50db9b62fea8eb/numpy/core/src/multiarray/calculation.c#L625-L636
values = values.copy()
return self.make_block_same_class(values, refs=refs)
# ---------------------------------------------------------------------
# Abstract Methods Overridden By EABackedBlock and NumpyBlock
def delete(self, loc) -> list[Block]:
"""Deletes the locs from the block.
We split the block to avoid copying the underlying data. We create new
blocks for every connected segment of the initial block that is not deleted.
The new blocks point to the initial array.
"""
if not is_list_like(loc):
loc = [loc]
if self.ndim == 1:
values = cast(np.ndarray, self.values)
values = np.delete(values, loc)
mgr_locs = self._mgr_locs.delete(loc)
return [type(self)(values, placement=mgr_locs, ndim=self.ndim)]
if np.max(loc) >= self.values.shape[0]:
raise IndexError
# Add one out-of-bounds indexer as maximum to collect
# all columns after our last indexer if any
loc = np.concatenate([loc, [self.values.shape[0]]])
mgr_locs_arr = self._mgr_locs.as_array
new_blocks: list[Block] = []
previous_loc = -1
# TODO(CoW): This is tricky, if parent block goes out of scope
# all split blocks are referencing each other even though they
# don't share data
refs = self.refs if self.refs.has_reference() else None
for idx in loc:
if idx == previous_loc + 1:
# There is no column between current and last idx
pass
else:
# No overload variant of "__getitem__" of "ExtensionArray" matches
# argument type "Tuple[slice, slice]"
values = self.values[previous_loc + 1 : idx, :] # type: ignore[call-overload] # noqa
locs = mgr_locs_arr[previous_loc + 1 : idx]
nb = type(self)(
values, placement=BlockPlacement(locs), ndim=self.ndim, refs=refs
)
new_blocks.append(nb)
previous_loc = idx
return new_blocks
@property
def is_view(self) -> bool:
"""return a boolean if I am possibly a view"""
raise AbstractMethodError(self)
@property
def array_values(self) -> ExtensionArray:
"""
The array that Series.array returns. Always an ExtensionArray.
"""
raise AbstractMethodError(self)
def get_values(self, dtype: DtypeObj | None = None) -> np.ndarray:
"""
return an internal format, currently just the ndarray
this is often overridden to handle to_dense like operations
"""
raise AbstractMethodError(self)
def values_for_json(self) -> np.ndarray:
raise AbstractMethodError(self)
class EABackedBlock(Block):
"""
Mixin for Block subclasses backed by ExtensionArray.
"""
values: ExtensionArray
def setitem(self, indexer, value, using_cow: bool = False):
"""
Attempt self.values[indexer] = value, possibly creating a new array.
This differs from Block.setitem by not allowing setitem to change
the dtype of the Block.
Parameters
----------
indexer : tuple, list-like, array-like, slice, int
The subset of self.values to set
value : object
The value being set
using_cow: bool, default False
Signaling if CoW is used.
Returns
-------
Block
Notes
-----
`indexer` is a direct slice/positional indexer. `value` must
be a compatible shape.
"""
orig_indexer = indexer
orig_value = value
indexer = self._unwrap_setitem_indexer(indexer)
value = self._maybe_squeeze_arg(value)
values = self.values
if values.ndim == 2:
# TODO(GH#45419): string[pyarrow] tests break if we transpose
# unconditionally
values = values.T
check_setitem_lengths(indexer, value, values)
try:
values[indexer] = value
except (ValueError, TypeError) as err:
_catch_deprecated_value_error(err)
if is_interval_dtype(self.dtype):
# see TestSetitemFloatIntervalWithIntIntervalValues
nb = self.coerce_to_target_dtype(orig_value)
return nb.setitem(orig_indexer, orig_value)
elif isinstance(self, NDArrayBackedExtensionBlock):
nb = self.coerce_to_target_dtype(orig_value)
return nb.setitem(orig_indexer, orig_value)
else:
raise
else:
return self
def where(
self, other, cond, _downcast: str | bool = "infer", using_cow: bool = False
) -> list[Block]:
# _downcast private bc we only specify it when calling from fillna
arr = self.values.T
cond = extract_bool_array(cond)
orig_other = other
orig_cond = cond
other = self._maybe_squeeze_arg(other)
cond = self._maybe_squeeze_arg(cond)
if other is lib.no_default:
other = self.fill_value
icond, noop = validate_putmask(arr, ~cond)
if noop:
# GH#44181, GH#45135
# Avoid a) raising for Interval/PeriodDtype and b) unnecessary object upcast
if using_cow:
return [self.copy(deep=False)]
return [self.copy()]
try:
res_values = arr._where(cond, other).T
except (ValueError, TypeError) as err:
_catch_deprecated_value_error(err)
if self.ndim == 1 or self.shape[0] == 1:
if is_interval_dtype(self.dtype):
# TestSetitemFloatIntervalWithIntIntervalValues
blk = self.coerce_to_target_dtype(orig_other)
nbs = blk.where(orig_other, orig_cond, using_cow=using_cow)
return self._maybe_downcast(
nbs, downcast=_downcast, using_cow=using_cow
)
elif isinstance(self, NDArrayBackedExtensionBlock):
# NB: not (yet) the same as
# isinstance(values, NDArrayBackedExtensionArray)
blk = self.coerce_to_target_dtype(orig_other)
nbs = blk.where(orig_other, orig_cond, using_cow=using_cow)
return self._maybe_downcast(
nbs, downcast=_downcast, using_cow=using_cow
)
else:
raise
else:
# Same pattern we use in Block.putmask
is_array = isinstance(orig_other, (np.ndarray, ExtensionArray))
res_blocks = []
nbs = self._split()
for i, nb in enumerate(nbs):
n = orig_other
if is_array:
# we have a different value per-column
n = orig_other[:, i : i + 1]
submask = orig_cond[:, i : i + 1]
rbs = nb.where(n, submask, using_cow=using_cow)
res_blocks.extend(rbs)
return res_blocks
nb = self.make_block_same_class(res_values)
return [nb]
def putmask(self, mask, new, using_cow: bool = False) -> list[Block]:
"""
See Block.putmask.__doc__
"""
mask = extract_bool_array(mask)
if new is lib.no_default:
new = self.fill_value
values = self.values
if values.ndim == 2:
values = values.T
orig_new = new
orig_mask = mask
new = self._maybe_squeeze_arg(new)
mask = self._maybe_squeeze_arg(mask)
if not mask.any():
if using_cow:
return [self.copy(deep=False)]
return [self]
if using_cow and self.refs.has_reference():
values = values.copy()
self = self.make_block_same_class( # type: ignore[assignment]
values.T if values.ndim == 2 else values
)
try:
# Caller is responsible for ensuring matching lengths
values._putmask(mask, new)
except (TypeError, ValueError) as err:
_catch_deprecated_value_error(err)
if self.ndim == 1 or self.shape[0] == 1:
if is_interval_dtype(self.dtype):
# Discussion about what we want to support in the general
# case GH#39584
blk = self.coerce_to_target_dtype(orig_new)
return blk.putmask(orig_mask, orig_new)
elif isinstance(self, NDArrayBackedExtensionBlock):
# NB: not (yet) the same as
# isinstance(values, NDArrayBackedExtensionArray)
blk = self.coerce_to_target_dtype(orig_new)
return blk.putmask(orig_mask, orig_new)
else:
raise
else:
# Same pattern we use in Block.putmask
is_array = isinstance(orig_new, (np.ndarray, ExtensionArray))
res_blocks = []
nbs = self._split()
for i, nb in enumerate(nbs):
n = orig_new
if is_array:
# we have a different value per-column
n = orig_new[:, i : i + 1]
submask = orig_mask[:, i : i + 1]
rbs = nb.putmask(submask, n)
res_blocks.extend(rbs)
return res_blocks
return [self]
def delete(self, loc) -> list[Block]:
# This will be unnecessary if/when __array_function__ is implemented
if self.ndim == 1:
values = self.values.delete(loc)
mgr_locs = self._mgr_locs.delete(loc)
return [type(self)(values, placement=mgr_locs, ndim=self.ndim)]
elif self.values.ndim == 1:
# We get here through to_stata
return []
return super().delete(loc)
@cache_readonly
def array_values(self) -> ExtensionArray:
return self.values
def get_values(self, dtype: DtypeObj | None = None) -> np.ndarray:
"""
return object dtype as boxed values, such as Timestamps/Timedelta
"""
values: ArrayLike = self.values
if dtype == _dtype_obj:
values = values.astype(object)
# TODO(EA2D): reshape not needed with 2D EAs
return np.asarray(values).reshape(self.shape)
def values_for_json(self) -> np.ndarray:
return np.asarray(self.values)
def interpolate(
self,
*,
method: FillnaOptions = "pad",
axis: int = 0,
inplace: bool = False,
limit: int | None = None,
fill_value=None,
using_cow: bool = False,
**kwargs,
):
values = self.values
if values.ndim == 2 and axis == 0:
# NDArrayBackedExtensionArray.fillna assumes axis=1
new_values = values.T.fillna(value=fill_value, method=method, limit=limit).T
else:
new_values = values.fillna(value=fill_value, method=method, limit=limit)
return self.make_block_same_class(new_values)
class ExtensionBlock(libinternals.Block, EABackedBlock):
"""
Block for holding extension types.
Notes
-----
This holds all 3rd-party extension array types. It's also the immediate
parent class for our internal extension types' blocks.
ExtensionArrays are limited to 1-D.
"""
_can_consolidate = False
_validate_ndim = False
is_extension = True
values: ExtensionArray
def fillna(
self,
value,
limit: int | None = None,
inplace: bool = False,
downcast=None,
using_cow: bool = False,
) -> list[Block]:
if is_interval_dtype(self.dtype):
# Block.fillna handles coercion (test_fillna_interval)
return super().fillna(
value=value,
limit=limit,
inplace=inplace,
downcast=downcast,
using_cow=using_cow,
)
if using_cow and self._can_hold_na and not self.values._hasna:
refs = self.refs
new_values = self.values
else:
refs = None
new_values = self.values.fillna(value=value, method=None, limit=limit)
nb = self.make_block_same_class(new_values, refs=refs)
return nb._maybe_downcast([nb], downcast, using_cow=using_cow)
@cache_readonly
def shape(self) -> Shape:
# TODO(EA2D): override unnecessary with 2D EAs
if self.ndim == 1:
return (len(self.values),)
return len(self._mgr_locs), len(self.values)
def iget(self, i: int | tuple[int, int] | tuple[slice, int]):
# In the case where we have a tuple[slice, int], the slice will always
# be slice(None)
# We _could_ make the annotation more specific, but mypy would
# complain about override mismatch:
# Literal[0] | tuple[Literal[0], int] | tuple[slice, int]
# Note: only reached with self.ndim == 2
if isinstance(i, tuple):
# TODO(EA2D): unnecessary with 2D EAs
col, loc = i
if not com.is_null_slice(col) and col != 0:
raise IndexError(f"{self} only contains one item")
if isinstance(col, slice):
# the is_null_slice check above assures that col is slice(None)
# so what we want is a view on all our columns and row loc
if loc < 0:
loc += len(self.values)
# Note: loc:loc+1 vs [[loc]] makes a difference when called
# from fast_xs because we want to get a view back.
return self.values[loc : loc + 1]
return self.values[loc]
else:
if i != 0:
raise IndexError(f"{self} only contains one item")
return self.values
def set_inplace(self, locs, values: ArrayLike, copy: bool = False) -> None:
# When an ndarray, we should have locs.tolist() == [0]
# When a BlockPlacement we should have list(locs) == [0]
if copy:
self.values = self.values.copy()
self.values[:] = values
def _maybe_squeeze_arg(self, arg):
"""
If necessary, squeeze a (N, 1) ndarray to (N,)
"""
# e.g. if we are passed a 2D mask for putmask
if (
isinstance(arg, (np.ndarray, ExtensionArray))
and arg.ndim == self.values.ndim + 1
):
# TODO(EA2D): unnecessary with 2D EAs
assert arg.shape[1] == 1
# error: No overload variant of "__getitem__" of "ExtensionArray"
# matches argument type "Tuple[slice, int]"
arg = arg[:, 0] # type: ignore[call-overload]
elif isinstance(arg, ABCDataFrame):
# 2022-01-06 only reached for setitem
# TODO: should we avoid getting here with DataFrame?
assert arg.shape[1] == 1
arg = arg._ixs(0, axis=1)._values
return arg
def _unwrap_setitem_indexer(self, indexer):
"""
Adapt a 2D-indexer to our 1D values.
This is intended for 'setitem', not 'iget' or '_slice'.
"""
# TODO: ATM this doesn't work for iget/_slice, can we change that?
if isinstance(indexer, tuple) and len(indexer) == 2:
# TODO(EA2D): not needed with 2D EAs
# Should never have length > 2. Caller is responsible for checking.
# Length 1 is reached vis setitem_single_block and setitem_single_column
# each of which pass indexer=(pi,)
if all(isinstance(x, np.ndarray) and x.ndim == 2 for x in indexer):
# GH#44703 went through indexing.maybe_convert_ix
first, second = indexer
if not (
second.size == 1 and (second == 0).all() and first.shape[1] == 1
):
raise NotImplementedError(
"This should not be reached. Please report a bug at "
"github.com/pandas-dev/pandas/"
)
indexer = first[:, 0]
elif lib.is_integer(indexer[1]) and indexer[1] == 0:
# reached via setitem_single_block passing the whole indexer
indexer = indexer[0]
elif com.is_null_slice(indexer[1]):
indexer = indexer[0]
elif is_list_like(indexer[1]) and indexer[1][0] == 0:
indexer = indexer[0]
else:
raise NotImplementedError(
"This should not be reached. Please report a bug at "
"github.com/pandas-dev/pandas/"
)
return indexer
@property
def is_view(self) -> bool:
"""Extension arrays are never treated as views."""
return False
@cache_readonly
def is_numeric(self):
return self.values.dtype._is_numeric
def _slice(
self, slicer: slice | npt.NDArray[np.bool_] | npt.NDArray[np.intp]
) -> ExtensionArray:
"""
Return a slice of my values.
Parameters
----------
slicer : slice, ndarray[int], or ndarray[bool]
Valid (non-reducing) indexer for self.values.
Returns
-------
ExtensionArray
"""
# Notes: ndarray[bool] is only reachable when via getitem_mgr, which
# is only for Series, i.e. self.ndim == 1.
# return same dims as we currently have
if self.ndim == 2:
# reached via getitem_block via _slice_take_blocks_ax0
# TODO(EA2D): won't be necessary with 2D EAs
if not isinstance(slicer, slice):
raise AssertionError(
"invalid slicing for a 1-ndim ExtensionArray", slicer
)
# GH#32959 only full-slicers along fake-dim0 are valid
# TODO(EA2D): won't be necessary with 2D EAs
# range(1) instead of self._mgr_locs to avoid exception on [::-1]
# see test_iloc_getitem_slice_negative_step_ea_block
new_locs = range(1)[slicer]
if not len(new_locs):
raise AssertionError(
"invalid slicing for a 1-ndim ExtensionArray", slicer
)
slicer = slice(None)
return self.values[slicer]
@final
def getitem_block_index(self, slicer: slice) -> ExtensionBlock:
"""
Perform __getitem__-like specialized to slicing along index.
"""
# GH#42787 in principle this is equivalent to values[..., slicer], but we don't
# require subclasses of ExtensionArray to support that form (for now).
new_values = self.values[slicer]
return type(self)(new_values, self._mgr_locs, ndim=self.ndim, refs=self.refs)
def diff(self, n: int, axis: AxisInt = 1) -> list[Block]:
# only reached with ndim == 2 and axis == 1
# TODO(EA2D): Can share with NDArrayBackedExtensionBlock
new_values = algos.diff(self.values, n, axis=0)
return [self.make_block(values=new_values)]
def shift(
self, periods: int, axis: AxisInt = 0, fill_value: Any = None
) -> list[Block]:
"""
Shift the block by `periods`.
Dispatches to underlying ExtensionArray and re-boxes in an
ExtensionBlock.
"""
new_values = self.values.shift(periods=periods, fill_value=fill_value)
return [self.make_block_same_class(new_values)]
def _unstack(
self,
unstacker,
fill_value,
new_placement: npt.NDArray[np.intp],
needs_masking: npt.NDArray[np.bool_],
):
# ExtensionArray-safe unstack.
# We override ObjectBlock._unstack, which unstacks directly on the
# values of the array. For EA-backed blocks, this would require
# converting to a 2-D ndarray of objects.
# Instead, we unstack an ndarray of integer positions, followed by
# a `take` on the actual values.
# Caller is responsible for ensuring self.shape[-1] == len(unstacker.index)
new_values, mask = unstacker.arange_result
# Note: these next two lines ensure that
# mask.sum() == sum(len(nb.mgr_locs) for nb in blocks)
# which the calling function needs in order to pass verify_integrity=False
# to the BlockManager constructor
new_values = new_values.T[mask]
new_placement = new_placement[mask]
# needs_masking[i] calculated once in BlockManager.unstack tells
# us if there are any -1s in the relevant indices. When False,
# that allows us to go through a faster path in 'take', among
# other things avoiding e.g. Categorical._validate_scalar.
blocks = [
# TODO: could cast to object depending on fill_value?
type(self)(
self.values.take(
indices, allow_fill=needs_masking[i], fill_value=fill_value
),
BlockPlacement(place),
ndim=2,
)
for i, (indices, place) in enumerate(zip(new_values, new_placement))
]
return blocks, mask
class NumpyBlock(libinternals.NumpyBlock, Block):
values: np.ndarray
@property
def is_view(self) -> bool:
"""return a boolean if I am possibly a view"""
return self.values.base is not None
@property
def array_values(self) -> ExtensionArray:
return PandasArray(self.values)
def get_values(self, dtype: DtypeObj | None = None) -> np.ndarray:
if dtype == _dtype_obj:
return self.values.astype(_dtype_obj)
return self.values
def values_for_json(self) -> np.ndarray:
return self.values
class NumericBlock(NumpyBlock):
__slots__ = ()
is_numeric = True
class NDArrayBackedExtensionBlock(libinternals.NDArrayBackedBlock, EABackedBlock):
"""
Block backed by an NDArrayBackedExtensionArray
"""
values: NDArrayBackedExtensionArray
# error: Signature of "is_extension" incompatible with supertype "Block"
@cache_readonly
def is_extension(self) -> bool: # type: ignore[override]
# i.e. datetime64tz, PeriodDtype
return not isinstance(self.dtype, np.dtype)
@property
def is_view(self) -> bool:
"""return a boolean if I am possibly a view"""
# check the ndarray values of the DatetimeIndex values
return self.values._ndarray.base is not None
def diff(self, n: int, axis: AxisInt = 0) -> list[Block]:
"""
1st discrete difference.
Parameters
----------
n : int
Number of periods to diff.
axis : int, default 0
Axis to diff upon.
Returns
-------
A list with a new Block.
Notes
-----
The arguments here are mimicking shift so they are called correctly
by apply.
"""
# only reached with ndim == 2 and axis == 1
values = self.values
new_values = values - values.shift(n, axis=axis)
return [self.make_block(new_values)]
def shift(
self, periods: int, axis: AxisInt = 0, fill_value: Any = None
) -> list[Block]:
values = self.values
new_values = values.shift(periods, fill_value=fill_value, axis=axis)
return [self.make_block_same_class(new_values)]
def _catch_deprecated_value_error(err: Exception) -> None:
"""
We catch ValueError for now, but only a specific one raised by DatetimeArray
which will no longer be raised in version.2.0.
"""
if isinstance(err, ValueError):
if isinstance(err, IncompatibleFrequency):
pass
elif "'value.closed' is" in str(err):
# IntervalDtype mismatched 'closed'
pass
class DatetimeLikeBlock(NDArrayBackedExtensionBlock):
"""Block for datetime64[ns], timedelta64[ns]."""
__slots__ = ()
is_numeric = False
values: DatetimeArray | TimedeltaArray
def values_for_json(self) -> np.ndarray:
return self.values._ndarray
def interpolate(
self,
*,
method: FillnaOptions = "pad",
index: Index | None = None,
axis: int = 0,
inplace: bool = False,
limit: int | None = None,
fill_value=None,
using_cow: bool = False,
**kwargs,
):
values = self.values
# error: Non-overlapping equality check (left operand type:
# "Literal['backfill', 'bfill', 'ffill', 'pad']", right operand type:
# "Literal['linear']") [comparison-overlap]
if method == "linear": # type: ignore[comparison-overlap]
# TODO: GH#50950 implement for arbitrary EAs
refs = None
if using_cow:
if inplace and not self.refs.has_reference():
data_out = values._ndarray
refs = self.refs
else:
data_out = values._ndarray.copy()
else:
data_out = values._ndarray if inplace else values._ndarray.copy()
missing.interpolate_array_2d(
data_out, method=method, limit=limit, index=index, axis=axis
)
new_values = type(values)._simple_new(data_out, dtype=values.dtype)
return self.make_block_same_class(new_values, refs=refs)
elif values.ndim == 2 and axis == 0:
# NDArrayBackedExtensionArray.fillna assumes axis=1
new_values = values.T.fillna(value=fill_value, method=method, limit=limit).T
else:
new_values = values.fillna(value=fill_value, method=method, limit=limit)
return self.make_block_same_class(new_values)
class DatetimeTZBlock(DatetimeLikeBlock):
"""implement a datetime64 block with a tz attribute"""
values: DatetimeArray
__slots__ = ()
is_extension = True
_validate_ndim = True
_can_consolidate = False
# Don't use values_for_json from DatetimeLikeBlock since it is
# an invalid optimization here(drop the tz)
values_for_json = NDArrayBackedExtensionBlock.values_for_json
class ObjectBlock(NumpyBlock):
__slots__ = ()
is_object = True
@maybe_split
def convert(
self,
*,
copy: bool = True,
using_cow: bool = False,
) -> list[Block]:
"""
attempt to cast any object types to better types return a copy of
the block (if copy = True) by definition we ARE an ObjectBlock!!!!!
"""
if self.dtype != _dtype_obj:
# GH#50067 this should be impossible in ObjectBlock, but until
# that is fixed, we short-circuit here.
if using_cow:
return [self.copy(deep=False)]
return [self]
values = self.values
if values.ndim == 2:
# maybe_split ensures we only get here with values.shape[0] == 1,
# avoid doing .ravel as that might make a copy
values = values[0]
res_values = lib.maybe_convert_objects(
values,
convert_datetime=True,
convert_timedelta=True,
convert_period=True,
convert_interval=True,
)
refs = None
if copy and res_values is values:
res_values = values.copy()
elif res_values is values and using_cow:
refs = self.refs
res_values = ensure_block_shape(res_values, self.ndim)
return [self.make_block(res_values, refs=refs)]
# -----------------------------------------------------------------
# Constructor Helpers
def maybe_coerce_values(values: ArrayLike) -> ArrayLike:
"""
Input validation for values passed to __init__. Ensure that
any datetime64/timedelta64 dtypes are in nanoseconds. Ensure
that we do not have string dtypes.
Parameters
----------
values : np.ndarray or ExtensionArray
Returns
-------
values : np.ndarray or ExtensionArray
"""
# Caller is responsible for ensuring PandasArray is already extracted.
if isinstance(values, np.ndarray):
values = ensure_wrapped_if_datetimelike(values)
if issubclass(values.dtype.type, str):
values = np.array(values, dtype=object)
if isinstance(values, (DatetimeArray, TimedeltaArray)) and values.freq is not None:
# freq is only stored in DatetimeIndex/TimedeltaIndex, not in Series/DataFrame
values = values._with_freq(None)
return values
def get_block_type(dtype: DtypeObj):
"""
Find the appropriate Block subclass to use for the given values and dtype.
Parameters
----------
dtype : numpy or pandas dtype
Returns
-------
cls : class, subclass of Block
"""
# We use kind checks because it is much more performant
# than is_foo_dtype
kind = dtype.kind
cls: type[Block]
if isinstance(dtype, SparseDtype):
# Need this first(ish) so that Sparse[datetime] is sparse
cls = ExtensionBlock
elif isinstance(dtype, DatetimeTZDtype):
cls = DatetimeTZBlock
elif isinstance(dtype, PeriodDtype):
cls = NDArrayBackedExtensionBlock
elif isinstance(dtype, ExtensionDtype):
# Note: need to be sure PandasArray is unwrapped before we get here
cls = ExtensionBlock
elif kind in ["M", "m"]:
cls = DatetimeLikeBlock
elif kind in ["f", "c", "i", "u", "b"]:
cls = NumericBlock
else:
cls = ObjectBlock
return cls
def new_block_2d(
values: ArrayLike, placement: BlockPlacement, refs: BlockValuesRefs | None = None
):
# new_block specialized to case with
# ndim=2
# isinstance(placement, BlockPlacement)
# check_ndim/ensure_block_shape already checked
klass = get_block_type(values.dtype)
values = maybe_coerce_values(values)
return klass(values, ndim=2, placement=placement, refs=refs)
def new_block(
values, placement, *, ndim: int, refs: BlockValuesRefs | None = None
) -> Block:
# caller is responsible for ensuring values is NOT a PandasArray
if not isinstance(placement, BlockPlacement):
placement = BlockPlacement(placement)
check_ndim(values, placement, ndim)
klass = get_block_type(values.dtype)
values = maybe_coerce_values(values)
return klass(values, ndim=ndim, placement=placement, refs=refs)
def check_ndim(values, placement: BlockPlacement, ndim: int) -> None:
"""
ndim inference and validation.
Validates that values.ndim and ndim are consistent.
Validates that len(values) and len(placement) are consistent.
Parameters
----------
values : array-like
placement : BlockPlacement
ndim : int
Raises
------
ValueError : the number of dimensions do not match
"""
if values.ndim > ndim:
# Check for both np.ndarray and ExtensionArray
raise ValueError(
"Wrong number of dimensions. "
f"values.ndim > ndim [{values.ndim} > {ndim}]"
)
if not is_1d_only_ea_dtype(values.dtype):
# TODO(EA2D): special case not needed with 2D EAs
if values.ndim != ndim:
raise ValueError(
"Wrong number of dimensions. "
f"values.ndim != ndim [{values.ndim} != {ndim}]"
)
if len(placement) != len(values):
raise ValueError(
f"Wrong number of items passed {len(values)}, "
f"placement implies {len(placement)}"
)
elif ndim == 2 and len(placement) != 1:
# TODO(EA2D): special case unnecessary with 2D EAs
raise ValueError("need to split")
def extract_pandas_array(
values: np.ndarray | ExtensionArray, dtype: DtypeObj | None, ndim: int
) -> tuple[np.ndarray | ExtensionArray, DtypeObj | None]:
"""
Ensure that we don't allow PandasArray / PandasDtype in internals.
"""
# For now, blocks should be backed by ndarrays when possible.
if isinstance(values, ABCPandasArray):
values = values.to_numpy()
if ndim and ndim > 1:
# TODO(EA2D): special case not needed with 2D EAs
values = np.atleast_2d(values)
if isinstance(dtype, PandasDtype):
dtype = dtype.numpy_dtype
return values, dtype
# -----------------------------------------------------------------
def extend_blocks(result, blocks=None) -> list[Block]:
"""return a new extended blocks, given the result"""
if blocks is None:
blocks = []
if isinstance(result, list):
for r in result:
if isinstance(r, list):
blocks.extend(r)
else:
blocks.append(r)
else:
assert isinstance(result, Block), type(result)
blocks.append(result)
return blocks
def ensure_block_shape(values: ArrayLike, ndim: int = 1) -> ArrayLike:
"""
Reshape if possible to have values.ndim == ndim.
"""
if values.ndim < ndim:
if not is_1d_only_ea_dtype(values.dtype):
# TODO(EA2D): https://github.com/pandas-dev/pandas/issues/23023
# block.shape is incorrect for "2D" ExtensionArrays
# We can't, and don't need to, reshape.
values = cast("np.ndarray | DatetimeArray | TimedeltaArray", values)
values = values.reshape(1, -1)
return values
def to_native_types(
values: ArrayLike,
*,
na_rep: str = "nan",
quoting=None,
float_format=None,
decimal: str = ".",
**kwargs,
) -> np.ndarray:
"""convert to our native types format"""
if isinstance(values, Categorical) and values.categories.dtype.kind in "Mm":
# GH#40754 Convert categorical datetimes to datetime array
values = algos.take_nd(
values.categories._values,
ensure_platform_int(values._codes),
fill_value=na_rep,
)
values = ensure_wrapped_if_datetimelike(values)
if isinstance(values, (DatetimeArray, TimedeltaArray)):
if values.ndim == 1:
result = values._format_native_types(na_rep=na_rep, **kwargs)
result = result.astype(object, copy=False)
return result
# GH#21734 Process every column separately, they might have different formats
results_converted = []
for i in range(len(values)):
result = values[i, :]._format_native_types(na_rep=na_rep, **kwargs)
results_converted.append(result.astype(object, copy=False))
return np.vstack(results_converted)
elif values.dtype.kind == "f" and not is_sparse(values):
# see GH#13418: no special formatting is desired at the
# output (important for appropriate 'quoting' behaviour),
# so do not pass it through the FloatArrayFormatter
if float_format is None and decimal == ".":
mask = isna(values)
if not quoting:
values = values.astype(str)
else:
values = np.array(values, dtype="object")
values[mask] = na_rep
values = values.astype(object, copy=False)
return values
from pandas.io.formats.format import FloatArrayFormatter
formatter = FloatArrayFormatter(
values,
na_rep=na_rep,
float_format=float_format,
decimal=decimal,
quoting=quoting,
fixed_width=False,
)
res = formatter.get_result_as_array()
res = res.astype(object, copy=False)
return res
elif isinstance(values, ExtensionArray):
mask = isna(values)
new_values = np.asarray(values.astype(object))
new_values[mask] = na_rep
return new_values
else:
mask = isna(values)
itemsize = writers.word_len(na_rep)
if values.dtype != _dtype_obj and not quoting and itemsize:
values = values.astype(str)
if values.dtype.itemsize / np.dtype("U1").itemsize < itemsize:
# enlarge for the na_rep
values = values.astype(f"<U{itemsize}")
else:
values = np.array(values, dtype="object")
values[mask] = na_rep
values = values.astype(object, copy=False)
return values
def external_values(values: ArrayLike) -> ArrayLike:
"""
The array that Series.values returns (public attribute).
This has some historical constraints, and is overridden in block
subclasses to return the correct array (e.g. period returns
object ndarray and datetimetz a datetime64[ns] ndarray instead of
proper extension array).
"""
if isinstance(values, (PeriodArray, IntervalArray)):
return values.astype(object)
elif isinstance(values, (DatetimeArray, TimedeltaArray)):
# NB: for datetime64tz this is different from np.asarray(values), since
# that returns an object-dtype ndarray of Timestamps.
# Avoid raising in .astype in casting from dt64tz to dt64
values = values._ndarray
if isinstance(values, np.ndarray) and using_copy_on_write():
values = values.view()
values.flags.writeable = False
# TODO(CoW) we should also mark our ExtensionArrays as read-only
return values