from __future__ import annotations from functools import wraps import re from typing import ( TYPE_CHECKING, Any, Callable, Iterable, Sequence, cast, final, ) import numpy as np from pandas._config import using_copy_on_write from pandas._libs import ( internals as libinternals, lib, writers, ) from pandas._libs.internals import ( BlockPlacement, BlockValuesRefs, ) from pandas._libs.missing import NA from pandas._libs.tslibs import IncompatibleFrequency from pandas._typing import ( ArrayLike, AxisInt, DtypeObj, F, FillnaOptions, IgnoreRaise, QuantileInterpolation, Shape, npt, ) from pandas.errors import AbstractMethodError from pandas.util._decorators import cache_readonly from pandas.util._validators import validate_bool_kwarg from pandas.core.dtypes.astype import ( astype_array_safe, astype_is_view, ) from pandas.core.dtypes.cast import ( LossySetitemError, can_hold_element, find_result_type, maybe_downcast_to_dtype, np_can_hold_element, ) from pandas.core.dtypes.common import ( ensure_platform_int, is_1d_only_ea_dtype, is_1d_only_ea_obj, is_dtype_equal, is_interval_dtype, is_list_like, is_sparse, is_string_dtype, ) from pandas.core.dtypes.dtypes import ( DatetimeTZDtype, ExtensionDtype, PandasDtype, PeriodDtype, ) from pandas.core.dtypes.generic import ( ABCDataFrame, ABCIndex, ABCPandasArray, ABCSeries, ) from pandas.core.dtypes.missing import ( is_valid_na_for_dtype, isna, na_value_for_dtype, ) from pandas.core import missing import pandas.core.algorithms as algos from pandas.core.array_algos.putmask import ( extract_bool_array, putmask_inplace, putmask_without_repeat, setitem_datetimelike_compat, validate_putmask, ) from pandas.core.array_algos.quantile import quantile_compat from pandas.core.array_algos.replace import ( compare_or_regex_search, replace_regex, should_use_regex, ) from pandas.core.array_algos.transforms import shift from pandas.core.arrays import ( Categorical, DatetimeArray, ExtensionArray, IntervalArray, PandasArray, PeriodArray, TimedeltaArray, ) from pandas.core.arrays.sparse import SparseDtype from pandas.core.base import PandasObject import pandas.core.common as com from pandas.core.computation import expressions from pandas.core.construction import ( ensure_wrapped_if_datetimelike, extract_array, ) from pandas.core.indexers import check_setitem_lengths if TYPE_CHECKING: from pandas.core.api import Index from pandas.core.arrays._mixins import NDArrayBackedExtensionArray # comparison is faster than is_object_dtype _dtype_obj = np.dtype("object") def maybe_split(meth: F) -> F: """ If we have a multi-column block, split and operate block-wise. Otherwise use the original method. """ @wraps(meth) def newfunc(self, *args, **kwargs) -> list[Block]: if self.ndim == 1 or self.shape[0] == 1: return meth(self, *args, **kwargs) else: # Split and operate column-by-column return self.split_and_operate(meth, *args, **kwargs) return cast(F, newfunc) class Block(PandasObject): """ Canonical n-dimensional unit of homogeneous dtype contained in a pandas data structure Index-ignorant; let the container take care of that """ values: np.ndarray | ExtensionArray ndim: int refs: BlockValuesRefs __init__: Callable __slots__ = () is_numeric = False is_object = False is_extension = False _can_consolidate = True _validate_ndim = True @final @cache_readonly def _consolidate_key(self): return self._can_consolidate, self.dtype.name @final @cache_readonly def _can_hold_na(self) -> bool: """ Can we store NA values in this Block? """ dtype = self.dtype if isinstance(dtype, np.dtype): return dtype.kind not in ["b", "i", "u"] return dtype._can_hold_na @final @property def is_bool(self) -> bool: """ We can be bool if a) we are bool dtype or b) object dtype with bool objects. """ return self.values.dtype == np.dtype(bool) @final def external_values(self): return external_values(self.values) @final @cache_readonly def fill_value(self): # Used in reindex_indexer return na_value_for_dtype(self.dtype, compat=False) @final def _standardize_fill_value(self, value): # if we are passed a scalar None, convert it here if self.dtype != _dtype_obj and is_valid_na_for_dtype(value, self.dtype): value = self.fill_value return value @property def mgr_locs(self) -> BlockPlacement: return self._mgr_locs @mgr_locs.setter def mgr_locs(self, new_mgr_locs: BlockPlacement) -> None: self._mgr_locs = new_mgr_locs @final def make_block( self, values, placement=None, refs: BlockValuesRefs | None = None ) -> Block: """ Create a new block, with type inference propagate any values that are not specified """ if placement is None: placement = self._mgr_locs if self.is_extension: values = ensure_block_shape(values, ndim=self.ndim) # TODO: perf by not going through new_block # We assume maybe_coerce_values has already been called return new_block(values, placement=placement, ndim=self.ndim, refs=refs) @final def make_block_same_class( self, values, placement: BlockPlacement | None = None, refs: BlockValuesRefs | None = None, ) -> Block: """Wrap given values in a block of same type as self.""" # Pre-2.0 we called ensure_wrapped_if_datetimelike because fastparquet # relied on it, as of 2.0 the caller is responsible for this. if placement is None: placement = self._mgr_locs # We assume maybe_coerce_values has already been called return type(self)(values, placement=placement, ndim=self.ndim, refs=refs) @final def __repr__(self) -> str: # don't want to print out all of the items here name = type(self).__name__ if self.ndim == 1: result = f"{name}: {len(self)} dtype: {self.dtype}" else: shape = " x ".join([str(s) for s in self.shape]) result = f"{name}: {self.mgr_locs.indexer}, {shape}, dtype: {self.dtype}" return result @final def __len__(self) -> int: return len(self.values) @final def getitem_block(self, slicer: slice | npt.NDArray[np.intp]) -> Block: """ Perform __getitem__-like, return result as block. Only supports slices that preserve dimensionality. """ # Note: the only place where we are called with ndarray[intp] # is from internals.concat, and we can verify that never happens # with 1-column blocks, i.e. never for ExtensionBlock. new_mgr_locs = self._mgr_locs[slicer] new_values = self._slice(slicer) refs = self.refs if isinstance(slicer, slice) else None return type(self)(new_values, new_mgr_locs, self.ndim, refs=refs) @final def getitem_block_columns( self, slicer: slice, new_mgr_locs: BlockPlacement ) -> Block: """ Perform __getitem__-like, return result as block. Only supports slices that preserve dimensionality. """ new_values = self._slice(slicer) if new_values.ndim != self.values.ndim: raise ValueError("Only same dim slicing is allowed") return type(self)(new_values, new_mgr_locs, self.ndim, refs=self.refs) @final def _can_hold_element(self, element: Any) -> bool: """require the same dtype as ourselves""" element = extract_array(element, extract_numpy=True) return can_hold_element(self.values, element) @final def should_store(self, value: ArrayLike) -> bool: """ Should we set self.values[indexer] = value inplace or do we need to cast? Parameters ---------- value : np.ndarray or ExtensionArray Returns ------- bool """ # faster equivalent to is_dtype_equal(value.dtype, self.dtype) try: return value.dtype == self.dtype except TypeError: return False # --------------------------------------------------------------------- # Apply/Reduce and Helpers @final def apply(self, func, **kwargs) -> list[Block]: """ apply the function to my values; return a block if we are not one """ result = func(self.values, **kwargs) return self._split_op_result(result) @final def reduce(self, func) -> list[Block]: # We will apply the function and reshape the result into a single-row # Block with the same mgr_locs; squeezing will be done at a higher level assert self.ndim == 2 result = func(self.values) if self.values.ndim == 1: # TODO(EA2D): special case not needed with 2D EAs res_values = np.array([[result]]) else: res_values = result.reshape(-1, 1) nb = self.make_block(res_values) return [nb] @final def _split_op_result(self, result: ArrayLike) -> list[Block]: # See also: split_and_operate if result.ndim > 1 and isinstance(result.dtype, ExtensionDtype): # TODO(EA2D): unnecessary with 2D EAs # if we get a 2D ExtensionArray, we need to split it into 1D pieces nbs = [] for i, loc in enumerate(self._mgr_locs): if not is_1d_only_ea_obj(result): vals = result[i : i + 1] else: vals = result[i] block = self.make_block(values=vals, placement=loc) nbs.append(block) return nbs nb = self.make_block(result) return [nb] @final def _split(self) -> list[Block]: """ Split a block into a list of single-column blocks. """ assert self.ndim == 2 new_blocks = [] for i, ref_loc in enumerate(self._mgr_locs): vals = self.values[slice(i, i + 1)] bp = BlockPlacement(ref_loc) nb = type(self)(vals, placement=bp, ndim=2, refs=self.refs) new_blocks.append(nb) return new_blocks @final def split_and_operate(self, func, *args, **kwargs) -> list[Block]: """ Split the block and apply func column-by-column. Parameters ---------- func : Block method *args **kwargs Returns ------- List[Block] """ assert self.ndim == 2 and self.shape[0] != 1 res_blocks = [] for nb in self._split(): rbs = func(nb, *args, **kwargs) res_blocks.extend(rbs) return res_blocks # --------------------------------------------------------------------- # Up/Down-casting @final def coerce_to_target_dtype(self, other) -> Block: """ coerce the current block to a dtype compat for other we will return a block, possibly object, and not raise we can also safely try to coerce to the same dtype and will receive the same block """ new_dtype = find_result_type(self.values, other) return self.astype(new_dtype, copy=False) @final def _maybe_downcast( self, blocks: list[Block], downcast=None, using_cow: bool = False ) -> list[Block]: if downcast is False: return blocks if self.dtype == _dtype_obj: # TODO: does it matter that self.dtype might not match blocks[i].dtype? # GH#44241 We downcast regardless of the argument; # respecting 'downcast=None' may be worthwhile at some point, # but ATM it breaks too much existing code. # split and convert the blocks return extend_blocks( [blk.convert(using_cow=using_cow, copy=not using_cow) for blk in blocks] ) if downcast is None: return blocks return extend_blocks([b._downcast_2d(downcast, using_cow) for b in blocks]) @final @maybe_split def _downcast_2d(self, dtype, using_cow: bool = False) -> list[Block]: """ downcast specialized to 2D case post-validation. Refactored to allow use of maybe_split. """ new_values = maybe_downcast_to_dtype(self.values, dtype=dtype) refs = self.refs if using_cow and new_values is self.values else None return [self.make_block(new_values, refs=refs)] def convert( self, *, copy: bool = True, using_cow: bool = False, ) -> list[Block]: """ attempt to coerce any object types to better types return a copy of the block (if copy = True) by definition we are not an ObjectBlock here! """ if not copy and using_cow: return [self.copy(deep=False)] return [self.copy()] if copy else [self] # --------------------------------------------------------------------- # Array-Like Methods @cache_readonly def dtype(self) -> DtypeObj: return self.values.dtype @final def astype( self, dtype: DtypeObj, copy: bool = False, errors: IgnoreRaise = "raise", using_cow: bool = False, ) -> Block: """ Coerce to the new dtype. Parameters ---------- dtype : np.dtype or ExtensionDtype copy : bool, default False copy if indicated errors : str, {'raise', 'ignore'}, default 'raise' - ``raise`` : allow exceptions to be raised - ``ignore`` : suppress exceptions. On error return original object using_cow: bool, default False Signaling if copy on write copy logic is used. Returns ------- Block """ values = self.values new_values = astype_array_safe(values, dtype, copy=copy, errors=errors) new_values = maybe_coerce_values(new_values) refs = None if using_cow and astype_is_view(values.dtype, new_values.dtype): refs = self.refs newb = self.make_block(new_values, refs=refs) if newb.shape != self.shape: raise TypeError( f"cannot set astype for copy = [{copy}] for dtype " f"({self.dtype.name} [{self.shape}]) to different shape " f"({newb.dtype.name} [{newb.shape}])" ) return newb @final def to_native_types(self, na_rep: str = "nan", quoting=None, **kwargs) -> Block: """convert to our native types format""" result = to_native_types(self.values, na_rep=na_rep, quoting=quoting, **kwargs) return self.make_block(result) @final def copy(self, deep: bool = True) -> Block: """copy constructor""" values = self.values refs: BlockValuesRefs | None if deep: values = values.copy() refs = None else: refs = self.refs return type(self)(values, placement=self._mgr_locs, ndim=self.ndim, refs=refs) # --------------------------------------------------------------------- # Replace @final def replace( self, to_replace, value, inplace: bool = False, # mask may be pre-computed if we're called from replace_list mask: npt.NDArray[np.bool_] | None = None, using_cow: bool = False, ) -> list[Block]: """ replace the to_replace value with value, possible to create new blocks here this is just a call to putmask. """ # Note: the checks we do in NDFrame.replace ensure we never get # here with listlike to_replace or value, as those cases # go through replace_list values = self.values if isinstance(values, Categorical): # TODO: avoid special-casing # GH49404 if using_cow and (self.refs.has_reference() or not inplace): blk = self.copy() elif using_cow: blk = self.copy(deep=False) else: blk = self if inplace else self.copy() values = cast(Categorical, blk.values) values._replace(to_replace=to_replace, value=value, inplace=True) return [blk] if not self._can_hold_element(to_replace): # We cannot hold `to_replace`, so we know immediately that # replacing it is a no-op. # Note: If to_replace were a list, NDFrame.replace would call # replace_list instead of replace. if using_cow: return [self.copy(deep=False)] else: return [self] if inplace else [self.copy()] if mask is None: mask = missing.mask_missing(values, to_replace) if not mask.any(): # Note: we get here with test_replace_extension_other incorrectly # bc _can_hold_element is incorrect. if using_cow: return [self.copy(deep=False)] else: return [self] if inplace else [self.copy()] elif self._can_hold_element(value): # TODO(CoW): Maybe split here as well into columns where mask has True # and rest? if using_cow: if inplace: blk = self.copy(deep=self.refs.has_reference()) else: blk = self.copy() else: blk = self if inplace else self.copy() putmask_inplace(blk.values, mask, value) if not (self.is_object and value is None): # if the user *explicitly* gave None, we keep None, otherwise # may downcast to NaN blocks = blk.convert(copy=False, using_cow=using_cow) else: blocks = [blk] return blocks elif self.ndim == 1 or self.shape[0] == 1: if value is None or value is NA: blk = self.astype(np.dtype(object)) else: blk = self.coerce_to_target_dtype(value) return blk.replace( to_replace=to_replace, value=value, inplace=True, mask=mask, ) else: # split so that we only upcast where necessary blocks = [] for i, nb in enumerate(self._split()): blocks.extend( type(self).replace( nb, to_replace=to_replace, value=value, inplace=True, mask=mask[i : i + 1], using_cow=using_cow, ) ) return blocks @final def _replace_regex( self, to_replace, value, inplace: bool = False, mask=None, using_cow: bool = False, ) -> list[Block]: """ Replace elements by the given value. Parameters ---------- to_replace : object or pattern Scalar to replace or regular expression to match. value : object Replacement object. inplace : bool, default False Perform inplace modification. mask : array-like of bool, optional True indicate corresponding element is ignored. using_cow: bool, default False Specifying if copy on write is enabled. Returns ------- List[Block] """ if not self._can_hold_element(to_replace): # i.e. only ObjectBlock, but could in principle include a # String ExtensionBlock if using_cow: return [self.copy(deep=False)] return [self] if inplace else [self.copy()] rx = re.compile(to_replace) if using_cow: if inplace and not self.refs.has_reference(): refs = self.refs new_values = self.values else: refs = None new_values = self.values.copy() else: refs = None new_values = self.values if inplace else self.values.copy() replace_regex(new_values, rx, value, mask) block = self.make_block(new_values, refs=refs) return block.convert(copy=False, using_cow=using_cow) @final def replace_list( self, src_list: Iterable[Any], dest_list: Sequence[Any], inplace: bool = False, regex: bool = False, using_cow: bool = False, ) -> list[Block]: """ See BlockManager.replace_list docstring. """ values = self.values if isinstance(values, Categorical): # TODO: avoid special-casing # GH49404 if using_cow and inplace: blk = self.copy(deep=self.refs.has_reference()) else: blk = self if inplace else self.copy() values = cast(Categorical, blk.values) values._replace(to_replace=src_list, value=dest_list, inplace=True) return [blk] # Exclude anything that we know we won't contain pairs = [ (x, y) for x, y in zip(src_list, dest_list) if self._can_hold_element(x) ] if not len(pairs): if using_cow: return [self.copy(deep=False)] # shortcut, nothing to replace return [self] if inplace else [self.copy()] src_len = len(pairs) - 1 if is_string_dtype(values.dtype): # Calculate the mask once, prior to the call of comp # in order to avoid repeating the same computations na_mask = ~isna(values) masks: Iterable[npt.NDArray[np.bool_]] = ( extract_bool_array( cast( ArrayLike, compare_or_regex_search( values, s[0], regex=regex, mask=na_mask ), ) ) for s in pairs ) else: # GH#38086 faster if we know we dont need to check for regex masks = (missing.mask_missing(values, s[0]) for s in pairs) # Materialize if inplace = True, since the masks can change # as we replace if inplace: masks = list(masks) if using_cow and inplace: # Don't set up refs here, otherwise we will think that we have # references when we check again later rb = [self] else: rb = [self if inplace else self.copy()] for i, ((src, dest), mask) in enumerate(zip(pairs, masks)): convert = i == src_len # only convert once at the end new_rb: list[Block] = [] # GH-39338: _replace_coerce can split a block into # single-column blocks, so track the index so we know # where to index into the mask for blk_num, blk in enumerate(rb): if len(rb) == 1: m = mask else: mib = mask assert not isinstance(mib, bool) m = mib[blk_num : blk_num + 1] # error: Argument "mask" to "_replace_coerce" of "Block" has # incompatible type "Union[ExtensionArray, ndarray[Any, Any], bool]"; # expected "ndarray[Any, dtype[bool_]]" result = blk._replace_coerce( to_replace=src, value=dest, mask=m, inplace=inplace, regex=regex, using_cow=using_cow, ) if convert and blk.is_object and not all(x is None for x in dest_list): # GH#44498 avoid unwanted cast-back result = extend_blocks( [ b.convert(copy=True and not using_cow, using_cow=using_cow) for b in result ] ) new_rb.extend(result) rb = new_rb return rb @final def _replace_coerce( self, to_replace, value, mask: npt.NDArray[np.bool_], inplace: bool = True, regex: bool = False, using_cow: bool = False, ) -> list[Block]: """ Replace value corresponding to the given boolean array with another value. Parameters ---------- to_replace : object or pattern Scalar to replace or regular expression to match. value : object Replacement object. mask : np.ndarray[bool] True indicate corresponding element is ignored. inplace : bool, default True Perform inplace modification. regex : bool, default False If true, perform regular expression substitution. Returns ------- List[Block] """ if should_use_regex(regex, to_replace): return self._replace_regex( to_replace, value, inplace=inplace, mask=mask, ) else: if value is None: # gh-45601, gh-45836, gh-46634 if mask.any(): has_ref = self.refs.has_reference() nb = self.astype(np.dtype(object), copy=False, using_cow=using_cow) if (nb is self or using_cow) and not inplace: nb = nb.copy() elif inplace and has_ref and nb.refs.has_reference(): # no copy in astype and we had refs before nb = nb.copy() putmask_inplace(nb.values, mask, value) return [nb] if using_cow: return [self.copy(deep=False)] return [self] if inplace else [self.copy()] return self.replace( to_replace=to_replace, value=value, inplace=inplace, mask=mask, using_cow=using_cow, ) # --------------------------------------------------------------------- # 2D Methods - Shared by NumpyBlock and NDArrayBackedExtensionBlock # but not ExtensionBlock def _maybe_squeeze_arg(self, arg: np.ndarray) -> np.ndarray: """ For compatibility with 1D-only ExtensionArrays. """ return arg def _unwrap_setitem_indexer(self, indexer): """ For compatibility with 1D-only ExtensionArrays. """ return indexer # NB: this cannot be made cache_readonly because in mgr.set_values we pin # new .values that can have different shape GH#42631 @property def shape(self) -> Shape: return self.values.shape def iget(self, i: int | tuple[int, int] | tuple[slice, int]) -> np.ndarray: # In the case where we have a tuple[slice, int], the slice will always # be slice(None) # Note: only reached with self.ndim == 2 # Invalid index type "Union[int, Tuple[int, int], Tuple[slice, int]]" # for "Union[ndarray[Any, Any], ExtensionArray]"; expected type # "Union[int, integer[Any]]" return self.values[i] # type: ignore[index] def _slice( self, slicer: slice | npt.NDArray[np.bool_] | npt.NDArray[np.intp] ) -> ArrayLike: """return a slice of my values""" return self.values[slicer] def set_inplace(self, locs, values: ArrayLike, copy: bool = False) -> None: """ Modify block values in-place with new item value. If copy=True, first copy the underlying values in place before modifying (for Copy-on-Write). Notes ----- `set_inplace` never creates a new array or new Block, whereas `setitem` _may_ create a new array and always creates a new Block. Caller is responsible for checking values.dtype == self.dtype. """ if copy: self.values = self.values.copy() self.values[locs] = values def take_nd( self, indexer: npt.NDArray[np.intp], axis: AxisInt, new_mgr_locs: BlockPlacement | None = None, fill_value=lib.no_default, ) -> Block: """ Take values according to indexer and return them as a block. """ values = self.values if fill_value is lib.no_default: fill_value = self.fill_value allow_fill = False else: allow_fill = True # Note: algos.take_nd has upcast logic similar to coerce_to_target_dtype new_values = algos.take_nd( values, indexer, axis=axis, allow_fill=allow_fill, fill_value=fill_value ) # Called from three places in managers, all of which satisfy # these assertions if isinstance(self, ExtensionBlock): # NB: in this case, the 'axis' kwarg will be ignored in the # algos.take_nd call above. assert not (self.ndim == 1 and new_mgr_locs is None) assert not (axis == 0 and new_mgr_locs is None) if new_mgr_locs is None: new_mgr_locs = self._mgr_locs if not is_dtype_equal(new_values.dtype, self.dtype): return self.make_block(new_values, new_mgr_locs) else: return self.make_block_same_class(new_values, new_mgr_locs) def _unstack( self, unstacker, fill_value, new_placement: npt.NDArray[np.intp], needs_masking: npt.NDArray[np.bool_], ): """ Return a list of unstacked blocks of self Parameters ---------- unstacker : reshape._Unstacker fill_value : int Only used in ExtensionBlock._unstack new_placement : np.ndarray[np.intp] allow_fill : bool needs_masking : np.ndarray[bool] Returns ------- blocks : list of Block New blocks of unstacked values. mask : array-like of bool The mask of columns of `blocks` we should keep. """ new_values, mask = unstacker.get_new_values( self.values.T, fill_value=fill_value ) mask = mask.any(0) # TODO: in all tests we have mask.all(); can we rely on that? # Note: these next two lines ensure that # mask.sum() == sum(len(nb.mgr_locs) for nb in blocks) # which the calling function needs in order to pass verify_integrity=False # to the BlockManager constructor new_values = new_values.T[mask] new_placement = new_placement[mask] bp = BlockPlacement(new_placement) blocks = [new_block_2d(new_values, placement=bp)] return blocks, mask # --------------------------------------------------------------------- def setitem(self, indexer, value, using_cow: bool = False) -> Block: """ Attempt self.values[indexer] = value, possibly creating a new array. Parameters ---------- indexer : tuple, list-like, array-like, slice, int The subset of self.values to set value : object The value being set using_cow: bool, default False Signaling if CoW is used. Returns ------- Block Notes ----- `indexer` is a direct slice/positional indexer. `value` must be a compatible shape. """ value = self._standardize_fill_value(value) values = cast(np.ndarray, self.values) if self.ndim == 2: values = values.T # length checking check_setitem_lengths(indexer, value, values) value = extract_array(value, extract_numpy=True) try: casted = np_can_hold_element(values.dtype, value) except LossySetitemError: # current dtype cannot store value, coerce to common dtype nb = self.coerce_to_target_dtype(value) return nb.setitem(indexer, value) else: if self.dtype == _dtype_obj: # TODO: avoid having to construct values[indexer] vi = values[indexer] if lib.is_list_like(vi): # checking lib.is_scalar here fails on # test_iloc_setitem_custom_object casted = setitem_datetimelike_compat(values, len(vi), casted) if using_cow and self.refs.has_reference(): values = values.copy() self = self.make_block_same_class( values.T if values.ndim == 2 else values ) if isinstance(casted, np.ndarray) and casted.ndim == 1 and len(casted) == 1: # NumPy 1.25 deprecation: https://github.com/numpy/numpy/pull/10615 casted = casted[0, ...] values[indexer] = casted return self def putmask(self, mask, new, using_cow: bool = False) -> list[Block]: """ putmask the data to the block; it is possible that we may create a new dtype of block Return the resulting block(s). Parameters ---------- mask : np.ndarray[bool], SparseArray[bool], or BooleanArray new : a ndarray/object using_cow: bool, default False Returns ------- List[Block] """ orig_mask = mask values = cast(np.ndarray, self.values) mask, noop = validate_putmask(values.T, mask) assert not isinstance(new, (ABCIndex, ABCSeries, ABCDataFrame)) if new is lib.no_default: new = self.fill_value new = self._standardize_fill_value(new) new = extract_array(new, extract_numpy=True) if noop: if using_cow: return [self.copy(deep=False)] return [self] try: casted = np_can_hold_element(values.dtype, new) if using_cow and self.refs.has_reference(): # Do this here to avoid copying twice values = values.copy() self = self.make_block_same_class(values) putmask_without_repeat(values.T, mask, casted) if using_cow: return [self.copy(deep=False)] return [self] except LossySetitemError: if self.ndim == 1 or self.shape[0] == 1: # no need to split columns if not is_list_like(new): # using just new[indexer] can't save us the need to cast return self.coerce_to_target_dtype(new).putmask(mask, new) else: indexer = mask.nonzero()[0] nb = self.setitem(indexer, new[indexer], using_cow=using_cow) return [nb] else: is_array = isinstance(new, np.ndarray) res_blocks = [] nbs = self._split() for i, nb in enumerate(nbs): n = new if is_array: # we have a different value per-column n = new[:, i : i + 1] submask = orig_mask[:, i : i + 1] rbs = nb.putmask(submask, n, using_cow=using_cow) res_blocks.extend(rbs) return res_blocks def where( self, other, cond, _downcast: str | bool = "infer", using_cow: bool = False ) -> list[Block]: """ evaluate the block; return result block(s) from the result Parameters ---------- other : a ndarray/object cond : np.ndarray[bool], SparseArray[bool], or BooleanArray _downcast : str or None, default "infer" Private because we only specify it when calling from fillna. Returns ------- List[Block] """ assert cond.ndim == self.ndim assert not isinstance(other, (ABCIndex, ABCSeries, ABCDataFrame)) transpose = self.ndim == 2 cond = extract_bool_array(cond) # EABlocks override where values = cast(np.ndarray, self.values) orig_other = other if transpose: values = values.T icond, noop = validate_putmask(values, ~cond) if noop: # GH-39595: Always return a copy; short-circuit up/downcasting if using_cow: return [self.copy(deep=False)] return [self.copy()] if other is lib.no_default: other = self.fill_value other = self._standardize_fill_value(other) try: # try/except here is equivalent to a self._can_hold_element check, # but this gets us back 'casted' which we will re-use below; # without using 'casted', expressions.where may do unwanted upcasts. casted = np_can_hold_element(values.dtype, other) except (ValueError, TypeError, LossySetitemError): # we cannot coerce, return a compat dtype if self.ndim == 1 or self.shape[0] == 1: # no need to split columns block = self.coerce_to_target_dtype(other) blocks = block.where(orig_other, cond, using_cow=using_cow) return self._maybe_downcast( blocks, downcast=_downcast, using_cow=using_cow ) else: # since _maybe_downcast would split blocks anyway, we # can avoid some potential upcast/downcast by splitting # on the front end. is_array = isinstance(other, (np.ndarray, ExtensionArray)) res_blocks = [] nbs = self._split() for i, nb in enumerate(nbs): oth = other if is_array: # we have a different value per-column oth = other[:, i : i + 1] submask = cond[:, i : i + 1] rbs = nb.where( oth, submask, _downcast=_downcast, using_cow=using_cow ) res_blocks.extend(rbs) return res_blocks else: other = casted alt = setitem_datetimelike_compat(values, icond.sum(), other) if alt is not other: if is_list_like(other) and len(other) < len(values): # call np.where with other to get the appropriate ValueError np.where(~icond, values, other) raise NotImplementedError( "This should not be reached; call to np.where above is " "expected to raise ValueError. Please report a bug at " "github.com/pandas-dev/pandas" ) result = values.copy() np.putmask(result, icond, alt) else: # By the time we get here, we should have all Series/Index # args extracted to ndarray if ( is_list_like(other) and not isinstance(other, np.ndarray) and len(other) == self.shape[-1] ): # If we don't do this broadcasting here, then expressions.where # will broadcast a 1D other to be row-like instead of # column-like. other = np.array(other).reshape(values.shape) # If lengths don't match (or len(other)==1), we will raise # inside expressions.where, see test_series_where # Note: expressions.where may upcast. result = expressions.where(~icond, values, other) # The np_can_hold_element check _should_ ensure that we always # have result.dtype == self.dtype here. if transpose: result = result.T return [self.make_block(result)] def fillna( self, value, limit: int | None = None, inplace: bool = False, downcast=None, using_cow: bool = False, ) -> list[Block]: """ fillna on the block with the value. If we fail, then convert to ObjectBlock and try again """ # Caller is responsible for validating limit; if int it is strictly positive inplace = validate_bool_kwarg(inplace, "inplace") if not self._can_hold_na: # can short-circuit the isna call noop = True else: mask = isna(self.values) mask, noop = validate_putmask(self.values, mask) if noop: # we can't process the value, but nothing to do if inplace: if using_cow: return [self.copy(deep=False)] # Arbitrarily imposing the convention that we ignore downcast # on no-op when inplace=True return [self] else: # GH#45423 consistent downcasting on no-ops. nb = self.copy(deep=not using_cow) nbs = nb._maybe_downcast([nb], downcast=downcast, using_cow=using_cow) return nbs if limit is not None: mask[mask.cumsum(self.ndim - 1) > limit] = False if inplace: nbs = self.putmask(mask.T, value, using_cow=using_cow) else: # without _downcast, we would break # test_fillna_dtype_conversion_equiv_replace nbs = self.where(value, ~mask.T, _downcast=False) # Note: blk._maybe_downcast vs self._maybe_downcast(nbs) # makes a difference bc blk may have object dtype, which has # different behavior in _maybe_downcast. return extend_blocks( [ blk._maybe_downcast([blk], downcast=downcast, using_cow=using_cow) for blk in nbs ] ) def interpolate( self, *, method: FillnaOptions = "pad", axis: AxisInt = 0, index: Index | None = None, inplace: bool = False, limit: int | None = None, limit_direction: str = "forward", limit_area: str | None = None, fill_value: Any | None = None, downcast: str | None = None, using_cow: bool = False, **kwargs, ) -> list[Block]: inplace = validate_bool_kwarg(inplace, "inplace") if not self._can_hold_na: # If there are no NAs, then interpolate is a no-op if using_cow: return [self.copy(deep=False)] return [self] if inplace else [self.copy()] try: m = missing.clean_fill_method(method) except ValueError: m = None if m is None and self.dtype.kind != "f": # only deal with floats # bc we already checked that can_hold_na, we don't have int dtype here # test_interp_basic checks that we make a copy here if using_cow: return [self.copy(deep=False)] return [self] if inplace else [self.copy()] if self.is_object and self.ndim == 2 and self.shape[0] != 1 and axis == 0: # split improves performance in ndarray.copy() return self.split_and_operate( type(self).interpolate, method=method, axis=axis, index=index, inplace=inplace, limit=limit, limit_direction=limit_direction, limit_area=limit_area, fill_value=fill_value, downcast=downcast, **kwargs, ) refs = None if inplace: if using_cow and self.refs.has_reference(): data = self.values.copy() else: data = self.values refs = self.refs else: data = self.values.copy() data = cast(np.ndarray, data) # bc overridden by ExtensionBlock missing.interpolate_array_2d( data, method=method, axis=axis, index=index, limit=limit, limit_direction=limit_direction, limit_area=limit_area, fill_value=fill_value, **kwargs, ) nb = self.make_block_same_class(data, refs=refs) return nb._maybe_downcast([nb], downcast, using_cow) def diff(self, n: int, axis: AxisInt = 1) -> list[Block]: """return block for the diff of the values""" # only reached with ndim == 2 and axis == 1 new_values = algos.diff(self.values, n, axis=axis) return [self.make_block(values=new_values)] def shift( self, periods: int, axis: AxisInt = 0, fill_value: Any = None ) -> list[Block]: """shift the block by periods, possibly upcast""" # convert integer to float if necessary. need to do a lot more than # that, handle boolean etc also # Note: periods is never 0 here, as that is handled at the top of # NDFrame.shift. If that ever changes, we can do a check for periods=0 # and possibly avoid coercing. if not lib.is_scalar(fill_value) and self.dtype != _dtype_obj: # with object dtype there is nothing to promote, and the user can # pass pretty much any weird fill_value they like # see test_shift_object_non_scalar_fill raise ValueError("fill_value must be a scalar") fill_value = self._standardize_fill_value(fill_value) try: # error: Argument 1 to "np_can_hold_element" has incompatible type # "Union[dtype[Any], ExtensionDtype]"; expected "dtype[Any]" casted = np_can_hold_element( self.dtype, fill_value # type: ignore[arg-type] ) except LossySetitemError: nb = self.coerce_to_target_dtype(fill_value) return nb.shift(periods, axis=axis, fill_value=fill_value) else: values = cast(np.ndarray, self.values) new_values = shift(values, periods, axis, casted) return [self.make_block(new_values)] @final def quantile( self, qs: Index, # with dtype float64 interpolation: QuantileInterpolation = "linear", axis: AxisInt = 0, ) -> Block: """ compute the quantiles of the Parameters ---------- qs : Index The quantiles to be computed in float64. interpolation : str, default 'linear' Type of interpolation. axis : int, default 0 Axis to compute. Returns ------- Block """ # We should always have ndim == 2 because Series dispatches to DataFrame assert self.ndim == 2 assert axis == 1 # only ever called this way assert is_list_like(qs) # caller is responsible for this result = quantile_compat(self.values, np.asarray(qs._values), interpolation) # ensure_block_shape needed for cases where we start with EA and result # is ndarray, e.g. IntegerArray, SparseArray result = ensure_block_shape(result, ndim=2) return new_block_2d(result, placement=self._mgr_locs) def round(self, decimals: int, using_cow: bool = False) -> Block: """ Rounds the values. If the block is not of an integer or float dtype, nothing happens. This is consistent with DataFrame.round behavivor. (Note: Series.round would raise) Parameters ---------- decimals: int, Number of decimal places to round to. Caller is responsible for validating this using_cow: bool, Whether Copy on Write is enabled right now """ if not self.is_numeric or self.is_bool: return self.copy(deep=not using_cow) refs = None # TODO: round only defined on BaseMaskedArray # Series also does this, so would need to fix both places # error: Item "ExtensionArray" of "Union[ndarray[Any, Any], ExtensionArray]" # has no attribute "round" values = self.values.round(decimals) # type: ignore[union-attr] if values is self.values: refs = self.refs if not using_cow: # Normally would need to do this before, but # numpy only returns same array when round operation # is no-op # https://github.com/numpy/numpy/blob/486878b37fc7439a3b2b87747f50db9b62fea8eb/numpy/core/src/multiarray/calculation.c#L625-L636 values = values.copy() return self.make_block_same_class(values, refs=refs) # --------------------------------------------------------------------- # Abstract Methods Overridden By EABackedBlock and NumpyBlock def delete(self, loc) -> list[Block]: """Deletes the locs from the block. We split the block to avoid copying the underlying data. We create new blocks for every connected segment of the initial block that is not deleted. The new blocks point to the initial array. """ if not is_list_like(loc): loc = [loc] if self.ndim == 1: values = cast(np.ndarray, self.values) values = np.delete(values, loc) mgr_locs = self._mgr_locs.delete(loc) return [type(self)(values, placement=mgr_locs, ndim=self.ndim)] if np.max(loc) >= self.values.shape[0]: raise IndexError # Add one out-of-bounds indexer as maximum to collect # all columns after our last indexer if any loc = np.concatenate([loc, [self.values.shape[0]]]) mgr_locs_arr = self._mgr_locs.as_array new_blocks: list[Block] = [] previous_loc = -1 # TODO(CoW): This is tricky, if parent block goes out of scope # all split blocks are referencing each other even though they # don't share data refs = self.refs if self.refs.has_reference() else None for idx in loc: if idx == previous_loc + 1: # There is no column between current and last idx pass else: # No overload variant of "__getitem__" of "ExtensionArray" matches # argument type "Tuple[slice, slice]" values = self.values[previous_loc + 1 : idx, :] # type: ignore[call-overload] # noqa locs = mgr_locs_arr[previous_loc + 1 : idx] nb = type(self)( values, placement=BlockPlacement(locs), ndim=self.ndim, refs=refs ) new_blocks.append(nb) previous_loc = idx return new_blocks @property def is_view(self) -> bool: """return a boolean if I am possibly a view""" raise AbstractMethodError(self) @property def array_values(self) -> ExtensionArray: """ The array that Series.array returns. Always an ExtensionArray. """ raise AbstractMethodError(self) def get_values(self, dtype: DtypeObj | None = None) -> np.ndarray: """ return an internal format, currently just the ndarray this is often overridden to handle to_dense like operations """ raise AbstractMethodError(self) def values_for_json(self) -> np.ndarray: raise AbstractMethodError(self) class EABackedBlock(Block): """ Mixin for Block subclasses backed by ExtensionArray. """ values: ExtensionArray def setitem(self, indexer, value, using_cow: bool = False): """ Attempt self.values[indexer] = value, possibly creating a new array. This differs from Block.setitem by not allowing setitem to change the dtype of the Block. Parameters ---------- indexer : tuple, list-like, array-like, slice, int The subset of self.values to set value : object The value being set using_cow: bool, default False Signaling if CoW is used. Returns ------- Block Notes ----- `indexer` is a direct slice/positional indexer. `value` must be a compatible shape. """ orig_indexer = indexer orig_value = value indexer = self._unwrap_setitem_indexer(indexer) value = self._maybe_squeeze_arg(value) values = self.values if values.ndim == 2: # TODO(GH#45419): string[pyarrow] tests break if we transpose # unconditionally values = values.T check_setitem_lengths(indexer, value, values) try: values[indexer] = value except (ValueError, TypeError) as err: _catch_deprecated_value_error(err) if is_interval_dtype(self.dtype): # see TestSetitemFloatIntervalWithIntIntervalValues nb = self.coerce_to_target_dtype(orig_value) return nb.setitem(orig_indexer, orig_value) elif isinstance(self, NDArrayBackedExtensionBlock): nb = self.coerce_to_target_dtype(orig_value) return nb.setitem(orig_indexer, orig_value) else: raise else: return self def where( self, other, cond, _downcast: str | bool = "infer", using_cow: bool = False ) -> list[Block]: # _downcast private bc we only specify it when calling from fillna arr = self.values.T cond = extract_bool_array(cond) orig_other = other orig_cond = cond other = self._maybe_squeeze_arg(other) cond = self._maybe_squeeze_arg(cond) if other is lib.no_default: other = self.fill_value icond, noop = validate_putmask(arr, ~cond) if noop: # GH#44181, GH#45135 # Avoid a) raising for Interval/PeriodDtype and b) unnecessary object upcast if using_cow: return [self.copy(deep=False)] return [self.copy()] try: res_values = arr._where(cond, other).T except (ValueError, TypeError) as err: _catch_deprecated_value_error(err) if self.ndim == 1 or self.shape[0] == 1: if is_interval_dtype(self.dtype): # TestSetitemFloatIntervalWithIntIntervalValues blk = self.coerce_to_target_dtype(orig_other) nbs = blk.where(orig_other, orig_cond, using_cow=using_cow) return self._maybe_downcast( nbs, downcast=_downcast, using_cow=using_cow ) elif isinstance(self, NDArrayBackedExtensionBlock): # NB: not (yet) the same as # isinstance(values, NDArrayBackedExtensionArray) blk = self.coerce_to_target_dtype(orig_other) nbs = blk.where(orig_other, orig_cond, using_cow=using_cow) return self._maybe_downcast( nbs, downcast=_downcast, using_cow=using_cow ) else: raise else: # Same pattern we use in Block.putmask is_array = isinstance(orig_other, (np.ndarray, ExtensionArray)) res_blocks = [] nbs = self._split() for i, nb in enumerate(nbs): n = orig_other if is_array: # we have a different value per-column n = orig_other[:, i : i + 1] submask = orig_cond[:, i : i + 1] rbs = nb.where(n, submask, using_cow=using_cow) res_blocks.extend(rbs) return res_blocks nb = self.make_block_same_class(res_values) return [nb] def putmask(self, mask, new, using_cow: bool = False) -> list[Block]: """ See Block.putmask.__doc__ """ mask = extract_bool_array(mask) if new is lib.no_default: new = self.fill_value values = self.values if values.ndim == 2: values = values.T orig_new = new orig_mask = mask new = self._maybe_squeeze_arg(new) mask = self._maybe_squeeze_arg(mask) if not mask.any(): if using_cow: return [self.copy(deep=False)] return [self] if using_cow and self.refs.has_reference(): values = values.copy() self = self.make_block_same_class( # type: ignore[assignment] values.T if values.ndim == 2 else values ) try: # Caller is responsible for ensuring matching lengths values._putmask(mask, new) except (TypeError, ValueError) as err: _catch_deprecated_value_error(err) if self.ndim == 1 or self.shape[0] == 1: if is_interval_dtype(self.dtype): # Discussion about what we want to support in the general # case GH#39584 blk = self.coerce_to_target_dtype(orig_new) return blk.putmask(orig_mask, orig_new) elif isinstance(self, NDArrayBackedExtensionBlock): # NB: not (yet) the same as # isinstance(values, NDArrayBackedExtensionArray) blk = self.coerce_to_target_dtype(orig_new) return blk.putmask(orig_mask, orig_new) else: raise else: # Same pattern we use in Block.putmask is_array = isinstance(orig_new, (np.ndarray, ExtensionArray)) res_blocks = [] nbs = self._split() for i, nb in enumerate(nbs): n = orig_new if is_array: # we have a different value per-column n = orig_new[:, i : i + 1] submask = orig_mask[:, i : i + 1] rbs = nb.putmask(submask, n) res_blocks.extend(rbs) return res_blocks return [self] def delete(self, loc) -> list[Block]: # This will be unnecessary if/when __array_function__ is implemented if self.ndim == 1: values = self.values.delete(loc) mgr_locs = self._mgr_locs.delete(loc) return [type(self)(values, placement=mgr_locs, ndim=self.ndim)] elif self.values.ndim == 1: # We get here through to_stata return [] return super().delete(loc) @cache_readonly def array_values(self) -> ExtensionArray: return self.values def get_values(self, dtype: DtypeObj | None = None) -> np.ndarray: """ return object dtype as boxed values, such as Timestamps/Timedelta """ values: ArrayLike = self.values if dtype == _dtype_obj: values = values.astype(object) # TODO(EA2D): reshape not needed with 2D EAs return np.asarray(values).reshape(self.shape) def values_for_json(self) -> np.ndarray: return np.asarray(self.values) def interpolate( self, *, method: FillnaOptions = "pad", axis: int = 0, inplace: bool = False, limit: int | None = None, fill_value=None, using_cow: bool = False, **kwargs, ): values = self.values if values.ndim == 2 and axis == 0: # NDArrayBackedExtensionArray.fillna assumes axis=1 new_values = values.T.fillna(value=fill_value, method=method, limit=limit).T else: new_values = values.fillna(value=fill_value, method=method, limit=limit) return self.make_block_same_class(new_values) class ExtensionBlock(libinternals.Block, EABackedBlock): """ Block for holding extension types. Notes ----- This holds all 3rd-party extension array types. It's also the immediate parent class for our internal extension types' blocks. ExtensionArrays are limited to 1-D. """ _can_consolidate = False _validate_ndim = False is_extension = True values: ExtensionArray def fillna( self, value, limit: int | None = None, inplace: bool = False, downcast=None, using_cow: bool = False, ) -> list[Block]: if is_interval_dtype(self.dtype): # Block.fillna handles coercion (test_fillna_interval) return super().fillna( value=value, limit=limit, inplace=inplace, downcast=downcast, using_cow=using_cow, ) if using_cow and self._can_hold_na and not self.values._hasna: refs = self.refs new_values = self.values else: refs = None new_values = self.values.fillna(value=value, method=None, limit=limit) nb = self.make_block_same_class(new_values, refs=refs) return nb._maybe_downcast([nb], downcast, using_cow=using_cow) @cache_readonly def shape(self) -> Shape: # TODO(EA2D): override unnecessary with 2D EAs if self.ndim == 1: return (len(self.values),) return len(self._mgr_locs), len(self.values) def iget(self, i: int | tuple[int, int] | tuple[slice, int]): # In the case where we have a tuple[slice, int], the slice will always # be slice(None) # We _could_ make the annotation more specific, but mypy would # complain about override mismatch: # Literal[0] | tuple[Literal[0], int] | tuple[slice, int] # Note: only reached with self.ndim == 2 if isinstance(i, tuple): # TODO(EA2D): unnecessary with 2D EAs col, loc = i if not com.is_null_slice(col) and col != 0: raise IndexError(f"{self} only contains one item") if isinstance(col, slice): # the is_null_slice check above assures that col is slice(None) # so what we want is a view on all our columns and row loc if loc < 0: loc += len(self.values) # Note: loc:loc+1 vs [[loc]] makes a difference when called # from fast_xs because we want to get a view back. return self.values[loc : loc + 1] return self.values[loc] else: if i != 0: raise IndexError(f"{self} only contains one item") return self.values def set_inplace(self, locs, values: ArrayLike, copy: bool = False) -> None: # When an ndarray, we should have locs.tolist() == [0] # When a BlockPlacement we should have list(locs) == [0] if copy: self.values = self.values.copy() self.values[:] = values def _maybe_squeeze_arg(self, arg): """ If necessary, squeeze a (N, 1) ndarray to (N,) """ # e.g. if we are passed a 2D mask for putmask if ( isinstance(arg, (np.ndarray, ExtensionArray)) and arg.ndim == self.values.ndim + 1 ): # TODO(EA2D): unnecessary with 2D EAs assert arg.shape[1] == 1 # error: No overload variant of "__getitem__" of "ExtensionArray" # matches argument type "Tuple[slice, int]" arg = arg[:, 0] # type: ignore[call-overload] elif isinstance(arg, ABCDataFrame): # 2022-01-06 only reached for setitem # TODO: should we avoid getting here with DataFrame? assert arg.shape[1] == 1 arg = arg._ixs(0, axis=1)._values return arg def _unwrap_setitem_indexer(self, indexer): """ Adapt a 2D-indexer to our 1D values. This is intended for 'setitem', not 'iget' or '_slice'. """ # TODO: ATM this doesn't work for iget/_slice, can we change that? if isinstance(indexer, tuple) and len(indexer) == 2: # TODO(EA2D): not needed with 2D EAs # Should never have length > 2. Caller is responsible for checking. # Length 1 is reached vis setitem_single_block and setitem_single_column # each of which pass indexer=(pi,) if all(isinstance(x, np.ndarray) and x.ndim == 2 for x in indexer): # GH#44703 went through indexing.maybe_convert_ix first, second = indexer if not ( second.size == 1 and (second == 0).all() and first.shape[1] == 1 ): raise NotImplementedError( "This should not be reached. Please report a bug at " "github.com/pandas-dev/pandas/" ) indexer = first[:, 0] elif lib.is_integer(indexer[1]) and indexer[1] == 0: # reached via setitem_single_block passing the whole indexer indexer = indexer[0] elif com.is_null_slice(indexer[1]): indexer = indexer[0] elif is_list_like(indexer[1]) and indexer[1][0] == 0: indexer = indexer[0] else: raise NotImplementedError( "This should not be reached. Please report a bug at " "github.com/pandas-dev/pandas/" ) return indexer @property def is_view(self) -> bool: """Extension arrays are never treated as views.""" return False @cache_readonly def is_numeric(self): return self.values.dtype._is_numeric def _slice( self, slicer: slice | npt.NDArray[np.bool_] | npt.NDArray[np.intp] ) -> ExtensionArray: """ Return a slice of my values. Parameters ---------- slicer : slice, ndarray[int], or ndarray[bool] Valid (non-reducing) indexer for self.values. Returns ------- ExtensionArray """ # Notes: ndarray[bool] is only reachable when via getitem_mgr, which # is only for Series, i.e. self.ndim == 1. # return same dims as we currently have if self.ndim == 2: # reached via getitem_block via _slice_take_blocks_ax0 # TODO(EA2D): won't be necessary with 2D EAs if not isinstance(slicer, slice): raise AssertionError( "invalid slicing for a 1-ndim ExtensionArray", slicer ) # GH#32959 only full-slicers along fake-dim0 are valid # TODO(EA2D): won't be necessary with 2D EAs # range(1) instead of self._mgr_locs to avoid exception on [::-1] # see test_iloc_getitem_slice_negative_step_ea_block new_locs = range(1)[slicer] if not len(new_locs): raise AssertionError( "invalid slicing for a 1-ndim ExtensionArray", slicer ) slicer = slice(None) return self.values[slicer] @final def getitem_block_index(self, slicer: slice) -> ExtensionBlock: """ Perform __getitem__-like specialized to slicing along index. """ # GH#42787 in principle this is equivalent to values[..., slicer], but we don't # require subclasses of ExtensionArray to support that form (for now). new_values = self.values[slicer] return type(self)(new_values, self._mgr_locs, ndim=self.ndim, refs=self.refs) def diff(self, n: int, axis: AxisInt = 1) -> list[Block]: # only reached with ndim == 2 and axis == 1 # TODO(EA2D): Can share with NDArrayBackedExtensionBlock new_values = algos.diff(self.values, n, axis=0) return [self.make_block(values=new_values)] def shift( self, periods: int, axis: AxisInt = 0, fill_value: Any = None ) -> list[Block]: """ Shift the block by `periods`. Dispatches to underlying ExtensionArray and re-boxes in an ExtensionBlock. """ new_values = self.values.shift(periods=periods, fill_value=fill_value) return [self.make_block_same_class(new_values)] def _unstack( self, unstacker, fill_value, new_placement: npt.NDArray[np.intp], needs_masking: npt.NDArray[np.bool_], ): # ExtensionArray-safe unstack. # We override ObjectBlock._unstack, which unstacks directly on the # values of the array. For EA-backed blocks, this would require # converting to a 2-D ndarray of objects. # Instead, we unstack an ndarray of integer positions, followed by # a `take` on the actual values. # Caller is responsible for ensuring self.shape[-1] == len(unstacker.index) new_values, mask = unstacker.arange_result # Note: these next two lines ensure that # mask.sum() == sum(len(nb.mgr_locs) for nb in blocks) # which the calling function needs in order to pass verify_integrity=False # to the BlockManager constructor new_values = new_values.T[mask] new_placement = new_placement[mask] # needs_masking[i] calculated once in BlockManager.unstack tells # us if there are any -1s in the relevant indices. When False, # that allows us to go through a faster path in 'take', among # other things avoiding e.g. Categorical._validate_scalar. blocks = [ # TODO: could cast to object depending on fill_value? type(self)( self.values.take( indices, allow_fill=needs_masking[i], fill_value=fill_value ), BlockPlacement(place), ndim=2, ) for i, (indices, place) in enumerate(zip(new_values, new_placement)) ] return blocks, mask class NumpyBlock(libinternals.NumpyBlock, Block): values: np.ndarray @property def is_view(self) -> bool: """return a boolean if I am possibly a view""" return self.values.base is not None @property def array_values(self) -> ExtensionArray: return PandasArray(self.values) def get_values(self, dtype: DtypeObj | None = None) -> np.ndarray: if dtype == _dtype_obj: return self.values.astype(_dtype_obj) return self.values def values_for_json(self) -> np.ndarray: return self.values class NumericBlock(NumpyBlock): __slots__ = () is_numeric = True class NDArrayBackedExtensionBlock(libinternals.NDArrayBackedBlock, EABackedBlock): """ Block backed by an NDArrayBackedExtensionArray """ values: NDArrayBackedExtensionArray # error: Signature of "is_extension" incompatible with supertype "Block" @cache_readonly def is_extension(self) -> bool: # type: ignore[override] # i.e. datetime64tz, PeriodDtype return not isinstance(self.dtype, np.dtype) @property def is_view(self) -> bool: """return a boolean if I am possibly a view""" # check the ndarray values of the DatetimeIndex values return self.values._ndarray.base is not None def diff(self, n: int, axis: AxisInt = 0) -> list[Block]: """ 1st discrete difference. Parameters ---------- n : int Number of periods to diff. axis : int, default 0 Axis to diff upon. Returns ------- A list with a new Block. Notes ----- The arguments here are mimicking shift so they are called correctly by apply. """ # only reached with ndim == 2 and axis == 1 values = self.values new_values = values - values.shift(n, axis=axis) return [self.make_block(new_values)] def shift( self, periods: int, axis: AxisInt = 0, fill_value: Any = None ) -> list[Block]: values = self.values new_values = values.shift(periods, fill_value=fill_value, axis=axis) return [self.make_block_same_class(new_values)] def _catch_deprecated_value_error(err: Exception) -> None: """ We catch ValueError for now, but only a specific one raised by DatetimeArray which will no longer be raised in version.2.0. """ if isinstance(err, ValueError): if isinstance(err, IncompatibleFrequency): pass elif "'value.closed' is" in str(err): # IntervalDtype mismatched 'closed' pass class DatetimeLikeBlock(NDArrayBackedExtensionBlock): """Block for datetime64[ns], timedelta64[ns].""" __slots__ = () is_numeric = False values: DatetimeArray | TimedeltaArray def values_for_json(self) -> np.ndarray: return self.values._ndarray def interpolate( self, *, method: FillnaOptions = "pad", index: Index | None = None, axis: int = 0, inplace: bool = False, limit: int | None = None, fill_value=None, using_cow: bool = False, **kwargs, ): values = self.values # error: Non-overlapping equality check (left operand type: # "Literal['backfill', 'bfill', 'ffill', 'pad']", right operand type: # "Literal['linear']") [comparison-overlap] if method == "linear": # type: ignore[comparison-overlap] # TODO: GH#50950 implement for arbitrary EAs refs = None if using_cow: if inplace and not self.refs.has_reference(): data_out = values._ndarray refs = self.refs else: data_out = values._ndarray.copy() else: data_out = values._ndarray if inplace else values._ndarray.copy() missing.interpolate_array_2d( data_out, method=method, limit=limit, index=index, axis=axis ) new_values = type(values)._simple_new(data_out, dtype=values.dtype) return self.make_block_same_class(new_values, refs=refs) elif values.ndim == 2 and axis == 0: # NDArrayBackedExtensionArray.fillna assumes axis=1 new_values = values.T.fillna(value=fill_value, method=method, limit=limit).T else: new_values = values.fillna(value=fill_value, method=method, limit=limit) return self.make_block_same_class(new_values) class DatetimeTZBlock(DatetimeLikeBlock): """implement a datetime64 block with a tz attribute""" values: DatetimeArray __slots__ = () is_extension = True _validate_ndim = True _can_consolidate = False # Don't use values_for_json from DatetimeLikeBlock since it is # an invalid optimization here(drop the tz) values_for_json = NDArrayBackedExtensionBlock.values_for_json class ObjectBlock(NumpyBlock): __slots__ = () is_object = True @maybe_split def convert( self, *, copy: bool = True, using_cow: bool = False, ) -> list[Block]: """ attempt to cast any object types to better types return a copy of the block (if copy = True) by definition we ARE an ObjectBlock!!!!! """ if self.dtype != _dtype_obj: # GH#50067 this should be impossible in ObjectBlock, but until # that is fixed, we short-circuit here. if using_cow: return [self.copy(deep=False)] return [self] values = self.values if values.ndim == 2: # maybe_split ensures we only get here with values.shape[0] == 1, # avoid doing .ravel as that might make a copy values = values[0] res_values = lib.maybe_convert_objects( values, convert_datetime=True, convert_timedelta=True, convert_period=True, convert_interval=True, ) refs = None if copy and res_values is values: res_values = values.copy() elif res_values is values and using_cow: refs = self.refs res_values = ensure_block_shape(res_values, self.ndim) return [self.make_block(res_values, refs=refs)] # ----------------------------------------------------------------- # Constructor Helpers def maybe_coerce_values(values: ArrayLike) -> ArrayLike: """ Input validation for values passed to __init__. Ensure that any datetime64/timedelta64 dtypes are in nanoseconds. Ensure that we do not have string dtypes. Parameters ---------- values : np.ndarray or ExtensionArray Returns ------- values : np.ndarray or ExtensionArray """ # Caller is responsible for ensuring PandasArray is already extracted. if isinstance(values, np.ndarray): values = ensure_wrapped_if_datetimelike(values) if issubclass(values.dtype.type, str): values = np.array(values, dtype=object) if isinstance(values, (DatetimeArray, TimedeltaArray)) and values.freq is not None: # freq is only stored in DatetimeIndex/TimedeltaIndex, not in Series/DataFrame values = values._with_freq(None) return values def get_block_type(dtype: DtypeObj): """ Find the appropriate Block subclass to use for the given values and dtype. Parameters ---------- dtype : numpy or pandas dtype Returns ------- cls : class, subclass of Block """ # We use kind checks because it is much more performant # than is_foo_dtype kind = dtype.kind cls: type[Block] if isinstance(dtype, SparseDtype): # Need this first(ish) so that Sparse[datetime] is sparse cls = ExtensionBlock elif isinstance(dtype, DatetimeTZDtype): cls = DatetimeTZBlock elif isinstance(dtype, PeriodDtype): cls = NDArrayBackedExtensionBlock elif isinstance(dtype, ExtensionDtype): # Note: need to be sure PandasArray is unwrapped before we get here cls = ExtensionBlock elif kind in ["M", "m"]: cls = DatetimeLikeBlock elif kind in ["f", "c", "i", "u", "b"]: cls = NumericBlock else: cls = ObjectBlock return cls def new_block_2d( values: ArrayLike, placement: BlockPlacement, refs: BlockValuesRefs | None = None ): # new_block specialized to case with # ndim=2 # isinstance(placement, BlockPlacement) # check_ndim/ensure_block_shape already checked klass = get_block_type(values.dtype) values = maybe_coerce_values(values) return klass(values, ndim=2, placement=placement, refs=refs) def new_block( values, placement, *, ndim: int, refs: BlockValuesRefs | None = None ) -> Block: # caller is responsible for ensuring values is NOT a PandasArray if not isinstance(placement, BlockPlacement): placement = BlockPlacement(placement) check_ndim(values, placement, ndim) klass = get_block_type(values.dtype) values = maybe_coerce_values(values) return klass(values, ndim=ndim, placement=placement, refs=refs) def check_ndim(values, placement: BlockPlacement, ndim: int) -> None: """ ndim inference and validation. Validates that values.ndim and ndim are consistent. Validates that len(values) and len(placement) are consistent. Parameters ---------- values : array-like placement : BlockPlacement ndim : int Raises ------ ValueError : the number of dimensions do not match """ if values.ndim > ndim: # Check for both np.ndarray and ExtensionArray raise ValueError( "Wrong number of dimensions. " f"values.ndim > ndim [{values.ndim} > {ndim}]" ) if not is_1d_only_ea_dtype(values.dtype): # TODO(EA2D): special case not needed with 2D EAs if values.ndim != ndim: raise ValueError( "Wrong number of dimensions. " f"values.ndim != ndim [{values.ndim} != {ndim}]" ) if len(placement) != len(values): raise ValueError( f"Wrong number of items passed {len(values)}, " f"placement implies {len(placement)}" ) elif ndim == 2 and len(placement) != 1: # TODO(EA2D): special case unnecessary with 2D EAs raise ValueError("need to split") def extract_pandas_array( values: np.ndarray | ExtensionArray, dtype: DtypeObj | None, ndim: int ) -> tuple[np.ndarray | ExtensionArray, DtypeObj | None]: """ Ensure that we don't allow PandasArray / PandasDtype in internals. """ # For now, blocks should be backed by ndarrays when possible. if isinstance(values, ABCPandasArray): values = values.to_numpy() if ndim and ndim > 1: # TODO(EA2D): special case not needed with 2D EAs values = np.atleast_2d(values) if isinstance(dtype, PandasDtype): dtype = dtype.numpy_dtype return values, dtype # ----------------------------------------------------------------- def extend_blocks(result, blocks=None) -> list[Block]: """return a new extended blocks, given the result""" if blocks is None: blocks = [] if isinstance(result, list): for r in result: if isinstance(r, list): blocks.extend(r) else: blocks.append(r) else: assert isinstance(result, Block), type(result) blocks.append(result) return blocks def ensure_block_shape(values: ArrayLike, ndim: int = 1) -> ArrayLike: """ Reshape if possible to have values.ndim == ndim. """ if values.ndim < ndim: if not is_1d_only_ea_dtype(values.dtype): # TODO(EA2D): https://github.com/pandas-dev/pandas/issues/23023 # block.shape is incorrect for "2D" ExtensionArrays # We can't, and don't need to, reshape. values = cast("np.ndarray | DatetimeArray | TimedeltaArray", values) values = values.reshape(1, -1) return values def to_native_types( values: ArrayLike, *, na_rep: str = "nan", quoting=None, float_format=None, decimal: str = ".", **kwargs, ) -> np.ndarray: """convert to our native types format""" if isinstance(values, Categorical) and values.categories.dtype.kind in "Mm": # GH#40754 Convert categorical datetimes to datetime array values = algos.take_nd( values.categories._values, ensure_platform_int(values._codes), fill_value=na_rep, ) values = ensure_wrapped_if_datetimelike(values) if isinstance(values, (DatetimeArray, TimedeltaArray)): if values.ndim == 1: result = values._format_native_types(na_rep=na_rep, **kwargs) result = result.astype(object, copy=False) return result # GH#21734 Process every column separately, they might have different formats results_converted = [] for i in range(len(values)): result = values[i, :]._format_native_types(na_rep=na_rep, **kwargs) results_converted.append(result.astype(object, copy=False)) return np.vstack(results_converted) elif values.dtype.kind == "f" and not is_sparse(values): # see GH#13418: no special formatting is desired at the # output (important for appropriate 'quoting' behaviour), # so do not pass it through the FloatArrayFormatter if float_format is None and decimal == ".": mask = isna(values) if not quoting: values = values.astype(str) else: values = np.array(values, dtype="object") values[mask] = na_rep values = values.astype(object, copy=False) return values from pandas.io.formats.format import FloatArrayFormatter formatter = FloatArrayFormatter( values, na_rep=na_rep, float_format=float_format, decimal=decimal, quoting=quoting, fixed_width=False, ) res = formatter.get_result_as_array() res = res.astype(object, copy=False) return res elif isinstance(values, ExtensionArray): mask = isna(values) new_values = np.asarray(values.astype(object)) new_values[mask] = na_rep return new_values else: mask = isna(values) itemsize = writers.word_len(na_rep) if values.dtype != _dtype_obj and not quoting and itemsize: values = values.astype(str) if values.dtype.itemsize / np.dtype("U1").itemsize < itemsize: # enlarge for the na_rep values = values.astype(f" ArrayLike: """ The array that Series.values returns (public attribute). This has some historical constraints, and is overridden in block subclasses to return the correct array (e.g. period returns object ndarray and datetimetz a datetime64[ns] ndarray instead of proper extension array). """ if isinstance(values, (PeriodArray, IntervalArray)): return values.astype(object) elif isinstance(values, (DatetimeArray, TimedeltaArray)): # NB: for datetime64tz this is different from np.asarray(values), since # that returns an object-dtype ndarray of Timestamps. # Avoid raising in .astype in casting from dt64tz to dt64 values = values._ndarray if isinstance(values, np.ndarray) and using_copy_on_write(): values = values.view() values.flags.writeable = False # TODO(CoW) we should also mark our ExtensionArrays as read-only return values