from datetime import datetime, timedelta import inspect import re from typing import TYPE_CHECKING, Any, List, Optional, Type, Union, cast import warnings import numpy as np from pandas._libs import NaT, algos as libalgos, internals as libinternals, lib, writers from pandas._libs.internals import BlockPlacement from pandas._libs.tslibs import conversion from pandas._libs.tslibs.timezones import tz_compare from pandas._typing import ArrayLike, Scalar, Shape from pandas.util._validators import validate_bool_kwarg from pandas.core.dtypes.cast import ( astype_nansafe, convert_scalar_for_putitemlike, find_common_type, infer_dtype_from, infer_dtype_from_scalar, maybe_box_datetimelike, maybe_downcast_numeric, maybe_downcast_to_dtype, maybe_infer_dtype_type, maybe_promote, maybe_upcast, soft_convert_objects, ) from pandas.core.dtypes.common import ( DT64NS_DTYPE, TD64NS_DTYPE, is_bool_dtype, is_categorical_dtype, is_datetime64_any_dtype, is_datetime64_dtype, is_datetime64tz_dtype, is_dtype_equal, is_extension_array_dtype, is_float, is_float_dtype, is_integer, is_integer_dtype, is_interval_dtype, is_list_like, is_object_dtype, is_period_dtype, is_re, is_re_compilable, is_sparse, is_timedelta64_dtype, pandas_dtype, ) from pandas.core.dtypes.dtypes import ExtensionDtype from pandas.core.dtypes.generic import ( ABCDataFrame, ABCIndexClass, ABCPandasArray, ABCSeries, ) from pandas.core.dtypes.missing import is_valid_nat_for_dtype, isna, isna_compat import pandas.core.algorithms as algos from pandas.core.array_algos.replace import compare_or_regex_search, replace_regex from pandas.core.array_algos.transforms import shift from pandas.core.arrays import ( Categorical, DatetimeArray, ExtensionArray, PandasArray, PandasDtype, TimedeltaArray, ) from pandas.core.base import PandasObject import pandas.core.common as com from pandas.core.construction import extract_array from pandas.core.indexers import ( check_setitem_lengths, is_empty_indexer, is_scalar_indexer, ) import pandas.core.missing as missing from pandas.core.nanops import nanpercentile if TYPE_CHECKING: from pandas import Index class Block(PandasObject): """ Canonical n-dimensional unit of homogeneous dtype contained in a pandas data structure Index-ignorant; let the container take care of that """ values: Union[np.ndarray, ExtensionArray] __slots__ = ["_mgr_locs", "values", "ndim"] is_numeric = False is_float = False is_integer = False is_complex = False is_datetime = False is_datetimetz = False is_timedelta = False is_bool = False is_object = False is_extension = False _can_hold_na = False _can_consolidate = True _validate_ndim = True @classmethod def _simple_new( cls, values: ArrayLike, placement: BlockPlacement, ndim: int ) -> "Block": """ Fastpath constructor, does *no* validation """ obj = object.__new__(cls) obj.ndim = ndim obj.values = values obj._mgr_locs = placement return obj def __init__(self, values, placement, ndim: int): """ Parameters ---------- values : np.ndarray or ExtensionArray placement : BlockPlacement (or castable) ndim : int 1 for SingleBlockManager/Series, 2 for BlockManager/DataFrame """ # TODO(EA2D): ndim will be unnecessary with 2D EAs self.ndim = self._check_ndim(values, ndim) self.mgr_locs = placement self.values = self._maybe_coerce_values(values) if self._validate_ndim and self.ndim and len(self.mgr_locs) != len(self.values): raise ValueError( f"Wrong number of items passed {len(self.values)}, " f"placement implies {len(self.mgr_locs)}" ) def _maybe_coerce_values(self, values): """ Ensure we have correctly-typed values. Parameters ---------- values : np.ndarray, ExtensionArray, Index Returns ------- np.ndarray or ExtensionArray """ return values def _check_ndim(self, values, ndim): """ ndim inference and validation. Infers ndim from 'values' if not provided to __init__. Validates that values.ndim and ndim are consistent if and only if the class variable '_validate_ndim' is True. Parameters ---------- values : array-like ndim : int or None Returns ------- ndim : int Raises ------ ValueError : the number of dimensions do not match """ if ndim is None: ndim = values.ndim if self._validate_ndim and values.ndim != ndim: raise ValueError( "Wrong number of dimensions. " f"values.ndim != ndim [{values.ndim} != {ndim}]" ) return ndim @property def _holder(self): """ The array-like that can hold the underlying values. None for 'Block', overridden by subclasses that don't use an ndarray. """ return None @property def _consolidate_key(self): return self._can_consolidate, self.dtype.name @property def is_view(self) -> bool: """ return a boolean if I am possibly a view """ values = self.values values = cast(np.ndarray, values) return values.base is not None @property def is_categorical(self) -> bool: return self._holder is Categorical @property def is_datelike(self) -> bool: """ return True if I am a non-datelike """ return self.is_datetime or self.is_timedelta def external_values(self): """ The array that Series.values returns (public attribute). This has some historical constraints, and is overridden in block subclasses to return the correct array (e.g. period returns object ndarray and datetimetz a datetime64[ns] ndarray instead of proper extension array). """ return self.values def internal_values(self): """ The array that Series._values returns (internal values). """ return self.values def array_values(self) -> ExtensionArray: """ The array that Series.array returns. Always an ExtensionArray. """ return PandasArray(self.values) def get_values(self, dtype=None): """ return an internal format, currently just the ndarray this is often overridden to handle to_dense like operations """ if is_object_dtype(dtype): return self.values.astype(object) return self.values def get_block_values_for_json(self) -> np.ndarray: """ This is used in the JSON C code. """ # TODO(EA2D): reshape will be unnecessary with 2D EAs return np.asarray(self.values).reshape(self.shape) @property def fill_value(self): return np.nan @property def mgr_locs(self): return self._mgr_locs @mgr_locs.setter def mgr_locs(self, new_mgr_locs): if not isinstance(new_mgr_locs, libinternals.BlockPlacement): new_mgr_locs = libinternals.BlockPlacement(new_mgr_locs) self._mgr_locs = new_mgr_locs def make_block(self, values, placement=None) -> "Block": """ Create a new block, with type inference propagate any values that are not specified """ if placement is None: placement = self.mgr_locs if self.is_extension: values = _block_shape(values, ndim=self.ndim) return make_block(values, placement=placement, ndim=self.ndim) def make_block_same_class(self, values, placement=None, ndim=None): """ Wrap given values in a block of same type as self. """ if placement is None: placement = self.mgr_locs if ndim is None: ndim = self.ndim return type(self)(values, placement=placement, ndim=ndim) def __repr__(self) -> str: # don't want to print out all of the items here name = type(self).__name__ if self.ndim == 1: result = f"{name}: {len(self)} dtype: {self.dtype}" else: shape = " x ".join(str(s) for s in self.shape) result = f"{name}: {self.mgr_locs.indexer}, {shape}, dtype: {self.dtype}" return result def __len__(self) -> int: return len(self.values) def __getstate__(self): return self.mgr_locs.indexer, self.values def __setstate__(self, state): self.mgr_locs = libinternals.BlockPlacement(state[0]) self.values = state[1] self.ndim = self.values.ndim def _slice(self, slicer): """ return a slice of my values """ return self.values[slicer] def getitem_block(self, slicer, new_mgr_locs=None): """ Perform __getitem__-like, return result as block. As of now, only supports slices that preserve dimensionality. """ if new_mgr_locs is None: axis0_slicer = slicer[0] if isinstance(slicer, tuple) else slicer new_mgr_locs = self.mgr_locs[axis0_slicer] elif not isinstance(new_mgr_locs, BlockPlacement): new_mgr_locs = BlockPlacement(new_mgr_locs) new_values = self._slice(slicer) if self._validate_ndim and new_values.ndim != self.ndim: raise ValueError("Only same dim slicing is allowed") return type(self)._simple_new(new_values, new_mgr_locs, self.ndim) @property def shape(self): return self.values.shape @property def dtype(self): return self.values.dtype def iget(self, i): return self.values[i] def set_inplace(self, locs, values): """ Modify block values in-place with new item value. Notes ----- `set` never creates a new array or new Block, whereas `setitem` _may_ create a new array and always creates a new Block. """ self.values[locs] = values def delete(self, loc) -> None: """ Delete given loc(-s) from block in-place. """ self.values = np.delete(self.values, loc, 0) self.mgr_locs = self.mgr_locs.delete(loc) def apply(self, func, **kwargs) -> List["Block"]: """ apply the function to my values; return a block if we are not one """ with np.errstate(all="ignore"): result = func(self.values, **kwargs) return self._split_op_result(result) def reduce(self, func, ignore_failures: bool = False) -> List["Block"]: # We will apply the function and reshape the result into a single-row # Block with the same mgr_locs; squeezing will be done at a higher level assert self.ndim == 2 try: result = func(self.values) except (TypeError, NotImplementedError): if ignore_failures: return [] raise if np.ndim(result) == 0: # TODO(EA2D): special case not needed with 2D EAs res_values = np.array([[result]]) else: res_values = result.reshape(-1, 1) nb = self.make_block(res_values) return [nb] def _split_op_result(self, result) -> List["Block"]: # See also: split_and_operate if is_extension_array_dtype(result) and result.ndim > 1: # TODO(EA2D): unnecessary with 2D EAs # if we get a 2D ExtensionArray, we need to split it into 1D pieces nbs = [] for i, loc in enumerate(self.mgr_locs): vals = result[i] block = self.make_block(values=vals, placement=[loc]) nbs.append(block) return nbs if not isinstance(result, Block): result = self.make_block(result) return [result] def fillna( self, value, limit=None, inplace: bool = False, downcast=None ) -> List["Block"]: """ fillna on the block with the value. If we fail, then convert to ObjectBlock and try again """ inplace = validate_bool_kwarg(inplace, "inplace") mask = isna(self.values) mask = _extract_bool_array(mask) if limit is not None: limit = libalgos.validate_limit(None, limit=limit) mask[mask.cumsum(self.ndim - 1) > limit] = False if not self._can_hold_na: if inplace: return [self] else: return [self.copy()] if self._can_hold_element(value): nb = self if inplace else self.copy() nb._putmask_simple(mask, value) # TODO: should be nb._maybe_downcast? return self._maybe_downcast([nb], downcast) # we can't process the value, but nothing to do if not mask.any(): return [self] if inplace else [self.copy()] # operate column-by-column def f(mask, val, idx): block = self.coerce_to_target_dtype(value) # slice out our block if idx is not None: # i.e. self.ndim == 2 block = block.getitem_block(slice(idx, idx + 1)) return block.fillna(value, limit=limit, inplace=inplace, downcast=None) return self.split_and_operate(None, f, inplace) def _split(self) -> List["Block"]: """ Split a block into a list of single-column blocks. """ assert self.ndim == 2 new_blocks = [] for i, ref_loc in enumerate(self.mgr_locs): vals = self.values[slice(i, i + 1)] nb = self.make_block(vals, [ref_loc]) new_blocks.append(nb) return new_blocks def split_and_operate( self, mask, f, inplace: bool, ignore_failures: bool = False ) -> List["Block"]: """ split the block per-column, and apply the callable f per-column, return a new block for each. Handle masking which will not change a block unless needed. Parameters ---------- mask : 2-d boolean mask f : callable accepting (1d-mask, 1d values, indexer) inplace : bool ignore_failures : bool, default False Returns ------- list of blocks """ if mask is None: mask = np.broadcast_to(True, shape=self.shape) new_values = self.values def make_a_block(nv, ref_loc): if isinstance(nv, list): assert len(nv) == 1, nv assert isinstance(nv[0], Block) block = nv[0] else: # Put back the dimension that was taken from it and make # a block out of the result. nv = _block_shape(nv, ndim=self.ndim) block = self.make_block(values=nv, placement=ref_loc) return block # ndim == 1 if self.ndim == 1: if mask.any(): nv = f(mask, new_values, None) else: nv = new_values if inplace else new_values.copy() block = make_a_block(nv, self.mgr_locs) return [block] # ndim > 1 new_blocks = [] for i, ref_loc in enumerate(self.mgr_locs): m = mask[i] v = new_values[i] # need a new block if m.any() or m.size == 0: # Apply our function; we may ignore_failures if this is a # reduction that is dropping nuisance columns GH#37827 try: nv = f(m, v, i) except TypeError: if ignore_failures: continue else: raise else: nv = v if inplace else v.copy() block = make_a_block(nv, [ref_loc]) new_blocks.append(block) return new_blocks def _maybe_downcast(self, blocks: List["Block"], downcast=None) -> List["Block"]: # no need to downcast our float # unless indicated if downcast is None and (self.is_float or self.is_datelike): return blocks return extend_blocks([b.downcast(downcast) for b in blocks]) def downcast(self, dtypes=None) -> List["Block"]: """ try to downcast each item to the dict of dtypes if present """ # turn it off completely if dtypes is False: return [self] values = self.values if self.ndim == 1: # try to cast all non-floats here if dtypes is None: dtypes = "infer" nv = maybe_downcast_to_dtype(values, dtypes) return [self.make_block(nv)] # ndim > 1 if dtypes is None: return [self] if not (dtypes == "infer" or isinstance(dtypes, dict)): raise ValueError( "downcast must have a dictionary or 'infer' as its argument" ) elif dtypes != "infer": raise AssertionError("dtypes as dict is not supported yet") # operate column-by-column # this is expensive as it splits the blocks items-by-item def f(mask, val, idx): val = maybe_downcast_to_dtype(val, dtype="infer") return val return self.split_and_operate(None, f, False) def astype(self, dtype, copy: bool = False, errors: str = "raise"): """ Coerce to the new dtype. Parameters ---------- dtype : str, dtype convertible copy : bool, default False copy if indicated errors : str, {'raise', 'ignore'}, default 'ignore' - ``raise`` : allow exceptions to be raised - ``ignore`` : suppress exceptions. On error return original object Returns ------- Block """ errors_legal_values = ("raise", "ignore") if errors not in errors_legal_values: invalid_arg = ( "Expected value of kwarg 'errors' to be one of " f"{list(errors_legal_values)}. Supplied value is '{errors}'" ) raise ValueError(invalid_arg) if inspect.isclass(dtype) and issubclass(dtype, ExtensionDtype): msg = ( f"Expected an instance of {dtype.__name__}, " "but got the class instead. Try instantiating 'dtype'." ) raise TypeError(msg) if dtype is not None: dtype = pandas_dtype(dtype) # may need to convert to categorical if is_categorical_dtype(dtype): if is_categorical_dtype(self.values.dtype): # GH 10696/18593: update an existing categorical efficiently return self.make_block(self.values.astype(dtype, copy=copy)) return self.make_block(Categorical(self.values, dtype=dtype)) dtype = pandas_dtype(dtype) # astype processing if is_dtype_equal(self.dtype, dtype): if copy: return self.copy() return self # force the copy here if self.is_extension: try: values = self.values.astype(dtype) except (ValueError, TypeError): if errors == "ignore": values = self.values else: raise else: if issubclass(dtype.type, str): # use native type formatting for datetime/tz/timedelta if self.is_datelike: values = self.to_native_types().values # astype formatting else: # Because we have neither is_extension nor is_datelike, # self.values already has the correct shape values = self.values else: values = self.get_values(dtype=dtype) # _astype_nansafe works fine with 1-d only vals1d = values.ravel() try: values = astype_nansafe(vals1d, dtype, copy=True) except (ValueError, TypeError): # e.g. astype_nansafe can fail on object-dtype of strings # trying to convert to float if errors == "raise": raise newb = self.copy() if copy else self return newb # TODO(EA2D): special case not needed with 2D EAs if isinstance(values, np.ndarray): values = values.reshape(self.shape) newb = self.make_block(values) if newb.is_numeric and self.is_numeric: if newb.shape != self.shape: raise TypeError( f"cannot set astype for copy = [{copy}] for dtype " f"({self.dtype.name} [{self.shape}]) to different shape " f"({newb.dtype.name} [{newb.shape}])" ) return newb def convert( self, copy: bool = True, datetime: bool = True, numeric: bool = True, timedelta: bool = True, ) -> List["Block"]: """ attempt to coerce any object types to better types return a copy of the block (if copy = True) by definition we are not an ObjectBlock here! """ return [self.copy()] if copy else [self] def _can_hold_element(self, element: Any) -> bool: """ require the same dtype as ourselves """ dtype = self.values.dtype.type tipo = maybe_infer_dtype_type(element) if tipo is not None: return issubclass(tipo.type, dtype) return isinstance(element, dtype) def should_store(self, value: ArrayLike) -> bool: """ Should we set self.values[indexer] = value inplace or do we need to cast? Parameters ---------- value : np.ndarray or ExtensionArray Returns ------- bool """ return is_dtype_equal(value.dtype, self.dtype) def to_native_types(self, na_rep="nan", quoting=None, **kwargs): """ convert to our native types format """ values = self.values mask = isna(values) itemsize = writers.word_len(na_rep) if not self.is_object and not quoting and itemsize: values = values.astype(str) if values.dtype.itemsize / np.dtype("U1").itemsize < itemsize: # enlarge for the na_rep values = values.astype(f" List["Block"]: """ replace the to_replace value with value, possible to create new blocks here this is just a call to putmask. regex is not used here. It is used in ObjectBlocks. It is here for API compatibility. """ inplace = validate_bool_kwarg(inplace, "inplace") original_to_replace = to_replace if not self._can_hold_element(to_replace): # We cannot hold `to_replace`, so we know immediately that # replacing it is a no-op. # Note: If to_replace were a list, NDFrame.replace would call # replace_list instead of replace. return [self] if inplace else [self.copy()] values = self.values if lib.is_scalar(to_replace) and isinstance(values, np.ndarray): # The only non-DatetimeLike class that also has a non-trivial # try_coerce_args is ObjectBlock, but that overrides replace, # so does not get here. to_replace = convert_scalar_for_putitemlike(to_replace, values.dtype) mask = missing.mask_missing(values, to_replace) if not mask.any(): # Note: we get here with test_replace_extension_other incorrectly # bc _can_hold_element is incorrect. return [self] if inplace else [self.copy()] if not self._can_hold_element(value): blk = self.astype(object) return blk.replace( to_replace=original_to_replace, value=value, inplace=True, regex=regex, ) blk = self if inplace else self.copy() blk._putmask_simple(mask, value) blocks = blk.convert(numeric=False, copy=not inplace) return blocks def _replace_regex( self, to_replace, value, inplace: bool = False, convert: bool = True, mask=None, ) -> List["Block"]: """ Replace elements by the given value. Parameters ---------- to_replace : object or pattern Scalar to replace or regular expression to match. value : object Replacement object. inplace : bool, default False Perform inplace modification. convert : bool, default True If true, try to coerce any object types to better types. mask : array-like of bool, optional True indicate corresponding element is ignored. Returns ------- List[Block] """ if not self._can_hold_element(to_replace): # i.e. only ObjectBlock, but could in principle include a # String ExtensionBlock return [self] if inplace else [self.copy()] rx = re.compile(to_replace) new_values = self.values if inplace else self.values.copy() replace_regex(new_values, rx, value, mask) block = self.make_block(new_values) if convert: nbs = block.convert(numeric=False) else: nbs = [block] return nbs def _replace_list( self, src_list: List[Any], dest_list: List[Any], inplace: bool = False, regex: bool = False, ) -> List["Block"]: """ See BlockManager._replace_list docstring. """ # Exclude anything that we know we won't contain pairs = [ (x, y) for x, y in zip(src_list, dest_list) if self._can_hold_element(x) ] if not len(pairs): # shortcut, nothing to replace return [self] if inplace else [self.copy()] src_len = len(pairs) - 1 def comp(s: Scalar, mask: np.ndarray, regex: bool = False) -> np.ndarray: """ Generate a bool array by perform an equality check, or perform an element-wise regular expression matching """ if isna(s): return ~mask s = maybe_box_datetimelike(s) return compare_or_regex_search(self.values, s, regex, mask) if self.is_object: # Calculate the mask once, prior to the call of comp # in order to avoid repeating the same computations mask = ~isna(self.values) masks = [comp(s[0], mask, regex) for s in pairs] else: # GH#38086 faster if we know we dont need to check for regex masks = [missing.mask_missing(self.values, s[0]) for s in pairs] masks = [_extract_bool_array(x) for x in masks] rb = [self if inplace else self.copy()] for i, (src, dest) in enumerate(pairs): convert = i == src_len # only convert once at the end new_rb: List["Block"] = [] # GH-39338: _replace_coerce can split a block into # single-column blocks, so track the index so we know # where to index into the mask for blk_num, blk in enumerate(rb): if len(rb) == 1: m = masks[i] else: mib = masks[i] assert not isinstance(mib, bool) m = mib[blk_num : blk_num + 1] result = blk._replace_coerce( to_replace=src, value=dest, mask=m, inplace=inplace, regex=regex, ) if convert and blk.is_object: result = extend_blocks( [b.convert(numeric=False, copy=True) for b in result] ) new_rb.extend(result) rb = new_rb return rb def setitem(self, indexer, value): """ Attempt self.values[indexer] = value, possibly creating a new array. Parameters ---------- indexer : tuple, list-like, array-like, slice The subset of self.values to set value : object The value being set Returns ------- Block Notes ----- `indexer` is a direct slice/positional indexer. `value` must be a compatible shape. """ transpose = self.ndim == 2 if isinstance(indexer, np.ndarray) and indexer.ndim > self.ndim: raise ValueError(f"Cannot set values with ndim > {self.ndim}") # coerce None values, if appropriate if value is None: if self.is_numeric: value = np.nan # coerce if block dtype can store value values = self.values if self._can_hold_element(value): # We only get here for non-Extension Blocks, so _try_coerce_args # is only relevant for DatetimeBlock and TimedeltaBlock if lib.is_scalar(value): value = convert_scalar_for_putitemlike(value, values.dtype) else: # current dtype cannot store value, coerce to common dtype if hasattr(value, "dtype"): dtype = value.dtype elif lib.is_scalar(value) and not isna(value): dtype, _ = infer_dtype_from_scalar(value, pandas_dtype=True) else: # e.g. we are bool dtype and value is nan # TODO: watch out for case with listlike value and scalar/empty indexer dtype, _ = maybe_promote(np.array(value).dtype) return self.astype(dtype).setitem(indexer, value) dtype = find_common_type([values.dtype, dtype]) assert not is_dtype_equal(self.dtype, dtype) # otherwise should have _can_hold_element return self.astype(dtype).setitem(indexer, value) # value must be storable at this moment if is_extension_array_dtype(getattr(value, "dtype", None)): # We need to be careful not to allow through strings that # can be parsed to EADtypes is_ea_value = True arr_value = value else: is_ea_value = False arr_value = np.array(value) if transpose: values = values.T # length checking check_setitem_lengths(indexer, value, values) exact_match = ( len(arr_value.shape) and arr_value.shape[0] == values.shape[0] and arr_value.size == values.size ) if is_empty_indexer(indexer, arr_value): # GH#8669 empty indexers pass elif is_scalar_indexer(indexer, self.ndim): # setting a single element for each dim and with a rhs that could # be e.g. a list; see GH#6043 values[indexer] = value elif exact_match and is_categorical_dtype(arr_value.dtype): # GH25495 - If the current dtype is not categorical, # we need to create a new categorical block values[indexer] = value return self.make_block(Categorical(self.values, dtype=arr_value.dtype)) elif exact_match and is_ea_value: # GH#32395 if we're going to replace the values entirely, just # substitute in the new array return self.make_block(arr_value) # if we are an exact match (ex-broadcasting), # then use the resultant dtype elif exact_match: # We are setting _all_ of the array's values, so can cast to new dtype values[indexer] = value values = values.astype(arr_value.dtype, copy=False) # set else: values[indexer] = value if transpose: values = values.T block = self.make_block(values) return block def _putmask_simple(self, mask: np.ndarray, value: Any): """ Like putmask but a) we do not cast on failure b) we do not handle repeating or truncating like numpy. Parameters ---------- mask : np.ndarray[bool] We assume _extract_bool_array has already been called. value : Any We assume self._can_hold_element(value) """ values = self.values if lib.is_scalar(value) and isinstance(values, np.ndarray): value = convert_scalar_for_putitemlike(value, values.dtype) if self.is_extension or (self.is_object and not lib.is_scalar(value)): # GH#19266 using np.putmask gives unexpected results with listlike value if is_list_like(value) and len(value) == len(values): values[mask] = value[mask] else: values[mask] = value else: # GH#37833 np.putmask is more performant than __setitem__ np.putmask(values, mask, value) def putmask( self, mask, new, inplace: bool = False, axis: int = 0, transpose: bool = False ) -> List["Block"]: """ putmask the data to the block; it is possible that we may create a new dtype of block Return the resulting block(s). Parameters ---------- mask : np.ndarray[bool], SparseArray[bool], or BooleanArray new : a ndarray/object inplace : bool, default False Perform inplace modification. axis : int transpose : bool, default False Set to True if self is stored with axes reversed. Returns ------- List[Block] """ mask = _extract_bool_array(mask) assert not isinstance(new, (ABCIndexClass, ABCSeries, ABCDataFrame)) new_values = self.values # delay copy if possible. # if we are passed a scalar None, convert it here if not is_list_like(new) and isna(new) and not self.is_object: # FIXME: make sure we have compatible NA new = self.fill_value if self._can_hold_element(new): # We only get here for non-Extension Blocks, so _try_coerce_args # is only relevant for DatetimeBlock and TimedeltaBlock if lib.is_scalar(new): new = convert_scalar_for_putitemlike(new, self.values.dtype) if transpose: new_values = new_values.T # If the default repeat behavior in np.putmask would go in the # wrong direction, then explicitly repeat and reshape new instead if getattr(new, "ndim", 0) >= 1: if self.ndim - 1 == new.ndim and axis == 1: new = np.repeat(new, new_values.shape[-1]).reshape(self.shape) new = new.astype(new_values.dtype) if new_values is self.values and not inplace: new_values = new_values.copy() # we require exact matches between the len of the # values we are setting (or is compat). np.putmask # doesn't check this and will simply truncate / pad # the output, but we want sane error messages # # TODO: this prob needs some better checking # for 2D cases if ( is_list_like(new) and np.any(mask[mask]) and getattr(new, "ndim", 1) == 1 ): if mask[mask].shape[-1] == len(new): # GH 30567 # If length of ``new`` is less than the length of ``new_values``, # `np.putmask` would first repeat the ``new`` array and then # assign the masked values hence produces incorrect result. # `np.place` on the other hand uses the ``new`` values at it is # to place in the masked locations of ``new_values`` np.place(new_values, mask, new) elif mask.shape[-1] == len(new) or len(new) == 1: np.putmask(new_values, mask, new) else: raise ValueError("cannot assign mismatch length to masked array") else: np.putmask(new_values, mask, new) # maybe upcast me elif mask.any(): if transpose: mask = mask.T if isinstance(new, np.ndarray): new = new.T axis = new_values.ndim - axis - 1 # Pseudo-broadcast if getattr(new, "ndim", 0) >= 1: if self.ndim - 1 == new.ndim: new_shape = list(new.shape) new_shape.insert(axis, 1) new = new.reshape(tuple(new_shape)) # operate column-by-column def f(mask, val, idx): if idx is None: # ndim==1 case. n = new else: if isinstance(new, np.ndarray): n = np.squeeze(new[idx % new.shape[0]]) else: n = np.array(new) # type of the new block dtype, _ = maybe_promote(n.dtype) # we need to explicitly astype here to make a copy n = n.astype(dtype) nv = _putmask_smart(val, mask, n) return nv new_blocks = self.split_and_operate(mask, f, inplace) return new_blocks if inplace: return [self] if transpose: if new_values is None: new_values = self.values if inplace else self.values.copy() new_values = new_values.T return [self.make_block(new_values)] def coerce_to_target_dtype(self, other): """ coerce the current block to a dtype compat for other we will return a block, possibly object, and not raise we can also safely try to coerce to the same dtype and will receive the same block """ # if we cannot then coerce to object dtype, _ = infer_dtype_from(other, pandas_dtype=True) if is_dtype_equal(self.dtype, dtype): return self if self.is_bool or is_object_dtype(dtype) or is_bool_dtype(dtype): # we don't upcast to bool return self.astype(object) elif (self.is_float or self.is_complex) and ( is_integer_dtype(dtype) or is_float_dtype(dtype) ): # don't coerce float/complex to int return self elif self.is_datetime or is_datetime64_any_dtype(dtype): # The is_dtype_equal check above ensures that at most one of # these two conditions hold, so we must cast to object. return self.astype(object) elif self.is_timedelta or is_timedelta64_dtype(dtype): # The is_dtype_equal check above ensures that at most one of # these two conditions hold, so we must cast to object. return self.astype(object) try: return self.astype(dtype) except (ValueError, TypeError, OverflowError): return self.astype(object) def interpolate( self, method: str = "pad", axis: int = 0, index: Optional["Index"] = None, inplace: bool = False, limit: Optional[int] = None, limit_direction: str = "forward", limit_area: Optional[str] = None, fill_value: Optional[Any] = None, coerce: bool = False, downcast: Optional[str] = None, **kwargs, ): inplace = validate_bool_kwarg(inplace, "inplace") if not self._can_hold_na: # If there are no NAs, then interpolate is a no-op return self if inplace else self.copy() # a fill na type method try: m = missing.clean_fill_method(method) except ValueError: m = None if m is not None: if fill_value is not None: # similar to validate_fillna_kwargs raise ValueError("Cannot pass both fill_value and method") return self._interpolate_with_fill( method=m, axis=axis, inplace=inplace, limit=limit, limit_area=limit_area, downcast=downcast, ) # validate the interp method m = missing.clean_interp_method(method, **kwargs) assert index is not None # for mypy return self._interpolate( method=m, index=index, axis=axis, limit=limit, limit_direction=limit_direction, limit_area=limit_area, fill_value=fill_value, inplace=inplace, downcast=downcast, **kwargs, ) def _interpolate_with_fill( self, method: str = "pad", axis: int = 0, inplace: bool = False, limit: Optional[int] = None, limit_area: Optional[str] = None, downcast: Optional[str] = None, ) -> List["Block"]: """ fillna but using the interpolate machinery """ inplace = validate_bool_kwarg(inplace, "inplace") assert self._can_hold_na # checked by caller values = self.values if inplace else self.values.copy() values = missing.interpolate_2d( values, method=method, axis=axis, limit=limit, limit_area=limit_area, ) blocks = [self.make_block_same_class(values, ndim=self.ndim)] return self._maybe_downcast(blocks, downcast) def _interpolate( self, method: str, index: "Index", fill_value: Optional[Any] = None, axis: int = 0, limit: Optional[int] = None, limit_direction: str = "forward", limit_area: Optional[str] = None, inplace: bool = False, downcast: Optional[str] = None, **kwargs, ) -> List["Block"]: """ interpolate using scipy wrappers """ inplace = validate_bool_kwarg(inplace, "inplace") data = self.values if inplace else self.values.copy() # only deal with floats if not self.is_float: if not self.is_integer: return [self] data = data.astype(np.float64) if fill_value is None: fill_value = self.fill_value if method in ("krogh", "piecewise_polynomial", "pchip"): if not index.is_monotonic: raise ValueError( f"{method} interpolation requires that the index be monotonic." ) # process 1-d slices in the axis direction def func(yvalues: np.ndarray) -> np.ndarray: # process a 1-d slice, returning it # should the axis argument be handled below in apply_along_axis? # i.e. not an arg to missing.interpolate_1d return missing.interpolate_1d( xvalues=index, yvalues=yvalues, method=method, limit=limit, limit_direction=limit_direction, limit_area=limit_area, fill_value=fill_value, bounds_error=False, **kwargs, ) # interp each column independently interp_values = np.apply_along_axis(func, axis, data) blocks = [self.make_block_same_class(interp_values)] return self._maybe_downcast(blocks, downcast) def take_nd(self, indexer, axis: int, new_mgr_locs=None, fill_value=lib.no_default): """ Take values according to indexer and return them as a block.bb """ # algos.take_nd dispatches for DatetimeTZBlock, CategoricalBlock # so need to preserve types # sparse is treated like an ndarray, but needs .get_values() shaping values = self.values if fill_value is lib.no_default: fill_value = self.fill_value allow_fill = False else: allow_fill = True new_values = algos.take_nd( values, indexer, axis=axis, allow_fill=allow_fill, fill_value=fill_value ) # Called from three places in managers, all of which satisfy # this assertion assert not (axis == 0 and new_mgr_locs is None) if new_mgr_locs is None: new_mgr_locs = self.mgr_locs if not is_dtype_equal(new_values.dtype, self.dtype): return self.make_block(new_values, new_mgr_locs) else: return self.make_block_same_class(new_values, new_mgr_locs) def diff(self, n: int, axis: int = 1) -> List["Block"]: """ return block for the diff of the values """ new_values = algos.diff(self.values, n, axis=axis, stacklevel=7) return [self.make_block(values=new_values)] def shift(self, periods: int, axis: int = 0, fill_value=None): """ shift the block by periods, possibly upcast """ # convert integer to float if necessary. need to do a lot more than # that, handle boolean etc also new_values, fill_value = maybe_upcast(self.values, fill_value) new_values = shift(new_values, periods, axis, fill_value) return [self.make_block(new_values)] def where( self, other, cond, errors="raise", try_cast: bool = False, axis: int = 0 ) -> List["Block"]: """ evaluate the block; return result block(s) from the result Parameters ---------- other : a ndarray/object cond : np.ndarray[bool], SparseArray[bool], or BooleanArray errors : str, {'raise', 'ignore'}, default 'raise' - ``raise`` : allow exceptions to be raised - ``ignore`` : suppress exceptions. On error return original object try_cast: bool, default False axis : int, default 0 Returns ------- List[Block] """ import pandas.core.computation.expressions as expressions cond = _extract_bool_array(cond) assert not isinstance(other, (ABCIndexClass, ABCSeries, ABCDataFrame)) assert errors in ["raise", "ignore"] transpose = self.ndim == 2 values = self.values orig_other = other if transpose: values = values.T # If the default broadcasting would go in the wrong direction, then # explicitly reshape other instead if getattr(other, "ndim", 0) >= 1: if values.ndim - 1 == other.ndim and axis == 1: other = other.reshape(tuple(other.shape + (1,))) elif transpose and values.ndim == self.ndim - 1: # TODO(EA2D): not neceesssary with 2D EAs cond = cond.T if not hasattr(cond, "shape"): raise ValueError("where must have a condition that is ndarray like") if cond.ravel("K").all(): result = values.copy() else: # see if we can operate on the entire block, or need item-by-item # or if we are a single block (ndim == 1) if ( (self.is_integer or self.is_bool) and lib.is_float(other) and np.isnan(other) ): # GH#3733 special case to avoid object-dtype casting # and go through numexpr path instead. # In integer case, np.where will cast to floats pass elif not self._can_hold_element(other): # we cannot coerce, return a compat dtype # we are explicitly ignoring errors block = self.coerce_to_target_dtype(other) blocks = block.where( orig_other, cond, errors=errors, try_cast=try_cast, axis=axis ) return self._maybe_downcast(blocks, "infer") if not ( (self.is_integer or self.is_bool) and lib.is_float(other) and np.isnan(other) ): # convert datetime to datetime64, timedelta to timedelta64 other = convert_scalar_for_putitemlike(other, values.dtype) # By the time we get here, we should have all Series/Index # args extracted to ndarray result = expressions.where(cond, values, other) if self._can_hold_na or self.ndim == 1: if transpose: result = result.T return [self.make_block(result)] # might need to separate out blocks axis = cond.ndim - 1 cond = cond.swapaxes(axis, 0) mask = np.array([cond[i].all() for i in range(cond.shape[0])], dtype=bool) result_blocks: List["Block"] = [] for m in [mask, ~mask]: if m.any(): result = cast(np.ndarray, result) # EABlock overrides where taken = result.take(m.nonzero()[0], axis=axis) r = maybe_downcast_numeric(taken, self.dtype) nb = self.make_block(r.T, placement=self.mgr_locs[m]) result_blocks.append(nb) return result_blocks def _unstack(self, unstacker, fill_value, new_placement): """ Return a list of unstacked blocks of self Parameters ---------- unstacker : reshape._Unstacker fill_value : int Only used in ExtensionBlock._unstack Returns ------- blocks : list of Block New blocks of unstacked values. mask : array_like of bool The mask of columns of `blocks` we should keep. """ new_values, mask = unstacker.get_new_values( self.values.T, fill_value=fill_value ) mask = mask.any(0) # TODO: in all tests we have mask.all(); can we rely on that? new_values = new_values.T[mask] new_placement = new_placement[mask] blocks = [make_block(new_values, placement=new_placement)] return blocks, mask def quantile(self, qs, interpolation="linear", axis: int = 0): """ compute the quantiles of the Parameters ---------- qs: a scalar or list of the quantiles to be computed interpolation: type of interpolation, default 'linear' axis: axis to compute, default 0 Returns ------- Block """ # We should always have ndim == 2 because Series dispatches to DataFrame assert self.ndim == 2 values = self.get_values() is_empty = values.shape[axis] == 0 orig_scalar = not is_list_like(qs) if orig_scalar: # make list-like, unpack later qs = [qs] if is_empty: # create the array of na_values # 2d len(values) * len(qs) result = np.repeat( np.array([self.fill_value] * len(qs)), len(values) ).reshape(len(values), len(qs)) else: # asarray needed for Sparse, see GH#24600 mask = np.asarray(isna(values)) result = nanpercentile( values, np.array(qs) * 100, axis=axis, na_value=self.fill_value, mask=mask, ndim=values.ndim, interpolation=interpolation, ) result = np.array(result, copy=False) result = result.T if orig_scalar and not lib.is_scalar(result): # result could be scalar in case with is_empty and self.ndim == 1 assert result.shape[-1] == 1, result.shape result = result[..., 0] result = lib.item_from_zerodim(result) ndim = np.ndim(result) return make_block(result, placement=np.arange(len(result)), ndim=ndim) def _replace_coerce( self, to_replace, value, mask: np.ndarray, inplace: bool = True, regex: bool = False, ) -> List["Block"]: """ Replace value corresponding to the given boolean array with another value. Parameters ---------- to_replace : object or pattern Scalar to replace or regular expression to match. value : object Replacement object. mask : np.ndarray[bool] True indicate corresponding element is ignored. inplace : bool, default True Perform inplace modification. regex : bool, default False If true, perform regular expression substitution. Returns ------- List[Block] """ if mask.any(): if not regex: nb = self.coerce_to_target_dtype(value) if nb is self and not inplace: nb = nb.copy() nb._putmask_simple(mask, value) return [nb] else: regex = _should_use_regex(regex, to_replace) if regex: return self._replace_regex( to_replace, value, inplace=inplace, convert=False, mask=mask, ) return self.replace(to_replace, value, inplace=inplace, regex=False) return [self] class ExtensionBlock(Block): """ Block for holding extension types. Notes ----- This holds all 3rd-party extension array types. It's also the immediate parent class for our internal extension types' blocks, CategoricalBlock. ExtensionArrays are limited to 1-D. """ _can_consolidate = False _validate_ndim = False is_extension = True values: ExtensionArray def __init__(self, values, placement, ndim: int): """ Initialize a non-consolidatable block. 'ndim' may be inferred from 'placement'. This will call continue to call __init__ for the other base classes mixed in with this Mixin. """ # Placement must be converted to BlockPlacement so that we can check # its length if not isinstance(placement, libinternals.BlockPlacement): placement = libinternals.BlockPlacement(placement) # Maybe infer ndim from placement if ndim is None: if len(placement) != 1: ndim = 1 else: ndim = 2 super().__init__(values, placement, ndim=ndim) if self.ndim == 2 and len(self.mgr_locs) != 1: # TODO(EA2D): check unnecessary with 2D EAs raise AssertionError("block.size != values.size") @property def shape(self): # TODO(EA2D): override unnecessary with 2D EAs if self.ndim == 1: return (len(self.values),) return len(self.mgr_locs), len(self.values) def iget(self, col): if self.ndim == 2 and isinstance(col, tuple): # TODO(EA2D): unnecessary with 2D EAs col, loc = col if not com.is_null_slice(col) and col != 0: raise IndexError(f"{self} only contains one item") elif isinstance(col, slice): if col != slice(None): raise NotImplementedError(col) return self.values[[loc]] return self.values[loc] else: if col != 0: raise IndexError(f"{self} only contains one item") return self.values def set_inplace(self, locs, values): # NB: This is a misnomer, is supposed to be inplace but is not, # see GH#33457 assert locs.tolist() == [0] self.values = values def putmask( self, mask, new, inplace: bool = False, axis: int = 0, transpose: bool = False ) -> List["Block"]: """ See Block.putmask.__doc__ """ inplace = validate_bool_kwarg(inplace, "inplace") mask = _extract_bool_array(mask) new_values = self.values if inplace else self.values.copy() if isinstance(new, (np.ndarray, ExtensionArray)) and len(new) == len(mask): new = new[mask] mask = safe_reshape(mask, new_values.shape) new_values[mask] = new return [self.make_block(values=new_values)] def _maybe_coerce_values(self, values): """ Unbox to an extension array. This will unbox an ExtensionArray stored in an Index or Series. ExtensionArrays pass through. No dtype coercion is done. Parameters ---------- values : Index, Series, ExtensionArray Returns ------- ExtensionArray """ return extract_array(values) @property def _holder(self): # For extension blocks, the holder is values-dependent. return type(self.values) @property def fill_value(self): # Used in reindex_indexer return self.values.dtype.na_value @property def _can_hold_na(self): # The default ExtensionArray._can_hold_na is True return self._holder._can_hold_na @property def is_view(self) -> bool: """Extension arrays are never treated as views.""" return False @property def is_numeric(self): return self.values.dtype._is_numeric def setitem(self, indexer, value): """ Attempt self.values[indexer] = value, possibly creating a new array. This differs from Block.setitem by not allowing setitem to change the dtype of the Block. Parameters ---------- indexer : tuple, list-like, array-like, slice The subset of self.values to set value : object The value being set Returns ------- Block Notes ----- `indexer` is a direct slice/positional indexer. `value` must be a compatible shape. """ if not self._can_hold_element(value): # This is only relevant for DatetimeTZBlock, which has a # non-trivial `_can_hold_element`. # https://github.com/pandas-dev/pandas/issues/24020 # Need a dedicated setitem until GH#24020 (type promotion in setitem # for extension arrays) is designed and implemented. return self.astype(object).setitem(indexer, value) if isinstance(indexer, tuple): # TODO(EA2D): not needed with 2D EAs # we are always 1-D indexer = indexer[0] check_setitem_lengths(indexer, value, self.values) self.values[indexer] = value return self def get_values(self, dtype=None): # ExtensionArrays must be iterable, so this works. # TODO(EA2D): reshape not needed with 2D EAs return np.asarray(self.values).reshape(self.shape) def array_values(self) -> ExtensionArray: return self.values def to_native_types(self, na_rep="nan", quoting=None, **kwargs): """override to use ExtensionArray astype for the conversion""" values = self.values mask = isna(values) values = np.asarray(values.astype(object)) values[mask] = na_rep # TODO(EA2D): reshape not needed with 2D EAs # we are expected to return a 2-d ndarray return self.make_block(values) def take_nd( self, indexer, axis: int = 0, new_mgr_locs=None, fill_value=lib.no_default ): """ Take values according to indexer and return them as a block. """ if fill_value is lib.no_default: fill_value = None # TODO(EA2D): special case not needed with 2D EAs # axis doesn't matter; we are really a single-dim object # but are passed the axis depending on the calling routing # if its REALLY axis 0, then this will be a reindex and not a take new_values = self.values.take(indexer, fill_value=fill_value, allow_fill=True) # Called from three places in managers, all of which satisfy # this assertion assert not (self.ndim == 1 and new_mgr_locs is None) if new_mgr_locs is None: new_mgr_locs = self.mgr_locs return self.make_block_same_class(new_values, new_mgr_locs) def _can_hold_element(self, element: Any) -> bool: # TODO: We may need to think about pushing this onto the array. # We're doing the same as CategoricalBlock here. return True def _slice(self, slicer): """ Return a slice of my values. Parameters ---------- slicer : slice, ndarray[int], or a tuple of these Valid (non-reducing) indexer for self.values. Returns ------- np.ndarray or ExtensionArray """ # return same dims as we currently have if not isinstance(slicer, tuple) and self.ndim == 2: # reached via getitem_block via _slice_take_blocks_ax0 # TODO(EA2D): wont be necessary with 2D EAs slicer = (slicer, slice(None)) if isinstance(slicer, tuple) and len(slicer) == 2: first = slicer[0] if not isinstance(first, slice): raise AssertionError( "invalid slicing for a 1-ndim ExtensionArray", first ) # GH#32959 only full-slicers along fake-dim0 are valid # TODO(EA2D): wont be necessary with 2D EAs new_locs = self.mgr_locs[first] if len(new_locs): # effectively slice(None) slicer = slicer[1] else: raise AssertionError( "invalid slicing for a 1-ndim ExtensionArray", slicer ) return self.values[slicer] def fillna(self, value, limit=None, inplace=False, downcast=None): values = self.values if inplace else self.values.copy() values = values.fillna(value=value, limit=limit) return [ self.make_block_same_class( values=values, placement=self.mgr_locs, ndim=self.ndim ) ] def interpolate( self, method="pad", axis=0, inplace=False, limit=None, fill_value=None, **kwargs ): values = self.values if inplace else self.values.copy() return self.make_block_same_class( values=values.fillna(value=fill_value, method=method, limit=limit), placement=self.mgr_locs, ) def diff(self, n: int, axis: int = 1) -> List["Block"]: if axis == 0 and n != 0: # n==0 case will be a no-op so let is fall through # Since we only have one column, the result will be all-NA. # Create this result by shifting along axis=0 past the length of # our values. return super().diff(len(self.values), axis=0) if axis == 1: # TODO(EA2D): unnecessary with 2D EAs # we are by definition 1D. axis = 0 return super().diff(n, axis) def shift( self, periods: int, axis: int = 0, fill_value: Any = None ) -> List["ExtensionBlock"]: """ Shift the block by `periods`. Dispatches to underlying ExtensionArray and re-boxes in an ExtensionBlock. """ return [ self.make_block_same_class( self.values.shift(periods=periods, fill_value=fill_value), placement=self.mgr_locs, ndim=self.ndim, ) ] def where( self, other, cond, errors="raise", try_cast: bool = False, axis: int = 0 ) -> List["Block"]: cond = _extract_bool_array(cond) assert not isinstance(other, (ABCIndexClass, ABCSeries, ABCDataFrame)) if isinstance(other, np.ndarray) and other.ndim == 2: # TODO(EA2D): unnecessary with 2D EAs assert other.shape[1] == 1 other = other[:, 0] if isinstance(cond, np.ndarray) and cond.ndim == 2: # TODO(EA2D): unnecessary with 2D EAs assert cond.shape[1] == 1 cond = cond[:, 0] if lib.is_scalar(other) and isna(other): # The default `other` for Series / Frame is np.nan # we want to replace that with the correct NA value # for the type other = self.dtype.na_value if is_sparse(self.values): # TODO(SparseArray.__setitem__): remove this if condition # We need to re-infer the type of the data after doing the # where, for cases where the subtypes don't match dtype = None else: dtype = self.dtype result = self.values.copy() icond = ~cond if lib.is_scalar(other): set_other = other else: set_other = other[icond] try: result[icond] = set_other except (NotImplementedError, TypeError): # NotImplementedError for class not implementing `__setitem__` # TypeError for SparseArray, which implements just to raise # a TypeError result = self._holder._from_sequence( np.where(cond, self.values, other), dtype=dtype ) return [self.make_block_same_class(result, placement=self.mgr_locs)] def _unstack(self, unstacker, fill_value, new_placement): # ExtensionArray-safe unstack. # We override ObjectBlock._unstack, which unstacks directly on the # values of the array. For EA-backed blocks, this would require # converting to a 2-D ndarray of objects. # Instead, we unstack an ndarray of integer positions, followed by # a `take` on the actual values. n_rows = self.shape[-1] dummy_arr = np.arange(n_rows) new_values, mask = unstacker.get_new_values(dummy_arr, fill_value=-1) mask = mask.any(0) # TODO: in all tests we have mask.all(); can we rely on that? blocks = [ self.make_block_same_class( self.values.take(indices, allow_fill=True, fill_value=fill_value), [place], ) for indices, place in zip(new_values.T, new_placement) ] return blocks, mask class ObjectValuesExtensionBlock(ExtensionBlock): """ Block providing backwards-compatibility for `.values`. Used by PeriodArray and IntervalArray to ensure that Series[T].values is an ndarray of objects. """ def external_values(self): return self.values.astype(object) def _can_hold_element(self, element: Any) -> bool: if is_valid_nat_for_dtype(element, self.dtype): return True if isinstance(element, list) and len(element) == 0: return True tipo = maybe_infer_dtype_type(element) if tipo is not None: return issubclass(tipo.type, self.dtype.type) return isinstance(element, self.dtype.type) class NumericBlock(Block): __slots__ = () is_numeric = True _can_hold_na = True class FloatBlock(NumericBlock): __slots__ = () is_float = True def _can_hold_element(self, element: Any) -> bool: tipo = maybe_infer_dtype_type(element) if tipo is not None: return issubclass(tipo.type, (np.floating, np.integer)) and not issubclass( tipo.type, np.timedelta64 ) return isinstance( element, (float, int, np.floating, np.int_) ) and not isinstance( element, (bool, np.bool_, np.timedelta64), ) def to_native_types( self, na_rep="", float_format=None, decimal=".", quoting=None, **kwargs ): """ convert to our native types format """ values = self.values # see gh-13418: no special formatting is desired at the # output (important for appropriate 'quoting' behaviour), # so do not pass it through the FloatArrayFormatter if float_format is None and decimal == ".": mask = isna(values) if not quoting: values = values.astype(str) else: values = np.array(values, dtype="object") values[mask] = na_rep return self.make_block(values) from pandas.io.formats.format import FloatArrayFormatter formatter = FloatArrayFormatter( values, na_rep=na_rep, float_format=float_format, decimal=decimal, quoting=quoting, fixed_width=False, ) res = formatter.get_result_as_array() return self.make_block(res) class ComplexBlock(NumericBlock): __slots__ = () is_complex = True def _can_hold_element(self, element: Any) -> bool: tipo = maybe_infer_dtype_type(element) if tipo is not None: return issubclass(tipo.type, (np.floating, np.integer, np.complexfloating)) return isinstance( element, (float, int, complex, np.float_, np.int_) ) and not isinstance(element, (bool, np.bool_)) class IntBlock(NumericBlock): __slots__ = () is_integer = True _can_hold_na = False def _can_hold_element(self, element: Any) -> bool: tipo = maybe_infer_dtype_type(element) if tipo is not None: return ( issubclass(tipo.type, np.integer) and not issubclass(tipo.type, np.timedelta64) and self.dtype.itemsize >= tipo.itemsize ) # We have not inferred an integer from the dtype # check if we have a builtin int or a float equal to an int return is_integer(element) or (is_float(element) and element.is_integer()) class DatetimeLikeBlockMixin(Block): """Mixin class for DatetimeBlock, DatetimeTZBlock, and TimedeltaBlock.""" @property def _holder(self): return DatetimeArray @property def fill_value(self): return np.datetime64("NaT", "ns") def get_values(self, dtype=None): """ return object dtype as boxed values, such as Timestamps/Timedelta """ if is_object_dtype(dtype): # DTA/TDA constructor and astype can handle 2D return self._holder(self.values).astype(object) return self.values def internal_values(self): # Override to return DatetimeArray and TimedeltaArray return self.array_values() def array_values(self): return self._holder._simple_new(self.values) def iget(self, key): # GH#31649 we need to wrap scalars in Timestamp/Timedelta # TODO(EA2D): this can be removed if we ever have 2D EA return self.array_values().reshape(self.shape)[key] def diff(self, n: int, axis: int = 0) -> List["Block"]: """ 1st discrete difference. Parameters ---------- n : int Number of periods to diff. axis : int, default 0 Axis to diff upon. Returns ------- A list with a new TimeDeltaBlock. Notes ----- The arguments here are mimicking shift so they are called correctly by apply. """ # TODO(EA2D): reshape not necessary with 2D EAs values = self.array_values().reshape(self.shape) new_values = values - values.shift(n, axis=axis) return [ TimeDeltaBlock(new_values, placement=self.mgr_locs.indexer, ndim=self.ndim) ] def shift(self, periods, axis=0, fill_value=None): # TODO(EA2D) this is unnecessary if these blocks are backed by 2D EAs values = self.array_values() new_values = values.shift(periods, fill_value=fill_value, axis=axis) return self.make_block_same_class(new_values) def to_native_types(self, na_rep="NaT", **kwargs): """ convert to our native types format """ arr = self.array_values() result = arr._format_native_types(na_rep=na_rep, **kwargs) return self.make_block(result) class DatetimeBlock(DatetimeLikeBlockMixin): __slots__ = () is_datetime = True @property def _can_hold_na(self): return True def _maybe_coerce_values(self, values): """ Input validation for values passed to __init__. Ensure that we have datetime64ns, coercing if necessary. Parameters ---------- values : array-like Must be convertible to datetime64 Returns ------- values : ndarray[datetime64ns] Overridden by DatetimeTZBlock. """ if values.dtype != DT64NS_DTYPE: values = conversion.ensure_datetime64ns(values) if isinstance(values, DatetimeArray): values = values._data assert isinstance(values, np.ndarray), type(values) return values def astype(self, dtype, copy: bool = False, errors: str = "raise"): """ these automatically copy, so copy=True has no effect raise on an except if raise == True """ dtype = pandas_dtype(dtype) # if we are passed a datetime64[ns, tz] if is_datetime64tz_dtype(dtype): values = self.values if copy: # this should be the only copy values = values.copy() values = DatetimeArray._simple_new(values.view("i8"), dtype=dtype) return self.make_block(values) # delegate return super().astype(dtype=dtype, copy=copy, errors=errors) def _can_hold_element(self, element: Any) -> bool: tipo = maybe_infer_dtype_type(element) if tipo is not None: if isinstance(element, list) and len(element) == 0: # Following DatetimeArray._validate_setitem_value # convention, we treat this as object-dtype # (even though tipo is float64) return True elif self.is_datetimetz: # require exact match, since non-nano does not exist return is_dtype_equal(tipo, self.dtype) or is_valid_nat_for_dtype( element, self.dtype ) # GH#27419 if we get a non-nano datetime64 object return is_datetime64_dtype(tipo) elif element is NaT: return True elif isinstance(element, datetime): if self.is_datetimetz: return tz_compare(element.tzinfo, self.dtype.tz) return element.tzinfo is None return is_valid_nat_for_dtype(element, self.dtype) def set_inplace(self, locs, values): """ See Block.set.__doc__ """ values = conversion.ensure_datetime64ns(values, copy=False) self.values[locs] = values class DatetimeTZBlock(ExtensionBlock, DatetimeBlock): """ implement a datetime64 block with a tz attribute """ values: DatetimeArray __slots__ = () is_datetimetz = True is_extension = True internal_values = Block.internal_values _can_hold_element = DatetimeBlock._can_hold_element to_native_types = DatetimeBlock.to_native_types diff = DatetimeBlock.diff fill_value = np.datetime64("NaT", "ns") array_values = ExtensionBlock.array_values @property def _holder(self): return DatetimeArray def _maybe_coerce_values(self, values): """ Input validation for values passed to __init__. Ensure that we have datetime64TZ, coercing if necessary. Parameters ---------- values : array-like Must be convertible to datetime64 Returns ------- values : DatetimeArray """ if not isinstance(values, self._holder): values = self._holder(values) if values.tz is None: raise ValueError("cannot create a DatetimeTZBlock without a tz") return values @property def is_view(self) -> bool: """ return a boolean if I am possibly a view """ # check the ndarray values of the DatetimeIndex values return self.values._data.base is not None def get_values(self, dtype=None): """ Returns an ndarray of values. Parameters ---------- dtype : np.dtype Only `object`-like dtypes are respected here (not sure why). Returns ------- values : ndarray When ``dtype=object``, then and object-dtype ndarray of boxed values is returned. Otherwise, an M8[ns] ndarray is returned. DatetimeArray is always 1-d. ``get_values`` will reshape the return value to be the same dimensionality as the block. """ values = self.values if is_object_dtype(dtype): values = values.astype(object) # TODO(EA2D): reshape unnecessary with 2D EAs # Ensure that our shape is correct for DataFrame. # ExtensionArrays are always 1-D, even in a DataFrame when # the analogous NumPy-backed column would be a 2-D ndarray. return np.asarray(values).reshape(self.shape) def external_values(self): # NB: this is different from np.asarray(self.values), since that # return an object-dtype ndarray of Timestamps. return np.asarray(self.values.astype("datetime64[ns]", copy=False)) def fillna(self, value, limit=None, inplace=False, downcast=None): # We support filling a DatetimeTZ with a `value` whose timezone # is different by coercing to object. if self._can_hold_element(value): return super().fillna(value, limit, inplace, downcast) # different timezones, or a non-tz return self.astype(object).fillna( value, limit=limit, inplace=inplace, downcast=downcast ) def quantile(self, qs, interpolation="linear", axis=0): naive = self.values.view("M8[ns]") # TODO(EA2D): kludge for 2D block with 1D values naive = naive.reshape(self.shape) blk = self.make_block(naive) res_blk = blk.quantile(qs, interpolation=interpolation, axis=axis) # TODO(EA2D): ravel is kludge for 2D block with 1D values, assumes column-like aware = self._holder(res_blk.values.ravel(), dtype=self.dtype) return self.make_block_same_class(aware, ndim=res_blk.ndim) def _check_ndim(self, values, ndim): """ ndim inference and validation. This is overriden by the DatetimeTZBlock to check the case of 2D data (values.ndim == 2), which should only be allowed if ndim is also 2. The case of 1D array is still allowed with both ndim of 1 or 2, as if the case for other EAs. Therefore, we are only checking `values.ndim > ndim` instead of `values.ndim != ndim` as for consolidated blocks. """ if ndim is None: ndim = values.ndim if values.ndim > ndim: raise ValueError( "Wrong number of dimensions. " f"values.ndim != ndim [{values.ndim} != {ndim}]" ) return ndim class TimeDeltaBlock(DatetimeLikeBlockMixin, IntBlock): __slots__ = () is_timedelta = True _can_hold_na = True is_numeric = False fill_value = np.timedelta64("NaT", "ns") def _maybe_coerce_values(self, values): if values.dtype != TD64NS_DTYPE: # non-nano we will convert to nano if values.dtype.kind != "m": # caller is responsible for ensuring timedelta64 dtype raise TypeError(values.dtype) # pragma: no cover values = TimedeltaArray._from_sequence(values)._data if isinstance(values, TimedeltaArray): values = values._data assert isinstance(values, np.ndarray), type(values) return values @property def _holder(self): return TimedeltaArray def _can_hold_element(self, element: Any) -> bool: tipo = maybe_infer_dtype_type(element) if tipo is not None: return issubclass(tipo.type, np.timedelta64) elif element is NaT: return True elif isinstance(element, (timedelta, np.timedelta64)): return True return is_valid_nat_for_dtype(element, self.dtype) def fillna(self, value, **kwargs): # TODO(EA2D): if we operated on array_values, TDA.fillna would handle # raising here. if is_integer(value): # Deprecation GH#24694, GH#19233 raise TypeError( "Passing integers to fillna for timedelta64[ns] dtype is no " "longer supported. To obtain the old behavior, pass " "`pd.Timedelta(seconds=n)` instead." ) return super().fillna(value, **kwargs) class BoolBlock(NumericBlock): __slots__ = () is_bool = True _can_hold_na = False def _can_hold_element(self, element: Any) -> bool: tipo = maybe_infer_dtype_type(element) if tipo is not None: return issubclass(tipo.type, np.bool_) return isinstance(element, (bool, np.bool_)) class ObjectBlock(Block): __slots__ = () is_object = True _can_hold_na = True def _maybe_coerce_values(self, values): if issubclass(values.dtype.type, str): values = np.array(values, dtype=object) return values @property def is_bool(self): """ we can be a bool if we have only bool values but are of type object """ return lib.is_bool_array(self.values.ravel("K")) def reduce(self, func, ignore_failures: bool = False) -> List[Block]: """ For object-dtype, we operate column-wise. """ assert self.ndim == 2 values = self.values if len(values) > 1: # split_and_operate expects func with signature (mask, values, inplace) def mask_func(mask, values, inplace): if values.ndim == 1: values = values.reshape(1, -1) return func(values) return self.split_and_operate( None, mask_func, False, ignore_failures=ignore_failures ) try: res = func(values) except TypeError: if not ignore_failures: raise return [] assert isinstance(res, np.ndarray) assert res.ndim == 1 res = res.reshape(1, -1) return [self.make_block_same_class(res)] def convert( self, copy: bool = True, datetime: bool = True, numeric: bool = True, timedelta: bool = True, ) -> List["Block"]: """ attempt to cast any object types to better types return a copy of the block (if copy = True) by definition we ARE an ObjectBlock!!!!! """ # operate column-by-column def f(mask, val, idx): shape = val.shape values = soft_convert_objects( val.ravel(), datetime=datetime, numeric=numeric, timedelta=timedelta, copy=copy, ) if isinstance(values, np.ndarray): # TODO(EA2D): allow EA once reshape is supported values = values.reshape(shape) return values if self.ndim == 2: blocks = self.split_and_operate(None, f, False) else: values = f(None, self.values.ravel(), None) blocks = [self.make_block(values)] return blocks def _maybe_downcast(self, blocks: List["Block"], downcast=None) -> List["Block"]: if downcast is not None: return blocks # split and convert the blocks return extend_blocks([b.convert(datetime=True, numeric=False) for b in blocks]) def _can_hold_element(self, element: Any) -> bool: return True def replace( self, to_replace, value, inplace: bool = False, regex: bool = False, ) -> List["Block"]: # Note: the checks we do in NDFrame.replace ensure we never get # here with listlike to_replace or value, as those cases # go through _replace_list regex = _should_use_regex(regex, to_replace) if regex: return self._replace_regex(to_replace, value, inplace=inplace) else: return super().replace(to_replace, value, inplace=inplace, regex=False) def _should_use_regex(regex: bool, to_replace: Any) -> bool: """ Decide whether to treat `to_replace` as a regular expression. """ if is_re(to_replace): regex = True regex = regex and is_re_compilable(to_replace) # Don't use regex if the pattern is empty. regex = regex and re.compile(to_replace).pattern != "" return regex class CategoricalBlock(ExtensionBlock): __slots__ = () def _replace_list( self, src_list: List[Any], dest_list: List[Any], inplace: bool = False, regex: bool = False, ) -> List["Block"]: if len(algos.unique(dest_list)) == 1: # We likely got here by tiling value inside NDFrame.replace, # so un-tile here return self.replace(src_list, dest_list[0], inplace, regex) return super()._replace_list(src_list, dest_list, inplace, regex) def replace( self, to_replace, value, inplace: bool = False, regex: bool = False, ) -> List["Block"]: inplace = validate_bool_kwarg(inplace, "inplace") result = self if inplace else self.copy() result.values.replace(to_replace, value, inplace=True) return [result] # ----------------------------------------------------------------- # Constructor Helpers def get_block_type(values, dtype=None): """ Find the appropriate Block subclass to use for the given values and dtype. Parameters ---------- values : ndarray-like dtype : numpy or pandas dtype Returns ------- cls : class, subclass of Block """ dtype = dtype or values.dtype vtype = dtype.type cls: Type[Block] if is_sparse(dtype): # Need this first(ish) so that Sparse[datetime] is sparse cls = ExtensionBlock elif is_categorical_dtype(values.dtype): cls = CategoricalBlock elif issubclass(vtype, np.datetime64): assert not is_datetime64tz_dtype(values.dtype) cls = DatetimeBlock elif is_datetime64tz_dtype(values.dtype): cls = DatetimeTZBlock elif is_interval_dtype(dtype) or is_period_dtype(dtype): cls = ObjectValuesExtensionBlock elif is_extension_array_dtype(values.dtype): # Note: need to be sure PandasArray is unwrapped before we get here cls = ExtensionBlock elif issubclass(vtype, np.floating): cls = FloatBlock elif issubclass(vtype, np.timedelta64): assert issubclass(vtype, np.integer) cls = TimeDeltaBlock elif issubclass(vtype, np.complexfloating): cls = ComplexBlock elif issubclass(vtype, np.integer): cls = IntBlock elif dtype == np.bool_: cls = BoolBlock else: cls = ObjectBlock return cls def make_block(values, placement, klass=None, ndim=None, dtype=None): # Ensure that we don't allow PandasArray / PandasDtype in internals. # For now, blocks should be backed by ndarrays when possible. if isinstance(values, ABCPandasArray): values = values.to_numpy() if ndim and ndim > 1: # TODO(EA2D): special case not needed with 2D EAs values = np.atleast_2d(values) if isinstance(dtype, PandasDtype): dtype = dtype.numpy_dtype if klass is None: dtype = dtype or values.dtype klass = get_block_type(values, dtype) elif klass is DatetimeTZBlock and not is_datetime64tz_dtype(values.dtype): # TODO: This is no longer hit internally; does it need to be retained # for e.g. pyarrow? values = DatetimeArray._simple_new(values, dtype=dtype) return klass(values, ndim=ndim, placement=placement) # ----------------------------------------------------------------- def extend_blocks(result, blocks=None): """ return a new extended blocks, given the result """ if blocks is None: blocks = [] if isinstance(result, list): for r in result: if isinstance(r, list): blocks.extend(r) else: blocks.append(r) else: assert isinstance(result, Block), type(result) blocks.append(result) return blocks def _block_shape(values: ArrayLike, ndim: int = 1) -> ArrayLike: """ guarantee the shape of the values to be at least 1 d """ if values.ndim < ndim: shape = values.shape if not is_extension_array_dtype(values.dtype): # TODO(EA2D): https://github.com/pandas-dev/pandas/issues/23023 # block.shape is incorrect for "2D" ExtensionArrays # We can't, and don't need to, reshape. # error: "ExtensionArray" has no attribute "reshape" values = values.reshape(tuple((1,) + shape)) # type: ignore[attr-defined] return values def safe_reshape(arr, new_shape: Shape): """ If possible, reshape `arr` to have shape `new_shape`, with a couple of exceptions (see gh-13012): 1) If `arr` is a ExtensionArray or Index, `arr` will be returned as is. 2) If `arr` is a Series, the `_values` attribute will be reshaped and returned. Parameters ---------- arr : array-like, object to be reshaped new_shape : int or tuple of ints, the new shape """ if isinstance(arr, ABCSeries): arr = arr._values if not is_extension_array_dtype(arr.dtype): # Note: this will include TimedeltaArray and tz-naive DatetimeArray # TODO(EA2D): special case will be unnecessary with 2D EAs arr = np.asarray(arr).reshape(new_shape) return arr def _putmask_smart(v: np.ndarray, mask: np.ndarray, n) -> np.ndarray: """ Return a new ndarray, try to preserve dtype if possible. Parameters ---------- v : np.ndarray `values`, updated in-place. mask : np.ndarray[bool] Applies to both sides (array like). n : `new values` either scalar or an array like aligned with `values` Returns ------- values : ndarray with updated values this *may* be a copy of the original See Also -------- ndarray.putmask """ # we cannot use np.asarray() here as we cannot have conversions # that numpy does when numeric are mixed with strings # n should be the length of the mask or a scalar here if not is_list_like(n): n = np.repeat(n, len(mask)) # see if we are only masking values that if putted # will work in the current dtype try: nn = n[mask] except TypeError: # TypeError: only integer scalar arrays can be converted to a scalar index pass else: # make sure that we have a nullable type # if we have nulls if not isna_compat(v, nn[0]): pass elif not (is_float_dtype(nn.dtype) or is_integer_dtype(nn.dtype)): # only compare integers/floats pass elif not (is_float_dtype(v.dtype) or is_integer_dtype(v.dtype)): # only compare integers/floats pass else: # we ignore ComplexWarning here with warnings.catch_warnings(record=True): warnings.simplefilter("ignore", np.ComplexWarning) nn_at = nn.astype(v.dtype) comp = nn == nn_at if is_list_like(comp) and comp.all(): nv = v.copy() nv[mask] = nn_at return nv n = np.asarray(n) def _putmask_preserve(nv, n): try: nv[mask] = n[mask] except (IndexError, ValueError): nv[mask] = n return nv # preserves dtype if possible if v.dtype.kind == n.dtype.kind: return _putmask_preserve(v, n) # change the dtype if needed dtype, _ = maybe_promote(n.dtype) v = v.astype(dtype) return _putmask_preserve(v, n) def _extract_bool_array(mask: ArrayLike) -> np.ndarray: """ If we have a SparseArray or BooleanArray, convert it to ndarray[bool]. """ if isinstance(mask, ExtensionArray): # We could have BooleanArray, Sparse[bool], ... # Except for BooleanArray, this is equivalent to just # np.asarray(mask, dtype=bool) mask = mask.to_numpy(dtype=bool, na_value=False) assert isinstance(mask, np.ndarray), type(mask) assert mask.dtype == bool, mask.dtype return mask