from collections import defaultdict import itertools from typing import ( Any, Callable, DefaultDict, Dict, List, Optional, Sequence, Tuple, TypeVar, Union, ) import warnings import numpy as np from pandas._libs import internals as libinternals, lib from pandas._typing import ArrayLike, DtypeObj, Label, Shape from pandas.util._validators import validate_bool_kwarg from pandas.core.dtypes.cast import ( find_common_type, infer_dtype_from_scalar, maybe_promote, ) from pandas.core.dtypes.common import ( DT64NS_DTYPE, is_dtype_equal, is_extension_array_dtype, is_list_like, ) from pandas.core.dtypes.concat import concat_compat from pandas.core.dtypes.dtypes import ExtensionDtype from pandas.core.dtypes.generic import ABCDataFrame, ABCPandasArray, ABCSeries from pandas.core.dtypes.missing import array_equals, isna import pandas.core.algorithms as algos from pandas.core.arrays.sparse import SparseDtype from pandas.core.base import PandasObject from pandas.core.construction import extract_array from pandas.core.indexers import maybe_convert_indices from pandas.core.indexes.api import Index, ensure_index from pandas.core.internals.blocks import ( Block, CategoricalBlock, DatetimeTZBlock, ExtensionBlock, ObjectValuesExtensionBlock, extend_blocks, get_block_type, make_block, safe_reshape, ) from pandas.core.internals.ops import blockwise_all, operate_blockwise # TODO: flexible with index=None and/or items=None T = TypeVar("T", bound="BlockManager") class BlockManager(PandasObject): """ Core internal data structure to implement DataFrame, Series, etc. Manage a bunch of labeled 2D mixed-type ndarrays. Essentially it's a lightweight blocked set of labeled data to be manipulated by the DataFrame public API class Attributes ---------- shape ndim axes values items Methods ------- set_axis(axis, new_labels) copy(deep=True) get_dtypes apply(func, axes, block_filter_fn) get_bool_data get_numeric_data get_slice(slice_like, axis) get(label) iget(loc) take(indexer, axis) reindex_axis(new_labels, axis) reindex_indexer(new_labels, indexer, axis) delete(label) insert(loc, label, value) set(label, value) Parameters ---------- blocks: Sequence of Block axes: Sequence of Index do_integrity_check: bool, default True Notes ----- This is *not* a public API class """ __slots__ = [ "axes", "blocks", "_known_consolidated", "_is_consolidated", "_blknos", "_blklocs", ] _blknos: np.ndarray _blklocs: np.ndarray def __init__( self, blocks: Sequence[Block], axes: Sequence[Index], do_integrity_check: bool = True, ): self.axes = [ensure_index(ax) for ax in axes] self.blocks: Tuple[Block, ...] = tuple(blocks) for block in blocks: if self.ndim != block.ndim: raise AssertionError( f"Number of Block dimensions ({block.ndim}) must equal " f"number of axes ({self.ndim})" ) if do_integrity_check: self._verify_integrity() # Populate known_consolidate, blknos, and blklocs lazily self._known_consolidated = False self._blknos = None self._blklocs = None @classmethod def from_blocks(cls, blocks: List[Block], axes: List[Index]): """ Constructor for BlockManager and SingleBlockManager with same signature. """ return cls(blocks, axes, do_integrity_check=False) @property def blknos(self): """ Suppose we want to find the array corresponding to our i'th column. blknos[i] identifies the block from self.blocks that contains this column. blklocs[i] identifies the column of interest within self.blocks[self.blknos[i]] """ if self._blknos is None: # Note: these can be altered by other BlockManager methods. self._rebuild_blknos_and_blklocs() return self._blknos @property def blklocs(self): """ See blknos.__doc__ """ if self._blklocs is None: # Note: these can be altered by other BlockManager methods. self._rebuild_blknos_and_blklocs() return self._blklocs def make_empty(self: T, axes=None) -> T: """ return an empty BlockManager with the items axis of len 0 """ if axes is None: axes = [Index([])] + self.axes[1:] # preserve dtype if possible if self.ndim == 1: assert isinstance(self, SingleBlockManager) # for mypy blk = self.blocks[0] arr = blk.values[:0] nb = blk.make_block_same_class(arr, placement=slice(0, 0), ndim=1) blocks = [nb] else: blocks = [] return type(self).from_blocks(blocks, axes) def __nonzero__(self) -> bool: return True # Python3 compat __bool__ = __nonzero__ @property def shape(self) -> Shape: return tuple(len(ax) for ax in self.axes) @property def ndim(self) -> int: return len(self.axes) def set_axis(self, axis: int, new_labels: Index) -> None: # Caller is responsible for ensuring we have an Index object. old_len = len(self.axes[axis]) new_len = len(new_labels) if new_len != old_len: raise ValueError( f"Length mismatch: Expected axis has {old_len} elements, new " f"values have {new_len} elements" ) self.axes[axis] = new_labels @property def is_single_block(self) -> bool: # Assumes we are 2D; overridden by SingleBlockManager return len(self.blocks) == 1 def _rebuild_blknos_and_blklocs(self) -> None: """ Update mgr._blknos / mgr._blklocs. """ new_blknos = np.empty(self.shape[0], dtype=np.intp) new_blklocs = np.empty(self.shape[0], dtype=np.intp) new_blknos.fill(-1) new_blklocs.fill(-1) for blkno, blk in enumerate(self.blocks): rl = blk.mgr_locs new_blknos[rl.indexer] = blkno new_blklocs[rl.indexer] = np.arange(len(rl)) if (new_blknos == -1).any(): # TODO: can we avoid this? it isn't cheap raise AssertionError("Gaps in blk ref_locs") self._blknos = new_blknos self._blklocs = new_blklocs @property def items(self) -> Index: return self.axes[0] def get_dtypes(self): dtypes = np.array([blk.dtype for blk in self.blocks]) return algos.take_1d(dtypes, self.blknos, allow_fill=False) def __getstate__(self): block_values = [b.values for b in self.blocks] block_items = [self.items[b.mgr_locs.indexer] for b in self.blocks] axes_array = list(self.axes) extra_state = { "0.14.1": { "axes": axes_array, "blocks": [ {"values": b.values, "mgr_locs": b.mgr_locs.indexer} for b in self.blocks ], } } # First three elements of the state are to maintain forward # compatibility with 0.13.1. return axes_array, block_values, block_items, extra_state def __setstate__(self, state): def unpickle_block(values, mgr_locs, ndim: int): # TODO(EA2D): ndim would be unnecessary with 2D EAs return make_block(values, placement=mgr_locs, ndim=ndim) if isinstance(state, tuple) and len(state) >= 4 and "0.14.1" in state[3]: state = state[3]["0.14.1"] self.axes = [ensure_index(ax) for ax in state["axes"]] ndim = len(self.axes) self.blocks = tuple( unpickle_block(b["values"], b["mgr_locs"], ndim=ndim) for b in state["blocks"] ) else: raise NotImplementedError("pre-0.14.1 pickles are no longer supported") self._post_setstate() def _post_setstate(self) -> None: self._is_consolidated = False self._known_consolidated = False self._rebuild_blknos_and_blklocs() def __len__(self) -> int: return len(self.items) def __repr__(self) -> str: output = type(self).__name__ for i, ax in enumerate(self.axes): if i == 0: output += f"\nItems: {ax}" else: output += f"\nAxis {i}: {ax}" for block in self.blocks: output += f"\n{block}" return output def _verify_integrity(self) -> None: mgr_shape = self.shape tot_items = sum(len(x.mgr_locs) for x in self.blocks) for block in self.blocks: if block.shape[1:] != mgr_shape[1:]: raise construction_error(tot_items, block.shape[1:], self.axes) if len(self.items) != tot_items: raise AssertionError( "Number of manager items must equal union of " f"block items\n# manager items: {len(self.items)}, # " f"tot_items: {tot_items}" ) def reduce( self: T, func: Callable, ignore_failures: bool = False ) -> Tuple[T, np.ndarray]: """ Apply reduction function blockwise, returning a single-row BlockManager. Parameters ---------- func : reduction function ignore_failures : bool, default False Whether to drop blocks where func raises TypeError. Returns ------- BlockManager np.ndarray Indexer of mgr_locs that are retained. """ # If 2D, we assume that we're operating column-wise assert self.ndim == 2 res_blocks: List[Block] = [] for blk in self.blocks: nbs = blk.reduce(func, ignore_failures) res_blocks.extend(nbs) index = Index([None]) # placeholder if ignore_failures: if res_blocks: indexer = np.concatenate([blk.mgr_locs.as_array for blk in res_blocks]) new_mgr = self._combine(res_blocks, copy=False, index=index) else: indexer = [] new_mgr = type(self).from_blocks([], [Index([]), index]) else: indexer = np.arange(self.shape[0]) new_mgr = type(self).from_blocks(res_blocks, [self.items, index]) return new_mgr, indexer def operate_blockwise(self, other: "BlockManager", array_op) -> "BlockManager": """ Apply array_op blockwise with another (aligned) BlockManager. """ return operate_blockwise(self, other, array_op) def apply( self: T, f, align_keys: Optional[List[str]] = None, ignore_failures: bool = False, **kwargs, ) -> T: """ Iterate over the blocks, collect and create a new BlockManager. Parameters ---------- f : str or callable Name of the Block method to apply. align_keys: List[str] or None, default None ignore_failures: bool, default False **kwargs Keywords to pass to `f` Returns ------- BlockManager """ assert "filter" not in kwargs align_keys = align_keys or [] result_blocks: List[Block] = [] # fillna: Series/DataFrame is responsible for making sure value is aligned aligned_args = {k: kwargs[k] for k in align_keys} for b in self.blocks: if aligned_args: for k, obj in aligned_args.items(): if isinstance(obj, (ABCSeries, ABCDataFrame)): # The caller is responsible for ensuring that # obj.axes[-1].equals(self.items) if obj.ndim == 1: kwargs[k] = obj.iloc[b.mgr_locs.indexer]._values else: kwargs[k] = obj.iloc[:, b.mgr_locs.indexer]._values else: # otherwise we have an ndarray kwargs[k] = obj[b.mgr_locs.indexer] try: if callable(f): applied = b.apply(f, **kwargs) else: applied = getattr(b, f)(**kwargs) except (TypeError, NotImplementedError): if not ignore_failures: raise continue result_blocks = extend_blocks(applied, result_blocks) if ignore_failures: return self._combine(result_blocks) if len(result_blocks) == 0: return self.make_empty(self.axes) return type(self).from_blocks(result_blocks, self.axes) def quantile( self, axis: int = 0, consolidate: bool = True, transposed: bool = False, interpolation="linear", qs=None, numeric_only=None, ) -> "BlockManager": """ Iterate over blocks applying quantile reduction. This routine is intended for reduction type operations and will do inference on the generated blocks. Parameters ---------- axis: reduction axis, default 0 consolidate: bool, default True. Join together blocks having same dtype transposed: bool, default False we are holding transposed data interpolation : type of interpolation, default 'linear' qs : a scalar or list of the quantiles to be computed numeric_only : ignored Returns ------- BlockManager """ # Series dispatches to DataFrame for quantile, which allows us to # simplify some of the code here and in the blocks assert self.ndim >= 2 if consolidate: self._consolidate_inplace() def get_axe(block, qs, axes): # Because Series dispatches to DataFrame, we will always have # block.ndim == 2 from pandas import Float64Index if is_list_like(qs): ax = Float64Index(qs) else: ax = axes[0] return ax axes, blocks = [], [] for b in self.blocks: block = b.quantile(axis=axis, qs=qs, interpolation=interpolation) axe = get_axe(b, qs, axes=self.axes) axes.append(axe) blocks.append(block) # note that some DatetimeTZ, Categorical are always ndim==1 ndim = {b.ndim for b in blocks} assert 0 not in ndim, ndim if 2 in ndim: new_axes = list(self.axes) # multiple blocks that are reduced if len(blocks) > 1: new_axes[1] = axes[0] # reset the placement to the original for b, sb in zip(blocks, self.blocks): b.mgr_locs = sb.mgr_locs else: new_axes[axis] = Index(np.concatenate([ax._values for ax in axes])) if transposed: new_axes = new_axes[::-1] blocks = [ b.make_block(b.values.T, placement=np.arange(b.shape[1])) for b in blocks ] return type(self)(blocks, new_axes) # single block, i.e. ndim == {1} values = concat_compat([b.values for b in blocks]) # compute the orderings of our original data if len(self.blocks) > 1: indexer = np.empty(len(self.axes[0]), dtype=np.intp) i = 0 for b in self.blocks: for j in b.mgr_locs: indexer[j] = i i = i + 1 values = values.take(indexer) return SingleBlockManager( make_block(values, ndim=1, placement=np.arange(len(values))), axes[0] ) def isna(self, func) -> "BlockManager": return self.apply("apply", func=func) def where( self, other, cond, align: bool, errors: str, try_cast: bool, axis: int ) -> "BlockManager": if align: align_keys = ["other", "cond"] else: align_keys = ["cond"] other = extract_array(other, extract_numpy=True) return self.apply( "where", align_keys=align_keys, other=other, cond=cond, errors=errors, try_cast=try_cast, axis=axis, ) def setitem(self, indexer, value) -> "BlockManager": return self.apply("setitem", indexer=indexer, value=value) def putmask(self, mask, new, align: bool = True, axis: int = 0): transpose = self.ndim == 2 if align: align_keys = ["new", "mask"] else: align_keys = ["mask"] new = extract_array(new, extract_numpy=True) return self.apply( "putmask", align_keys=align_keys, mask=mask, new=new, inplace=True, axis=axis, transpose=transpose, ) def diff(self, n: int, axis: int) -> "BlockManager": return self.apply("diff", n=n, axis=axis) def interpolate(self, **kwargs) -> "BlockManager": return self.apply("interpolate", **kwargs) def shift(self, periods: int, axis: int, fill_value) -> "BlockManager": if fill_value is lib.no_default: fill_value = None if axis == 0 and self.ndim == 2 and self.nblocks > 1: # GH#35488 we need to watch out for multi-block cases # We only get here with fill_value not-lib.no_default ncols = self.shape[0] if periods > 0: indexer = [-1] * periods + list(range(ncols - periods)) else: nper = abs(periods) indexer = list(range(nper, ncols)) + [-1] * nper result = self.reindex_indexer( self.items, indexer, axis=0, fill_value=fill_value, allow_dups=True, consolidate=False, ) return result return self.apply("shift", periods=periods, axis=axis, fill_value=fill_value) def fillna(self, value, limit, inplace: bool, downcast) -> "BlockManager": return self.apply( "fillna", value=value, limit=limit, inplace=inplace, downcast=downcast ) def downcast(self) -> "BlockManager": return self.apply("downcast") def astype( self, dtype, copy: bool = False, errors: str = "raise" ) -> "BlockManager": return self.apply("astype", dtype=dtype, copy=copy, errors=errors) def convert( self, copy: bool = True, datetime: bool = True, numeric: bool = True, timedelta: bool = True, ) -> "BlockManager": return self.apply( "convert", copy=copy, datetime=datetime, numeric=numeric, timedelta=timedelta, ) def replace(self, to_replace, value, inplace: bool, regex: bool) -> "BlockManager": assert np.ndim(value) == 0, value return self.apply( "replace", to_replace=to_replace, value=value, inplace=inplace, regex=regex ) def replace_list( self: T, src_list: List[Any], dest_list: List[Any], inplace: bool = False, regex: bool = False, ) -> T: """ do a list replace """ inplace = validate_bool_kwarg(inplace, "inplace") bm = self.apply( "_replace_list", src_list=src_list, dest_list=dest_list, inplace=inplace, regex=regex, ) bm._consolidate_inplace() return bm def to_native_types(self, **kwargs) -> "BlockManager": """ Convert values to native types (strings / python objects) that are used in formatting (repr / csv). """ return self.apply("to_native_types", **kwargs) def is_consolidated(self) -> bool: """ Return True if more than one block with the same dtype """ if not self._known_consolidated: self._consolidate_check() return self._is_consolidated def _consolidate_check(self) -> None: dtypes = [blk.dtype for blk in self.blocks if blk._can_consolidate] self._is_consolidated = len(dtypes) == len(set(dtypes)) self._known_consolidated = True @property def is_numeric_mixed_type(self) -> bool: return all(block.is_numeric for block in self.blocks) @property def any_extension_types(self) -> bool: """Whether any of the blocks in this manager are extension blocks""" return any(block.is_extension for block in self.blocks) @property def is_view(self) -> bool: """ return a boolean if we are a single block and are a view """ if len(self.blocks) == 1: return self.blocks[0].is_view # It is technically possible to figure out which blocks are views # e.g. [ b.values.base is not None for b in self.blocks ] # but then we have the case of possibly some blocks being a view # and some blocks not. setting in theory is possible on the non-view # blocks w/o causing a SettingWithCopy raise/warn. But this is a bit # complicated return False def get_bool_data(self, copy: bool = False) -> "BlockManager": """ Select blocks that are bool-dtype and columns from object-dtype blocks that are all-bool. Parameters ---------- copy : bool, default False Whether to copy the blocks """ new_blocks = [] for blk in self.blocks: if blk.dtype == bool: new_blocks.append(blk) elif blk.is_object: nbs = blk._split() for nb in nbs: if nb.is_bool: new_blocks.append(nb) return self._combine(new_blocks, copy) def get_numeric_data(self, copy: bool = False) -> "BlockManager": """ Parameters ---------- copy : bool, default False Whether to copy the blocks """ return self._combine([b for b in self.blocks if b.is_numeric], copy) def _combine( self: T, blocks: List[Block], copy: bool = True, index: Optional[Index] = None ) -> T: """ return a new manager with the blocks """ if len(blocks) == 0: return self.make_empty() # FIXME: optimization potential indexer = np.sort(np.concatenate([b.mgr_locs.as_array for b in blocks])) inv_indexer = lib.get_reverse_indexer(indexer, self.shape[0]) new_blocks: List[Block] = [] for b in blocks: b = b.copy(deep=copy) b.mgr_locs = inv_indexer[b.mgr_locs.indexer] new_blocks.append(b) axes = list(self.axes) if index is not None: axes[-1] = index axes[0] = self.items.take(indexer) return type(self).from_blocks(new_blocks, axes) def get_slice(self, slobj: slice, axis: int = 0) -> "BlockManager": if axis == 0: new_blocks = self._slice_take_blocks_ax0(slobj) elif axis == 1: slicer = (slice(None), slobj) new_blocks = [blk.getitem_block(slicer) for blk in self.blocks] else: raise IndexError("Requested axis not found in manager") new_axes = list(self.axes) new_axes[axis] = new_axes[axis][slobj] bm = type(self)(new_blocks, new_axes, do_integrity_check=False) return bm @property def nblocks(self) -> int: return len(self.blocks) def copy(self: T, deep=True) -> T: """ Make deep or shallow copy of BlockManager Parameters ---------- deep : bool or string, default True If False, return shallow copy (do not copy data) If 'all', copy data and a deep copy of the index Returns ------- BlockManager """ # this preserves the notion of view copying of axes if deep: # hit in e.g. tests.io.json.test_pandas def copy_func(ax): return ax.copy(deep=True) if deep == "all" else ax.view() new_axes = [copy_func(ax) for ax in self.axes] else: new_axes = list(self.axes) res = self.apply("copy", deep=deep) res.axes = new_axes return res def as_array( self, transpose: bool = False, dtype=None, copy: bool = False, na_value=lib.no_default, ) -> np.ndarray: """ Convert the blockmanager data into an numpy array. Parameters ---------- transpose : bool, default False If True, transpose the return array. dtype : object, default None Data type of the return array. copy : bool, default False If True then guarantee that a copy is returned. A value of False does not guarantee that the underlying data is not copied. na_value : object, default lib.no_default Value to be used as the missing value sentinel. Returns ------- arr : ndarray """ if len(self.blocks) == 0: arr = np.empty(self.shape, dtype=float) return arr.transpose() if transpose else arr # We want to copy when na_value is provided to avoid # mutating the original object copy = copy or na_value is not lib.no_default if self.is_single_block: blk = self.blocks[0] if blk.is_extension: # Avoid implicit conversion of extension blocks to object arr = blk.values.to_numpy(dtype=dtype, na_value=na_value).reshape( blk.shape ) else: arr = np.asarray(blk.get_values()) if dtype: arr = arr.astype(dtype, copy=False) else: arr = self._interleave(dtype=dtype, na_value=na_value) # The underlying data was copied within _interleave copy = False if copy: arr = arr.copy() if na_value is not lib.no_default: arr[isna(arr)] = na_value return arr.transpose() if transpose else arr def _interleave(self, dtype=None, na_value=lib.no_default) -> np.ndarray: """ Return ndarray from blocks with specified item order Items must be contained in the blocks """ if not dtype: dtype = _interleaved_dtype(self.blocks) # TODO: https://github.com/pandas-dev/pandas/issues/22791 # Give EAs some input on what happens here. Sparse needs this. if isinstance(dtype, SparseDtype): dtype = dtype.subtype elif is_extension_array_dtype(dtype): dtype = "object" elif is_dtype_equal(dtype, str): dtype = "object" result = np.empty(self.shape, dtype=dtype) itemmask = np.zeros(self.shape[0]) for blk in self.blocks: rl = blk.mgr_locs if blk.is_extension: # Avoid implicit conversion of extension blocks to object arr = blk.values.to_numpy(dtype=dtype, na_value=na_value) else: arr = blk.get_values(dtype) result[rl.indexer] = arr itemmask[rl.indexer] = 1 if not itemmask.all(): raise AssertionError("Some items were not contained in blocks") return result def to_dict(self, copy: bool = True): """ Return a dict of str(dtype) -> BlockManager Parameters ---------- copy : bool, default True Returns ------- values : a dict of dtype -> BlockManager """ bd: Dict[str, List[Block]] = {} for b in self.blocks: bd.setdefault(str(b.dtype), []).append(b) # TODO(EA2D): the combine will be unnecessary with 2D EAs return {dtype: self._combine(blocks, copy=copy) for dtype, blocks in bd.items()} def fast_xs(self, loc: int) -> ArrayLike: """ Return the array corresponding to `frame.iloc[loc]`. Parameters ---------- loc : int Returns ------- np.ndarray or ExtensionArray """ if len(self.blocks) == 1: return self.blocks[0].iget((slice(None), loc)) dtype = _interleaved_dtype(self.blocks) n = len(self) if is_extension_array_dtype(dtype): # we'll eventually construct an ExtensionArray. result = np.empty(n, dtype=object) else: result = np.empty(n, dtype=dtype) for blk in self.blocks: # Such assignment may incorrectly coerce NaT to None # result[blk.mgr_locs] = blk._slice((slice(None), loc)) for i, rl in enumerate(blk.mgr_locs): result[rl] = blk.iget((i, loc)) if isinstance(dtype, ExtensionDtype): result = dtype.construct_array_type()._from_sequence(result, dtype=dtype) return result def consolidate(self) -> "BlockManager": """ Join together blocks having same dtype Returns ------- y : BlockManager """ if self.is_consolidated(): return self bm = type(self)(self.blocks, self.axes) bm._is_consolidated = False bm._consolidate_inplace() return bm def _consolidate_inplace(self) -> None: if not self.is_consolidated(): self.blocks = tuple(_consolidate(self.blocks)) self._is_consolidated = True self._known_consolidated = True self._rebuild_blknos_and_blklocs() def iget(self, i: int) -> "SingleBlockManager": """ Return the data as a SingleBlockManager. """ block = self.blocks[self.blknos[i]] values = block.iget(self.blklocs[i]) # shortcut for select a single-dim from a 2-dim BM return SingleBlockManager( block.make_block_same_class( values, placement=slice(0, len(values)), ndim=1 ), self.axes[1], ) def iget_values(self, i: int) -> ArrayLike: """ Return the data for column i as the values (ndarray or ExtensionArray). """ block = self.blocks[self.blknos[i]] values = block.iget(self.blklocs[i]) return values def idelete(self, indexer): """ Delete selected locations in-place (new block and array, same BlockManager) """ is_deleted = np.zeros(self.shape[0], dtype=np.bool_) is_deleted[indexer] = True ref_loc_offset = -is_deleted.cumsum() is_blk_deleted = [False] * len(self.blocks) if isinstance(indexer, int): affected_start = indexer else: affected_start = is_deleted.nonzero()[0][0] for blkno, _ in _fast_count_smallints(self.blknos[affected_start:]): blk = self.blocks[blkno] bml = blk.mgr_locs blk_del = is_deleted[bml.indexer].nonzero()[0] if len(blk_del) == len(bml): is_blk_deleted[blkno] = True continue elif len(blk_del) != 0: blk.delete(blk_del) bml = blk.mgr_locs blk.mgr_locs = bml.add(ref_loc_offset[bml.indexer]) # FIXME: use Index.delete as soon as it uses fastpath=True self.axes[0] = self.items[~is_deleted] self.blocks = tuple( b for blkno, b in enumerate(self.blocks) if not is_blk_deleted[blkno] ) self._rebuild_blknos_and_blklocs() def iset(self, loc: Union[int, slice, np.ndarray], value): """ Set new item in-place. Does not consolidate. Adds new Block if not contained in the current set of items """ value = extract_array(value, extract_numpy=True) # FIXME: refactor, clearly separate broadcasting & zip-like assignment # can prob also fix the various if tests for sparse/categorical if self._blklocs is None and self.ndim > 1: self._rebuild_blknos_and_blklocs() value_is_extension_type = is_extension_array_dtype(value) # categorical/sparse/datetimetz if value_is_extension_type: def value_getitem(placement): return value else: if value.ndim == self.ndim - 1: value = safe_reshape(value, (1,) + value.shape) def value_getitem(placement): return value else: def value_getitem(placement): return value[placement.indexer] if value.shape[1:] != self.shape[1:]: raise AssertionError( "Shape of new values must be compatible with manager shape" ) if lib.is_integer(loc): # We have 6 tests where loc is _not_ an int. # In this case, get_blkno_placements will yield only one tuple, # containing (self._blknos[loc], BlockPlacement(slice(0, 1, 1))) loc = [loc] # Accessing public blknos ensures the public versions are initialized blknos = self.blknos[loc] blklocs = self.blklocs[loc].copy() unfit_mgr_locs = [] unfit_val_locs = [] removed_blknos = [] for blkno, val_locs in libinternals.get_blkno_placements(blknos, group=True): blk = self.blocks[blkno] blk_locs = blklocs[val_locs.indexer] if blk.should_store(value): blk.set_inplace(blk_locs, value_getitem(val_locs)) else: unfit_mgr_locs.append(blk.mgr_locs.as_array[blk_locs]) unfit_val_locs.append(val_locs) # If all block items are unfit, schedule the block for removal. if len(val_locs) == len(blk.mgr_locs): removed_blknos.append(blkno) else: blk.delete(blk_locs) self._blklocs[blk.mgr_locs.indexer] = np.arange(len(blk)) if len(removed_blknos): # Remove blocks & update blknos accordingly is_deleted = np.zeros(self.nblocks, dtype=np.bool_) is_deleted[removed_blknos] = True new_blknos = np.empty(self.nblocks, dtype=np.int64) new_blknos.fill(-1) new_blknos[~is_deleted] = np.arange(self.nblocks - len(removed_blknos)) self._blknos = new_blknos[self._blknos] self.blocks = tuple( blk for i, blk in enumerate(self.blocks) if i not in set(removed_blknos) ) if unfit_val_locs: unfit_mgr_locs = np.concatenate(unfit_mgr_locs) unfit_count = len(unfit_mgr_locs) new_blocks: List[Block] = [] if value_is_extension_type: # This code (ab-)uses the fact that EA blocks contain only # one item. # TODO(EA2D): special casing unnecessary with 2D EAs new_blocks.extend( make_block( values=value, ndim=self.ndim, placement=slice(mgr_loc, mgr_loc + 1), ) for mgr_loc in unfit_mgr_locs ) self._blknos[unfit_mgr_locs] = np.arange(unfit_count) + len(self.blocks) self._blklocs[unfit_mgr_locs] = 0 else: # unfit_val_locs contains BlockPlacement objects unfit_val_items = unfit_val_locs[0].append(unfit_val_locs[1:]) new_blocks.append( make_block( values=value_getitem(unfit_val_items), ndim=self.ndim, placement=unfit_mgr_locs, ) ) self._blknos[unfit_mgr_locs] = len(self.blocks) self._blklocs[unfit_mgr_locs] = np.arange(unfit_count) self.blocks += tuple(new_blocks) # Newly created block's dtype may already be present. self._known_consolidated = False def insert(self, loc: int, item: Label, value, allow_duplicates: bool = False): """ Insert item at selected position. Parameters ---------- loc : int item : hashable value : array_like allow_duplicates: bool If False, trying to insert non-unique item will raise """ if not allow_duplicates and item in self.items: # Should this be a different kind of error?? raise ValueError(f"cannot insert {item}, already exists") if not isinstance(loc, int): raise TypeError("loc must be int") # insert to the axis; this could possibly raise a TypeError new_axis = self.items.insert(loc, item) if value.ndim == self.ndim - 1 and not is_extension_array_dtype(value.dtype): # TODO(EA2D): special case not needed with 2D EAs value = safe_reshape(value, (1,) + value.shape) block = make_block(values=value, ndim=self.ndim, placement=slice(loc, loc + 1)) for blkno, count in _fast_count_smallints(self.blknos[loc:]): blk = self.blocks[blkno] if count == len(blk.mgr_locs): blk.mgr_locs = blk.mgr_locs.add(1) else: new_mgr_locs = blk.mgr_locs.as_array.copy() new_mgr_locs[new_mgr_locs >= loc] += 1 blk.mgr_locs = new_mgr_locs # Accessing public blklocs ensures the public versions are initialized if loc == self.blklocs.shape[0]: # np.append is a lot faster, let's use it if we can. self._blklocs = np.append(self._blklocs, 0) self._blknos = np.append(self._blknos, len(self.blocks)) else: self._blklocs = np.insert(self._blklocs, loc, 0) self._blknos = np.insert(self._blknos, loc, len(self.blocks)) self.axes[0] = new_axis self.blocks += (block,) self._known_consolidated = False if len(self.blocks) > 100: self._consolidate_inplace() def reindex_axis( self, new_index, axis: int, method=None, limit=None, fill_value=None, copy: bool = True, consolidate: bool = True, only_slice: bool = False, ): """ Conform block manager to new index. """ new_index = ensure_index(new_index) new_index, indexer = self.axes[axis].reindex( new_index, method=method, limit=limit ) return self.reindex_indexer( new_index, indexer, axis=axis, fill_value=fill_value, copy=copy, consolidate=consolidate, only_slice=only_slice, ) def reindex_indexer( self: T, new_axis, indexer, axis: int, fill_value=None, allow_dups: bool = False, copy: bool = True, consolidate: bool = True, only_slice: bool = False, ) -> T: """ Parameters ---------- new_axis : Index indexer : ndarray of int64 or None axis : int fill_value : object, default None allow_dups : bool, default False copy : bool, default True consolidate: bool, default True Whether to consolidate inplace before reindexing. only_slice : bool, default False Whether to take views, not copies, along columns. pandas-indexer with -1's only. """ if indexer is None: if new_axis is self.axes[axis] and not copy: return self result = self.copy(deep=copy) result.axes = list(self.axes) result.axes[axis] = new_axis return result if consolidate: self._consolidate_inplace() # some axes don't allow reindexing with dups if not allow_dups: self.axes[axis]._can_reindex(indexer) if axis >= self.ndim: raise IndexError("Requested axis not found in manager") if axis == 0: new_blocks = self._slice_take_blocks_ax0( indexer, fill_value=fill_value, only_slice=only_slice ) else: new_blocks = [ blk.take_nd( indexer, axis=axis, fill_value=( fill_value if fill_value is not None else blk.fill_value ), ) for blk in self.blocks ] new_axes = list(self.axes) new_axes[axis] = new_axis return type(self).from_blocks(new_blocks, new_axes) def _slice_take_blocks_ax0( self, slice_or_indexer, fill_value=lib.no_default, only_slice: bool = False ): """ Slice/take blocks along axis=0. Overloaded for SingleBlock Parameters ---------- slice_or_indexer : slice, ndarray[bool], or list-like of ints fill_value : scalar, default lib.no_default only_slice : bool, default False If True, we always return views on existing arrays, never copies. This is used when called from ops.blockwise.operate_blockwise. Returns ------- new_blocks : list of Block """ allow_fill = fill_value is not lib.no_default sl_type, slobj, sllen = _preprocess_slice_or_indexer( slice_or_indexer, self.shape[0], allow_fill=allow_fill ) if self.is_single_block: blk = self.blocks[0] if sl_type in ("slice", "mask"): # GH#32959 EABlock would fail since we cant make 0-width # TODO(EA2D): special casing unnecessary with 2D EAs if sllen == 0: return [] return [blk.getitem_block(slobj, new_mgr_locs=slice(0, sllen))] elif not allow_fill or self.ndim == 1: if allow_fill and fill_value is None: _, fill_value = maybe_promote(blk.dtype) if not allow_fill and only_slice: # GH#33597 slice instead of take, so we get # views instead of copies blocks = [ blk.getitem_block([ml], new_mgr_locs=i) for i, ml in enumerate(slobj) ] return blocks else: return [ blk.take_nd( slobj, axis=0, new_mgr_locs=slice(0, sllen), fill_value=fill_value, ) ] if sl_type in ("slice", "mask"): blknos = self.blknos[slobj] blklocs = self.blklocs[slobj] else: blknos = algos.take_1d( self.blknos, slobj, fill_value=-1, allow_fill=allow_fill ) blklocs = algos.take_1d( self.blklocs, slobj, fill_value=-1, allow_fill=allow_fill ) # When filling blknos, make sure blknos is updated before appending to # blocks list, that way new blkno is exactly len(blocks). blocks = [] group = not only_slice for blkno, mgr_locs in libinternals.get_blkno_placements(blknos, group=group): if blkno == -1: # If we've got here, fill_value was not lib.no_default blocks.append( self._make_na_block(placement=mgr_locs, fill_value=fill_value) ) else: blk = self.blocks[blkno] # Otherwise, slicing along items axis is necessary. if not blk._can_consolidate: # A non-consolidatable block, it's easy, because there's # only one item and each mgr loc is a copy of that single # item. for mgr_loc in mgr_locs: newblk = blk.copy(deep=False) newblk.mgr_locs = slice(mgr_loc, mgr_loc + 1) blocks.append(newblk) else: # GH#32779 to avoid the performance penalty of copying, # we may try to only slice taker = blklocs[mgr_locs.indexer] max_len = max(len(mgr_locs), taker.max() + 1) if only_slice: taker = lib.maybe_indices_to_slice(taker, max_len) if isinstance(taker, slice): nb = blk.getitem_block(taker, new_mgr_locs=mgr_locs) blocks.append(nb) elif only_slice: # GH#33597 slice instead of take, so we get # views instead of copies for i, ml in zip(taker, mgr_locs): nb = blk.getitem_block([i], new_mgr_locs=ml) blocks.append(nb) else: nb = blk.take_nd(taker, axis=0, new_mgr_locs=mgr_locs) blocks.append(nb) return blocks def _make_na_block(self, placement, fill_value=None): if fill_value is None: fill_value = np.nan block_shape = list(self.shape) block_shape[0] = len(placement) dtype, fill_value = infer_dtype_from_scalar(fill_value) block_values = np.empty(block_shape, dtype=dtype) block_values.fill(fill_value) return make_block(block_values, placement=placement, ndim=block_values.ndim) def take(self, indexer, axis: int = 1, verify: bool = True, convert: bool = True): """ Take items along any axis. """ self._consolidate_inplace() indexer = ( np.arange(indexer.start, indexer.stop, indexer.step, dtype="int64") if isinstance(indexer, slice) else np.asanyarray(indexer, dtype="int64") ) n = self.shape[axis] if convert: indexer = maybe_convert_indices(indexer, n) if verify: if ((indexer == -1) | (indexer >= n)).any(): raise Exception("Indices must be nonzero and less than the axis length") new_labels = self.axes[axis].take(indexer) return self.reindex_indexer( new_axis=new_labels, indexer=indexer, axis=axis, allow_dups=True ) def equals(self, other: object) -> bool: if not isinstance(other, BlockManager): return False self_axes, other_axes = self.axes, other.axes if len(self_axes) != len(other_axes): return False if not all(ax1.equals(ax2) for ax1, ax2 in zip(self_axes, other_axes)): return False if self.ndim == 1: # For SingleBlockManager (i.e.Series) if other.ndim != 1: return False left = self.blocks[0].values right = other.blocks[0].values return array_equals(left, right) return blockwise_all(self, other, array_equals) def unstack(self, unstacker, fill_value) -> "BlockManager": """ Return a BlockManager with all blocks unstacked.. Parameters ---------- unstacker : reshape._Unstacker fill_value : Any fill_value for newly introduced missing values. Returns ------- unstacked : BlockManager """ new_columns = unstacker.get_new_columns(self.items) new_index = unstacker.new_index new_blocks: List[Block] = [] columns_mask: List[np.ndarray] = [] for blk in self.blocks: blk_cols = self.items[blk.mgr_locs.indexer] new_items = unstacker.get_new_columns(blk_cols) new_placement = new_columns.get_indexer(new_items) blocks, mask = blk._unstack( unstacker, fill_value, new_placement=new_placement ) new_blocks.extend(blocks) columns_mask.extend(mask) new_columns = new_columns[columns_mask] bm = BlockManager(new_blocks, [new_columns, new_index]) return bm class SingleBlockManager(BlockManager): """ manage a single block with """ ndim = 1 _is_consolidated = True _known_consolidated = True __slots__ = () is_single_block = True def __init__( self, block: Block, axis: Index, do_integrity_check: bool = False, fastpath=lib.no_default, ): assert isinstance(block, Block), type(block) assert isinstance(axis, Index), type(axis) if fastpath is not lib.no_default: warnings.warn( "The `fastpath` keyword is deprecated and will be removed " "in a future version.", FutureWarning, stacklevel=2, ) self.axes = [axis] self.blocks = (block,) @classmethod def from_blocks( cls, blocks: List[Block], axes: List[Index] ) -> "SingleBlockManager": """ Constructor for BlockManager and SingleBlockManager with same signature. """ assert len(blocks) == 1 assert len(axes) == 1 return cls(blocks[0], axes[0], do_integrity_check=False) @classmethod def from_array(cls, array: ArrayLike, index: Index) -> "SingleBlockManager": """ Constructor for if we have an array that is not yet a Block. """ block = make_block(array, placement=slice(0, len(index)), ndim=1) return cls(block, index) def _post_setstate(self): pass @property def _block(self) -> Block: return self.blocks[0] @property def _blknos(self): """ compat with BlockManager """ return None @property def _blklocs(self): """ compat with BlockManager """ return None def get_slice(self, slobj: slice, axis: int = 0) -> "SingleBlockManager": if axis >= self.ndim: raise IndexError("Requested axis not found in manager") blk = self._block array = blk._slice(slobj) block = blk.make_block_same_class(array, placement=slice(0, len(array))) return type(self)(block, self.index[slobj]) @property def index(self) -> Index: return self.axes[0] @property def dtype(self) -> DtypeObj: return self._block.dtype def get_dtypes(self) -> np.ndarray: return np.array([self._block.dtype]) def external_values(self): """The array that Series.values returns""" return self._block.external_values() def internal_values(self): """The array that Series._values returns""" return self._block.internal_values() @property def _can_hold_na(self) -> bool: return self._block._can_hold_na def is_consolidated(self) -> bool: return True def _consolidate_check(self): pass def _consolidate_inplace(self): pass def idelete(self, indexer): """ Delete single location from SingleBlockManager. Ensures that self.blocks doesn't become empty. """ self._block.delete(indexer) self.axes[0] = self.axes[0].delete(indexer) def fast_xs(self, loc): """ fast path for getting a cross-section return a view of the data """ raise NotImplementedError("Use series._values[loc] instead") # -------------------------------------------------------------------- # Constructor Helpers def create_block_manager_from_blocks(blocks, axes: List[Index]) -> BlockManager: try: if len(blocks) == 1 and not isinstance(blocks[0], Block): # if blocks[0] is of length 0, return empty blocks if not len(blocks[0]): blocks = [] else: # It's OK if a single block is passed as values, its placement # is basically "all items", but if there're many, don't bother # converting, it's an error anyway. blocks = [ make_block( values=blocks[0], placement=slice(0, len(axes[0])), ndim=2 ) ] mgr = BlockManager(blocks, axes) mgr._consolidate_inplace() return mgr except ValueError as e: blocks = [getattr(b, "values", b) for b in blocks] tot_items = sum(b.shape[0] for b in blocks) raise construction_error(tot_items, blocks[0].shape[1:], axes, e) def create_block_manager_from_arrays( arrays, names: Index, axes: List[Index] ) -> BlockManager: assert isinstance(names, Index) assert isinstance(axes, list) assert all(isinstance(x, Index) for x in axes) # ensure we dont have any PandasArrays when we call get_block_type # Note: just calling extract_array breaks tests that patch PandasArray._typ. arrays = [x if not isinstance(x, ABCPandasArray) else x.to_numpy() for x in arrays] try: blocks = _form_blocks(arrays, names, axes) mgr = BlockManager(blocks, axes) mgr._consolidate_inplace() return mgr except ValueError as e: raise construction_error(len(arrays), arrays[0].shape, axes, e) def construction_error(tot_items, block_shape, axes, e=None): """ raise a helpful message about our construction """ passed = tuple(map(int, [tot_items] + list(block_shape))) # Correcting the user facing error message during dataframe construction if len(passed) <= 2: passed = passed[::-1] implied = tuple(len(ax) for ax in axes) # Correcting the user facing error message during dataframe construction if len(implied) <= 2: implied = implied[::-1] # We return the exception object instead of raising it so that we # can raise it in the caller; mypy plays better with that if passed == implied and e is not None: return e if block_shape[0] == 0: return ValueError("Empty data passed with indices specified.") return ValueError(f"Shape of passed values is {passed}, indices imply {implied}") # ----------------------------------------------------------------------- def _form_blocks(arrays, names: Index, axes) -> List[Block]: # put "leftover" items in float bucket, where else? # generalize? items_dict: DefaultDict[str, List] = defaultdict(list) extra_locs = [] names_idx = names if names_idx.equals(axes[0]): names_indexer = np.arange(len(names_idx)) else: assert names_idx.intersection(axes[0]).is_unique names_indexer = names_idx.get_indexer_for(axes[0]) for i, name_idx in enumerate(names_indexer): if name_idx == -1: extra_locs.append(i) continue k = names[name_idx] v = arrays[name_idx] block_type = get_block_type(v) items_dict[block_type.__name__].append((i, k, v)) blocks: List[Block] = [] if len(items_dict["FloatBlock"]): float_blocks = _multi_blockify(items_dict["FloatBlock"]) blocks.extend(float_blocks) if len(items_dict["ComplexBlock"]): complex_blocks = _multi_blockify(items_dict["ComplexBlock"]) blocks.extend(complex_blocks) if len(items_dict["TimeDeltaBlock"]): timedelta_blocks = _multi_blockify(items_dict["TimeDeltaBlock"]) blocks.extend(timedelta_blocks) if len(items_dict["IntBlock"]): int_blocks = _multi_blockify(items_dict["IntBlock"]) blocks.extend(int_blocks) if len(items_dict["DatetimeBlock"]): datetime_blocks = _simple_blockify(items_dict["DatetimeBlock"], DT64NS_DTYPE) blocks.extend(datetime_blocks) if len(items_dict["DatetimeTZBlock"]): dttz_blocks = [ make_block(array, klass=DatetimeTZBlock, placement=i, ndim=2) for i, _, array in items_dict["DatetimeTZBlock"] ] blocks.extend(dttz_blocks) if len(items_dict["BoolBlock"]): bool_blocks = _simple_blockify(items_dict["BoolBlock"], np.bool_) blocks.extend(bool_blocks) if len(items_dict["ObjectBlock"]) > 0: object_blocks = _simple_blockify(items_dict["ObjectBlock"], np.object_) blocks.extend(object_blocks) if len(items_dict["CategoricalBlock"]) > 0: cat_blocks = [ make_block(array, klass=CategoricalBlock, placement=i, ndim=2) for i, _, array in items_dict["CategoricalBlock"] ] blocks.extend(cat_blocks) if len(items_dict["ExtensionBlock"]): external_blocks = [ make_block(array, klass=ExtensionBlock, placement=i, ndim=2) for i, _, array in items_dict["ExtensionBlock"] ] blocks.extend(external_blocks) if len(items_dict["ObjectValuesExtensionBlock"]): external_blocks = [ make_block(array, klass=ObjectValuesExtensionBlock, placement=i, ndim=2) for i, _, array in items_dict["ObjectValuesExtensionBlock"] ] blocks.extend(external_blocks) if len(extra_locs): shape = (len(extra_locs),) + tuple(len(x) for x in axes[1:]) # empty items -> dtype object block_values = np.empty(shape, dtype=object) block_values.fill(np.nan) na_block = make_block(block_values, placement=extra_locs, ndim=2) blocks.append(na_block) return blocks def _simple_blockify(tuples, dtype) -> List[Block]: """ return a single array of a block that has a single dtype; if dtype is not None, coerce to this dtype """ values, placement = _stack_arrays(tuples, dtype) # TODO: CHECK DTYPE? if dtype is not None and values.dtype != dtype: # pragma: no cover values = values.astype(dtype) block = make_block(values, placement=placement, ndim=2) return [block] def _multi_blockify(tuples, dtype=None): """ return an array of blocks that potentially have different dtypes """ # group by dtype grouper = itertools.groupby(tuples, lambda x: x[2].dtype) new_blocks = [] for dtype, tup_block in grouper: values, placement = _stack_arrays(list(tup_block), dtype) block = make_block(values, placement=placement, ndim=2) new_blocks.append(block) return new_blocks def _stack_arrays(tuples, dtype): # fml def _asarray_compat(x): if isinstance(x, ABCSeries): return x._values else: return np.asarray(x) def _shape_compat(x) -> Shape: if isinstance(x, ABCSeries): return (len(x),) else: return x.shape placement, names, arrays = zip(*tuples) first = arrays[0] shape = (len(arrays),) + _shape_compat(first) stacked = np.empty(shape, dtype=dtype) for i, arr in enumerate(arrays): stacked[i] = _asarray_compat(arr) return stacked, placement def _interleaved_dtype(blocks: Sequence[Block]) -> Optional[DtypeObj]: """ Find the common dtype for `blocks`. Parameters ---------- blocks : List[Block] Returns ------- dtype : np.dtype, ExtensionDtype, or None None is returned when `blocks` is empty. """ if not len(blocks): return None return find_common_type([b.dtype for b in blocks]) def _consolidate(blocks): """ Merge blocks having same dtype, exclude non-consolidating blocks """ # sort by _can_consolidate, dtype gkey = lambda x: x._consolidate_key grouper = itertools.groupby(sorted(blocks, key=gkey), gkey) new_blocks: List[Block] = [] for (_can_consolidate, dtype), group_blocks in grouper: merged_blocks = _merge_blocks( list(group_blocks), dtype=dtype, can_consolidate=_can_consolidate ) new_blocks = extend_blocks(merged_blocks, new_blocks) return new_blocks def _merge_blocks( blocks: List[Block], dtype: DtypeObj, can_consolidate: bool ) -> List[Block]: if len(blocks) == 1: return blocks if can_consolidate: if dtype is None: if len({b.dtype for b in blocks}) != 1: raise AssertionError("_merge_blocks are invalid!") # TODO: optimization potential in case all mgrs contain slices and # combination of those slices is a slice, too. new_mgr_locs = np.concatenate([b.mgr_locs.as_array for b in blocks]) new_values = np.vstack([b.values for b in blocks]) argsort = np.argsort(new_mgr_locs) new_values = new_values[argsort] new_mgr_locs = new_mgr_locs[argsort] return [make_block(new_values, placement=new_mgr_locs, ndim=2)] # can't consolidate --> no merge return blocks def _fast_count_smallints(arr: np.ndarray) -> np.ndarray: """Faster version of set(arr) for sequences of small numbers.""" counts = np.bincount(arr.astype(np.int_)) nz = counts.nonzero()[0] return np.c_[nz, counts[nz]] def _preprocess_slice_or_indexer(slice_or_indexer, length: int, allow_fill: bool): if isinstance(slice_or_indexer, slice): return ( "slice", slice_or_indexer, libinternals.slice_len(slice_or_indexer, length), ) elif ( isinstance(slice_or_indexer, np.ndarray) and slice_or_indexer.dtype == np.bool_ ): return "mask", slice_or_indexer, slice_or_indexer.sum() else: indexer = np.asanyarray(slice_or_indexer, dtype=np.int64) if not allow_fill: indexer = maybe_convert_indices(indexer, length) return "fancy", indexer, len(indexer)