from __future__ import annotations import copy as cp import itertools from typing import ( TYPE_CHECKING, Sequence, cast, ) import numpy as np from pandas._libs import ( NaT, internals as libinternals, ) from pandas._libs.missing import NA from pandas._typing import ( ArrayLike, AxisInt, DtypeObj, Manager, Shape, ) from pandas.util._decorators import cache_readonly from pandas.core.dtypes.astype import astype_array from pandas.core.dtypes.cast import ( ensure_dtype_can_hold_na, find_common_type, np_find_common_type, ) from pandas.core.dtypes.common import ( is_1d_only_ea_dtype, is_dtype_equal, is_scalar, needs_i8_conversion, ) from pandas.core.dtypes.concat import concat_compat from pandas.core.dtypes.dtypes import ( DatetimeTZDtype, ExtensionDtype, ) from pandas.core.dtypes.missing import ( is_valid_na_for_dtype, isna, isna_all, ) import pandas.core.algorithms as algos from pandas.core.arrays import ( DatetimeArray, ExtensionArray, ) from pandas.core.arrays.sparse import SparseDtype from pandas.core.construction import ensure_wrapped_if_datetimelike from pandas.core.internals.array_manager import ( ArrayManager, NullArrayProxy, ) from pandas.core.internals.blocks import ( ensure_block_shape, new_block_2d, ) from pandas.core.internals.managers import BlockManager if TYPE_CHECKING: from pandas import Index from pandas.core.internals.blocks import Block def _concatenate_array_managers( mgrs_indexers, axes: list[Index], concat_axis: AxisInt, copy: bool ) -> Manager: """ Concatenate array managers into one. Parameters ---------- mgrs_indexers : list of (ArrayManager, {axis: indexer,...}) tuples axes : list of Index concat_axis : int copy : bool Returns ------- ArrayManager """ # reindex all arrays mgrs = [] for mgr, indexers in mgrs_indexers: axis1_made_copy = False for ax, indexer in indexers.items(): mgr = mgr.reindex_indexer( axes[ax], indexer, axis=ax, allow_dups=True, use_na_proxy=True ) if ax == 1 and indexer is not None: axis1_made_copy = True if copy and concat_axis == 0 and not axis1_made_copy: # for concat_axis 1 we will always get a copy through concat_arrays mgr = mgr.copy() mgrs.append(mgr) if concat_axis == 1: # concatting along the rows -> concat the reindexed arrays # TODO(ArrayManager) doesn't yet preserve the correct dtype arrays = [ concat_arrays([mgrs[i].arrays[j] for i in range(len(mgrs))]) for j in range(len(mgrs[0].arrays)) ] else: # concatting along the columns -> combine reindexed arrays in a single manager assert concat_axis == 0 arrays = list(itertools.chain.from_iterable([mgr.arrays for mgr in mgrs])) new_mgr = ArrayManager(arrays, [axes[1], axes[0]], verify_integrity=False) return new_mgr def concat_arrays(to_concat: list) -> ArrayLike: """ Alternative for concat_compat but specialized for use in the ArrayManager. Differences: only deals with 1D arrays (no axis keyword), assumes ensure_wrapped_if_datetimelike and does not skip empty arrays to determine the dtype. In addition ensures that all NullArrayProxies get replaced with actual arrays. Parameters ---------- to_concat : list of arrays Returns ------- np.ndarray or ExtensionArray """ # ignore the all-NA proxies to determine the resulting dtype to_concat_no_proxy = [x for x in to_concat if not isinstance(x, NullArrayProxy)] dtypes = {x.dtype for x in to_concat_no_proxy} single_dtype = len(dtypes) == 1 if single_dtype: target_dtype = to_concat_no_proxy[0].dtype elif all(x.kind in ["i", "u", "b"] and isinstance(x, np.dtype) for x in dtypes): # GH#42092 target_dtype = np_find_common_type(*dtypes) else: target_dtype = find_common_type([arr.dtype for arr in to_concat_no_proxy]) to_concat = [ arr.to_array(target_dtype) if isinstance(arr, NullArrayProxy) else astype_array(arr, target_dtype, copy=False) for arr in to_concat ] if isinstance(to_concat[0], ExtensionArray): cls = type(to_concat[0]) return cls._concat_same_type(to_concat) result = np.concatenate(to_concat) # TODO decide on exact behaviour (we shouldn't do this only for empty result) # see https://github.com/pandas-dev/pandas/issues/39817 if len(result) == 0: # all empties -> check for bool to not coerce to float kinds = {obj.dtype.kind for obj in to_concat_no_proxy} if len(kinds) != 1: if "b" in kinds: result = result.astype(object) return result def concatenate_managers( mgrs_indexers, axes: list[Index], concat_axis: AxisInt, copy: bool ) -> Manager: """ Concatenate block managers into one. Parameters ---------- mgrs_indexers : list of (BlockManager, {axis: indexer,...}) tuples axes : list of Index concat_axis : int copy : bool Returns ------- BlockManager """ # TODO(ArrayManager) this assumes that all managers are of the same type if isinstance(mgrs_indexers[0][0], ArrayManager): return _concatenate_array_managers(mgrs_indexers, axes, concat_axis, copy) # Assertions disabled for performance # for tup in mgrs_indexers: # # caller is responsible for ensuring this # indexers = tup[1] # assert concat_axis not in indexers if concat_axis == 0: return _concat_managers_axis0(mgrs_indexers, axes, copy) mgrs_indexers = _maybe_reindex_columns_na_proxy(axes, mgrs_indexers) concat_plans = [ _get_mgr_concatenation_plan(mgr, indexers) for mgr, indexers in mgrs_indexers ] concat_plan = _combine_concat_plans(concat_plans) blocks = [] for placement, join_units in concat_plan: unit = join_units[0] blk = unit.block if len(join_units) == 1 and not join_units[0].indexers: values = blk.values if copy: values = values.copy() else: values = values.view() fastpath = True elif _is_uniform_join_units(join_units): vals = [ju.block.values for ju in join_units] if not blk.is_extension: # _is_uniform_join_units ensures a single dtype, so # we can use np.concatenate, which is more performant # than concat_compat values = np.concatenate(vals, axis=1) else: # TODO(EA2D): special-casing not needed with 2D EAs values = concat_compat(vals, axis=1) values = ensure_block_shape(values, ndim=2) values = ensure_wrapped_if_datetimelike(values) fastpath = blk.values.dtype == values.dtype else: values = _concatenate_join_units(join_units, copy=copy) fastpath = False if fastpath: b = blk.make_block_same_class(values, placement=placement) else: b = new_block_2d(values, placement=placement) blocks.append(b) return BlockManager(tuple(blocks), axes) def _concat_managers_axis0( mgrs_indexers, axes: list[Index], copy: bool ) -> BlockManager: """ concat_managers specialized to concat_axis=0, with reindexing already having been done in _maybe_reindex_columns_na_proxy. """ had_reindexers = { i: len(mgrs_indexers[i][1]) > 0 for i in range(len(mgrs_indexers)) } mgrs_indexers = _maybe_reindex_columns_na_proxy(axes, mgrs_indexers) mgrs = [x[0] for x in mgrs_indexers] offset = 0 blocks = [] for i, mgr in enumerate(mgrs): # If we already reindexed, then we definitely don't need another copy made_copy = had_reindexers[i] for blk in mgr.blocks: if made_copy: nb = blk.copy(deep=False) elif copy: nb = blk.copy() else: # by slicing instead of copy(deep=False), we get a new array # object, see test_concat_copy nb = blk.getitem_block(slice(None)) nb._mgr_locs = nb._mgr_locs.add(offset) blocks.append(nb) offset += len(mgr.items) result = BlockManager(tuple(blocks), axes) return result def _maybe_reindex_columns_na_proxy( axes: list[Index], mgrs_indexers: list[tuple[BlockManager, dict[int, np.ndarray]]] ) -> list[tuple[BlockManager, dict[int, np.ndarray]]]: """ Reindex along columns so that all of the BlockManagers being concatenated have matching columns. Columns added in this reindexing have dtype=np.void, indicating they should be ignored when choosing a column's final dtype. """ new_mgrs_indexers: list[tuple[BlockManager, dict[int, np.ndarray]]] = [] for mgr, indexers in mgrs_indexers: # For axis=0 (i.e. columns) we use_na_proxy and only_slice, so this # is a cheap reindexing. for i, indexer in indexers.items(): mgr = mgr.reindex_indexer( axes[i], indexers[i], axis=i, copy=False, only_slice=True, # only relevant for i==0 allow_dups=True, use_na_proxy=True, # only relevant for i==0 ) new_mgrs_indexers.append((mgr, {})) return new_mgrs_indexers def _get_mgr_concatenation_plan(mgr: BlockManager, indexers: dict[int, np.ndarray]): """ Construct concatenation plan for given block manager and indexers. Parameters ---------- mgr : BlockManager indexers : dict of {axis: indexer} Returns ------- plan : list of (BlockPlacement, JoinUnit) tuples """ assert len(indexers) == 0 # Calculate post-reindex shape, save for item axis which will be separate # for each block anyway. mgr_shape_list = list(mgr.shape) for ax, indexer in indexers.items(): mgr_shape_list[ax] = len(indexer) mgr_shape = tuple(mgr_shape_list) assert 0 not in indexers if mgr.is_single_block: blk = mgr.blocks[0] return [(blk.mgr_locs, JoinUnit(blk, mgr_shape, indexers))] blknos = mgr.blknos blklocs = mgr.blklocs plan = [] for blkno, placements in libinternals.get_blkno_placements(blknos, group=False): assert placements.is_slice_like assert blkno != -1 join_unit_indexers = indexers.copy() shape_list = list(mgr_shape) shape_list[0] = len(placements) shape = tuple(shape_list) blk = mgr.blocks[blkno] ax0_blk_indexer = blklocs[placements.indexer] unit_no_ax0_reindexing = ( len(placements) == len(blk.mgr_locs) and # Fastpath detection of join unit not # needing to reindex its block: no ax0 # reindexing took place and block # placement was sequential before. ( (blk.mgr_locs.is_slice_like and blk.mgr_locs.as_slice.step == 1) or # Slow-ish detection: all indexer locs # are sequential (and length match is # checked above). (np.diff(ax0_blk_indexer) == 1).all() ) ) # Omit indexer if no item reindexing is required. if unit_no_ax0_reindexing: join_unit_indexers.pop(0, None) else: join_unit_indexers[0] = ax0_blk_indexer unit = JoinUnit(blk, shape, join_unit_indexers) plan.append((placements, unit)) return plan class JoinUnit: def __init__(self, block: Block, shape: Shape, indexers=None) -> None: # Passing shape explicitly is required for cases when block is None. # Note: block is None implies indexers is None, but not vice-versa if indexers is None: indexers = {} self.block = block self.indexers = indexers self.shape = shape def __repr__(self) -> str: return f"{type(self).__name__}({repr(self.block)}, {self.indexers})" @cache_readonly def needs_filling(self) -> bool: for indexer in self.indexers.values(): # FIXME: cache results of indexer == -1 checks. if (indexer == -1).any(): return True return False @cache_readonly def dtype(self) -> DtypeObj: blk = self.block if blk.values.dtype.kind == "V": raise AssertionError("Block is None, no dtype") if not self.needs_filling: return blk.dtype return ensure_dtype_can_hold_na(blk.dtype) def _is_valid_na_for(self, dtype: DtypeObj) -> bool: """ Check that we are all-NA of a type/dtype that is compatible with this dtype. Augments `self.is_na` with an additional check of the type of NA values. """ if not self.is_na: return False if self.block.dtype.kind == "V": return True if self.dtype == object: values = self.block.values return all(is_valid_na_for_dtype(x, dtype) for x in values.ravel(order="K")) na_value = self.block.fill_value if na_value is NaT and not is_dtype_equal(self.dtype, dtype): # e.g. we are dt64 and other is td64 # fill_values match but we should not cast self.block.values to dtype # TODO: this will need updating if we ever have non-nano dt64/td64 return False if na_value is NA and needs_i8_conversion(dtype): # FIXME: kludge; test_append_empty_frame_with_timedelta64ns_nat # e.g. self.dtype == "Int64" and dtype is td64, we dont want # to consider these as matching return False # TODO: better to use can_hold_element? return is_valid_na_for_dtype(na_value, dtype) @cache_readonly def is_na(self) -> bool: blk = self.block if blk.dtype.kind == "V": return True if not blk._can_hold_na: return False values = blk.values if values.size == 0: return True if isinstance(values.dtype, SparseDtype): return False if values.ndim == 1: # TODO(EA2D): no need for special case with 2D EAs val = values[0] if not is_scalar(val) or not isna(val): # ideally isna_all would do this short-circuiting return False return isna_all(values) else: val = values[0][0] if not is_scalar(val) or not isna(val): # ideally isna_all would do this short-circuiting return False return all(isna_all(row) for row in values) def get_reindexed_values(self, empty_dtype: DtypeObj, upcasted_na) -> ArrayLike: values: ArrayLike if upcasted_na is None and self.block.dtype.kind != "V": # No upcasting is necessary fill_value = self.block.fill_value values = self.block.values else: fill_value = upcasted_na if self._is_valid_na_for(empty_dtype): # note: always holds when self.block.dtype.kind == "V" blk_dtype = self.block.dtype if blk_dtype == np.dtype("object"): # we want to avoid filling with np.nan if we are # using None; we already know that we are all # nulls values = self.block.values.ravel(order="K") if len(values) and values[0] is None: fill_value = None if isinstance(empty_dtype, DatetimeTZDtype): # NB: exclude e.g. pyarrow[dt64tz] dtypes i8values = np.full(self.shape, fill_value._value) return DatetimeArray(i8values, dtype=empty_dtype) elif is_1d_only_ea_dtype(empty_dtype): if is_dtype_equal(blk_dtype, empty_dtype) and self.indexers: # avoid creating new empty array if we already have an array # with correct dtype that can be reindexed pass else: empty_dtype = cast(ExtensionDtype, empty_dtype) cls = empty_dtype.construct_array_type() missing_arr = cls._from_sequence([], dtype=empty_dtype) ncols, nrows = self.shape assert ncols == 1, ncols empty_arr = -1 * np.ones((nrows,), dtype=np.intp) return missing_arr.take( empty_arr, allow_fill=True, fill_value=fill_value ) elif isinstance(empty_dtype, ExtensionDtype): # TODO: no tests get here, a handful would if we disabled # the dt64tz special-case above (which is faster) cls = empty_dtype.construct_array_type() missing_arr = cls._empty(shape=self.shape, dtype=empty_dtype) missing_arr[:] = fill_value return missing_arr else: # NB: we should never get here with empty_dtype integer or bool; # if we did, the missing_arr.fill would cast to gibberish missing_arr = np.empty(self.shape, dtype=empty_dtype) missing_arr.fill(fill_value) return missing_arr if (not self.indexers) and (not self.block._can_consolidate): # preserve these for validation in concat_compat return self.block.values if self.block.is_bool: # External code requested filling/upcasting, bool values must # be upcasted to object to avoid being upcasted to numeric. values = self.block.astype(np.dtype("object")).values else: # No dtype upcasting is done here, it will be performed during # concatenation itself. values = self.block.values if not self.indexers: # If there's no indexing to be done, we want to signal outside # code that this array must be copied explicitly. This is done # by returning a view and checking `retval.base`. values = values.view() else: for ax, indexer in self.indexers.items(): values = algos.take_nd(values, indexer, axis=ax) return values def _concatenate_join_units(join_units: list[JoinUnit], copy: bool) -> ArrayLike: """ Concatenate values from several join units along axis=1. """ empty_dtype = _get_empty_dtype(join_units) has_none_blocks = any(unit.block.dtype.kind == "V" for unit in join_units) upcasted_na = _dtype_to_na_value(empty_dtype, has_none_blocks) to_concat = [ ju.get_reindexed_values(empty_dtype=empty_dtype, upcasted_na=upcasted_na) for ju in join_units ] if len(to_concat) == 1: # Only one block, nothing to concatenate. concat_values = to_concat[0] if copy: if isinstance(concat_values, np.ndarray): # non-reindexed (=not yet copied) arrays are made into a view # in JoinUnit.get_reindexed_values if concat_values.base is not None: concat_values = concat_values.copy() else: concat_values = concat_values.copy() elif any(is_1d_only_ea_dtype(t.dtype) for t in to_concat): # TODO(EA2D): special case not needed if all EAs used HybridBlocks # error: No overload variant of "__getitem__" of "ExtensionArray" matches # argument type "Tuple[int, slice]" to_concat = [ t if is_1d_only_ea_dtype(t.dtype) else t[0, :] # type: ignore[call-overload] for t in to_concat ] concat_values = concat_compat(to_concat, axis=0, ea_compat_axis=True) concat_values = ensure_block_shape(concat_values, 2) else: concat_values = concat_compat(to_concat, axis=1) return concat_values def _dtype_to_na_value(dtype: DtypeObj, has_none_blocks: bool): """ Find the NA value to go with this dtype. """ if isinstance(dtype, ExtensionDtype): return dtype.na_value elif dtype.kind in ["m", "M"]: return dtype.type("NaT") elif dtype.kind in ["f", "c"]: return dtype.type("NaN") elif dtype.kind == "b": # different from missing.na_value_for_dtype return None elif dtype.kind in ["i", "u"]: if not has_none_blocks: # different from missing.na_value_for_dtype return None return np.nan elif dtype.kind == "O": return np.nan raise NotImplementedError def _get_empty_dtype(join_units: Sequence[JoinUnit]) -> DtypeObj: """ Return dtype and N/A values to use when concatenating specified units. Returned N/A value may be None which means there was no casting involved. Returns ------- dtype """ if len(join_units) == 1: blk = join_units[0].block return blk.dtype if _is_uniform_reindex(join_units): empty_dtype = join_units[0].block.dtype return empty_dtype has_none_blocks = any(unit.block.dtype.kind == "V" for unit in join_units) dtypes = [unit.dtype for unit in join_units if not unit.is_na] if not len(dtypes): dtypes = [unit.dtype for unit in join_units if unit.block.dtype.kind != "V"] dtype = find_common_type(dtypes) if has_none_blocks: dtype = ensure_dtype_can_hold_na(dtype) return dtype def _is_uniform_join_units(join_units: list[JoinUnit]) -> bool: """ Check if the join units consist of blocks of uniform type that can be concatenated using Block.concat_same_type instead of the generic _concatenate_join_units (which uses `concat_compat`). """ first = join_units[0].block if first.dtype.kind == "V": return False return ( # exclude cases where a) ju.block is None or b) we have e.g. Int64+int64 all(type(ju.block) is type(first) for ju in join_units) and # e.g. DatetimeLikeBlock can be dt64 or td64, but these are not uniform all( is_dtype_equal(ju.block.dtype, first.dtype) # GH#42092 we only want the dtype_equal check for non-numeric blocks # (for now, may change but that would need a deprecation) or ju.block.dtype.kind in ["b", "i", "u"] for ju in join_units ) and # no blocks that would get missing values (can lead to type upcasts) # unless we're an extension dtype. all(not ju.is_na or ju.block.is_extension for ju in join_units) and # no blocks with indexers (as then the dimensions do not fit) all(not ju.indexers for ju in join_units) and # only use this path when there is something to concatenate len(join_units) > 1 ) def _is_uniform_reindex(join_units) -> bool: return ( # TODO: should this be ju.block._can_hold_na? all(ju.block.is_extension for ju in join_units) and len({ju.block.dtype.name for ju in join_units}) == 1 ) def _trim_join_unit(join_unit: JoinUnit, length: int) -> JoinUnit: """ Reduce join_unit's shape along item axis to length. Extra items that didn't fit are returned as a separate block. """ if 0 not in join_unit.indexers: extra_indexers = join_unit.indexers if join_unit.block is None: extra_block = None else: extra_block = join_unit.block.getitem_block(slice(length, None)) join_unit.block = join_unit.block.getitem_block(slice(length)) else: extra_block = join_unit.block extra_indexers = cp.copy(join_unit.indexers) extra_indexers[0] = extra_indexers[0][length:] join_unit.indexers[0] = join_unit.indexers[0][:length] extra_shape = (join_unit.shape[0] - length,) + join_unit.shape[1:] join_unit.shape = (length,) + join_unit.shape[1:] return JoinUnit(block=extra_block, indexers=extra_indexers, shape=extra_shape) def _combine_concat_plans(plans): """ Combine multiple concatenation plans into one. existing_plan is updated in-place. We only get here with concat_axis == 1. """ if len(plans) == 1: for p in plans[0]: yield p[0], [p[1]] else: # singleton list so we can modify it as a side-effect within _next_or_none num_ended = [0] def _next_or_none(seq): retval = next(seq, None) if retval is None: num_ended[0] += 1 return retval plans = list(map(iter, plans)) next_items = list(map(_next_or_none, plans)) while num_ended[0] != len(next_items): if num_ended[0] > 0: raise ValueError("Plan shapes are not aligned") placements, units = zip(*next_items) lengths = list(map(len, placements)) min_len, max_len = min(lengths), max(lengths) if min_len == max_len: yield placements[0], units next_items[:] = map(_next_or_none, plans) else: yielded_placement = None yielded_units = [None] * len(next_items) for i, (plc, unit) in enumerate(next_items): yielded_units[i] = unit if len(plc) > min_len: # _trim_join_unit updates unit in place, so only # placement needs to be sliced to skip min_len. next_items[i] = (plc[min_len:], _trim_join_unit(unit, min_len)) else: yielded_placement = plc next_items[i] = _next_or_none(plans[i]) yield yielded_placement, yielded_units