Inzynierka/Lib/site-packages/pandas/core/internals/concat.py

792 lines
26 KiB
Python
Raw Permalink Normal View History

2023-06-02 12:51:02 +02:00
from __future__ import annotations
import copy as cp
import itertools
from typing import (
TYPE_CHECKING,
Sequence,
cast,
)
import numpy as np
from pandas._libs import (
NaT,
internals as libinternals,
)
from pandas._libs.missing import NA
from pandas._typing import (
ArrayLike,
AxisInt,
DtypeObj,
Manager,
Shape,
)
from pandas.util._decorators import cache_readonly
from pandas.core.dtypes.astype import astype_array
from pandas.core.dtypes.cast import (
ensure_dtype_can_hold_na,
find_common_type,
np_find_common_type,
)
from pandas.core.dtypes.common import (
is_1d_only_ea_dtype,
is_dtype_equal,
is_scalar,
needs_i8_conversion,
)
from pandas.core.dtypes.concat import concat_compat
from pandas.core.dtypes.dtypes import (
DatetimeTZDtype,
ExtensionDtype,
)
from pandas.core.dtypes.missing import (
is_valid_na_for_dtype,
isna,
isna_all,
)
import pandas.core.algorithms as algos
from pandas.core.arrays import (
DatetimeArray,
ExtensionArray,
)
from pandas.core.arrays.sparse import SparseDtype
from pandas.core.construction import ensure_wrapped_if_datetimelike
from pandas.core.internals.array_manager import (
ArrayManager,
NullArrayProxy,
)
from pandas.core.internals.blocks import (
ensure_block_shape,
new_block_2d,
)
from pandas.core.internals.managers import BlockManager
if TYPE_CHECKING:
from pandas import Index
from pandas.core.internals.blocks import Block
def _concatenate_array_managers(
mgrs_indexers, axes: list[Index], concat_axis: AxisInt, copy: bool
) -> Manager:
"""
Concatenate array managers into one.
Parameters
----------
mgrs_indexers : list of (ArrayManager, {axis: indexer,...}) tuples
axes : list of Index
concat_axis : int
copy : bool
Returns
-------
ArrayManager
"""
# reindex all arrays
mgrs = []
for mgr, indexers in mgrs_indexers:
axis1_made_copy = False
for ax, indexer in indexers.items():
mgr = mgr.reindex_indexer(
axes[ax], indexer, axis=ax, allow_dups=True, use_na_proxy=True
)
if ax == 1 and indexer is not None:
axis1_made_copy = True
if copy and concat_axis == 0 and not axis1_made_copy:
# for concat_axis 1 we will always get a copy through concat_arrays
mgr = mgr.copy()
mgrs.append(mgr)
if concat_axis == 1:
# concatting along the rows -> concat the reindexed arrays
# TODO(ArrayManager) doesn't yet preserve the correct dtype
arrays = [
concat_arrays([mgrs[i].arrays[j] for i in range(len(mgrs))])
for j in range(len(mgrs[0].arrays))
]
else:
# concatting along the columns -> combine reindexed arrays in a single manager
assert concat_axis == 0
arrays = list(itertools.chain.from_iterable([mgr.arrays for mgr in mgrs]))
new_mgr = ArrayManager(arrays, [axes[1], axes[0]], verify_integrity=False)
return new_mgr
def concat_arrays(to_concat: list) -> ArrayLike:
"""
Alternative for concat_compat but specialized for use in the ArrayManager.
Differences: only deals with 1D arrays (no axis keyword), assumes
ensure_wrapped_if_datetimelike and does not skip empty arrays to determine
the dtype.
In addition ensures that all NullArrayProxies get replaced with actual
arrays.
Parameters
----------
to_concat : list of arrays
Returns
-------
np.ndarray or ExtensionArray
"""
# ignore the all-NA proxies to determine the resulting dtype
to_concat_no_proxy = [x for x in to_concat if not isinstance(x, NullArrayProxy)]
dtypes = {x.dtype for x in to_concat_no_proxy}
single_dtype = len(dtypes) == 1
if single_dtype:
target_dtype = to_concat_no_proxy[0].dtype
elif all(x.kind in ["i", "u", "b"] and isinstance(x, np.dtype) for x in dtypes):
# GH#42092
target_dtype = np_find_common_type(*dtypes)
else:
target_dtype = find_common_type([arr.dtype for arr in to_concat_no_proxy])
to_concat = [
arr.to_array(target_dtype)
if isinstance(arr, NullArrayProxy)
else astype_array(arr, target_dtype, copy=False)
for arr in to_concat
]
if isinstance(to_concat[0], ExtensionArray):
cls = type(to_concat[0])
return cls._concat_same_type(to_concat)
result = np.concatenate(to_concat)
# TODO decide on exact behaviour (we shouldn't do this only for empty result)
# see https://github.com/pandas-dev/pandas/issues/39817
if len(result) == 0:
# all empties -> check for bool to not coerce to float
kinds = {obj.dtype.kind for obj in to_concat_no_proxy}
if len(kinds) != 1:
if "b" in kinds:
result = result.astype(object)
return result
def concatenate_managers(
mgrs_indexers, axes: list[Index], concat_axis: AxisInt, copy: bool
) -> Manager:
"""
Concatenate block managers into one.
Parameters
----------
mgrs_indexers : list of (BlockManager, {axis: indexer,...}) tuples
axes : list of Index
concat_axis : int
copy : bool
Returns
-------
BlockManager
"""
# TODO(ArrayManager) this assumes that all managers are of the same type
if isinstance(mgrs_indexers[0][0], ArrayManager):
return _concatenate_array_managers(mgrs_indexers, axes, concat_axis, copy)
# Assertions disabled for performance
# for tup in mgrs_indexers:
# # caller is responsible for ensuring this
# indexers = tup[1]
# assert concat_axis not in indexers
if concat_axis == 0:
return _concat_managers_axis0(mgrs_indexers, axes, copy)
mgrs_indexers = _maybe_reindex_columns_na_proxy(axes, mgrs_indexers)
concat_plans = [
_get_mgr_concatenation_plan(mgr, indexers) for mgr, indexers in mgrs_indexers
]
concat_plan = _combine_concat_plans(concat_plans)
blocks = []
for placement, join_units in concat_plan:
unit = join_units[0]
blk = unit.block
if len(join_units) == 1 and not join_units[0].indexers:
values = blk.values
if copy:
values = values.copy()
else:
values = values.view()
fastpath = True
elif _is_uniform_join_units(join_units):
vals = [ju.block.values for ju in join_units]
if not blk.is_extension:
# _is_uniform_join_units ensures a single dtype, so
# we can use np.concatenate, which is more performant
# than concat_compat
values = np.concatenate(vals, axis=1)
else:
# TODO(EA2D): special-casing not needed with 2D EAs
values = concat_compat(vals, axis=1)
values = ensure_block_shape(values, ndim=2)
values = ensure_wrapped_if_datetimelike(values)
fastpath = blk.values.dtype == values.dtype
else:
values = _concatenate_join_units(join_units, copy=copy)
fastpath = False
if fastpath:
b = blk.make_block_same_class(values, placement=placement)
else:
b = new_block_2d(values, placement=placement)
blocks.append(b)
return BlockManager(tuple(blocks), axes)
def _concat_managers_axis0(
mgrs_indexers, axes: list[Index], copy: bool
) -> BlockManager:
"""
concat_managers specialized to concat_axis=0, with reindexing already
having been done in _maybe_reindex_columns_na_proxy.
"""
had_reindexers = {
i: len(mgrs_indexers[i][1]) > 0 for i in range(len(mgrs_indexers))
}
mgrs_indexers = _maybe_reindex_columns_na_proxy(axes, mgrs_indexers)
mgrs = [x[0] for x in mgrs_indexers]
offset = 0
blocks = []
for i, mgr in enumerate(mgrs):
# If we already reindexed, then we definitely don't need another copy
made_copy = had_reindexers[i]
for blk in mgr.blocks:
if made_copy:
nb = blk.copy(deep=False)
elif copy:
nb = blk.copy()
else:
# by slicing instead of copy(deep=False), we get a new array
# object, see test_concat_copy
nb = blk.getitem_block(slice(None))
nb._mgr_locs = nb._mgr_locs.add(offset)
blocks.append(nb)
offset += len(mgr.items)
result = BlockManager(tuple(blocks), axes)
return result
def _maybe_reindex_columns_na_proxy(
axes: list[Index], mgrs_indexers: list[tuple[BlockManager, dict[int, np.ndarray]]]
) -> list[tuple[BlockManager, dict[int, np.ndarray]]]:
"""
Reindex along columns so that all of the BlockManagers being concatenated
have matching columns.
Columns added in this reindexing have dtype=np.void, indicating they
should be ignored when choosing a column's final dtype.
"""
new_mgrs_indexers: list[tuple[BlockManager, dict[int, np.ndarray]]] = []
for mgr, indexers in mgrs_indexers:
# For axis=0 (i.e. columns) we use_na_proxy and only_slice, so this
# is a cheap reindexing.
for i, indexer in indexers.items():
mgr = mgr.reindex_indexer(
axes[i],
indexers[i],
axis=i,
copy=False,
only_slice=True, # only relevant for i==0
allow_dups=True,
use_na_proxy=True, # only relevant for i==0
)
new_mgrs_indexers.append((mgr, {}))
return new_mgrs_indexers
def _get_mgr_concatenation_plan(mgr: BlockManager, indexers: dict[int, np.ndarray]):
"""
Construct concatenation plan for given block manager and indexers.
Parameters
----------
mgr : BlockManager
indexers : dict of {axis: indexer}
Returns
-------
plan : list of (BlockPlacement, JoinUnit) tuples
"""
assert len(indexers) == 0
# Calculate post-reindex shape, save for item axis which will be separate
# for each block anyway.
mgr_shape_list = list(mgr.shape)
for ax, indexer in indexers.items():
mgr_shape_list[ax] = len(indexer)
mgr_shape = tuple(mgr_shape_list)
assert 0 not in indexers
if mgr.is_single_block:
blk = mgr.blocks[0]
return [(blk.mgr_locs, JoinUnit(blk, mgr_shape, indexers))]
blknos = mgr.blknos
blklocs = mgr.blklocs
plan = []
for blkno, placements in libinternals.get_blkno_placements(blknos, group=False):
assert placements.is_slice_like
assert blkno != -1
join_unit_indexers = indexers.copy()
shape_list = list(mgr_shape)
shape_list[0] = len(placements)
shape = tuple(shape_list)
blk = mgr.blocks[blkno]
ax0_blk_indexer = blklocs[placements.indexer]
unit_no_ax0_reindexing = (
len(placements) == len(blk.mgr_locs)
and
# Fastpath detection of join unit not
# needing to reindex its block: no ax0
# reindexing took place and block
# placement was sequential before.
(
(blk.mgr_locs.is_slice_like and blk.mgr_locs.as_slice.step == 1)
or
# Slow-ish detection: all indexer locs
# are sequential (and length match is
# checked above).
(np.diff(ax0_blk_indexer) == 1).all()
)
)
# Omit indexer if no item reindexing is required.
if unit_no_ax0_reindexing:
join_unit_indexers.pop(0, None)
else:
join_unit_indexers[0] = ax0_blk_indexer
unit = JoinUnit(blk, shape, join_unit_indexers)
plan.append((placements, unit))
return plan
class JoinUnit:
def __init__(self, block: Block, shape: Shape, indexers=None) -> None:
# Passing shape explicitly is required for cases when block is None.
# Note: block is None implies indexers is None, but not vice-versa
if indexers is None:
indexers = {}
self.block = block
self.indexers = indexers
self.shape = shape
def __repr__(self) -> str:
return f"{type(self).__name__}({repr(self.block)}, {self.indexers})"
@cache_readonly
def needs_filling(self) -> bool:
for indexer in self.indexers.values():
# FIXME: cache results of indexer == -1 checks.
if (indexer == -1).any():
return True
return False
@cache_readonly
def dtype(self) -> DtypeObj:
blk = self.block
if blk.values.dtype.kind == "V":
raise AssertionError("Block is None, no dtype")
if not self.needs_filling:
return blk.dtype
return ensure_dtype_can_hold_na(blk.dtype)
def _is_valid_na_for(self, dtype: DtypeObj) -> bool:
"""
Check that we are all-NA of a type/dtype that is compatible with this dtype.
Augments `self.is_na` with an additional check of the type of NA values.
"""
if not self.is_na:
return False
if self.block.dtype.kind == "V":
return True
if self.dtype == object:
values = self.block.values
return all(is_valid_na_for_dtype(x, dtype) for x in values.ravel(order="K"))
na_value = self.block.fill_value
if na_value is NaT and not is_dtype_equal(self.dtype, dtype):
# e.g. we are dt64 and other is td64
# fill_values match but we should not cast self.block.values to dtype
# TODO: this will need updating if we ever have non-nano dt64/td64
return False
if na_value is NA and needs_i8_conversion(dtype):
# FIXME: kludge; test_append_empty_frame_with_timedelta64ns_nat
# e.g. self.dtype == "Int64" and dtype is td64, we dont want
# to consider these as matching
return False
# TODO: better to use can_hold_element?
return is_valid_na_for_dtype(na_value, dtype)
@cache_readonly
def is_na(self) -> bool:
blk = self.block
if blk.dtype.kind == "V":
return True
if not blk._can_hold_na:
return False
values = blk.values
if values.size == 0:
return True
if isinstance(values.dtype, SparseDtype):
return False
if values.ndim == 1:
# TODO(EA2D): no need for special case with 2D EAs
val = values[0]
if not is_scalar(val) or not isna(val):
# ideally isna_all would do this short-circuiting
return False
return isna_all(values)
else:
val = values[0][0]
if not is_scalar(val) or not isna(val):
# ideally isna_all would do this short-circuiting
return False
return all(isna_all(row) for row in values)
def get_reindexed_values(self, empty_dtype: DtypeObj, upcasted_na) -> ArrayLike:
values: ArrayLike
if upcasted_na is None and self.block.dtype.kind != "V":
# No upcasting is necessary
fill_value = self.block.fill_value
values = self.block.values
else:
fill_value = upcasted_na
if self._is_valid_na_for(empty_dtype):
# note: always holds when self.block.dtype.kind == "V"
blk_dtype = self.block.dtype
if blk_dtype == np.dtype("object"):
# we want to avoid filling with np.nan if we are
# using None; we already know that we are all
# nulls
values = self.block.values.ravel(order="K")
if len(values) and values[0] is None:
fill_value = None
if isinstance(empty_dtype, DatetimeTZDtype):
# NB: exclude e.g. pyarrow[dt64tz] dtypes
i8values = np.full(self.shape, fill_value._value)
return DatetimeArray(i8values, dtype=empty_dtype)
elif is_1d_only_ea_dtype(empty_dtype):
if is_dtype_equal(blk_dtype, empty_dtype) and self.indexers:
# avoid creating new empty array if we already have an array
# with correct dtype that can be reindexed
pass
else:
empty_dtype = cast(ExtensionDtype, empty_dtype)
cls = empty_dtype.construct_array_type()
missing_arr = cls._from_sequence([], dtype=empty_dtype)
ncols, nrows = self.shape
assert ncols == 1, ncols
empty_arr = -1 * np.ones((nrows,), dtype=np.intp)
return missing_arr.take(
empty_arr, allow_fill=True, fill_value=fill_value
)
elif isinstance(empty_dtype, ExtensionDtype):
# TODO: no tests get here, a handful would if we disabled
# the dt64tz special-case above (which is faster)
cls = empty_dtype.construct_array_type()
missing_arr = cls._empty(shape=self.shape, dtype=empty_dtype)
missing_arr[:] = fill_value
return missing_arr
else:
# NB: we should never get here with empty_dtype integer or bool;
# if we did, the missing_arr.fill would cast to gibberish
missing_arr = np.empty(self.shape, dtype=empty_dtype)
missing_arr.fill(fill_value)
return missing_arr
if (not self.indexers) and (not self.block._can_consolidate):
# preserve these for validation in concat_compat
return self.block.values
if self.block.is_bool:
# External code requested filling/upcasting, bool values must
# be upcasted to object to avoid being upcasted to numeric.
values = self.block.astype(np.dtype("object")).values
else:
# No dtype upcasting is done here, it will be performed during
# concatenation itself.
values = self.block.values
if not self.indexers:
# If there's no indexing to be done, we want to signal outside
# code that this array must be copied explicitly. This is done
# by returning a view and checking `retval.base`.
values = values.view()
else:
for ax, indexer in self.indexers.items():
values = algos.take_nd(values, indexer, axis=ax)
return values
def _concatenate_join_units(join_units: list[JoinUnit], copy: bool) -> ArrayLike:
"""
Concatenate values from several join units along axis=1.
"""
empty_dtype = _get_empty_dtype(join_units)
has_none_blocks = any(unit.block.dtype.kind == "V" for unit in join_units)
upcasted_na = _dtype_to_na_value(empty_dtype, has_none_blocks)
to_concat = [
ju.get_reindexed_values(empty_dtype=empty_dtype, upcasted_na=upcasted_na)
for ju in join_units
]
if len(to_concat) == 1:
# Only one block, nothing to concatenate.
concat_values = to_concat[0]
if copy:
if isinstance(concat_values, np.ndarray):
# non-reindexed (=not yet copied) arrays are made into a view
# in JoinUnit.get_reindexed_values
if concat_values.base is not None:
concat_values = concat_values.copy()
else:
concat_values = concat_values.copy()
elif any(is_1d_only_ea_dtype(t.dtype) for t in to_concat):
# TODO(EA2D): special case not needed if all EAs used HybridBlocks
# error: No overload variant of "__getitem__" of "ExtensionArray" matches
# argument type "Tuple[int, slice]"
to_concat = [
t
if is_1d_only_ea_dtype(t.dtype)
else t[0, :] # type: ignore[call-overload]
for t in to_concat
]
concat_values = concat_compat(to_concat, axis=0, ea_compat_axis=True)
concat_values = ensure_block_shape(concat_values, 2)
else:
concat_values = concat_compat(to_concat, axis=1)
return concat_values
def _dtype_to_na_value(dtype: DtypeObj, has_none_blocks: bool):
"""
Find the NA value to go with this dtype.
"""
if isinstance(dtype, ExtensionDtype):
return dtype.na_value
elif dtype.kind in ["m", "M"]:
return dtype.type("NaT")
elif dtype.kind in ["f", "c"]:
return dtype.type("NaN")
elif dtype.kind == "b":
# different from missing.na_value_for_dtype
return None
elif dtype.kind in ["i", "u"]:
if not has_none_blocks:
# different from missing.na_value_for_dtype
return None
return np.nan
elif dtype.kind == "O":
return np.nan
raise NotImplementedError
def _get_empty_dtype(join_units: Sequence[JoinUnit]) -> DtypeObj:
"""
Return dtype and N/A values to use when concatenating specified units.
Returned N/A value may be None which means there was no casting involved.
Returns
-------
dtype
"""
if len(join_units) == 1:
blk = join_units[0].block
return blk.dtype
if _is_uniform_reindex(join_units):
empty_dtype = join_units[0].block.dtype
return empty_dtype
has_none_blocks = any(unit.block.dtype.kind == "V" for unit in join_units)
dtypes = [unit.dtype for unit in join_units if not unit.is_na]
if not len(dtypes):
dtypes = [unit.dtype for unit in join_units if unit.block.dtype.kind != "V"]
dtype = find_common_type(dtypes)
if has_none_blocks:
dtype = ensure_dtype_can_hold_na(dtype)
return dtype
def _is_uniform_join_units(join_units: list[JoinUnit]) -> bool:
"""
Check if the join units consist of blocks of uniform type that can
be concatenated using Block.concat_same_type instead of the generic
_concatenate_join_units (which uses `concat_compat`).
"""
first = join_units[0].block
if first.dtype.kind == "V":
return False
return (
# exclude cases where a) ju.block is None or b) we have e.g. Int64+int64
all(type(ju.block) is type(first) for ju in join_units)
and
# e.g. DatetimeLikeBlock can be dt64 or td64, but these are not uniform
all(
is_dtype_equal(ju.block.dtype, first.dtype)
# GH#42092 we only want the dtype_equal check for non-numeric blocks
# (for now, may change but that would need a deprecation)
or ju.block.dtype.kind in ["b", "i", "u"]
for ju in join_units
)
and
# no blocks that would get missing values (can lead to type upcasts)
# unless we're an extension dtype.
all(not ju.is_na or ju.block.is_extension for ju in join_units)
and
# no blocks with indexers (as then the dimensions do not fit)
all(not ju.indexers for ju in join_units)
and
# only use this path when there is something to concatenate
len(join_units) > 1
)
def _is_uniform_reindex(join_units) -> bool:
return (
# TODO: should this be ju.block._can_hold_na?
all(ju.block.is_extension for ju in join_units)
and len({ju.block.dtype.name for ju in join_units}) == 1
)
def _trim_join_unit(join_unit: JoinUnit, length: int) -> JoinUnit:
"""
Reduce join_unit's shape along item axis to length.
Extra items that didn't fit are returned as a separate block.
"""
if 0 not in join_unit.indexers:
extra_indexers = join_unit.indexers
if join_unit.block is None:
extra_block = None
else:
extra_block = join_unit.block.getitem_block(slice(length, None))
join_unit.block = join_unit.block.getitem_block(slice(length))
else:
extra_block = join_unit.block
extra_indexers = cp.copy(join_unit.indexers)
extra_indexers[0] = extra_indexers[0][length:]
join_unit.indexers[0] = join_unit.indexers[0][:length]
extra_shape = (join_unit.shape[0] - length,) + join_unit.shape[1:]
join_unit.shape = (length,) + join_unit.shape[1:]
return JoinUnit(block=extra_block, indexers=extra_indexers, shape=extra_shape)
def _combine_concat_plans(plans):
"""
Combine multiple concatenation plans into one.
existing_plan is updated in-place.
We only get here with concat_axis == 1.
"""
if len(plans) == 1:
for p in plans[0]:
yield p[0], [p[1]]
else:
# singleton list so we can modify it as a side-effect within _next_or_none
num_ended = [0]
def _next_or_none(seq):
retval = next(seq, None)
if retval is None:
num_ended[0] += 1
return retval
plans = list(map(iter, plans))
next_items = list(map(_next_or_none, plans))
while num_ended[0] != len(next_items):
if num_ended[0] > 0:
raise ValueError("Plan shapes are not aligned")
placements, units = zip(*next_items)
lengths = list(map(len, placements))
min_len, max_len = min(lengths), max(lengths)
if min_len == max_len:
yield placements[0], units
next_items[:] = map(_next_or_none, plans)
else:
yielded_placement = None
yielded_units = [None] * len(next_items)
for i, (plc, unit) in enumerate(next_items):
yielded_units[i] = unit
if len(plc) > min_len:
# _trim_join_unit updates unit in place, so only
# placement needs to be sliced to skip min_len.
next_items[i] = (plc[min_len:], _trim_join_unit(unit, min_len))
else:
yielded_placement = plc
next_items[i] = _next_or_none(plans[i])
yield yielded_placement, yielded_units