projektAI/venv/Lib/site-packages/pandas/core/internals/concat.py
2021-06-06 22:13:05 +02:00

596 lines
20 KiB
Python

from collections import defaultdict
import copy
from typing import TYPE_CHECKING, Any, Dict, List, Sequence, Tuple, cast
import numpy as np
from pandas._libs import NaT, internals as libinternals
from pandas._typing import DtypeObj, Shape
from pandas.util._decorators import cache_readonly
from pandas.core.dtypes.cast import maybe_promote
from pandas.core.dtypes.common import (
get_dtype,
is_categorical_dtype,
is_datetime64_dtype,
is_datetime64tz_dtype,
is_extension_array_dtype,
is_float_dtype,
is_numeric_dtype,
is_sparse,
is_timedelta64_dtype,
)
from pandas.core.dtypes.concat import concat_compat
from pandas.core.dtypes.missing import isna_all
import pandas.core.algorithms as algos
from pandas.core.arrays import DatetimeArray, ExtensionArray
from pandas.core.internals.blocks import make_block
from pandas.core.internals.managers import BlockManager
if TYPE_CHECKING:
from pandas.core.arrays.sparse.dtype import SparseDtype
def concatenate_block_managers(
mgrs_indexers, axes, concat_axis: int, copy: bool
) -> BlockManager:
"""
Concatenate block managers into one.
Parameters
----------
mgrs_indexers : list of (BlockManager, {axis: indexer,...}) tuples
axes : list of Index
concat_axis : int
copy : bool
Returns
-------
BlockManager
"""
concat_plans = [
_get_mgr_concatenation_plan(mgr, indexers) for mgr, indexers in mgrs_indexers
]
concat_plan = _combine_concat_plans(concat_plans, concat_axis)
blocks = []
for placement, join_units in concat_plan:
if len(join_units) == 1 and not join_units[0].indexers:
b = join_units[0].block
values = b.values
if copy:
values = values.copy()
else:
values = values.view()
b = b.make_block_same_class(values, placement=placement)
elif _is_uniform_join_units(join_units):
blk = join_units[0].block
vals = [ju.block.values for ju in join_units]
if not blk.is_extension:
values = concat_compat(vals, axis=blk.ndim - 1)
else:
# TODO(EA2D): special-casing not needed with 2D EAs
values = concat_compat(vals)
if not isinstance(values, ExtensionArray):
values = values.reshape(1, len(values))
b = make_block(values, placement=placement, ndim=blk.ndim)
else:
b = make_block(
_concatenate_join_units(join_units, concat_axis, copy=copy),
placement=placement,
ndim=len(axes),
)
blocks.append(b)
return BlockManager(blocks, axes)
def _get_mgr_concatenation_plan(mgr, indexers):
"""
Construct concatenation plan for given block manager and indexers.
Parameters
----------
mgr : BlockManager
indexers : dict of {axis: indexer}
Returns
-------
plan : list of (BlockPlacement, JoinUnit) tuples
"""
# Calculate post-reindex shape , save for item axis which will be separate
# for each block anyway.
mgr_shape_list = list(mgr.shape)
for ax, indexer in indexers.items():
mgr_shape_list[ax] = len(indexer)
mgr_shape = tuple(mgr_shape_list)
if 0 in indexers:
ax0_indexer = indexers.pop(0)
blknos = algos.take_1d(mgr.blknos, ax0_indexer, fill_value=-1)
blklocs = algos.take_1d(mgr.blklocs, ax0_indexer, fill_value=-1)
else:
if mgr.is_single_block:
blk = mgr.blocks[0]
return [(blk.mgr_locs, JoinUnit(blk, mgr_shape, indexers))]
ax0_indexer = None
blknos = mgr.blknos
blklocs = mgr.blklocs
plan = []
for blkno, placements in libinternals.get_blkno_placements(blknos, group=False):
assert placements.is_slice_like
join_unit_indexers = indexers.copy()
shape_list = list(mgr_shape)
shape_list[0] = len(placements)
shape = tuple(shape_list)
if blkno == -1:
unit = JoinUnit(None, shape)
else:
blk = mgr.blocks[blkno]
ax0_blk_indexer = blklocs[placements.indexer]
unit_no_ax0_reindexing = (
len(placements) == len(blk.mgr_locs)
and
# Fastpath detection of join unit not
# needing to reindex its block: no ax0
# reindexing took place and block
# placement was sequential before.
(
(
ax0_indexer is None
and blk.mgr_locs.is_slice_like
and blk.mgr_locs.as_slice.step == 1
)
or
# Slow-ish detection: all indexer locs
# are sequential (and length match is
# checked above).
(np.diff(ax0_blk_indexer) == 1).all()
)
)
# Omit indexer if no item reindexing is required.
if unit_no_ax0_reindexing:
join_unit_indexers.pop(0, None)
else:
join_unit_indexers[0] = ax0_blk_indexer
unit = JoinUnit(blk, shape, join_unit_indexers)
plan.append((placements, unit))
return plan
class JoinUnit:
def __init__(self, block, shape: Shape, indexers=None):
# Passing shape explicitly is required for cases when block is None.
if indexers is None:
indexers = {}
self.block = block
self.indexers = indexers
self.shape = shape
def __repr__(self) -> str:
return f"{type(self).__name__}({repr(self.block)}, {self.indexers})"
@cache_readonly
def needs_filling(self) -> bool:
for indexer in self.indexers.values():
# FIXME: cache results of indexer == -1 checks.
if (indexer == -1).any():
return True
return False
@cache_readonly
def dtype(self):
if self.block is None:
raise AssertionError("Block is None, no dtype")
if not self.needs_filling:
return self.block.dtype
else:
return get_dtype(maybe_promote(self.block.dtype, self.block.fill_value)[0])
@cache_readonly
def is_na(self) -> bool:
if self.block is None:
return True
if not self.block._can_hold_na:
return False
# Usually it's enough to check but a small fraction of values to see if
# a block is NOT null, chunks should help in such cases. 1000 value
# was chosen rather arbitrarily.
values = self.block.values
if is_sparse(self.block.values.dtype):
return False
elif self.block.is_extension:
# TODO(EA2D): no need for special case with 2D EAs
values_flat = values
else:
values_flat = values.ravel(order="K")
return isna_all(values_flat)
def get_reindexed_values(self, empty_dtype: DtypeObj, upcasted_na):
if upcasted_na is None:
# No upcasting is necessary
fill_value = self.block.fill_value
values = self.block.get_values()
else:
fill_value = upcasted_na
if self.is_na:
if getattr(self.block, "is_object", False):
# we want to avoid filling with np.nan if we are
# using None; we already know that we are all
# nulls
values = self.block.values.ravel(order="K")
if len(values) and values[0] is None:
fill_value = None
if getattr(self.block, "is_datetimetz", False) or is_datetime64tz_dtype(
empty_dtype
):
if self.block is None:
# TODO(EA2D): special case unneeded with 2D EAs
return DatetimeArray(
np.full(self.shape[1], fill_value.value), dtype=empty_dtype
)
elif getattr(self.block, "is_categorical", False):
pass
elif getattr(self.block, "is_extension", False):
pass
elif is_extension_array_dtype(empty_dtype):
missing_arr = empty_dtype.construct_array_type()._from_sequence(
[], dtype=empty_dtype
)
ncols, nrows = self.shape
assert ncols == 1, ncols
empty_arr = -1 * np.ones((nrows,), dtype=np.intp)
return missing_arr.take(
empty_arr, allow_fill=True, fill_value=fill_value
)
else:
missing_arr = np.empty(self.shape, dtype=empty_dtype)
missing_arr.fill(fill_value)
return missing_arr
if (not self.indexers) and (not self.block._can_consolidate):
# preserve these for validation in concat_compat
return self.block.values
if self.block.is_bool and not self.block.is_categorical:
# External code requested filling/upcasting, bool values must
# be upcasted to object to avoid being upcasted to numeric.
values = self.block.astype(np.object_).values
elif self.block.is_extension:
values = self.block.values
else:
# No dtype upcasting is done here, it will be performed during
# concatenation itself.
values = self.block.values
if not self.indexers:
# If there's no indexing to be done, we want to signal outside
# code that this array must be copied explicitly. This is done
# by returning a view and checking `retval.base`.
values = values.view()
else:
for ax, indexer in self.indexers.items():
values = algos.take_nd(values, indexer, axis=ax, fill_value=fill_value)
return values
def _concatenate_join_units(join_units, concat_axis, copy):
"""
Concatenate values from several join units along selected axis.
"""
if concat_axis == 0 and len(join_units) > 1:
# Concatenating join units along ax0 is handled in _merge_blocks.
raise AssertionError("Concatenating join units along axis0")
empty_dtype, upcasted_na = _get_empty_dtype_and_na(join_units)
to_concat = [
ju.get_reindexed_values(empty_dtype=empty_dtype, upcasted_na=upcasted_na)
for ju in join_units
]
if len(to_concat) == 1:
# Only one block, nothing to concatenate.
concat_values = to_concat[0]
if copy:
if isinstance(concat_values, np.ndarray):
# non-reindexed (=not yet copied) arrays are made into a view
# in JoinUnit.get_reindexed_values
if concat_values.base is not None:
concat_values = concat_values.copy()
else:
concat_values = concat_values.copy()
elif any(isinstance(t, ExtensionArray) for t in to_concat):
# concatting with at least one EA means we are concatting a single column
# the non-EA values are 2D arrays with shape (1, n)
to_concat = [t if isinstance(t, ExtensionArray) else t[0, :] for t in to_concat]
concat_values = concat_compat(to_concat, axis=0)
if not isinstance(concat_values, ExtensionArray) or (
isinstance(concat_values, DatetimeArray) and concat_values.tz is None
):
# if the result of concat is not an EA but an ndarray, reshape to
# 2D to put it a non-EA Block
# special case DatetimeArray, which *is* an EA, but is put in a
# consolidated 2D block
concat_values = np.atleast_2d(concat_values)
else:
concat_values = concat_compat(to_concat, axis=concat_axis)
return concat_values
def _get_empty_dtype_and_na(join_units: Sequence[JoinUnit]) -> Tuple[DtypeObj, Any]:
"""
Return dtype and N/A values to use when concatenating specified units.
Returned N/A value may be None which means there was no casting involved.
Returns
-------
dtype
na
"""
if len(join_units) == 1:
blk = join_units[0].block
if blk is None:
return np.dtype(np.float64), np.nan
if _is_uniform_reindex(join_units):
# FIXME: integrate property
empty_dtype = join_units[0].block.dtype
upcasted_na = join_units[0].block.fill_value
return empty_dtype, upcasted_na
has_none_blocks = False
dtypes = [None] * len(join_units)
for i, unit in enumerate(join_units):
if unit.block is None:
has_none_blocks = True
else:
dtypes[i] = unit.dtype
upcast_classes = _get_upcast_classes(join_units, dtypes)
# TODO: de-duplicate with maybe_promote?
# create the result
if "extension" in upcast_classes:
if len(upcast_classes) == 1:
cls = upcast_classes["extension"][0]
return cls, cls.na_value
else:
return np.dtype("object"), np.nan
elif "object" in upcast_classes:
return np.dtype(np.object_), np.nan
elif "bool" in upcast_classes:
if has_none_blocks:
return np.dtype(np.object_), np.nan
else:
return np.dtype(np.bool_), None
elif "category" in upcast_classes:
return np.dtype(np.object_), np.nan
elif "datetimetz" in upcast_classes:
# GH-25014. We use NaT instead of iNaT, since this eventually
# ends up in DatetimeArray.take, which does not allow iNaT.
dtype = upcast_classes["datetimetz"]
return dtype[0], NaT
elif "datetime" in upcast_classes:
return np.dtype("M8[ns]"), np.datetime64("NaT", "ns")
elif "timedelta" in upcast_classes:
return np.dtype("m8[ns]"), np.timedelta64("NaT", "ns")
else: # pragma
try:
common_dtype = np.find_common_type(upcast_classes, [])
except TypeError:
# At least one is an ExtensionArray
return np.dtype(np.object_), np.nan
else:
if is_float_dtype(common_dtype):
return common_dtype, common_dtype.type(np.nan)
elif is_numeric_dtype(common_dtype):
if has_none_blocks:
return np.dtype(np.float64), np.nan
else:
return common_dtype, None
msg = "invalid dtype determination in get_concat_dtype"
raise AssertionError(msg)
def _get_upcast_classes(
join_units: Sequence[JoinUnit],
dtypes: Sequence[DtypeObj],
) -> Dict[str, List[DtypeObj]]:
"""Create mapping between upcast class names and lists of dtypes."""
upcast_classes: Dict[str, List[DtypeObj]] = defaultdict(list)
null_upcast_classes: Dict[str, List[DtypeObj]] = defaultdict(list)
for dtype, unit in zip(dtypes, join_units):
if dtype is None:
continue
upcast_cls = _select_upcast_cls_from_dtype(dtype)
# Null blocks should not influence upcast class selection, unless there
# are only null blocks, when same upcasting rules must be applied to
# null upcast classes.
if unit.is_na:
null_upcast_classes[upcast_cls].append(dtype)
else:
upcast_classes[upcast_cls].append(dtype)
if not upcast_classes:
upcast_classes = null_upcast_classes
return upcast_classes
def _select_upcast_cls_from_dtype(dtype: DtypeObj) -> str:
"""Select upcast class name based on dtype."""
if is_categorical_dtype(dtype):
return "category"
elif is_datetime64tz_dtype(dtype):
return "datetimetz"
elif is_extension_array_dtype(dtype):
return "extension"
elif issubclass(dtype.type, np.bool_):
return "bool"
elif issubclass(dtype.type, np.object_):
return "object"
elif is_datetime64_dtype(dtype):
return "datetime"
elif is_timedelta64_dtype(dtype):
return "timedelta"
elif is_sparse(dtype):
dtype = cast("SparseDtype", dtype)
return dtype.subtype.name
elif is_float_dtype(dtype) or is_numeric_dtype(dtype):
return dtype.name
else:
return "float"
def _is_uniform_join_units(join_units: List[JoinUnit]) -> bool:
"""
Check if the join units consist of blocks of uniform type that can
be concatenated using Block.concat_same_type instead of the generic
_concatenate_join_units (which uses `concat_compat`).
"""
# TODO: require dtype match in addition to same type? e.g. DatetimeTZBlock
# cannot necessarily join
return (
# all blocks need to have the same type
all(type(ju.block) is type(join_units[0].block) for ju in join_units) # noqa
and
# no blocks that would get missing values (can lead to type upcasts)
# unless we're an extension dtype.
all(not ju.is_na or ju.block.is_extension for ju in join_units)
and
# no blocks with indexers (as then the dimensions do not fit)
all(not ju.indexers for ju in join_units)
and
# only use this path when there is something to concatenate
len(join_units) > 1
)
def _is_uniform_reindex(join_units) -> bool:
return (
# TODO: should this be ju.block._can_hold_na?
all(ju.block and ju.block.is_extension for ju in join_units)
and len({ju.block.dtype.name for ju in join_units}) == 1
)
def _trim_join_unit(join_unit, length):
"""
Reduce join_unit's shape along item axis to length.
Extra items that didn't fit are returned as a separate block.
"""
if 0 not in join_unit.indexers:
extra_indexers = join_unit.indexers
if join_unit.block is None:
extra_block = None
else:
extra_block = join_unit.block.getitem_block(slice(length, None))
join_unit.block = join_unit.block.getitem_block(slice(length))
else:
extra_block = join_unit.block
extra_indexers = copy.copy(join_unit.indexers)
extra_indexers[0] = extra_indexers[0][length:]
join_unit.indexers[0] = join_unit.indexers[0][:length]
extra_shape = (join_unit.shape[0] - length,) + join_unit.shape[1:]
join_unit.shape = (length,) + join_unit.shape[1:]
return JoinUnit(block=extra_block, indexers=extra_indexers, shape=extra_shape)
def _combine_concat_plans(plans, concat_axis):
"""
Combine multiple concatenation plans into one.
existing_plan is updated in-place.
"""
if len(plans) == 1:
for p in plans[0]:
yield p[0], [p[1]]
elif concat_axis == 0:
offset = 0
for plan in plans:
last_plc = None
for plc, unit in plan:
yield plc.add(offset), [unit]
last_plc = plc
if last_plc is not None:
offset += last_plc.as_slice.stop
else:
num_ended = [0]
def _next_or_none(seq):
retval = next(seq, None)
if retval is None:
num_ended[0] += 1
return retval
plans = list(map(iter, plans))
next_items = list(map(_next_or_none, plans))
while num_ended[0] != len(next_items):
if num_ended[0] > 0:
raise ValueError("Plan shapes are not aligned")
placements, units = zip(*next_items)
lengths = list(map(len, placements))
min_len, max_len = min(lengths), max(lengths)
if min_len == max_len:
yield placements[0], units
next_items[:] = map(_next_or_none, plans)
else:
yielded_placement = None
yielded_units = [None] * len(next_items)
for i, (plc, unit) in enumerate(next_items):
yielded_units[i] = unit
if len(plc) > min_len:
# _trim_join_unit updates unit in place, so only
# placement needs to be sliced to skip min_len.
next_items[i] = (plc[min_len:], _trim_join_unit(unit, min_len))
else:
yielded_placement = plc
next_items[i] = _next_or_none(plans[i])
yield yielded_placement, yielded_units