projektAI/venv/Lib/site-packages/pandas/core/internals/managers.py

1974 lines
62 KiB
Python
Raw Normal View History

2021-06-06 22:13:05 +02:00
from collections import defaultdict
import itertools
from typing import (
Any,
Callable,
DefaultDict,
Dict,
List,
Optional,
Sequence,
Tuple,
TypeVar,
Union,
)
import warnings
import numpy as np
from pandas._libs import internals as libinternals, lib
from pandas._typing import ArrayLike, DtypeObj, Label, Shape
from pandas.util._validators import validate_bool_kwarg
from pandas.core.dtypes.cast import (
find_common_type,
infer_dtype_from_scalar,
maybe_promote,
)
from pandas.core.dtypes.common import (
DT64NS_DTYPE,
is_dtype_equal,
is_extension_array_dtype,
is_list_like,
)
from pandas.core.dtypes.concat import concat_compat
from pandas.core.dtypes.dtypes import ExtensionDtype
from pandas.core.dtypes.generic import ABCDataFrame, ABCPandasArray, ABCSeries
from pandas.core.dtypes.missing import array_equals, isna
import pandas.core.algorithms as algos
from pandas.core.arrays.sparse import SparseDtype
from pandas.core.base import PandasObject
from pandas.core.construction import extract_array
from pandas.core.indexers import maybe_convert_indices
from pandas.core.indexes.api import Index, ensure_index
from pandas.core.internals.blocks import (
Block,
CategoricalBlock,
DatetimeTZBlock,
ExtensionBlock,
ObjectValuesExtensionBlock,
extend_blocks,
get_block_type,
make_block,
safe_reshape,
)
from pandas.core.internals.ops import blockwise_all, operate_blockwise
# TODO: flexible with index=None and/or items=None
T = TypeVar("T", bound="BlockManager")
class BlockManager(PandasObject):
"""
Core internal data structure to implement DataFrame, Series, etc.
Manage a bunch of labeled 2D mixed-type ndarrays. Essentially it's a
lightweight blocked set of labeled data to be manipulated by the DataFrame
public API class
Attributes
----------
shape
ndim
axes
values
items
Methods
-------
set_axis(axis, new_labels)
copy(deep=True)
get_dtypes
apply(func, axes, block_filter_fn)
get_bool_data
get_numeric_data
get_slice(slice_like, axis)
get(label)
iget(loc)
take(indexer, axis)
reindex_axis(new_labels, axis)
reindex_indexer(new_labels, indexer, axis)
delete(label)
insert(loc, label, value)
set(label, value)
Parameters
----------
blocks: Sequence of Block
axes: Sequence of Index
do_integrity_check: bool, default True
Notes
-----
This is *not* a public API class
"""
__slots__ = [
"axes",
"blocks",
"_known_consolidated",
"_is_consolidated",
"_blknos",
"_blklocs",
]
_blknos: np.ndarray
_blklocs: np.ndarray
def __init__(
self,
blocks: Sequence[Block],
axes: Sequence[Index],
do_integrity_check: bool = True,
):
self.axes = [ensure_index(ax) for ax in axes]
self.blocks: Tuple[Block, ...] = tuple(blocks)
for block in blocks:
if self.ndim != block.ndim:
raise AssertionError(
f"Number of Block dimensions ({block.ndim}) must equal "
f"number of axes ({self.ndim})"
)
if do_integrity_check:
self._verify_integrity()
# Populate known_consolidate, blknos, and blklocs lazily
self._known_consolidated = False
self._blknos = None
self._blklocs = None
@classmethod
def from_blocks(cls, blocks: List[Block], axes: List[Index]):
"""
Constructor for BlockManager and SingleBlockManager with same signature.
"""
return cls(blocks, axes, do_integrity_check=False)
@property
def blknos(self):
"""
Suppose we want to find the array corresponding to our i'th column.
blknos[i] identifies the block from self.blocks that contains this column.
blklocs[i] identifies the column of interest within
self.blocks[self.blknos[i]]
"""
if self._blknos is None:
# Note: these can be altered by other BlockManager methods.
self._rebuild_blknos_and_blklocs()
return self._blknos
@property
def blklocs(self):
"""
See blknos.__doc__
"""
if self._blklocs is None:
# Note: these can be altered by other BlockManager methods.
self._rebuild_blknos_and_blklocs()
return self._blklocs
def make_empty(self: T, axes=None) -> T:
""" return an empty BlockManager with the items axis of len 0 """
if axes is None:
axes = [Index([])] + self.axes[1:]
# preserve dtype if possible
if self.ndim == 1:
assert isinstance(self, SingleBlockManager) # for mypy
blk = self.blocks[0]
arr = blk.values[:0]
nb = blk.make_block_same_class(arr, placement=slice(0, 0), ndim=1)
blocks = [nb]
else:
blocks = []
return type(self).from_blocks(blocks, axes)
def __nonzero__(self) -> bool:
return True
# Python3 compat
__bool__ = __nonzero__
@property
def shape(self) -> Shape:
return tuple(len(ax) for ax in self.axes)
@property
def ndim(self) -> int:
return len(self.axes)
def set_axis(self, axis: int, new_labels: Index) -> None:
# Caller is responsible for ensuring we have an Index object.
old_len = len(self.axes[axis])
new_len = len(new_labels)
if new_len != old_len:
raise ValueError(
f"Length mismatch: Expected axis has {old_len} elements, new "
f"values have {new_len} elements"
)
self.axes[axis] = new_labels
@property
def is_single_block(self) -> bool:
# Assumes we are 2D; overridden by SingleBlockManager
return len(self.blocks) == 1
def _rebuild_blknos_and_blklocs(self) -> None:
"""
Update mgr._blknos / mgr._blklocs.
"""
new_blknos = np.empty(self.shape[0], dtype=np.intp)
new_blklocs = np.empty(self.shape[0], dtype=np.intp)
new_blknos.fill(-1)
new_blklocs.fill(-1)
for blkno, blk in enumerate(self.blocks):
rl = blk.mgr_locs
new_blknos[rl.indexer] = blkno
new_blklocs[rl.indexer] = np.arange(len(rl))
if (new_blknos == -1).any():
# TODO: can we avoid this? it isn't cheap
raise AssertionError("Gaps in blk ref_locs")
self._blknos = new_blknos
self._blklocs = new_blklocs
@property
def items(self) -> Index:
return self.axes[0]
def get_dtypes(self):
dtypes = np.array([blk.dtype for blk in self.blocks])
return algos.take_1d(dtypes, self.blknos, allow_fill=False)
def __getstate__(self):
block_values = [b.values for b in self.blocks]
block_items = [self.items[b.mgr_locs.indexer] for b in self.blocks]
axes_array = list(self.axes)
extra_state = {
"0.14.1": {
"axes": axes_array,
"blocks": [
{"values": b.values, "mgr_locs": b.mgr_locs.indexer}
for b in self.blocks
],
}
}
# First three elements of the state are to maintain forward
# compatibility with 0.13.1.
return axes_array, block_values, block_items, extra_state
def __setstate__(self, state):
def unpickle_block(values, mgr_locs, ndim: int):
# TODO(EA2D): ndim would be unnecessary with 2D EAs
return make_block(values, placement=mgr_locs, ndim=ndim)
if isinstance(state, tuple) and len(state) >= 4 and "0.14.1" in state[3]:
state = state[3]["0.14.1"]
self.axes = [ensure_index(ax) for ax in state["axes"]]
ndim = len(self.axes)
self.blocks = tuple(
unpickle_block(b["values"], b["mgr_locs"], ndim=ndim)
for b in state["blocks"]
)
else:
raise NotImplementedError("pre-0.14.1 pickles are no longer supported")
self._post_setstate()
def _post_setstate(self) -> None:
self._is_consolidated = False
self._known_consolidated = False
self._rebuild_blknos_and_blklocs()
def __len__(self) -> int:
return len(self.items)
def __repr__(self) -> str:
output = type(self).__name__
for i, ax in enumerate(self.axes):
if i == 0:
output += f"\nItems: {ax}"
else:
output += f"\nAxis {i}: {ax}"
for block in self.blocks:
output += f"\n{block}"
return output
def _verify_integrity(self) -> None:
mgr_shape = self.shape
tot_items = sum(len(x.mgr_locs) for x in self.blocks)
for block in self.blocks:
if block.shape[1:] != mgr_shape[1:]:
raise construction_error(tot_items, block.shape[1:], self.axes)
if len(self.items) != tot_items:
raise AssertionError(
"Number of manager items must equal union of "
f"block items\n# manager items: {len(self.items)}, # "
f"tot_items: {tot_items}"
)
def reduce(
self: T, func: Callable, ignore_failures: bool = False
) -> Tuple[T, np.ndarray]:
"""
Apply reduction function blockwise, returning a single-row BlockManager.
Parameters
----------
func : reduction function
ignore_failures : bool, default False
Whether to drop blocks where func raises TypeError.
Returns
-------
BlockManager
np.ndarray
Indexer of mgr_locs that are retained.
"""
# If 2D, we assume that we're operating column-wise
assert self.ndim == 2
res_blocks: List[Block] = []
for blk in self.blocks:
nbs = blk.reduce(func, ignore_failures)
res_blocks.extend(nbs)
index = Index([None]) # placeholder
if ignore_failures:
if res_blocks:
indexer = np.concatenate([blk.mgr_locs.as_array for blk in res_blocks])
new_mgr = self._combine(res_blocks, copy=False, index=index)
else:
indexer = []
new_mgr = type(self).from_blocks([], [Index([]), index])
else:
indexer = np.arange(self.shape[0])
new_mgr = type(self).from_blocks(res_blocks, [self.items, index])
return new_mgr, indexer
def operate_blockwise(self, other: "BlockManager", array_op) -> "BlockManager":
"""
Apply array_op blockwise with another (aligned) BlockManager.
"""
return operate_blockwise(self, other, array_op)
def apply(
self: T,
f,
align_keys: Optional[List[str]] = None,
ignore_failures: bool = False,
**kwargs,
) -> T:
"""
Iterate over the blocks, collect and create a new BlockManager.
Parameters
----------
f : str or callable
Name of the Block method to apply.
align_keys: List[str] or None, default None
ignore_failures: bool, default False
**kwargs
Keywords to pass to `f`
Returns
-------
BlockManager
"""
assert "filter" not in kwargs
align_keys = align_keys or []
result_blocks: List[Block] = []
# fillna: Series/DataFrame is responsible for making sure value is aligned
aligned_args = {k: kwargs[k] for k in align_keys}
for b in self.blocks:
if aligned_args:
for k, obj in aligned_args.items():
if isinstance(obj, (ABCSeries, ABCDataFrame)):
# The caller is responsible for ensuring that
# obj.axes[-1].equals(self.items)
if obj.ndim == 1:
kwargs[k] = obj.iloc[b.mgr_locs.indexer]._values
else:
kwargs[k] = obj.iloc[:, b.mgr_locs.indexer]._values
else:
# otherwise we have an ndarray
kwargs[k] = obj[b.mgr_locs.indexer]
try:
if callable(f):
applied = b.apply(f, **kwargs)
else:
applied = getattr(b, f)(**kwargs)
except (TypeError, NotImplementedError):
if not ignore_failures:
raise
continue
result_blocks = extend_blocks(applied, result_blocks)
if ignore_failures:
return self._combine(result_blocks)
if len(result_blocks) == 0:
return self.make_empty(self.axes)
return type(self).from_blocks(result_blocks, self.axes)
def quantile(
self,
axis: int = 0,
consolidate: bool = True,
transposed: bool = False,
interpolation="linear",
qs=None,
numeric_only=None,
) -> "BlockManager":
"""
Iterate over blocks applying quantile reduction.
This routine is intended for reduction type operations and
will do inference on the generated blocks.
Parameters
----------
axis: reduction axis, default 0
consolidate: bool, default True. Join together blocks having same
dtype
transposed: bool, default False
we are holding transposed data
interpolation : type of interpolation, default 'linear'
qs : a scalar or list of the quantiles to be computed
numeric_only : ignored
Returns
-------
BlockManager
"""
# Series dispatches to DataFrame for quantile, which allows us to
# simplify some of the code here and in the blocks
assert self.ndim >= 2
if consolidate:
self._consolidate_inplace()
def get_axe(block, qs, axes):
# Because Series dispatches to DataFrame, we will always have
# block.ndim == 2
from pandas import Float64Index
if is_list_like(qs):
ax = Float64Index(qs)
else:
ax = axes[0]
return ax
axes, blocks = [], []
for b in self.blocks:
block = b.quantile(axis=axis, qs=qs, interpolation=interpolation)
axe = get_axe(b, qs, axes=self.axes)
axes.append(axe)
blocks.append(block)
# note that some DatetimeTZ, Categorical are always ndim==1
ndim = {b.ndim for b in blocks}
assert 0 not in ndim, ndim
if 2 in ndim:
new_axes = list(self.axes)
# multiple blocks that are reduced
if len(blocks) > 1:
new_axes[1] = axes[0]
# reset the placement to the original
for b, sb in zip(blocks, self.blocks):
b.mgr_locs = sb.mgr_locs
else:
new_axes[axis] = Index(np.concatenate([ax._values for ax in axes]))
if transposed:
new_axes = new_axes[::-1]
blocks = [
b.make_block(b.values.T, placement=np.arange(b.shape[1]))
for b in blocks
]
return type(self)(blocks, new_axes)
# single block, i.e. ndim == {1}
values = concat_compat([b.values for b in blocks])
# compute the orderings of our original data
if len(self.blocks) > 1:
indexer = np.empty(len(self.axes[0]), dtype=np.intp)
i = 0
for b in self.blocks:
for j in b.mgr_locs:
indexer[j] = i
i = i + 1
values = values.take(indexer)
return SingleBlockManager(
make_block(values, ndim=1, placement=np.arange(len(values))), axes[0]
)
def isna(self, func) -> "BlockManager":
return self.apply("apply", func=func)
def where(
self, other, cond, align: bool, errors: str, try_cast: bool, axis: int
) -> "BlockManager":
if align:
align_keys = ["other", "cond"]
else:
align_keys = ["cond"]
other = extract_array(other, extract_numpy=True)
return self.apply(
"where",
align_keys=align_keys,
other=other,
cond=cond,
errors=errors,
try_cast=try_cast,
axis=axis,
)
def setitem(self, indexer, value) -> "BlockManager":
return self.apply("setitem", indexer=indexer, value=value)
def putmask(self, mask, new, align: bool = True, axis: int = 0):
transpose = self.ndim == 2
if align:
align_keys = ["new", "mask"]
else:
align_keys = ["mask"]
new = extract_array(new, extract_numpy=True)
return self.apply(
"putmask",
align_keys=align_keys,
mask=mask,
new=new,
inplace=True,
axis=axis,
transpose=transpose,
)
def diff(self, n: int, axis: int) -> "BlockManager":
return self.apply("diff", n=n, axis=axis)
def interpolate(self, **kwargs) -> "BlockManager":
return self.apply("interpolate", **kwargs)
def shift(self, periods: int, axis: int, fill_value) -> "BlockManager":
if fill_value is lib.no_default:
fill_value = None
if axis == 0 and self.ndim == 2 and self.nblocks > 1:
# GH#35488 we need to watch out for multi-block cases
# We only get here with fill_value not-lib.no_default
ncols = self.shape[0]
if periods > 0:
indexer = [-1] * periods + list(range(ncols - periods))
else:
nper = abs(periods)
indexer = list(range(nper, ncols)) + [-1] * nper
result = self.reindex_indexer(
self.items,
indexer,
axis=0,
fill_value=fill_value,
allow_dups=True,
consolidate=False,
)
return result
return self.apply("shift", periods=periods, axis=axis, fill_value=fill_value)
def fillna(self, value, limit, inplace: bool, downcast) -> "BlockManager":
return self.apply(
"fillna", value=value, limit=limit, inplace=inplace, downcast=downcast
)
def downcast(self) -> "BlockManager":
return self.apply("downcast")
def astype(
self, dtype, copy: bool = False, errors: str = "raise"
) -> "BlockManager":
return self.apply("astype", dtype=dtype, copy=copy, errors=errors)
def convert(
self,
copy: bool = True,
datetime: bool = True,
numeric: bool = True,
timedelta: bool = True,
) -> "BlockManager":
return self.apply(
"convert",
copy=copy,
datetime=datetime,
numeric=numeric,
timedelta=timedelta,
)
def replace(self, to_replace, value, inplace: bool, regex: bool) -> "BlockManager":
assert np.ndim(value) == 0, value
return self.apply(
"replace", to_replace=to_replace, value=value, inplace=inplace, regex=regex
)
def replace_list(
self: T,
src_list: List[Any],
dest_list: List[Any],
inplace: bool = False,
regex: bool = False,
) -> T:
""" do a list replace """
inplace = validate_bool_kwarg(inplace, "inplace")
bm = self.apply(
"_replace_list",
src_list=src_list,
dest_list=dest_list,
inplace=inplace,
regex=regex,
)
bm._consolidate_inplace()
return bm
def to_native_types(self, **kwargs) -> "BlockManager":
"""
Convert values to native types (strings / python objects) that are used
in formatting (repr / csv).
"""
return self.apply("to_native_types", **kwargs)
def is_consolidated(self) -> bool:
"""
Return True if more than one block with the same dtype
"""
if not self._known_consolidated:
self._consolidate_check()
return self._is_consolidated
def _consolidate_check(self) -> None:
dtypes = [blk.dtype for blk in self.blocks if blk._can_consolidate]
self._is_consolidated = len(dtypes) == len(set(dtypes))
self._known_consolidated = True
@property
def is_numeric_mixed_type(self) -> bool:
return all(block.is_numeric for block in self.blocks)
@property
def any_extension_types(self) -> bool:
"""Whether any of the blocks in this manager are extension blocks"""
return any(block.is_extension for block in self.blocks)
@property
def is_view(self) -> bool:
""" return a boolean if we are a single block and are a view """
if len(self.blocks) == 1:
return self.blocks[0].is_view
# It is technically possible to figure out which blocks are views
# e.g. [ b.values.base is not None for b in self.blocks ]
# but then we have the case of possibly some blocks being a view
# and some blocks not. setting in theory is possible on the non-view
# blocks w/o causing a SettingWithCopy raise/warn. But this is a bit
# complicated
return False
def get_bool_data(self, copy: bool = False) -> "BlockManager":
"""
Select blocks that are bool-dtype and columns from object-dtype blocks
that are all-bool.
Parameters
----------
copy : bool, default False
Whether to copy the blocks
"""
new_blocks = []
for blk in self.blocks:
if blk.dtype == bool:
new_blocks.append(blk)
elif blk.is_object:
nbs = blk._split()
for nb in nbs:
if nb.is_bool:
new_blocks.append(nb)
return self._combine(new_blocks, copy)
def get_numeric_data(self, copy: bool = False) -> "BlockManager":
"""
Parameters
----------
copy : bool, default False
Whether to copy the blocks
"""
return self._combine([b for b in self.blocks if b.is_numeric], copy)
def _combine(
self: T, blocks: List[Block], copy: bool = True, index: Optional[Index] = None
) -> T:
""" return a new manager with the blocks """
if len(blocks) == 0:
return self.make_empty()
# FIXME: optimization potential
indexer = np.sort(np.concatenate([b.mgr_locs.as_array for b in blocks]))
inv_indexer = lib.get_reverse_indexer(indexer, self.shape[0])
new_blocks: List[Block] = []
for b in blocks:
b = b.copy(deep=copy)
b.mgr_locs = inv_indexer[b.mgr_locs.indexer]
new_blocks.append(b)
axes = list(self.axes)
if index is not None:
axes[-1] = index
axes[0] = self.items.take(indexer)
return type(self).from_blocks(new_blocks, axes)
def get_slice(self, slobj: slice, axis: int = 0) -> "BlockManager":
if axis == 0:
new_blocks = self._slice_take_blocks_ax0(slobj)
elif axis == 1:
slicer = (slice(None), slobj)
new_blocks = [blk.getitem_block(slicer) for blk in self.blocks]
else:
raise IndexError("Requested axis not found in manager")
new_axes = list(self.axes)
new_axes[axis] = new_axes[axis][slobj]
bm = type(self)(new_blocks, new_axes, do_integrity_check=False)
return bm
@property
def nblocks(self) -> int:
return len(self.blocks)
def copy(self: T, deep=True) -> T:
"""
Make deep or shallow copy of BlockManager
Parameters
----------
deep : bool or string, default True
If False, return shallow copy (do not copy data)
If 'all', copy data and a deep copy of the index
Returns
-------
BlockManager
"""
# this preserves the notion of view copying of axes
if deep:
# hit in e.g. tests.io.json.test_pandas
def copy_func(ax):
return ax.copy(deep=True) if deep == "all" else ax.view()
new_axes = [copy_func(ax) for ax in self.axes]
else:
new_axes = list(self.axes)
res = self.apply("copy", deep=deep)
res.axes = new_axes
return res
def as_array(
self,
transpose: bool = False,
dtype=None,
copy: bool = False,
na_value=lib.no_default,
) -> np.ndarray:
"""
Convert the blockmanager data into an numpy array.
Parameters
----------
transpose : bool, default False
If True, transpose the return array.
dtype : object, default None
Data type of the return array.
copy : bool, default False
If True then guarantee that a copy is returned. A value of
False does not guarantee that the underlying data is not
copied.
na_value : object, default lib.no_default
Value to be used as the missing value sentinel.
Returns
-------
arr : ndarray
"""
if len(self.blocks) == 0:
arr = np.empty(self.shape, dtype=float)
return arr.transpose() if transpose else arr
# We want to copy when na_value is provided to avoid
# mutating the original object
copy = copy or na_value is not lib.no_default
if self.is_single_block:
blk = self.blocks[0]
if blk.is_extension:
# Avoid implicit conversion of extension blocks to object
arr = blk.values.to_numpy(dtype=dtype, na_value=na_value).reshape(
blk.shape
)
else:
arr = np.asarray(blk.get_values())
if dtype:
arr = arr.astype(dtype, copy=False)
else:
arr = self._interleave(dtype=dtype, na_value=na_value)
# The underlying data was copied within _interleave
copy = False
if copy:
arr = arr.copy()
if na_value is not lib.no_default:
arr[isna(arr)] = na_value
return arr.transpose() if transpose else arr
def _interleave(self, dtype=None, na_value=lib.no_default) -> np.ndarray:
"""
Return ndarray from blocks with specified item order
Items must be contained in the blocks
"""
if not dtype:
dtype = _interleaved_dtype(self.blocks)
# TODO: https://github.com/pandas-dev/pandas/issues/22791
# Give EAs some input on what happens here. Sparse needs this.
if isinstance(dtype, SparseDtype):
dtype = dtype.subtype
elif is_extension_array_dtype(dtype):
dtype = "object"
elif is_dtype_equal(dtype, str):
dtype = "object"
result = np.empty(self.shape, dtype=dtype)
itemmask = np.zeros(self.shape[0])
for blk in self.blocks:
rl = blk.mgr_locs
if blk.is_extension:
# Avoid implicit conversion of extension blocks to object
arr = blk.values.to_numpy(dtype=dtype, na_value=na_value)
else:
arr = blk.get_values(dtype)
result[rl.indexer] = arr
itemmask[rl.indexer] = 1
if not itemmask.all():
raise AssertionError("Some items were not contained in blocks")
return result
def to_dict(self, copy: bool = True):
"""
Return a dict of str(dtype) -> BlockManager
Parameters
----------
copy : bool, default True
Returns
-------
values : a dict of dtype -> BlockManager
"""
bd: Dict[str, List[Block]] = {}
for b in self.blocks:
bd.setdefault(str(b.dtype), []).append(b)
# TODO(EA2D): the combine will be unnecessary with 2D EAs
return {dtype: self._combine(blocks, copy=copy) for dtype, blocks in bd.items()}
def fast_xs(self, loc: int) -> ArrayLike:
"""
Return the array corresponding to `frame.iloc[loc]`.
Parameters
----------
loc : int
Returns
-------
np.ndarray or ExtensionArray
"""
if len(self.blocks) == 1:
return self.blocks[0].iget((slice(None), loc))
dtype = _interleaved_dtype(self.blocks)
n = len(self)
if is_extension_array_dtype(dtype):
# we'll eventually construct an ExtensionArray.
result = np.empty(n, dtype=object)
else:
result = np.empty(n, dtype=dtype)
for blk in self.blocks:
# Such assignment may incorrectly coerce NaT to None
# result[blk.mgr_locs] = blk._slice((slice(None), loc))
for i, rl in enumerate(blk.mgr_locs):
result[rl] = blk.iget((i, loc))
if isinstance(dtype, ExtensionDtype):
result = dtype.construct_array_type()._from_sequence(result, dtype=dtype)
return result
def consolidate(self) -> "BlockManager":
"""
Join together blocks having same dtype
Returns
-------
y : BlockManager
"""
if self.is_consolidated():
return self
bm = type(self)(self.blocks, self.axes)
bm._is_consolidated = False
bm._consolidate_inplace()
return bm
def _consolidate_inplace(self) -> None:
if not self.is_consolidated():
self.blocks = tuple(_consolidate(self.blocks))
self._is_consolidated = True
self._known_consolidated = True
self._rebuild_blknos_and_blklocs()
def iget(self, i: int) -> "SingleBlockManager":
"""
Return the data as a SingleBlockManager.
"""
block = self.blocks[self.blknos[i]]
values = block.iget(self.blklocs[i])
# shortcut for select a single-dim from a 2-dim BM
return SingleBlockManager(
block.make_block_same_class(
values, placement=slice(0, len(values)), ndim=1
),
self.axes[1],
)
def iget_values(self, i: int) -> ArrayLike:
"""
Return the data for column i as the values (ndarray or ExtensionArray).
"""
block = self.blocks[self.blknos[i]]
values = block.iget(self.blklocs[i])
return values
def idelete(self, indexer):
"""
Delete selected locations in-place (new block and array, same BlockManager)
"""
is_deleted = np.zeros(self.shape[0], dtype=np.bool_)
is_deleted[indexer] = True
ref_loc_offset = -is_deleted.cumsum()
is_blk_deleted = [False] * len(self.blocks)
if isinstance(indexer, int):
affected_start = indexer
else:
affected_start = is_deleted.nonzero()[0][0]
for blkno, _ in _fast_count_smallints(self.blknos[affected_start:]):
blk = self.blocks[blkno]
bml = blk.mgr_locs
blk_del = is_deleted[bml.indexer].nonzero()[0]
if len(blk_del) == len(bml):
is_blk_deleted[blkno] = True
continue
elif len(blk_del) != 0:
blk.delete(blk_del)
bml = blk.mgr_locs
blk.mgr_locs = bml.add(ref_loc_offset[bml.indexer])
# FIXME: use Index.delete as soon as it uses fastpath=True
self.axes[0] = self.items[~is_deleted]
self.blocks = tuple(
b for blkno, b in enumerate(self.blocks) if not is_blk_deleted[blkno]
)
self._rebuild_blknos_and_blklocs()
def iset(self, loc: Union[int, slice, np.ndarray], value):
"""
Set new item in-place. Does not consolidate. Adds new Block if not
contained in the current set of items
"""
value = extract_array(value, extract_numpy=True)
# FIXME: refactor, clearly separate broadcasting & zip-like assignment
# can prob also fix the various if tests for sparse/categorical
if self._blklocs is None and self.ndim > 1:
self._rebuild_blknos_and_blklocs()
value_is_extension_type = is_extension_array_dtype(value)
# categorical/sparse/datetimetz
if value_is_extension_type:
def value_getitem(placement):
return value
else:
if value.ndim == self.ndim - 1:
value = safe_reshape(value, (1,) + value.shape)
def value_getitem(placement):
return value
else:
def value_getitem(placement):
return value[placement.indexer]
if value.shape[1:] != self.shape[1:]:
raise AssertionError(
"Shape of new values must be compatible with manager shape"
)
if lib.is_integer(loc):
# We have 6 tests where loc is _not_ an int.
# In this case, get_blkno_placements will yield only one tuple,
# containing (self._blknos[loc], BlockPlacement(slice(0, 1, 1)))
loc = [loc]
# Accessing public blknos ensures the public versions are initialized
blknos = self.blknos[loc]
blklocs = self.blklocs[loc].copy()
unfit_mgr_locs = []
unfit_val_locs = []
removed_blknos = []
for blkno, val_locs in libinternals.get_blkno_placements(blknos, group=True):
blk = self.blocks[blkno]
blk_locs = blklocs[val_locs.indexer]
if blk.should_store(value):
blk.set_inplace(blk_locs, value_getitem(val_locs))
else:
unfit_mgr_locs.append(blk.mgr_locs.as_array[blk_locs])
unfit_val_locs.append(val_locs)
# If all block items are unfit, schedule the block for removal.
if len(val_locs) == len(blk.mgr_locs):
removed_blknos.append(blkno)
else:
blk.delete(blk_locs)
self._blklocs[blk.mgr_locs.indexer] = np.arange(len(blk))
if len(removed_blknos):
# Remove blocks & update blknos accordingly
is_deleted = np.zeros(self.nblocks, dtype=np.bool_)
is_deleted[removed_blknos] = True
new_blknos = np.empty(self.nblocks, dtype=np.int64)
new_blknos.fill(-1)
new_blknos[~is_deleted] = np.arange(self.nblocks - len(removed_blknos))
self._blknos = new_blknos[self._blknos]
self.blocks = tuple(
blk for i, blk in enumerate(self.blocks) if i not in set(removed_blknos)
)
if unfit_val_locs:
unfit_mgr_locs = np.concatenate(unfit_mgr_locs)
unfit_count = len(unfit_mgr_locs)
new_blocks: List[Block] = []
if value_is_extension_type:
# This code (ab-)uses the fact that EA blocks contain only
# one item.
# TODO(EA2D): special casing unnecessary with 2D EAs
new_blocks.extend(
make_block(
values=value,
ndim=self.ndim,
placement=slice(mgr_loc, mgr_loc + 1),
)
for mgr_loc in unfit_mgr_locs
)
self._blknos[unfit_mgr_locs] = np.arange(unfit_count) + len(self.blocks)
self._blklocs[unfit_mgr_locs] = 0
else:
# unfit_val_locs contains BlockPlacement objects
unfit_val_items = unfit_val_locs[0].append(unfit_val_locs[1:])
new_blocks.append(
make_block(
values=value_getitem(unfit_val_items),
ndim=self.ndim,
placement=unfit_mgr_locs,
)
)
self._blknos[unfit_mgr_locs] = len(self.blocks)
self._blklocs[unfit_mgr_locs] = np.arange(unfit_count)
self.blocks += tuple(new_blocks)
# Newly created block's dtype may already be present.
self._known_consolidated = False
def insert(self, loc: int, item: Label, value, allow_duplicates: bool = False):
"""
Insert item at selected position.
Parameters
----------
loc : int
item : hashable
value : array_like
allow_duplicates: bool
If False, trying to insert non-unique item will raise
"""
if not allow_duplicates and item in self.items:
# Should this be a different kind of error??
raise ValueError(f"cannot insert {item}, already exists")
if not isinstance(loc, int):
raise TypeError("loc must be int")
# insert to the axis; this could possibly raise a TypeError
new_axis = self.items.insert(loc, item)
if value.ndim == self.ndim - 1 and not is_extension_array_dtype(value.dtype):
# TODO(EA2D): special case not needed with 2D EAs
value = safe_reshape(value, (1,) + value.shape)
block = make_block(values=value, ndim=self.ndim, placement=slice(loc, loc + 1))
for blkno, count in _fast_count_smallints(self.blknos[loc:]):
blk = self.blocks[blkno]
if count == len(blk.mgr_locs):
blk.mgr_locs = blk.mgr_locs.add(1)
else:
new_mgr_locs = blk.mgr_locs.as_array.copy()
new_mgr_locs[new_mgr_locs >= loc] += 1
blk.mgr_locs = new_mgr_locs
# Accessing public blklocs ensures the public versions are initialized
if loc == self.blklocs.shape[0]:
# np.append is a lot faster, let's use it if we can.
self._blklocs = np.append(self._blklocs, 0)
self._blknos = np.append(self._blknos, len(self.blocks))
else:
self._blklocs = np.insert(self._blklocs, loc, 0)
self._blknos = np.insert(self._blknos, loc, len(self.blocks))
self.axes[0] = new_axis
self.blocks += (block,)
self._known_consolidated = False
if len(self.blocks) > 100:
self._consolidate_inplace()
def reindex_axis(
self,
new_index,
axis: int,
method=None,
limit=None,
fill_value=None,
copy: bool = True,
consolidate: bool = True,
only_slice: bool = False,
):
"""
Conform block manager to new index.
"""
new_index = ensure_index(new_index)
new_index, indexer = self.axes[axis].reindex(
new_index, method=method, limit=limit
)
return self.reindex_indexer(
new_index,
indexer,
axis=axis,
fill_value=fill_value,
copy=copy,
consolidate=consolidate,
only_slice=only_slice,
)
def reindex_indexer(
self: T,
new_axis,
indexer,
axis: int,
fill_value=None,
allow_dups: bool = False,
copy: bool = True,
consolidate: bool = True,
only_slice: bool = False,
) -> T:
"""
Parameters
----------
new_axis : Index
indexer : ndarray of int64 or None
axis : int
fill_value : object, default None
allow_dups : bool, default False
copy : bool, default True
consolidate: bool, default True
Whether to consolidate inplace before reindexing.
only_slice : bool, default False
Whether to take views, not copies, along columns.
pandas-indexer with -1's only.
"""
if indexer is None:
if new_axis is self.axes[axis] and not copy:
return self
result = self.copy(deep=copy)
result.axes = list(self.axes)
result.axes[axis] = new_axis
return result
if consolidate:
self._consolidate_inplace()
# some axes don't allow reindexing with dups
if not allow_dups:
self.axes[axis]._can_reindex(indexer)
if axis >= self.ndim:
raise IndexError("Requested axis not found in manager")
if axis == 0:
new_blocks = self._slice_take_blocks_ax0(
indexer, fill_value=fill_value, only_slice=only_slice
)
else:
new_blocks = [
blk.take_nd(
indexer,
axis=axis,
fill_value=(
fill_value if fill_value is not None else blk.fill_value
),
)
for blk in self.blocks
]
new_axes = list(self.axes)
new_axes[axis] = new_axis
return type(self).from_blocks(new_blocks, new_axes)
def _slice_take_blocks_ax0(
self, slice_or_indexer, fill_value=lib.no_default, only_slice: bool = False
):
"""
Slice/take blocks along axis=0.
Overloaded for SingleBlock
Parameters
----------
slice_or_indexer : slice, ndarray[bool], or list-like of ints
fill_value : scalar, default lib.no_default
only_slice : bool, default False
If True, we always return views on existing arrays, never copies.
This is used when called from ops.blockwise.operate_blockwise.
Returns
-------
new_blocks : list of Block
"""
allow_fill = fill_value is not lib.no_default
sl_type, slobj, sllen = _preprocess_slice_or_indexer(
slice_or_indexer, self.shape[0], allow_fill=allow_fill
)
if self.is_single_block:
blk = self.blocks[0]
if sl_type in ("slice", "mask"):
# GH#32959 EABlock would fail since we cant make 0-width
# TODO(EA2D): special casing unnecessary with 2D EAs
if sllen == 0:
return []
return [blk.getitem_block(slobj, new_mgr_locs=slice(0, sllen))]
elif not allow_fill or self.ndim == 1:
if allow_fill and fill_value is None:
_, fill_value = maybe_promote(blk.dtype)
if not allow_fill and only_slice:
# GH#33597 slice instead of take, so we get
# views instead of copies
blocks = [
blk.getitem_block([ml], new_mgr_locs=i)
for i, ml in enumerate(slobj)
]
return blocks
else:
return [
blk.take_nd(
slobj,
axis=0,
new_mgr_locs=slice(0, sllen),
fill_value=fill_value,
)
]
if sl_type in ("slice", "mask"):
blknos = self.blknos[slobj]
blklocs = self.blklocs[slobj]
else:
blknos = algos.take_1d(
self.blknos, slobj, fill_value=-1, allow_fill=allow_fill
)
blklocs = algos.take_1d(
self.blklocs, slobj, fill_value=-1, allow_fill=allow_fill
)
# When filling blknos, make sure blknos is updated before appending to
# blocks list, that way new blkno is exactly len(blocks).
blocks = []
group = not only_slice
for blkno, mgr_locs in libinternals.get_blkno_placements(blknos, group=group):
if blkno == -1:
# If we've got here, fill_value was not lib.no_default
blocks.append(
self._make_na_block(placement=mgr_locs, fill_value=fill_value)
)
else:
blk = self.blocks[blkno]
# Otherwise, slicing along items axis is necessary.
if not blk._can_consolidate:
# A non-consolidatable block, it's easy, because there's
# only one item and each mgr loc is a copy of that single
# item.
for mgr_loc in mgr_locs:
newblk = blk.copy(deep=False)
newblk.mgr_locs = slice(mgr_loc, mgr_loc + 1)
blocks.append(newblk)
else:
# GH#32779 to avoid the performance penalty of copying,
# we may try to only slice
taker = blklocs[mgr_locs.indexer]
max_len = max(len(mgr_locs), taker.max() + 1)
if only_slice:
taker = lib.maybe_indices_to_slice(taker, max_len)
if isinstance(taker, slice):
nb = blk.getitem_block(taker, new_mgr_locs=mgr_locs)
blocks.append(nb)
elif only_slice:
# GH#33597 slice instead of take, so we get
# views instead of copies
for i, ml in zip(taker, mgr_locs):
nb = blk.getitem_block([i], new_mgr_locs=ml)
blocks.append(nb)
else:
nb = blk.take_nd(taker, axis=0, new_mgr_locs=mgr_locs)
blocks.append(nb)
return blocks
def _make_na_block(self, placement, fill_value=None):
if fill_value is None:
fill_value = np.nan
block_shape = list(self.shape)
block_shape[0] = len(placement)
dtype, fill_value = infer_dtype_from_scalar(fill_value)
block_values = np.empty(block_shape, dtype=dtype)
block_values.fill(fill_value)
return make_block(block_values, placement=placement, ndim=block_values.ndim)
def take(self, indexer, axis: int = 1, verify: bool = True, convert: bool = True):
"""
Take items along any axis.
"""
self._consolidate_inplace()
indexer = (
np.arange(indexer.start, indexer.stop, indexer.step, dtype="int64")
if isinstance(indexer, slice)
else np.asanyarray(indexer, dtype="int64")
)
n = self.shape[axis]
if convert:
indexer = maybe_convert_indices(indexer, n)
if verify:
if ((indexer == -1) | (indexer >= n)).any():
raise Exception("Indices must be nonzero and less than the axis length")
new_labels = self.axes[axis].take(indexer)
return self.reindex_indexer(
new_axis=new_labels, indexer=indexer, axis=axis, allow_dups=True
)
def equals(self, other: object) -> bool:
if not isinstance(other, BlockManager):
return False
self_axes, other_axes = self.axes, other.axes
if len(self_axes) != len(other_axes):
return False
if not all(ax1.equals(ax2) for ax1, ax2 in zip(self_axes, other_axes)):
return False
if self.ndim == 1:
# For SingleBlockManager (i.e.Series)
if other.ndim != 1:
return False
left = self.blocks[0].values
right = other.blocks[0].values
return array_equals(left, right)
return blockwise_all(self, other, array_equals)
def unstack(self, unstacker, fill_value) -> "BlockManager":
"""
Return a BlockManager with all blocks unstacked..
Parameters
----------
unstacker : reshape._Unstacker
fill_value : Any
fill_value for newly introduced missing values.
Returns
-------
unstacked : BlockManager
"""
new_columns = unstacker.get_new_columns(self.items)
new_index = unstacker.new_index
new_blocks: List[Block] = []
columns_mask: List[np.ndarray] = []
for blk in self.blocks:
blk_cols = self.items[blk.mgr_locs.indexer]
new_items = unstacker.get_new_columns(blk_cols)
new_placement = new_columns.get_indexer(new_items)
blocks, mask = blk._unstack(
unstacker, fill_value, new_placement=new_placement
)
new_blocks.extend(blocks)
columns_mask.extend(mask)
new_columns = new_columns[columns_mask]
bm = BlockManager(new_blocks, [new_columns, new_index])
return bm
class SingleBlockManager(BlockManager):
""" manage a single block with """
ndim = 1
_is_consolidated = True
_known_consolidated = True
__slots__ = ()
is_single_block = True
def __init__(
self,
block: Block,
axis: Index,
do_integrity_check: bool = False,
fastpath=lib.no_default,
):
assert isinstance(block, Block), type(block)
assert isinstance(axis, Index), type(axis)
if fastpath is not lib.no_default:
warnings.warn(
"The `fastpath` keyword is deprecated and will be removed "
"in a future version.",
FutureWarning,
stacklevel=2,
)
self.axes = [axis]
self.blocks = (block,)
@classmethod
def from_blocks(
cls, blocks: List[Block], axes: List[Index]
) -> "SingleBlockManager":
"""
Constructor for BlockManager and SingleBlockManager with same signature.
"""
assert len(blocks) == 1
assert len(axes) == 1
return cls(blocks[0], axes[0], do_integrity_check=False)
@classmethod
def from_array(cls, array: ArrayLike, index: Index) -> "SingleBlockManager":
"""
Constructor for if we have an array that is not yet a Block.
"""
block = make_block(array, placement=slice(0, len(index)), ndim=1)
return cls(block, index)
def _post_setstate(self):
pass
@property
def _block(self) -> Block:
return self.blocks[0]
@property
def _blknos(self):
""" compat with BlockManager """
return None
@property
def _blklocs(self):
""" compat with BlockManager """
return None
def get_slice(self, slobj: slice, axis: int = 0) -> "SingleBlockManager":
if axis >= self.ndim:
raise IndexError("Requested axis not found in manager")
blk = self._block
array = blk._slice(slobj)
block = blk.make_block_same_class(array, placement=slice(0, len(array)))
return type(self)(block, self.index[slobj])
@property
def index(self) -> Index:
return self.axes[0]
@property
def dtype(self) -> DtypeObj:
return self._block.dtype
def get_dtypes(self) -> np.ndarray:
return np.array([self._block.dtype])
def external_values(self):
"""The array that Series.values returns"""
return self._block.external_values()
def internal_values(self):
"""The array that Series._values returns"""
return self._block.internal_values()
@property
def _can_hold_na(self) -> bool:
return self._block._can_hold_na
def is_consolidated(self) -> bool:
return True
def _consolidate_check(self):
pass
def _consolidate_inplace(self):
pass
def idelete(self, indexer):
"""
Delete single location from SingleBlockManager.
Ensures that self.blocks doesn't become empty.
"""
self._block.delete(indexer)
self.axes[0] = self.axes[0].delete(indexer)
def fast_xs(self, loc):
"""
fast path for getting a cross-section
return a view of the data
"""
raise NotImplementedError("Use series._values[loc] instead")
# --------------------------------------------------------------------
# Constructor Helpers
def create_block_manager_from_blocks(blocks, axes: List[Index]) -> BlockManager:
try:
if len(blocks) == 1 and not isinstance(blocks[0], Block):
# if blocks[0] is of length 0, return empty blocks
if not len(blocks[0]):
blocks = []
else:
# It's OK if a single block is passed as values, its placement
# is basically "all items", but if there're many, don't bother
# converting, it's an error anyway.
blocks = [
make_block(
values=blocks[0], placement=slice(0, len(axes[0])), ndim=2
)
]
mgr = BlockManager(blocks, axes)
mgr._consolidate_inplace()
return mgr
except ValueError as e:
blocks = [getattr(b, "values", b) for b in blocks]
tot_items = sum(b.shape[0] for b in blocks)
raise construction_error(tot_items, blocks[0].shape[1:], axes, e)
def create_block_manager_from_arrays(
arrays, names: Index, axes: List[Index]
) -> BlockManager:
assert isinstance(names, Index)
assert isinstance(axes, list)
assert all(isinstance(x, Index) for x in axes)
# ensure we dont have any PandasArrays when we call get_block_type
# Note: just calling extract_array breaks tests that patch PandasArray._typ.
arrays = [x if not isinstance(x, ABCPandasArray) else x.to_numpy() for x in arrays]
try:
blocks = _form_blocks(arrays, names, axes)
mgr = BlockManager(blocks, axes)
mgr._consolidate_inplace()
return mgr
except ValueError as e:
raise construction_error(len(arrays), arrays[0].shape, axes, e)
def construction_error(tot_items, block_shape, axes, e=None):
""" raise a helpful message about our construction """
passed = tuple(map(int, [tot_items] + list(block_shape)))
# Correcting the user facing error message during dataframe construction
if len(passed) <= 2:
passed = passed[::-1]
implied = tuple(len(ax) for ax in axes)
# Correcting the user facing error message during dataframe construction
if len(implied) <= 2:
implied = implied[::-1]
# We return the exception object instead of raising it so that we
# can raise it in the caller; mypy plays better with that
if passed == implied and e is not None:
return e
if block_shape[0] == 0:
return ValueError("Empty data passed with indices specified.")
return ValueError(f"Shape of passed values is {passed}, indices imply {implied}")
# -----------------------------------------------------------------------
def _form_blocks(arrays, names: Index, axes) -> List[Block]:
# put "leftover" items in float bucket, where else?
# generalize?
items_dict: DefaultDict[str, List] = defaultdict(list)
extra_locs = []
names_idx = names
if names_idx.equals(axes[0]):
names_indexer = np.arange(len(names_idx))
else:
assert names_idx.intersection(axes[0]).is_unique
names_indexer = names_idx.get_indexer_for(axes[0])
for i, name_idx in enumerate(names_indexer):
if name_idx == -1:
extra_locs.append(i)
continue
k = names[name_idx]
v = arrays[name_idx]
block_type = get_block_type(v)
items_dict[block_type.__name__].append((i, k, v))
blocks: List[Block] = []
if len(items_dict["FloatBlock"]):
float_blocks = _multi_blockify(items_dict["FloatBlock"])
blocks.extend(float_blocks)
if len(items_dict["ComplexBlock"]):
complex_blocks = _multi_blockify(items_dict["ComplexBlock"])
blocks.extend(complex_blocks)
if len(items_dict["TimeDeltaBlock"]):
timedelta_blocks = _multi_blockify(items_dict["TimeDeltaBlock"])
blocks.extend(timedelta_blocks)
if len(items_dict["IntBlock"]):
int_blocks = _multi_blockify(items_dict["IntBlock"])
blocks.extend(int_blocks)
if len(items_dict["DatetimeBlock"]):
datetime_blocks = _simple_blockify(items_dict["DatetimeBlock"], DT64NS_DTYPE)
blocks.extend(datetime_blocks)
if len(items_dict["DatetimeTZBlock"]):
dttz_blocks = [
make_block(array, klass=DatetimeTZBlock, placement=i, ndim=2)
for i, _, array in items_dict["DatetimeTZBlock"]
]
blocks.extend(dttz_blocks)
if len(items_dict["BoolBlock"]):
bool_blocks = _simple_blockify(items_dict["BoolBlock"], np.bool_)
blocks.extend(bool_blocks)
if len(items_dict["ObjectBlock"]) > 0:
object_blocks = _simple_blockify(items_dict["ObjectBlock"], np.object_)
blocks.extend(object_blocks)
if len(items_dict["CategoricalBlock"]) > 0:
cat_blocks = [
make_block(array, klass=CategoricalBlock, placement=i, ndim=2)
for i, _, array in items_dict["CategoricalBlock"]
]
blocks.extend(cat_blocks)
if len(items_dict["ExtensionBlock"]):
external_blocks = [
make_block(array, klass=ExtensionBlock, placement=i, ndim=2)
for i, _, array in items_dict["ExtensionBlock"]
]
blocks.extend(external_blocks)
if len(items_dict["ObjectValuesExtensionBlock"]):
external_blocks = [
make_block(array, klass=ObjectValuesExtensionBlock, placement=i, ndim=2)
for i, _, array in items_dict["ObjectValuesExtensionBlock"]
]
blocks.extend(external_blocks)
if len(extra_locs):
shape = (len(extra_locs),) + tuple(len(x) for x in axes[1:])
# empty items -> dtype object
block_values = np.empty(shape, dtype=object)
block_values.fill(np.nan)
na_block = make_block(block_values, placement=extra_locs, ndim=2)
blocks.append(na_block)
return blocks
def _simple_blockify(tuples, dtype) -> List[Block]:
"""
return a single array of a block that has a single dtype; if dtype is
not None, coerce to this dtype
"""
values, placement = _stack_arrays(tuples, dtype)
# TODO: CHECK DTYPE?
if dtype is not None and values.dtype != dtype: # pragma: no cover
values = values.astype(dtype)
block = make_block(values, placement=placement, ndim=2)
return [block]
def _multi_blockify(tuples, dtype=None):
""" return an array of blocks that potentially have different dtypes """
# group by dtype
grouper = itertools.groupby(tuples, lambda x: x[2].dtype)
new_blocks = []
for dtype, tup_block in grouper:
values, placement = _stack_arrays(list(tup_block), dtype)
block = make_block(values, placement=placement, ndim=2)
new_blocks.append(block)
return new_blocks
def _stack_arrays(tuples, dtype):
# fml
def _asarray_compat(x):
if isinstance(x, ABCSeries):
return x._values
else:
return np.asarray(x)
def _shape_compat(x) -> Shape:
if isinstance(x, ABCSeries):
return (len(x),)
else:
return x.shape
placement, names, arrays = zip(*tuples)
first = arrays[0]
shape = (len(arrays),) + _shape_compat(first)
stacked = np.empty(shape, dtype=dtype)
for i, arr in enumerate(arrays):
stacked[i] = _asarray_compat(arr)
return stacked, placement
def _interleaved_dtype(blocks: Sequence[Block]) -> Optional[DtypeObj]:
"""
Find the common dtype for `blocks`.
Parameters
----------
blocks : List[Block]
Returns
-------
dtype : np.dtype, ExtensionDtype, or None
None is returned when `blocks` is empty.
"""
if not len(blocks):
return None
return find_common_type([b.dtype for b in blocks])
def _consolidate(blocks):
"""
Merge blocks having same dtype, exclude non-consolidating blocks
"""
# sort by _can_consolidate, dtype
gkey = lambda x: x._consolidate_key
grouper = itertools.groupby(sorted(blocks, key=gkey), gkey)
new_blocks: List[Block] = []
for (_can_consolidate, dtype), group_blocks in grouper:
merged_blocks = _merge_blocks(
list(group_blocks), dtype=dtype, can_consolidate=_can_consolidate
)
new_blocks = extend_blocks(merged_blocks, new_blocks)
return new_blocks
def _merge_blocks(
blocks: List[Block], dtype: DtypeObj, can_consolidate: bool
) -> List[Block]:
if len(blocks) == 1:
return blocks
if can_consolidate:
if dtype is None:
if len({b.dtype for b in blocks}) != 1:
raise AssertionError("_merge_blocks are invalid!")
# TODO: optimization potential in case all mgrs contain slices and
# combination of those slices is a slice, too.
new_mgr_locs = np.concatenate([b.mgr_locs.as_array for b in blocks])
new_values = np.vstack([b.values for b in blocks])
argsort = np.argsort(new_mgr_locs)
new_values = new_values[argsort]
new_mgr_locs = new_mgr_locs[argsort]
return [make_block(new_values, placement=new_mgr_locs, ndim=2)]
# can't consolidate --> no merge
return blocks
def _fast_count_smallints(arr: np.ndarray) -> np.ndarray:
"""Faster version of set(arr) for sequences of small numbers."""
counts = np.bincount(arr.astype(np.int_))
nz = counts.nonzero()[0]
return np.c_[nz, counts[nz]]
def _preprocess_slice_or_indexer(slice_or_indexer, length: int, allow_fill: bool):
if isinstance(slice_or_indexer, slice):
return (
"slice",
slice_or_indexer,
libinternals.slice_len(slice_or_indexer, length),
)
elif (
isinstance(slice_or_indexer, np.ndarray) and slice_or_indexer.dtype == np.bool_
):
return "mask", slice_or_indexer, slice_or_indexer.sum()
else:
indexer = np.asanyarray(slice_or_indexer, dtype=np.int64)
if not allow_fill:
indexer = maybe_convert_indices(indexer, length)
return "fancy", indexer, len(indexer)