990 lines
34 KiB
Python
990 lines
34 KiB
Python
from __future__ import annotations
|
|
|
|
import itertools
|
|
from typing import (
|
|
TYPE_CHECKING,
|
|
cast,
|
|
)
|
|
import warnings
|
|
|
|
import numpy as np
|
|
|
|
import pandas._libs.reshape as libreshape
|
|
from pandas.errors import PerformanceWarning
|
|
from pandas.util._decorators import cache_readonly
|
|
from pandas.util._exceptions import find_stack_level
|
|
|
|
from pandas.core.dtypes.cast import (
|
|
find_common_type,
|
|
maybe_promote,
|
|
)
|
|
from pandas.core.dtypes.common import (
|
|
ensure_platform_int,
|
|
is_1d_only_ea_dtype,
|
|
is_integer,
|
|
needs_i8_conversion,
|
|
)
|
|
from pandas.core.dtypes.dtypes import ExtensionDtype
|
|
from pandas.core.dtypes.missing import notna
|
|
|
|
import pandas.core.algorithms as algos
|
|
from pandas.core.algorithms import (
|
|
factorize,
|
|
unique,
|
|
)
|
|
from pandas.core.arrays.categorical import factorize_from_iterable
|
|
from pandas.core.construction import ensure_wrapped_if_datetimelike
|
|
from pandas.core.frame import DataFrame
|
|
from pandas.core.indexes.api import (
|
|
Index,
|
|
MultiIndex,
|
|
RangeIndex,
|
|
)
|
|
from pandas.core.reshape.concat import concat
|
|
from pandas.core.series import Series
|
|
from pandas.core.sorting import (
|
|
compress_group_index,
|
|
decons_obs_group_ids,
|
|
get_compressed_ids,
|
|
get_group_index,
|
|
get_group_index_sorter,
|
|
)
|
|
|
|
if TYPE_CHECKING:
|
|
from pandas._typing import (
|
|
ArrayLike,
|
|
Level,
|
|
npt,
|
|
)
|
|
|
|
from pandas.core.arrays import ExtensionArray
|
|
from pandas.core.indexes.frozen import FrozenList
|
|
|
|
|
|
class _Unstacker:
|
|
"""
|
|
Helper class to unstack data / pivot with multi-level index
|
|
|
|
Parameters
|
|
----------
|
|
index : MultiIndex
|
|
level : int or str, default last level
|
|
Level to "unstack". Accepts a name for the level.
|
|
fill_value : scalar, optional
|
|
Default value to fill in missing values if subgroups do not have the
|
|
same set of labels. By default, missing values will be replaced with
|
|
the default fill value for that data type, NaN for float, NaT for
|
|
datetimelike, etc. For integer types, by default data will converted to
|
|
float and missing values will be set to NaN.
|
|
constructor : object
|
|
Pandas ``DataFrame`` or subclass used to create unstacked
|
|
response. If None, DataFrame will be used.
|
|
|
|
Examples
|
|
--------
|
|
>>> index = pd.MultiIndex.from_tuples([('one', 'a'), ('one', 'b'),
|
|
... ('two', 'a'), ('two', 'b')])
|
|
>>> s = pd.Series(np.arange(1, 5, dtype=np.int64), index=index)
|
|
>>> s
|
|
one a 1
|
|
b 2
|
|
two a 3
|
|
b 4
|
|
dtype: int64
|
|
|
|
>>> s.unstack(level=-1)
|
|
a b
|
|
one 1 2
|
|
two 3 4
|
|
|
|
>>> s.unstack(level=0)
|
|
one two
|
|
a 1 3
|
|
b 2 4
|
|
|
|
Returns
|
|
-------
|
|
unstacked : DataFrame
|
|
"""
|
|
|
|
def __init__(
|
|
self, index: MultiIndex, level: Level, constructor, sort: bool = True
|
|
) -> None:
|
|
self.constructor = constructor
|
|
self.sort = sort
|
|
|
|
self.index = index.remove_unused_levels()
|
|
|
|
self.level = self.index._get_level_number(level)
|
|
|
|
# when index includes `nan`, need to lift levels/strides by 1
|
|
self.lift = 1 if -1 in self.index.codes[self.level] else 0
|
|
|
|
# Note: the "pop" below alters these in-place.
|
|
self.new_index_levels = list(self.index.levels)
|
|
self.new_index_names = list(self.index.names)
|
|
|
|
self.removed_name = self.new_index_names.pop(self.level)
|
|
self.removed_level = self.new_index_levels.pop(self.level)
|
|
self.removed_level_full = index.levels[self.level]
|
|
if not self.sort:
|
|
unique_codes = unique(self.index.codes[self.level])
|
|
self.removed_level = self.removed_level.take(unique_codes)
|
|
self.removed_level_full = self.removed_level_full.take(unique_codes)
|
|
|
|
# Bug fix GH 20601
|
|
# If the data frame is too big, the number of unique index combination
|
|
# will cause int32 overflow on windows environments.
|
|
# We want to check and raise an warning before this happens
|
|
num_rows = np.max([index_level.size for index_level in self.new_index_levels])
|
|
num_columns = self.removed_level.size
|
|
|
|
# GH20601: This forces an overflow if the number of cells is too high.
|
|
num_cells = num_rows * num_columns
|
|
|
|
# GH 26314: Previous ValueError raised was too restrictive for many users.
|
|
if num_cells > np.iinfo(np.int32).max:
|
|
warnings.warn(
|
|
f"The following operation may generate {num_cells} cells "
|
|
f"in the resulting pandas object.",
|
|
PerformanceWarning,
|
|
stacklevel=find_stack_level(),
|
|
)
|
|
|
|
self._make_selectors()
|
|
|
|
@cache_readonly
|
|
def _indexer_and_to_sort(
|
|
self,
|
|
) -> tuple[
|
|
npt.NDArray[np.intp],
|
|
list[np.ndarray], # each has _some_ signed integer dtype
|
|
]:
|
|
v = self.level
|
|
|
|
codes = list(self.index.codes)
|
|
levs = list(self.index.levels)
|
|
to_sort = codes[:v] + codes[v + 1 :] + [codes[v]]
|
|
sizes = tuple(len(x) for x in levs[:v] + levs[v + 1 :] + [levs[v]])
|
|
|
|
comp_index, obs_ids = get_compressed_ids(to_sort, sizes)
|
|
ngroups = len(obs_ids)
|
|
|
|
indexer = get_group_index_sorter(comp_index, ngroups)
|
|
return indexer, to_sort
|
|
|
|
@cache_readonly
|
|
def sorted_labels(self) -> list[np.ndarray]:
|
|
indexer, to_sort = self._indexer_and_to_sort
|
|
if self.sort:
|
|
return [line.take(indexer) for line in to_sort]
|
|
return to_sort
|
|
|
|
def _make_sorted_values(self, values: np.ndarray) -> np.ndarray:
|
|
if self.sort:
|
|
indexer, _ = self._indexer_and_to_sort
|
|
|
|
sorted_values = algos.take_nd(values, indexer, axis=0)
|
|
return sorted_values
|
|
return values
|
|
|
|
def _make_selectors(self):
|
|
new_levels = self.new_index_levels
|
|
|
|
# make the mask
|
|
remaining_labels = self.sorted_labels[:-1]
|
|
level_sizes = tuple(len(x) for x in new_levels)
|
|
|
|
comp_index, obs_ids = get_compressed_ids(remaining_labels, level_sizes)
|
|
ngroups = len(obs_ids)
|
|
|
|
comp_index = ensure_platform_int(comp_index)
|
|
stride = self.index.levshape[self.level] + self.lift
|
|
self.full_shape = ngroups, stride
|
|
|
|
selector = self.sorted_labels[-1] + stride * comp_index + self.lift
|
|
mask = np.zeros(np.prod(self.full_shape), dtype=bool)
|
|
mask.put(selector, True)
|
|
|
|
if mask.sum() < len(self.index):
|
|
raise ValueError("Index contains duplicate entries, cannot reshape")
|
|
|
|
self.group_index = comp_index
|
|
self.mask = mask
|
|
if self.sort:
|
|
self.compressor = comp_index.searchsorted(np.arange(ngroups))
|
|
else:
|
|
self.compressor = np.sort(np.unique(comp_index, return_index=True)[1])
|
|
|
|
@cache_readonly
|
|
def mask_all(self) -> bool:
|
|
return bool(self.mask.all())
|
|
|
|
@cache_readonly
|
|
def arange_result(self) -> tuple[npt.NDArray[np.intp], npt.NDArray[np.bool_]]:
|
|
# We cache this for reuse in ExtensionBlock._unstack
|
|
dummy_arr = np.arange(len(self.index), dtype=np.intp)
|
|
new_values, mask = self.get_new_values(dummy_arr, fill_value=-1)
|
|
return new_values, mask.any(0)
|
|
# TODO: in all tests we have mask.any(0).all(); can we rely on that?
|
|
|
|
def get_result(self, values, value_columns, fill_value) -> DataFrame:
|
|
if values.ndim == 1:
|
|
values = values[:, np.newaxis]
|
|
|
|
if value_columns is None and values.shape[1] != 1: # pragma: no cover
|
|
raise ValueError("must pass column labels for multi-column data")
|
|
|
|
values, _ = self.get_new_values(values, fill_value)
|
|
columns = self.get_new_columns(value_columns)
|
|
index = self.new_index
|
|
|
|
return self.constructor(
|
|
values, index=index, columns=columns, dtype=values.dtype
|
|
)
|
|
|
|
def get_new_values(self, values, fill_value=None):
|
|
if values.ndim == 1:
|
|
values = values[:, np.newaxis]
|
|
|
|
sorted_values = self._make_sorted_values(values)
|
|
|
|
# place the values
|
|
length, width = self.full_shape
|
|
stride = values.shape[1]
|
|
result_width = width * stride
|
|
result_shape = (length, result_width)
|
|
mask = self.mask
|
|
mask_all = self.mask_all
|
|
|
|
# we can simply reshape if we don't have a mask
|
|
if mask_all and len(values):
|
|
# TODO: Under what circumstances can we rely on sorted_values
|
|
# matching values? When that holds, we can slice instead
|
|
# of take (in particular for EAs)
|
|
new_values = (
|
|
sorted_values.reshape(length, width, stride)
|
|
.swapaxes(1, 2)
|
|
.reshape(result_shape)
|
|
)
|
|
new_mask = np.ones(result_shape, dtype=bool)
|
|
return new_values, new_mask
|
|
|
|
dtype = values.dtype
|
|
|
|
# if our mask is all True, then we can use our existing dtype
|
|
if mask_all:
|
|
dtype = values.dtype
|
|
new_values = np.empty(result_shape, dtype=dtype)
|
|
else:
|
|
if isinstance(dtype, ExtensionDtype):
|
|
# GH#41875
|
|
# We are assuming that fill_value can be held by this dtype,
|
|
# unlike the non-EA case that promotes.
|
|
cls = dtype.construct_array_type()
|
|
new_values = cls._empty(result_shape, dtype=dtype)
|
|
new_values[:] = fill_value
|
|
else:
|
|
dtype, fill_value = maybe_promote(dtype, fill_value)
|
|
new_values = np.empty(result_shape, dtype=dtype)
|
|
new_values.fill(fill_value)
|
|
|
|
name = dtype.name
|
|
new_mask = np.zeros(result_shape, dtype=bool)
|
|
|
|
# we need to convert to a basic dtype
|
|
# and possibly coerce an input to our output dtype
|
|
# e.g. ints -> floats
|
|
if needs_i8_conversion(values.dtype):
|
|
sorted_values = sorted_values.view("i8")
|
|
new_values = new_values.view("i8")
|
|
else:
|
|
sorted_values = sorted_values.astype(name, copy=False)
|
|
|
|
# fill in our values & mask
|
|
libreshape.unstack(
|
|
sorted_values,
|
|
mask.view("u1"),
|
|
stride,
|
|
length,
|
|
width,
|
|
new_values,
|
|
new_mask.view("u1"),
|
|
)
|
|
|
|
# reconstruct dtype if needed
|
|
if needs_i8_conversion(values.dtype):
|
|
# view as datetime64 so we can wrap in DatetimeArray and use
|
|
# DTA's view method
|
|
new_values = new_values.view("M8[ns]")
|
|
new_values = ensure_wrapped_if_datetimelike(new_values)
|
|
new_values = new_values.view(values.dtype)
|
|
|
|
return new_values, new_mask
|
|
|
|
def get_new_columns(self, value_columns: Index | None):
|
|
if value_columns is None:
|
|
if self.lift == 0:
|
|
return self.removed_level._rename(name=self.removed_name)
|
|
|
|
lev = self.removed_level.insert(0, item=self.removed_level._na_value)
|
|
return lev.rename(self.removed_name)
|
|
|
|
stride = len(self.removed_level) + self.lift
|
|
width = len(value_columns)
|
|
propagator = np.repeat(np.arange(width), stride)
|
|
|
|
new_levels: FrozenList | list[Index]
|
|
|
|
if isinstance(value_columns, MultiIndex):
|
|
# error: Cannot determine type of "__add__" [has-type]
|
|
new_levels = value_columns.levels + ( # type: ignore[has-type]
|
|
self.removed_level_full,
|
|
)
|
|
new_names = value_columns.names + (self.removed_name,)
|
|
|
|
new_codes = [lab.take(propagator) for lab in value_columns.codes]
|
|
else:
|
|
new_levels = [
|
|
value_columns,
|
|
self.removed_level_full,
|
|
]
|
|
new_names = [value_columns.name, self.removed_name]
|
|
new_codes = [propagator]
|
|
|
|
repeater = self._repeater
|
|
|
|
# The entire level is then just a repetition of the single chunk:
|
|
new_codes.append(np.tile(repeater, width))
|
|
return MultiIndex(
|
|
levels=new_levels, codes=new_codes, names=new_names, verify_integrity=False
|
|
)
|
|
|
|
@cache_readonly
|
|
def _repeater(self) -> np.ndarray:
|
|
# The two indices differ only if the unstacked level had unused items:
|
|
if len(self.removed_level_full) != len(self.removed_level):
|
|
# In this case, we remap the new codes to the original level:
|
|
repeater = self.removed_level_full.get_indexer(self.removed_level)
|
|
if self.lift:
|
|
repeater = np.insert(repeater, 0, -1)
|
|
else:
|
|
# Otherwise, we just use each level item exactly once:
|
|
stride = len(self.removed_level) + self.lift
|
|
repeater = np.arange(stride) - self.lift
|
|
|
|
return repeater
|
|
|
|
@cache_readonly
|
|
def new_index(self) -> MultiIndex:
|
|
# Does not depend on values or value_columns
|
|
result_codes = [lab.take(self.compressor) for lab in self.sorted_labels[:-1]]
|
|
|
|
# construct the new index
|
|
if len(self.new_index_levels) == 1:
|
|
level, level_codes = self.new_index_levels[0], result_codes[0]
|
|
if (level_codes == -1).any():
|
|
level = level.insert(len(level), level._na_value)
|
|
return level.take(level_codes).rename(self.new_index_names[0])
|
|
|
|
return MultiIndex(
|
|
levels=self.new_index_levels,
|
|
codes=result_codes,
|
|
names=self.new_index_names,
|
|
verify_integrity=False,
|
|
)
|
|
|
|
|
|
def _unstack_multiple(
|
|
data: Series | DataFrame, clocs, fill_value=None, sort: bool = True
|
|
):
|
|
if len(clocs) == 0:
|
|
return data
|
|
|
|
# NOTE: This doesn't deal with hierarchical columns yet
|
|
|
|
index = data.index
|
|
index = cast(MultiIndex, index) # caller is responsible for checking
|
|
|
|
# GH 19966 Make sure if MultiIndexed index has tuple name, they will be
|
|
# recognised as a whole
|
|
if clocs in index.names:
|
|
clocs = [clocs]
|
|
clocs = [index._get_level_number(i) for i in clocs]
|
|
|
|
rlocs = [i for i in range(index.nlevels) if i not in clocs]
|
|
|
|
clevels = [index.levels[i] for i in clocs]
|
|
ccodes = [index.codes[i] for i in clocs]
|
|
cnames = [index.names[i] for i in clocs]
|
|
rlevels = [index.levels[i] for i in rlocs]
|
|
rcodes = [index.codes[i] for i in rlocs]
|
|
rnames = [index.names[i] for i in rlocs]
|
|
|
|
shape = tuple(len(x) for x in clevels)
|
|
group_index = get_group_index(ccodes, shape, sort=False, xnull=False)
|
|
|
|
comp_ids, obs_ids = compress_group_index(group_index, sort=False)
|
|
recons_codes = decons_obs_group_ids(comp_ids, obs_ids, shape, ccodes, xnull=False)
|
|
|
|
if not rlocs:
|
|
# Everything is in clocs, so the dummy df has a regular index
|
|
dummy_index = Index(obs_ids, name="__placeholder__")
|
|
else:
|
|
dummy_index = MultiIndex(
|
|
levels=rlevels + [obs_ids],
|
|
codes=rcodes + [comp_ids],
|
|
names=rnames + ["__placeholder__"],
|
|
verify_integrity=False,
|
|
)
|
|
|
|
if isinstance(data, Series):
|
|
dummy = data.copy()
|
|
dummy.index = dummy_index
|
|
|
|
unstacked = dummy.unstack("__placeholder__", fill_value=fill_value, sort=sort)
|
|
new_levels = clevels
|
|
new_names = cnames
|
|
new_codes = recons_codes
|
|
else:
|
|
if isinstance(data.columns, MultiIndex):
|
|
result = data
|
|
while clocs:
|
|
val = clocs.pop(0)
|
|
result = result.unstack(val, fill_value=fill_value, sort=sort)
|
|
clocs = [v if v < val else v - 1 for v in clocs]
|
|
|
|
return result
|
|
|
|
# GH#42579 deep=False to avoid consolidating
|
|
dummy_df = data.copy(deep=False)
|
|
dummy_df.index = dummy_index
|
|
|
|
unstacked = dummy_df.unstack(
|
|
"__placeholder__", fill_value=fill_value, sort=sort
|
|
)
|
|
if isinstance(unstacked, Series):
|
|
unstcols = unstacked.index
|
|
else:
|
|
unstcols = unstacked.columns
|
|
assert isinstance(unstcols, MultiIndex) # for mypy
|
|
new_levels = [unstcols.levels[0]] + clevels
|
|
new_names = [data.columns.name] + cnames
|
|
|
|
new_codes = [unstcols.codes[0]]
|
|
new_codes.extend(rec.take(unstcols.codes[-1]) for rec in recons_codes)
|
|
|
|
new_columns = MultiIndex(
|
|
levels=new_levels, codes=new_codes, names=new_names, verify_integrity=False
|
|
)
|
|
|
|
if isinstance(unstacked, Series):
|
|
unstacked.index = new_columns
|
|
else:
|
|
unstacked.columns = new_columns
|
|
|
|
return unstacked
|
|
|
|
|
|
def unstack(obj: Series | DataFrame, level, fill_value=None, sort: bool = True):
|
|
if isinstance(level, (tuple, list)):
|
|
if len(level) != 1:
|
|
# _unstack_multiple only handles MultiIndexes,
|
|
# and isn't needed for a single level
|
|
return _unstack_multiple(obj, level, fill_value=fill_value, sort=sort)
|
|
else:
|
|
level = level[0]
|
|
|
|
if not is_integer(level) and not level == "__placeholder__":
|
|
# check if level is valid in case of regular index
|
|
obj.index._get_level_number(level)
|
|
|
|
if isinstance(obj, DataFrame):
|
|
if isinstance(obj.index, MultiIndex):
|
|
return _unstack_frame(obj, level, fill_value=fill_value, sort=sort)
|
|
else:
|
|
return obj.T.stack(future_stack=True)
|
|
elif not isinstance(obj.index, MultiIndex):
|
|
# GH 36113
|
|
# Give nicer error messages when unstack a Series whose
|
|
# Index is not a MultiIndex.
|
|
raise ValueError(
|
|
f"index must be a MultiIndex to unstack, {type(obj.index)} was passed"
|
|
)
|
|
else:
|
|
if is_1d_only_ea_dtype(obj.dtype):
|
|
return _unstack_extension_series(obj, level, fill_value, sort=sort)
|
|
unstacker = _Unstacker(
|
|
obj.index, level=level, constructor=obj._constructor_expanddim, sort=sort
|
|
)
|
|
return unstacker.get_result(
|
|
obj._values, value_columns=None, fill_value=fill_value
|
|
)
|
|
|
|
|
|
def _unstack_frame(
|
|
obj: DataFrame, level, fill_value=None, sort: bool = True
|
|
) -> DataFrame:
|
|
assert isinstance(obj.index, MultiIndex) # checked by caller
|
|
unstacker = _Unstacker(
|
|
obj.index, level=level, constructor=obj._constructor, sort=sort
|
|
)
|
|
|
|
if not obj._can_fast_transpose:
|
|
mgr = obj._mgr.unstack(unstacker, fill_value=fill_value)
|
|
return obj._constructor_from_mgr(mgr, axes=mgr.axes)
|
|
else:
|
|
return unstacker.get_result(
|
|
obj._values, value_columns=obj.columns, fill_value=fill_value
|
|
)
|
|
|
|
|
|
def _unstack_extension_series(
|
|
series: Series, level, fill_value, sort: bool
|
|
) -> DataFrame:
|
|
"""
|
|
Unstack an ExtensionArray-backed Series.
|
|
|
|
The ExtensionDtype is preserved.
|
|
|
|
Parameters
|
|
----------
|
|
series : Series
|
|
A Series with an ExtensionArray for values
|
|
level : Any
|
|
The level name or number.
|
|
fill_value : Any
|
|
The user-level (not physical storage) fill value to use for
|
|
missing values introduced by the reshape. Passed to
|
|
``series.values.take``.
|
|
sort : bool
|
|
Whether to sort the resulting MuliIndex levels
|
|
|
|
Returns
|
|
-------
|
|
DataFrame
|
|
Each column of the DataFrame will have the same dtype as
|
|
the input Series.
|
|
"""
|
|
# Defer to the logic in ExtensionBlock._unstack
|
|
df = series.to_frame()
|
|
result = df.unstack(level=level, fill_value=fill_value, sort=sort)
|
|
|
|
# equiv: result.droplevel(level=0, axis=1)
|
|
# but this avoids an extra copy
|
|
result.columns = result.columns._drop_level_numbers([0])
|
|
return result
|
|
|
|
|
|
def stack(frame: DataFrame, level=-1, dropna: bool = True, sort: bool = True):
|
|
"""
|
|
Convert DataFrame to Series with multi-level Index. Columns become the
|
|
second level of the resulting hierarchical index
|
|
|
|
Returns
|
|
-------
|
|
stacked : Series or DataFrame
|
|
"""
|
|
|
|
def stack_factorize(index):
|
|
if index.is_unique:
|
|
return index, np.arange(len(index))
|
|
codes, categories = factorize_from_iterable(index)
|
|
return categories, codes
|
|
|
|
N, K = frame.shape
|
|
|
|
# Will also convert negative level numbers and check if out of bounds.
|
|
level_num = frame.columns._get_level_number(level)
|
|
|
|
if isinstance(frame.columns, MultiIndex):
|
|
return _stack_multi_columns(
|
|
frame, level_num=level_num, dropna=dropna, sort=sort
|
|
)
|
|
elif isinstance(frame.index, MultiIndex):
|
|
new_levels = list(frame.index.levels)
|
|
new_codes = [lab.repeat(K) for lab in frame.index.codes]
|
|
|
|
clev, clab = stack_factorize(frame.columns)
|
|
new_levels.append(clev)
|
|
new_codes.append(np.tile(clab, N).ravel())
|
|
|
|
new_names = list(frame.index.names)
|
|
new_names.append(frame.columns.name)
|
|
new_index = MultiIndex(
|
|
levels=new_levels, codes=new_codes, names=new_names, verify_integrity=False
|
|
)
|
|
else:
|
|
levels, (ilab, clab) = zip(*map(stack_factorize, (frame.index, frame.columns)))
|
|
codes = ilab.repeat(K), np.tile(clab, N).ravel()
|
|
new_index = MultiIndex(
|
|
levels=levels,
|
|
codes=codes,
|
|
names=[frame.index.name, frame.columns.name],
|
|
verify_integrity=False,
|
|
)
|
|
|
|
new_values: ArrayLike
|
|
if not frame.empty and frame._is_homogeneous_type:
|
|
# For homogeneous EAs, frame._values will coerce to object. So
|
|
# we concatenate instead.
|
|
dtypes = list(frame.dtypes._values)
|
|
dtype = dtypes[0]
|
|
|
|
if isinstance(dtype, ExtensionDtype):
|
|
arr = dtype.construct_array_type()
|
|
new_values = arr._concat_same_type(
|
|
[col._values for _, col in frame.items()]
|
|
)
|
|
new_values = _reorder_for_extension_array_stack(new_values, N, K)
|
|
else:
|
|
# homogeneous, non-EA
|
|
new_values = frame._values.ravel()
|
|
|
|
else:
|
|
# non-homogeneous
|
|
new_values = frame._values.ravel()
|
|
|
|
if dropna:
|
|
mask = notna(new_values)
|
|
new_values = new_values[mask]
|
|
new_index = new_index[mask]
|
|
|
|
return frame._constructor_sliced(new_values, index=new_index)
|
|
|
|
|
|
def stack_multiple(frame: DataFrame, level, dropna: bool = True, sort: bool = True):
|
|
# If all passed levels match up to column names, no
|
|
# ambiguity about what to do
|
|
if all(lev in frame.columns.names for lev in level):
|
|
result = frame
|
|
for lev in level:
|
|
result = stack(result, lev, dropna=dropna, sort=sort)
|
|
|
|
# Otherwise, level numbers may change as each successive level is stacked
|
|
elif all(isinstance(lev, int) for lev in level):
|
|
# As each stack is done, the level numbers decrease, so we need
|
|
# to account for that when level is a sequence of ints
|
|
result = frame
|
|
# _get_level_number() checks level numbers are in range and converts
|
|
# negative numbers to positive
|
|
level = [frame.columns._get_level_number(lev) for lev in level]
|
|
|
|
while level:
|
|
lev = level.pop(0)
|
|
result = stack(result, lev, dropna=dropna, sort=sort)
|
|
# Decrement all level numbers greater than current, as these
|
|
# have now shifted down by one
|
|
level = [v if v <= lev else v - 1 for v in level]
|
|
|
|
else:
|
|
raise ValueError(
|
|
"level should contain all level names or all level "
|
|
"numbers, not a mixture of the two."
|
|
)
|
|
|
|
return result
|
|
|
|
|
|
def _stack_multi_column_index(columns: MultiIndex) -> MultiIndex:
|
|
"""Creates a MultiIndex from the first N-1 levels of this MultiIndex."""
|
|
if len(columns.levels) <= 2:
|
|
return columns.levels[0]._rename(name=columns.names[0])
|
|
|
|
levs = [
|
|
[lev[c] if c >= 0 else None for c in codes]
|
|
for lev, codes in zip(columns.levels[:-1], columns.codes[:-1])
|
|
]
|
|
|
|
# Remove duplicate tuples in the MultiIndex.
|
|
tuples = zip(*levs)
|
|
unique_tuples = (key for key, _ in itertools.groupby(tuples))
|
|
new_levs = zip(*unique_tuples)
|
|
|
|
# The dtype of each level must be explicitly set to avoid inferring the wrong type.
|
|
# See GH-36991.
|
|
return MultiIndex.from_arrays(
|
|
[
|
|
# Not all indices can accept None values.
|
|
Index(new_lev, dtype=lev.dtype) if None not in new_lev else new_lev
|
|
for new_lev, lev in zip(new_levs, columns.levels)
|
|
],
|
|
names=columns.names[:-1],
|
|
)
|
|
|
|
|
|
def _stack_multi_columns(
|
|
frame: DataFrame, level_num: int = -1, dropna: bool = True, sort: bool = True
|
|
) -> DataFrame:
|
|
def _convert_level_number(level_num: int, columns: Index):
|
|
"""
|
|
Logic for converting the level number to something we can safely pass
|
|
to swaplevel.
|
|
|
|
If `level_num` matches a column name return the name from
|
|
position `level_num`, otherwise return `level_num`.
|
|
"""
|
|
if level_num in columns.names:
|
|
return columns.names[level_num]
|
|
|
|
return level_num
|
|
|
|
this = frame.copy(deep=False)
|
|
mi_cols = this.columns # cast(MultiIndex, this.columns)
|
|
assert isinstance(mi_cols, MultiIndex) # caller is responsible
|
|
|
|
# this makes life much simpler
|
|
if level_num != mi_cols.nlevels - 1:
|
|
# roll levels to put selected level at end
|
|
roll_columns = mi_cols
|
|
for i in range(level_num, mi_cols.nlevels - 1):
|
|
# Need to check if the ints conflict with level names
|
|
lev1 = _convert_level_number(i, roll_columns)
|
|
lev2 = _convert_level_number(i + 1, roll_columns)
|
|
roll_columns = roll_columns.swaplevel(lev1, lev2)
|
|
this.columns = mi_cols = roll_columns
|
|
|
|
if not mi_cols._is_lexsorted() and sort:
|
|
# Workaround the edge case where 0 is one of the column names,
|
|
# which interferes with trying to sort based on the first
|
|
# level
|
|
level_to_sort = _convert_level_number(0, mi_cols)
|
|
this = this.sort_index(level=level_to_sort, axis=1)
|
|
mi_cols = this.columns
|
|
|
|
mi_cols = cast(MultiIndex, mi_cols)
|
|
new_columns = _stack_multi_column_index(mi_cols)
|
|
|
|
# time to ravel the values
|
|
new_data = {}
|
|
level_vals = mi_cols.levels[-1]
|
|
level_codes = unique(mi_cols.codes[-1])
|
|
if sort:
|
|
level_codes = np.sort(level_codes)
|
|
level_vals_nan = level_vals.insert(len(level_vals), None)
|
|
|
|
level_vals_used = np.take(level_vals_nan, level_codes)
|
|
levsize = len(level_codes)
|
|
drop_cols = []
|
|
for key in new_columns:
|
|
try:
|
|
loc = this.columns.get_loc(key)
|
|
except KeyError:
|
|
drop_cols.append(key)
|
|
continue
|
|
|
|
# can make more efficient?
|
|
# we almost always return a slice
|
|
# but if unsorted can get a boolean
|
|
# indexer
|
|
if not isinstance(loc, slice):
|
|
slice_len = len(loc)
|
|
else:
|
|
slice_len = loc.stop - loc.start
|
|
|
|
if slice_len != levsize:
|
|
chunk = this.loc[:, this.columns[loc]]
|
|
chunk.columns = level_vals_nan.take(chunk.columns.codes[-1])
|
|
value_slice = chunk.reindex(columns=level_vals_used).values
|
|
else:
|
|
subset = this.iloc[:, loc]
|
|
dtype = find_common_type(subset.dtypes.tolist())
|
|
if isinstance(dtype, ExtensionDtype):
|
|
# TODO(EA2D): won't need special case, can go through .values
|
|
# paths below (might change to ._values)
|
|
value_slice = dtype.construct_array_type()._concat_same_type(
|
|
[x._values.astype(dtype, copy=False) for _, x in subset.items()]
|
|
)
|
|
N, K = subset.shape
|
|
idx = np.arange(N * K).reshape(K, N).T.ravel()
|
|
value_slice = value_slice.take(idx)
|
|
else:
|
|
value_slice = subset.values
|
|
|
|
if value_slice.ndim > 1:
|
|
# i.e. not extension
|
|
value_slice = value_slice.ravel()
|
|
|
|
new_data[key] = value_slice
|
|
|
|
if len(drop_cols) > 0:
|
|
new_columns = new_columns.difference(drop_cols)
|
|
|
|
N = len(this)
|
|
|
|
if isinstance(this.index, MultiIndex):
|
|
new_levels = list(this.index.levels)
|
|
new_names = list(this.index.names)
|
|
new_codes = [lab.repeat(levsize) for lab in this.index.codes]
|
|
else:
|
|
old_codes, old_levels = factorize_from_iterable(this.index)
|
|
new_levels = [old_levels]
|
|
new_codes = [old_codes.repeat(levsize)]
|
|
new_names = [this.index.name] # something better?
|
|
|
|
new_levels.append(level_vals)
|
|
new_codes.append(np.tile(level_codes, N))
|
|
new_names.append(frame.columns.names[level_num])
|
|
|
|
new_index = MultiIndex(
|
|
levels=new_levels, codes=new_codes, names=new_names, verify_integrity=False
|
|
)
|
|
|
|
result = frame._constructor(new_data, index=new_index, columns=new_columns)
|
|
|
|
if frame.columns.nlevels > 1:
|
|
desired_columns = frame.columns._drop_level_numbers([level_num]).unique()
|
|
if not result.columns.equals(desired_columns):
|
|
result = result[desired_columns]
|
|
|
|
# more efficient way to go about this? can do the whole masking biz but
|
|
# will only save a small amount of time...
|
|
if dropna:
|
|
result = result.dropna(axis=0, how="all")
|
|
|
|
return result
|
|
|
|
|
|
def _reorder_for_extension_array_stack(
|
|
arr: ExtensionArray, n_rows: int, n_columns: int
|
|
) -> ExtensionArray:
|
|
"""
|
|
Re-orders the values when stacking multiple extension-arrays.
|
|
|
|
The indirect stacking method used for EAs requires a followup
|
|
take to get the order correct.
|
|
|
|
Parameters
|
|
----------
|
|
arr : ExtensionArray
|
|
n_rows, n_columns : int
|
|
The number of rows and columns in the original DataFrame.
|
|
|
|
Returns
|
|
-------
|
|
taken : ExtensionArray
|
|
The original `arr` with elements re-ordered appropriately
|
|
|
|
Examples
|
|
--------
|
|
>>> arr = np.array(['a', 'b', 'c', 'd', 'e', 'f'])
|
|
>>> _reorder_for_extension_array_stack(arr, 2, 3)
|
|
array(['a', 'c', 'e', 'b', 'd', 'f'], dtype='<U1')
|
|
|
|
>>> _reorder_for_extension_array_stack(arr, 3, 2)
|
|
array(['a', 'd', 'b', 'e', 'c', 'f'], dtype='<U1')
|
|
"""
|
|
# final take to get the order correct.
|
|
# idx is an indexer like
|
|
# [c0r0, c1r0, c2r0, ...,
|
|
# c0r1, c1r1, c2r1, ...]
|
|
idx = np.arange(n_rows * n_columns).reshape(n_columns, n_rows).T.ravel()
|
|
return arr.take(idx)
|
|
|
|
|
|
def stack_v3(frame: DataFrame, level: list[int]) -> Series | DataFrame:
|
|
if frame.columns.nunique() != len(frame.columns):
|
|
raise ValueError("Columns with duplicate values are not supported in stack")
|
|
|
|
# If we need to drop `level` from columns, it needs to be in descending order
|
|
drop_levnums = sorted(level, reverse=True)
|
|
stack_cols = frame.columns._drop_level_numbers(
|
|
[k for k in range(frame.columns.nlevels) if k not in level][::-1]
|
|
)
|
|
if len(level) > 1:
|
|
# Arrange columns in the order we want to take them, e.g. level=[2, 0, 1]
|
|
sorter = np.argsort(level)
|
|
ordered_stack_cols = stack_cols._reorder_ilevels(sorter)
|
|
else:
|
|
ordered_stack_cols = stack_cols
|
|
|
|
stack_cols_unique = stack_cols.unique()
|
|
ordered_stack_cols_unique = ordered_stack_cols.unique()
|
|
|
|
# Grab data for each unique index to be stacked
|
|
buf = []
|
|
for idx in stack_cols_unique:
|
|
if len(frame.columns) == 1:
|
|
data = frame.copy()
|
|
else:
|
|
# Take the data from frame corresponding to this idx value
|
|
if len(level) == 1:
|
|
idx = (idx,)
|
|
gen = iter(idx)
|
|
column_indexer = tuple(
|
|
next(gen) if k in level else slice(None)
|
|
for k in range(frame.columns.nlevels)
|
|
)
|
|
data = frame.loc[:, column_indexer]
|
|
|
|
if len(level) < frame.columns.nlevels:
|
|
data.columns = data.columns._drop_level_numbers(drop_levnums)
|
|
elif stack_cols.nlevels == 1:
|
|
if data.ndim == 1:
|
|
data.name = 0
|
|
else:
|
|
data.columns = RangeIndex(len(data.columns))
|
|
buf.append(data)
|
|
|
|
result: Series | DataFrame
|
|
if len(buf) > 0 and not frame.empty:
|
|
result = concat(buf)
|
|
ratio = len(result) // len(frame)
|
|
else:
|
|
# input is empty
|
|
if len(level) < frame.columns.nlevels:
|
|
# concat column order may be different from dropping the levels
|
|
new_columns = frame.columns._drop_level_numbers(drop_levnums).unique()
|
|
else:
|
|
new_columns = [0]
|
|
result = DataFrame(columns=new_columns, dtype=frame._values.dtype)
|
|
ratio = 0
|
|
|
|
if len(level) < frame.columns.nlevels:
|
|
# concat column order may be different from dropping the levels
|
|
desired_columns = frame.columns._drop_level_numbers(drop_levnums).unique()
|
|
if not result.columns.equals(desired_columns):
|
|
result = result[desired_columns]
|
|
|
|
# Construct the correct MultiIndex by combining the frame's index and
|
|
# stacked columns.
|
|
index_levels: list | FrozenList
|
|
if isinstance(frame.index, MultiIndex):
|
|
index_levels = frame.index.levels
|
|
index_codes = list(np.tile(frame.index.codes, (1, ratio)))
|
|
else:
|
|
codes, uniques = factorize(frame.index, use_na_sentinel=False)
|
|
index_levels = [uniques]
|
|
index_codes = list(np.tile(codes, (1, ratio)))
|
|
if isinstance(stack_cols, MultiIndex):
|
|
column_levels = ordered_stack_cols.levels
|
|
column_codes = ordered_stack_cols.drop_duplicates().codes
|
|
else:
|
|
column_levels = [ordered_stack_cols.unique()]
|
|
column_codes = [factorize(ordered_stack_cols_unique, use_na_sentinel=False)[0]]
|
|
column_codes = [np.repeat(codes, len(frame)) for codes in column_codes]
|
|
result.index = MultiIndex(
|
|
levels=index_levels + column_levels,
|
|
codes=index_codes + column_codes,
|
|
names=frame.index.names + list(ordered_stack_cols.names),
|
|
verify_integrity=False,
|
|
)
|
|
|
|
# sort result, but faster than calling sort_index since we know the order we need
|
|
len_df = len(frame)
|
|
n_uniques = len(ordered_stack_cols_unique)
|
|
indexer = np.arange(n_uniques)
|
|
idxs = np.tile(len_df * indexer, len_df) + np.repeat(np.arange(len_df), n_uniques)
|
|
result = result.take(idxs)
|
|
|
|
# Reshape/rename if needed and dropna
|
|
if result.ndim == 2 and frame.columns.nlevels == len(level):
|
|
if len(result.columns) == 0:
|
|
result = Series(index=result.index)
|
|
else:
|
|
result = result.iloc[:, 0]
|
|
if result.ndim == 1:
|
|
result.name = None
|
|
|
|
return result
|