1055 lines
34 KiB
Python
1055 lines
34 KiB
Python
import itertools
|
|
from typing import List, Optional, Union
|
|
|
|
import numpy as np
|
|
|
|
import pandas._libs.algos as libalgos
|
|
import pandas._libs.reshape as libreshape
|
|
from pandas._libs.sparse import IntIndex
|
|
from pandas.util._decorators import cache_readonly
|
|
|
|
from pandas.core.dtypes.cast import maybe_promote
|
|
from pandas.core.dtypes.common import (
|
|
ensure_platform_int,
|
|
is_bool_dtype,
|
|
is_extension_array_dtype,
|
|
is_integer,
|
|
is_integer_dtype,
|
|
is_list_like,
|
|
is_object_dtype,
|
|
needs_i8_conversion,
|
|
)
|
|
from pandas.core.dtypes.missing import notna
|
|
|
|
import pandas.core.algorithms as algos
|
|
from pandas.core.arrays import SparseArray
|
|
from pandas.core.arrays.categorical import factorize_from_iterable
|
|
from pandas.core.frame import DataFrame
|
|
from pandas.core.indexes.api import Index, MultiIndex
|
|
from pandas.core.series import Series
|
|
from pandas.core.sorting import (
|
|
compress_group_index,
|
|
decons_obs_group_ids,
|
|
get_compressed_ids,
|
|
get_group_index,
|
|
)
|
|
|
|
|
|
class _Unstacker:
|
|
"""
|
|
Helper class to unstack data / pivot with multi-level index
|
|
|
|
Parameters
|
|
----------
|
|
index : MultiIndex
|
|
level : int or str, default last level
|
|
Level to "unstack". Accepts a name for the level.
|
|
fill_value : scalar, optional
|
|
Default value to fill in missing values if subgroups do not have the
|
|
same set of labels. By default, missing values will be replaced with
|
|
the default fill value for that data type, NaN for float, NaT for
|
|
datetimelike, etc. For integer types, by default data will converted to
|
|
float and missing values will be set to NaN.
|
|
constructor : object
|
|
Pandas ``DataFrame`` or subclass used to create unstacked
|
|
response. If None, DataFrame will be used.
|
|
|
|
Examples
|
|
--------
|
|
>>> index = pd.MultiIndex.from_tuples([('one', 'a'), ('one', 'b'),
|
|
... ('two', 'a'), ('two', 'b')])
|
|
>>> s = pd.Series(np.arange(1, 5, dtype=np.int64), index=index)
|
|
>>> s
|
|
one a 1
|
|
b 2
|
|
two a 3
|
|
b 4
|
|
dtype: int64
|
|
|
|
>>> s.unstack(level=-1)
|
|
a b
|
|
one 1 2
|
|
two 3 4
|
|
|
|
>>> s.unstack(level=0)
|
|
one two
|
|
a 1 3
|
|
b 2 4
|
|
|
|
Returns
|
|
-------
|
|
unstacked : DataFrame
|
|
"""
|
|
|
|
def __init__(self, index: MultiIndex, level=-1, constructor=None):
|
|
|
|
if constructor is None:
|
|
constructor = DataFrame
|
|
self.constructor = constructor
|
|
|
|
self.index = index.remove_unused_levels()
|
|
|
|
self.level = self.index._get_level_number(level)
|
|
|
|
# when index includes `nan`, need to lift levels/strides by 1
|
|
self.lift = 1 if -1 in self.index.codes[self.level] else 0
|
|
|
|
# Note: the "pop" below alters these in-place.
|
|
self.new_index_levels = list(self.index.levels)
|
|
self.new_index_names = list(self.index.names)
|
|
|
|
self.removed_name = self.new_index_names.pop(self.level)
|
|
self.removed_level = self.new_index_levels.pop(self.level)
|
|
self.removed_level_full = index.levels[self.level]
|
|
|
|
# Bug fix GH 20601
|
|
# If the data frame is too big, the number of unique index combination
|
|
# will cause int32 overflow on windows environments.
|
|
# We want to check and raise an error before this happens
|
|
num_rows = np.max([index_level.size for index_level in self.new_index_levels])
|
|
num_columns = self.removed_level.size
|
|
|
|
# GH20601: This forces an overflow if the number of cells is too high.
|
|
num_cells = np.multiply(num_rows, num_columns, dtype=np.int32)
|
|
|
|
if num_rows > 0 and num_columns > 0 and num_cells <= 0:
|
|
raise ValueError("Unstacked DataFrame is too big, causing int32 overflow")
|
|
|
|
self._make_selectors()
|
|
|
|
@cache_readonly
|
|
def _indexer_and_to_sort(self):
|
|
v = self.level
|
|
|
|
codes = list(self.index.codes)
|
|
levs = list(self.index.levels)
|
|
to_sort = codes[:v] + codes[v + 1 :] + [codes[v]]
|
|
sizes = [len(x) for x in levs[:v] + levs[v + 1 :] + [levs[v]]]
|
|
|
|
comp_index, obs_ids = get_compressed_ids(to_sort, sizes)
|
|
ngroups = len(obs_ids)
|
|
|
|
indexer = libalgos.groupsort_indexer(comp_index, ngroups)[0]
|
|
indexer = ensure_platform_int(indexer)
|
|
|
|
return indexer, to_sort
|
|
|
|
@cache_readonly
|
|
def sorted_labels(self):
|
|
indexer, to_sort = self._indexer_and_to_sort
|
|
return [line.take(indexer) for line in to_sort]
|
|
|
|
def _make_sorted_values(self, values: np.ndarray) -> np.ndarray:
|
|
indexer, _ = self._indexer_and_to_sort
|
|
|
|
sorted_values = algos.take_nd(values, indexer, axis=0)
|
|
return sorted_values
|
|
|
|
def _make_selectors(self):
|
|
new_levels = self.new_index_levels
|
|
|
|
# make the mask
|
|
remaining_labels = self.sorted_labels[:-1]
|
|
level_sizes = [len(x) for x in new_levels]
|
|
|
|
comp_index, obs_ids = get_compressed_ids(remaining_labels, level_sizes)
|
|
ngroups = len(obs_ids)
|
|
|
|
comp_index = ensure_platform_int(comp_index)
|
|
stride = self.index.levshape[self.level] + self.lift
|
|
self.full_shape = ngroups, stride
|
|
|
|
selector = self.sorted_labels[-1] + stride * comp_index + self.lift
|
|
mask = np.zeros(np.prod(self.full_shape), dtype=bool)
|
|
mask.put(selector, True)
|
|
|
|
if mask.sum() < len(self.index):
|
|
raise ValueError("Index contains duplicate entries, cannot reshape")
|
|
|
|
self.group_index = comp_index
|
|
self.mask = mask
|
|
self.unique_groups = obs_ids
|
|
self.compressor = comp_index.searchsorted(np.arange(ngroups))
|
|
|
|
def get_result(self, values, value_columns, fill_value):
|
|
|
|
if values.ndim == 1:
|
|
values = values[:, np.newaxis]
|
|
|
|
if value_columns is None and values.shape[1] != 1: # pragma: no cover
|
|
raise ValueError("must pass column labels for multi-column data")
|
|
|
|
values, _ = self.get_new_values(values, fill_value)
|
|
columns = self.get_new_columns(value_columns)
|
|
index = self.new_index
|
|
|
|
return self.constructor(values, index=index, columns=columns)
|
|
|
|
def get_new_values(self, values, fill_value=None):
|
|
|
|
if values.ndim == 1:
|
|
values = values[:, np.newaxis]
|
|
|
|
sorted_values = self._make_sorted_values(values)
|
|
|
|
# place the values
|
|
length, width = self.full_shape
|
|
stride = values.shape[1]
|
|
result_width = width * stride
|
|
result_shape = (length, result_width)
|
|
mask = self.mask
|
|
mask_all = mask.all()
|
|
|
|
# we can simply reshape if we don't have a mask
|
|
if mask_all and len(values):
|
|
# TODO: Under what circumstances can we rely on sorted_values
|
|
# matching values? When that holds, we can slice instead
|
|
# of take (in particular for EAs)
|
|
new_values = (
|
|
sorted_values.reshape(length, width, stride)
|
|
.swapaxes(1, 2)
|
|
.reshape(result_shape)
|
|
)
|
|
new_mask = np.ones(result_shape, dtype=bool)
|
|
return new_values, new_mask
|
|
|
|
# if our mask is all True, then we can use our existing dtype
|
|
if mask_all:
|
|
dtype = values.dtype
|
|
new_values = np.empty(result_shape, dtype=dtype)
|
|
else:
|
|
dtype, fill_value = maybe_promote(values.dtype, fill_value)
|
|
new_values = np.empty(result_shape, dtype=dtype)
|
|
new_values.fill(fill_value)
|
|
|
|
new_mask = np.zeros(result_shape, dtype=bool)
|
|
|
|
name = np.dtype(dtype).name
|
|
|
|
# we need to convert to a basic dtype
|
|
# and possibly coerce an input to our output dtype
|
|
# e.g. ints -> floats
|
|
if needs_i8_conversion(values.dtype):
|
|
sorted_values = sorted_values.view("i8")
|
|
new_values = new_values.view("i8")
|
|
elif is_bool_dtype(values.dtype):
|
|
sorted_values = sorted_values.astype("object")
|
|
new_values = new_values.astype("object")
|
|
else:
|
|
sorted_values = sorted_values.astype(name, copy=False)
|
|
|
|
# fill in our values & mask
|
|
libreshape.unstack(
|
|
sorted_values,
|
|
mask.view("u1"),
|
|
stride,
|
|
length,
|
|
width,
|
|
new_values,
|
|
new_mask.view("u1"),
|
|
)
|
|
|
|
# reconstruct dtype if needed
|
|
if needs_i8_conversion(values.dtype):
|
|
new_values = new_values.view(values.dtype)
|
|
|
|
return new_values, new_mask
|
|
|
|
def get_new_columns(self, value_columns):
|
|
if value_columns is None:
|
|
if self.lift == 0:
|
|
return self.removed_level._shallow_copy(name=self.removed_name)
|
|
|
|
lev = self.removed_level.insert(0, item=self.removed_level._na_value)
|
|
return lev.rename(self.removed_name)
|
|
|
|
stride = len(self.removed_level) + self.lift
|
|
width = len(value_columns)
|
|
propagator = np.repeat(np.arange(width), stride)
|
|
if isinstance(value_columns, MultiIndex):
|
|
new_levels = value_columns.levels + (self.removed_level_full,)
|
|
new_names = value_columns.names + (self.removed_name,)
|
|
|
|
new_codes = [lab.take(propagator) for lab in value_columns.codes]
|
|
else:
|
|
new_levels = [value_columns, self.removed_level_full]
|
|
new_names = [value_columns.name, self.removed_name]
|
|
new_codes = [propagator]
|
|
|
|
# The two indices differ only if the unstacked level had unused items:
|
|
if len(self.removed_level_full) != len(self.removed_level):
|
|
# In this case, we remap the new codes to the original level:
|
|
repeater = self.removed_level_full.get_indexer(self.removed_level)
|
|
if self.lift:
|
|
repeater = np.insert(repeater, 0, -1)
|
|
else:
|
|
# Otherwise, we just use each level item exactly once:
|
|
repeater = np.arange(stride) - self.lift
|
|
|
|
# The entire level is then just a repetition of the single chunk:
|
|
new_codes.append(np.tile(repeater, width))
|
|
return MultiIndex(
|
|
levels=new_levels, codes=new_codes, names=new_names, verify_integrity=False
|
|
)
|
|
|
|
@cache_readonly
|
|
def new_index(self):
|
|
# Does not depend on values or value_columns
|
|
result_codes = [lab.take(self.compressor) for lab in self.sorted_labels[:-1]]
|
|
|
|
# construct the new index
|
|
if len(self.new_index_levels) == 1:
|
|
level, level_codes = self.new_index_levels[0], result_codes[0]
|
|
if (level_codes == -1).any():
|
|
level = level.insert(len(level), level._na_value)
|
|
return level.take(level_codes).rename(self.new_index_names[0])
|
|
|
|
return MultiIndex(
|
|
levels=self.new_index_levels,
|
|
codes=result_codes,
|
|
names=self.new_index_names,
|
|
verify_integrity=False,
|
|
)
|
|
|
|
|
|
def _unstack_multiple(data, clocs, fill_value=None):
|
|
if len(clocs) == 0:
|
|
return data
|
|
|
|
# NOTE: This doesn't deal with hierarchical columns yet
|
|
|
|
index = data.index
|
|
|
|
# GH 19966 Make sure if MultiIndexed index has tuple name, they will be
|
|
# recognised as a whole
|
|
if clocs in index.names:
|
|
clocs = [clocs]
|
|
clocs = [index._get_level_number(i) for i in clocs]
|
|
|
|
rlocs = [i for i in range(index.nlevels) if i not in clocs]
|
|
|
|
clevels = [index.levels[i] for i in clocs]
|
|
ccodes = [index.codes[i] for i in clocs]
|
|
cnames = [index.names[i] for i in clocs]
|
|
rlevels = [index.levels[i] for i in rlocs]
|
|
rcodes = [index.codes[i] for i in rlocs]
|
|
rnames = [index.names[i] for i in rlocs]
|
|
|
|
shape = [len(x) for x in clevels]
|
|
group_index = get_group_index(ccodes, shape, sort=False, xnull=False)
|
|
|
|
comp_ids, obs_ids = compress_group_index(group_index, sort=False)
|
|
recons_codes = decons_obs_group_ids(comp_ids, obs_ids, shape, ccodes, xnull=False)
|
|
|
|
if not rlocs:
|
|
# Everything is in clocs, so the dummy df has a regular index
|
|
dummy_index = Index(obs_ids, name="__placeholder__")
|
|
else:
|
|
dummy_index = MultiIndex(
|
|
levels=rlevels + [obs_ids],
|
|
codes=rcodes + [comp_ids],
|
|
names=rnames + ["__placeholder__"],
|
|
verify_integrity=False,
|
|
)
|
|
|
|
if isinstance(data, Series):
|
|
dummy = data.copy()
|
|
dummy.index = dummy_index
|
|
|
|
unstacked = dummy.unstack("__placeholder__", fill_value=fill_value)
|
|
new_levels = clevels
|
|
new_names = cnames
|
|
new_codes = recons_codes
|
|
else:
|
|
if isinstance(data.columns, MultiIndex):
|
|
result = data
|
|
for i in range(len(clocs)):
|
|
val = clocs[i]
|
|
result = result.unstack(val, fill_value=fill_value)
|
|
clocs = [v if v < val else v - 1 for v in clocs]
|
|
|
|
return result
|
|
|
|
dummy = data.copy()
|
|
dummy.index = dummy_index
|
|
|
|
unstacked = dummy.unstack("__placeholder__", fill_value=fill_value)
|
|
if isinstance(unstacked, Series):
|
|
unstcols = unstacked.index
|
|
else:
|
|
unstcols = unstacked.columns
|
|
assert isinstance(unstcols, MultiIndex) # for mypy
|
|
new_levels = [unstcols.levels[0]] + clevels
|
|
new_names = [data.columns.name] + cnames
|
|
|
|
new_codes = [unstcols.codes[0]]
|
|
for rec in recons_codes:
|
|
new_codes.append(rec.take(unstcols.codes[-1]))
|
|
|
|
new_columns = MultiIndex(
|
|
levels=new_levels, codes=new_codes, names=new_names, verify_integrity=False
|
|
)
|
|
|
|
if isinstance(unstacked, Series):
|
|
unstacked.index = new_columns
|
|
else:
|
|
unstacked.columns = new_columns
|
|
|
|
return unstacked
|
|
|
|
|
|
def unstack(obj, level, fill_value=None):
|
|
|
|
if isinstance(level, (tuple, list)):
|
|
if len(level) != 1:
|
|
# _unstack_multiple only handles MultiIndexes,
|
|
# and isn't needed for a single level
|
|
return _unstack_multiple(obj, level, fill_value=fill_value)
|
|
else:
|
|
level = level[0]
|
|
|
|
# Prioritize integer interpretation (GH #21677):
|
|
if not is_integer(level) and not level == "__placeholder__":
|
|
level = obj.index._get_level_number(level)
|
|
|
|
if isinstance(obj, DataFrame):
|
|
if isinstance(obj.index, MultiIndex):
|
|
return _unstack_frame(obj, level, fill_value=fill_value)
|
|
else:
|
|
return obj.T.stack(dropna=False)
|
|
elif not isinstance(obj.index, MultiIndex):
|
|
# GH 36113
|
|
# Give nicer error messages when unstack a Series whose
|
|
# Index is not a MultiIndex.
|
|
raise ValueError(
|
|
f"index must be a MultiIndex to unstack, {type(obj.index)} was passed"
|
|
)
|
|
else:
|
|
if is_extension_array_dtype(obj.dtype):
|
|
return _unstack_extension_series(obj, level, fill_value)
|
|
unstacker = _Unstacker(
|
|
obj.index, level=level, constructor=obj._constructor_expanddim
|
|
)
|
|
return unstacker.get_result(
|
|
obj.values, value_columns=None, fill_value=fill_value
|
|
)
|
|
|
|
|
|
def _unstack_frame(obj, level, fill_value=None):
|
|
if not obj._can_fast_transpose:
|
|
unstacker = _Unstacker(obj.index, level=level)
|
|
mgr = obj._mgr.unstack(unstacker, fill_value=fill_value)
|
|
return obj._constructor(mgr)
|
|
else:
|
|
return _Unstacker(
|
|
obj.index, level=level, constructor=obj._constructor
|
|
).get_result(obj._values, value_columns=obj.columns, fill_value=fill_value)
|
|
|
|
|
|
def _unstack_extension_series(series, level, fill_value):
|
|
"""
|
|
Unstack an ExtensionArray-backed Series.
|
|
|
|
The ExtensionDtype is preserved.
|
|
|
|
Parameters
|
|
----------
|
|
series : Series
|
|
A Series with an ExtensionArray for values
|
|
level : Any
|
|
The level name or number.
|
|
fill_value : Any
|
|
The user-level (not physical storage) fill value to use for
|
|
missing values introduced by the reshape. Passed to
|
|
``series.values.take``.
|
|
|
|
Returns
|
|
-------
|
|
DataFrame
|
|
Each column of the DataFrame will have the same dtype as
|
|
the input Series.
|
|
"""
|
|
# Defer to the logic in ExtensionBlock._unstack
|
|
df = series.to_frame()
|
|
result = df.unstack(level=level, fill_value=fill_value)
|
|
return result.droplevel(level=0, axis=1)
|
|
|
|
|
|
def stack(frame, level=-1, dropna=True):
|
|
"""
|
|
Convert DataFrame to Series with multi-level Index. Columns become the
|
|
second level of the resulting hierarchical index
|
|
|
|
Returns
|
|
-------
|
|
stacked : Series
|
|
"""
|
|
|
|
def factorize(index):
|
|
if index.is_unique:
|
|
return index, np.arange(len(index))
|
|
codes, categories = factorize_from_iterable(index)
|
|
return categories, codes
|
|
|
|
N, K = frame.shape
|
|
|
|
# Will also convert negative level numbers and check if out of bounds.
|
|
level_num = frame.columns._get_level_number(level)
|
|
|
|
if isinstance(frame.columns, MultiIndex):
|
|
return _stack_multi_columns(frame, level_num=level_num, dropna=dropna)
|
|
elif isinstance(frame.index, MultiIndex):
|
|
new_levels = list(frame.index.levels)
|
|
new_codes = [lab.repeat(K) for lab in frame.index.codes]
|
|
|
|
clev, clab = factorize(frame.columns)
|
|
new_levels.append(clev)
|
|
new_codes.append(np.tile(clab, N).ravel())
|
|
|
|
new_names = list(frame.index.names)
|
|
new_names.append(frame.columns.name)
|
|
new_index = MultiIndex(
|
|
levels=new_levels, codes=new_codes, names=new_names, verify_integrity=False
|
|
)
|
|
else:
|
|
levels, (ilab, clab) = zip(*map(factorize, (frame.index, frame.columns)))
|
|
codes = ilab.repeat(K), np.tile(clab, N).ravel()
|
|
new_index = MultiIndex(
|
|
levels=levels,
|
|
codes=codes,
|
|
names=[frame.index.name, frame.columns.name],
|
|
verify_integrity=False,
|
|
)
|
|
|
|
if not frame.empty and frame._is_homogeneous_type:
|
|
# For homogeneous EAs, frame._values will coerce to object. So
|
|
# we concatenate instead.
|
|
dtypes = list(frame.dtypes._values)
|
|
dtype = dtypes[0]
|
|
|
|
if is_extension_array_dtype(dtype):
|
|
arr = dtype.construct_array_type()
|
|
new_values = arr._concat_same_type(
|
|
[col._values for _, col in frame.items()]
|
|
)
|
|
new_values = _reorder_for_extension_array_stack(new_values, N, K)
|
|
else:
|
|
# homogeneous, non-EA
|
|
new_values = frame._values.ravel()
|
|
|
|
else:
|
|
# non-homogeneous
|
|
new_values = frame._values.ravel()
|
|
|
|
if dropna:
|
|
mask = notna(new_values)
|
|
new_values = new_values[mask]
|
|
new_index = new_index[mask]
|
|
|
|
return frame._constructor_sliced(new_values, index=new_index)
|
|
|
|
|
|
def stack_multiple(frame, level, dropna=True):
|
|
# If all passed levels match up to column names, no
|
|
# ambiguity about what to do
|
|
if all(lev in frame.columns.names for lev in level):
|
|
result = frame
|
|
for lev in level:
|
|
result = stack(result, lev, dropna=dropna)
|
|
|
|
# Otherwise, level numbers may change as each successive level is stacked
|
|
elif all(isinstance(lev, int) for lev in level):
|
|
# As each stack is done, the level numbers decrease, so we need
|
|
# to account for that when level is a sequence of ints
|
|
result = frame
|
|
# _get_level_number() checks level numbers are in range and converts
|
|
# negative numbers to positive
|
|
level = [frame.columns._get_level_number(lev) for lev in level]
|
|
|
|
# Can't iterate directly through level as we might need to change
|
|
# values as we go
|
|
for index in range(len(level)):
|
|
lev = level[index]
|
|
result = stack(result, lev, dropna=dropna)
|
|
# Decrement all level numbers greater than current, as these
|
|
# have now shifted down by one
|
|
updated_level = []
|
|
for other in level:
|
|
if other > lev:
|
|
updated_level.append(other - 1)
|
|
else:
|
|
updated_level.append(other)
|
|
level = updated_level
|
|
|
|
else:
|
|
raise ValueError(
|
|
"level should contain all level names or all level "
|
|
"numbers, not a mixture of the two."
|
|
)
|
|
|
|
return result
|
|
|
|
|
|
def _stack_multi_columns(frame, level_num=-1, dropna=True):
|
|
def _convert_level_number(level_num, columns):
|
|
"""
|
|
Logic for converting the level number to something we can safely pass
|
|
to swaplevel.
|
|
|
|
If `level_num` matches a column name return the name from
|
|
position `level_num`, otherwise return `level_num`.
|
|
"""
|
|
if level_num in columns.names:
|
|
return columns.names[level_num]
|
|
|
|
return level_num
|
|
|
|
this = frame.copy()
|
|
|
|
# this makes life much simpler
|
|
if level_num != frame.columns.nlevels - 1:
|
|
# roll levels to put selected level at end
|
|
roll_columns = this.columns
|
|
for i in range(level_num, frame.columns.nlevels - 1):
|
|
# Need to check if the ints conflict with level names
|
|
lev1 = _convert_level_number(i, roll_columns)
|
|
lev2 = _convert_level_number(i + 1, roll_columns)
|
|
roll_columns = roll_columns.swaplevel(lev1, lev2)
|
|
this.columns = roll_columns
|
|
|
|
if not this.columns.is_lexsorted():
|
|
# Workaround the edge case where 0 is one of the column names,
|
|
# which interferes with trying to sort based on the first
|
|
# level
|
|
level_to_sort = _convert_level_number(0, this.columns)
|
|
this = this.sort_index(level=level_to_sort, axis=1)
|
|
|
|
# tuple list excluding level for grouping columns
|
|
if len(frame.columns.levels) > 2:
|
|
tuples = list(
|
|
zip(
|
|
*[
|
|
lev.take(level_codes)
|
|
for lev, level_codes in zip(
|
|
this.columns.levels[:-1], this.columns.codes[:-1]
|
|
)
|
|
]
|
|
)
|
|
)
|
|
unique_groups = [key for key, _ in itertools.groupby(tuples)]
|
|
new_names = this.columns.names[:-1]
|
|
new_columns = MultiIndex.from_tuples(unique_groups, names=new_names)
|
|
else:
|
|
new_columns = this.columns.levels[0]._shallow_copy(name=this.columns.names[0])
|
|
unique_groups = new_columns
|
|
|
|
# time to ravel the values
|
|
new_data = {}
|
|
level_vals = this.columns.levels[-1]
|
|
level_codes = sorted(set(this.columns.codes[-1]))
|
|
level_vals_used = level_vals[level_codes]
|
|
levsize = len(level_codes)
|
|
drop_cols = []
|
|
for key in unique_groups:
|
|
try:
|
|
loc = this.columns.get_loc(key)
|
|
except KeyError:
|
|
drop_cols.append(key)
|
|
continue
|
|
|
|
# can make more efficient?
|
|
# we almost always return a slice
|
|
# but if unsorted can get a boolean
|
|
# indexer
|
|
if not isinstance(loc, slice):
|
|
slice_len = len(loc)
|
|
else:
|
|
slice_len = loc.stop - loc.start
|
|
|
|
if slice_len != levsize:
|
|
chunk = this.loc[:, this.columns[loc]]
|
|
chunk.columns = level_vals.take(chunk.columns.codes[-1])
|
|
value_slice = chunk.reindex(columns=level_vals_used).values
|
|
else:
|
|
if frame._is_homogeneous_type and is_extension_array_dtype(
|
|
frame.dtypes.iloc[0]
|
|
):
|
|
dtype = this[this.columns[loc]].dtypes.iloc[0]
|
|
subset = this[this.columns[loc]]
|
|
|
|
value_slice = dtype.construct_array_type()._concat_same_type(
|
|
[x._values for _, x in subset.items()]
|
|
)
|
|
N, K = this.shape
|
|
idx = np.arange(N * K).reshape(K, N).T.ravel()
|
|
value_slice = value_slice.take(idx)
|
|
|
|
elif frame._is_mixed_type:
|
|
value_slice = this[this.columns[loc]].values
|
|
else:
|
|
value_slice = this.values[:, loc]
|
|
|
|
if value_slice.ndim > 1:
|
|
# i.e. not extension
|
|
value_slice = value_slice.ravel()
|
|
|
|
new_data[key] = value_slice
|
|
|
|
if len(drop_cols) > 0:
|
|
new_columns = new_columns.difference(drop_cols)
|
|
|
|
N = len(this)
|
|
|
|
if isinstance(this.index, MultiIndex):
|
|
new_levels = list(this.index.levels)
|
|
new_names = list(this.index.names)
|
|
new_codes = [lab.repeat(levsize) for lab in this.index.codes]
|
|
else:
|
|
old_codes, old_levels = factorize_from_iterable(this.index)
|
|
new_levels = [old_levels]
|
|
new_codes = [old_codes.repeat(levsize)]
|
|
new_names = [this.index.name] # something better?
|
|
|
|
new_levels.append(level_vals)
|
|
new_codes.append(np.tile(level_codes, N))
|
|
new_names.append(frame.columns.names[level_num])
|
|
|
|
new_index = MultiIndex(
|
|
levels=new_levels, codes=new_codes, names=new_names, verify_integrity=False
|
|
)
|
|
|
|
result = frame._constructor(new_data, index=new_index, columns=new_columns)
|
|
|
|
# more efficient way to go about this? can do the whole masking biz but
|
|
# will only save a small amount of time...
|
|
if dropna:
|
|
result = result.dropna(axis=0, how="all")
|
|
|
|
return result
|
|
|
|
|
|
def get_dummies(
|
|
data,
|
|
prefix=None,
|
|
prefix_sep="_",
|
|
dummy_na=False,
|
|
columns=None,
|
|
sparse=False,
|
|
drop_first=False,
|
|
dtype=None,
|
|
) -> "DataFrame":
|
|
"""
|
|
Convert categorical variable into dummy/indicator variables.
|
|
|
|
Parameters
|
|
----------
|
|
data : array-like, Series, or DataFrame
|
|
Data of which to get dummy indicators.
|
|
prefix : str, list of str, or dict of str, default None
|
|
String to append DataFrame column names.
|
|
Pass a list with length equal to the number of columns
|
|
when calling get_dummies on a DataFrame. Alternatively, `prefix`
|
|
can be a dictionary mapping column names to prefixes.
|
|
prefix_sep : str, default '_'
|
|
If appending prefix, separator/delimiter to use. Or pass a
|
|
list or dictionary as with `prefix`.
|
|
dummy_na : bool, default False
|
|
Add a column to indicate NaNs, if False NaNs are ignored.
|
|
columns : list-like, default None
|
|
Column names in the DataFrame to be encoded.
|
|
If `columns` is None then all the columns with
|
|
`object` or `category` dtype will be converted.
|
|
sparse : bool, default False
|
|
Whether the dummy-encoded columns should be backed by
|
|
a :class:`SparseArray` (True) or a regular NumPy array (False).
|
|
drop_first : bool, default False
|
|
Whether to get k-1 dummies out of k categorical levels by removing the
|
|
first level.
|
|
dtype : dtype, default np.uint8
|
|
Data type for new columns. Only a single dtype is allowed.
|
|
|
|
Returns
|
|
-------
|
|
DataFrame
|
|
Dummy-coded data.
|
|
|
|
See Also
|
|
--------
|
|
Series.str.get_dummies : Convert Series to dummy codes.
|
|
|
|
Examples
|
|
--------
|
|
>>> s = pd.Series(list('abca'))
|
|
|
|
>>> pd.get_dummies(s)
|
|
a b c
|
|
0 1 0 0
|
|
1 0 1 0
|
|
2 0 0 1
|
|
3 1 0 0
|
|
|
|
>>> s1 = ['a', 'b', np.nan]
|
|
|
|
>>> pd.get_dummies(s1)
|
|
a b
|
|
0 1 0
|
|
1 0 1
|
|
2 0 0
|
|
|
|
>>> pd.get_dummies(s1, dummy_na=True)
|
|
a b NaN
|
|
0 1 0 0
|
|
1 0 1 0
|
|
2 0 0 1
|
|
|
|
>>> df = pd.DataFrame({'A': ['a', 'b', 'a'], 'B': ['b', 'a', 'c'],
|
|
... 'C': [1, 2, 3]})
|
|
|
|
>>> pd.get_dummies(df, prefix=['col1', 'col2'])
|
|
C col1_a col1_b col2_a col2_b col2_c
|
|
0 1 1 0 0 1 0
|
|
1 2 0 1 1 0 0
|
|
2 3 1 0 0 0 1
|
|
|
|
>>> pd.get_dummies(pd.Series(list('abcaa')))
|
|
a b c
|
|
0 1 0 0
|
|
1 0 1 0
|
|
2 0 0 1
|
|
3 1 0 0
|
|
4 1 0 0
|
|
|
|
>>> pd.get_dummies(pd.Series(list('abcaa')), drop_first=True)
|
|
b c
|
|
0 0 0
|
|
1 1 0
|
|
2 0 1
|
|
3 0 0
|
|
4 0 0
|
|
|
|
>>> pd.get_dummies(pd.Series(list('abc')), dtype=float)
|
|
a b c
|
|
0 1.0 0.0 0.0
|
|
1 0.0 1.0 0.0
|
|
2 0.0 0.0 1.0
|
|
"""
|
|
from pandas.core.reshape.concat import concat
|
|
|
|
dtypes_to_encode = ["object", "category"]
|
|
|
|
if isinstance(data, DataFrame):
|
|
# determine columns being encoded
|
|
if columns is None:
|
|
data_to_encode = data.select_dtypes(include=dtypes_to_encode)
|
|
elif not is_list_like(columns):
|
|
raise TypeError("Input must be a list-like for parameter `columns`")
|
|
else:
|
|
data_to_encode = data[columns]
|
|
|
|
# validate prefixes and separator to avoid silently dropping cols
|
|
def check_len(item, name):
|
|
|
|
if is_list_like(item):
|
|
if not len(item) == data_to_encode.shape[1]:
|
|
len_msg = (
|
|
f"Length of '{name}' ({len(item)}) did not match the "
|
|
"length of the columns being encoded "
|
|
f"({data_to_encode.shape[1]})."
|
|
)
|
|
raise ValueError(len_msg)
|
|
|
|
check_len(prefix, "prefix")
|
|
check_len(prefix_sep, "prefix_sep")
|
|
|
|
if isinstance(prefix, str):
|
|
prefix = itertools.cycle([prefix])
|
|
if isinstance(prefix, dict):
|
|
prefix = [prefix[col] for col in data_to_encode.columns]
|
|
|
|
if prefix is None:
|
|
prefix = data_to_encode.columns
|
|
|
|
# validate separators
|
|
if isinstance(prefix_sep, str):
|
|
prefix_sep = itertools.cycle([prefix_sep])
|
|
elif isinstance(prefix_sep, dict):
|
|
prefix_sep = [prefix_sep[col] for col in data_to_encode.columns]
|
|
|
|
with_dummies: List[DataFrame]
|
|
if data_to_encode.shape == data.shape:
|
|
# Encoding the entire df, do not prepend any dropped columns
|
|
with_dummies = []
|
|
elif columns is not None:
|
|
# Encoding only cols specified in columns. Get all cols not in
|
|
# columns to prepend to result.
|
|
with_dummies = [data.drop(columns, axis=1)]
|
|
else:
|
|
# Encoding only object and category dtype columns. Get remaining
|
|
# columns to prepend to result.
|
|
with_dummies = [data.select_dtypes(exclude=dtypes_to_encode)]
|
|
|
|
for (col, pre, sep) in zip(data_to_encode.items(), prefix, prefix_sep):
|
|
# col is (column_name, column), use just column data here
|
|
dummy = _get_dummies_1d(
|
|
col[1],
|
|
prefix=pre,
|
|
prefix_sep=sep,
|
|
dummy_na=dummy_na,
|
|
sparse=sparse,
|
|
drop_first=drop_first,
|
|
dtype=dtype,
|
|
)
|
|
with_dummies.append(dummy)
|
|
result = concat(with_dummies, axis=1)
|
|
else:
|
|
result = _get_dummies_1d(
|
|
data,
|
|
prefix,
|
|
prefix_sep,
|
|
dummy_na,
|
|
sparse=sparse,
|
|
drop_first=drop_first,
|
|
dtype=dtype,
|
|
)
|
|
return result
|
|
|
|
|
|
def _get_dummies_1d(
|
|
data,
|
|
prefix,
|
|
prefix_sep="_",
|
|
dummy_na=False,
|
|
sparse=False,
|
|
drop_first=False,
|
|
dtype=None,
|
|
):
|
|
from pandas.core.reshape.concat import concat
|
|
|
|
# Series avoids inconsistent NaN handling
|
|
codes, levels = factorize_from_iterable(Series(data))
|
|
|
|
if dtype is None:
|
|
dtype = np.uint8
|
|
dtype = np.dtype(dtype)
|
|
|
|
if is_object_dtype(dtype):
|
|
raise ValueError("dtype=object is not a valid dtype for get_dummies")
|
|
|
|
def get_empty_frame(data) -> DataFrame:
|
|
if isinstance(data, Series):
|
|
index = data.index
|
|
else:
|
|
index = np.arange(len(data))
|
|
return DataFrame(index=index)
|
|
|
|
# if all NaN
|
|
if not dummy_na and len(levels) == 0:
|
|
return get_empty_frame(data)
|
|
|
|
codes = codes.copy()
|
|
if dummy_na:
|
|
codes[codes == -1] = len(levels)
|
|
levels = np.append(levels, np.nan)
|
|
|
|
# if dummy_na, we just fake a nan level. drop_first will drop it again
|
|
if drop_first and len(levels) == 1:
|
|
return get_empty_frame(data)
|
|
|
|
number_of_cols = len(levels)
|
|
|
|
if prefix is None:
|
|
dummy_cols = levels
|
|
else:
|
|
dummy_cols = [f"{prefix}{prefix_sep}{level}" for level in levels]
|
|
|
|
index: Optional[Index]
|
|
if isinstance(data, Series):
|
|
index = data.index
|
|
else:
|
|
index = None
|
|
|
|
if sparse:
|
|
|
|
fill_value: Union[bool, float, int]
|
|
if is_integer_dtype(dtype):
|
|
fill_value = 0
|
|
elif dtype == bool:
|
|
fill_value = False
|
|
else:
|
|
fill_value = 0.0
|
|
|
|
sparse_series = []
|
|
N = len(data)
|
|
sp_indices: List[List] = [[] for _ in range(len(dummy_cols))]
|
|
mask = codes != -1
|
|
codes = codes[mask]
|
|
n_idx = np.arange(N)[mask]
|
|
|
|
for ndx, code in zip(n_idx, codes):
|
|
sp_indices[code].append(ndx)
|
|
|
|
if drop_first:
|
|
# remove first categorical level to avoid perfect collinearity
|
|
# GH12042
|
|
sp_indices = sp_indices[1:]
|
|
dummy_cols = dummy_cols[1:]
|
|
for col, ixs in zip(dummy_cols, sp_indices):
|
|
sarr = SparseArray(
|
|
np.ones(len(ixs), dtype=dtype),
|
|
sparse_index=IntIndex(N, ixs),
|
|
fill_value=fill_value,
|
|
dtype=dtype,
|
|
)
|
|
sparse_series.append(Series(data=sarr, index=index, name=col))
|
|
|
|
out = concat(sparse_series, axis=1, copy=False)
|
|
return out
|
|
|
|
else:
|
|
dummy_mat = np.eye(number_of_cols, dtype=dtype).take(codes, axis=0)
|
|
|
|
if not dummy_na:
|
|
# reset NaN GH4446
|
|
dummy_mat[codes == -1] = 0
|
|
|
|
if drop_first:
|
|
# remove first GH12042
|
|
dummy_mat = dummy_mat[:, 1:]
|
|
dummy_cols = dummy_cols[1:]
|
|
return DataFrame(dummy_mat, index=index, columns=dummy_cols)
|
|
|
|
|
|
def _reorder_for_extension_array_stack(arr, n_rows: int, n_columns: int):
|
|
"""
|
|
Re-orders the values when stacking multiple extension-arrays.
|
|
|
|
The indirect stacking method used for EAs requires a followup
|
|
take to get the order correct.
|
|
|
|
Parameters
|
|
----------
|
|
arr : ExtensionArray
|
|
n_rows, n_columns : int
|
|
The number of rows and columns in the original DataFrame.
|
|
|
|
Returns
|
|
-------
|
|
taken : ExtensionArray
|
|
The original `arr` with elements re-ordered appropriately
|
|
|
|
Examples
|
|
--------
|
|
>>> arr = np.array(['a', 'b', 'c', 'd', 'e', 'f'])
|
|
>>> _reorder_for_extension_array_stack(arr, 2, 3)
|
|
array(['a', 'c', 'e', 'b', 'd', 'f'], dtype='<U1')
|
|
|
|
>>> _reorder_for_extension_array_stack(arr, 3, 2)
|
|
array(['a', 'd', 'b', 'e', 'c', 'f'], dtype='<U1')
|
|
"""
|
|
# final take to get the order correct.
|
|
# idx is an indexer like
|
|
# [c0r0, c1r0, c2r0, ...,
|
|
# c0r1, c1r1, c2r1, ...]
|
|
idx = np.arange(n_rows * n_columns).reshape(n_columns, n_rows).T.ravel()
|
|
return arr.take(idx)
|