projektAI/venv/Lib/site-packages/pandas/core/groupby/generic.py

1879 lines
63 KiB
Python
Raw Normal View History

2021-06-06 22:13:05 +02:00
"""
Define the SeriesGroupBy and DataFrameGroupBy
classes that hold the groupby interfaces (and some implementations).
These are user facing as the result of the ``df.groupby(...)`` operations,
which here returns a DataFrameGroupBy object.
"""
from collections import abc, namedtuple
import copy
from functools import partial
from textwrap import dedent
from typing import (
TYPE_CHECKING,
Any,
Callable,
Dict,
FrozenSet,
Iterable,
List,
Mapping,
Optional,
Sequence,
Type,
TypeVar,
Union,
cast,
)
import warnings
import numpy as np
from pandas._libs import lib, reduction as libreduction
from pandas._typing import ArrayLike, FrameOrSeries, FrameOrSeriesUnion, Label
from pandas.util._decorators import Appender, Substitution, doc
from pandas.core.dtypes.cast import (
find_common_type,
maybe_cast_result_dtype,
maybe_downcast_numeric,
)
from pandas.core.dtypes.common import (
ensure_int64,
ensure_platform_int,
is_bool,
is_integer_dtype,
is_interval_dtype,
is_numeric_dtype,
is_scalar,
needs_i8_conversion,
)
from pandas.core.dtypes.missing import isna, notna
from pandas.core import algorithms, nanops
from pandas.core.aggregation import (
agg_list_like,
aggregate,
maybe_mangle_lambdas,
reconstruct_func,
validate_func_kwargs,
)
from pandas.core.arrays import Categorical, ExtensionArray
from pandas.core.base import DataError, SpecificationError
import pandas.core.common as com
from pandas.core.construction import create_series_with_explicit_dtype
from pandas.core.frame import DataFrame
from pandas.core.generic import NDFrame
from pandas.core.groupby import base
from pandas.core.groupby.groupby import (
GroupBy,
_agg_template,
_apply_docs,
_transform_template,
get_groupby,
group_selection_context,
)
from pandas.core.indexes.api import Index, MultiIndex, all_indexes_same
import pandas.core.indexes.base as ibase
from pandas.core.internals import BlockManager
from pandas.core.series import Series
from pandas.core.util.numba_ import maybe_use_numba
from pandas.plotting import boxplot_frame_groupby
if TYPE_CHECKING:
from pandas.core.internals import Block
NamedAgg = namedtuple("NamedAgg", ["column", "aggfunc"])
# TODO(typing) the return value on this callable should be any *scalar*.
AggScalar = Union[str, Callable[..., Any]]
# TODO: validate types on ScalarResult and move to _typing
# Blocked from using by https://github.com/python/mypy/issues/1484
# See note at _mangle_lambda_list
ScalarResult = TypeVar("ScalarResult")
def generate_property(name: str, klass: Type[FrameOrSeries]):
"""
Create a property for a GroupBy subclass to dispatch to DataFrame/Series.
Parameters
----------
name : str
klass : {DataFrame, Series}
Returns
-------
property
"""
def prop(self):
return self._make_wrapper(name)
parent_method = getattr(klass, name)
prop.__doc__ = parent_method.__doc__ or ""
prop.__name__ = name
return property(prop)
def pin_allowlisted_properties(klass: Type[FrameOrSeries], allowlist: FrozenSet[str]):
"""
Create GroupBy member defs for DataFrame/Series names in a allowlist.
Parameters
----------
klass : DataFrame or Series class
class where members are defined.
allowlist : frozenset[str]
Set of names of klass methods to be constructed
Returns
-------
class decorator
Notes
-----
Since we don't want to override methods explicitly defined in the
base class, any such name is skipped.
"""
def pinner(cls):
for name in allowlist:
if hasattr(cls, name):
# don't override anything that was explicitly defined
# in the base class
continue
prop = generate_property(name, klass)
setattr(cls, name, prop)
return cls
return pinner
@pin_allowlisted_properties(Series, base.series_apply_allowlist)
class SeriesGroupBy(GroupBy[Series]):
_apply_allowlist = base.series_apply_allowlist
def _iterate_slices(self) -> Iterable[Series]:
yield self._selected_obj
@property
def _selection_name(self):
"""
since we are a series, we by definition only have
a single name, but may be the result of a selection or
the name of our object
"""
if self._selection is None:
return self.obj.name
else:
return self._selection
_agg_examples_doc = dedent(
"""
Examples
--------
>>> s = pd.Series([1, 2, 3, 4])
>>> s
0 1
1 2
2 3
3 4
dtype: int64
>>> s.groupby([1, 1, 2, 2]).min()
1 1
2 3
dtype: int64
>>> s.groupby([1, 1, 2, 2]).agg('min')
1 1
2 3
dtype: int64
>>> s.groupby([1, 1, 2, 2]).agg(['min', 'max'])
min max
1 1 2
2 3 4
The output column names can be controlled by passing
the desired column names and aggregations as keyword arguments.
>>> s.groupby([1, 1, 2, 2]).agg(
... minimum='min',
... maximum='max',
... )
minimum maximum
1 1 2
2 3 4"""
)
@Appender(
_apply_docs["template"].format(
input="series", examples=_apply_docs["series_examples"]
)
)
def apply(self, func, *args, **kwargs):
return super().apply(func, *args, **kwargs)
@doc(_agg_template, examples=_agg_examples_doc, klass="Series")
def aggregate(self, func=None, *args, engine=None, engine_kwargs=None, **kwargs):
if maybe_use_numba(engine):
with group_selection_context(self):
data = self._selected_obj
result, index = self._aggregate_with_numba(
data.to_frame(), func, *args, engine_kwargs=engine_kwargs, **kwargs
)
return self.obj._constructor(result.ravel(), index=index, name=data.name)
relabeling = func is None
columns = None
if relabeling:
columns, func = validate_func_kwargs(kwargs)
kwargs = {}
if isinstance(func, str):
return getattr(self, func)(*args, **kwargs)
elif isinstance(func, abc.Iterable):
# Catch instances of lists / tuples
# but not the class list / tuple itself.
func = maybe_mangle_lambdas(func)
ret = self._aggregate_multiple_funcs(func)
if relabeling:
ret.columns = columns
else:
cyfunc = self._get_cython_func(func)
if cyfunc and not args and not kwargs:
return getattr(self, cyfunc)()
if self.grouper.nkeys > 1:
return self._python_agg_general(func, *args, **kwargs)
try:
return self._python_agg_general(func, *args, **kwargs)
except (ValueError, KeyError):
# TODO: KeyError is raised in _python_agg_general,
# see test_groupby.test_basic
result = self._aggregate_named(func, *args, **kwargs)
index = Index(sorted(result), name=self.grouper.names[0])
ret = create_series_with_explicit_dtype(
result, index=index, dtype_if_empty=object
)
if not self.as_index: # pragma: no cover
print("Warning, ignoring as_index=True")
if isinstance(ret, dict):
from pandas import concat
ret = concat(ret.values(), axis=1, keys=[key.label for key in ret.keys()])
return ret
agg = aggregate
def _aggregate_multiple_funcs(self, arg):
if isinstance(arg, dict):
# show the deprecation, but only if we
# have not shown a higher level one
# GH 15931
if isinstance(self._selected_obj, Series):
raise SpecificationError("nested renamer is not supported")
columns = list(arg.keys())
arg = arg.items()
elif any(isinstance(x, (tuple, list)) for x in arg):
arg = [(x, x) if not isinstance(x, (tuple, list)) else x for x in arg]
# indicated column order
columns = next(zip(*arg))
else:
# list of functions / function names
columns = []
for f in arg:
columns.append(com.get_callable_name(f) or f)
arg = zip(columns, arg)
results: Dict[base.OutputKey, FrameOrSeriesUnion] = {}
for idx, (name, func) in enumerate(arg):
obj = self
# reset the cache so that we
# only include the named selection
if name in self._selected_obj:
obj = copy.copy(obj)
obj._reset_cache()
obj._selection = name
results[base.OutputKey(label=name, position=idx)] = obj.aggregate(func)
if any(isinstance(x, DataFrame) for x in results.values()):
# let higher level handle
return results
output = self._wrap_aggregated_output(results, index=None)
return self.obj._constructor_expanddim(output, columns=columns)
# TODO: index should not be Optional - see GH 35490
def _wrap_series_output(
self,
output: Mapping[base.OutputKey, Union[Series, np.ndarray]],
index: Optional[Index],
) -> FrameOrSeriesUnion:
"""
Wraps the output of a SeriesGroupBy operation into the expected result.
Parameters
----------
output : Mapping[base.OutputKey, Union[Series, np.ndarray]]
Data to wrap.
index : pd.Index or None
Index to apply to the output.
Returns
-------
Series or DataFrame
Notes
-----
In the vast majority of cases output and columns will only contain one
element. The exception is operations that expand dimensions, like ohlc.
"""
indexed_output = {key.position: val for key, val in output.items()}
columns = Index(key.label for key in output)
result: FrameOrSeriesUnion
if len(output) > 1:
result = self.obj._constructor_expanddim(indexed_output, index=index)
result.columns = columns
elif not columns.empty:
result = self.obj._constructor(
indexed_output[0], index=index, name=columns[0]
)
else:
result = self.obj._constructor_expanddim()
return result
# TODO: Remove index argument, use self.grouper.result_index, see GH 35490
def _wrap_aggregated_output(
self,
output: Mapping[base.OutputKey, Union[Series, np.ndarray]],
index: Optional[Index],
) -> FrameOrSeriesUnion:
"""
Wraps the output of a SeriesGroupBy aggregation into the expected result.
Parameters
----------
output : Mapping[base.OutputKey, Union[Series, np.ndarray]]
Data to wrap.
Returns
-------
Series or DataFrame
Notes
-----
In the vast majority of cases output will only contain one element.
The exception is operations that expand dimensions, like ohlc.
"""
result = self._wrap_series_output(output=output, index=index)
return self._reindex_output(result)
def _wrap_transformed_output(
self, output: Mapping[base.OutputKey, Union[Series, np.ndarray]]
) -> Series:
"""
Wraps the output of a SeriesGroupBy aggregation into the expected result.
Parameters
----------
output : dict[base.OutputKey, Union[Series, np.ndarray]]
Dict with a sole key of 0 and a value of the result values.
Returns
-------
Series
Notes
-----
output should always contain one element. It is specified as a dict
for consistency with DataFrame methods and _wrap_aggregated_output.
"""
assert len(output) == 1
result = self._wrap_series_output(output=output, index=self.obj.index)
# No transformations increase the ndim of the result
assert isinstance(result, Series)
return result
def _wrap_applied_output(
self, keys: Index, values: Optional[List[Any]], not_indexed_same: bool = False
) -> FrameOrSeriesUnion:
"""
Wrap the output of SeriesGroupBy.apply into the expected result.
Parameters
----------
keys : Index
Keys of groups that Series was grouped by.
values : Optional[List[Any]]
Applied output for each group.
not_indexed_same : bool, default False
Whether the applied outputs are not indexed the same as the group axes.
Returns
-------
DataFrame or Series
"""
if len(keys) == 0:
# GH #6265
return self.obj._constructor(
[], name=self._selection_name, index=keys, dtype=np.float64
)
assert values is not None
def _get_index() -> Index:
if self.grouper.nkeys > 1:
index = MultiIndex.from_tuples(keys, names=self.grouper.names)
else:
index = Index(keys, name=self.grouper.names[0])
return index
if isinstance(values[0], dict):
# GH #823 #24880
index = _get_index()
result: FrameOrSeriesUnion = self._reindex_output(
self.obj._constructor_expanddim(values, index=index)
)
# if self.observed is False,
# keep all-NaN rows created while re-indexing
result = result.stack(dropna=self.observed)
result.name = self._selection_name
return result
elif isinstance(values[0], (Series, DataFrame)):
return self._concat_objects(keys, values, not_indexed_same=not_indexed_same)
else:
# GH #6265 #24880
result = self.obj._constructor(
data=values, index=_get_index(), name=self._selection_name
)
return self._reindex_output(result)
def _aggregate_named(self, func, *args, **kwargs):
result = {}
initialized = False
for name, group in self:
# Each step of this loop corresponds to
# libreduction._BaseGrouper._apply_to_group
group.name = name # NB: libreduction does not pin name
output = func(group, *args, **kwargs)
output = libreduction.extract_result(output)
if not initialized:
# We only do this validation on the first iteration
libreduction.check_result_array(output, 0)
initialized = True
result[name] = output
return result
@Substitution(klass="Series")
@Appender(_transform_template)
def transform(self, func, *args, engine=None, engine_kwargs=None, **kwargs):
if maybe_use_numba(engine):
with group_selection_context(self):
data = self._selected_obj
result = self._transform_with_numba(
data.to_frame(), func, *args, engine_kwargs=engine_kwargs, **kwargs
)
return self.obj._constructor(
result.ravel(), index=data.index, name=data.name
)
func = self._get_cython_func(func) or func
if not isinstance(func, str):
return self._transform_general(func, *args, **kwargs)
elif func not in base.transform_kernel_allowlist:
msg = f"'{func}' is not a valid function name for transform(name)"
raise ValueError(msg)
elif func in base.cythonized_kernels or func in base.transformation_kernels:
# cythonized transform or canned "agg+broadcast"
return getattr(self, func)(*args, **kwargs)
# If func is a reduction, we need to broadcast the
# result to the whole group. Compute func result
# and deal with possible broadcasting below.
# Temporarily set observed for dealing with categoricals.
with com.temp_setattr(self, "observed", True):
result = getattr(self, func)(*args, **kwargs)
return self._transform_fast(result)
def _transform_general(self, func, *args, **kwargs):
"""
Transform with a non-str `func`.
"""
klass = type(self._selected_obj)
results = []
for name, group in self:
object.__setattr__(group, "name", name)
res = func(group, *args, **kwargs)
if isinstance(res, (DataFrame, Series)):
res = res._values
results.append(klass(res, index=group.index))
# check for empty "results" to avoid concat ValueError
if results:
from pandas.core.reshape.concat import concat
concatenated = concat(results)
result = self._set_result_index_ordered(concatenated)
else:
result = self.obj._constructor(dtype=np.float64)
# we will only try to coerce the result type if
# we have a numeric dtype, as these are *always* user-defined funcs
# the cython take a different path (and casting)
if is_numeric_dtype(result.dtype):
common_dtype = find_common_type([self._selected_obj.dtype, result.dtype])
if common_dtype is result.dtype:
result = maybe_downcast_numeric(result, self._selected_obj.dtype)
result.name = self._selected_obj.name
result.index = self._selected_obj.index
return result
def _transform_fast(self, result) -> Series:
"""
fast version of transform, only applicable to
builtin/cythonizable functions
"""
ids, _, ngroup = self.grouper.group_info
result = result.reindex(self.grouper.result_index, copy=False)
out = algorithms.take_1d(result._values, ids)
return self.obj._constructor(out, index=self.obj.index, name=self.obj.name)
def filter(self, func, dropna=True, *args, **kwargs):
"""
Return a copy of a Series excluding elements from groups that
do not satisfy the boolean criterion specified by func.
Parameters
----------
func : function
To apply to each group. Should return True or False.
dropna : Drop groups that do not pass the filter. True by default;
if False, groups that evaluate False are filled with NaNs.
Examples
--------
>>> df = pd.DataFrame({'A' : ['foo', 'bar', 'foo', 'bar',
... 'foo', 'bar'],
... 'B' : [1, 2, 3, 4, 5, 6],
... 'C' : [2.0, 5., 8., 1., 2., 9.]})
>>> grouped = df.groupby('A')
>>> df.groupby('A').B.filter(lambda x: x.mean() > 3.)
1 2
3 4
5 6
Name: B, dtype: int64
Returns
-------
filtered : Series
"""
if isinstance(func, str):
wrapper = lambda x: getattr(x, func)(*args, **kwargs)
else:
wrapper = lambda x: func(x, *args, **kwargs)
# Interpret np.nan as False.
def true_and_notna(x) -> bool:
b = wrapper(x)
return b and notna(b)
try:
indices = [
self._get_index(name) for name, group in self if true_and_notna(group)
]
except (ValueError, TypeError) as err:
raise TypeError("the filter must return a boolean result") from err
filtered = self._apply_filter(indices, dropna)
return filtered
def nunique(self, dropna: bool = True) -> Series:
"""
Return number of unique elements in the group.
Returns
-------
Series
Number of unique values within each group.
"""
ids, _, _ = self.grouper.group_info
val = self.obj._values
codes, _ = algorithms.factorize(val, sort=False)
sorter = np.lexsort((codes, ids))
codes = codes[sorter]
ids = ids[sorter]
# group boundaries are where group ids change
# unique observations are where sorted values change
idx = np.r_[0, 1 + np.nonzero(ids[1:] != ids[:-1])[0]]
inc = np.r_[1, codes[1:] != codes[:-1]]
# 1st item of each group is a new unique observation
mask = codes == -1
if dropna:
inc[idx] = 1
inc[mask] = 0
else:
inc[mask & np.r_[False, mask[:-1]]] = 0
inc[idx] = 1
out = np.add.reduceat(inc, idx).astype("int64", copy=False)
if len(ids):
# NaN/NaT group exists if the head of ids is -1,
# so remove it from res and exclude its index from idx
if ids[0] == -1:
res = out[1:]
idx = idx[np.flatnonzero(idx)]
else:
res = out
else:
res = out[1:]
ri = self.grouper.result_index
# we might have duplications among the bins
if len(res) != len(ri):
res, out = np.zeros(len(ri), dtype=out.dtype), res
res[ids[idx]] = out
result = self.obj._constructor(res, index=ri, name=self._selection_name)
return self._reindex_output(result, fill_value=0)
@doc(Series.describe)
def describe(self, **kwargs):
result = self.apply(lambda x: x.describe(**kwargs))
if self.axis == 1:
return result.T
return result.unstack()
def value_counts(
self, normalize=False, sort=True, ascending=False, bins=None, dropna=True
):
from pandas.core.reshape.merge import get_join_indexers
from pandas.core.reshape.tile import cut
if bins is not None and not np.iterable(bins):
# scalar bins cannot be done at top level
# in a backward compatible way
return self.apply(
Series.value_counts,
normalize=normalize,
sort=sort,
ascending=ascending,
bins=bins,
)
ids, _, _ = self.grouper.group_info
val = self.obj._values
# groupby removes null keys from groupings
mask = ids != -1
ids, val = ids[mask], val[mask]
if bins is None:
lab, lev = algorithms.factorize(val, sort=True)
llab = lambda lab, inc: lab[inc]
else:
# lab is a Categorical with categories an IntervalIndex
lab = cut(Series(val), bins, include_lowest=True)
lev = lab.cat.categories
lab = lev.take(lab.cat.codes, allow_fill=True, fill_value=lev._na_value)
llab = lambda lab, inc: lab[inc]._multiindex.codes[-1]
if is_interval_dtype(lab.dtype):
# TODO: should we do this inside II?
sorter = np.lexsort((lab.left, lab.right, ids))
else:
sorter = np.lexsort((lab, ids))
ids, lab = ids[sorter], lab[sorter]
# group boundaries are where group ids change
idx = np.r_[0, 1 + np.nonzero(ids[1:] != ids[:-1])[0]]
# new values are where sorted labels change
lchanges = llab(lab, slice(1, None)) != llab(lab, slice(None, -1))
inc = np.r_[True, lchanges]
inc[idx] = True # group boundaries are also new values
out = np.diff(np.nonzero(np.r_[inc, True])[0]) # value counts
# num. of times each group should be repeated
rep = partial(np.repeat, repeats=np.add.reduceat(inc, idx))
# multi-index components
codes = self.grouper.reconstructed_codes
codes = [rep(level_codes) for level_codes in codes] + [llab(lab, inc)]
levels = [ping.group_index for ping in self.grouper.groupings] + [lev]
names = self.grouper.names + [self._selection_name]
if dropna:
mask = codes[-1] != -1
if mask.all():
dropna = False
else:
out, codes = out[mask], [level_codes[mask] for level_codes in codes]
if normalize:
out = out.astype("float")
d = np.diff(np.r_[idx, len(ids)])
if dropna:
m = ids[lab == -1]
np.add.at(d, m, -1)
acc = rep(d)[mask]
else:
acc = rep(d)
out /= acc
if sort and bins is None:
cat = ids[inc][mask] if dropna else ids[inc]
sorter = np.lexsort((out if ascending else -out, cat))
out, codes[-1] = out[sorter], codes[-1][sorter]
if bins is None:
mi = MultiIndex(
levels=levels, codes=codes, names=names, verify_integrity=False
)
if is_integer_dtype(out):
out = ensure_int64(out)
return self.obj._constructor(out, index=mi, name=self._selection_name)
# for compat. with libgroupby.value_counts need to ensure every
# bin is present at every index level, null filled with zeros
diff = np.zeros(len(out), dtype="bool")
for level_codes in codes[:-1]:
diff |= np.r_[True, level_codes[1:] != level_codes[:-1]]
ncat, nbin = diff.sum(), len(levels[-1])
left = [np.repeat(np.arange(ncat), nbin), np.tile(np.arange(nbin), ncat)]
right = [diff.cumsum() - 1, codes[-1]]
_, idx = get_join_indexers(left, right, sort=False, how="left")
out = np.where(idx != -1, out[idx], 0)
if sort:
sorter = np.lexsort((out if ascending else -out, left[0]))
out, left[-1] = out[sorter], left[-1][sorter]
# build the multi-index w/ full levels
def build_codes(lev_codes: np.ndarray) -> np.ndarray:
return np.repeat(lev_codes[diff], nbin)
codes = [build_codes(lev_codes) for lev_codes in codes[:-1]]
codes.append(left[-1])
mi = MultiIndex(levels=levels, codes=codes, names=names, verify_integrity=False)
if is_integer_dtype(out):
out = ensure_int64(out)
return self.obj._constructor(out, index=mi, name=self._selection_name)
def count(self) -> Series:
"""
Compute count of group, excluding missing values.
Returns
-------
Series
Count of values within each group.
"""
ids, _, ngroups = self.grouper.group_info
val = self.obj._values
mask = (ids != -1) & ~isna(val)
ids = ensure_platform_int(ids)
minlength = ngroups or 0
out = np.bincount(ids[mask], minlength=minlength)
result = self.obj._constructor(
out,
index=self.grouper.result_index,
name=self._selection_name,
dtype="int64",
)
return self._reindex_output(result, fill_value=0)
def _apply_to_column_groupbys(self, func):
""" return a pass thru """
return func(self)
def pct_change(self, periods=1, fill_method="pad", limit=None, freq=None):
"""Calculate pct_change of each value to previous entry in group"""
# TODO: Remove this conditional when #23918 is fixed
if freq:
return self.apply(
lambda x: x.pct_change(
periods=periods, fill_method=fill_method, limit=limit, freq=freq
)
)
if fill_method is None: # GH30463
fill_method = "pad"
limit = 0
filled = getattr(self, fill_method)(limit=limit)
fill_grp = filled.groupby(self.grouper.codes)
shifted = fill_grp.shift(periods=periods, freq=freq)
return (filled / shifted) - 1
@pin_allowlisted_properties(DataFrame, base.dataframe_apply_allowlist)
class DataFrameGroupBy(GroupBy[DataFrame]):
_apply_allowlist = base.dataframe_apply_allowlist
_agg_examples_doc = dedent(
"""
Examples
--------
>>> df = pd.DataFrame(
... {
... "A": [1, 1, 2, 2],
... "B": [1, 2, 3, 4],
... "C": [0.362838, 0.227877, 1.267767, -0.562860],
... }
... )
>>> df
A B C
0 1 1 0.362838
1 1 2 0.227877
2 2 3 1.267767
3 2 4 -0.562860
The aggregation is for each column.
>>> df.groupby('A').agg('min')
B C
A
1 1 0.227877
2 3 -0.562860
Multiple aggregations
>>> df.groupby('A').agg(['min', 'max'])
B C
min max min max
A
1 1 2 0.227877 0.362838
2 3 4 -0.562860 1.267767
Select a column for aggregation
>>> df.groupby('A').B.agg(['min', 'max'])
min max
A
1 1 2
2 3 4
Different aggregations per column
>>> df.groupby('A').agg({'B': ['min', 'max'], 'C': 'sum'})
B C
min max sum
A
1 1 2 0.590715
2 3 4 0.704907
To control the output names with different aggregations per column,
pandas supports "named aggregation"
>>> df.groupby("A").agg(
... b_min=pd.NamedAgg(column="B", aggfunc="min"),
... c_sum=pd.NamedAgg(column="C", aggfunc="sum"))
b_min c_sum
A
1 1 0.590715
2 3 0.704907
- The keywords are the *output* column names
- The values are tuples whose first element is the column to select
and the second element is the aggregation to apply to that column.
Pandas provides the ``pandas.NamedAgg`` namedtuple with the fields
``['column', 'aggfunc']`` to make it clearer what the arguments are.
As usual, the aggregation can be a callable or a string alias.
See :ref:`groupby.aggregate.named` for more."""
)
@doc(_agg_template, examples=_agg_examples_doc, klass="DataFrame")
def aggregate(self, func=None, *args, engine=None, engine_kwargs=None, **kwargs):
if maybe_use_numba(engine):
with group_selection_context(self):
data = self._selected_obj
result, index = self._aggregate_with_numba(
data, func, *args, engine_kwargs=engine_kwargs, **kwargs
)
return self.obj._constructor(result, index=index, columns=data.columns)
relabeling, func, columns, order = reconstruct_func(func, **kwargs)
func = maybe_mangle_lambdas(func)
result, how = aggregate(self, func, *args, **kwargs)
if how is None:
return result
if result is None:
# grouper specific aggregations
if self.grouper.nkeys > 1:
return self._python_agg_general(func, *args, **kwargs)
elif args or kwargs:
result = self._aggregate_frame(func, *args, **kwargs)
elif self.axis == 1:
# _aggregate_multiple_funcs does not allow self.axis == 1
result = self._aggregate_frame(func)
else:
# try to treat as if we are passing a list
try:
result = agg_list_like(self, [func], _axis=self.axis)
# select everything except for the last level, which is the one
# containing the name of the function(s), see GH 32040
result.columns = result.columns.rename(
[self._selected_obj.columns.name] * result.columns.nlevels
).droplevel(-1)
except ValueError as err:
if "no results" not in str(err):
# raised directly by _aggregate_multiple_funcs
raise
result = self._aggregate_frame(func)
except AttributeError:
# catch exception from line 969
# (Series does not have attribute "columns"), see GH 35246
result = self._aggregate_frame(func)
if relabeling:
# used reordered index of columns
result = result.iloc[:, order]
result.columns = columns
if not self.as_index:
self._insert_inaxis_grouper_inplace(result)
result.index = np.arange(len(result))
return result._convert(datetime=True)
agg = aggregate
def _iterate_slices(self) -> Iterable[Series]:
obj = self._selected_obj
if self.axis == 1:
obj = obj.T
if isinstance(obj, Series) and obj.name not in self.exclusions:
# Occurs when doing DataFrameGroupBy(...)["X"]
yield obj
else:
for label, values in obj.items():
if label in self.exclusions:
continue
yield values
def _cython_agg_general(
self, how: str, alt=None, numeric_only: bool = True, min_count: int = -1
) -> DataFrame:
agg_mgr = self._cython_agg_blocks(
how, alt=alt, numeric_only=numeric_only, min_count=min_count
)
return self._wrap_agged_blocks(agg_mgr.blocks, items=agg_mgr.items)
def _cython_agg_blocks(
self, how: str, alt=None, numeric_only: bool = True, min_count: int = -1
) -> BlockManager:
data: BlockManager = self._get_data_to_aggregate()
if numeric_only:
data = data.get_numeric_data(copy=False)
def cast_agg_result(result, values: ArrayLike, how: str) -> ArrayLike:
# see if we can cast the values to the desired dtype
# this may not be the original dtype
assert not isinstance(result, DataFrame)
dtype = maybe_cast_result_dtype(values.dtype, how)
result = maybe_downcast_numeric(result, dtype)
if isinstance(values, Categorical) and isinstance(result, np.ndarray):
# If the Categorical op didn't raise, it is dtype-preserving
result = type(values)._from_sequence(result.ravel(), dtype=values.dtype)
# Note this will have result.dtype == dtype from above
elif isinstance(result, np.ndarray) and result.ndim == 1:
# We went through a SeriesGroupByPath and need to reshape
# GH#32223 includes case with IntegerArray values
result = result.reshape(1, -1)
# test_groupby_duplicate_columns gets here with
# result.dtype == int64, values.dtype=object, how="min"
return result
def py_fallback(bvalues: ArrayLike) -> ArrayLike:
# if self.grouper.aggregate fails, we fall back to a pure-python
# solution
# We get here with a) EADtypes and b) object dtype
obj: FrameOrSeriesUnion
# call our grouper again with only this block
if isinstance(bvalues, ExtensionArray):
# TODO(EA2D): special case not needed with 2D EAs
obj = Series(bvalues)
else:
obj = DataFrame(bvalues.T)
if obj.shape[1] == 1:
# Avoid call to self.values that can occur in DataFrame
# reductions; see GH#28949
obj = obj.iloc[:, 0]
# Create SeriesGroupBy with observed=True so that it does
# not try to add missing categories if grouping over multiple
# Categoricals. This will done by later self._reindex_output()
# Doing it here creates an error. See GH#34951
sgb = get_groupby(obj, self.grouper, observed=True)
result = sgb.aggregate(lambda x: alt(x, axis=self.axis))
assert isinstance(result, (Series, DataFrame)) # for mypy
# In the case of object dtype block, it may have been split
# in the operation. We un-split here.
result = result._consolidate()
assert isinstance(result, (Series, DataFrame)) # for mypy
mgr = result._mgr
assert isinstance(mgr, BlockManager)
# unwrap DataFrame to get array
if len(mgr.blocks) != 1:
# We've split an object block! Everything we've assumed
# about a single block input returning a single block output
# is a lie. See eg GH-39329
return mgr.as_array()
else:
result = mgr.blocks[0].values
return result
def blk_func(bvalues: ArrayLike) -> ArrayLike:
try:
result = self.grouper._cython_operation(
"aggregate", bvalues, how, axis=1, min_count=min_count
)
except NotImplementedError:
# generally if we have numeric_only=False
# and non-applicable functions
# try to python agg
if alt is None:
# we cannot perform the operation
# in an alternate way, exclude the block
assert how == "ohlc"
raise
result = py_fallback(bvalues)
return cast_agg_result(result, bvalues, how)
# TypeError -> we may have an exception in trying to aggregate
# continue and exclude the block
# NotImplementedError -> "ohlc" with wrong dtype
new_mgr = data.apply(blk_func, ignore_failures=True)
if not len(new_mgr):
raise DataError("No numeric types to aggregate")
return new_mgr
def _aggregate_frame(self, func, *args, **kwargs) -> DataFrame:
if self.grouper.nkeys != 1:
raise AssertionError("Number of keys must be 1")
axis = self.axis
obj = self._obj_with_exclusions
result: Dict[Label, Union[NDFrame, np.ndarray]] = {}
if axis != obj._info_axis_number:
for name, data in self:
fres = func(data, *args, **kwargs)
result[name] = fres
else:
for name in self.indices:
data = self.get_group(name, obj=obj)
fres = func(data, *args, **kwargs)
result[name] = fres
return self._wrap_frame_output(result, obj)
def _aggregate_item_by_item(self, func, *args, **kwargs) -> DataFrame:
# only for axis==0
obj = self._obj_with_exclusions
result: Dict[Union[int, str], NDFrame] = {}
cannot_agg = []
for item in obj:
data = obj[item]
colg = SeriesGroupBy(data, selection=item, grouper=self.grouper)
try:
result[item] = colg.aggregate(func, *args, **kwargs)
except ValueError as err:
if "Must produce aggregated value" in str(err):
# raised in _aggregate_named, handle at higher level
# see test_apply_with_mutated_index
raise
# otherwise we get here from an AttributeError in _make_wrapper
cannot_agg.append(item)
continue
result_columns = obj.columns
if cannot_agg:
result_columns = result_columns.drop(cannot_agg)
return self.obj._constructor(result, columns=result_columns)
def _wrap_applied_output(self, keys, values, not_indexed_same=False):
if len(keys) == 0:
return self.obj._constructor(index=keys)
# GH12824
first_not_none = next(com.not_none(*values), None)
if first_not_none is None:
# GH9684 - All values are None, return an empty frame.
return self.obj._constructor()
elif isinstance(first_not_none, DataFrame):
return self._concat_objects(keys, values, not_indexed_same=not_indexed_same)
key_index = self.grouper.result_index if self.as_index else None
if isinstance(first_not_none, (np.ndarray, Index)):
# GH#1738: values is list of arrays of unequal lengths
# fall through to the outer else clause
# TODO: sure this is right? we used to do this
# after raising AttributeError above
return self.obj._constructor_sliced(
values, index=key_index, name=self._selection_name
)
elif not isinstance(first_not_none, Series):
# values are not series or array-like but scalars
# self._selection_name not passed through to Series as the
# result should not take the name of original selection
# of columns
if self.as_index:
return self.obj._constructor_sliced(values, index=key_index)
else:
result = DataFrame(values, index=key_index, columns=[self._selection])
self._insert_inaxis_grouper_inplace(result)
return result
else:
# values are Series
return self._wrap_applied_output_series(
keys, values, not_indexed_same, first_not_none, key_index
)
def _wrap_applied_output_series(
self,
keys,
values: List[Series],
not_indexed_same: bool,
first_not_none,
key_index,
) -> FrameOrSeriesUnion:
# this is to silence a DeprecationWarning
# TODO: Remove when default dtype of empty Series is object
kwargs = first_not_none._construct_axes_dict()
backup = create_series_with_explicit_dtype(dtype_if_empty=object, **kwargs)
values = [x if (x is not None) else backup for x in values]
all_indexed_same = all_indexes_same(x.index for x in values)
# GH3596
# provide a reduction (Frame -> Series) if groups are
# unique
if self.squeeze:
applied_index = self._selected_obj._get_axis(self.axis)
singular_series = len(values) == 1 and applied_index.nlevels == 1
# assign the name to this series
if singular_series:
values[0].name = keys[0]
# GH2893
# we have series in the values array, we want to
# produce a series:
# if any of the sub-series are not indexed the same
# OR we don't have a multi-index and we have only a
# single values
return self._concat_objects(
keys, values, not_indexed_same=not_indexed_same
)
# still a series
# path added as of GH 5545
elif all_indexed_same:
from pandas.core.reshape.concat import concat
return concat(values)
if not all_indexed_same:
# GH 8467
return self._concat_objects(keys, values, not_indexed_same=True)
# Combine values
# vstack+constructor is faster than concat and handles MI-columns
stacked_values = np.vstack([np.asarray(v) for v in values])
if self.axis == 0:
index = key_index
columns = first_not_none.index.copy()
if columns.name is None:
# GH6124 - propagate name of Series when it's consistent
names = {v.name for v in values}
if len(names) == 1:
columns.name = list(names)[0]
else:
index = first_not_none.index
columns = key_index
stacked_values = stacked_values.T
result = self.obj._constructor(stacked_values, index=index, columns=columns)
# if we have date/time like in the original, then coerce dates
# as we are stacking can easily have object dtypes here
so = self._selected_obj
if so.ndim == 2 and so.dtypes.apply(needs_i8_conversion).any():
result = result._convert(datetime=True)
else:
result = result._convert(datetime=True)
if not self.as_index:
self._insert_inaxis_grouper_inplace(result)
return self._reindex_output(result)
def _transform_general(self, func, *args, **kwargs):
from pandas.core.reshape.concat import concat
applied = []
obj = self._obj_with_exclusions
gen = self.grouper.get_iterator(obj, axis=self.axis)
fast_path, slow_path = self._define_paths(func, *args, **kwargs)
for name, group in gen:
object.__setattr__(group, "name", name)
# Try slow path and fast path.
try:
path, res = self._choose_path(fast_path, slow_path, group)
except TypeError:
return self._transform_item_by_item(obj, fast_path)
except ValueError as err:
msg = "transform must return a scalar value for each group"
raise ValueError(msg) from err
if isinstance(res, Series):
# we need to broadcast across the
# other dimension; this will preserve dtypes
# GH14457
if not np.prod(group.shape):
continue
elif res.index.is_(obj.index):
r = concat([res] * len(group.columns), axis=1)
r.columns = group.columns
r.index = group.index
else:
r = self.obj._constructor(
np.concatenate([res.values] * len(group.index)).reshape(
group.shape
),
columns=group.columns,
index=group.index,
)
applied.append(r)
else:
applied.append(res)
concat_index = obj.columns if self.axis == 0 else obj.index
other_axis = 1 if self.axis == 0 else 0 # switches between 0 & 1
concatenated = concat(applied, axis=self.axis, verify_integrity=False)
concatenated = concatenated.reindex(concat_index, axis=other_axis, copy=False)
return self._set_result_index_ordered(concatenated)
@Substitution(klass="DataFrame")
@Appender(_transform_template)
def transform(self, func, *args, engine=None, engine_kwargs=None, **kwargs):
if maybe_use_numba(engine):
with group_selection_context(self):
data = self._selected_obj
result = self._transform_with_numba(
data, func, *args, engine_kwargs=engine_kwargs, **kwargs
)
return self.obj._constructor(result, index=data.index, columns=data.columns)
# optimized transforms
func = self._get_cython_func(func) or func
if not isinstance(func, str):
return self._transform_general(func, *args, **kwargs)
elif func not in base.transform_kernel_allowlist:
msg = f"'{func}' is not a valid function name for transform(name)"
raise ValueError(msg)
elif func in base.cythonized_kernels or func in base.transformation_kernels:
# cythonized transformation or canned "reduction+broadcast"
return getattr(self, func)(*args, **kwargs)
# GH 30918
# Use _transform_fast only when we know func is an aggregation
if func in base.reduction_kernels:
# If func is a reduction, we need to broadcast the
# result to the whole group. Compute func result
# and deal with possible broadcasting below.
# Temporarily set observed for dealing with categoricals.
with com.temp_setattr(self, "observed", True):
result = getattr(self, func)(*args, **kwargs)
if isinstance(result, DataFrame) and result.columns.equals(
self._obj_with_exclusions.columns
):
return self._transform_fast(result)
return self._transform_general(func, *args, **kwargs)
def _transform_fast(self, result: DataFrame) -> DataFrame:
"""
Fast transform path for aggregations
"""
obj = self._obj_with_exclusions
# for each col, reshape to size of original frame by take operation
ids, _, ngroup = self.grouper.group_info
result = result.reindex(self.grouper.result_index, copy=False)
output = [
algorithms.take_1d(result.iloc[:, i].values, ids)
for i, _ in enumerate(result.columns)
]
return self.obj._constructor._from_arrays(
output, columns=result.columns, index=obj.index
)
def _define_paths(self, func, *args, **kwargs):
if isinstance(func, str):
fast_path = lambda group: getattr(group, func)(*args, **kwargs)
slow_path = lambda group: group.apply(
lambda x: getattr(x, func)(*args, **kwargs), axis=self.axis
)
else:
fast_path = lambda group: func(group, *args, **kwargs)
slow_path = lambda group: group.apply(
lambda x: func(x, *args, **kwargs), axis=self.axis
)
return fast_path, slow_path
def _choose_path(self, fast_path: Callable, slow_path: Callable, group: DataFrame):
path = slow_path
res = slow_path(group)
# if we make it here, test if we can use the fast path
try:
res_fast = fast_path(group)
except AssertionError:
raise
except Exception:
# GH#29631 For user-defined function, we can't predict what may be
# raised; see test_transform.test_transform_fastpath_raises
return path, res
# verify fast path does not change columns (and names), otherwise
# its results cannot be joined with those of the slow path
if not isinstance(res_fast, DataFrame):
return path, res
if not res_fast.columns.equals(group.columns):
return path, res
if res_fast.equals(res):
path = fast_path
return path, res
def _transform_item_by_item(self, obj: DataFrame, wrapper) -> DataFrame:
# iterate through columns
output = {}
inds = []
for i, col in enumerate(obj):
try:
output[col] = self[col].transform(wrapper)
except TypeError:
# e.g. trying to call nanmean with string values
pass
else:
inds.append(i)
if not output:
raise TypeError("Transform function invalid for data types")
columns = obj.columns
if len(output) < len(obj.columns):
columns = columns.take(inds)
return self.obj._constructor(output, index=obj.index, columns=columns)
def filter(self, func, dropna=True, *args, **kwargs):
"""
Return a copy of a DataFrame excluding filtered elements.
Elements from groups are filtered if they do not satisfy the
boolean criterion specified by func.
Parameters
----------
func : function
Function to apply to each subframe. Should return True or False.
dropna : Drop groups that do not pass the filter. True by default;
If False, groups that evaluate False are filled with NaNs.
Returns
-------
filtered : DataFrame
Notes
-----
Each subframe is endowed the attribute 'name' in case you need to know
which group you are working on.
Examples
--------
>>> df = pd.DataFrame({'A' : ['foo', 'bar', 'foo', 'bar',
... 'foo', 'bar'],
... 'B' : [1, 2, 3, 4, 5, 6],
... 'C' : [2.0, 5., 8., 1., 2., 9.]})
>>> grouped = df.groupby('A')
>>> grouped.filter(lambda x: x['B'].mean() > 3.)
A B C
1 bar 2 5.0
3 bar 4 1.0
5 bar 6 9.0
"""
indices = []
obj = self._selected_obj
gen = self.grouper.get_iterator(obj, axis=self.axis)
for name, group in gen:
object.__setattr__(group, "name", name)
res = func(group, *args, **kwargs)
try:
res = res.squeeze()
except AttributeError: # allow e.g., scalars and frames to pass
pass
# interpret the result of the filter
if is_bool(res) or (is_scalar(res) and isna(res)):
if res and notna(res):
indices.append(self._get_index(name))
else:
# non scalars aren't allowed
raise TypeError(
f"filter function returned a {type(res).__name__}, "
"but expected a scalar bool"
)
return self._apply_filter(indices, dropna)
def __getitem__(self, key):
if self.axis == 1:
# GH 37725
raise ValueError("Cannot subset columns when using axis=1")
# per GH 23566
if isinstance(key, tuple) and len(key) > 1:
# if len == 1, then it becomes a SeriesGroupBy and this is actually
# valid syntax, so don't raise warning
warnings.warn(
"Indexing with multiple keys (implicitly converted to a tuple "
"of keys) will be deprecated, use a list instead.",
FutureWarning,
stacklevel=2,
)
return super().__getitem__(key)
def _gotitem(self, key, ndim: int, subset=None):
"""
sub-classes to define
return a sliced object
Parameters
----------
key : string / list of selections
ndim : {1, 2}
requested ndim of result
subset : object, default None
subset to act on
"""
if ndim == 2:
if subset is None:
subset = self.obj
return DataFrameGroupBy(
subset,
self.grouper,
axis=self.axis,
level=self.level,
grouper=self.grouper,
exclusions=self.exclusions,
selection=key,
as_index=self.as_index,
sort=self.sort,
group_keys=self.group_keys,
squeeze=self.squeeze,
observed=self.observed,
mutated=self.mutated,
dropna=self.dropna,
)
elif ndim == 1:
if subset is None:
subset = self.obj[key]
return SeriesGroupBy(
subset,
level=self.level,
grouper=self.grouper,
selection=key,
sort=self.sort,
group_keys=self.group_keys,
squeeze=self.squeeze,
observed=self.observed,
dropna=self.dropna,
)
raise AssertionError("invalid ndim for _gotitem")
def _wrap_frame_output(self, result, obj: DataFrame) -> DataFrame:
result_index = self.grouper.levels[0]
if self.axis == 0:
return self.obj._constructor(
result, index=obj.columns, columns=result_index
).T
else:
return self.obj._constructor(result, index=obj.index, columns=result_index)
def _get_data_to_aggregate(self) -> BlockManager:
obj = self._obj_with_exclusions
if self.axis == 1:
return obj.T._mgr
else:
return obj._mgr
def _insert_inaxis_grouper_inplace(self, result: DataFrame) -> None:
# zip in reverse so we can always insert at loc 0
columns = result.columns
for name, lev, in_axis in zip(
reversed(self.grouper.names),
reversed(self.grouper.get_group_levels()),
reversed([grp.in_axis for grp in self.grouper.groupings]),
):
# GH #28549
# When using .apply(-), name will be in columns already
if in_axis and name not in columns:
result.insert(0, name, lev)
def _wrap_aggregated_output(
self,
output: Mapping[base.OutputKey, Union[Series, np.ndarray]],
index: Optional[Index],
) -> DataFrame:
"""
Wraps the output of DataFrameGroupBy aggregations into the expected result.
Parameters
----------
output : Mapping[base.OutputKey, Union[Series, np.ndarray]]
Data to wrap.
Returns
-------
DataFrame
"""
indexed_output = {key.position: val for key, val in output.items()}
columns = Index([key.label for key in output])
columns._set_names(self._obj_with_exclusions._get_axis(1 - self.axis).names)
result = self.obj._constructor(indexed_output)
result.columns = columns
if not self.as_index:
self._insert_inaxis_grouper_inplace(result)
result = result._consolidate()
else:
result.index = self.grouper.result_index
if self.axis == 1:
result = result.T
return self._reindex_output(result)
def _wrap_transformed_output(
self, output: Mapping[base.OutputKey, Union[Series, np.ndarray]]
) -> DataFrame:
"""
Wraps the output of DataFrameGroupBy transformations into the expected result.
Parameters
----------
output : Mapping[base.OutputKey, Union[Series, np.ndarray]]
Data to wrap.
Returns
-------
DataFrame
"""
indexed_output = {key.position: val for key, val in output.items()}
result = self.obj._constructor(indexed_output)
if self.axis == 1:
result = result.T
result.columns = self.obj.columns
else:
columns = Index(key.label for key in output)
columns.name = self.obj.columns.name
result.columns = columns
result.index = self.obj.index
return result
def _wrap_agged_blocks(self, blocks: Sequence["Block"], items: Index) -> DataFrame:
if not self.as_index:
index = np.arange(blocks[0].values.shape[-1])
mgr = BlockManager(blocks, axes=[items, index])
result = self.obj._constructor(mgr)
self._insert_inaxis_grouper_inplace(result)
result = result._consolidate()
else:
index = self.grouper.result_index
mgr = BlockManager(blocks, axes=[items, index])
result = self.obj._constructor(mgr)
if self.axis == 1:
result = result.T
return self._reindex_output(result)._convert(datetime=True)
def _iterate_column_groupbys(self):
for i, colname in enumerate(self._selected_obj.columns):
yield colname, SeriesGroupBy(
self._selected_obj.iloc[:, i],
selection=colname,
grouper=self.grouper,
exclusions=self.exclusions,
)
def _apply_to_column_groupbys(self, func) -> DataFrame:
from pandas.core.reshape.concat import concat
return concat(
(func(col_groupby) for _, col_groupby in self._iterate_column_groupbys()),
keys=self._selected_obj.columns,
axis=1,
)
def count(self) -> DataFrame:
"""
Compute count of group, excluding missing values.
Returns
-------
DataFrame
Count of values within each group.
"""
data = self._get_data_to_aggregate()
ids, _, ngroups = self.grouper.group_info
mask = ids != -1
def hfunc(bvalues: ArrayLike) -> ArrayLike:
# TODO(2DEA): reshape would not be necessary with 2D EAs
if bvalues.ndim == 1:
# EA
masked = mask & ~isna(bvalues).reshape(1, -1)
else:
masked = mask & ~isna(bvalues)
counted = lib.count_level_2d(masked, labels=ids, max_bin=ngroups, axis=1)
return counted
new_mgr = data.apply(hfunc)
# If we are grouping on categoricals we want unobserved categories to
# return zero, rather than the default of NaN which the reindexing in
# _wrap_agged_blocks() returns. GH 35028
with com.temp_setattr(self, "observed", True):
result = self._wrap_agged_blocks(new_mgr.blocks, items=data.items)
return self._reindex_output(result, fill_value=0)
def nunique(self, dropna: bool = True) -> DataFrame:
"""
Return DataFrame with counts of unique elements in each position.
Parameters
----------
dropna : bool, default True
Don't include NaN in the counts.
Returns
-------
nunique: DataFrame
Examples
--------
>>> df = pd.DataFrame({'id': ['spam', 'egg', 'egg', 'spam',
... 'ham', 'ham'],
... 'value1': [1, 5, 5, 2, 5, 5],
... 'value2': list('abbaxy')})
>>> df
id value1 value2
0 spam 1 a
1 egg 5 b
2 egg 5 b
3 spam 2 a
4 ham 5 x
5 ham 5 y
>>> df.groupby('id').nunique()
value1 value2
id
egg 1 1
ham 1 2
spam 2 1
Check for rows with the same id but conflicting values:
>>> df.groupby('id').filter(lambda g: (g.nunique() > 1).any())
id value1 value2
0 spam 1 a
3 spam 2 a
4 ham 5 x
5 ham 5 y
"""
from pandas.core.reshape.concat import concat
# TODO: this is duplicative of how GroupBy naturally works
# Try to consolidate with normal wrapping functions
obj = self._obj_with_exclusions
axis_number = obj._get_axis_number(self.axis)
other_axis = int(not axis_number)
if axis_number == 0:
iter_func = obj.items
else:
iter_func = obj.iterrows
results = concat(
[
SeriesGroupBy(content, selection=label, grouper=self.grouper).nunique(
dropna
)
for label, content in iter_func()
],
axis=1,
)
results = cast(DataFrame, results)
if axis_number == 1:
results = results.T
results._get_axis(other_axis).names = obj._get_axis(other_axis).names
if not self.as_index:
results.index = ibase.default_index(len(results))
self._insert_inaxis_grouper_inplace(results)
return results
@Appender(DataFrame.idxmax.__doc__)
def idxmax(self, axis=0, skipna: bool = True):
axis = DataFrame._get_axis_number(axis)
numeric_only = None if axis == 0 else False
def func(df):
# NB: here we use numeric_only=None, in DataFrame it is False GH#38217
res = df._reduce(
nanops.nanargmax,
"argmax",
axis=axis,
skipna=skipna,
numeric_only=numeric_only,
)
indices = res._values
index = df._get_axis(axis)
result = [index[i] if i >= 0 else np.nan for i in indices]
return df._constructor_sliced(result, index=res.index)
return self._python_apply_general(func, self._obj_with_exclusions)
@Appender(DataFrame.idxmin.__doc__)
def idxmin(self, axis=0, skipna: bool = True):
axis = DataFrame._get_axis_number(axis)
numeric_only = None if axis == 0 else False
def func(df):
# NB: here we use numeric_only=None, in DataFrame it is False GH#38217
res = df._reduce(
nanops.nanargmin,
"argmin",
axis=axis,
skipna=skipna,
numeric_only=numeric_only,
)
indices = res._values
index = df._get_axis(axis)
result = [index[i] if i >= 0 else np.nan for i in indices]
return df._constructor_sliced(result, index=res.index)
return self._python_apply_general(func, self._obj_with_exclusions)
boxplot = boxplot_frame_groupby