2652 lines
86 KiB
Python
2652 lines
86 KiB
Python
|
"""
|
|||
|
Define the SeriesGroupBy and DataFrameGroupBy
|
|||
|
classes that hold the groupby interfaces (and some implementations).
|
|||
|
|
|||
|
These are user facing as the result of the ``df.groupby(...)`` operations,
|
|||
|
which here returns a DataFrameGroupBy object.
|
|||
|
"""
|
|||
|
from __future__ import annotations
|
|||
|
|
|||
|
from collections import abc
|
|||
|
from functools import partial
|
|||
|
from textwrap import dedent
|
|||
|
from typing import (
|
|||
|
TYPE_CHECKING,
|
|||
|
Any,
|
|||
|
Callable,
|
|||
|
Hashable,
|
|||
|
Iterable,
|
|||
|
Literal,
|
|||
|
Mapping,
|
|||
|
NamedTuple,
|
|||
|
Sequence,
|
|||
|
TypeVar,
|
|||
|
Union,
|
|||
|
cast,
|
|||
|
)
|
|||
|
|
|||
|
import numpy as np
|
|||
|
|
|||
|
from pandas._libs import (
|
|||
|
Interval,
|
|||
|
lib,
|
|||
|
reduction as libreduction,
|
|||
|
)
|
|||
|
from pandas._typing import (
|
|||
|
ArrayLike,
|
|||
|
Axis,
|
|||
|
AxisInt,
|
|||
|
CorrelationMethod,
|
|||
|
FillnaOptions,
|
|||
|
IndexLabel,
|
|||
|
Manager,
|
|||
|
Manager2D,
|
|||
|
SingleManager,
|
|||
|
TakeIndexer,
|
|||
|
)
|
|||
|
from pandas.errors import SpecificationError
|
|||
|
from pandas.util._decorators import (
|
|||
|
Appender,
|
|||
|
Substitution,
|
|||
|
doc,
|
|||
|
)
|
|||
|
|
|||
|
from pandas.core.dtypes.common import (
|
|||
|
ensure_int64,
|
|||
|
is_bool,
|
|||
|
is_categorical_dtype,
|
|||
|
is_dict_like,
|
|||
|
is_integer_dtype,
|
|||
|
is_interval_dtype,
|
|||
|
is_numeric_dtype,
|
|||
|
is_scalar,
|
|||
|
)
|
|||
|
from pandas.core.dtypes.missing import (
|
|||
|
isna,
|
|||
|
notna,
|
|||
|
)
|
|||
|
|
|||
|
from pandas.core import algorithms
|
|||
|
from pandas.core.apply import (
|
|||
|
GroupByApply,
|
|||
|
maybe_mangle_lambdas,
|
|||
|
reconstruct_func,
|
|||
|
validate_func_kwargs,
|
|||
|
)
|
|||
|
import pandas.core.common as com
|
|||
|
from pandas.core.frame import DataFrame
|
|||
|
from pandas.core.groupby import base
|
|||
|
from pandas.core.groupby.groupby import (
|
|||
|
GroupBy,
|
|||
|
GroupByPlot,
|
|||
|
_agg_template,
|
|||
|
_apply_docs,
|
|||
|
_transform_template,
|
|||
|
)
|
|||
|
from pandas.core.indexes.api import (
|
|||
|
Index,
|
|||
|
MultiIndex,
|
|||
|
all_indexes_same,
|
|||
|
default_index,
|
|||
|
)
|
|||
|
from pandas.core.series import Series
|
|||
|
from pandas.core.util.numba_ import maybe_use_numba
|
|||
|
|
|||
|
from pandas.plotting import boxplot_frame_groupby
|
|||
|
|
|||
|
if TYPE_CHECKING:
|
|||
|
from pandas import Categorical
|
|||
|
from pandas.core.generic import NDFrame
|
|||
|
|
|||
|
# TODO(typing) the return value on this callable should be any *scalar*.
|
|||
|
AggScalar = Union[str, Callable[..., Any]]
|
|||
|
# TODO: validate types on ScalarResult and move to _typing
|
|||
|
# Blocked from using by https://github.com/python/mypy/issues/1484
|
|||
|
# See note at _mangle_lambda_list
|
|||
|
ScalarResult = TypeVar("ScalarResult")
|
|||
|
|
|||
|
|
|||
|
class NamedAgg(NamedTuple):
|
|||
|
"""
|
|||
|
Helper for column specific aggregation with control over output column names.
|
|||
|
|
|||
|
Subclass of typing.NamedTuple.
|
|||
|
|
|||
|
Parameters
|
|||
|
----------
|
|||
|
column : Hashable
|
|||
|
Column label in the DataFrame to apply aggfunc.
|
|||
|
aggfunc : function or str
|
|||
|
Function to apply to the provided column. If string, the name of a built-in
|
|||
|
pandas function.
|
|||
|
|
|||
|
Examples
|
|||
|
--------
|
|||
|
>>> df = pd.DataFrame({"key": [1, 1, 2], "a": [-1, 0, 1], 1: [10, 11, 12]})
|
|||
|
>>> agg_a = pd.NamedAgg(column="a", aggfunc="min")
|
|||
|
>>> agg_1 = pd.NamedAgg(column=1, aggfunc=np.mean)
|
|||
|
>>> df.groupby("key").agg(result_a=agg_a, result_1=agg_1)
|
|||
|
result_a result_1
|
|||
|
key
|
|||
|
1 -1 10.5
|
|||
|
2 1 12.0
|
|||
|
"""
|
|||
|
|
|||
|
column: Hashable
|
|||
|
aggfunc: AggScalar
|
|||
|
|
|||
|
|
|||
|
class SeriesGroupBy(GroupBy[Series]):
|
|||
|
def _wrap_agged_manager(self, mgr: Manager) -> Series:
|
|||
|
return self.obj._constructor(mgr, name=self.obj.name)
|
|||
|
|
|||
|
def _get_data_to_aggregate(
|
|||
|
self, *, numeric_only: bool = False, name: str | None = None
|
|||
|
) -> SingleManager:
|
|||
|
ser = self._selected_obj
|
|||
|
single = ser._mgr
|
|||
|
if numeric_only and not is_numeric_dtype(ser.dtype):
|
|||
|
# GH#41291 match Series behavior
|
|||
|
kwd_name = "numeric_only"
|
|||
|
raise TypeError(
|
|||
|
f"Cannot use {kwd_name}=True with "
|
|||
|
f"{type(self).__name__}.{name} and non-numeric dtypes."
|
|||
|
)
|
|||
|
return single
|
|||
|
|
|||
|
def _iterate_slices(self) -> Iterable[Series]:
|
|||
|
yield self._selected_obj
|
|||
|
|
|||
|
_agg_examples_doc = dedent(
|
|||
|
"""
|
|||
|
Examples
|
|||
|
--------
|
|||
|
>>> s = pd.Series([1, 2, 3, 4])
|
|||
|
|
|||
|
>>> s
|
|||
|
0 1
|
|||
|
1 2
|
|||
|
2 3
|
|||
|
3 4
|
|||
|
dtype: int64
|
|||
|
|
|||
|
>>> s.groupby([1, 1, 2, 2]).min()
|
|||
|
1 1
|
|||
|
2 3
|
|||
|
dtype: int64
|
|||
|
|
|||
|
>>> s.groupby([1, 1, 2, 2]).agg('min')
|
|||
|
1 1
|
|||
|
2 3
|
|||
|
dtype: int64
|
|||
|
|
|||
|
>>> s.groupby([1, 1, 2, 2]).agg(['min', 'max'])
|
|||
|
min max
|
|||
|
1 1 2
|
|||
|
2 3 4
|
|||
|
|
|||
|
The output column names can be controlled by passing
|
|||
|
the desired column names and aggregations as keyword arguments.
|
|||
|
|
|||
|
>>> s.groupby([1, 1, 2, 2]).agg(
|
|||
|
... minimum='min',
|
|||
|
... maximum='max',
|
|||
|
... )
|
|||
|
minimum maximum
|
|||
|
1 1 2
|
|||
|
2 3 4
|
|||
|
|
|||
|
.. versionchanged:: 1.3.0
|
|||
|
|
|||
|
The resulting dtype will reflect the return value of the aggregating function.
|
|||
|
|
|||
|
>>> s.groupby([1, 1, 2, 2]).agg(lambda x: x.astype(float).min())
|
|||
|
1 1.0
|
|||
|
2 3.0
|
|||
|
dtype: float64
|
|||
|
"""
|
|||
|
)
|
|||
|
|
|||
|
@Appender(
|
|||
|
_apply_docs["template"].format(
|
|||
|
input="series", examples=_apply_docs["series_examples"]
|
|||
|
)
|
|||
|
)
|
|||
|
def apply(self, func, *args, **kwargs) -> Series:
|
|||
|
return super().apply(func, *args, **kwargs)
|
|||
|
|
|||
|
@doc(_agg_template, examples=_agg_examples_doc, klass="Series")
|
|||
|
def aggregate(self, func=None, *args, engine=None, engine_kwargs=None, **kwargs):
|
|||
|
if maybe_use_numba(engine):
|
|||
|
return self._aggregate_with_numba(
|
|||
|
func, *args, engine_kwargs=engine_kwargs, **kwargs
|
|||
|
)
|
|||
|
|
|||
|
relabeling = func is None
|
|||
|
columns = None
|
|||
|
if relabeling:
|
|||
|
columns, func = validate_func_kwargs(kwargs)
|
|||
|
kwargs = {}
|
|||
|
|
|||
|
if isinstance(func, str):
|
|||
|
return getattr(self, func)(*args, **kwargs)
|
|||
|
|
|||
|
elif isinstance(func, abc.Iterable):
|
|||
|
# Catch instances of lists / tuples
|
|||
|
# but not the class list / tuple itself.
|
|||
|
func = maybe_mangle_lambdas(func)
|
|||
|
ret = self._aggregate_multiple_funcs(func, *args, **kwargs)
|
|||
|
if relabeling:
|
|||
|
# columns is not narrowed by mypy from relabeling flag
|
|||
|
assert columns is not None # for mypy
|
|||
|
ret.columns = columns
|
|||
|
if not self.as_index:
|
|||
|
ret = ret.reset_index()
|
|||
|
return ret
|
|||
|
|
|||
|
else:
|
|||
|
cyfunc = com.get_cython_func(func)
|
|||
|
if cyfunc and not args and not kwargs:
|
|||
|
return getattr(self, cyfunc)()
|
|||
|
|
|||
|
if self.ngroups == 0:
|
|||
|
# e.g. test_evaluate_with_empty_groups without any groups to
|
|||
|
# iterate over, we have no output on which to do dtype
|
|||
|
# inference. We default to using the existing dtype.
|
|||
|
# xref GH#51445
|
|||
|
obj = self._obj_with_exclusions
|
|||
|
return self.obj._constructor(
|
|||
|
[],
|
|||
|
name=self.obj.name,
|
|||
|
index=self.grouper.result_index,
|
|||
|
dtype=obj.dtype,
|
|||
|
)
|
|||
|
|
|||
|
if self.grouper.nkeys > 1:
|
|||
|
return self._python_agg_general(func, *args, **kwargs)
|
|||
|
|
|||
|
try:
|
|||
|
return self._python_agg_general(func, *args, **kwargs)
|
|||
|
except KeyError:
|
|||
|
# KeyError raised in test_groupby.test_basic is bc the func does
|
|||
|
# a dictionary lookup on group.name, but group name is not
|
|||
|
# pinned in _python_agg_general, only in _aggregate_named
|
|||
|
result = self._aggregate_named(func, *args, **kwargs)
|
|||
|
|
|||
|
# result is a dict whose keys are the elements of result_index
|
|||
|
result = Series(result, index=self.grouper.result_index)
|
|||
|
result = self._wrap_aggregated_output(result)
|
|||
|
return result
|
|||
|
|
|||
|
agg = aggregate
|
|||
|
|
|||
|
def _python_agg_general(self, func, *args, **kwargs):
|
|||
|
func = com.is_builtin_func(func)
|
|||
|
f = lambda x: func(x, *args, **kwargs)
|
|||
|
|
|||
|
obj = self._obj_with_exclusions
|
|||
|
result = self.grouper.agg_series(obj, f)
|
|||
|
res = obj._constructor(result, name=obj.name)
|
|||
|
return self._wrap_aggregated_output(res)
|
|||
|
|
|||
|
def _aggregate_multiple_funcs(self, arg, *args, **kwargs) -> DataFrame:
|
|||
|
if isinstance(arg, dict):
|
|||
|
if self.as_index:
|
|||
|
# GH 15931
|
|||
|
raise SpecificationError("nested renamer is not supported")
|
|||
|
else:
|
|||
|
# GH#50684 - This accidentally worked in 1.x
|
|||
|
arg = list(arg.items())
|
|||
|
elif any(isinstance(x, (tuple, list)) for x in arg):
|
|||
|
arg = [(x, x) if not isinstance(x, (tuple, list)) else x for x in arg]
|
|||
|
else:
|
|||
|
# list of functions / function names
|
|||
|
columns = []
|
|||
|
for f in arg:
|
|||
|
columns.append(com.get_callable_name(f) or f)
|
|||
|
|
|||
|
arg = zip(columns, arg)
|
|||
|
|
|||
|
results: dict[base.OutputKey, DataFrame | Series] = {}
|
|||
|
with com.temp_setattr(self, "as_index", True):
|
|||
|
# Combine results using the index, need to adjust index after
|
|||
|
# if as_index=False (GH#50724)
|
|||
|
for idx, (name, func) in enumerate(arg):
|
|||
|
key = base.OutputKey(label=name, position=idx)
|
|||
|
results[key] = self.aggregate(func, *args, **kwargs)
|
|||
|
|
|||
|
if any(isinstance(x, DataFrame) for x in results.values()):
|
|||
|
from pandas import concat
|
|||
|
|
|||
|
res_df = concat(
|
|||
|
results.values(), axis=1, keys=[key.label for key in results]
|
|||
|
)
|
|||
|
return res_df
|
|||
|
|
|||
|
indexed_output = {key.position: val for key, val in results.items()}
|
|||
|
output = self.obj._constructor_expanddim(indexed_output, index=None)
|
|||
|
output.columns = Index(key.label for key in results)
|
|||
|
|
|||
|
return output
|
|||
|
|
|||
|
def _wrap_applied_output(
|
|||
|
self,
|
|||
|
data: Series,
|
|||
|
values: list[Any],
|
|||
|
not_indexed_same: bool = False,
|
|||
|
is_transform: bool = False,
|
|||
|
) -> DataFrame | Series:
|
|||
|
"""
|
|||
|
Wrap the output of SeriesGroupBy.apply into the expected result.
|
|||
|
|
|||
|
Parameters
|
|||
|
----------
|
|||
|
data : Series
|
|||
|
Input data for groupby operation.
|
|||
|
values : List[Any]
|
|||
|
Applied output for each group.
|
|||
|
not_indexed_same : bool, default False
|
|||
|
Whether the applied outputs are not indexed the same as the group axes.
|
|||
|
|
|||
|
Returns
|
|||
|
-------
|
|||
|
DataFrame or Series
|
|||
|
"""
|
|||
|
if len(values) == 0:
|
|||
|
# GH #6265
|
|||
|
if is_transform:
|
|||
|
# GH#47787 see test_group_on_empty_multiindex
|
|||
|
res_index = data.index
|
|||
|
else:
|
|||
|
res_index = self.grouper.result_index
|
|||
|
|
|||
|
return self.obj._constructor(
|
|||
|
[],
|
|||
|
name=self.obj.name,
|
|||
|
index=res_index,
|
|||
|
dtype=data.dtype,
|
|||
|
)
|
|||
|
assert values is not None
|
|||
|
|
|||
|
if isinstance(values[0], dict):
|
|||
|
# GH #823 #24880
|
|||
|
index = self.grouper.result_index
|
|||
|
res_df = self.obj._constructor_expanddim(values, index=index)
|
|||
|
res_df = self._reindex_output(res_df)
|
|||
|
# if self.observed is False,
|
|||
|
# keep all-NaN rows created while re-indexing
|
|||
|
res_ser = res_df.stack(dropna=self.observed)
|
|||
|
res_ser.name = self.obj.name
|
|||
|
return res_ser
|
|||
|
elif isinstance(values[0], (Series, DataFrame)):
|
|||
|
result = self._concat_objects(
|
|||
|
values,
|
|||
|
not_indexed_same=not_indexed_same,
|
|||
|
is_transform=is_transform,
|
|||
|
)
|
|||
|
if isinstance(result, Series):
|
|||
|
result.name = self.obj.name
|
|||
|
if not self.as_index and not_indexed_same:
|
|||
|
result = self._insert_inaxis_grouper(result)
|
|||
|
result.index = default_index(len(result))
|
|||
|
return result
|
|||
|
else:
|
|||
|
# GH #6265 #24880
|
|||
|
result = self.obj._constructor(
|
|||
|
data=values, index=self.grouper.result_index, name=self.obj.name
|
|||
|
)
|
|||
|
if not self.as_index:
|
|||
|
result = self._insert_inaxis_grouper(result)
|
|||
|
result.index = default_index(len(result))
|
|||
|
return self._reindex_output(result)
|
|||
|
|
|||
|
def _aggregate_named(self, func, *args, **kwargs):
|
|||
|
# Note: this is very similar to _aggregate_series_pure_python,
|
|||
|
# but that does not pin group.name
|
|||
|
result = {}
|
|||
|
initialized = False
|
|||
|
|
|||
|
for name, group in self:
|
|||
|
object.__setattr__(group, "name", name)
|
|||
|
|
|||
|
output = func(group, *args, **kwargs)
|
|||
|
output = libreduction.extract_result(output)
|
|||
|
if not initialized:
|
|||
|
# We only do this validation on the first iteration
|
|||
|
libreduction.check_result_array(output, group.dtype)
|
|||
|
initialized = True
|
|||
|
result[name] = output
|
|||
|
|
|||
|
return result
|
|||
|
|
|||
|
__examples_series_doc = dedent(
|
|||
|
"""
|
|||
|
>>> ser = pd.Series(
|
|||
|
... [390.0, 350.0, 30.0, 20.0],
|
|||
|
... index=["Falcon", "Falcon", "Parrot", "Parrot"],
|
|||
|
... name="Max Speed")
|
|||
|
>>> grouped = ser.groupby([1, 1, 2, 2])
|
|||
|
>>> grouped.transform(lambda x: (x - x.mean()) / x.std())
|
|||
|
Falcon 0.707107
|
|||
|
Falcon -0.707107
|
|||
|
Parrot 0.707107
|
|||
|
Parrot -0.707107
|
|||
|
Name: Max Speed, dtype: float64
|
|||
|
|
|||
|
Broadcast result of the transformation
|
|||
|
|
|||
|
>>> grouped.transform(lambda x: x.max() - x.min())
|
|||
|
Falcon 40.0
|
|||
|
Falcon 40.0
|
|||
|
Parrot 10.0
|
|||
|
Parrot 10.0
|
|||
|
Name: Max Speed, dtype: float64
|
|||
|
|
|||
|
>>> grouped.transform("mean")
|
|||
|
Falcon 370.0
|
|||
|
Falcon 370.0
|
|||
|
Parrot 25.0
|
|||
|
Parrot 25.0
|
|||
|
Name: Max Speed, dtype: float64
|
|||
|
|
|||
|
.. versionchanged:: 1.3.0
|
|||
|
|
|||
|
The resulting dtype will reflect the return value of the passed ``func``,
|
|||
|
for example:
|
|||
|
|
|||
|
>>> grouped.transform(lambda x: x.astype(int).max())
|
|||
|
Falcon 390
|
|||
|
Falcon 390
|
|||
|
Parrot 30
|
|||
|
Parrot 30
|
|||
|
Name: Max Speed, dtype: int64
|
|||
|
"""
|
|||
|
)
|
|||
|
|
|||
|
@Substitution(klass="Series", example=__examples_series_doc)
|
|||
|
@Appender(_transform_template)
|
|||
|
def transform(self, func, *args, engine=None, engine_kwargs=None, **kwargs):
|
|||
|
return self._transform(
|
|||
|
func, *args, engine=engine, engine_kwargs=engine_kwargs, **kwargs
|
|||
|
)
|
|||
|
|
|||
|
def _cython_transform(
|
|||
|
self, how: str, numeric_only: bool = False, axis: AxisInt = 0, **kwargs
|
|||
|
):
|
|||
|
assert axis == 0 # handled by caller
|
|||
|
|
|||
|
obj = self._selected_obj
|
|||
|
|
|||
|
try:
|
|||
|
result = self.grouper._cython_operation(
|
|||
|
"transform", obj._values, how, axis, **kwargs
|
|||
|
)
|
|||
|
except NotImplementedError as err:
|
|||
|
# e.g. test_groupby_raises_string
|
|||
|
raise TypeError(f"{how} is not supported for {obj.dtype} dtype") from err
|
|||
|
|
|||
|
return obj._constructor(result, index=self.obj.index, name=obj.name)
|
|||
|
|
|||
|
def _transform_general(self, func: Callable, *args, **kwargs) -> Series:
|
|||
|
"""
|
|||
|
Transform with a callable func`.
|
|||
|
"""
|
|||
|
assert callable(func)
|
|||
|
klass = type(self.obj)
|
|||
|
|
|||
|
results = []
|
|||
|
for name, group in self.grouper.get_iterator(
|
|||
|
self._selected_obj, axis=self.axis
|
|||
|
):
|
|||
|
# this setattr is needed for test_transform_lambda_with_datetimetz
|
|||
|
object.__setattr__(group, "name", name)
|
|||
|
res = func(group, *args, **kwargs)
|
|||
|
|
|||
|
results.append(klass(res, index=group.index))
|
|||
|
|
|||
|
# check for empty "results" to avoid concat ValueError
|
|||
|
if results:
|
|||
|
from pandas.core.reshape.concat import concat
|
|||
|
|
|||
|
concatenated = concat(results)
|
|||
|
result = self._set_result_index_ordered(concatenated)
|
|||
|
else:
|
|||
|
result = self.obj._constructor(dtype=np.float64)
|
|||
|
|
|||
|
result.name = self.obj.name
|
|||
|
return result
|
|||
|
|
|||
|
def filter(self, func, dropna: bool = True, *args, **kwargs):
|
|||
|
"""
|
|||
|
Filter elements from groups that don't satisfy a criterion.
|
|||
|
|
|||
|
Elements from groups are filtered if they do not satisfy the
|
|||
|
boolean criterion specified by func.
|
|||
|
|
|||
|
Parameters
|
|||
|
----------
|
|||
|
func : function
|
|||
|
Criterion to apply to each group. Should return True or False.
|
|||
|
dropna : bool
|
|||
|
Drop groups that do not pass the filter. True by default; if False,
|
|||
|
groups that evaluate False are filled with NaNs.
|
|||
|
|
|||
|
Returns
|
|||
|
-------
|
|||
|
Series
|
|||
|
|
|||
|
Notes
|
|||
|
-----
|
|||
|
Functions that mutate the passed object can produce unexpected
|
|||
|
behavior or errors and are not supported. See :ref:`gotchas.udf-mutation`
|
|||
|
for more details.
|
|||
|
|
|||
|
Examples
|
|||
|
--------
|
|||
|
>>> df = pd.DataFrame({'A' : ['foo', 'bar', 'foo', 'bar',
|
|||
|
... 'foo', 'bar'],
|
|||
|
... 'B' : [1, 2, 3, 4, 5, 6],
|
|||
|
... 'C' : [2.0, 5., 8., 1., 2., 9.]})
|
|||
|
>>> grouped = df.groupby('A')
|
|||
|
>>> df.groupby('A').B.filter(lambda x: x.mean() > 3.)
|
|||
|
1 2
|
|||
|
3 4
|
|||
|
5 6
|
|||
|
Name: B, dtype: int64
|
|||
|
"""
|
|||
|
if isinstance(func, str):
|
|||
|
wrapper = lambda x: getattr(x, func)(*args, **kwargs)
|
|||
|
else:
|
|||
|
wrapper = lambda x: func(x, *args, **kwargs)
|
|||
|
|
|||
|
# Interpret np.nan as False.
|
|||
|
def true_and_notna(x) -> bool:
|
|||
|
b = wrapper(x)
|
|||
|
return notna(b) and b
|
|||
|
|
|||
|
try:
|
|||
|
indices = [
|
|||
|
self._get_index(name) for name, group in self if true_and_notna(group)
|
|||
|
]
|
|||
|
except (ValueError, TypeError) as err:
|
|||
|
raise TypeError("the filter must return a boolean result") from err
|
|||
|
|
|||
|
filtered = self._apply_filter(indices, dropna)
|
|||
|
return filtered
|
|||
|
|
|||
|
def nunique(self, dropna: bool = True) -> Series | DataFrame:
|
|||
|
"""
|
|||
|
Return number of unique elements in the group.
|
|||
|
|
|||
|
Returns
|
|||
|
-------
|
|||
|
Series
|
|||
|
Number of unique values within each group.
|
|||
|
"""
|
|||
|
ids, _, _ = self.grouper.group_info
|
|||
|
|
|||
|
val = self.obj._values
|
|||
|
|
|||
|
codes, _ = algorithms.factorize(val, sort=False)
|
|||
|
sorter = np.lexsort((codes, ids))
|
|||
|
codes = codes[sorter]
|
|||
|
ids = ids[sorter]
|
|||
|
|
|||
|
# group boundaries are where group ids change
|
|||
|
# unique observations are where sorted values change
|
|||
|
idx = np.r_[0, 1 + np.nonzero(ids[1:] != ids[:-1])[0]]
|
|||
|
inc = np.r_[1, codes[1:] != codes[:-1]]
|
|||
|
|
|||
|
# 1st item of each group is a new unique observation
|
|||
|
mask = codes == -1
|
|||
|
if dropna:
|
|||
|
inc[idx] = 1
|
|||
|
inc[mask] = 0
|
|||
|
else:
|
|||
|
inc[mask & np.r_[False, mask[:-1]]] = 0
|
|||
|
inc[idx] = 1
|
|||
|
|
|||
|
out = np.add.reduceat(inc, idx).astype("int64", copy=False)
|
|||
|
if len(ids):
|
|||
|
# NaN/NaT group exists if the head of ids is -1,
|
|||
|
# so remove it from res and exclude its index from idx
|
|||
|
if ids[0] == -1:
|
|||
|
res = out[1:]
|
|||
|
idx = idx[np.flatnonzero(idx)]
|
|||
|
else:
|
|||
|
res = out
|
|||
|
else:
|
|||
|
res = out[1:]
|
|||
|
ri = self.grouper.result_index
|
|||
|
|
|||
|
# we might have duplications among the bins
|
|||
|
if len(res) != len(ri):
|
|||
|
res, out = np.zeros(len(ri), dtype=out.dtype), res
|
|||
|
if len(ids) > 0:
|
|||
|
# GH#21334s
|
|||
|
res[ids[idx]] = out
|
|||
|
|
|||
|
result: Series | DataFrame = self.obj._constructor(
|
|||
|
res, index=ri, name=self.obj.name
|
|||
|
)
|
|||
|
if not self.as_index:
|
|||
|
result = self._insert_inaxis_grouper(result)
|
|||
|
result.index = default_index(len(result))
|
|||
|
return self._reindex_output(result, fill_value=0)
|
|||
|
|
|||
|
@doc(Series.describe)
|
|||
|
def describe(self, **kwargs):
|
|||
|
return super().describe(**kwargs)
|
|||
|
|
|||
|
def value_counts(
|
|||
|
self,
|
|||
|
normalize: bool = False,
|
|||
|
sort: bool = True,
|
|||
|
ascending: bool = False,
|
|||
|
bins=None,
|
|||
|
dropna: bool = True,
|
|||
|
) -> Series | DataFrame:
|
|||
|
name = "proportion" if normalize else "count"
|
|||
|
|
|||
|
if bins is None:
|
|||
|
result = self._value_counts(
|
|||
|
normalize=normalize, sort=sort, ascending=ascending, dropna=dropna
|
|||
|
)
|
|||
|
result.name = name
|
|||
|
return result
|
|||
|
|
|||
|
from pandas.core.reshape.merge import get_join_indexers
|
|||
|
from pandas.core.reshape.tile import cut
|
|||
|
|
|||
|
ids, _, _ = self.grouper.group_info
|
|||
|
val = self.obj._values
|
|||
|
|
|||
|
index_names = self.grouper.names + [self.obj.name]
|
|||
|
|
|||
|
if is_categorical_dtype(val.dtype) or (
|
|||
|
bins is not None and not np.iterable(bins)
|
|||
|
):
|
|||
|
# scalar bins cannot be done at top level
|
|||
|
# in a backward compatible way
|
|||
|
# GH38672 relates to categorical dtype
|
|||
|
ser = self.apply(
|
|||
|
Series.value_counts,
|
|||
|
normalize=normalize,
|
|||
|
sort=sort,
|
|||
|
ascending=ascending,
|
|||
|
bins=bins,
|
|||
|
)
|
|||
|
ser.name = name
|
|||
|
ser.index.names = index_names
|
|||
|
return ser
|
|||
|
|
|||
|
# groupby removes null keys from groupings
|
|||
|
mask = ids != -1
|
|||
|
ids, val = ids[mask], val[mask]
|
|||
|
|
|||
|
if bins is None:
|
|||
|
lab, lev = algorithms.factorize(val, sort=True)
|
|||
|
llab = lambda lab, inc: lab[inc]
|
|||
|
else:
|
|||
|
# lab is a Categorical with categories an IntervalIndex
|
|||
|
cat_ser = cut(Series(val, copy=False), bins, include_lowest=True)
|
|||
|
cat_obj = cast("Categorical", cat_ser._values)
|
|||
|
lev = cat_obj.categories
|
|||
|
lab = lev.take(
|
|||
|
cat_obj.codes,
|
|||
|
allow_fill=True,
|
|||
|
fill_value=lev._na_value,
|
|||
|
)
|
|||
|
llab = lambda lab, inc: lab[inc]._multiindex.codes[-1]
|
|||
|
|
|||
|
if is_interval_dtype(lab.dtype):
|
|||
|
# TODO: should we do this inside II?
|
|||
|
lab_interval = cast(Interval, lab)
|
|||
|
|
|||
|
sorter = np.lexsort((lab_interval.left, lab_interval.right, ids))
|
|||
|
else:
|
|||
|
sorter = np.lexsort((lab, ids))
|
|||
|
|
|||
|
ids, lab = ids[sorter], lab[sorter]
|
|||
|
|
|||
|
# group boundaries are where group ids change
|
|||
|
idchanges = 1 + np.nonzero(ids[1:] != ids[:-1])[0]
|
|||
|
idx = np.r_[0, idchanges]
|
|||
|
if not len(ids):
|
|||
|
idx = idchanges
|
|||
|
|
|||
|
# new values are where sorted labels change
|
|||
|
lchanges = llab(lab, slice(1, None)) != llab(lab, slice(None, -1))
|
|||
|
inc = np.r_[True, lchanges]
|
|||
|
if not len(val):
|
|||
|
inc = lchanges
|
|||
|
inc[idx] = True # group boundaries are also new values
|
|||
|
out = np.diff(np.nonzero(np.r_[inc, True])[0]) # value counts
|
|||
|
|
|||
|
# num. of times each group should be repeated
|
|||
|
rep = partial(np.repeat, repeats=np.add.reduceat(inc, idx))
|
|||
|
|
|||
|
# multi-index components
|
|||
|
codes = self.grouper.reconstructed_codes
|
|||
|
codes = [rep(level_codes) for level_codes in codes] + [llab(lab, inc)]
|
|||
|
levels = [ping.group_index for ping in self.grouper.groupings] + [lev]
|
|||
|
|
|||
|
if dropna:
|
|||
|
mask = codes[-1] != -1
|
|||
|
if mask.all():
|
|||
|
dropna = False
|
|||
|
else:
|
|||
|
out, codes = out[mask], [level_codes[mask] for level_codes in codes]
|
|||
|
|
|||
|
if normalize:
|
|||
|
out = out.astype("float")
|
|||
|
d = np.diff(np.r_[idx, len(ids)])
|
|||
|
if dropna:
|
|||
|
m = ids[lab == -1]
|
|||
|
np.add.at(d, m, -1)
|
|||
|
acc = rep(d)[mask]
|
|||
|
else:
|
|||
|
acc = rep(d)
|
|||
|
out /= acc
|
|||
|
|
|||
|
if sort and bins is None:
|
|||
|
cat = ids[inc][mask] if dropna else ids[inc]
|
|||
|
sorter = np.lexsort((out if ascending else -out, cat))
|
|||
|
out, codes[-1] = out[sorter], codes[-1][sorter]
|
|||
|
|
|||
|
if bins is not None:
|
|||
|
# for compat. with libgroupby.value_counts need to ensure every
|
|||
|
# bin is present at every index level, null filled with zeros
|
|||
|
diff = np.zeros(len(out), dtype="bool")
|
|||
|
for level_codes in codes[:-1]:
|
|||
|
diff |= np.r_[True, level_codes[1:] != level_codes[:-1]]
|
|||
|
|
|||
|
ncat, nbin = diff.sum(), len(levels[-1])
|
|||
|
|
|||
|
left = [np.repeat(np.arange(ncat), nbin), np.tile(np.arange(nbin), ncat)]
|
|||
|
|
|||
|
right = [diff.cumsum() - 1, codes[-1]]
|
|||
|
|
|||
|
_, idx = get_join_indexers(left, right, sort=False, how="left")
|
|||
|
out = np.where(idx != -1, out[idx], 0)
|
|||
|
|
|||
|
if sort:
|
|||
|
sorter = np.lexsort((out if ascending else -out, left[0]))
|
|||
|
out, left[-1] = out[sorter], left[-1][sorter]
|
|||
|
|
|||
|
# build the multi-index w/ full levels
|
|||
|
def build_codes(lev_codes: np.ndarray) -> np.ndarray:
|
|||
|
return np.repeat(lev_codes[diff], nbin)
|
|||
|
|
|||
|
codes = [build_codes(lev_codes) for lev_codes in codes[:-1]]
|
|||
|
codes.append(left[-1])
|
|||
|
|
|||
|
mi = MultiIndex(
|
|||
|
levels=levels, codes=codes, names=index_names, verify_integrity=False
|
|||
|
)
|
|||
|
|
|||
|
if is_integer_dtype(out.dtype):
|
|||
|
out = ensure_int64(out)
|
|||
|
result = self.obj._constructor(out, index=mi, name=name)
|
|||
|
if not self.as_index:
|
|||
|
result = result.reset_index()
|
|||
|
return result
|
|||
|
|
|||
|
def fillna(
|
|||
|
self,
|
|||
|
value: object | ArrayLike | None = None,
|
|||
|
method: FillnaOptions | None = None,
|
|||
|
axis: Axis | None = None,
|
|||
|
inplace: bool = False,
|
|||
|
limit: int | None = None,
|
|||
|
downcast: dict | None = None,
|
|||
|
) -> Series | None:
|
|||
|
"""
|
|||
|
Fill NA/NaN values using the specified method within groups.
|
|||
|
|
|||
|
Parameters
|
|||
|
----------
|
|||
|
value : scalar, dict, Series, or DataFrame
|
|||
|
Value to use to fill holes (e.g. 0), alternately a
|
|||
|
dict/Series/DataFrame of values specifying which value to use for
|
|||
|
each index (for a Series) or column (for a DataFrame). Values not
|
|||
|
in the dict/Series/DataFrame will not be filled. This value cannot
|
|||
|
be a list. Users wanting to use the ``value`` argument and not ``method``
|
|||
|
should prefer :meth:`.Series.fillna` as this
|
|||
|
will produce the same result and be more performant.
|
|||
|
method : {{'bfill', 'ffill', None}}, default None
|
|||
|
Method to use for filling holes. ``'ffill'`` will propagate
|
|||
|
the last valid observation forward within a group.
|
|||
|
``'bfill'`` will use next valid observation to fill the gap.
|
|||
|
axis : {0 or 'index', 1 or 'columns'}
|
|||
|
Unused, only for compatibility with :meth:`DataFrameGroupBy.fillna`.
|
|||
|
inplace : bool, default False
|
|||
|
Broken. Do not set to True.
|
|||
|
limit : int, default None
|
|||
|
If method is specified, this is the maximum number of consecutive
|
|||
|
NaN values to forward/backward fill within a group. In other words,
|
|||
|
if there is a gap with more than this number of consecutive NaNs,
|
|||
|
it will only be partially filled. If method is not specified, this is the
|
|||
|
maximum number of entries along the entire axis where NaNs will be
|
|||
|
filled. Must be greater than 0 if not None.
|
|||
|
downcast : dict, default is None
|
|||
|
A dict of item->dtype of what to downcast if possible,
|
|||
|
or the string 'infer' which will try to downcast to an appropriate
|
|||
|
equal type (e.g. float64 to int64 if possible).
|
|||
|
|
|||
|
Returns
|
|||
|
-------
|
|||
|
Series
|
|||
|
Object with missing values filled within groups.
|
|||
|
|
|||
|
See Also
|
|||
|
--------
|
|||
|
ffill : Forward fill values within a group.
|
|||
|
bfill : Backward fill values within a group.
|
|||
|
|
|||
|
Examples
|
|||
|
--------
|
|||
|
>>> ser = pd.Series([np.nan, np.nan, 2, 3, np.nan, np.nan])
|
|||
|
>>> ser
|
|||
|
0 NaN
|
|||
|
1 NaN
|
|||
|
2 2.0
|
|||
|
3 3.0
|
|||
|
4 NaN
|
|||
|
5 NaN
|
|||
|
dtype: float64
|
|||
|
|
|||
|
Propagate non-null values forward or backward within each group.
|
|||
|
|
|||
|
>>> ser.groupby([0, 0, 0, 1, 1, 1]).fillna(method="ffill")
|
|||
|
0 NaN
|
|||
|
1 NaN
|
|||
|
2 2.0
|
|||
|
3 3.0
|
|||
|
4 3.0
|
|||
|
5 3.0
|
|||
|
dtype: float64
|
|||
|
|
|||
|
>>> ser.groupby([0, 0, 0, 1, 1, 1]).fillna(method="bfill")
|
|||
|
0 2.0
|
|||
|
1 2.0
|
|||
|
2 2.0
|
|||
|
3 3.0
|
|||
|
4 NaN
|
|||
|
5 NaN
|
|||
|
dtype: float64
|
|||
|
|
|||
|
Only replace the first NaN element within a group.
|
|||
|
|
|||
|
>>> ser.groupby([0, 0, 0, 1, 1, 1]).fillna(method="ffill", limit=1)
|
|||
|
0 NaN
|
|||
|
1 NaN
|
|||
|
2 2.0
|
|||
|
3 3.0
|
|||
|
4 3.0
|
|||
|
5 NaN
|
|||
|
dtype: float64
|
|||
|
"""
|
|||
|
result = self._op_via_apply(
|
|||
|
"fillna",
|
|||
|
value=value,
|
|||
|
method=method,
|
|||
|
axis=axis,
|
|||
|
inplace=inplace,
|
|||
|
limit=limit,
|
|||
|
downcast=downcast,
|
|||
|
)
|
|||
|
return result
|
|||
|
|
|||
|
def take(
|
|||
|
self,
|
|||
|
indices: TakeIndexer,
|
|||
|
axis: Axis = 0,
|
|||
|
**kwargs,
|
|||
|
) -> Series:
|
|||
|
"""
|
|||
|
Return the elements in the given *positional* indices in each group.
|
|||
|
|
|||
|
This means that we are not indexing according to actual values in
|
|||
|
the index attribute of the object. We are indexing according to the
|
|||
|
actual position of the element in the object.
|
|||
|
|
|||
|
If a requested index does not exist for some group, this method will raise.
|
|||
|
To get similar behavior that ignores indices that don't exist, see
|
|||
|
:meth:`.SeriesGroupBy.nth`.
|
|||
|
|
|||
|
Parameters
|
|||
|
----------
|
|||
|
indices : array-like
|
|||
|
An array of ints indicating which positions to take in each group.
|
|||
|
axis : {0 or 'index', 1 or 'columns', None}, default 0
|
|||
|
The axis on which to select elements. ``0`` means that we are
|
|||
|
selecting rows, ``1`` means that we are selecting columns.
|
|||
|
For `SeriesGroupBy` this parameter is unused and defaults to 0.
|
|||
|
**kwargs
|
|||
|
For compatibility with :meth:`numpy.take`. Has no effect on the
|
|||
|
output.
|
|||
|
|
|||
|
Returns
|
|||
|
-------
|
|||
|
Series
|
|||
|
A Series containing the elements taken from each group.
|
|||
|
|
|||
|
See Also
|
|||
|
--------
|
|||
|
Series.take : Take elements from a Series along an axis.
|
|||
|
Series.loc : Select a subset of a DataFrame by labels.
|
|||
|
Series.iloc : Select a subset of a DataFrame by positions.
|
|||
|
numpy.take : Take elements from an array along an axis.
|
|||
|
SeriesGroupBy.nth : Similar to take, won't raise if indices don't exist.
|
|||
|
|
|||
|
Examples
|
|||
|
--------
|
|||
|
>>> df = pd.DataFrame([('falcon', 'bird', 389.0),
|
|||
|
... ('parrot', 'bird', 24.0),
|
|||
|
... ('lion', 'mammal', 80.5),
|
|||
|
... ('monkey', 'mammal', np.nan),
|
|||
|
... ('rabbit', 'mammal', 15.0)],
|
|||
|
... columns=['name', 'class', 'max_speed'],
|
|||
|
... index=[4, 3, 2, 1, 0])
|
|||
|
>>> df
|
|||
|
name class max_speed
|
|||
|
4 falcon bird 389.0
|
|||
|
3 parrot bird 24.0
|
|||
|
2 lion mammal 80.5
|
|||
|
1 monkey mammal NaN
|
|||
|
0 rabbit mammal 15.0
|
|||
|
>>> gb = df["name"].groupby([1, 1, 2, 2, 2])
|
|||
|
|
|||
|
Take elements at positions 0 and 1 along the axis 0 in each group (default).
|
|||
|
|
|||
|
>>> gb.take([0, 1])
|
|||
|
1 4 falcon
|
|||
|
3 parrot
|
|||
|
2 2 lion
|
|||
|
1 monkey
|
|||
|
Name: name, dtype: object
|
|||
|
|
|||
|
We may take elements using negative integers for positive indices,
|
|||
|
starting from the end of the object, just like with Python lists.
|
|||
|
|
|||
|
>>> gb.take([-1, -2])
|
|||
|
1 3 parrot
|
|||
|
4 falcon
|
|||
|
2 0 rabbit
|
|||
|
1 monkey
|
|||
|
Name: name, dtype: object
|
|||
|
"""
|
|||
|
result = self._op_via_apply("take", indices=indices, axis=axis, **kwargs)
|
|||
|
return result
|
|||
|
|
|||
|
def skew(
|
|||
|
self,
|
|||
|
axis: Axis | lib.NoDefault = lib.no_default,
|
|||
|
skipna: bool = True,
|
|||
|
numeric_only: bool = False,
|
|||
|
**kwargs,
|
|||
|
) -> Series:
|
|||
|
"""
|
|||
|
Return unbiased skew within groups.
|
|||
|
|
|||
|
Normalized by N-1.
|
|||
|
|
|||
|
Parameters
|
|||
|
----------
|
|||
|
axis : {0 or 'index', 1 or 'columns', None}, default 0
|
|||
|
Axis for the function to be applied on.
|
|||
|
This parameter is only for compatibility with DataFrame and is unused.
|
|||
|
|
|||
|
skipna : bool, default True
|
|||
|
Exclude NA/null values when computing the result.
|
|||
|
|
|||
|
numeric_only : bool, default False
|
|||
|
Include only float, int, boolean columns. Not implemented for Series.
|
|||
|
|
|||
|
**kwargs
|
|||
|
Additional keyword arguments to be passed to the function.
|
|||
|
|
|||
|
Returns
|
|||
|
-------
|
|||
|
Series
|
|||
|
|
|||
|
See Also
|
|||
|
--------
|
|||
|
Series.skew : Return unbiased skew over requested axis.
|
|||
|
|
|||
|
Examples
|
|||
|
--------
|
|||
|
>>> ser = pd.Series([390., 350., 357., np.nan, 22., 20., 30.],
|
|||
|
... index=['Falcon', 'Falcon', 'Falcon', 'Falcon',
|
|||
|
... 'Parrot', 'Parrot', 'Parrot'],
|
|||
|
... name="Max Speed")
|
|||
|
>>> ser
|
|||
|
Falcon 390.0
|
|||
|
Falcon 350.0
|
|||
|
Falcon 357.0
|
|||
|
Falcon NaN
|
|||
|
Parrot 22.0
|
|||
|
Parrot 20.0
|
|||
|
Parrot 30.0
|
|||
|
Name: Max Speed, dtype: float64
|
|||
|
>>> ser.groupby(level=0).skew()
|
|||
|
Falcon 1.525174
|
|||
|
Parrot 1.457863
|
|||
|
Name: Max Speed, dtype: float64
|
|||
|
>>> ser.groupby(level=0).skew(skipna=False)
|
|||
|
Falcon NaN
|
|||
|
Parrot 1.457863
|
|||
|
Name: Max Speed, dtype: float64
|
|||
|
"""
|
|||
|
result = self._op_via_apply(
|
|||
|
"skew",
|
|||
|
axis=axis,
|
|||
|
skipna=skipna,
|
|||
|
numeric_only=numeric_only,
|
|||
|
**kwargs,
|
|||
|
)
|
|||
|
return result
|
|||
|
|
|||
|
@property
|
|||
|
@doc(Series.plot.__doc__)
|
|||
|
def plot(self):
|
|||
|
result = GroupByPlot(self)
|
|||
|
return result
|
|||
|
|
|||
|
@doc(Series.nlargest.__doc__)
|
|||
|
def nlargest(
|
|||
|
self, n: int = 5, keep: Literal["first", "last", "all"] = "first"
|
|||
|
) -> Series:
|
|||
|
f = partial(Series.nlargest, n=n, keep=keep)
|
|||
|
data = self._selected_obj
|
|||
|
# Don't change behavior if result index happens to be the same, i.e.
|
|||
|
# already ordered and n >= all group sizes.
|
|||
|
result = self._python_apply_general(f, data, not_indexed_same=True)
|
|||
|
return result
|
|||
|
|
|||
|
@doc(Series.nsmallest.__doc__)
|
|||
|
def nsmallest(
|
|||
|
self, n: int = 5, keep: Literal["first", "last", "all"] = "first"
|
|||
|
) -> Series:
|
|||
|
f = partial(Series.nsmallest, n=n, keep=keep)
|
|||
|
data = self._selected_obj
|
|||
|
# Don't change behavior if result index happens to be the same, i.e.
|
|||
|
# already ordered and n >= all group sizes.
|
|||
|
result = self._python_apply_general(f, data, not_indexed_same=True)
|
|||
|
return result
|
|||
|
|
|||
|
@doc(Series.idxmin.__doc__)
|
|||
|
def idxmin(self, axis: Axis = 0, skipna: bool = True) -> Series:
|
|||
|
result = self._op_via_apply("idxmin", axis=axis, skipna=skipna)
|
|||
|
return result
|
|||
|
|
|||
|
@doc(Series.idxmax.__doc__)
|
|||
|
def idxmax(self, axis: Axis = 0, skipna: bool = True) -> Series:
|
|||
|
result = self._op_via_apply("idxmax", axis=axis, skipna=skipna)
|
|||
|
return result
|
|||
|
|
|||
|
@doc(Series.corr.__doc__)
|
|||
|
def corr(
|
|||
|
self,
|
|||
|
other: Series,
|
|||
|
method: CorrelationMethod = "pearson",
|
|||
|
min_periods: int | None = None,
|
|||
|
) -> Series:
|
|||
|
result = self._op_via_apply(
|
|||
|
"corr", other=other, method=method, min_periods=min_periods
|
|||
|
)
|
|||
|
return result
|
|||
|
|
|||
|
@doc(Series.cov.__doc__)
|
|||
|
def cov(
|
|||
|
self, other: Series, min_periods: int | None = None, ddof: int | None = 1
|
|||
|
) -> Series:
|
|||
|
result = self._op_via_apply(
|
|||
|
"cov", other=other, min_periods=min_periods, ddof=ddof
|
|||
|
)
|
|||
|
return result
|
|||
|
|
|||
|
@property
|
|||
|
@doc(Series.is_monotonic_increasing.__doc__)
|
|||
|
def is_monotonic_increasing(self) -> Series:
|
|||
|
return self.apply(lambda ser: ser.is_monotonic_increasing)
|
|||
|
|
|||
|
@property
|
|||
|
@doc(Series.is_monotonic_decreasing.__doc__)
|
|||
|
def is_monotonic_decreasing(self) -> Series:
|
|||
|
return self.apply(lambda ser: ser.is_monotonic_decreasing)
|
|||
|
|
|||
|
@doc(Series.hist.__doc__)
|
|||
|
def hist(
|
|||
|
self,
|
|||
|
by=None,
|
|||
|
ax=None,
|
|||
|
grid: bool = True,
|
|||
|
xlabelsize: int | None = None,
|
|||
|
xrot: float | None = None,
|
|||
|
ylabelsize: int | None = None,
|
|||
|
yrot: float | None = None,
|
|||
|
figsize: tuple[int, int] | None = None,
|
|||
|
bins: int | Sequence[int] = 10,
|
|||
|
backend: str | None = None,
|
|||
|
legend: bool = False,
|
|||
|
**kwargs,
|
|||
|
):
|
|||
|
result = self._op_via_apply(
|
|||
|
"hist",
|
|||
|
by=by,
|
|||
|
ax=ax,
|
|||
|
grid=grid,
|
|||
|
xlabelsize=xlabelsize,
|
|||
|
xrot=xrot,
|
|||
|
ylabelsize=ylabelsize,
|
|||
|
yrot=yrot,
|
|||
|
figsize=figsize,
|
|||
|
bins=bins,
|
|||
|
backend=backend,
|
|||
|
legend=legend,
|
|||
|
**kwargs,
|
|||
|
)
|
|||
|
return result
|
|||
|
|
|||
|
@property
|
|||
|
@doc(Series.dtype.__doc__)
|
|||
|
def dtype(self) -> Series:
|
|||
|
return self.apply(lambda ser: ser.dtype)
|
|||
|
|
|||
|
@doc(Series.unique.__doc__)
|
|||
|
def unique(self) -> Series:
|
|||
|
result = self._op_via_apply("unique")
|
|||
|
return result
|
|||
|
|
|||
|
|
|||
|
class DataFrameGroupBy(GroupBy[DataFrame]):
|
|||
|
_agg_examples_doc = dedent(
|
|||
|
"""
|
|||
|
Examples
|
|||
|
--------
|
|||
|
>>> df = pd.DataFrame(
|
|||
|
... {
|
|||
|
... "A": [1, 1, 2, 2],
|
|||
|
... "B": [1, 2, 3, 4],
|
|||
|
... "C": [0.362838, 0.227877, 1.267767, -0.562860],
|
|||
|
... }
|
|||
|
... )
|
|||
|
|
|||
|
>>> df
|
|||
|
A B C
|
|||
|
0 1 1 0.362838
|
|||
|
1 1 2 0.227877
|
|||
|
2 2 3 1.267767
|
|||
|
3 2 4 -0.562860
|
|||
|
|
|||
|
The aggregation is for each column.
|
|||
|
|
|||
|
>>> df.groupby('A').agg('min')
|
|||
|
B C
|
|||
|
A
|
|||
|
1 1 0.227877
|
|||
|
2 3 -0.562860
|
|||
|
|
|||
|
Multiple aggregations
|
|||
|
|
|||
|
>>> df.groupby('A').agg(['min', 'max'])
|
|||
|
B C
|
|||
|
min max min max
|
|||
|
A
|
|||
|
1 1 2 0.227877 0.362838
|
|||
|
2 3 4 -0.562860 1.267767
|
|||
|
|
|||
|
Select a column for aggregation
|
|||
|
|
|||
|
>>> df.groupby('A').B.agg(['min', 'max'])
|
|||
|
min max
|
|||
|
A
|
|||
|
1 1 2
|
|||
|
2 3 4
|
|||
|
|
|||
|
User-defined function for aggregation
|
|||
|
|
|||
|
>>> df.groupby('A').agg(lambda x: sum(x) + 2)
|
|||
|
B C
|
|||
|
A
|
|||
|
1 5 2.590715
|
|||
|
2 9 2.704907
|
|||
|
|
|||
|
Different aggregations per column
|
|||
|
|
|||
|
>>> df.groupby('A').agg({'B': ['min', 'max'], 'C': 'sum'})
|
|||
|
B C
|
|||
|
min max sum
|
|||
|
A
|
|||
|
1 1 2 0.590715
|
|||
|
2 3 4 0.704907
|
|||
|
|
|||
|
To control the output names with different aggregations per column,
|
|||
|
pandas supports "named aggregation"
|
|||
|
|
|||
|
>>> df.groupby("A").agg(
|
|||
|
... b_min=pd.NamedAgg(column="B", aggfunc="min"),
|
|||
|
... c_sum=pd.NamedAgg(column="C", aggfunc="sum"))
|
|||
|
b_min c_sum
|
|||
|
A
|
|||
|
1 1 0.590715
|
|||
|
2 3 0.704907
|
|||
|
|
|||
|
- The keywords are the *output* column names
|
|||
|
- The values are tuples whose first element is the column to select
|
|||
|
and the second element is the aggregation to apply to that column.
|
|||
|
Pandas provides the ``pandas.NamedAgg`` namedtuple with the fields
|
|||
|
``['column', 'aggfunc']`` to make it clearer what the arguments are.
|
|||
|
As usual, the aggregation can be a callable or a string alias.
|
|||
|
|
|||
|
See :ref:`groupby.aggregate.named` for more.
|
|||
|
|
|||
|
.. versionchanged:: 1.3.0
|
|||
|
|
|||
|
The resulting dtype will reflect the return value of the aggregating function.
|
|||
|
|
|||
|
>>> df.groupby("A")[["B"]].agg(lambda x: x.astype(float).min())
|
|||
|
B
|
|||
|
A
|
|||
|
1 1.0
|
|||
|
2 3.0
|
|||
|
"""
|
|||
|
)
|
|||
|
|
|||
|
@doc(_agg_template, examples=_agg_examples_doc, klass="DataFrame")
|
|||
|
def aggregate(self, func=None, *args, engine=None, engine_kwargs=None, **kwargs):
|
|||
|
if maybe_use_numba(engine):
|
|||
|
return self._aggregate_with_numba(
|
|||
|
func, *args, engine_kwargs=engine_kwargs, **kwargs
|
|||
|
)
|
|||
|
|
|||
|
relabeling, func, columns, order = reconstruct_func(func, **kwargs)
|
|||
|
func = maybe_mangle_lambdas(func)
|
|||
|
|
|||
|
op = GroupByApply(self, func, args, kwargs)
|
|||
|
result = op.agg()
|
|||
|
if not is_dict_like(func) and result is not None:
|
|||
|
return result
|
|||
|
elif relabeling:
|
|||
|
# this should be the only (non-raising) case with relabeling
|
|||
|
# used reordered index of columns
|
|||
|
result = cast(DataFrame, result)
|
|||
|
result = result.iloc[:, order]
|
|||
|
result = cast(DataFrame, result)
|
|||
|
# error: Incompatible types in assignment (expression has type
|
|||
|
# "Optional[List[str]]", variable has type
|
|||
|
# "Union[Union[Union[ExtensionArray, ndarray[Any, Any]],
|
|||
|
# Index, Series], Sequence[Any]]")
|
|||
|
result.columns = columns # type: ignore[assignment]
|
|||
|
|
|||
|
if result is None:
|
|||
|
# grouper specific aggregations
|
|||
|
if self.grouper.nkeys > 1:
|
|||
|
# test_groupby_as_index_series_scalar gets here with 'not self.as_index'
|
|||
|
return self._python_agg_general(func, *args, **kwargs)
|
|||
|
elif args or kwargs:
|
|||
|
# test_pass_args_kwargs gets here (with and without as_index)
|
|||
|
# can't return early
|
|||
|
result = self._aggregate_frame(func, *args, **kwargs)
|
|||
|
|
|||
|
elif self.axis == 1:
|
|||
|
# _aggregate_multiple_funcs does not allow self.axis == 1
|
|||
|
# Note: axis == 1 precludes 'not self.as_index', see __init__
|
|||
|
result = self._aggregate_frame(func)
|
|||
|
return result
|
|||
|
|
|||
|
else:
|
|||
|
# try to treat as if we are passing a list
|
|||
|
gba = GroupByApply(self, [func], args=(), kwargs={})
|
|||
|
try:
|
|||
|
result = gba.agg()
|
|||
|
|
|||
|
except ValueError as err:
|
|||
|
if "No objects to concatenate" not in str(err):
|
|||
|
raise
|
|||
|
# _aggregate_frame can fail with e.g. func=Series.mode,
|
|||
|
# where it expects 1D values but would be getting 2D values
|
|||
|
# In other tests, using aggregate_frame instead of GroupByApply
|
|||
|
# would give correct values but incorrect dtypes
|
|||
|
# object vs float64 in test_cython_agg_empty_buckets
|
|||
|
# float64 vs int64 in test_category_order_apply
|
|||
|
result = self._aggregate_frame(func)
|
|||
|
|
|||
|
else:
|
|||
|
# GH#32040, GH#35246
|
|||
|
# e.g. test_groupby_as_index_select_column_sum_empty_df
|
|||
|
result = cast(DataFrame, result)
|
|||
|
result.columns = self._obj_with_exclusions.columns.copy()
|
|||
|
|
|||
|
if not self.as_index:
|
|||
|
result = self._insert_inaxis_grouper(result)
|
|||
|
result.index = default_index(len(result))
|
|||
|
|
|||
|
return result
|
|||
|
|
|||
|
agg = aggregate
|
|||
|
|
|||
|
def _python_agg_general(self, func, *args, **kwargs):
|
|||
|
func = com.is_builtin_func(func)
|
|||
|
f = lambda x: func(x, *args, **kwargs)
|
|||
|
|
|||
|
# iterate through "columns" ex exclusions to populate output dict
|
|||
|
output: dict[base.OutputKey, ArrayLike] = {}
|
|||
|
|
|||
|
if self.ngroups == 0:
|
|||
|
# e.g. test_evaluate_with_empty_groups different path gets different
|
|||
|
# result dtype in empty case.
|
|||
|
return self._python_apply_general(f, self._selected_obj, is_agg=True)
|
|||
|
|
|||
|
for idx, obj in enumerate(self._iterate_slices()):
|
|||
|
name = obj.name
|
|||
|
result = self.grouper.agg_series(obj, f)
|
|||
|
key = base.OutputKey(label=name, position=idx)
|
|||
|
output[key] = result
|
|||
|
|
|||
|
if not output:
|
|||
|
# e.g. test_margins_no_values_no_cols
|
|||
|
return self._python_apply_general(f, self._selected_obj)
|
|||
|
|
|||
|
res = self._indexed_output_to_ndframe(output)
|
|||
|
return self._wrap_aggregated_output(res)
|
|||
|
|
|||
|
def _iterate_slices(self) -> Iterable[Series]:
|
|||
|
obj = self._selected_obj
|
|||
|
if self.axis == 1:
|
|||
|
obj = obj.T
|
|||
|
|
|||
|
if isinstance(obj, Series) and obj.name not in self.exclusions:
|
|||
|
# Occurs when doing DataFrameGroupBy(...)["X"]
|
|||
|
yield obj
|
|||
|
else:
|
|||
|
for label, values in obj.items():
|
|||
|
if label in self.exclusions:
|
|||
|
# Note: if we tried to just iterate over _obj_with_exclusions,
|
|||
|
# we would break test_wrap_agg_out by yielding a column
|
|||
|
# that is skipped here but not dropped from obj_with_exclusions
|
|||
|
continue
|
|||
|
|
|||
|
yield values
|
|||
|
|
|||
|
def _aggregate_frame(self, func, *args, **kwargs) -> DataFrame:
|
|||
|
if self.grouper.nkeys != 1:
|
|||
|
raise AssertionError("Number of keys must be 1")
|
|||
|
|
|||
|
obj = self._obj_with_exclusions
|
|||
|
|
|||
|
result: dict[Hashable, NDFrame | np.ndarray] = {}
|
|||
|
for name, grp_df in self.grouper.get_iterator(obj, self.axis):
|
|||
|
fres = func(grp_df, *args, **kwargs)
|
|||
|
result[name] = fres
|
|||
|
|
|||
|
result_index = self.grouper.result_index
|
|||
|
other_ax = obj.axes[1 - self.axis]
|
|||
|
out = self.obj._constructor(result, index=other_ax, columns=result_index)
|
|||
|
if self.axis == 0:
|
|||
|
out = out.T
|
|||
|
|
|||
|
return out
|
|||
|
|
|||
|
def _wrap_applied_output(
|
|||
|
self,
|
|||
|
data: DataFrame,
|
|||
|
values: list,
|
|||
|
not_indexed_same: bool = False,
|
|||
|
is_transform: bool = False,
|
|||
|
):
|
|||
|
if len(values) == 0:
|
|||
|
if is_transform:
|
|||
|
# GH#47787 see test_group_on_empty_multiindex
|
|||
|
res_index = data.index
|
|||
|
else:
|
|||
|
res_index = self.grouper.result_index
|
|||
|
|
|||
|
result = self.obj._constructor(index=res_index, columns=data.columns)
|
|||
|
result = result.astype(data.dtypes, copy=False)
|
|||
|
return result
|
|||
|
|
|||
|
# GH12824
|
|||
|
# using values[0] here breaks test_groupby_apply_none_first
|
|||
|
first_not_none = next(com.not_none(*values), None)
|
|||
|
|
|||
|
if first_not_none is None:
|
|||
|
# GH9684 - All values are None, return an empty frame.
|
|||
|
return self.obj._constructor()
|
|||
|
elif isinstance(first_not_none, DataFrame):
|
|||
|
return self._concat_objects(
|
|||
|
values,
|
|||
|
not_indexed_same=not_indexed_same,
|
|||
|
is_transform=is_transform,
|
|||
|
)
|
|||
|
|
|||
|
key_index = self.grouper.result_index if self.as_index else None
|
|||
|
|
|||
|
if isinstance(first_not_none, (np.ndarray, Index)):
|
|||
|
# GH#1738: values is list of arrays of unequal lengths
|
|||
|
# fall through to the outer else clause
|
|||
|
# TODO: sure this is right? we used to do this
|
|||
|
# after raising AttributeError above
|
|||
|
return self.obj._constructor_sliced(
|
|||
|
values, index=key_index, name=self._selection
|
|||
|
)
|
|||
|
elif not isinstance(first_not_none, Series):
|
|||
|
# values are not series or array-like but scalars
|
|||
|
# self._selection not passed through to Series as the
|
|||
|
# result should not take the name of original selection
|
|||
|
# of columns
|
|||
|
if self.as_index:
|
|||
|
return self.obj._constructor_sliced(values, index=key_index)
|
|||
|
else:
|
|||
|
result = self.obj._constructor(values, columns=[self._selection])
|
|||
|
result = self._insert_inaxis_grouper(result)
|
|||
|
return result
|
|||
|
else:
|
|||
|
# values are Series
|
|||
|
return self._wrap_applied_output_series(
|
|||
|
values,
|
|||
|
not_indexed_same,
|
|||
|
first_not_none,
|
|||
|
key_index,
|
|||
|
is_transform,
|
|||
|
)
|
|||
|
|
|||
|
def _wrap_applied_output_series(
|
|||
|
self,
|
|||
|
values: list[Series],
|
|||
|
not_indexed_same: bool,
|
|||
|
first_not_none,
|
|||
|
key_index: Index | None,
|
|||
|
is_transform: bool,
|
|||
|
) -> DataFrame | Series:
|
|||
|
kwargs = first_not_none._construct_axes_dict()
|
|||
|
backup = Series(**kwargs)
|
|||
|
values = [x if (x is not None) else backup for x in values]
|
|||
|
|
|||
|
all_indexed_same = all_indexes_same(x.index for x in values)
|
|||
|
|
|||
|
if not all_indexed_same:
|
|||
|
# GH 8467
|
|||
|
return self._concat_objects(
|
|||
|
values,
|
|||
|
not_indexed_same=True,
|
|||
|
is_transform=is_transform,
|
|||
|
)
|
|||
|
|
|||
|
# Combine values
|
|||
|
# vstack+constructor is faster than concat and handles MI-columns
|
|||
|
stacked_values = np.vstack([np.asarray(v) for v in values])
|
|||
|
|
|||
|
if self.axis == 0:
|
|||
|
index = key_index
|
|||
|
columns = first_not_none.index.copy()
|
|||
|
if columns.name is None:
|
|||
|
# GH6124 - propagate name of Series when it's consistent
|
|||
|
names = {v.name for v in values}
|
|||
|
if len(names) == 1:
|
|||
|
columns.name = list(names)[0]
|
|||
|
else:
|
|||
|
index = first_not_none.index
|
|||
|
columns = key_index
|
|||
|
stacked_values = stacked_values.T
|
|||
|
|
|||
|
if stacked_values.dtype == object:
|
|||
|
# We'll have the DataFrame constructor do inference
|
|||
|
stacked_values = stacked_values.tolist()
|
|||
|
result = self.obj._constructor(stacked_values, index=index, columns=columns)
|
|||
|
|
|||
|
if not self.as_index:
|
|||
|
result = self._insert_inaxis_grouper(result)
|
|||
|
|
|||
|
return self._reindex_output(result)
|
|||
|
|
|||
|
def _cython_transform(
|
|||
|
self,
|
|||
|
how: str,
|
|||
|
numeric_only: bool = False,
|
|||
|
axis: AxisInt = 0,
|
|||
|
**kwargs,
|
|||
|
) -> DataFrame:
|
|||
|
assert axis == 0 # handled by caller
|
|||
|
|
|||
|
# With self.axis == 0, we have multi-block tests
|
|||
|
# e.g. test_rank_min_int, test_cython_transform_frame
|
|||
|
# test_transform_numeric_ret
|
|||
|
# With self.axis == 1, _get_data_to_aggregate does a transpose
|
|||
|
# so we always have a single block.
|
|||
|
mgr: Manager2D = self._get_data_to_aggregate(
|
|||
|
numeric_only=numeric_only, name=how
|
|||
|
)
|
|||
|
|
|||
|
def arr_func(bvalues: ArrayLike) -> ArrayLike:
|
|||
|
return self.grouper._cython_operation(
|
|||
|
"transform", bvalues, how, 1, **kwargs
|
|||
|
)
|
|||
|
|
|||
|
# We could use `mgr.apply` here and not have to set_axis, but
|
|||
|
# we would have to do shape gymnastics for ArrayManager compat
|
|||
|
res_mgr = mgr.grouped_reduce(arr_func)
|
|||
|
res_mgr.set_axis(1, mgr.axes[1])
|
|||
|
|
|||
|
res_df = self.obj._constructor(res_mgr)
|
|||
|
res_df = self._maybe_transpose_result(res_df)
|
|||
|
return res_df
|
|||
|
|
|||
|
def _transform_general(self, func, *args, **kwargs):
|
|||
|
from pandas.core.reshape.concat import concat
|
|||
|
|
|||
|
applied = []
|
|||
|
obj = self._obj_with_exclusions
|
|||
|
gen = self.grouper.get_iterator(obj, axis=self.axis)
|
|||
|
fast_path, slow_path = self._define_paths(func, *args, **kwargs)
|
|||
|
|
|||
|
# Determine whether to use slow or fast path by evaluating on the first group.
|
|||
|
# Need to handle the case of an empty generator and process the result so that
|
|||
|
# it does not need to be computed again.
|
|||
|
try:
|
|||
|
name, group = next(gen)
|
|||
|
except StopIteration:
|
|||
|
pass
|
|||
|
else:
|
|||
|
object.__setattr__(group, "name", name)
|
|||
|
try:
|
|||
|
path, res = self._choose_path(fast_path, slow_path, group)
|
|||
|
except ValueError as err:
|
|||
|
# e.g. test_transform_with_non_scalar_group
|
|||
|
msg = "transform must return a scalar value for each group"
|
|||
|
raise ValueError(msg) from err
|
|||
|
if group.size > 0:
|
|||
|
res = _wrap_transform_general_frame(self.obj, group, res)
|
|||
|
applied.append(res)
|
|||
|
|
|||
|
# Compute and process with the remaining groups
|
|||
|
for name, group in gen:
|
|||
|
if group.size == 0:
|
|||
|
continue
|
|||
|
object.__setattr__(group, "name", name)
|
|||
|
res = path(group)
|
|||
|
|
|||
|
res = _wrap_transform_general_frame(self.obj, group, res)
|
|||
|
applied.append(res)
|
|||
|
|
|||
|
concat_index = obj.columns if self.axis == 0 else obj.index
|
|||
|
other_axis = 1 if self.axis == 0 else 0 # switches between 0 & 1
|
|||
|
concatenated = concat(applied, axis=self.axis, verify_integrity=False)
|
|||
|
concatenated = concatenated.reindex(concat_index, axis=other_axis, copy=False)
|
|||
|
return self._set_result_index_ordered(concatenated)
|
|||
|
|
|||
|
__examples_dataframe_doc = dedent(
|
|||
|
"""
|
|||
|
>>> df = pd.DataFrame({'A' : ['foo', 'bar', 'foo', 'bar',
|
|||
|
... 'foo', 'bar'],
|
|||
|
... 'B' : ['one', 'one', 'two', 'three',
|
|||
|
... 'two', 'two'],
|
|||
|
... 'C' : [1, 5, 5, 2, 5, 5],
|
|||
|
... 'D' : [2.0, 5., 8., 1., 2., 9.]})
|
|||
|
>>> grouped = df.groupby('A')[['C', 'D']]
|
|||
|
>>> grouped.transform(lambda x: (x - x.mean()) / x.std())
|
|||
|
C D
|
|||
|
0 -1.154701 -0.577350
|
|||
|
1 0.577350 0.000000
|
|||
|
2 0.577350 1.154701
|
|||
|
3 -1.154701 -1.000000
|
|||
|
4 0.577350 -0.577350
|
|||
|
5 0.577350 1.000000
|
|||
|
|
|||
|
Broadcast result of the transformation
|
|||
|
|
|||
|
>>> grouped.transform(lambda x: x.max() - x.min())
|
|||
|
C D
|
|||
|
0 4.0 6.0
|
|||
|
1 3.0 8.0
|
|||
|
2 4.0 6.0
|
|||
|
3 3.0 8.0
|
|||
|
4 4.0 6.0
|
|||
|
5 3.0 8.0
|
|||
|
|
|||
|
>>> grouped.transform("mean")
|
|||
|
C D
|
|||
|
0 3.666667 4.0
|
|||
|
1 4.000000 5.0
|
|||
|
2 3.666667 4.0
|
|||
|
3 4.000000 5.0
|
|||
|
4 3.666667 4.0
|
|||
|
5 4.000000 5.0
|
|||
|
|
|||
|
.. versionchanged:: 1.3.0
|
|||
|
|
|||
|
The resulting dtype will reflect the return value of the passed ``func``,
|
|||
|
for example:
|
|||
|
|
|||
|
>>> grouped.transform(lambda x: x.astype(int).max())
|
|||
|
C D
|
|||
|
0 5 8
|
|||
|
1 5 9
|
|||
|
2 5 8
|
|||
|
3 5 9
|
|||
|
4 5 8
|
|||
|
5 5 9
|
|||
|
"""
|
|||
|
)
|
|||
|
|
|||
|
@Substitution(klass="DataFrame", example=__examples_dataframe_doc)
|
|||
|
@Appender(_transform_template)
|
|||
|
def transform(self, func, *args, engine=None, engine_kwargs=None, **kwargs):
|
|||
|
return self._transform(
|
|||
|
func, *args, engine=engine, engine_kwargs=engine_kwargs, **kwargs
|
|||
|
)
|
|||
|
|
|||
|
def _define_paths(self, func, *args, **kwargs):
|
|||
|
if isinstance(func, str):
|
|||
|
fast_path = lambda group: getattr(group, func)(*args, **kwargs)
|
|||
|
slow_path = lambda group: group.apply(
|
|||
|
lambda x: getattr(x, func)(*args, **kwargs), axis=self.axis
|
|||
|
)
|
|||
|
else:
|
|||
|
fast_path = lambda group: func(group, *args, **kwargs)
|
|||
|
slow_path = lambda group: group.apply(
|
|||
|
lambda x: func(x, *args, **kwargs), axis=self.axis
|
|||
|
)
|
|||
|
return fast_path, slow_path
|
|||
|
|
|||
|
def _choose_path(self, fast_path: Callable, slow_path: Callable, group: DataFrame):
|
|||
|
path = slow_path
|
|||
|
res = slow_path(group)
|
|||
|
|
|||
|
if self.ngroups == 1:
|
|||
|
# no need to evaluate multiple paths when only
|
|||
|
# a single group exists
|
|||
|
return path, res
|
|||
|
|
|||
|
# if we make it here, test if we can use the fast path
|
|||
|
try:
|
|||
|
res_fast = fast_path(group)
|
|||
|
except AssertionError:
|
|||
|
raise # pragma: no cover
|
|||
|
except Exception:
|
|||
|
# GH#29631 For user-defined function, we can't predict what may be
|
|||
|
# raised; see test_transform.test_transform_fastpath_raises
|
|||
|
return path, res
|
|||
|
|
|||
|
# verify fast path returns either:
|
|||
|
# a DataFrame with columns equal to group.columns
|
|||
|
# OR a Series with index equal to group.columns
|
|||
|
if isinstance(res_fast, DataFrame):
|
|||
|
if not res_fast.columns.equals(group.columns):
|
|||
|
return path, res
|
|||
|
elif isinstance(res_fast, Series):
|
|||
|
if not res_fast.index.equals(group.columns):
|
|||
|
return path, res
|
|||
|
else:
|
|||
|
return path, res
|
|||
|
|
|||
|
if res_fast.equals(res):
|
|||
|
path = fast_path
|
|||
|
|
|||
|
return path, res
|
|||
|
|
|||
|
def filter(self, func, dropna: bool = True, *args, **kwargs):
|
|||
|
"""
|
|||
|
Filter elements from groups that don't satisfy a criterion.
|
|||
|
|
|||
|
Elements from groups are filtered if they do not satisfy the
|
|||
|
boolean criterion specified by func.
|
|||
|
|
|||
|
Parameters
|
|||
|
----------
|
|||
|
func : function
|
|||
|
Criterion to apply to each group. Should return True or False.
|
|||
|
dropna : bool
|
|||
|
Drop groups that do not pass the filter. True by default; if False,
|
|||
|
groups that evaluate False are filled with NaNs.
|
|||
|
|
|||
|
Returns
|
|||
|
-------
|
|||
|
DataFrame
|
|||
|
|
|||
|
Notes
|
|||
|
-----
|
|||
|
Each subframe is endowed the attribute 'name' in case you need to know
|
|||
|
which group you are working on.
|
|||
|
|
|||
|
Functions that mutate the passed object can produce unexpected
|
|||
|
behavior or errors and are not supported. See :ref:`gotchas.udf-mutation`
|
|||
|
for more details.
|
|||
|
|
|||
|
Examples
|
|||
|
--------
|
|||
|
>>> df = pd.DataFrame({'A' : ['foo', 'bar', 'foo', 'bar',
|
|||
|
... 'foo', 'bar'],
|
|||
|
... 'B' : [1, 2, 3, 4, 5, 6],
|
|||
|
... 'C' : [2.0, 5., 8., 1., 2., 9.]})
|
|||
|
>>> grouped = df.groupby('A')
|
|||
|
>>> grouped.filter(lambda x: x['B'].mean() > 3.)
|
|||
|
A B C
|
|||
|
1 bar 2 5.0
|
|||
|
3 bar 4 1.0
|
|||
|
5 bar 6 9.0
|
|||
|
"""
|
|||
|
indices = []
|
|||
|
|
|||
|
obj = self._selected_obj
|
|||
|
gen = self.grouper.get_iterator(obj, axis=self.axis)
|
|||
|
|
|||
|
for name, group in gen:
|
|||
|
object.__setattr__(group, "name", name)
|
|||
|
|
|||
|
res = func(group, *args, **kwargs)
|
|||
|
|
|||
|
try:
|
|||
|
res = res.squeeze()
|
|||
|
except AttributeError: # allow e.g., scalars and frames to pass
|
|||
|
pass
|
|||
|
|
|||
|
# interpret the result of the filter
|
|||
|
if is_bool(res) or (is_scalar(res) and isna(res)):
|
|||
|
if notna(res) and res:
|
|||
|
indices.append(self._get_index(name))
|
|||
|
else:
|
|||
|
# non scalars aren't allowed
|
|||
|
raise TypeError(
|
|||
|
f"filter function returned a {type(res).__name__}, "
|
|||
|
"but expected a scalar bool"
|
|||
|
)
|
|||
|
|
|||
|
return self._apply_filter(indices, dropna)
|
|||
|
|
|||
|
def __getitem__(self, key) -> DataFrameGroupBy | SeriesGroupBy:
|
|||
|
if self.axis == 1:
|
|||
|
# GH 37725
|
|||
|
raise ValueError("Cannot subset columns when using axis=1")
|
|||
|
# per GH 23566
|
|||
|
if isinstance(key, tuple) and len(key) > 1:
|
|||
|
# if len == 1, then it becomes a SeriesGroupBy and this is actually
|
|||
|
# valid syntax, so don't raise
|
|||
|
raise ValueError(
|
|||
|
"Cannot subset columns with a tuple with more than one element. "
|
|||
|
"Use a list instead."
|
|||
|
)
|
|||
|
return super().__getitem__(key)
|
|||
|
|
|||
|
def _gotitem(self, key, ndim: int, subset=None):
|
|||
|
"""
|
|||
|
sub-classes to define
|
|||
|
return a sliced object
|
|||
|
|
|||
|
Parameters
|
|||
|
----------
|
|||
|
key : string / list of selections
|
|||
|
ndim : {1, 2}
|
|||
|
requested ndim of result
|
|||
|
subset : object, default None
|
|||
|
subset to act on
|
|||
|
"""
|
|||
|
if ndim == 2:
|
|||
|
if subset is None:
|
|||
|
subset = self.obj
|
|||
|
return DataFrameGroupBy(
|
|||
|
subset,
|
|||
|
self.grouper,
|
|||
|
axis=self.axis,
|
|||
|
level=self.level,
|
|||
|
grouper=self.grouper,
|
|||
|
exclusions=self.exclusions,
|
|||
|
selection=key,
|
|||
|
as_index=self.as_index,
|
|||
|
sort=self.sort,
|
|||
|
group_keys=self.group_keys,
|
|||
|
observed=self.observed,
|
|||
|
dropna=self.dropna,
|
|||
|
)
|
|||
|
elif ndim == 1:
|
|||
|
if subset is None:
|
|||
|
subset = self.obj[key]
|
|||
|
return SeriesGroupBy(
|
|||
|
subset,
|
|||
|
level=self.level,
|
|||
|
grouper=self.grouper,
|
|||
|
exclusions=self.exclusions,
|
|||
|
selection=key,
|
|||
|
as_index=self.as_index,
|
|||
|
sort=self.sort,
|
|||
|
group_keys=self.group_keys,
|
|||
|
observed=self.observed,
|
|||
|
dropna=self.dropna,
|
|||
|
)
|
|||
|
|
|||
|
raise AssertionError("invalid ndim for _gotitem")
|
|||
|
|
|||
|
def _get_data_to_aggregate(
|
|||
|
self, *, numeric_only: bool = False, name: str | None = None
|
|||
|
) -> Manager2D:
|
|||
|
obj = self._obj_with_exclusions
|
|||
|
if self.axis == 1:
|
|||
|
mgr = obj.T._mgr
|
|||
|
else:
|
|||
|
mgr = obj._mgr
|
|||
|
|
|||
|
if numeric_only:
|
|||
|
mgr = mgr.get_numeric_data(copy=False)
|
|||
|
return mgr
|
|||
|
|
|||
|
def _indexed_output_to_ndframe(
|
|||
|
self, output: Mapping[base.OutputKey, ArrayLike]
|
|||
|
) -> DataFrame:
|
|||
|
"""
|
|||
|
Wrap the dict result of a GroupBy aggregation into a DataFrame.
|
|||
|
"""
|
|||
|
indexed_output = {key.position: val for key, val in output.items()}
|
|||
|
columns = Index([key.label for key in output])
|
|||
|
columns._set_names(self._obj_with_exclusions._get_axis(1 - self.axis).names)
|
|||
|
|
|||
|
result = self.obj._constructor(indexed_output)
|
|||
|
result.columns = columns
|
|||
|
return result
|
|||
|
|
|||
|
def _wrap_agged_manager(self, mgr: Manager2D) -> DataFrame:
|
|||
|
return self.obj._constructor(mgr)
|
|||
|
|
|||
|
def _iterate_column_groupbys(self, obj: DataFrame):
|
|||
|
for i, colname in enumerate(obj.columns):
|
|||
|
yield colname, SeriesGroupBy(
|
|||
|
obj.iloc[:, i],
|
|||
|
selection=colname,
|
|||
|
grouper=self.grouper,
|
|||
|
exclusions=self.exclusions,
|
|||
|
observed=self.observed,
|
|||
|
)
|
|||
|
|
|||
|
def _apply_to_column_groupbys(self, func, obj: DataFrame) -> DataFrame:
|
|||
|
from pandas.core.reshape.concat import concat
|
|||
|
|
|||
|
columns = obj.columns
|
|||
|
results = [
|
|||
|
func(col_groupby) for _, col_groupby in self._iterate_column_groupbys(obj)
|
|||
|
]
|
|||
|
|
|||
|
if not len(results):
|
|||
|
# concat would raise
|
|||
|
return DataFrame([], columns=columns, index=self.grouper.result_index)
|
|||
|
else:
|
|||
|
return concat(results, keys=columns, axis=1)
|
|||
|
|
|||
|
def nunique(self, dropna: bool = True) -> DataFrame:
|
|||
|
"""
|
|||
|
Return DataFrame with counts of unique elements in each position.
|
|||
|
|
|||
|
Parameters
|
|||
|
----------
|
|||
|
dropna : bool, default True
|
|||
|
Don't include NaN in the counts.
|
|||
|
|
|||
|
Returns
|
|||
|
-------
|
|||
|
nunique: DataFrame
|
|||
|
|
|||
|
Examples
|
|||
|
--------
|
|||
|
>>> df = pd.DataFrame({'id': ['spam', 'egg', 'egg', 'spam',
|
|||
|
... 'ham', 'ham'],
|
|||
|
... 'value1': [1, 5, 5, 2, 5, 5],
|
|||
|
... 'value2': list('abbaxy')})
|
|||
|
>>> df
|
|||
|
id value1 value2
|
|||
|
0 spam 1 a
|
|||
|
1 egg 5 b
|
|||
|
2 egg 5 b
|
|||
|
3 spam 2 a
|
|||
|
4 ham 5 x
|
|||
|
5 ham 5 y
|
|||
|
|
|||
|
>>> df.groupby('id').nunique()
|
|||
|
value1 value2
|
|||
|
id
|
|||
|
egg 1 1
|
|||
|
ham 1 2
|
|||
|
spam 2 1
|
|||
|
|
|||
|
Check for rows with the same id but conflicting values:
|
|||
|
|
|||
|
>>> df.groupby('id').filter(lambda g: (g.nunique() > 1).any())
|
|||
|
id value1 value2
|
|||
|
0 spam 1 a
|
|||
|
3 spam 2 a
|
|||
|
4 ham 5 x
|
|||
|
5 ham 5 y
|
|||
|
"""
|
|||
|
|
|||
|
if self.axis != 0:
|
|||
|
# see test_groupby_crash_on_nunique
|
|||
|
return self._python_apply_general(
|
|||
|
lambda sgb: sgb.nunique(dropna), self._obj_with_exclusions, is_agg=True
|
|||
|
)
|
|||
|
|
|||
|
obj = self._obj_with_exclusions
|
|||
|
results = self._apply_to_column_groupbys(
|
|||
|
lambda sgb: sgb.nunique(dropna), obj=obj
|
|||
|
)
|
|||
|
|
|||
|
if not self.as_index:
|
|||
|
results.index = default_index(len(results))
|
|||
|
results = self._insert_inaxis_grouper(results)
|
|||
|
|
|||
|
return results
|
|||
|
|
|||
|
def idxmax(
|
|||
|
self,
|
|||
|
axis: Axis | None = None,
|
|||
|
skipna: bool = True,
|
|||
|
numeric_only: bool = False,
|
|||
|
) -> DataFrame:
|
|||
|
"""
|
|||
|
Return index of first occurrence of maximum over requested axis.
|
|||
|
|
|||
|
NA/null values are excluded.
|
|||
|
|
|||
|
Parameters
|
|||
|
----------
|
|||
|
axis : {{0 or 'index', 1 or 'columns'}}, default None
|
|||
|
The axis to use. 0 or 'index' for row-wise, 1 or 'columns' for column-wise.
|
|||
|
If axis is not provided, grouper's axis is used.
|
|||
|
|
|||
|
.. versionchanged:: 2.0.0
|
|||
|
|
|||
|
skipna : bool, default True
|
|||
|
Exclude NA/null values. If an entire row/column is NA, the result
|
|||
|
will be NA.
|
|||
|
numeric_only : bool, default False
|
|||
|
Include only `float`, `int` or `boolean` data.
|
|||
|
|
|||
|
.. versionadded:: 1.5.0
|
|||
|
|
|||
|
Returns
|
|||
|
-------
|
|||
|
Series
|
|||
|
Indexes of maxima along the specified axis.
|
|||
|
|
|||
|
Raises
|
|||
|
------
|
|||
|
ValueError
|
|||
|
* If the row/column is empty
|
|||
|
|
|||
|
See Also
|
|||
|
--------
|
|||
|
Series.idxmax : Return index of the maximum element.
|
|||
|
|
|||
|
Notes
|
|||
|
-----
|
|||
|
This method is the DataFrame version of ``ndarray.argmax``.
|
|||
|
|
|||
|
Examples
|
|||
|
--------
|
|||
|
Consider a dataset containing food consumption in Argentina.
|
|||
|
|
|||
|
>>> df = pd.DataFrame({'consumption': [10.51, 103.11, 55.48],
|
|||
|
... 'co2_emissions': [37.2, 19.66, 1712]},
|
|||
|
... index=['Pork', 'Wheat Products', 'Beef'])
|
|||
|
|
|||
|
>>> df
|
|||
|
consumption co2_emissions
|
|||
|
Pork 10.51 37.20
|
|||
|
Wheat Products 103.11 19.66
|
|||
|
Beef 55.48 1712.00
|
|||
|
|
|||
|
By default, it returns the index for the maximum value in each column.
|
|||
|
|
|||
|
>>> df.idxmax()
|
|||
|
consumption Wheat Products
|
|||
|
co2_emissions Beef
|
|||
|
dtype: object
|
|||
|
|
|||
|
To return the index for the maximum value in each row, use ``axis="columns"``.
|
|||
|
|
|||
|
>>> df.idxmax(axis="columns")
|
|||
|
Pork co2_emissions
|
|||
|
Wheat Products consumption
|
|||
|
Beef co2_emissions
|
|||
|
dtype: object
|
|||
|
"""
|
|||
|
if axis is None:
|
|||
|
axis = self.axis
|
|||
|
|
|||
|
def func(df):
|
|||
|
return df.idxmax(axis=axis, skipna=skipna, numeric_only=numeric_only)
|
|||
|
|
|||
|
func.__name__ = "idxmax"
|
|||
|
result = self._python_apply_general(
|
|||
|
func, self._obj_with_exclusions, not_indexed_same=True
|
|||
|
)
|
|||
|
return result
|
|||
|
|
|||
|
def idxmin(
|
|||
|
self,
|
|||
|
axis: Axis | None = None,
|
|||
|
skipna: bool = True,
|
|||
|
numeric_only: bool = False,
|
|||
|
) -> DataFrame:
|
|||
|
"""
|
|||
|
Return index of first occurrence of minimum over requested axis.
|
|||
|
|
|||
|
NA/null values are excluded.
|
|||
|
|
|||
|
Parameters
|
|||
|
----------
|
|||
|
axis : {{0 or 'index', 1 or 'columns'}}, default None
|
|||
|
The axis to use. 0 or 'index' for row-wise, 1 or 'columns' for column-wise.
|
|||
|
If axis is not provided, grouper's axis is used.
|
|||
|
|
|||
|
.. versionchanged:: 2.0.0
|
|||
|
|
|||
|
skipna : bool, default True
|
|||
|
Exclude NA/null values. If an entire row/column is NA, the result
|
|||
|
will be NA.
|
|||
|
numeric_only : bool, default False
|
|||
|
Include only `float`, `int` or `boolean` data.
|
|||
|
|
|||
|
.. versionadded:: 1.5.0
|
|||
|
|
|||
|
Returns
|
|||
|
-------
|
|||
|
Series
|
|||
|
Indexes of minima along the specified axis.
|
|||
|
|
|||
|
Raises
|
|||
|
------
|
|||
|
ValueError
|
|||
|
* If the row/column is empty
|
|||
|
|
|||
|
See Also
|
|||
|
--------
|
|||
|
Series.idxmin : Return index of the minimum element.
|
|||
|
|
|||
|
Notes
|
|||
|
-----
|
|||
|
This method is the DataFrame version of ``ndarray.argmin``.
|
|||
|
|
|||
|
Examples
|
|||
|
--------
|
|||
|
Consider a dataset containing food consumption in Argentina.
|
|||
|
|
|||
|
>>> df = pd.DataFrame({'consumption': [10.51, 103.11, 55.48],
|
|||
|
... 'co2_emissions': [37.2, 19.66, 1712]},
|
|||
|
... index=['Pork', 'Wheat Products', 'Beef'])
|
|||
|
|
|||
|
>>> df
|
|||
|
consumption co2_emissions
|
|||
|
Pork 10.51 37.20
|
|||
|
Wheat Products 103.11 19.66
|
|||
|
Beef 55.48 1712.00
|
|||
|
|
|||
|
By default, it returns the index for the minimum value in each column.
|
|||
|
|
|||
|
>>> df.idxmin()
|
|||
|
consumption Pork
|
|||
|
co2_emissions Wheat Products
|
|||
|
dtype: object
|
|||
|
|
|||
|
To return the index for the minimum value in each row, use ``axis="columns"``.
|
|||
|
|
|||
|
>>> df.idxmin(axis="columns")
|
|||
|
Pork consumption
|
|||
|
Wheat Products co2_emissions
|
|||
|
Beef consumption
|
|||
|
dtype: object
|
|||
|
"""
|
|||
|
if axis is None:
|
|||
|
axis = self.axis
|
|||
|
|
|||
|
def func(df):
|
|||
|
return df.idxmin(axis=axis, skipna=skipna, numeric_only=numeric_only)
|
|||
|
|
|||
|
func.__name__ = "idxmin"
|
|||
|
result = self._python_apply_general(
|
|||
|
func, self._obj_with_exclusions, not_indexed_same=True
|
|||
|
)
|
|||
|
return result
|
|||
|
|
|||
|
boxplot = boxplot_frame_groupby
|
|||
|
|
|||
|
def value_counts(
|
|||
|
self,
|
|||
|
subset: Sequence[Hashable] | None = None,
|
|||
|
normalize: bool = False,
|
|||
|
sort: bool = True,
|
|||
|
ascending: bool = False,
|
|||
|
dropna: bool = True,
|
|||
|
) -> DataFrame | Series:
|
|||
|
"""
|
|||
|
Return a Series or DataFrame containing counts of unique rows.
|
|||
|
|
|||
|
.. versionadded:: 1.4.0
|
|||
|
|
|||
|
Parameters
|
|||
|
----------
|
|||
|
subset : list-like, optional
|
|||
|
Columns to use when counting unique combinations.
|
|||
|
normalize : bool, default False
|
|||
|
Return proportions rather than frequencies.
|
|||
|
sort : bool, default True
|
|||
|
Sort by frequencies.
|
|||
|
ascending : bool, default False
|
|||
|
Sort in ascending order.
|
|||
|
dropna : bool, default True
|
|||
|
Don’t include counts of rows that contain NA values.
|
|||
|
|
|||
|
Returns
|
|||
|
-------
|
|||
|
Series or DataFrame
|
|||
|
Series if the groupby as_index is True, otherwise DataFrame.
|
|||
|
|
|||
|
See Also
|
|||
|
--------
|
|||
|
Series.value_counts: Equivalent method on Series.
|
|||
|
DataFrame.value_counts: Equivalent method on DataFrame.
|
|||
|
SeriesGroupBy.value_counts: Equivalent method on SeriesGroupBy.
|
|||
|
|
|||
|
Notes
|
|||
|
-----
|
|||
|
- If the groupby as_index is True then the returned Series will have a
|
|||
|
MultiIndex with one level per input column.
|
|||
|
- If the groupby as_index is False then the returned DataFrame will have an
|
|||
|
additional column with the value_counts. The column is labelled 'count' or
|
|||
|
'proportion', depending on the ``normalize`` parameter.
|
|||
|
|
|||
|
By default, rows that contain any NA values are omitted from
|
|||
|
the result.
|
|||
|
|
|||
|
By default, the result will be in descending order so that the
|
|||
|
first element of each group is the most frequently-occurring row.
|
|||
|
|
|||
|
Examples
|
|||
|
--------
|
|||
|
>>> df = pd.DataFrame({
|
|||
|
... 'gender': ['male', 'male', 'female', 'male', 'female', 'male'],
|
|||
|
... 'education': ['low', 'medium', 'high', 'low', 'high', 'low'],
|
|||
|
... 'country': ['US', 'FR', 'US', 'FR', 'FR', 'FR']
|
|||
|
... })
|
|||
|
|
|||
|
>>> df
|
|||
|
gender education country
|
|||
|
0 male low US
|
|||
|
1 male medium FR
|
|||
|
2 female high US
|
|||
|
3 male low FR
|
|||
|
4 female high FR
|
|||
|
5 male low FR
|
|||
|
|
|||
|
>>> df.groupby('gender').value_counts()
|
|||
|
gender education country
|
|||
|
female high FR 1
|
|||
|
US 1
|
|||
|
male low FR 2
|
|||
|
US 1
|
|||
|
medium FR 1
|
|||
|
Name: count, dtype: int64
|
|||
|
|
|||
|
>>> df.groupby('gender').value_counts(ascending=True)
|
|||
|
gender education country
|
|||
|
female high FR 1
|
|||
|
US 1
|
|||
|
male low US 1
|
|||
|
medium FR 1
|
|||
|
low FR 2
|
|||
|
Name: count, dtype: int64
|
|||
|
|
|||
|
>>> df.groupby('gender').value_counts(normalize=True)
|
|||
|
gender education country
|
|||
|
female high FR 0.50
|
|||
|
US 0.50
|
|||
|
male low FR 0.50
|
|||
|
US 0.25
|
|||
|
medium FR 0.25
|
|||
|
Name: proportion, dtype: float64
|
|||
|
|
|||
|
>>> df.groupby('gender', as_index=False).value_counts()
|
|||
|
gender education country count
|
|||
|
0 female high FR 1
|
|||
|
1 female high US 1
|
|||
|
2 male low FR 2
|
|||
|
3 male low US 1
|
|||
|
4 male medium FR 1
|
|||
|
|
|||
|
>>> df.groupby('gender', as_index=False).value_counts(normalize=True)
|
|||
|
gender education country proportion
|
|||
|
0 female high FR 0.50
|
|||
|
1 female high US 0.50
|
|||
|
2 male low FR 0.50
|
|||
|
3 male low US 0.25
|
|||
|
4 male medium FR 0.25
|
|||
|
"""
|
|||
|
return self._value_counts(subset, normalize, sort, ascending, dropna)
|
|||
|
|
|||
|
def fillna(
|
|||
|
self,
|
|||
|
value: Hashable | Mapping | Series | DataFrame = None,
|
|||
|
method: FillnaOptions | None = None,
|
|||
|
axis: Axis | None = None,
|
|||
|
inplace: bool = False,
|
|||
|
limit=None,
|
|||
|
downcast=None,
|
|||
|
) -> DataFrame | None:
|
|||
|
"""
|
|||
|
Fill NA/NaN values using the specified method within groups.
|
|||
|
|
|||
|
Parameters
|
|||
|
----------
|
|||
|
value : scalar, dict, Series, or DataFrame
|
|||
|
Value to use to fill holes (e.g. 0), alternately a
|
|||
|
dict/Series/DataFrame of values specifying which value to use for
|
|||
|
each index (for a Series) or column (for a DataFrame). Values not
|
|||
|
in the dict/Series/DataFrame will not be filled. This value cannot
|
|||
|
be a list. Users wanting to use the ``value`` argument and not ``method``
|
|||
|
should prefer :meth:`.DataFrame.fillna` as this
|
|||
|
will produce the same result and be more performant.
|
|||
|
method : {{'bfill', 'ffill', None}}, default None
|
|||
|
Method to use for filling holes. ``'ffill'`` will propagate
|
|||
|
the last valid observation forward within a group.
|
|||
|
``'bfill'`` will use next valid observation to fill the gap.
|
|||
|
axis : {0 or 'index', 1 or 'columns'}
|
|||
|
Axis along which to fill missing values. When the :class:`DataFrameGroupBy`
|
|||
|
``axis`` argument is ``0``, using ``axis=1`` here will produce
|
|||
|
the same results as :meth:`.DataFrame.fillna`. When the
|
|||
|
:class:`DataFrameGroupBy` ``axis`` argument is ``1``, using ``axis=0``
|
|||
|
or ``axis=1`` here will produce the same results.
|
|||
|
inplace : bool, default False
|
|||
|
Broken. Do not set to True.
|
|||
|
limit : int, default None
|
|||
|
If method is specified, this is the maximum number of consecutive
|
|||
|
NaN values to forward/backward fill within a group. In other words,
|
|||
|
if there is a gap with more than this number of consecutive NaNs,
|
|||
|
it will only be partially filled. If method is not specified, this is the
|
|||
|
maximum number of entries along the entire axis where NaNs will be
|
|||
|
filled. Must be greater than 0 if not None.
|
|||
|
downcast : dict, default is None
|
|||
|
A dict of item->dtype of what to downcast if possible,
|
|||
|
or the string 'infer' which will try to downcast to an appropriate
|
|||
|
equal type (e.g. float64 to int64 if possible).
|
|||
|
|
|||
|
Returns
|
|||
|
-------
|
|||
|
DataFrame
|
|||
|
Object with missing values filled.
|
|||
|
|
|||
|
See Also
|
|||
|
--------
|
|||
|
ffill : Forward fill values within a group.
|
|||
|
bfill : Backward fill values within a group.
|
|||
|
|
|||
|
Examples
|
|||
|
--------
|
|||
|
>>> df = pd.DataFrame(
|
|||
|
... {
|
|||
|
... "key": [0, 0, 1, 1, 1],
|
|||
|
... "A": [np.nan, 2, np.nan, 3, np.nan],
|
|||
|
... "B": [2, 3, np.nan, np.nan, np.nan],
|
|||
|
... "C": [np.nan, np.nan, 2, np.nan, np.nan],
|
|||
|
... }
|
|||
|
... )
|
|||
|
>>> df
|
|||
|
key A B C
|
|||
|
0 0 NaN 2.0 NaN
|
|||
|
1 0 2.0 3.0 NaN
|
|||
|
2 1 NaN NaN 2.0
|
|||
|
3 1 3.0 NaN NaN
|
|||
|
4 1 NaN NaN NaN
|
|||
|
|
|||
|
Propagate non-null values forward or backward within each group along columns.
|
|||
|
|
|||
|
>>> df.groupby("key").fillna(method="ffill")
|
|||
|
A B C
|
|||
|
0 NaN 2.0 NaN
|
|||
|
1 2.0 3.0 NaN
|
|||
|
2 NaN NaN 2.0
|
|||
|
3 3.0 NaN 2.0
|
|||
|
4 3.0 NaN 2.0
|
|||
|
|
|||
|
>>> df.groupby("key").fillna(method="bfill")
|
|||
|
A B C
|
|||
|
0 2.0 2.0 NaN
|
|||
|
1 2.0 3.0 NaN
|
|||
|
2 3.0 NaN 2.0
|
|||
|
3 3.0 NaN NaN
|
|||
|
4 NaN NaN NaN
|
|||
|
|
|||
|
Propagate non-null values forward or backward within each group along rows.
|
|||
|
|
|||
|
>>> df.groupby([0, 0, 1, 1], axis=1).fillna(method="ffill")
|
|||
|
key A B C
|
|||
|
0 0.0 0.0 2.0 2.0
|
|||
|
1 0.0 2.0 3.0 3.0
|
|||
|
2 1.0 1.0 NaN 2.0
|
|||
|
3 1.0 3.0 NaN NaN
|
|||
|
4 1.0 1.0 NaN NaN
|
|||
|
|
|||
|
>>> df.groupby([0, 0, 1, 1], axis=1).fillna(method="bfill")
|
|||
|
key A B C
|
|||
|
0 0.0 NaN 2.0 NaN
|
|||
|
1 0.0 2.0 3.0 NaN
|
|||
|
2 1.0 NaN 2.0 2.0
|
|||
|
3 1.0 3.0 NaN NaN
|
|||
|
4 1.0 NaN NaN NaN
|
|||
|
|
|||
|
Only replace the first NaN element within a group along rows.
|
|||
|
|
|||
|
>>> df.groupby("key").fillna(method="ffill", limit=1)
|
|||
|
A B C
|
|||
|
0 NaN 2.0 NaN
|
|||
|
1 2.0 3.0 NaN
|
|||
|
2 NaN NaN 2.0
|
|||
|
3 3.0 NaN 2.0
|
|||
|
4 3.0 NaN NaN
|
|||
|
"""
|
|||
|
result = self._op_via_apply(
|
|||
|
"fillna",
|
|||
|
value=value,
|
|||
|
method=method,
|
|||
|
axis=axis,
|
|||
|
inplace=inplace,
|
|||
|
limit=limit,
|
|||
|
downcast=downcast,
|
|||
|
)
|
|||
|
return result
|
|||
|
|
|||
|
def take(
|
|||
|
self,
|
|||
|
indices: TakeIndexer,
|
|||
|
axis: Axis | None = 0,
|
|||
|
**kwargs,
|
|||
|
) -> DataFrame:
|
|||
|
"""
|
|||
|
Return the elements in the given *positional* indices in each group.
|
|||
|
|
|||
|
This means that we are not indexing according to actual values in
|
|||
|
the index attribute of the object. We are indexing according to the
|
|||
|
actual position of the element in the object.
|
|||
|
|
|||
|
If a requested index does not exist for some group, this method will raise.
|
|||
|
To get similar behavior that ignores indices that don't exist, see
|
|||
|
:meth:`.DataFrameGroupBy.nth`.
|
|||
|
|
|||
|
Parameters
|
|||
|
----------
|
|||
|
indices : array-like
|
|||
|
An array of ints indicating which positions to take.
|
|||
|
axis : {0 or 'index', 1 or 'columns', None}, default 0
|
|||
|
The axis on which to select elements. ``0`` means that we are
|
|||
|
selecting rows, ``1`` means that we are selecting columns.
|
|||
|
**kwargs
|
|||
|
For compatibility with :meth:`numpy.take`. Has no effect on the
|
|||
|
output.
|
|||
|
|
|||
|
Returns
|
|||
|
-------
|
|||
|
DataFrame
|
|||
|
An DataFrame containing the elements taken from each group.
|
|||
|
|
|||
|
See Also
|
|||
|
--------
|
|||
|
DataFrame.take : Take elements from a Series along an axis.
|
|||
|
DataFrame.loc : Select a subset of a DataFrame by labels.
|
|||
|
DataFrame.iloc : Select a subset of a DataFrame by positions.
|
|||
|
numpy.take : Take elements from an array along an axis.
|
|||
|
|
|||
|
Examples
|
|||
|
--------
|
|||
|
>>> df = pd.DataFrame([('falcon', 'bird', 389.0),
|
|||
|
... ('parrot', 'bird', 24.0),
|
|||
|
... ('lion', 'mammal', 80.5),
|
|||
|
... ('monkey', 'mammal', np.nan),
|
|||
|
... ('rabbit', 'mammal', 15.0)],
|
|||
|
... columns=['name', 'class', 'max_speed'],
|
|||
|
... index=[4, 3, 2, 1, 0])
|
|||
|
>>> df
|
|||
|
name class max_speed
|
|||
|
4 falcon bird 389.0
|
|||
|
3 parrot bird 24.0
|
|||
|
2 lion mammal 80.5
|
|||
|
1 monkey mammal NaN
|
|||
|
0 rabbit mammal 15.0
|
|||
|
>>> gb = df.groupby([1, 1, 2, 2, 2])
|
|||
|
|
|||
|
Take elements at positions 0 and 1 along the axis 0 (default).
|
|||
|
|
|||
|
Note how the indices selected in the result do not correspond to
|
|||
|
our input indices 0 and 1. That's because we are selecting the 0th
|
|||
|
and 1st rows, not rows whose indices equal 0 and 1.
|
|||
|
|
|||
|
>>> gb.take([0, 1])
|
|||
|
name class max_speed
|
|||
|
1 4 falcon bird 389.0
|
|||
|
3 parrot bird 24.0
|
|||
|
2 2 lion mammal 80.5
|
|||
|
1 monkey mammal NaN
|
|||
|
|
|||
|
The order of the specified indices influences the order in the result.
|
|||
|
Here, the order is swapped from the previous example.
|
|||
|
|
|||
|
>>> gb.take([1, 0])
|
|||
|
name class max_speed
|
|||
|
1 3 parrot bird 24.0
|
|||
|
4 falcon bird 389.0
|
|||
|
2 1 monkey mammal NaN
|
|||
|
2 lion mammal 80.5
|
|||
|
|
|||
|
Take elements at indices 1 and 2 along the axis 1 (column selection).
|
|||
|
|
|||
|
We may take elements using negative integers for positive indices,
|
|||
|
starting from the end of the object, just like with Python lists.
|
|||
|
|
|||
|
>>> gb.take([-1, -2])
|
|||
|
name class max_speed
|
|||
|
1 3 parrot bird 24.0
|
|||
|
4 falcon bird 389.0
|
|||
|
2 0 rabbit mammal 15.0
|
|||
|
1 monkey mammal NaN
|
|||
|
"""
|
|||
|
result = self._op_via_apply("take", indices=indices, axis=axis, **kwargs)
|
|||
|
return result
|
|||
|
|
|||
|
def skew(
|
|||
|
self,
|
|||
|
axis: Axis | None | lib.NoDefault = lib.no_default,
|
|||
|
skipna: bool = True,
|
|||
|
numeric_only: bool = False,
|
|||
|
**kwargs,
|
|||
|
) -> DataFrame:
|
|||
|
"""
|
|||
|
Return unbiased skew within groups.
|
|||
|
|
|||
|
Normalized by N-1.
|
|||
|
|
|||
|
Parameters
|
|||
|
----------
|
|||
|
axis : {0 or 'index', 1 or 'columns', None}, default 0
|
|||
|
Axis for the function to be applied on.
|
|||
|
|
|||
|
Specifying ``axis=None`` will apply the aggregation across both axes.
|
|||
|
|
|||
|
.. versionadded:: 2.0.0
|
|||
|
|
|||
|
skipna : bool, default True
|
|||
|
Exclude NA/null values when computing the result.
|
|||
|
|
|||
|
numeric_only : bool, default False
|
|||
|
Include only float, int, boolean columns.
|
|||
|
|
|||
|
**kwargs
|
|||
|
Additional keyword arguments to be passed to the function.
|
|||
|
|
|||
|
Returns
|
|||
|
-------
|
|||
|
DataFrame
|
|||
|
|
|||
|
See Also
|
|||
|
--------
|
|||
|
DataFrame.skew : Return unbiased skew over requested axis.
|
|||
|
|
|||
|
Examples
|
|||
|
--------
|
|||
|
>>> arrays = [['falcon', 'parrot', 'cockatoo', 'kiwi',
|
|||
|
... 'lion', 'monkey', 'rabbit'],
|
|||
|
... ['bird', 'bird', 'bird', 'bird',
|
|||
|
... 'mammal', 'mammal', 'mammal']]
|
|||
|
>>> index = pd.MultiIndex.from_arrays(arrays, names=('name', 'class'))
|
|||
|
>>> df = pd.DataFrame({'max_speed': [389.0, 24.0, 70.0, np.nan,
|
|||
|
... 80.5, 21.5, 15.0]},
|
|||
|
... index=index)
|
|||
|
>>> df
|
|||
|
max_speed
|
|||
|
name class
|
|||
|
falcon bird 389.0
|
|||
|
parrot bird 24.0
|
|||
|
cockatoo bird 70.0
|
|||
|
kiwi bird NaN
|
|||
|
lion mammal 80.5
|
|||
|
monkey mammal 21.5
|
|||
|
rabbit mammal 15.0
|
|||
|
>>> gb = df.groupby(["class"])
|
|||
|
>>> gb.skew()
|
|||
|
max_speed
|
|||
|
class
|
|||
|
bird 1.628296
|
|||
|
mammal 1.669046
|
|||
|
>>> gb.skew(skipna=False)
|
|||
|
max_speed
|
|||
|
class
|
|||
|
bird NaN
|
|||
|
mammal 1.669046
|
|||
|
"""
|
|||
|
result = self._op_via_apply(
|
|||
|
"skew",
|
|||
|
axis=axis,
|
|||
|
skipna=skipna,
|
|||
|
numeric_only=numeric_only,
|
|||
|
**kwargs,
|
|||
|
)
|
|||
|
return result
|
|||
|
|
|||
|
@property
|
|||
|
@doc(DataFrame.plot.__doc__)
|
|||
|
def plot(self) -> GroupByPlot:
|
|||
|
result = GroupByPlot(self)
|
|||
|
return result
|
|||
|
|
|||
|
@doc(DataFrame.corr.__doc__)
|
|||
|
def corr(
|
|||
|
self,
|
|||
|
method: str | Callable[[np.ndarray, np.ndarray], float] = "pearson",
|
|||
|
min_periods: int = 1,
|
|||
|
numeric_only: bool = False,
|
|||
|
) -> DataFrame:
|
|||
|
result = self._op_via_apply(
|
|||
|
"corr", method=method, min_periods=min_periods, numeric_only=numeric_only
|
|||
|
)
|
|||
|
return result
|
|||
|
|
|||
|
@doc(DataFrame.cov.__doc__)
|
|||
|
def cov(
|
|||
|
self,
|
|||
|
min_periods: int | None = None,
|
|||
|
ddof: int | None = 1,
|
|||
|
numeric_only: bool = False,
|
|||
|
) -> DataFrame:
|
|||
|
result = self._op_via_apply(
|
|||
|
"cov", min_periods=min_periods, ddof=ddof, numeric_only=numeric_only
|
|||
|
)
|
|||
|
return result
|
|||
|
|
|||
|
@doc(DataFrame.hist.__doc__)
|
|||
|
def hist(
|
|||
|
self,
|
|||
|
column: IndexLabel = None,
|
|||
|
by=None,
|
|||
|
grid: bool = True,
|
|||
|
xlabelsize: int | None = None,
|
|||
|
xrot: float | None = None,
|
|||
|
ylabelsize: int | None = None,
|
|||
|
yrot: float | None = None,
|
|||
|
ax=None,
|
|||
|
sharex: bool = False,
|
|||
|
sharey: bool = False,
|
|||
|
figsize: tuple[int, int] | None = None,
|
|||
|
layout: tuple[int, int] | None = None,
|
|||
|
bins: int | Sequence[int] = 10,
|
|||
|
backend: str | None = None,
|
|||
|
legend: bool = False,
|
|||
|
**kwargs,
|
|||
|
):
|
|||
|
result = self._op_via_apply(
|
|||
|
"hist",
|
|||
|
column=column,
|
|||
|
by=by,
|
|||
|
grid=grid,
|
|||
|
xlabelsize=xlabelsize,
|
|||
|
xrot=xrot,
|
|||
|
ylabelsize=ylabelsize,
|
|||
|
yrot=yrot,
|
|||
|
ax=ax,
|
|||
|
sharex=sharex,
|
|||
|
sharey=sharey,
|
|||
|
figsize=figsize,
|
|||
|
layout=layout,
|
|||
|
bins=bins,
|
|||
|
backend=backend,
|
|||
|
legend=legend,
|
|||
|
**kwargs,
|
|||
|
)
|
|||
|
return result
|
|||
|
|
|||
|
@property
|
|||
|
@doc(DataFrame.dtypes.__doc__)
|
|||
|
def dtypes(self) -> Series:
|
|||
|
# error: Incompatible return value type (got "DataFrame", expected "Series")
|
|||
|
return self.apply(lambda df: df.dtypes) # type: ignore[return-value]
|
|||
|
|
|||
|
@doc(DataFrame.corrwith.__doc__)
|
|||
|
def corrwith(
|
|||
|
self,
|
|||
|
other: DataFrame | Series,
|
|||
|
axis: Axis = 0,
|
|||
|
drop: bool = False,
|
|||
|
method: CorrelationMethod = "pearson",
|
|||
|
numeric_only: bool = False,
|
|||
|
) -> DataFrame:
|
|||
|
result = self._op_via_apply(
|
|||
|
"corrwith",
|
|||
|
other=other,
|
|||
|
axis=axis,
|
|||
|
drop=drop,
|
|||
|
method=method,
|
|||
|
numeric_only=numeric_only,
|
|||
|
)
|
|||
|
return result
|
|||
|
|
|||
|
|
|||
|
def _wrap_transform_general_frame(
|
|||
|
obj: DataFrame, group: DataFrame, res: DataFrame | Series
|
|||
|
) -> DataFrame:
|
|||
|
from pandas import concat
|
|||
|
|
|||
|
if isinstance(res, Series):
|
|||
|
# we need to broadcast across the
|
|||
|
# other dimension; this will preserve dtypes
|
|||
|
# GH14457
|
|||
|
if res.index.is_(obj.index):
|
|||
|
res_frame = concat([res] * len(group.columns), axis=1)
|
|||
|
res_frame.columns = group.columns
|
|||
|
res_frame.index = group.index
|
|||
|
else:
|
|||
|
res_frame = obj._constructor(
|
|||
|
np.tile(res.values, (len(group.index), 1)),
|
|||
|
columns=group.columns,
|
|||
|
index=group.index,
|
|||
|
)
|
|||
|
assert isinstance(res_frame, DataFrame)
|
|||
|
return res_frame
|
|||
|
elif isinstance(res, DataFrame) and not res.index.is_(group.index):
|
|||
|
return res._align_frame(group)[0]
|
|||
|
else:
|
|||
|
return res
|