3087 lines
96 KiB
Python
3087 lines
96 KiB
Python
![]() |
"""
|
|||
|
Provide the groupby split-apply-combine paradigm. Define the GroupBy
|
|||
|
class providing the base-class of operations.
|
|||
|
|
|||
|
The SeriesGroupBy and DataFrameGroupBy sub-class
|
|||
|
(defined in pandas.core.groupby.generic)
|
|||
|
expose these user-facing objects to provide specific functionality.
|
|||
|
"""
|
|||
|
|
|||
|
from contextlib import contextmanager
|
|||
|
import datetime
|
|||
|
from functools import partial, wraps
|
|||
|
import inspect
|
|||
|
from textwrap import dedent
|
|||
|
import types
|
|||
|
from typing import (
|
|||
|
Callable,
|
|||
|
Dict,
|
|||
|
FrozenSet,
|
|||
|
Generic,
|
|||
|
Hashable,
|
|||
|
Iterable,
|
|||
|
Iterator,
|
|||
|
List,
|
|||
|
Mapping,
|
|||
|
Optional,
|
|||
|
Sequence,
|
|||
|
Set,
|
|||
|
Tuple,
|
|||
|
Type,
|
|||
|
TypeVar,
|
|||
|
Union,
|
|||
|
)
|
|||
|
|
|||
|
import numpy as np
|
|||
|
|
|||
|
from pandas._config.config import option_context
|
|||
|
|
|||
|
from pandas._libs import Timestamp, lib
|
|||
|
import pandas._libs.groupby as libgroupby
|
|||
|
from pandas._typing import (
|
|||
|
F,
|
|||
|
FrameOrSeries,
|
|||
|
FrameOrSeriesUnion,
|
|||
|
IndexLabel,
|
|||
|
Label,
|
|||
|
Scalar,
|
|||
|
final,
|
|||
|
)
|
|||
|
from pandas.compat.numpy import function as nv
|
|||
|
from pandas.errors import AbstractMethodError
|
|||
|
from pandas.util._decorators import Appender, Substitution, cache_readonly, doc
|
|||
|
|
|||
|
from pandas.core.dtypes.cast import maybe_downcast_to_dtype
|
|||
|
from pandas.core.dtypes.common import (
|
|||
|
ensure_float,
|
|||
|
is_bool_dtype,
|
|||
|
is_datetime64_dtype,
|
|||
|
is_extension_array_dtype,
|
|||
|
is_integer_dtype,
|
|||
|
is_numeric_dtype,
|
|||
|
is_object_dtype,
|
|||
|
is_scalar,
|
|||
|
is_timedelta64_dtype,
|
|||
|
)
|
|||
|
from pandas.core.dtypes.missing import isna, notna
|
|||
|
|
|||
|
from pandas.core import nanops
|
|||
|
import pandas.core.algorithms as algorithms
|
|||
|
from pandas.core.arrays import Categorical, DatetimeArray
|
|||
|
from pandas.core.base import DataError, PandasObject, SelectionMixin
|
|||
|
import pandas.core.common as com
|
|||
|
from pandas.core.frame import DataFrame
|
|||
|
from pandas.core.generic import NDFrame
|
|||
|
from pandas.core.groupby import base, numba_, ops
|
|||
|
from pandas.core.indexes.api import CategoricalIndex, Index, MultiIndex
|
|||
|
from pandas.core.series import Series
|
|||
|
from pandas.core.sorting import get_group_index_sorter
|
|||
|
from pandas.core.util.numba_ import NUMBA_FUNC_CACHE
|
|||
|
|
|||
|
_common_see_also = """
|
|||
|
See Also
|
|||
|
--------
|
|||
|
Series.%(name)s : Apply a function %(name)s to a Series.
|
|||
|
DataFrame.%(name)s : Apply a function %(name)s
|
|||
|
to each row or column of a DataFrame.
|
|||
|
"""
|
|||
|
|
|||
|
_apply_docs = {
|
|||
|
"template": """
|
|||
|
Apply function `func` group-wise and combine the results together.
|
|||
|
|
|||
|
The function passed to `apply` must take a {input} as its first
|
|||
|
argument and return a DataFrame, Series or scalar. `apply` will
|
|||
|
then take care of combining the results back together into a single
|
|||
|
dataframe or series. `apply` is therefore a highly flexible
|
|||
|
grouping method.
|
|||
|
|
|||
|
While `apply` is a very flexible method, its downside is that
|
|||
|
using it can be quite a bit slower than using more specific methods
|
|||
|
like `agg` or `transform`. Pandas offers a wide range of method that will
|
|||
|
be much faster than using `apply` for their specific purposes, so try to
|
|||
|
use them before reaching for `apply`.
|
|||
|
|
|||
|
Parameters
|
|||
|
----------
|
|||
|
func : callable
|
|||
|
A callable that takes a {input} as its first argument, and
|
|||
|
returns a dataframe, a series or a scalar. In addition the
|
|||
|
callable may take positional and keyword arguments.
|
|||
|
args, kwargs : tuple and dict
|
|||
|
Optional positional and keyword arguments to pass to `func`.
|
|||
|
|
|||
|
Returns
|
|||
|
-------
|
|||
|
applied : Series or DataFrame
|
|||
|
|
|||
|
See Also
|
|||
|
--------
|
|||
|
pipe : Apply function to the full GroupBy object instead of to each
|
|||
|
group.
|
|||
|
aggregate : Apply aggregate function to the GroupBy object.
|
|||
|
transform : Apply function column-by-column to the GroupBy object.
|
|||
|
Series.apply : Apply a function to a Series.
|
|||
|
DataFrame.apply : Apply a function to each row or column of a DataFrame.
|
|||
|
""",
|
|||
|
"dataframe_examples": """
|
|||
|
>>> df = pd.DataFrame({'A': 'a a b'.split(),
|
|||
|
'B': [1,2,3],
|
|||
|
'C': [4,6, 5]})
|
|||
|
>>> g = df.groupby('A')
|
|||
|
|
|||
|
Notice that ``g`` has two groups, ``a`` and ``b``.
|
|||
|
Calling `apply` in various ways, we can get different grouping results:
|
|||
|
|
|||
|
Example 1: below the function passed to `apply` takes a DataFrame as
|
|||
|
its argument and returns a DataFrame. `apply` combines the result for
|
|||
|
each group together into a new DataFrame:
|
|||
|
|
|||
|
>>> g[['B', 'C']].apply(lambda x: x / x.sum())
|
|||
|
B C
|
|||
|
0 0.333333 0.4
|
|||
|
1 0.666667 0.6
|
|||
|
2 1.000000 1.0
|
|||
|
|
|||
|
Example 2: The function passed to `apply` takes a DataFrame as
|
|||
|
its argument and returns a Series. `apply` combines the result for
|
|||
|
each group together into a new DataFrame:
|
|||
|
|
|||
|
>>> g[['B', 'C']].apply(lambda x: x.max() - x.min())
|
|||
|
B C
|
|||
|
A
|
|||
|
a 1 2
|
|||
|
b 0 0
|
|||
|
|
|||
|
Example 3: The function passed to `apply` takes a DataFrame as
|
|||
|
its argument and returns a scalar. `apply` combines the result for
|
|||
|
each group together into a Series, including setting the index as
|
|||
|
appropriate:
|
|||
|
|
|||
|
>>> g.apply(lambda x: x.C.max() - x.B.min())
|
|||
|
A
|
|||
|
a 5
|
|||
|
b 2
|
|||
|
dtype: int64
|
|||
|
""",
|
|||
|
"series_examples": """
|
|||
|
>>> s = pd.Series([0, 1, 2], index='a a b'.split())
|
|||
|
>>> g = s.groupby(s.index)
|
|||
|
|
|||
|
From ``s`` above we can see that ``g`` has two groups, ``a`` and ``b``.
|
|||
|
Calling `apply` in various ways, we can get different grouping results:
|
|||
|
|
|||
|
Example 1: The function passed to `apply` takes a Series as
|
|||
|
its argument and returns a Series. `apply` combines the result for
|
|||
|
each group together into a new Series:
|
|||
|
|
|||
|
>>> g.apply(lambda x: x*2 if x.name == 'b' else x/2)
|
|||
|
0 0.0
|
|||
|
1 0.5
|
|||
|
2 4.0
|
|||
|
dtype: float64
|
|||
|
|
|||
|
Example 2: The function passed to `apply` takes a Series as
|
|||
|
its argument and returns a scalar. `apply` combines the result for
|
|||
|
each group together into a Series, including setting the index as
|
|||
|
appropriate:
|
|||
|
|
|||
|
>>> g.apply(lambda x: x.max() - x.min())
|
|||
|
a 1
|
|||
|
b 0
|
|||
|
dtype: int64
|
|||
|
|
|||
|
Notes
|
|||
|
-----
|
|||
|
In the current implementation `apply` calls `func` twice on the
|
|||
|
first group to decide whether it can take a fast or slow code
|
|||
|
path. This can lead to unexpected behavior if `func` has
|
|||
|
side-effects, as they will take effect twice for the first
|
|||
|
group.
|
|||
|
|
|||
|
Examples
|
|||
|
--------
|
|||
|
{examples}
|
|||
|
""",
|
|||
|
}
|
|||
|
|
|||
|
_groupby_agg_method_template = """
|
|||
|
Compute {fname} of group values.
|
|||
|
|
|||
|
Parameters
|
|||
|
----------
|
|||
|
numeric_only : bool, default {no}
|
|||
|
Include only float, int, boolean columns. If None, will attempt to use
|
|||
|
everything, then use only numeric data.
|
|||
|
min_count : int, default {mc}
|
|||
|
The required number of valid values to perform the operation. If fewer
|
|||
|
than ``min_count`` non-NA values are present the result will be NA.
|
|||
|
|
|||
|
Returns
|
|||
|
-------
|
|||
|
Series or DataFrame
|
|||
|
Computed {fname} of values within each group.
|
|||
|
"""
|
|||
|
|
|||
|
_pipe_template = """
|
|||
|
Apply a function `func` with arguments to this %(klass)s object and return
|
|||
|
the function's result.
|
|||
|
|
|||
|
Use `.pipe` when you want to improve readability by chaining together
|
|||
|
functions that expect Series, DataFrames, GroupBy or Resampler objects.
|
|||
|
Instead of writing
|
|||
|
|
|||
|
>>> h(g(f(df.groupby('group')), arg1=a), arg2=b, arg3=c) # doctest: +SKIP
|
|||
|
|
|||
|
You can write
|
|||
|
|
|||
|
>>> (df.groupby('group')
|
|||
|
... .pipe(f)
|
|||
|
... .pipe(g, arg1=a)
|
|||
|
... .pipe(h, arg2=b, arg3=c)) # doctest: +SKIP
|
|||
|
|
|||
|
which is much more readable.
|
|||
|
|
|||
|
Parameters
|
|||
|
----------
|
|||
|
func : callable or tuple of (callable, str)
|
|||
|
Function to apply to this %(klass)s object or, alternatively,
|
|||
|
a `(callable, data_keyword)` tuple where `data_keyword` is a
|
|||
|
string indicating the keyword of `callable` that expects the
|
|||
|
%(klass)s object.
|
|||
|
args : iterable, optional
|
|||
|
Positional arguments passed into `func`.
|
|||
|
kwargs : dict, optional
|
|||
|
A dictionary of keyword arguments passed into `func`.
|
|||
|
|
|||
|
Returns
|
|||
|
-------
|
|||
|
object : the return type of `func`.
|
|||
|
|
|||
|
See Also
|
|||
|
--------
|
|||
|
Series.pipe : Apply a function with arguments to a series.
|
|||
|
DataFrame.pipe: Apply a function with arguments to a dataframe.
|
|||
|
apply : Apply function to each group instead of to the
|
|||
|
full %(klass)s object.
|
|||
|
|
|||
|
Notes
|
|||
|
-----
|
|||
|
See more `here
|
|||
|
<https://pandas.pydata.org/pandas-docs/stable/user_guide/groupby.html#piping-function-calls>`_
|
|||
|
|
|||
|
Examples
|
|||
|
--------
|
|||
|
%(examples)s
|
|||
|
"""
|
|||
|
|
|||
|
_transform_template = """
|
|||
|
Call function producing a like-indexed %(klass)s on each group and
|
|||
|
return a %(klass)s having the same indexes as the original object
|
|||
|
filled with the transformed values
|
|||
|
|
|||
|
Parameters
|
|||
|
----------
|
|||
|
f : function
|
|||
|
Function to apply to each group.
|
|||
|
|
|||
|
Can also accept a Numba JIT function with
|
|||
|
``engine='numba'`` specified.
|
|||
|
|
|||
|
If the ``'numba'`` engine is chosen, the function must be
|
|||
|
a user defined function with ``values`` and ``index`` as the
|
|||
|
first and second arguments respectively in the function signature.
|
|||
|
Each group's index will be passed to the user defined function
|
|||
|
and optionally available for use.
|
|||
|
|
|||
|
.. versionchanged:: 1.1.0
|
|||
|
*args
|
|||
|
Positional arguments to pass to func.
|
|||
|
engine : str, default None
|
|||
|
* ``'cython'`` : Runs the function through C-extensions from cython.
|
|||
|
* ``'numba'`` : Runs the function through JIT compiled code from numba.
|
|||
|
* ``None`` : Defaults to ``'cython'`` or globally setting ``compute.use_numba``
|
|||
|
|
|||
|
.. versionadded:: 1.1.0
|
|||
|
engine_kwargs : dict, default None
|
|||
|
* For ``'cython'`` engine, there are no accepted ``engine_kwargs``
|
|||
|
* For ``'numba'`` engine, the engine can accept ``nopython``, ``nogil``
|
|||
|
and ``parallel`` dictionary keys. The values must either be ``True`` or
|
|||
|
``False``. The default ``engine_kwargs`` for the ``'numba'`` engine is
|
|||
|
``{'nopython': True, 'nogil': False, 'parallel': False}`` and will be
|
|||
|
applied to the function
|
|||
|
|
|||
|
.. versionadded:: 1.1.0
|
|||
|
**kwargs
|
|||
|
Keyword arguments to be passed into func.
|
|||
|
|
|||
|
Returns
|
|||
|
-------
|
|||
|
%(klass)s
|
|||
|
|
|||
|
See Also
|
|||
|
--------
|
|||
|
%(klass)s.groupby.apply : Apply function func group-wise
|
|||
|
and combine the results together.
|
|||
|
%(klass)s.groupby.aggregate : Aggregate using one or more
|
|||
|
operations over the specified axis.
|
|||
|
%(klass)s.transform : Transforms the Series on each group
|
|||
|
based on the given function.
|
|||
|
|
|||
|
Notes
|
|||
|
-----
|
|||
|
Each group is endowed the attribute 'name' in case you need to know
|
|||
|
which group you are working on.
|
|||
|
|
|||
|
The current implementation imposes three requirements on f:
|
|||
|
|
|||
|
* f must return a value that either has the same shape as the input
|
|||
|
subframe or can be broadcast to the shape of the input subframe.
|
|||
|
For example, if `f` returns a scalar it will be broadcast to have the
|
|||
|
same shape as the input subframe.
|
|||
|
* if this is a DataFrame, f must support application column-by-column
|
|||
|
in the subframe. If f also supports application to the entire subframe,
|
|||
|
then a fast path is used starting from the second chunk.
|
|||
|
* f must not mutate groups. Mutation is not supported and may
|
|||
|
produce unexpected results.
|
|||
|
|
|||
|
When using ``engine='numba'``, there will be no "fall back" behavior internally.
|
|||
|
The group data and group index will be passed as numpy arrays to the JITed
|
|||
|
user defined function, and no alternative execution attempts will be tried.
|
|||
|
|
|||
|
Examples
|
|||
|
--------
|
|||
|
|
|||
|
>>> df = pd.DataFrame({'A' : ['foo', 'bar', 'foo', 'bar',
|
|||
|
... 'foo', 'bar'],
|
|||
|
... 'B' : ['one', 'one', 'two', 'three',
|
|||
|
... 'two', 'two'],
|
|||
|
... 'C' : [1, 5, 5, 2, 5, 5],
|
|||
|
... 'D' : [2.0, 5., 8., 1., 2., 9.]})
|
|||
|
>>> grouped = df.groupby('A')
|
|||
|
>>> grouped.transform(lambda x: (x - x.mean()) / x.std())
|
|||
|
C D
|
|||
|
0 -1.154701 -0.577350
|
|||
|
1 0.577350 0.000000
|
|||
|
2 0.577350 1.154701
|
|||
|
3 -1.154701 -1.000000
|
|||
|
4 0.577350 -0.577350
|
|||
|
5 0.577350 1.000000
|
|||
|
|
|||
|
Broadcast result of the transformation
|
|||
|
|
|||
|
>>> grouped.transform(lambda x: x.max() - x.min())
|
|||
|
C D
|
|||
|
0 4 6.0
|
|||
|
1 3 8.0
|
|||
|
2 4 6.0
|
|||
|
3 3 8.0
|
|||
|
4 4 6.0
|
|||
|
5 3 8.0
|
|||
|
"""
|
|||
|
|
|||
|
_agg_template = """
|
|||
|
Aggregate using one or more operations over the specified axis.
|
|||
|
|
|||
|
Parameters
|
|||
|
----------
|
|||
|
func : function, str, list or dict
|
|||
|
Function to use for aggregating the data. If a function, must either
|
|||
|
work when passed a {klass} or when passed to {klass}.apply.
|
|||
|
|
|||
|
Accepted combinations are:
|
|||
|
|
|||
|
- function
|
|||
|
- string function name
|
|||
|
- list of functions and/or function names, e.g. ``[np.sum, 'mean']``
|
|||
|
- dict of axis labels -> functions, function names or list of such.
|
|||
|
|
|||
|
Can also accept a Numba JIT function with
|
|||
|
``engine='numba'`` specified. Only passing a single function is supported
|
|||
|
with this engine.
|
|||
|
|
|||
|
If the ``'numba'`` engine is chosen, the function must be
|
|||
|
a user defined function with ``values`` and ``index`` as the
|
|||
|
first and second arguments respectively in the function signature.
|
|||
|
Each group's index will be passed to the user defined function
|
|||
|
and optionally available for use.
|
|||
|
|
|||
|
.. versionchanged:: 1.1.0
|
|||
|
*args
|
|||
|
Positional arguments to pass to func.
|
|||
|
engine : str, default None
|
|||
|
* ``'cython'`` : Runs the function through C-extensions from cython.
|
|||
|
* ``'numba'`` : Runs the function through JIT compiled code from numba.
|
|||
|
* ``None`` : Defaults to ``'cython'`` or globally setting ``compute.use_numba``
|
|||
|
|
|||
|
.. versionadded:: 1.1.0
|
|||
|
engine_kwargs : dict, default None
|
|||
|
* For ``'cython'`` engine, there are no accepted ``engine_kwargs``
|
|||
|
* For ``'numba'`` engine, the engine can accept ``nopython``, ``nogil``
|
|||
|
and ``parallel`` dictionary keys. The values must either be ``True`` or
|
|||
|
``False``. The default ``engine_kwargs`` for the ``'numba'`` engine is
|
|||
|
``{{'nopython': True, 'nogil': False, 'parallel': False}}`` and will be
|
|||
|
applied to the function
|
|||
|
|
|||
|
.. versionadded:: 1.1.0
|
|||
|
**kwargs
|
|||
|
Keyword arguments to be passed into func.
|
|||
|
|
|||
|
Returns
|
|||
|
-------
|
|||
|
{klass}
|
|||
|
|
|||
|
See Also
|
|||
|
--------
|
|||
|
{klass}.groupby.apply : Apply function func group-wise
|
|||
|
and combine the results together.
|
|||
|
{klass}.groupby.transform : Aggregate using one or more
|
|||
|
operations over the specified axis.
|
|||
|
{klass}.aggregate : Transforms the Series on each group
|
|||
|
based on the given function.
|
|||
|
|
|||
|
Notes
|
|||
|
-----
|
|||
|
When using ``engine='numba'``, there will be no "fall back" behavior internally.
|
|||
|
The group data and group index will be passed as numpy arrays to the JITed
|
|||
|
user defined function, and no alternative execution attempts will be tried.
|
|||
|
{examples}
|
|||
|
"""
|
|||
|
|
|||
|
|
|||
|
@final
|
|||
|
class GroupByPlot(PandasObject):
|
|||
|
"""
|
|||
|
Class implementing the .plot attribute for groupby objects.
|
|||
|
"""
|
|||
|
|
|||
|
def __init__(self, groupby):
|
|||
|
self._groupby = groupby
|
|||
|
|
|||
|
def __call__(self, *args, **kwargs):
|
|||
|
def f(self):
|
|||
|
return self.plot(*args, **kwargs)
|
|||
|
|
|||
|
f.__name__ = "plot"
|
|||
|
return self._groupby.apply(f)
|
|||
|
|
|||
|
def __getattr__(self, name: str):
|
|||
|
def attr(*args, **kwargs):
|
|||
|
def f(self):
|
|||
|
return getattr(self.plot, name)(*args, **kwargs)
|
|||
|
|
|||
|
return self._groupby.apply(f)
|
|||
|
|
|||
|
return attr
|
|||
|
|
|||
|
|
|||
|
@contextmanager
|
|||
|
def group_selection_context(groupby: "BaseGroupBy") -> Iterator["BaseGroupBy"]:
|
|||
|
"""
|
|||
|
Set / reset the group_selection_context.
|
|||
|
"""
|
|||
|
groupby._set_group_selection()
|
|||
|
try:
|
|||
|
yield groupby
|
|||
|
finally:
|
|||
|
groupby._reset_group_selection()
|
|||
|
|
|||
|
|
|||
|
_KeysArgType = Union[
|
|||
|
Hashable,
|
|||
|
List[Hashable],
|
|||
|
Callable[[Hashable], Hashable],
|
|||
|
List[Callable[[Hashable], Hashable]],
|
|||
|
Mapping[Hashable, Hashable],
|
|||
|
]
|
|||
|
|
|||
|
|
|||
|
class BaseGroupBy(PandasObject, SelectionMixin, Generic[FrameOrSeries]):
|
|||
|
_group_selection: Optional[IndexLabel] = None
|
|||
|
_apply_allowlist: FrozenSet[str] = frozenset()
|
|||
|
_hidden_attrs = PandasObject._hidden_attrs | {
|
|||
|
"as_index",
|
|||
|
"axis",
|
|||
|
"dropna",
|
|||
|
"exclusions",
|
|||
|
"grouper",
|
|||
|
"group_keys",
|
|||
|
"keys",
|
|||
|
"level",
|
|||
|
"mutated",
|
|||
|
"obj",
|
|||
|
"observed",
|
|||
|
"sort",
|
|||
|
"squeeze",
|
|||
|
}
|
|||
|
|
|||
|
def __init__(
|
|||
|
self,
|
|||
|
obj: FrameOrSeries,
|
|||
|
keys: Optional[_KeysArgType] = None,
|
|||
|
axis: int = 0,
|
|||
|
level: Optional[IndexLabel] = None,
|
|||
|
grouper: Optional["ops.BaseGrouper"] = None,
|
|||
|
exclusions: Optional[Set[Label]] = None,
|
|||
|
selection: Optional[IndexLabel] = None,
|
|||
|
as_index: bool = True,
|
|||
|
sort: bool = True,
|
|||
|
group_keys: bool = True,
|
|||
|
squeeze: bool = False,
|
|||
|
observed: bool = False,
|
|||
|
mutated: bool = False,
|
|||
|
dropna: bool = True,
|
|||
|
):
|
|||
|
|
|||
|
self._selection = selection
|
|||
|
|
|||
|
assert isinstance(obj, NDFrame), type(obj)
|
|||
|
|
|||
|
self.level = level
|
|||
|
|
|||
|
if not as_index:
|
|||
|
if not isinstance(obj, DataFrame):
|
|||
|
raise TypeError("as_index=False only valid with DataFrame")
|
|||
|
if axis != 0:
|
|||
|
raise ValueError("as_index=False only valid for axis=0")
|
|||
|
|
|||
|
self.as_index = as_index
|
|||
|
self.keys = keys
|
|||
|
self.sort = sort
|
|||
|
self.group_keys = group_keys
|
|||
|
self.squeeze = squeeze
|
|||
|
self.observed = observed
|
|||
|
self.mutated = mutated
|
|||
|
self.dropna = dropna
|
|||
|
|
|||
|
if grouper is None:
|
|||
|
from pandas.core.groupby.grouper import get_grouper
|
|||
|
|
|||
|
grouper, exclusions, obj = get_grouper(
|
|||
|
obj,
|
|||
|
keys,
|
|||
|
axis=axis,
|
|||
|
level=level,
|
|||
|
sort=sort,
|
|||
|
observed=observed,
|
|||
|
mutated=self.mutated,
|
|||
|
dropna=self.dropna,
|
|||
|
)
|
|||
|
|
|||
|
self.obj = obj
|
|||
|
self.axis = obj._get_axis_number(axis)
|
|||
|
self.grouper = grouper
|
|||
|
self.exclusions = exclusions or set()
|
|||
|
|
|||
|
@final
|
|||
|
def __len__(self) -> int:
|
|||
|
return len(self.groups)
|
|||
|
|
|||
|
@final
|
|||
|
def __repr__(self) -> str:
|
|||
|
# TODO: Better repr for GroupBy object
|
|||
|
return object.__repr__(self)
|
|||
|
|
|||
|
def _assure_grouper(self) -> None:
|
|||
|
"""
|
|||
|
We create the grouper on instantiation sub-classes may have a
|
|||
|
different policy.
|
|||
|
"""
|
|||
|
pass
|
|||
|
|
|||
|
@final
|
|||
|
@property
|
|||
|
def groups(self) -> Dict[Hashable, np.ndarray]:
|
|||
|
"""
|
|||
|
Dict {group name -> group labels}.
|
|||
|
"""
|
|||
|
self._assure_grouper()
|
|||
|
return self.grouper.groups
|
|||
|
|
|||
|
@final
|
|||
|
@property
|
|||
|
def ngroups(self) -> int:
|
|||
|
self._assure_grouper()
|
|||
|
return self.grouper.ngroups
|
|||
|
|
|||
|
@final
|
|||
|
@property
|
|||
|
def indices(self):
|
|||
|
"""
|
|||
|
Dict {group name -> group indices}.
|
|||
|
"""
|
|||
|
self._assure_grouper()
|
|||
|
return self.grouper.indices
|
|||
|
|
|||
|
@final
|
|||
|
def _get_indices(self, names):
|
|||
|
"""
|
|||
|
Safe get multiple indices, translate keys for
|
|||
|
datelike to underlying repr.
|
|||
|
"""
|
|||
|
|
|||
|
def get_converter(s):
|
|||
|
# possibly convert to the actual key types
|
|||
|
# in the indices, could be a Timestamp or a np.datetime64
|
|||
|
if isinstance(s, datetime.datetime):
|
|||
|
return lambda key: Timestamp(key)
|
|||
|
elif isinstance(s, np.datetime64):
|
|||
|
return lambda key: Timestamp(key).asm8
|
|||
|
else:
|
|||
|
return lambda key: key
|
|||
|
|
|||
|
if len(names) == 0:
|
|||
|
return []
|
|||
|
|
|||
|
if len(self.indices) > 0:
|
|||
|
index_sample = next(iter(self.indices))
|
|||
|
else:
|
|||
|
index_sample = None # Dummy sample
|
|||
|
|
|||
|
name_sample = names[0]
|
|||
|
if isinstance(index_sample, tuple):
|
|||
|
if not isinstance(name_sample, tuple):
|
|||
|
msg = "must supply a tuple to get_group with multiple grouping keys"
|
|||
|
raise ValueError(msg)
|
|||
|
if not len(name_sample) == len(index_sample):
|
|||
|
try:
|
|||
|
# If the original grouper was a tuple
|
|||
|
return [self.indices[name] for name in names]
|
|||
|
except KeyError as err:
|
|||
|
# turns out it wasn't a tuple
|
|||
|
msg = (
|
|||
|
"must supply a same-length tuple to get_group "
|
|||
|
"with multiple grouping keys"
|
|||
|
)
|
|||
|
raise ValueError(msg) from err
|
|||
|
|
|||
|
converters = [get_converter(s) for s in index_sample]
|
|||
|
names = (tuple(f(n) for f, n in zip(converters, name)) for name in names)
|
|||
|
|
|||
|
else:
|
|||
|
converter = get_converter(index_sample)
|
|||
|
names = (converter(name) for name in names)
|
|||
|
|
|||
|
return [self.indices.get(name, []) for name in names]
|
|||
|
|
|||
|
@final
|
|||
|
def _get_index(self, name):
|
|||
|
"""
|
|||
|
Safe get index, translate keys for datelike to underlying repr.
|
|||
|
"""
|
|||
|
return self._get_indices([name])[0]
|
|||
|
|
|||
|
@final
|
|||
|
@cache_readonly
|
|||
|
def _selected_obj(self):
|
|||
|
# Note: _selected_obj is always just `self.obj` for SeriesGroupBy
|
|||
|
|
|||
|
if self._selection is None or isinstance(self.obj, Series):
|
|||
|
if self._group_selection is not None:
|
|||
|
return self.obj[self._group_selection]
|
|||
|
return self.obj
|
|||
|
else:
|
|||
|
return self.obj[self._selection]
|
|||
|
|
|||
|
@final
|
|||
|
def _reset_group_selection(self) -> None:
|
|||
|
"""
|
|||
|
Clear group based selection.
|
|||
|
|
|||
|
Used for methods needing to return info on each group regardless of
|
|||
|
whether a group selection was previously set.
|
|||
|
"""
|
|||
|
if self._group_selection is not None:
|
|||
|
# GH12839 clear cached selection too when changing group selection
|
|||
|
self._group_selection = None
|
|||
|
self._reset_cache("_selected_obj")
|
|||
|
|
|||
|
@final
|
|||
|
def _set_group_selection(self) -> None:
|
|||
|
"""
|
|||
|
Create group based selection.
|
|||
|
|
|||
|
Used when selection is not passed directly but instead via a grouper.
|
|||
|
|
|||
|
NOTE: this should be paired with a call to _reset_group_selection
|
|||
|
"""
|
|||
|
grp = self.grouper
|
|||
|
if not (
|
|||
|
self.as_index
|
|||
|
and getattr(grp, "groupings", None) is not None
|
|||
|
and self.obj.ndim > 1
|
|||
|
and self._group_selection is None
|
|||
|
):
|
|||
|
return
|
|||
|
|
|||
|
groupers = [g.name for g in grp.groupings if g.level is None and g.in_axis]
|
|||
|
|
|||
|
if len(groupers):
|
|||
|
# GH12839 clear selected obj cache when group selection changes
|
|||
|
ax = self.obj._info_axis
|
|||
|
self._group_selection = ax.difference(Index(groupers), sort=False).tolist()
|
|||
|
self._reset_cache("_selected_obj")
|
|||
|
|
|||
|
@final
|
|||
|
def _set_result_index_ordered(
|
|||
|
self, result: "OutputFrameOrSeries"
|
|||
|
) -> "OutputFrameOrSeries":
|
|||
|
# set the result index on the passed values object and
|
|||
|
# return the new object, xref 8046
|
|||
|
|
|||
|
# the values/counts are repeated according to the group index
|
|||
|
# shortcut if we have an already ordered grouper
|
|||
|
if not self.grouper.is_monotonic:
|
|||
|
index = Index(np.concatenate(self._get_indices(self.grouper.result_index)))
|
|||
|
result.set_axis(index, axis=self.axis, inplace=True)
|
|||
|
result = result.sort_index(axis=self.axis)
|
|||
|
|
|||
|
result.set_axis(self.obj._get_axis(self.axis), axis=self.axis, inplace=True)
|
|||
|
return result
|
|||
|
|
|||
|
@final
|
|||
|
def _dir_additions(self) -> Set[str]:
|
|||
|
return self.obj._dir_additions() | self._apply_allowlist
|
|||
|
|
|||
|
def __getattr__(self, attr: str):
|
|||
|
if attr in self._internal_names_set:
|
|||
|
return object.__getattribute__(self, attr)
|
|||
|
if attr in self.obj:
|
|||
|
return self[attr]
|
|||
|
|
|||
|
raise AttributeError(
|
|||
|
f"'{type(self).__name__}' object has no attribute '{attr}'"
|
|||
|
)
|
|||
|
|
|||
|
@Substitution(
|
|||
|
klass="GroupBy",
|
|||
|
examples=dedent(
|
|||
|
"""\
|
|||
|
>>> df = pd.DataFrame({'A': 'a b a b'.split(), 'B': [1, 2, 3, 4]})
|
|||
|
>>> df
|
|||
|
A B
|
|||
|
0 a 1
|
|||
|
1 b 2
|
|||
|
2 a 3
|
|||
|
3 b 4
|
|||
|
|
|||
|
To get the difference between each groups maximum and minimum value in one
|
|||
|
pass, you can do
|
|||
|
|
|||
|
>>> df.groupby('A').pipe(lambda x: x.max() - x.min())
|
|||
|
B
|
|||
|
A
|
|||
|
a 2
|
|||
|
b 2"""
|
|||
|
),
|
|||
|
)
|
|||
|
@Appender(_pipe_template)
|
|||
|
def pipe(self, func, *args, **kwargs):
|
|||
|
return com.pipe(self, func, *args, **kwargs)
|
|||
|
|
|||
|
plot = property(GroupByPlot)
|
|||
|
|
|||
|
@final
|
|||
|
def _make_wrapper(self, name: str) -> Callable:
|
|||
|
assert name in self._apply_allowlist
|
|||
|
|
|||
|
with group_selection_context(self):
|
|||
|
# need to setup the selection
|
|||
|
# as are not passed directly but in the grouper
|
|||
|
f = getattr(self._obj_with_exclusions, name)
|
|||
|
if not isinstance(f, types.MethodType):
|
|||
|
return self.apply(lambda self: getattr(self, name))
|
|||
|
|
|||
|
f = getattr(type(self._obj_with_exclusions), name)
|
|||
|
sig = inspect.signature(f)
|
|||
|
|
|||
|
def wrapper(*args, **kwargs):
|
|||
|
# a little trickery for aggregation functions that need an axis
|
|||
|
# argument
|
|||
|
if "axis" in sig.parameters:
|
|||
|
if kwargs.get("axis", None) is None:
|
|||
|
kwargs["axis"] = self.axis
|
|||
|
|
|||
|
def curried(x):
|
|||
|
return f(x, *args, **kwargs)
|
|||
|
|
|||
|
# preserve the name so we can detect it when calling plot methods,
|
|||
|
# to avoid duplicates
|
|||
|
curried.__name__ = name
|
|||
|
|
|||
|
# special case otherwise extra plots are created when catching the
|
|||
|
# exception below
|
|||
|
if name in base.plotting_methods:
|
|||
|
return self.apply(curried)
|
|||
|
|
|||
|
return self._python_apply_general(curried, self._obj_with_exclusions)
|
|||
|
|
|||
|
wrapper.__name__ = name
|
|||
|
return wrapper
|
|||
|
|
|||
|
@final
|
|||
|
def get_group(self, name, obj=None):
|
|||
|
"""
|
|||
|
Construct DataFrame from group with provided name.
|
|||
|
|
|||
|
Parameters
|
|||
|
----------
|
|||
|
name : object
|
|||
|
The name of the group to get as a DataFrame.
|
|||
|
obj : DataFrame, default None
|
|||
|
The DataFrame to take the DataFrame out of. If
|
|||
|
it is None, the object groupby was called on will
|
|||
|
be used.
|
|||
|
|
|||
|
Returns
|
|||
|
-------
|
|||
|
group : same type as obj
|
|||
|
"""
|
|||
|
if obj is None:
|
|||
|
obj = self._selected_obj
|
|||
|
|
|||
|
inds = self._get_index(name)
|
|||
|
if not len(inds):
|
|||
|
raise KeyError(name)
|
|||
|
|
|||
|
return obj._take_with_is_copy(inds, axis=self.axis)
|
|||
|
|
|||
|
def __iter__(self) -> Iterator[Tuple[Label, FrameOrSeries]]:
|
|||
|
"""
|
|||
|
Groupby iterator.
|
|||
|
|
|||
|
Returns
|
|||
|
-------
|
|||
|
Generator yielding sequence of (name, subsetted object)
|
|||
|
for each group
|
|||
|
"""
|
|||
|
return self.grouper.get_iterator(self.obj, axis=self.axis)
|
|||
|
|
|||
|
@Appender(
|
|||
|
_apply_docs["template"].format(
|
|||
|
input="dataframe", examples=_apply_docs["dataframe_examples"]
|
|||
|
)
|
|||
|
)
|
|||
|
def apply(self, func, *args, **kwargs):
|
|||
|
|
|||
|
func = self._is_builtin_func(func)
|
|||
|
|
|||
|
# this is needed so we don't try and wrap strings. If we could
|
|||
|
# resolve functions to their callable functions prior, this
|
|||
|
# wouldn't be needed
|
|||
|
if args or kwargs:
|
|||
|
if callable(func):
|
|||
|
|
|||
|
@wraps(func)
|
|||
|
def f(g):
|
|||
|
with np.errstate(all="ignore"):
|
|||
|
return func(g, *args, **kwargs)
|
|||
|
|
|||
|
elif hasattr(nanops, "nan" + func):
|
|||
|
# TODO: should we wrap this in to e.g. _is_builtin_func?
|
|||
|
f = getattr(nanops, "nan" + func)
|
|||
|
|
|||
|
else:
|
|||
|
raise ValueError(
|
|||
|
"func must be a callable if args or kwargs are supplied"
|
|||
|
)
|
|||
|
else:
|
|||
|
f = func
|
|||
|
|
|||
|
# ignore SettingWithCopy here in case the user mutates
|
|||
|
with option_context("mode.chained_assignment", None):
|
|||
|
try:
|
|||
|
result = self._python_apply_general(f, self._selected_obj)
|
|||
|
except TypeError:
|
|||
|
# gh-20949
|
|||
|
# try again, with .apply acting as a filtering
|
|||
|
# operation, by excluding the grouping column
|
|||
|
# This would normally not be triggered
|
|||
|
# except if the udf is trying an operation that
|
|||
|
# fails on *some* columns, e.g. a numeric operation
|
|||
|
# on a string grouper column
|
|||
|
|
|||
|
with group_selection_context(self):
|
|||
|
return self._python_apply_general(f, self._selected_obj)
|
|||
|
|
|||
|
return result
|
|||
|
|
|||
|
@final
|
|||
|
def _python_apply_general(
|
|||
|
self, f: F, data: FrameOrSeriesUnion
|
|||
|
) -> FrameOrSeriesUnion:
|
|||
|
"""
|
|||
|
Apply function f in python space
|
|||
|
|
|||
|
Parameters
|
|||
|
----------
|
|||
|
f : callable
|
|||
|
Function to apply
|
|||
|
data : Series or DataFrame
|
|||
|
Data to apply f to
|
|||
|
|
|||
|
Returns
|
|||
|
-------
|
|||
|
Series or DataFrame
|
|||
|
data after applying f
|
|||
|
"""
|
|||
|
keys, values, mutated = self.grouper.apply(f, data, self.axis)
|
|||
|
|
|||
|
return self._wrap_applied_output(
|
|||
|
keys, values, not_indexed_same=mutated or self.mutated
|
|||
|
)
|
|||
|
|
|||
|
def _iterate_slices(self) -> Iterable[Series]:
|
|||
|
raise AbstractMethodError(self)
|
|||
|
|
|||
|
def transform(self, func, *args, **kwargs):
|
|||
|
raise AbstractMethodError(self)
|
|||
|
|
|||
|
@final
|
|||
|
def _cumcount_array(self, ascending: bool = True):
|
|||
|
"""
|
|||
|
Parameters
|
|||
|
----------
|
|||
|
ascending : bool, default True
|
|||
|
If False, number in reverse, from length of group - 1 to 0.
|
|||
|
|
|||
|
Notes
|
|||
|
-----
|
|||
|
this is currently implementing sort=False
|
|||
|
(though the default is sort=True) for groupby in general
|
|||
|
"""
|
|||
|
ids, _, ngroups = self.grouper.group_info
|
|||
|
sorter = get_group_index_sorter(ids, ngroups)
|
|||
|
ids, count = ids[sorter], len(ids)
|
|||
|
|
|||
|
if count == 0:
|
|||
|
return np.empty(0, dtype=np.int64)
|
|||
|
|
|||
|
run = np.r_[True, ids[:-1] != ids[1:]]
|
|||
|
rep = np.diff(np.r_[np.nonzero(run)[0], count])
|
|||
|
out = (~run).cumsum()
|
|||
|
|
|||
|
if ascending:
|
|||
|
out -= np.repeat(out[run], rep)
|
|||
|
else:
|
|||
|
out = np.repeat(out[np.r_[run[1:], True]], rep) - out
|
|||
|
|
|||
|
rev = np.empty(count, dtype=np.intp)
|
|||
|
rev[sorter] = np.arange(count, dtype=np.intp)
|
|||
|
return out[rev].astype(np.int64, copy=False)
|
|||
|
|
|||
|
@final
|
|||
|
def _cython_transform(
|
|||
|
self, how: str, numeric_only: bool = True, axis: int = 0, **kwargs
|
|||
|
):
|
|||
|
output: Dict[base.OutputKey, np.ndarray] = {}
|
|||
|
|
|||
|
for idx, obj in enumerate(self._iterate_slices()):
|
|||
|
name = obj.name
|
|||
|
is_numeric = is_numeric_dtype(obj.dtype)
|
|||
|
if numeric_only and not is_numeric:
|
|||
|
continue
|
|||
|
|
|||
|
try:
|
|||
|
result = self.grouper._cython_operation(
|
|||
|
"transform", obj._values, how, axis, **kwargs
|
|||
|
)
|
|||
|
except NotImplementedError:
|
|||
|
continue
|
|||
|
|
|||
|
key = base.OutputKey(label=name, position=idx)
|
|||
|
output[key] = result
|
|||
|
|
|||
|
if not output:
|
|||
|
raise DataError("No numeric types to aggregate")
|
|||
|
|
|||
|
return self._wrap_transformed_output(output)
|
|||
|
|
|||
|
def _wrap_aggregated_output(
|
|||
|
self, output: Mapping[base.OutputKey, np.ndarray], index: Optional[Index]
|
|||
|
):
|
|||
|
raise AbstractMethodError(self)
|
|||
|
|
|||
|
def _wrap_transformed_output(self, output: Mapping[base.OutputKey, np.ndarray]):
|
|||
|
raise AbstractMethodError(self)
|
|||
|
|
|||
|
def _wrap_applied_output(self, keys, values, not_indexed_same: bool = False):
|
|||
|
raise AbstractMethodError(self)
|
|||
|
|
|||
|
@final
|
|||
|
def _agg_general(
|
|||
|
self,
|
|||
|
numeric_only: bool = True,
|
|||
|
min_count: int = -1,
|
|||
|
*,
|
|||
|
alias: str,
|
|||
|
npfunc: Callable,
|
|||
|
):
|
|||
|
with group_selection_context(self):
|
|||
|
# try a cython aggregation if we can
|
|||
|
result = None
|
|||
|
try:
|
|||
|
result = self._cython_agg_general(
|
|||
|
how=alias,
|
|||
|
alt=npfunc,
|
|||
|
numeric_only=numeric_only,
|
|||
|
min_count=min_count,
|
|||
|
)
|
|||
|
except DataError:
|
|||
|
pass
|
|||
|
except NotImplementedError as err:
|
|||
|
if "function is not implemented for this dtype" in str(
|
|||
|
err
|
|||
|
) or "category dtype not supported" in str(err):
|
|||
|
# raised in _get_cython_function, in some cases can
|
|||
|
# be trimmed by implementing cython funcs for more dtypes
|
|||
|
pass
|
|||
|
else:
|
|||
|
raise
|
|||
|
|
|||
|
# apply a non-cython aggregation
|
|||
|
if result is None:
|
|||
|
result = self.aggregate(lambda x: npfunc(x, axis=self.axis))
|
|||
|
return result.__finalize__(self.obj, method="groupby")
|
|||
|
|
|||
|
def _cython_agg_general(
|
|||
|
self, how: str, alt=None, numeric_only: bool = True, min_count: int = -1
|
|||
|
):
|
|||
|
output: Dict[base.OutputKey, Union[np.ndarray, DatetimeArray]] = {}
|
|||
|
# Ideally we would be able to enumerate self._iterate_slices and use
|
|||
|
# the index from enumeration as the key of output, but ohlc in particular
|
|||
|
# returns a (n x 4) array. Output requires 1D ndarrays as values, so we
|
|||
|
# need to slice that up into 1D arrays
|
|||
|
idx = 0
|
|||
|
for obj in self._iterate_slices():
|
|||
|
name = obj.name
|
|||
|
is_numeric = is_numeric_dtype(obj.dtype)
|
|||
|
if numeric_only and not is_numeric:
|
|||
|
continue
|
|||
|
|
|||
|
result = self.grouper._cython_operation(
|
|||
|
"aggregate", obj._values, how, axis=0, min_count=min_count
|
|||
|
)
|
|||
|
|
|||
|
if how == "ohlc":
|
|||
|
# e.g. ohlc
|
|||
|
agg_names = ["open", "high", "low", "close"]
|
|||
|
assert len(agg_names) == result.shape[1]
|
|||
|
for result_column, result_name in zip(result.T, agg_names):
|
|||
|
key = base.OutputKey(label=result_name, position=idx)
|
|||
|
output[key] = result_column
|
|||
|
idx += 1
|
|||
|
else:
|
|||
|
assert result.ndim == 1
|
|||
|
key = base.OutputKey(label=name, position=idx)
|
|||
|
output[key] = result
|
|||
|
idx += 1
|
|||
|
|
|||
|
if not output:
|
|||
|
raise DataError("No numeric types to aggregate")
|
|||
|
|
|||
|
return self._wrap_aggregated_output(output, index=self.grouper.result_index)
|
|||
|
|
|||
|
@final
|
|||
|
def _transform_with_numba(self, data, func, *args, engine_kwargs=None, **kwargs):
|
|||
|
"""
|
|||
|
Perform groupby transform routine with the numba engine.
|
|||
|
|
|||
|
This routine mimics the data splitting routine of the DataSplitter class
|
|||
|
to generate the indices of each group in the sorted data and then passes the
|
|||
|
data and indices into a Numba jitted function.
|
|||
|
"""
|
|||
|
if not callable(func):
|
|||
|
raise NotImplementedError(
|
|||
|
"Numba engine can only be used with a single function."
|
|||
|
)
|
|||
|
group_keys = self.grouper._get_group_keys()
|
|||
|
labels, _, n_groups = self.grouper.group_info
|
|||
|
sorted_index = get_group_index_sorter(labels, n_groups)
|
|||
|
sorted_labels = algorithms.take_nd(labels, sorted_index, allow_fill=False)
|
|||
|
sorted_data = data.take(sorted_index, axis=self.axis).to_numpy()
|
|||
|
starts, ends = lib.generate_slices(sorted_labels, n_groups)
|
|||
|
|
|||
|
numba_transform_func = numba_.generate_numba_transform_func(
|
|||
|
tuple(args), kwargs, func, engine_kwargs
|
|||
|
)
|
|||
|
result = numba_transform_func(
|
|||
|
sorted_data, sorted_index, starts, ends, len(group_keys), len(data.columns)
|
|||
|
)
|
|||
|
|
|||
|
cache_key = (func, "groupby_transform")
|
|||
|
if cache_key not in NUMBA_FUNC_CACHE:
|
|||
|
NUMBA_FUNC_CACHE[cache_key] = numba_transform_func
|
|||
|
|
|||
|
# result values needs to be resorted to their original positions since we
|
|||
|
# evaluated the data sorted by group
|
|||
|
return result.take(np.argsort(sorted_index), axis=0)
|
|||
|
|
|||
|
@final
|
|||
|
def _aggregate_with_numba(self, data, func, *args, engine_kwargs=None, **kwargs):
|
|||
|
"""
|
|||
|
Perform groupby aggregation routine with the numba engine.
|
|||
|
|
|||
|
This routine mimics the data splitting routine of the DataSplitter class
|
|||
|
to generate the indices of each group in the sorted data and then passes the
|
|||
|
data and indices into a Numba jitted function.
|
|||
|
"""
|
|||
|
if not callable(func):
|
|||
|
raise NotImplementedError(
|
|||
|
"Numba engine can only be used with a single function."
|
|||
|
)
|
|||
|
group_keys = self.grouper._get_group_keys()
|
|||
|
labels, _, n_groups = self.grouper.group_info
|
|||
|
sorted_index = get_group_index_sorter(labels, n_groups)
|
|||
|
sorted_labels = algorithms.take_nd(labels, sorted_index, allow_fill=False)
|
|||
|
sorted_data = data.take(sorted_index, axis=self.axis).to_numpy()
|
|||
|
starts, ends = lib.generate_slices(sorted_labels, n_groups)
|
|||
|
|
|||
|
numba_agg_func = numba_.generate_numba_agg_func(
|
|||
|
tuple(args), kwargs, func, engine_kwargs
|
|||
|
)
|
|||
|
result = numba_agg_func(
|
|||
|
sorted_data, sorted_index, starts, ends, len(group_keys), len(data.columns)
|
|||
|
)
|
|||
|
|
|||
|
cache_key = (func, "groupby_agg")
|
|||
|
if cache_key not in NUMBA_FUNC_CACHE:
|
|||
|
NUMBA_FUNC_CACHE[cache_key] = numba_agg_func
|
|||
|
|
|||
|
if self.grouper.nkeys > 1:
|
|||
|
index = MultiIndex.from_tuples(group_keys, names=self.grouper.names)
|
|||
|
else:
|
|||
|
index = Index(group_keys, name=self.grouper.names[0])
|
|||
|
return result, index
|
|||
|
|
|||
|
@final
|
|||
|
def _python_agg_general(self, func, *args, **kwargs):
|
|||
|
func = self._is_builtin_func(func)
|
|||
|
f = lambda x: func(x, *args, **kwargs)
|
|||
|
|
|||
|
# iterate through "columns" ex exclusions to populate output dict
|
|||
|
output: Dict[base.OutputKey, np.ndarray] = {}
|
|||
|
|
|||
|
for idx, obj in enumerate(self._iterate_slices()):
|
|||
|
name = obj.name
|
|||
|
if self.grouper.ngroups == 0:
|
|||
|
# agg_series below assumes ngroups > 0
|
|||
|
continue
|
|||
|
|
|||
|
try:
|
|||
|
# if this function is invalid for this dtype, we will ignore it.
|
|||
|
result, counts = self.grouper.agg_series(obj, f)
|
|||
|
except TypeError:
|
|||
|
continue
|
|||
|
|
|||
|
assert result is not None
|
|||
|
key = base.OutputKey(label=name, position=idx)
|
|||
|
|
|||
|
if is_numeric_dtype(obj.dtype):
|
|||
|
result = maybe_downcast_to_dtype(result, obj.dtype)
|
|||
|
|
|||
|
if self.grouper._filter_empty_groups:
|
|||
|
mask = counts.ravel() > 0
|
|||
|
|
|||
|
# since we are masking, make sure that we have a float object
|
|||
|
values = result
|
|||
|
if is_numeric_dtype(values.dtype):
|
|||
|
values = ensure_float(values)
|
|||
|
|
|||
|
result = maybe_downcast_to_dtype(values[mask], result.dtype)
|
|||
|
|
|||
|
output[key] = result
|
|||
|
|
|||
|
if not output:
|
|||
|
return self._python_apply_general(f, self._selected_obj)
|
|||
|
|
|||
|
return self._wrap_aggregated_output(output, index=self.grouper.result_index)
|
|||
|
|
|||
|
@final
|
|||
|
def _concat_objects(self, keys, values, not_indexed_same: bool = False):
|
|||
|
from pandas.core.reshape.concat import concat
|
|||
|
|
|||
|
def reset_identity(values):
|
|||
|
# reset the identities of the components
|
|||
|
# of the values to prevent aliasing
|
|||
|
for v in com.not_none(*values):
|
|||
|
ax = v._get_axis(self.axis)
|
|||
|
ax._reset_identity()
|
|||
|
return values
|
|||
|
|
|||
|
if not not_indexed_same:
|
|||
|
result = concat(values, axis=self.axis)
|
|||
|
ax = self.filter(lambda x: True).axes[self.axis]
|
|||
|
|
|||
|
# this is a very unfortunate situation
|
|||
|
# we can't use reindex to restore the original order
|
|||
|
# when the ax has duplicates
|
|||
|
# so we resort to this
|
|||
|
# GH 14776, 30667
|
|||
|
if ax.has_duplicates and not result.axes[self.axis].equals(ax):
|
|||
|
indexer, _ = result.index.get_indexer_non_unique(ax.values)
|
|||
|
indexer = algorithms.unique1d(indexer)
|
|||
|
result = result.take(indexer, axis=self.axis)
|
|||
|
else:
|
|||
|
result = result.reindex(ax, axis=self.axis, copy=False)
|
|||
|
|
|||
|
elif self.group_keys:
|
|||
|
|
|||
|
values = reset_identity(values)
|
|||
|
if self.as_index:
|
|||
|
|
|||
|
# possible MI return case
|
|||
|
group_keys = keys
|
|||
|
group_levels = self.grouper.levels
|
|||
|
group_names = self.grouper.names
|
|||
|
|
|||
|
result = concat(
|
|||
|
values,
|
|||
|
axis=self.axis,
|
|||
|
keys=group_keys,
|
|||
|
levels=group_levels,
|
|||
|
names=group_names,
|
|||
|
sort=False,
|
|||
|
)
|
|||
|
else:
|
|||
|
|
|||
|
# GH5610, returns a MI, with the first level being a
|
|||
|
# range index
|
|||
|
keys = list(range(len(values)))
|
|||
|
result = concat(values, axis=self.axis, keys=keys)
|
|||
|
else:
|
|||
|
values = reset_identity(values)
|
|||
|
result = concat(values, axis=self.axis)
|
|||
|
|
|||
|
if isinstance(result, Series) and self._selection_name is not None:
|
|||
|
|
|||
|
result.name = self._selection_name
|
|||
|
|
|||
|
return result
|
|||
|
|
|||
|
@final
|
|||
|
def _apply_filter(self, indices, dropna):
|
|||
|
if len(indices) == 0:
|
|||
|
indices = np.array([], dtype="int64")
|
|||
|
else:
|
|||
|
indices = np.sort(np.concatenate(indices))
|
|||
|
if dropna:
|
|||
|
filtered = self._selected_obj.take(indices, axis=self.axis)
|
|||
|
else:
|
|||
|
mask = np.empty(len(self._selected_obj.index), dtype=bool)
|
|||
|
mask.fill(False)
|
|||
|
mask[indices.astype(int)] = True
|
|||
|
# mask fails to broadcast when passed to where; broadcast manually.
|
|||
|
mask = np.tile(mask, list(self._selected_obj.shape[1:]) + [1]).T
|
|||
|
filtered = self._selected_obj.where(mask) # Fill with NaNs.
|
|||
|
return filtered
|
|||
|
|
|||
|
|
|||
|
# To track operations that expand dimensions, like ohlc
|
|||
|
OutputFrameOrSeries = TypeVar("OutputFrameOrSeries", bound=NDFrame)
|
|||
|
|
|||
|
|
|||
|
class GroupBy(BaseGroupBy[FrameOrSeries]):
|
|||
|
"""
|
|||
|
Class for grouping and aggregating relational data.
|
|||
|
|
|||
|
See aggregate, transform, and apply functions on this object.
|
|||
|
|
|||
|
It's easiest to use obj.groupby(...) to use GroupBy, but you can also do:
|
|||
|
|
|||
|
::
|
|||
|
|
|||
|
grouped = groupby(obj, ...)
|
|||
|
|
|||
|
Parameters
|
|||
|
----------
|
|||
|
obj : pandas object
|
|||
|
axis : int, default 0
|
|||
|
level : int, default None
|
|||
|
Level of MultiIndex
|
|||
|
groupings : list of Grouping objects
|
|||
|
Most users should ignore this
|
|||
|
exclusions : array-like, optional
|
|||
|
List of columns to exclude
|
|||
|
name : str
|
|||
|
Most users should ignore this
|
|||
|
|
|||
|
Returns
|
|||
|
-------
|
|||
|
**Attributes**
|
|||
|
groups : dict
|
|||
|
{group name -> group labels}
|
|||
|
len(grouped) : int
|
|||
|
Number of groups
|
|||
|
|
|||
|
Notes
|
|||
|
-----
|
|||
|
After grouping, see aggregate, apply, and transform functions. Here are
|
|||
|
some other brief notes about usage. When grouping by multiple groups, the
|
|||
|
result index will be a MultiIndex (hierarchical) by default.
|
|||
|
|
|||
|
Iteration produces (key, group) tuples, i.e. chunking the data by group. So
|
|||
|
you can write code like:
|
|||
|
|
|||
|
::
|
|||
|
|
|||
|
grouped = obj.groupby(keys, axis=axis)
|
|||
|
for key, group in grouped:
|
|||
|
# do something with the data
|
|||
|
|
|||
|
Function calls on GroupBy, if not specially implemented, "dispatch" to the
|
|||
|
grouped data. So if you group a DataFrame and wish to invoke the std()
|
|||
|
method on each group, you can simply do:
|
|||
|
|
|||
|
::
|
|||
|
|
|||
|
df.groupby(mapper).std()
|
|||
|
|
|||
|
rather than
|
|||
|
|
|||
|
::
|
|||
|
|
|||
|
df.groupby(mapper).aggregate(np.std)
|
|||
|
|
|||
|
You can pass arguments to these "wrapped" functions, too.
|
|||
|
|
|||
|
See the online documentation for full exposition on these topics and much
|
|||
|
more
|
|||
|
"""
|
|||
|
|
|||
|
@final
|
|||
|
@property
|
|||
|
def _obj_1d_constructor(self) -> Type["Series"]:
|
|||
|
# GH28330 preserve subclassed Series/DataFrames
|
|||
|
if isinstance(self.obj, DataFrame):
|
|||
|
return self.obj._constructor_sliced
|
|||
|
assert isinstance(self.obj, Series)
|
|||
|
return self.obj._constructor
|
|||
|
|
|||
|
@final
|
|||
|
def _bool_agg(self, val_test, skipna):
|
|||
|
"""
|
|||
|
Shared func to call any / all Cython GroupBy implementations.
|
|||
|
"""
|
|||
|
|
|||
|
def objs_to_bool(vals: np.ndarray) -> Tuple[np.ndarray, Type]:
|
|||
|
if is_object_dtype(vals):
|
|||
|
vals = np.array([bool(x) for x in vals])
|
|||
|
else:
|
|||
|
vals = vals.astype(bool)
|
|||
|
|
|||
|
return vals.view(np.uint8), bool
|
|||
|
|
|||
|
def result_to_bool(result: np.ndarray, inference: Type) -> np.ndarray:
|
|||
|
return result.astype(inference, copy=False)
|
|||
|
|
|||
|
return self._get_cythonized_result(
|
|||
|
"group_any_all",
|
|||
|
aggregate=True,
|
|||
|
numeric_only=False,
|
|||
|
cython_dtype=np.dtype(np.uint8),
|
|||
|
needs_values=True,
|
|||
|
needs_mask=True,
|
|||
|
pre_processing=objs_to_bool,
|
|||
|
post_processing=result_to_bool,
|
|||
|
val_test=val_test,
|
|||
|
skipna=skipna,
|
|||
|
)
|
|||
|
|
|||
|
@final
|
|||
|
@Substitution(name="groupby")
|
|||
|
@Appender(_common_see_also)
|
|||
|
def any(self, skipna: bool = True):
|
|||
|
"""
|
|||
|
Return True if any value in the group is truthful, else False.
|
|||
|
|
|||
|
Parameters
|
|||
|
----------
|
|||
|
skipna : bool, default True
|
|||
|
Flag to ignore nan values during truth testing.
|
|||
|
|
|||
|
Returns
|
|||
|
-------
|
|||
|
Series or DataFrame
|
|||
|
DataFrame or Series of boolean values, where a value is True if any element
|
|||
|
is True within its respective group, False otherwise.
|
|||
|
"""
|
|||
|
return self._bool_agg("any", skipna)
|
|||
|
|
|||
|
@final
|
|||
|
@Substitution(name="groupby")
|
|||
|
@Appender(_common_see_also)
|
|||
|
def all(self, skipna: bool = True):
|
|||
|
"""
|
|||
|
Return True if all values in the group are truthful, else False.
|
|||
|
|
|||
|
Parameters
|
|||
|
----------
|
|||
|
skipna : bool, default True
|
|||
|
Flag to ignore nan values during truth testing.
|
|||
|
|
|||
|
Returns
|
|||
|
-------
|
|||
|
Series or DataFrame
|
|||
|
DataFrame or Series of boolean values, where a value is True if all elements
|
|||
|
are True within its respective group, False otherwise.
|
|||
|
"""
|
|||
|
return self._bool_agg("all", skipna)
|
|||
|
|
|||
|
@Substitution(name="groupby")
|
|||
|
@Appender(_common_see_also)
|
|||
|
def count(self):
|
|||
|
"""
|
|||
|
Compute count of group, excluding missing values.
|
|||
|
|
|||
|
Returns
|
|||
|
-------
|
|||
|
Series or DataFrame
|
|||
|
Count of values within each group.
|
|||
|
"""
|
|||
|
# defined here for API doc
|
|||
|
raise NotImplementedError
|
|||
|
|
|||
|
@final
|
|||
|
@Substitution(name="groupby")
|
|||
|
@Substitution(see_also=_common_see_also)
|
|||
|
def mean(self, numeric_only: bool = True):
|
|||
|
"""
|
|||
|
Compute mean of groups, excluding missing values.
|
|||
|
|
|||
|
Parameters
|
|||
|
----------
|
|||
|
numeric_only : bool, default True
|
|||
|
Include only float, int, boolean columns. If None, will attempt to use
|
|||
|
everything, then use only numeric data.
|
|||
|
|
|||
|
Returns
|
|||
|
-------
|
|||
|
pandas.Series or pandas.DataFrame
|
|||
|
%(see_also)s
|
|||
|
Examples
|
|||
|
--------
|
|||
|
>>> df = pd.DataFrame({'A': [1, 1, 2, 1, 2],
|
|||
|
... 'B': [np.nan, 2, 3, 4, 5],
|
|||
|
... 'C': [1, 2, 1, 1, 2]}, columns=['A', 'B', 'C'])
|
|||
|
|
|||
|
Groupby one column and return the mean of the remaining columns in
|
|||
|
each group.
|
|||
|
|
|||
|
>>> df.groupby('A').mean()
|
|||
|
B C
|
|||
|
A
|
|||
|
1 3.0 1.333333
|
|||
|
2 4.0 1.500000
|
|||
|
|
|||
|
Groupby two columns and return the mean of the remaining column.
|
|||
|
|
|||
|
>>> df.groupby(['A', 'B']).mean()
|
|||
|
C
|
|||
|
A B
|
|||
|
1 2.0 2
|
|||
|
4.0 1
|
|||
|
2 3.0 1
|
|||
|
5.0 2
|
|||
|
|
|||
|
Groupby one column and return the mean of only particular column in
|
|||
|
the group.
|
|||
|
|
|||
|
>>> df.groupby('A')['B'].mean()
|
|||
|
A
|
|||
|
1 3.0
|
|||
|
2 4.0
|
|||
|
Name: B, dtype: float64
|
|||
|
"""
|
|||
|
return self._cython_agg_general(
|
|||
|
"mean",
|
|||
|
alt=lambda x, axis: Series(x).mean(numeric_only=numeric_only),
|
|||
|
numeric_only=numeric_only,
|
|||
|
)
|
|||
|
|
|||
|
@final
|
|||
|
@Substitution(name="groupby")
|
|||
|
@Appender(_common_see_also)
|
|||
|
def median(self, numeric_only=True):
|
|||
|
"""
|
|||
|
Compute median of groups, excluding missing values.
|
|||
|
|
|||
|
For multiple groupings, the result index will be a MultiIndex
|
|||
|
|
|||
|
Parameters
|
|||
|
----------
|
|||
|
numeric_only : bool, default True
|
|||
|
Include only float, int, boolean columns. If None, will attempt to use
|
|||
|
everything, then use only numeric data.
|
|||
|
|
|||
|
Returns
|
|||
|
-------
|
|||
|
Series or DataFrame
|
|||
|
Median of values within each group.
|
|||
|
"""
|
|||
|
return self._cython_agg_general(
|
|||
|
"median",
|
|||
|
alt=lambda x, axis: Series(x).median(axis=axis, numeric_only=numeric_only),
|
|||
|
numeric_only=numeric_only,
|
|||
|
)
|
|||
|
|
|||
|
@final
|
|||
|
@Substitution(name="groupby")
|
|||
|
@Appender(_common_see_also)
|
|||
|
def std(self, ddof: int = 1):
|
|||
|
"""
|
|||
|
Compute standard deviation of groups, excluding missing values.
|
|||
|
|
|||
|
For multiple groupings, the result index will be a MultiIndex.
|
|||
|
|
|||
|
Parameters
|
|||
|
----------
|
|||
|
ddof : int, default 1
|
|||
|
Degrees of freedom.
|
|||
|
|
|||
|
Returns
|
|||
|
-------
|
|||
|
Series or DataFrame
|
|||
|
Standard deviation of values within each group.
|
|||
|
"""
|
|||
|
return self._get_cythonized_result(
|
|||
|
"group_var_float64",
|
|||
|
aggregate=True,
|
|||
|
needs_counts=True,
|
|||
|
needs_values=True,
|
|||
|
needs_2d=True,
|
|||
|
cython_dtype=np.dtype(np.float64),
|
|||
|
post_processing=lambda vals, inference: np.sqrt(vals),
|
|||
|
ddof=ddof,
|
|||
|
)
|
|||
|
|
|||
|
@final
|
|||
|
@Substitution(name="groupby")
|
|||
|
@Appender(_common_see_also)
|
|||
|
def var(self, ddof: int = 1):
|
|||
|
"""
|
|||
|
Compute variance of groups, excluding missing values.
|
|||
|
|
|||
|
For multiple groupings, the result index will be a MultiIndex.
|
|||
|
|
|||
|
Parameters
|
|||
|
----------
|
|||
|
ddof : int, default 1
|
|||
|
Degrees of freedom.
|
|||
|
|
|||
|
Returns
|
|||
|
-------
|
|||
|
Series or DataFrame
|
|||
|
Variance of values within each group.
|
|||
|
"""
|
|||
|
if ddof == 1:
|
|||
|
return self._cython_agg_general(
|
|||
|
"var", alt=lambda x, axis: Series(x).var(ddof=ddof)
|
|||
|
)
|
|||
|
else:
|
|||
|
func = lambda x: x.var(ddof=ddof)
|
|||
|
with group_selection_context(self):
|
|||
|
return self._python_agg_general(func)
|
|||
|
|
|||
|
@final
|
|||
|
@Substitution(name="groupby")
|
|||
|
@Appender(_common_see_also)
|
|||
|
def sem(self, ddof: int = 1):
|
|||
|
"""
|
|||
|
Compute standard error of the mean of groups, excluding missing values.
|
|||
|
|
|||
|
For multiple groupings, the result index will be a MultiIndex.
|
|||
|
|
|||
|
Parameters
|
|||
|
----------
|
|||
|
ddof : int, default 1
|
|||
|
Degrees of freedom.
|
|||
|
|
|||
|
Returns
|
|||
|
-------
|
|||
|
Series or DataFrame
|
|||
|
Standard error of the mean of values within each group.
|
|||
|
"""
|
|||
|
result = self.std(ddof=ddof)
|
|||
|
if result.ndim == 1:
|
|||
|
result /= np.sqrt(self.count())
|
|||
|
else:
|
|||
|
cols = result.columns.difference(self.exclusions).unique()
|
|||
|
counts = self.count()
|
|||
|
result_ilocs = result.columns.get_indexer_for(cols)
|
|||
|
count_ilocs = counts.columns.get_indexer_for(cols)
|
|||
|
result.iloc[:, result_ilocs] /= np.sqrt(counts.iloc[:, count_ilocs])
|
|||
|
return result
|
|||
|
|
|||
|
@final
|
|||
|
@Substitution(name="groupby")
|
|||
|
@Appender(_common_see_also)
|
|||
|
def size(self) -> FrameOrSeriesUnion:
|
|||
|
"""
|
|||
|
Compute group sizes.
|
|||
|
|
|||
|
Returns
|
|||
|
-------
|
|||
|
DataFrame or Series
|
|||
|
Number of rows in each group as a Series if as_index is True
|
|||
|
or a DataFrame if as_index is False.
|
|||
|
"""
|
|||
|
result = self.grouper.size()
|
|||
|
|
|||
|
# GH28330 preserve subclassed Series/DataFrames through calls
|
|||
|
if issubclass(self.obj._constructor, Series):
|
|||
|
result = self._obj_1d_constructor(result, name=self.obj.name)
|
|||
|
else:
|
|||
|
result = self._obj_1d_constructor(result)
|
|||
|
|
|||
|
if not self.as_index:
|
|||
|
result = result.rename("size").reset_index()
|
|||
|
|
|||
|
return self._reindex_output(result, fill_value=0)
|
|||
|
|
|||
|
@final
|
|||
|
@doc(_groupby_agg_method_template, fname="sum", no=True, mc=0)
|
|||
|
def sum(self, numeric_only: bool = True, min_count: int = 0):
|
|||
|
|
|||
|
# If we are grouping on categoricals we want unobserved categories to
|
|||
|
# return zero, rather than the default of NaN which the reindexing in
|
|||
|
# _agg_general() returns. GH #31422
|
|||
|
with com.temp_setattr(self, "observed", True):
|
|||
|
result = self._agg_general(
|
|||
|
numeric_only=numeric_only,
|
|||
|
min_count=min_count,
|
|||
|
alias="add",
|
|||
|
npfunc=np.sum,
|
|||
|
)
|
|||
|
|
|||
|
return self._reindex_output(result, fill_value=0)
|
|||
|
|
|||
|
@final
|
|||
|
@doc(_groupby_agg_method_template, fname="prod", no=True, mc=0)
|
|||
|
def prod(self, numeric_only: bool = True, min_count: int = 0):
|
|||
|
return self._agg_general(
|
|||
|
numeric_only=numeric_only, min_count=min_count, alias="prod", npfunc=np.prod
|
|||
|
)
|
|||
|
|
|||
|
@final
|
|||
|
@doc(_groupby_agg_method_template, fname="min", no=False, mc=-1)
|
|||
|
def min(self, numeric_only: bool = False, min_count: int = -1):
|
|||
|
return self._agg_general(
|
|||
|
numeric_only=numeric_only, min_count=min_count, alias="min", npfunc=np.min
|
|||
|
)
|
|||
|
|
|||
|
@final
|
|||
|
@doc(_groupby_agg_method_template, fname="max", no=False, mc=-1)
|
|||
|
def max(self, numeric_only: bool = False, min_count: int = -1):
|
|||
|
return self._agg_general(
|
|||
|
numeric_only=numeric_only, min_count=min_count, alias="max", npfunc=np.max
|
|||
|
)
|
|||
|
|
|||
|
@final
|
|||
|
@doc(_groupby_agg_method_template, fname="first", no=False, mc=-1)
|
|||
|
def first(self, numeric_only: bool = False, min_count: int = -1):
|
|||
|
def first_compat(obj: FrameOrSeries, axis: int = 0):
|
|||
|
def first(x: Series):
|
|||
|
"""Helper function for first item that isn't NA."""
|
|||
|
arr = x.array[notna(x.array)]
|
|||
|
if not len(arr):
|
|||
|
return np.nan
|
|||
|
return arr[0]
|
|||
|
|
|||
|
if isinstance(obj, DataFrame):
|
|||
|
return obj.apply(first, axis=axis)
|
|||
|
elif isinstance(obj, Series):
|
|||
|
return first(obj)
|
|||
|
else:
|
|||
|
raise TypeError(type(obj))
|
|||
|
|
|||
|
return self._agg_general(
|
|||
|
numeric_only=numeric_only,
|
|||
|
min_count=min_count,
|
|||
|
alias="first",
|
|||
|
npfunc=first_compat,
|
|||
|
)
|
|||
|
|
|||
|
@final
|
|||
|
@doc(_groupby_agg_method_template, fname="last", no=False, mc=-1)
|
|||
|
def last(self, numeric_only: bool = False, min_count: int = -1):
|
|||
|
def last_compat(obj: FrameOrSeries, axis: int = 0):
|
|||
|
def last(x: Series):
|
|||
|
"""Helper function for last item that isn't NA."""
|
|||
|
arr = x.array[notna(x.array)]
|
|||
|
if not len(arr):
|
|||
|
return np.nan
|
|||
|
return arr[-1]
|
|||
|
|
|||
|
if isinstance(obj, DataFrame):
|
|||
|
return obj.apply(last, axis=axis)
|
|||
|
elif isinstance(obj, Series):
|
|||
|
return last(obj)
|
|||
|
else:
|
|||
|
raise TypeError(type(obj))
|
|||
|
|
|||
|
return self._agg_general(
|
|||
|
numeric_only=numeric_only,
|
|||
|
min_count=min_count,
|
|||
|
alias="last",
|
|||
|
npfunc=last_compat,
|
|||
|
)
|
|||
|
|
|||
|
@final
|
|||
|
@Substitution(name="groupby")
|
|||
|
@Appender(_common_see_also)
|
|||
|
def ohlc(self) -> DataFrame:
|
|||
|
"""
|
|||
|
Compute open, high, low and close values of a group, excluding missing values.
|
|||
|
|
|||
|
For multiple groupings, the result index will be a MultiIndex
|
|||
|
|
|||
|
Returns
|
|||
|
-------
|
|||
|
DataFrame
|
|||
|
Open, high, low and close values within each group.
|
|||
|
"""
|
|||
|
return self._apply_to_column_groupbys(lambda x: x._cython_agg_general("ohlc"))
|
|||
|
|
|||
|
@final
|
|||
|
@doc(DataFrame.describe)
|
|||
|
def describe(self, **kwargs):
|
|||
|
with group_selection_context(self):
|
|||
|
result = self.apply(lambda x: x.describe(**kwargs))
|
|||
|
if self.axis == 1:
|
|||
|
return result.T
|
|||
|
return result.unstack()
|
|||
|
|
|||
|
@final
|
|||
|
def resample(self, rule, *args, **kwargs):
|
|||
|
"""
|
|||
|
Provide resampling when using a TimeGrouper.
|
|||
|
|
|||
|
Given a grouper, the function resamples it according to a string
|
|||
|
"string" -> "frequency".
|
|||
|
|
|||
|
See the :ref:`frequency aliases <timeseries.offset_aliases>`
|
|||
|
documentation for more details.
|
|||
|
|
|||
|
Parameters
|
|||
|
----------
|
|||
|
rule : str or DateOffset
|
|||
|
The offset string or object representing target grouper conversion.
|
|||
|
*args, **kwargs
|
|||
|
Possible arguments are `how`, `fill_method`, `limit`, `kind` and
|
|||
|
`on`, and other arguments of `TimeGrouper`.
|
|||
|
|
|||
|
Returns
|
|||
|
-------
|
|||
|
Grouper
|
|||
|
Return a new grouper with our resampler appended.
|
|||
|
|
|||
|
See Also
|
|||
|
--------
|
|||
|
Grouper : Specify a frequency to resample with when
|
|||
|
grouping by a key.
|
|||
|
DatetimeIndex.resample : Frequency conversion and resampling of
|
|||
|
time series.
|
|||
|
|
|||
|
Examples
|
|||
|
--------
|
|||
|
>>> idx = pd.date_range('1/1/2000', periods=4, freq='T')
|
|||
|
>>> df = pd.DataFrame(data=4 * [range(2)],
|
|||
|
... index=idx,
|
|||
|
... columns=['a', 'b'])
|
|||
|
>>> df.iloc[2, 0] = 5
|
|||
|
>>> df
|
|||
|
a b
|
|||
|
2000-01-01 00:00:00 0 1
|
|||
|
2000-01-01 00:01:00 0 1
|
|||
|
2000-01-01 00:02:00 5 1
|
|||
|
2000-01-01 00:03:00 0 1
|
|||
|
|
|||
|
Downsample the DataFrame into 3 minute bins and sum the values of
|
|||
|
the timestamps falling into a bin.
|
|||
|
|
|||
|
>>> df.groupby('a').resample('3T').sum()
|
|||
|
a b
|
|||
|
a
|
|||
|
0 2000-01-01 00:00:00 0 2
|
|||
|
2000-01-01 00:03:00 0 1
|
|||
|
5 2000-01-01 00:00:00 5 1
|
|||
|
|
|||
|
Upsample the series into 30 second bins.
|
|||
|
|
|||
|
>>> df.groupby('a').resample('30S').sum()
|
|||
|
a b
|
|||
|
a
|
|||
|
0 2000-01-01 00:00:00 0 1
|
|||
|
2000-01-01 00:00:30 0 0
|
|||
|
2000-01-01 00:01:00 0 1
|
|||
|
2000-01-01 00:01:30 0 0
|
|||
|
2000-01-01 00:02:00 0 0
|
|||
|
2000-01-01 00:02:30 0 0
|
|||
|
2000-01-01 00:03:00 0 1
|
|||
|
5 2000-01-01 00:02:00 5 1
|
|||
|
|
|||
|
Resample by month. Values are assigned to the month of the period.
|
|||
|
|
|||
|
>>> df.groupby('a').resample('M').sum()
|
|||
|
a b
|
|||
|
a
|
|||
|
0 2000-01-31 0 3
|
|||
|
5 2000-01-31 5 1
|
|||
|
|
|||
|
Downsample the series into 3 minute bins as above, but close the right
|
|||
|
side of the bin interval.
|
|||
|
|
|||
|
>>> df.groupby('a').resample('3T', closed='right').sum()
|
|||
|
a b
|
|||
|
a
|
|||
|
0 1999-12-31 23:57:00 0 1
|
|||
|
2000-01-01 00:00:00 0 2
|
|||
|
5 2000-01-01 00:00:00 5 1
|
|||
|
|
|||
|
Downsample the series into 3 minute bins and close the right side of
|
|||
|
the bin interval, but label each bin using the right edge instead of
|
|||
|
the left.
|
|||
|
|
|||
|
>>> df.groupby('a').resample('3T', closed='right', label='right').sum()
|
|||
|
a b
|
|||
|
a
|
|||
|
0 2000-01-01 00:00:00 0 1
|
|||
|
2000-01-01 00:03:00 0 2
|
|||
|
5 2000-01-01 00:03:00 5 1
|
|||
|
"""
|
|||
|
from pandas.core.resample import get_resampler_for_grouping
|
|||
|
|
|||
|
return get_resampler_for_grouping(self, rule, *args, **kwargs)
|
|||
|
|
|||
|
@final
|
|||
|
@Substitution(name="groupby")
|
|||
|
@Appender(_common_see_also)
|
|||
|
def rolling(self, *args, **kwargs):
|
|||
|
"""
|
|||
|
Return a rolling grouper, providing rolling functionality per group.
|
|||
|
"""
|
|||
|
from pandas.core.window import RollingGroupby
|
|||
|
|
|||
|
return RollingGroupby(self, *args, **kwargs)
|
|||
|
|
|||
|
@final
|
|||
|
@Substitution(name="groupby")
|
|||
|
@Appender(_common_see_also)
|
|||
|
def expanding(self, *args, **kwargs):
|
|||
|
"""
|
|||
|
Return an expanding grouper, providing expanding
|
|||
|
functionality per group.
|
|||
|
"""
|
|||
|
from pandas.core.window import ExpandingGroupby
|
|||
|
|
|||
|
return ExpandingGroupby(self, *args, **kwargs)
|
|||
|
|
|||
|
@final
|
|||
|
@Substitution(name="groupby")
|
|||
|
@Appender(_common_see_also)
|
|||
|
def ewm(self, *args, **kwargs):
|
|||
|
"""
|
|||
|
Return an ewm grouper, providing ewm functionality per group.
|
|||
|
"""
|
|||
|
from pandas.core.window import ExponentialMovingWindowGroupby
|
|||
|
|
|||
|
return ExponentialMovingWindowGroupby(self, *args, **kwargs)
|
|||
|
|
|||
|
@final
|
|||
|
def _fill(self, direction, limit=None):
|
|||
|
"""
|
|||
|
Shared function for `pad` and `backfill` to call Cython method.
|
|||
|
|
|||
|
Parameters
|
|||
|
----------
|
|||
|
direction : {'ffill', 'bfill'}
|
|||
|
Direction passed to underlying Cython function. `bfill` will cause
|
|||
|
values to be filled backwards. `ffill` and any other values will
|
|||
|
default to a forward fill
|
|||
|
limit : int, default None
|
|||
|
Maximum number of consecutive values to fill. If `None`, this
|
|||
|
method will convert to -1 prior to passing to Cython
|
|||
|
|
|||
|
Returns
|
|||
|
-------
|
|||
|
`Series` or `DataFrame` with filled values
|
|||
|
|
|||
|
See Also
|
|||
|
--------
|
|||
|
pad : Returns Series with minimum number of char in object.
|
|||
|
backfill : Backward fill the missing values in the dataset.
|
|||
|
"""
|
|||
|
# Need int value for Cython
|
|||
|
if limit is None:
|
|||
|
limit = -1
|
|||
|
|
|||
|
return self._get_cythonized_result(
|
|||
|
"group_fillna_indexer",
|
|||
|
numeric_only=False,
|
|||
|
needs_mask=True,
|
|||
|
cython_dtype=np.dtype(np.int64),
|
|||
|
result_is_index=True,
|
|||
|
direction=direction,
|
|||
|
limit=limit,
|
|||
|
dropna=self.dropna,
|
|||
|
)
|
|||
|
|
|||
|
@final
|
|||
|
@Substitution(name="groupby")
|
|||
|
def pad(self, limit=None):
|
|||
|
"""
|
|||
|
Forward fill the values.
|
|||
|
|
|||
|
Parameters
|
|||
|
----------
|
|||
|
limit : int, optional
|
|||
|
Limit of how many values to fill.
|
|||
|
|
|||
|
Returns
|
|||
|
-------
|
|||
|
Series or DataFrame
|
|||
|
Object with missing values filled.
|
|||
|
|
|||
|
See Also
|
|||
|
--------
|
|||
|
Series.pad: Returns Series with minimum number of char in object.
|
|||
|
DataFrame.pad: Object with missing values filled or None if inplace=True.
|
|||
|
Series.fillna: Fill NaN values of a Series.
|
|||
|
DataFrame.fillna: Fill NaN values of a DataFrame.
|
|||
|
"""
|
|||
|
return self._fill("ffill", limit=limit)
|
|||
|
|
|||
|
ffill = pad
|
|||
|
|
|||
|
@final
|
|||
|
@Substitution(name="groupby")
|
|||
|
def backfill(self, limit=None):
|
|||
|
"""
|
|||
|
Backward fill the values.
|
|||
|
|
|||
|
Parameters
|
|||
|
----------
|
|||
|
limit : int, optional
|
|||
|
Limit of how many values to fill.
|
|||
|
|
|||
|
Returns
|
|||
|
-------
|
|||
|
Series or DataFrame
|
|||
|
Object with missing values filled.
|
|||
|
|
|||
|
See Also
|
|||
|
--------
|
|||
|
Series.backfill : Backward fill the missing values in the dataset.
|
|||
|
DataFrame.backfill: Backward fill the missing values in the dataset.
|
|||
|
Series.fillna: Fill NaN values of a Series.
|
|||
|
DataFrame.fillna: Fill NaN values of a DataFrame.
|
|||
|
"""
|
|||
|
return self._fill("bfill", limit=limit)
|
|||
|
|
|||
|
bfill = backfill
|
|||
|
|
|||
|
@final
|
|||
|
@Substitution(name="groupby")
|
|||
|
@Substitution(see_also=_common_see_also)
|
|||
|
def nth(self, n: Union[int, List[int]], dropna: Optional[str] = None) -> DataFrame:
|
|||
|
"""
|
|||
|
Take the nth row from each group if n is an int, or a subset of rows
|
|||
|
if n is a list of ints.
|
|||
|
|
|||
|
If dropna, will take the nth non-null row, dropna is either
|
|||
|
'all' or 'any'; this is equivalent to calling dropna(how=dropna)
|
|||
|
before the groupby.
|
|||
|
|
|||
|
Parameters
|
|||
|
----------
|
|||
|
n : int or list of ints
|
|||
|
A single nth value for the row or a list of nth values.
|
|||
|
dropna : None or str, optional
|
|||
|
Apply the specified dropna operation before counting which row is
|
|||
|
the nth row. Needs to be None, 'any' or 'all'.
|
|||
|
|
|||
|
Returns
|
|||
|
-------
|
|||
|
Series or DataFrame
|
|||
|
N-th value within each group.
|
|||
|
%(see_also)s
|
|||
|
Examples
|
|||
|
--------
|
|||
|
|
|||
|
>>> df = pd.DataFrame({'A': [1, 1, 2, 1, 2],
|
|||
|
... 'B': [np.nan, 2, 3, 4, 5]}, columns=['A', 'B'])
|
|||
|
>>> g = df.groupby('A')
|
|||
|
>>> g.nth(0)
|
|||
|
B
|
|||
|
A
|
|||
|
1 NaN
|
|||
|
2 3.0
|
|||
|
>>> g.nth(1)
|
|||
|
B
|
|||
|
A
|
|||
|
1 2.0
|
|||
|
2 5.0
|
|||
|
>>> g.nth(-1)
|
|||
|
B
|
|||
|
A
|
|||
|
1 4.0
|
|||
|
2 5.0
|
|||
|
>>> g.nth([0, 1])
|
|||
|
B
|
|||
|
A
|
|||
|
1 NaN
|
|||
|
1 2.0
|
|||
|
2 3.0
|
|||
|
2 5.0
|
|||
|
|
|||
|
Specifying `dropna` allows count ignoring ``NaN``
|
|||
|
|
|||
|
>>> g.nth(0, dropna='any')
|
|||
|
B
|
|||
|
A
|
|||
|
1 2.0
|
|||
|
2 3.0
|
|||
|
|
|||
|
NaNs denote group exhausted when using dropna
|
|||
|
|
|||
|
>>> g.nth(3, dropna='any')
|
|||
|
B
|
|||
|
A
|
|||
|
1 NaN
|
|||
|
2 NaN
|
|||
|
|
|||
|
Specifying `as_index=False` in `groupby` keeps the original index.
|
|||
|
|
|||
|
>>> df.groupby('A', as_index=False).nth(1)
|
|||
|
A B
|
|||
|
1 1 2.0
|
|||
|
4 2 5.0
|
|||
|
"""
|
|||
|
valid_containers = (set, list, tuple)
|
|||
|
if not isinstance(n, (valid_containers, int)):
|
|||
|
raise TypeError("n needs to be an int or a list/set/tuple of ints")
|
|||
|
|
|||
|
if not dropna:
|
|||
|
|
|||
|
if isinstance(n, int):
|
|||
|
nth_values = [n]
|
|||
|
elif isinstance(n, valid_containers):
|
|||
|
nth_values = list(set(n))
|
|||
|
|
|||
|
nth_array = np.array(nth_values, dtype=np.intp)
|
|||
|
with group_selection_context(self):
|
|||
|
|
|||
|
mask_left = np.in1d(self._cumcount_array(), nth_array)
|
|||
|
mask_right = np.in1d(
|
|||
|
self._cumcount_array(ascending=False) + 1, -nth_array
|
|||
|
)
|
|||
|
mask = mask_left | mask_right
|
|||
|
|
|||
|
ids, _, _ = self.grouper.group_info
|
|||
|
|
|||
|
# Drop NA values in grouping
|
|||
|
mask = mask & (ids != -1)
|
|||
|
|
|||
|
out = self._selected_obj[mask]
|
|||
|
if not self.as_index:
|
|||
|
return out
|
|||
|
|
|||
|
result_index = self.grouper.result_index
|
|||
|
out.index = result_index[ids[mask]]
|
|||
|
|
|||
|
if not self.observed and isinstance(result_index, CategoricalIndex):
|
|||
|
out = out.reindex(result_index)
|
|||
|
|
|||
|
out = self._reindex_output(out)
|
|||
|
return out.sort_index() if self.sort else out
|
|||
|
|
|||
|
# dropna is truthy
|
|||
|
if isinstance(n, valid_containers):
|
|||
|
raise ValueError("dropna option with a list of nth values is not supported")
|
|||
|
|
|||
|
if dropna not in ["any", "all"]:
|
|||
|
# Note: when agg-ing picker doesn't raise this, just returns NaN
|
|||
|
raise ValueError(
|
|||
|
"For a DataFrame groupby, dropna must be "
|
|||
|
"either None, 'any' or 'all', "
|
|||
|
f"(was passed {dropna})."
|
|||
|
)
|
|||
|
|
|||
|
# old behaviour, but with all and any support for DataFrames.
|
|||
|
# modified in GH 7559 to have better perf
|
|||
|
max_len = n if n >= 0 else -1 - n
|
|||
|
dropped = self.obj.dropna(how=dropna, axis=self.axis)
|
|||
|
|
|||
|
# get a new grouper for our dropped obj
|
|||
|
if self.keys is None and self.level is None:
|
|||
|
|
|||
|
# we don't have the grouper info available
|
|||
|
# (e.g. we have selected out
|
|||
|
# a column that is not in the current object)
|
|||
|
axis = self.grouper.axis
|
|||
|
grouper = axis[axis.isin(dropped.index)]
|
|||
|
|
|||
|
else:
|
|||
|
|
|||
|
# create a grouper with the original parameters, but on dropped
|
|||
|
# object
|
|||
|
from pandas.core.groupby.grouper import get_grouper
|
|||
|
|
|||
|
grouper, _, _ = get_grouper(
|
|||
|
dropped,
|
|||
|
key=self.keys,
|
|||
|
axis=self.axis,
|
|||
|
level=self.level,
|
|||
|
sort=self.sort,
|
|||
|
mutated=self.mutated,
|
|||
|
)
|
|||
|
|
|||
|
grb = dropped.groupby(grouper, as_index=self.as_index, sort=self.sort)
|
|||
|
sizes, result = grb.size(), grb.nth(n)
|
|||
|
mask = (sizes < max_len)._values
|
|||
|
|
|||
|
# set the results which don't meet the criteria
|
|||
|
if len(result) and mask.any():
|
|||
|
result.loc[mask] = np.nan
|
|||
|
|
|||
|
# reset/reindex to the original groups
|
|||
|
if len(self.obj) == len(dropped) or len(result) == len(
|
|||
|
self.grouper.result_index
|
|||
|
):
|
|||
|
result.index = self.grouper.result_index
|
|||
|
else:
|
|||
|
result = result.reindex(self.grouper.result_index)
|
|||
|
|
|||
|
return result
|
|||
|
|
|||
|
@final
|
|||
|
def quantile(self, q=0.5, interpolation: str = "linear"):
|
|||
|
"""
|
|||
|
Return group values at the given quantile, a la numpy.percentile.
|
|||
|
|
|||
|
Parameters
|
|||
|
----------
|
|||
|
q : float or array-like, default 0.5 (50% quantile)
|
|||
|
Value(s) between 0 and 1 providing the quantile(s) to compute.
|
|||
|
interpolation : {'linear', 'lower', 'higher', 'midpoint', 'nearest'}
|
|||
|
Method to use when the desired quantile falls between two points.
|
|||
|
|
|||
|
Returns
|
|||
|
-------
|
|||
|
Series or DataFrame
|
|||
|
Return type determined by caller of GroupBy object.
|
|||
|
|
|||
|
See Also
|
|||
|
--------
|
|||
|
Series.quantile : Similar method for Series.
|
|||
|
DataFrame.quantile : Similar method for DataFrame.
|
|||
|
numpy.percentile : NumPy method to compute qth percentile.
|
|||
|
|
|||
|
Examples
|
|||
|
--------
|
|||
|
>>> df = pd.DataFrame([
|
|||
|
... ['a', 1], ['a', 2], ['a', 3],
|
|||
|
... ['b', 1], ['b', 3], ['b', 5]
|
|||
|
... ], columns=['key', 'val'])
|
|||
|
>>> df.groupby('key').quantile()
|
|||
|
val
|
|||
|
key
|
|||
|
a 2.0
|
|||
|
b 3.0
|
|||
|
"""
|
|||
|
from pandas import concat
|
|||
|
|
|||
|
def pre_processor(vals: np.ndarray) -> Tuple[np.ndarray, Optional[Type]]:
|
|||
|
if is_object_dtype(vals):
|
|||
|
raise TypeError(
|
|||
|
"'quantile' cannot be performed against 'object' dtypes!"
|
|||
|
)
|
|||
|
|
|||
|
inference = None
|
|||
|
if is_integer_dtype(vals.dtype):
|
|||
|
if is_extension_array_dtype(vals.dtype):
|
|||
|
vals = vals.to_numpy(dtype=float, na_value=np.nan)
|
|||
|
inference = np.int64
|
|||
|
elif is_bool_dtype(vals.dtype) and is_extension_array_dtype(vals.dtype):
|
|||
|
vals = vals.to_numpy(dtype=float, na_value=np.nan)
|
|||
|
elif is_datetime64_dtype(vals.dtype):
|
|||
|
inference = "datetime64[ns]"
|
|||
|
vals = np.asarray(vals).astype(float)
|
|||
|
elif is_timedelta64_dtype(vals.dtype):
|
|||
|
inference = "timedelta64[ns]"
|
|||
|
vals = np.asarray(vals).astype(float)
|
|||
|
|
|||
|
return vals, inference
|
|||
|
|
|||
|
def post_processor(vals: np.ndarray, inference: Optional[Type]) -> np.ndarray:
|
|||
|
if inference:
|
|||
|
# Check for edge case
|
|||
|
if not (
|
|||
|
is_integer_dtype(inference)
|
|||
|
and interpolation in {"linear", "midpoint"}
|
|||
|
):
|
|||
|
vals = vals.astype(inference)
|
|||
|
|
|||
|
return vals
|
|||
|
|
|||
|
if is_scalar(q):
|
|||
|
return self._get_cythonized_result(
|
|||
|
"group_quantile",
|
|||
|
aggregate=True,
|
|||
|
numeric_only=False,
|
|||
|
needs_values=True,
|
|||
|
needs_mask=True,
|
|||
|
cython_dtype=np.dtype(np.float64),
|
|||
|
pre_processing=pre_processor,
|
|||
|
post_processing=post_processor,
|
|||
|
q=q,
|
|||
|
interpolation=interpolation,
|
|||
|
)
|
|||
|
else:
|
|||
|
results = [
|
|||
|
self._get_cythonized_result(
|
|||
|
"group_quantile",
|
|||
|
aggregate=True,
|
|||
|
needs_values=True,
|
|||
|
needs_mask=True,
|
|||
|
cython_dtype=np.dtype(np.float64),
|
|||
|
pre_processing=pre_processor,
|
|||
|
post_processing=post_processor,
|
|||
|
q=qi,
|
|||
|
interpolation=interpolation,
|
|||
|
)
|
|||
|
for qi in q
|
|||
|
]
|
|||
|
result = concat(results, axis=self.axis, keys=q)
|
|||
|
# fix levels to place quantiles on the inside
|
|||
|
# TODO(GH-10710): Ideally, we could write this as
|
|||
|
# >>> result.stack(0).loc[pd.IndexSlice[:, ..., q], :]
|
|||
|
# but this hits https://github.com/pandas-dev/pandas/issues/10710
|
|||
|
# which doesn't reorder the list-like `q` on the inner level.
|
|||
|
order = list(range(1, result.axes[self.axis].nlevels)) + [0]
|
|||
|
|
|||
|
# temporarily saves the index names
|
|||
|
index_names = np.array(result.axes[self.axis].names)
|
|||
|
|
|||
|
# set index names to positions to avoid confusion
|
|||
|
result.axes[self.axis].names = np.arange(len(index_names))
|
|||
|
|
|||
|
# place quantiles on the inside
|
|||
|
if isinstance(result, Series):
|
|||
|
result = result.reorder_levels(order)
|
|||
|
else:
|
|||
|
result = result.reorder_levels(order, axis=self.axis)
|
|||
|
|
|||
|
# restore the index names in order
|
|||
|
result.axes[self.axis].names = index_names[order]
|
|||
|
|
|||
|
# reorder rows to keep things sorted
|
|||
|
indices = (
|
|||
|
np.arange(result.shape[self.axis])
|
|||
|
.reshape([len(q), self.ngroups])
|
|||
|
.T.flatten()
|
|||
|
)
|
|||
|
return result.take(indices, axis=self.axis)
|
|||
|
|
|||
|
@final
|
|||
|
@Substitution(name="groupby")
|
|||
|
def ngroup(self, ascending: bool = True):
|
|||
|
"""
|
|||
|
Number each group from 0 to the number of groups - 1.
|
|||
|
|
|||
|
This is the enumerative complement of cumcount. Note that the
|
|||
|
numbers given to the groups match the order in which the groups
|
|||
|
would be seen when iterating over the groupby object, not the
|
|||
|
order they are first observed.
|
|||
|
|
|||
|
Parameters
|
|||
|
----------
|
|||
|
ascending : bool, default True
|
|||
|
If False, number in reverse, from number of group - 1 to 0.
|
|||
|
|
|||
|
Returns
|
|||
|
-------
|
|||
|
Series
|
|||
|
Unique numbers for each group.
|
|||
|
|
|||
|
See Also
|
|||
|
--------
|
|||
|
.cumcount : Number the rows in each group.
|
|||
|
|
|||
|
Examples
|
|||
|
--------
|
|||
|
>>> df = pd.DataFrame({"A": list("aaabba")})
|
|||
|
>>> df
|
|||
|
A
|
|||
|
0 a
|
|||
|
1 a
|
|||
|
2 a
|
|||
|
3 b
|
|||
|
4 b
|
|||
|
5 a
|
|||
|
>>> df.groupby('A').ngroup()
|
|||
|
0 0
|
|||
|
1 0
|
|||
|
2 0
|
|||
|
3 1
|
|||
|
4 1
|
|||
|
5 0
|
|||
|
dtype: int64
|
|||
|
>>> df.groupby('A').ngroup(ascending=False)
|
|||
|
0 1
|
|||
|
1 1
|
|||
|
2 1
|
|||
|
3 0
|
|||
|
4 0
|
|||
|
5 1
|
|||
|
dtype: int64
|
|||
|
>>> df.groupby(["A", [1,1,2,3,2,1]]).ngroup()
|
|||
|
0 0
|
|||
|
1 0
|
|||
|
2 1
|
|||
|
3 3
|
|||
|
4 2
|
|||
|
5 0
|
|||
|
dtype: int64
|
|||
|
"""
|
|||
|
with group_selection_context(self):
|
|||
|
index = self._selected_obj.index
|
|||
|
result = self._obj_1d_constructor(self.grouper.group_info[0], index)
|
|||
|
if not ascending:
|
|||
|
result = self.ngroups - 1 - result
|
|||
|
return result
|
|||
|
|
|||
|
@final
|
|||
|
@Substitution(name="groupby")
|
|||
|
def cumcount(self, ascending: bool = True):
|
|||
|
"""
|
|||
|
Number each item in each group from 0 to the length of that group - 1.
|
|||
|
|
|||
|
Essentially this is equivalent to
|
|||
|
|
|||
|
.. code-block:: python
|
|||
|
|
|||
|
self.apply(lambda x: pd.Series(np.arange(len(x)), x.index))
|
|||
|
|
|||
|
Parameters
|
|||
|
----------
|
|||
|
ascending : bool, default True
|
|||
|
If False, number in reverse, from length of group - 1 to 0.
|
|||
|
|
|||
|
Returns
|
|||
|
-------
|
|||
|
Series
|
|||
|
Sequence number of each element within each group.
|
|||
|
|
|||
|
See Also
|
|||
|
--------
|
|||
|
.ngroup : Number the groups themselves.
|
|||
|
|
|||
|
Examples
|
|||
|
--------
|
|||
|
>>> df = pd.DataFrame([['a'], ['a'], ['a'], ['b'], ['b'], ['a']],
|
|||
|
... columns=['A'])
|
|||
|
>>> df
|
|||
|
A
|
|||
|
0 a
|
|||
|
1 a
|
|||
|
2 a
|
|||
|
3 b
|
|||
|
4 b
|
|||
|
5 a
|
|||
|
>>> df.groupby('A').cumcount()
|
|||
|
0 0
|
|||
|
1 1
|
|||
|
2 2
|
|||
|
3 0
|
|||
|
4 1
|
|||
|
5 3
|
|||
|
dtype: int64
|
|||
|
>>> df.groupby('A').cumcount(ascending=False)
|
|||
|
0 3
|
|||
|
1 2
|
|||
|
2 1
|
|||
|
3 1
|
|||
|
4 0
|
|||
|
5 0
|
|||
|
dtype: int64
|
|||
|
"""
|
|||
|
with group_selection_context(self):
|
|||
|
index = self._selected_obj._get_axis(self.axis)
|
|||
|
cumcounts = self._cumcount_array(ascending=ascending)
|
|||
|
return self._obj_1d_constructor(cumcounts, index)
|
|||
|
|
|||
|
@final
|
|||
|
@Substitution(name="groupby")
|
|||
|
@Appender(_common_see_also)
|
|||
|
def rank(
|
|||
|
self,
|
|||
|
method: str = "average",
|
|||
|
ascending: bool = True,
|
|||
|
na_option: str = "keep",
|
|||
|
pct: bool = False,
|
|||
|
axis: int = 0,
|
|||
|
):
|
|||
|
"""
|
|||
|
Provide the rank of values within each group.
|
|||
|
|
|||
|
Parameters
|
|||
|
----------
|
|||
|
method : {'average', 'min', 'max', 'first', 'dense'}, default 'average'
|
|||
|
* average: average rank of group.
|
|||
|
* min: lowest rank in group.
|
|||
|
* max: highest rank in group.
|
|||
|
* first: ranks assigned in order they appear in the array.
|
|||
|
* dense: like 'min', but rank always increases by 1 between groups.
|
|||
|
ascending : bool, default True
|
|||
|
False for ranks by high (1) to low (N).
|
|||
|
na_option : {'keep', 'top', 'bottom'}, default 'keep'
|
|||
|
* keep: leave NA values where they are.
|
|||
|
* top: smallest rank if ascending.
|
|||
|
* bottom: smallest rank if descending.
|
|||
|
pct : bool, default False
|
|||
|
Compute percentage rank of data within each group.
|
|||
|
axis : int, default 0
|
|||
|
The axis of the object over which to compute the rank.
|
|||
|
|
|||
|
Returns
|
|||
|
-------
|
|||
|
DataFrame with ranking of values within each group
|
|||
|
"""
|
|||
|
if na_option not in {"keep", "top", "bottom"}:
|
|||
|
msg = "na_option must be one of 'keep', 'top', or 'bottom'"
|
|||
|
raise ValueError(msg)
|
|||
|
return self._cython_transform(
|
|||
|
"rank",
|
|||
|
numeric_only=False,
|
|||
|
ties_method=method,
|
|||
|
ascending=ascending,
|
|||
|
na_option=na_option,
|
|||
|
pct=pct,
|
|||
|
axis=axis,
|
|||
|
)
|
|||
|
|
|||
|
@final
|
|||
|
@Substitution(name="groupby")
|
|||
|
@Appender(_common_see_also)
|
|||
|
def cumprod(self, axis=0, *args, **kwargs):
|
|||
|
"""
|
|||
|
Cumulative product for each group.
|
|||
|
|
|||
|
Returns
|
|||
|
-------
|
|||
|
Series or DataFrame
|
|||
|
"""
|
|||
|
nv.validate_groupby_func("cumprod", args, kwargs, ["numeric_only", "skipna"])
|
|||
|
if axis != 0:
|
|||
|
return self.apply(lambda x: x.cumprod(axis=axis, **kwargs))
|
|||
|
|
|||
|
return self._cython_transform("cumprod", **kwargs)
|
|||
|
|
|||
|
@final
|
|||
|
@Substitution(name="groupby")
|
|||
|
@Appender(_common_see_also)
|
|||
|
def cumsum(self, axis=0, *args, **kwargs):
|
|||
|
"""
|
|||
|
Cumulative sum for each group.
|
|||
|
|
|||
|
Returns
|
|||
|
-------
|
|||
|
Series or DataFrame
|
|||
|
"""
|
|||
|
nv.validate_groupby_func("cumsum", args, kwargs, ["numeric_only", "skipna"])
|
|||
|
if axis != 0:
|
|||
|
return self.apply(lambda x: x.cumsum(axis=axis, **kwargs))
|
|||
|
|
|||
|
return self._cython_transform("cumsum", **kwargs)
|
|||
|
|
|||
|
@final
|
|||
|
@Substitution(name="groupby")
|
|||
|
@Appender(_common_see_also)
|
|||
|
def cummin(self, axis=0, **kwargs):
|
|||
|
"""
|
|||
|
Cumulative min for each group.
|
|||
|
|
|||
|
Returns
|
|||
|
-------
|
|||
|
Series or DataFrame
|
|||
|
"""
|
|||
|
if axis != 0:
|
|||
|
return self.apply(lambda x: np.minimum.accumulate(x, axis))
|
|||
|
|
|||
|
return self._cython_transform("cummin", numeric_only=False)
|
|||
|
|
|||
|
@final
|
|||
|
@Substitution(name="groupby")
|
|||
|
@Appender(_common_see_also)
|
|||
|
def cummax(self, axis=0, **kwargs):
|
|||
|
"""
|
|||
|
Cumulative max for each group.
|
|||
|
|
|||
|
Returns
|
|||
|
-------
|
|||
|
Series or DataFrame
|
|||
|
"""
|
|||
|
if axis != 0:
|
|||
|
return self.apply(lambda x: np.maximum.accumulate(x, axis))
|
|||
|
|
|||
|
return self._cython_transform("cummax", numeric_only=False)
|
|||
|
|
|||
|
@final
|
|||
|
def _get_cythonized_result(
|
|||
|
self,
|
|||
|
how: str,
|
|||
|
cython_dtype: np.dtype,
|
|||
|
aggregate: bool = False,
|
|||
|
numeric_only: bool = True,
|
|||
|
needs_counts: bool = False,
|
|||
|
needs_values: bool = False,
|
|||
|
needs_2d: bool = False,
|
|||
|
min_count: Optional[int] = None,
|
|||
|
needs_mask: bool = False,
|
|||
|
needs_ngroups: bool = False,
|
|||
|
result_is_index: bool = False,
|
|||
|
pre_processing=None,
|
|||
|
post_processing=None,
|
|||
|
**kwargs,
|
|||
|
):
|
|||
|
"""
|
|||
|
Get result for Cythonized functions.
|
|||
|
|
|||
|
Parameters
|
|||
|
----------
|
|||
|
how : str, Cythonized function name to be called
|
|||
|
cython_dtype : np.dtype
|
|||
|
Type of the array that will be modified by the Cython call.
|
|||
|
aggregate : bool, default False
|
|||
|
Whether the result should be aggregated to match the number of
|
|||
|
groups
|
|||
|
numeric_only : bool, default True
|
|||
|
Whether only numeric datatypes should be computed
|
|||
|
needs_counts : bool, default False
|
|||
|
Whether the counts should be a part of the Cython call
|
|||
|
needs_values : bool, default False
|
|||
|
Whether the values should be a part of the Cython call
|
|||
|
signature
|
|||
|
needs_2d : bool, default False
|
|||
|
Whether the values and result of the Cython call signature
|
|||
|
are 2-dimensional.
|
|||
|
min_count : int, default None
|
|||
|
When not None, min_count for the Cython call
|
|||
|
needs_mask : bool, default False
|
|||
|
Whether boolean mask needs to be part of the Cython call
|
|||
|
signature
|
|||
|
needs_ngroups : bool, default False
|
|||
|
Whether number of groups is part of the Cython call signature
|
|||
|
result_is_index : bool, default False
|
|||
|
Whether the result of the Cython operation is an index of
|
|||
|
values to be retrieved, instead of the actual values themselves
|
|||
|
pre_processing : function, default None
|
|||
|
Function to be applied to `values` prior to passing to Cython.
|
|||
|
Function should return a tuple where the first element is the
|
|||
|
values to be passed to Cython and the second element is an optional
|
|||
|
type which the values should be converted to after being returned
|
|||
|
by the Cython operation. This function is also responsible for
|
|||
|
raising a TypeError if the values have an invalid type. Raises
|
|||
|
if `needs_values` is False.
|
|||
|
post_processing : function, default None
|
|||
|
Function to be applied to result of Cython function. Should accept
|
|||
|
an array of values as the first argument and type inferences as its
|
|||
|
second argument, i.e. the signature should be
|
|||
|
(ndarray, Type).
|
|||
|
**kwargs : dict
|
|||
|
Extra arguments to be passed back to Cython funcs
|
|||
|
|
|||
|
Returns
|
|||
|
-------
|
|||
|
`Series` or `DataFrame` with filled values
|
|||
|
"""
|
|||
|
if result_is_index and aggregate:
|
|||
|
raise ValueError("'result_is_index' and 'aggregate' cannot both be True!")
|
|||
|
if post_processing and not callable(post_processing):
|
|||
|
raise ValueError("'post_processing' must be a callable!")
|
|||
|
if pre_processing:
|
|||
|
if not callable(pre_processing):
|
|||
|
raise ValueError("'pre_processing' must be a callable!")
|
|||
|
if not needs_values:
|
|||
|
raise ValueError(
|
|||
|
"Cannot use 'pre_processing' without specifying 'needs_values'!"
|
|||
|
)
|
|||
|
|
|||
|
grouper = self.grouper
|
|||
|
|
|||
|
labels, _, ngroups = grouper.group_info
|
|||
|
output: Dict[base.OutputKey, np.ndarray] = {}
|
|||
|
base_func = getattr(libgroupby, how)
|
|||
|
|
|||
|
error_msg = ""
|
|||
|
for idx, obj in enumerate(self._iterate_slices()):
|
|||
|
name = obj.name
|
|||
|
values = obj._values
|
|||
|
|
|||
|
if numeric_only and not is_numeric_dtype(values):
|
|||
|
continue
|
|||
|
|
|||
|
if aggregate:
|
|||
|
result_sz = ngroups
|
|||
|
else:
|
|||
|
result_sz = len(values)
|
|||
|
|
|||
|
result = np.zeros(result_sz, dtype=cython_dtype)
|
|||
|
if needs_2d:
|
|||
|
result = result.reshape((-1, 1))
|
|||
|
func = partial(base_func, result)
|
|||
|
|
|||
|
inferences = None
|
|||
|
|
|||
|
if needs_counts:
|
|||
|
counts = np.zeros(self.ngroups, dtype=np.int64)
|
|||
|
func = partial(func, counts)
|
|||
|
|
|||
|
if needs_values:
|
|||
|
vals = values
|
|||
|
if pre_processing:
|
|||
|
try:
|
|||
|
vals, inferences = pre_processing(vals)
|
|||
|
except TypeError as e:
|
|||
|
error_msg = str(e)
|
|||
|
continue
|
|||
|
vals = vals.astype(cython_dtype, copy=False)
|
|||
|
if needs_2d:
|
|||
|
vals = vals.reshape((-1, 1))
|
|||
|
func = partial(func, vals)
|
|||
|
|
|||
|
func = partial(func, labels)
|
|||
|
|
|||
|
if min_count is not None:
|
|||
|
func = partial(func, min_count)
|
|||
|
|
|||
|
if needs_mask:
|
|||
|
mask = isna(values).view(np.uint8)
|
|||
|
func = partial(func, mask)
|
|||
|
|
|||
|
if needs_ngroups:
|
|||
|
func = partial(func, ngroups)
|
|||
|
|
|||
|
func(**kwargs) # Call func to modify indexer values in place
|
|||
|
|
|||
|
if needs_2d:
|
|||
|
result = result.reshape(-1)
|
|||
|
|
|||
|
if result_is_index:
|
|||
|
result = algorithms.take_nd(values, result)
|
|||
|
|
|||
|
if post_processing:
|
|||
|
result = post_processing(result, inferences)
|
|||
|
|
|||
|
key = base.OutputKey(label=name, position=idx)
|
|||
|
output[key] = result
|
|||
|
|
|||
|
# error_msg is "" on an frame/series with no rows or columns
|
|||
|
if not output and error_msg != "":
|
|||
|
raise TypeError(error_msg)
|
|||
|
|
|||
|
if aggregate:
|
|||
|
return self._wrap_aggregated_output(output, index=self.grouper.result_index)
|
|||
|
else:
|
|||
|
return self._wrap_transformed_output(output)
|
|||
|
|
|||
|
@final
|
|||
|
@Substitution(name="groupby")
|
|||
|
def shift(self, periods=1, freq=None, axis=0, fill_value=None):
|
|||
|
"""
|
|||
|
Shift each group by periods observations.
|
|||
|
|
|||
|
If freq is passed, the index will be increased using the periods and the freq.
|
|||
|
|
|||
|
Parameters
|
|||
|
----------
|
|||
|
periods : int, default 1
|
|||
|
Number of periods to shift.
|
|||
|
freq : str, optional
|
|||
|
Frequency string.
|
|||
|
axis : axis to shift, default 0
|
|||
|
Shift direction.
|
|||
|
fill_value : optional
|
|||
|
The scalar value to use for newly introduced missing values.
|
|||
|
|
|||
|
.. versionadded:: 0.24.0
|
|||
|
|
|||
|
Returns
|
|||
|
-------
|
|||
|
Series or DataFrame
|
|||
|
Object shifted within each group.
|
|||
|
|
|||
|
See Also
|
|||
|
--------
|
|||
|
Index.shift : Shift values of Index.
|
|||
|
tshift : Shift the time index, using the index’s frequency
|
|||
|
if available.
|
|||
|
"""
|
|||
|
if freq is not None or axis != 0 or not isna(fill_value):
|
|||
|
return self.apply(lambda x: x.shift(periods, freq, axis, fill_value))
|
|||
|
|
|||
|
return self._get_cythonized_result(
|
|||
|
"group_shift_indexer",
|
|||
|
numeric_only=False,
|
|||
|
cython_dtype=np.dtype(np.int64),
|
|||
|
needs_ngroups=True,
|
|||
|
result_is_index=True,
|
|||
|
periods=periods,
|
|||
|
)
|
|||
|
|
|||
|
@final
|
|||
|
@Substitution(name="groupby")
|
|||
|
@Appender(_common_see_also)
|
|||
|
def pct_change(self, periods=1, fill_method="pad", limit=None, freq=None, axis=0):
|
|||
|
"""
|
|||
|
Calculate pct_change of each value to previous entry in group.
|
|||
|
|
|||
|
Returns
|
|||
|
-------
|
|||
|
Series or DataFrame
|
|||
|
Percentage changes within each group.
|
|||
|
"""
|
|||
|
if freq is not None or axis != 0:
|
|||
|
return self.apply(
|
|||
|
lambda x: x.pct_change(
|
|||
|
periods=periods,
|
|||
|
fill_method=fill_method,
|
|||
|
limit=limit,
|
|||
|
freq=freq,
|
|||
|
axis=axis,
|
|||
|
)
|
|||
|
)
|
|||
|
if fill_method is None: # GH30463
|
|||
|
fill_method = "pad"
|
|||
|
limit = 0
|
|||
|
filled = getattr(self, fill_method)(limit=limit)
|
|||
|
fill_grp = filled.groupby(self.grouper.codes, axis=self.axis)
|
|||
|
shifted = fill_grp.shift(periods=periods, freq=freq, axis=self.axis)
|
|||
|
return (filled / shifted) - 1
|
|||
|
|
|||
|
@final
|
|||
|
@Substitution(name="groupby")
|
|||
|
@Substitution(see_also=_common_see_also)
|
|||
|
def head(self, n=5):
|
|||
|
"""
|
|||
|
Return first n rows of each group.
|
|||
|
|
|||
|
Similar to ``.apply(lambda x: x.head(n))``, but it returns a subset of rows
|
|||
|
from the original DataFrame with original index and order preserved
|
|||
|
(``as_index`` flag is ignored).
|
|||
|
|
|||
|
Does not work for negative values of `n`.
|
|||
|
|
|||
|
Returns
|
|||
|
-------
|
|||
|
Series or DataFrame
|
|||
|
%(see_also)s
|
|||
|
Examples
|
|||
|
--------
|
|||
|
|
|||
|
>>> df = pd.DataFrame([[1, 2], [1, 4], [5, 6]],
|
|||
|
... columns=['A', 'B'])
|
|||
|
>>> df.groupby('A').head(1)
|
|||
|
A B
|
|||
|
0 1 2
|
|||
|
2 5 6
|
|||
|
>>> df.groupby('A').head(-1)
|
|||
|
Empty DataFrame
|
|||
|
Columns: [A, B]
|
|||
|
Index: []
|
|||
|
"""
|
|||
|
self._reset_group_selection()
|
|||
|
mask = self._cumcount_array() < n
|
|||
|
if self.axis == 0:
|
|||
|
return self._selected_obj[mask]
|
|||
|
else:
|
|||
|
return self._selected_obj.iloc[:, mask]
|
|||
|
|
|||
|
@final
|
|||
|
@Substitution(name="groupby")
|
|||
|
@Substitution(see_also=_common_see_also)
|
|||
|
def tail(self, n=5):
|
|||
|
"""
|
|||
|
Return last n rows of each group.
|
|||
|
|
|||
|
Similar to ``.apply(lambda x: x.tail(n))``, but it returns a subset of rows
|
|||
|
from the original DataFrame with original index and order preserved
|
|||
|
(``as_index`` flag is ignored).
|
|||
|
|
|||
|
Does not work for negative values of `n`.
|
|||
|
|
|||
|
Returns
|
|||
|
-------
|
|||
|
Series or DataFrame
|
|||
|
%(see_also)s
|
|||
|
Examples
|
|||
|
--------
|
|||
|
|
|||
|
>>> df = pd.DataFrame([['a', 1], ['a', 2], ['b', 1], ['b', 2]],
|
|||
|
... columns=['A', 'B'])
|
|||
|
>>> df.groupby('A').tail(1)
|
|||
|
A B
|
|||
|
1 a 2
|
|||
|
3 b 2
|
|||
|
>>> df.groupby('A').tail(-1)
|
|||
|
Empty DataFrame
|
|||
|
Columns: [A, B]
|
|||
|
Index: []
|
|||
|
"""
|
|||
|
self._reset_group_selection()
|
|||
|
mask = self._cumcount_array(ascending=False) < n
|
|||
|
if self.axis == 0:
|
|||
|
return self._selected_obj[mask]
|
|||
|
else:
|
|||
|
return self._selected_obj.iloc[:, mask]
|
|||
|
|
|||
|
@final
|
|||
|
def _reindex_output(
|
|||
|
self, output: OutputFrameOrSeries, fill_value: Scalar = np.NaN
|
|||
|
) -> OutputFrameOrSeries:
|
|||
|
"""
|
|||
|
If we have categorical groupers, then we might want to make sure that
|
|||
|
we have a fully re-indexed output to the levels. This means expanding
|
|||
|
the output space to accommodate all values in the cartesian product of
|
|||
|
our groups, regardless of whether they were observed in the data or
|
|||
|
not. This will expand the output space if there are missing groups.
|
|||
|
|
|||
|
The method returns early without modifying the input if the number of
|
|||
|
groupings is less than 2, self.observed == True or none of the groupers
|
|||
|
are categorical.
|
|||
|
|
|||
|
Parameters
|
|||
|
----------
|
|||
|
output : Series or DataFrame
|
|||
|
Object resulting from grouping and applying an operation.
|
|||
|
fill_value : scalar, default np.NaN
|
|||
|
Value to use for unobserved categories if self.observed is False.
|
|||
|
|
|||
|
Returns
|
|||
|
-------
|
|||
|
Series or DataFrame
|
|||
|
Object (potentially) re-indexed to include all possible groups.
|
|||
|
"""
|
|||
|
groupings = self.grouper.groupings
|
|||
|
if groupings is None:
|
|||
|
return output
|
|||
|
elif len(groupings) == 1:
|
|||
|
return output
|
|||
|
|
|||
|
# if we only care about the observed values
|
|||
|
# we are done
|
|||
|
elif self.observed:
|
|||
|
return output
|
|||
|
|
|||
|
# reindexing only applies to a Categorical grouper
|
|||
|
elif not any(
|
|||
|
isinstance(ping.grouper, (Categorical, CategoricalIndex))
|
|||
|
for ping in groupings
|
|||
|
):
|
|||
|
return output
|
|||
|
|
|||
|
levels_list = [ping.group_index for ping in groupings]
|
|||
|
index, _ = MultiIndex.from_product(
|
|||
|
levels_list, names=self.grouper.names
|
|||
|
).sortlevel()
|
|||
|
|
|||
|
if self.as_index:
|
|||
|
d = {
|
|||
|
self.obj._get_axis_name(self.axis): index,
|
|||
|
"copy": False,
|
|||
|
"fill_value": fill_value,
|
|||
|
}
|
|||
|
return output.reindex(**d)
|
|||
|
|
|||
|
# GH 13204
|
|||
|
# Here, the categorical in-axis groupers, which need to be fully
|
|||
|
# expanded, are columns in `output`. An idea is to do:
|
|||
|
# output = output.set_index(self.grouper.names)
|
|||
|
# .reindex(index).reset_index()
|
|||
|
# but special care has to be taken because of possible not-in-axis
|
|||
|
# groupers.
|
|||
|
# So, we manually select and drop the in-axis grouper columns,
|
|||
|
# reindex `output`, and then reset the in-axis grouper columns.
|
|||
|
|
|||
|
# Select in-axis groupers
|
|||
|
in_axis_grps = (
|
|||
|
(i, ping.name) for (i, ping) in enumerate(groupings) if ping.in_axis
|
|||
|
)
|
|||
|
g_nums, g_names = zip(*in_axis_grps)
|
|||
|
|
|||
|
output = output.drop(labels=list(g_names), axis=1)
|
|||
|
|
|||
|
# Set a temp index and reindex (possibly expanding)
|
|||
|
output = output.set_index(self.grouper.result_index).reindex(
|
|||
|
index, copy=False, fill_value=fill_value
|
|||
|
)
|
|||
|
|
|||
|
# Reset in-axis grouper columns
|
|||
|
# (using level numbers `g_nums` because level names may not be unique)
|
|||
|
output = output.reset_index(level=g_nums)
|
|||
|
|
|||
|
return output.reset_index(drop=True)
|
|||
|
|
|||
|
@final
|
|||
|
def sample(
|
|||
|
self,
|
|||
|
n: Optional[int] = None,
|
|||
|
frac: Optional[float] = None,
|
|||
|
replace: bool = False,
|
|||
|
weights: Optional[Union[Sequence, Series]] = None,
|
|||
|
random_state=None,
|
|||
|
):
|
|||
|
"""
|
|||
|
Return a random sample of items from each group.
|
|||
|
|
|||
|
You can use `random_state` for reproducibility.
|
|||
|
|
|||
|
.. versionadded:: 1.1.0
|
|||
|
|
|||
|
Parameters
|
|||
|
----------
|
|||
|
n : int, optional
|
|||
|
Number of items to return for each group. Cannot be used with
|
|||
|
`frac` and must be no larger than the smallest group unless
|
|||
|
`replace` is True. Default is one if `frac` is None.
|
|||
|
frac : float, optional
|
|||
|
Fraction of items to return. Cannot be used with `n`.
|
|||
|
replace : bool, default False
|
|||
|
Allow or disallow sampling of the same row more than once.
|
|||
|
weights : list-like, optional
|
|||
|
Default None results in equal probability weighting.
|
|||
|
If passed a list-like then values must have the same length as
|
|||
|
the underlying DataFrame or Series object and will be used as
|
|||
|
sampling probabilities after normalization within each group.
|
|||
|
Values must be non-negative with at least one positive element
|
|||
|
within each group.
|
|||
|
random_state : int, array-like, BitGenerator, np.random.RandomState, optional
|
|||
|
If int, array-like, or BitGenerator (NumPy>=1.17), seed for
|
|||
|
random number generator
|
|||
|
If np.random.RandomState, use as numpy RandomState object.
|
|||
|
|
|||
|
Returns
|
|||
|
-------
|
|||
|
Series or DataFrame
|
|||
|
A new object of same type as caller containing items randomly
|
|||
|
sampled within each group from the caller object.
|
|||
|
|
|||
|
See Also
|
|||
|
--------
|
|||
|
DataFrame.sample: Generate random samples from a DataFrame object.
|
|||
|
numpy.random.choice: Generate a random sample from a given 1-D numpy
|
|||
|
array.
|
|||
|
|
|||
|
Examples
|
|||
|
--------
|
|||
|
>>> df = pd.DataFrame(
|
|||
|
... {"a": ["red"] * 2 + ["blue"] * 2 + ["black"] * 2, "b": range(6)}
|
|||
|
... )
|
|||
|
>>> df
|
|||
|
a b
|
|||
|
0 red 0
|
|||
|
1 red 1
|
|||
|
2 blue 2
|
|||
|
3 blue 3
|
|||
|
4 black 4
|
|||
|
5 black 5
|
|||
|
|
|||
|
Select one row at random for each distinct value in column a. The
|
|||
|
`random_state` argument can be used to guarantee reproducibility:
|
|||
|
|
|||
|
>>> df.groupby("a").sample(n=1, random_state=1)
|
|||
|
a b
|
|||
|
4 black 4
|
|||
|
2 blue 2
|
|||
|
1 red 1
|
|||
|
|
|||
|
Set `frac` to sample fixed proportions rather than counts:
|
|||
|
|
|||
|
>>> df.groupby("a")["b"].sample(frac=0.5, random_state=2)
|
|||
|
5 5
|
|||
|
2 2
|
|||
|
0 0
|
|||
|
Name: b, dtype: int64
|
|||
|
|
|||
|
Control sample probabilities within groups by setting weights:
|
|||
|
|
|||
|
>>> df.groupby("a").sample(
|
|||
|
... n=1,
|
|||
|
... weights=[1, 1, 1, 0, 0, 1],
|
|||
|
... random_state=1,
|
|||
|
... )
|
|||
|
a b
|
|||
|
5 black 5
|
|||
|
2 blue 2
|
|||
|
0 red 0
|
|||
|
"""
|
|||
|
from pandas.core.reshape.concat import concat
|
|||
|
|
|||
|
if weights is not None:
|
|||
|
weights = Series(weights, index=self._selected_obj.index)
|
|||
|
ws = [weights[idx] for idx in self.indices.values()]
|
|||
|
else:
|
|||
|
ws = [None] * self.ngroups
|
|||
|
|
|||
|
if random_state is not None:
|
|||
|
random_state = com.random_state(random_state)
|
|||
|
|
|||
|
samples = [
|
|||
|
obj.sample(
|
|||
|
n=n, frac=frac, replace=replace, weights=w, random_state=random_state
|
|||
|
)
|
|||
|
for (_, obj), w in zip(self, ws)
|
|||
|
]
|
|||
|
|
|||
|
return concat(samples, axis=self.axis)
|
|||
|
|
|||
|
|
|||
|
@doc(GroupBy)
|
|||
|
def get_groupby(
|
|||
|
obj: NDFrame,
|
|||
|
by: Optional[_KeysArgType] = None,
|
|||
|
axis: int = 0,
|
|||
|
level=None,
|
|||
|
grouper: "Optional[ops.BaseGrouper]" = None,
|
|||
|
exclusions=None,
|
|||
|
selection=None,
|
|||
|
as_index: bool = True,
|
|||
|
sort: bool = True,
|
|||
|
group_keys: bool = True,
|
|||
|
squeeze: bool = False,
|
|||
|
observed: bool = False,
|
|||
|
mutated: bool = False,
|
|||
|
dropna: bool = True,
|
|||
|
) -> GroupBy:
|
|||
|
|
|||
|
klass: Type[GroupBy]
|
|||
|
if isinstance(obj, Series):
|
|||
|
from pandas.core.groupby.generic import SeriesGroupBy
|
|||
|
|
|||
|
klass = SeriesGroupBy
|
|||
|
elif isinstance(obj, DataFrame):
|
|||
|
from pandas.core.groupby.generic import DataFrameGroupBy
|
|||
|
|
|||
|
klass = DataFrameGroupBy
|
|||
|
else:
|
|||
|
raise TypeError(f"invalid type: {obj}")
|
|||
|
|
|||
|
return klass(
|
|||
|
obj=obj,
|
|||
|
keys=by,
|
|||
|
axis=axis,
|
|||
|
level=level,
|
|||
|
grouper=grouper,
|
|||
|
exclusions=exclusions,
|
|||
|
selection=selection,
|
|||
|
as_index=as_index,
|
|||
|
sort=sort,
|
|||
|
group_keys=group_keys,
|
|||
|
squeeze=squeeze,
|
|||
|
observed=observed,
|
|||
|
mutated=mutated,
|
|||
|
dropna=dropna,
|
|||
|
)
|