304 lines
9.3 KiB
Python
304 lines
9.3 KiB
Python
|
from __future__ import annotations
|
||
|
|
||
|
from typing import (
|
||
|
TYPE_CHECKING,
|
||
|
Iterable,
|
||
|
Literal,
|
||
|
cast,
|
||
|
)
|
||
|
|
||
|
import numpy as np
|
||
|
|
||
|
from pandas._typing import PositionalIndexer
|
||
|
from pandas.util._decorators import (
|
||
|
cache_readonly,
|
||
|
doc,
|
||
|
)
|
||
|
|
||
|
from pandas.core.dtypes.common import (
|
||
|
is_integer,
|
||
|
is_list_like,
|
||
|
)
|
||
|
|
||
|
if TYPE_CHECKING:
|
||
|
from pandas import (
|
||
|
DataFrame,
|
||
|
Series,
|
||
|
)
|
||
|
from pandas.core.groupby import groupby
|
||
|
|
||
|
|
||
|
class GroupByIndexingMixin:
|
||
|
"""
|
||
|
Mixin for adding ._positional_selector to GroupBy.
|
||
|
"""
|
||
|
|
||
|
@cache_readonly
|
||
|
def _positional_selector(self) -> GroupByPositionalSelector:
|
||
|
"""
|
||
|
Return positional selection for each group.
|
||
|
|
||
|
``groupby._positional_selector[i:j]`` is similar to
|
||
|
``groupby.apply(lambda x: x.iloc[i:j])``
|
||
|
but much faster and preserves the original index and order.
|
||
|
|
||
|
``_positional_selector[]`` is compatible with and extends :meth:`~GroupBy.head`
|
||
|
and :meth:`~GroupBy.tail`. For example:
|
||
|
|
||
|
- ``head(5)``
|
||
|
- ``_positional_selector[5:-5]``
|
||
|
- ``tail(5)``
|
||
|
|
||
|
together return all the rows.
|
||
|
|
||
|
Allowed inputs for the index are:
|
||
|
|
||
|
- An integer valued iterable, e.g. ``range(2, 4)``.
|
||
|
- A comma separated list of integers and slices, e.g. ``5``, ``2, 4``, ``2:4``.
|
||
|
|
||
|
The output format is the same as :meth:`~GroupBy.head` and
|
||
|
:meth:`~GroupBy.tail`, namely
|
||
|
a subset of the ``DataFrame`` or ``Series`` with the index and order preserved.
|
||
|
|
||
|
Returns
|
||
|
-------
|
||
|
Series
|
||
|
The filtered subset of the original Series.
|
||
|
DataFrame
|
||
|
The filtered subset of the original DataFrame.
|
||
|
|
||
|
See Also
|
||
|
--------
|
||
|
DataFrame.iloc : Purely integer-location based indexing for selection by
|
||
|
position.
|
||
|
GroupBy.head : Return first n rows of each group.
|
||
|
GroupBy.tail : Return last n rows of each group.
|
||
|
GroupBy.nth : Take the nth row from each group if n is an int, or a
|
||
|
subset of rows, if n is a list of ints.
|
||
|
|
||
|
Notes
|
||
|
-----
|
||
|
- The slice step cannot be negative.
|
||
|
- If the index specification results in overlaps, the item is not duplicated.
|
||
|
- If the index specification changes the order of items, then
|
||
|
they are returned in their original order.
|
||
|
By contrast, ``DataFrame.iloc`` can change the row order.
|
||
|
- ``groupby()`` parameters such as as_index and dropna are ignored.
|
||
|
|
||
|
The differences between ``_positional_selector[]`` and :meth:`~GroupBy.nth`
|
||
|
with ``as_index=False`` are:
|
||
|
|
||
|
- Input to ``_positional_selector`` can include
|
||
|
one or more slices whereas ``nth``
|
||
|
just handles an integer or a list of integers.
|
||
|
- ``_positional_selector`` can accept a slice relative to the
|
||
|
last row of each group.
|
||
|
- ``_positional_selector`` does not have an equivalent to the
|
||
|
``nth()`` ``dropna`` parameter.
|
||
|
|
||
|
Examples
|
||
|
--------
|
||
|
>>> df = pd.DataFrame([["a", 1], ["a", 2], ["a", 3], ["b", 4], ["b", 5]],
|
||
|
... columns=["A", "B"])
|
||
|
>>> df.groupby("A")._positional_selector[1:2]
|
||
|
A B
|
||
|
1 a 2
|
||
|
4 b 5
|
||
|
|
||
|
>>> df.groupby("A")._positional_selector[1, -1]
|
||
|
A B
|
||
|
1 a 2
|
||
|
2 a 3
|
||
|
4 b 5
|
||
|
"""
|
||
|
if TYPE_CHECKING:
|
||
|
# pylint: disable-next=used-before-assignment
|
||
|
groupby_self = cast(groupby.GroupBy, self)
|
||
|
else:
|
||
|
groupby_self = self
|
||
|
|
||
|
return GroupByPositionalSelector(groupby_self)
|
||
|
|
||
|
def _make_mask_from_positional_indexer(
|
||
|
self,
|
||
|
arg: PositionalIndexer | tuple,
|
||
|
) -> np.ndarray:
|
||
|
if is_list_like(arg):
|
||
|
if all(is_integer(i) for i in cast(Iterable, arg)):
|
||
|
mask = self._make_mask_from_list(cast(Iterable[int], arg))
|
||
|
else:
|
||
|
mask = self._make_mask_from_tuple(cast(tuple, arg))
|
||
|
|
||
|
elif isinstance(arg, slice):
|
||
|
mask = self._make_mask_from_slice(arg)
|
||
|
elif is_integer(arg):
|
||
|
mask = self._make_mask_from_int(cast(int, arg))
|
||
|
else:
|
||
|
raise TypeError(
|
||
|
f"Invalid index {type(arg)}. "
|
||
|
"Must be integer, list-like, slice or a tuple of "
|
||
|
"integers and slices"
|
||
|
)
|
||
|
|
||
|
if isinstance(mask, bool):
|
||
|
if mask:
|
||
|
mask = self._ascending_count >= 0
|
||
|
else:
|
||
|
mask = self._ascending_count < 0
|
||
|
|
||
|
return cast(np.ndarray, mask)
|
||
|
|
||
|
def _make_mask_from_int(self, arg: int) -> np.ndarray:
|
||
|
if arg >= 0:
|
||
|
return self._ascending_count == arg
|
||
|
else:
|
||
|
return self._descending_count == (-arg - 1)
|
||
|
|
||
|
def _make_mask_from_list(self, args: Iterable[int]) -> bool | np.ndarray:
|
||
|
positive = [arg for arg in args if arg >= 0]
|
||
|
negative = [-arg - 1 for arg in args if arg < 0]
|
||
|
|
||
|
mask: bool | np.ndarray = False
|
||
|
|
||
|
if positive:
|
||
|
mask |= np.isin(self._ascending_count, positive)
|
||
|
|
||
|
if negative:
|
||
|
mask |= np.isin(self._descending_count, negative)
|
||
|
|
||
|
return mask
|
||
|
|
||
|
def _make_mask_from_tuple(self, args: tuple) -> bool | np.ndarray:
|
||
|
mask: bool | np.ndarray = False
|
||
|
|
||
|
for arg in args:
|
||
|
if is_integer(arg):
|
||
|
mask |= self._make_mask_from_int(cast(int, arg))
|
||
|
elif isinstance(arg, slice):
|
||
|
mask |= self._make_mask_from_slice(arg)
|
||
|
else:
|
||
|
raise ValueError(
|
||
|
f"Invalid argument {type(arg)}. Should be int or slice."
|
||
|
)
|
||
|
|
||
|
return mask
|
||
|
|
||
|
def _make_mask_from_slice(self, arg: slice) -> bool | np.ndarray:
|
||
|
start = arg.start
|
||
|
stop = arg.stop
|
||
|
step = arg.step
|
||
|
|
||
|
if step is not None and step < 0:
|
||
|
raise ValueError(f"Invalid step {step}. Must be non-negative")
|
||
|
|
||
|
mask: bool | np.ndarray = True
|
||
|
|
||
|
if step is None:
|
||
|
step = 1
|
||
|
|
||
|
if start is None:
|
||
|
if step > 1:
|
||
|
mask &= self._ascending_count % step == 0
|
||
|
|
||
|
elif start >= 0:
|
||
|
mask &= self._ascending_count >= start
|
||
|
|
||
|
if step > 1:
|
||
|
mask &= (self._ascending_count - start) % step == 0
|
||
|
|
||
|
else:
|
||
|
mask &= self._descending_count < -start
|
||
|
|
||
|
offset_array = self._descending_count + start + 1
|
||
|
limit_array = (
|
||
|
self._ascending_count + self._descending_count + (start + 1)
|
||
|
) < 0
|
||
|
offset_array = np.where(limit_array, self._ascending_count, offset_array)
|
||
|
|
||
|
mask &= offset_array % step == 0
|
||
|
|
||
|
if stop is not None:
|
||
|
if stop >= 0:
|
||
|
mask &= self._ascending_count < stop
|
||
|
else:
|
||
|
mask &= self._descending_count >= -stop
|
||
|
|
||
|
return mask
|
||
|
|
||
|
@cache_readonly
|
||
|
def _ascending_count(self) -> np.ndarray:
|
||
|
if TYPE_CHECKING:
|
||
|
groupby_self = cast(groupby.GroupBy, self)
|
||
|
else:
|
||
|
groupby_self = self
|
||
|
|
||
|
return groupby_self._cumcount_array()
|
||
|
|
||
|
@cache_readonly
|
||
|
def _descending_count(self) -> np.ndarray:
|
||
|
if TYPE_CHECKING:
|
||
|
groupby_self = cast(groupby.GroupBy, self)
|
||
|
else:
|
||
|
groupby_self = self
|
||
|
|
||
|
return groupby_self._cumcount_array(ascending=False)
|
||
|
|
||
|
|
||
|
@doc(GroupByIndexingMixin._positional_selector)
|
||
|
class GroupByPositionalSelector:
|
||
|
def __init__(self, groupby_object: groupby.GroupBy) -> None:
|
||
|
self.groupby_object = groupby_object
|
||
|
|
||
|
def __getitem__(self, arg: PositionalIndexer | tuple) -> DataFrame | Series:
|
||
|
"""
|
||
|
Select by positional index per group.
|
||
|
|
||
|
Implements GroupBy._positional_selector
|
||
|
|
||
|
Parameters
|
||
|
----------
|
||
|
arg : PositionalIndexer | tuple
|
||
|
Allowed values are:
|
||
|
- int
|
||
|
- int valued iterable such as list or range
|
||
|
- slice with step either None or positive
|
||
|
- tuple of integers and slices
|
||
|
|
||
|
Returns
|
||
|
-------
|
||
|
Series
|
||
|
The filtered subset of the original groupby Series.
|
||
|
DataFrame
|
||
|
The filtered subset of the original groupby DataFrame.
|
||
|
|
||
|
See Also
|
||
|
--------
|
||
|
DataFrame.iloc : Integer-location based indexing for selection by position.
|
||
|
GroupBy.head : Return first n rows of each group.
|
||
|
GroupBy.tail : Return last n rows of each group.
|
||
|
GroupBy._positional_selector : Return positional selection for each group.
|
||
|
GroupBy.nth : Take the nth row from each group if n is an int, or a
|
||
|
subset of rows, if n is a list of ints.
|
||
|
"""
|
||
|
mask = self.groupby_object._make_mask_from_positional_indexer(arg)
|
||
|
return self.groupby_object._mask_selected_obj(mask)
|
||
|
|
||
|
|
||
|
class GroupByNthSelector:
|
||
|
"""
|
||
|
Dynamically substituted for GroupBy.nth to enable both call and index
|
||
|
"""
|
||
|
|
||
|
def __init__(self, groupby_object: groupby.GroupBy) -> None:
|
||
|
self.groupby_object = groupby_object
|
||
|
|
||
|
def __call__(
|
||
|
self,
|
||
|
n: PositionalIndexer | tuple,
|
||
|
dropna: Literal["any", "all", None] = None,
|
||
|
) -> DataFrame | Series:
|
||
|
return self.groupby_object._nth(n, dropna)
|
||
|
|
||
|
def __getitem__(self, n: PositionalIndexer | tuple) -> DataFrame | Series:
|
||
|
return self.groupby_object._nth(n)
|