111 lines
3.5 KiB
Python
111 lines
3.5 KiB
Python
from typing import Optional, Tuple
|
|
|
|
import numpy as np
|
|
|
|
from pandas.core.algorithms import unique1d
|
|
from pandas.core.arrays.categorical import (
|
|
Categorical,
|
|
CategoricalDtype,
|
|
recode_for_categories,
|
|
)
|
|
from pandas.core.indexes.api import CategoricalIndex
|
|
|
|
|
|
def recode_for_groupby(
|
|
c: Categorical, sort: bool, observed: bool
|
|
) -> Tuple[Categorical, Optional[Categorical]]:
|
|
"""
|
|
Code the categories to ensure we can groupby for categoricals.
|
|
|
|
If observed=True, we return a new Categorical with the observed
|
|
categories only.
|
|
|
|
If sort=False, return a copy of self, coded with categories as
|
|
returned by .unique(), followed by any categories not appearing in
|
|
the data. If sort=True, return self.
|
|
|
|
This method is needed solely to ensure the categorical index of the
|
|
GroupBy result has categories in the order of appearance in the data
|
|
(GH-8868).
|
|
|
|
Parameters
|
|
----------
|
|
c : Categorical
|
|
sort : boolean
|
|
The value of the sort parameter groupby was called with.
|
|
observed : boolean
|
|
Account only for the observed values
|
|
|
|
Returns
|
|
-------
|
|
New Categorical
|
|
If sort=False, the new categories are set to the order of
|
|
appearance in codes (unless ordered=True, in which case the
|
|
original order is preserved), followed by any unrepresented
|
|
categories in the original order.
|
|
Categorical or None
|
|
If we are observed, return the original categorical, otherwise None
|
|
"""
|
|
# we only care about observed values
|
|
if observed:
|
|
# In cases with c.ordered, this is equivalent to
|
|
# return c.remove_unused_categories(), c
|
|
|
|
unique_codes = unique1d(c.codes)
|
|
|
|
take_codes = unique_codes[unique_codes != -1]
|
|
if c.ordered:
|
|
take_codes = np.sort(take_codes)
|
|
|
|
# we recode according to the uniques
|
|
categories = c.categories.take(take_codes)
|
|
codes = recode_for_categories(c.codes, c.categories, categories)
|
|
|
|
# return a new categorical that maps our new codes
|
|
# and categories
|
|
dtype = CategoricalDtype(categories, ordered=c.ordered)
|
|
return Categorical(codes, dtype=dtype, fastpath=True), c
|
|
|
|
# Already sorted according to c.categories; all is fine
|
|
if sort:
|
|
return c, None
|
|
|
|
# sort=False should order groups in as-encountered order (GH-8868)
|
|
cat = c.unique()
|
|
|
|
# But for groupby to work, all categories should be present,
|
|
# including those missing from the data (GH-13179), which .unique()
|
|
# above dropped
|
|
cat = cat.add_categories(c.categories[~c.categories.isin(cat.categories)])
|
|
|
|
return c.reorder_categories(cat.categories), None
|
|
|
|
|
|
def recode_from_groupby(
|
|
c: Categorical, sort: bool, ci: CategoricalIndex
|
|
) -> CategoricalIndex:
|
|
"""
|
|
Reverse the codes_to_groupby to account for sort / observed.
|
|
|
|
Parameters
|
|
----------
|
|
c : Categorical
|
|
sort : boolean
|
|
The value of the sort parameter groupby was called with.
|
|
ci : CategoricalIndex
|
|
The codes / categories to recode
|
|
|
|
Returns
|
|
-------
|
|
CategoricalIndex
|
|
"""
|
|
# we re-order to the original category orderings
|
|
if sort:
|
|
# error: "CategoricalIndex" has no attribute "set_categories"
|
|
return ci.set_categories(c.categories) # type: ignore[attr-defined]
|
|
|
|
# we are not sorting, so add unobserved to the end
|
|
new_cats = c.categories[~c.categories.isin(ci.categories)]
|
|
# error: "CategoricalIndex" has no attribute "add_categories"
|
|
return ci.add_categories(new_cats) # type: ignore[attr-defined]
|