804 lines
24 KiB
Python
804 lines
24 KiB
Python
|
"""
|
||
|
aggregation.py contains utility functions to handle multiple named and lambda
|
||
|
kwarg aggregations in groupby and DataFrame/Series aggregation
|
||
|
"""
|
||
|
|
||
|
from collections import defaultdict
|
||
|
from functools import partial
|
||
|
from typing import (
|
||
|
TYPE_CHECKING,
|
||
|
Any,
|
||
|
Callable,
|
||
|
DefaultDict,
|
||
|
Dict,
|
||
|
Iterable,
|
||
|
List,
|
||
|
Optional,
|
||
|
Sequence,
|
||
|
Tuple,
|
||
|
Union,
|
||
|
cast,
|
||
|
)
|
||
|
|
||
|
from pandas._typing import (
|
||
|
AggFuncType,
|
||
|
AggFuncTypeBase,
|
||
|
AggFuncTypeDict,
|
||
|
AggObjType,
|
||
|
Axis,
|
||
|
FrameOrSeries,
|
||
|
FrameOrSeriesUnion,
|
||
|
Label,
|
||
|
)
|
||
|
|
||
|
from pandas.core.dtypes.cast import is_nested_object
|
||
|
from pandas.core.dtypes.common import is_dict_like, is_list_like
|
||
|
from pandas.core.dtypes.generic import ABCDataFrame, ABCNDFrame, ABCSeries
|
||
|
|
||
|
from pandas.core.base import DataError, SpecificationError
|
||
|
import pandas.core.common as com
|
||
|
from pandas.core.indexes.api import Index
|
||
|
|
||
|
if TYPE_CHECKING:
|
||
|
from pandas.core.series import Series
|
||
|
|
||
|
|
||
|
def reconstruct_func(
|
||
|
func: Optional[AggFuncType], **kwargs
|
||
|
) -> Tuple[bool, Optional[AggFuncType], Optional[List[str]], Optional[List[int]]]:
|
||
|
"""
|
||
|
This is the internal function to reconstruct func given if there is relabeling
|
||
|
or not and also normalize the keyword to get new order of columns.
|
||
|
|
||
|
If named aggregation is applied, `func` will be None, and kwargs contains the
|
||
|
column and aggregation function information to be parsed;
|
||
|
If named aggregation is not applied, `func` is either string (e.g. 'min') or
|
||
|
Callable, or list of them (e.g. ['min', np.max]), or the dictionary of column name
|
||
|
and str/Callable/list of them (e.g. {'A': 'min'}, or {'A': [np.min, lambda x: x]})
|
||
|
|
||
|
If relabeling is True, will return relabeling, reconstructed func, column
|
||
|
names, and the reconstructed order of columns.
|
||
|
If relabeling is False, the columns and order will be None.
|
||
|
|
||
|
Parameters
|
||
|
----------
|
||
|
func: agg function (e.g. 'min' or Callable) or list of agg functions
|
||
|
(e.g. ['min', np.max]) or dictionary (e.g. {'A': ['min', np.max]}).
|
||
|
**kwargs: dict, kwargs used in is_multi_agg_with_relabel and
|
||
|
normalize_keyword_aggregation function for relabelling
|
||
|
|
||
|
Returns
|
||
|
-------
|
||
|
relabelling: bool, if there is relabelling or not
|
||
|
func: normalized and mangled func
|
||
|
columns: list of column names
|
||
|
order: list of columns indices
|
||
|
|
||
|
Examples
|
||
|
--------
|
||
|
>>> reconstruct_func(None, **{"foo": ("col", "min")})
|
||
|
(True, defaultdict(<class 'list'>, {'col': ['min']}), ('foo',), array([0]))
|
||
|
|
||
|
>>> reconstruct_func("min")
|
||
|
(False, 'min', None, None)
|
||
|
"""
|
||
|
relabeling = func is None and is_multi_agg_with_relabel(**kwargs)
|
||
|
columns: Optional[List[str]] = None
|
||
|
order: Optional[List[int]] = None
|
||
|
|
||
|
if not relabeling:
|
||
|
if isinstance(func, list) and len(func) > len(set(func)):
|
||
|
|
||
|
# GH 28426 will raise error if duplicated function names are used and
|
||
|
# there is no reassigned name
|
||
|
raise SpecificationError(
|
||
|
"Function names must be unique if there is no new column names "
|
||
|
"assigned"
|
||
|
)
|
||
|
elif func is None:
|
||
|
# nicer error message
|
||
|
raise TypeError("Must provide 'func' or tuples of '(column, aggfunc).")
|
||
|
|
||
|
if relabeling:
|
||
|
func, columns, order = normalize_keyword_aggregation(kwargs)
|
||
|
|
||
|
return relabeling, func, columns, order
|
||
|
|
||
|
|
||
|
def is_multi_agg_with_relabel(**kwargs) -> bool:
|
||
|
"""
|
||
|
Check whether kwargs passed to .agg look like multi-agg with relabeling.
|
||
|
|
||
|
Parameters
|
||
|
----------
|
||
|
**kwargs : dict
|
||
|
|
||
|
Returns
|
||
|
-------
|
||
|
bool
|
||
|
|
||
|
Examples
|
||
|
--------
|
||
|
>>> is_multi_agg_with_relabel(a="max")
|
||
|
False
|
||
|
>>> is_multi_agg_with_relabel(a_max=("a", "max"), a_min=("a", "min"))
|
||
|
True
|
||
|
>>> is_multi_agg_with_relabel()
|
||
|
False
|
||
|
"""
|
||
|
return all(isinstance(v, tuple) and len(v) == 2 for v in kwargs.values()) and (
|
||
|
len(kwargs) > 0
|
||
|
)
|
||
|
|
||
|
|
||
|
def normalize_keyword_aggregation(kwargs: dict) -> Tuple[dict, List[str], List[int]]:
|
||
|
"""
|
||
|
Normalize user-provided "named aggregation" kwargs.
|
||
|
Transforms from the new ``Mapping[str, NamedAgg]`` style kwargs
|
||
|
to the old Dict[str, List[scalar]]].
|
||
|
|
||
|
Parameters
|
||
|
----------
|
||
|
kwargs : dict
|
||
|
|
||
|
Returns
|
||
|
-------
|
||
|
aggspec : dict
|
||
|
The transformed kwargs.
|
||
|
columns : List[str]
|
||
|
The user-provided keys.
|
||
|
col_idx_order : List[int]
|
||
|
List of columns indices.
|
||
|
|
||
|
Examples
|
||
|
--------
|
||
|
>>> normalize_keyword_aggregation({"output": ("input", "sum")})
|
||
|
(defaultdict(<class 'list'>, {'input': ['sum']}), ('output',), array([0]))
|
||
|
"""
|
||
|
# Normalize the aggregation functions as Mapping[column, List[func]],
|
||
|
# process normally, then fixup the names.
|
||
|
# TODO: aggspec type: typing.Dict[str, List[AggScalar]]
|
||
|
# May be hitting https://github.com/python/mypy/issues/5958
|
||
|
# saying it doesn't have an attribute __name__
|
||
|
aggspec: DefaultDict = defaultdict(list)
|
||
|
order = []
|
||
|
columns, pairs = list(zip(*kwargs.items()))
|
||
|
|
||
|
for name, (column, aggfunc) in zip(columns, pairs):
|
||
|
aggspec[column].append(aggfunc)
|
||
|
order.append((column, com.get_callable_name(aggfunc) or aggfunc))
|
||
|
|
||
|
# uniquify aggfunc name if duplicated in order list
|
||
|
uniquified_order = _make_unique_kwarg_list(order)
|
||
|
|
||
|
# GH 25719, due to aggspec will change the order of assigned columns in aggregation
|
||
|
# uniquified_aggspec will store uniquified order list and will compare it with order
|
||
|
# based on index
|
||
|
aggspec_order = [
|
||
|
(column, com.get_callable_name(aggfunc) or aggfunc)
|
||
|
for column, aggfuncs in aggspec.items()
|
||
|
for aggfunc in aggfuncs
|
||
|
]
|
||
|
uniquified_aggspec = _make_unique_kwarg_list(aggspec_order)
|
||
|
|
||
|
# get the new index of columns by comparison
|
||
|
col_idx_order = Index(uniquified_aggspec).get_indexer(uniquified_order)
|
||
|
return aggspec, columns, col_idx_order
|
||
|
|
||
|
|
||
|
def _make_unique_kwarg_list(
|
||
|
seq: Sequence[Tuple[Any, Any]]
|
||
|
) -> Sequence[Tuple[Any, Any]]:
|
||
|
"""
|
||
|
Uniquify aggfunc name of the pairs in the order list
|
||
|
|
||
|
Examples:
|
||
|
--------
|
||
|
>>> kwarg_list = [('a', '<lambda>'), ('a', '<lambda>'), ('b', '<lambda>')]
|
||
|
>>> _make_unique_kwarg_list(kwarg_list)
|
||
|
[('a', '<lambda>_0'), ('a', '<lambda>_1'), ('b', '<lambda>')]
|
||
|
"""
|
||
|
return [
|
||
|
(pair[0], "_".join([pair[1], str(seq[:i].count(pair))]))
|
||
|
if seq.count(pair) > 1
|
||
|
else pair
|
||
|
for i, pair in enumerate(seq)
|
||
|
]
|
||
|
|
||
|
|
||
|
# TODO: Can't use, because mypy doesn't like us setting __name__
|
||
|
# error: "partial[Any]" has no attribute "__name__"
|
||
|
# the type is:
|
||
|
# typing.Sequence[Callable[..., ScalarResult]]
|
||
|
# -> typing.Sequence[Callable[..., ScalarResult]]:
|
||
|
|
||
|
|
||
|
def _managle_lambda_list(aggfuncs: Sequence[Any]) -> Sequence[Any]:
|
||
|
"""
|
||
|
Possibly mangle a list of aggfuncs.
|
||
|
|
||
|
Parameters
|
||
|
----------
|
||
|
aggfuncs : Sequence
|
||
|
|
||
|
Returns
|
||
|
-------
|
||
|
mangled: list-like
|
||
|
A new AggSpec sequence, where lambdas have been converted
|
||
|
to have unique names.
|
||
|
|
||
|
Notes
|
||
|
-----
|
||
|
If just one aggfunc is passed, the name will not be mangled.
|
||
|
"""
|
||
|
if len(aggfuncs) <= 1:
|
||
|
# don't mangle for .agg([lambda x: .])
|
||
|
return aggfuncs
|
||
|
i = 0
|
||
|
mangled_aggfuncs = []
|
||
|
for aggfunc in aggfuncs:
|
||
|
if com.get_callable_name(aggfunc) == "<lambda>":
|
||
|
aggfunc = partial(aggfunc)
|
||
|
aggfunc.__name__ = f"<lambda_{i}>"
|
||
|
i += 1
|
||
|
mangled_aggfuncs.append(aggfunc)
|
||
|
|
||
|
return mangled_aggfuncs
|
||
|
|
||
|
|
||
|
def maybe_mangle_lambdas(agg_spec: Any) -> Any:
|
||
|
"""
|
||
|
Make new lambdas with unique names.
|
||
|
|
||
|
Parameters
|
||
|
----------
|
||
|
agg_spec : Any
|
||
|
An argument to GroupBy.agg.
|
||
|
Non-dict-like `agg_spec` are pass through as is.
|
||
|
For dict-like `agg_spec` a new spec is returned
|
||
|
with name-mangled lambdas.
|
||
|
|
||
|
Returns
|
||
|
-------
|
||
|
mangled : Any
|
||
|
Same type as the input.
|
||
|
|
||
|
Examples
|
||
|
--------
|
||
|
>>> maybe_mangle_lambdas('sum')
|
||
|
'sum'
|
||
|
>>> maybe_mangle_lambdas([lambda: 1, lambda: 2]) # doctest: +SKIP
|
||
|
[<function __main__.<lambda_0>,
|
||
|
<function pandas...._make_lambda.<locals>.f(*args, **kwargs)>]
|
||
|
"""
|
||
|
is_dict = is_dict_like(agg_spec)
|
||
|
if not (is_dict or is_list_like(agg_spec)):
|
||
|
return agg_spec
|
||
|
mangled_aggspec = type(agg_spec)() # dict or OrderedDict
|
||
|
|
||
|
if is_dict:
|
||
|
for key, aggfuncs in agg_spec.items():
|
||
|
if is_list_like(aggfuncs) and not is_dict_like(aggfuncs):
|
||
|
mangled_aggfuncs = _managle_lambda_list(aggfuncs)
|
||
|
else:
|
||
|
mangled_aggfuncs = aggfuncs
|
||
|
|
||
|
mangled_aggspec[key] = mangled_aggfuncs
|
||
|
else:
|
||
|
mangled_aggspec = _managle_lambda_list(agg_spec)
|
||
|
|
||
|
return mangled_aggspec
|
||
|
|
||
|
|
||
|
def relabel_result(
|
||
|
result: FrameOrSeries,
|
||
|
func: Dict[str, List[Union[Callable, str]]],
|
||
|
columns: Iterable[Label],
|
||
|
order: Iterable[int],
|
||
|
) -> Dict[Label, "Series"]:
|
||
|
"""
|
||
|
Internal function to reorder result if relabelling is True for
|
||
|
dataframe.agg, and return the reordered result in dict.
|
||
|
|
||
|
Parameters:
|
||
|
----------
|
||
|
result: Result from aggregation
|
||
|
func: Dict of (column name, funcs)
|
||
|
columns: New columns name for relabelling
|
||
|
order: New order for relabelling
|
||
|
|
||
|
Examples:
|
||
|
---------
|
||
|
>>> result = DataFrame({"A": [np.nan, 2, np.nan],
|
||
|
... "C": [6, np.nan, np.nan], "B": [np.nan, 4, 2.5]}) # doctest: +SKIP
|
||
|
>>> funcs = {"A": ["max"], "C": ["max"], "B": ["mean", "min"]}
|
||
|
>>> columns = ("foo", "aab", "bar", "dat")
|
||
|
>>> order = [0, 1, 2, 3]
|
||
|
>>> _relabel_result(result, func, columns, order) # doctest: +SKIP
|
||
|
dict(A=Series([2.0, NaN, NaN, NaN], index=["foo", "aab", "bar", "dat"]),
|
||
|
C=Series([NaN, 6.0, NaN, NaN], index=["foo", "aab", "bar", "dat"]),
|
||
|
B=Series([NaN, NaN, 2.5, 4.0], index=["foo", "aab", "bar", "dat"]))
|
||
|
"""
|
||
|
reordered_indexes = [
|
||
|
pair[0] for pair in sorted(zip(columns, order), key=lambda t: t[1])
|
||
|
]
|
||
|
reordered_result_in_dict: Dict[Label, "Series"] = {}
|
||
|
idx = 0
|
||
|
|
||
|
reorder_mask = not isinstance(result, ABCSeries) and len(result.columns) > 1
|
||
|
for col, fun in func.items():
|
||
|
s = result[col].dropna()
|
||
|
|
||
|
# In the `_aggregate`, the callable names are obtained and used in `result`, and
|
||
|
# these names are ordered alphabetically. e.g.
|
||
|
# C2 C1
|
||
|
# <lambda> 1 NaN
|
||
|
# amax NaN 4.0
|
||
|
# max NaN 4.0
|
||
|
# sum 18.0 6.0
|
||
|
# Therefore, the order of functions for each column could be shuffled
|
||
|
# accordingly so need to get the callable name if it is not parsed names, and
|
||
|
# reorder the aggregated result for each column.
|
||
|
# e.g. if df.agg(c1=("C2", sum), c2=("C2", lambda x: min(x))), correct order is
|
||
|
# [sum, <lambda>], but in `result`, it will be [<lambda>, sum], and we need to
|
||
|
# reorder so that aggregated values map to their functions regarding the order.
|
||
|
|
||
|
# However there is only one column being used for aggregation, not need to
|
||
|
# reorder since the index is not sorted, and keep as is in `funcs`, e.g.
|
||
|
# A
|
||
|
# min 1.0
|
||
|
# mean 1.5
|
||
|
# mean 1.5
|
||
|
if reorder_mask:
|
||
|
fun = [
|
||
|
com.get_callable_name(f) if not isinstance(f, str) else f for f in fun
|
||
|
]
|
||
|
col_idx_order = Index(s.index).get_indexer(fun)
|
||
|
s = s[col_idx_order]
|
||
|
|
||
|
# assign the new user-provided "named aggregation" as index names, and reindex
|
||
|
# it based on the whole user-provided names.
|
||
|
s.index = reordered_indexes[idx : idx + len(fun)]
|
||
|
reordered_result_in_dict[col] = s.reindex(columns, copy=False)
|
||
|
idx = idx + len(fun)
|
||
|
return reordered_result_in_dict
|
||
|
|
||
|
|
||
|
def validate_func_kwargs(
|
||
|
kwargs: dict,
|
||
|
) -> Tuple[List[str], List[Union[str, Callable[..., Any]]]]:
|
||
|
"""
|
||
|
Validates types of user-provided "named aggregation" kwargs.
|
||
|
`TypeError` is raised if aggfunc is not `str` or callable.
|
||
|
|
||
|
Parameters
|
||
|
----------
|
||
|
kwargs : dict
|
||
|
|
||
|
Returns
|
||
|
-------
|
||
|
columns : List[str]
|
||
|
List of user-provied keys.
|
||
|
func : List[Union[str, callable[...,Any]]]
|
||
|
List of user-provided aggfuncs
|
||
|
|
||
|
Examples
|
||
|
--------
|
||
|
>>> validate_func_kwargs({'one': 'min', 'two': 'max'})
|
||
|
(['one', 'two'], ['min', 'max'])
|
||
|
"""
|
||
|
no_arg_message = "Must provide 'func' or named aggregation **kwargs."
|
||
|
tuple_given_message = "func is expected but received {} in **kwargs."
|
||
|
columns = list(kwargs)
|
||
|
func = []
|
||
|
for col_func in kwargs.values():
|
||
|
if not (isinstance(col_func, str) or callable(col_func)):
|
||
|
raise TypeError(tuple_given_message.format(type(col_func).__name__))
|
||
|
func.append(col_func)
|
||
|
if not columns:
|
||
|
raise TypeError(no_arg_message)
|
||
|
return columns, func
|
||
|
|
||
|
|
||
|
def transform(
|
||
|
obj: FrameOrSeries, func: AggFuncType, axis: Axis, *args, **kwargs
|
||
|
) -> FrameOrSeriesUnion:
|
||
|
"""
|
||
|
Transform a DataFrame or Series
|
||
|
|
||
|
Parameters
|
||
|
----------
|
||
|
obj : DataFrame or Series
|
||
|
Object to compute the transform on.
|
||
|
func : string, function, list, or dictionary
|
||
|
Function(s) to compute the transform with.
|
||
|
axis : {0 or 'index', 1 or 'columns'}
|
||
|
Axis along which the function is applied:
|
||
|
|
||
|
* 0 or 'index': apply function to each column.
|
||
|
* 1 or 'columns': apply function to each row.
|
||
|
|
||
|
Returns
|
||
|
-------
|
||
|
DataFrame or Series
|
||
|
Result of applying ``func`` along the given axis of the
|
||
|
Series or DataFrame.
|
||
|
|
||
|
Raises
|
||
|
------
|
||
|
ValueError
|
||
|
If the transform function fails or does not transform.
|
||
|
"""
|
||
|
is_series = obj.ndim == 1
|
||
|
|
||
|
if obj._get_axis_number(axis) == 1:
|
||
|
assert not is_series
|
||
|
return transform(obj.T, func, 0, *args, **kwargs).T
|
||
|
|
||
|
if is_list_like(func) and not is_dict_like(func):
|
||
|
func = cast(List[AggFuncTypeBase], func)
|
||
|
# Convert func equivalent dict
|
||
|
if is_series:
|
||
|
func = {com.get_callable_name(v) or v: v for v in func}
|
||
|
else:
|
||
|
func = {col: func for col in obj}
|
||
|
|
||
|
if is_dict_like(func):
|
||
|
func = cast(AggFuncTypeDict, func)
|
||
|
return transform_dict_like(obj, func, *args, **kwargs)
|
||
|
|
||
|
# func is either str or callable
|
||
|
func = cast(AggFuncTypeBase, func)
|
||
|
try:
|
||
|
result = transform_str_or_callable(obj, func, *args, **kwargs)
|
||
|
except Exception:
|
||
|
raise ValueError("Transform function failed")
|
||
|
|
||
|
# Functions that transform may return empty Series/DataFrame
|
||
|
# when the dtype is not appropriate
|
||
|
if isinstance(result, (ABCSeries, ABCDataFrame)) and result.empty and not obj.empty:
|
||
|
raise ValueError("Transform function failed")
|
||
|
if not isinstance(result, (ABCSeries, ABCDataFrame)) or not result.index.equals(
|
||
|
obj.index
|
||
|
):
|
||
|
raise ValueError("Function did not transform")
|
||
|
|
||
|
return result
|
||
|
|
||
|
|
||
|
def transform_dict_like(
|
||
|
obj: FrameOrSeries,
|
||
|
func: AggFuncTypeDict,
|
||
|
*args,
|
||
|
**kwargs,
|
||
|
):
|
||
|
"""
|
||
|
Compute transform in the case of a dict-like func
|
||
|
"""
|
||
|
from pandas.core.reshape.concat import concat
|
||
|
|
||
|
if len(func) == 0:
|
||
|
raise ValueError("No transform functions were provided")
|
||
|
|
||
|
if obj.ndim != 1:
|
||
|
# Check for missing columns on a frame
|
||
|
cols = sorted(set(func.keys()) - set(obj.columns))
|
||
|
if len(cols) > 0:
|
||
|
raise SpecificationError(f"Column(s) {cols} do not exist")
|
||
|
|
||
|
# Can't use func.values(); wouldn't work for a Series
|
||
|
if any(is_dict_like(v) for _, v in func.items()):
|
||
|
# GH 15931 - deprecation of renaming keys
|
||
|
raise SpecificationError("nested renamer is not supported")
|
||
|
|
||
|
is_aggregator = lambda x: isinstance(x, (list, tuple, dict))
|
||
|
|
||
|
# if we have a dict of any non-scalars
|
||
|
# eg. {'A' : ['mean']}, normalize all to
|
||
|
# be list-likes
|
||
|
# Cannot use func.values() because arg may be a Series
|
||
|
if any(is_aggregator(x) for _, x in func.items()):
|
||
|
new_func: AggFuncTypeDict = {}
|
||
|
for k, v in func.items():
|
||
|
if not is_aggregator(v):
|
||
|
# mypy can't realize v is not a list here
|
||
|
new_func[k] = [v] # type:ignore[list-item]
|
||
|
else:
|
||
|
new_func[k] = v
|
||
|
func = new_func
|
||
|
|
||
|
results: Dict[Label, FrameOrSeriesUnion] = {}
|
||
|
for name, how in func.items():
|
||
|
colg = obj._gotitem(name, ndim=1)
|
||
|
try:
|
||
|
results[name] = transform(colg, how, 0, *args, **kwargs)
|
||
|
except Exception as err:
|
||
|
if (
|
||
|
str(err) == "Function did not transform"
|
||
|
or str(err) == "No transform functions were provided"
|
||
|
):
|
||
|
raise err
|
||
|
|
||
|
# combine results
|
||
|
if len(results) == 0:
|
||
|
raise ValueError("Transform function failed")
|
||
|
return concat(results, axis=1)
|
||
|
|
||
|
|
||
|
def transform_str_or_callable(
|
||
|
obj: FrameOrSeries, func: AggFuncTypeBase, *args, **kwargs
|
||
|
) -> FrameOrSeriesUnion:
|
||
|
"""
|
||
|
Compute transform in the case of a string or callable func
|
||
|
"""
|
||
|
if isinstance(func, str):
|
||
|
return obj._try_aggregate_string_function(func, *args, **kwargs)
|
||
|
|
||
|
if not args and not kwargs:
|
||
|
f = obj._get_cython_func(func)
|
||
|
if f:
|
||
|
return getattr(obj, f)()
|
||
|
|
||
|
# Two possible ways to use a UDF - apply or call directly
|
||
|
try:
|
||
|
return obj.apply(func, args=args, **kwargs)
|
||
|
except Exception:
|
||
|
return func(obj, *args, **kwargs)
|
||
|
|
||
|
|
||
|
def aggregate(
|
||
|
obj: AggObjType,
|
||
|
arg: AggFuncType,
|
||
|
*args,
|
||
|
**kwargs,
|
||
|
):
|
||
|
"""
|
||
|
Provide an implementation for the aggregators.
|
||
|
|
||
|
Parameters
|
||
|
----------
|
||
|
obj : Pandas object to compute aggregation on.
|
||
|
arg : string, dict, function.
|
||
|
*args : args to pass on to the function.
|
||
|
**kwargs : kwargs to pass on to the function.
|
||
|
|
||
|
Returns
|
||
|
-------
|
||
|
tuple of result, how.
|
||
|
|
||
|
Notes
|
||
|
-----
|
||
|
how can be a string describe the required post-processing, or
|
||
|
None if not required.
|
||
|
"""
|
||
|
_axis = kwargs.pop("_axis", None)
|
||
|
if _axis is None:
|
||
|
_axis = getattr(obj, "axis", 0)
|
||
|
|
||
|
if isinstance(arg, str):
|
||
|
return obj._try_aggregate_string_function(arg, *args, **kwargs), None
|
||
|
elif is_dict_like(arg):
|
||
|
arg = cast(AggFuncTypeDict, arg)
|
||
|
return agg_dict_like(obj, arg, _axis), True
|
||
|
elif is_list_like(arg):
|
||
|
# we require a list, but not an 'str'
|
||
|
arg = cast(List[AggFuncTypeBase], arg)
|
||
|
return agg_list_like(obj, arg, _axis=_axis), None
|
||
|
else:
|
||
|
result = None
|
||
|
|
||
|
if callable(arg):
|
||
|
f = obj._get_cython_func(arg)
|
||
|
if f and not args and not kwargs:
|
||
|
return getattr(obj, f)(), None
|
||
|
|
||
|
# caller can react
|
||
|
return result, True
|
||
|
|
||
|
|
||
|
def agg_list_like(
|
||
|
obj: AggObjType,
|
||
|
arg: List[AggFuncTypeBase],
|
||
|
_axis: int,
|
||
|
) -> FrameOrSeriesUnion:
|
||
|
"""
|
||
|
Compute aggregation in the case of a list-like argument.
|
||
|
|
||
|
Parameters
|
||
|
----------
|
||
|
obj : Pandas object to compute aggregation on.
|
||
|
arg : list
|
||
|
Aggregations to compute.
|
||
|
_axis : int, 0 or 1
|
||
|
Axis to compute aggregation on.
|
||
|
|
||
|
Returns
|
||
|
-------
|
||
|
Result of aggregation.
|
||
|
"""
|
||
|
from pandas.core.reshape.concat import concat
|
||
|
|
||
|
if _axis != 0:
|
||
|
raise NotImplementedError("axis other than 0 is not supported")
|
||
|
|
||
|
if obj._selected_obj.ndim == 1:
|
||
|
selected_obj = obj._selected_obj
|
||
|
else:
|
||
|
selected_obj = obj._obj_with_exclusions
|
||
|
|
||
|
results = []
|
||
|
keys = []
|
||
|
|
||
|
# degenerate case
|
||
|
if selected_obj.ndim == 1:
|
||
|
for a in arg:
|
||
|
colg = obj._gotitem(selected_obj.name, ndim=1, subset=selected_obj)
|
||
|
try:
|
||
|
new_res = colg.aggregate(a)
|
||
|
|
||
|
except TypeError:
|
||
|
pass
|
||
|
else:
|
||
|
results.append(new_res)
|
||
|
|
||
|
# make sure we find a good name
|
||
|
name = com.get_callable_name(a) or a
|
||
|
keys.append(name)
|
||
|
|
||
|
# multiples
|
||
|
else:
|
||
|
for index, col in enumerate(selected_obj):
|
||
|
colg = obj._gotitem(col, ndim=1, subset=selected_obj.iloc[:, index])
|
||
|
try:
|
||
|
new_res = colg.aggregate(arg)
|
||
|
except (TypeError, DataError):
|
||
|
pass
|
||
|
except ValueError as err:
|
||
|
# cannot aggregate
|
||
|
if "Must produce aggregated value" in str(err):
|
||
|
# raised directly in _aggregate_named
|
||
|
pass
|
||
|
elif "no results" in str(err):
|
||
|
# raised directly in _aggregate_multiple_funcs
|
||
|
pass
|
||
|
else:
|
||
|
raise
|
||
|
else:
|
||
|
results.append(new_res)
|
||
|
keys.append(col)
|
||
|
|
||
|
# if we are empty
|
||
|
if not len(results):
|
||
|
raise ValueError("no results")
|
||
|
|
||
|
try:
|
||
|
return concat(results, keys=keys, axis=1, sort=False)
|
||
|
except TypeError as err:
|
||
|
|
||
|
# we are concatting non-NDFrame objects,
|
||
|
# e.g. a list of scalars
|
||
|
|
||
|
from pandas import Series
|
||
|
|
||
|
result = Series(results, index=keys, name=obj.name)
|
||
|
if is_nested_object(result):
|
||
|
raise ValueError(
|
||
|
"cannot combine transform and aggregation operations"
|
||
|
) from err
|
||
|
return result
|
||
|
|
||
|
|
||
|
def agg_dict_like(
|
||
|
obj: AggObjType,
|
||
|
arg: AggFuncTypeDict,
|
||
|
_axis: int,
|
||
|
) -> FrameOrSeriesUnion:
|
||
|
"""
|
||
|
Compute aggregation in the case of a dict-like argument.
|
||
|
|
||
|
Parameters
|
||
|
----------
|
||
|
obj : Pandas object to compute aggregation on.
|
||
|
arg : dict
|
||
|
label-aggregation pairs to compute.
|
||
|
_axis : int, 0 or 1
|
||
|
Axis to compute aggregation on.
|
||
|
|
||
|
Returns
|
||
|
-------
|
||
|
Result of aggregation.
|
||
|
"""
|
||
|
is_aggregator = lambda x: isinstance(x, (list, tuple, dict))
|
||
|
|
||
|
if _axis != 0: # pragma: no cover
|
||
|
raise ValueError("Can only pass dict with axis=0")
|
||
|
|
||
|
selected_obj = obj._selected_obj
|
||
|
|
||
|
# if we have a dict of any non-scalars
|
||
|
# eg. {'A' : ['mean']}, normalize all to
|
||
|
# be list-likes
|
||
|
if any(is_aggregator(x) for x in arg.values()):
|
||
|
new_arg: AggFuncTypeDict = {}
|
||
|
for k, v in arg.items():
|
||
|
if not isinstance(v, (tuple, list, dict)):
|
||
|
new_arg[k] = [v]
|
||
|
else:
|
||
|
new_arg[k] = v
|
||
|
|
||
|
# the keys must be in the columns
|
||
|
# for ndim=2, or renamers for ndim=1
|
||
|
|
||
|
# ok for now, but deprecated
|
||
|
# {'A': { 'ra': 'mean' }}
|
||
|
# {'A': { 'ra': ['mean'] }}
|
||
|
# {'ra': ['mean']}
|
||
|
|
||
|
# not ok
|
||
|
# {'ra' : { 'A' : 'mean' }}
|
||
|
if isinstance(v, dict):
|
||
|
raise SpecificationError("nested renamer is not supported")
|
||
|
elif isinstance(selected_obj, ABCSeries):
|
||
|
raise SpecificationError("nested renamer is not supported")
|
||
|
elif (
|
||
|
isinstance(selected_obj, ABCDataFrame) and k not in selected_obj.columns
|
||
|
):
|
||
|
raise KeyError(f"Column '{k}' does not exist!")
|
||
|
|
||
|
arg = new_arg
|
||
|
|
||
|
else:
|
||
|
# deprecation of renaming keys
|
||
|
# GH 15931
|
||
|
keys = list(arg.keys())
|
||
|
if isinstance(selected_obj, ABCDataFrame) and len(
|
||
|
selected_obj.columns.intersection(keys)
|
||
|
) != len(keys):
|
||
|
cols = sorted(set(keys) - set(selected_obj.columns.intersection(keys)))
|
||
|
raise SpecificationError(f"Column(s) {cols} do not exist")
|
||
|
|
||
|
from pandas.core.reshape.concat import concat
|
||
|
|
||
|
if selected_obj.ndim == 1:
|
||
|
# key only used for output
|
||
|
colg = obj._gotitem(obj._selection, ndim=1)
|
||
|
results = {key: colg.agg(how) for key, how in arg.items()}
|
||
|
else:
|
||
|
# key used for column selection and output
|
||
|
results = {key: obj._gotitem(key, ndim=1).agg(how) for key, how in arg.items()}
|
||
|
|
||
|
# set the final keys
|
||
|
keys = list(arg.keys())
|
||
|
|
||
|
# Avoid making two isinstance calls in all and any below
|
||
|
is_ndframe = [isinstance(r, ABCNDFrame) for r in results.values()]
|
||
|
|
||
|
# combine results
|
||
|
if all(is_ndframe):
|
||
|
keys_to_use = [k for k in keys if not results[k].empty]
|
||
|
# Have to check, if at least one DataFrame is not empty.
|
||
|
keys_to_use = keys_to_use if keys_to_use != [] else keys
|
||
|
axis = 0 if isinstance(obj, ABCSeries) else 1
|
||
|
result = concat({k: results[k] for k in keys_to_use}, axis=axis)
|
||
|
elif any(is_ndframe):
|
||
|
# There is a mix of NDFrames and scalars
|
||
|
raise ValueError(
|
||
|
"cannot perform both aggregation "
|
||
|
"and transformation operations "
|
||
|
"simultaneously"
|
||
|
)
|
||
|
else:
|
||
|
from pandas import Series
|
||
|
|
||
|
# we have a dict of scalars
|
||
|
# GH 36212 use name only if obj is a series
|
||
|
if obj.ndim == 1:
|
||
|
obj = cast("Series", obj)
|
||
|
name = obj.name
|
||
|
else:
|
||
|
name = None
|
||
|
|
||
|
result = Series(results, name=name)
|
||
|
|
||
|
return result
|