391 lines
10 KiB
Python
391 lines
10 KiB
Python
|
from typing import Dict
|
||
|
|
||
|
_shared_docs: Dict[str, str] = {}
|
||
|
|
||
|
_shared_docs[
|
||
|
"aggregate"
|
||
|
] = """
|
||
|
Aggregate using one or more operations over the specified axis.
|
||
|
|
||
|
Parameters
|
||
|
----------
|
||
|
func : function, str, list or dict
|
||
|
Function to use for aggregating the data. If a function, must either
|
||
|
work when passed a {klass} or when passed to {klass}.apply.
|
||
|
|
||
|
Accepted combinations are:
|
||
|
|
||
|
- function
|
||
|
- string function name
|
||
|
- list of functions and/or function names, e.g. ``[np.sum, 'mean']``
|
||
|
- dict of axis labels -> functions, function names or list of such.
|
||
|
{axis}
|
||
|
*args
|
||
|
Positional arguments to pass to `func`.
|
||
|
**kwargs
|
||
|
Keyword arguments to pass to `func`.
|
||
|
|
||
|
Returns
|
||
|
-------
|
||
|
scalar, Series or DataFrame
|
||
|
|
||
|
The return can be:
|
||
|
|
||
|
* scalar : when Series.agg is called with single function
|
||
|
* Series : when DataFrame.agg is called with a single function
|
||
|
* DataFrame : when DataFrame.agg is called with several functions
|
||
|
|
||
|
Return scalar, Series or DataFrame.
|
||
|
{see_also}
|
||
|
Notes
|
||
|
-----
|
||
|
`agg` is an alias for `aggregate`. Use the alias.
|
||
|
|
||
|
A passed user-defined-function will be passed a Series for evaluation.
|
||
|
{examples}"""
|
||
|
|
||
|
_shared_docs[
|
||
|
"compare"
|
||
|
] = """
|
||
|
Compare to another {klass} and show the differences.
|
||
|
|
||
|
.. versionadded:: 1.1.0
|
||
|
|
||
|
Parameters
|
||
|
----------
|
||
|
other : {klass}
|
||
|
Object to compare with.
|
||
|
|
||
|
align_axis : {{0 or 'index', 1 or 'columns'}}, default 1
|
||
|
Determine which axis to align the comparison on.
|
||
|
|
||
|
* 0, or 'index' : Resulting differences are stacked vertically
|
||
|
with rows drawn alternately from self and other.
|
||
|
* 1, or 'columns' : Resulting differences are aligned horizontally
|
||
|
with columns drawn alternately from self and other.
|
||
|
|
||
|
keep_shape : bool, default False
|
||
|
If true, all rows and columns are kept.
|
||
|
Otherwise, only the ones with different values are kept.
|
||
|
|
||
|
keep_equal : bool, default False
|
||
|
If true, the result keeps values that are equal.
|
||
|
Otherwise, equal values are shown as NaNs.
|
||
|
"""
|
||
|
|
||
|
_shared_docs[
|
||
|
"groupby"
|
||
|
] = """
|
||
|
Group %(klass)s using a mapper or by a Series of columns.
|
||
|
|
||
|
A groupby operation involves some combination of splitting the
|
||
|
object, applying a function, and combining the results. This can be
|
||
|
used to group large amounts of data and compute operations on these
|
||
|
groups.
|
||
|
|
||
|
Parameters
|
||
|
----------
|
||
|
by : mapping, function, label, or list of labels
|
||
|
Used to determine the groups for the groupby.
|
||
|
If ``by`` is a function, it's called on each value of the object's
|
||
|
index. If a dict or Series is passed, the Series or dict VALUES
|
||
|
will be used to determine the groups (the Series' values are first
|
||
|
aligned; see ``.align()`` method). If an ndarray is passed, the
|
||
|
values are used as-is to determine the groups. A label or list of
|
||
|
labels may be passed to group by the columns in ``self``. Notice
|
||
|
that a tuple is interpreted as a (single) key.
|
||
|
axis : {0 or 'index', 1 or 'columns'}, default 0
|
||
|
Split along rows (0) or columns (1).
|
||
|
level : int, level name, or sequence of such, default None
|
||
|
If the axis is a MultiIndex (hierarchical), group by a particular
|
||
|
level or levels.
|
||
|
as_index : bool, default True
|
||
|
For aggregated output, return object with group labels as the
|
||
|
index. Only relevant for DataFrame input. as_index=False is
|
||
|
effectively "SQL-style" grouped output.
|
||
|
sort : bool, default True
|
||
|
Sort group keys. Get better performance by turning this off.
|
||
|
Note this does not influence the order of observations within each
|
||
|
group. Groupby preserves the order of rows within each group.
|
||
|
group_keys : bool, default True
|
||
|
When calling apply, add group keys to index to identify pieces.
|
||
|
squeeze : bool, default False
|
||
|
Reduce the dimensionality of the return type if possible,
|
||
|
otherwise return a consistent type.
|
||
|
|
||
|
.. deprecated:: 1.1.0
|
||
|
|
||
|
observed : bool, default False
|
||
|
This only applies if any of the groupers are Categoricals.
|
||
|
If True: only show observed values for categorical groupers.
|
||
|
If False: show all values for categorical groupers.
|
||
|
dropna : bool, default True
|
||
|
If True, and if group keys contain NA values, NA values together
|
||
|
with row/column will be dropped.
|
||
|
If False, NA values will also be treated as the key in groups
|
||
|
|
||
|
.. versionadded:: 1.1.0
|
||
|
|
||
|
Returns
|
||
|
-------
|
||
|
%(klass)sGroupBy
|
||
|
Returns a groupby object that contains information about the groups.
|
||
|
|
||
|
See Also
|
||
|
--------
|
||
|
resample : Convenience method for frequency conversion and resampling
|
||
|
of time series.
|
||
|
|
||
|
Notes
|
||
|
-----
|
||
|
See the `user guide
|
||
|
<https://pandas.pydata.org/pandas-docs/stable/groupby.html>`_ for more.
|
||
|
"""
|
||
|
|
||
|
_shared_docs[
|
||
|
"melt"
|
||
|
] = """
|
||
|
Unpivot a DataFrame from wide to long format, optionally leaving identifiers set.
|
||
|
|
||
|
This function is useful to massage a DataFrame into a format where one
|
||
|
or more columns are identifier variables (`id_vars`), while all other
|
||
|
columns, considered measured variables (`value_vars`), are "unpivoted" to
|
||
|
the row axis, leaving just two non-identifier columns, 'variable' and
|
||
|
'value'.
|
||
|
|
||
|
Parameters
|
||
|
----------
|
||
|
id_vars : tuple, list, or ndarray, optional
|
||
|
Column(s) to use as identifier variables.
|
||
|
value_vars : tuple, list, or ndarray, optional
|
||
|
Column(s) to unpivot. If not specified, uses all columns that
|
||
|
are not set as `id_vars`.
|
||
|
var_name : scalar
|
||
|
Name to use for the 'variable' column. If None it uses
|
||
|
``frame.columns.name`` or 'variable'.
|
||
|
value_name : scalar, default 'value'
|
||
|
Name to use for the 'value' column.
|
||
|
col_level : int or str, optional
|
||
|
If columns are a MultiIndex then use this level to melt.
|
||
|
ignore_index : bool, default True
|
||
|
If True, original index is ignored. If False, the original index is retained.
|
||
|
Index labels will be repeated as necessary.
|
||
|
|
||
|
.. versionadded:: 1.1.0
|
||
|
|
||
|
Returns
|
||
|
-------
|
||
|
DataFrame
|
||
|
Unpivoted DataFrame.
|
||
|
|
||
|
See Also
|
||
|
--------
|
||
|
%(other)s : Identical method.
|
||
|
pivot_table : Create a spreadsheet-style pivot table as a DataFrame.
|
||
|
DataFrame.pivot : Return reshaped DataFrame organized
|
||
|
by given index / column values.
|
||
|
DataFrame.explode : Explode a DataFrame from list-like
|
||
|
columns to long format.
|
||
|
|
||
|
Examples
|
||
|
--------
|
||
|
>>> df = pd.DataFrame({'A': {0: 'a', 1: 'b', 2: 'c'},
|
||
|
... 'B': {0: 1, 1: 3, 2: 5},
|
||
|
... 'C': {0: 2, 1: 4, 2: 6}})
|
||
|
>>> df
|
||
|
A B C
|
||
|
0 a 1 2
|
||
|
1 b 3 4
|
||
|
2 c 5 6
|
||
|
|
||
|
>>> %(caller)sid_vars=['A'], value_vars=['B'])
|
||
|
A variable value
|
||
|
0 a B 1
|
||
|
1 b B 3
|
||
|
2 c B 5
|
||
|
|
||
|
>>> %(caller)sid_vars=['A'], value_vars=['B', 'C'])
|
||
|
A variable value
|
||
|
0 a B 1
|
||
|
1 b B 3
|
||
|
2 c B 5
|
||
|
3 a C 2
|
||
|
4 b C 4
|
||
|
5 c C 6
|
||
|
|
||
|
The names of 'variable' and 'value' columns can be customized:
|
||
|
|
||
|
>>> %(caller)sid_vars=['A'], value_vars=['B'],
|
||
|
... var_name='myVarname', value_name='myValname')
|
||
|
A myVarname myValname
|
||
|
0 a B 1
|
||
|
1 b B 3
|
||
|
2 c B 5
|
||
|
|
||
|
Original index values can be kept around:
|
||
|
|
||
|
>>> %(caller)sid_vars=['A'], value_vars=['B', 'C'], ignore_index=False)
|
||
|
A variable value
|
||
|
0 a B 1
|
||
|
1 b B 3
|
||
|
2 c B 5
|
||
|
0 a C 2
|
||
|
1 b C 4
|
||
|
2 c C 6
|
||
|
|
||
|
If you have multi-index columns:
|
||
|
|
||
|
>>> df.columns = [list('ABC'), list('DEF')]
|
||
|
>>> df
|
||
|
A B C
|
||
|
D E F
|
||
|
0 a 1 2
|
||
|
1 b 3 4
|
||
|
2 c 5 6
|
||
|
|
||
|
>>> %(caller)scol_level=0, id_vars=['A'], value_vars=['B'])
|
||
|
A variable value
|
||
|
0 a B 1
|
||
|
1 b B 3
|
||
|
2 c B 5
|
||
|
|
||
|
>>> %(caller)sid_vars=[('A', 'D')], value_vars=[('B', 'E')])
|
||
|
(A, D) variable_0 variable_1 value
|
||
|
0 a B E 1
|
||
|
1 b B E 3
|
||
|
2 c B E 5
|
||
|
"""
|
||
|
|
||
|
_shared_docs[
|
||
|
"transform"
|
||
|
] = """
|
||
|
Call ``func`` on self producing a {klass} with transformed values.
|
||
|
|
||
|
Produced {klass} will have same axis length as self.
|
||
|
|
||
|
Parameters
|
||
|
----------
|
||
|
func : function, str, list-like or dict-like
|
||
|
Function to use for transforming the data. If a function, must either
|
||
|
work when passed a {klass} or when passed to {klass}.apply. If func
|
||
|
is both list-like and dict-like, dict-like behavior takes precedence.
|
||
|
|
||
|
Accepted combinations are:
|
||
|
|
||
|
- function
|
||
|
- string function name
|
||
|
- list-like of functions and/or function names, e.g. ``[np.exp, 'sqrt']``
|
||
|
- dict-like of axis labels -> functions, function names or list-like of such.
|
||
|
{axis}
|
||
|
*args
|
||
|
Positional arguments to pass to `func`.
|
||
|
**kwargs
|
||
|
Keyword arguments to pass to `func`.
|
||
|
|
||
|
Returns
|
||
|
-------
|
||
|
{klass}
|
||
|
A {klass} that must have the same length as self.
|
||
|
|
||
|
Raises
|
||
|
------
|
||
|
ValueError : If the returned {klass} has a different length than self.
|
||
|
|
||
|
See Also
|
||
|
--------
|
||
|
{klass}.agg : Only perform aggregating type operations.
|
||
|
{klass}.apply : Invoke function on a {klass}.
|
||
|
|
||
|
Examples
|
||
|
--------
|
||
|
>>> df = pd.DataFrame({{'A': range(3), 'B': range(1, 4)}})
|
||
|
>>> df
|
||
|
A B
|
||
|
0 0 1
|
||
|
1 1 2
|
||
|
2 2 3
|
||
|
>>> df.transform(lambda x: x + 1)
|
||
|
A B
|
||
|
0 1 2
|
||
|
1 2 3
|
||
|
2 3 4
|
||
|
|
||
|
Even though the resulting {klass} must have the same length as the
|
||
|
input {klass}, it is possible to provide several input functions:
|
||
|
|
||
|
>>> s = pd.Series(range(3))
|
||
|
>>> s
|
||
|
0 0
|
||
|
1 1
|
||
|
2 2
|
||
|
dtype: int64
|
||
|
>>> s.transform([np.sqrt, np.exp])
|
||
|
sqrt exp
|
||
|
0 0.000000 1.000000
|
||
|
1 1.000000 2.718282
|
||
|
2 1.414214 7.389056
|
||
|
|
||
|
You can call transform on a GroupBy object:
|
||
|
|
||
|
>>> df = pd.DataFrame({{
|
||
|
... "Date": [
|
||
|
... "2015-05-08", "2015-05-07", "2015-05-06", "2015-05-05",
|
||
|
... "2015-05-08", "2015-05-07", "2015-05-06", "2015-05-05"],
|
||
|
... "Data": [5, 8, 6, 1, 50, 100, 60, 120],
|
||
|
... }})
|
||
|
>>> df
|
||
|
Date Data
|
||
|
0 2015-05-08 5
|
||
|
1 2015-05-07 8
|
||
|
2 2015-05-06 6
|
||
|
3 2015-05-05 1
|
||
|
4 2015-05-08 50
|
||
|
5 2015-05-07 100
|
||
|
6 2015-05-06 60
|
||
|
7 2015-05-05 120
|
||
|
>>> df.groupby('Date')['Data'].transform('sum')
|
||
|
0 55
|
||
|
1 108
|
||
|
2 66
|
||
|
3 121
|
||
|
4 55
|
||
|
5 108
|
||
|
6 66
|
||
|
7 121
|
||
|
Name: Data, dtype: int64
|
||
|
|
||
|
>>> df = pd.DataFrame({{
|
||
|
... "c": [1, 1, 1, 2, 2, 2, 2],
|
||
|
... "type": ["m", "n", "o", "m", "m", "n", "n"]
|
||
|
... }})
|
||
|
>>> df
|
||
|
c type
|
||
|
0 1 m
|
||
|
1 1 n
|
||
|
2 1 o
|
||
|
3 2 m
|
||
|
4 2 m
|
||
|
5 2 n
|
||
|
6 2 n
|
||
|
>>> df['size'] = df.groupby('c')['type'].transform(len)
|
||
|
>>> df
|
||
|
c type size
|
||
|
0 1 m 3
|
||
|
1 1 n 3
|
||
|
2 1 o 3
|
||
|
3 2 m 4
|
||
|
4 2 m 4
|
||
|
5 2 n 4
|
||
|
6 2 n 4
|
||
|
"""
|
||
|
|
||
|
_shared_docs[
|
||
|
"storage_options"
|
||
|
] = """storage_options : dict, optional
|
||
|
Extra options that make sense for a particular storage connection, e.g.
|
||
|
host, port, username, password, etc., if using a URL that will
|
||
|
be parsed by ``fsspec``, e.g., starting "s3://", "gcs://". An error
|
||
|
will be raised if providing this argument with a non-fsspec URL.
|
||
|
See the fsspec and backend storage implementation docs for the set of
|
||
|
allowed keys and values."""
|