projektAI/venv/Lib/site-packages/pandas/core/window/rolling.py

2191 lines
66 KiB
Python
Raw Normal View History

2021-06-06 22:13:05 +02:00
"""
Provide a generic structure to support window functions,
similar to how we have a Groupby object.
"""
from datetime import timedelta
from functools import partial
import inspect
from textwrap import dedent
from typing import (
TYPE_CHECKING,
Any,
Callable,
Dict,
List,
Optional,
Set,
Tuple,
Type,
Union,
)
import warnings
import numpy as np
from pandas._libs.tslibs import BaseOffset, to_offset
import pandas._libs.window.aggregations as window_aggregations
from pandas._typing import ArrayLike, Axis, FrameOrSeries, FrameOrSeriesUnion
from pandas.compat._optional import import_optional_dependency
from pandas.compat.numpy import function as nv
from pandas.util._decorators import Appender, Substitution, cache_readonly, doc
from pandas.core.dtypes.common import (
ensure_float64,
is_bool,
is_float_dtype,
is_integer,
is_integer_dtype,
is_list_like,
is_scalar,
needs_i8_conversion,
)
from pandas.core.dtypes.generic import (
ABCDataFrame,
ABCDatetimeIndex,
ABCPeriodIndex,
ABCSeries,
ABCTimedeltaIndex,
)
from pandas.core.dtypes.missing import notna
from pandas.core.aggregation import aggregate
from pandas.core.base import DataError, SelectionMixin
from pandas.core.construction import extract_array
from pandas.core.groupby.base import GotItemMixin, ShallowMixin
from pandas.core.indexes.api import Index, MultiIndex
from pandas.core.util.numba_ import NUMBA_FUNC_CACHE, maybe_use_numba
from pandas.core.window.common import (
_doc_template,
_shared_docs,
flex_binary_moment,
zsqrt,
)
from pandas.core.window.indexers import (
BaseIndexer,
FixedWindowIndexer,
GroupbyIndexer,
VariableWindowIndexer,
)
from pandas.core.window.numba_ import generate_numba_apply_func
if TYPE_CHECKING:
from pandas import DataFrame, Series
from pandas.core.internals import Block # noqa:F401
class BaseWindow(ShallowMixin, SelectionMixin):
"""Provides utilities for performing windowing operations."""
_attributes: List[str] = [
"window",
"min_periods",
"center",
"win_type",
"axis",
"on",
"closed",
]
exclusions: Set[str] = set()
def __init__(
self,
obj: FrameOrSeries,
window=None,
min_periods: Optional[int] = None,
center: bool = False,
win_type: Optional[str] = None,
axis: Axis = 0,
on: Optional[Union[str, Index]] = None,
closed: Optional[str] = None,
**kwargs,
):
self.__dict__.update(kwargs)
self.obj = obj
self.on = on
self.closed = closed
self.window = window
self.min_periods = min_periods
self.center = center
self.win_type = win_type
self.win_freq = None
self.axis = obj._get_axis_number(axis) if axis is not None else None
self.validate()
@property
def is_datetimelike(self) -> Optional[bool]:
return None
@property
def _on(self):
return None
@property
def is_freq_type(self) -> bool:
return self.win_type == "freq"
def validate(self) -> None:
if self.center is not None and not is_bool(self.center):
raise ValueError("center must be a boolean")
if self.min_periods is not None:
if not is_integer(self.min_periods):
raise ValueError("min_periods must be an integer")
elif self.min_periods < 0:
raise ValueError("min_periods must be >= 0")
elif is_integer(self.window) and self.min_periods > self.window:
raise ValueError(
f"min_periods {self.min_periods} must be <= window {self.window}"
)
if self.closed is not None and self.closed not in [
"right",
"both",
"left",
"neither",
]:
raise ValueError("closed must be 'right', 'left', 'both' or 'neither'")
if not isinstance(self.obj, (ABCSeries, ABCDataFrame)):
raise TypeError(f"invalid type: {type(self)}")
if isinstance(self.window, BaseIndexer):
# Validate that the passed BaseIndexer subclass has
# a get_window_bounds with the correct signature.
get_window_bounds_signature = inspect.signature(
self.window.get_window_bounds
).parameters.keys()
expected_signature = inspect.signature(
BaseIndexer().get_window_bounds
).parameters.keys()
if get_window_bounds_signature != expected_signature:
raise ValueError(
f"{type(self.window).__name__} does not implement "
f"the correct signature for get_window_bounds"
)
def _create_data(self, obj: FrameOrSeries) -> FrameOrSeries:
"""
Split data into blocks & return conformed data.
"""
# filter out the on from the object
if self.on is not None and not isinstance(self.on, Index):
if obj.ndim == 2:
obj = obj.reindex(columns=obj.columns.difference([self.on]), copy=False)
if self.axis == 1:
# GH: 20649 in case of mixed dtype and axis=1 we have to convert everything
# to float to calculate the complete row at once. We exclude all non-numeric
# dtypes.
obj = obj.select_dtypes(include=["integer", "float"], exclude=["timedelta"])
obj = obj.astype("float64", copy=False)
obj._mgr = obj._mgr.consolidate()
return obj
def _gotitem(self, key, ndim, subset=None):
"""
Sub-classes to define. Return a sliced object.
Parameters
----------
key : str / list of selections
ndim : 1,2
requested ndim of result
subset : object, default None
subset to act on
"""
# create a new object to prevent aliasing
if subset is None:
subset = self.obj
self = self._shallow_copy(subset)
self._reset_cache()
if subset.ndim == 2:
if is_scalar(key) and key in subset or is_list_like(key):
self._selection = key
return self
def __getattr__(self, attr: str):
if attr in self._internal_names_set:
return object.__getattribute__(self, attr)
if attr in self.obj:
return self[attr]
raise AttributeError(
f"'{type(self).__name__}' object has no attribute '{attr}'"
)
def _dir_additions(self):
return self.obj._dir_additions()
def _get_cov_corr_window(
self, other: Optional[Union[np.ndarray, FrameOrSeries]] = None
) -> Optional[Union[int, timedelta, BaseOffset, BaseIndexer]]:
"""
Return window length.
Parameters
----------
other :
Used in Expanding
Returns
-------
window : int
"""
return self.window
@property
def _window_type(self) -> str:
return type(self).__name__
def __repr__(self) -> str:
"""
Provide a nice str repr of our rolling object.
"""
attrs_list = (
f"{attr_name}={getattr(self, attr_name)}"
for attr_name in self._attributes
if getattr(self, attr_name, None) is not None
)
attrs = ",".join(attrs_list)
return f"{self._window_type} [{attrs}]"
def __iter__(self):
obj = self._create_data(self._selected_obj)
indexer = self._get_window_indexer()
start, end = indexer.get_window_bounds(
num_values=len(obj),
min_periods=self.min_periods,
center=self.center,
closed=self.closed,
)
# From get_window_bounds, those two should be equal in length of array
assert len(start) == len(end)
for s, e in zip(start, end):
result = obj.iloc[slice(s, e)]
yield result
def _prep_values(self, values: Optional[np.ndarray] = None) -> np.ndarray:
"""Convert input to numpy arrays for Cython routines"""
if values is None:
values = extract_array(self._selected_obj, extract_numpy=True)
# GH #12373 : rolling functions error on float32 data
# make sure the data is coerced to float64
if is_float_dtype(values.dtype):
values = ensure_float64(values)
elif is_integer_dtype(values.dtype):
values = ensure_float64(values)
elif needs_i8_conversion(values.dtype):
raise NotImplementedError(
f"ops for {self._window_type} for this "
f"dtype {values.dtype} are not implemented"
)
else:
try:
values = ensure_float64(values)
except (ValueError, TypeError) as err:
raise TypeError(f"cannot handle this type -> {values.dtype}") from err
# Convert inf to nan for C funcs
inf = np.isinf(values)
if inf.any():
values = np.where(inf, np.nan, values)
return values
def _insert_on_column(self, result: "DataFrame", obj: "DataFrame"):
# if we have an 'on' column we want to put it back into
# the results in the same location
from pandas import Series
if self.on is not None and not self._on.equals(obj.index):
name = self._on.name
extra_col = Series(self._on, index=self.obj.index, name=name)
if name in result.columns:
# TODO: sure we want to overwrite results?
result[name] = extra_col
elif name in result.index.names:
pass
elif name in self._selected_obj.columns:
# insert in the same location as we had in _selected_obj
old_cols = self._selected_obj.columns
new_cols = result.columns
old_loc = old_cols.get_loc(name)
overlap = new_cols.intersection(old_cols[:old_loc])
new_loc = len(overlap)
result.insert(new_loc, name, extra_col)
else:
# insert at the end
result[name] = extra_col
def _get_roll_func(self, func_name: str) -> Callable[..., Any]:
"""
Wrap rolling function to check values passed.
Parameters
----------
func_name : str
Cython function used to calculate rolling statistics
Returns
-------
func : callable
"""
window_func = getattr(window_aggregations, func_name, None)
if window_func is None:
raise ValueError(
f"we do not support this function in window_aggregations.{func_name}"
)
return window_func
@property
def _index_array(self):
# TODO: why do we get here with e.g. MultiIndex?
if needs_i8_conversion(self._on.dtype):
return self._on.asi8
return None
def _get_window_indexer(self) -> BaseIndexer:
"""
Return an indexer class that will compute the window start and end bounds
"""
if isinstance(self.window, BaseIndexer):
return self.window
if self.is_freq_type:
return VariableWindowIndexer(
index_array=self._index_array, window_size=self.window
)
return FixedWindowIndexer(window_size=self.window)
def _apply_series(
self, homogeneous_func: Callable[..., ArrayLike], name: Optional[str] = None
) -> "Series":
"""
Series version of _apply_blockwise
"""
obj = self._create_data(self._selected_obj)
try:
# GH 12541: Special case for count where we support date-like types
input = obj.values if name != "count" else notna(obj.values).astype(int)
values = self._prep_values(input)
except (TypeError, NotImplementedError) as err:
raise DataError("No numeric types to aggregate") from err
result = homogeneous_func(values)
return obj._constructor(result, index=obj.index, name=obj.name)
def _apply_blockwise(
self, homogeneous_func: Callable[..., ArrayLike], name: Optional[str] = None
) -> FrameOrSeriesUnion:
"""
Apply the given function to the DataFrame broken down into homogeneous
sub-frames.
"""
if self._selected_obj.ndim == 1:
return self._apply_series(homogeneous_func, name)
obj = self._create_data(self._selected_obj)
if name == "count":
# GH 12541: Special case for count where we support date-like types
obj = notna(obj).astype(int)
obj._mgr = obj._mgr.consolidate()
mgr = obj._mgr
def hfunc(bvalues: ArrayLike) -> ArrayLike:
# TODO(EA2D): getattr unnecessary with 2D EAs
values = self._prep_values(getattr(bvalues, "T", bvalues))
res_values = homogeneous_func(values)
return getattr(res_values, "T", res_values)
new_mgr = mgr.apply(hfunc, ignore_failures=True)
out = obj._constructor(new_mgr)
if out.shape[1] == 0 and obj.shape[1] > 0:
raise DataError("No numeric types to aggregate")
elif out.shape[1] == 0:
return obj.astype("float64")
self._insert_on_column(out, obj)
return out
def _apply(
self,
func: Callable[..., Any],
name: Optional[str] = None,
numba_cache_key: Optional[Tuple[Callable, str]] = None,
**kwargs,
):
"""
Rolling statistical measure using supplied function.
Designed to be used with passed-in Cython array-based functions.
Parameters
----------
func : callable function to apply
name : str,
numba_cache_key : tuple
caching key to be used to store a compiled numba func
**kwargs
additional arguments for rolling function and window function
Returns
-------
y : type of input
"""
window_indexer = self._get_window_indexer()
min_periods = (
self.min_periods
if self.min_periods is not None
else window_indexer.window_size
)
def homogeneous_func(values: np.ndarray):
# calculation function
if values.size == 0:
return values.copy()
def calc(x):
start, end = window_indexer.get_window_bounds(
num_values=len(x),
min_periods=min_periods,
center=self.center,
closed=self.closed,
)
return func(x, start, end, min_periods)
with np.errstate(all="ignore"):
if values.ndim > 1:
result = np.apply_along_axis(calc, self.axis, values)
else:
result = calc(values)
result = np.asarray(result)
if numba_cache_key is not None:
NUMBA_FUNC_CACHE[numba_cache_key] = func
return result
return self._apply_blockwise(homogeneous_func, name)
def aggregate(self, func, *args, **kwargs):
result, how = aggregate(self, func, *args, **kwargs)
if result is None:
return self.apply(func, raw=False, args=args, kwargs=kwargs)
return result
agg = aggregate
_shared_docs["sum"] = dedent(
"""
Calculate %(name)s sum of given DataFrame or Series.
Parameters
----------
*args, **kwargs
For compatibility with other %(name)s methods. Has no effect
on the computed value.
Returns
-------
Series or DataFrame
Same type as the input, with the same index, containing the
%(name)s sum.
See Also
--------
pandas.Series.sum : Reducing sum for Series.
pandas.DataFrame.sum : Reducing sum for DataFrame.
Examples
--------
>>> s = pd.Series([1, 2, 3, 4, 5])
>>> s
0 1
1 2
2 3
3 4
4 5
dtype: int64
>>> s.rolling(3).sum()
0 NaN
1 NaN
2 6.0
3 9.0
4 12.0
dtype: float64
>>> s.expanding(3).sum()
0 NaN
1 NaN
2 6.0
3 10.0
4 15.0
dtype: float64
>>> s.rolling(3, center=True).sum()
0 NaN
1 6.0
2 9.0
3 12.0
4 NaN
dtype: float64
For DataFrame, each %(name)s sum is computed column-wise.
>>> df = pd.DataFrame({"A": s, "B": s ** 2})
>>> df
A B
0 1 1
1 2 4
2 3 9
3 4 16
4 5 25
>>> df.rolling(3).sum()
A B
0 NaN NaN
1 NaN NaN
2 6.0 14.0
3 9.0 29.0
4 12.0 50.0
"""
)
_shared_docs["mean"] = dedent(
"""
Calculate the %(name)s mean of the values.
Parameters
----------
*args
Under Review.
**kwargs
Under Review.
Returns
-------
Series or DataFrame
Returned object type is determined by the caller of the %(name)s
calculation.
See Also
--------
pandas.Series.%(name)s : Calling object with Series data.
pandas.DataFrame.%(name)s : Calling object with DataFrames.
pandas.Series.mean : Equivalent method for Series.
pandas.DataFrame.mean : Equivalent method for DataFrame.
Examples
--------
The below examples will show rolling mean calculations with window sizes of
two and three, respectively.
>>> s = pd.Series([1, 2, 3, 4])
>>> s.rolling(2).mean()
0 NaN
1 1.5
2 2.5
3 3.5
dtype: float64
>>> s.rolling(3).mean()
0 NaN
1 NaN
2 2.0
3 3.0
dtype: float64
"""
)
_shared_docs["var"] = dedent(
"""
Calculate unbiased %(name)s variance.
%(versionadded)s
Normalized by N-1 by default. This can be changed using the `ddof`
argument.
Parameters
----------
ddof : int, default 1
Delta Degrees of Freedom. The divisor used in calculations
is ``N - ddof``, where ``N`` represents the number of elements.
*args, **kwargs
For NumPy compatibility. No additional arguments are used.
Returns
-------
Series or DataFrame
Returns the same object type as the caller of the %(name)s calculation.
See Also
--------
pandas.Series.%(name)s : Calling object with Series data.
pandas.DataFrame.%(name)s : Calling object with DataFrames.
pandas.Series.var : Equivalent method for Series.
pandas.DataFrame.var : Equivalent method for DataFrame.
numpy.var : Equivalent method for Numpy array.
Notes
-----
The default `ddof` of 1 used in :meth:`Series.var` is different than the
default `ddof` of 0 in :func:`numpy.var`.
A minimum of 1 period is required for the rolling calculation.
Examples
--------
>>> s = pd.Series([5, 5, 6, 7, 5, 5, 5])
>>> s.rolling(3).var()
0 NaN
1 NaN
2 0.333333
3 1.000000
4 1.000000
5 1.333333
6 0.000000
dtype: float64
>>> s.expanding(3).var()
0 NaN
1 NaN
2 0.333333
3 0.916667
4 0.800000
5 0.700000
6 0.619048
dtype: float64
"""
)
_shared_docs["std"] = dedent(
"""
Calculate %(name)s standard deviation.
%(versionadded)s
Normalized by N-1 by default. This can be changed using the `ddof`
argument.
Parameters
----------
ddof : int, default 1
Delta Degrees of Freedom. The divisor used in calculations
is ``N - ddof``, where ``N`` represents the number of elements.
*args, **kwargs
For NumPy compatibility. No additional arguments are used.
Returns
-------
Series or DataFrame
Returns the same object type as the caller of the %(name)s calculation.
See Also
--------
pandas.Series.%(name)s : Calling object with Series data.
pandas.DataFrame.%(name)s : Calling object with DataFrames.
pandas.Series.std : Equivalent method for Series.
pandas.DataFrame.std : Equivalent method for DataFrame.
numpy.std : Equivalent method for Numpy array.
Notes
-----
The default `ddof` of 1 used in Series.std is different than the default
`ddof` of 0 in numpy.std.
A minimum of one period is required for the rolling calculation.
Examples
--------
>>> s = pd.Series([5, 5, 6, 7, 5, 5, 5])
>>> s.rolling(3).std()
0 NaN
1 NaN
2 0.577350
3 1.000000
4 1.000000
5 1.154701
6 0.000000
dtype: float64
>>> s.expanding(3).std()
0 NaN
1 NaN
2 0.577350
3 0.957427
4 0.894427
5 0.836660
6 0.786796
dtype: float64
"""
)
def dispatch(name: str, *args, **kwargs):
"""
Dispatch to groupby apply.
"""
def outer(self, *args, **kwargs):
def f(x):
x = self._shallow_copy(x, groupby=self._groupby)
return getattr(x, name)(*args, **kwargs)
return self._groupby.apply(f)
outer.__name__ = name
return outer
class BaseWindowGroupby(GotItemMixin, BaseWindow):
"""
Provide the groupby windowing facilities.
"""
def __init__(self, obj, *args, **kwargs):
kwargs.pop("parent", None)
groupby = kwargs.pop("groupby", None)
if groupby is None:
groupby, obj = obj, obj._selected_obj
self._groupby = groupby
self._groupby.mutated = True
self._groupby.grouper.mutated = True
super().__init__(obj, *args, **kwargs)
corr = dispatch("corr", other=None, pairwise=None)
cov = dispatch("cov", other=None, pairwise=None)
def _apply(
self,
func: Callable[..., Any],
name: Optional[str] = None,
numba_cache_key: Optional[Tuple[Callable, str]] = None,
**kwargs,
) -> FrameOrSeries:
result = super()._apply(
func,
name,
numba_cache_key,
**kwargs,
)
# Reconstruct the resulting MultiIndex from tuples
# 1st set of levels = group by labels
# 2nd set of levels = original index
# Ignore 2nd set of levels if a group by label include an index level
result_index_names = [
grouping.name for grouping in self._groupby.grouper._groupings
]
grouped_object_index = None
column_keys = [
key
for key in result_index_names
if key not in self.obj.index.names or key is None
]
if len(column_keys) == len(result_index_names):
grouped_object_index = self.obj.index
grouped_index_name = [*grouped_object_index.names]
result_index_names += grouped_index_name
else:
# Our result will have still kept the column in the result
result = result.drop(columns=column_keys, errors="ignore")
codes = self._groupby.grouper.codes
levels = self._groupby.grouper.levels
group_indices = self._groupby.grouper.indices.values()
if group_indices:
indexer = np.concatenate(list(group_indices))
else:
indexer = np.array([], dtype=np.intp)
codes = [c.take(indexer) for c in codes]
# if the index of the original dataframe needs to be preserved, append
# this index (but reordered) to the codes/levels from the groupby
if grouped_object_index is not None:
idx = grouped_object_index.take(indexer)
if not isinstance(idx, MultiIndex):
idx = MultiIndex.from_arrays([idx])
codes.extend(list(idx.codes))
levels.extend(list(idx.levels))
result_index = MultiIndex(
levels, codes, names=result_index_names, verify_integrity=False
)
result.index = result_index
return result
def _create_data(self, obj: FrameOrSeries) -> FrameOrSeries:
"""
Split data into blocks & return conformed data.
"""
# Ensure the object we're rolling over is monotonically sorted relative
# to the groups
# GH 36197
if not obj.empty:
groupby_order = np.concatenate(
list(self._groupby.grouper.indices.values())
).astype(np.int64)
obj = obj.take(groupby_order)
return super()._create_data(obj)
def _gotitem(self, key, ndim, subset=None):
# we are setting the index on the actual object
# here so our index is carried through to the selected obj
# when we do the splitting for the groupby
if self.on is not None:
self.obj = self.obj.set_index(self._on)
self.on = None
return super()._gotitem(key, ndim, subset=subset)
def _validate_monotonic(self):
"""
Validate that "on" is monotonic; already validated at a higher level.
"""
pass
class Window(BaseWindow):
"""
Provide rolling window calculations.
Parameters
----------
window : int, offset, or BaseIndexer subclass
Size of the moving window. This is the number of observations used for
calculating the statistic. Each window will be a fixed size.
If its an offset then this will be the time period of each window. Each
window will be a variable sized based on the observations included in
the time-period. This is only valid for datetimelike indexes.
If a BaseIndexer subclass is passed, calculates the window boundaries
based on the defined ``get_window_bounds`` method. Additional rolling
keyword arguments, namely `min_periods`, `center`, and
`closed` will be passed to `get_window_bounds`.
min_periods : int, default None
Minimum number of observations in window required to have a value
(otherwise result is NA). For a window that is specified by an offset,
`min_periods` will default to 1. Otherwise, `min_periods` will default
to the size of the window.
center : bool, default False
Set the labels at the center of the window.
win_type : str, default None
Provide a window type. If ``None``, all points are evenly weighted.
See the notes below for further information.
on : str, optional
For a DataFrame, a datetime-like column or MultiIndex level on which
to calculate the rolling window, rather than the DataFrame's index.
Provided integer column is ignored and excluded from result since
an integer index is not used to calculate the rolling window.
axis : int or str, default 0
closed : str, default None
Make the interval closed on the 'right', 'left', 'both' or
'neither' endpoints. Defaults to 'right'.
.. versionchanged:: 1.2.0
The closed parameter with fixed windows is now supported.
Returns
-------
a Window or Rolling sub-classed for the particular operation
See Also
--------
expanding : Provides expanding transformations.
ewm : Provides exponential weighted functions.
Notes
-----
By default, the result is set to the right edge of the window. This can be
changed to the center of the window by setting ``center=True``.
To learn more about the offsets & frequency strings, please see `this link
<https://pandas.pydata.org/pandas-docs/stable/user_guide/timeseries.html#offset-aliases>`__.
If ``win_type=None``, all points are evenly weighted; otherwise, ``win_type``
can accept a string of any `scipy.signal window function
<https://docs.scipy.org/doc/scipy/reference/signal.windows.html#module-scipy.signal.windows>`__.
Certain Scipy window types require additional parameters to be passed
in the aggregation function. The additional parameters must match
the keywords specified in the Scipy window type method signature.
Please see the third example below on how to add the additional parameters.
Examples
--------
>>> df = pd.DataFrame({'B': [0, 1, 2, np.nan, 4]})
>>> df
B
0 0.0
1 1.0
2 2.0
3 NaN
4 4.0
Rolling sum with a window length of 2, using the 'triang'
window type.
>>> df.rolling(2, win_type='triang').sum()
B
0 NaN
1 0.5
2 1.5
3 NaN
4 NaN
Rolling sum with a window length of 2, using the 'gaussian'
window type (note how we need to specify std).
>>> df.rolling(2, win_type='gaussian').sum(std=3)
B
0 NaN
1 0.986207
2 2.958621
3 NaN
4 NaN
Rolling sum with a window length of 2, min_periods defaults
to the window length.
>>> df.rolling(2).sum()
B
0 NaN
1 1.0
2 3.0
3 NaN
4 NaN
Same as above, but explicitly set the min_periods
>>> df.rolling(2, min_periods=1).sum()
B
0 0.0
1 1.0
2 3.0
3 2.0
4 4.0
Same as above, but with forward-looking windows
>>> indexer = pd.api.indexers.FixedForwardWindowIndexer(window_size=2)
>>> df.rolling(window=indexer, min_periods=1).sum()
B
0 1.0
1 3.0
2 2.0
3 4.0
4 4.0
A ragged (meaning not-a-regular frequency), time-indexed DataFrame
>>> df = pd.DataFrame({'B': [0, 1, 2, np.nan, 4]},
... index = [pd.Timestamp('20130101 09:00:00'),
... pd.Timestamp('20130101 09:00:02'),
... pd.Timestamp('20130101 09:00:03'),
... pd.Timestamp('20130101 09:00:05'),
... pd.Timestamp('20130101 09:00:06')])
>>> df
B
2013-01-01 09:00:00 0.0
2013-01-01 09:00:02 1.0
2013-01-01 09:00:03 2.0
2013-01-01 09:00:05 NaN
2013-01-01 09:00:06 4.0
Contrasting to an integer rolling window, this will roll a variable
length window corresponding to the time period.
The default for min_periods is 1.
>>> df.rolling('2s').sum()
B
2013-01-01 09:00:00 0.0
2013-01-01 09:00:02 1.0
2013-01-01 09:00:03 3.0
2013-01-01 09:00:05 NaN
2013-01-01 09:00:06 4.0
"""
@property
def _constructor(self):
return Window
def validate(self):
super().validate()
if isinstance(self.window, BaseIndexer):
raise NotImplementedError(
"BaseIndexer subclasses not implemented with win_types."
)
elif is_integer(self.window):
if self.window <= 0:
raise ValueError("window must be > 0 ")
sig = import_optional_dependency(
"scipy.signal", extra="Scipy is required to generate window weight."
)
if not isinstance(self.win_type, str):
raise ValueError(f"Invalid win_type {self.win_type}")
if getattr(sig, self.win_type, None) is None:
raise ValueError(f"Invalid win_type {self.win_type}")
else:
raise ValueError(f"Invalid window {self.window}")
def _center_window(self, result: np.ndarray, offset: int) -> np.ndarray:
"""
Center the result in the window for weighted rolling aggregations.
"""
if self.axis > result.ndim - 1:
raise ValueError("Requested axis is larger then no. of argument dimensions")
if offset > 0:
lead_indexer = [slice(None)] * result.ndim
lead_indexer[self.axis] = slice(offset, None)
result = np.copy(result[tuple(lead_indexer)])
return result
def _apply(
self,
func: Callable[[np.ndarray, int, int], np.ndarray],
name: Optional[str] = None,
numba_cache_key: Optional[Tuple[Callable, str]] = None,
**kwargs,
):
"""
Rolling with weights statistical measure using supplied function.
Designed to be used with passed-in Cython array-based functions.
Parameters
----------
func : callable function to apply
name : str,
use_numba_cache : tuple
unused
**kwargs
additional arguments for scipy windows if necessary
Returns
-------
y : type of input
"""
signal = import_optional_dependency(
"scipy.signal", extra="Scipy is required to generate window weight."
)
assert self.win_type is not None # for mypy
window = getattr(signal, self.win_type)(self.window, **kwargs)
offset = (len(window) - 1) // 2 if self.center else 0
def homogeneous_func(values: np.ndarray):
# calculation function
if values.size == 0:
return values.copy()
def calc(x):
additional_nans = np.array([np.nan] * offset)
x = np.concatenate((x, additional_nans))
return func(x, window, self.min_periods or len(window))
with np.errstate(all="ignore"):
if values.ndim > 1:
result = np.apply_along_axis(calc, self.axis, values)
else:
result = calc(values)
result = np.asarray(result)
if self.center:
result = self._center_window(result, offset)
return result
return self._apply_blockwise(homogeneous_func, name)
_agg_see_also_doc = dedent(
"""
See Also
--------
pandas.DataFrame.aggregate : Similar DataFrame method.
pandas.Series.aggregate : Similar Series method.
"""
)
_agg_examples_doc = dedent(
"""
Examples
--------
>>> df = pd.DataFrame({"A": [1, 2, 3], "B": [4, 5, 6], "C": [7, 8, 9]})
>>> df
A B C
0 1 4 7
1 2 5 8
2 3 6 9
>>> df.rolling(2, win_type="boxcar").agg("mean")
A B C
0 NaN NaN NaN
1 1.5 4.5 7.5
2 2.5 5.5 8.5
"""
)
@doc(
_shared_docs["aggregate"],
see_also=_agg_see_also_doc,
examples=_agg_examples_doc,
klass="Series/DataFrame",
axis="",
)
def aggregate(self, func, *args, **kwargs):
result, how = aggregate(self, func, *args, **kwargs)
if result is None:
# these must apply directly
result = func(self)
return result
agg = aggregate
@Substitution(name="window")
@Appender(_shared_docs["sum"])
def sum(self, *args, **kwargs):
nv.validate_window_func("sum", args, kwargs)
window_func = self._get_roll_func("roll_weighted_sum")
return self._apply(window_func, name="sum", **kwargs)
@Substitution(name="window")
@Appender(_shared_docs["mean"])
def mean(self, *args, **kwargs):
nv.validate_window_func("mean", args, kwargs)
window_func = self._get_roll_func("roll_weighted_mean")
return self._apply(window_func, name="mean", **kwargs)
@Substitution(name="window", versionadded="\n.. versionadded:: 1.0.0\n")
@Appender(_shared_docs["var"])
def var(self, ddof: int = 1, *args, **kwargs):
nv.validate_window_func("var", args, kwargs)
window_func = partial(self._get_roll_func("roll_weighted_var"), ddof=ddof)
kwargs.pop("name", None)
return self._apply(window_func, name="var", **kwargs)
@Substitution(name="window", versionadded="\n.. versionadded:: 1.0.0\n")
@Appender(_shared_docs["std"])
def std(self, ddof: int = 1, *args, **kwargs):
nv.validate_window_func("std", args, kwargs)
return zsqrt(self.var(ddof=ddof, name="std", **kwargs))
class RollingAndExpandingMixin(BaseWindow):
_shared_docs["count"] = dedent(
r"""
The %(name)s count of any non-NaN observations inside the window.
Returns
-------
Series or DataFrame
Returned object type is determined by the caller of the %(name)s
calculation.
See Also
--------
pandas.Series.%(name)s : Calling object with Series data.
pandas.DataFrame.%(name)s : Calling object with DataFrames.
pandas.DataFrame.count : Count of the full DataFrame.
Examples
--------
>>> s = pd.Series([2, 3, np.nan, 10])
>>> s.rolling(2).count()
0 1.0
1 2.0
2 1.0
3 1.0
dtype: float64
>>> s.rolling(3).count()
0 1.0
1 2.0
2 2.0
3 2.0
dtype: float64
>>> s.rolling(4).count()
0 1.0
1 2.0
2 2.0
3 3.0
dtype: float64
"""
)
def count(self):
window_func = self._get_roll_func("roll_sum")
return self._apply(window_func, name="count")
_shared_docs["apply"] = dedent(
r"""
Apply an arbitrary function to each %(name)s window.
Parameters
----------
func : function
Must produce a single value from an ndarray input if ``raw=True``
or a single value from a Series if ``raw=False``. Can also accept a
Numba JIT function with ``engine='numba'`` specified.
.. versionchanged:: 1.0.0
raw : bool, default None
* ``False`` : passes each row or column as a Series to the
function.
* ``True`` : the passed function will receive ndarray
objects instead.
If you are just applying a NumPy reduction function this will
achieve much better performance.
engine : str, default None
* ``'cython'`` : Runs rolling apply through C-extensions from cython.
* ``'numba'`` : Runs rolling apply through JIT compiled code from numba.
Only available when ``raw`` is set to ``True``.
* ``None`` : Defaults to ``'cython'`` or globally setting ``compute.use_numba``
.. versionadded:: 1.0.0
engine_kwargs : dict, default None
* For ``'cython'`` engine, there are no accepted ``engine_kwargs``
* For ``'numba'`` engine, the engine can accept ``nopython``, ``nogil``
and ``parallel`` dictionary keys. The values must either be ``True`` or
``False``. The default ``engine_kwargs`` for the ``'numba'`` engine is
``{'nopython': True, 'nogil': False, 'parallel': False}`` and will be
applied to both the ``func`` and the ``apply`` rolling aggregation.
.. versionadded:: 1.0.0
args : tuple, default None
Positional arguments to be passed into func.
kwargs : dict, default None
Keyword arguments to be passed into func.
Returns
-------
Series or DataFrame
Return type is determined by the caller.
See Also
--------
pandas.Series.%(name)s : Calling object with Series data.
pandas.DataFrame.%(name)s : Calling object with DataFrame data.
pandas.Series.apply : Similar method for Series.
pandas.DataFrame.apply : Similar method for DataFrame.
Notes
-----
See :ref:`window.numba_engine` for extended documentation and performance
considerations for the Numba engine.
"""
)
def apply(
self,
func: Callable[..., Any],
raw: bool = False,
engine: Optional[str] = None,
engine_kwargs: Optional[Dict[str, bool]] = None,
args: Optional[Tuple[Any, ...]] = None,
kwargs: Optional[Dict[str, Any]] = None,
):
if args is None:
args = ()
if kwargs is None:
kwargs = {}
if not is_bool(raw):
raise ValueError("raw parameter must be `True` or `False`")
numba_cache_key = None
if maybe_use_numba(engine):
if raw is False:
raise ValueError("raw must be `True` when using the numba engine")
apply_func = generate_numba_apply_func(args, kwargs, func, engine_kwargs)
numba_cache_key = (func, "rolling_apply")
elif engine in ("cython", None):
if engine_kwargs is not None:
raise ValueError("cython engine does not accept engine_kwargs")
apply_func = self._generate_cython_apply_func(args, kwargs, raw, func)
else:
raise ValueError("engine must be either 'numba' or 'cython'")
return self._apply(
apply_func,
numba_cache_key=numba_cache_key,
)
def _generate_cython_apply_func(
self,
args: Tuple[Any, ...],
kwargs: Dict[str, Any],
raw: bool,
function: Callable[..., Any],
) -> Callable[[np.ndarray, np.ndarray, np.ndarray, int], np.ndarray]:
from pandas import Series
window_func = partial(
self._get_roll_func("roll_apply"),
args=args,
kwargs=kwargs,
raw=raw,
function=function,
)
def apply_func(values, begin, end, min_periods, raw=raw):
if not raw:
values = Series(values, index=self.obj.index)
return window_func(values, begin, end, min_periods)
return apply_func
def sum(self, *args, **kwargs):
nv.validate_window_func("sum", args, kwargs)
window_func = self._get_roll_func("roll_sum")
return self._apply(window_func, name="sum", **kwargs)
_shared_docs["max"] = dedent(
"""
Calculate the %(name)s maximum.
Parameters
----------
*args, **kwargs
Arguments and keyword arguments to be passed into func.
"""
)
def max(self, *args, **kwargs):
nv.validate_window_func("max", args, kwargs)
window_func = self._get_roll_func("roll_max")
return self._apply(window_func, name="max", **kwargs)
_shared_docs["min"] = dedent(
"""
Calculate the %(name)s minimum.
Parameters
----------
**kwargs
Under Review.
Returns
-------
Series or DataFrame
Returned object type is determined by the caller of the %(name)s
calculation.
See Also
--------
pandas.Series.%(name)s : Calling object with a Series.
pandas.DataFrame.%(name)s : Calling object with a DataFrame.
pandas.Series.min : Similar method for Series.
pandas.DataFrame.min : Similar method for DataFrame.
Examples
--------
Performing a rolling minimum with a window size of 3.
>>> s = pd.Series([4, 3, 5, 2, 6])
>>> s.rolling(3).min()
0 NaN
1 NaN
2 3.0
3 2.0
4 2.0
dtype: float64
"""
)
def min(self, *args, **kwargs):
nv.validate_window_func("min", args, kwargs)
window_func = self._get_roll_func("roll_min")
return self._apply(window_func, name="min", **kwargs)
def mean(self, *args, **kwargs):
nv.validate_window_func("mean", args, kwargs)
window_func = self._get_roll_func("roll_mean")
return self._apply(window_func, name="mean", **kwargs)
_shared_docs["median"] = dedent(
"""
Calculate the %(name)s median.
Parameters
----------
**kwargs
For compatibility with other %(name)s methods. Has no effect
on the computed median.
Returns
-------
Series or DataFrame
Returned type is the same as the original object.
See Also
--------
pandas.Series.%(name)s : Calling object with Series data.
pandas.DataFrame.%(name)s : Calling object with DataFrames.
pandas.Series.median : Equivalent method for Series.
pandas.DataFrame.median : Equivalent method for DataFrame.
Examples
--------
Compute the rolling median of a series with a window size of 3.
>>> s = pd.Series([0, 1, 2, 3, 4])
>>> s.rolling(3).median()
0 NaN
1 NaN
2 1.0
3 2.0
4 3.0
dtype: float64
"""
)
def median(self, **kwargs):
window_func = self._get_roll_func("roll_median_c")
# GH 32865. Move max window size calculation to
# the median function implementation
return self._apply(window_func, name="median", **kwargs)
def std(self, ddof: int = 1, *args, **kwargs):
nv.validate_window_func("std", args, kwargs)
window_func = self._get_roll_func("roll_var")
def zsqrt_func(values, begin, end, min_periods):
return zsqrt(window_func(values, begin, end, min_periods, ddof=ddof))
return self._apply(
zsqrt_func,
name="std",
**kwargs,
)
def var(self, ddof: int = 1, *args, **kwargs):
nv.validate_window_func("var", args, kwargs)
window_func = partial(self._get_roll_func("roll_var"), ddof=ddof)
return self._apply(
window_func,
name="var",
**kwargs,
)
_shared_docs[
"skew"
] = """
Unbiased %(name)s skewness.
Parameters
----------
**kwargs
Keyword arguments to be passed into func.
"""
def skew(self, **kwargs):
window_func = self._get_roll_func("roll_skew")
return self._apply(
window_func,
name="skew",
**kwargs,
)
_shared_docs["kurt"] = dedent(
"""
Calculate unbiased %(name)s kurtosis.
This function uses Fisher's definition of kurtosis without bias.
Parameters
----------
**kwargs
Under Review.
Returns
-------
Series or DataFrame
Returned object type is determined by the caller of the %(name)s
calculation.
See Also
--------
pandas.Series.%(name)s : Calling object with Series data.
pandas.DataFrame.%(name)s : Calling object with DataFrames.
pandas.Series.kurt : Equivalent method for Series.
pandas.DataFrame.kurt : Equivalent method for DataFrame.
scipy.stats.skew : Third moment of a probability density.
scipy.stats.kurtosis : Reference SciPy method.
Notes
-----
A minimum of 4 periods is required for the %(name)s calculation.
"""
)
def sem(self, ddof: int = 1, *args, **kwargs):
return self.std(*args, **kwargs) / (self.count() - ddof).pow(0.5)
_shared_docs["sem"] = dedent(
"""
Compute %(name)s standard error of mean.
Parameters
----------
ddof : int, default 1
Delta Degrees of Freedom. The divisor used in calculations
is ``N - ddof``, where ``N`` represents the number of elements.
*args, **kwargs
For NumPy compatibility. No additional arguments are used.
Returns
-------
Series or DataFrame
Returned object type is determined by the caller of the %(name)s
calculation.
See Also
--------
pandas.Series.%(name)s : Calling object with Series data.
pandas.DataFrame.%(name)s : Calling object with DataFrames.
pandas.Series.sem : Equivalent method for Series.
pandas.DataFrame.sem : Equivalent method for DataFrame.
Notes
-----
A minimum of one period is required for the rolling calculation.
Examples
--------
>>> s = pd.Series([0, 1, 2, 3])
>>> s.rolling(2, min_periods=1).sem()
0 NaN
1 0.707107
2 0.707107
3 0.707107
dtype: float64
>>> s.expanding().sem()
0 NaN
1 0.707107
2 0.707107
3 0.745356
dtype: float64
"""
)
def kurt(self, **kwargs):
window_func = self._get_roll_func("roll_kurt")
return self._apply(
window_func,
name="kurt",
**kwargs,
)
_shared_docs["quantile"] = dedent(
"""
Calculate the %(name)s quantile.
Parameters
----------
quantile : float
Quantile to compute. 0 <= quantile <= 1.
interpolation : {'linear', 'lower', 'higher', 'midpoint', 'nearest'}
This optional parameter specifies the interpolation method to use,
when the desired quantile lies between two data points `i` and `j`:
* linear: `i + (j - i) * fraction`, where `fraction` is the
fractional part of the index surrounded by `i` and `j`.
* lower: `i`.
* higher: `j`.
* nearest: `i` or `j` whichever is nearest.
* midpoint: (`i` + `j`) / 2.
**kwargs
For compatibility with other %(name)s methods. Has no effect on
the result.
Returns
-------
Series or DataFrame
Returned object type is determined by the caller of the %(name)s
calculation.
See Also
--------
pandas.Series.quantile : Computes value at the given quantile over all data
in Series.
pandas.DataFrame.quantile : Computes values at the given quantile over
requested axis in DataFrame.
Examples
--------
>>> s = pd.Series([1, 2, 3, 4])
>>> s.rolling(2).quantile(.4, interpolation='lower')
0 NaN
1 1.0
2 2.0
3 3.0
dtype: float64
>>> s.rolling(2).quantile(.4, interpolation='midpoint')
0 NaN
1 1.5
2 2.5
3 3.5
dtype: float64
"""
)
def quantile(self, quantile: float, interpolation: str = "linear", **kwargs):
if quantile == 1.0:
window_func = self._get_roll_func("roll_max")
elif quantile == 0.0:
window_func = self._get_roll_func("roll_min")
else:
window_func = partial(
self._get_roll_func("roll_quantile"),
quantile=quantile,
interpolation=interpolation,
)
return self._apply(window_func, name="quantile", **kwargs)
_shared_docs[
"cov"
] = """
Calculate the %(name)s sample covariance.
Parameters
----------
other : Series, DataFrame, or ndarray, optional
If not supplied then will default to self and produce pairwise
output.
pairwise : bool, default None
If False then only matching columns between self and other will be
used and the output will be a DataFrame.
If True then all pairwise combinations will be calculated and the
output will be a MultiIndexed DataFrame in the case of DataFrame
inputs. In the case of missing elements, only complete pairwise
observations will be used.
ddof : int, default 1
Delta Degrees of Freedom. The divisor used in calculations
is ``N - ddof``, where ``N`` represents the number of elements.
**kwargs
Keyword arguments to be passed into func.
"""
def cov(self, other=None, pairwise=None, ddof=1, **kwargs):
if other is None:
other = self._selected_obj
# only default unset
pairwise = True if pairwise is None else pairwise
other = self._shallow_copy(other)
# GH 32865. We leverage rolling.mean, so we pass
# to the rolling constructors the data used when constructing self:
# window width, frequency data, or a BaseIndexer subclass
# GH 16058: offset window
window = (
self._get_cov_corr_window(other) if not self.is_freq_type else self.win_freq
)
def _get_cov(X, Y):
# GH #12373 : rolling functions error on float32 data
# to avoid potential overflow, cast the data to float64
X = X.astype("float64")
Y = Y.astype("float64")
mean = lambda x: x.rolling(
window, self.min_periods, center=self.center
).mean(**kwargs)
count = (
(X + Y)
.rolling(window=window, min_periods=0, center=self.center)
.count(**kwargs)
)
bias_adj = count / (count - ddof)
return (mean(X * Y) - mean(X) * mean(Y)) * bias_adj
return flex_binary_moment(
self._selected_obj, other._selected_obj, _get_cov, pairwise=bool(pairwise)
)
_shared_docs["corr"] = dedent(
"""
Calculate %(name)s correlation.
Parameters
----------
other : Series, DataFrame, or ndarray, optional
If not supplied then will default to self.
pairwise : bool, default None
Calculate pairwise combinations of columns within a
DataFrame. If `other` is not specified, defaults to `True`,
otherwise defaults to `False`.
Not relevant for :class:`~pandas.Series`.
**kwargs
Unused.
Returns
-------
Series or DataFrame
Returned object type is determined by the caller of the
%(name)s calculation.
See Also
--------
pandas.Series.%(name)s : Calling object with Series data.
pandas.DataFrame.%(name)s : Calling object with DataFrames.
pandas.Series.corr : Equivalent method for Series.
pandas.DataFrame.corr : Equivalent method for DataFrame.
cov : Similar method to calculate covariance.
numpy.corrcoef : NumPy Pearson's correlation calculation.
Notes
-----
This function uses Pearson's definition of correlation
(https://en.wikipedia.org/wiki/Pearson_correlation_coefficient).
When `other` is not specified, the output will be self correlation (e.g.
all 1's), except for :class:`~pandas.DataFrame` inputs with `pairwise`
set to `True`.
Function will return ``NaN`` for correlations of equal valued sequences;
this is the result of a 0/0 division error.
When `pairwise` is set to `False`, only matching columns between `self` and
`other` will be used.
When `pairwise` is set to `True`, the output will be a MultiIndex DataFrame
with the original index on the first level, and the `other` DataFrame
columns on the second level.
In the case of missing elements, only complete pairwise observations
will be used.
Examples
--------
The below example shows a rolling calculation with a window size of
four matching the equivalent function call using :meth:`numpy.corrcoef`.
>>> v1 = [3, 3, 3, 5, 8]
>>> v2 = [3, 4, 4, 4, 8]
>>> # numpy returns a 2X2 array, the correlation coefficient
>>> # is the number at entry [0][1]
>>> print(f"{np.corrcoef(v1[:-1], v2[:-1])[0][1]:.6f}")
0.333333
>>> print(f"{np.corrcoef(v1[1:], v2[1:])[0][1]:.6f}")
0.916949
>>> s1 = pd.Series(v1)
>>> s2 = pd.Series(v2)
>>> s1.rolling(4).corr(s2)
0 NaN
1 NaN
2 NaN
3 0.333333
4 0.916949
dtype: float64
The below example shows a similar rolling calculation on a
DataFrame using the pairwise option.
>>> matrix = np.array([[51., 35.], [49., 30.], [47., 32.],\
[46., 31.], [50., 36.]])
>>> print(np.corrcoef(matrix[:-1,0], matrix[:-1,1]).round(7))
[[1. 0.6263001]
[0.6263001 1. ]]
>>> print(np.corrcoef(matrix[1:,0], matrix[1:,1]).round(7))
[[1. 0.5553681]
[0.5553681 1. ]]
>>> df = pd.DataFrame(matrix, columns=['X','Y'])
>>> df
X Y
0 51.0 35.0
1 49.0 30.0
2 47.0 32.0
3 46.0 31.0
4 50.0 36.0
>>> df.rolling(4).corr(pairwise=True)
X Y
0 X NaN NaN
Y NaN NaN
1 X NaN NaN
Y NaN NaN
2 X NaN NaN
Y NaN NaN
3 X 1.000000 0.626300
Y 0.626300 1.000000
4 X 1.000000 0.555368
Y 0.555368 1.000000
"""
)
def corr(self, other=None, pairwise=None, **kwargs):
if other is None:
other = self._selected_obj
# only default unset
pairwise = True if pairwise is None else pairwise
other = self._shallow_copy(other)
# GH 32865. We leverage rolling.cov and rolling.std here, so we pass
# to the rolling constructors the data used when constructing self:
# window width, frequency data, or a BaseIndexer subclass
# GH 16058: offset window
window = (
self._get_cov_corr_window(other) if not self.is_freq_type else self.win_freq
)
def _get_corr(a, b):
a = a.rolling(
window=window, min_periods=self.min_periods, center=self.center
)
b = b.rolling(
window=window, min_periods=self.min_periods, center=self.center
)
# GH 31286: Through using var instead of std we can avoid numerical
# issues when the result of var is withing floating proint precision
# while std is not.
return a.cov(b, **kwargs) / (a.var(**kwargs) * b.var(**kwargs)) ** 0.5
return flex_binary_moment(
self._selected_obj, other._selected_obj, _get_corr, pairwise=bool(pairwise)
)
class Rolling(RollingAndExpandingMixin):
@cache_readonly
def is_datetimelike(self) -> bool:
return isinstance(
self._on, (ABCDatetimeIndex, ABCTimedeltaIndex, ABCPeriodIndex)
)
@cache_readonly
def _on(self) -> Index:
if self.on is None:
if self.axis == 0:
return self.obj.index
else:
# i.e. self.axis == 1
return self.obj.columns
elif isinstance(self.on, Index):
return self.on
elif isinstance(self.obj, ABCDataFrame) and self.on in self.obj.columns:
return Index(self.obj[self.on])
else:
raise ValueError(
f"invalid on specified as {self.on}, "
"must be a column (of DataFrame), an Index or None"
)
@property
def _constructor(self):
return Rolling
def validate(self):
super().validate()
# we allow rolling on a datetimelike index
if (self.obj.empty or self.is_datetimelike) and isinstance(
self.window, (str, BaseOffset, timedelta)
):
self._validate_monotonic()
# we don't allow center
if self.center:
raise NotImplementedError(
"center is not implemented for "
"datetimelike and offset based windows"
)
# this will raise ValueError on non-fixed freqs
self.win_freq = self.window
self.window = self._determine_window_length()
self.win_type = "freq"
# min_periods must be an integer
if self.min_periods is None:
self.min_periods = 1
elif isinstance(self.window, BaseIndexer):
# Passed BaseIndexer subclass should handle all other rolling kwargs
return
elif not is_integer(self.window):
raise ValueError("window must be an integer")
elif self.window < 0:
raise ValueError("window must be non-negative")
def _determine_window_length(self) -> Union[int, float]:
"""
Calculate freq for PeriodIndexes based on Index freq. Can not use
nanos, because asi8 of PeriodIndex is not in nanos
"""
freq = self._validate_freq()
if isinstance(self._on, ABCPeriodIndex):
return freq.nanos / (self._on.freq.nanos / self._on.freq.n)
return freq.nanos
def _validate_monotonic(self):
"""
Validate monotonic (increasing or decreasing).
"""
if not (self._on.is_monotonic_increasing or self._on.is_monotonic_decreasing):
self._raise_monotonic_error()
def _raise_monotonic_error(self):
formatted = self.on
if self.on is None:
formatted = "index"
raise ValueError(f"{formatted} must be monotonic")
def _validate_freq(self):
"""
Validate & return window frequency.
"""
try:
return to_offset(self.window)
except (TypeError, ValueError) as err:
raise ValueError(
f"passed window {self.window} is not "
"compatible with a datetimelike index"
) from err
_agg_see_also_doc = dedent(
"""
See Also
--------
pandas.Series.rolling : Calling object with Series data.
pandas.DataFrame.rolling : Calling object with DataFrame data.
"""
)
_agg_examples_doc = dedent(
"""
Examples
--------
>>> df = pd.DataFrame({"A": [1, 2, 3], "B": [4, 5, 6], "C": [7, 8, 9]})
>>> df
A B C
0 1 4 7
1 2 5 8
2 3 6 9
>>> df.rolling(2).sum()
A B C
0 NaN NaN NaN
1 3.0 9.0 15.0
2 5.0 11.0 17.0
>>> df.rolling(2).agg({"A": "sum", "B": "min"})
A B
0 NaN NaN
1 3.0 4.0
2 5.0 5.0
"""
)
@doc(
_shared_docs["aggregate"],
see_also=_agg_see_also_doc,
examples=_agg_examples_doc,
klass="Series/Dataframe",
axis="",
)
def aggregate(self, func, *args, **kwargs):
return super().aggregate(func, *args, **kwargs)
agg = aggregate
@Substitution(name="rolling")
@Appender(_shared_docs["count"])
def count(self):
if self.min_periods is None:
warnings.warn(
(
"min_periods=None will default to the size of window "
"consistent with other methods in a future version. "
"Specify min_periods=0 instead."
),
FutureWarning,
)
self.min_periods = 0
result = super().count()
self.min_periods = None
else:
result = super().count()
return result
@Substitution(name="rolling")
@Appender(_shared_docs["apply"])
def apply(
self, func, raw=False, engine=None, engine_kwargs=None, args=None, kwargs=None
):
return super().apply(
func,
raw=raw,
engine=engine,
engine_kwargs=engine_kwargs,
args=args,
kwargs=kwargs,
)
@Substitution(name="rolling")
@Appender(_shared_docs["sum"])
def sum(self, *args, **kwargs):
nv.validate_rolling_func("sum", args, kwargs)
return super().sum(*args, **kwargs)
@Substitution(name="rolling", func_name="max")
@Appender(_doc_template)
@Appender(_shared_docs["max"])
def max(self, *args, **kwargs):
nv.validate_rolling_func("max", args, kwargs)
return super().max(*args, **kwargs)
@Substitution(name="rolling")
@Appender(_shared_docs["min"])
def min(self, *args, **kwargs):
nv.validate_rolling_func("min", args, kwargs)
return super().min(*args, **kwargs)
@Substitution(name="rolling")
@Appender(_shared_docs["mean"])
def mean(self, *args, **kwargs):
nv.validate_rolling_func("mean", args, kwargs)
return super().mean(*args, **kwargs)
@Substitution(name="rolling")
@Appender(_shared_docs["median"])
def median(self, **kwargs):
return super().median(**kwargs)
@Substitution(name="rolling", versionadded="")
@Appender(_shared_docs["std"])
def std(self, ddof=1, *args, **kwargs):
nv.validate_rolling_func("std", args, kwargs)
return super().std(ddof=ddof, **kwargs)
@Substitution(name="rolling", versionadded="")
@Appender(_shared_docs["var"])
def var(self, ddof=1, *args, **kwargs):
nv.validate_rolling_func("var", args, kwargs)
return super().var(ddof=ddof, **kwargs)
@Substitution(name="rolling", func_name="skew")
@Appender(_doc_template)
@Appender(_shared_docs["skew"])
def skew(self, **kwargs):
return super().skew(**kwargs)
@Substitution(name="rolling")
@Appender(_shared_docs["sem"])
def sem(self, ddof=1, *args, **kwargs):
return self.std(*args, **kwargs) / (self.count() - ddof).pow(0.5)
_agg_doc = dedent(
"""
Examples
--------
The example below will show a rolling calculation with a window size of
four matching the equivalent function call using `scipy.stats`.
>>> arr = [1, 2, 3, 4, 999]
>>> import scipy.stats
>>> print(f"{scipy.stats.kurtosis(arr[:-1], bias=False):.6f}")
-1.200000
>>> print(f"{scipy.stats.kurtosis(arr[1:], bias=False):.6f}")
3.999946
>>> s = pd.Series(arr)
>>> s.rolling(4).kurt()
0 NaN
1 NaN
2 NaN
3 -1.200000
4 3.999946
dtype: float64
"""
)
@Appender(_agg_doc)
@Substitution(name="rolling")
@Appender(_shared_docs["kurt"])
def kurt(self, **kwargs):
return super().kurt(**kwargs)
@Substitution(name="rolling")
@Appender(_shared_docs["quantile"])
def quantile(self, quantile, interpolation="linear", **kwargs):
return super().quantile(
quantile=quantile, interpolation=interpolation, **kwargs
)
@Substitution(name="rolling", func_name="cov")
@Appender(_doc_template)
@Appender(_shared_docs["cov"])
def cov(self, other=None, pairwise=None, ddof=1, **kwargs):
return super().cov(other=other, pairwise=pairwise, ddof=ddof, **kwargs)
@Substitution(name="rolling")
@Appender(_shared_docs["corr"])
def corr(self, other=None, pairwise=None, **kwargs):
return super().corr(other=other, pairwise=pairwise, **kwargs)
Rolling.__doc__ = Window.__doc__
class RollingGroupby(BaseWindowGroupby, Rolling):
"""
Provide a rolling groupby implementation.
"""
def _get_window_indexer(self) -> GroupbyIndexer:
"""
Return an indexer class that will compute the window start and end bounds
Returns
-------
GroupbyIndexer
"""
rolling_indexer: Type[BaseIndexer]
indexer_kwargs: Optional[Dict[str, Any]] = None
index_array = self._index_array
window = self.window
if isinstance(self.window, BaseIndexer):
rolling_indexer = type(self.window)
indexer_kwargs = self.window.__dict__
assert isinstance(indexer_kwargs, dict) # for mypy
# We'll be using the index of each group later
indexer_kwargs.pop("index_array", None)
window = 0
elif self.is_freq_type:
rolling_indexer = VariableWindowIndexer
else:
rolling_indexer = FixedWindowIndexer
index_array = None
window_indexer = GroupbyIndexer(
index_array=index_array,
window_size=window,
groupby_indicies=self._groupby.indices,
window_indexer=rolling_indexer,
indexer_kwargs=indexer_kwargs,
)
return window_indexer
def _validate_monotonic(self):
"""
Validate that on is monotonic;
in this case we have to check only for nans, because
monotonicy was already validated at a higher level.
"""
if self._on.hasnans:
self._raise_monotonic_error()