from datetime import datetime import numpy as np import pytest import pandas.util._test_decorators as td from pandas import DataFrame, DatetimeIndex, Index, MultiIndex, Series import pandas._testing as tm from pandas.core.window.common import flex_binary_moment def _rolling_consistency_cases(): for window in [1, 2, 3, 10, 20]: for min_periods in {0, 1, 2, 3, 4, window}: if min_periods and (min_periods > window): continue for center in [False, True]: yield window, min_periods, center # binary moments def test_rolling_cov(series): A = series B = A + np.random.randn(len(A)) result = A.rolling(window=50, min_periods=25).cov(B) tm.assert_almost_equal(result[-1], np.cov(A[-50:], B[-50:])[0, 1]) def test_rolling_corr(series): A = series B = A + np.random.randn(len(A)) result = A.rolling(window=50, min_periods=25).corr(B) tm.assert_almost_equal(result[-1], np.corrcoef(A[-50:], B[-50:])[0, 1]) # test for correct bias correction a = tm.makeTimeSeries() b = tm.makeTimeSeries() a[:5] = np.nan b[:10] = np.nan result = a.rolling(window=len(a), min_periods=1).corr(b) tm.assert_almost_equal(result[-1], a.corr(b)) @pytest.mark.parametrize("func", ["cov", "corr"]) def test_rolling_pairwise_cov_corr(func, frame): result = getattr(frame.rolling(window=10, min_periods=5), func)() result = result.loc[(slice(None), 1), 5] result.index = result.index.droplevel(1) expected = getattr(frame[1].rolling(window=10, min_periods=5), func)(frame[5]) tm.assert_series_equal(result, expected, check_names=False) @pytest.mark.parametrize("method", ["corr", "cov"]) def test_flex_binary_frame(method, frame): series = frame[1] res = getattr(series.rolling(window=10), method)(frame) res2 = getattr(frame.rolling(window=10), method)(series) exp = frame.apply(lambda x: getattr(series.rolling(window=10), method)(x)) tm.assert_frame_equal(res, exp) tm.assert_frame_equal(res2, exp) frame2 = frame.copy() frame2.values[:] = np.random.randn(*frame2.shape) res3 = getattr(frame.rolling(window=10), method)(frame2) exp = DataFrame( {k: getattr(frame[k].rolling(window=10), method)(frame2[k]) for k in frame} ) tm.assert_frame_equal(res3, exp) @pytest.mark.parametrize( "window,min_periods,center", list(_rolling_consistency_cases()) ) @pytest.mark.parametrize("f", [lambda v: Series(v).sum(), np.nansum]) def test_rolling_apply_consistency_sum_nans( consistency_data, window, min_periods, center, f ): x, is_constant, no_nans = consistency_data if f is np.nansum and min_periods == 0: pass else: rolling_f_result = x.rolling( window=window, min_periods=min_periods, center=center ).sum() rolling_apply_f_result = x.rolling( window=window, min_periods=min_periods, center=center ).apply(func=f, raw=True) tm.assert_equal(rolling_f_result, rolling_apply_f_result) @pytest.mark.parametrize( "window,min_periods,center", list(_rolling_consistency_cases()) ) @pytest.mark.parametrize("f", [lambda v: Series(v).sum(), np.nansum, np.sum]) def test_rolling_apply_consistency_sum_no_nans( consistency_data, window, min_periods, center, f ): x, is_constant, no_nans = consistency_data if no_nans: if f is np.nansum and min_periods == 0: pass else: rolling_f_result = x.rolling( window=window, min_periods=min_periods, center=center ).sum() rolling_apply_f_result = x.rolling( window=window, min_periods=min_periods, center=center ).apply(func=f, raw=True) tm.assert_equal(rolling_f_result, rolling_apply_f_result) @pytest.mark.parametrize("window", range(7)) def test_rolling_corr_with_zero_variance(window): # GH 18430 s = Series(np.zeros(20)) other = Series(np.arange(20)) assert s.rolling(window=window).corr(other=other).isna().all() def test_flex_binary_moment(): # GH3155 # don't blow the stack msg = "arguments to moment function must be of type np.ndarray/Series/DataFrame" with pytest.raises(TypeError, match=msg): flex_binary_moment(5, 6, None) def test_corr_sanity(): # GH 3155 df = DataFrame( np.array( [ [0.87024726, 0.18505595], [0.64355431, 0.3091617], [0.92372966, 0.50552513], [0.00203756, 0.04520709], [0.84780328, 0.33394331], [0.78369152, 0.63919667], ] ) ) res = df[0].rolling(5, center=True).corr(df[1]) assert all(np.abs(np.nan_to_num(x)) <= 1 for x in res) df = DataFrame(np.random.rand(30, 2)) res = df[0].rolling(5, center=True).corr(df[1]) assert all(np.abs(np.nan_to_num(x)) <= 1 for x in res) def test_rolling_cov_diff_length(): # GH 7512 s1 = Series([1, 2, 3], index=[0, 1, 2]) s2 = Series([1, 3], index=[0, 2]) result = s1.rolling(window=3, min_periods=2).cov(s2) expected = Series([None, None, 2.0]) tm.assert_series_equal(result, expected) s2a = Series([1, None, 3], index=[0, 1, 2]) result = s1.rolling(window=3, min_periods=2).cov(s2a) tm.assert_series_equal(result, expected) def test_rolling_corr_diff_length(): # GH 7512 s1 = Series([1, 2, 3], index=[0, 1, 2]) s2 = Series([1, 3], index=[0, 2]) result = s1.rolling(window=3, min_periods=2).corr(s2) expected = Series([None, None, 1.0]) tm.assert_series_equal(result, expected) s2a = Series([1, None, 3], index=[0, 1, 2]) result = s1.rolling(window=3, min_periods=2).corr(s2a) tm.assert_series_equal(result, expected) @pytest.mark.parametrize( "f", [ lambda x: x.rolling(window=10, min_periods=5).cov(x, pairwise=False), lambda x: x.rolling(window=10, min_periods=5).corr(x, pairwise=False), lambda x: x.rolling(window=10, min_periods=5).max(), lambda x: x.rolling(window=10, min_periods=5).min(), lambda x: x.rolling(window=10, min_periods=5).sum(), lambda x: x.rolling(window=10, min_periods=5).mean(), lambda x: x.rolling(window=10, min_periods=5).std(), lambda x: x.rolling(window=10, min_periods=5).var(), lambda x: x.rolling(window=10, min_periods=5).skew(), lambda x: x.rolling(window=10, min_periods=5).kurt(), lambda x: x.rolling(window=10, min_periods=5).quantile(quantile=0.5), lambda x: x.rolling(window=10, min_periods=5).median(), lambda x: x.rolling(window=10, min_periods=5).apply(sum, raw=False), lambda x: x.rolling(window=10, min_periods=5).apply(sum, raw=True), pytest.param( lambda x: x.rolling(win_type="boxcar", window=10, min_periods=5).mean(), marks=td.skip_if_no_scipy, ), ], ) def test_rolling_functions_window_non_shrinkage(f): # GH 7764 s = Series(range(4)) s_expected = Series(np.nan, index=s.index) df = DataFrame([[1, 5], [3, 2], [3, 9], [-1, 0]], columns=["A", "B"]) df_expected = DataFrame(np.nan, index=df.index, columns=df.columns) s_result = f(s) tm.assert_series_equal(s_result, s_expected) df_result = f(df) tm.assert_frame_equal(df_result, df_expected) @pytest.mark.parametrize( "f", [ lambda x: (x.rolling(window=10, min_periods=5).cov(x, pairwise=True)), lambda x: (x.rolling(window=10, min_periods=5).corr(x, pairwise=True)), ], ) def test_rolling_functions_window_non_shrinkage_binary(f): # corr/cov return a MI DataFrame df = DataFrame( [[1, 5], [3, 2], [3, 9], [-1, 0]], columns=Index(["A", "B"], name="foo"), index=Index(range(4), name="bar"), ) df_expected = DataFrame( columns=Index(["A", "B"], name="foo"), index=MultiIndex.from_product([df.index, df.columns], names=["bar", "foo"]), dtype="float64", ) df_result = f(df) tm.assert_frame_equal(df_result, df_expected) def test_rolling_skew_edge_cases(): all_nan = Series([np.NaN] * 5) # yields all NaN (0 variance) d = Series([1] * 5) x = d.rolling(window=5).skew() tm.assert_series_equal(all_nan, x) # yields all NaN (window too small) d = Series(np.random.randn(5)) x = d.rolling(window=2).skew() tm.assert_series_equal(all_nan, x) # yields [NaN, NaN, NaN, 0.177994, 1.548824] d = Series([-1.50837035, -0.1297039, 0.19501095, 1.73508164, 0.41941401]) expected = Series([np.NaN, np.NaN, np.NaN, 0.177994, 1.548824]) x = d.rolling(window=4).skew() tm.assert_series_equal(expected, x) def test_rolling_kurt_edge_cases(): all_nan = Series([np.NaN] * 5) # yields all NaN (0 variance) d = Series([1] * 5) x = d.rolling(window=5).kurt() tm.assert_series_equal(all_nan, x) # yields all NaN (window too small) d = Series(np.random.randn(5)) x = d.rolling(window=3).kurt() tm.assert_series_equal(all_nan, x) # yields [NaN, NaN, NaN, 1.224307, 2.671499] d = Series([-1.50837035, -0.1297039, 0.19501095, 1.73508164, 0.41941401]) expected = Series([np.NaN, np.NaN, np.NaN, 1.224307, 2.671499]) x = d.rolling(window=4).kurt() tm.assert_series_equal(expected, x) def test_rolling_skew_eq_value_fperr(): # #18804 all rolling skew for all equal values should return Nan a = Series([1.1] * 15).rolling(window=10).skew() assert np.isnan(a).all() def test_rolling_kurt_eq_value_fperr(): # #18804 all rolling kurt for all equal values should return Nan a = Series([1.1] * 15).rolling(window=10).kurt() assert np.isnan(a).all() def test_rolling_max_gh6297(): """Replicate result expected in GH #6297""" indices = [datetime(1975, 1, i) for i in range(1, 6)] # So that we can have 2 datapoints on one of the days indices.append(datetime(1975, 1, 3, 6, 0)) series = Series(range(1, 7), index=indices) # Use floats instead of ints as values series = series.map(lambda x: float(x)) # Sort chronologically series = series.sort_index() expected = Series( [1.0, 2.0, 6.0, 4.0, 5.0], index=DatetimeIndex([datetime(1975, 1, i, 0) for i in range(1, 6)], freq="D"), ) x = series.resample("D").max().rolling(window=1).max() tm.assert_series_equal(expected, x) def test_rolling_max_resample(): indices = [datetime(1975, 1, i) for i in range(1, 6)] # So that we can have 3 datapoints on last day (4, 10, and 20) indices.append(datetime(1975, 1, 5, 1)) indices.append(datetime(1975, 1, 5, 2)) series = Series(list(range(0, 5)) + [10, 20], index=indices) # Use floats instead of ints as values series = series.map(lambda x: float(x)) # Sort chronologically series = series.sort_index() # Default how should be max expected = Series( [0.0, 1.0, 2.0, 3.0, 20.0], index=DatetimeIndex([datetime(1975, 1, i, 0) for i in range(1, 6)], freq="D"), ) x = series.resample("D").max().rolling(window=1).max() tm.assert_series_equal(expected, x) # Now specify median (10.0) expected = Series( [0.0, 1.0, 2.0, 3.0, 10.0], index=DatetimeIndex([datetime(1975, 1, i, 0) for i in range(1, 6)], freq="D"), ) x = series.resample("D").median().rolling(window=1).max() tm.assert_series_equal(expected, x) # Now specify mean (4+10+20)/3 v = (4.0 + 10.0 + 20.0) / 3.0 expected = Series( [0.0, 1.0, 2.0, 3.0, v], index=DatetimeIndex([datetime(1975, 1, i, 0) for i in range(1, 6)], freq="D"), ) x = series.resample("D").mean().rolling(window=1).max() tm.assert_series_equal(expected, x) def test_rolling_min_resample(): indices = [datetime(1975, 1, i) for i in range(1, 6)] # So that we can have 3 datapoints on last day (4, 10, and 20) indices.append(datetime(1975, 1, 5, 1)) indices.append(datetime(1975, 1, 5, 2)) series = Series(list(range(0, 5)) + [10, 20], index=indices) # Use floats instead of ints as values series = series.map(lambda x: float(x)) # Sort chronologically series = series.sort_index() # Default how should be min expected = Series( [0.0, 1.0, 2.0, 3.0, 4.0], index=DatetimeIndex([datetime(1975, 1, i, 0) for i in range(1, 6)], freq="D"), ) r = series.resample("D").min().rolling(window=1) tm.assert_series_equal(expected, r.min()) def test_rolling_median_resample(): indices = [datetime(1975, 1, i) for i in range(1, 6)] # So that we can have 3 datapoints on last day (4, 10, and 20) indices.append(datetime(1975, 1, 5, 1)) indices.append(datetime(1975, 1, 5, 2)) series = Series(list(range(0, 5)) + [10, 20], index=indices) # Use floats instead of ints as values series = series.map(lambda x: float(x)) # Sort chronologically series = series.sort_index() # Default how should be median expected = Series( [0.0, 1.0, 2.0, 3.0, 10], index=DatetimeIndex([datetime(1975, 1, i, 0) for i in range(1, 6)], freq="D"), ) x = series.resample("D").median().rolling(window=1).median() tm.assert_series_equal(expected, x) def test_rolling_median_memory_error(): # GH11722 n = 20000 Series(np.random.randn(n)).rolling(window=2, center=False).median() Series(np.random.randn(n)).rolling(window=2, center=False).median() @pytest.mark.parametrize( "data_type", [np.dtype(f"f{width}") for width in [4, 8]] + [np.dtype(f"{sign}{width}") for width in [1, 2, 4, 8] for sign in "ui"], ) def test_rolling_min_max_numeric_types(data_type): # GH12373 # Just testing that these don't throw exceptions and that # the return type is float64. Other tests will cover quantitative # correctness result = DataFrame(np.arange(20, dtype=data_type)).rolling(window=5).max() assert result.dtypes[0] == np.dtype("f8") result = DataFrame(np.arange(20, dtype=data_type)).rolling(window=5).min() assert result.dtypes[0] == np.dtype("f8") @pytest.mark.parametrize( "f", [ lambda x: x.rolling(window=10, min_periods=0).count(), lambda x: x.rolling(window=10, min_periods=5).cov(x, pairwise=False), lambda x: x.rolling(window=10, min_periods=5).corr(x, pairwise=False), lambda x: x.rolling(window=10, min_periods=5).max(), lambda x: x.rolling(window=10, min_periods=5).min(), lambda x: x.rolling(window=10, min_periods=5).sum(), lambda x: x.rolling(window=10, min_periods=5).mean(), lambda x: x.rolling(window=10, min_periods=5).std(), lambda x: x.rolling(window=10, min_periods=5).var(), lambda x: x.rolling(window=10, min_periods=5).skew(), lambda x: x.rolling(window=10, min_periods=5).kurt(), lambda x: x.rolling(window=10, min_periods=5).quantile(0.5), lambda x: x.rolling(window=10, min_periods=5).median(), lambda x: x.rolling(window=10, min_periods=5).apply(sum, raw=False), lambda x: x.rolling(window=10, min_periods=5).apply(sum, raw=True), pytest.param( lambda x: x.rolling(win_type="boxcar", window=10, min_periods=5).mean(), marks=td.skip_if_no_scipy, ), ], ) def test_moment_functions_zero_length(f): # GH 8056 s = Series(dtype=np.float64) s_expected = s df1 = DataFrame() df1_expected = df1 df2 = DataFrame(columns=["a"]) df2["a"] = df2["a"].astype("float64") df2_expected = df2 s_result = f(s) tm.assert_series_equal(s_result, s_expected) df1_result = f(df1) tm.assert_frame_equal(df1_result, df1_expected) df2_result = f(df2) tm.assert_frame_equal(df2_result, df2_expected) @pytest.mark.parametrize( "f", [ lambda x: (x.rolling(window=10, min_periods=5).cov(x, pairwise=True)), lambda x: (x.rolling(window=10, min_periods=5).corr(x, pairwise=True)), ], ) def test_moment_functions_zero_length_pairwise(f): df1 = DataFrame() df2 = DataFrame(columns=Index(["a"], name="foo"), index=Index([], name="bar")) df2["a"] = df2["a"].astype("float64") df1_expected = DataFrame( index=MultiIndex.from_product([df1.index, df1.columns]), columns=Index([]) ) df2_expected = DataFrame( index=MultiIndex.from_product([df2.index, df2.columns], names=["bar", "foo"]), columns=Index(["a"], name="foo"), dtype="float64", ) df1_result = f(df1) tm.assert_frame_equal(df1_result, df1_expected) df2_result = f(df2) tm.assert_frame_equal(df2_result, df2_expected) @pytest.mark.parametrize( "window,min_periods,center", list(_rolling_consistency_cases()) ) @pytest.mark.parametrize("ddof", [0, 1]) def test_moments_consistency_var(consistency_data, window, min_periods, center, ddof): x, is_constant, no_nans = consistency_data mean_x = x.rolling(window=window, min_periods=min_periods, center=center).mean() var_x = x.rolling(window=window, min_periods=min_periods, center=center).var( ddof=ddof ) assert not (var_x < 0).any().any() if ddof == 0: # check that biased var(x) == mean(x^2) - mean(x)^2 mean_x2 = ( (x * x) .rolling(window=window, min_periods=min_periods, center=center) .mean() ) tm.assert_equal(var_x, mean_x2 - (mean_x * mean_x)) @pytest.mark.parametrize( "window,min_periods,center", list(_rolling_consistency_cases()) ) @pytest.mark.parametrize("ddof", [0, 1]) def test_moments_consistency_var_constant( consistency_data, window, min_periods, center, ddof ): x, is_constant, no_nans = consistency_data if is_constant: count_x = x.rolling( window=window, min_periods=min_periods, center=center ).count() var_x = x.rolling(window=window, min_periods=min_periods, center=center).var( ddof=ddof ) # check that variance of constant series is identically 0 assert not (var_x > 0).any().any() expected = x * np.nan expected[count_x >= max(min_periods, 1)] = 0.0 if ddof == 1: expected[count_x < 2] = np.nan tm.assert_equal(var_x, expected) @pytest.mark.parametrize( "window,min_periods,center", list(_rolling_consistency_cases()) ) @pytest.mark.parametrize("ddof", [0, 1]) def test_rolling_consistency_std(consistency_data, window, min_periods, center, ddof): x, is_constant, no_nans = consistency_data var_x = x.rolling(window=window, min_periods=min_periods, center=center).var( ddof=ddof ) std_x = x.rolling(window=window, min_periods=min_periods, center=center).std( ddof=ddof ) assert not (var_x < 0).any().any() assert not (std_x < 0).any().any() # check that var(x) == std(x)^2 tm.assert_equal(var_x, std_x * std_x) @pytest.mark.parametrize( "window,min_periods,center", list(_rolling_consistency_cases()) ) @pytest.mark.parametrize("ddof", [0, 1]) def test_rolling_consistency_cov(consistency_data, window, min_periods, center, ddof): x, is_constant, no_nans = consistency_data var_x = x.rolling(window=window, min_periods=min_periods, center=center).var( ddof=ddof ) assert not (var_x < 0).any().any() cov_x_x = x.rolling(window=window, min_periods=min_periods, center=center).cov( x, ddof=ddof ) assert not (cov_x_x < 0).any().any() # check that var(x) == cov(x, x) tm.assert_equal(var_x, cov_x_x) @pytest.mark.parametrize( "window,min_periods,center", list(_rolling_consistency_cases()) ) @pytest.mark.parametrize("ddof", [0, 1]) def test_rolling_consistency_series_cov_corr( consistency_data, window, min_periods, center, ddof ): x, is_constant, no_nans = consistency_data if isinstance(x, Series): var_x_plus_y = ( (x + x) .rolling(window=window, min_periods=min_periods, center=center) .var(ddof=ddof) ) var_x = x.rolling(window=window, min_periods=min_periods, center=center).var( ddof=ddof ) var_y = x.rolling(window=window, min_periods=min_periods, center=center).var( ddof=ddof ) cov_x_y = x.rolling(window=window, min_periods=min_periods, center=center).cov( x, ddof=ddof ) # check that cov(x, y) == (var(x+y) - var(x) - # var(y)) / 2 tm.assert_equal(cov_x_y, 0.5 * (var_x_plus_y - var_x - var_y)) # check that corr(x, y) == cov(x, y) / (std(x) * # std(y)) corr_x_y = x.rolling( window=window, min_periods=min_periods, center=center ).corr(x) std_x = x.rolling(window=window, min_periods=min_periods, center=center).std( ddof=ddof ) std_y = x.rolling(window=window, min_periods=min_periods, center=center).std( ddof=ddof ) tm.assert_equal(corr_x_y, cov_x_y / (std_x * std_y)) if ddof == 0: # check that biased cov(x, y) == mean(x*y) - # mean(x)*mean(y) mean_x = x.rolling( window=window, min_periods=min_periods, center=center ).mean() mean_y = x.rolling( window=window, min_periods=min_periods, center=center ).mean() mean_x_times_y = ( (x * x) .rolling(window=window, min_periods=min_periods, center=center) .mean() ) tm.assert_equal(cov_x_y, mean_x_times_y - (mean_x * mean_y)) @pytest.mark.parametrize( "window,min_periods,center", list(_rolling_consistency_cases()) ) def test_rolling_consistency_mean(consistency_data, window, min_periods, center): x, is_constant, no_nans = consistency_data result = x.rolling(window=window, min_periods=min_periods, center=center).mean() expected = ( x.rolling(window=window, min_periods=min_periods, center=center) .sum() .divide( x.rolling(window=window, min_periods=min_periods, center=center).count() ) ) tm.assert_equal(result, expected.astype("float64")) @pytest.mark.parametrize( "window,min_periods,center", list(_rolling_consistency_cases()) ) def test_rolling_consistency_constant(consistency_data, window, min_periods, center): x, is_constant, no_nans = consistency_data if is_constant: count_x = x.rolling( window=window, min_periods=min_periods, center=center ).count() mean_x = x.rolling(window=window, min_periods=min_periods, center=center).mean() # check that correlation of a series with itself is either 1 or NaN corr_x_x = x.rolling( window=window, min_periods=min_periods, center=center ).corr(x) exp = x.max() if isinstance(x, Series) else x.max().max() # check mean of constant series expected = x * np.nan expected[count_x >= max(min_periods, 1)] = exp tm.assert_equal(mean_x, expected) # check correlation of constant series with itself is NaN expected[:] = np.nan tm.assert_equal(corr_x_x, expected) @pytest.mark.parametrize( "window,min_periods,center", list(_rolling_consistency_cases()) ) def test_rolling_consistency_var_debiasing_factors( consistency_data, window, min_periods, center ): x, is_constant, no_nans = consistency_data # check variance debiasing factors var_unbiased_x = x.rolling( window=window, min_periods=min_periods, center=center ).var() var_biased_x = x.rolling(window=window, min_periods=min_periods, center=center).var( ddof=0 ) var_debiasing_factors_x = ( x.rolling(window=window, min_periods=min_periods, center=center) .count() .divide( ( x.rolling(window=window, min_periods=min_periods, center=center).count() - 1.0 ).replace(0.0, np.nan) ) ) tm.assert_equal(var_unbiased_x, var_biased_x * var_debiasing_factors_x)