387 lines
12 KiB
Python
387 lines
12 KiB
Python
from datetime import datetime
|
|
|
|
import numpy as np
|
|
from numpy.random import randn
|
|
|
|
from pandas import DataFrame, Series, bdate_range, notna
|
|
import pandas._testing as tm
|
|
|
|
N, K = 100, 10
|
|
|
|
|
|
class Base:
|
|
|
|
_nan_locs = np.arange(20, 40)
|
|
_inf_locs = np.array([])
|
|
|
|
def _create_data(self):
|
|
arr = randn(N)
|
|
arr[self._nan_locs] = np.NaN
|
|
|
|
self.arr = arr
|
|
self.rng = bdate_range(datetime(2009, 1, 1), periods=N)
|
|
self.series = Series(arr.copy(), index=self.rng)
|
|
self.frame = DataFrame(randn(N, K), index=self.rng, columns=np.arange(K))
|
|
|
|
|
|
# create the data only once as we are not setting it
|
|
def _create_consistency_data():
|
|
def create_series():
|
|
return [
|
|
Series(dtype=object),
|
|
Series([np.nan]),
|
|
Series([np.nan, np.nan]),
|
|
Series([3.0]),
|
|
Series([np.nan, 3.0]),
|
|
Series([3.0, np.nan]),
|
|
Series([1.0, 3.0]),
|
|
Series([2.0, 2.0]),
|
|
Series([3.0, 1.0]),
|
|
Series(
|
|
[5.0, 5.0, 5.0, 5.0, np.nan, np.nan, np.nan, 5.0, 5.0, np.nan, np.nan]
|
|
),
|
|
Series(
|
|
[
|
|
np.nan,
|
|
5.0,
|
|
5.0,
|
|
5.0,
|
|
np.nan,
|
|
np.nan,
|
|
np.nan,
|
|
5.0,
|
|
5.0,
|
|
np.nan,
|
|
np.nan,
|
|
]
|
|
),
|
|
Series(
|
|
[
|
|
np.nan,
|
|
np.nan,
|
|
5.0,
|
|
5.0,
|
|
np.nan,
|
|
np.nan,
|
|
np.nan,
|
|
5.0,
|
|
5.0,
|
|
np.nan,
|
|
np.nan,
|
|
]
|
|
),
|
|
Series(
|
|
[
|
|
np.nan,
|
|
3.0,
|
|
np.nan,
|
|
3.0,
|
|
4.0,
|
|
5.0,
|
|
6.0,
|
|
np.nan,
|
|
np.nan,
|
|
7.0,
|
|
12.0,
|
|
13.0,
|
|
14.0,
|
|
15.0,
|
|
]
|
|
),
|
|
Series(
|
|
[
|
|
np.nan,
|
|
5.0,
|
|
np.nan,
|
|
2.0,
|
|
4.0,
|
|
0.0,
|
|
9.0,
|
|
np.nan,
|
|
np.nan,
|
|
3.0,
|
|
12.0,
|
|
13.0,
|
|
14.0,
|
|
15.0,
|
|
]
|
|
),
|
|
Series(
|
|
[
|
|
2.0,
|
|
3.0,
|
|
np.nan,
|
|
3.0,
|
|
4.0,
|
|
5.0,
|
|
6.0,
|
|
np.nan,
|
|
np.nan,
|
|
7.0,
|
|
12.0,
|
|
13.0,
|
|
14.0,
|
|
15.0,
|
|
]
|
|
),
|
|
Series(
|
|
[
|
|
2.0,
|
|
5.0,
|
|
np.nan,
|
|
2.0,
|
|
4.0,
|
|
0.0,
|
|
9.0,
|
|
np.nan,
|
|
np.nan,
|
|
3.0,
|
|
12.0,
|
|
13.0,
|
|
14.0,
|
|
15.0,
|
|
]
|
|
),
|
|
Series(range(10)),
|
|
Series(range(20, 0, -2)),
|
|
]
|
|
|
|
def create_dataframes():
|
|
return [
|
|
DataFrame(),
|
|
DataFrame(columns=["a"]),
|
|
DataFrame(columns=["a", "a"]),
|
|
DataFrame(columns=["a", "b"]),
|
|
DataFrame(np.arange(10).reshape((5, 2))),
|
|
DataFrame(np.arange(25).reshape((5, 5))),
|
|
DataFrame(np.arange(25).reshape((5, 5)), columns=["a", "b", 99, "d", "d"]),
|
|
] + [DataFrame(s) for s in create_series()]
|
|
|
|
def is_constant(x):
|
|
values = x.values.ravel()
|
|
return len(set(values[notna(values)])) == 1
|
|
|
|
def no_nans(x):
|
|
return x.notna().all().all()
|
|
|
|
# data is a tuple(object, is_constant, no_nans)
|
|
data = create_series() + create_dataframes()
|
|
|
|
return [(x, is_constant(x), no_nans(x)) for x in data]
|
|
|
|
|
|
_consistency_data = _create_consistency_data()
|
|
|
|
|
|
class ConsistencyBase(Base):
|
|
base_functions = [
|
|
(lambda v: Series(v).count(), None, "count"),
|
|
(lambda v: Series(v).max(), None, "max"),
|
|
(lambda v: Series(v).min(), None, "min"),
|
|
(lambda v: Series(v).sum(), None, "sum"),
|
|
(lambda v: Series(v).mean(), None, "mean"),
|
|
(lambda v: Series(v).std(), 1, "std"),
|
|
(lambda v: Series(v).cov(Series(v)), None, "cov"),
|
|
(lambda v: Series(v).corr(Series(v)), None, "corr"),
|
|
(lambda v: Series(v).var(), 1, "var"),
|
|
# restore once GH 8086 is fixed
|
|
# lambda v: Series(v).skew(), 3, 'skew'),
|
|
# (lambda v: Series(v).kurt(), 4, 'kurt'),
|
|
# restore once GH 8084 is fixed
|
|
# lambda v: Series(v).quantile(0.3), None, 'quantile'),
|
|
(lambda v: Series(v).median(), None, "median"),
|
|
(np.nanmax, 1, "max"),
|
|
(np.nanmin, 1, "min"),
|
|
(np.nansum, 1, "sum"),
|
|
(np.nanmean, 1, "mean"),
|
|
(lambda v: np.nanstd(v, ddof=1), 1, "std"),
|
|
(lambda v: np.nanvar(v, ddof=1), 1, "var"),
|
|
(np.nanmedian, 1, "median"),
|
|
]
|
|
no_nan_functions = [
|
|
(np.max, None, "max"),
|
|
(np.min, None, "min"),
|
|
(np.sum, None, "sum"),
|
|
(np.mean, None, "mean"),
|
|
(lambda v: np.std(v, ddof=1), 1, "std"),
|
|
(lambda v: np.var(v, ddof=1), 1, "var"),
|
|
(np.median, None, "median"),
|
|
]
|
|
|
|
def _create_data(self):
|
|
super()._create_data()
|
|
self.data = _consistency_data
|
|
|
|
def _test_moments_consistency_mock_mean(self, mean, mock_mean):
|
|
for (x, is_constant, no_nans) in self.data:
|
|
mean_x = mean(x)
|
|
# check that correlation of a series with itself is either 1 or NaN
|
|
|
|
if mock_mean:
|
|
# check that mean equals mock_mean
|
|
expected = mock_mean(x)
|
|
tm.assert_equal(mean_x, expected.astype("float64"))
|
|
|
|
def _test_moments_consistency_is_constant(self, min_periods, count, mean, corr):
|
|
for (x, is_constant, no_nans) in self.data:
|
|
count_x = count(x)
|
|
mean_x = mean(x)
|
|
# check that correlation of a series with itself is either 1 or NaN
|
|
corr_x_x = corr(x, x)
|
|
|
|
if is_constant:
|
|
exp = x.max() if isinstance(x, Series) else x.max().max()
|
|
|
|
# check mean of constant series
|
|
expected = x * np.nan
|
|
expected[count_x >= max(min_periods, 1)] = exp
|
|
tm.assert_equal(mean_x, expected)
|
|
|
|
# check correlation of constant series with itself is NaN
|
|
expected[:] = np.nan
|
|
tm.assert_equal(corr_x_x, expected)
|
|
|
|
def _test_moments_consistency_var_debiasing_factors(
|
|
self, var_biased=None, var_unbiased=None, var_debiasing_factors=None
|
|
):
|
|
for (x, is_constant, no_nans) in self.data:
|
|
if var_unbiased and var_biased and var_debiasing_factors:
|
|
# check variance debiasing factors
|
|
var_unbiased_x = var_unbiased(x)
|
|
var_biased_x = var_biased(x)
|
|
var_debiasing_factors_x = var_debiasing_factors(x)
|
|
tm.assert_equal(var_unbiased_x, var_biased_x * var_debiasing_factors_x)
|
|
|
|
def _test_moments_consistency(
|
|
self,
|
|
min_periods,
|
|
count,
|
|
mean,
|
|
corr,
|
|
var_unbiased=None,
|
|
std_unbiased=None,
|
|
cov_unbiased=None,
|
|
var_biased=None,
|
|
std_biased=None,
|
|
cov_biased=None,
|
|
):
|
|
|
|
for (x, is_constant, no_nans) in self.data:
|
|
count_x = count(x)
|
|
mean_x = mean(x)
|
|
|
|
for (std, var, cov) in [
|
|
(std_biased, var_biased, cov_biased),
|
|
(std_unbiased, var_unbiased, cov_unbiased),
|
|
]:
|
|
|
|
# check that var(x), std(x), and cov(x) are all >= 0
|
|
var_x = var(x)
|
|
std_x = std(x)
|
|
assert not (var_x < 0).any().any()
|
|
assert not (std_x < 0).any().any()
|
|
if cov:
|
|
cov_x_x = cov(x, x)
|
|
assert not (cov_x_x < 0).any().any()
|
|
|
|
# check that var(x) == cov(x, x)
|
|
tm.assert_equal(var_x, cov_x_x)
|
|
|
|
# check that var(x) == std(x)^2
|
|
tm.assert_equal(var_x, std_x * std_x)
|
|
|
|
if var is var_biased:
|
|
# check that biased var(x) == mean(x^2) - mean(x)^2
|
|
mean_x2 = mean(x * x)
|
|
tm.assert_equal(var_x, mean_x2 - (mean_x * mean_x))
|
|
|
|
if is_constant:
|
|
# check that variance of constant series is identically 0
|
|
assert not (var_x > 0).any().any()
|
|
expected = x * np.nan
|
|
expected[count_x >= max(min_periods, 1)] = 0.0
|
|
if var is var_unbiased:
|
|
expected[count_x < 2] = np.nan
|
|
tm.assert_equal(var_x, expected)
|
|
|
|
if isinstance(x, Series):
|
|
for (y, is_constant, no_nans) in self.data:
|
|
if not x.isna().equals(y.isna()):
|
|
# can only easily test two Series with similar
|
|
# structure
|
|
continue
|
|
|
|
# check that cor(x, y) is symmetric
|
|
corr_x_y = corr(x, y)
|
|
corr_y_x = corr(y, x)
|
|
tm.assert_equal(corr_x_y, corr_y_x)
|
|
|
|
if cov:
|
|
# check that cov(x, y) is symmetric
|
|
cov_x_y = cov(x, y)
|
|
cov_y_x = cov(y, x)
|
|
tm.assert_equal(cov_x_y, cov_y_x)
|
|
|
|
# check that cov(x, y) == (var(x+y) - var(x) -
|
|
# var(y)) / 2
|
|
var_x_plus_y = var(x + y)
|
|
var_y = var(y)
|
|
tm.assert_equal(
|
|
cov_x_y, 0.5 * (var_x_plus_y - var_x - var_y)
|
|
)
|
|
|
|
# check that corr(x, y) == cov(x, y) / (std(x) *
|
|
# std(y))
|
|
std_y = std(y)
|
|
tm.assert_equal(corr_x_y, cov_x_y / (std_x * std_y))
|
|
|
|
if cov is cov_biased:
|
|
# check that biased cov(x, y) == mean(x*y) -
|
|
# mean(x)*mean(y)
|
|
mean_y = mean(y)
|
|
mean_x_times_y = mean(x * y)
|
|
tm.assert_equal(
|
|
cov_x_y, mean_x_times_y - (mean_x * mean_y)
|
|
)
|
|
|
|
def _check_pairwise_moment(self, dispatch, name, **kwargs):
|
|
def get_result(obj, obj2=None):
|
|
return getattr(getattr(obj, dispatch)(**kwargs), name)(obj2)
|
|
|
|
result = get_result(self.frame)
|
|
result = result.loc[(slice(None), 1), 5]
|
|
result.index = result.index.droplevel(1)
|
|
expected = get_result(self.frame[1], self.frame[5])
|
|
tm.assert_series_equal(result, expected, check_names=False)
|
|
|
|
|
|
def ew_func(A, B, com, name, **kwargs):
|
|
return getattr(A.ewm(com, **kwargs), name)(B)
|
|
|
|
|
|
def check_binary_ew(name, A, B):
|
|
|
|
result = ew_func(A=A, B=B, com=20, name=name, min_periods=5)
|
|
assert np.isnan(result.values[:14]).all()
|
|
assert not np.isnan(result.values[14:]).any()
|
|
|
|
|
|
def check_binary_ew_min_periods(name, min_periods, A, B):
|
|
# GH 7898
|
|
result = ew_func(A, B, 20, name=name, min_periods=min_periods)
|
|
# binary functions (ewmcov, ewmcorr) with bias=False require at
|
|
# least two values
|
|
assert np.isnan(result.values[:11]).all()
|
|
assert not np.isnan(result.values[11:]).any()
|
|
|
|
# check series of length 0
|
|
empty = Series([], dtype=np.float64)
|
|
result = ew_func(empty, empty, 50, name=name, min_periods=min_periods)
|
|
tm.assert_series_equal(result, empty)
|
|
|
|
# check series of length 1
|
|
result = ew_func(
|
|
Series([1.0]), Series([1.0]), 50, name=name, min_periods=min_periods
|
|
)
|
|
tm.assert_series_equal(result, Series([np.NaN]))
|