347 lines
12 KiB
Python
347 lines
12 KiB
Python
|
import warnings
|
||
|
|
||
|
import numpy as np
|
||
|
import pytest
|
||
|
|
||
|
import pandas.util._test_decorators as td
|
||
|
|
||
|
import pandas as pd
|
||
|
from pandas import DataFrame, Series, isna
|
||
|
import pandas._testing as tm
|
||
|
|
||
|
|
||
|
class TestDataFrameCov:
|
||
|
def test_cov(self, float_frame, float_string_frame):
|
||
|
# min_periods no NAs (corner case)
|
||
|
expected = float_frame.cov()
|
||
|
result = float_frame.cov(min_periods=len(float_frame))
|
||
|
|
||
|
tm.assert_frame_equal(expected, result)
|
||
|
|
||
|
result = float_frame.cov(min_periods=len(float_frame) + 1)
|
||
|
assert isna(result.values).all()
|
||
|
|
||
|
# with NAs
|
||
|
frame = float_frame.copy()
|
||
|
frame["A"][:5] = np.nan
|
||
|
frame["B"][5:10] = np.nan
|
||
|
result = float_frame.cov(min_periods=len(float_frame) - 8)
|
||
|
expected = float_frame.cov()
|
||
|
expected.loc["A", "B"] = np.nan
|
||
|
expected.loc["B", "A"] = np.nan
|
||
|
|
||
|
# regular
|
||
|
float_frame["A"][:5] = np.nan
|
||
|
float_frame["B"][:10] = np.nan
|
||
|
cov = float_frame.cov()
|
||
|
|
||
|
tm.assert_almost_equal(cov["A"]["C"], float_frame["A"].cov(float_frame["C"]))
|
||
|
|
||
|
# exclude non-numeric types
|
||
|
result = float_string_frame.cov()
|
||
|
expected = float_string_frame.loc[:, ["A", "B", "C", "D"]].cov()
|
||
|
tm.assert_frame_equal(result, expected)
|
||
|
|
||
|
# Single column frame
|
||
|
df = DataFrame(np.linspace(0.0, 1.0, 10))
|
||
|
result = df.cov()
|
||
|
expected = DataFrame(
|
||
|
np.cov(df.values.T).reshape((1, 1)), index=df.columns, columns=df.columns
|
||
|
)
|
||
|
tm.assert_frame_equal(result, expected)
|
||
|
df.loc[0] = np.nan
|
||
|
result = df.cov()
|
||
|
expected = DataFrame(
|
||
|
np.cov(df.values[1:].T).reshape((1, 1)),
|
||
|
index=df.columns,
|
||
|
columns=df.columns,
|
||
|
)
|
||
|
tm.assert_frame_equal(result, expected)
|
||
|
|
||
|
@pytest.mark.parametrize("test_ddof", [None, 0, 1, 2, 3])
|
||
|
def test_cov_ddof(self, test_ddof):
|
||
|
# GH#34611
|
||
|
np_array1 = np.random.rand(10)
|
||
|
np_array2 = np.random.rand(10)
|
||
|
df = DataFrame({0: np_array1, 1: np_array2})
|
||
|
result = df.cov(ddof=test_ddof)
|
||
|
expected_np = np.cov(np_array1, np_array2, ddof=test_ddof)
|
||
|
expected = DataFrame(expected_np)
|
||
|
tm.assert_frame_equal(result, expected)
|
||
|
|
||
|
@pytest.mark.parametrize(
|
||
|
"other_column", [pd.array([1, 2, 3]), np.array([1.0, 2.0, 3.0])]
|
||
|
)
|
||
|
def test_cov_nullable_integer(self, other_column):
|
||
|
# https://github.com/pandas-dev/pandas/issues/33803
|
||
|
data = DataFrame({"a": pd.array([1, 2, None]), "b": other_column})
|
||
|
result = data.cov()
|
||
|
arr = np.array([[0.5, 0.5], [0.5, 1.0]])
|
||
|
expected = DataFrame(arr, columns=["a", "b"], index=["a", "b"])
|
||
|
tm.assert_frame_equal(result, expected)
|
||
|
|
||
|
|
||
|
class TestDataFrameCorr:
|
||
|
# DataFrame.corr(), as opposed to DataFrame.corrwith
|
||
|
|
||
|
@pytest.mark.parametrize("method", ["pearson", "kendall", "spearman"])
|
||
|
@td.skip_if_no_scipy
|
||
|
def test_corr_scipy_method(self, float_frame, method):
|
||
|
float_frame["A"][:5] = np.nan
|
||
|
float_frame["B"][5:10] = np.nan
|
||
|
|
||
|
correls = float_frame.corr(method=method)
|
||
|
expected = float_frame["A"].corr(float_frame["C"], method=method)
|
||
|
tm.assert_almost_equal(correls["A"]["C"], expected)
|
||
|
|
||
|
# ---------------------------------------------------------------------
|
||
|
|
||
|
@td.skip_if_no_scipy
|
||
|
def test_corr_non_numeric(self, float_frame, float_string_frame):
|
||
|
float_frame["A"][:5] = np.nan
|
||
|
float_frame["B"][5:10] = np.nan
|
||
|
|
||
|
# exclude non-numeric types
|
||
|
result = float_string_frame.corr()
|
||
|
expected = float_string_frame.loc[:, ["A", "B", "C", "D"]].corr()
|
||
|
tm.assert_frame_equal(result, expected)
|
||
|
|
||
|
@td.skip_if_no_scipy
|
||
|
@pytest.mark.parametrize("meth", ["pearson", "kendall", "spearman"])
|
||
|
def test_corr_nooverlap(self, meth):
|
||
|
# nothing in common
|
||
|
df = DataFrame(
|
||
|
{
|
||
|
"A": [1, 1.5, 1, np.nan, np.nan, np.nan],
|
||
|
"B": [np.nan, np.nan, np.nan, 1, 1.5, 1],
|
||
|
"C": [np.nan, np.nan, np.nan, np.nan, np.nan, np.nan],
|
||
|
}
|
||
|
)
|
||
|
rs = df.corr(meth)
|
||
|
assert isna(rs.loc["A", "B"])
|
||
|
assert isna(rs.loc["B", "A"])
|
||
|
assert rs.loc["A", "A"] == 1
|
||
|
assert rs.loc["B", "B"] == 1
|
||
|
assert isna(rs.loc["C", "C"])
|
||
|
|
||
|
@td.skip_if_no_scipy
|
||
|
@pytest.mark.parametrize("meth", ["pearson", "spearman"])
|
||
|
def test_corr_constant(self, meth):
|
||
|
# constant --> all NA
|
||
|
|
||
|
df = DataFrame(
|
||
|
{
|
||
|
"A": [1, 1, 1, np.nan, np.nan, np.nan],
|
||
|
"B": [np.nan, np.nan, np.nan, 1, 1, 1],
|
||
|
}
|
||
|
)
|
||
|
rs = df.corr(meth)
|
||
|
assert isna(rs.values).all()
|
||
|
|
||
|
@td.skip_if_no_scipy
|
||
|
def test_corr_int_and_boolean(self):
|
||
|
# when dtypes of pandas series are different
|
||
|
# then ndarray will have dtype=object,
|
||
|
# so it need to be properly handled
|
||
|
df = DataFrame({"a": [True, False], "b": [1, 0]})
|
||
|
|
||
|
expected = DataFrame(np.ones((2, 2)), index=["a", "b"], columns=["a", "b"])
|
||
|
for meth in ["pearson", "kendall", "spearman"]:
|
||
|
|
||
|
with warnings.catch_warnings(record=True):
|
||
|
warnings.simplefilter("ignore", RuntimeWarning)
|
||
|
result = df.corr(meth)
|
||
|
tm.assert_frame_equal(result, expected)
|
||
|
|
||
|
def test_corr_cov_independent_index_column(self):
|
||
|
# GH#14617
|
||
|
df = DataFrame(np.random.randn(4 * 10).reshape(10, 4), columns=list("abcd"))
|
||
|
for method in ["cov", "corr"]:
|
||
|
result = getattr(df, method)()
|
||
|
assert result.index is not result.columns
|
||
|
assert result.index.equals(result.columns)
|
||
|
|
||
|
def test_corr_invalid_method(self):
|
||
|
# GH#22298
|
||
|
df = DataFrame(np.random.normal(size=(10, 2)))
|
||
|
msg = "method must be either 'pearson', 'spearman', 'kendall', or a callable, "
|
||
|
with pytest.raises(ValueError, match=msg):
|
||
|
df.corr(method="____")
|
||
|
|
||
|
def test_corr_int(self):
|
||
|
# dtypes other than float64 GH#1761
|
||
|
df3 = DataFrame({"a": [1, 2, 3, 4], "b": [1, 2, 3, 4]})
|
||
|
|
||
|
df3.cov()
|
||
|
df3.corr()
|
||
|
|
||
|
@td.skip_if_no_scipy
|
||
|
@pytest.mark.parametrize(
|
||
|
"nullable_column", [pd.array([1, 2, 3]), pd.array([1, 2, None])]
|
||
|
)
|
||
|
@pytest.mark.parametrize(
|
||
|
"other_column",
|
||
|
[pd.array([1, 2, 3]), np.array([1.0, 2.0, 3.0]), np.array([1.0, 2.0, np.nan])],
|
||
|
)
|
||
|
@pytest.mark.parametrize("method", ["pearson", "spearman", "kendall"])
|
||
|
def test_corr_nullable_integer(self, nullable_column, other_column, method):
|
||
|
# https://github.com/pandas-dev/pandas/issues/33803
|
||
|
data = DataFrame({"a": nullable_column, "b": other_column})
|
||
|
result = data.corr(method=method)
|
||
|
expected = DataFrame(np.ones((2, 2)), columns=["a", "b"], index=["a", "b"])
|
||
|
tm.assert_frame_equal(result, expected)
|
||
|
|
||
|
def test_corr_item_cache(self):
|
||
|
# Check that corr does not lead to incorrect entries in item_cache
|
||
|
|
||
|
df = DataFrame({"A": range(10)})
|
||
|
df["B"] = range(10)[::-1]
|
||
|
|
||
|
ser = df["A"] # populate item_cache
|
||
|
assert len(df._mgr.blocks) == 2
|
||
|
|
||
|
_ = df.corr()
|
||
|
|
||
|
# Check that the corr didnt break link between ser and df
|
||
|
ser.values[0] = 99
|
||
|
assert df.loc[0, "A"] == 99
|
||
|
assert df["A"] is ser
|
||
|
assert df.values[0, 0] == 99
|
||
|
|
||
|
@pytest.mark.parametrize("length", [2, 20, 200, 2000])
|
||
|
def test_corr_for_constant_columns(self, length):
|
||
|
# GH: 37448
|
||
|
df = DataFrame(length * [[0.4, 0.1]], columns=["A", "B"])
|
||
|
result = df.corr()
|
||
|
expected = DataFrame(
|
||
|
{"A": [np.nan, np.nan], "B": [np.nan, np.nan]}, index=["A", "B"]
|
||
|
)
|
||
|
tm.assert_frame_equal(result, expected)
|
||
|
|
||
|
def test_calc_corr_small_numbers(self):
|
||
|
# GH: 37452
|
||
|
df = DataFrame(
|
||
|
{"A": [1.0e-20, 2.0e-20, 3.0e-20], "B": [1.0e-20, 2.0e-20, 3.0e-20]}
|
||
|
)
|
||
|
result = df.corr()
|
||
|
expected = DataFrame({"A": [1.0, 1.0], "B": [1.0, 1.0]}, index=["A", "B"])
|
||
|
tm.assert_frame_equal(result, expected)
|
||
|
|
||
|
|
||
|
class TestDataFrameCorrWith:
|
||
|
def test_corrwith(self, datetime_frame):
|
||
|
a = datetime_frame
|
||
|
noise = Series(np.random.randn(len(a)), index=a.index)
|
||
|
|
||
|
b = datetime_frame.add(noise, axis=0)
|
||
|
|
||
|
# make sure order does not matter
|
||
|
b = b.reindex(columns=b.columns[::-1], index=b.index[::-1][10:])
|
||
|
del b["B"]
|
||
|
|
||
|
colcorr = a.corrwith(b, axis=0)
|
||
|
tm.assert_almost_equal(colcorr["A"], a["A"].corr(b["A"]))
|
||
|
|
||
|
rowcorr = a.corrwith(b, axis=1)
|
||
|
tm.assert_series_equal(rowcorr, a.T.corrwith(b.T, axis=0))
|
||
|
|
||
|
dropped = a.corrwith(b, axis=0, drop=True)
|
||
|
tm.assert_almost_equal(dropped["A"], a["A"].corr(b["A"]))
|
||
|
assert "B" not in dropped
|
||
|
|
||
|
dropped = a.corrwith(b, axis=1, drop=True)
|
||
|
assert a.index[-1] not in dropped.index
|
||
|
|
||
|
# non time-series data
|
||
|
index = ["a", "b", "c", "d", "e"]
|
||
|
columns = ["one", "two", "three", "four"]
|
||
|
df1 = DataFrame(np.random.randn(5, 4), index=index, columns=columns)
|
||
|
df2 = DataFrame(np.random.randn(4, 4), index=index[:4], columns=columns)
|
||
|
correls = df1.corrwith(df2, axis=1)
|
||
|
for row in index[:4]:
|
||
|
tm.assert_almost_equal(correls[row], df1.loc[row].corr(df2.loc[row]))
|
||
|
|
||
|
def test_corrwith_with_objects(self):
|
||
|
df1 = tm.makeTimeDataFrame()
|
||
|
df2 = tm.makeTimeDataFrame()
|
||
|
cols = ["A", "B", "C", "D"]
|
||
|
|
||
|
df1["obj"] = "foo"
|
||
|
df2["obj"] = "bar"
|
||
|
|
||
|
result = df1.corrwith(df2)
|
||
|
expected = df1.loc[:, cols].corrwith(df2.loc[:, cols])
|
||
|
tm.assert_series_equal(result, expected)
|
||
|
|
||
|
result = df1.corrwith(df2, axis=1)
|
||
|
expected = df1.loc[:, cols].corrwith(df2.loc[:, cols], axis=1)
|
||
|
tm.assert_series_equal(result, expected)
|
||
|
|
||
|
def test_corrwith_series(self, datetime_frame):
|
||
|
result = datetime_frame.corrwith(datetime_frame["A"])
|
||
|
expected = datetime_frame.apply(datetime_frame["A"].corr)
|
||
|
|
||
|
tm.assert_series_equal(result, expected)
|
||
|
|
||
|
def test_corrwith_matches_corrcoef(self):
|
||
|
df1 = DataFrame(np.arange(10000), columns=["a"])
|
||
|
df2 = DataFrame(np.arange(10000) ** 2, columns=["a"])
|
||
|
c1 = df1.corrwith(df2)["a"]
|
||
|
c2 = np.corrcoef(df1["a"], df2["a"])[0][1]
|
||
|
|
||
|
tm.assert_almost_equal(c1, c2)
|
||
|
assert c1 < 1
|
||
|
|
||
|
def test_corrwith_mixed_dtypes(self):
|
||
|
# GH#18570
|
||
|
df = DataFrame(
|
||
|
{"a": [1, 4, 3, 2], "b": [4, 6, 7, 3], "c": ["a", "b", "c", "d"]}
|
||
|
)
|
||
|
s = Series([0, 6, 7, 3])
|
||
|
result = df.corrwith(s)
|
||
|
corrs = [df["a"].corr(s), df["b"].corr(s)]
|
||
|
expected = Series(data=corrs, index=["a", "b"])
|
||
|
tm.assert_series_equal(result, expected)
|
||
|
|
||
|
def test_corrwith_index_intersection(self):
|
||
|
df1 = DataFrame(np.random.random(size=(10, 2)), columns=["a", "b"])
|
||
|
df2 = DataFrame(np.random.random(size=(10, 3)), columns=["a", "b", "c"])
|
||
|
|
||
|
result = df1.corrwith(df2, drop=True).index.sort_values()
|
||
|
expected = df1.columns.intersection(df2.columns).sort_values()
|
||
|
tm.assert_index_equal(result, expected)
|
||
|
|
||
|
def test_corrwith_index_union(self):
|
||
|
df1 = DataFrame(np.random.random(size=(10, 2)), columns=["a", "b"])
|
||
|
df2 = DataFrame(np.random.random(size=(10, 3)), columns=["a", "b", "c"])
|
||
|
|
||
|
result = df1.corrwith(df2, drop=False).index.sort_values()
|
||
|
expected = df1.columns.union(df2.columns).sort_values()
|
||
|
tm.assert_index_equal(result, expected)
|
||
|
|
||
|
def test_corrwith_dup_cols(self):
|
||
|
# GH#21925
|
||
|
df1 = DataFrame(np.vstack([np.arange(10)] * 3).T)
|
||
|
df2 = df1.copy()
|
||
|
df2 = pd.concat((df2, df2[0]), axis=1)
|
||
|
|
||
|
result = df1.corrwith(df2)
|
||
|
expected = Series(np.ones(4), index=[0, 0, 1, 2])
|
||
|
tm.assert_series_equal(result, expected)
|
||
|
|
||
|
@td.skip_if_no_scipy
|
||
|
def test_corrwith_spearman(self):
|
||
|
# GH#21925
|
||
|
df = DataFrame(np.random.random(size=(100, 3)))
|
||
|
result = df.corrwith(df ** 2, method="spearman")
|
||
|
expected = Series(np.ones(len(result)))
|
||
|
tm.assert_series_equal(result, expected)
|
||
|
|
||
|
@td.skip_if_no_scipy
|
||
|
def test_corrwith_kendall(self):
|
||
|
# GH#21925
|
||
|
df = DataFrame(np.random.random(size=(100, 3)))
|
||
|
result = df.corrwith(df ** 2, method="kendall")
|
||
|
expected = Series(np.ones(len(result)))
|
||
|
tm.assert_series_equal(result, expected)
|