489 lines
16 KiB
Python
489 lines
16 KiB
Python
|
import numpy as np
|
||
|
import pytest
|
||
|
|
||
|
import pandas as pd
|
||
|
from pandas import DataFrame, MultiIndex, Series, date_range
|
||
|
import pandas._testing as tm
|
||
|
|
||
|
|
||
|
def check(result, expected=None):
|
||
|
if expected is not None:
|
||
|
tm.assert_frame_equal(result, expected)
|
||
|
result.dtypes
|
||
|
str(result)
|
||
|
|
||
|
|
||
|
class TestDataFrameNonuniqueIndexes:
|
||
|
def test_column_dups_operations(self):
|
||
|
|
||
|
# assignment
|
||
|
# GH 3687
|
||
|
arr = np.random.randn(3, 2)
|
||
|
idx = list(range(2))
|
||
|
df = DataFrame(arr, columns=["A", "A"])
|
||
|
df.columns = idx
|
||
|
expected = DataFrame(arr, columns=idx)
|
||
|
check(df, expected)
|
||
|
|
||
|
idx = date_range("20130101", periods=4, freq="Q-NOV")
|
||
|
df = DataFrame(
|
||
|
[[1, 1, 1, 5], [1, 1, 2, 5], [2, 1, 3, 5]], columns=["a", "a", "a", "a"]
|
||
|
)
|
||
|
df.columns = idx
|
||
|
expected = DataFrame([[1, 1, 1, 5], [1, 1, 2, 5], [2, 1, 3, 5]], columns=idx)
|
||
|
check(df, expected)
|
||
|
|
||
|
# insert
|
||
|
df = DataFrame(
|
||
|
[[1, 1, 1, 5], [1, 1, 2, 5], [2, 1, 3, 5]],
|
||
|
columns=["foo", "bar", "foo", "hello"],
|
||
|
)
|
||
|
df["string"] = "bah"
|
||
|
expected = DataFrame(
|
||
|
[[1, 1, 1, 5, "bah"], [1, 1, 2, 5, "bah"], [2, 1, 3, 5, "bah"]],
|
||
|
columns=["foo", "bar", "foo", "hello", "string"],
|
||
|
)
|
||
|
check(df, expected)
|
||
|
with pytest.raises(ValueError, match="Length of value"):
|
||
|
df.insert(0, "AnotherColumn", range(len(df.index) - 1))
|
||
|
|
||
|
# insert same dtype
|
||
|
df["foo2"] = 3
|
||
|
expected = DataFrame(
|
||
|
[[1, 1, 1, 5, "bah", 3], [1, 1, 2, 5, "bah", 3], [2, 1, 3, 5, "bah", 3]],
|
||
|
columns=["foo", "bar", "foo", "hello", "string", "foo2"],
|
||
|
)
|
||
|
check(df, expected)
|
||
|
|
||
|
# set (non-dup)
|
||
|
df["foo2"] = 4
|
||
|
expected = DataFrame(
|
||
|
[[1, 1, 1, 5, "bah", 4], [1, 1, 2, 5, "bah", 4], [2, 1, 3, 5, "bah", 4]],
|
||
|
columns=["foo", "bar", "foo", "hello", "string", "foo2"],
|
||
|
)
|
||
|
check(df, expected)
|
||
|
df["foo2"] = 3
|
||
|
|
||
|
# delete (non dup)
|
||
|
del df["bar"]
|
||
|
expected = DataFrame(
|
||
|
[[1, 1, 5, "bah", 3], [1, 2, 5, "bah", 3], [2, 3, 5, "bah", 3]],
|
||
|
columns=["foo", "foo", "hello", "string", "foo2"],
|
||
|
)
|
||
|
check(df, expected)
|
||
|
|
||
|
# try to delete again (its not consolidated)
|
||
|
del df["hello"]
|
||
|
expected = DataFrame(
|
||
|
[[1, 1, "bah", 3], [1, 2, "bah", 3], [2, 3, "bah", 3]],
|
||
|
columns=["foo", "foo", "string", "foo2"],
|
||
|
)
|
||
|
check(df, expected)
|
||
|
|
||
|
# consolidate
|
||
|
df = df._consolidate()
|
||
|
expected = DataFrame(
|
||
|
[[1, 1, "bah", 3], [1, 2, "bah", 3], [2, 3, "bah", 3]],
|
||
|
columns=["foo", "foo", "string", "foo2"],
|
||
|
)
|
||
|
check(df, expected)
|
||
|
|
||
|
# insert
|
||
|
df.insert(2, "new_col", 5.0)
|
||
|
expected = DataFrame(
|
||
|
[[1, 1, 5.0, "bah", 3], [1, 2, 5.0, "bah", 3], [2, 3, 5.0, "bah", 3]],
|
||
|
columns=["foo", "foo", "new_col", "string", "foo2"],
|
||
|
)
|
||
|
check(df, expected)
|
||
|
|
||
|
# insert a dup
|
||
|
with pytest.raises(ValueError, match="cannot insert"):
|
||
|
df.insert(2, "new_col", 4.0)
|
||
|
|
||
|
df.insert(2, "new_col", 4.0, allow_duplicates=True)
|
||
|
expected = DataFrame(
|
||
|
[
|
||
|
[1, 1, 4.0, 5.0, "bah", 3],
|
||
|
[1, 2, 4.0, 5.0, "bah", 3],
|
||
|
[2, 3, 4.0, 5.0, "bah", 3],
|
||
|
],
|
||
|
columns=["foo", "foo", "new_col", "new_col", "string", "foo2"],
|
||
|
)
|
||
|
check(df, expected)
|
||
|
|
||
|
# delete (dup)
|
||
|
del df["foo"]
|
||
|
expected = DataFrame(
|
||
|
[[4.0, 5.0, "bah", 3], [4.0, 5.0, "bah", 3], [4.0, 5.0, "bah", 3]],
|
||
|
columns=["new_col", "new_col", "string", "foo2"],
|
||
|
)
|
||
|
tm.assert_frame_equal(df, expected)
|
||
|
|
||
|
# dup across dtypes
|
||
|
df = DataFrame(
|
||
|
[[1, 1, 1.0, 5], [1, 1, 2.0, 5], [2, 1, 3.0, 5]],
|
||
|
columns=["foo", "bar", "foo", "hello"],
|
||
|
)
|
||
|
check(df)
|
||
|
|
||
|
df["foo2"] = 7.0
|
||
|
expected = DataFrame(
|
||
|
[[1, 1, 1.0, 5, 7.0], [1, 1, 2.0, 5, 7.0], [2, 1, 3.0, 5, 7.0]],
|
||
|
columns=["foo", "bar", "foo", "hello", "foo2"],
|
||
|
)
|
||
|
check(df, expected)
|
||
|
|
||
|
result = df["foo"]
|
||
|
expected = DataFrame([[1, 1.0], [1, 2.0], [2, 3.0]], columns=["foo", "foo"])
|
||
|
check(result, expected)
|
||
|
|
||
|
# multiple replacements
|
||
|
df["foo"] = "string"
|
||
|
expected = DataFrame(
|
||
|
[
|
||
|
["string", 1, "string", 5, 7.0],
|
||
|
["string", 1, "string", 5, 7.0],
|
||
|
["string", 1, "string", 5, 7.0],
|
||
|
],
|
||
|
columns=["foo", "bar", "foo", "hello", "foo2"],
|
||
|
)
|
||
|
check(df, expected)
|
||
|
|
||
|
del df["foo"]
|
||
|
expected = DataFrame(
|
||
|
[[1, 5, 7.0], [1, 5, 7.0], [1, 5, 7.0]], columns=["bar", "hello", "foo2"]
|
||
|
)
|
||
|
check(df, expected)
|
||
|
|
||
|
# values
|
||
|
df = DataFrame([[1, 2.5], [3, 4.5]], index=[1, 2], columns=["x", "x"])
|
||
|
result = df.values
|
||
|
expected = np.array([[1, 2.5], [3, 4.5]])
|
||
|
assert (result == expected).all().all()
|
||
|
|
||
|
# rename, GH 4403
|
||
|
df4 = DataFrame(
|
||
|
{"RT": [0.0454], "TClose": [22.02], "TExg": [0.0422]},
|
||
|
index=MultiIndex.from_tuples(
|
||
|
[(600809, 20130331)], names=["STK_ID", "RPT_Date"]
|
||
|
),
|
||
|
)
|
||
|
|
||
|
df5 = DataFrame(
|
||
|
{
|
||
|
"RPT_Date": [20120930, 20121231, 20130331],
|
||
|
"STK_ID": [600809] * 3,
|
||
|
"STK_Name": ["饡驦", "饡驦", "饡驦"],
|
||
|
"TClose": [38.05, 41.66, 30.01],
|
||
|
},
|
||
|
index=MultiIndex.from_tuples(
|
||
|
[(600809, 20120930), (600809, 20121231), (600809, 20130331)],
|
||
|
names=["STK_ID", "RPT_Date"],
|
||
|
),
|
||
|
)
|
||
|
|
||
|
k = pd.merge(df4, df5, how="inner", left_index=True, right_index=True)
|
||
|
result = k.rename(columns={"TClose_x": "TClose", "TClose_y": "QT_Close"})
|
||
|
str(result)
|
||
|
result.dtypes
|
||
|
|
||
|
expected = DataFrame(
|
||
|
[[0.0454, 22.02, 0.0422, 20130331, 600809, "饡驦", 30.01]],
|
||
|
columns=[
|
||
|
"RT",
|
||
|
"TClose",
|
||
|
"TExg",
|
||
|
"RPT_Date",
|
||
|
"STK_ID",
|
||
|
"STK_Name",
|
||
|
"QT_Close",
|
||
|
],
|
||
|
).set_index(["STK_ID", "RPT_Date"], drop=False)
|
||
|
tm.assert_frame_equal(result, expected)
|
||
|
|
||
|
# reindex is invalid!
|
||
|
df = DataFrame(
|
||
|
[[1, 5, 7.0], [1, 5, 7.0], [1, 5, 7.0]], columns=["bar", "a", "a"]
|
||
|
)
|
||
|
msg = "cannot reindex from a duplicate axis"
|
||
|
with pytest.raises(ValueError, match=msg):
|
||
|
df.reindex(columns=["bar"])
|
||
|
with pytest.raises(ValueError, match=msg):
|
||
|
df.reindex(columns=["bar", "foo"])
|
||
|
|
||
|
# drop
|
||
|
df = DataFrame(
|
||
|
[[1, 5, 7.0], [1, 5, 7.0], [1, 5, 7.0]], columns=["bar", "a", "a"]
|
||
|
)
|
||
|
result = df.drop(["a"], axis=1)
|
||
|
expected = DataFrame([[1], [1], [1]], columns=["bar"])
|
||
|
check(result, expected)
|
||
|
result = df.drop("a", axis=1)
|
||
|
check(result, expected)
|
||
|
|
||
|
# describe
|
||
|
df = DataFrame(
|
||
|
[[1, 1, 1], [2, 2, 2], [3, 3, 3]],
|
||
|
columns=["bar", "a", "a"],
|
||
|
dtype="float64",
|
||
|
)
|
||
|
result = df.describe()
|
||
|
s = df.iloc[:, 0].describe()
|
||
|
expected = pd.concat([s, s, s], keys=df.columns, axis=1)
|
||
|
check(result, expected)
|
||
|
|
||
|
# check column dups with index equal and not equal to df's index
|
||
|
df = DataFrame(
|
||
|
np.random.randn(5, 3),
|
||
|
index=["a", "b", "c", "d", "e"],
|
||
|
columns=["A", "B", "A"],
|
||
|
)
|
||
|
for index in [df.index, pd.Index(list("edcba"))]:
|
||
|
this_df = df.copy()
|
||
|
expected_ser = Series(index.values, index=this_df.index)
|
||
|
expected_df = DataFrame(
|
||
|
{"A": expected_ser, "B": this_df["B"], "A": expected_ser},
|
||
|
columns=["A", "B", "A"],
|
||
|
)
|
||
|
this_df["A"] = index
|
||
|
check(this_df, expected_df)
|
||
|
|
||
|
# operations
|
||
|
for op in ["__add__", "__mul__", "__sub__", "__truediv__"]:
|
||
|
df = DataFrame({"A": np.arange(10), "B": np.random.rand(10)})
|
||
|
expected = getattr(df, op)(df)
|
||
|
expected.columns = ["A", "A"]
|
||
|
df.columns = ["A", "A"]
|
||
|
result = getattr(df, op)(df)
|
||
|
check(result, expected)
|
||
|
|
||
|
# multiple assignments that change dtypes
|
||
|
# the location indexer is a slice
|
||
|
# GH 6120
|
||
|
df = DataFrame(np.random.randn(5, 2), columns=["that", "that"])
|
||
|
expected = DataFrame(1.0, index=range(5), columns=["that", "that"])
|
||
|
|
||
|
df["that"] = 1.0
|
||
|
check(df, expected)
|
||
|
|
||
|
df = DataFrame(np.random.rand(5, 2), columns=["that", "that"])
|
||
|
expected = DataFrame(1, index=range(5), columns=["that", "that"])
|
||
|
|
||
|
df["that"] = 1
|
||
|
check(df, expected)
|
||
|
|
||
|
def test_column_dups2(self):
|
||
|
|
||
|
# drop buggy GH 6240
|
||
|
df = DataFrame(
|
||
|
{
|
||
|
"A": np.random.randn(5),
|
||
|
"B": np.random.randn(5),
|
||
|
"C": np.random.randn(5),
|
||
|
"D": ["a", "b", "c", "d", "e"],
|
||
|
}
|
||
|
)
|
||
|
|
||
|
expected = df.take([0, 1, 1], axis=1)
|
||
|
df2 = df.take([2, 0, 1, 2, 1], axis=1)
|
||
|
result = df2.drop("C", axis=1)
|
||
|
tm.assert_frame_equal(result, expected)
|
||
|
|
||
|
# dropna
|
||
|
df = DataFrame(
|
||
|
{
|
||
|
"A": np.random.randn(5),
|
||
|
"B": np.random.randn(5),
|
||
|
"C": np.random.randn(5),
|
||
|
"D": ["a", "b", "c", "d", "e"],
|
||
|
}
|
||
|
)
|
||
|
df.iloc[2, [0, 1, 2]] = np.nan
|
||
|
df.iloc[0, 0] = np.nan
|
||
|
df.iloc[1, 1] = np.nan
|
||
|
df.iloc[:, 3] = np.nan
|
||
|
expected = df.dropna(subset=["A", "B", "C"], how="all")
|
||
|
expected.columns = ["A", "A", "B", "C"]
|
||
|
|
||
|
df.columns = ["A", "A", "B", "C"]
|
||
|
|
||
|
result = df.dropna(subset=["A", "C"], how="all")
|
||
|
tm.assert_frame_equal(result, expected)
|
||
|
|
||
|
def test_getitem_boolean_series_with_duplicate_columns(self):
|
||
|
# boolean indexing
|
||
|
# GH 4879
|
||
|
dups = ["A", "A", "C", "D"]
|
||
|
df = DataFrame(
|
||
|
np.arange(12).reshape(3, 4), columns=["A", "B", "C", "D"], dtype="float64"
|
||
|
)
|
||
|
expected = df[df.C > 6]
|
||
|
expected.columns = dups
|
||
|
df = DataFrame(np.arange(12).reshape(3, 4), columns=dups, dtype="float64")
|
||
|
result = df[df.C > 6]
|
||
|
check(result, expected)
|
||
|
|
||
|
def test_getitem_boolean_frame_with_duplicate_columns(self):
|
||
|
dups = ["A", "A", "C", "D"]
|
||
|
|
||
|
# where
|
||
|
df = DataFrame(
|
||
|
np.arange(12).reshape(3, 4), columns=["A", "B", "C", "D"], dtype="float64"
|
||
|
)
|
||
|
# `df > 6` is a DataFrame with the same shape+alignment as df
|
||
|
expected = df[df > 6]
|
||
|
expected.columns = dups
|
||
|
df = DataFrame(np.arange(12).reshape(3, 4), columns=dups, dtype="float64")
|
||
|
result = df[df > 6]
|
||
|
check(result, expected)
|
||
|
|
||
|
def test_getitem_boolean_frame_unaligned_with_duplicate_columns(self):
|
||
|
# `df.A > 6` is a DataFrame with a different shape from df
|
||
|
dups = ["A", "A", "C", "D"]
|
||
|
|
||
|
# boolean with the duplicate raises
|
||
|
df = DataFrame(np.arange(12).reshape(3, 4), columns=dups, dtype="float64")
|
||
|
msg = "cannot reindex from a duplicate axis"
|
||
|
with pytest.raises(ValueError, match=msg):
|
||
|
df[df.A > 6]
|
||
|
|
||
|
def test_column_dups_indexing(self):
|
||
|
|
||
|
# dup aligning operations should work
|
||
|
# GH 5185
|
||
|
df1 = DataFrame([1, 2, 3, 4, 5], index=[1, 2, 1, 2, 3])
|
||
|
df2 = DataFrame([1, 2, 3], index=[1, 2, 3])
|
||
|
expected = DataFrame([0, 2, 0, 2, 2], index=[1, 1, 2, 2, 3])
|
||
|
result = df1.sub(df2)
|
||
|
tm.assert_frame_equal(result, expected)
|
||
|
|
||
|
# equality
|
||
|
df1 = DataFrame([[1, 2], [2, np.nan], [3, 4], [4, 4]], columns=["A", "B"])
|
||
|
df2 = DataFrame([[0, 1], [2, 4], [2, np.nan], [4, 5]], columns=["A", "A"])
|
||
|
|
||
|
# not-comparing like-labelled
|
||
|
msg = "Can only compare identically-labeled DataFrame objects"
|
||
|
with pytest.raises(ValueError, match=msg):
|
||
|
df1 == df2
|
||
|
|
||
|
df1r = df1.reindex_like(df2)
|
||
|
result = df1r == df2
|
||
|
expected = DataFrame(
|
||
|
[[False, True], [True, False], [False, False], [True, False]],
|
||
|
columns=["A", "A"],
|
||
|
)
|
||
|
tm.assert_frame_equal(result, expected)
|
||
|
|
||
|
# mixed column selection
|
||
|
# GH 5639
|
||
|
dfbool = DataFrame(
|
||
|
{
|
||
|
"one": Series([True, True, False], index=["a", "b", "c"]),
|
||
|
"two": Series([False, False, True, False], index=["a", "b", "c", "d"]),
|
||
|
"three": Series([False, True, True, True], index=["a", "b", "c", "d"]),
|
||
|
}
|
||
|
)
|
||
|
expected = pd.concat([dfbool["one"], dfbool["three"], dfbool["one"]], axis=1)
|
||
|
result = dfbool[["one", "three", "one"]]
|
||
|
check(result, expected)
|
||
|
|
||
|
# multi-axis dups
|
||
|
# GH 6121
|
||
|
df = DataFrame(
|
||
|
np.arange(25.0).reshape(5, 5),
|
||
|
index=["a", "b", "c", "d", "e"],
|
||
|
columns=["A", "B", "C", "D", "E"],
|
||
|
)
|
||
|
z = df[["A", "C", "A"]].copy()
|
||
|
expected = z.loc[["a", "c", "a"]]
|
||
|
|
||
|
df = DataFrame(
|
||
|
np.arange(25.0).reshape(5, 5),
|
||
|
index=["a", "b", "c", "d", "e"],
|
||
|
columns=["A", "B", "C", "D", "E"],
|
||
|
)
|
||
|
z = df[["A", "C", "A"]]
|
||
|
result = z.loc[["a", "c", "a"]]
|
||
|
check(result, expected)
|
||
|
|
||
|
def test_columns_with_dups(self):
|
||
|
# GH 3468 related
|
||
|
|
||
|
# basic
|
||
|
df = DataFrame([[1, 2]], columns=["a", "a"])
|
||
|
df.columns = ["a", "a.1"]
|
||
|
str(df)
|
||
|
expected = DataFrame([[1, 2]], columns=["a", "a.1"])
|
||
|
tm.assert_frame_equal(df, expected)
|
||
|
|
||
|
df = DataFrame([[1, 2, 3]], columns=["b", "a", "a"])
|
||
|
df.columns = ["b", "a", "a.1"]
|
||
|
str(df)
|
||
|
expected = DataFrame([[1, 2, 3]], columns=["b", "a", "a.1"])
|
||
|
tm.assert_frame_equal(df, expected)
|
||
|
|
||
|
# with a dup index
|
||
|
df = DataFrame([[1, 2]], columns=["a", "a"])
|
||
|
df.columns = ["b", "b"]
|
||
|
str(df)
|
||
|
expected = DataFrame([[1, 2]], columns=["b", "b"])
|
||
|
tm.assert_frame_equal(df, expected)
|
||
|
|
||
|
# multi-dtype
|
||
|
df = DataFrame(
|
||
|
[[1, 2, 1.0, 2.0, 3.0, "foo", "bar"]],
|
||
|
columns=["a", "a", "b", "b", "d", "c", "c"],
|
||
|
)
|
||
|
df.columns = list("ABCDEFG")
|
||
|
str(df)
|
||
|
expected = DataFrame(
|
||
|
[[1, 2, 1.0, 2.0, 3.0, "foo", "bar"]], columns=list("ABCDEFG")
|
||
|
)
|
||
|
tm.assert_frame_equal(df, expected)
|
||
|
|
||
|
df = DataFrame([[1, 2, "foo", "bar"]], columns=["a", "a", "a", "a"])
|
||
|
df.columns = ["a", "a.1", "a.2", "a.3"]
|
||
|
str(df)
|
||
|
expected = DataFrame([[1, 2, "foo", "bar"]], columns=["a", "a.1", "a.2", "a.3"])
|
||
|
tm.assert_frame_equal(df, expected)
|
||
|
|
||
|
# dups across blocks
|
||
|
df_float = DataFrame(np.random.randn(10, 3), dtype="float64")
|
||
|
df_int = DataFrame(np.random.randn(10, 3), dtype="int64")
|
||
|
df_bool = DataFrame(True, index=df_float.index, columns=df_float.columns)
|
||
|
df_object = DataFrame("foo", index=df_float.index, columns=df_float.columns)
|
||
|
df_dt = DataFrame(
|
||
|
pd.Timestamp("20010101"), index=df_float.index, columns=df_float.columns
|
||
|
)
|
||
|
df = pd.concat([df_float, df_int, df_bool, df_object, df_dt], axis=1)
|
||
|
|
||
|
assert len(df._mgr.blknos) == len(df.columns)
|
||
|
assert len(df._mgr.blklocs) == len(df.columns)
|
||
|
|
||
|
# testing iloc
|
||
|
for i in range(len(df.columns)):
|
||
|
df.iloc[:, i]
|
||
|
|
||
|
# dup columns across dtype GH 2079/2194
|
||
|
vals = [[1, -1, 2.0], [2, -2, 3.0]]
|
||
|
rs = DataFrame(vals, columns=["A", "A", "B"])
|
||
|
xp = DataFrame(vals)
|
||
|
xp.columns = ["A", "A", "B"]
|
||
|
tm.assert_frame_equal(rs, xp)
|
||
|
|
||
|
def test_set_value_by_index(self):
|
||
|
# See gh-12344
|
||
|
df = DataFrame(np.arange(9).reshape(3, 3).T)
|
||
|
df.columns = list("AAA")
|
||
|
expected = df.iloc[:, 2]
|
||
|
|
||
|
df.iloc[:, 0] = 3
|
||
|
tm.assert_series_equal(df.iloc[:, 2], expected)
|
||
|
|
||
|
df = DataFrame(np.arange(9).reshape(3, 3).T)
|
||
|
df.columns = [2, float(2), str(2)]
|
||
|
expected = df.iloc[:, 1]
|
||
|
|
||
|
df.iloc[:, 0] = 3
|
||
|
tm.assert_series_equal(df.iloc[:, 1], expected)
|