Inzynierka/Lib/site-packages/pandas/tests/generic/test_duplicate_labels.py

449 lines
16 KiB
Python
Raw Permalink Normal View History

2023-06-02 12:51:02 +02:00
"""Tests dealing with the NDFrame.allows_duplicates."""
import operator
import numpy as np
import pytest
import pandas as pd
import pandas._testing as tm
not_implemented = pytest.mark.xfail(reason="Not implemented.")
# ----------------------------------------------------------------------------
# Preservation
class TestPreserves:
@pytest.mark.parametrize(
"cls, data",
[
(pd.Series, np.array([])),
(pd.Series, [1, 2]),
(pd.DataFrame, {}),
(pd.DataFrame, {"A": [1, 2]}),
],
)
def test_construction_ok(self, cls, data):
result = cls(data)
assert result.flags.allows_duplicate_labels is True
result = cls(data).set_flags(allows_duplicate_labels=False)
assert result.flags.allows_duplicate_labels is False
@pytest.mark.parametrize(
"func",
[
operator.itemgetter(["a"]),
operator.methodcaller("add", 1),
operator.methodcaller("rename", str.upper),
operator.methodcaller("rename", "name"),
operator.methodcaller("abs"),
np.abs,
],
)
def test_preserved_series(self, func):
s = pd.Series([0, 1], index=["a", "b"]).set_flags(allows_duplicate_labels=False)
assert func(s).flags.allows_duplicate_labels is False
@pytest.mark.parametrize(
"other", [pd.Series(0, index=["a", "b", "c"]), pd.Series(0, index=["a", "b"])]
)
# TODO: frame
@not_implemented
def test_align(self, other):
s = pd.Series([0, 1], index=["a", "b"]).set_flags(allows_duplicate_labels=False)
a, b = s.align(other)
assert a.flags.allows_duplicate_labels is False
assert b.flags.allows_duplicate_labels is False
def test_preserved_frame(self):
df = pd.DataFrame({"A": [1, 2], "B": [3, 4]}, index=["a", "b"]).set_flags(
allows_duplicate_labels=False
)
assert df.loc[["a"]].flags.allows_duplicate_labels is False
assert df.loc[:, ["A", "B"]].flags.allows_duplicate_labels is False
def test_to_frame(self):
ser = pd.Series(dtype=float).set_flags(allows_duplicate_labels=False)
assert ser.to_frame().flags.allows_duplicate_labels is False
@pytest.mark.parametrize("func", ["add", "sub"])
@pytest.mark.parametrize("frame", [False, True])
@pytest.mark.parametrize("other", [1, pd.Series([1, 2], name="A")])
def test_binops(self, func, other, frame):
df = pd.Series([1, 2], name="A", index=["a", "b"]).set_flags(
allows_duplicate_labels=False
)
if frame:
df = df.to_frame()
if isinstance(other, pd.Series) and frame:
other = other.to_frame()
func = operator.methodcaller(func, other)
assert df.flags.allows_duplicate_labels is False
assert func(df).flags.allows_duplicate_labels is False
def test_preserve_getitem(self):
df = pd.DataFrame({"A": [1, 2]}).set_flags(allows_duplicate_labels=False)
assert df[["A"]].flags.allows_duplicate_labels is False
assert df["A"].flags.allows_duplicate_labels is False
assert df.loc[0].flags.allows_duplicate_labels is False
assert df.loc[[0]].flags.allows_duplicate_labels is False
assert df.loc[0, ["A"]].flags.allows_duplicate_labels is False
def test_ndframe_getitem_caching_issue(self, request, using_copy_on_write):
if not using_copy_on_write:
request.node.add_marker(pytest.mark.xfail(reason="Unclear behavior."))
# NDFrame.__getitem__ will cache the first df['A']. May need to
# invalidate that cache? Update the cached entries?
df = pd.DataFrame({"A": [0]}).set_flags(allows_duplicate_labels=False)
assert df["A"].flags.allows_duplicate_labels is False
df.flags.allows_duplicate_labels = True
assert df["A"].flags.allows_duplicate_labels is True
@pytest.mark.parametrize(
"objs, kwargs",
[
# Series
(
[
pd.Series(1, index=["a", "b"]).set_flags(
allows_duplicate_labels=False
),
pd.Series(2, index=["c", "d"]).set_flags(
allows_duplicate_labels=False
),
],
{},
),
(
[
pd.Series(1, index=["a", "b"]).set_flags(
allows_duplicate_labels=False
),
pd.Series(2, index=["a", "b"]).set_flags(
allows_duplicate_labels=False
),
],
{"ignore_index": True},
),
(
[
pd.Series(1, index=["a", "b"]).set_flags(
allows_duplicate_labels=False
),
pd.Series(2, index=["a", "b"]).set_flags(
allows_duplicate_labels=False
),
],
{"axis": 1},
),
# Frame
(
[
pd.DataFrame({"A": [1, 2]}, index=["a", "b"]).set_flags(
allows_duplicate_labels=False
),
pd.DataFrame({"A": [1, 2]}, index=["c", "d"]).set_flags(
allows_duplicate_labels=False
),
],
{},
),
(
[
pd.DataFrame({"A": [1, 2]}, index=["a", "b"]).set_flags(
allows_duplicate_labels=False
),
pd.DataFrame({"A": [1, 2]}, index=["a", "b"]).set_flags(
allows_duplicate_labels=False
),
],
{"ignore_index": True},
),
(
[
pd.DataFrame({"A": [1, 2]}, index=["a", "b"]).set_flags(
allows_duplicate_labels=False
),
pd.DataFrame({"B": [1, 2]}, index=["a", "b"]).set_flags(
allows_duplicate_labels=False
),
],
{"axis": 1},
),
# Series / Frame
(
[
pd.DataFrame({"A": [1, 2]}, index=["a", "b"]).set_flags(
allows_duplicate_labels=False
),
pd.Series(
[1, 2],
index=["a", "b"],
name="B",
).set_flags(
allows_duplicate_labels=False,
),
],
{"axis": 1},
),
],
)
def test_concat(self, objs, kwargs):
result = pd.concat(objs, **kwargs)
assert result.flags.allows_duplicate_labels is False
@pytest.mark.parametrize(
"left, right, kwargs, expected",
[
# false false false
pytest.param(
pd.DataFrame({"A": [0, 1]}, index=["a", "b"]).set_flags(
allows_duplicate_labels=False
),
pd.DataFrame({"B": [0, 1]}, index=["a", "d"]).set_flags(
allows_duplicate_labels=False
),
{"left_index": True, "right_index": True},
False,
marks=not_implemented,
),
# false true false
pytest.param(
pd.DataFrame({"A": [0, 1]}, index=["a", "b"]).set_flags(
allows_duplicate_labels=False
),
pd.DataFrame({"B": [0, 1]}, index=["a", "d"]),
{"left_index": True, "right_index": True},
False,
marks=not_implemented,
),
# true true true
(
pd.DataFrame({"A": [0, 1]}, index=["a", "b"]),
pd.DataFrame({"B": [0, 1]}, index=["a", "d"]),
{"left_index": True, "right_index": True},
True,
),
],
)
def test_merge(self, left, right, kwargs, expected):
result = pd.merge(left, right, **kwargs)
assert result.flags.allows_duplicate_labels is expected
@not_implemented
def test_groupby(self):
# XXX: This is under tested
# TODO:
# - apply
# - transform
# - Should passing a grouper that disallows duplicates propagate?
df = pd.DataFrame({"A": [1, 2, 3]}).set_flags(allows_duplicate_labels=False)
result = df.groupby([0, 0, 1]).agg("count")
assert result.flags.allows_duplicate_labels is False
@pytest.mark.parametrize("frame", [True, False])
@not_implemented
def test_window(self, frame):
df = pd.Series(
1,
index=pd.date_range("2000", periods=12),
name="A",
allows_duplicate_labels=False,
)
if frame:
df = df.to_frame()
assert df.rolling(3).mean().flags.allows_duplicate_labels is False
assert df.ewm(3).mean().flags.allows_duplicate_labels is False
assert df.expanding(3).mean().flags.allows_duplicate_labels is False
# ----------------------------------------------------------------------------
# Raises
class TestRaises:
@pytest.mark.parametrize(
"cls, axes",
[
(pd.Series, {"index": ["a", "a"], "dtype": float}),
(pd.DataFrame, {"index": ["a", "a"]}),
(pd.DataFrame, {"index": ["a", "a"], "columns": ["b", "b"]}),
(pd.DataFrame, {"columns": ["b", "b"]}),
],
)
def test_set_flags_with_duplicates(self, cls, axes):
result = cls(**axes)
assert result.flags.allows_duplicate_labels is True
msg = "Index has duplicates."
with pytest.raises(pd.errors.DuplicateLabelError, match=msg):
cls(**axes).set_flags(allows_duplicate_labels=False)
@pytest.mark.parametrize(
"data",
[
pd.Series(index=[0, 0], dtype=float),
pd.DataFrame(index=[0, 0]),
pd.DataFrame(columns=[0, 0]),
],
)
def test_setting_allows_duplicate_labels_raises(self, data):
msg = "Index has duplicates."
with pytest.raises(pd.errors.DuplicateLabelError, match=msg):
data.flags.allows_duplicate_labels = False
assert data.flags.allows_duplicate_labels is True
def test_series_raises(self):
a = pd.Series(0, index=["a", "b"])
b = pd.Series([0, 1], index=["a", "b"]).set_flags(allows_duplicate_labels=False)
msg = "Index has duplicates."
with pytest.raises(pd.errors.DuplicateLabelError, match=msg):
pd.concat([a, b])
@pytest.mark.parametrize(
"getter, target",
[
(operator.itemgetter(["A", "A"]), None),
# loc
(operator.itemgetter(["a", "a"]), "loc"),
pytest.param(operator.itemgetter(("a", ["A", "A"])), "loc"),
(operator.itemgetter((["a", "a"], "A")), "loc"),
# iloc
(operator.itemgetter([0, 0]), "iloc"),
pytest.param(operator.itemgetter((0, [0, 0])), "iloc"),
pytest.param(operator.itemgetter(([0, 0], 0)), "iloc"),
],
)
def test_getitem_raises(self, getter, target):
df = pd.DataFrame({"A": [1, 2], "B": [3, 4]}, index=["a", "b"]).set_flags(
allows_duplicate_labels=False
)
if target:
# df, df.loc, or df.iloc
target = getattr(df, target)
else:
target = df
msg = "Index has duplicates."
with pytest.raises(pd.errors.DuplicateLabelError, match=msg):
getter(target)
@pytest.mark.parametrize(
"objs, kwargs",
[
(
[
pd.Series(1, index=[0, 1], name="a").set_flags(
allows_duplicate_labels=False
),
pd.Series(2, index=[0, 1], name="a").set_flags(
allows_duplicate_labels=False
),
],
{"axis": 1},
)
],
)
def test_concat_raises(self, objs, kwargs):
msg = "Index has duplicates."
with pytest.raises(pd.errors.DuplicateLabelError, match=msg):
pd.concat(objs, **kwargs)
@not_implemented
def test_merge_raises(self):
a = pd.DataFrame({"A": [0, 1, 2]}, index=["a", "b", "c"]).set_flags(
allows_duplicate_labels=False
)
b = pd.DataFrame({"B": [0, 1, 2]}, index=["a", "b", "b"])
msg = "Index has duplicates."
with pytest.raises(pd.errors.DuplicateLabelError, match=msg):
pd.merge(a, b, left_index=True, right_index=True)
@pytest.mark.parametrize(
"idx",
[
pd.Index([1, 1]),
pd.Index(["a", "a"]),
pd.Index([1.1, 1.1]),
pd.PeriodIndex([pd.Period("2000", "D")] * 2),
pd.DatetimeIndex([pd.Timestamp("2000")] * 2),
pd.TimedeltaIndex([pd.Timedelta("1D")] * 2),
pd.CategoricalIndex(["a", "a"]),
pd.IntervalIndex([pd.Interval(0, 1)] * 2),
pd.MultiIndex.from_tuples([("a", 1), ("a", 1)]),
],
ids=lambda x: type(x).__name__,
)
def test_raises_basic(idx):
msg = "Index has duplicates."
with pytest.raises(pd.errors.DuplicateLabelError, match=msg):
pd.Series(1, index=idx).set_flags(allows_duplicate_labels=False)
with pytest.raises(pd.errors.DuplicateLabelError, match=msg):
pd.DataFrame({"A": [1, 1]}, index=idx).set_flags(allows_duplicate_labels=False)
with pytest.raises(pd.errors.DuplicateLabelError, match=msg):
pd.DataFrame([[1, 2]], columns=idx).set_flags(allows_duplicate_labels=False)
def test_format_duplicate_labels_message():
idx = pd.Index(["a", "b", "a", "b", "c"])
result = idx._format_duplicate_message()
expected = pd.DataFrame(
{"positions": [[0, 2], [1, 3]]}, index=pd.Index(["a", "b"], name="label")
)
tm.assert_frame_equal(result, expected)
def test_format_duplicate_labels_message_multi():
idx = pd.MultiIndex.from_product([["A"], ["a", "b", "a", "b", "c"]])
result = idx._format_duplicate_message()
expected = pd.DataFrame(
{"positions": [[0, 2], [1, 3]]},
index=pd.MultiIndex.from_product([["A"], ["a", "b"]]),
)
tm.assert_frame_equal(result, expected)
def test_dataframe_insert_raises():
df = pd.DataFrame({"A": [1, 2]}).set_flags(allows_duplicate_labels=False)
msg = "Cannot specify"
with pytest.raises(ValueError, match=msg):
df.insert(0, "A", [3, 4], allow_duplicates=True)
@pytest.mark.parametrize(
"method, frame_only",
[
(operator.methodcaller("set_index", "A", inplace=True), True),
(operator.methodcaller("reset_index", inplace=True), True),
(operator.methodcaller("rename", lambda x: x, inplace=True), False),
],
)
def test_inplace_raises(method, frame_only):
df = pd.DataFrame({"A": [0, 0], "B": [1, 2]}).set_flags(
allows_duplicate_labels=False
)
s = df["A"]
s.flags.allows_duplicate_labels = False
msg = "Cannot specify"
with pytest.raises(ValueError, match=msg):
method(df)
if not frame_only:
with pytest.raises(ValueError, match=msg):
method(s)
def test_pickle():
a = pd.Series([1, 2]).set_flags(allows_duplicate_labels=False)
b = tm.round_trip_pickle(a)
tm.assert_series_equal(a, b)
a = pd.DataFrame({"A": []}).set_flags(allows_duplicate_labels=False)
b = tm.round_trip_pickle(a)
tm.assert_frame_equal(a, b)