"""Tests dealing with the NDFrame.allows_duplicates.""" import operator import numpy as np import pytest import pandas as pd import pandas._testing as tm not_implemented = pytest.mark.xfail(reason="Not implemented.") # ---------------------------------------------------------------------------- # Preservation class TestPreserves: @pytest.mark.parametrize( "cls, data", [ (pd.Series, np.array([])), (pd.Series, [1, 2]), (pd.DataFrame, {}), (pd.DataFrame, {"A": [1, 2]}), ], ) def test_construction_ok(self, cls, data): result = cls(data) assert result.flags.allows_duplicate_labels is True result = cls(data).set_flags(allows_duplicate_labels=False) assert result.flags.allows_duplicate_labels is False @pytest.mark.parametrize( "func", [ operator.itemgetter(["a"]), operator.methodcaller("add", 1), operator.methodcaller("rename", str.upper), operator.methodcaller("rename", "name"), operator.methodcaller("abs"), np.abs, ], ) def test_preserved_series(self, func): s = pd.Series([0, 1], index=["a", "b"]).set_flags(allows_duplicate_labels=False) assert func(s).flags.allows_duplicate_labels is False @pytest.mark.parametrize( "other", [pd.Series(0, index=["a", "b", "c"]), pd.Series(0, index=["a", "b"])] ) # TODO: frame @not_implemented def test_align(self, other): s = pd.Series([0, 1], index=["a", "b"]).set_flags(allows_duplicate_labels=False) a, b = s.align(other) assert a.flags.allows_duplicate_labels is False assert b.flags.allows_duplicate_labels is False def test_preserved_frame(self): df = pd.DataFrame({"A": [1, 2], "B": [3, 4]}, index=["a", "b"]).set_flags( allows_duplicate_labels=False ) assert df.loc[["a"]].flags.allows_duplicate_labels is False assert df.loc[:, ["A", "B"]].flags.allows_duplicate_labels is False def test_to_frame(self): ser = pd.Series(dtype=float).set_flags(allows_duplicate_labels=False) assert ser.to_frame().flags.allows_duplicate_labels is False @pytest.mark.parametrize("func", ["add", "sub"]) @pytest.mark.parametrize("frame", [False, True]) @pytest.mark.parametrize("other", [1, pd.Series([1, 2], name="A")]) def test_binops(self, func, other, frame): df = pd.Series([1, 2], name="A", index=["a", "b"]).set_flags( allows_duplicate_labels=False ) if frame: df = df.to_frame() if isinstance(other, pd.Series) and frame: other = other.to_frame() func = operator.methodcaller(func, other) assert df.flags.allows_duplicate_labels is False assert func(df).flags.allows_duplicate_labels is False def test_preserve_getitem(self): df = pd.DataFrame({"A": [1, 2]}).set_flags(allows_duplicate_labels=False) assert df[["A"]].flags.allows_duplicate_labels is False assert df["A"].flags.allows_duplicate_labels is False assert df.loc[0].flags.allows_duplicate_labels is False assert df.loc[[0]].flags.allows_duplicate_labels is False assert df.loc[0, ["A"]].flags.allows_duplicate_labels is False def test_ndframe_getitem_caching_issue(self, request, using_copy_on_write): if not using_copy_on_write: request.node.add_marker(pytest.mark.xfail(reason="Unclear behavior.")) # NDFrame.__getitem__ will cache the first df['A']. May need to # invalidate that cache? Update the cached entries? df = pd.DataFrame({"A": [0]}).set_flags(allows_duplicate_labels=False) assert df["A"].flags.allows_duplicate_labels is False df.flags.allows_duplicate_labels = True assert df["A"].flags.allows_duplicate_labels is True @pytest.mark.parametrize( "objs, kwargs", [ # Series ( [ pd.Series(1, index=["a", "b"]).set_flags( allows_duplicate_labels=False ), pd.Series(2, index=["c", "d"]).set_flags( allows_duplicate_labels=False ), ], {}, ), ( [ pd.Series(1, index=["a", "b"]).set_flags( allows_duplicate_labels=False ), pd.Series(2, index=["a", "b"]).set_flags( allows_duplicate_labels=False ), ], {"ignore_index": True}, ), ( [ pd.Series(1, index=["a", "b"]).set_flags( allows_duplicate_labels=False ), pd.Series(2, index=["a", "b"]).set_flags( allows_duplicate_labels=False ), ], {"axis": 1}, ), # Frame ( [ pd.DataFrame({"A": [1, 2]}, index=["a", "b"]).set_flags( allows_duplicate_labels=False ), pd.DataFrame({"A": [1, 2]}, index=["c", "d"]).set_flags( allows_duplicate_labels=False ), ], {}, ), ( [ pd.DataFrame({"A": [1, 2]}, index=["a", "b"]).set_flags( allows_duplicate_labels=False ), pd.DataFrame({"A": [1, 2]}, index=["a", "b"]).set_flags( allows_duplicate_labels=False ), ], {"ignore_index": True}, ), ( [ pd.DataFrame({"A": [1, 2]}, index=["a", "b"]).set_flags( allows_duplicate_labels=False ), pd.DataFrame({"B": [1, 2]}, index=["a", "b"]).set_flags( allows_duplicate_labels=False ), ], {"axis": 1}, ), # Series / Frame ( [ pd.DataFrame({"A": [1, 2]}, index=["a", "b"]).set_flags( allows_duplicate_labels=False ), pd.Series( [1, 2], index=["a", "b"], name="B", ).set_flags( allows_duplicate_labels=False, ), ], {"axis": 1}, ), ], ) def test_concat(self, objs, kwargs): result = pd.concat(objs, **kwargs) assert result.flags.allows_duplicate_labels is False @pytest.mark.parametrize( "left, right, kwargs, expected", [ # false false false pytest.param( pd.DataFrame({"A": [0, 1]}, index=["a", "b"]).set_flags( allows_duplicate_labels=False ), pd.DataFrame({"B": [0, 1]}, index=["a", "d"]).set_flags( allows_duplicate_labels=False ), {"left_index": True, "right_index": True}, False, marks=not_implemented, ), # false true false pytest.param( pd.DataFrame({"A": [0, 1]}, index=["a", "b"]).set_flags( allows_duplicate_labels=False ), pd.DataFrame({"B": [0, 1]}, index=["a", "d"]), {"left_index": True, "right_index": True}, False, marks=not_implemented, ), # true true true ( pd.DataFrame({"A": [0, 1]}, index=["a", "b"]), pd.DataFrame({"B": [0, 1]}, index=["a", "d"]), {"left_index": True, "right_index": True}, True, ), ], ) def test_merge(self, left, right, kwargs, expected): result = pd.merge(left, right, **kwargs) assert result.flags.allows_duplicate_labels is expected @not_implemented def test_groupby(self): # XXX: This is under tested # TODO: # - apply # - transform # - Should passing a grouper that disallows duplicates propagate? df = pd.DataFrame({"A": [1, 2, 3]}).set_flags(allows_duplicate_labels=False) result = df.groupby([0, 0, 1]).agg("count") assert result.flags.allows_duplicate_labels is False @pytest.mark.parametrize("frame", [True, False]) @not_implemented def test_window(self, frame): df = pd.Series( 1, index=pd.date_range("2000", periods=12), name="A", allows_duplicate_labels=False, ) if frame: df = df.to_frame() assert df.rolling(3).mean().flags.allows_duplicate_labels is False assert df.ewm(3).mean().flags.allows_duplicate_labels is False assert df.expanding(3).mean().flags.allows_duplicate_labels is False # ---------------------------------------------------------------------------- # Raises class TestRaises: @pytest.mark.parametrize( "cls, axes", [ (pd.Series, {"index": ["a", "a"], "dtype": float}), (pd.DataFrame, {"index": ["a", "a"]}), (pd.DataFrame, {"index": ["a", "a"], "columns": ["b", "b"]}), (pd.DataFrame, {"columns": ["b", "b"]}), ], ) def test_set_flags_with_duplicates(self, cls, axes): result = cls(**axes) assert result.flags.allows_duplicate_labels is True msg = "Index has duplicates." with pytest.raises(pd.errors.DuplicateLabelError, match=msg): cls(**axes).set_flags(allows_duplicate_labels=False) @pytest.mark.parametrize( "data", [ pd.Series(index=[0, 0], dtype=float), pd.DataFrame(index=[0, 0]), pd.DataFrame(columns=[0, 0]), ], ) def test_setting_allows_duplicate_labels_raises(self, data): msg = "Index has duplicates." with pytest.raises(pd.errors.DuplicateLabelError, match=msg): data.flags.allows_duplicate_labels = False assert data.flags.allows_duplicate_labels is True def test_series_raises(self): a = pd.Series(0, index=["a", "b"]) b = pd.Series([0, 1], index=["a", "b"]).set_flags(allows_duplicate_labels=False) msg = "Index has duplicates." with pytest.raises(pd.errors.DuplicateLabelError, match=msg): pd.concat([a, b]) @pytest.mark.parametrize( "getter, target", [ (operator.itemgetter(["A", "A"]), None), # loc (operator.itemgetter(["a", "a"]), "loc"), pytest.param(operator.itemgetter(("a", ["A", "A"])), "loc"), (operator.itemgetter((["a", "a"], "A")), "loc"), # iloc (operator.itemgetter([0, 0]), "iloc"), pytest.param(operator.itemgetter((0, [0, 0])), "iloc"), pytest.param(operator.itemgetter(([0, 0], 0)), "iloc"), ], ) def test_getitem_raises(self, getter, target): df = pd.DataFrame({"A": [1, 2], "B": [3, 4]}, index=["a", "b"]).set_flags( allows_duplicate_labels=False ) if target: # df, df.loc, or df.iloc target = getattr(df, target) else: target = df msg = "Index has duplicates." with pytest.raises(pd.errors.DuplicateLabelError, match=msg): getter(target) @pytest.mark.parametrize( "objs, kwargs", [ ( [ pd.Series(1, index=[0, 1], name="a").set_flags( allows_duplicate_labels=False ), pd.Series(2, index=[0, 1], name="a").set_flags( allows_duplicate_labels=False ), ], {"axis": 1}, ) ], ) def test_concat_raises(self, objs, kwargs): msg = "Index has duplicates." with pytest.raises(pd.errors.DuplicateLabelError, match=msg): pd.concat(objs, **kwargs) @not_implemented def test_merge_raises(self): a = pd.DataFrame({"A": [0, 1, 2]}, index=["a", "b", "c"]).set_flags( allows_duplicate_labels=False ) b = pd.DataFrame({"B": [0, 1, 2]}, index=["a", "b", "b"]) msg = "Index has duplicates." with pytest.raises(pd.errors.DuplicateLabelError, match=msg): pd.merge(a, b, left_index=True, right_index=True) @pytest.mark.parametrize( "idx", [ pd.Index([1, 1]), pd.Index(["a", "a"]), pd.Index([1.1, 1.1]), pd.PeriodIndex([pd.Period("2000", "D")] * 2), pd.DatetimeIndex([pd.Timestamp("2000")] * 2), pd.TimedeltaIndex([pd.Timedelta("1D")] * 2), pd.CategoricalIndex(["a", "a"]), pd.IntervalIndex([pd.Interval(0, 1)] * 2), pd.MultiIndex.from_tuples([("a", 1), ("a", 1)]), ], ids=lambda x: type(x).__name__, ) def test_raises_basic(idx): msg = "Index has duplicates." with pytest.raises(pd.errors.DuplicateLabelError, match=msg): pd.Series(1, index=idx).set_flags(allows_duplicate_labels=False) with pytest.raises(pd.errors.DuplicateLabelError, match=msg): pd.DataFrame({"A": [1, 1]}, index=idx).set_flags(allows_duplicate_labels=False) with pytest.raises(pd.errors.DuplicateLabelError, match=msg): pd.DataFrame([[1, 2]], columns=idx).set_flags(allows_duplicate_labels=False) def test_format_duplicate_labels_message(): idx = pd.Index(["a", "b", "a", "b", "c"]) result = idx._format_duplicate_message() expected = pd.DataFrame( {"positions": [[0, 2], [1, 3]]}, index=pd.Index(["a", "b"], name="label") ) tm.assert_frame_equal(result, expected) def test_format_duplicate_labels_message_multi(): idx = pd.MultiIndex.from_product([["A"], ["a", "b", "a", "b", "c"]]) result = idx._format_duplicate_message() expected = pd.DataFrame( {"positions": [[0, 2], [1, 3]]}, index=pd.MultiIndex.from_product([["A"], ["a", "b"]]), ) tm.assert_frame_equal(result, expected) def test_dataframe_insert_raises(): df = pd.DataFrame({"A": [1, 2]}).set_flags(allows_duplicate_labels=False) msg = "Cannot specify" with pytest.raises(ValueError, match=msg): df.insert(0, "A", [3, 4], allow_duplicates=True) @pytest.mark.parametrize( "method, frame_only", [ (operator.methodcaller("set_index", "A", inplace=True), True), (operator.methodcaller("reset_index", inplace=True), True), (operator.methodcaller("rename", lambda x: x, inplace=True), False), ], ) def test_inplace_raises(method, frame_only): df = pd.DataFrame({"A": [0, 0], "B": [1, 2]}).set_flags( allows_duplicate_labels=False ) s = df["A"] s.flags.allows_duplicate_labels = False msg = "Cannot specify" with pytest.raises(ValueError, match=msg): method(df) if not frame_only: with pytest.raises(ValueError, match=msg): method(s) def test_pickle(): a = pd.Series([1, 2]).set_flags(allows_duplicate_labels=False) b = tm.round_trip_pickle(a) tm.assert_series_equal(a, b) a = pd.DataFrame({"A": []}).set_flags(allows_duplicate_labels=False) b = tm.round_trip_pickle(a) tm.assert_frame_equal(a, b)