268 lines
9.0 KiB
Python
268 lines
9.0 KiB
Python
import numpy as np
|
|
import pytest
|
|
|
|
import pandas as pd
|
|
from pandas import (
|
|
Categorical,
|
|
Series,
|
|
)
|
|
import pandas._testing as tm
|
|
|
|
|
|
@pytest.mark.parametrize(
|
|
"keep, expected",
|
|
[
|
|
("first", Series([False, False, False, False, True, True, False])),
|
|
("last", Series([False, True, True, False, False, False, False])),
|
|
(False, Series([False, True, True, False, True, True, False])),
|
|
],
|
|
)
|
|
def test_drop_duplicates(any_numpy_dtype, keep, expected):
|
|
tc = Series([1, 0, 3, 5, 3, 0, 4], dtype=np.dtype(any_numpy_dtype))
|
|
|
|
if tc.dtype == "bool":
|
|
pytest.skip("tested separately in test_drop_duplicates_bool")
|
|
|
|
tm.assert_series_equal(tc.duplicated(keep=keep), expected)
|
|
tm.assert_series_equal(tc.drop_duplicates(keep=keep), tc[~expected])
|
|
sc = tc.copy()
|
|
return_value = sc.drop_duplicates(keep=keep, inplace=True)
|
|
assert return_value is None
|
|
tm.assert_series_equal(sc, tc[~expected])
|
|
|
|
|
|
@pytest.mark.parametrize(
|
|
"keep, expected",
|
|
[
|
|
("first", Series([False, False, True, True])),
|
|
("last", Series([True, True, False, False])),
|
|
(False, Series([True, True, True, True])),
|
|
],
|
|
)
|
|
def test_drop_duplicates_bool(keep, expected):
|
|
tc = Series([True, False, True, False])
|
|
|
|
tm.assert_series_equal(tc.duplicated(keep=keep), expected)
|
|
tm.assert_series_equal(tc.drop_duplicates(keep=keep), tc[~expected])
|
|
sc = tc.copy()
|
|
return_value = sc.drop_duplicates(keep=keep, inplace=True)
|
|
tm.assert_series_equal(sc, tc[~expected])
|
|
assert return_value is None
|
|
|
|
|
|
@pytest.mark.parametrize("values", [[], list(range(5))])
|
|
def test_drop_duplicates_no_duplicates(any_numpy_dtype, keep, values):
|
|
tc = Series(values, dtype=np.dtype(any_numpy_dtype))
|
|
expected = Series([False] * len(tc), dtype="bool")
|
|
|
|
if tc.dtype == "bool":
|
|
# 0 -> False and 1-> True
|
|
# any other value would be duplicated
|
|
tc = tc[:2]
|
|
expected = expected[:2]
|
|
|
|
tm.assert_series_equal(tc.duplicated(keep=keep), expected)
|
|
|
|
result_dropped = tc.drop_duplicates(keep=keep)
|
|
tm.assert_series_equal(result_dropped, tc)
|
|
|
|
# validate shallow copy
|
|
assert result_dropped is not tc
|
|
|
|
|
|
class TestSeriesDropDuplicates:
|
|
@pytest.fixture(
|
|
params=["int_", "uint", "float64", "str_", "timedelta64[h]", "datetime64[D]"]
|
|
)
|
|
def dtype(self, request):
|
|
return request.param
|
|
|
|
@pytest.fixture
|
|
def cat_series_unused_category(self, dtype, ordered):
|
|
# Test case 1
|
|
cat_array = np.array([1, 2, 3, 4, 5], dtype=np.dtype(dtype))
|
|
|
|
input1 = np.array([1, 2, 3, 3], dtype=np.dtype(dtype))
|
|
cat = Categorical(input1, categories=cat_array, ordered=ordered)
|
|
tc1 = Series(cat)
|
|
return tc1
|
|
|
|
def test_drop_duplicates_categorical_non_bool(self, cat_series_unused_category):
|
|
tc1 = cat_series_unused_category
|
|
|
|
expected = Series([False, False, False, True])
|
|
|
|
result = tc1.duplicated()
|
|
tm.assert_series_equal(result, expected)
|
|
|
|
result = tc1.drop_duplicates()
|
|
tm.assert_series_equal(result, tc1[~expected])
|
|
|
|
sc = tc1.copy()
|
|
return_value = sc.drop_duplicates(inplace=True)
|
|
assert return_value is None
|
|
tm.assert_series_equal(sc, tc1[~expected])
|
|
|
|
def test_drop_duplicates_categorical_non_bool_keeplast(
|
|
self, cat_series_unused_category
|
|
):
|
|
tc1 = cat_series_unused_category
|
|
|
|
expected = Series([False, False, True, False])
|
|
|
|
result = tc1.duplicated(keep="last")
|
|
tm.assert_series_equal(result, expected)
|
|
|
|
result = tc1.drop_duplicates(keep="last")
|
|
tm.assert_series_equal(result, tc1[~expected])
|
|
|
|
sc = tc1.copy()
|
|
return_value = sc.drop_duplicates(keep="last", inplace=True)
|
|
assert return_value is None
|
|
tm.assert_series_equal(sc, tc1[~expected])
|
|
|
|
def test_drop_duplicates_categorical_non_bool_keepfalse(
|
|
self, cat_series_unused_category
|
|
):
|
|
tc1 = cat_series_unused_category
|
|
|
|
expected = Series([False, False, True, True])
|
|
|
|
result = tc1.duplicated(keep=False)
|
|
tm.assert_series_equal(result, expected)
|
|
|
|
result = tc1.drop_duplicates(keep=False)
|
|
tm.assert_series_equal(result, tc1[~expected])
|
|
|
|
sc = tc1.copy()
|
|
return_value = sc.drop_duplicates(keep=False, inplace=True)
|
|
assert return_value is None
|
|
tm.assert_series_equal(sc, tc1[~expected])
|
|
|
|
@pytest.fixture
|
|
def cat_series(self, dtype, ordered):
|
|
# no unused categories, unlike cat_series_unused_category
|
|
cat_array = np.array([1, 2, 3, 4, 5], dtype=np.dtype(dtype))
|
|
|
|
input2 = np.array([1, 2, 3, 5, 3, 2, 4], dtype=np.dtype(dtype))
|
|
cat = Categorical(input2, categories=cat_array, ordered=ordered)
|
|
tc2 = Series(cat)
|
|
return tc2
|
|
|
|
def test_drop_duplicates_categorical_non_bool2(self, cat_series):
|
|
tc2 = cat_series
|
|
|
|
expected = Series([False, False, False, False, True, True, False])
|
|
|
|
result = tc2.duplicated()
|
|
tm.assert_series_equal(result, expected)
|
|
|
|
result = tc2.drop_duplicates()
|
|
tm.assert_series_equal(result, tc2[~expected])
|
|
|
|
sc = tc2.copy()
|
|
return_value = sc.drop_duplicates(inplace=True)
|
|
assert return_value is None
|
|
tm.assert_series_equal(sc, tc2[~expected])
|
|
|
|
def test_drop_duplicates_categorical_non_bool2_keeplast(self, cat_series):
|
|
tc2 = cat_series
|
|
|
|
expected = Series([False, True, True, False, False, False, False])
|
|
|
|
result = tc2.duplicated(keep="last")
|
|
tm.assert_series_equal(result, expected)
|
|
|
|
result = tc2.drop_duplicates(keep="last")
|
|
tm.assert_series_equal(result, tc2[~expected])
|
|
|
|
sc = tc2.copy()
|
|
return_value = sc.drop_duplicates(keep="last", inplace=True)
|
|
assert return_value is None
|
|
tm.assert_series_equal(sc, tc2[~expected])
|
|
|
|
def test_drop_duplicates_categorical_non_bool2_keepfalse(self, cat_series):
|
|
tc2 = cat_series
|
|
|
|
expected = Series([False, True, True, False, True, True, False])
|
|
|
|
result = tc2.duplicated(keep=False)
|
|
tm.assert_series_equal(result, expected)
|
|
|
|
result = tc2.drop_duplicates(keep=False)
|
|
tm.assert_series_equal(result, tc2[~expected])
|
|
|
|
sc = tc2.copy()
|
|
return_value = sc.drop_duplicates(keep=False, inplace=True)
|
|
assert return_value is None
|
|
tm.assert_series_equal(sc, tc2[~expected])
|
|
|
|
def test_drop_duplicates_categorical_bool(self, ordered):
|
|
tc = Series(
|
|
Categorical(
|
|
[True, False, True, False], categories=[True, False], ordered=ordered
|
|
)
|
|
)
|
|
|
|
expected = Series([False, False, True, True])
|
|
tm.assert_series_equal(tc.duplicated(), expected)
|
|
tm.assert_series_equal(tc.drop_duplicates(), tc[~expected])
|
|
sc = tc.copy()
|
|
return_value = sc.drop_duplicates(inplace=True)
|
|
assert return_value is None
|
|
tm.assert_series_equal(sc, tc[~expected])
|
|
|
|
expected = Series([True, True, False, False])
|
|
tm.assert_series_equal(tc.duplicated(keep="last"), expected)
|
|
tm.assert_series_equal(tc.drop_duplicates(keep="last"), tc[~expected])
|
|
sc = tc.copy()
|
|
return_value = sc.drop_duplicates(keep="last", inplace=True)
|
|
assert return_value is None
|
|
tm.assert_series_equal(sc, tc[~expected])
|
|
|
|
expected = Series([True, True, True, True])
|
|
tm.assert_series_equal(tc.duplicated(keep=False), expected)
|
|
tm.assert_series_equal(tc.drop_duplicates(keep=False), tc[~expected])
|
|
sc = tc.copy()
|
|
return_value = sc.drop_duplicates(keep=False, inplace=True)
|
|
assert return_value is None
|
|
tm.assert_series_equal(sc, tc[~expected])
|
|
|
|
def test_drop_duplicates_categorical_bool_na(self, nulls_fixture):
|
|
# GH#44351
|
|
ser = Series(
|
|
Categorical(
|
|
[True, False, True, False, nulls_fixture],
|
|
categories=[True, False],
|
|
ordered=True,
|
|
)
|
|
)
|
|
result = ser.drop_duplicates()
|
|
expected = Series(
|
|
Categorical([True, False, np.nan], categories=[True, False], ordered=True),
|
|
index=[0, 1, 4],
|
|
)
|
|
tm.assert_series_equal(result, expected)
|
|
|
|
def test_drop_duplicates_ignore_index(self):
|
|
# GH#48304
|
|
ser = Series([1, 2, 2, 3])
|
|
result = ser.drop_duplicates(ignore_index=True)
|
|
expected = Series([1, 2, 3])
|
|
tm.assert_series_equal(result, expected)
|
|
|
|
def test_duplicated_arrow_dtype(self):
|
|
pytest.importorskip("pyarrow")
|
|
ser = Series([True, False, None, False], dtype="bool[pyarrow]")
|
|
result = ser.drop_duplicates()
|
|
expected = Series([True, False, None], dtype="bool[pyarrow]")
|
|
tm.assert_series_equal(result, expected)
|
|
|
|
def test_drop_duplicates_arrow_strings(self):
|
|
# GH#54904
|
|
pa = pytest.importorskip("pyarrow")
|
|
ser = Series(["a", "a"], dtype=pd.ArrowDtype(pa.string()))
|
|
result = ser.drop_duplicates()
|
|
expecetd = Series(["a"], dtype=pd.ArrowDtype(pa.string()))
|
|
tm.assert_series_equal(result, expecetd)
|