projektAI/venv/Lib/site-packages/pandas/tests/frame/methods/test_drop.py
2021-06-06 22:13:05 +02:00

444 lines
16 KiB
Python

import re
import numpy as np
import pytest
from pandas.errors import PerformanceWarning
import pandas as pd
from pandas import DataFrame, Index, MultiIndex, Series, Timestamp
import pandas._testing as tm
@pytest.mark.parametrize(
"msg,labels,level",
[
(r"labels \[4\] not found in level", 4, "a"),
(r"labels \[7\] not found in level", 7, "b"),
],
)
def test_drop_raise_exception_if_labels_not_in_level(msg, labels, level):
# GH 8594
mi = MultiIndex.from_arrays([[1, 2, 3], [4, 5, 6]], names=["a", "b"])
s = pd.Series([10, 20, 30], index=mi)
df = DataFrame([10, 20, 30], index=mi)
with pytest.raises(KeyError, match=msg):
s.drop(labels, level=level)
with pytest.raises(KeyError, match=msg):
df.drop(labels, level=level)
@pytest.mark.parametrize("labels,level", [(4, "a"), (7, "b")])
def test_drop_errors_ignore(labels, level):
# GH 8594
mi = MultiIndex.from_arrays([[1, 2, 3], [4, 5, 6]], names=["a", "b"])
s = pd.Series([10, 20, 30], index=mi)
df = DataFrame([10, 20, 30], index=mi)
expected_s = s.drop(labels, level=level, errors="ignore")
tm.assert_series_equal(s, expected_s)
expected_df = df.drop(labels, level=level, errors="ignore")
tm.assert_frame_equal(df, expected_df)
def test_drop_with_non_unique_datetime_index_and_invalid_keys():
# GH 30399
# define dataframe with unique datetime index
df = DataFrame(
np.random.randn(5, 3),
columns=["a", "b", "c"],
index=pd.date_range("2012", freq="H", periods=5),
)
# create dataframe with non-unique datetime index
df = df.iloc[[0, 2, 2, 3]].copy()
with pytest.raises(KeyError, match="not found in axis"):
df.drop(["a", "b"]) # Dropping with labels not exist in the index
class TestDataFrameDrop:
def test_drop_names(self):
df = DataFrame(
[[1, 2, 3], [3, 4, 5], [5, 6, 7]],
index=["a", "b", "c"],
columns=["d", "e", "f"],
)
df.index.name, df.columns.name = "first", "second"
df_dropped_b = df.drop("b")
df_dropped_e = df.drop("e", axis=1)
df_inplace_b, df_inplace_e = df.copy(), df.copy()
return_value = df_inplace_b.drop("b", inplace=True)
assert return_value is None
return_value = df_inplace_e.drop("e", axis=1, inplace=True)
assert return_value is None
for obj in (df_dropped_b, df_dropped_e, df_inplace_b, df_inplace_e):
assert obj.index.name == "first"
assert obj.columns.name == "second"
assert list(df.columns) == ["d", "e", "f"]
msg = r"\['g'\] not found in axis"
with pytest.raises(KeyError, match=msg):
df.drop(["g"])
with pytest.raises(KeyError, match=msg):
df.drop(["g"], 1)
# errors = 'ignore'
dropped = df.drop(["g"], errors="ignore")
expected = Index(["a", "b", "c"], name="first")
tm.assert_index_equal(dropped.index, expected)
dropped = df.drop(["b", "g"], errors="ignore")
expected = Index(["a", "c"], name="first")
tm.assert_index_equal(dropped.index, expected)
dropped = df.drop(["g"], axis=1, errors="ignore")
expected = Index(["d", "e", "f"], name="second")
tm.assert_index_equal(dropped.columns, expected)
dropped = df.drop(["d", "g"], axis=1, errors="ignore")
expected = Index(["e", "f"], name="second")
tm.assert_index_equal(dropped.columns, expected)
# GH 16398
dropped = df.drop([], errors="ignore")
expected = Index(["a", "b", "c"], name="first")
tm.assert_index_equal(dropped.index, expected)
def test_drop(self):
simple = DataFrame({"A": [1, 2, 3, 4], "B": [0, 1, 2, 3]})
tm.assert_frame_equal(simple.drop("A", axis=1), simple[["B"]])
tm.assert_frame_equal(simple.drop(["A", "B"], axis="columns"), simple[[]])
tm.assert_frame_equal(simple.drop([0, 1, 3], axis=0), simple.loc[[2], :])
tm.assert_frame_equal(simple.drop([0, 3], axis="index"), simple.loc[[1, 2], :])
with pytest.raises(KeyError, match=r"\[5\] not found in axis"):
simple.drop(5)
with pytest.raises(KeyError, match=r"\['C'\] not found in axis"):
simple.drop("C", 1)
with pytest.raises(KeyError, match=r"\[5\] not found in axis"):
simple.drop([1, 5])
with pytest.raises(KeyError, match=r"\['C'\] not found in axis"):
simple.drop(["A", "C"], 1)
# errors = 'ignore'
tm.assert_frame_equal(simple.drop(5, errors="ignore"), simple)
tm.assert_frame_equal(
simple.drop([0, 5], errors="ignore"), simple.loc[[1, 2, 3], :]
)
tm.assert_frame_equal(simple.drop("C", axis=1, errors="ignore"), simple)
tm.assert_frame_equal(
simple.drop(["A", "C"], axis=1, errors="ignore"), simple[["B"]]
)
# non-unique - wheee!
nu_df = DataFrame(
list(zip(range(3), range(-3, 1), list("abc"))), columns=["a", "a", "b"]
)
tm.assert_frame_equal(nu_df.drop("a", axis=1), nu_df[["b"]])
tm.assert_frame_equal(nu_df.drop("b", axis="columns"), nu_df["a"])
tm.assert_frame_equal(nu_df.drop([]), nu_df) # GH 16398
nu_df = nu_df.set_index(Index(["X", "Y", "X"]))
nu_df.columns = list("abc")
tm.assert_frame_equal(nu_df.drop("X", axis="rows"), nu_df.loc[["Y"], :])
tm.assert_frame_equal(nu_df.drop(["X", "Y"], axis=0), nu_df.loc[[], :])
# inplace cache issue
# GH#5628
df = DataFrame(np.random.randn(10, 3), columns=list("abc"))
expected = df[~(df.b > 0)]
return_value = df.drop(labels=df[df.b > 0].index, inplace=True)
assert return_value is None
tm.assert_frame_equal(df, expected)
def test_drop_multiindex_not_lexsorted(self):
# GH#11640
# define the lexsorted version
lexsorted_mi = MultiIndex.from_tuples(
[("a", ""), ("b1", "c1"), ("b2", "c2")], names=["b", "c"]
)
lexsorted_df = DataFrame([[1, 3, 4]], columns=lexsorted_mi)
assert lexsorted_df.columns.is_lexsorted()
# define the non-lexsorted version
not_lexsorted_df = DataFrame(
columns=["a", "b", "c", "d"], data=[[1, "b1", "c1", 3], [1, "b2", "c2", 4]]
)
not_lexsorted_df = not_lexsorted_df.pivot_table(
index="a", columns=["b", "c"], values="d"
)
not_lexsorted_df = not_lexsorted_df.reset_index()
assert not not_lexsorted_df.columns.is_lexsorted()
# compare the results
tm.assert_frame_equal(lexsorted_df, not_lexsorted_df)
expected = lexsorted_df.drop("a", axis=1)
with tm.assert_produces_warning(PerformanceWarning):
result = not_lexsorted_df.drop("a", axis=1)
tm.assert_frame_equal(result, expected)
def test_drop_api_equivalence(self):
# equivalence of the labels/axis and index/columns API's (GH#12392)
df = DataFrame(
[[1, 2, 3], [3, 4, 5], [5, 6, 7]],
index=["a", "b", "c"],
columns=["d", "e", "f"],
)
res1 = df.drop("a")
res2 = df.drop(index="a")
tm.assert_frame_equal(res1, res2)
res1 = df.drop("d", 1)
res2 = df.drop(columns="d")
tm.assert_frame_equal(res1, res2)
res1 = df.drop(labels="e", axis=1)
res2 = df.drop(columns="e")
tm.assert_frame_equal(res1, res2)
res1 = df.drop(["a"], axis=0)
res2 = df.drop(index=["a"])
tm.assert_frame_equal(res1, res2)
res1 = df.drop(["a"], axis=0).drop(["d"], axis=1)
res2 = df.drop(index=["a"], columns=["d"])
tm.assert_frame_equal(res1, res2)
msg = "Cannot specify both 'labels' and 'index'/'columns'"
with pytest.raises(ValueError, match=msg):
df.drop(labels="a", index="b")
with pytest.raises(ValueError, match=msg):
df.drop(labels="a", columns="b")
msg = "Need to specify at least one of 'labels', 'index' or 'columns'"
with pytest.raises(ValueError, match=msg):
df.drop(axis=1)
data = [[1, 2, 3], [1, 2, 3]]
@pytest.mark.parametrize(
"actual",
[
DataFrame(data=data, index=["a", "a"]),
DataFrame(data=data, index=["a", "b"]),
DataFrame(data=data, index=["a", "b"]).set_index([0, 1]),
DataFrame(data=data, index=["a", "a"]).set_index([0, 1]),
],
)
def test_raise_on_drop_duplicate_index(self, actual):
# GH#19186
level = 0 if isinstance(actual.index, MultiIndex) else None
msg = re.escape("\"['c'] not found in axis\"")
with pytest.raises(KeyError, match=msg):
actual.drop("c", level=level, axis=0)
with pytest.raises(KeyError, match=msg):
actual.T.drop("c", level=level, axis=1)
expected_no_err = actual.drop("c", axis=0, level=level, errors="ignore")
tm.assert_frame_equal(expected_no_err, actual)
expected_no_err = actual.T.drop("c", axis=1, level=level, errors="ignore")
tm.assert_frame_equal(expected_no_err.T, actual)
@pytest.mark.parametrize("index", [[1, 2, 3], [1, 1, 2]])
@pytest.mark.parametrize("drop_labels", [[], [1], [2]])
def test_drop_empty_list(self, index, drop_labels):
# GH#21494
expected_index = [i for i in index if i not in drop_labels]
frame = DataFrame(index=index).drop(drop_labels)
tm.assert_frame_equal(frame, DataFrame(index=expected_index))
@pytest.mark.parametrize("index", [[1, 2, 3], [1, 2, 2]])
@pytest.mark.parametrize("drop_labels", [[1, 4], [4, 5]])
def test_drop_non_empty_list(self, index, drop_labels):
# GH# 21494
with pytest.raises(KeyError, match="not found in axis"):
DataFrame(index=index).drop(drop_labels)
def test_mixed_depth_drop(self):
arrays = [
["a", "top", "top", "routine1", "routine1", "routine2"],
["", "OD", "OD", "result1", "result2", "result1"],
["", "wx", "wy", "", "", ""],
]
tuples = sorted(zip(*arrays))
index = MultiIndex.from_tuples(tuples)
df = DataFrame(np.random.randn(4, 6), columns=index)
result = df.drop("a", axis=1)
expected = df.drop([("a", "", "")], axis=1)
tm.assert_frame_equal(expected, result)
result = df.drop(["top"], axis=1)
expected = df.drop([("top", "OD", "wx")], axis=1)
expected = expected.drop([("top", "OD", "wy")], axis=1)
tm.assert_frame_equal(expected, result)
result = df.drop(("top", "OD", "wx"), axis=1)
expected = df.drop([("top", "OD", "wx")], axis=1)
tm.assert_frame_equal(expected, result)
expected = df.drop([("top", "OD", "wy")], axis=1)
expected = df.drop("top", axis=1)
result = df.drop("result1", level=1, axis=1)
expected = df.drop(
[("routine1", "result1", ""), ("routine2", "result1", "")], axis=1
)
tm.assert_frame_equal(expected, result)
def test_drop_multiindex_other_level_nan(self):
# GH#12754
df = (
DataFrame(
{
"A": ["one", "one", "two", "two"],
"B": [np.nan, 0.0, 1.0, 2.0],
"C": ["a", "b", "c", "c"],
"D": [1, 2, 3, 4],
}
)
.set_index(["A", "B", "C"])
.sort_index()
)
result = df.drop("c", level="C")
expected = DataFrame(
[2, 1],
columns=["D"],
index=MultiIndex.from_tuples(
[("one", 0.0, "b"), ("one", np.nan, "a")], names=["A", "B", "C"]
),
)
tm.assert_frame_equal(result, expected)
def test_drop_nonunique(self):
df = DataFrame(
[
["x-a", "x", "a", 1.5],
["x-a", "x", "a", 1.2],
["z-c", "z", "c", 3.1],
["x-a", "x", "a", 4.1],
["x-b", "x", "b", 5.1],
["x-b", "x", "b", 4.1],
["x-b", "x", "b", 2.2],
["y-a", "y", "a", 1.2],
["z-b", "z", "b", 2.1],
],
columns=["var1", "var2", "var3", "var4"],
)
grp_size = df.groupby("var1").size()
drop_idx = grp_size.loc[grp_size == 1]
idf = df.set_index(["var1", "var2", "var3"])
# it works! GH#2101
result = idf.drop(drop_idx.index, level=0).reset_index()
expected = df[-df.var1.isin(drop_idx.index)]
result.index = expected.index
tm.assert_frame_equal(result, expected)
def test_drop_level(self):
index = MultiIndex(
levels=[["foo", "bar", "baz", "qux"], ["one", "two", "three"]],
codes=[[0, 0, 0, 1, 1, 2, 2, 3, 3, 3], [0, 1, 2, 0, 1, 1, 2, 0, 1, 2]],
names=["first", "second"],
)
frame = DataFrame(
np.random.randn(10, 3),
index=index,
columns=Index(["A", "B", "C"], name="exp"),
)
result = frame.drop(["bar", "qux"], level="first")
expected = frame.iloc[[0, 1, 2, 5, 6]]
tm.assert_frame_equal(result, expected)
result = frame.drop(["two"], level="second")
expected = frame.iloc[[0, 2, 3, 6, 7, 9]]
tm.assert_frame_equal(result, expected)
result = frame.T.drop(["bar", "qux"], axis=1, level="first")
expected = frame.iloc[[0, 1, 2, 5, 6]].T
tm.assert_frame_equal(result, expected)
result = frame.T.drop(["two"], axis=1, level="second")
expected = frame.iloc[[0, 2, 3, 6, 7, 9]].T
tm.assert_frame_equal(result, expected)
def test_drop_level_nonunique_datetime(self):
# GH#12701
idx = Index([2, 3, 4, 4, 5], name="id")
idxdt = pd.to_datetime(
[
"201603231400",
"201603231500",
"201603231600",
"201603231600",
"201603231700",
]
)
df = DataFrame(np.arange(10).reshape(5, 2), columns=list("ab"), index=idx)
df["tstamp"] = idxdt
df = df.set_index("tstamp", append=True)
ts = Timestamp("201603231600")
assert df.index.is_unique is False
result = df.drop(ts, level="tstamp")
expected = df.loc[idx != 4]
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize("box", [Series, DataFrame])
def test_drop_tz_aware_timestamp_across_dst(self, box):
# GH#21761
start = Timestamp("2017-10-29", tz="Europe/Berlin")
end = Timestamp("2017-10-29 04:00:00", tz="Europe/Berlin")
index = pd.date_range(start, end, freq="15min")
data = box(data=[1] * len(index), index=index)
result = data.drop(start)
expected_start = Timestamp("2017-10-29 00:15:00", tz="Europe/Berlin")
expected_idx = pd.date_range(expected_start, end, freq="15min")
expected = box(data=[1] * len(expected_idx), index=expected_idx)
tm.assert_equal(result, expected)
def test_drop_preserve_names(self):
index = MultiIndex.from_arrays(
[[0, 0, 0, 1, 1, 1], [1, 2, 3, 1, 2, 3]], names=["one", "two"]
)
df = DataFrame(np.random.randn(6, 3), index=index)
result = df.drop([(0, 2)])
assert result.index.names == ("one", "two")
@pytest.mark.parametrize(
"operation", ["__iadd__", "__isub__", "__imul__", "__ipow__"]
)
@pytest.mark.parametrize("inplace", [False, True])
def test_inplace_drop_and_operation(self, operation, inplace):
# GH#30484
df = DataFrame({"x": range(5)})
expected = df.copy()
df["y"] = range(5)
y = df["y"]
with tm.assert_produces_warning(None):
if inplace:
df.drop("y", axis=1, inplace=inplace)
else:
df = df.drop("y", axis=1, inplace=inplace)
# Perform operation and check result
getattr(y, operation)(1)
tm.assert_frame_equal(df, expected)