699 lines
25 KiB
Python
699 lines
25 KiB
Python
|
"""
|
||
|
See also: test_reindex.py:TestReindexSetIndex
|
||
|
"""
|
||
|
|
||
|
from datetime import (
|
||
|
datetime,
|
||
|
timedelta,
|
||
|
)
|
||
|
|
||
|
import numpy as np
|
||
|
import pytest
|
||
|
|
||
|
from pandas import (
|
||
|
Categorical,
|
||
|
DataFrame,
|
||
|
DatetimeIndex,
|
||
|
Index,
|
||
|
MultiIndex,
|
||
|
Series,
|
||
|
date_range,
|
||
|
period_range,
|
||
|
to_datetime,
|
||
|
)
|
||
|
import pandas._testing as tm
|
||
|
|
||
|
|
||
|
class TestSetIndex:
|
||
|
def test_set_index_multiindex(self):
|
||
|
# segfault in GH#3308
|
||
|
d = {"t1": [2, 2.5, 3], "t2": [4, 5, 6]}
|
||
|
df = DataFrame(d)
|
||
|
tuples = [(0, 1), (0, 2), (1, 2)]
|
||
|
df["tuples"] = tuples
|
||
|
|
||
|
index = MultiIndex.from_tuples(df["tuples"])
|
||
|
# it works!
|
||
|
df.set_index(index)
|
||
|
|
||
|
def test_set_index_empty_column(self):
|
||
|
# GH#1971
|
||
|
df = DataFrame(
|
||
|
[
|
||
|
{"a": 1, "p": 0},
|
||
|
{"a": 2, "m": 10},
|
||
|
{"a": 3, "m": 11, "p": 20},
|
||
|
{"a": 4, "m": 12, "p": 21},
|
||
|
],
|
||
|
columns=["a", "m", "p", "x"],
|
||
|
)
|
||
|
|
||
|
result = df.set_index(["a", "x"])
|
||
|
|
||
|
expected = df[["m", "p"]]
|
||
|
expected.index = MultiIndex.from_arrays([df["a"], df["x"]], names=["a", "x"])
|
||
|
tm.assert_frame_equal(result, expected)
|
||
|
|
||
|
def test_set_index_empty_dataframe(self):
|
||
|
# GH#38419
|
||
|
df1 = DataFrame(
|
||
|
{"a": Series(dtype="datetime64[ns]"), "b": Series(dtype="int64"), "c": []}
|
||
|
)
|
||
|
|
||
|
df2 = df1.set_index(["a", "b"])
|
||
|
result = df2.index.to_frame().dtypes
|
||
|
expected = df1[["a", "b"]].dtypes
|
||
|
tm.assert_series_equal(result, expected)
|
||
|
|
||
|
def test_set_index_multiindexcolumns(self):
|
||
|
columns = MultiIndex.from_tuples([("foo", 1), ("foo", 2), ("bar", 1)])
|
||
|
df = DataFrame(np.random.randn(3, 3), columns=columns)
|
||
|
|
||
|
result = df.set_index(df.columns[0])
|
||
|
|
||
|
expected = df.iloc[:, 1:]
|
||
|
expected.index = df.iloc[:, 0].values
|
||
|
expected.index.names = [df.columns[0]]
|
||
|
tm.assert_frame_equal(result, expected)
|
||
|
|
||
|
def test_set_index_timezone(self):
|
||
|
# GH#12358
|
||
|
# tz-aware Series should retain the tz
|
||
|
idx = DatetimeIndex(["2014-01-01 10:10:10"], tz="UTC").tz_convert("Europe/Rome")
|
||
|
df = DataFrame({"A": idx})
|
||
|
assert df.set_index(idx).index[0].hour == 11
|
||
|
assert DatetimeIndex(Series(df.A))[0].hour == 11
|
||
|
assert df.set_index(df.A).index[0].hour == 11
|
||
|
|
||
|
def test_set_index_cast_datetimeindex(self):
|
||
|
df = DataFrame(
|
||
|
{
|
||
|
"A": [datetime(2000, 1, 1) + timedelta(i) for i in range(1000)],
|
||
|
"B": np.random.randn(1000),
|
||
|
}
|
||
|
)
|
||
|
|
||
|
idf = df.set_index("A")
|
||
|
assert isinstance(idf.index, DatetimeIndex)
|
||
|
|
||
|
def test_set_index_dst(self):
|
||
|
di = date_range("2006-10-29 00:00:00", periods=3, freq="H", tz="US/Pacific")
|
||
|
|
||
|
df = DataFrame(data={"a": [0, 1, 2], "b": [3, 4, 5]}, index=di).reset_index()
|
||
|
# single level
|
||
|
res = df.set_index("index")
|
||
|
exp = DataFrame(
|
||
|
data={"a": [0, 1, 2], "b": [3, 4, 5]},
|
||
|
index=Index(di, name="index"),
|
||
|
)
|
||
|
exp.index = exp.index._with_freq(None)
|
||
|
tm.assert_frame_equal(res, exp)
|
||
|
|
||
|
# GH#12920
|
||
|
res = df.set_index(["index", "a"])
|
||
|
exp_index = MultiIndex.from_arrays([di, [0, 1, 2]], names=["index", "a"])
|
||
|
exp = DataFrame({"b": [3, 4, 5]}, index=exp_index)
|
||
|
tm.assert_frame_equal(res, exp)
|
||
|
|
||
|
def test_set_index(self, float_string_frame):
|
||
|
df = float_string_frame
|
||
|
idx = Index(np.arange(len(df))[::-1])
|
||
|
|
||
|
df = df.set_index(idx)
|
||
|
tm.assert_index_equal(df.index, idx)
|
||
|
with pytest.raises(ValueError, match="Length mismatch"):
|
||
|
df.set_index(idx[::2])
|
||
|
|
||
|
def test_set_index_names(self):
|
||
|
df = tm.makeDataFrame()
|
||
|
df.index.name = "name"
|
||
|
|
||
|
assert df.set_index(df.index).index.names == ["name"]
|
||
|
|
||
|
mi = MultiIndex.from_arrays(df[["A", "B"]].T.values, names=["A", "B"])
|
||
|
mi2 = MultiIndex.from_arrays(
|
||
|
df[["A", "B", "A", "B"]].T.values, names=["A", "B", "C", "D"]
|
||
|
)
|
||
|
|
||
|
df = df.set_index(["A", "B"])
|
||
|
|
||
|
assert df.set_index(df.index).index.names == ["A", "B"]
|
||
|
|
||
|
# Check that set_index isn't converting a MultiIndex into an Index
|
||
|
assert isinstance(df.set_index(df.index).index, MultiIndex)
|
||
|
|
||
|
# Check actual equality
|
||
|
tm.assert_index_equal(df.set_index(df.index).index, mi)
|
||
|
|
||
|
idx2 = df.index.rename(["C", "D"])
|
||
|
|
||
|
# Check that [MultiIndex, MultiIndex] yields a MultiIndex rather
|
||
|
# than a pair of tuples
|
||
|
assert isinstance(df.set_index([df.index, idx2]).index, MultiIndex)
|
||
|
|
||
|
# Check equality
|
||
|
tm.assert_index_equal(df.set_index([df.index, idx2]).index, mi2)
|
||
|
|
||
|
# A has duplicate values, C does not
|
||
|
@pytest.mark.parametrize("keys", ["A", "C", ["A", "B"], ("tuple", "as", "label")])
|
||
|
@pytest.mark.parametrize("inplace", [True, False])
|
||
|
@pytest.mark.parametrize("drop", [True, False])
|
||
|
def test_set_index_drop_inplace(self, frame_of_index_cols, drop, inplace, keys):
|
||
|
df = frame_of_index_cols
|
||
|
|
||
|
if isinstance(keys, list):
|
||
|
idx = MultiIndex.from_arrays([df[x] for x in keys], names=keys)
|
||
|
else:
|
||
|
idx = Index(df[keys], name=keys)
|
||
|
expected = df.drop(keys, axis=1) if drop else df
|
||
|
expected.index = idx
|
||
|
|
||
|
if inplace:
|
||
|
result = df.copy()
|
||
|
return_value = result.set_index(keys, drop=drop, inplace=True)
|
||
|
assert return_value is None
|
||
|
else:
|
||
|
result = df.set_index(keys, drop=drop)
|
||
|
|
||
|
tm.assert_frame_equal(result, expected)
|
||
|
|
||
|
# A has duplicate values, C does not
|
||
|
@pytest.mark.parametrize("keys", ["A", "C", ["A", "B"], ("tuple", "as", "label")])
|
||
|
@pytest.mark.parametrize("drop", [True, False])
|
||
|
def test_set_index_append(self, frame_of_index_cols, drop, keys):
|
||
|
df = frame_of_index_cols
|
||
|
|
||
|
keys = keys if isinstance(keys, list) else [keys]
|
||
|
idx = MultiIndex.from_arrays(
|
||
|
[df.index] + [df[x] for x in keys], names=[None] + keys
|
||
|
)
|
||
|
expected = df.drop(keys, axis=1) if drop else df.copy()
|
||
|
expected.index = idx
|
||
|
|
||
|
result = df.set_index(keys, drop=drop, append=True)
|
||
|
|
||
|
tm.assert_frame_equal(result, expected)
|
||
|
|
||
|
# A has duplicate values, C does not
|
||
|
@pytest.mark.parametrize("keys", ["A", "C", ["A", "B"], ("tuple", "as", "label")])
|
||
|
@pytest.mark.parametrize("drop", [True, False])
|
||
|
def test_set_index_append_to_multiindex(self, frame_of_index_cols, drop, keys):
|
||
|
# append to existing multiindex
|
||
|
df = frame_of_index_cols.set_index(["D"], drop=drop, append=True)
|
||
|
|
||
|
keys = keys if isinstance(keys, list) else [keys]
|
||
|
expected = frame_of_index_cols.set_index(["D"] + keys, drop=drop, append=True)
|
||
|
|
||
|
result = df.set_index(keys, drop=drop, append=True)
|
||
|
|
||
|
tm.assert_frame_equal(result, expected)
|
||
|
|
||
|
def test_set_index_after_mutation(self):
|
||
|
# GH#1590
|
||
|
df = DataFrame({"val": [0, 1, 2], "key": ["a", "b", "c"]})
|
||
|
expected = DataFrame({"val": [1, 2]}, Index(["b", "c"], name="key"))
|
||
|
|
||
|
df2 = df.loc[df.index.map(lambda indx: indx >= 1)]
|
||
|
result = df2.set_index("key")
|
||
|
tm.assert_frame_equal(result, expected)
|
||
|
|
||
|
# MultiIndex constructor does not work directly on Series -> lambda
|
||
|
# Add list-of-list constructor because list is ambiguous -> lambda
|
||
|
# also test index name if append=True (name is duplicate here for B)
|
||
|
@pytest.mark.parametrize(
|
||
|
"box",
|
||
|
[
|
||
|
Series,
|
||
|
Index,
|
||
|
np.array,
|
||
|
list,
|
||
|
lambda x: [list(x)],
|
||
|
lambda x: MultiIndex.from_arrays([x]),
|
||
|
],
|
||
|
)
|
||
|
@pytest.mark.parametrize(
|
||
|
"append, index_name", [(True, None), (True, "B"), (True, "test"), (False, None)]
|
||
|
)
|
||
|
@pytest.mark.parametrize("drop", [True, False])
|
||
|
def test_set_index_pass_single_array(
|
||
|
self, frame_of_index_cols, drop, append, index_name, box
|
||
|
):
|
||
|
df = frame_of_index_cols
|
||
|
df.index.name = index_name
|
||
|
|
||
|
key = box(df["B"])
|
||
|
if box == list:
|
||
|
# list of strings gets interpreted as list of keys
|
||
|
msg = "['one', 'two', 'three', 'one', 'two']"
|
||
|
with pytest.raises(KeyError, match=msg):
|
||
|
df.set_index(key, drop=drop, append=append)
|
||
|
else:
|
||
|
# np.array/list-of-list "forget" the name of B
|
||
|
name_mi = getattr(key, "names", None)
|
||
|
name = [getattr(key, "name", None)] if name_mi is None else name_mi
|
||
|
|
||
|
result = df.set_index(key, drop=drop, append=append)
|
||
|
|
||
|
# only valid column keys are dropped
|
||
|
# since B is always passed as array above, nothing is dropped
|
||
|
expected = df.set_index(["B"], drop=False, append=append)
|
||
|
expected.index.names = [index_name] + name if append else name
|
||
|
|
||
|
tm.assert_frame_equal(result, expected)
|
||
|
|
||
|
# MultiIndex constructor does not work directly on Series -> lambda
|
||
|
# also test index name if append=True (name is duplicate here for A & B)
|
||
|
@pytest.mark.parametrize(
|
||
|
"box", [Series, Index, np.array, list, lambda x: MultiIndex.from_arrays([x])]
|
||
|
)
|
||
|
@pytest.mark.parametrize(
|
||
|
"append, index_name",
|
||
|
[(True, None), (True, "A"), (True, "B"), (True, "test"), (False, None)],
|
||
|
)
|
||
|
@pytest.mark.parametrize("drop", [True, False])
|
||
|
def test_set_index_pass_arrays(
|
||
|
self, frame_of_index_cols, drop, append, index_name, box
|
||
|
):
|
||
|
df = frame_of_index_cols
|
||
|
df.index.name = index_name
|
||
|
|
||
|
keys = ["A", box(df["B"])]
|
||
|
# np.array/list "forget" the name of B
|
||
|
names = ["A", None if box in [np.array, list, tuple, iter] else "B"]
|
||
|
|
||
|
result = df.set_index(keys, drop=drop, append=append)
|
||
|
|
||
|
# only valid column keys are dropped
|
||
|
# since B is always passed as array above, only A is dropped, if at all
|
||
|
expected = df.set_index(["A", "B"], drop=False, append=append)
|
||
|
expected = expected.drop("A", axis=1) if drop else expected
|
||
|
expected.index.names = [index_name] + names if append else names
|
||
|
|
||
|
tm.assert_frame_equal(result, expected)
|
||
|
|
||
|
# MultiIndex constructor does not work directly on Series -> lambda
|
||
|
# We also emulate a "constructor" for the label -> lambda
|
||
|
# also test index name if append=True (name is duplicate here for A)
|
||
|
@pytest.mark.parametrize(
|
||
|
"box2",
|
||
|
[
|
||
|
Series,
|
||
|
Index,
|
||
|
np.array,
|
||
|
list,
|
||
|
iter,
|
||
|
lambda x: MultiIndex.from_arrays([x]),
|
||
|
lambda x: x.name,
|
||
|
],
|
||
|
)
|
||
|
@pytest.mark.parametrize(
|
||
|
"box1",
|
||
|
[
|
||
|
Series,
|
||
|
Index,
|
||
|
np.array,
|
||
|
list,
|
||
|
iter,
|
||
|
lambda x: MultiIndex.from_arrays([x]),
|
||
|
lambda x: x.name,
|
||
|
],
|
||
|
)
|
||
|
@pytest.mark.parametrize(
|
||
|
"append, index_name", [(True, None), (True, "A"), (True, "test"), (False, None)]
|
||
|
)
|
||
|
@pytest.mark.parametrize("drop", [True, False])
|
||
|
def test_set_index_pass_arrays_duplicate(
|
||
|
self, frame_of_index_cols, drop, append, index_name, box1, box2
|
||
|
):
|
||
|
df = frame_of_index_cols
|
||
|
df.index.name = index_name
|
||
|
|
||
|
keys = [box1(df["A"]), box2(df["A"])]
|
||
|
result = df.set_index(keys, drop=drop, append=append)
|
||
|
|
||
|
# if either box is iter, it has been consumed; re-read
|
||
|
keys = [box1(df["A"]), box2(df["A"])]
|
||
|
|
||
|
# need to adapt first drop for case that both keys are 'A' --
|
||
|
# cannot drop the same column twice;
|
||
|
# plain == would give ambiguous Boolean error for containers
|
||
|
first_drop = (
|
||
|
False
|
||
|
if (
|
||
|
isinstance(keys[0], str)
|
||
|
and keys[0] == "A"
|
||
|
and isinstance(keys[1], str)
|
||
|
and keys[1] == "A"
|
||
|
)
|
||
|
else drop
|
||
|
)
|
||
|
# to test against already-tested behaviour, we add sequentially,
|
||
|
# hence second append always True; must wrap keys in list, otherwise
|
||
|
# box = list would be interpreted as keys
|
||
|
expected = df.set_index([keys[0]], drop=first_drop, append=append)
|
||
|
expected = expected.set_index([keys[1]], drop=drop, append=True)
|
||
|
tm.assert_frame_equal(result, expected)
|
||
|
|
||
|
@pytest.mark.parametrize("append", [True, False])
|
||
|
@pytest.mark.parametrize("drop", [True, False])
|
||
|
def test_set_index_pass_multiindex(self, frame_of_index_cols, drop, append):
|
||
|
df = frame_of_index_cols
|
||
|
keys = MultiIndex.from_arrays([df["A"], df["B"]], names=["A", "B"])
|
||
|
|
||
|
result = df.set_index(keys, drop=drop, append=append)
|
||
|
|
||
|
# setting with a MultiIndex will never drop columns
|
||
|
expected = df.set_index(["A", "B"], drop=False, append=append)
|
||
|
|
||
|
tm.assert_frame_equal(result, expected)
|
||
|
|
||
|
def test_construction_with_categorical_index(self):
|
||
|
ci = tm.makeCategoricalIndex(10)
|
||
|
ci.name = "B"
|
||
|
|
||
|
# with Categorical
|
||
|
df = DataFrame({"A": np.random.randn(10), "B": ci.values})
|
||
|
idf = df.set_index("B")
|
||
|
tm.assert_index_equal(idf.index, ci)
|
||
|
|
||
|
# from a CategoricalIndex
|
||
|
df = DataFrame({"A": np.random.randn(10), "B": ci})
|
||
|
idf = df.set_index("B")
|
||
|
tm.assert_index_equal(idf.index, ci)
|
||
|
|
||
|
# round-trip
|
||
|
idf = idf.reset_index().set_index("B")
|
||
|
tm.assert_index_equal(idf.index, ci)
|
||
|
|
||
|
def test_set_index_preserve_categorical_dtype(self):
|
||
|
# GH#13743, GH#13854
|
||
|
df = DataFrame(
|
||
|
{
|
||
|
"A": [1, 2, 1, 1, 2],
|
||
|
"B": [10, 16, 22, 28, 34],
|
||
|
"C1": Categorical(list("abaab"), categories=list("bac"), ordered=False),
|
||
|
"C2": Categorical(list("abaab"), categories=list("bac"), ordered=True),
|
||
|
}
|
||
|
)
|
||
|
for cols in ["C1", "C2", ["A", "C1"], ["A", "C2"], ["C1", "C2"]]:
|
||
|
result = df.set_index(cols).reset_index()
|
||
|
result = result.reindex(columns=df.columns)
|
||
|
tm.assert_frame_equal(result, df)
|
||
|
|
||
|
def test_set_index_datetime(self):
|
||
|
# GH#3950
|
||
|
df = DataFrame(
|
||
|
{
|
||
|
"label": ["a", "a", "a", "b", "b", "b"],
|
||
|
"datetime": [
|
||
|
"2011-07-19 07:00:00",
|
||
|
"2011-07-19 08:00:00",
|
||
|
"2011-07-19 09:00:00",
|
||
|
"2011-07-19 07:00:00",
|
||
|
"2011-07-19 08:00:00",
|
||
|
"2011-07-19 09:00:00",
|
||
|
],
|
||
|
"value": range(6),
|
||
|
}
|
||
|
)
|
||
|
df.index = to_datetime(df.pop("datetime"), utc=True)
|
||
|
df.index = df.index.tz_convert("US/Pacific")
|
||
|
|
||
|
expected = DatetimeIndex(
|
||
|
["2011-07-19 07:00:00", "2011-07-19 08:00:00", "2011-07-19 09:00:00"],
|
||
|
name="datetime",
|
||
|
)
|
||
|
expected = expected.tz_localize("UTC").tz_convert("US/Pacific")
|
||
|
|
||
|
df = df.set_index("label", append=True)
|
||
|
tm.assert_index_equal(df.index.levels[0], expected)
|
||
|
tm.assert_index_equal(df.index.levels[1], Index(["a", "b"], name="label"))
|
||
|
assert df.index.names == ["datetime", "label"]
|
||
|
|
||
|
df = df.swaplevel(0, 1)
|
||
|
tm.assert_index_equal(df.index.levels[0], Index(["a", "b"], name="label"))
|
||
|
tm.assert_index_equal(df.index.levels[1], expected)
|
||
|
assert df.index.names == ["label", "datetime"]
|
||
|
|
||
|
df = DataFrame(np.random.random(6))
|
||
|
idx1 = DatetimeIndex(
|
||
|
[
|
||
|
"2011-07-19 07:00:00",
|
||
|
"2011-07-19 08:00:00",
|
||
|
"2011-07-19 09:00:00",
|
||
|
"2011-07-19 07:00:00",
|
||
|
"2011-07-19 08:00:00",
|
||
|
"2011-07-19 09:00:00",
|
||
|
],
|
||
|
tz="US/Eastern",
|
||
|
)
|
||
|
idx2 = DatetimeIndex(
|
||
|
[
|
||
|
"2012-04-01 09:00",
|
||
|
"2012-04-01 09:00",
|
||
|
"2012-04-01 09:00",
|
||
|
"2012-04-02 09:00",
|
||
|
"2012-04-02 09:00",
|
||
|
"2012-04-02 09:00",
|
||
|
],
|
||
|
tz="US/Eastern",
|
||
|
)
|
||
|
idx3 = date_range("2011-01-01 09:00", periods=6, tz="Asia/Tokyo")
|
||
|
idx3 = idx3._with_freq(None)
|
||
|
|
||
|
df = df.set_index(idx1)
|
||
|
df = df.set_index(idx2, append=True)
|
||
|
df = df.set_index(idx3, append=True)
|
||
|
|
||
|
expected1 = DatetimeIndex(
|
||
|
["2011-07-19 07:00:00", "2011-07-19 08:00:00", "2011-07-19 09:00:00"],
|
||
|
tz="US/Eastern",
|
||
|
)
|
||
|
expected2 = DatetimeIndex(
|
||
|
["2012-04-01 09:00", "2012-04-02 09:00"], tz="US/Eastern"
|
||
|
)
|
||
|
|
||
|
tm.assert_index_equal(df.index.levels[0], expected1)
|
||
|
tm.assert_index_equal(df.index.levels[1], expected2)
|
||
|
tm.assert_index_equal(df.index.levels[2], idx3)
|
||
|
|
||
|
# GH#7092
|
||
|
tm.assert_index_equal(df.index.get_level_values(0), idx1)
|
||
|
tm.assert_index_equal(df.index.get_level_values(1), idx2)
|
||
|
tm.assert_index_equal(df.index.get_level_values(2), idx3)
|
||
|
|
||
|
def test_set_index_period(self):
|
||
|
# GH#6631
|
||
|
df = DataFrame(np.random.random(6))
|
||
|
idx1 = period_range("2011-01-01", periods=3, freq="M")
|
||
|
idx1 = idx1.append(idx1)
|
||
|
idx2 = period_range("2013-01-01 09:00", periods=2, freq="H")
|
||
|
idx2 = idx2.append(idx2).append(idx2)
|
||
|
idx3 = period_range("2005", periods=6, freq="A")
|
||
|
|
||
|
df = df.set_index(idx1)
|
||
|
df = df.set_index(idx2, append=True)
|
||
|
df = df.set_index(idx3, append=True)
|
||
|
|
||
|
expected1 = period_range("2011-01-01", periods=3, freq="M")
|
||
|
expected2 = period_range("2013-01-01 09:00", periods=2, freq="H")
|
||
|
|
||
|
tm.assert_index_equal(df.index.levels[0], expected1)
|
||
|
tm.assert_index_equal(df.index.levels[1], expected2)
|
||
|
tm.assert_index_equal(df.index.levels[2], idx3)
|
||
|
|
||
|
tm.assert_index_equal(df.index.get_level_values(0), idx1)
|
||
|
tm.assert_index_equal(df.index.get_level_values(1), idx2)
|
||
|
tm.assert_index_equal(df.index.get_level_values(2), idx3)
|
||
|
|
||
|
|
||
|
class TestSetIndexInvalid:
|
||
|
def test_set_index_verify_integrity(self, frame_of_index_cols):
|
||
|
df = frame_of_index_cols
|
||
|
|
||
|
with pytest.raises(ValueError, match="Index has duplicate keys"):
|
||
|
df.set_index("A", verify_integrity=True)
|
||
|
# with MultiIndex
|
||
|
with pytest.raises(ValueError, match="Index has duplicate keys"):
|
||
|
df.set_index([df["A"], df["A"]], verify_integrity=True)
|
||
|
|
||
|
@pytest.mark.parametrize("append", [True, False])
|
||
|
@pytest.mark.parametrize("drop", [True, False])
|
||
|
def test_set_index_raise_keys(self, frame_of_index_cols, drop, append):
|
||
|
df = frame_of_index_cols
|
||
|
|
||
|
with pytest.raises(KeyError, match="['foo', 'bar', 'baz']"):
|
||
|
# column names are A-E, as well as one tuple
|
||
|
df.set_index(["foo", "bar", "baz"], drop=drop, append=append)
|
||
|
|
||
|
# non-existent key in list with arrays
|
||
|
with pytest.raises(KeyError, match="X"):
|
||
|
df.set_index([df["A"], df["B"], "X"], drop=drop, append=append)
|
||
|
|
||
|
msg = "[('foo', 'foo', 'foo', 'bar', 'bar')]"
|
||
|
# tuples always raise KeyError
|
||
|
with pytest.raises(KeyError, match=msg):
|
||
|
df.set_index(tuple(df["A"]), drop=drop, append=append)
|
||
|
|
||
|
# also within a list
|
||
|
with pytest.raises(KeyError, match=msg):
|
||
|
df.set_index(["A", df["A"], tuple(df["A"])], drop=drop, append=append)
|
||
|
|
||
|
@pytest.mark.parametrize("append", [True, False])
|
||
|
@pytest.mark.parametrize("drop", [True, False])
|
||
|
@pytest.mark.parametrize("box", [set], ids=["set"])
|
||
|
def test_set_index_raise_on_type(self, frame_of_index_cols, box, drop, append):
|
||
|
df = frame_of_index_cols
|
||
|
|
||
|
msg = 'The parameter "keys" may be a column key, .*'
|
||
|
# forbidden type, e.g. set
|
||
|
with pytest.raises(TypeError, match=msg):
|
||
|
df.set_index(box(df["A"]), drop=drop, append=append)
|
||
|
|
||
|
# forbidden type in list, e.g. set
|
||
|
with pytest.raises(TypeError, match=msg):
|
||
|
df.set_index(["A", df["A"], box(df["A"])], drop=drop, append=append)
|
||
|
|
||
|
# MultiIndex constructor does not work directly on Series -> lambda
|
||
|
@pytest.mark.parametrize(
|
||
|
"box",
|
||
|
[Series, Index, np.array, iter, lambda x: MultiIndex.from_arrays([x])],
|
||
|
ids=["Series", "Index", "np.array", "iter", "MultiIndex"],
|
||
|
)
|
||
|
@pytest.mark.parametrize("length", [4, 6], ids=["too_short", "too_long"])
|
||
|
@pytest.mark.parametrize("append", [True, False])
|
||
|
@pytest.mark.parametrize("drop", [True, False])
|
||
|
def test_set_index_raise_on_len(
|
||
|
self, frame_of_index_cols, box, length, drop, append
|
||
|
):
|
||
|
# GH 24984
|
||
|
df = frame_of_index_cols # has length 5
|
||
|
|
||
|
values = np.random.randint(0, 10, (length,))
|
||
|
|
||
|
msg = "Length mismatch: Expected 5 rows, received array of length.*"
|
||
|
|
||
|
# wrong length directly
|
||
|
with pytest.raises(ValueError, match=msg):
|
||
|
df.set_index(box(values), drop=drop, append=append)
|
||
|
|
||
|
# wrong length in list
|
||
|
with pytest.raises(ValueError, match=msg):
|
||
|
df.set_index(["A", df.A, box(values)], drop=drop, append=append)
|
||
|
|
||
|
|
||
|
class TestSetIndexCustomLabelType:
|
||
|
def test_set_index_custom_label_type(self):
|
||
|
# GH#24969
|
||
|
|
||
|
class Thing:
|
||
|
def __init__(self, name, color) -> None:
|
||
|
self.name = name
|
||
|
self.color = color
|
||
|
|
||
|
def __str__(self) -> str:
|
||
|
return f"<Thing {repr(self.name)}>"
|
||
|
|
||
|
# necessary for pretty KeyError
|
||
|
__repr__ = __str__
|
||
|
|
||
|
thing1 = Thing("One", "red")
|
||
|
thing2 = Thing("Two", "blue")
|
||
|
df = DataFrame({thing1: [0, 1], thing2: [2, 3]})
|
||
|
expected = DataFrame({thing1: [0, 1]}, index=Index([2, 3], name=thing2))
|
||
|
|
||
|
# use custom label directly
|
||
|
result = df.set_index(thing2)
|
||
|
tm.assert_frame_equal(result, expected)
|
||
|
|
||
|
# custom label wrapped in list
|
||
|
result = df.set_index([thing2])
|
||
|
tm.assert_frame_equal(result, expected)
|
||
|
|
||
|
# missing key
|
||
|
thing3 = Thing("Three", "pink")
|
||
|
msg = "<Thing 'Three'>"
|
||
|
with pytest.raises(KeyError, match=msg):
|
||
|
# missing label directly
|
||
|
df.set_index(thing3)
|
||
|
|
||
|
with pytest.raises(KeyError, match=msg):
|
||
|
# missing label in list
|
||
|
df.set_index([thing3])
|
||
|
|
||
|
def test_set_index_custom_label_hashable_iterable(self):
|
||
|
# GH#24969
|
||
|
|
||
|
# actual example discussed in GH 24984 was e.g. for shapely.geometry
|
||
|
# objects (e.g. a collection of Points) that can be both hashable and
|
||
|
# iterable; using frozenset as a stand-in for testing here
|
||
|
|
||
|
class Thing(frozenset):
|
||
|
# need to stabilize repr for KeyError (due to random order in sets)
|
||
|
def __repr__(self) -> str:
|
||
|
tmp = sorted(self)
|
||
|
joined_reprs = ", ".join(map(repr, tmp))
|
||
|
# double curly brace prints one brace in format string
|
||
|
return f"frozenset({{{joined_reprs}}})"
|
||
|
|
||
|
thing1 = Thing(["One", "red"])
|
||
|
thing2 = Thing(["Two", "blue"])
|
||
|
df = DataFrame({thing1: [0, 1], thing2: [2, 3]})
|
||
|
expected = DataFrame({thing1: [0, 1]}, index=Index([2, 3], name=thing2))
|
||
|
|
||
|
# use custom label directly
|
||
|
result = df.set_index(thing2)
|
||
|
tm.assert_frame_equal(result, expected)
|
||
|
|
||
|
# custom label wrapped in list
|
||
|
result = df.set_index([thing2])
|
||
|
tm.assert_frame_equal(result, expected)
|
||
|
|
||
|
# missing key
|
||
|
thing3 = Thing(["Three", "pink"])
|
||
|
msg = r"frozenset\(\{'Three', 'pink'\}\)"
|
||
|
with pytest.raises(KeyError, match=msg):
|
||
|
# missing label directly
|
||
|
df.set_index(thing3)
|
||
|
|
||
|
with pytest.raises(KeyError, match=msg):
|
||
|
# missing label in list
|
||
|
df.set_index([thing3])
|
||
|
|
||
|
def test_set_index_custom_label_type_raises(self):
|
||
|
# GH#24969
|
||
|
|
||
|
# purposefully inherit from something unhashable
|
||
|
class Thing(set):
|
||
|
def __init__(self, name, color) -> None:
|
||
|
self.name = name
|
||
|
self.color = color
|
||
|
|
||
|
def __str__(self) -> str:
|
||
|
return f"<Thing {repr(self.name)}>"
|
||
|
|
||
|
thing1 = Thing("One", "red")
|
||
|
thing2 = Thing("Two", "blue")
|
||
|
df = DataFrame([[0, 2], [1, 3]], columns=[thing1, thing2])
|
||
|
|
||
|
msg = 'The parameter "keys" may be a column key, .*'
|
||
|
|
||
|
with pytest.raises(TypeError, match=msg):
|
||
|
# use custom label directly
|
||
|
df.set_index(thing2)
|
||
|
|
||
|
with pytest.raises(TypeError, match=msg):
|
||
|
# custom label wrapped in list
|
||
|
df.set_index([thing2])
|
||
|
|
||
|
def test_set_index_periodindex(self):
|
||
|
# GH#6631
|
||
|
df = DataFrame(np.random.random(6))
|
||
|
idx1 = period_range("2011/01/01", periods=6, freq="M")
|
||
|
idx2 = period_range("2013", periods=6, freq="A")
|
||
|
|
||
|
df = df.set_index(idx1)
|
||
|
tm.assert_index_equal(df.index, idx1)
|
||
|
df = df.set_index(idx2)
|
||
|
tm.assert_index_equal(df.index, idx2)
|