274 lines
8.2 KiB
Python
274 lines
8.2 KiB
Python
![]() |
import random
|
||
|
|
||
|
import numpy as np
|
||
|
import pytest
|
||
|
|
||
|
from pandas.errors import PerformanceWarning, UnsortedIndexError
|
||
|
|
||
|
from pandas import CategoricalIndex, DataFrame, Index, MultiIndex, RangeIndex
|
||
|
import pandas._testing as tm
|
||
|
|
||
|
|
||
|
def test_sortlevel(idx):
|
||
|
tuples = list(idx)
|
||
|
random.shuffle(tuples)
|
||
|
|
||
|
index = MultiIndex.from_tuples(tuples)
|
||
|
|
||
|
sorted_idx, _ = index.sortlevel(0)
|
||
|
expected = MultiIndex.from_tuples(sorted(tuples))
|
||
|
assert sorted_idx.equals(expected)
|
||
|
|
||
|
sorted_idx, _ = index.sortlevel(0, ascending=False)
|
||
|
assert sorted_idx.equals(expected[::-1])
|
||
|
|
||
|
sorted_idx, _ = index.sortlevel(1)
|
||
|
by1 = sorted(tuples, key=lambda x: (x[1], x[0]))
|
||
|
expected = MultiIndex.from_tuples(by1)
|
||
|
assert sorted_idx.equals(expected)
|
||
|
|
||
|
sorted_idx, _ = index.sortlevel(1, ascending=False)
|
||
|
assert sorted_idx.equals(expected[::-1])
|
||
|
|
||
|
|
||
|
def test_sortlevel_not_sort_remaining():
|
||
|
mi = MultiIndex.from_tuples([[1, 1, 3], [1, 1, 1]], names=list("ABC"))
|
||
|
sorted_idx, _ = mi.sortlevel("A", sort_remaining=False)
|
||
|
assert sorted_idx.equals(mi)
|
||
|
|
||
|
|
||
|
def test_sortlevel_deterministic():
|
||
|
tuples = [
|
||
|
("bar", "one"),
|
||
|
("foo", "two"),
|
||
|
("qux", "two"),
|
||
|
("foo", "one"),
|
||
|
("baz", "two"),
|
||
|
("qux", "one"),
|
||
|
]
|
||
|
|
||
|
index = MultiIndex.from_tuples(tuples)
|
||
|
|
||
|
sorted_idx, _ = index.sortlevel(0)
|
||
|
expected = MultiIndex.from_tuples(sorted(tuples))
|
||
|
assert sorted_idx.equals(expected)
|
||
|
|
||
|
sorted_idx, _ = index.sortlevel(0, ascending=False)
|
||
|
assert sorted_idx.equals(expected[::-1])
|
||
|
|
||
|
sorted_idx, _ = index.sortlevel(1)
|
||
|
by1 = sorted(tuples, key=lambda x: (x[1], x[0]))
|
||
|
expected = MultiIndex.from_tuples(by1)
|
||
|
assert sorted_idx.equals(expected)
|
||
|
|
||
|
sorted_idx, _ = index.sortlevel(1, ascending=False)
|
||
|
assert sorted_idx.equals(expected[::-1])
|
||
|
|
||
|
|
||
|
def test_numpy_argsort(idx):
|
||
|
result = np.argsort(idx)
|
||
|
expected = idx.argsort()
|
||
|
tm.assert_numpy_array_equal(result, expected)
|
||
|
|
||
|
# these are the only two types that perform
|
||
|
# pandas compatibility input validation - the
|
||
|
# rest already perform separate (or no) such
|
||
|
# validation via their 'values' attribute as
|
||
|
# defined in pandas.core.indexes/base.py - they
|
||
|
# cannot be changed at the moment due to
|
||
|
# backwards compatibility concerns
|
||
|
if isinstance(type(idx), (CategoricalIndex, RangeIndex)):
|
||
|
msg = "the 'axis' parameter is not supported"
|
||
|
with pytest.raises(ValueError, match=msg):
|
||
|
np.argsort(idx, axis=1)
|
||
|
|
||
|
msg = "the 'kind' parameter is not supported"
|
||
|
with pytest.raises(ValueError, match=msg):
|
||
|
np.argsort(idx, kind="mergesort")
|
||
|
|
||
|
msg = "the 'order' parameter is not supported"
|
||
|
with pytest.raises(ValueError, match=msg):
|
||
|
np.argsort(idx, order=("a", "b"))
|
||
|
|
||
|
|
||
|
def test_unsortedindex():
|
||
|
# GH 11897
|
||
|
mi = MultiIndex.from_tuples(
|
||
|
[("z", "a"), ("x", "a"), ("y", "b"), ("x", "b"), ("y", "a"), ("z", "b")],
|
||
|
names=["one", "two"],
|
||
|
)
|
||
|
df = DataFrame([[i, 10 * i] for i in range(6)], index=mi, columns=["one", "two"])
|
||
|
|
||
|
# GH 16734: not sorted, but no real slicing
|
||
|
result = df.loc(axis=0)["z", "a"]
|
||
|
expected = df.iloc[0]
|
||
|
tm.assert_series_equal(result, expected)
|
||
|
|
||
|
msg = (
|
||
|
"MultiIndex slicing requires the index to be lexsorted: "
|
||
|
r"slicing on levels \[1\], lexsort depth 0"
|
||
|
)
|
||
|
with pytest.raises(UnsortedIndexError, match=msg):
|
||
|
df.loc(axis=0)["z", slice("a")]
|
||
|
df.sort_index(inplace=True)
|
||
|
assert len(df.loc(axis=0)["z", :]) == 2
|
||
|
|
||
|
with pytest.raises(KeyError, match="'q'"):
|
||
|
df.loc(axis=0)["q", :]
|
||
|
|
||
|
|
||
|
def test_unsortedindex_doc_examples():
|
||
|
# https://pandas.pydata.org/pandas-docs/stable/advanced.html#sorting-a-multiindex
|
||
|
dfm = DataFrame(
|
||
|
{"jim": [0, 0, 1, 1], "joe": ["x", "x", "z", "y"], "jolie": np.random.rand(4)}
|
||
|
)
|
||
|
|
||
|
dfm = dfm.set_index(["jim", "joe"])
|
||
|
with tm.assert_produces_warning(PerformanceWarning):
|
||
|
dfm.loc[(1, "z")]
|
||
|
|
||
|
msg = r"Key length \(2\) was greater than MultiIndex lexsort depth \(1\)"
|
||
|
with pytest.raises(UnsortedIndexError, match=msg):
|
||
|
dfm.loc[(0, "y"):(1, "z")]
|
||
|
|
||
|
assert not dfm.index.is_lexsorted()
|
||
|
assert dfm.index.lexsort_depth == 1
|
||
|
|
||
|
# sort it
|
||
|
dfm = dfm.sort_index()
|
||
|
dfm.loc[(1, "z")]
|
||
|
dfm.loc[(0, "y"):(1, "z")]
|
||
|
|
||
|
assert dfm.index.is_lexsorted()
|
||
|
assert dfm.index.lexsort_depth == 2
|
||
|
|
||
|
|
||
|
def test_reconstruct_sort():
|
||
|
|
||
|
# starts off lexsorted & monotonic
|
||
|
mi = MultiIndex.from_arrays([["A", "A", "B", "B", "B"], [1, 2, 1, 2, 3]])
|
||
|
assert mi.is_lexsorted()
|
||
|
assert mi.is_monotonic
|
||
|
|
||
|
recons = mi._sort_levels_monotonic()
|
||
|
assert recons.is_lexsorted()
|
||
|
assert recons.is_monotonic
|
||
|
assert mi is recons
|
||
|
|
||
|
assert mi.equals(recons)
|
||
|
assert Index(mi.values).equals(Index(recons.values))
|
||
|
|
||
|
# cannot convert to lexsorted
|
||
|
mi = MultiIndex.from_tuples(
|
||
|
[("z", "a"), ("x", "a"), ("y", "b"), ("x", "b"), ("y", "a"), ("z", "b")],
|
||
|
names=["one", "two"],
|
||
|
)
|
||
|
assert not mi.is_lexsorted()
|
||
|
assert not mi.is_monotonic
|
||
|
|
||
|
recons = mi._sort_levels_monotonic()
|
||
|
assert not recons.is_lexsorted()
|
||
|
assert not recons.is_monotonic
|
||
|
|
||
|
assert mi.equals(recons)
|
||
|
assert Index(mi.values).equals(Index(recons.values))
|
||
|
|
||
|
# cannot convert to lexsorted
|
||
|
mi = MultiIndex(
|
||
|
levels=[["b", "d", "a"], [1, 2, 3]],
|
||
|
codes=[[0, 1, 0, 2], [2, 0, 0, 1]],
|
||
|
names=["col1", "col2"],
|
||
|
)
|
||
|
assert not mi.is_lexsorted()
|
||
|
assert not mi.is_monotonic
|
||
|
|
||
|
recons = mi._sort_levels_monotonic()
|
||
|
assert not recons.is_lexsorted()
|
||
|
assert not recons.is_monotonic
|
||
|
|
||
|
assert mi.equals(recons)
|
||
|
assert Index(mi.values).equals(Index(recons.values))
|
||
|
|
||
|
|
||
|
def test_reconstruct_remove_unused():
|
||
|
# xref to GH 2770
|
||
|
df = DataFrame(
|
||
|
[["deleteMe", 1, 9], ["keepMe", 2, 9], ["keepMeToo", 3, 9]],
|
||
|
columns=["first", "second", "third"],
|
||
|
)
|
||
|
df2 = df.set_index(["first", "second"], drop=False)
|
||
|
df2 = df2[df2["first"] != "deleteMe"]
|
||
|
|
||
|
# removed levels are there
|
||
|
expected = MultiIndex(
|
||
|
levels=[["deleteMe", "keepMe", "keepMeToo"], [1, 2, 3]],
|
||
|
codes=[[1, 2], [1, 2]],
|
||
|
names=["first", "second"],
|
||
|
)
|
||
|
result = df2.index
|
||
|
tm.assert_index_equal(result, expected)
|
||
|
|
||
|
expected = MultiIndex(
|
||
|
levels=[["keepMe", "keepMeToo"], [2, 3]],
|
||
|
codes=[[0, 1], [0, 1]],
|
||
|
names=["first", "second"],
|
||
|
)
|
||
|
result = df2.index.remove_unused_levels()
|
||
|
tm.assert_index_equal(result, expected)
|
||
|
|
||
|
# idempotent
|
||
|
result2 = result.remove_unused_levels()
|
||
|
tm.assert_index_equal(result2, expected)
|
||
|
assert result2.is_(result)
|
||
|
|
||
|
|
||
|
@pytest.mark.parametrize(
|
||
|
"first_type,second_type", [("int64", "int64"), ("datetime64[D]", "str")]
|
||
|
)
|
||
|
def test_remove_unused_levels_large(first_type, second_type):
|
||
|
# GH16556
|
||
|
|
||
|
# because tests should be deterministic (and this test in particular
|
||
|
# checks that levels are removed, which is not the case for every
|
||
|
# random input):
|
||
|
rng = np.random.RandomState(4) # seed is arbitrary value that works
|
||
|
|
||
|
size = 1 << 16
|
||
|
df = DataFrame(
|
||
|
{
|
||
|
"first": rng.randint(0, 1 << 13, size).astype(first_type),
|
||
|
"second": rng.randint(0, 1 << 10, size).astype(second_type),
|
||
|
"third": rng.rand(size),
|
||
|
}
|
||
|
)
|
||
|
df = df.groupby(["first", "second"]).sum()
|
||
|
df = df[df.third < 0.1]
|
||
|
|
||
|
result = df.index.remove_unused_levels()
|
||
|
assert len(result.levels[0]) < len(df.index.levels[0])
|
||
|
assert len(result.levels[1]) < len(df.index.levels[1])
|
||
|
assert result.equals(df.index)
|
||
|
|
||
|
expected = df.reset_index().set_index(["first", "second"]).index
|
||
|
tm.assert_index_equal(result, expected)
|
||
|
|
||
|
|
||
|
@pytest.mark.parametrize("level0", [["a", "d", "b"], ["a", "d", "b", "unused"]])
|
||
|
@pytest.mark.parametrize(
|
||
|
"level1", [["w", "x", "y", "z"], ["w", "x", "y", "z", "unused"]]
|
||
|
)
|
||
|
def test_remove_unused_nan(level0, level1):
|
||
|
# GH 18417
|
||
|
mi = MultiIndex(levels=[level0, level1], codes=[[0, 2, -1, 1, -1], [0, 1, 2, 3, 2]])
|
||
|
|
||
|
result = mi.remove_unused_levels()
|
||
|
tm.assert_index_equal(result, mi)
|
||
|
for level in 0, 1:
|
||
|
assert "unused" not in result.levels[level]
|
||
|
|
||
|
|
||
|
def test_argsort(idx):
|
||
|
result = idx.argsort()
|
||
|
expected = idx.values.argsort()
|
||
|
tm.assert_numpy_array_equal(result, expected)
|