import numpy as np import pytest import pandas._libs.index as _index from pandas.errors import PerformanceWarning import pandas as pd from pandas import ( CategoricalDtype, DataFrame, Index, MultiIndex, Series, ) import pandas._testing as tm from pandas.core.arrays.boolean import BooleanDtype class TestMultiIndexBasic: def test_multiindex_perf_warn(self): df = DataFrame( { "jim": [0, 0, 1, 1], "joe": ["x", "x", "z", "y"], "jolie": np.random.rand(4), } ).set_index(["jim", "joe"]) with tm.assert_produces_warning(PerformanceWarning): df.loc[(1, "z")] df = df.iloc[[2, 1, 3, 0]] with tm.assert_produces_warning(PerformanceWarning): df.loc[(0,)] def test_indexing_over_hashtable_size_cutoff(self): n = 10000 old_cutoff = _index._SIZE_CUTOFF _index._SIZE_CUTOFF = 20000 s = Series(np.arange(n), MultiIndex.from_arrays((["a"] * n, np.arange(n)))) # hai it works! assert s[("a", 5)] == 5 assert s[("a", 6)] == 6 assert s[("a", 7)] == 7 _index._SIZE_CUTOFF = old_cutoff def test_multi_nan_indexing(self): # GH 3588 df = DataFrame( { "a": ["R1", "R2", np.nan, "R4"], "b": ["C1", "C2", "C3", "C4"], "c": [10, 15, np.nan, 20], } ) result = df.set_index(["a", "b"], drop=False) expected = DataFrame( { "a": ["R1", "R2", np.nan, "R4"], "b": ["C1", "C2", "C3", "C4"], "c": [10, 15, np.nan, 20], }, index=[ Index(["R1", "R2", np.nan, "R4"], name="a"), Index(["C1", "C2", "C3", "C4"], name="b"), ], ) tm.assert_frame_equal(result, expected) def test_exclusive_nat_column_indexing(self): # GH 38025 # test multi indexing when one column exclusively contains NaT values df = DataFrame( { "a": [pd.NaT, pd.NaT, pd.NaT, pd.NaT], "b": ["C1", "C2", "C3", "C4"], "c": [10, 15, np.nan, 20], } ) df = df.set_index(["a", "b"]) expected = DataFrame( { "c": [10, 15, np.nan, 20], }, index=[ Index([pd.NaT, pd.NaT, pd.NaT, pd.NaT], name="a"), Index(["C1", "C2", "C3", "C4"], name="b"), ], ) tm.assert_frame_equal(df, expected) def test_nested_tuples_duplicates(self): # GH#30892 dti = pd.to_datetime(["20190101", "20190101", "20190102"]) idx = Index(["a", "a", "c"]) mi = MultiIndex.from_arrays([dti, idx], names=["index1", "index2"]) df = DataFrame({"c1": [1, 2, 3], "c2": [np.nan, np.nan, np.nan]}, index=mi) expected = DataFrame({"c1": df["c1"], "c2": [1.0, 1.0, np.nan]}, index=mi) df2 = df.copy(deep=True) df2.loc[(dti[0], "a"), "c2"] = 1.0 tm.assert_frame_equal(df2, expected) df3 = df.copy(deep=True) df3.loc[[(dti[0], "a")], "c2"] = 1.0 tm.assert_frame_equal(df3, expected) def test_multiindex_with_datatime_level_preserves_freq(self): # https://github.com/pandas-dev/pandas/issues/35563 idx = Index(range(2), name="A") dti = pd.date_range("2020-01-01", periods=7, freq="D", name="B") mi = MultiIndex.from_product([idx, dti]) df = DataFrame(np.random.randn(14, 2), index=mi) result = df.loc[0].index tm.assert_index_equal(result, dti) assert result.freq == dti.freq def test_multiindex_complex(self): # GH#42145 complex_data = [1 + 2j, 4 - 3j, 10 - 1j] non_complex_data = [3, 4, 5] result = DataFrame( { "x": complex_data, "y": non_complex_data, "z": non_complex_data, } ) result.set_index(["x", "y"], inplace=True) expected = DataFrame( {"z": non_complex_data}, index=MultiIndex.from_arrays( [complex_data, non_complex_data], names=("x", "y"), ), ) tm.assert_frame_equal(result, expected) def test_rename_multiindex_with_duplicates(self): # GH 38015 mi = MultiIndex.from_tuples([("A", "cat"), ("B", "cat"), ("B", "cat")]) df = DataFrame(index=mi) df = df.rename(index={"A": "Apple"}, level=0) mi2 = MultiIndex.from_tuples([("Apple", "cat"), ("B", "cat"), ("B", "cat")]) expected = DataFrame(index=mi2) tm.assert_frame_equal(df, expected) def test_series_align_multiindex_with_nan_overlap_only(self): # GH 38439 mi1 = MultiIndex.from_arrays([[81.0, np.nan], [np.nan, np.nan]]) mi2 = MultiIndex.from_arrays([[np.nan, 82.0], [np.nan, np.nan]]) ser1 = Series([1, 2], index=mi1) ser2 = Series([1, 2], index=mi2) result1, result2 = ser1.align(ser2) mi = MultiIndex.from_arrays([[81.0, 82.0, np.nan], [np.nan, np.nan, np.nan]]) expected1 = Series([1.0, np.nan, 2.0], index=mi) expected2 = Series([np.nan, 2.0, 1.0], index=mi) tm.assert_series_equal(result1, expected1) tm.assert_series_equal(result2, expected2) def test_series_align_multiindex_with_nan(self): # GH 38439 mi1 = MultiIndex.from_arrays([[81.0, np.nan], [np.nan, np.nan]]) mi2 = MultiIndex.from_arrays([[np.nan, 81.0], [np.nan, np.nan]]) ser1 = Series([1, 2], index=mi1) ser2 = Series([1, 2], index=mi2) result1, result2 = ser1.align(ser2) mi = MultiIndex.from_arrays([[81.0, np.nan], [np.nan, np.nan]]) expected1 = Series([1, 2], index=mi) expected2 = Series([2, 1], index=mi) tm.assert_series_equal(result1, expected1) tm.assert_series_equal(result2, expected2) def test_nunique_smoke(self): # GH 34019 n = DataFrame([[1, 2], [1, 2]]).set_index([0, 1]).index.nunique() assert n == 1 def test_multiindex_repeated_keys(self): # GH19414 tm.assert_series_equal( Series([1, 2], MultiIndex.from_arrays([["a", "b"]])).loc[ ["a", "a", "b", "b"] ], Series([1, 1, 2, 2], MultiIndex.from_arrays([["a", "a", "b", "b"]])), ) def test_multiindex_with_na_missing_key(self): # GH46173 df = DataFrame.from_dict( { ("foo",): [1, 2, 3], ("bar",): [5, 6, 7], (None,): [8, 9, 0], } ) with pytest.raises(KeyError, match="missing_key"): df[[("missing_key",)]] def test_multiindex_dtype_preservation(self): # GH51261 columns = MultiIndex.from_tuples([("A", "B")], names=["lvl1", "lvl2"]) df = DataFrame(["value"], columns=columns).astype("category") df_no_multiindex = df["A"] assert isinstance(df_no_multiindex["B"].dtype, CategoricalDtype) # geopandas 1763 analogue df = DataFrame( [[1, 0], [0, 1]], columns=[ ["foo", "foo"], ["location", "location"], ["x", "y"], ], ).assign(bools=Series([True, False], dtype="boolean")) assert isinstance(df["bools"].dtype, BooleanDtype)