import numpy as np import pytest import pandas as pd from pandas import DataFrame, MultiIndex, Series import pandas._testing as tm AGG_FUNCTIONS = [ "sum", "prod", "min", "max", "median", "mean", "skew", "mad", "std", "var", "sem", ] class TestMultiLevel: def test_reindex_level(self, multiindex_year_month_day_dataframe_random_data): # axis=0 ymd = multiindex_year_month_day_dataframe_random_data month_sums = ymd.sum(level="month") result = month_sums.reindex(ymd.index, level=1) expected = ymd.groupby(level="month").transform(np.sum) tm.assert_frame_equal(result, expected) # Series result = month_sums["A"].reindex(ymd.index, level=1) expected = ymd["A"].groupby(level="month").transform(np.sum) tm.assert_series_equal(result, expected, check_names=False) # axis=1 month_sums = ymd.T.sum(axis=1, level="month") result = month_sums.reindex(columns=ymd.index, level=1) expected = ymd.groupby(level="month").transform(np.sum).T tm.assert_frame_equal(result, expected) def test_binops_level(self, multiindex_year_month_day_dataframe_random_data): ymd = multiindex_year_month_day_dataframe_random_data def _check_op(opname): op = getattr(DataFrame, opname) month_sums = ymd.sum(level="month") result = op(ymd, month_sums, level="month") broadcasted = ymd.groupby(level="month").transform(np.sum) expected = op(ymd, broadcasted) tm.assert_frame_equal(result, expected) # Series op = getattr(Series, opname) result = op(ymd["A"], month_sums["A"], level="month") broadcasted = ymd["A"].groupby(level="month").transform(np.sum) expected = op(ymd["A"], broadcasted) expected.name = "A" tm.assert_series_equal(result, expected) _check_op("sub") _check_op("add") _check_op("mul") _check_op("div") def test_reindex(self, multiindex_dataframe_random_data): frame = multiindex_dataframe_random_data expected = frame.iloc[[0, 3]] reindexed = frame.loc[[("foo", "one"), ("bar", "one")]] tm.assert_frame_equal(reindexed, expected) def test_reindex_preserve_levels( self, multiindex_year_month_day_dataframe_random_data ): ymd = multiindex_year_month_day_dataframe_random_data new_index = ymd.index[::10] chunk = ymd.reindex(new_index) assert chunk.index is new_index chunk = ymd.loc[new_index] assert chunk.index is new_index ymdT = ymd.T chunk = ymdT.reindex(columns=new_index) assert chunk.columns is new_index chunk = ymdT.loc[:, new_index] assert chunk.columns is new_index def test_groupby_transform(self, multiindex_dataframe_random_data): frame = multiindex_dataframe_random_data s = frame["A"] grouper = s.index.get_level_values(0) grouped = s.groupby(grouper) applied = grouped.apply(lambda x: x * 2) expected = grouped.transform(lambda x: x * 2) result = applied.reindex(expected.index) tm.assert_series_equal(result, expected, check_names=False) def test_groupby_corner(self): midx = MultiIndex( levels=[["foo"], ["bar"], ["baz"]], codes=[[0], [0], [0]], names=["one", "two", "three"], ) df = DataFrame([np.random.rand(4)], columns=["a", "b", "c", "d"], index=midx) # should work df.groupby(level="three") def test_groupby_level_no_obs(self): # #1697 midx = MultiIndex.from_tuples( [ ("f1", "s1"), ("f1", "s2"), ("f2", "s1"), ("f2", "s2"), ("f3", "s1"), ("f3", "s2"), ] ) df = DataFrame([[1, 2, 3, 4, 5, 6], [7, 8, 9, 10, 11, 12]], columns=midx) df1 = df.loc(axis=1)[df.columns.map(lambda u: u[0] in ["f2", "f3"])] grouped = df1.groupby(axis=1, level=0) result = grouped.sum() assert (result.columns == ["f2", "f3"]).all() def test_setitem_with_expansion_multiindex_columns( self, multiindex_year_month_day_dataframe_random_data ): ymd = multiindex_year_month_day_dataframe_random_data df = ymd[:5].T df[2000, 1, 10] = df[2000, 1, 7] assert isinstance(df.columns, MultiIndex) assert (df[2000, 1, 10] == df[2000, 1, 7]).all() def test_alignment(self): x = Series( data=[1, 2, 3], index=MultiIndex.from_tuples([("A", 1), ("A", 2), ("B", 3)]) ) y = Series( data=[4, 5, 6], index=MultiIndex.from_tuples([("Z", 1), ("Z", 2), ("B", 3)]) ) res = x - y exp_index = x.index.union(y.index) exp = x.reindex(exp_index) - y.reindex(exp_index) tm.assert_series_equal(res, exp) # hit non-monotonic code path res = x[::-1] - y[::-1] exp_index = x.index.union(y.index) exp = x.reindex(exp_index) - y.reindex(exp_index) tm.assert_series_equal(res, exp) @pytest.mark.parametrize("op", AGG_FUNCTIONS) @pytest.mark.parametrize("level", [0, 1]) @pytest.mark.parametrize("skipna", [True, False]) @pytest.mark.parametrize("sort", [True, False]) def test_series_group_min_max( self, op, level, skipna, sort, series_with_multilevel_index ): # GH 17537 ser = series_with_multilevel_index grouped = ser.groupby(level=level, sort=sort) # skipna=True leftside = grouped.agg(lambda x: getattr(x, op)(skipna=skipna)) rightside = getattr(ser, op)(level=level, skipna=skipna) if sort: rightside = rightside.sort_index(level=level) tm.assert_series_equal(leftside, rightside) @pytest.mark.parametrize("op", AGG_FUNCTIONS) @pytest.mark.parametrize("level", [0, 1]) @pytest.mark.parametrize("axis", [0, 1]) @pytest.mark.parametrize("skipna", [True, False]) @pytest.mark.parametrize("sort", [True, False]) def test_frame_group_ops( self, op, level, axis, skipna, sort, multiindex_dataframe_random_data ): # GH 17537 frame = multiindex_dataframe_random_data frame.iloc[1, [1, 2]] = np.nan frame.iloc[7, [0, 1]] = np.nan level_name = frame.index.names[level] if axis == 0: frame = frame else: frame = frame.T grouped = frame.groupby(level=level, axis=axis, sort=sort) pieces = [] def aggf(x): pieces.append(x) return getattr(x, op)(skipna=skipna, axis=axis) leftside = grouped.agg(aggf) rightside = getattr(frame, op)(level=level, axis=axis, skipna=skipna) if sort: rightside = rightside.sort_index(level=level, axis=axis) frame = frame.sort_index(level=level, axis=axis) # for good measure, groupby detail level_index = frame._get_axis(axis).levels[level].rename(level_name) tm.assert_index_equal(leftside._get_axis(axis), level_index) tm.assert_index_equal(rightside._get_axis(axis), level_index) tm.assert_frame_equal(leftside, rightside) def test_std_var_pass_ddof(self): index = MultiIndex.from_arrays( [np.arange(5).repeat(10), np.tile(np.arange(10), 5)] ) df = DataFrame(np.random.randn(len(index), 5), index=index) for meth in ["var", "std"]: ddof = 4 alt = lambda x: getattr(x, meth)(ddof=ddof) result = getattr(df[0], meth)(level=0, ddof=ddof) expected = df[0].groupby(level=0).agg(alt) tm.assert_series_equal(result, expected) result = getattr(df, meth)(level=0, ddof=ddof) expected = df.groupby(level=0).agg(alt) tm.assert_frame_equal(result, expected) def test_agg_multiple_levels( self, multiindex_year_month_day_dataframe_random_data, frame_or_series ): ymd = multiindex_year_month_day_dataframe_random_data if frame_or_series is Series: ymd = ymd["A"] result = ymd.sum(level=["year", "month"]) expected = ymd.groupby(level=["year", "month"]).sum() tm.assert_equal(result, expected) def test_groupby_multilevel(self, multiindex_year_month_day_dataframe_random_data): ymd = multiindex_year_month_day_dataframe_random_data result = ymd.groupby(level=[0, 1]).mean() k1 = ymd.index.get_level_values(0) k2 = ymd.index.get_level_values(1) expected = ymd.groupby([k1, k2]).mean() # TODO groupby with level_values drops names tm.assert_frame_equal(result, expected, check_names=False) assert result.index.names == ymd.index.names[:2] result2 = ymd.groupby(level=ymd.index.names[:2]).mean() tm.assert_frame_equal(result, result2) def test_groupby_multilevel_with_transform(self): pass def test_multilevel_consolidate(self): index = MultiIndex.from_tuples( [("foo", "one"), ("foo", "two"), ("bar", "one"), ("bar", "two")] ) df = DataFrame(np.random.randn(4, 4), index=index, columns=index) df["Totals", ""] = df.sum(1) df = df._consolidate() def test_level_with_tuples(self): index = MultiIndex( levels=[[("foo", "bar", 0), ("foo", "baz", 0), ("foo", "qux", 0)], [0, 1]], codes=[[0, 0, 1, 1, 2, 2], [0, 1, 0, 1, 0, 1]], ) series = Series(np.random.randn(6), index=index) frame = DataFrame(np.random.randn(6, 4), index=index) result = series[("foo", "bar", 0)] result2 = series.loc[("foo", "bar", 0)] expected = series[:2] expected.index = expected.index.droplevel(0) tm.assert_series_equal(result, expected) tm.assert_series_equal(result2, expected) with pytest.raises(KeyError, match=r"^\(\('foo', 'bar', 0\), 2\)$"): series[("foo", "bar", 0), 2] result = frame.loc[("foo", "bar", 0)] result2 = frame.xs(("foo", "bar", 0)) expected = frame[:2] expected.index = expected.index.droplevel(0) tm.assert_frame_equal(result, expected) tm.assert_frame_equal(result2, expected) index = MultiIndex( levels=[[("foo", "bar"), ("foo", "baz"), ("foo", "qux")], [0, 1]], codes=[[0, 0, 1, 1, 2, 2], [0, 1, 0, 1, 0, 1]], ) series = Series(np.random.randn(6), index=index) frame = DataFrame(np.random.randn(6, 4), index=index) result = series[("foo", "bar")] result2 = series.loc[("foo", "bar")] expected = series[:2] expected.index = expected.index.droplevel(0) tm.assert_series_equal(result, expected) tm.assert_series_equal(result2, expected) result = frame.loc[("foo", "bar")] result2 = frame.xs(("foo", "bar")) expected = frame[:2] expected.index = expected.index.droplevel(0) tm.assert_frame_equal(result, expected) tm.assert_frame_equal(result2, expected) def test_reindex_level_partial_selection(self, multiindex_dataframe_random_data): frame = multiindex_dataframe_random_data result = frame.reindex(["foo", "qux"], level=0) expected = frame.iloc[[0, 1, 2, 7, 8, 9]] tm.assert_frame_equal(result, expected) result = frame.T.reindex(["foo", "qux"], axis=1, level=0) tm.assert_frame_equal(result, expected.T) result = frame.loc[["foo", "qux"]] tm.assert_frame_equal(result, expected) result = frame["A"].loc[["foo", "qux"]] tm.assert_series_equal(result, expected["A"]) result = frame.T.loc[:, ["foo", "qux"]] tm.assert_frame_equal(result, expected.T) @pytest.mark.parametrize("d", [4, "d"]) def test_empty_frame_groupby_dtypes_consistency(self, d): # GH 20888 group_keys = ["a", "b", "c"] df = DataFrame({"a": [1], "b": [2], "c": [3], "d": [d]}) g = df[df.a == 2].groupby(group_keys) result = g.first().index expected = MultiIndex( levels=[[1], [2], [3]], codes=[[], [], []], names=["a", "b", "c"] ) tm.assert_index_equal(result, expected) def test_duplicate_groupby_issues(self): idx_tp = [ ("600809", "20061231"), ("600809", "20070331"), ("600809", "20070630"), ("600809", "20070331"), ] dt = ["demo", "demo", "demo", "demo"] idx = MultiIndex.from_tuples(idx_tp, names=["STK_ID", "RPT_Date"]) s = Series(dt, index=idx) result = s.groupby(s.index).first() assert len(result) == 3 def test_subsets_multiindex_dtype(self): # GH 20757 data = [["x", 1]] columns = [("a", "b", np.nan), ("a", "c", 0.0)] df = DataFrame(data, columns=MultiIndex.from_tuples(columns)) expected = df.dtypes.a.b result = df.a.b.dtypes tm.assert_series_equal(result, expected) class TestSorted: """ everything you wanted to test about sorting """ def test_sort_non_lexsorted(self): # degenerate case where we sort but don't # have a satisfying result :< # GH 15797 idx = MultiIndex( [["A", "B", "C"], ["c", "b", "a"]], [[0, 1, 2, 0, 1, 2], [0, 2, 1, 1, 0, 2]] ) df = DataFrame({"col": range(len(idx))}, index=idx, dtype="int64") assert df.index.is_lexsorted() is False assert df.index.is_monotonic is False sorted = df.sort_index() assert sorted.index.is_lexsorted() is True assert sorted.index.is_monotonic is True expected = DataFrame( {"col": [1, 4, 5, 2]}, index=MultiIndex.from_tuples( [("B", "a"), ("B", "c"), ("C", "a"), ("C", "b")] ), dtype="int64", ) result = sorted.loc[pd.IndexSlice["B":"C", "a":"c"], :] tm.assert_frame_equal(result, expected)