3RNN/Lib/site-packages/pandas/tests/reshape/merge/test_join.py

import re

import numpy as np
import pytest

import pandas.util._test_decorators as td

import pandas as pd
from pandas import (
    Categorical,
    DataFrame,
    Index,
    MultiIndex,
    Series,
    Timestamp,
    bdate_range,
    concat,
    merge,
    option_context,
)
import pandas._testing as tm


def get_test_data(ngroups=8, n=50):
    unique_groups = list(range(ngroups))
    arr = np.asarray(np.tile(unique_groups, n // ngroups))

    if len(arr) < n:
        arr = np.asarray(list(arr) + unique_groups[: n - len(arr)])

    np.random.default_rng(2).shuffle(arr)
    return arr


class TestJoin:
    # aggregate multiple columns
    @pytest.fixture
    def df(self):
        df = DataFrame(
            {
                "key1": get_test_data(),
                "key2": get_test_data(),
                "data1": np.random.default_rng(2).standard_normal(50),
                "data2": np.random.default_rng(2).standard_normal(50),
            }
        )

        # exclude a couple keys for fun
        df = df[df["key2"] > 1]
        return df

    @pytest.fixture
    def df2(self):
        return DataFrame(
            {
                "key1": get_test_data(n=10),
                "key2": get_test_data(ngroups=4, n=10),
                "value": np.random.default_rng(2).standard_normal(10),
            }
        )

    @pytest.fixture
    def target_source(self):
        data = {
            "A": [0.0, 1.0, 2.0, 3.0, 4.0],
            "B": [0.0, 1.0, 0.0, 1.0, 0.0],
            "C": ["foo1", "foo2", "foo3", "foo4", "foo5"],
            "D": bdate_range("1/1/2009", periods=5),
        }
        target = DataFrame(data, index=Index(["a", "b", "c", "d", "e"], dtype=object))

        # Join on string value

        source = DataFrame(
            {"MergedA": data["A"], "MergedD": data["D"]}, index=data["C"]
        )
        return target, source

    def test_left_outer_join(self, df, df2):
        joined_key2 = merge(df, df2, on="key2")
        _check_join(df, df2, joined_key2, ["key2"], how="left")

        joined_both = merge(df, df2)
        _check_join(df, df2, joined_both, ["key1", "key2"], how="left")

    def test_right_outer_join(self, df, df2):
        joined_key2 = merge(df, df2, on="key2", how="right")
        _check_join(df, df2, joined_key2, ["key2"], how="right")

        joined_both = merge(df, df2, how="right")
        _check_join(df, df2, joined_both, ["key1", "key2"], how="right")

    def test_full_outer_join(self, df, df2):
        joined_key2 = merge(df, df2, on="key2", how="outer")
        _check_join(df, df2, joined_key2, ["key2"], how="outer")

        joined_both = merge(df, df2, how="outer")
        _check_join(df, df2, joined_both, ["key1", "key2"], how="outer")

    def test_inner_join(self, df, df2):
        joined_key2 = merge(df, df2, on="key2", how="inner")
        _check_join(df, df2, joined_key2, ["key2"], how="inner")

        joined_both = merge(df, df2, how="inner")
        _check_join(df, df2, joined_both, ["key1", "key2"], how="inner")

    def test_handle_overlap(self, df, df2):
        joined = merge(df, df2, on="key2", suffixes=(".foo", ".bar"))

        assert "key1.foo" in joined
        assert "key1.bar" in joined

    def test_handle_overlap_arbitrary_key(self, df, df2):
        joined = merge(
            df,
            df2,
            left_on="key2",
            right_on="key1",
            suffixes=(".foo", ".bar"),
        )
        assert "key1.foo" in joined
        assert "key2.bar" in joined

    @pytest.mark.parametrize(
        "infer_string", [False, pytest.param(True, marks=td.skip_if_no("pyarrow"))]
    )
    def test_join_on(self, target_source, infer_string):
        target, source = target_source

        merged = target.join(source, on="C")
        tm.assert_series_equal(merged["MergedA"], target["A"], check_names=False)
        tm.assert_series_equal(merged["MergedD"], target["D"], check_names=False)

        # join with duplicates (fix regression from DataFrame/Matrix merge)
        df = DataFrame({"key": ["a", "a", "b", "b", "c"]})
        df2 = DataFrame({"value": [0, 1, 2]}, index=["a", "b", "c"])
        joined = df.join(df2, on="key")
        expected = DataFrame(
            {"key": ["a", "a", "b", "b", "c"], "value": [0, 0, 1, 1, 2]}
        )
        tm.assert_frame_equal(joined, expected)

        # Test when some are missing
        df_a = DataFrame([[1], [2], [3]], index=["a", "b", "c"], columns=["one"])
        df_b = DataFrame([["foo"], ["bar"]], index=[1, 2], columns=["two"])
        df_c = DataFrame([[1], [2]], index=[1, 2], columns=["three"])
        joined = df_a.join(df_b, on="one")
        joined = joined.join(df_c, on="one")
        assert np.isnan(joined["two"]["c"])
        assert np.isnan(joined["three"]["c"])

        # merge column not p resent
        with pytest.raises(KeyError, match="^'E'$"):
            target.join(source, on="E")

        # overlap
        source_copy = source.copy()
        msg = (
            "You are trying to merge on float64 and object|string columns for key "
            "'A'. If you wish to proceed you should use pd.concat"
        )
        with pytest.raises(ValueError, match=msg):
            target.join(source_copy, on="A")

    def test_join_on_fails_with_different_right_index(self):
        df = DataFrame(
            {
                "a": np.random.default_rng(2).choice(["m", "f"], size=3),
                "b": np.random.default_rng(2).standard_normal(3),
            }
        )
        df2 = DataFrame(
            {
                "a": np.random.default_rng(2).choice(["m", "f"], size=10),
                "b": np.random.default_rng(2).standard_normal(10),
            },
            index=MultiIndex.from_product([range(5), ["A", "B"]]),
        )
        msg = r'len\(left_on\) must equal the number of levels in the index of "right"'
        with pytest.raises(ValueError, match=msg):
            merge(df, df2, left_on="a", right_index=True)

    def test_join_on_fails_with_different_left_index(self):
        df = DataFrame(
            {
                "a": np.random.default_rng(2).choice(["m", "f"], size=3),
                "b": np.random.default_rng(2).standard_normal(3),
            },
            index=MultiIndex.from_arrays([range(3), list("abc")]),
        )
        df2 = DataFrame(
            {
                "a": np.random.default_rng(2).choice(["m", "f"], size=10),
                "b": np.random.default_rng(2).standard_normal(10),
            }
        )
        msg = r'len\(right_on\) must equal the number of levels in the index of "left"'
        with pytest.raises(ValueError, match=msg):
            merge(df, df2, right_on="b", left_index=True)

    def test_join_on_fails_with_different_column_counts(self):
        df = DataFrame(
            {
                "a": np.random.default_rng(2).choice(["m", "f"], size=3),
                "b": np.random.default_rng(2).standard_normal(3),
            }
        )
        df2 = DataFrame(
            {
                "a": np.random.default_rng(2).choice(["m", "f"], size=10),
                "b": np.random.default_rng(2).standard_normal(10),
            },
            index=MultiIndex.from_product([range(5), ["A", "B"]]),
        )
        msg = r"len\(right_on\) must equal len\(left_on\)"
        with pytest.raises(ValueError, match=msg):
            merge(df, df2, right_on="a", left_on=["a", "b"])

    @pytest.mark.parametrize("wrong_type", [2, "str", None, np.array([0, 1])])
    def test_join_on_fails_with_wrong_object_type(self, wrong_type):
        # GH12081 - original issue

        # GH21220 - merging of Series and DataFrame is now allowed
        # Edited test to remove the Series object from test parameters

        df = DataFrame({"a": [1, 1]})
        msg = (
            "Can only merge Series or DataFrame objects, "
            f"a {type(wrong_type)} was passed"
        )
        with pytest.raises(TypeError, match=msg):
            merge(wrong_type, df, left_on="a", right_on="a")
        with pytest.raises(TypeError, match=msg):
            merge(df, wrong_type, left_on="a", right_on="a")

    def test_join_on_pass_vector(self, target_source):
        target, source = target_source
        expected = target.join(source, on="C")
        expected = expected.rename(columns={"C": "key_0"})
        expected = expected[["key_0", "A", "B", "D", "MergedA", "MergedD"]]

        join_col = target.pop("C")
        result = target.join(source, on=join_col)
        tm.assert_frame_equal(result, expected)

    def test_join_with_len0(self, target_source):
        # nothing to merge
        target, source = target_source
        merged = target.join(source.reindex([]), on="C")
        for col in source:
            assert col in merged
            assert merged[col].isna().all()

        merged2 = target.join(source.reindex([]), on="C", how="inner")
        tm.assert_index_equal(merged2.columns, merged.columns)
        assert len(merged2) == 0

    def test_join_on_inner(self):
        df = DataFrame({"key": ["a", "a", "d", "b", "b", "c"]})
        df2 = DataFrame({"value": [0, 1]}, index=["a", "b"])

        joined = df.join(df2, on="key", how="inner")

        expected = df.join(df2, on="key")
        expected = expected[expected["value"].notna()]
        tm.assert_series_equal(joined["key"], expected["key"])
        tm.assert_series_equal(joined["value"], expected["value"], check_dtype=False)
        tm.assert_index_equal(joined.index, expected.index)

    def test_join_on_singlekey_list(self):
        df = DataFrame({"key": ["a", "a", "b", "b", "c"]})
        df2 = DataFrame({"value": [0, 1, 2]}, index=["a", "b", "c"])

        # corner cases
        joined = df.join(df2, on=["key"])
        expected = df.join(df2, on="key")

        tm.assert_frame_equal(joined, expected)

    def test_join_on_series(self, target_source):
        target, source = target_source
        result = target.join(source["MergedA"], on="C")
        expected = target.join(source[["MergedA"]], on="C")
        tm.assert_frame_equal(result, expected)

    def test_join_on_series_buglet(self):
        # GH #638
        df = DataFrame({"a": [1, 1]})
        ds = Series([2], index=[1], name="b")
        result = df.join(ds, on="a")
        expected = DataFrame({"a": [1, 1], "b": [2, 2]}, index=df.index)
        tm.assert_frame_equal(result, expected)

    def test_join_index_mixed(self, join_type):
        # no overlapping blocks
        df1 = DataFrame(index=np.arange(10))
        df1["bool"] = True
        df1["string"] = "foo"

        df2 = DataFrame(index=np.arange(5, 15))
        df2["int"] = 1
        df2["float"] = 1.0

        joined = df1.join(df2, how=join_type)
        expected = _join_by_hand(df1, df2, how=join_type)
        tm.assert_frame_equal(joined, expected)

        joined = df2.join(df1, how=join_type)
        expected = _join_by_hand(df2, df1, how=join_type)
        tm.assert_frame_equal(joined, expected)

    def test_join_index_mixed_overlap(self):
        df1 = DataFrame(
            {"A": 1.0, "B": 2, "C": "foo", "D": True},
            index=np.arange(10),
            columns=["A", "B", "C", "D"],
        )
        assert df1["B"].dtype == np.int64
        assert df1["D"].dtype == np.bool_

        df2 = DataFrame(
            {"A": 1.0, "B": 2, "C": "foo", "D": True},
            index=np.arange(0, 10, 2),
            columns=["A", "B", "C", "D"],
        )

        # overlap
        joined = df1.join(df2, lsuffix="_one", rsuffix="_two")
        expected_columns = [
            "A_one",
            "B_one",
            "C_one",
            "D_one",
            "A_two",
            "B_two",
            "C_two",
            "D_two",
        ]
        df1.columns = expected_columns[:4]
        df2.columns = expected_columns[4:]
        expected = _join_by_hand(df1, df2)
        tm.assert_frame_equal(joined, expected)

    def test_join_empty_bug(self):
        # generated an exception in 0.4.3
        x = DataFrame()
        x.join(DataFrame([3], index=[0], columns=["A"]), how="outer")

    def test_join_unconsolidated(self):
        # GH #331
        a = DataFrame(
            np.random.default_rng(2).standard_normal((30, 2)), columns=["a", "b"]
        )
        c = Series(np.random.default_rng(2).standard_normal(30))
        a["c"] = c
        d = DataFrame(np.random.default_rng(2).standard_normal((30, 1)), columns=["q"])

        # it works!
        a.join(d)
        d.join(a)

    def test_join_multiindex(self):
        index1 = MultiIndex.from_arrays(
            [["a", "a", "a", "b", "b", "b"], [1, 2, 3, 1, 2, 3]],
            names=["first", "second"],
        )

        index2 = MultiIndex.from_arrays(
            [["b", "b", "b", "c", "c", "c"], [1, 2, 3, 1, 2, 3]],
            names=["first", "second"],
        )

        df1 = DataFrame(
            data=np.random.default_rng(2).standard_normal(6),
            index=index1,
            columns=["var X"],
        )
        df2 = DataFrame(
            data=np.random.default_rng(2).standard_normal(6),
            index=index2,
            columns=["var Y"],
        )

        df1 = df1.sort_index(level=0)
        df2 = df2.sort_index(level=0)

        joined = df1.join(df2, how="outer")
        ex_index = Index(index1.values).union(Index(index2.values))
        expected = df1.reindex(ex_index).join(df2.reindex(ex_index))
        expected.index.names = index1.names
        tm.assert_frame_equal(joined, expected)
        assert joined.index.names == index1.names

        df1 = df1.sort_index(level=1)
        df2 = df2.sort_index(level=1)

        joined = df1.join(df2, how="outer").sort_index(level=0)
        ex_index = Index(index1.values).union(Index(index2.values))
        expected = df1.reindex(ex_index).join(df2.reindex(ex_index))
        expected.index.names = index1.names

        tm.assert_frame_equal(joined, expected)
        assert joined.index.names == index1.names

    def test_join_inner_multiindex(self, lexsorted_two_level_string_multiindex):
        key1 = ["bar", "bar", "bar", "foo", "foo", "baz", "baz", "qux", "qux", "snap"]
        key2 = [
            "two",
            "one",
            "three",
            "one",
            "two",
            "one",
            "two",
            "two",
            "three",
            "one",
        ]

        data = np.random.default_rng(2).standard_normal(len(key1))
        data = DataFrame({"key1": key1, "key2": key2, "data": data})

        index = lexsorted_two_level_string_multiindex
        to_join = DataFrame(
            np.random.default_rng(2).standard_normal((10, 3)),
            index=index,
            columns=["j_one", "j_two", "j_three"],
        )

        joined = data.join(to_join, on=["key1", "key2"], how="inner")
        expected = merge(
            data,
            to_join.reset_index(),
            left_on=["key1", "key2"],
            right_on=["first", "second"],
            how="inner",
            sort=False,
        )

        expected2 = merge(
            to_join,
            data,
            right_on=["key1", "key2"],
            left_index=True,
            how="inner",
            sort=False,
        )
        tm.assert_frame_equal(joined, expected2.reindex_like(joined))

        expected2 = merge(
            to_join,
            data,
            right_on=["key1", "key2"],
            left_index=True,
            how="inner",
            sort=False,
        )

        expected = expected.drop(["first", "second"], axis=1)
        expected.index = joined.index

        assert joined.index.is_monotonic_increasing
        tm.assert_frame_equal(joined, expected)

        # _assert_same_contents(expected, expected2.loc[:, expected.columns])

    def test_join_hierarchical_mixed_raises(self):
        # GH 2024
        # GH 40993: For raising, enforced in 2.0
        df = DataFrame([(1, 2, 3), (4, 5, 6)], columns=["a", "b", "c"])
        new_df = df.groupby(["a"]).agg({"b": ["mean", "sum"]})
        other_df = DataFrame([(1, 2, 3), (7, 10, 6)], columns=["a", "b", "d"])
        other_df.set_index("a", inplace=True)
        # GH 9455, 12219
        with pytest.raises(
            pd.errors.MergeError, match="Not allowed to merge between different levels"
        ):
            merge(new_df, other_df, left_index=True, right_index=True)

    def test_join_float64_float32(self):
        a = DataFrame(
            np.random.default_rng(2).standard_normal((10, 2)),
            columns=["a", "b"],
            dtype=np.float64,
        )
        b = DataFrame(
            np.random.default_rng(2).standard_normal((10, 1)),
            columns=["c"],
            dtype=np.float32,
        )
        joined = a.join(b)
        assert joined.dtypes["a"] == "float64"
        assert joined.dtypes["b"] == "float64"
        assert joined.dtypes["c"] == "float32"

        a = np.random.default_rng(2).integers(0, 5, 100).astype("int64")
        b = np.random.default_rng(2).random(100).astype("float64")
        c = np.random.default_rng(2).random(100).astype("float32")
        df = DataFrame({"a": a, "b": b, "c": c})
        xpdf = DataFrame({"a": a, "b": b, "c": c})
        s = DataFrame(
            np.random.default_rng(2).random(5).astype("float32"), columns=["md"]
        )
        rs = df.merge(s, left_on="a", right_index=True)
        assert rs.dtypes["a"] == "int64"
        assert rs.dtypes["b"] == "float64"
        assert rs.dtypes["c"] == "float32"
        assert rs.dtypes["md"] == "float32"

        xp = xpdf.merge(s, left_on="a", right_index=True)
        tm.assert_frame_equal(rs, xp)

    def test_join_many_non_unique_index(self):
        df1 = DataFrame({"a": [1, 1], "b": [1, 1], "c": [10, 20]})
        df2 = DataFrame({"a": [1, 1], "b": [1, 2], "d": [100, 200]})
        df3 = DataFrame({"a": [1, 1], "b": [1, 2], "e": [1000, 2000]})
        idf1 = df1.set_index(["a", "b"])
        idf2 = df2.set_index(["a", "b"])
        idf3 = df3.set_index(["a", "b"])

        result = idf1.join([idf2, idf3], how="outer")

        df_partially_merged = merge(df1, df2, on=["a", "b"], how="outer")
        expected = merge(df_partially_merged, df3, on=["a", "b"], how="outer")

        result = result.reset_index()
        expected = expected[result.columns]
        expected["a"] = expected.a.astype("int64")
        expected["b"] = expected.b.astype("int64")
        tm.assert_frame_equal(result, expected)

        df1 = DataFrame({"a": [1, 1, 1], "b": [1, 1, 1], "c": [10, 20, 30]})
        df2 = DataFrame({"a": [1, 1, 1], "b": [1, 1, 2], "d": [100, 200, 300]})
        df3 = DataFrame({"a": [1, 1, 1], "b": [1, 1, 2], "e": [1000, 2000, 3000]})
        idf1 = df1.set_index(["a", "b"])
        idf2 = df2.set_index(["a", "b"])
        idf3 = df3.set_index(["a", "b"])
        result = idf1.join([idf2, idf3], how="inner")

        df_partially_merged = merge(df1, df2, on=["a", "b"], how="inner")
        expected = merge(df_partially_merged, df3, on=["a", "b"], how="inner")

        result = result.reset_index()

        tm.assert_frame_equal(result, expected.loc[:, result.columns])

        # GH 11519
        df = DataFrame(
            {
                "A": ["foo", "bar", "foo", "bar", "foo", "bar", "foo", "foo"],
                "B": ["one", "one", "two", "three", "two", "two", "one", "three"],
                "C": np.random.default_rng(2).standard_normal(8),
                "D": np.random.default_rng(2).standard_normal(8),
            }
        )
        s = Series(
            np.repeat(np.arange(8), 2), index=np.repeat(np.arange(8), 2), name="TEST"
        )
        inner = df.join(s, how="inner")
        outer = df.join(s, how="outer")
        left = df.join(s, how="left")
        right = df.join(s, how="right")
        tm.assert_frame_equal(inner, outer)
        tm.assert_frame_equal(inner, left)
        tm.assert_frame_equal(inner, right)

    @pytest.mark.parametrize(
        "infer_string", [False, pytest.param(True, marks=td.skip_if_no("pyarrow"))]
    )
    def test_join_sort(self, infer_string):
        with option_context("future.infer_string", infer_string):
            left = DataFrame(
                {"key": ["foo", "bar", "baz", "foo"], "value": [1, 2, 3, 4]}
            )
            right = DataFrame({"value2": ["a", "b", "c"]}, index=["bar", "baz", "foo"])

            joined = left.join(right, on="key", sort=True)
            expected = DataFrame(
                {
                    "key": ["bar", "baz", "foo", "foo"],
                    "value": [2, 3, 1, 4],
                    "value2": ["a", "b", "c", "c"],
                },
                index=[1, 2, 0, 3],
            )
            tm.assert_frame_equal(joined, expected)

            # smoke test
            joined = left.join(right, on="key", sort=False)
            tm.assert_index_equal(joined.index, Index(range(4)), exact=True)

    def test_join_mixed_non_unique_index(self):
        # GH 12814, unorderable types in py3 with a non-unique index
        df1 = DataFrame({"a": [1, 2, 3, 4]}, index=[1, 2, 3, "a"])
        df2 = DataFrame({"b": [5, 6, 7, 8]}, index=[1, 3, 3, 4])
        result = df1.join(df2)
        expected = DataFrame(
            {"a": [1, 2, 3, 3, 4], "b": [5, np.nan, 6, 7, np.nan]},
            index=[1, 2, 3, 3, "a"],
        )
        tm.assert_frame_equal(result, expected)

        df3 = DataFrame({"a": [1, 2, 3, 4]}, index=[1, 2, 2, "a"])
        df4 = DataFrame({"b": [5, 6, 7, 8]}, index=[1, 2, 3, 4])
        result = df3.join(df4)
        expected = DataFrame(
            {"a": [1, 2, 3, 4], "b": [5, 6, 6, np.nan]}, index=[1, 2, 2, "a"]
        )
        tm.assert_frame_equal(result, expected)

    def test_join_non_unique_period_index(self):
        # GH #16871
        index = pd.period_range("2016-01-01", periods=16, freq="M")
        df = DataFrame(list(range(len(index))), index=index, columns=["pnum"])
        df2 = concat([df, df])
        result = df.join(df2, how="inner", rsuffix="_df2")
        expected = DataFrame(
            np.tile(np.arange(16, dtype=np.int64).repeat(2).reshape(-1, 1), 2),
            columns=["pnum", "pnum_df2"],
            index=df2.sort_index().index,
        )
        tm.assert_frame_equal(result, expected)

    def test_mixed_type_join_with_suffix(self):
        # GH #916
        df = DataFrame(
            np.random.default_rng(2).standard_normal((20, 6)),
            columns=["a", "b", "c", "d", "e", "f"],
        )
        df.insert(0, "id", 0)
        df.insert(5, "dt", "foo")

        grouped = df.groupby("id")
        msg = re.escape("agg function failed [how->mean,dtype->")
        with pytest.raises(TypeError, match=msg):
            grouped.mean()
        mn = grouped.mean(numeric_only=True)
        cn = grouped.count()

        # it works!
        mn.join(cn, rsuffix="_right")

    def test_join_many(self):
        df = DataFrame(
            np.random.default_rng(2).standard_normal((10, 6)), columns=list("abcdef")
        )
        df_list = [df[["a", "b"]], df[["c", "d"]], df[["e", "f"]]]

        joined = df_list[0].join(df_list[1:])
        tm.assert_frame_equal(joined, df)

        df_list = [df[["a", "b"]][:-2], df[["c", "d"]][2:], df[["e", "f"]][1:9]]

        def _check_diff_index(df_list, result, exp_index):
            reindexed = [x.reindex(exp_index) for x in df_list]
            expected = reindexed[0].join(reindexed[1:])
            tm.assert_frame_equal(result, expected)

        # different join types
        joined = df_list[0].join(df_list[1:], how="outer")
        _check_diff_index(df_list, joined, df.index)

        joined = df_list[0].join(df_list[1:])
        _check_diff_index(df_list, joined, df_list[0].index)

        joined = df_list[0].join(df_list[1:], how="inner")
        _check_diff_index(df_list, joined, df.index[2:8])

        msg = "Joining multiple DataFrames only supported for joining on index"
        with pytest.raises(ValueError, match=msg):
            df_list[0].join(df_list[1:], on="a")

    def test_join_many_mixed(self):
        df = DataFrame(
            np.random.default_rng(2).standard_normal((8, 4)),
            columns=["A", "B", "C", "D"],
        )
        df["key"] = ["foo", "bar"] * 4
        df1 = df.loc[:, ["A", "B"]]
        df2 = df.loc[:, ["C", "D"]]
        df3 = df.loc[:, ["key"]]

        result = df1.join([df2, df3])
        tm.assert_frame_equal(result, df)

    def test_join_dups(self):
        # joining dups
        df = concat(
            [
                DataFrame(
                    np.random.default_rng(2).standard_normal((10, 4)),
                    columns=["A", "A", "B", "B"],
                ),
                DataFrame(
                    np.random.default_rng(2).integers(0, 10, size=20).reshape(10, 2),
                    columns=["A", "C"],
                ),
            ],
            axis=1,
        )

        expected = concat([df, df], axis=1)
        result = df.join(df, rsuffix="_2")
        result.columns = expected.columns
        tm.assert_frame_equal(result, expected)

        # GH 4975, invalid join on dups
        w = DataFrame(
            np.random.default_rng(2).standard_normal((4, 2)), columns=["x", "y"]
        )
        x = DataFrame(
            np.random.default_rng(2).standard_normal((4, 2)), columns=["x", "y"]
        )
        y = DataFrame(
            np.random.default_rng(2).standard_normal((4, 2)), columns=["x", "y"]
        )
        z = DataFrame(
            np.random.default_rng(2).standard_normal((4, 2)), columns=["x", "y"]
        )

        dta = x.merge(y, left_index=True, right_index=True).merge(
            z, left_index=True, right_index=True, how="outer"
        )
        # GH 40991: As of 2.0 causes duplicate columns
        with pytest.raises(
            pd.errors.MergeError,
            match="Passing 'suffixes' which cause duplicate columns",
        ):
            dta.merge(w, left_index=True, right_index=True)

    def test_join_multi_to_multi(self, join_type):
        # GH 20475
        leftindex = MultiIndex.from_product(
            [list("abc"), list("xy"), [1, 2]], names=["abc", "xy", "num"]
        )
        left = DataFrame({"v1": range(12)}, index=leftindex)

        rightindex = MultiIndex.from_product(
            [list("abc"), list("xy")], names=["abc", "xy"]
        )
        right = DataFrame({"v2": [100 * i for i in range(1, 7)]}, index=rightindex)

        result = left.join(right, on=["abc", "xy"], how=join_type)
        expected = (
            left.reset_index()
            .merge(right.reset_index(), on=["abc", "xy"], how=join_type)
            .set_index(["abc", "xy", "num"])
        )
        tm.assert_frame_equal(expected, result)

        msg = r'len\(left_on\) must equal the number of levels in the index of "right"'
        with pytest.raises(ValueError, match=msg):
            left.join(right, on="xy", how=join_type)

        with pytest.raises(ValueError, match=msg):
            right.join(left, on=["abc", "xy"], how=join_type)

    def test_join_on_tz_aware_datetimeindex(self):
        # GH 23931, 26335
        df1 = DataFrame(
            {
                "date": pd.date_range(
                    start="2018-01-01", periods=5, tz="America/Chicago"
                ),
                "vals": list("abcde"),
            }
        )

        df2 = DataFrame(
            {
                "date": pd.date_range(
                    start="2018-01-03", periods=5, tz="America/Chicago"
                ),
                "vals_2": list("tuvwx"),
            }
        )
        result = df1.join(df2.set_index("date"), on="date")
        expected = df1.copy()
        expected["vals_2"] = Series([np.nan] * 2 + list("tuv"))
        tm.assert_frame_equal(result, expected)

    def test_join_datetime_string(self):
        # GH 5647
        dfa = DataFrame(
            [
                ["2012-08-02", "L", 10],
                ["2012-08-02", "J", 15],
                ["2013-04-06", "L", 20],
                ["2013-04-06", "J", 25],
            ],
            columns=["x", "y", "a"],
        )
        dfa["x"] = pd.to_datetime(dfa["x"]).astype("M8[ns]")
        dfb = DataFrame(
            [["2012-08-02", "J", 1], ["2013-04-06", "L", 2]],
            columns=["x", "y", "z"],
            index=[2, 4],
        )
        dfb["x"] = pd.to_datetime(dfb["x"]).astype("M8[ns]")
        result = dfb.join(dfa.set_index(["x", "y"]), on=["x", "y"])
        expected = DataFrame(
            [
                [Timestamp("2012-08-02 00:00:00"), "J", 1, 15],
                [Timestamp("2013-04-06 00:00:00"), "L", 2, 20],
            ],
            index=[2, 4],
            columns=["x", "y", "z", "a"],
        )
        expected["x"] = expected["x"].astype("M8[ns]")
        tm.assert_frame_equal(result, expected)

    def test_join_with_categorical_index(self):
        # GH47812
        ix = ["a", "b"]
        id1 = pd.CategoricalIndex(ix, categories=ix)
        id2 = pd.CategoricalIndex(reversed(ix), categories=reversed(ix))

        df1 = DataFrame({"c1": ix}, index=id1)
        df2 = DataFrame({"c2": reversed(ix)}, index=id2)
        result = df1.join(df2)
        expected = DataFrame(
            {"c1": ["a", "b"], "c2": ["a", "b"]},
            index=pd.CategoricalIndex(["a", "b"], categories=["a", "b"]),
        )
        tm.assert_frame_equal(result, expected)


def _check_join(left, right, result, join_col, how="left", lsuffix="_x", rsuffix="_y"):
    # some smoke tests
    for c in join_col:
        assert result[c].notna().all()

    left_grouped = left.groupby(join_col)
    right_grouped = right.groupby(join_col)

    for group_key, group in result.groupby(join_col):
        l_joined = _restrict_to_columns(group, left.columns, lsuffix)
        r_joined = _restrict_to_columns(group, right.columns, rsuffix)

        try:
            lgroup = left_grouped.get_group(group_key)
        except KeyError as err:
            if how in ("left", "inner"):
                raise AssertionError(
                    f"key {group_key} should not have been in the join"
                ) from err

            _assert_all_na(l_joined, left.columns, join_col)
        else:
            _assert_same_contents(l_joined, lgroup)

        try:
            rgroup = right_grouped.get_group(group_key)
        except KeyError as err:
            if how in ("right", "inner"):
                raise AssertionError(
                    f"key {group_key} should not have been in the join"
                ) from err

            _assert_all_na(r_joined, right.columns, join_col)
        else:
            _assert_same_contents(r_joined, rgroup)


def _restrict_to_columns(group, columns, suffix):
    found = [
        c for c in group.columns if c in columns or c.replace(suffix, "") in columns
    ]

    # filter
    group = group.loc[:, found]

    # get rid of suffixes, if any
    group = group.rename(columns=lambda x: x.replace(suffix, ""))

    # put in the right order...
    group = group.loc[:, columns]

    return group


def _assert_same_contents(join_chunk, source):
    NA_SENTINEL = -1234567  # drop_duplicates not so NA-friendly...

    jvalues = join_chunk.fillna(NA_SENTINEL).drop_duplicates().values
    svalues = source.fillna(NA_SENTINEL).drop_duplicates().values

    rows = {tuple(row) for row in jvalues}
    assert len(rows) == len(source)
    assert all(tuple(row) in rows for row in svalues)


def _assert_all_na(join_chunk, source_columns, join_col):
    for c in source_columns:
        if c in join_col:
            continue
        assert join_chunk[c].isna().all()


def _join_by_hand(a, b, how="left"):
    join_index = a.index.join(b.index, how=how)

    a_re = a.reindex(join_index)
    b_re = b.reindex(join_index)

    result_columns = a.columns.append(b.columns)

    for col, s in b_re.items():
        a_re[col] = s
    return a_re.reindex(columns=result_columns)


def test_join_inner_multiindex_deterministic_order():
    # GH: 36910
    left = DataFrame(
        data={"e": 5},
        index=MultiIndex.from_tuples([(1, 2, 4)], names=("a", "b", "d")),
    )
    right = DataFrame(
        data={"f": 6}, index=MultiIndex.from_tuples([(2, 3)], names=("b", "c"))
    )
    result = left.join(right, how="inner")
    expected = DataFrame(
        {"e": [5], "f": [6]},
        index=MultiIndex.from_tuples([(1, 2, 4, 3)], names=("a", "b", "d", "c")),
    )
    tm.assert_frame_equal(result, expected)


@pytest.mark.parametrize(
    ("input_col", "output_cols"), [("b", ["a", "b"]), ("a", ["a_x", "a_y"])]
)
def test_join_cross(input_col, output_cols):
    # GH#5401
    left = DataFrame({"a": [1, 3]})
    right = DataFrame({input_col: [3, 4]})
    result = left.join(right, how="cross", lsuffix="_x", rsuffix="_y")
    expected = DataFrame({output_cols[0]: [1, 1, 3, 3], output_cols[1]: [3, 4, 3, 4]})
    tm.assert_frame_equal(result, expected)


def test_join_multiindex_one_level(join_type):
    # GH#36909
    left = DataFrame(
        data={"c": 3}, index=MultiIndex.from_tuples([(1, 2)], names=("a", "b"))
    )
    right = DataFrame(data={"d": 4}, index=MultiIndex.from_tuples([(2,)], names=("b",)))
    result = left.join(right, how=join_type)
    if join_type == "right":
        expected = DataFrame(
            {"c": [3], "d": [4]},
            index=MultiIndex.from_tuples([(2, 1)], names=["b", "a"]),
        )
    else:
        expected = DataFrame(
            {"c": [3], "d": [4]},
            index=MultiIndex.from_tuples([(1, 2)], names=["a", "b"]),
        )
    tm.assert_frame_equal(result, expected)


@pytest.mark.parametrize(
    "categories, values",
    [
        (["Y", "X"], ["Y", "X", "X"]),
        ([2, 1], [2, 1, 1]),
        ([2.5, 1.5], [2.5, 1.5, 1.5]),
        (
            [Timestamp("2020-12-31"), Timestamp("2019-12-31")],
            [Timestamp("2020-12-31"), Timestamp("2019-12-31"), Timestamp("2019-12-31")],
        ),
    ],
)
def test_join_multiindex_not_alphabetical_categorical(categories, values):
    # GH#38502
    left = DataFrame(
        {
            "first": ["A", "A"],
            "second": Categorical(categories, categories=categories),
            "value": [1, 2],
        }
    ).set_index(["first", "second"])
    right = DataFrame(
        {
            "first": ["A", "A", "B"],
            "second": Categorical(values, categories=categories),
            "value": [3, 4, 5],
        }
    ).set_index(["first", "second"])
    result = left.join(right, lsuffix="_left", rsuffix="_right")

    expected = DataFrame(
        {
            "first": ["A", "A"],
            "second": Categorical(categories, categories=categories),
            "value_left": [1, 2],
            "value_right": [3, 4],
        }
    ).set_index(["first", "second"])
    tm.assert_frame_equal(result, expected)


@pytest.mark.parametrize(
    "left_empty, how, exp",
    [
        (False, "left", "left"),
        (False, "right", "empty"),
        (False, "inner", "empty"),
        (False, "outer", "left"),
        (False, "cross", "empty"),
        (True, "left", "empty"),
        (True, "right", "right"),
        (True, "inner", "empty"),
        (True, "outer", "right"),
        (True, "cross", "empty"),
    ],
)
def test_join_empty(left_empty, how, exp):
    left = DataFrame({"A": [2, 1], "B": [3, 4]}, dtype="int64").set_index("A")
    right = DataFrame({"A": [1], "C": [5]}, dtype="int64").set_index("A")

    if left_empty:
        left = left.head(0)
    else:
        right = right.head(0)

    result = left.join(right, how=how)

    if exp == "left":
        expected = DataFrame({"A": [2, 1], "B": [3, 4], "C": [np.nan, np.nan]})
        expected = expected.set_index("A")
    elif exp == "right":
        expected = DataFrame({"B": [np.nan], "A": [1], "C": [5]})
        expected = expected.set_index("A")
    elif exp == "empty":
        expected = DataFrame(columns=["B", "C"], dtype="int64")
        if how != "cross":
            expected = expected.rename_axis("A")
    if how == "outer":
        expected = expected.sort_index()

    tm.assert_frame_equal(result, expected)


def test_join_empty_uncomparable_columns():
    # GH 57048
    df1 = DataFrame()
    df2 = DataFrame(columns=["test"])
    df3 = DataFrame(columns=["foo", ("bar", "baz")])

    result = df1 + df2
    expected = DataFrame(columns=["test"])
    tm.assert_frame_equal(result, expected)

    result = df2 + df3
    expected = DataFrame(columns=[("bar", "baz"), "foo", "test"])
    tm.assert_frame_equal(result, expected)

    result = df1 + df3
    expected = DataFrame(columns=[("bar", "baz"), "foo"])
    tm.assert_frame_equal(result, expected)


@pytest.mark.parametrize(
    "how, values",
    [
        ("inner", [0, 1, 2]),
        ("outer", [0, 1, 2]),
        ("left", [0, 1, 2]),
        ("right", [0, 2, 1]),
    ],
)
def test_join_multiindex_categorical_output_index_dtype(how, values):
    # GH#50906
    df1 = DataFrame(
        {
            "a": Categorical([0, 1, 2]),
            "b": Categorical([0, 1, 2]),
            "c": [0, 1, 2],
        }
    ).set_index(["a", "b"])

    df2 = DataFrame(
        {
            "a": Categorical([0, 2, 1]),
            "b": Categorical([0, 2, 1]),
            "d": [0, 2, 1],
        }
    ).set_index(["a", "b"])

    expected = DataFrame(
        {
            "a": Categorical(values),
            "b": Categorical(values),
            "c": values,
            "d": values,
        }
    ).set_index(["a", "b"])

    result = df1.join(df2, how=how)
    tm.assert_frame_equal(result, expected)