272 lines
8.3 KiB
Python
272 lines
8.3 KiB
Python
|
import numpy as np
|
||
|
import pytest
|
||
|
|
||
|
from pandas import (
|
||
|
DataFrame,
|
||
|
Index,
|
||
|
Interval,
|
||
|
MultiIndex,
|
||
|
Series,
|
||
|
StringDtype,
|
||
|
)
|
||
|
import pandas._testing as tm
|
||
|
|
||
|
|
||
|
@pytest.mark.parametrize(
|
||
|
"other", [Index(["three", "one", "two"]), Index(["one"]), Index(["one", "three"])]
|
||
|
)
|
||
|
def test_join_level(idx, other, join_type):
|
||
|
join_index, lidx, ridx = other.join(
|
||
|
idx, how=join_type, level="second", return_indexers=True
|
||
|
)
|
||
|
|
||
|
exp_level = other.join(idx.levels[1], how=join_type)
|
||
|
assert join_index.levels[0].equals(idx.levels[0])
|
||
|
assert join_index.levels[1].equals(exp_level)
|
||
|
|
||
|
# pare down levels
|
||
|
mask = np.array([x[1] in exp_level for x in idx], dtype=bool)
|
||
|
exp_values = idx.values[mask]
|
||
|
tm.assert_numpy_array_equal(join_index.values, exp_values)
|
||
|
|
||
|
if join_type in ("outer", "inner"):
|
||
|
join_index2, ridx2, lidx2 = idx.join(
|
||
|
other, how=join_type, level="second", return_indexers=True
|
||
|
)
|
||
|
|
||
|
assert join_index.equals(join_index2)
|
||
|
tm.assert_numpy_array_equal(lidx, lidx2)
|
||
|
tm.assert_numpy_array_equal(ridx, ridx2)
|
||
|
tm.assert_numpy_array_equal(join_index2.values, exp_values)
|
||
|
|
||
|
|
||
|
def test_join_level_corner_case(idx):
|
||
|
# some corner cases
|
||
|
index = Index(["three", "one", "two"])
|
||
|
result = index.join(idx, level="second")
|
||
|
assert isinstance(result, MultiIndex)
|
||
|
|
||
|
with pytest.raises(TypeError, match="Join.*MultiIndex.*ambiguous"):
|
||
|
idx.join(idx, level=1)
|
||
|
|
||
|
|
||
|
def test_join_self(idx, join_type):
|
||
|
joined = idx.join(idx, how=join_type)
|
||
|
tm.assert_index_equal(joined, idx)
|
||
|
|
||
|
|
||
|
def test_join_multi():
|
||
|
# GH 10665
|
||
|
midx = MultiIndex.from_product([np.arange(4), np.arange(4)], names=["a", "b"])
|
||
|
idx = Index([1, 2, 5], name="b")
|
||
|
|
||
|
# inner
|
||
|
jidx, lidx, ridx = midx.join(idx, how="inner", return_indexers=True)
|
||
|
exp_idx = MultiIndex.from_product([np.arange(4), [1, 2]], names=["a", "b"])
|
||
|
exp_lidx = np.array([1, 2, 5, 6, 9, 10, 13, 14], dtype=np.intp)
|
||
|
exp_ridx = np.array([0, 1, 0, 1, 0, 1, 0, 1], dtype=np.intp)
|
||
|
tm.assert_index_equal(jidx, exp_idx)
|
||
|
tm.assert_numpy_array_equal(lidx, exp_lidx)
|
||
|
tm.assert_numpy_array_equal(ridx, exp_ridx)
|
||
|
# flip
|
||
|
jidx, ridx, lidx = idx.join(midx, how="inner", return_indexers=True)
|
||
|
tm.assert_index_equal(jidx, exp_idx)
|
||
|
tm.assert_numpy_array_equal(lidx, exp_lidx)
|
||
|
tm.assert_numpy_array_equal(ridx, exp_ridx)
|
||
|
|
||
|
# keep MultiIndex
|
||
|
jidx, lidx, ridx = midx.join(idx, how="left", return_indexers=True)
|
||
|
exp_ridx = np.array(
|
||
|
[-1, 0, 1, -1, -1, 0, 1, -1, -1, 0, 1, -1, -1, 0, 1, -1], dtype=np.intp
|
||
|
)
|
||
|
tm.assert_index_equal(jidx, midx)
|
||
|
assert lidx is None
|
||
|
tm.assert_numpy_array_equal(ridx, exp_ridx)
|
||
|
# flip
|
||
|
jidx, ridx, lidx = idx.join(midx, how="right", return_indexers=True)
|
||
|
tm.assert_index_equal(jidx, midx)
|
||
|
assert lidx is None
|
||
|
tm.assert_numpy_array_equal(ridx, exp_ridx)
|
||
|
|
||
|
|
||
|
def test_join_self_unique(idx, join_type):
|
||
|
if idx.is_unique:
|
||
|
joined = idx.join(idx, how=join_type)
|
||
|
assert (idx == joined).all()
|
||
|
|
||
|
|
||
|
def test_join_multi_wrong_order():
|
||
|
# GH 25760
|
||
|
# GH 28956
|
||
|
|
||
|
midx1 = MultiIndex.from_product([[1, 2], [3, 4]], names=["a", "b"])
|
||
|
midx2 = MultiIndex.from_product([[1, 2], [3, 4]], names=["b", "a"])
|
||
|
|
||
|
join_idx, lidx, ridx = midx1.join(midx2, return_indexers=True)
|
||
|
|
||
|
exp_ridx = np.array([-1, -1, -1, -1], dtype=np.intp)
|
||
|
|
||
|
tm.assert_index_equal(midx1, join_idx)
|
||
|
assert lidx is None
|
||
|
tm.assert_numpy_array_equal(ridx, exp_ridx)
|
||
|
|
||
|
|
||
|
def test_join_multi_return_indexers():
|
||
|
# GH 34074
|
||
|
|
||
|
midx1 = MultiIndex.from_product([[1, 2], [3, 4], [5, 6]], names=["a", "b", "c"])
|
||
|
midx2 = MultiIndex.from_product([[1, 2], [3, 4]], names=["a", "b"])
|
||
|
|
||
|
result = midx1.join(midx2, return_indexers=False)
|
||
|
tm.assert_index_equal(result, midx1)
|
||
|
|
||
|
|
||
|
def test_join_overlapping_interval_level():
|
||
|
# GH 44096
|
||
|
idx_1 = MultiIndex.from_tuples(
|
||
|
[
|
||
|
(1, Interval(0.0, 1.0)),
|
||
|
(1, Interval(1.0, 2.0)),
|
||
|
(1, Interval(2.0, 5.0)),
|
||
|
(2, Interval(0.0, 1.0)),
|
||
|
(2, Interval(1.0, 3.0)), # interval limit is here at 3.0, not at 2.0
|
||
|
(2, Interval(3.0, 5.0)),
|
||
|
],
|
||
|
names=["num", "interval"],
|
||
|
)
|
||
|
|
||
|
idx_2 = MultiIndex.from_tuples(
|
||
|
[
|
||
|
(1, Interval(2.0, 5.0)),
|
||
|
(1, Interval(0.0, 1.0)),
|
||
|
(1, Interval(1.0, 2.0)),
|
||
|
(2, Interval(3.0, 5.0)),
|
||
|
(2, Interval(0.0, 1.0)),
|
||
|
(2, Interval(1.0, 3.0)),
|
||
|
],
|
||
|
names=["num", "interval"],
|
||
|
)
|
||
|
|
||
|
expected = MultiIndex.from_tuples(
|
||
|
[
|
||
|
(1, Interval(0.0, 1.0)),
|
||
|
(1, Interval(1.0, 2.0)),
|
||
|
(1, Interval(2.0, 5.0)),
|
||
|
(2, Interval(0.0, 1.0)),
|
||
|
(2, Interval(1.0, 3.0)),
|
||
|
(2, Interval(3.0, 5.0)),
|
||
|
],
|
||
|
names=["num", "interval"],
|
||
|
)
|
||
|
result = idx_1.join(idx_2, how="outer")
|
||
|
|
||
|
tm.assert_index_equal(result, expected)
|
||
|
|
||
|
|
||
|
def test_join_midx_ea():
|
||
|
# GH#49277
|
||
|
midx = MultiIndex.from_arrays(
|
||
|
[Series([1, 1, 3], dtype="Int64"), Series([1, 2, 3], dtype="Int64")],
|
||
|
names=["a", "b"],
|
||
|
)
|
||
|
midx2 = MultiIndex.from_arrays(
|
||
|
[Series([1], dtype="Int64"), Series([3], dtype="Int64")], names=["a", "c"]
|
||
|
)
|
||
|
result = midx.join(midx2, how="inner")
|
||
|
expected = MultiIndex.from_arrays(
|
||
|
[
|
||
|
Series([1, 1], dtype="Int64"),
|
||
|
Series([1, 2], dtype="Int64"),
|
||
|
Series([3, 3], dtype="Int64"),
|
||
|
],
|
||
|
names=["a", "b", "c"],
|
||
|
)
|
||
|
tm.assert_index_equal(result, expected)
|
||
|
|
||
|
|
||
|
def test_join_midx_string():
|
||
|
# GH#49277
|
||
|
midx = MultiIndex.from_arrays(
|
||
|
[
|
||
|
Series(["a", "a", "c"], dtype=StringDtype()),
|
||
|
Series(["a", "b", "c"], dtype=StringDtype()),
|
||
|
],
|
||
|
names=["a", "b"],
|
||
|
)
|
||
|
midx2 = MultiIndex.from_arrays(
|
||
|
[Series(["a"], dtype=StringDtype()), Series(["c"], dtype=StringDtype())],
|
||
|
names=["a", "c"],
|
||
|
)
|
||
|
result = midx.join(midx2, how="inner")
|
||
|
expected = MultiIndex.from_arrays(
|
||
|
[
|
||
|
Series(["a", "a"], dtype=StringDtype()),
|
||
|
Series(["a", "b"], dtype=StringDtype()),
|
||
|
Series(["c", "c"], dtype=StringDtype()),
|
||
|
],
|
||
|
names=["a", "b", "c"],
|
||
|
)
|
||
|
tm.assert_index_equal(result, expected)
|
||
|
|
||
|
|
||
|
def test_join_multi_with_nan():
|
||
|
# GH29252
|
||
|
df1 = DataFrame(
|
||
|
data={"col1": [1.1, 1.2]},
|
||
|
index=MultiIndex.from_product([["A"], [1.0, 2.0]], names=["id1", "id2"]),
|
||
|
)
|
||
|
df2 = DataFrame(
|
||
|
data={"col2": [2.1, 2.2]},
|
||
|
index=MultiIndex.from_product([["A"], [np.NaN, 2.0]], names=["id1", "id2"]),
|
||
|
)
|
||
|
result = df1.join(df2)
|
||
|
expected = DataFrame(
|
||
|
data={"col1": [1.1, 1.2], "col2": [np.nan, 2.2]},
|
||
|
index=MultiIndex.from_product([["A"], [1.0, 2.0]], names=["id1", "id2"]),
|
||
|
)
|
||
|
tm.assert_frame_equal(result, expected)
|
||
|
|
||
|
|
||
|
@pytest.mark.parametrize("val", [0, 5])
|
||
|
def test_join_dtypes(any_numeric_ea_dtype, val):
|
||
|
# GH#49830
|
||
|
midx = MultiIndex.from_arrays([Series([1, 2], dtype=any_numeric_ea_dtype), [3, 4]])
|
||
|
midx2 = MultiIndex.from_arrays(
|
||
|
[Series([1, val, val], dtype=any_numeric_ea_dtype), [3, 4, 4]]
|
||
|
)
|
||
|
result = midx.join(midx2, how="outer")
|
||
|
expected = MultiIndex.from_arrays(
|
||
|
[Series([val, val, 1, 2], dtype=any_numeric_ea_dtype), [4, 4, 3, 4]]
|
||
|
).sort_values()
|
||
|
tm.assert_index_equal(result, expected)
|
||
|
|
||
|
|
||
|
def test_join_dtypes_all_nan(any_numeric_ea_dtype):
|
||
|
# GH#49830
|
||
|
midx = MultiIndex.from_arrays(
|
||
|
[Series([1, 2], dtype=any_numeric_ea_dtype), [np.nan, np.nan]]
|
||
|
)
|
||
|
midx2 = MultiIndex.from_arrays(
|
||
|
[Series([1, 0, 0], dtype=any_numeric_ea_dtype), [np.nan, np.nan, np.nan]]
|
||
|
)
|
||
|
result = midx.join(midx2, how="outer")
|
||
|
expected = MultiIndex.from_arrays(
|
||
|
[
|
||
|
Series([0, 0, 1, 2], dtype=any_numeric_ea_dtype),
|
||
|
[np.nan, np.nan, np.nan, np.nan],
|
||
|
]
|
||
|
)
|
||
|
tm.assert_index_equal(result, expected)
|
||
|
|
||
|
|
||
|
def test_join_index_levels():
|
||
|
# GH#53093
|
||
|
midx = midx = MultiIndex.from_tuples([("a", "2019-02-01"), ("a", "2019-02-01")])
|
||
|
midx2 = MultiIndex.from_tuples([("a", "2019-01-31")])
|
||
|
result = midx.join(midx2, how="outer")
|
||
|
expected = MultiIndex.from_tuples(
|
||
|
[("a", "2019-01-31"), ("a", "2019-02-01"), ("a", "2019-02-01")]
|
||
|
)
|
||
|
tm.assert_index_equal(result.levels[1], expected.levels[1])
|
||
|
tm.assert_index_equal(result, expected)
|