249 lines
8.0 KiB
Python
249 lines
8.0 KiB
Python
|
import numpy as np
|
||
|
import pytest
|
||
|
|
||
|
import pandas as pd
|
||
|
from pandas import (
|
||
|
Series,
|
||
|
date_range,
|
||
|
)
|
||
|
import pandas._testing as tm
|
||
|
from pandas.core.arrays import PeriodArray
|
||
|
|
||
|
|
||
|
class TestSeriesIsIn:
|
||
|
def test_isin(self):
|
||
|
s = Series(["A", "B", "C", "a", "B", "B", "A", "C"])
|
||
|
|
||
|
result = s.isin(["A", "C"])
|
||
|
expected = Series([True, False, True, False, False, False, True, True])
|
||
|
tm.assert_series_equal(result, expected)
|
||
|
|
||
|
# GH#16012
|
||
|
# This specific issue has to have a series over 1e6 in len, but the
|
||
|
# comparison array (in_list) must be large enough so that numpy doesn't
|
||
|
# do a manual masking trick that will avoid this issue altogether
|
||
|
s = Series(list("abcdefghijk" * 10**5))
|
||
|
# If numpy doesn't do the manual comparison/mask, these
|
||
|
# unorderable mixed types are what cause the exception in numpy
|
||
|
in_list = [-1, "a", "b", "G", "Y", "Z", "E", "K", "E", "S", "I", "R", "R"] * 6
|
||
|
|
||
|
assert s.isin(in_list).sum() == 200000
|
||
|
|
||
|
def test_isin_with_string_scalar(self):
|
||
|
# GH#4763
|
||
|
s = Series(["A", "B", "C", "a", "B", "B", "A", "C"])
|
||
|
msg = (
|
||
|
r"only list-like objects are allowed to be passed to isin\(\), "
|
||
|
r"you passed a \[str\]"
|
||
|
)
|
||
|
with pytest.raises(TypeError, match=msg):
|
||
|
s.isin("a")
|
||
|
|
||
|
s = Series(["aaa", "b", "c"])
|
||
|
with pytest.raises(TypeError, match=msg):
|
||
|
s.isin("aaa")
|
||
|
|
||
|
def test_isin_datetimelike_mismatched_reso(self):
|
||
|
expected = Series([True, True, False, False, False])
|
||
|
|
||
|
ser = Series(date_range("jan-01-2013", "jan-05-2013"))
|
||
|
|
||
|
# fails on dtype conversion in the first place
|
||
|
day_values = np.asarray(ser[0:2].values).astype("datetime64[D]")
|
||
|
result = ser.isin(day_values)
|
||
|
tm.assert_series_equal(result, expected)
|
||
|
|
||
|
dta = ser[:2]._values.astype("M8[s]")
|
||
|
result = ser.isin(dta)
|
||
|
tm.assert_series_equal(result, expected)
|
||
|
|
||
|
def test_isin_datetimelike_mismatched_reso_list(self):
|
||
|
expected = Series([True, True, False, False, False])
|
||
|
|
||
|
ser = Series(date_range("jan-01-2013", "jan-05-2013"))
|
||
|
|
||
|
dta = ser[:2]._values.astype("M8[s]")
|
||
|
result = ser.isin(list(dta))
|
||
|
tm.assert_series_equal(result, expected)
|
||
|
|
||
|
def test_isin_with_i8(self):
|
||
|
# GH#5021
|
||
|
|
||
|
expected = Series([True, True, False, False, False])
|
||
|
expected2 = Series([False, True, False, False, False])
|
||
|
|
||
|
# datetime64[ns]
|
||
|
s = Series(date_range("jan-01-2013", "jan-05-2013"))
|
||
|
|
||
|
result = s.isin(s[0:2])
|
||
|
tm.assert_series_equal(result, expected)
|
||
|
|
||
|
result = s.isin(s[0:2].values)
|
||
|
tm.assert_series_equal(result, expected)
|
||
|
|
||
|
result = s.isin([s[1]])
|
||
|
tm.assert_series_equal(result, expected2)
|
||
|
|
||
|
result = s.isin([np.datetime64(s[1])])
|
||
|
tm.assert_series_equal(result, expected2)
|
||
|
|
||
|
result = s.isin(set(s[0:2]))
|
||
|
tm.assert_series_equal(result, expected)
|
||
|
|
||
|
# timedelta64[ns]
|
||
|
s = Series(pd.to_timedelta(range(5), unit="d"))
|
||
|
result = s.isin(s[0:2])
|
||
|
tm.assert_series_equal(result, expected)
|
||
|
|
||
|
@pytest.mark.parametrize("empty", [[], Series(dtype=object), np.array([])])
|
||
|
def test_isin_empty(self, empty):
|
||
|
# see GH#16991
|
||
|
s = Series(["a", "b"])
|
||
|
expected = Series([False, False])
|
||
|
|
||
|
result = s.isin(empty)
|
||
|
tm.assert_series_equal(expected, result)
|
||
|
|
||
|
def test_isin_read_only(self):
|
||
|
# https://github.com/pandas-dev/pandas/issues/37174
|
||
|
arr = np.array([1, 2, 3])
|
||
|
arr.setflags(write=False)
|
||
|
s = Series([1, 2, 3])
|
||
|
result = s.isin(arr)
|
||
|
expected = Series([True, True, True])
|
||
|
tm.assert_series_equal(result, expected)
|
||
|
|
||
|
@pytest.mark.parametrize("dtype", [object, None])
|
||
|
def test_isin_dt64_values_vs_ints(self, dtype):
|
||
|
# GH#36621 dont cast integers to datetimes for isin
|
||
|
dti = date_range("2013-01-01", "2013-01-05")
|
||
|
ser = Series(dti)
|
||
|
|
||
|
comps = np.asarray([1356998400000000000], dtype=dtype)
|
||
|
|
||
|
res = dti.isin(comps)
|
||
|
expected = np.array([False] * len(dti), dtype=bool)
|
||
|
tm.assert_numpy_array_equal(res, expected)
|
||
|
|
||
|
res = ser.isin(comps)
|
||
|
tm.assert_series_equal(res, Series(expected))
|
||
|
|
||
|
res = pd.core.algorithms.isin(ser, comps)
|
||
|
tm.assert_numpy_array_equal(res, expected)
|
||
|
|
||
|
def test_isin_tzawareness_mismatch(self):
|
||
|
dti = date_range("2013-01-01", "2013-01-05")
|
||
|
ser = Series(dti)
|
||
|
|
||
|
other = dti.tz_localize("UTC")
|
||
|
|
||
|
res = dti.isin(other)
|
||
|
expected = np.array([False] * len(dti), dtype=bool)
|
||
|
tm.assert_numpy_array_equal(res, expected)
|
||
|
|
||
|
res = ser.isin(other)
|
||
|
tm.assert_series_equal(res, Series(expected))
|
||
|
|
||
|
res = pd.core.algorithms.isin(ser, other)
|
||
|
tm.assert_numpy_array_equal(res, expected)
|
||
|
|
||
|
def test_isin_period_freq_mismatch(self):
|
||
|
dti = date_range("2013-01-01", "2013-01-05")
|
||
|
pi = dti.to_period("M")
|
||
|
ser = Series(pi)
|
||
|
|
||
|
# We construct another PeriodIndex with the same i8 values
|
||
|
# but different dtype
|
||
|
dtype = dti.to_period("Y").dtype
|
||
|
other = PeriodArray._simple_new(pi.asi8, dtype=dtype)
|
||
|
|
||
|
res = pi.isin(other)
|
||
|
expected = np.array([False] * len(pi), dtype=bool)
|
||
|
tm.assert_numpy_array_equal(res, expected)
|
||
|
|
||
|
res = ser.isin(other)
|
||
|
tm.assert_series_equal(res, Series(expected))
|
||
|
|
||
|
res = pd.core.algorithms.isin(ser, other)
|
||
|
tm.assert_numpy_array_equal(res, expected)
|
||
|
|
||
|
@pytest.mark.parametrize("values", [[-9.0, 0.0], [-9, 0]])
|
||
|
def test_isin_float_in_int_series(self, values):
|
||
|
# GH#19356 GH#21804
|
||
|
ser = Series(values)
|
||
|
result = ser.isin([-9, -0.5])
|
||
|
expected = Series([True, False])
|
||
|
tm.assert_series_equal(result, expected)
|
||
|
|
||
|
@pytest.mark.parametrize("dtype", ["boolean", "Int64", "Float64"])
|
||
|
@pytest.mark.parametrize(
|
||
|
"data,values,expected",
|
||
|
[
|
||
|
([0, 1, 0], [1], [False, True, False]),
|
||
|
([0, 1, 0], [1, pd.NA], [False, True, False]),
|
||
|
([0, pd.NA, 0], [1, 0], [True, False, True]),
|
||
|
([0, 1, pd.NA], [1, pd.NA], [False, True, True]),
|
||
|
([0, 1, pd.NA], [1, np.nan], [False, True, False]),
|
||
|
([0, pd.NA, pd.NA], [np.nan, pd.NaT, None], [False, False, False]),
|
||
|
],
|
||
|
)
|
||
|
def test_isin_masked_types(self, dtype, data, values, expected):
|
||
|
# GH#42405
|
||
|
ser = Series(data, dtype=dtype)
|
||
|
|
||
|
result = ser.isin(values)
|
||
|
expected = Series(expected, dtype="boolean")
|
||
|
|
||
|
tm.assert_series_equal(result, expected)
|
||
|
|
||
|
|
||
|
@pytest.mark.slow
|
||
|
def test_isin_large_series_mixed_dtypes_and_nan():
|
||
|
# https://github.com/pandas-dev/pandas/issues/37094
|
||
|
# combination of object dtype for the values and > 1_000_000 elements
|
||
|
ser = Series([1, 2, np.nan] * 1_000_000)
|
||
|
result = ser.isin({"foo", "bar"})
|
||
|
expected = Series([False] * 3 * 1_000_000)
|
||
|
tm.assert_series_equal(result, expected)
|
||
|
|
||
|
|
||
|
@pytest.mark.parametrize(
|
||
|
"array,expected",
|
||
|
[
|
||
|
(
|
||
|
[0, 1j, 1j, 1, 1 + 1j, 1 + 2j, 1 + 1j],
|
||
|
Series([False, True, True, False, True, True, True], dtype=bool),
|
||
|
)
|
||
|
],
|
||
|
)
|
||
|
def test_isin_complex_numbers(array, expected):
|
||
|
# GH 17927
|
||
|
result = Series(array).isin([1j, 1 + 1j, 1 + 2j])
|
||
|
tm.assert_series_equal(result, expected)
|
||
|
|
||
|
|
||
|
@pytest.mark.parametrize(
|
||
|
"data,is_in",
|
||
|
[([1, [2]], [1]), (["simple str", [{"values": 3}]], ["simple str"])],
|
||
|
)
|
||
|
def test_isin_filtering_with_mixed_object_types(data, is_in):
|
||
|
# GH 20883
|
||
|
|
||
|
ser = Series(data)
|
||
|
result = ser.isin(is_in)
|
||
|
expected = Series([True, False])
|
||
|
|
||
|
tm.assert_series_equal(result, expected)
|
||
|
|
||
|
|
||
|
@pytest.mark.parametrize("data", [[1, 2, 3], [1.0, 2.0, 3.0]])
|
||
|
@pytest.mark.parametrize("isin", [[1, 2], [1.0, 2.0]])
|
||
|
def test_isin_filtering_on_iterable(data, isin):
|
||
|
# GH 50234
|
||
|
|
||
|
ser = Series(data)
|
||
|
result = ser.isin(i for i in isin)
|
||
|
expected_result = Series([True, True, False])
|
||
|
|
||
|
tm.assert_series_equal(result, expected_result)
|