from datetime import datetime, timedelta import re import numpy as np from numpy.random import randint import pytest from pandas._libs import lib import pandas as pd from pandas import DataFrame, Index, MultiIndex, Series, concat, isna, notna import pandas._testing as tm import pandas.core.strings as strings def assert_series_or_index_equal(left, right): if isinstance(left, Series): tm.assert_series_equal(left, right) else: # Index tm.assert_index_equal(left, right) _any_string_method = [ ("cat", (), {"sep": ","}), ("cat", (Series(list("zyx")),), {"sep": ",", "join": "left"}), ("center", (10,), {}), ("contains", ("a",), {}), ("count", ("a",), {}), ("decode", ("UTF-8",), {}), ("encode", ("UTF-8",), {}), ("endswith", ("a",), {}), ("extract", ("([a-z]*)",), {"expand": False}), ("extract", ("([a-z]*)",), {"expand": True}), ("extractall", ("([a-z]*)",), {}), ("find", ("a",), {}), ("findall", ("a",), {}), ("get", (0,), {}), # because "index" (and "rindex") fail intentionally # if the string is not found, search only for empty string ("index", ("",), {}), ("join", (",",), {}), ("ljust", (10,), {}), ("match", ("a",), {}), ("normalize", ("NFC",), {}), ("pad", (10,), {}), ("partition", (" ",), {"expand": False}), ("partition", (" ",), {"expand": True}), ("repeat", (3,), {}), ("replace", ("a", "z"), {}), ("rfind", ("a",), {}), ("rindex", ("",), {}), ("rjust", (10,), {}), ("rpartition", (" ",), {"expand": False}), ("rpartition", (" ",), {"expand": True}), ("slice", (0, 1), {}), ("slice_replace", (0, 1, "z"), {}), ("split", (" ",), {"expand": False}), ("split", (" ",), {"expand": True}), ("startswith", ("a",), {}), # translating unicode points of "a" to "d" ("translate", ({97: 100},), {}), ("wrap", (2,), {}), ("zfill", (10,), {}), ] + list( zip( [ # methods without positional arguments: zip with empty tuple and empty dict "capitalize", "cat", "get_dummies", "isalnum", "isalpha", "isdecimal", "isdigit", "islower", "isnumeric", "isspace", "istitle", "isupper", "len", "lower", "lstrip", "partition", "rpartition", "rsplit", "rstrip", "slice", "slice_replace", "split", "strip", "swapcase", "title", "upper", "casefold", ], [()] * 100, [{}] * 100, ) ) ids, _, _ = zip(*_any_string_method) # use method name as fixture-id # test that the above list captures all methods of StringMethods missing_methods = { f for f in dir(strings.StringMethods) if not f.startswith("_") } - set(ids) assert not missing_methods @pytest.fixture(params=_any_string_method, ids=ids) def any_string_method(request): """ Fixture for all public methods of `StringMethods` This fixture returns a tuple of the method name and sample arguments necessary to call the method. Returns ------- method_name : str The name of the method in `StringMethods` args : tuple Sample values for the positional arguments kwargs : dict Sample values for the keyword arguments Examples -------- >>> def test_something(any_string_method): ... s = pd.Series(['a', 'b', np.nan, 'd']) ... ... method_name, args, kwargs = any_string_method ... method = getattr(s.str, method_name) ... # will not raise ... method(*args, **kwargs) """ return request.param # subset of the full set from pandas/conftest.py _any_allowed_skipna_inferred_dtype = [ ("string", ["a", np.nan, "c"]), ("bytes", [b"a", np.nan, b"c"]), ("empty", [np.nan, np.nan, np.nan]), ("empty", []), ("mixed-integer", ["a", np.nan, 2]), ] ids, _ = zip(*_any_allowed_skipna_inferred_dtype) # use inferred type as id @pytest.fixture(params=_any_allowed_skipna_inferred_dtype, ids=ids) def any_allowed_skipna_inferred_dtype(request): """ Fixture for all (inferred) dtypes allowed in StringMethods.__init__ The covered (inferred) types are: * 'string' * 'empty' * 'bytes' * 'mixed' * 'mixed-integer' Returns ------- inferred_dtype : str The string for the inferred dtype from _libs.lib.infer_dtype values : np.ndarray An array of object dtype that will be inferred to have `inferred_dtype` Examples -------- >>> import pandas._libs.lib as lib >>> >>> def test_something(any_allowed_skipna_inferred_dtype): ... inferred_dtype, values = any_allowed_skipna_inferred_dtype ... # will pass ... assert lib.infer_dtype(values, skipna=True) == inferred_dtype ... ... # constructor for .str-accessor will also pass ... pd.Series(values).str """ inferred_dtype, values = request.param values = np.array(values, dtype=object) # object dtype to avoid casting # correctness of inference tested in tests/dtypes/test_inference.py return inferred_dtype, values class TestStringMethods: def test_api(self): # GH 6106, GH 9322 assert Series.str is strings.StringMethods assert isinstance(Series([""]).str, strings.StringMethods) def test_api_mi_raises(self): # GH 23679 mi = MultiIndex.from_arrays([["a", "b", "c"]]) msg = "Can only use .str accessor with Index, not MultiIndex" with pytest.raises(AttributeError, match=msg): mi.str assert not hasattr(mi, "str") @pytest.mark.parametrize("dtype", [object, "category"]) def test_api_per_dtype(self, index_or_series, dtype, any_skipna_inferred_dtype): # one instance of parametrized fixture box = index_or_series inferred_dtype, values = any_skipna_inferred_dtype if dtype == "category" and len(values) and values[1] is pd.NA: pytest.xfail(reason="Categorical does not yet support pd.NA") t = box(values, dtype=dtype) # explicit dtype to avoid casting # TODO: get rid of these xfails if dtype == "category" and inferred_dtype in ["period", "interval"]: pytest.xfail( reason="Conversion to numpy array fails because " "the ._values-attribute is not a numpy array for " "PeriodArray/IntervalArray; see GH 23553" ) types_passing_constructor = [ "string", "unicode", "empty", "bytes", "mixed", "mixed-integer", ] if inferred_dtype in types_passing_constructor: # GH 6106 assert isinstance(t.str, strings.StringMethods) else: # GH 9184, GH 23011, GH 23163 msg = "Can only use .str accessor with string values.*" with pytest.raises(AttributeError, match=msg): t.str assert not hasattr(t, "str") @pytest.mark.parametrize("dtype", [object, "category"]) def test_api_per_method( self, index_or_series, dtype, any_allowed_skipna_inferred_dtype, any_string_method, ): # this test does not check correctness of the different methods, # just that the methods work on the specified (inferred) dtypes, # and raise on all others box = index_or_series # one instance of each parametrized fixture inferred_dtype, values = any_allowed_skipna_inferred_dtype method_name, args, kwargs = any_string_method # TODO: get rid of these xfails if ( method_name in ["partition", "rpartition"] and box == Index and inferred_dtype == "empty" ): pytest.xfail(reason="Method cannot deal with empty Index") if ( method_name == "split" and box == Index and values.size == 0 and kwargs.get("expand", None) is not None ): pytest.xfail(reason="Split fails on empty Series when expand=True") if ( method_name == "get_dummies" and box == Index and inferred_dtype == "empty" and (dtype == object or values.size == 0) ): pytest.xfail(reason="Need to fortify get_dummies corner cases") t = box(values, dtype=dtype) # explicit dtype to avoid casting method = getattr(t.str, method_name) bytes_allowed = method_name in ["decode", "get", "len", "slice"] # as of v0.23.4, all methods except 'cat' are very lenient with the # allowed data types, just returning NaN for entries that error. # This could be changed with an 'errors'-kwarg to the `str`-accessor, # see discussion in GH 13877 mixed_allowed = method_name not in ["cat"] allowed_types = ( ["string", "unicode", "empty"] + ["bytes"] * bytes_allowed + ["mixed", "mixed-integer"] * mixed_allowed ) if inferred_dtype in allowed_types: # xref GH 23555, GH 23556 method(*args, **kwargs) # works! else: # GH 23011, GH 23163 msg = ( f"Cannot use .str.{method_name} with values of " f"inferred dtype {repr(inferred_dtype)}." ) with pytest.raises(TypeError, match=msg): method(*args, **kwargs) def test_api_for_categorical(self, any_string_method): # https://github.com/pandas-dev/pandas/issues/10661 s = Series(list("aabb")) s = s + " " + s c = s.astype("category") assert isinstance(c.str, strings.StringMethods) method_name, args, kwargs = any_string_method result = getattr(c.str, method_name)(*args, **kwargs) expected = getattr(s.str, method_name)(*args, **kwargs) if isinstance(result, DataFrame): tm.assert_frame_equal(result, expected) elif isinstance(result, Series): tm.assert_series_equal(result, expected) else: # str.cat(others=None) returns string, for example assert result == expected def test_iter(self): # GH3638 strs = "google", "wikimedia", "wikipedia", "wikitravel" ds = Series(strs) with tm.assert_produces_warning(FutureWarning): for s in ds.str: # iter must yield a Series assert isinstance(s, Series) # indices of each yielded Series should be equal to the index of # the original Series tm.assert_index_equal(s.index, ds.index) for el in s: # each element of the series is either a basestring/str or nan assert isinstance(el, str) or isna(el) # desired behavior is to iterate until everything would be nan on the # next iter so make sure the last element of the iterator was 'l' in # this case since 'wikitravel' is the longest string assert s.dropna().values.item() == "l" def test_iter_empty(self): ds = Series([], dtype=object) i, s = 100, 1 with tm.assert_produces_warning(FutureWarning): for i, s in enumerate(ds.str): pass # nothing to iterate over so nothing defined values should remain # unchanged assert i == 100 assert s == 1 def test_iter_single_element(self): ds = Series(["a"]) with tm.assert_produces_warning(FutureWarning): for i, s in enumerate(ds.str): pass assert not i tm.assert_series_equal(ds, s) def test_iter_object_try_string(self): ds = Series([slice(None, randint(10), randint(10, 20)) for _ in range(4)]) i, s = 100, "h" with tm.assert_produces_warning(FutureWarning): for i, s in enumerate(ds.str): pass assert i == 100 assert s == "h" @pytest.mark.parametrize("other", [None, Series, Index]) def test_str_cat_name(self, index_or_series, other): # GH 21053 box = index_or_series values = ["a", "b"] if other: other = other(values) else: other = values result = box(values, name="name").str.cat(other, sep=",") assert result.name == "name" def test_str_cat(self, index_or_series): box = index_or_series # test_cat above tests "str_cat" from ndarray; # here testing "str.cat" from Series/Indext to ndarray/list s = box(["a", "a", "b", "b", "c", np.nan]) # single array result = s.str.cat() expected = "aabbc" assert result == expected result = s.str.cat(na_rep="-") expected = "aabbc-" assert result == expected result = s.str.cat(sep="_", na_rep="NA") expected = "a_a_b_b_c_NA" assert result == expected t = np.array(["a", np.nan, "b", "d", "foo", np.nan], dtype=object) expected = box(["aa", "a-", "bb", "bd", "cfoo", "--"]) # Series/Index with array result = s.str.cat(t, na_rep="-") assert_series_or_index_equal(result, expected) # Series/Index with list result = s.str.cat(list(t), na_rep="-") assert_series_or_index_equal(result, expected) # errors for incorrect lengths rgx = r"If `others` contains arrays or lists \(or other list-likes.*" z = Series(["1", "2", "3"]) with pytest.raises(ValueError, match=rgx): s.str.cat(z.values) with pytest.raises(ValueError, match=rgx): s.str.cat(list(z)) def test_str_cat_raises_intuitive_error(self, index_or_series): # GH 11334 box = index_or_series s = box(["a", "b", "c", "d"]) message = "Did you mean to supply a `sep` keyword?" with pytest.raises(ValueError, match=message): s.str.cat("|") with pytest.raises(ValueError, match=message): s.str.cat(" ") @pytest.mark.parametrize("sep", ["", None]) @pytest.mark.parametrize("dtype_target", ["object", "category"]) @pytest.mark.parametrize("dtype_caller", ["object", "category"]) def test_str_cat_categorical( self, index_or_series, dtype_caller, dtype_target, sep ): box = index_or_series s = Index(["a", "a", "b", "a"], dtype=dtype_caller) s = s if box == Index else Series(s, index=s) t = Index(["b", "a", "b", "c"], dtype=dtype_target) expected = Index(["ab", "aa", "bb", "ac"]) expected = expected if box == Index else Series(expected, index=s) # Series/Index with unaligned Index -> t.values result = s.str.cat(t.values, sep=sep) assert_series_or_index_equal(result, expected) # Series/Index with Series having matching Index t = Series(t.values, index=s) result = s.str.cat(t, sep=sep) assert_series_or_index_equal(result, expected) # Series/Index with Series.values result = s.str.cat(t.values, sep=sep) assert_series_or_index_equal(result, expected) # Series/Index with Series having different Index t = Series(t.values, index=t.values) expected = Index(["aa", "aa", "aa", "bb", "bb"]) expected = ( expected if box == Index else Series(expected, index=expected.str[:1]) ) result = s.str.cat(t, sep=sep) assert_series_or_index_equal(result, expected) # test integer/float dtypes (inferred by constructor) and mixed @pytest.mark.parametrize( "data", [[1, 2, 3], [0.1, 0.2, 0.3], [1, 2, "b"]], ids=["integers", "floats", "mixed"], ) # without dtype=object, np.array would cast [1, 2, 'b'] to ['1', '2', 'b'] @pytest.mark.parametrize( "box", [Series, Index, list, lambda x: np.array(x, dtype=object)], ids=["Series", "Index", "list", "np.array"], ) def test_str_cat_wrong_dtype_raises(self, box, data): # GH 22722 s = Series(["a", "b", "c"]) t = box(data) msg = "Concatenation requires list-likes containing only strings.*" with pytest.raises(TypeError, match=msg): # need to use outer and na_rep, as otherwise Index would not raise s.str.cat(t, join="outer", na_rep="-") def test_str_cat_mixed_inputs(self, index_or_series): box = index_or_series s = Index(["a", "b", "c", "d"]) s = s if box == Index else Series(s, index=s) t = Series(["A", "B", "C", "D"], index=s.values) d = concat([t, Series(s, index=s)], axis=1) expected = Index(["aAa", "bBb", "cCc", "dDd"]) expected = expected if box == Index else Series(expected.values, index=s.values) # Series/Index with DataFrame result = s.str.cat(d) assert_series_or_index_equal(result, expected) # Series/Index with two-dimensional ndarray result = s.str.cat(d.values) assert_series_or_index_equal(result, expected) # Series/Index with list of Series result = s.str.cat([t, s]) assert_series_or_index_equal(result, expected) # Series/Index with mixed list of Series/array result = s.str.cat([t, s.values]) assert_series_or_index_equal(result, expected) # Series/Index with list of Series; different indexes t.index = ["b", "c", "d", "a"] expected = box(["aDa", "bAb", "cBc", "dCd"]) expected = expected if box == Index else Series(expected.values, index=s.values) result = s.str.cat([t, s]) assert_series_or_index_equal(result, expected) # Series/Index with mixed list; different index result = s.str.cat([t, s.values]) assert_series_or_index_equal(result, expected) # Series/Index with DataFrame; different indexes d.index = ["b", "c", "d", "a"] expected = box(["aDd", "bAa", "cBb", "dCc"]) expected = expected if box == Index else Series(expected.values, index=s.values) result = s.str.cat(d) assert_series_or_index_equal(result, expected) # errors for incorrect lengths rgx = r"If `others` contains arrays or lists \(or other list-likes.*" z = Series(["1", "2", "3"]) e = concat([z, z], axis=1) # two-dimensional ndarray with pytest.raises(ValueError, match=rgx): s.str.cat(e.values) # list of list-likes with pytest.raises(ValueError, match=rgx): s.str.cat([z.values, s.values]) # mixed list of Series/list-like with pytest.raises(ValueError, match=rgx): s.str.cat([z.values, s]) # errors for incorrect arguments in list-like rgx = "others must be Series, Index, DataFrame,.*" # make sure None/NaN do not crash checks in _get_series_list u = Series(["a", np.nan, "c", None]) # mix of string and Series with pytest.raises(TypeError, match=rgx): s.str.cat([u, "u"]) # DataFrame in list with pytest.raises(TypeError, match=rgx): s.str.cat([u, d]) # 2-dim ndarray in list with pytest.raises(TypeError, match=rgx): s.str.cat([u, d.values]) # nested lists with pytest.raises(TypeError, match=rgx): s.str.cat([u, [u, d]]) # forbidden input type: set # GH 23009 with pytest.raises(TypeError, match=rgx): s.str.cat(set(u)) # forbidden input type: set in list # GH 23009 with pytest.raises(TypeError, match=rgx): s.str.cat([u, set(u)]) # other forbidden input type, e.g. int with pytest.raises(TypeError, match=rgx): s.str.cat(1) # nested list-likes with pytest.raises(TypeError, match=rgx): s.str.cat(iter([t.values, list(s)])) @pytest.mark.parametrize("join", ["left", "outer", "inner", "right"]) def test_str_cat_align_indexed(self, index_or_series, join): # https://github.com/pandas-dev/pandas/issues/18657 box = index_or_series s = Series(["a", "b", "c", "d"], index=["a", "b", "c", "d"]) t = Series(["D", "A", "E", "B"], index=["d", "a", "e", "b"]) sa, ta = s.align(t, join=join) # result after manual alignment of inputs expected = sa.str.cat(ta, na_rep="-") if box == Index: s = Index(s) sa = Index(sa) expected = Index(expected) result = s.str.cat(t, join=join, na_rep="-") assert_series_or_index_equal(result, expected) @pytest.mark.parametrize("join", ["left", "outer", "inner", "right"]) def test_str_cat_align_mixed_inputs(self, join): s = Series(["a", "b", "c", "d"]) t = Series(["d", "a", "e", "b"], index=[3, 0, 4, 1]) d = concat([t, t], axis=1) expected_outer = Series(["aaa", "bbb", "c--", "ddd", "-ee"]) expected = expected_outer.loc[s.index.join(t.index, how=join)] # list of Series result = s.str.cat([t, t], join=join, na_rep="-") tm.assert_series_equal(result, expected) # DataFrame result = s.str.cat(d, join=join, na_rep="-") tm.assert_series_equal(result, expected) # mixed list of indexed/unindexed u = np.array(["A", "B", "C", "D"]) expected_outer = Series(["aaA", "bbB", "c-C", "ddD", "-e-"]) # joint index of rhs [t, u]; u will be forced have index of s rhs_idx = t.index & s.index if join == "inner" else t.index | s.index expected = expected_outer.loc[s.index.join(rhs_idx, how=join)] result = s.str.cat([t, u], join=join, na_rep="-") tm.assert_series_equal(result, expected) with pytest.raises(TypeError, match="others must be Series,.*"): # nested lists are forbidden s.str.cat([t, list(u)], join=join) # errors for incorrect lengths rgx = r"If `others` contains arrays or lists \(or other list-likes.*" z = Series(["1", "2", "3"]).values # unindexed object of wrong length with pytest.raises(ValueError, match=rgx): s.str.cat(z, join=join) # unindexed object of wrong length in list with pytest.raises(ValueError, match=rgx): s.str.cat([t, z], join=join) index_or_series2 = [Series, Index] # type: ignore # List item 0 has incompatible type "Type[Series]"; expected "Type[PandasObject]" # See GH#29725 @pytest.mark.parametrize("other", index_or_series2) def test_str_cat_all_na(self, index_or_series, other): # GH 24044 box = index_or_series # check that all NaNs in caller / target work s = Index(["a", "b", "c", "d"]) s = s if box == Index else Series(s, index=s) t = other([np.nan] * 4, dtype=object) # add index of s for alignment t = t if other == Index else Series(t, index=s) # all-NA target if box == Series: expected = Series([np.nan] * 4, index=s.index, dtype=object) else: # box == Index expected = Index([np.nan] * 4, dtype=object) result = s.str.cat(t, join="left") assert_series_or_index_equal(result, expected) # all-NA caller (only for Series) if other == Series: expected = Series([np.nan] * 4, dtype=object, index=t.index) result = t.str.cat(s, join="left") tm.assert_series_equal(result, expected) def test_str_cat_special_cases(self): s = Series(["a", "b", "c", "d"]) t = Series(["d", "a", "e", "b"], index=[3, 0, 4, 1]) # iterator of elements with different types expected = Series(["aaa", "bbb", "c-c", "ddd", "-e-"]) result = s.str.cat(iter([t, s.values]), join="outer", na_rep="-") tm.assert_series_equal(result, expected) # right-align with different indexes in others expected = Series(["aa-", "d-d"], index=[0, 3]) result = s.str.cat([t.loc[[0]], t.loc[[3]]], join="right", na_rep="-") tm.assert_series_equal(result, expected) def test_cat_on_filtered_index(self): df = DataFrame( index=MultiIndex.from_product( [[2011, 2012], [1, 2, 3]], names=["year", "month"] ) ) df = df.reset_index() df = df[df.month > 1] str_year = df.year.astype("str") str_month = df.month.astype("str") str_both = str_year.str.cat(str_month, sep=" ") assert str_both.loc[1] == "2011 2" str_multiple = str_year.str.cat([str_month, str_month], sep=" ") assert str_multiple.loc[1] == "2011 2 2" def test_count(self): values = np.array( ["foo", "foofoo", np.nan, "foooofooofommmfoo"], dtype=np.object_ ) result = strings.str_count(values, "f[o]+") exp = np.array([1, 2, np.nan, 4]) tm.assert_numpy_array_equal(result, exp) result = Series(values).str.count("f[o]+") exp = Series([1, 2, np.nan, 4]) assert isinstance(result, Series) tm.assert_series_equal(result, exp) # mixed mixed = np.array( ["a", np.nan, "b", True, datetime.today(), "foo", None, 1, 2.0], dtype=object, ) rs = strings.str_count(mixed, "a") xp = np.array([1, np.nan, 0, np.nan, np.nan, 0, np.nan, np.nan, np.nan]) tm.assert_numpy_array_equal(rs, xp) rs = Series(mixed).str.count("a") xp = Series([1, np.nan, 0, np.nan, np.nan, 0, np.nan, np.nan, np.nan]) assert isinstance(rs, Series) tm.assert_series_equal(rs, xp) def test_contains(self): values = np.array( ["foo", np.nan, "fooommm__foo", "mmm_", "foommm[_]+bar"], dtype=np.object_ ) pat = "mmm[_]+" result = strings.str_contains(values, pat) expected = np.array([False, np.nan, True, True, False], dtype=np.object_) tm.assert_numpy_array_equal(result, expected) result = strings.str_contains(values, pat, regex=False) expected = np.array([False, np.nan, False, False, True], dtype=np.object_) tm.assert_numpy_array_equal(result, expected) values = np.array(["foo", "xyz", "fooommm__foo", "mmm_"], dtype=object) result = strings.str_contains(values, pat) expected = np.array([False, False, True, True]) assert result.dtype == np.bool_ tm.assert_numpy_array_equal(result, expected) # case insensitive using regex values = np.array(["Foo", "xYz", "fOOomMm__fOo", "MMM_"], dtype=object) result = strings.str_contains(values, "FOO|mmm", case=False) expected = np.array([True, False, True, True]) tm.assert_numpy_array_equal(result, expected) # case insensitive without regex result = strings.str_contains(values, "foo", regex=False, case=False) expected = np.array([True, False, True, False]) tm.assert_numpy_array_equal(result, expected) # mixed mixed = np.array( ["a", np.nan, "b", True, datetime.today(), "foo", None, 1, 2.0], dtype=object, ) rs = strings.str_contains(mixed, "o") xp = np.array( [False, np.nan, False, np.nan, np.nan, True, np.nan, np.nan, np.nan], dtype=np.object_, ) tm.assert_numpy_array_equal(rs, xp) rs = Series(mixed).str.contains("o") xp = Series( [False, np.nan, False, np.nan, np.nan, True, np.nan, np.nan, np.nan] ) assert isinstance(rs, Series) tm.assert_series_equal(rs, xp) # unicode values = np.array(["foo", np.nan, "fooommm__foo", "mmm_"], dtype=np.object_) pat = "mmm[_]+" result = strings.str_contains(values, pat) expected = np.array([False, np.nan, True, True], dtype=np.object_) tm.assert_numpy_array_equal(result, expected) result = strings.str_contains(values, pat, na=False) expected = np.array([False, False, True, True]) tm.assert_numpy_array_equal(result, expected) values = np.array(["foo", "xyz", "fooommm__foo", "mmm_"], dtype=np.object_) result = strings.str_contains(values, pat) expected = np.array([False, False, True, True]) assert result.dtype == np.bool_ tm.assert_numpy_array_equal(result, expected) def test_contains_for_object_category(self): # gh 22158 # na for category values = Series(["a", "b", "c", "a", np.nan], dtype="category") result = values.str.contains("a", na=True) expected = Series([True, False, False, True, True]) tm.assert_series_equal(result, expected) result = values.str.contains("a", na=False) expected = Series([True, False, False, True, False]) tm.assert_series_equal(result, expected) # na for objects values = Series(["a", "b", "c", "a", np.nan]) result = values.str.contains("a", na=True) expected = Series([True, False, False, True, True]) tm.assert_series_equal(result, expected) result = values.str.contains("a", na=False) expected = Series([True, False, False, True, False]) tm.assert_series_equal(result, expected) def test_startswith(self): values = Series(["om", np.nan, "foo_nom", "nom", "bar_foo", np.nan, "foo"]) result = values.str.startswith("foo") exp = Series([False, np.nan, True, False, False, np.nan, True]) tm.assert_series_equal(result, exp) result = values.str.startswith("foo", na=True) tm.assert_series_equal(result, exp.fillna(True).astype(bool)) # mixed mixed = np.array( ["a", np.nan, "b", True, datetime.today(), "foo", None, 1, 2.0], dtype=np.object_, ) rs = strings.str_startswith(mixed, "f") xp = np.array( [False, np.nan, False, np.nan, np.nan, True, np.nan, np.nan, np.nan], dtype=np.object_, ) tm.assert_numpy_array_equal(rs, xp) rs = Series(mixed).str.startswith("f") assert isinstance(rs, Series) xp = Series( [False, np.nan, False, np.nan, np.nan, True, np.nan, np.nan, np.nan] ) tm.assert_series_equal(rs, xp) def test_endswith(self): values = Series(["om", np.nan, "foo_nom", "nom", "bar_foo", np.nan, "foo"]) result = values.str.endswith("foo") exp = Series([False, np.nan, False, False, True, np.nan, True]) tm.assert_series_equal(result, exp) result = values.str.endswith("foo", na=False) tm.assert_series_equal(result, exp.fillna(False).astype(bool)) # mixed mixed = np.array( ["a", np.nan, "b", True, datetime.today(), "foo", None, 1, 2.0], dtype=object, ) rs = strings.str_endswith(mixed, "f") xp = np.array( [False, np.nan, False, np.nan, np.nan, False, np.nan, np.nan, np.nan], dtype=np.object_, ) tm.assert_numpy_array_equal(rs, xp) rs = Series(mixed).str.endswith("f") xp = Series( [False, np.nan, False, np.nan, np.nan, False, np.nan, np.nan, np.nan] ) assert isinstance(rs, Series) tm.assert_series_equal(rs, xp) def test_title(self): values = Series(["FOO", "BAR", np.nan, "Blah", "blurg"]) result = values.str.title() exp = Series(["Foo", "Bar", np.nan, "Blah", "Blurg"]) tm.assert_series_equal(result, exp) # mixed mixed = Series( ["FOO", np.nan, "bar", True, datetime.today(), "blah", None, 1, 2.0] ) mixed = mixed.str.title() exp = Series( ["Foo", np.nan, "Bar", np.nan, np.nan, "Blah", np.nan, np.nan, np.nan] ) tm.assert_almost_equal(mixed, exp) def test_lower_upper(self): values = Series(["om", np.nan, "nom", "nom"]) result = values.str.upper() exp = Series(["OM", np.nan, "NOM", "NOM"]) tm.assert_series_equal(result, exp) result = result.str.lower() tm.assert_series_equal(result, values) # mixed mixed = Series(["a", np.nan, "b", True, datetime.today(), "foo", None, 1, 2.0]) mixed = mixed.str.upper() rs = Series(mixed).str.lower() xp = Series(["a", np.nan, "b", np.nan, np.nan, "foo", np.nan, np.nan, np.nan]) assert isinstance(rs, Series) tm.assert_series_equal(rs, xp) def test_capitalize(self): values = Series(["FOO", "BAR", np.nan, "Blah", "blurg"]) result = values.str.capitalize() exp = Series(["Foo", "Bar", np.nan, "Blah", "Blurg"]) tm.assert_series_equal(result, exp) # mixed mixed = Series( ["FOO", np.nan, "bar", True, datetime.today(), "blah", None, 1, 2.0] ) mixed = mixed.str.capitalize() exp = Series( ["Foo", np.nan, "Bar", np.nan, np.nan, "Blah", np.nan, np.nan, np.nan] ) tm.assert_almost_equal(mixed, exp) def test_swapcase(self): values = Series(["FOO", "BAR", np.nan, "Blah", "blurg"]) result = values.str.swapcase() exp = Series(["foo", "bar", np.nan, "bLAH", "BLURG"]) tm.assert_series_equal(result, exp) # mixed mixed = Series( ["FOO", np.nan, "bar", True, datetime.today(), "Blah", None, 1, 2.0] ) mixed = mixed.str.swapcase() exp = Series( ["foo", np.nan, "BAR", np.nan, np.nan, "bLAH", np.nan, np.nan, np.nan] ) tm.assert_almost_equal(mixed, exp) def test_casemethods(self): values = ["aaa", "bbb", "CCC", "Dddd", "eEEE"] s = Series(values) assert s.str.lower().tolist() == [v.lower() for v in values] assert s.str.upper().tolist() == [v.upper() for v in values] assert s.str.title().tolist() == [v.title() for v in values] assert s.str.capitalize().tolist() == [v.capitalize() for v in values] assert s.str.swapcase().tolist() == [v.swapcase() for v in values] def test_replace(self): values = Series(["fooBAD__barBAD", np.nan]) result = values.str.replace("BAD[_]*", "") exp = Series(["foobar", np.nan]) tm.assert_series_equal(result, exp) result = values.str.replace("BAD[_]*", "", n=1) exp = Series(["foobarBAD", np.nan]) tm.assert_series_equal(result, exp) # mixed mixed = Series( ["aBAD", np.nan, "bBAD", True, datetime.today(), "fooBAD", None, 1, 2.0] ) rs = Series(mixed).str.replace("BAD[_]*", "") xp = Series(["a", np.nan, "b", np.nan, np.nan, "foo", np.nan, np.nan, np.nan]) assert isinstance(rs, Series) tm.assert_almost_equal(rs, xp) # flags + unicode values = Series([b"abcd,\xc3\xa0".decode("utf-8")]) exp = Series([b"abcd, \xc3\xa0".decode("utf-8")]) result = values.str.replace(r"(?<=\w),(?=\w)", ", ", flags=re.UNICODE) tm.assert_series_equal(result, exp) # GH 13438 msg = "repl must be a string or callable" for klass in (Series, Index): for repl in (None, 3, {"a": "b"}): for data in (["a", "b", None], ["a", "b", "c", "ad"]): values = klass(data) with pytest.raises(TypeError, match=msg): values.str.replace("a", repl) def test_replace_callable(self): # GH 15055 values = Series(["fooBAD__barBAD", np.nan]) # test with callable repl = lambda m: m.group(0).swapcase() result = values.str.replace("[a-z][A-Z]{2}", repl, n=2) exp = Series(["foObaD__baRbaD", np.nan]) tm.assert_series_equal(result, exp) # test with wrong number of arguments, raising an error p_err = ( r"((takes)|(missing)) (?(2)from \d+ to )?\d+ " r"(?(3)required )positional arguments?" ) repl = lambda: None with pytest.raises(TypeError, match=p_err): values.str.replace("a", repl) repl = lambda m, x: None with pytest.raises(TypeError, match=p_err): values.str.replace("a", repl) repl = lambda m, x, y=None: None with pytest.raises(TypeError, match=p_err): values.str.replace("a", repl) # test regex named groups values = Series(["Foo Bar Baz", np.nan]) pat = r"(?P\w+) (?P\w+) (?P\w+)" repl = lambda m: m.group("middle").swapcase() result = values.str.replace(pat, repl) exp = Series(["bAR", np.nan]) tm.assert_series_equal(result, exp) def test_replace_compiled_regex(self): # GH 15446 values = Series(["fooBAD__barBAD", np.nan]) # test with compiled regex pat = re.compile(r"BAD[_]*") result = values.str.replace(pat, "") exp = Series(["foobar", np.nan]) tm.assert_series_equal(result, exp) result = values.str.replace(pat, "", n=1) exp = Series(["foobarBAD", np.nan]) tm.assert_series_equal(result, exp) # mixed mixed = Series( ["aBAD", np.nan, "bBAD", True, datetime.today(), "fooBAD", None, 1, 2.0] ) rs = Series(mixed).str.replace(pat, "") xp = Series(["a", np.nan, "b", np.nan, np.nan, "foo", np.nan, np.nan, np.nan]) assert isinstance(rs, Series) tm.assert_almost_equal(rs, xp) # flags + unicode values = Series([b"abcd,\xc3\xa0".decode("utf-8")]) exp = Series([b"abcd, \xc3\xa0".decode("utf-8")]) pat = re.compile(r"(?<=\w),(?=\w)", flags=re.UNICODE) result = values.str.replace(pat, ", ") tm.assert_series_equal(result, exp) # case and flags provided to str.replace will have no effect # and will produce warnings values = Series(["fooBAD__barBAD__bad", np.nan]) pat = re.compile(r"BAD[_]*") with pytest.raises(ValueError, match="case and flags cannot be"): result = values.str.replace(pat, "", flags=re.IGNORECASE) with pytest.raises(ValueError, match="case and flags cannot be"): result = values.str.replace(pat, "", case=False) with pytest.raises(ValueError, match="case and flags cannot be"): result = values.str.replace(pat, "", case=True) # test with callable values = Series(["fooBAD__barBAD", np.nan]) repl = lambda m: m.group(0).swapcase() pat = re.compile("[a-z][A-Z]{2}") result = values.str.replace(pat, repl, n=2) exp = Series(["foObaD__baRbaD", np.nan]) tm.assert_series_equal(result, exp) def test_replace_literal(self): # GH16808 literal replace (regex=False vs regex=True) values = Series(["f.o", "foo", np.nan]) exp = Series(["bao", "bao", np.nan]) result = values.str.replace("f.", "ba") tm.assert_series_equal(result, exp) exp = Series(["bao", "foo", np.nan]) result = values.str.replace("f.", "ba", regex=False) tm.assert_series_equal(result, exp) # Cannot do a literal replace if given a callable repl or compiled # pattern callable_repl = lambda m: m.group(0).swapcase() compiled_pat = re.compile("[a-z][A-Z]{2}") msg = "Cannot use a callable replacement when regex=False" with pytest.raises(ValueError, match=msg): values.str.replace("abc", callable_repl, regex=False) msg = "Cannot use a compiled regex as replacement pattern with regex=False" with pytest.raises(ValueError, match=msg): values.str.replace(compiled_pat, "", regex=False) def test_repeat(self): values = Series(["a", "b", np.nan, "c", np.nan, "d"]) result = values.str.repeat(3) exp = Series(["aaa", "bbb", np.nan, "ccc", np.nan, "ddd"]) tm.assert_series_equal(result, exp) result = values.str.repeat([1, 2, 3, 4, 5, 6]) exp = Series(["a", "bb", np.nan, "cccc", np.nan, "dddddd"]) tm.assert_series_equal(result, exp) # mixed mixed = Series(["a", np.nan, "b", True, datetime.today(), "foo", None, 1, 2.0]) rs = Series(mixed).str.repeat(3) xp = Series( ["aaa", np.nan, "bbb", np.nan, np.nan, "foofoofoo", np.nan, np.nan, np.nan] ) assert isinstance(rs, Series) tm.assert_series_equal(rs, xp) def test_repeat_with_null(self): # GH: 31632 values = Series(["a", None], dtype="string") result = values.str.repeat([3, 4]) exp = Series(["aaa", None], dtype="string") tm.assert_series_equal(result, exp) values = Series(["a", "b"], dtype="string") result = values.str.repeat([3, None]) exp = Series(["aaa", None], dtype="string") tm.assert_series_equal(result, exp) def test_match(self): # New match behavior introduced in 0.13 values = Series(["fooBAD__barBAD", np.nan, "foo"]) result = values.str.match(".*(BAD[_]+).*(BAD)") exp = Series([True, np.nan, False]) tm.assert_series_equal(result, exp) values = Series(["fooBAD__barBAD", np.nan, "foo"]) result = values.str.match(".*BAD[_]+.*BAD") exp = Series([True, np.nan, False]) tm.assert_series_equal(result, exp) # mixed mixed = Series( [ "aBAD_BAD", np.nan, "BAD_b_BAD", True, datetime.today(), "foo", None, 1, 2.0, ] ) rs = Series(mixed).str.match(".*(BAD[_]+).*(BAD)") xp = Series([True, np.nan, True, np.nan, np.nan, False, np.nan, np.nan, np.nan]) assert isinstance(rs, Series) tm.assert_series_equal(rs, xp) # na GH #6609 res = Series(["a", 0, np.nan]).str.match("a", na=False) exp = Series([True, False, False]) tm.assert_series_equal(exp, res) res = Series(["a", 0, np.nan]).str.match("a") exp = Series([True, np.nan, np.nan]) tm.assert_series_equal(exp, res) def test_extract_expand_None(self): values = Series(["fooBAD__barBAD", np.nan, "foo"]) with pytest.raises(ValueError, match="expand must be True or False"): values.str.extract(".*(BAD[_]+).*(BAD)", expand=None) def test_extract_expand_unspecified(self): values = Series(["fooBAD__barBAD", np.nan, "foo"]) result_unspecified = values.str.extract(".*(BAD[_]+).*") assert isinstance(result_unspecified, DataFrame) result_true = values.str.extract(".*(BAD[_]+).*", expand=True) tm.assert_frame_equal(result_unspecified, result_true) def test_extract_expand_False(self): # Contains tests like those in test_match and some others. values = Series(["fooBAD__barBAD", np.nan, "foo"]) er = [np.nan, np.nan] # empty row result = values.str.extract(".*(BAD[_]+).*(BAD)", expand=False) exp = DataFrame([["BAD__", "BAD"], er, er]) tm.assert_frame_equal(result, exp) # mixed mixed = Series( [ "aBAD_BAD", np.nan, "BAD_b_BAD", True, datetime.today(), "foo", None, 1, 2.0, ] ) rs = Series(mixed).str.extract(".*(BAD[_]+).*(BAD)", expand=False) exp = DataFrame([["BAD_", "BAD"], er, ["BAD_", "BAD"], er, er, er, er, er, er]) tm.assert_frame_equal(rs, exp) # unicode values = Series(["fooBAD__barBAD", np.nan, "foo"]) result = values.str.extract(".*(BAD[_]+).*(BAD)", expand=False) exp = DataFrame([["BAD__", "BAD"], er, er]) tm.assert_frame_equal(result, exp) # GH9980 # Index only works with one regex group since # multi-group would expand to a frame idx = Index(["A1", "A2", "A3", "A4", "B5"]) with pytest.raises(ValueError, match="supported"): idx.str.extract("([AB])([123])", expand=False) # these should work for both Series and Index for klass in [Series, Index]: # no groups s_or_idx = klass(["A1", "B2", "C3"]) msg = "pattern contains no capture groups" with pytest.raises(ValueError, match=msg): s_or_idx.str.extract("[ABC][123]", expand=False) # only non-capturing groups with pytest.raises(ValueError, match=msg): s_or_idx.str.extract("(?:[AB]).*", expand=False) # single group renames series/index properly s_or_idx = klass(["A1", "A2"]) result = s_or_idx.str.extract(r"(?PA)\d", expand=False) assert result.name == "uno" exp = klass(["A", "A"], name="uno") if klass == Series: tm.assert_series_equal(result, exp) else: tm.assert_index_equal(result, exp) s = Series(["A1", "B2", "C3"]) # one group, no matches result = s.str.extract("(_)", expand=False) exp = Series([np.nan, np.nan, np.nan], dtype=object) tm.assert_series_equal(result, exp) # two groups, no matches result = s.str.extract("(_)(_)", expand=False) exp = DataFrame( [[np.nan, np.nan], [np.nan, np.nan], [np.nan, np.nan]], dtype=object ) tm.assert_frame_equal(result, exp) # one group, some matches result = s.str.extract("([AB])[123]", expand=False) exp = Series(["A", "B", np.nan]) tm.assert_series_equal(result, exp) # two groups, some matches result = s.str.extract("([AB])([123])", expand=False) exp = DataFrame([["A", "1"], ["B", "2"], [np.nan, np.nan]]) tm.assert_frame_equal(result, exp) # one named group result = s.str.extract("(?P[AB])", expand=False) exp = Series(["A", "B", np.nan], name="letter") tm.assert_series_equal(result, exp) # two named groups result = s.str.extract("(?P[AB])(?P[123])", expand=False) exp = DataFrame( [["A", "1"], ["B", "2"], [np.nan, np.nan]], columns=["letter", "number"] ) tm.assert_frame_equal(result, exp) # mix named and unnamed groups result = s.str.extract("([AB])(?P[123])", expand=False) exp = DataFrame( [["A", "1"], ["B", "2"], [np.nan, np.nan]], columns=[0, "number"] ) tm.assert_frame_equal(result, exp) # one normal group, one non-capturing group result = s.str.extract("([AB])(?:[123])", expand=False) exp = Series(["A", "B", np.nan]) tm.assert_series_equal(result, exp) # two normal groups, one non-capturing group result = Series(["A11", "B22", "C33"]).str.extract( "([AB])([123])(?:[123])", expand=False ) exp = DataFrame([["A", "1"], ["B", "2"], [np.nan, np.nan]]) tm.assert_frame_equal(result, exp) # one optional group followed by one normal group result = Series(["A1", "B2", "3"]).str.extract( "(?P[AB])?(?P[123])", expand=False ) exp = DataFrame( [["A", "1"], ["B", "2"], [np.nan, "3"]], columns=["letter", "number"] ) tm.assert_frame_equal(result, exp) # one normal group followed by one optional group result = Series(["A1", "B2", "C"]).str.extract( "(?P[ABC])(?P[123])?", expand=False ) exp = DataFrame( [["A", "1"], ["B", "2"], ["C", np.nan]], columns=["letter", "number"] ) tm.assert_frame_equal(result, exp) # GH6348 # not passing index to the extractor def check_index(index): data = ["A1", "B2", "C"] index = index[: len(data)] s = Series(data, index=index) result = s.str.extract(r"(\d)", expand=False) exp = Series(["1", "2", np.nan], index=index) tm.assert_series_equal(result, exp) result = Series(data, index=index).str.extract( r"(?P\D)(?P\d)?", expand=False ) e_list = [["A", "1"], ["B", "2"], ["C", np.nan]] exp = DataFrame(e_list, columns=["letter", "number"], index=index) tm.assert_frame_equal(result, exp) i_funs = [ tm.makeStringIndex, tm.makeUnicodeIndex, tm.makeIntIndex, tm.makeDateIndex, tm.makePeriodIndex, tm.makeRangeIndex, ] for index in i_funs: check_index(index()) # single_series_name_is_preserved. s = Series(["a3", "b3", "c2"], name="bob") r = s.str.extract(r"(?P[a-z])", expand=False) e = Series(["a", "b", "c"], name="sue") tm.assert_series_equal(r, e) assert r.name == e.name def test_extract_expand_True(self): # Contains tests like those in test_match and some others. values = Series(["fooBAD__barBAD", np.nan, "foo"]) er = [np.nan, np.nan] # empty row result = values.str.extract(".*(BAD[_]+).*(BAD)", expand=True) exp = DataFrame([["BAD__", "BAD"], er, er]) tm.assert_frame_equal(result, exp) # mixed mixed = Series( [ "aBAD_BAD", np.nan, "BAD_b_BAD", True, datetime.today(), "foo", None, 1, 2.0, ] ) rs = Series(mixed).str.extract(".*(BAD[_]+).*(BAD)", expand=True) exp = DataFrame([["BAD_", "BAD"], er, ["BAD_", "BAD"], er, er, er, er, er, er]) tm.assert_frame_equal(rs, exp) # these should work for both Series and Index for klass in [Series, Index]: # no groups s_or_idx = klass(["A1", "B2", "C3"]) msg = "pattern contains no capture groups" with pytest.raises(ValueError, match=msg): s_or_idx.str.extract("[ABC][123]", expand=True) # only non-capturing groups with pytest.raises(ValueError, match=msg): s_or_idx.str.extract("(?:[AB]).*", expand=True) # single group renames series/index properly s_or_idx = klass(["A1", "A2"]) result_df = s_or_idx.str.extract(r"(?PA)\d", expand=True) assert isinstance(result_df, DataFrame) result_series = result_df["uno"] tm.assert_series_equal(result_series, Series(["A", "A"], name="uno")) def test_extract_series(self): # extract should give the same result whether or not the # series has a name. for series_name in None, "series_name": s = Series(["A1", "B2", "C3"], name=series_name) # one group, no matches result = s.str.extract("(_)", expand=True) exp = DataFrame([np.nan, np.nan, np.nan], dtype=object) tm.assert_frame_equal(result, exp) # two groups, no matches result = s.str.extract("(_)(_)", expand=True) exp = DataFrame( [[np.nan, np.nan], [np.nan, np.nan], [np.nan, np.nan]], dtype=object ) tm.assert_frame_equal(result, exp) # one group, some matches result = s.str.extract("([AB])[123]", expand=True) exp = DataFrame(["A", "B", np.nan]) tm.assert_frame_equal(result, exp) # two groups, some matches result = s.str.extract("([AB])([123])", expand=True) exp = DataFrame([["A", "1"], ["B", "2"], [np.nan, np.nan]]) tm.assert_frame_equal(result, exp) # one named group result = s.str.extract("(?P[AB])", expand=True) exp = DataFrame({"letter": ["A", "B", np.nan]}) tm.assert_frame_equal(result, exp) # two named groups result = s.str.extract("(?P[AB])(?P[123])", expand=True) e_list = [["A", "1"], ["B", "2"], [np.nan, np.nan]] exp = DataFrame(e_list, columns=["letter", "number"]) tm.assert_frame_equal(result, exp) # mix named and unnamed groups result = s.str.extract("([AB])(?P[123])", expand=True) exp = DataFrame(e_list, columns=[0, "number"]) tm.assert_frame_equal(result, exp) # one normal group, one non-capturing group result = s.str.extract("([AB])(?:[123])", expand=True) exp = DataFrame(["A", "B", np.nan]) tm.assert_frame_equal(result, exp) def test_extract_optional_groups(self): # two normal groups, one non-capturing group result = Series(["A11", "B22", "C33"]).str.extract( "([AB])([123])(?:[123])", expand=True ) exp = DataFrame([["A", "1"], ["B", "2"], [np.nan, np.nan]]) tm.assert_frame_equal(result, exp) # one optional group followed by one normal group result = Series(["A1", "B2", "3"]).str.extract( "(?P[AB])?(?P[123])", expand=True ) e_list = [["A", "1"], ["B", "2"], [np.nan, "3"]] exp = DataFrame(e_list, columns=["letter", "number"]) tm.assert_frame_equal(result, exp) # one normal group followed by one optional group result = Series(["A1", "B2", "C"]).str.extract( "(?P[ABC])(?P[123])?", expand=True ) e_list = [["A", "1"], ["B", "2"], ["C", np.nan]] exp = DataFrame(e_list, columns=["letter", "number"]) tm.assert_frame_equal(result, exp) # GH6348 # not passing index to the extractor def check_index(index): data = ["A1", "B2", "C"] index = index[: len(data)] result = Series(data, index=index).str.extract(r"(\d)", expand=True) exp = DataFrame(["1", "2", np.nan], index=index) tm.assert_frame_equal(result, exp) result = Series(data, index=index).str.extract( r"(?P\D)(?P\d)?", expand=True ) e_list = [["A", "1"], ["B", "2"], ["C", np.nan]] exp = DataFrame(e_list, columns=["letter", "number"], index=index) tm.assert_frame_equal(result, exp) i_funs = [ tm.makeStringIndex, tm.makeUnicodeIndex, tm.makeIntIndex, tm.makeDateIndex, tm.makePeriodIndex, tm.makeRangeIndex, ] for index in i_funs: check_index(index()) def test_extract_single_group_returns_frame(self): # GH11386 extract should always return DataFrame, even when # there is only one group. Prior to v0.18.0, extract returned # Series when there was only one group in the regex. s = Series(["a3", "b3", "c2"], name="series_name") r = s.str.extract(r"(?P[a-z])", expand=True) e = DataFrame({"letter": ["a", "b", "c"]}) tm.assert_frame_equal(r, e) def test_extractall(self): subject_list = [ "dave@google.com", "tdhock5@gmail.com", "maudelaperriere@gmail.com", "rob@gmail.com some text steve@gmail.com", "a@b.com some text c@d.com and e@f.com", np.nan, "", ] expected_tuples = [ ("dave", "google", "com"), ("tdhock5", "gmail", "com"), ("maudelaperriere", "gmail", "com"), ("rob", "gmail", "com"), ("steve", "gmail", "com"), ("a", "b", "com"), ("c", "d", "com"), ("e", "f", "com"), ] named_pattern = r""" (?P[a-z0-9]+) @ (?P[a-z]+) \. (?P[a-z]{2,4}) """ expected_columns = ["user", "domain", "tld"] S = Series(subject_list) # extractall should return a DataFrame with one row for each # match, indexed by the subject from which the match came. expected_index = MultiIndex.from_tuples( [(0, 0), (1, 0), (2, 0), (3, 0), (3, 1), (4, 0), (4, 1), (4, 2)], names=(None, "match"), ) expected_df = DataFrame(expected_tuples, expected_index, expected_columns) computed_df = S.str.extractall(named_pattern, re.VERBOSE) tm.assert_frame_equal(computed_df, expected_df) # The index of the input Series should be used to construct # the index of the output DataFrame: series_index = MultiIndex.from_tuples( [ ("single", "Dave"), ("single", "Toby"), ("single", "Maude"), ("multiple", "robAndSteve"), ("multiple", "abcdef"), ("none", "missing"), ("none", "empty"), ] ) Si = Series(subject_list, series_index) expected_index = MultiIndex.from_tuples( [ ("single", "Dave", 0), ("single", "Toby", 0), ("single", "Maude", 0), ("multiple", "robAndSteve", 0), ("multiple", "robAndSteve", 1), ("multiple", "abcdef", 0), ("multiple", "abcdef", 1), ("multiple", "abcdef", 2), ], names=(None, None, "match"), ) expected_df = DataFrame(expected_tuples, expected_index, expected_columns) computed_df = Si.str.extractall(named_pattern, re.VERBOSE) tm.assert_frame_equal(computed_df, expected_df) # MultiIndexed subject with names. Sn = Series(subject_list, series_index) Sn.index.names = ("matches", "description") expected_index.names = ("matches", "description", "match") expected_df = DataFrame(expected_tuples, expected_index, expected_columns) computed_df = Sn.str.extractall(named_pattern, re.VERBOSE) tm.assert_frame_equal(computed_df, expected_df) # optional groups. subject_list = ["", "A1", "32"] named_pattern = "(?P[AB])?(?P[123])" computed_df = Series(subject_list).str.extractall(named_pattern) expected_index = MultiIndex.from_tuples( [(1, 0), (2, 0), (2, 1)], names=(None, "match") ) expected_df = DataFrame( [("A", "1"), (np.nan, "3"), (np.nan, "2")], expected_index, columns=["letter", "number"], ) tm.assert_frame_equal(computed_df, expected_df) # only one of two groups has a name. pattern = "([AB])?(?P[123])" computed_df = Series(subject_list).str.extractall(pattern) expected_df = DataFrame( [("A", "1"), (np.nan, "3"), (np.nan, "2")], expected_index, columns=[0, "number"], ) tm.assert_frame_equal(computed_df, expected_df) def test_extractall_single_group(self): # extractall(one named group) returns DataFrame with one named # column. s = Series(["a3", "b3", "d4c2"], name="series_name") r = s.str.extractall(r"(?P[a-z])") i = MultiIndex.from_tuples( [(0, 0), (1, 0), (2, 0), (2, 1)], names=(None, "match") ) e = DataFrame({"letter": ["a", "b", "d", "c"]}, i) tm.assert_frame_equal(r, e) # extractall(one un-named group) returns DataFrame with one # un-named column. r = s.str.extractall(r"([a-z])") e = DataFrame(["a", "b", "d", "c"], i) tm.assert_frame_equal(r, e) def test_extractall_single_group_with_quantifier(self): # extractall(one un-named group with quantifier) returns # DataFrame with one un-named column (GH13382). s = Series(["ab3", "abc3", "d4cd2"], name="series_name") r = s.str.extractall(r"([a-z]+)") i = MultiIndex.from_tuples( [(0, 0), (1, 0), (2, 0), (2, 1)], names=(None, "match") ) e = DataFrame(["ab", "abc", "d", "cd"], i) tm.assert_frame_equal(r, e) @pytest.mark.parametrize( "data, names", [ ([], (None,)), ([], ("i1",)), ([], (None, "i2")), ([], ("i1", "i2")), (["a3", "b3", "d4c2"], (None,)), (["a3", "b3", "d4c2"], ("i1", "i2")), (["a3", "b3", "d4c2"], (None, "i2")), (["a3", "b3", "d4c2"], ("i1", "i2")), ], ) def test_extractall_no_matches(self, data, names): # GH19075 extractall with no matches should return a valid MultiIndex n = len(data) if len(names) == 1: i = Index(range(n), name=names[0]) else: a = (tuple([i] * (n - 1)) for i in range(n)) i = MultiIndex.from_tuples(a, names=names) s = Series(data, name="series_name", index=i, dtype="object") ei = MultiIndex.from_tuples([], names=(names + ("match",))) # one un-named group. r = s.str.extractall("(z)") e = DataFrame(columns=[0], index=ei) tm.assert_frame_equal(r, e) # two un-named groups. r = s.str.extractall("(z)(z)") e = DataFrame(columns=[0, 1], index=ei) tm.assert_frame_equal(r, e) # one named group. r = s.str.extractall("(?Pz)") e = DataFrame(columns=["first"], index=ei) tm.assert_frame_equal(r, e) # two named groups. r = s.str.extractall("(?Pz)(?Pz)") e = DataFrame(columns=["first", "second"], index=ei) tm.assert_frame_equal(r, e) # one named, one un-named. r = s.str.extractall("(z)(?Pz)") e = DataFrame(columns=[0, "second"], index=ei) tm.assert_frame_equal(r, e) def test_extractall_stringindex(self): s = Series(["a1a2", "b1", "c1"], name="xxx") res = s.str.extractall(r"[ab](?P\d)") exp_idx = MultiIndex.from_tuples( [(0, 0), (0, 1), (1, 0)], names=[None, "match"] ) exp = DataFrame({"digit": ["1", "2", "1"]}, index=exp_idx) tm.assert_frame_equal(res, exp) # index should return the same result as the default index without name # thus index.name doesn't affect to the result for idx in [ Index(["a1a2", "b1", "c1"]), Index(["a1a2", "b1", "c1"], name="xxx"), ]: res = idx.str.extractall(r"[ab](?P\d)") tm.assert_frame_equal(res, exp) s = Series( ["a1a2", "b1", "c1"], name="s_name", index=Index(["XX", "yy", "zz"], name="idx_name"), ) res = s.str.extractall(r"[ab](?P\d)") exp_idx = MultiIndex.from_tuples( [("XX", 0), ("XX", 1), ("yy", 0)], names=["idx_name", "match"] ) exp = DataFrame({"digit": ["1", "2", "1"]}, index=exp_idx) tm.assert_frame_equal(res, exp) def test_extractall_errors(self): # Does not make sense to use extractall with a regex that has # no capture groups. (it returns DataFrame with one column for # each capture group) s = Series(["a3", "b3", "d4c2"], name="series_name") with pytest.raises(ValueError, match="no capture groups"): s.str.extractall(r"[a-z]") def test_extract_index_one_two_groups(self): s = Series(["a3", "b3", "d4c2"], index=["A3", "B3", "D4"], name="series_name") r = s.index.str.extract(r"([A-Z])", expand=True) e = DataFrame(["A", "B", "D"]) tm.assert_frame_equal(r, e) # Prior to v0.18.0, index.str.extract(regex with one group) # returned Index. With more than one group, extract raised an # error (GH9980). Now extract always returns DataFrame. r = s.index.str.extract(r"(?P[A-Z])(?P[0-9])", expand=True) e_list = [("A", "3"), ("B", "3"), ("D", "4")] e = DataFrame(e_list, columns=["letter", "digit"]) tm.assert_frame_equal(r, e) def test_extractall_same_as_extract(self): s = Series(["a3", "b3", "c2"], name="series_name") pattern_two_noname = r"([a-z])([0-9])" extract_two_noname = s.str.extract(pattern_two_noname, expand=True) has_multi_index = s.str.extractall(pattern_two_noname) no_multi_index = has_multi_index.xs(0, level="match") tm.assert_frame_equal(extract_two_noname, no_multi_index) pattern_two_named = r"(?P[a-z])(?P[0-9])" extract_two_named = s.str.extract(pattern_two_named, expand=True) has_multi_index = s.str.extractall(pattern_two_named) no_multi_index = has_multi_index.xs(0, level="match") tm.assert_frame_equal(extract_two_named, no_multi_index) pattern_one_named = r"(?P[a-z])" extract_one_named = s.str.extract(pattern_one_named, expand=True) has_multi_index = s.str.extractall(pattern_one_named) no_multi_index = has_multi_index.xs(0, level="match") tm.assert_frame_equal(extract_one_named, no_multi_index) pattern_one_noname = r"([a-z])" extract_one_noname = s.str.extract(pattern_one_noname, expand=True) has_multi_index = s.str.extractall(pattern_one_noname) no_multi_index = has_multi_index.xs(0, level="match") tm.assert_frame_equal(extract_one_noname, no_multi_index) def test_extractall_same_as_extract_subject_index(self): # same as above tests, but s has an MultiIndex. i = MultiIndex.from_tuples( [("A", "first"), ("B", "second"), ("C", "third")], names=("capital", "ordinal"), ) s = Series(["a3", "b3", "c2"], i, name="series_name") pattern_two_noname = r"([a-z])([0-9])" extract_two_noname = s.str.extract(pattern_two_noname, expand=True) has_match_index = s.str.extractall(pattern_two_noname) no_match_index = has_match_index.xs(0, level="match") tm.assert_frame_equal(extract_two_noname, no_match_index) pattern_two_named = r"(?P[a-z])(?P[0-9])" extract_two_named = s.str.extract(pattern_two_named, expand=True) has_match_index = s.str.extractall(pattern_two_named) no_match_index = has_match_index.xs(0, level="match") tm.assert_frame_equal(extract_two_named, no_match_index) pattern_one_named = r"(?P[a-z])" extract_one_named = s.str.extract(pattern_one_named, expand=True) has_match_index = s.str.extractall(pattern_one_named) no_match_index = has_match_index.xs(0, level="match") tm.assert_frame_equal(extract_one_named, no_match_index) pattern_one_noname = r"([a-z])" extract_one_noname = s.str.extract(pattern_one_noname, expand=True) has_match_index = s.str.extractall(pattern_one_noname) no_match_index = has_match_index.xs(0, level="match") tm.assert_frame_equal(extract_one_noname, no_match_index) def test_empty_str_methods(self): empty_str = empty = Series(dtype=object) empty_int = Series(dtype="int64") empty_bool = Series(dtype=bool) empty_bytes = Series(dtype=object) # GH7241 # (extract) on empty series tm.assert_series_equal(empty_str, empty.str.cat(empty)) assert "" == empty.str.cat() tm.assert_series_equal(empty_str, empty.str.title()) tm.assert_series_equal(empty_int, empty.str.count("a")) tm.assert_series_equal(empty_bool, empty.str.contains("a")) tm.assert_series_equal(empty_bool, empty.str.startswith("a")) tm.assert_series_equal(empty_bool, empty.str.endswith("a")) tm.assert_series_equal(empty_str, empty.str.lower()) tm.assert_series_equal(empty_str, empty.str.upper()) tm.assert_series_equal(empty_str, empty.str.replace("a", "b")) tm.assert_series_equal(empty_str, empty.str.repeat(3)) tm.assert_series_equal(empty_bool, empty.str.match("^a")) tm.assert_frame_equal( DataFrame(columns=[0], dtype=str), empty.str.extract("()", expand=True) ) tm.assert_frame_equal( DataFrame(columns=[0, 1], dtype=str), empty.str.extract("()()", expand=True) ) tm.assert_series_equal(empty_str, empty.str.extract("()", expand=False)) tm.assert_frame_equal( DataFrame(columns=[0, 1], dtype=str), empty.str.extract("()()", expand=False), ) tm.assert_frame_equal(DataFrame(dtype=str), empty.str.get_dummies()) tm.assert_series_equal(empty_str, empty_str.str.join("")) tm.assert_series_equal(empty_int, empty.str.len()) tm.assert_series_equal(empty_str, empty_str.str.findall("a")) tm.assert_series_equal(empty_int, empty.str.find("a")) tm.assert_series_equal(empty_int, empty.str.rfind("a")) tm.assert_series_equal(empty_str, empty.str.pad(42)) tm.assert_series_equal(empty_str, empty.str.center(42)) tm.assert_series_equal(empty_str, empty.str.split("a")) tm.assert_series_equal(empty_str, empty.str.rsplit("a")) tm.assert_series_equal(empty_str, empty.str.partition("a", expand=False)) tm.assert_series_equal(empty_str, empty.str.rpartition("a", expand=False)) tm.assert_series_equal(empty_str, empty.str.slice(stop=1)) tm.assert_series_equal(empty_str, empty.str.slice(step=1)) tm.assert_series_equal(empty_str, empty.str.strip()) tm.assert_series_equal(empty_str, empty.str.lstrip()) tm.assert_series_equal(empty_str, empty.str.rstrip()) tm.assert_series_equal(empty_str, empty.str.wrap(42)) tm.assert_series_equal(empty_str, empty.str.get(0)) tm.assert_series_equal(empty_str, empty_bytes.str.decode("ascii")) tm.assert_series_equal(empty_bytes, empty.str.encode("ascii")) # ismethods should always return boolean (GH 29624) tm.assert_series_equal(empty_bool, empty.str.isalnum()) tm.assert_series_equal(empty_bool, empty.str.isalpha()) tm.assert_series_equal(empty_bool, empty.str.isdigit()) tm.assert_series_equal(empty_bool, empty.str.isspace()) tm.assert_series_equal(empty_bool, empty.str.islower()) tm.assert_series_equal(empty_bool, empty.str.isupper()) tm.assert_series_equal(empty_bool, empty.str.istitle()) tm.assert_series_equal(empty_bool, empty.str.isnumeric()) tm.assert_series_equal(empty_bool, empty.str.isdecimal()) tm.assert_series_equal(empty_str, empty.str.capitalize()) tm.assert_series_equal(empty_str, empty.str.swapcase()) tm.assert_series_equal(empty_str, empty.str.normalize("NFC")) table = str.maketrans("a", "b") tm.assert_series_equal(empty_str, empty.str.translate(table)) def test_empty_str_methods_to_frame(self): empty = Series(dtype=str) empty_df = DataFrame() tm.assert_frame_equal(empty_df, empty.str.partition("a")) tm.assert_frame_equal(empty_df, empty.str.rpartition("a")) def test_ismethods(self): values = ["A", "b", "Xy", "4", "3A", "", "TT", "55", "-", " "] str_s = Series(values) alnum_e = [True, True, True, True, True, False, True, True, False, False] alpha_e = [True, True, True, False, False, False, True, False, False, False] digit_e = [False, False, False, True, False, False, False, True, False, False] # TODO: unused num_e = [ # noqa False, False, False, True, False, False, False, True, False, False, ] space_e = [False, False, False, False, False, False, False, False, False, True] lower_e = [False, True, False, False, False, False, False, False, False, False] upper_e = [True, False, False, False, True, False, True, False, False, False] title_e = [True, False, True, False, True, False, False, False, False, False] tm.assert_series_equal(str_s.str.isalnum(), Series(alnum_e)) tm.assert_series_equal(str_s.str.isalpha(), Series(alpha_e)) tm.assert_series_equal(str_s.str.isdigit(), Series(digit_e)) tm.assert_series_equal(str_s.str.isspace(), Series(space_e)) tm.assert_series_equal(str_s.str.islower(), Series(lower_e)) tm.assert_series_equal(str_s.str.isupper(), Series(upper_e)) tm.assert_series_equal(str_s.str.istitle(), Series(title_e)) assert str_s.str.isalnum().tolist() == [v.isalnum() for v in values] assert str_s.str.isalpha().tolist() == [v.isalpha() for v in values] assert str_s.str.isdigit().tolist() == [v.isdigit() for v in values] assert str_s.str.isspace().tolist() == [v.isspace() for v in values] assert str_s.str.islower().tolist() == [v.islower() for v in values] assert str_s.str.isupper().tolist() == [v.isupper() for v in values] assert str_s.str.istitle().tolist() == [v.istitle() for v in values] def test_isnumeric(self): # 0x00bc: ¼ VULGAR FRACTION ONE QUARTER # 0x2605: ★ not number # 0x1378: ፸ ETHIOPIC NUMBER SEVENTY # 0xFF13: 3 Em 3 values = ["A", "3", "¼", "★", "፸", "3", "four"] s = Series(values) numeric_e = [False, True, True, False, True, True, False] decimal_e = [False, True, False, False, False, True, False] tm.assert_series_equal(s.str.isnumeric(), Series(numeric_e)) tm.assert_series_equal(s.str.isdecimal(), Series(decimal_e)) unicodes = ["A", "3", "¼", "★", "፸", "3", "four"] assert s.str.isnumeric().tolist() == [v.isnumeric() for v in unicodes] assert s.str.isdecimal().tolist() == [v.isdecimal() for v in unicodes] values = ["A", np.nan, "¼", "★", np.nan, "3", "four"] s = Series(values) numeric_e = [False, np.nan, True, False, np.nan, True, False] decimal_e = [False, np.nan, False, False, np.nan, True, False] tm.assert_series_equal(s.str.isnumeric(), Series(numeric_e)) tm.assert_series_equal(s.str.isdecimal(), Series(decimal_e)) def test_get_dummies(self): s = Series(["a|b", "a|c", np.nan]) result = s.str.get_dummies("|") expected = DataFrame([[1, 1, 0], [1, 0, 1], [0, 0, 0]], columns=list("abc")) tm.assert_frame_equal(result, expected) s = Series(["a;b", "a", 7]) result = s.str.get_dummies(";") expected = DataFrame([[0, 1, 1], [0, 1, 0], [1, 0, 0]], columns=list("7ab")) tm.assert_frame_equal(result, expected) # GH9980, GH8028 idx = Index(["a|b", "a|c", "b|c"]) result = idx.str.get_dummies("|") expected = MultiIndex.from_tuples( [(1, 1, 0), (1, 0, 1), (0, 1, 1)], names=("a", "b", "c") ) tm.assert_index_equal(result, expected) def test_get_dummies_with_name_dummy(self): # GH 12180 # Dummies named 'name' should work as expected s = Series(["a", "b,name", "b"]) result = s.str.get_dummies(",") expected = DataFrame( [[1, 0, 0], [0, 1, 1], [0, 1, 0]], columns=["a", "b", "name"] ) tm.assert_frame_equal(result, expected) idx = Index(["a|b", "name|c", "b|name"]) result = idx.str.get_dummies("|") expected = MultiIndex.from_tuples( [(1, 1, 0, 0), (0, 0, 1, 1), (0, 1, 0, 1)], names=("a", "b", "c", "name") ) tm.assert_index_equal(result, expected) def test_join(self): values = Series(["a_b_c", "c_d_e", np.nan, "f_g_h"]) result = values.str.split("_").str.join("_") tm.assert_series_equal(values, result) # mixed mixed = Series( [ "a_b", np.nan, "asdf_cas_asdf", True, datetime.today(), "foo", None, 1, 2.0, ] ) rs = Series(mixed).str.split("_").str.join("_") xp = Series( [ "a_b", np.nan, "asdf_cas_asdf", np.nan, np.nan, "foo", np.nan, np.nan, np.nan, ] ) assert isinstance(rs, Series) tm.assert_almost_equal(rs, xp) def test_len(self): values = Series(["foo", "fooo", "fooooo", np.nan, "fooooooo"]) result = values.str.len() exp = values.map(lambda x: len(x) if notna(x) else np.nan) tm.assert_series_equal(result, exp) # mixed mixed = Series( [ "a_b", np.nan, "asdf_cas_asdf", True, datetime.today(), "foo", None, 1, 2.0, ] ) rs = Series(mixed).str.len() xp = Series([3, np.nan, 13, np.nan, np.nan, 3, np.nan, np.nan, np.nan]) assert isinstance(rs, Series) tm.assert_almost_equal(rs, xp) def test_findall(self): values = Series(["fooBAD__barBAD", np.nan, "foo", "BAD"]) result = values.str.findall("BAD[_]*") exp = Series([["BAD__", "BAD"], np.nan, [], ["BAD"]]) tm.assert_almost_equal(result, exp) # mixed mixed = Series( [ "fooBAD__barBAD", np.nan, "foo", True, datetime.today(), "BAD", None, 1, 2.0, ] ) rs = Series(mixed).str.findall("BAD[_]*") xp = Series( [ ["BAD__", "BAD"], np.nan, [], np.nan, np.nan, ["BAD"], np.nan, np.nan, np.nan, ] ) assert isinstance(rs, Series) tm.assert_almost_equal(rs, xp) def test_find(self): values = Series(["ABCDEFG", "BCDEFEF", "DEFGHIJEF", "EFGHEF", "XXXX"]) result = values.str.find("EF") tm.assert_series_equal(result, Series([4, 3, 1, 0, -1])) expected = np.array([v.find("EF") for v in values.values], dtype=np.int64) tm.assert_numpy_array_equal(result.values, expected) result = values.str.rfind("EF") tm.assert_series_equal(result, Series([4, 5, 7, 4, -1])) expected = np.array([v.rfind("EF") for v in values.values], dtype=np.int64) tm.assert_numpy_array_equal(result.values, expected) result = values.str.find("EF", 3) tm.assert_series_equal(result, Series([4, 3, 7, 4, -1])) expected = np.array([v.find("EF", 3) for v in values.values], dtype=np.int64) tm.assert_numpy_array_equal(result.values, expected) result = values.str.rfind("EF", 3) tm.assert_series_equal(result, Series([4, 5, 7, 4, -1])) expected = np.array([v.rfind("EF", 3) for v in values.values], dtype=np.int64) tm.assert_numpy_array_equal(result.values, expected) result = values.str.find("EF", 3, 6) tm.assert_series_equal(result, Series([4, 3, -1, 4, -1])) expected = np.array([v.find("EF", 3, 6) for v in values.values], dtype=np.int64) tm.assert_numpy_array_equal(result.values, expected) result = values.str.rfind("EF", 3, 6) tm.assert_series_equal(result, Series([4, 3, -1, 4, -1])) expected = np.array( [v.rfind("EF", 3, 6) for v in values.values], dtype=np.int64 ) tm.assert_numpy_array_equal(result.values, expected) with pytest.raises(TypeError, match="expected a string object, not int"): result = values.str.find(0) with pytest.raises(TypeError, match="expected a string object, not int"): result = values.str.rfind(0) def test_find_nan(self): values = Series(["ABCDEFG", np.nan, "DEFGHIJEF", np.nan, "XXXX"]) result = values.str.find("EF") tm.assert_series_equal(result, Series([4, np.nan, 1, np.nan, -1])) result = values.str.rfind("EF") tm.assert_series_equal(result, Series([4, np.nan, 7, np.nan, -1])) result = values.str.find("EF", 3) tm.assert_series_equal(result, Series([4, np.nan, 7, np.nan, -1])) result = values.str.rfind("EF", 3) tm.assert_series_equal(result, Series([4, np.nan, 7, np.nan, -1])) result = values.str.find("EF", 3, 6) tm.assert_series_equal(result, Series([4, np.nan, -1, np.nan, -1])) result = values.str.rfind("EF", 3, 6) tm.assert_series_equal(result, Series([4, np.nan, -1, np.nan, -1])) def test_index(self): def _check(result, expected): if isinstance(result, Series): tm.assert_series_equal(result, expected) else: tm.assert_index_equal(result, expected) for klass in [Series, Index]: s = klass(["ABCDEFG", "BCDEFEF", "DEFGHIJEF", "EFGHEF"]) result = s.str.index("EF") _check(result, klass([4, 3, 1, 0])) expected = np.array([v.index("EF") for v in s.values], dtype=np.int64) tm.assert_numpy_array_equal(result.values, expected) result = s.str.rindex("EF") _check(result, klass([4, 5, 7, 4])) expected = np.array([v.rindex("EF") for v in s.values], dtype=np.int64) tm.assert_numpy_array_equal(result.values, expected) result = s.str.index("EF", 3) _check(result, klass([4, 3, 7, 4])) expected = np.array([v.index("EF", 3) for v in s.values], dtype=np.int64) tm.assert_numpy_array_equal(result.values, expected) result = s.str.rindex("EF", 3) _check(result, klass([4, 5, 7, 4])) expected = np.array([v.rindex("EF", 3) for v in s.values], dtype=np.int64) tm.assert_numpy_array_equal(result.values, expected) result = s.str.index("E", 4, 8) _check(result, klass([4, 5, 7, 4])) expected = np.array([v.index("E", 4, 8) for v in s.values], dtype=np.int64) tm.assert_numpy_array_equal(result.values, expected) result = s.str.rindex("E", 0, 5) _check(result, klass([4, 3, 1, 4])) expected = np.array([v.rindex("E", 0, 5) for v in s.values], dtype=np.int64) tm.assert_numpy_array_equal(result.values, expected) with pytest.raises(ValueError, match="substring not found"): result = s.str.index("DE") msg = "expected a string object, not int" with pytest.raises(TypeError, match=msg): result = s.str.index(0) # test with nan s = Series(["abcb", "ab", "bcbe", np.nan]) result = s.str.index("b") tm.assert_series_equal(result, Series([1, 1, 0, np.nan])) result = s.str.rindex("b") tm.assert_series_equal(result, Series([3, 1, 2, np.nan])) def test_pad(self): values = Series(["a", "b", np.nan, "c", np.nan, "eeeeee"]) result = values.str.pad(5, side="left") exp = Series([" a", " b", np.nan, " c", np.nan, "eeeeee"]) tm.assert_almost_equal(result, exp) result = values.str.pad(5, side="right") exp = Series(["a ", "b ", np.nan, "c ", np.nan, "eeeeee"]) tm.assert_almost_equal(result, exp) result = values.str.pad(5, side="both") exp = Series([" a ", " b ", np.nan, " c ", np.nan, "eeeeee"]) tm.assert_almost_equal(result, exp) # mixed mixed = Series(["a", np.nan, "b", True, datetime.today(), "ee", None, 1, 2.0]) rs = Series(mixed).str.pad(5, side="left") xp = Series( [" a", np.nan, " b", np.nan, np.nan, " ee", np.nan, np.nan, np.nan] ) assert isinstance(rs, Series) tm.assert_almost_equal(rs, xp) mixed = Series(["a", np.nan, "b", True, datetime.today(), "ee", None, 1, 2.0]) rs = Series(mixed).str.pad(5, side="right") xp = Series( ["a ", np.nan, "b ", np.nan, np.nan, "ee ", np.nan, np.nan, np.nan] ) assert isinstance(rs, Series) tm.assert_almost_equal(rs, xp) mixed = Series(["a", np.nan, "b", True, datetime.today(), "ee", None, 1, 2.0]) rs = Series(mixed).str.pad(5, side="both") xp = Series( [" a ", np.nan, " b ", np.nan, np.nan, " ee ", np.nan, np.nan, np.nan] ) assert isinstance(rs, Series) tm.assert_almost_equal(rs, xp) def test_pad_fillchar(self): values = Series(["a", "b", np.nan, "c", np.nan, "eeeeee"]) result = values.str.pad(5, side="left", fillchar="X") exp = Series(["XXXXa", "XXXXb", np.nan, "XXXXc", np.nan, "eeeeee"]) tm.assert_almost_equal(result, exp) result = values.str.pad(5, side="right", fillchar="X") exp = Series(["aXXXX", "bXXXX", np.nan, "cXXXX", np.nan, "eeeeee"]) tm.assert_almost_equal(result, exp) result = values.str.pad(5, side="both", fillchar="X") exp = Series(["XXaXX", "XXbXX", np.nan, "XXcXX", np.nan, "eeeeee"]) tm.assert_almost_equal(result, exp) msg = "fillchar must be a character, not str" with pytest.raises(TypeError, match=msg): result = values.str.pad(5, fillchar="XY") msg = "fillchar must be a character, not int" with pytest.raises(TypeError, match=msg): result = values.str.pad(5, fillchar=5) @pytest.mark.parametrize("f", ["center", "ljust", "rjust", "zfill", "pad"]) def test_pad_width(self, f): # see gh-13598 s = Series(["1", "22", "a", "bb"]) msg = "width must be of integer type, not*" with pytest.raises(TypeError, match=msg): getattr(s.str, f)("f") def test_translate(self): def _check(result, expected): if isinstance(result, Series): tm.assert_series_equal(result, expected) else: tm.assert_index_equal(result, expected) for klass in [Series, Index]: s = klass(["abcdefg", "abcc", "cdddfg", "cdefggg"]) table = str.maketrans("abc", "cde") result = s.str.translate(table) expected = klass(["cdedefg", "cdee", "edddfg", "edefggg"]) _check(result, expected) # Series with non-string values s = Series(["a", "b", "c", 1.2]) expected = Series(["c", "d", "e", np.nan]) result = s.str.translate(table) tm.assert_series_equal(result, expected) def test_center_ljust_rjust(self): values = Series(["a", "b", np.nan, "c", np.nan, "eeeeee"]) result = values.str.center(5) exp = Series([" a ", " b ", np.nan, " c ", np.nan, "eeeeee"]) tm.assert_almost_equal(result, exp) result = values.str.ljust(5) exp = Series(["a ", "b ", np.nan, "c ", np.nan, "eeeeee"]) tm.assert_almost_equal(result, exp) result = values.str.rjust(5) exp = Series([" a", " b", np.nan, " c", np.nan, "eeeeee"]) tm.assert_almost_equal(result, exp) # mixed mixed = Series( ["a", np.nan, "b", True, datetime.today(), "c", "eee", None, 1, 2.0] ) rs = Series(mixed).str.center(5) xp = Series( [ " a ", np.nan, " b ", np.nan, np.nan, " c ", " eee ", np.nan, np.nan, np.nan, ] ) assert isinstance(rs, Series) tm.assert_almost_equal(rs, xp) rs = Series(mixed).str.ljust(5) xp = Series( [ "a ", np.nan, "b ", np.nan, np.nan, "c ", "eee ", np.nan, np.nan, np.nan, ] ) assert isinstance(rs, Series) tm.assert_almost_equal(rs, xp) rs = Series(mixed).str.rjust(5) xp = Series( [ " a", np.nan, " b", np.nan, np.nan, " c", " eee", np.nan, np.nan, np.nan, ] ) assert isinstance(rs, Series) tm.assert_almost_equal(rs, xp) def test_center_ljust_rjust_fillchar(self): values = Series(["a", "bb", "cccc", "ddddd", "eeeeee"]) result = values.str.center(5, fillchar="X") expected = Series(["XXaXX", "XXbbX", "Xcccc", "ddddd", "eeeeee"]) tm.assert_series_equal(result, expected) expected = np.array([v.center(5, "X") for v in values.values], dtype=np.object_) tm.assert_numpy_array_equal(result.values, expected) result = values.str.ljust(5, fillchar="X") expected = Series(["aXXXX", "bbXXX", "ccccX", "ddddd", "eeeeee"]) tm.assert_series_equal(result, expected) expected = np.array([v.ljust(5, "X") for v in values.values], dtype=np.object_) tm.assert_numpy_array_equal(result.values, expected) result = values.str.rjust(5, fillchar="X") expected = Series(["XXXXa", "XXXbb", "Xcccc", "ddddd", "eeeeee"]) tm.assert_series_equal(result, expected) expected = np.array([v.rjust(5, "X") for v in values.values], dtype=np.object_) tm.assert_numpy_array_equal(result.values, expected) # If fillchar is not a charatter, normal str raises TypeError # 'aaa'.ljust(5, 'XY') # TypeError: must be char, not str template = "fillchar must be a character, not {dtype}" with pytest.raises(TypeError, match=template.format(dtype="str")): values.str.center(5, fillchar="XY") with pytest.raises(TypeError, match=template.format(dtype="str")): values.str.ljust(5, fillchar="XY") with pytest.raises(TypeError, match=template.format(dtype="str")): values.str.rjust(5, fillchar="XY") with pytest.raises(TypeError, match=template.format(dtype="int")): values.str.center(5, fillchar=1) with pytest.raises(TypeError, match=template.format(dtype="int")): values.str.ljust(5, fillchar=1) with pytest.raises(TypeError, match=template.format(dtype="int")): values.str.rjust(5, fillchar=1) def test_zfill(self): values = Series(["1", "22", "aaa", "333", "45678"]) result = values.str.zfill(5) expected = Series(["00001", "00022", "00aaa", "00333", "45678"]) tm.assert_series_equal(result, expected) expected = np.array([v.zfill(5) for v in values.values], dtype=np.object_) tm.assert_numpy_array_equal(result.values, expected) result = values.str.zfill(3) expected = Series(["001", "022", "aaa", "333", "45678"]) tm.assert_series_equal(result, expected) expected = np.array([v.zfill(3) for v in values.values], dtype=np.object_) tm.assert_numpy_array_equal(result.values, expected) values = Series(["1", np.nan, "aaa", np.nan, "45678"]) result = values.str.zfill(5) expected = Series(["00001", np.nan, "00aaa", np.nan, "45678"]) tm.assert_series_equal(result, expected) def test_split(self): values = Series(["a_b_c", "c_d_e", np.nan, "f_g_h"]) result = values.str.split("_") exp = Series([["a", "b", "c"], ["c", "d", "e"], np.nan, ["f", "g", "h"]]) tm.assert_series_equal(result, exp) # more than one char values = Series(["a__b__c", "c__d__e", np.nan, "f__g__h"]) result = values.str.split("__") tm.assert_series_equal(result, exp) result = values.str.split("__", expand=False) tm.assert_series_equal(result, exp) # mixed mixed = Series(["a_b_c", np.nan, "d_e_f", True, datetime.today(), None, 1, 2.0]) result = mixed.str.split("_") exp = Series( [ ["a", "b", "c"], np.nan, ["d", "e", "f"], np.nan, np.nan, np.nan, np.nan, np.nan, ] ) assert isinstance(result, Series) tm.assert_almost_equal(result, exp) result = mixed.str.split("_", expand=False) assert isinstance(result, Series) tm.assert_almost_equal(result, exp) # regex split values = Series(["a,b_c", "c_d,e", np.nan, "f,g,h"]) result = values.str.split("[,_]") exp = Series([["a", "b", "c"], ["c", "d", "e"], np.nan, ["f", "g", "h"]]) tm.assert_series_equal(result, exp) def test_rsplit(self): values = Series(["a_b_c", "c_d_e", np.nan, "f_g_h"]) result = values.str.rsplit("_") exp = Series([["a", "b", "c"], ["c", "d", "e"], np.nan, ["f", "g", "h"]]) tm.assert_series_equal(result, exp) # more than one char values = Series(["a__b__c", "c__d__e", np.nan, "f__g__h"]) result = values.str.rsplit("__") tm.assert_series_equal(result, exp) result = values.str.rsplit("__", expand=False) tm.assert_series_equal(result, exp) # mixed mixed = Series(["a_b_c", np.nan, "d_e_f", True, datetime.today(), None, 1, 2.0]) result = mixed.str.rsplit("_") exp = Series( [ ["a", "b", "c"], np.nan, ["d", "e", "f"], np.nan, np.nan, np.nan, np.nan, np.nan, ] ) assert isinstance(result, Series) tm.assert_almost_equal(result, exp) result = mixed.str.rsplit("_", expand=False) assert isinstance(result, Series) tm.assert_almost_equal(result, exp) # regex split is not supported by rsplit values = Series(["a,b_c", "c_d,e", np.nan, "f,g,h"]) result = values.str.rsplit("[,_]") exp = Series([["a,b_c"], ["c_d,e"], np.nan, ["f,g,h"]]) tm.assert_series_equal(result, exp) # setting max number of splits, make sure it's from reverse values = Series(["a_b_c", "c_d_e", np.nan, "f_g_h"]) result = values.str.rsplit("_", n=1) exp = Series([["a_b", "c"], ["c_d", "e"], np.nan, ["f_g", "h"]]) tm.assert_series_equal(result, exp) def test_split_blank_string(self): # expand blank split GH 20067 values = Series([""], name="test") result = values.str.split(expand=True) exp = DataFrame([[]]) # NOTE: this is NOT an empty DataFrame tm.assert_frame_equal(result, exp) values = Series(["a b c", "a b", "", " "], name="test") result = values.str.split(expand=True) exp = DataFrame( [ ["a", "b", "c"], ["a", "b", np.nan], [np.nan, np.nan, np.nan], [np.nan, np.nan, np.nan], ] ) tm.assert_frame_equal(result, exp) def test_split_noargs(self): # #1859 s = Series(["Wes McKinney", "Travis Oliphant"]) result = s.str.split() expected = ["Travis", "Oliphant"] assert result[1] == expected result = s.str.rsplit() assert result[1] == expected def test_split_maxsplit(self): # re.split 0, str.split -1 s = Series(["bd asdf jfg", "kjasdflqw asdfnfk"]) result = s.str.split(n=-1) xp = s.str.split() tm.assert_series_equal(result, xp) result = s.str.split(n=0) tm.assert_series_equal(result, xp) xp = s.str.split("asdf") result = s.str.split("asdf", n=0) tm.assert_series_equal(result, xp) result = s.str.split("asdf", n=-1) tm.assert_series_equal(result, xp) def test_split_no_pat_with_nonzero_n(self): s = Series(["split once", "split once too!"]) result = s.str.split(n=1) expected = Series({0: ["split", "once"], 1: ["split", "once too!"]}) tm.assert_series_equal(expected, result, check_index_type=False) def test_split_to_dataframe(self): s = Series(["nosplit", "alsonosplit"]) result = s.str.split("_", expand=True) exp = DataFrame({0: Series(["nosplit", "alsonosplit"])}) tm.assert_frame_equal(result, exp) s = Series(["some_equal_splits", "with_no_nans"]) result = s.str.split("_", expand=True) exp = DataFrame( {0: ["some", "with"], 1: ["equal", "no"], 2: ["splits", "nans"]} ) tm.assert_frame_equal(result, exp) s = Series(["some_unequal_splits", "one_of_these_things_is_not"]) result = s.str.split("_", expand=True) exp = DataFrame( { 0: ["some", "one"], 1: ["unequal", "of"], 2: ["splits", "these"], 3: [np.nan, "things"], 4: [np.nan, "is"], 5: [np.nan, "not"], } ) tm.assert_frame_equal(result, exp) s = Series(["some_splits", "with_index"], index=["preserve", "me"]) result = s.str.split("_", expand=True) exp = DataFrame( {0: ["some", "with"], 1: ["splits", "index"]}, index=["preserve", "me"] ) tm.assert_frame_equal(result, exp) with pytest.raises(ValueError, match="expand must be"): s.str.split("_", expand="not_a_boolean") def test_split_to_multiindex_expand(self): # https://github.com/pandas-dev/pandas/issues/23677 idx = Index(["nosplit", "alsonosplit", np.nan]) result = idx.str.split("_", expand=True) exp = idx tm.assert_index_equal(result, exp) assert result.nlevels == 1 idx = Index(["some_equal_splits", "with_no_nans", np.nan, None]) result = idx.str.split("_", expand=True) exp = MultiIndex.from_tuples( [ ("some", "equal", "splits"), ("with", "no", "nans"), [np.nan, np.nan, np.nan], [None, None, None], ] ) tm.assert_index_equal(result, exp) assert result.nlevels == 3 idx = Index(["some_unequal_splits", "one_of_these_things_is_not", np.nan, None]) result = idx.str.split("_", expand=True) exp = MultiIndex.from_tuples( [ ("some", "unequal", "splits", np.nan, np.nan, np.nan), ("one", "of", "these", "things", "is", "not"), (np.nan, np.nan, np.nan, np.nan, np.nan, np.nan), (None, None, None, None, None, None), ] ) tm.assert_index_equal(result, exp) assert result.nlevels == 6 with pytest.raises(ValueError, match="expand must be"): idx.str.split("_", expand="not_a_boolean") def test_rsplit_to_dataframe_expand(self): s = Series(["nosplit", "alsonosplit"]) result = s.str.rsplit("_", expand=True) exp = DataFrame({0: Series(["nosplit", "alsonosplit"])}) tm.assert_frame_equal(result, exp) s = Series(["some_equal_splits", "with_no_nans"]) result = s.str.rsplit("_", expand=True) exp = DataFrame( {0: ["some", "with"], 1: ["equal", "no"], 2: ["splits", "nans"]} ) tm.assert_frame_equal(result, exp) result = s.str.rsplit("_", expand=True, n=2) exp = DataFrame( {0: ["some", "with"], 1: ["equal", "no"], 2: ["splits", "nans"]} ) tm.assert_frame_equal(result, exp) result = s.str.rsplit("_", expand=True, n=1) exp = DataFrame({0: ["some_equal", "with_no"], 1: ["splits", "nans"]}) tm.assert_frame_equal(result, exp) s = Series(["some_splits", "with_index"], index=["preserve", "me"]) result = s.str.rsplit("_", expand=True) exp = DataFrame( {0: ["some", "with"], 1: ["splits", "index"]}, index=["preserve", "me"] ) tm.assert_frame_equal(result, exp) def test_rsplit_to_multiindex_expand(self): idx = Index(["nosplit", "alsonosplit"]) result = idx.str.rsplit("_", expand=True) exp = idx tm.assert_index_equal(result, exp) assert result.nlevels == 1 idx = Index(["some_equal_splits", "with_no_nans"]) result = idx.str.rsplit("_", expand=True) exp = MultiIndex.from_tuples( [("some", "equal", "splits"), ("with", "no", "nans")] ) tm.assert_index_equal(result, exp) assert result.nlevels == 3 idx = Index(["some_equal_splits", "with_no_nans"]) result = idx.str.rsplit("_", expand=True, n=1) exp = MultiIndex.from_tuples([("some_equal", "splits"), ("with_no", "nans")]) tm.assert_index_equal(result, exp) assert result.nlevels == 2 def test_split_nan_expand(self): # gh-18450 s = Series(["foo,bar,baz", np.nan]) result = s.str.split(",", expand=True) exp = DataFrame([["foo", "bar", "baz"], [np.nan, np.nan, np.nan]]) tm.assert_frame_equal(result, exp) # check that these are actually np.nan and not None # TODO see GH 18463 # tm.assert_frame_equal does not differentiate assert all(np.isnan(x) for x in result.iloc[1]) def test_split_with_name(self): # GH 12617 # should preserve name s = Series(["a,b", "c,d"], name="xxx") res = s.str.split(",") exp = Series([["a", "b"], ["c", "d"]], name="xxx") tm.assert_series_equal(res, exp) res = s.str.split(",", expand=True) exp = DataFrame([["a", "b"], ["c", "d"]]) tm.assert_frame_equal(res, exp) idx = Index(["a,b", "c,d"], name="xxx") res = idx.str.split(",") exp = Index([["a", "b"], ["c", "d"]], name="xxx") assert res.nlevels == 1 tm.assert_index_equal(res, exp) res = idx.str.split(",", expand=True) exp = MultiIndex.from_tuples([("a", "b"), ("c", "d")]) assert res.nlevels == 2 tm.assert_index_equal(res, exp) def test_partition_series(self): # https://github.com/pandas-dev/pandas/issues/23558 values = Series(["a_b_c", "c_d_e", np.nan, "f_g_h", None]) result = values.str.partition("_", expand=False) exp = Series( [("a", "_", "b_c"), ("c", "_", "d_e"), np.nan, ("f", "_", "g_h"), None] ) tm.assert_series_equal(result, exp) result = values.str.rpartition("_", expand=False) exp = Series( [("a_b", "_", "c"), ("c_d", "_", "e"), np.nan, ("f_g", "_", "h"), None] ) tm.assert_series_equal(result, exp) # more than one char values = Series(["a__b__c", "c__d__e", np.nan, "f__g__h", None]) result = values.str.partition("__", expand=False) exp = Series( [ ("a", "__", "b__c"), ("c", "__", "d__e"), np.nan, ("f", "__", "g__h"), None, ] ) tm.assert_series_equal(result, exp) result = values.str.rpartition("__", expand=False) exp = Series( [ ("a__b", "__", "c"), ("c__d", "__", "e"), np.nan, ("f__g", "__", "h"), None, ] ) tm.assert_series_equal(result, exp) # None values = Series(["a b c", "c d e", np.nan, "f g h", None]) result = values.str.partition(expand=False) exp = Series( [("a", " ", "b c"), ("c", " ", "d e"), np.nan, ("f", " ", "g h"), None] ) tm.assert_series_equal(result, exp) result = values.str.rpartition(expand=False) exp = Series( [("a b", " ", "c"), ("c d", " ", "e"), np.nan, ("f g", " ", "h"), None] ) tm.assert_series_equal(result, exp) # Not split values = Series(["abc", "cde", np.nan, "fgh", None]) result = values.str.partition("_", expand=False) exp = Series([("abc", "", ""), ("cde", "", ""), np.nan, ("fgh", "", ""), None]) tm.assert_series_equal(result, exp) result = values.str.rpartition("_", expand=False) exp = Series([("", "", "abc"), ("", "", "cde"), np.nan, ("", "", "fgh"), None]) tm.assert_series_equal(result, exp) # unicode values = Series(["a_b_c", "c_d_e", np.nan, "f_g_h"]) result = values.str.partition("_", expand=False) exp = Series([("a", "_", "b_c"), ("c", "_", "d_e"), np.nan, ("f", "_", "g_h")]) tm.assert_series_equal(result, exp) result = values.str.rpartition("_", expand=False) exp = Series([("a_b", "_", "c"), ("c_d", "_", "e"), np.nan, ("f_g", "_", "h")]) tm.assert_series_equal(result, exp) # compare to standard lib values = Series(["A_B_C", "B_C_D", "E_F_G", "EFGHEF"]) result = values.str.partition("_", expand=False).tolist() assert result == [v.partition("_") for v in values] result = values.str.rpartition("_", expand=False).tolist() assert result == [v.rpartition("_") for v in values] def test_partition_index(self): # https://github.com/pandas-dev/pandas/issues/23558 values = Index(["a_b_c", "c_d_e", "f_g_h", np.nan, None]) result = values.str.partition("_", expand=False) exp = Index( np.array( [("a", "_", "b_c"), ("c", "_", "d_e"), ("f", "_", "g_h"), np.nan, None], dtype=object, ) ) tm.assert_index_equal(result, exp) assert result.nlevels == 1 result = values.str.rpartition("_", expand=False) exp = Index( np.array( [("a_b", "_", "c"), ("c_d", "_", "e"), ("f_g", "_", "h"), np.nan, None], dtype=object, ) ) tm.assert_index_equal(result, exp) assert result.nlevels == 1 result = values.str.partition("_") exp = Index( [ ("a", "_", "b_c"), ("c", "_", "d_e"), ("f", "_", "g_h"), (np.nan, np.nan, np.nan), (None, None, None), ] ) tm.assert_index_equal(result, exp) assert isinstance(result, MultiIndex) assert result.nlevels == 3 result = values.str.rpartition("_") exp = Index( [ ("a_b", "_", "c"), ("c_d", "_", "e"), ("f_g", "_", "h"), (np.nan, np.nan, np.nan), (None, None, None), ] ) tm.assert_index_equal(result, exp) assert isinstance(result, MultiIndex) assert result.nlevels == 3 def test_partition_to_dataframe(self): # https://github.com/pandas-dev/pandas/issues/23558 values = Series(["a_b_c", "c_d_e", np.nan, "f_g_h", None]) result = values.str.partition("_") exp = DataFrame( { 0: ["a", "c", np.nan, "f", None], 1: ["_", "_", np.nan, "_", None], 2: ["b_c", "d_e", np.nan, "g_h", None], } ) tm.assert_frame_equal(result, exp) result = values.str.rpartition("_") exp = DataFrame( { 0: ["a_b", "c_d", np.nan, "f_g", None], 1: ["_", "_", np.nan, "_", None], 2: ["c", "e", np.nan, "h", None], } ) tm.assert_frame_equal(result, exp) values = Series(["a_b_c", "c_d_e", np.nan, "f_g_h", None]) result = values.str.partition("_", expand=True) exp = DataFrame( { 0: ["a", "c", np.nan, "f", None], 1: ["_", "_", np.nan, "_", None], 2: ["b_c", "d_e", np.nan, "g_h", None], } ) tm.assert_frame_equal(result, exp) result = values.str.rpartition("_", expand=True) exp = DataFrame( { 0: ["a_b", "c_d", np.nan, "f_g", None], 1: ["_", "_", np.nan, "_", None], 2: ["c", "e", np.nan, "h", None], } ) tm.assert_frame_equal(result, exp) def test_partition_with_name(self): # GH 12617 s = Series(["a,b", "c,d"], name="xxx") res = s.str.partition(",") exp = DataFrame({0: ["a", "c"], 1: [",", ","], 2: ["b", "d"]}) tm.assert_frame_equal(res, exp) # should preserve name res = s.str.partition(",", expand=False) exp = Series([("a", ",", "b"), ("c", ",", "d")], name="xxx") tm.assert_series_equal(res, exp) idx = Index(["a,b", "c,d"], name="xxx") res = idx.str.partition(",") exp = MultiIndex.from_tuples([("a", ",", "b"), ("c", ",", "d")]) assert res.nlevels == 3 tm.assert_index_equal(res, exp) # should preserve name res = idx.str.partition(",", expand=False) exp = Index(np.array([("a", ",", "b"), ("c", ",", "d")]), name="xxx") assert res.nlevels == 1 tm.assert_index_equal(res, exp) def test_partition_sep_kwarg(self): # GH 22676; depr kwarg "pat" in favor of "sep" values = Series(["a_b_c", "c_d_e", np.nan, "f_g_h"]) expected = values.str.partition(sep="_") result = values.str.partition("_") tm.assert_frame_equal(result, expected) expected = values.str.rpartition(sep="_") result = values.str.rpartition("_") tm.assert_frame_equal(result, expected) def test_pipe_failures(self): # #2119 s = Series(["A|B|C"]) result = s.str.split("|") exp = Series([["A", "B", "C"]]) tm.assert_series_equal(result, exp) result = s.str.replace("|", " ") exp = Series(["A B C"]) tm.assert_series_equal(result, exp) @pytest.mark.parametrize( "start, stop, step, expected", [ (2, 5, None, Series(["foo", "bar", np.nan, "baz"])), (0, 3, -1, Series(["", "", np.nan, ""])), (None, None, -1, Series(["owtoofaa", "owtrabaa", np.nan, "xuqzabaa"])), (3, 10, 2, Series(["oto", "ato", np.nan, "aqx"])), (3, 0, -1, Series(["ofa", "aba", np.nan, "aba"])), ], ) def test_slice(self, start, stop, step, expected): values = Series(["aafootwo", "aabartwo", np.nan, "aabazqux"]) result = values.str.slice(start, stop, step) tm.assert_series_equal(result, expected) # mixed mixed = Series( ["aafootwo", np.nan, "aabartwo", True, datetime.today(), None, 1, 2.0] ) rs = Series(mixed).str.slice(2, 5) xp = Series(["foo", np.nan, "bar", np.nan, np.nan, np.nan, np.nan, np.nan]) assert isinstance(rs, Series) tm.assert_almost_equal(rs, xp) rs = Series(mixed).str.slice(2, 5, -1) xp = Series(["oof", np.nan, "rab", np.nan, np.nan, np.nan, np.nan, np.nan]) def test_slice_replace(self): values = Series(["short", "a bit longer", "evenlongerthanthat", "", np.nan]) exp = Series(["shrt", "a it longer", "evnlongerthanthat", "", np.nan]) result = values.str.slice_replace(2, 3) tm.assert_series_equal(result, exp) exp = Series(["shzrt", "a zit longer", "evznlongerthanthat", "z", np.nan]) result = values.str.slice_replace(2, 3, "z") tm.assert_series_equal(result, exp) exp = Series(["shzort", "a zbit longer", "evzenlongerthanthat", "z", np.nan]) result = values.str.slice_replace(2, 2, "z") tm.assert_series_equal(result, exp) exp = Series(["shzort", "a zbit longer", "evzenlongerthanthat", "z", np.nan]) result = values.str.slice_replace(2, 1, "z") tm.assert_series_equal(result, exp) exp = Series(["shorz", "a bit longez", "evenlongerthanthaz", "z", np.nan]) result = values.str.slice_replace(-1, None, "z") tm.assert_series_equal(result, exp) exp = Series(["zrt", "zer", "zat", "z", np.nan]) result = values.str.slice_replace(None, -2, "z") tm.assert_series_equal(result, exp) exp = Series(["shortz", "a bit znger", "evenlozerthanthat", "z", np.nan]) result = values.str.slice_replace(6, 8, "z") tm.assert_series_equal(result, exp) exp = Series(["zrt", "a zit longer", "evenlongzerthanthat", "z", np.nan]) result = values.str.slice_replace(-10, 3, "z") tm.assert_series_equal(result, exp) def test_strip_lstrip_rstrip(self): values = Series([" aa ", " bb \n", np.nan, "cc "]) result = values.str.strip() exp = Series(["aa", "bb", np.nan, "cc"]) tm.assert_series_equal(result, exp) result = values.str.lstrip() exp = Series(["aa ", "bb \n", np.nan, "cc "]) tm.assert_series_equal(result, exp) result = values.str.rstrip() exp = Series([" aa", " bb", np.nan, "cc"]) tm.assert_series_equal(result, exp) def test_strip_lstrip_rstrip_mixed(self): # mixed mixed = Series( [" aa ", np.nan, " bb \t\n", True, datetime.today(), None, 1, 2.0] ) rs = Series(mixed).str.strip() xp = Series(["aa", np.nan, "bb", np.nan, np.nan, np.nan, np.nan, np.nan]) assert isinstance(rs, Series) tm.assert_almost_equal(rs, xp) rs = Series(mixed).str.lstrip() xp = Series(["aa ", np.nan, "bb \t\n", np.nan, np.nan, np.nan, np.nan, np.nan]) assert isinstance(rs, Series) tm.assert_almost_equal(rs, xp) rs = Series(mixed).str.rstrip() xp = Series([" aa", np.nan, " bb", np.nan, np.nan, np.nan, np.nan, np.nan]) assert isinstance(rs, Series) tm.assert_almost_equal(rs, xp) def test_strip_lstrip_rstrip_args(self): values = Series(["xxABCxx", "xx BNSD", "LDFJH xx"]) rs = values.str.strip("x") xp = Series(["ABC", " BNSD", "LDFJH "]) tm.assert_series_equal(rs, xp) rs = values.str.lstrip("x") xp = Series(["ABCxx", " BNSD", "LDFJH xx"]) tm.assert_series_equal(rs, xp) rs = values.str.rstrip("x") xp = Series(["xxABC", "xx BNSD", "LDFJH "]) tm.assert_series_equal(rs, xp) def test_wrap(self): # test values are: two words less than width, two words equal to width, # two words greater than width, one word less than width, one word # equal to width, one word greater than width, multiple tokens with # trailing whitespace equal to width values = Series( [ "hello world", "hello world!", "hello world!!", "abcdefabcde", "abcdefabcdef", "abcdefabcdefa", "ab ab ab ab ", "ab ab ab ab a", "\t", ] ) # expected values xp = Series( [ "hello world", "hello world!", "hello\nworld!!", "abcdefabcde", "abcdefabcdef", "abcdefabcdef\na", "ab ab ab ab", "ab ab ab ab\na", "", ] ) rs = values.str.wrap(12, break_long_words=True) tm.assert_series_equal(rs, xp) # test with pre and post whitespace (non-unicode), NaN, and non-ascii # Unicode values = Series([" pre ", np.nan, "\xac\u20ac\U00008000 abadcafe"]) xp = Series([" pre", np.nan, "\xac\u20ac\U00008000 ab\nadcafe"]) rs = values.str.wrap(6) tm.assert_series_equal(rs, xp) def test_get(self): values = Series(["a_b_c", "c_d_e", np.nan, "f_g_h"]) result = values.str.split("_").str.get(1) expected = Series(["b", "d", np.nan, "g"]) tm.assert_series_equal(result, expected) # mixed mixed = Series(["a_b_c", np.nan, "c_d_e", True, datetime.today(), None, 1, 2.0]) rs = Series(mixed).str.split("_").str.get(1) xp = Series(["b", np.nan, "d", np.nan, np.nan, np.nan, np.nan, np.nan]) assert isinstance(rs, Series) tm.assert_almost_equal(rs, xp) # bounds testing values = Series(["1_2_3_4_5", "6_7_8_9_10", "11_12"]) # positive index result = values.str.split("_").str.get(2) expected = Series(["3", "8", np.nan]) tm.assert_series_equal(result, expected) # negative index result = values.str.split("_").str.get(-3) expected = Series(["3", "8", np.nan]) tm.assert_series_equal(result, expected) def test_get_complex(self): # GH 20671, getting value not in dict raising `KeyError` values = Series([(1, 2, 3), [1, 2, 3], {1, 2, 3}, {1: "a", 2: "b", 3: "c"}]) result = values.str.get(1) expected = Series([2, 2, np.nan, "a"]) tm.assert_series_equal(result, expected) result = values.str.get(-1) expected = Series([3, 3, np.nan, np.nan]) tm.assert_series_equal(result, expected) @pytest.mark.parametrize("to_type", [tuple, list, np.array]) def test_get_complex_nested(self, to_type): values = Series([to_type([to_type([1, 2])])]) result = values.str.get(0) expected = Series([to_type([1, 2])]) tm.assert_series_equal(result, expected) result = values.str.get(1) expected = Series([np.nan]) tm.assert_series_equal(result, expected) def test_contains_moar(self): # PR #1179 s = Series(["A", "B", "C", "Aaba", "Baca", "", np.nan, "CABA", "dog", "cat"]) result = s.str.contains("a") expected = Series( [False, False, False, True, True, False, np.nan, False, False, True] ) tm.assert_series_equal(result, expected) result = s.str.contains("a", case=False) expected = Series( [True, False, False, True, True, False, np.nan, True, False, True] ) tm.assert_series_equal(result, expected) result = s.str.contains("Aa") expected = Series( [False, False, False, True, False, False, np.nan, False, False, False] ) tm.assert_series_equal(result, expected) result = s.str.contains("ba") expected = Series( [False, False, False, True, False, False, np.nan, False, False, False] ) tm.assert_series_equal(result, expected) result = s.str.contains("ba", case=False) expected = Series( [False, False, False, True, True, False, np.nan, True, False, False] ) tm.assert_series_equal(result, expected) def test_contains_nan(self): # PR #14171 s = Series([np.nan, np.nan, np.nan], dtype=np.object_) result = s.str.contains("foo", na=False) expected = Series([False, False, False], dtype=np.bool_) tm.assert_series_equal(result, expected) result = s.str.contains("foo", na=True) expected = Series([True, True, True], dtype=np.bool_) tm.assert_series_equal(result, expected) result = s.str.contains("foo", na="foo") expected = Series(["foo", "foo", "foo"], dtype=np.object_) tm.assert_series_equal(result, expected) result = s.str.contains("foo") expected = Series([np.nan, np.nan, np.nan], dtype=np.object_) tm.assert_series_equal(result, expected) def test_replace_moar(self): # PR #1179 s = Series(["A", "B", "C", "Aaba", "Baca", "", np.nan, "CABA", "dog", "cat"]) result = s.str.replace("A", "YYY") expected = Series( ["YYY", "B", "C", "YYYaba", "Baca", "", np.nan, "CYYYBYYY", "dog", "cat"] ) tm.assert_series_equal(result, expected) result = s.str.replace("A", "YYY", case=False) expected = Series( [ "YYY", "B", "C", "YYYYYYbYYY", "BYYYcYYY", "", np.nan, "CYYYBYYY", "dog", "cYYYt", ] ) tm.assert_series_equal(result, expected) result = s.str.replace("^.a|dog", "XX-XX ", case=False) expected = Series( [ "A", "B", "C", "XX-XX ba", "XX-XX ca", "", np.nan, "XX-XX BA", "XX-XX ", "XX-XX t", ] ) tm.assert_series_equal(result, expected) def test_string_slice_get_syntax(self): s = Series( [ "YYY", "B", "C", "YYYYYYbYYY", "BYYYcYYY", np.nan, "CYYYBYYY", "dog", "cYYYt", ] ) result = s.str[0] expected = s.str.get(0) tm.assert_series_equal(result, expected) result = s.str[:3] expected = s.str.slice(stop=3) tm.assert_series_equal(result, expected) result = s.str[2::-1] expected = s.str.slice(start=2, step=-1) tm.assert_series_equal(result, expected) def test_string_slice_out_of_bounds(self): s = Series([(1, 2), (1,), (3, 4, 5)]) result = s.str[1] expected = Series([2, np.nan, 4]) tm.assert_series_equal(result, expected) s = Series(["foo", "b", "ba"]) result = s.str[1] expected = Series(["o", np.nan, "a"]) tm.assert_series_equal(result, expected) def test_match_findall_flags(self): data = { "Dave": "dave@google.com", "Steve": "steve@gmail.com", "Rob": "rob@gmail.com", "Wes": np.nan, } data = Series(data) pat = r"([A-Z0-9._%+-]+)@([A-Z0-9.-]+)\.([A-Z]{2,4})" result = data.str.extract(pat, flags=re.IGNORECASE, expand=True) assert result.iloc[0].tolist() == ["dave", "google", "com"] result = data.str.match(pat, flags=re.IGNORECASE) assert result[0] result = data.str.findall(pat, flags=re.IGNORECASE) assert result[0][0] == ("dave", "google", "com") result = data.str.count(pat, flags=re.IGNORECASE) assert result[0] == 1 with tm.assert_produces_warning(UserWarning): result = data.str.contains(pat, flags=re.IGNORECASE) assert result[0] def test_encode_decode(self): base = Series(["a", "b", "a\xe4"]) series = base.str.encode("utf-8") f = lambda x: x.decode("utf-8") result = series.str.decode("utf-8") exp = series.map(f) tm.assert_series_equal(result, exp) def test_encode_decode_errors(self): encodeBase = Series(["a", "b", "a\x9d"]) msg = ( r"'charmap' codec can't encode character '\\x9d' in position 1:" " character maps to " ) with pytest.raises(UnicodeEncodeError, match=msg): encodeBase.str.encode("cp1252") f = lambda x: x.encode("cp1252", "ignore") result = encodeBase.str.encode("cp1252", "ignore") exp = encodeBase.map(f) tm.assert_series_equal(result, exp) decodeBase = Series([b"a", b"b", b"a\x9d"]) msg = ( "'charmap' codec can't decode byte 0x9d in position 1:" " character maps to " ) with pytest.raises(UnicodeDecodeError, match=msg): decodeBase.str.decode("cp1252") f = lambda x: x.decode("cp1252", "ignore") result = decodeBase.str.decode("cp1252", "ignore") exp = decodeBase.map(f) tm.assert_series_equal(result, exp) def test_normalize(self): values = ["ABC", "ABC", "123", np.nan, "アイエ"] s = Series(values, index=["a", "b", "c", "d", "e"]) normed = ["ABC", "ABC", "123", np.nan, "アイエ"] expected = Series(normed, index=["a", "b", "c", "d", "e"]) result = s.str.normalize("NFKC") tm.assert_series_equal(result, expected) expected = Series( ["ABC", "ABC", "123", np.nan, "アイエ"], index=["a", "b", "c", "d", "e"] ) result = s.str.normalize("NFC") tm.assert_series_equal(result, expected) with pytest.raises(ValueError, match="invalid normalization form"): s.str.normalize("xxx") s = Index(["ABC", "123", "アイエ"]) expected = Index(["ABC", "123", "アイエ"]) result = s.str.normalize("NFKC") tm.assert_index_equal(result, expected) def test_index_str_accessor_visibility(self): from pandas.core.strings import StringMethods cases = [ (["a", "b"], "string"), (["a", "b", 1], "mixed-integer"), (["a", "b", 1.3], "mixed"), (["a", "b", 1.3, 1], "mixed-integer"), (["aa", datetime(2011, 1, 1)], "mixed"), ] for values, tp in cases: idx = Index(values) assert isinstance(Series(values).str, StringMethods) assert isinstance(idx.str, StringMethods) assert idx.inferred_type == tp for values, tp in cases: idx = Index(values) assert isinstance(Series(values).str, StringMethods) assert isinstance(idx.str, StringMethods) assert idx.inferred_type == tp cases = [ ([1, np.nan], "floating"), ([datetime(2011, 1, 1)], "datetime64"), ([timedelta(1)], "timedelta64"), ] for values, tp in cases: idx = Index(values) message = "Can only use .str accessor with string values" with pytest.raises(AttributeError, match=message): Series(values).str with pytest.raises(AttributeError, match=message): idx.str assert idx.inferred_type == tp # MultiIndex has mixed dtype, but not allow to use accessor idx = MultiIndex.from_tuples([("a", "b"), ("a", "b")]) assert idx.inferred_type == "mixed" message = "Can only use .str accessor with Index, not MultiIndex" with pytest.raises(AttributeError, match=message): idx.str def test_str_accessor_no_new_attributes(self): # https://github.com/pandas-dev/pandas/issues/10673 s = Series(list("aabbcde")) with pytest.raises(AttributeError, match="You cannot add any new attribute"): s.str.xlabel = "a" def test_method_on_bytes(self): lhs = Series(np.array(list("abc"), "S1").astype(object)) rhs = Series(np.array(list("def"), "S1").astype(object)) with pytest.raises(TypeError, match="Cannot use .str.cat with values of.*"): lhs.str.cat(rhs) def test_casefold(self): # GH25405 expected = Series(["ss", np.nan, "case", "ssd"]) s = Series(["ß", np.nan, "case", "ßd"]) result = s.str.casefold() tm.assert_series_equal(result, expected) def test_string_array(any_string_method): method_name, args, kwargs = any_string_method if method_name == "decode": pytest.skip("decode requires bytes.") data = ["a", "bb", np.nan, "ccc"] a = Series(data, dtype=object) b = Series(data, dtype="string") expected = getattr(a.str, method_name)(*args, **kwargs) result = getattr(b.str, method_name)(*args, **kwargs) if isinstance(expected, Series): if expected.dtype == "object" and lib.is_string_array( expected.dropna().values, ): assert result.dtype == "string" result = result.astype(object) elif expected.dtype == "object" and lib.is_bool_array( expected.values, skipna=True ): assert result.dtype == "boolean" result = result.astype(object) elif expected.dtype == "float" and expected.isna().any(): assert result.dtype == "Int64" result = result.astype("float") elif isinstance(expected, DataFrame): columns = expected.select_dtypes(include="object").columns assert all(result[columns].dtypes == "string") result[columns] = result[columns].astype(object) tm.assert_equal(result, expected) @pytest.mark.parametrize( "method,expected", [ ("count", [2, None]), ("find", [0, None]), ("index", [0, None]), ("rindex", [2, None]), ], ) def test_string_array_numeric_integer_array(method, expected): s = Series(["aba", None], dtype="string") result = getattr(s.str, method)("a") expected = Series(expected, dtype="Int64") tm.assert_series_equal(result, expected) @pytest.mark.parametrize( "method,expected", [ ("isdigit", [False, None, True]), ("isalpha", [True, None, False]), ("isalnum", [True, None, True]), ("isdigit", [False, None, True]), ], ) def test_string_array_boolean_array(method, expected): s = Series(["a", None, "1"], dtype="string") result = getattr(s.str, method)() expected = Series(expected, dtype="boolean") tm.assert_series_equal(result, expected) def test_string_array_extract(): # https://github.com/pandas-dev/pandas/issues/30969 # Only expand=False & multiple groups was failing a = Series(["a1", "b2", "cc"], dtype="string") b = Series(["a1", "b2", "cc"], dtype="object") pat = r"(\w)(\d)" result = a.str.extract(pat, expand=False) expected = b.str.extract(pat, expand=False) assert all(result.dtypes == "string") result = result.astype(object) tm.assert_equal(result, expected)