from datetime import datetime from io import StringIO import re from typing import Dict, List, Union import numpy as np import pytest import pandas as pd from pandas import DataFrame, Index, Series, Timestamp, date_range import pandas._testing as tm @pytest.fixture def mix_ab() -> Dict[str, List[Union[int, str]]]: return {"a": list(range(4)), "b": list("ab..")} @pytest.fixture def mix_abc() -> Dict[str, List[Union[float, str]]]: return {"a": list(range(4)), "b": list("ab.."), "c": ["a", "b", np.nan, "d"]} class TestDataFrameReplace: def test_replace_inplace(self, datetime_frame, float_string_frame): datetime_frame["A"][:5] = np.nan datetime_frame["A"][-5:] = np.nan tsframe = datetime_frame.copy() return_value = tsframe.replace(np.nan, 0, inplace=True) assert return_value is None tm.assert_frame_equal(tsframe, datetime_frame.fillna(0)) # mixed type mf = float_string_frame mf.iloc[5:20, mf.columns.get_loc("foo")] = np.nan mf.iloc[-10:, mf.columns.get_loc("A")] = np.nan result = float_string_frame.replace(np.nan, 0) expected = float_string_frame.fillna(value=0) tm.assert_frame_equal(result, expected) tsframe = datetime_frame.copy() return_value = tsframe.replace([np.nan], [0], inplace=True) assert return_value is None tm.assert_frame_equal(tsframe, datetime_frame.fillna(0)) def test_regex_replace_scalar(self, mix_ab): obj = {"a": list("ab.."), "b": list("efgh")} dfobj = DataFrame(obj) dfmix = DataFrame(mix_ab) # simplest cases # regex -> value # obj frame res = dfobj.replace(r"\s*\.\s*", np.nan, regex=True) tm.assert_frame_equal(dfobj, res.fillna(".")) # mixed res = dfmix.replace(r"\s*\.\s*", np.nan, regex=True) tm.assert_frame_equal(dfmix, res.fillna(".")) # regex -> regex # obj frame res = dfobj.replace(r"\s*(\.)\s*", r"\1\1\1", regex=True) objc = obj.copy() objc["a"] = ["a", "b", "...", "..."] expec = DataFrame(objc) tm.assert_frame_equal(res, expec) # with mixed res = dfmix.replace(r"\s*(\.)\s*", r"\1\1\1", regex=True) mixc = mix_ab.copy() mixc["b"] = ["a", "b", "...", "..."] expec = DataFrame(mixc) tm.assert_frame_equal(res, expec) # everything with compiled regexs as well res = dfobj.replace(re.compile(r"\s*\.\s*"), np.nan, regex=True) tm.assert_frame_equal(dfobj, res.fillna(".")) # mixed res = dfmix.replace(re.compile(r"\s*\.\s*"), np.nan, regex=True) tm.assert_frame_equal(dfmix, res.fillna(".")) # regex -> regex # obj frame res = dfobj.replace(re.compile(r"\s*(\.)\s*"), r"\1\1\1") objc = obj.copy() objc["a"] = ["a", "b", "...", "..."] expec = DataFrame(objc) tm.assert_frame_equal(res, expec) # with mixed res = dfmix.replace(re.compile(r"\s*(\.)\s*"), r"\1\1\1") mixc = mix_ab.copy() mixc["b"] = ["a", "b", "...", "..."] expec = DataFrame(mixc) tm.assert_frame_equal(res, expec) res = dfmix.replace(regex=re.compile(r"\s*(\.)\s*"), value=r"\1\1\1") mixc = mix_ab.copy() mixc["b"] = ["a", "b", "...", "..."] expec = DataFrame(mixc) tm.assert_frame_equal(res, expec) res = dfmix.replace(regex=r"\s*(\.)\s*", value=r"\1\1\1") mixc = mix_ab.copy() mixc["b"] = ["a", "b", "...", "..."] expec = DataFrame(mixc) tm.assert_frame_equal(res, expec) def test_regex_replace_scalar_inplace(self, mix_ab): obj = {"a": list("ab.."), "b": list("efgh")} dfobj = DataFrame(obj) dfmix = DataFrame(mix_ab) # simplest cases # regex -> value # obj frame res = dfobj.copy() return_value = res.replace(r"\s*\.\s*", np.nan, regex=True, inplace=True) assert return_value is None tm.assert_frame_equal(dfobj, res.fillna(".")) # mixed res = dfmix.copy() return_value = res.replace(r"\s*\.\s*", np.nan, regex=True, inplace=True) assert return_value is None tm.assert_frame_equal(dfmix, res.fillna(".")) # regex -> regex # obj frame res = dfobj.copy() return_value = res.replace(r"\s*(\.)\s*", r"\1\1\1", regex=True, inplace=True) assert return_value is None objc = obj.copy() objc["a"] = ["a", "b", "...", "..."] expec = DataFrame(objc) tm.assert_frame_equal(res, expec) # with mixed res = dfmix.copy() return_value = res.replace(r"\s*(\.)\s*", r"\1\1\1", regex=True, inplace=True) assert return_value is None mixc = mix_ab.copy() mixc["b"] = ["a", "b", "...", "..."] expec = DataFrame(mixc) tm.assert_frame_equal(res, expec) # everything with compiled regexs as well res = dfobj.copy() return_value = res.replace( re.compile(r"\s*\.\s*"), np.nan, regex=True, inplace=True ) assert return_value is None tm.assert_frame_equal(dfobj, res.fillna(".")) # mixed res = dfmix.copy() return_value = res.replace( re.compile(r"\s*\.\s*"), np.nan, regex=True, inplace=True ) assert return_value is None tm.assert_frame_equal(dfmix, res.fillna(".")) # regex -> regex # obj frame res = dfobj.copy() return_value = res.replace( re.compile(r"\s*(\.)\s*"), r"\1\1\1", regex=True, inplace=True ) assert return_value is None objc = obj.copy() objc["a"] = ["a", "b", "...", "..."] expec = DataFrame(objc) tm.assert_frame_equal(res, expec) # with mixed res = dfmix.copy() return_value = res.replace( re.compile(r"\s*(\.)\s*"), r"\1\1\1", regex=True, inplace=True ) assert return_value is None mixc = mix_ab.copy() mixc["b"] = ["a", "b", "...", "..."] expec = DataFrame(mixc) tm.assert_frame_equal(res, expec) res = dfobj.copy() return_value = res.replace(regex=r"\s*\.\s*", value=np.nan, inplace=True) assert return_value is None tm.assert_frame_equal(dfobj, res.fillna(".")) # mixed res = dfmix.copy() return_value = res.replace(regex=r"\s*\.\s*", value=np.nan, inplace=True) assert return_value is None tm.assert_frame_equal(dfmix, res.fillna(".")) # regex -> regex # obj frame res = dfobj.copy() return_value = res.replace(regex=r"\s*(\.)\s*", value=r"\1\1\1", inplace=True) assert return_value is None objc = obj.copy() objc["a"] = ["a", "b", "...", "..."] expec = DataFrame(objc) tm.assert_frame_equal(res, expec) # with mixed res = dfmix.copy() return_value = res.replace(regex=r"\s*(\.)\s*", value=r"\1\1\1", inplace=True) assert return_value is None mixc = mix_ab.copy() mixc["b"] = ["a", "b", "...", "..."] expec = DataFrame(mixc) tm.assert_frame_equal(res, expec) # everything with compiled regexs as well res = dfobj.copy() return_value = res.replace( regex=re.compile(r"\s*\.\s*"), value=np.nan, inplace=True ) assert return_value is None tm.assert_frame_equal(dfobj, res.fillna(".")) # mixed res = dfmix.copy() return_value = res.replace( regex=re.compile(r"\s*\.\s*"), value=np.nan, inplace=True ) assert return_value is None tm.assert_frame_equal(dfmix, res.fillna(".")) # regex -> regex # obj frame res = dfobj.copy() return_value = res.replace( regex=re.compile(r"\s*(\.)\s*"), value=r"\1\1\1", inplace=True ) assert return_value is None objc = obj.copy() objc["a"] = ["a", "b", "...", "..."] expec = DataFrame(objc) tm.assert_frame_equal(res, expec) # with mixed res = dfmix.copy() return_value = res.replace( regex=re.compile(r"\s*(\.)\s*"), value=r"\1\1\1", inplace=True ) assert return_value is None mixc = mix_ab.copy() mixc["b"] = ["a", "b", "...", "..."] expec = DataFrame(mixc) tm.assert_frame_equal(res, expec) def test_regex_replace_list_obj(self): obj = {"a": list("ab.."), "b": list("efgh"), "c": list("helo")} dfobj = DataFrame(obj) # lists of regexes and values # list of [re1, re2, ..., reN] -> [v1, v2, ..., vN] to_replace_res = [r"\s*\.\s*", r"e|f|g"] values = [np.nan, "crap"] res = dfobj.replace(to_replace_res, values, regex=True) expec = DataFrame( { "a": ["a", "b", np.nan, np.nan], "b": ["crap"] * 3 + ["h"], "c": ["h", "crap", "l", "o"], } ) tm.assert_frame_equal(res, expec) # list of [re1, re2, ..., reN] -> [re1, re2, .., reN] to_replace_res = [r"\s*(\.)\s*", r"(e|f|g)"] values = [r"\1\1", r"\1_crap"] res = dfobj.replace(to_replace_res, values, regex=True) expec = DataFrame( { "a": ["a", "b", "..", ".."], "b": ["e_crap", "f_crap", "g_crap", "h"], "c": ["h", "e_crap", "l", "o"], } ) tm.assert_frame_equal(res, expec) # list of [re1, re2, ..., reN] -> [(re1 or v1), (re2 or v2), ..., (reN # or vN)] to_replace_res = [r"\s*(\.)\s*", r"e"] values = [r"\1\1", r"crap"] res = dfobj.replace(to_replace_res, values, regex=True) expec = DataFrame( { "a": ["a", "b", "..", ".."], "b": ["crap", "f", "g", "h"], "c": ["h", "crap", "l", "o"], } ) tm.assert_frame_equal(res, expec) to_replace_res = [r"\s*(\.)\s*", r"e"] values = [r"\1\1", r"crap"] res = dfobj.replace(value=values, regex=to_replace_res) expec = DataFrame( { "a": ["a", "b", "..", ".."], "b": ["crap", "f", "g", "h"], "c": ["h", "crap", "l", "o"], } ) tm.assert_frame_equal(res, expec) def test_regex_replace_list_obj_inplace(self): # same as above with inplace=True # lists of regexes and values obj = {"a": list("ab.."), "b": list("efgh"), "c": list("helo")} dfobj = DataFrame(obj) # lists of regexes and values # list of [re1, re2, ..., reN] -> [v1, v2, ..., vN] to_replace_res = [r"\s*\.\s*", r"e|f|g"] values = [np.nan, "crap"] res = dfobj.copy() return_value = res.replace(to_replace_res, values, inplace=True, regex=True) assert return_value is None expec = DataFrame( { "a": ["a", "b", np.nan, np.nan], "b": ["crap"] * 3 + ["h"], "c": ["h", "crap", "l", "o"], } ) tm.assert_frame_equal(res, expec) # list of [re1, re2, ..., reN] -> [re1, re2, .., reN] to_replace_res = [r"\s*(\.)\s*", r"(e|f|g)"] values = [r"\1\1", r"\1_crap"] res = dfobj.copy() return_value = res.replace(to_replace_res, values, inplace=True, regex=True) assert return_value is None expec = DataFrame( { "a": ["a", "b", "..", ".."], "b": ["e_crap", "f_crap", "g_crap", "h"], "c": ["h", "e_crap", "l", "o"], } ) tm.assert_frame_equal(res, expec) # list of [re1, re2, ..., reN] -> [(re1 or v1), (re2 or v2), ..., (reN # or vN)] to_replace_res = [r"\s*(\.)\s*", r"e"] values = [r"\1\1", r"crap"] res = dfobj.copy() return_value = res.replace(to_replace_res, values, inplace=True, regex=True) assert return_value is None expec = DataFrame( { "a": ["a", "b", "..", ".."], "b": ["crap", "f", "g", "h"], "c": ["h", "crap", "l", "o"], } ) tm.assert_frame_equal(res, expec) to_replace_res = [r"\s*(\.)\s*", r"e"] values = [r"\1\1", r"crap"] res = dfobj.copy() return_value = res.replace(value=values, regex=to_replace_res, inplace=True) assert return_value is None expec = DataFrame( { "a": ["a", "b", "..", ".."], "b": ["crap", "f", "g", "h"], "c": ["h", "crap", "l", "o"], } ) tm.assert_frame_equal(res, expec) def test_regex_replace_list_mixed(self, mix_ab): # mixed frame to make sure this doesn't break things dfmix = DataFrame(mix_ab) # lists of regexes and values # list of [re1, re2, ..., reN] -> [v1, v2, ..., vN] to_replace_res = [r"\s*\.\s*", r"a"] values = [np.nan, "crap"] mix2 = {"a": list(range(4)), "b": list("ab.."), "c": list("halo")} dfmix2 = DataFrame(mix2) res = dfmix2.replace(to_replace_res, values, regex=True) expec = DataFrame( { "a": mix2["a"], "b": ["crap", "b", np.nan, np.nan], "c": ["h", "crap", "l", "o"], } ) tm.assert_frame_equal(res, expec) # list of [re1, re2, ..., reN] -> [re1, re2, .., reN] to_replace_res = [r"\s*(\.)\s*", r"(a|b)"] values = [r"\1\1", r"\1_crap"] res = dfmix.replace(to_replace_res, values, regex=True) expec = DataFrame({"a": mix_ab["a"], "b": ["a_crap", "b_crap", "..", ".."]}) tm.assert_frame_equal(res, expec) # list of [re1, re2, ..., reN] -> [(re1 or v1), (re2 or v2), ..., (reN # or vN)] to_replace_res = [r"\s*(\.)\s*", r"a", r"(b)"] values = [r"\1\1", r"crap", r"\1_crap"] res = dfmix.replace(to_replace_res, values, regex=True) expec = DataFrame({"a": mix_ab["a"], "b": ["crap", "b_crap", "..", ".."]}) tm.assert_frame_equal(res, expec) to_replace_res = [r"\s*(\.)\s*", r"a", r"(b)"] values = [r"\1\1", r"crap", r"\1_crap"] res = dfmix.replace(regex=to_replace_res, value=values) expec = DataFrame({"a": mix_ab["a"], "b": ["crap", "b_crap", "..", ".."]}) tm.assert_frame_equal(res, expec) def test_regex_replace_list_mixed_inplace(self, mix_ab): dfmix = DataFrame(mix_ab) # the same inplace # lists of regexes and values # list of [re1, re2, ..., reN] -> [v1, v2, ..., vN] to_replace_res = [r"\s*\.\s*", r"a"] values = [np.nan, "crap"] res = dfmix.copy() return_value = res.replace(to_replace_res, values, inplace=True, regex=True) assert return_value is None expec = DataFrame({"a": mix_ab["a"], "b": ["crap", "b", np.nan, np.nan]}) tm.assert_frame_equal(res, expec) # list of [re1, re2, ..., reN] -> [re1, re2, .., reN] to_replace_res = [r"\s*(\.)\s*", r"(a|b)"] values = [r"\1\1", r"\1_crap"] res = dfmix.copy() return_value = res.replace(to_replace_res, values, inplace=True, regex=True) assert return_value is None expec = DataFrame({"a": mix_ab["a"], "b": ["a_crap", "b_crap", "..", ".."]}) tm.assert_frame_equal(res, expec) # list of [re1, re2, ..., reN] -> [(re1 or v1), (re2 or v2), ..., (reN # or vN)] to_replace_res = [r"\s*(\.)\s*", r"a", r"(b)"] values = [r"\1\1", r"crap", r"\1_crap"] res = dfmix.copy() return_value = res.replace(to_replace_res, values, inplace=True, regex=True) assert return_value is None expec = DataFrame({"a": mix_ab["a"], "b": ["crap", "b_crap", "..", ".."]}) tm.assert_frame_equal(res, expec) to_replace_res = [r"\s*(\.)\s*", r"a", r"(b)"] values = [r"\1\1", r"crap", r"\1_crap"] res = dfmix.copy() return_value = res.replace(regex=to_replace_res, value=values, inplace=True) assert return_value is None expec = DataFrame({"a": mix_ab["a"], "b": ["crap", "b_crap", "..", ".."]}) tm.assert_frame_equal(res, expec) def test_regex_replace_dict_mixed(self, mix_abc): dfmix = DataFrame(mix_abc) # dicts # single dict {re1: v1}, search the whole frame # need test for this... # list of dicts {re1: v1, re2: v2, ..., re3: v3}, search the whole # frame res = dfmix.replace({"b": r"\s*\.\s*"}, {"b": np.nan}, regex=True) res2 = dfmix.copy() return_value = res2.replace( {"b": r"\s*\.\s*"}, {"b": np.nan}, inplace=True, regex=True ) assert return_value is None expec = DataFrame( {"a": mix_abc["a"], "b": ["a", "b", np.nan, np.nan], "c": mix_abc["c"]} ) tm.assert_frame_equal(res, expec) tm.assert_frame_equal(res2, expec) # list of dicts {re1: re11, re2: re12, ..., reN: re1N}, search the # whole frame res = dfmix.replace({"b": r"\s*(\.)\s*"}, {"b": r"\1ty"}, regex=True) res2 = dfmix.copy() return_value = res2.replace( {"b": r"\s*(\.)\s*"}, {"b": r"\1ty"}, inplace=True, regex=True ) assert return_value is None expec = DataFrame( {"a": mix_abc["a"], "b": ["a", "b", ".ty", ".ty"], "c": mix_abc["c"]} ) tm.assert_frame_equal(res, expec) tm.assert_frame_equal(res2, expec) res = dfmix.replace(regex={"b": r"\s*(\.)\s*"}, value={"b": r"\1ty"}) res2 = dfmix.copy() return_value = res2.replace( regex={"b": r"\s*(\.)\s*"}, value={"b": r"\1ty"}, inplace=True ) assert return_value is None expec = DataFrame( {"a": mix_abc["a"], "b": ["a", "b", ".ty", ".ty"], "c": mix_abc["c"]} ) tm.assert_frame_equal(res, expec) tm.assert_frame_equal(res2, expec) # scalar -> dict # to_replace regex, {value: value} expec = DataFrame( {"a": mix_abc["a"], "b": [np.nan, "b", ".", "."], "c": mix_abc["c"]} ) res = dfmix.replace("a", {"b": np.nan}, regex=True) res2 = dfmix.copy() return_value = res2.replace("a", {"b": np.nan}, regex=True, inplace=True) assert return_value is None tm.assert_frame_equal(res, expec) tm.assert_frame_equal(res2, expec) res = dfmix.replace("a", {"b": np.nan}, regex=True) res2 = dfmix.copy() return_value = res2.replace(regex="a", value={"b": np.nan}, inplace=True) assert return_value is None expec = DataFrame( {"a": mix_abc["a"], "b": [np.nan, "b", ".", "."], "c": mix_abc["c"]} ) tm.assert_frame_equal(res, expec) tm.assert_frame_equal(res2, expec) def test_regex_replace_dict_nested(self, mix_abc): # nested dicts will not work until this is implemented for Series dfmix = DataFrame(mix_abc) res = dfmix.replace({"b": {r"\s*\.\s*": np.nan}}, regex=True) res2 = dfmix.copy() res4 = dfmix.copy() return_value = res2.replace( {"b": {r"\s*\.\s*": np.nan}}, inplace=True, regex=True ) assert return_value is None res3 = dfmix.replace(regex={"b": {r"\s*\.\s*": np.nan}}) return_value = res4.replace(regex={"b": {r"\s*\.\s*": np.nan}}, inplace=True) assert return_value is None expec = DataFrame( {"a": mix_abc["a"], "b": ["a", "b", np.nan, np.nan], "c": mix_abc["c"]} ) tm.assert_frame_equal(res, expec) tm.assert_frame_equal(res2, expec) tm.assert_frame_equal(res3, expec) tm.assert_frame_equal(res4, expec) def test_regex_replace_dict_nested_non_first_character(self): # GH 25259 df = DataFrame({"first": ["abc", "bca", "cab"]}) expected = DataFrame({"first": [".bc", "bc.", "c.b"]}) result = df.replace({"a": "."}, regex=True) tm.assert_frame_equal(result, expected) def test_regex_replace_dict_nested_gh4115(self): df = DataFrame({"Type": ["Q", "T", "Q", "Q", "T"], "tmp": 2}) expected = DataFrame({"Type": [0, 1, 0, 0, 1], "tmp": 2}) result = df.replace({"Type": {"Q": 0, "T": 1}}) tm.assert_frame_equal(result, expected) def test_regex_replace_list_to_scalar(self, mix_abc): df = DataFrame(mix_abc) expec = DataFrame( { "a": mix_abc["a"], "b": np.array([np.nan] * 4), "c": [np.nan, np.nan, np.nan, "d"], } ) res = df.replace([r"\s*\.\s*", "a|b"], np.nan, regex=True) res2 = df.copy() res3 = df.copy() return_value = res2.replace( [r"\s*\.\s*", "a|b"], np.nan, regex=True, inplace=True ) assert return_value is None return_value = res3.replace( regex=[r"\s*\.\s*", "a|b"], value=np.nan, inplace=True ) assert return_value is None tm.assert_frame_equal(res, expec) tm.assert_frame_equal(res2, expec) tm.assert_frame_equal(res3, expec) def test_regex_replace_str_to_numeric(self, mix_abc): # what happens when you try to replace a numeric value with a regex? df = DataFrame(mix_abc) res = df.replace(r"\s*\.\s*", 0, regex=True) res2 = df.copy() return_value = res2.replace(r"\s*\.\s*", 0, inplace=True, regex=True) assert return_value is None res3 = df.copy() return_value = res3.replace(regex=r"\s*\.\s*", value=0, inplace=True) assert return_value is None expec = DataFrame({"a": mix_abc["a"], "b": ["a", "b", 0, 0], "c": mix_abc["c"]}) tm.assert_frame_equal(res, expec) tm.assert_frame_equal(res2, expec) tm.assert_frame_equal(res3, expec) def test_regex_replace_regex_list_to_numeric(self, mix_abc): df = DataFrame(mix_abc) res = df.replace([r"\s*\.\s*", "b"], 0, regex=True) res2 = df.copy() return_value = res2.replace([r"\s*\.\s*", "b"], 0, regex=True, inplace=True) assert return_value is None res3 = df.copy() return_value = res3.replace(regex=[r"\s*\.\s*", "b"], value=0, inplace=True) assert return_value is None expec = DataFrame( {"a": mix_abc["a"], "b": ["a", 0, 0, 0], "c": ["a", 0, np.nan, "d"]} ) tm.assert_frame_equal(res, expec) tm.assert_frame_equal(res2, expec) tm.assert_frame_equal(res3, expec) def test_regex_replace_series_of_regexes(self, mix_abc): df = DataFrame(mix_abc) s1 = Series({"b": r"\s*\.\s*"}) s2 = Series({"b": np.nan}) res = df.replace(s1, s2, regex=True) res2 = df.copy() return_value = res2.replace(s1, s2, inplace=True, regex=True) assert return_value is None res3 = df.copy() return_value = res3.replace(regex=s1, value=s2, inplace=True) assert return_value is None expec = DataFrame( {"a": mix_abc["a"], "b": ["a", "b", np.nan, np.nan], "c": mix_abc["c"]} ) tm.assert_frame_equal(res, expec) tm.assert_frame_equal(res2, expec) tm.assert_frame_equal(res3, expec) def test_regex_replace_numeric_to_object_conversion(self, mix_abc): df = DataFrame(mix_abc) expec = DataFrame({"a": ["a", 1, 2, 3], "b": mix_abc["b"], "c": mix_abc["c"]}) res = df.replace(0, "a") tm.assert_frame_equal(res, expec) assert res.a.dtype == np.object_ @pytest.mark.parametrize( "to_replace", [{"": np.nan, ",": ""}, {",": "", "": np.nan}] ) def test_joint_simple_replace_and_regex_replace(self, to_replace): # GH-39338 df = DataFrame( { "col1": ["1,000", "a", "3"], "col2": ["a", "", "b"], "col3": ["a", "b", "c"], } ) result = df.replace(regex=to_replace) expected = DataFrame( { "col1": ["1000", "a", "3"], "col2": ["a", np.nan, "b"], "col3": ["a", "b", "c"], } ) tm.assert_frame_equal(result, expected) @pytest.mark.parametrize("metachar", ["[]", "()", r"\d", r"\w", r"\s"]) def test_replace_regex_metachar(self, metachar): df = DataFrame({"a": [metachar, "else"]}) result = df.replace({"a": {metachar: "paren"}}) expected = DataFrame({"a": ["paren", "else"]}) tm.assert_frame_equal(result, expected) def test_replace(self, datetime_frame): datetime_frame["A"][:5] = np.nan datetime_frame["A"][-5:] = np.nan zero_filled = datetime_frame.replace(np.nan, -1e8) tm.assert_frame_equal(zero_filled, datetime_frame.fillna(-1e8)) tm.assert_frame_equal(zero_filled.replace(-1e8, np.nan), datetime_frame) datetime_frame["A"][:5] = np.nan datetime_frame["A"][-5:] = np.nan datetime_frame["B"][:5] = -1e8 # empty df = DataFrame(index=["a", "b"]) tm.assert_frame_equal(df, df.replace(5, 7)) # GH 11698 # test for mixed data types. df = DataFrame( [("-", pd.to_datetime("20150101")), ("a", pd.to_datetime("20150102"))] ) df1 = df.replace("-", np.nan) expected_df = DataFrame( [(np.nan, pd.to_datetime("20150101")), ("a", pd.to_datetime("20150102"))] ) tm.assert_frame_equal(df1, expected_df) def test_replace_list(self): obj = {"a": list("ab.."), "b": list("efgh"), "c": list("helo")} dfobj = DataFrame(obj) # lists of regexes and values # list of [v1, v2, ..., vN] -> [v1, v2, ..., vN] to_replace_res = [r".", r"e"] values = [np.nan, "crap"] res = dfobj.replace(to_replace_res, values) expec = DataFrame( { "a": ["a", "b", np.nan, np.nan], "b": ["crap", "f", "g", "h"], "c": ["h", "crap", "l", "o"], } ) tm.assert_frame_equal(res, expec) # list of [v1, v2, ..., vN] -> [v1, v2, .., vN] to_replace_res = [r".", r"f"] values = [r"..", r"crap"] res = dfobj.replace(to_replace_res, values) expec = DataFrame( { "a": ["a", "b", "..", ".."], "b": ["e", "crap", "g", "h"], "c": ["h", "e", "l", "o"], } ) tm.assert_frame_equal(res, expec) def test_replace_with_empty_list(self, frame_or_series): # GH 21977 ser = Series([["a", "b"], [], np.nan, [1]]) obj = DataFrame({"col": ser}) if frame_or_series is Series: obj = ser expected = obj result = obj.replace([], np.nan) tm.assert_equal(result, expected) # GH 19266 msg = ( "NumPy boolean array indexing assignment cannot assign {size} " "input values to the 1 output values where the mask is true" ) with pytest.raises(ValueError, match=msg.format(size=0)): obj.replace({np.nan: []}) with pytest.raises(ValueError, match=msg.format(size=2)): obj.replace({np.nan: ["dummy", "alt"]}) def test_replace_series_dict(self): # from GH 3064 df = DataFrame({"zero": {"a": 0.0, "b": 1}, "one": {"a": 2.0, "b": 0}}) result = df.replace(0, {"zero": 0.5, "one": 1.0}) expected = DataFrame({"zero": {"a": 0.5, "b": 1}, "one": {"a": 2.0, "b": 1.0}}) tm.assert_frame_equal(result, expected) result = df.replace(0, df.mean()) tm.assert_frame_equal(result, expected) # series to series/dict df = DataFrame({"zero": {"a": 0.0, "b": 1}, "one": {"a": 2.0, "b": 0}}) s = Series({"zero": 0.0, "one": 2.0}) result = df.replace(s, {"zero": 0.5, "one": 1.0}) expected = DataFrame({"zero": {"a": 0.5, "b": 1}, "one": {"a": 1.0, "b": 0.0}}) tm.assert_frame_equal(result, expected) result = df.replace(s, df.mean()) tm.assert_frame_equal(result, expected) def test_replace_convert(self): # gh 3907 df = DataFrame([["foo", "bar", "bah"], ["bar", "foo", "bah"]]) m = {"foo": 1, "bar": 2, "bah": 3} rep = df.replace(m) expec = Series([np.int64] * 3) res = rep.dtypes tm.assert_series_equal(expec, res) def test_replace_mixed(self, float_string_frame): mf = float_string_frame mf.iloc[5:20, mf.columns.get_loc("foo")] = np.nan mf.iloc[-10:, mf.columns.get_loc("A")] = np.nan result = float_string_frame.replace(np.nan, -18) expected = float_string_frame.fillna(value=-18) tm.assert_frame_equal(result, expected) tm.assert_frame_equal(result.replace(-18, np.nan), float_string_frame) result = float_string_frame.replace(np.nan, -1e8) expected = float_string_frame.fillna(value=-1e8) tm.assert_frame_equal(result, expected) tm.assert_frame_equal(result.replace(-1e8, np.nan), float_string_frame) # int block upcasting df = DataFrame( { "A": Series([1.0, 2.0], dtype="float64"), "B": Series([0, 1], dtype="int64"), } ) expected = DataFrame( { "A": Series([1.0, 2.0], dtype="float64"), "B": Series([0.5, 1], dtype="float64"), } ) result = df.replace(0, 0.5) tm.assert_frame_equal(result, expected) return_value = df.replace(0, 0.5, inplace=True) assert return_value is None tm.assert_frame_equal(df, expected) # int block splitting df = DataFrame( { "A": Series([1.0, 2.0], dtype="float64"), "B": Series([0, 1], dtype="int64"), "C": Series([1, 2], dtype="int64"), } ) expected = DataFrame( { "A": Series([1.0, 2.0], dtype="float64"), "B": Series([0.5, 1], dtype="float64"), "C": Series([1, 2], dtype="int64"), } ) result = df.replace(0, 0.5) tm.assert_frame_equal(result, expected) # to object block upcasting df = DataFrame( { "A": Series([1.0, 2.0], dtype="float64"), "B": Series([0, 1], dtype="int64"), } ) expected = DataFrame( { "A": Series([1, "foo"], dtype="object"), "B": Series([0, 1], dtype="int64"), } ) result = df.replace(2, "foo") tm.assert_frame_equal(result, expected) expected = DataFrame( { "A": Series(["foo", "bar"], dtype="object"), "B": Series([0, "foo"], dtype="object"), } ) result = df.replace([1, 2], ["foo", "bar"]) tm.assert_frame_equal(result, expected) # test case from df = DataFrame( {"A": Series([3, 0], dtype="int64"), "B": Series([0, 3], dtype="int64")} ) result = df.replace(3, df.mean().to_dict()) expected = df.copy().astype("float64") m = df.mean() expected.iloc[0, 0] = m[0] expected.iloc[1, 1] = m[1] tm.assert_frame_equal(result, expected) def test_replace_simple_nested_dict(self): df = DataFrame({"col": range(1, 5)}) expected = DataFrame({"col": ["a", 2, 3, "b"]}) result = df.replace({"col": {1: "a", 4: "b"}}) tm.assert_frame_equal(expected, result) # in this case, should be the same as the not nested version result = df.replace({1: "a", 4: "b"}) tm.assert_frame_equal(expected, result) def test_replace_simple_nested_dict_with_nonexistent_value(self): df = DataFrame({"col": range(1, 5)}) expected = DataFrame({"col": ["a", 2, 3, "b"]}) result = df.replace({-1: "-", 1: "a", 4: "b"}) tm.assert_frame_equal(expected, result) result = df.replace({"col": {-1: "-", 1: "a", 4: "b"}}) tm.assert_frame_equal(expected, result) def test_replace_value_is_none(self, datetime_frame): orig_value = datetime_frame.iloc[0, 0] orig2 = datetime_frame.iloc[1, 0] datetime_frame.iloc[0, 0] = np.nan datetime_frame.iloc[1, 0] = 1 result = datetime_frame.replace(to_replace={np.nan: 0}) expected = datetime_frame.T.replace(to_replace={np.nan: 0}).T tm.assert_frame_equal(result, expected) result = datetime_frame.replace(to_replace={np.nan: 0, 1: -1e8}) tsframe = datetime_frame.copy() tsframe.iloc[0, 0] = 0 tsframe.iloc[1, 0] = -1e8 expected = tsframe tm.assert_frame_equal(expected, result) datetime_frame.iloc[0, 0] = orig_value datetime_frame.iloc[1, 0] = orig2 def test_replace_for_new_dtypes(self, datetime_frame): # dtypes tsframe = datetime_frame.copy().astype(np.float32) tsframe["A"][:5] = np.nan tsframe["A"][-5:] = np.nan zero_filled = tsframe.replace(np.nan, -1e8) tm.assert_frame_equal(zero_filled, tsframe.fillna(-1e8)) tm.assert_frame_equal(zero_filled.replace(-1e8, np.nan), tsframe) tsframe["A"][:5] = np.nan tsframe["A"][-5:] = np.nan tsframe["B"][:5] = -1e8 b = tsframe["B"] b[b == -1e8] = np.nan tsframe["B"] = b result = tsframe.fillna(method="bfill") tm.assert_frame_equal(result, tsframe.fillna(method="bfill")) @pytest.mark.parametrize( "frame, to_replace, value, expected", [ (DataFrame({"ints": [1, 2, 3]}), 1, 0, DataFrame({"ints": [0, 2, 3]})), ( DataFrame({"ints": [1, 2, 3]}, dtype=np.int32), 1, 0, DataFrame({"ints": [0, 2, 3]}, dtype=np.int32), ), ( DataFrame({"ints": [1, 2, 3]}, dtype=np.int16), 1, 0, DataFrame({"ints": [0, 2, 3]}, dtype=np.int16), ), ( DataFrame({"bools": [True, False, True]}), False, True, DataFrame({"bools": [True, True, True]}), ), ( DataFrame({"complex": [1j, 2j, 3j]}), 1j, 0, DataFrame({"complex": [0j, 2j, 3j]}), ), ( DataFrame( { "datetime64": Index( [ datetime(2018, 5, 28), datetime(2018, 7, 28), datetime(2018, 5, 28), ] ) } ), datetime(2018, 5, 28), datetime(2018, 7, 28), DataFrame({"datetime64": Index([datetime(2018, 7, 28)] * 3)}), ), # GH 20380 ( DataFrame({"dt": [datetime(3017, 12, 20)], "str": ["foo"]}), "foo", "bar", DataFrame({"dt": [datetime(3017, 12, 20)], "str": ["bar"]}), ), ( DataFrame( { "A": date_range("20130101", periods=3, tz="US/Eastern"), "B": [0, np.nan, 2], } ), Timestamp("20130102", tz="US/Eastern"), Timestamp("20130104", tz="US/Eastern"), DataFrame( { "A": [ Timestamp("20130101", tz="US/Eastern"), Timestamp("20130104", tz="US/Eastern"), Timestamp("20130103", tz="US/Eastern"), ], "B": [0, np.nan, 2], } ), ), # GH 35376 ( DataFrame([[1, 1.0], [2, 2.0]]), 1.0, 5, DataFrame([[5, 5.0], [2, 2.0]]), ), ( DataFrame([[1, 1.0], [2, 2.0]]), 1, 5, DataFrame([[5, 5.0], [2, 2.0]]), ), ( DataFrame([[1, 1.0], [2, 2.0]]), 1.0, 5.0, DataFrame([[5, 5.0], [2, 2.0]]), ), ( DataFrame([[1, 1.0], [2, 2.0]]), 1, 5.0, DataFrame([[5, 5.0], [2, 2.0]]), ), ], ) def test_replace_dtypes(self, frame, to_replace, value, expected): result = getattr(frame, "replace")(to_replace, value) tm.assert_frame_equal(result, expected) def test_replace_input_formats_listlike(self): # both dicts to_rep = {"A": np.nan, "B": 0, "C": ""} values = {"A": 0, "B": -1, "C": "missing"} df = DataFrame( {"A": [np.nan, 0, np.inf], "B": [0, 2, 5], "C": ["", "asdf", "fd"]} ) filled = df.replace(to_rep, values) expected = {k: v.replace(to_rep[k], values[k]) for k, v in df.items()} tm.assert_frame_equal(filled, DataFrame(expected)) result = df.replace([0, 2, 5], [5, 2, 0]) expected = DataFrame( {"A": [np.nan, 5, np.inf], "B": [5, 2, 0], "C": ["", "asdf", "fd"]} ) tm.assert_frame_equal(result, expected) # scalar to dict values = {"A": 0, "B": -1, "C": "missing"} df = DataFrame( {"A": [np.nan, 0, np.nan], "B": [0, 2, 5], "C": ["", "asdf", "fd"]} ) filled = df.replace(np.nan, values) expected = {k: v.replace(np.nan, values[k]) for k, v in df.items()} tm.assert_frame_equal(filled, DataFrame(expected)) # list to list to_rep = [np.nan, 0, ""] values = [-2, -1, "missing"] result = df.replace(to_rep, values) expected = df.copy() for i in range(len(to_rep)): return_value = expected.replace(to_rep[i], values[i], inplace=True) assert return_value is None tm.assert_frame_equal(result, expected) msg = r"Replacement lists must match in length\. Expecting 3 got 2" with pytest.raises(ValueError, match=msg): df.replace(to_rep, values[1:]) def test_replace_input_formats_scalar(self): df = DataFrame( {"A": [np.nan, 0, np.inf], "B": [0, 2, 5], "C": ["", "asdf", "fd"]} ) # dict to scalar to_rep = {"A": np.nan, "B": 0, "C": ""} filled = df.replace(to_rep, 0) expected = {k: v.replace(to_rep[k], 0) for k, v in df.items()} tm.assert_frame_equal(filled, DataFrame(expected)) msg = "value argument must be scalar, dict, or Series" with pytest.raises(TypeError, match=msg): df.replace(to_rep, [np.nan, 0, ""]) # list to scalar to_rep = [np.nan, 0, ""] result = df.replace(to_rep, -1) expected = df.copy() for i in range(len(to_rep)): return_value = expected.replace(to_rep[i], -1, inplace=True) assert return_value is None tm.assert_frame_equal(result, expected) def test_replace_limit(self): pass def test_replace_dict_no_regex(self): answer = Series( { 0: "Strongly Agree", 1: "Agree", 2: "Neutral", 3: "Disagree", 4: "Strongly Disagree", } ) weights = { "Agree": 4, "Disagree": 2, "Neutral": 3, "Strongly Agree": 5, "Strongly Disagree": 1, } expected = Series({0: 5, 1: 4, 2: 3, 3: 2, 4: 1}) result = answer.replace(weights) tm.assert_series_equal(result, expected) def test_replace_series_no_regex(self): answer = Series( { 0: "Strongly Agree", 1: "Agree", 2: "Neutral", 3: "Disagree", 4: "Strongly Disagree", } ) weights = Series( { "Agree": 4, "Disagree": 2, "Neutral": 3, "Strongly Agree": 5, "Strongly Disagree": 1, } ) expected = Series({0: 5, 1: 4, 2: 3, 3: 2, 4: 1}) result = answer.replace(weights) tm.assert_series_equal(result, expected) def test_replace_dict_tuple_list_ordering_remains_the_same(self): df = DataFrame({"A": [np.nan, 1]}) res1 = df.replace(to_replace={np.nan: 0, 1: -1e8}) res2 = df.replace(to_replace=(1, np.nan), value=[-1e8, 0]) res3 = df.replace(to_replace=[1, np.nan], value=[-1e8, 0]) expected = DataFrame({"A": [0, -1e8]}) tm.assert_frame_equal(res1, res2) tm.assert_frame_equal(res2, res3) tm.assert_frame_equal(res3, expected) def test_replace_doesnt_replace_without_regex(self): raw = """fol T_opp T_Dir T_Enh 0 1 0 0 vo 1 2 vr 0 0 2 2 0 0 0 3 3 0 bt 0""" df = pd.read_csv(StringIO(raw), sep=r"\s+") res = df.replace({r"\D": 1}) tm.assert_frame_equal(df, res) def test_replace_bool_with_string(self): df = DataFrame({"a": [True, False], "b": list("ab")}) result = df.replace(True, "a") expected = DataFrame({"a": ["a", False], "b": df.b}) tm.assert_frame_equal(result, expected) def test_replace_pure_bool_with_string_no_op(self): df = DataFrame(np.random.rand(2, 2) > 0.5) result = df.replace("asdf", "fdsa") tm.assert_frame_equal(df, result) def test_replace_bool_with_bool(self): df = DataFrame(np.random.rand(2, 2) > 0.5) result = df.replace(False, True) expected = DataFrame(np.ones((2, 2), dtype=bool)) tm.assert_frame_equal(result, expected) def test_replace_with_dict_with_bool_keys(self): df = DataFrame({0: [True, False], 1: [False, True]}) result = df.replace({"asdf": "asdb", True: "yes"}) expected = DataFrame({0: ["yes", False], 1: [False, "yes"]}) tm.assert_frame_equal(result, expected) def test_replace_dict_strings_vs_ints(self): # GH#34789 df = DataFrame({"Y0": [1, 2], "Y1": [3, 4]}) result = df.replace({"replace_string": "test"}) tm.assert_frame_equal(result, df) result = df["Y0"].replace({"replace_string": "test"}) tm.assert_series_equal(result, df["Y0"]) def test_replace_truthy(self): df = DataFrame({"a": [True, True]}) r = df.replace([np.inf, -np.inf], np.nan) e = df tm.assert_frame_equal(r, e) def test_nested_dict_overlapping_keys_replace_int(self): # GH 27660 keep behaviour consistent for simple dictionary and # nested dictionary replacement df = DataFrame({"a": list(range(1, 5))}) result = df.replace({"a": dict(zip(range(1, 5), range(2, 6)))}) expected = df.replace(dict(zip(range(1, 5), range(2, 6)))) tm.assert_frame_equal(result, expected) def test_nested_dict_overlapping_keys_replace_str(self): # GH 27660 a = np.arange(1, 5) astr = a.astype(str) bstr = np.arange(2, 6).astype(str) df = DataFrame({"a": astr}) result = df.replace(dict(zip(astr, bstr))) expected = df.replace({"a": dict(zip(astr, bstr))}) tm.assert_frame_equal(result, expected) def test_replace_swapping_bug(self): df = DataFrame({"a": [True, False, True]}) res = df.replace({"a": {True: "Y", False: "N"}}) expect = DataFrame({"a": ["Y", "N", "Y"]}) tm.assert_frame_equal(res, expect) df = DataFrame({"a": [0, 1, 0]}) res = df.replace({"a": {0: "Y", 1: "N"}}) expect = DataFrame({"a": ["Y", "N", "Y"]}) tm.assert_frame_equal(res, expect) def test_replace_period(self): d = { "fname": { "out_augmented_AUG_2011.json": pd.Period(year=2011, month=8, freq="M"), "out_augmented_JAN_2011.json": pd.Period(year=2011, month=1, freq="M"), "out_augmented_MAY_2012.json": pd.Period(year=2012, month=5, freq="M"), "out_augmented_SUBSIDY_WEEK.json": pd.Period( year=2011, month=4, freq="M" ), "out_augmented_AUG_2012.json": pd.Period(year=2012, month=8, freq="M"), "out_augmented_MAY_2011.json": pd.Period(year=2011, month=5, freq="M"), "out_augmented_SEP_2013.json": pd.Period(year=2013, month=9, freq="M"), } } df = DataFrame( [ "out_augmented_AUG_2012.json", "out_augmented_SEP_2013.json", "out_augmented_SUBSIDY_WEEK.json", "out_augmented_MAY_2012.json", "out_augmented_MAY_2011.json", "out_augmented_AUG_2011.json", "out_augmented_JAN_2011.json", ], columns=["fname"], ) assert set(df.fname.values) == set(d["fname"].keys()) # We don't support converting object -> specialized EA in # replace yet. expected = DataFrame( {"fname": [d["fname"][k] for k in df.fname.values]}, dtype=object ) result = df.replace(d) tm.assert_frame_equal(result, expected) def test_replace_datetime(self): d = { "fname": { "out_augmented_AUG_2011.json": Timestamp("2011-08"), "out_augmented_JAN_2011.json": Timestamp("2011-01"), "out_augmented_MAY_2012.json": Timestamp("2012-05"), "out_augmented_SUBSIDY_WEEK.json": Timestamp("2011-04"), "out_augmented_AUG_2012.json": Timestamp("2012-08"), "out_augmented_MAY_2011.json": Timestamp("2011-05"), "out_augmented_SEP_2013.json": Timestamp("2013-09"), } } df = DataFrame( [ "out_augmented_AUG_2012.json", "out_augmented_SEP_2013.json", "out_augmented_SUBSIDY_WEEK.json", "out_augmented_MAY_2012.json", "out_augmented_MAY_2011.json", "out_augmented_AUG_2011.json", "out_augmented_JAN_2011.json", ], columns=["fname"], ) assert set(df.fname.values) == set(d["fname"].keys()) expected = DataFrame({"fname": [d["fname"][k] for k in df.fname.values]}) result = df.replace(d) tm.assert_frame_equal(result, expected) def test_replace_datetimetz(self): # GH 11326 # behaving poorly when presented with a datetime64[ns, tz] df = DataFrame( { "A": date_range("20130101", periods=3, tz="US/Eastern"), "B": [0, np.nan, 2], } ) result = df.replace(np.nan, 1) expected = DataFrame( { "A": date_range("20130101", periods=3, tz="US/Eastern"), "B": Series([0, 1, 2], dtype="float64"), } ) tm.assert_frame_equal(result, expected) result = df.fillna(1) tm.assert_frame_equal(result, expected) result = df.replace(0, np.nan) expected = DataFrame( { "A": date_range("20130101", periods=3, tz="US/Eastern"), "B": [np.nan, np.nan, 2], } ) tm.assert_frame_equal(result, expected) result = df.replace( Timestamp("20130102", tz="US/Eastern"), Timestamp("20130104", tz="US/Eastern"), ) expected = DataFrame( { "A": [ Timestamp("20130101", tz="US/Eastern"), Timestamp("20130104", tz="US/Eastern"), Timestamp("20130103", tz="US/Eastern"), ], "B": [0, np.nan, 2], } ) tm.assert_frame_equal(result, expected) result = df.copy() result.iloc[1, 0] = np.nan result = result.replace({"A": pd.NaT}, Timestamp("20130104", tz="US/Eastern")) tm.assert_frame_equal(result, expected) # coerce to object result = df.copy() result.iloc[1, 0] = np.nan result = result.replace({"A": pd.NaT}, Timestamp("20130104", tz="US/Pacific")) expected = DataFrame( { "A": [ Timestamp("20130101", tz="US/Eastern"), Timestamp("20130104", tz="US/Pacific"), Timestamp("20130103", tz="US/Eastern"), ], "B": [0, np.nan, 2], } ) tm.assert_frame_equal(result, expected) result = df.copy() result.iloc[1, 0] = np.nan result = result.replace({"A": np.nan}, Timestamp("20130104")) expected = DataFrame( { "A": [ Timestamp("20130101", tz="US/Eastern"), Timestamp("20130104"), Timestamp("20130103", tz="US/Eastern"), ], "B": [0, np.nan, 2], } ) tm.assert_frame_equal(result, expected) def test_replace_with_empty_dictlike(self, mix_abc): # GH 15289 df = DataFrame(mix_abc) tm.assert_frame_equal(df, df.replace({})) tm.assert_frame_equal(df, df.replace(Series([], dtype=object))) tm.assert_frame_equal(df, df.replace({"b": {}})) tm.assert_frame_equal(df, df.replace(Series({"b": {}}))) @pytest.mark.parametrize( "to_replace, method, expected", [ (0, "bfill", {"A": [1, 1, 2], "B": [5, np.nan, 7], "C": ["a", "b", "c"]}), ( np.nan, "bfill", {"A": [0, 1, 2], "B": [5.0, 7.0, 7.0], "C": ["a", "b", "c"]}, ), ("d", "ffill", {"A": [0, 1, 2], "B": [5, np.nan, 7], "C": ["a", "b", "c"]}), ( [0, 2], "bfill", {"A": [1, 1, 2], "B": [5, np.nan, 7], "C": ["a", "b", "c"]}, ), ( [1, 2], "pad", {"A": [0, 0, 0], "B": [5, np.nan, 7], "C": ["a", "b", "c"]}, ), ( (1, 2), "bfill", {"A": [0, 2, 2], "B": [5, np.nan, 7], "C": ["a", "b", "c"]}, ), ( ["b", "c"], "ffill", {"A": [0, 1, 2], "B": [5, np.nan, 7], "C": ["a", "a", "a"]}, ), ], ) def test_replace_method(self, to_replace, method, expected): # GH 19632 df = DataFrame({"A": [0, 1, 2], "B": [5, np.nan, 7], "C": ["a", "b", "c"]}) result = df.replace(to_replace=to_replace, value=None, method=method) expected = DataFrame(expected) tm.assert_frame_equal(result, expected) @pytest.mark.parametrize( "replace_dict, final_data", [({"a": 1, "b": 1}, [[3, 3], [2, 2]]), ({"a": 1, "b": 2}, [[3, 1], [2, 3]])], ) def test_categorical_replace_with_dict(self, replace_dict, final_data): # GH 26988 df = DataFrame([[1, 1], [2, 2]], columns=["a", "b"], dtype="category") final_data = np.array(final_data) a = pd.Categorical(final_data[:, 0], categories=[3, 2]) excat = [3, 2] if replace_dict["b"] == 1 else [1, 3] b = pd.Categorical(final_data[:, 1], categories=excat) expected = DataFrame({"a": a, "b": b}) result = df.replace(replace_dict, 3) tm.assert_frame_equal(result, expected) msg = ( r"Attributes of DataFrame.iloc\[:, 0\] \(column name=\"a\"\) are " "different" ) with pytest.raises(AssertionError, match=msg): # ensure non-inplace call does not affect original tm.assert_frame_equal(df, expected) return_value = df.replace(replace_dict, 3, inplace=True) assert return_value is None tm.assert_frame_equal(df, expected) @pytest.mark.parametrize( "df, to_replace, exp", [ ( {"col1": [1, 2, 3], "col2": [4, 5, 6]}, {4: 5, 5: 6, 6: 7}, {"col1": [1, 2, 3], "col2": [5, 6, 7]}, ), ( {"col1": [1, 2, 3], "col2": ["4", "5", "6"]}, {"4": "5", "5": "6", "6": "7"}, {"col1": [1, 2, 3], "col2": ["5", "6", "7"]}, ), ], ) def test_replace_commutative(self, df, to_replace, exp): # GH 16051 # DataFrame.replace() overwrites when values are non-numeric # also added to data frame whilst issue was for series df = DataFrame(df) expected = DataFrame(exp) result = df.replace(to_replace) tm.assert_frame_equal(result, expected) @pytest.mark.parametrize( "replacer", [ Timestamp("20170827"), np.int8(1), np.int16(1), np.float32(1), np.float64(1), ], ) def test_replace_replacer_dtype(self, replacer): # GH26632 df = DataFrame(["a"]) result = df.replace({"a": replacer, "b": replacer}) expected = DataFrame([replacer]) tm.assert_frame_equal(result, expected) def test_replace_after_convert_dtypes(self): # GH31517 df = DataFrame({"grp": [1, 2, 3, 4, 5]}, dtype="Int64") result = df.replace(1, 10) expected = DataFrame({"grp": [10, 2, 3, 4, 5]}, dtype="Int64") tm.assert_frame_equal(result, expected) def test_replace_invalid_to_replace(self): # GH 18634 # API: replace() should raise an exception if invalid argument is given df = DataFrame({"one": ["a", "b ", "c"], "two": ["d ", "e ", "f "]}) msg = ( r"Expecting 'to_replace' to be either a scalar, array-like, " r"dict or None, got invalid type.*" ) with pytest.raises(TypeError, match=msg): df.replace(lambda x: x.strip()) @pytest.mark.parametrize("dtype", ["float", "float64", "int64", "Int64", "boolean"]) @pytest.mark.parametrize("value", [np.nan, pd.NA]) def test_replace_no_replacement_dtypes(self, dtype, value): # https://github.com/pandas-dev/pandas/issues/32988 df = DataFrame(np.eye(2), dtype=dtype) result = df.replace(to_replace=[None, -np.inf, np.inf], value=value) tm.assert_frame_equal(result, df) @pytest.mark.parametrize("replacement", [np.nan, 5]) def test_replace_with_duplicate_columns(self, replacement): # GH 24798 result = DataFrame({"A": [1, 2, 3], "A1": [4, 5, 6], "B": [7, 8, 9]}) result.columns = list("AAB") expected = DataFrame( {"A": [1, 2, 3], "A1": [4, 5, 6], "B": [replacement, 8, 9]} ) expected.columns = list("AAB") result["B"] = result["B"].replace(7, replacement) tm.assert_frame_equal(result, expected) @pytest.mark.parametrize("value", [pd.Period("2020-01"), pd.Interval(0, 5)]) def test_replace_ea_ignore_float(self, frame_or_series, value): # GH#34871 obj = DataFrame({"Per": [value] * 3}) if frame_or_series is not DataFrame: obj = obj["Per"] expected = obj.copy() result = obj.replace(1.0, 0.0) tm.assert_equal(expected, result) def test_replace_value_category_type(self): """ Test for #23305: to ensure category dtypes are maintained after replace with direct values """ # create input data input_dict = { "col1": [1, 2, 3, 4], "col2": ["a", "b", "c", "d"], "col3": [1.5, 2.5, 3.5, 4.5], "col4": ["cat1", "cat2", "cat3", "cat4"], "col5": ["obj1", "obj2", "obj3", "obj4"], } # explicitly cast columns as category and order them input_df = DataFrame(data=input_dict).astype( {"col2": "category", "col4": "category"} ) input_df["col2"] = input_df["col2"].cat.reorder_categories( ["a", "b", "c", "d"], ordered=True ) input_df["col4"] = input_df["col4"].cat.reorder_categories( ["cat1", "cat2", "cat3", "cat4"], ordered=True ) # create expected dataframe expected_dict = { "col1": [1, 2, 3, 4], "col2": ["a", "b", "c", "z"], "col3": [1.5, 2.5, 3.5, 4.5], "col4": ["cat1", "catX", "cat3", "cat4"], "col5": ["obj9", "obj2", "obj3", "obj4"], } # explicitly cast columns as category and order them expected = DataFrame(data=expected_dict).astype( {"col2": "category", "col4": "category"} ) expected["col2"] = expected["col2"].cat.reorder_categories( ["a", "b", "c", "z"], ordered=True ) expected["col4"] = expected["col4"].cat.reorder_categories( ["cat1", "catX", "cat3", "cat4"], ordered=True ) # replace values in input dataframe input_df = input_df.replace("d", "z") input_df = input_df.replace("obj1", "obj9") result = input_df.replace("cat2", "catX") tm.assert_frame_equal(result, expected) @pytest.mark.xfail( reason="category dtype gets changed to object type after replace, see #35268", strict=True, ) def test_replace_dict_category_type(self, input_category_df, expected_category_df): """ Test to ensure category dtypes are maintained after replace with dict values """ # create input dataframe input_dict = {"col1": ["a"], "col2": ["obj1"], "col3": ["cat1"]} # explicitly cast columns as category input_df = DataFrame(data=input_dict).astype( {"col1": "category", "col2": "category", "col3": "category"} ) # create expected dataframe expected_dict = {"col1": ["z"], "col2": ["obj9"], "col3": ["catX"]} # explicitly cast columns as category expected = DataFrame(data=expected_dict).astype( {"col1": "category", "col2": "category", "col3": "category"} ) # replace values in input dataframe using a dict result = input_df.replace({"a": "z", "obj1": "obj9", "cat1": "catX"}) tm.assert_frame_equal(result, expected) def test_replace_with_compiled_regex(self): # https://github.com/pandas-dev/pandas/issues/35680 df = DataFrame(["a", "b", "c"]) regex = re.compile("^a$") result = df.replace({regex: "z"}, regex=True) expected = DataFrame(["z", "b", "c"]) tm.assert_frame_equal(result, expected) def test_replace_intervals(self): # https://github.com/pandas-dev/pandas/issues/35931 df = DataFrame({"a": [pd.Interval(0, 1), pd.Interval(0, 1)]}) result = df.replace({"a": {pd.Interval(0, 1): "x"}}) expected = DataFrame({"a": ["x", "x"]}) tm.assert_frame_equal(result, expected) def test_replace_unicode(self): # GH: 16784 columns_values_map = {"positive": {"正面": 1, "中立": 1, "负面": 0}} df1 = DataFrame({"positive": np.ones(3)}) result = df1.replace(columns_values_map) expected = DataFrame({"positive": np.ones(3)}) tm.assert_frame_equal(result, expected) def test_replace_bytes(self, frame_or_series): # GH#38900 obj = frame_or_series(["o"]).astype("|S") expected = obj.copy() obj = obj.replace({None: np.nan}) tm.assert_equal(obj, expected)