import re import numpy as np import pytest import pandas as pd from pandas import ( DataFrame, lreshape, melt, wide_to_long, ) import pandas._testing as tm @pytest.fixture def df(): res = tm.makeTimeDataFrame()[:10] res["id1"] = (res["A"] > 0).astype(np.int64) res["id2"] = (res["B"] > 0).astype(np.int64) return res @pytest.fixture def df1(): res = DataFrame( [ [1.067683, -1.110463, 0.20867], [-1.321405, 0.368915, -1.055342], [-0.807333, 0.08298, -0.873361], ] ) res.columns = [list("ABC"), list("abc")] res.columns.names = ["CAP", "low"] return res @pytest.fixture def var_name(): return "var" @pytest.fixture def value_name(): return "val" class TestMelt: def test_top_level_method(self, df): result = melt(df) assert result.columns.tolist() == ["variable", "value"] def test_method_signatures(self, df, df1, var_name, value_name): tm.assert_frame_equal(df.melt(), melt(df)) tm.assert_frame_equal( df.melt(id_vars=["id1", "id2"], value_vars=["A", "B"]), melt(df, id_vars=["id1", "id2"], value_vars=["A", "B"]), ) tm.assert_frame_equal( df.melt(var_name=var_name, value_name=value_name), melt(df, var_name=var_name, value_name=value_name), ) tm.assert_frame_equal(df1.melt(col_level=0), melt(df1, col_level=0)) def test_default_col_names(self, df): result = df.melt() assert result.columns.tolist() == ["variable", "value"] result1 = df.melt(id_vars=["id1"]) assert result1.columns.tolist() == ["id1", "variable", "value"] result2 = df.melt(id_vars=["id1", "id2"]) assert result2.columns.tolist() == ["id1", "id2", "variable", "value"] def test_value_vars(self, df): result3 = df.melt(id_vars=["id1", "id2"], value_vars="A") assert len(result3) == 10 result4 = df.melt(id_vars=["id1", "id2"], value_vars=["A", "B"]) expected4 = DataFrame( { "id1": df["id1"].tolist() * 2, "id2": df["id2"].tolist() * 2, "variable": ["A"] * 10 + ["B"] * 10, "value": (df["A"].tolist() + df["B"].tolist()), }, columns=["id1", "id2", "variable", "value"], ) tm.assert_frame_equal(result4, expected4) @pytest.mark.parametrize("type_", (tuple, list, np.array)) def test_value_vars_types(self, type_, df): # GH 15348 expected = DataFrame( { "id1": df["id1"].tolist() * 2, "id2": df["id2"].tolist() * 2, "variable": ["A"] * 10 + ["B"] * 10, "value": (df["A"].tolist() + df["B"].tolist()), }, columns=["id1", "id2", "variable", "value"], ) result = df.melt(id_vars=["id1", "id2"], value_vars=type_(("A", "B"))) tm.assert_frame_equal(result, expected) def test_vars_work_with_multiindex(self, df1): expected = DataFrame( { ("A", "a"): df1[("A", "a")], "CAP": ["B"] * len(df1), "low": ["b"] * len(df1), "value": df1[("B", "b")], }, columns=[("A", "a"), "CAP", "low", "value"], ) result = df1.melt(id_vars=[("A", "a")], value_vars=[("B", "b")]) tm.assert_frame_equal(result, expected) @pytest.mark.parametrize( "id_vars, value_vars, col_level, expected", [ ( ["A"], ["B"], 0, DataFrame( { "A": {0: 1.067683, 1: -1.321405, 2: -0.807333}, "CAP": {0: "B", 1: "B", 2: "B"}, "value": {0: -1.110463, 1: 0.368915, 2: 0.08298}, } ), ), ( ["a"], ["b"], 1, DataFrame( { "a": {0: 1.067683, 1: -1.321405, 2: -0.807333}, "low": {0: "b", 1: "b", 2: "b"}, "value": {0: -1.110463, 1: 0.368915, 2: 0.08298}, } ), ), ], ) def test_single_vars_work_with_multiindex( self, id_vars, value_vars, col_level, expected, df1 ): result = df1.melt(id_vars, value_vars, col_level=col_level) tm.assert_frame_equal(result, expected) @pytest.mark.parametrize( "id_vars, value_vars", [ [("A", "a"), [("B", "b")]], [[("A", "a")], ("B", "b")], [("A", "a"), ("B", "b")], ], ) def test_tuple_vars_fail_with_multiindex(self, id_vars, value_vars, df1): # melt should fail with an informative error message if # the columns have a MultiIndex and a tuple is passed # for id_vars or value_vars. msg = r"(id|value)_vars must be a list of tuples when columns are a MultiIndex" with pytest.raises(ValueError, match=msg): df1.melt(id_vars=id_vars, value_vars=value_vars) def test_custom_var_name(self, df, var_name): result5 = df.melt(var_name=var_name) assert result5.columns.tolist() == ["var", "value"] result6 = df.melt(id_vars=["id1"], var_name=var_name) assert result6.columns.tolist() == ["id1", "var", "value"] result7 = df.melt(id_vars=["id1", "id2"], var_name=var_name) assert result7.columns.tolist() == ["id1", "id2", "var", "value"] result8 = df.melt(id_vars=["id1", "id2"], value_vars="A", var_name=var_name) assert result8.columns.tolist() == ["id1", "id2", "var", "value"] result9 = df.melt( id_vars=["id1", "id2"], value_vars=["A", "B"], var_name=var_name ) expected9 = DataFrame( { "id1": df["id1"].tolist() * 2, "id2": df["id2"].tolist() * 2, var_name: ["A"] * 10 + ["B"] * 10, "value": (df["A"].tolist() + df["B"].tolist()), }, columns=["id1", "id2", var_name, "value"], ) tm.assert_frame_equal(result9, expected9) def test_custom_value_name(self, df, value_name): result10 = df.melt(value_name=value_name) assert result10.columns.tolist() == ["variable", "val"] result11 = df.melt(id_vars=["id1"], value_name=value_name) assert result11.columns.tolist() == ["id1", "variable", "val"] result12 = df.melt(id_vars=["id1", "id2"], value_name=value_name) assert result12.columns.tolist() == ["id1", "id2", "variable", "val"] result13 = df.melt( id_vars=["id1", "id2"], value_vars="A", value_name=value_name ) assert result13.columns.tolist() == ["id1", "id2", "variable", "val"] result14 = df.melt( id_vars=["id1", "id2"], value_vars=["A", "B"], value_name=value_name ) expected14 = DataFrame( { "id1": df["id1"].tolist() * 2, "id2": df["id2"].tolist() * 2, "variable": ["A"] * 10 + ["B"] * 10, value_name: (df["A"].tolist() + df["B"].tolist()), }, columns=["id1", "id2", "variable", value_name], ) tm.assert_frame_equal(result14, expected14) def test_custom_var_and_value_name(self, df, value_name, var_name): result15 = df.melt(var_name=var_name, value_name=value_name) assert result15.columns.tolist() == ["var", "val"] result16 = df.melt(id_vars=["id1"], var_name=var_name, value_name=value_name) assert result16.columns.tolist() == ["id1", "var", "val"] result17 = df.melt( id_vars=["id1", "id2"], var_name=var_name, value_name=value_name ) assert result17.columns.tolist() == ["id1", "id2", "var", "val"] result18 = df.melt( id_vars=["id1", "id2"], value_vars="A", var_name=var_name, value_name=value_name, ) assert result18.columns.tolist() == ["id1", "id2", "var", "val"] result19 = df.melt( id_vars=["id1", "id2"], value_vars=["A", "B"], var_name=var_name, value_name=value_name, ) expected19 = DataFrame( { "id1": df["id1"].tolist() * 2, "id2": df["id2"].tolist() * 2, var_name: ["A"] * 10 + ["B"] * 10, value_name: (df["A"].tolist() + df["B"].tolist()), }, columns=["id1", "id2", var_name, value_name], ) tm.assert_frame_equal(result19, expected19) df20 = df.copy() df20.columns.name = "foo" result20 = df20.melt() assert result20.columns.tolist() == ["foo", "value"] @pytest.mark.parametrize("col_level", [0, "CAP"]) def test_col_level(self, col_level, df1): res = df1.melt(col_level=col_level) assert res.columns.tolist() == ["CAP", "value"] def test_multiindex(self, df1): res = df1.melt() assert res.columns.tolist() == ["CAP", "low", "value"] @pytest.mark.parametrize( "col", [ pd.Series(pd.date_range("2010", periods=5, tz="US/Pacific")), pd.Series(["a", "b", "c", "a", "d"], dtype="category"), pd.Series([0, 1, 0, 0, 0]), ], ) def test_pandas_dtypes(self, col): # GH 15785 df = DataFrame( {"klass": range(5), "col": col, "attr1": [1, 0, 0, 0, 0], "attr2": col} ) expected_value = pd.concat([pd.Series([1, 0, 0, 0, 0]), col], ignore_index=True) result = melt( df, id_vars=["klass", "col"], var_name="attribute", value_name="value" ) expected = DataFrame( { 0: list(range(5)) * 2, 1: pd.concat([col] * 2, ignore_index=True), 2: ["attr1"] * 5 + ["attr2"] * 5, 3: expected_value, } ) expected.columns = ["klass", "col", "attribute", "value"] tm.assert_frame_equal(result, expected) def test_preserve_category(self): # GH 15853 data = DataFrame({"A": [1, 2], "B": pd.Categorical(["X", "Y"])}) result = melt(data, ["B"], ["A"]) expected = DataFrame( {"B": pd.Categorical(["X", "Y"]), "variable": ["A", "A"], "value": [1, 2]} ) tm.assert_frame_equal(result, expected) def test_melt_missing_columns_raises(self): # GH-23575 # This test is to ensure that pandas raises an error if melting is # attempted with column names absent from the dataframe # Generate data df = DataFrame(np.random.randn(5, 4), columns=list("abcd")) # Try to melt with missing `value_vars` column name msg = "The following '{Var}' are not present in the DataFrame: {Col}" with pytest.raises( KeyError, match=msg.format(Var="value_vars", Col="\\['C'\\]") ): df.melt(["a", "b"], ["C", "d"]) # Try to melt with missing `id_vars` column name with pytest.raises(KeyError, match=msg.format(Var="id_vars", Col="\\['A'\\]")): df.melt(["A", "b"], ["c", "d"]) # Multiple missing with pytest.raises( KeyError, match=msg.format(Var="id_vars", Col="\\['not_here', 'or_there'\\]"), ): df.melt(["a", "b", "not_here", "or_there"], ["c", "d"]) # Multiindex melt fails if column is missing from multilevel melt multi = df.copy() multi.columns = [list("ABCD"), list("abcd")] with pytest.raises(KeyError, match=msg.format(Var="id_vars", Col="\\['E'\\]")): multi.melt([("E", "a")], [("B", "b")]) # Multiindex fails if column is missing from single level melt with pytest.raises( KeyError, match=msg.format(Var="value_vars", Col="\\['F'\\]") ): multi.melt(["A"], ["F"], col_level=0) def test_melt_mixed_int_str_id_vars(self): # GH 29718 df = DataFrame({0: ["foo"], "a": ["bar"], "b": [1], "d": [2]}) result = melt(df, id_vars=[0, "a"], value_vars=["b", "d"]) expected = DataFrame( {0: ["foo"] * 2, "a": ["bar"] * 2, "variable": list("bd"), "value": [1, 2]} ) tm.assert_frame_equal(result, expected) def test_melt_mixed_int_str_value_vars(self): # GH 29718 df = DataFrame({0: ["foo"], "a": ["bar"]}) result = melt(df, value_vars=[0, "a"]) expected = DataFrame({"variable": [0, "a"], "value": ["foo", "bar"]}) tm.assert_frame_equal(result, expected) def test_ignore_index(self): # GH 17440 df = DataFrame({"foo": [0], "bar": [1]}, index=["first"]) result = melt(df, ignore_index=False) expected = DataFrame( {"variable": ["foo", "bar"], "value": [0, 1]}, index=["first", "first"] ) tm.assert_frame_equal(result, expected) def test_ignore_multiindex(self): # GH 17440 index = pd.MultiIndex.from_tuples( [("first", "second"), ("first", "third")], names=["baz", "foobar"] ) df = DataFrame({"foo": [0, 1], "bar": [2, 3]}, index=index) result = melt(df, ignore_index=False) expected_index = pd.MultiIndex.from_tuples( [("first", "second"), ("first", "third")] * 2, names=["baz", "foobar"] ) expected = DataFrame( {"variable": ["foo"] * 2 + ["bar"] * 2, "value": [0, 1, 2, 3]}, index=expected_index, ) tm.assert_frame_equal(result, expected) def test_ignore_index_name_and_type(self): # GH 17440 index = pd.Index(["foo", "bar"], dtype="category", name="baz") df = DataFrame({"x": [0, 1], "y": [2, 3]}, index=index) result = melt(df, ignore_index=False) expected_index = pd.Index(["foo", "bar"] * 2, dtype="category", name="baz") expected = DataFrame( {"variable": ["x", "x", "y", "y"], "value": [0, 1, 2, 3]}, index=expected_index, ) tm.assert_frame_equal(result, expected) def test_melt_with_duplicate_columns(self): # GH#41951 df = DataFrame([["id", 2, 3]], columns=["a", "b", "b"]) result = df.melt(id_vars=["a"], value_vars=["b"]) expected = DataFrame( [["id", "b", 2], ["id", "b", 3]], columns=["a", "variable", "value"] ) tm.assert_frame_equal(result, expected) @pytest.mark.parametrize("dtype", ["Int8", "Int64"]) def test_melt_ea_dtype(self, dtype): # GH#41570 df = DataFrame( { "a": pd.Series([1, 2], dtype="Int8"), "b": pd.Series([3, 4], dtype=dtype), } ) result = df.melt() expected = DataFrame( { "variable": ["a", "a", "b", "b"], "value": pd.Series([1, 2, 3, 4], dtype=dtype), } ) tm.assert_frame_equal(result, expected) class TestLreshape: def test_pairs(self): data = { "birthdt": [ "08jan2009", "20dec2008", "30dec2008", "21dec2008", "11jan2009", ], "birthwt": [1766, 3301, 1454, 3139, 4133], "id": [101, 102, 103, 104, 105], "sex": ["Male", "Female", "Female", "Female", "Female"], "visitdt1": [ "11jan2009", "22dec2008", "04jan2009", "29dec2008", "20jan2009", ], "visitdt2": ["21jan2009", np.nan, "22jan2009", "31dec2008", "03feb2009"], "visitdt3": ["05feb2009", np.nan, np.nan, "02jan2009", "15feb2009"], "wt1": [1823, 3338, 1549, 3298, 4306], "wt2": [2011.0, np.nan, 1892.0, 3338.0, 4575.0], "wt3": [2293.0, np.nan, np.nan, 3377.0, 4805.0], } df = DataFrame(data) spec = { "visitdt": [f"visitdt{i:d}" for i in range(1, 4)], "wt": [f"wt{i:d}" for i in range(1, 4)], } result = lreshape(df, spec) exp_data = { "birthdt": [ "08jan2009", "20dec2008", "30dec2008", "21dec2008", "11jan2009", "08jan2009", "30dec2008", "21dec2008", "11jan2009", "08jan2009", "21dec2008", "11jan2009", ], "birthwt": [ 1766, 3301, 1454, 3139, 4133, 1766, 1454, 3139, 4133, 1766, 3139, 4133, ], "id": [101, 102, 103, 104, 105, 101, 103, 104, 105, 101, 104, 105], "sex": [ "Male", "Female", "Female", "Female", "Female", "Male", "Female", "Female", "Female", "Male", "Female", "Female", ], "visitdt": [ "11jan2009", "22dec2008", "04jan2009", "29dec2008", "20jan2009", "21jan2009", "22jan2009", "31dec2008", "03feb2009", "05feb2009", "02jan2009", "15feb2009", ], "wt": [ 1823.0, 3338.0, 1549.0, 3298.0, 4306.0, 2011.0, 1892.0, 3338.0, 4575.0, 2293.0, 3377.0, 4805.0, ], } exp = DataFrame(exp_data, columns=result.columns) tm.assert_frame_equal(result, exp) result = lreshape(df, spec, dropna=False) exp_data = { "birthdt": [ "08jan2009", "20dec2008", "30dec2008", "21dec2008", "11jan2009", "08jan2009", "20dec2008", "30dec2008", "21dec2008", "11jan2009", "08jan2009", "20dec2008", "30dec2008", "21dec2008", "11jan2009", ], "birthwt": [ 1766, 3301, 1454, 3139, 4133, 1766, 3301, 1454, 3139, 4133, 1766, 3301, 1454, 3139, 4133, ], "id": [ 101, 102, 103, 104, 105, 101, 102, 103, 104, 105, 101, 102, 103, 104, 105, ], "sex": [ "Male", "Female", "Female", "Female", "Female", "Male", "Female", "Female", "Female", "Female", "Male", "Female", "Female", "Female", "Female", ], "visitdt": [ "11jan2009", "22dec2008", "04jan2009", "29dec2008", "20jan2009", "21jan2009", np.nan, "22jan2009", "31dec2008", "03feb2009", "05feb2009", np.nan, np.nan, "02jan2009", "15feb2009", ], "wt": [ 1823.0, 3338.0, 1549.0, 3298.0, 4306.0, 2011.0, np.nan, 1892.0, 3338.0, 4575.0, 2293.0, np.nan, np.nan, 3377.0, 4805.0, ], } exp = DataFrame(exp_data, columns=result.columns) tm.assert_frame_equal(result, exp) spec = { "visitdt": [f"visitdt{i:d}" for i in range(1, 3)], "wt": [f"wt{i:d}" for i in range(1, 4)], } msg = "All column lists must be same length" with pytest.raises(ValueError, match=msg): lreshape(df, spec) class TestWideToLong: def test_simple(self): np.random.seed(123) x = np.random.randn(3) df = DataFrame( { "A1970": {0: "a", 1: "b", 2: "c"}, "A1980": {0: "d", 1: "e", 2: "f"}, "B1970": {0: 2.5, 1: 1.2, 2: 0.7}, "B1980": {0: 3.2, 1: 1.3, 2: 0.1}, "X": dict(zip(range(3), x)), } ) df["id"] = df.index exp_data = { "X": x.tolist() + x.tolist(), "A": ["a", "b", "c", "d", "e", "f"], "B": [2.5, 1.2, 0.7, 3.2, 1.3, 0.1], "year": [1970, 1970, 1970, 1980, 1980, 1980], "id": [0, 1, 2, 0, 1, 2], } expected = DataFrame(exp_data) expected = expected.set_index(["id", "year"])[["X", "A", "B"]] result = wide_to_long(df, ["A", "B"], i="id", j="year") tm.assert_frame_equal(result, expected) def test_stubs(self): # GH9204 wide_to_long call should not modify 'stubs' list df = DataFrame([[0, 1, 2, 3, 8], [4, 5, 6, 7, 9]]) df.columns = ["id", "inc1", "inc2", "edu1", "edu2"] stubs = ["inc", "edu"] wide_to_long(df, stubs, i="id", j="age") assert stubs == ["inc", "edu"] def test_separating_character(self): # GH14779 np.random.seed(123) x = np.random.randn(3) df = DataFrame( { "A.1970": {0: "a", 1: "b", 2: "c"}, "A.1980": {0: "d", 1: "e", 2: "f"}, "B.1970": {0: 2.5, 1: 1.2, 2: 0.7}, "B.1980": {0: 3.2, 1: 1.3, 2: 0.1}, "X": dict(zip(range(3), x)), } ) df["id"] = df.index exp_data = { "X": x.tolist() + x.tolist(), "A": ["a", "b", "c", "d", "e", "f"], "B": [2.5, 1.2, 0.7, 3.2, 1.3, 0.1], "year": [1970, 1970, 1970, 1980, 1980, 1980], "id": [0, 1, 2, 0, 1, 2], } expected = DataFrame(exp_data) expected = expected.set_index(["id", "year"])[["X", "A", "B"]] result = wide_to_long(df, ["A", "B"], i="id", j="year", sep=".") tm.assert_frame_equal(result, expected) def test_escapable_characters(self): np.random.seed(123) x = np.random.randn(3) df = DataFrame( { "A(quarterly)1970": {0: "a", 1: "b", 2: "c"}, "A(quarterly)1980": {0: "d", 1: "e", 2: "f"}, "B(quarterly)1970": {0: 2.5, 1: 1.2, 2: 0.7}, "B(quarterly)1980": {0: 3.2, 1: 1.3, 2: 0.1}, "X": dict(zip(range(3), x)), } ) df["id"] = df.index exp_data = { "X": x.tolist() + x.tolist(), "A(quarterly)": ["a", "b", "c", "d", "e", "f"], "B(quarterly)": [2.5, 1.2, 0.7, 3.2, 1.3, 0.1], "year": [1970, 1970, 1970, 1980, 1980, 1980], "id": [0, 1, 2, 0, 1, 2], } expected = DataFrame(exp_data) expected = expected.set_index(["id", "year"])[ ["X", "A(quarterly)", "B(quarterly)"] ] result = wide_to_long(df, ["A(quarterly)", "B(quarterly)"], i="id", j="year") tm.assert_frame_equal(result, expected) def test_unbalanced(self): # test that we can have a varying amount of time variables df = DataFrame( { "A2010": [1.0, 2.0], "A2011": [3.0, 4.0], "B2010": [5.0, 6.0], "X": ["X1", "X2"], } ) df["id"] = df.index exp_data = { "X": ["X1", "X2", "X1", "X2"], "A": [1.0, 2.0, 3.0, 4.0], "B": [5.0, 6.0, np.nan, np.nan], "id": [0, 1, 0, 1], "year": [2010, 2010, 2011, 2011], } expected = DataFrame(exp_data) expected = expected.set_index(["id", "year"])[["X", "A", "B"]] result = wide_to_long(df, ["A", "B"], i="id", j="year") tm.assert_frame_equal(result, expected) def test_character_overlap(self): # Test we handle overlapping characters in both id_vars and value_vars df = DataFrame( { "A11": ["a11", "a22", "a33"], "A12": ["a21", "a22", "a23"], "B11": ["b11", "b12", "b13"], "B12": ["b21", "b22", "b23"], "BB11": [1, 2, 3], "BB12": [4, 5, 6], "BBBX": [91, 92, 93], "BBBZ": [91, 92, 93], } ) df["id"] = df.index expected = DataFrame( { "BBBX": [91, 92, 93, 91, 92, 93], "BBBZ": [91, 92, 93, 91, 92, 93], "A": ["a11", "a22", "a33", "a21", "a22", "a23"], "B": ["b11", "b12", "b13", "b21", "b22", "b23"], "BB": [1, 2, 3, 4, 5, 6], "id": [0, 1, 2, 0, 1, 2], "year": [11, 11, 11, 12, 12, 12], } ) expected = expected.set_index(["id", "year"])[["BBBX", "BBBZ", "A", "B", "BB"]] result = wide_to_long(df, ["A", "B", "BB"], i="id", j="year") tm.assert_frame_equal(result.sort_index(axis=1), expected.sort_index(axis=1)) def test_invalid_separator(self): # if an invalid separator is supplied a empty data frame is returned sep = "nope!" df = DataFrame( { "A2010": [1.0, 2.0], "A2011": [3.0, 4.0], "B2010": [5.0, 6.0], "X": ["X1", "X2"], } ) df["id"] = df.index exp_data = { "X": "", "A2010": [], "A2011": [], "B2010": [], "id": [], "year": [], "A": [], "B": [], } expected = DataFrame(exp_data).astype({"year": np.int64}) expected = expected.set_index(["id", "year"])[ ["X", "A2010", "A2011", "B2010", "A", "B"] ] expected.index = expected.index.set_levels([0, 1], level=0) result = wide_to_long(df, ["A", "B"], i="id", j="year", sep=sep) tm.assert_frame_equal(result.sort_index(axis=1), expected.sort_index(axis=1)) def test_num_string_disambiguation(self): # Test that we can disambiguate number value_vars from # string value_vars df = DataFrame( { "A11": ["a11", "a22", "a33"], "A12": ["a21", "a22", "a23"], "B11": ["b11", "b12", "b13"], "B12": ["b21", "b22", "b23"], "BB11": [1, 2, 3], "BB12": [4, 5, 6], "Arating": [91, 92, 93], "Arating_old": [91, 92, 93], } ) df["id"] = df.index expected = DataFrame( { "Arating": [91, 92, 93, 91, 92, 93], "Arating_old": [91, 92, 93, 91, 92, 93], "A": ["a11", "a22", "a33", "a21", "a22", "a23"], "B": ["b11", "b12", "b13", "b21", "b22", "b23"], "BB": [1, 2, 3, 4, 5, 6], "id": [0, 1, 2, 0, 1, 2], "year": [11, 11, 11, 12, 12, 12], } ) expected = expected.set_index(["id", "year"])[ ["Arating", "Arating_old", "A", "B", "BB"] ] result = wide_to_long(df, ["A", "B", "BB"], i="id", j="year") tm.assert_frame_equal(result.sort_index(axis=1), expected.sort_index(axis=1)) def test_invalid_suffixtype(self): # If all stubs names end with a string, but a numeric suffix is # assumed, an empty data frame is returned df = DataFrame( { "Aone": [1.0, 2.0], "Atwo": [3.0, 4.0], "Bone": [5.0, 6.0], "X": ["X1", "X2"], } ) df["id"] = df.index exp_data = { "X": "", "Aone": [], "Atwo": [], "Bone": [], "id": [], "year": [], "A": [], "B": [], } expected = DataFrame(exp_data).astype({"year": np.int64}) expected = expected.set_index(["id", "year"]) expected.index = expected.index.set_levels([0, 1], level=0) result = wide_to_long(df, ["A", "B"], i="id", j="year") tm.assert_frame_equal(result.sort_index(axis=1), expected.sort_index(axis=1)) def test_multiple_id_columns(self): # Taken from http://www.ats.ucla.edu/stat/stata/modules/reshapel.htm df = DataFrame( { "famid": [1, 1, 1, 2, 2, 2, 3, 3, 3], "birth": [1, 2, 3, 1, 2, 3, 1, 2, 3], "ht1": [2.8, 2.9, 2.2, 2, 1.8, 1.9, 2.2, 2.3, 2.1], "ht2": [3.4, 3.8, 2.9, 3.2, 2.8, 2.4, 3.3, 3.4, 2.9], } ) expected = DataFrame( { "ht": [ 2.8, 3.4, 2.9, 3.8, 2.2, 2.9, 2.0, 3.2, 1.8, 2.8, 1.9, 2.4, 2.2, 3.3, 2.3, 3.4, 2.1, 2.9, ], "famid": [1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3], "birth": [1, 1, 2, 2, 3, 3, 1, 1, 2, 2, 3, 3, 1, 1, 2, 2, 3, 3], "age": [1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2], } ) expected = expected.set_index(["famid", "birth", "age"])[["ht"]] result = wide_to_long(df, "ht", i=["famid", "birth"], j="age") tm.assert_frame_equal(result, expected) def test_non_unique_idvars(self): # GH16382 # Raise an error message if non unique id vars (i) are passed df = DataFrame( {"A_A1": [1, 2, 3, 4, 5], "B_B1": [1, 2, 3, 4, 5], "x": [1, 1, 1, 1, 1]} ) msg = "the id variables need to uniquely identify each row" with pytest.raises(ValueError, match=msg): wide_to_long(df, ["A_A", "B_B"], i="x", j="colname") def test_cast_j_int(self): df = DataFrame( { "actor_1": ["CCH Pounder", "Johnny Depp", "Christoph Waltz"], "actor_2": ["Joel David Moore", "Orlando Bloom", "Rory Kinnear"], "actor_fb_likes_1": [1000.0, 40000.0, 11000.0], "actor_fb_likes_2": [936.0, 5000.0, 393.0], "title": ["Avatar", "Pirates of the Caribbean", "Spectre"], } ) expected = DataFrame( { "actor": [ "CCH Pounder", "Johnny Depp", "Christoph Waltz", "Joel David Moore", "Orlando Bloom", "Rory Kinnear", ], "actor_fb_likes": [1000.0, 40000.0, 11000.0, 936.0, 5000.0, 393.0], "num": [1, 1, 1, 2, 2, 2], "title": [ "Avatar", "Pirates of the Caribbean", "Spectre", "Avatar", "Pirates of the Caribbean", "Spectre", ], } ).set_index(["title", "num"]) result = wide_to_long( df, ["actor", "actor_fb_likes"], i="title", j="num", sep="_" ) tm.assert_frame_equal(result, expected) def test_identical_stubnames(self): df = DataFrame( { "A2010": [1.0, 2.0], "A2011": [3.0, 4.0], "B2010": [5.0, 6.0], "A": ["X1", "X2"], } ) msg = "stubname can't be identical to a column name" with pytest.raises(ValueError, match=msg): wide_to_long(df, ["A", "B"], i="A", j="colname") def test_nonnumeric_suffix(self): df = DataFrame( { "treatment_placebo": [1.0, 2.0], "treatment_test": [3.0, 4.0], "result_placebo": [5.0, 6.0], "A": ["X1", "X2"], } ) expected = DataFrame( { "A": ["X1", "X2", "X1", "X2"], "colname": ["placebo", "placebo", "test", "test"], "result": [5.0, 6.0, np.nan, np.nan], "treatment": [1.0, 2.0, 3.0, 4.0], } ) expected = expected.set_index(["A", "colname"]) result = wide_to_long( df, ["result", "treatment"], i="A", j="colname", suffix="[a-z]+", sep="_" ) tm.assert_frame_equal(result, expected) def test_mixed_type_suffix(self): df = DataFrame( { "A": ["X1", "X2"], "result_1": [0, 9], "result_foo": [5.0, 6.0], "treatment_1": [1.0, 2.0], "treatment_foo": [3.0, 4.0], } ) expected = DataFrame( { "A": ["X1", "X2", "X1", "X2"], "colname": ["1", "1", "foo", "foo"], "result": [0.0, 9.0, 5.0, 6.0], "treatment": [1.0, 2.0, 3.0, 4.0], } ).set_index(["A", "colname"]) result = wide_to_long( df, ["result", "treatment"], i="A", j="colname", suffix=".+", sep="_" ) tm.assert_frame_equal(result, expected) def test_float_suffix(self): df = DataFrame( { "treatment_1.1": [1.0, 2.0], "treatment_2.1": [3.0, 4.0], "result_1.2": [5.0, 6.0], "result_1": [0, 9], "A": ["X1", "X2"], } ) expected = DataFrame( { "A": ["X1", "X2", "X1", "X2", "X1", "X2", "X1", "X2"], "colname": [1.2, 1.2, 1.0, 1.0, 1.1, 1.1, 2.1, 2.1], "result": [5.0, 6.0, 0.0, 9.0, np.nan, np.nan, np.nan, np.nan], "treatment": [np.nan, np.nan, np.nan, np.nan, 1.0, 2.0, 3.0, 4.0], } ) expected = expected.set_index(["A", "colname"]) result = wide_to_long( df, ["result", "treatment"], i="A", j="colname", suffix="[0-9.]+", sep="_" ) tm.assert_frame_equal(result, expected) def test_col_substring_of_stubname(self): # GH22468 # Don't raise ValueError when a column name is a substring # of a stubname that's been passed as a string wide_data = { "node_id": {0: 0, 1: 1, 2: 2, 3: 3, 4: 4}, "A": {0: 0.80, 1: 0.0, 2: 0.25, 3: 1.0, 4: 0.81}, "PA0": {0: 0.74, 1: 0.56, 2: 0.56, 3: 0.98, 4: 0.6}, "PA1": {0: 0.77, 1: 0.64, 2: 0.52, 3: 0.98, 4: 0.67}, "PA3": {0: 0.34, 1: 0.70, 2: 0.52, 3: 0.98, 4: 0.67}, } wide_df = DataFrame.from_dict(wide_data) expected = wide_to_long(wide_df, stubnames=["PA"], i=["node_id", "A"], j="time") result = wide_to_long(wide_df, stubnames="PA", i=["node_id", "A"], j="time") tm.assert_frame_equal(result, expected) def test_raise_of_column_name_value(self): # GH34731, enforced in 2.0 # raise a ValueError if the resultant value column name matches # a name in the dataframe already (default name is "value") df = DataFrame({"col": list("ABC"), "value": range(10, 16, 2)}) with pytest.raises( ValueError, match=re.escape("value_name (value) cannot match") ): df.melt(id_vars="value", value_name="value") @pytest.mark.parametrize("dtype", ["O", "string"]) def test_missing_stubname(self, dtype): # GH46044 df = DataFrame({"id": ["1", "2"], "a-1": [100, 200], "a-2": [300, 400]}) df = df.astype({"id": dtype}) result = wide_to_long( df, stubnames=["a", "b"], i="id", j="num", sep="-", ) index = pd.Index( [("1", 1), ("2", 1), ("1", 2), ("2", 2)], name=("id", "num"), ) expected = DataFrame( {"a": [100, 200, 300, 400], "b": [np.nan] * 4}, index=index, ) new_level = expected.index.levels[0].astype(dtype) expected.index = expected.index.set_levels(new_level, level=0) tm.assert_frame_equal(result, expected)