import json import numpy as np import pytest from pandas import ( DataFrame, Index, Series, json_normalize, ) import pandas._testing as tm from pandas.io.json._normalize import nested_to_record @pytest.fixture def deep_nested(): # deeply nested data return [ { "country": "USA", "states": [ { "name": "California", "cities": [ {"name": "San Francisco", "pop": 12345}, {"name": "Los Angeles", "pop": 12346}, ], }, { "name": "Ohio", "cities": [ {"name": "Columbus", "pop": 1234}, {"name": "Cleveland", "pop": 1236}, ], }, ], }, { "country": "Germany", "states": [ {"name": "Bayern", "cities": [{"name": "Munich", "pop": 12347}]}, { "name": "Nordrhein-Westfalen", "cities": [ {"name": "Duesseldorf", "pop": 1238}, {"name": "Koeln", "pop": 1239}, ], }, ], }, ] @pytest.fixture def state_data(): return [ { "counties": [ {"name": "Dade", "population": 12345}, {"name": "Broward", "population": 40000}, {"name": "Palm Beach", "population": 60000}, ], "info": {"governor": "Rick Scott"}, "shortname": "FL", "state": "Florida", }, { "counties": [ {"name": "Summit", "population": 1234}, {"name": "Cuyahoga", "population": 1337}, ], "info": {"governor": "John Kasich"}, "shortname": "OH", "state": "Ohio", }, ] @pytest.fixture def author_missing_data(): return [ {"info": None}, { "info": {"created_at": "11/08/1993", "last_updated": "26/05/2012"}, "author_name": {"first": "Jane", "last_name": "Doe"}, }, ] @pytest.fixture def missing_metadata(): return [ { "name": "Alice", "addresses": [ { "number": 9562, "street": "Morris St.", "city": "Massillon", "state": "OH", "zip": 44646, } ], "previous_residences": {"cities": [{"city_name": "Foo York City"}]}, }, { "addresses": [ { "number": 8449, "street": "Spring St.", "city": "Elizabethton", "state": "TN", "zip": 37643, } ], "previous_residences": {"cities": [{"city_name": "Barmingham"}]}, }, ] @pytest.fixture def max_level_test_input_data(): """ input data to test json_normalize with max_level param """ return [ { "CreatedBy": {"Name": "User001"}, "Lookup": { "TextField": "Some text", "UserField": {"Id": "ID001", "Name": "Name001"}, }, "Image": {"a": "b"}, } ] class TestJSONNormalize: def test_simple_records(self): recs = [ {"a": 1, "b": 2, "c": 3}, {"a": 4, "b": 5, "c": 6}, {"a": 7, "b": 8, "c": 9}, {"a": 10, "b": 11, "c": 12}, ] result = json_normalize(recs) expected = DataFrame(recs) tm.assert_frame_equal(result, expected) def test_simple_normalize(self, state_data): result = json_normalize(state_data[0], "counties") expected = DataFrame(state_data[0]["counties"]) tm.assert_frame_equal(result, expected) result = json_normalize(state_data, "counties") expected = [] for rec in state_data: expected.extend(rec["counties"]) expected = DataFrame(expected) tm.assert_frame_equal(result, expected) result = json_normalize(state_data, "counties", meta="state") expected["state"] = np.array(["Florida", "Ohio"]).repeat([3, 2]) tm.assert_frame_equal(result, expected) def test_empty_array(self): result = json_normalize([]) expected = DataFrame() tm.assert_frame_equal(result, expected) @pytest.mark.parametrize( "data, record_path, exception_type", [ ([{"a": 0}, {"a": 1}], None, None), ({"a": [{"a": 0}, {"a": 1}]}, "a", None), ('{"a": [{"a": 0}, {"a": 1}]}', None, NotImplementedError), (None, None, NotImplementedError), ], ) def test_accepted_input(self, data, record_path, exception_type): if exception_type is not None: with pytest.raises(exception_type, match=tm.EMPTY_STRING_PATTERN): json_normalize(data, record_path=record_path) else: result = json_normalize(data, record_path=record_path) expected = DataFrame([0, 1], columns=["a"]) tm.assert_frame_equal(result, expected) def test_simple_normalize_with_separator(self, deep_nested): # GH 14883 result = json_normalize({"A": {"A": 1, "B": 2}}) expected = DataFrame([[1, 2]], columns=["A.A", "A.B"]) tm.assert_frame_equal(result.reindex_like(expected), expected) result = json_normalize({"A": {"A": 1, "B": 2}}, sep="_") expected = DataFrame([[1, 2]], columns=["A_A", "A_B"]) tm.assert_frame_equal(result.reindex_like(expected), expected) result = json_normalize({"A": {"A": 1, "B": 2}}, sep="\u03c3") expected = DataFrame([[1, 2]], columns=["A\u03c3A", "A\u03c3B"]) tm.assert_frame_equal(result.reindex_like(expected), expected) result = json_normalize( deep_nested, ["states", "cities"], meta=["country", ["states", "name"]], sep="_", ) expected = Index(["name", "pop", "country", "states_name"]).sort_values() assert result.columns.sort_values().equals(expected) def test_normalize_with_multichar_separator(self): # GH #43831 data = {"a": [1, 2], "b": {"b_1": 2, "b_2": (3, 4)}} result = json_normalize(data, sep="__") expected = DataFrame([[[1, 2], 2, (3, 4)]], columns=["a", "b__b_1", "b__b_2"]) tm.assert_frame_equal(result, expected) def test_value_array_record_prefix(self): # GH 21536 result = json_normalize({"A": [1, 2]}, "A", record_prefix="Prefix.") expected = DataFrame([[1], [2]], columns=["Prefix.0"]) tm.assert_frame_equal(result, expected) def test_nested_object_record_path(self): # GH 22706 data = { "state": "Florida", "info": { "governor": "Rick Scott", "counties": [ {"name": "Dade", "population": 12345}, {"name": "Broward", "population": 40000}, {"name": "Palm Beach", "population": 60000}, ], }, } result = json_normalize(data, record_path=["info", "counties"]) expected = DataFrame( [["Dade", 12345], ["Broward", 40000], ["Palm Beach", 60000]], columns=["name", "population"], ) tm.assert_frame_equal(result, expected) def test_more_deeply_nested(self, deep_nested): result = json_normalize( deep_nested, ["states", "cities"], meta=["country", ["states", "name"]] ) ex_data = { "country": ["USA"] * 4 + ["Germany"] * 3, "states.name": [ "California", "California", "Ohio", "Ohio", "Bayern", "Nordrhein-Westfalen", "Nordrhein-Westfalen", ], "name": [ "San Francisco", "Los Angeles", "Columbus", "Cleveland", "Munich", "Duesseldorf", "Koeln", ], "pop": [12345, 12346, 1234, 1236, 12347, 1238, 1239], } expected = DataFrame(ex_data, columns=result.columns) tm.assert_frame_equal(result, expected) def test_shallow_nested(self): data = [ { "state": "Florida", "shortname": "FL", "info": {"governor": "Rick Scott"}, "counties": [ {"name": "Dade", "population": 12345}, {"name": "Broward", "population": 40000}, {"name": "Palm Beach", "population": 60000}, ], }, { "state": "Ohio", "shortname": "OH", "info": {"governor": "John Kasich"}, "counties": [ {"name": "Summit", "population": 1234}, {"name": "Cuyahoga", "population": 1337}, ], }, ] result = json_normalize( data, "counties", ["state", "shortname", ["info", "governor"]] ) ex_data = { "name": ["Dade", "Broward", "Palm Beach", "Summit", "Cuyahoga"], "state": ["Florida"] * 3 + ["Ohio"] * 2, "shortname": ["FL", "FL", "FL", "OH", "OH"], "info.governor": ["Rick Scott"] * 3 + ["John Kasich"] * 2, "population": [12345, 40000, 60000, 1234, 1337], } expected = DataFrame(ex_data, columns=result.columns) tm.assert_frame_equal(result, expected) def test_nested_meta_path_with_nested_record_path(self, state_data): # GH 27220 result = json_normalize( data=state_data, record_path=["counties"], meta=["state", "shortname", ["info", "governor"]], errors="ignore", ) ex_data = { "name": ["Dade", "Broward", "Palm Beach", "Summit", "Cuyahoga"], "population": [12345, 40000, 60000, 1234, 1337], "state": ["Florida"] * 3 + ["Ohio"] * 2, "shortname": ["FL"] * 3 + ["OH"] * 2, "info.governor": ["Rick Scott"] * 3 + ["John Kasich"] * 2, } expected = DataFrame(ex_data) tm.assert_frame_equal(result, expected) def test_meta_name_conflict(self): data = [ { "foo": "hello", "bar": "there", "data": [ {"foo": "something", "bar": "else"}, {"foo": "something2", "bar": "else2"}, ], } ] msg = r"Conflicting metadata name (foo|bar), need distinguishing prefix" with pytest.raises(ValueError, match=msg): json_normalize(data, "data", meta=["foo", "bar"]) result = json_normalize(data, "data", meta=["foo", "bar"], meta_prefix="meta") for val in ["metafoo", "metabar", "foo", "bar"]: assert val in result def test_meta_parameter_not_modified(self): # GH 18610 data = [ { "foo": "hello", "bar": "there", "data": [ {"foo": "something", "bar": "else"}, {"foo": "something2", "bar": "else2"}, ], } ] COLUMNS = ["foo", "bar"] result = json_normalize(data, "data", meta=COLUMNS, meta_prefix="meta") assert COLUMNS == ["foo", "bar"] for val in ["metafoo", "metabar", "foo", "bar"]: assert val in result def test_record_prefix(self, state_data): result = json_normalize(state_data[0], "counties") expected = DataFrame(state_data[0]["counties"]) tm.assert_frame_equal(result, expected) result = json_normalize( state_data, "counties", meta="state", record_prefix="county_" ) expected = [] for rec in state_data: expected.extend(rec["counties"]) expected = DataFrame(expected) expected = expected.rename(columns=lambda x: "county_" + x) expected["state"] = np.array(["Florida", "Ohio"]).repeat([3, 2]) tm.assert_frame_equal(result, expected) def test_non_ascii_key(self): testjson = ( b'[{"\xc3\x9cnic\xc3\xb8de":0,"sub":{"A":1, "B":2}},' + b'{"\xc3\x9cnic\xc3\xb8de":1,"sub":{"A":3, "B":4}}]' ).decode("utf8") testdata = { b"\xc3\x9cnic\xc3\xb8de".decode("utf8"): [0, 1], "sub.A": [1, 3], "sub.B": [2, 4], } expected = DataFrame(testdata) result = json_normalize(json.loads(testjson)) tm.assert_frame_equal(result, expected) def test_missing_field(self, author_missing_data): # GH20030: result = json_normalize(author_missing_data) ex_data = [ { "info": np.nan, "info.created_at": np.nan, "info.last_updated": np.nan, "author_name.first": np.nan, "author_name.last_name": np.nan, }, { "info": None, "info.created_at": "11/08/1993", "info.last_updated": "26/05/2012", "author_name.first": "Jane", "author_name.last_name": "Doe", }, ] expected = DataFrame(ex_data) tm.assert_frame_equal(result, expected) @pytest.mark.parametrize( "max_level,expected", [ ( 0, [ { "TextField": "Some text", "UserField": {"Id": "ID001", "Name": "Name001"}, "CreatedBy": {"Name": "User001"}, "Image": {"a": "b"}, }, { "TextField": "Some text", "UserField": {"Id": "ID001", "Name": "Name001"}, "CreatedBy": {"Name": "User001"}, "Image": {"a": "b"}, }, ], ), ( 1, [ { "TextField": "Some text", "UserField.Id": "ID001", "UserField.Name": "Name001", "CreatedBy": {"Name": "User001"}, "Image": {"a": "b"}, }, { "TextField": "Some text", "UserField.Id": "ID001", "UserField.Name": "Name001", "CreatedBy": {"Name": "User001"}, "Image": {"a": "b"}, }, ], ), ], ) def test_max_level_with_records_path(self, max_level, expected): # GH23843: Enhanced JSON normalize test_input = [ { "CreatedBy": {"Name": "User001"}, "Lookup": [ { "TextField": "Some text", "UserField": {"Id": "ID001", "Name": "Name001"}, }, { "TextField": "Some text", "UserField": {"Id": "ID001", "Name": "Name001"}, }, ], "Image": {"a": "b"}, "tags": [ {"foo": "something", "bar": "else"}, {"foo": "something2", "bar": "else2"}, ], } ] result = json_normalize( test_input, record_path=["Lookup"], meta=[["CreatedBy"], ["Image"]], max_level=max_level, ) expected_df = DataFrame(data=expected, columns=result.columns.values) tm.assert_equal(expected_df, result) def test_nested_flattening_consistent(self): # see gh-21537 df1 = json_normalize([{"A": {"B": 1}}]) df2 = json_normalize({"dummy": [{"A": {"B": 1}}]}, "dummy") # They should be the same. tm.assert_frame_equal(df1, df2) def test_nonetype_record_path(self, nulls_fixture): # see gh-30148 # should not raise TypeError result = json_normalize( [ {"state": "Texas", "info": nulls_fixture}, {"state": "Florida", "info": [{"i": 2}]}, ], record_path=["info"], ) expected = DataFrame({"i": 2}, index=[0]) tm.assert_equal(result, expected) @pytest.mark.parametrize("value", ["false", "true", "{}", "1", '"text"']) def test_non_list_record_path_errors(self, value): # see gh-30148, GH 26284 parsed_value = json.loads(value) test_input = {"state": "Texas", "info": parsed_value} test_path = "info" msg = ( f"{test_input} has non list value {parsed_value} for path {test_path}. " "Must be list or null." ) with pytest.raises(TypeError, match=msg): json_normalize([test_input], record_path=[test_path]) def test_meta_non_iterable(self): # GH 31507 data = """[{"id": 99, "data": [{"one": 1, "two": 2}]}]""" result = json_normalize(json.loads(data), record_path=["data"], meta=["id"]) expected = DataFrame( {"one": [1], "two": [2], "id": np.array([99], dtype=object)} ) tm.assert_frame_equal(result, expected) def test_generator(self, state_data): # GH35923 Fix pd.json_normalize to not skip the first element of a # generator input def generator_data(): yield from state_data[0]["counties"] result = json_normalize(generator_data()) expected = DataFrame(state_data[0]["counties"]) tm.assert_frame_equal(result, expected) def test_top_column_with_leading_underscore(self): # 49861 data = {"_id": {"a1": 10, "l2": {"l3": 0}}, "gg": 4} result = json_normalize(data, sep="_") expected = DataFrame([[4, 10, 0]], columns=["gg", "_id_a1", "_id_l2_l3"]) tm.assert_frame_equal(result, expected) class TestNestedToRecord: def test_flat_stays_flat(self): recs = [{"flat1": 1, "flat2": 2}, {"flat3": 3, "flat2": 4}] result = nested_to_record(recs) expected = recs assert result == expected def test_one_level_deep_flattens(self): data = {"flat1": 1, "dict1": {"c": 1, "d": 2}} result = nested_to_record(data) expected = {"dict1.c": 1, "dict1.d": 2, "flat1": 1} assert result == expected def test_nested_flattens(self): data = { "flat1": 1, "dict1": {"c": 1, "d": 2}, "nested": {"e": {"c": 1, "d": 2}, "d": 2}, } result = nested_to_record(data) expected = { "dict1.c": 1, "dict1.d": 2, "flat1": 1, "nested.d": 2, "nested.e.c": 1, "nested.e.d": 2, } assert result == expected def test_json_normalize_errors(self, missing_metadata): # GH14583: # If meta keys are not always present a new option to set # errors='ignore' has been implemented msg = ( "Key 'name' not found. To replace missing values of " "'name' with np.nan, pass in errors='ignore'" ) with pytest.raises(KeyError, match=msg): json_normalize( data=missing_metadata, record_path="addresses", meta="name", errors="raise", ) def test_missing_meta(self, missing_metadata): # GH25468 # If metadata is nullable with errors set to ignore, the null values # should be numpy.nan values result = json_normalize( data=missing_metadata, record_path="addresses", meta="name", errors="ignore" ) ex_data = [ [9562, "Morris St.", "Massillon", "OH", 44646, "Alice"], [8449, "Spring St.", "Elizabethton", "TN", 37643, np.nan], ] columns = ["number", "street", "city", "state", "zip", "name"] expected = DataFrame(ex_data, columns=columns) tm.assert_frame_equal(result, expected) def test_missing_nested_meta(self): # GH44312 # If errors="ignore" and nested metadata is null, we should return nan data = {"meta": "foo", "nested_meta": None, "value": [{"rec": 1}, {"rec": 2}]} result = json_normalize( data, record_path="value", meta=["meta", ["nested_meta", "leaf"]], errors="ignore", ) ex_data = [[1, "foo", np.nan], [2, "foo", np.nan]] columns = ["rec", "meta", "nested_meta.leaf"] expected = DataFrame(ex_data, columns=columns).astype( {"nested_meta.leaf": object} ) tm.assert_frame_equal(result, expected) # If errors="raise" and nested metadata is null, we should raise with the # key of the first missing level with pytest.raises(KeyError, match="'leaf' not found"): json_normalize( data, record_path="value", meta=["meta", ["nested_meta", "leaf"]], errors="raise", ) def test_missing_meta_multilevel_record_path_errors_raise(self, missing_metadata): # GH41876 # Ensure errors='raise' works as intended even when a record_path of length # greater than one is passed in msg = ( "Key 'name' not found. To replace missing values of " "'name' with np.nan, pass in errors='ignore'" ) with pytest.raises(KeyError, match=msg): json_normalize( data=missing_metadata, record_path=["previous_residences", "cities"], meta="name", errors="raise", ) def test_missing_meta_multilevel_record_path_errors_ignore(self, missing_metadata): # GH41876 # Ensure errors='ignore' works as intended even when a record_path of length # greater than one is passed in result = json_normalize( data=missing_metadata, record_path=["previous_residences", "cities"], meta="name", errors="ignore", ) ex_data = [ ["Foo York City", "Alice"], ["Barmingham", np.nan], ] columns = ["city_name", "name"] expected = DataFrame(ex_data, columns=columns) tm.assert_frame_equal(result, expected) def test_donot_drop_nonevalues(self): # GH21356 data = [ {"info": None, "author_name": {"first": "Smith", "last_name": "Appleseed"}}, { "info": {"created_at": "11/08/1993", "last_updated": "26/05/2012"}, "author_name": {"first": "Jane", "last_name": "Doe"}, }, ] result = nested_to_record(data) expected = [ { "info": None, "author_name.first": "Smith", "author_name.last_name": "Appleseed", }, { "author_name.first": "Jane", "author_name.last_name": "Doe", "info.created_at": "11/08/1993", "info.last_updated": "26/05/2012", }, ] assert result == expected def test_nonetype_top_level_bottom_level(self): # GH21158: If inner level json has a key with a null value # make sure it does not do a new_d.pop twice and except data = { "id": None, "location": { "country": { "state": { "id": None, "town.info": { "id": None, "region": None, "x": 49.151580810546875, "y": -33.148521423339844, "z": 27.572303771972656, }, } } }, } result = nested_to_record(data) expected = { "id": None, "location.country.state.id": None, "location.country.state.town.info.id": None, "location.country.state.town.info.region": None, "location.country.state.town.info.x": 49.151580810546875, "location.country.state.town.info.y": -33.148521423339844, "location.country.state.town.info.z": 27.572303771972656, } assert result == expected def test_nonetype_multiple_levels(self): # GH21158: If inner level json has a key with a null value # make sure it does not do a new_d.pop twice and except data = { "id": None, "location": { "id": None, "country": { "id": None, "state": { "id": None, "town.info": { "region": None, "x": 49.151580810546875, "y": -33.148521423339844, "z": 27.572303771972656, }, }, }, }, } result = nested_to_record(data) expected = { "id": None, "location.id": None, "location.country.id": None, "location.country.state.id": None, "location.country.state.town.info.region": None, "location.country.state.town.info.x": 49.151580810546875, "location.country.state.town.info.y": -33.148521423339844, "location.country.state.town.info.z": 27.572303771972656, } assert result == expected @pytest.mark.parametrize( "max_level, expected", [ ( None, [ { "CreatedBy.Name": "User001", "Lookup.TextField": "Some text", "Lookup.UserField.Id": "ID001", "Lookup.UserField.Name": "Name001", "Image.a": "b", } ], ), ( 0, [ { "CreatedBy": {"Name": "User001"}, "Lookup": { "TextField": "Some text", "UserField": {"Id": "ID001", "Name": "Name001"}, }, "Image": {"a": "b"}, } ], ), ( 1, [ { "CreatedBy.Name": "User001", "Lookup.TextField": "Some text", "Lookup.UserField": {"Id": "ID001", "Name": "Name001"}, "Image.a": "b", } ], ), ], ) def test_with_max_level(self, max_level, expected, max_level_test_input_data): # GH23843: Enhanced JSON normalize output = nested_to_record(max_level_test_input_data, max_level=max_level) assert output == expected def test_with_large_max_level(self): # GH23843: Enhanced JSON normalize max_level = 100 input_data = [ { "CreatedBy": { "user": { "name": {"firstname": "Leo", "LastName": "Thomson"}, "family_tree": { "father": { "name": "Father001", "father": { "Name": "Father002", "father": { "name": "Father003", "father": {"Name": "Father004"}, }, }, } }, } } } ] expected = [ { "CreatedBy.user.name.firstname": "Leo", "CreatedBy.user.name.LastName": "Thomson", "CreatedBy.user.family_tree.father.name": "Father001", "CreatedBy.user.family_tree.father.father.Name": "Father002", "CreatedBy.user.family_tree.father.father.father.name": "Father003", "CreatedBy.user.family_tree.father.father.father.father.Name": "Father004", # noqa: E501 } ] output = nested_to_record(input_data, max_level=max_level) assert output == expected def test_series_non_zero_index(self): # GH 19020 data = { 0: {"id": 1, "name": "Foo", "elements": {"a": 1}}, 1: {"id": 2, "name": "Bar", "elements": {"b": 2}}, 2: {"id": 3, "name": "Baz", "elements": {"c": 3}}, } s = Series(data) s.index = [1, 2, 3] result = json_normalize(s) expected = DataFrame( { "id": [1, 2, 3], "name": ["Foo", "Bar", "Baz"], "elements.a": [1.0, np.nan, np.nan], "elements.b": [np.nan, 2.0, np.nan], "elements.c": [np.nan, np.nan, 3.0], } ) tm.assert_frame_equal(result, expected)