359 lines
12 KiB
Python
359 lines
12 KiB
Python
|
# ---------------------------------------------------------------------
|
||
|
# JSON normalization routines
|
||
|
|
||
|
from collections import defaultdict
|
||
|
import copy
|
||
|
from typing import Any, DefaultDict, Dict, Iterable, List, Optional, Union
|
||
|
|
||
|
import numpy as np
|
||
|
|
||
|
from pandas._libs.writers import convert_json_to_lines
|
||
|
from pandas._typing import Scalar
|
||
|
from pandas.util._decorators import deprecate
|
||
|
|
||
|
import pandas as pd
|
||
|
from pandas import DataFrame
|
||
|
|
||
|
|
||
|
def convert_to_line_delimits(s):
|
||
|
"""
|
||
|
Helper function that converts JSON lists to line delimited JSON.
|
||
|
"""
|
||
|
# Determine we have a JSON list to turn to lines otherwise just return the
|
||
|
# json object, only lists can
|
||
|
if not s[0] == "[" and s[-1] == "]":
|
||
|
return s
|
||
|
s = s[1:-1]
|
||
|
|
||
|
return convert_json_to_lines(s)
|
||
|
|
||
|
|
||
|
def nested_to_record(
|
||
|
ds,
|
||
|
prefix: str = "",
|
||
|
sep: str = ".",
|
||
|
level: int = 0,
|
||
|
max_level: Optional[int] = None,
|
||
|
):
|
||
|
"""
|
||
|
A simplified json_normalize
|
||
|
|
||
|
Converts a nested dict into a flat dict ("record"), unlike json_normalize,
|
||
|
it does not attempt to extract a subset of the data.
|
||
|
|
||
|
Parameters
|
||
|
----------
|
||
|
ds : dict or list of dicts
|
||
|
prefix: the prefix, optional, default: ""
|
||
|
sep : str, default '.'
|
||
|
Nested records will generate names separated by sep,
|
||
|
e.g., for sep='.', { 'foo' : { 'bar' : 0 } } -> foo.bar
|
||
|
level: int, optional, default: 0
|
||
|
The number of levels in the json string.
|
||
|
|
||
|
max_level: int, optional, default: None
|
||
|
The max depth to normalize.
|
||
|
|
||
|
.. versionadded:: 0.25.0
|
||
|
|
||
|
Returns
|
||
|
-------
|
||
|
d - dict or list of dicts, matching `ds`
|
||
|
|
||
|
Examples
|
||
|
--------
|
||
|
IN[52]: nested_to_record(dict(flat1=1,dict1=dict(c=1,d=2),
|
||
|
nested=dict(e=dict(c=1,d=2),d=2)))
|
||
|
Out[52]:
|
||
|
{'dict1.c': 1,
|
||
|
'dict1.d': 2,
|
||
|
'flat1': 1,
|
||
|
'nested.d': 2,
|
||
|
'nested.e.c': 1,
|
||
|
'nested.e.d': 2}
|
||
|
"""
|
||
|
singleton = False
|
||
|
if isinstance(ds, dict):
|
||
|
ds = [ds]
|
||
|
singleton = True
|
||
|
new_ds = []
|
||
|
for d in ds:
|
||
|
new_d = copy.deepcopy(d)
|
||
|
for k, v in d.items():
|
||
|
# each key gets renamed with prefix
|
||
|
if not isinstance(k, str):
|
||
|
k = str(k)
|
||
|
if level == 0:
|
||
|
newkey = k
|
||
|
else:
|
||
|
newkey = prefix + sep + k
|
||
|
|
||
|
# flatten if type is dict and
|
||
|
# current dict level < maximum level provided and
|
||
|
# only dicts gets recurse-flattened
|
||
|
# only at level>1 do we rename the rest of the keys
|
||
|
if not isinstance(v, dict) or (
|
||
|
max_level is not None and level >= max_level
|
||
|
):
|
||
|
if level != 0: # so we skip copying for top level, common case
|
||
|
v = new_d.pop(k)
|
||
|
new_d[newkey] = v
|
||
|
continue
|
||
|
else:
|
||
|
v = new_d.pop(k)
|
||
|
new_d.update(nested_to_record(v, newkey, sep, level + 1, max_level))
|
||
|
new_ds.append(new_d)
|
||
|
|
||
|
if singleton:
|
||
|
return new_ds[0]
|
||
|
return new_ds
|
||
|
|
||
|
|
||
|
def _json_normalize(
|
||
|
data: Union[Dict, List[Dict]],
|
||
|
record_path: Optional[Union[str, List]] = None,
|
||
|
meta: Optional[Union[str, List[Union[str, List[str]]]]] = None,
|
||
|
meta_prefix: Optional[str] = None,
|
||
|
record_prefix: Optional[str] = None,
|
||
|
errors: str = "raise",
|
||
|
sep: str = ".",
|
||
|
max_level: Optional[int] = None,
|
||
|
) -> "DataFrame":
|
||
|
"""
|
||
|
Normalize semi-structured JSON data into a flat table.
|
||
|
|
||
|
Parameters
|
||
|
----------
|
||
|
data : dict or list of dicts
|
||
|
Unserialized JSON objects.
|
||
|
record_path : str or list of str, default None
|
||
|
Path in each object to list of records. If not passed, data will be
|
||
|
assumed to be an array of records.
|
||
|
meta : list of paths (str or list of str), default None
|
||
|
Fields to use as metadata for each record in resulting table.
|
||
|
meta_prefix : str, default None
|
||
|
If True, prefix records with dotted (?) path, e.g. foo.bar.field if
|
||
|
meta is ['foo', 'bar'].
|
||
|
record_prefix : str, default None
|
||
|
If True, prefix records with dotted (?) path, e.g. foo.bar.field if
|
||
|
path to records is ['foo', 'bar'].
|
||
|
errors : {'raise', 'ignore'}, default 'raise'
|
||
|
Configures error handling.
|
||
|
|
||
|
* 'ignore' : will ignore KeyError if keys listed in meta are not
|
||
|
always present.
|
||
|
* 'raise' : will raise KeyError if keys listed in meta are not
|
||
|
always present.
|
||
|
sep : str, default '.'
|
||
|
Nested records will generate names separated by sep.
|
||
|
e.g., for sep='.', {'foo': {'bar': 0}} -> foo.bar.
|
||
|
max_level : int, default None
|
||
|
Max number of levels(depth of dict) to normalize.
|
||
|
if None, normalizes all levels.
|
||
|
|
||
|
.. versionadded:: 0.25.0
|
||
|
|
||
|
Returns
|
||
|
-------
|
||
|
frame : DataFrame
|
||
|
Normalize semi-structured JSON data into a flat table.
|
||
|
|
||
|
Examples
|
||
|
--------
|
||
|
>>> data = [{'id': 1, 'name': {'first': 'Coleen', 'last': 'Volk'}},
|
||
|
... {'name': {'given': 'Mose', 'family': 'Regner'}},
|
||
|
... {'id': 2, 'name': 'Faye Raker'}]
|
||
|
>>> pd.json_normalize(data)
|
||
|
id name.first name.last name.given name.family name
|
||
|
0 1.0 Coleen Volk NaN NaN NaN
|
||
|
1 NaN NaN NaN Mose Regner NaN
|
||
|
2 2.0 NaN NaN NaN NaN Faye Raker
|
||
|
|
||
|
>>> data = [{'id': 1,
|
||
|
... 'name': "Cole Volk",
|
||
|
... 'fitness': {'height': 130, 'weight': 60}},
|
||
|
... {'name': "Mose Reg",
|
||
|
... 'fitness': {'height': 130, 'weight': 60}},
|
||
|
... {'id': 2, 'name': 'Faye Raker',
|
||
|
... 'fitness': {'height': 130, 'weight': 60}}]
|
||
|
>>> pd.json_normalize(data, max_level=0)
|
||
|
id name fitness
|
||
|
0 1.0 Cole Volk {'height': 130, 'weight': 60}
|
||
|
1 NaN Mose Reg {'height': 130, 'weight': 60}
|
||
|
2 2.0 Faye Raker {'height': 130, 'weight': 60}
|
||
|
|
||
|
Normalizes nested data up to level 1.
|
||
|
|
||
|
>>> data = [{'id': 1,
|
||
|
... 'name': "Cole Volk",
|
||
|
... 'fitness': {'height': 130, 'weight': 60}},
|
||
|
... {'name': "Mose Reg",
|
||
|
... 'fitness': {'height': 130, 'weight': 60}},
|
||
|
... {'id': 2, 'name': 'Faye Raker',
|
||
|
... 'fitness': {'height': 130, 'weight': 60}}]
|
||
|
>>> pd.json_normalize(data, max_level=1)
|
||
|
id name fitness.height fitness.weight
|
||
|
0 1.0 Cole Volk 130 60
|
||
|
1 NaN Mose Reg 130 60
|
||
|
2 2.0 Faye Raker 130 60
|
||
|
|
||
|
>>> data = [{'state': 'Florida',
|
||
|
... 'shortname': 'FL',
|
||
|
... 'info': {'governor': 'Rick Scott'},
|
||
|
... 'counties': [{'name': 'Dade', 'population': 12345},
|
||
|
... {'name': 'Broward', 'population': 40000},
|
||
|
... {'name': 'Palm Beach', 'population': 60000}]},
|
||
|
... {'state': 'Ohio',
|
||
|
... 'shortname': 'OH',
|
||
|
... 'info': {'governor': 'John Kasich'},
|
||
|
... 'counties': [{'name': 'Summit', 'population': 1234},
|
||
|
... {'name': 'Cuyahoga', 'population': 1337}]}]
|
||
|
>>> result = pd.json_normalize(data, 'counties', ['state', 'shortname',
|
||
|
... ['info', 'governor']])
|
||
|
>>> result
|
||
|
name population state shortname info.governor
|
||
|
0 Dade 12345 Florida FL Rick Scott
|
||
|
1 Broward 40000 Florida FL Rick Scott
|
||
|
2 Palm Beach 60000 Florida FL Rick Scott
|
||
|
3 Summit 1234 Ohio OH John Kasich
|
||
|
4 Cuyahoga 1337 Ohio OH John Kasich
|
||
|
|
||
|
>>> data = {'A': [1, 2]}
|
||
|
>>> pd.json_normalize(data, 'A', record_prefix='Prefix.')
|
||
|
Prefix.0
|
||
|
0 1
|
||
|
1 2
|
||
|
|
||
|
Returns normalized data with columns prefixed with the given string.
|
||
|
"""
|
||
|
|
||
|
def _pull_field(
|
||
|
js: Dict[str, Any], spec: Union[List, str]
|
||
|
) -> Union[Scalar, Iterable]:
|
||
|
"""Internal function to pull field"""
|
||
|
result = js
|
||
|
if isinstance(spec, list):
|
||
|
for field in spec:
|
||
|
result = result[field]
|
||
|
else:
|
||
|
result = result[spec]
|
||
|
return result
|
||
|
|
||
|
def _pull_records(js: Dict[str, Any], spec: Union[List, str]) -> List:
|
||
|
"""
|
||
|
Internal function to pull field for records, and similar to
|
||
|
_pull_field, but require to return list. And will raise error
|
||
|
if has non iterable value.
|
||
|
"""
|
||
|
result = _pull_field(js, spec)
|
||
|
|
||
|
# GH 31507 GH 30145, GH 26284 if result is not list, raise TypeError if not
|
||
|
# null, otherwise return an empty list
|
||
|
if not isinstance(result, list):
|
||
|
if pd.isnull(result):
|
||
|
result = []
|
||
|
else:
|
||
|
raise TypeError(
|
||
|
f"{js} has non list value {result} for path {spec}. "
|
||
|
"Must be list or null."
|
||
|
)
|
||
|
return result
|
||
|
|
||
|
if isinstance(data, list) and not data:
|
||
|
return DataFrame()
|
||
|
|
||
|
# A bit of a hackjob
|
||
|
if isinstance(data, dict):
|
||
|
data = [data]
|
||
|
|
||
|
if record_path is None:
|
||
|
if any([isinstance(x, dict) for x in y.values()] for y in data):
|
||
|
# naive normalization, this is idempotent for flat records
|
||
|
# and potentially will inflate the data considerably for
|
||
|
# deeply nested structures:
|
||
|
# {VeryLong: { b: 1,c:2}} -> {VeryLong.b:1 ,VeryLong.c:@}
|
||
|
#
|
||
|
# TODO: handle record value which are lists, at least error
|
||
|
# reasonably
|
||
|
data = nested_to_record(data, sep=sep, max_level=max_level)
|
||
|
return DataFrame(data)
|
||
|
elif not isinstance(record_path, list):
|
||
|
record_path = [record_path]
|
||
|
|
||
|
if meta is None:
|
||
|
meta = []
|
||
|
elif not isinstance(meta, list):
|
||
|
meta = [meta]
|
||
|
|
||
|
_meta = [m if isinstance(m, list) else [m] for m in meta]
|
||
|
|
||
|
# Disastrously inefficient for now
|
||
|
records: List = []
|
||
|
lengths = []
|
||
|
|
||
|
meta_vals: DefaultDict = defaultdict(list)
|
||
|
meta_keys = [sep.join(val) for val in _meta]
|
||
|
|
||
|
def _recursive_extract(data, path, seen_meta, level=0):
|
||
|
if isinstance(data, dict):
|
||
|
data = [data]
|
||
|
if len(path) > 1:
|
||
|
for obj in data:
|
||
|
for val, key in zip(_meta, meta_keys):
|
||
|
if level + 1 == len(val):
|
||
|
seen_meta[key] = _pull_field(obj, val[-1])
|
||
|
|
||
|
_recursive_extract(obj[path[0]], path[1:], seen_meta, level=level + 1)
|
||
|
else:
|
||
|
for obj in data:
|
||
|
recs = _pull_records(obj, path[0])
|
||
|
recs = [
|
||
|
nested_to_record(r, sep=sep, max_level=max_level)
|
||
|
if isinstance(r, dict)
|
||
|
else r
|
||
|
for r in recs
|
||
|
]
|
||
|
|
||
|
# For repeating the metadata later
|
||
|
lengths.append(len(recs))
|
||
|
for val, key in zip(_meta, meta_keys):
|
||
|
if level + 1 > len(val):
|
||
|
meta_val = seen_meta[key]
|
||
|
else:
|
||
|
try:
|
||
|
meta_val = _pull_field(obj, val[level:])
|
||
|
except KeyError as e:
|
||
|
if errors == "ignore":
|
||
|
meta_val = np.nan
|
||
|
else:
|
||
|
raise KeyError(
|
||
|
"Try running with errors='ignore' as key "
|
||
|
f"{e} is not always present"
|
||
|
) from e
|
||
|
meta_vals[key].append(meta_val)
|
||
|
records.extend(recs)
|
||
|
|
||
|
_recursive_extract(data, record_path, {}, level=0)
|
||
|
|
||
|
result = DataFrame(records)
|
||
|
|
||
|
if record_prefix is not None:
|
||
|
result = result.rename(columns=lambda x: f"{record_prefix}{x}")
|
||
|
|
||
|
# Data types, a problem
|
||
|
for k, v in meta_vals.items():
|
||
|
if meta_prefix is not None:
|
||
|
k = meta_prefix + k
|
||
|
|
||
|
if k in result:
|
||
|
raise ValueError(
|
||
|
f"Conflicting metadata name {k}, need distinguishing prefix "
|
||
|
)
|
||
|
result[k] = np.array(v, dtype=object).repeat(lengths)
|
||
|
return result
|
||
|
|
||
|
|
||
|
json_normalize = deprecate(
|
||
|
"pandas.io.json.json_normalize", _json_normalize, "1.0.0", "pandas.json_normalize"
|
||
|
)
|