PCQRSCANER/venv/Lib/site-packages/textract/parsers/json_parser.py
2019-12-22 21:51:47 +01:00

39 lines
1.2 KiB
Python

import json
import six
from .utils import BaseParser
class Parser(BaseParser):
"""Extract all of the string values of a json file (no keys as those
are, in some sense, markup). This is useful for parsing content
from mongodb dumps, for example.
"""
def extract(self, filename, **kwargs):
with open(filename, 'r') as raw:
deserialized_json = json.load(raw)
return self.get_text(deserialized_json)
def get_text(self, deserialized_json):
"""Recursively get text from subcomponents of a deserialized json. To
enforce the same order on the documents, make sure to read keys of
deserialized_json in a consistent (alphabetical) order.
"""
if isinstance(deserialized_json, dict):
result = ''
for key in sorted(deserialized_json):
result += self.get_text(deserialized_json[key]) + ' '
return result
if isinstance(deserialized_json, list):
result = ''
for item in deserialized_json:
result += self.get_text(item) + ' '
return result
if isinstance(deserialized_json, six.string_types):
return deserialized_json
else:
return ''