mass-scraper/parishwebsites/view_raw_data.py

29 lines
690 B
Python
Raw Normal View History

2018-03-30 22:04:14 +02:00
#!/usr/bin/env python3
import jsonlines
import sys
import html2text
import pprint
def convert_html_to_text(parish, text_maker):
html = parish['content']
text = text_maker.handle(html)
parish['content'] = text
return parish
def main():
text_maker = html2text.HTML2Text()
text_maker.ignore_links = True
text_maker.ignore_images = True
writer = jsonlines.Writer(sys.stdout)
# text_maker.wrap_links = False
# text_maker.strong_mark = ''
with jsonlines.open(sys.argv[1]) as reader:
for parish in reader:
parish = convert_html_to_text(parish, text_maker)
pprint.pprint(parish)
if __name__ == '__main__':
main()