mass-scraper/utils/iterator.py
siulkilulki 1f6b1e6ffe Working utterances getting/pickling
Working converting parishes from html2text.
Add makefile parish2text goal.
Change to non-html(text) parishes in extract_rule_based and get_utterances
Enhance find_hours.py
Wrap render_template in make_response in webapp/app.py
2018-05-14 01:51:40 +02:00

26 lines
796 B
Python

import os
import jsonlines
import random
from parishwebsites.parish2text import Parish2Text
def parish_path_iterator(directory):
for root, dirs, files in os.walk(directory):
for fname in sorted(files):
filepath = os.path.join(root, fname)
if os.path.getsize(filepath) > 0:
yield filepath
def parish_page_iterator(filepath, html=True):
with jsonlines.open(filepath) as parish_reader:
for parish_page in parish_reader:
if 'Maximum execution time of 30 seconds exceeded in' in parish_page[
'content']:
continue
if html:
parish2text = Parish2Text()
yield parish2text.convert(parish_page)
else:
yield parish_page