siulkilulki
1f6b1e6ffe
Working converting parishes from html2text. Add makefile parish2text goal. Change to non-html(text) parishes in extract_rule_based and get_utterances Enhance find_hours.py Wrap render_template in make_response in webapp/app.py
52 lines
1.6 KiB
Python
Executable File
52 lines
1.6 KiB
Python
Executable File
#!/usr/bin/env python3
|
|
import jsonlines
|
|
import sys
|
|
import html2text
|
|
import pprint
|
|
import re
|
|
import logging
|
|
|
|
|
|
class Parish2Text():
|
|
def __init__(self):
|
|
'''Don't use this object for long period of time, because convertion
|
|
will slowdown. Destroy it after every convertion.'''
|
|
self.text_maker = html2text.HTML2Text()
|
|
self.text_maker.ignore_links = True
|
|
self.text_maker.ignore_images = True
|
|
self.text_maker.images_to_alt = True
|
|
self.text_maker.strong_mark = ''
|
|
self.text_maker.ul_item_mark = ''
|
|
self.text_maker.emphasis_mark = ''
|
|
self.text_maker.ignore_tables = True
|
|
|
|
def convert(self, parish):
|
|
parish['content'] = self.text_maker.handle(parish['content'])
|
|
parish['button_text'] = self.text_maker.handle(parish['button_text'])
|
|
parish['button_text'] = ' '.join(
|
|
re.sub('[\W_]+', ' ', parish['button_text']).split())
|
|
return parish
|
|
|
|
|
|
def main():
|
|
writer = jsonlines.Writer(sys.stdout)
|
|
# text_maker.wrap_links = False
|
|
reader = jsonlines.Reader((line.rstrip('\n') for line in sys.stdin))
|
|
for page_nr, parish_page in enumerate(reader):
|
|
parish2text = Parish2Text()
|
|
try:
|
|
parish_page = parish2text.convert(parish_page)
|
|
except Exception:
|
|
logging.warning('page: {},url: {}'.format(page_nr,
|
|
parish_page['url']))
|
|
continue
|
|
writer.write(parish_page)
|
|
# parish_content = parish_page.pop('content')
|
|
# pprint.pprint(parish_page)
|
|
# print(parish_content)
|
|
reader.close()
|
|
|
|
|
|
if __name__ == '__main__':
|
|
main()
|