Add converter of content field in jsonline from html to text.
This commit is contained in:
parent
3027e1e7cc
commit
63c4a71812
29
parishwebsites/convert_content2text.py
Executable file
29
parishwebsites/convert_content2text.py
Executable file
@ -0,0 +1,29 @@
|
||||
#!/usr/bin/env python3
|
||||
import jsonlines
|
||||
import sys
|
||||
import html2text
|
||||
|
||||
|
||||
def convert_html_to_text(parish, text_maker):
|
||||
html = parish['content']
|
||||
text = text_maker.handle(html)
|
||||
parish['content'] = text
|
||||
return parish
|
||||
|
||||
|
||||
def main():
|
||||
text_maker = html2text.HTML2Text()
|
||||
text_maker.ignore_links = True
|
||||
text_maker.ignore_images = True
|
||||
writer = jsonlines.Writer(sys.stdout)
|
||||
# text_maker.wrap_links = False
|
||||
# text_maker.strong_mark = ''
|
||||
with jsonlines.open(sys.argv[1]) as reader:
|
||||
for parish in reader:
|
||||
parish = convert_html_to_text(parish, text_maker)
|
||||
writer.write(parish)
|
||||
writer.close()
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
Loading…
Reference in New Issue
Block a user