paragraph splitting in parsers

This commit is contained in:
Karolina Boczoń 2020-06-04 21:11:05 +00:00
parent 017e560cbf
commit 7e153db9ca
2 changed files with 93 additions and 86 deletions

View File

@ -3,6 +3,7 @@
import argparse
from bs4 import BeautifulSoup
from postmarkup import render_bbcode
import html
import json
import re
@ -13,7 +14,7 @@ parser.add_argument('filename', help='xml forum file')
args = parser.parse_args()
# make a soup
with open(args.filename) as forum:
with open(args.filename, 'rb') as forum:
soup = BeautifulSoup(forum, "xml")
# put json together
@ -27,11 +28,13 @@ pid = did + 1
for d in soup.group.find_all('table')[4::2]:
posts = []
for p in d.find_all('row'):
text = html.unescape(p.find_all('string')[1].text)
paragraphs = [render_bbcode(x) for x in text.splitlines()]
posts.append({
'id': pid,
'parent': pid - 1,
'author': p.find_all('string')[2].text,
'message': render_bbcode(p.find_all('string')[1].text)
'message': [x for x in paragraphs if x]
})
pid = pid + 1
out['discussions'].append({
@ -43,5 +46,5 @@ for d in soup.group.find_all('table')[4::2]:
did = did + 50
pid = did + 1
with open('parsed.json', 'w') as outfile:
with open('parsed.json', 'w', encoding='utf-8') as outfile:
json.dump(out, outfile, ensure_ascii=False, indent=2)

View File

@ -2,6 +2,8 @@
import argparse
from bs4 import BeautifulSoup
from postmarkup import render_bbcode
import html
import json
@ -22,11 +24,13 @@ out['discussions'] = []
for d in soup.forum.find_all('discussion'):
posts = []
for p in d.find_all('post'):
post_soup = BeautifulSoup(html.unescape(str(p.message)), "lxml")
paragraphs = [render_bbcode(x.text) for x in post_soup.find_all('p')]
posts.append({
'id': p.get('id'),
'parent': p.find('parent').text,
'author': p.userid.text,
'message': p.message.get_text()
'message': [x for x in paragraphs if x]
})
out['discussions'].append({
'id': d.get('id'),