better forum message parser

This commit is contained in:
Karolin 2020-12-28 21:39:53 +01:00
parent 63dba04890
commit c1b41a7bd5

View File

@ -2,9 +2,9 @@
import argparse
from bs4 import BeautifulSoup
from postmarkup import render_bbcode
import html
import json
import re
# arguments
@ -12,26 +12,65 @@ parser = argparse.ArgumentParser(description='Process some xml files.')
parser.add_argument('filename', help='xml forum file')
args = parser.parse_args()
def split_message(p):
message = html.unescape(str(p))
message = re.sub('\n', ' ', message)
message = re.sub('<br />|</p>|</div>', '\n', message)
message = BeautifulSoup(message, "lxml").get_text().split('\n')
# split or connect
paragraphs = [""]
for par in message:
par = par.rstrip()
# long paragraphs
if len(par) > 500:
point = re.compile('\. [A-Z]')
i = 0
if point.search(par, len(par)//2):
i = point.search(par, len(par)//2).start() + 1
if len(par)//2 > 500:
j = 0
if point.search(par[:i], i//2):
j = point.search(par[:i], i//2).start() + 1
k = 0
if point.search(par[i:], i//2):
k = point.search(par[i:], i//2).start() + 1
paragraphs += [par[:j], par[j:i], par[i:i+k], par[i+k:]]
else:
paragraphs += [par[:i], par[i:]]
# bullet lists
elif re.match('^([-•\w]|http| [a-z])', par) and not par.endswith('.'):
paragraphs[-1] += '\n' + par
elif paragraphs[-1].endswith(':') and len(paragraphs[-1]) < 90:
paragraphs[-1] += '\n' + par
# regular paragraphs
else:
paragraphs.append(par)
return [par.strip() for par in paragraphs if par]
# make a soup
with open(args.filename) as forum:
soup = BeautifulSoup(forum, "xml")
# put json together
out = {}
# basic information about the forum
out['id'] = soup.forum.get('id')
out['name'] = soup.forum.find('name').text
out['intro'] = soup.forum.intro.text
# discussions
out['discussions'] = []
for d in soup.forum.find_all('discussion'):
posts = []
for p in d.find_all('post'):
post_soup = BeautifulSoup(html.unescape(str(p.message)), "lxml")
paragraphs = [render_bbcode(x.text) for x in post_soup.find_all('p')]
posts.append({
'id': p.get('id'),
'parent': p.find('parent').text,
'author': p.userid.text,
'message': [x for x in paragraphs if x]
'message': split_message(p.message) #[x for x in paragraphs if x]
})
out['discussions'].append({
'id': d.get('id'),
'title': d.find('name').text,