better forum message parser
This commit is contained in:
parent
63dba04890
commit
c1b41a7bd5
@ -2,9 +2,9 @@
|
||||
|
||||
import argparse
|
||||
from bs4 import BeautifulSoup
|
||||
from postmarkup import render_bbcode
|
||||
import html
|
||||
import json
|
||||
import re
|
||||
|
||||
|
||||
# arguments
|
||||
@ -12,26 +12,65 @@ parser = argparse.ArgumentParser(description='Process some xml files.')
|
||||
parser.add_argument('filename', help='xml forum file')
|
||||
args = parser.parse_args()
|
||||
|
||||
|
||||
def split_message(p):
|
||||
message = html.unescape(str(p))
|
||||
message = re.sub('\n', ' ', message)
|
||||
message = re.sub('<br />|</p>|</div>', '\n', message)
|
||||
message = BeautifulSoup(message, "lxml").get_text().split('\n')
|
||||
# split or connect
|
||||
paragraphs = [""]
|
||||
for par in message:
|
||||
par = par.rstrip()
|
||||
# long paragraphs
|
||||
if len(par) > 500:
|
||||
point = re.compile('\. [A-Z]')
|
||||
i = 0
|
||||
if point.search(par, len(par)//2):
|
||||
i = point.search(par, len(par)//2).start() + 1
|
||||
if len(par)//2 > 500:
|
||||
j = 0
|
||||
if point.search(par[:i], i//2):
|
||||
j = point.search(par[:i], i//2).start() + 1
|
||||
k = 0
|
||||
if point.search(par[i:], i//2):
|
||||
k = point.search(par[i:], i//2).start() + 1
|
||||
paragraphs += [par[:j], par[j:i], par[i:i+k], par[i+k:]]
|
||||
else:
|
||||
paragraphs += [par[:i], par[i:]]
|
||||
# bullet lists
|
||||
elif re.match('^([-•\w]|http| [a-z])', par) and not par.endswith('.'):
|
||||
paragraphs[-1] += '\n' + par
|
||||
elif paragraphs[-1].endswith(':') and len(paragraphs[-1]) < 90:
|
||||
paragraphs[-1] += '\n' + par
|
||||
# regular paragraphs
|
||||
else:
|
||||
paragraphs.append(par)
|
||||
return [par.strip() for par in paragraphs if par]
|
||||
|
||||
|
||||
# make a soup
|
||||
with open(args.filename) as forum:
|
||||
soup = BeautifulSoup(forum, "xml")
|
||||
|
||||
# put json together
|
||||
out = {}
|
||||
# basic information about the forum
|
||||
out['id'] = soup.forum.get('id')
|
||||
out['name'] = soup.forum.find('name').text
|
||||
out['intro'] = soup.forum.intro.text
|
||||
# discussions
|
||||
out['discussions'] = []
|
||||
for d in soup.forum.find_all('discussion'):
|
||||
posts = []
|
||||
for p in d.find_all('post'):
|
||||
post_soup = BeautifulSoup(html.unescape(str(p.message)), "lxml")
|
||||
paragraphs = [render_bbcode(x.text) for x in post_soup.find_all('p')]
|
||||
posts.append({
|
||||
'id': p.get('id'),
|
||||
'parent': p.find('parent').text,
|
||||
'author': p.userid.text,
|
||||
'message': [x for x in paragraphs if x]
|
||||
'message': split_message(p.message) #[x for x in paragraphs if x]
|
||||
})
|
||||
|
||||
out['discussions'].append({
|
||||
'id': d.get('id'),
|
||||
'title': d.find('name').text,
|
||||
|
Loading…
Reference in New Issue
Block a user