82 lines
2.3 KiB
Python
82 lines
2.3 KiB
Python
#!/usr/bin/env python3
|
|
|
|
import argparse
|
|
from bs4 import BeautifulSoup
|
|
import html
|
|
import json
|
|
import re
|
|
|
|
|
|
# arguments
|
|
parser = argparse.ArgumentParser(description='Process some xml files.')
|
|
parser.add_argument('filename', help='xml forum file')
|
|
args = parser.parse_args()
|
|
|
|
|
|
def split_message(p):
|
|
message = html.unescape(str(p))
|
|
message = re.sub('\n', ' ', message)
|
|
message = re.sub('<br />|</p>|</div>', '\n', message)
|
|
message = BeautifulSoup(message, "lxml").get_text().split('\n')
|
|
# split or connect
|
|
paragraphs = [""]
|
|
for par in message:
|
|
par = par.rstrip()
|
|
# long paragraphs
|
|
if len(par) > 500:
|
|
point = re.compile('\. [A-Z]')
|
|
i = 0
|
|
if point.search(par, len(par)//2):
|
|
i = point.search(par, len(par)//2).start() + 1
|
|
if len(par)//2 > 500:
|
|
j = 0
|
|
if point.search(par[:i], i//2):
|
|
j = point.search(par[:i], i//2).start() + 1
|
|
k = 0
|
|
if point.search(par[i:], i//2):
|
|
k = point.search(par[i:], i//2).start() + 1
|
|
paragraphs += [par[:j], par[j:i], par[i:i+k], par[i+k:]]
|
|
else:
|
|
paragraphs += [par[:i], par[i:]]
|
|
# bullet lists
|
|
elif re.match('^([-•\w]|http| [a-z])', par) and not par.endswith('.'):
|
|
paragraphs[-1] += '\n' + par
|
|
elif paragraphs[-1].endswith(':') and len(paragraphs[-1]) < 90:
|
|
paragraphs[-1] += '\n' + par
|
|
# regular paragraphs
|
|
else:
|
|
paragraphs.append(par)
|
|
return [par.strip() for par in paragraphs if par]
|
|
|
|
|
|
# make a soup
|
|
with open(args.filename) as forum:
|
|
soup = BeautifulSoup(forum, "xml")
|
|
|
|
# put json together
|
|
out = {}
|
|
# basic information about the forum
|
|
out['id'] = soup.forum.get('id')
|
|
out['name'] = soup.forum.find('name').text
|
|
out['intro'] = soup.forum.intro.text
|
|
# discussions
|
|
out['discussions'] = []
|
|
for d in soup.forum.find_all('discussion'):
|
|
posts = []
|
|
for p in d.find_all('post'):
|
|
posts.append({
|
|
'id': p.get('id'),
|
|
'parent': p.find('parent').text,
|
|
'author': p.userid.text,
|
|
'message': split_message(p.message) #[x for x in paragraphs if x]
|
|
})
|
|
|
|
out['discussions'].append({
|
|
'id': d.get('id'),
|
|
'title': d.find('name').text,
|
|
'first_post': d.firstpost.text,
|
|
'posts': posts
|
|
})
|
|
|
|
with open('parsed.json', 'w') as outfile:
|
|
json.dump(out, outfile, ensure_ascii=False, indent=2) |