PRI_2020-FE/backend/edumaticParser.py

51 lines
1.3 KiB
Python

#!/usr/bin/env python3
import argparse
from bs4 import BeautifulSoup
from postmarkup import render_bbcode
import html
import json
import re
# arguments
parser = argparse.ArgumentParser(description='Process some edumatic xml files.')
parser.add_argument('filename', help='xml forum file')
args = parser.parse_args()
# make a soup
with open(args.filename, 'rb') as forum:
soup = BeautifulSoup(forum, "xml")
# put json together
out = {}
out['id'] = re.search(r'ID: (\d+)', soup.group.title.text).group(1)
out['name'] = soup.group.table.find('string').text
out['discussions'] = []
did = 50
pid = did + 1
# we ignore first table, and then rules
for d in soup.group.find_all('table')[4::2]:
posts = []
for p in d.find_all('row'):
text = html.unescape(p.find_all('string')[1].text)
paragraphs = [render_bbcode(x) for x in text.splitlines()]
posts.append({
'id': pid,
'parent': pid - 1,
'author': p.find_all('string')[2].text,
'message': [x for x in paragraphs if x]
})
pid = pid + 1
out['discussions'].append({
'id' : did,
'title': d.row.find('string').text,
'first_post': did + 1,
'posts': posts
})
did = did + 50
pid = did + 1
with open('parsed.json', 'w', encoding='utf-8') as outfile:
json.dump(out, outfile, ensure_ascii=False, indent=2)