#!/usr/bin/env python3 import argparse from bs4 import BeautifulSoup from postmarkup import render_bbcode import html import json # arguments parser = argparse.ArgumentParser(description='Process some xml files.') parser.add_argument('filename', help='xml forum file') args = parser.parse_args() # make a soup with open(args.filename) as forum: soup = BeautifulSoup(forum, "xml") # put json together out = {} out['id'] = soup.forum.get('id') out['name'] = soup.forum.find('name').text out['discussions'] = [] for d in soup.forum.find_all('discussion'): posts = [] for p in d.find_all('post'): post_soup = BeautifulSoup(html.unescape(str(p.message)), "lxml") paragraphs = [render_bbcode(x.text) for x in post_soup.find_all('p')] posts.append({ 'id': p.get('id'), 'parent': p.find('parent').text, 'author': p.userid.text, 'message': [x for x in paragraphs if x] }) out['discussions'].append({ 'id': d.get('id'), 'title': d.find('name').text, 'first_post': d.firstpost.text, 'posts': posts }) with open('parsed.json', 'w') as outfile: json.dump(out, outfile, ensure_ascii=False, indent=2)