2020-04-23 20:05:51 +02:00
|
|
|
#!/usr/bin/env python3
|
|
|
|
|
|
|
|
import argparse
|
|
|
|
from bs4 import BeautifulSoup
|
|
|
|
import json
|
|
|
|
|
|
|
|
|
|
|
|
# arguments
|
|
|
|
parser = argparse.ArgumentParser(description='Process some xml files.')
|
|
|
|
parser.add_argument('filename', help='xml forum file')
|
|
|
|
args = parser.parse_args()
|
|
|
|
|
|
|
|
# make a soup
|
|
|
|
with open(args.filename) as forum:
|
|
|
|
soup = BeautifulSoup(forum, "xml")
|
|
|
|
|
|
|
|
# put json together
|
|
|
|
out = {}
|
|
|
|
out['id'] = soup.forum.get('id')
|
|
|
|
out['name'] = soup.forum.find('name').text
|
|
|
|
out['discussions'] = []
|
|
|
|
for d in soup.forum.find_all('discussion'):
|
2020-05-10 16:52:34 +02:00
|
|
|
posts = []
|
|
|
|
for p in d.find_all('post'):
|
|
|
|
posts.append({
|
|
|
|
'id': p.get('id'),
|
|
|
|
'parent': p.find('parent').text,
|
|
|
|
'author': p.userid.text,
|
2020-05-27 19:36:56 +02:00
|
|
|
'message': p.message.get_text()
|
2020-05-10 16:52:34 +02:00
|
|
|
})
|
2020-04-23 20:05:51 +02:00
|
|
|
out['discussions'].append({
|
2020-05-04 21:24:35 +02:00
|
|
|
'id': d.get('id'),
|
|
|
|
'title': d.find('name').text,
|
|
|
|
'first_post': d.firstpost.text,
|
2020-05-10 16:52:34 +02:00
|
|
|
'posts': posts
|
2020-04-23 20:05:51 +02:00
|
|
|
})
|
|
|
|
|
|
|
|
with open('parsed.json', 'w') as outfile:
|
|
|
|
json.dump(out, outfile, ensure_ascii=False, indent=2)
|