diff --git a/backend/edumaticParser.py b/backend/edumaticParser.py index db1c3ee..f2e2775 100644 --- a/backend/edumaticParser.py +++ b/backend/edumaticParser.py @@ -1,47 +1,50 @@ -#!/usr/bin/env python3 - -import argparse -from bs4 import BeautifulSoup -from postmarkup import render_bbcode -import json -import re - - -# arguments -parser = argparse.ArgumentParser(description='Process some edumatic xml files.') -parser.add_argument('filename', help='xml forum file') -args = parser.parse_args() - -# make a soup -with open(args.filename) as forum: - soup = BeautifulSoup(forum, "xml") - -# put json together -out = {} -out['id'] = re.search(r'ID: (\d+)', soup.group.title.text).group(1) -out['name'] = soup.group.table.find('string').text -out['discussions'] = [] -did = 50 -pid = did + 1 -# we ignore first table, and then rules -for d in soup.group.find_all('table')[4::2]: - posts = [] - for p in d.find_all('row'): - posts.append({ - 'id': pid, - 'parent': pid - 1, - 'author': p.find_all('string')[2].text, - 'message': render_bbcode(p.find_all('string')[1].text) - }) - pid = pid + 1 - out['discussions'].append({ - 'id' : did, - 'title': d.row.find('string').text, - 'first_post': did + 1, - 'posts': posts - }) - did = did + 50 - pid = did + 1 - -with open('parsed.json', 'w') as outfile: - json.dump(out, outfile, ensure_ascii=False, indent=2) +#!/usr/bin/env python3 + +import argparse +from bs4 import BeautifulSoup +from postmarkup import render_bbcode +import html +import json +import re + + +# arguments +parser = argparse.ArgumentParser(description='Process some edumatic xml files.') +parser.add_argument('filename', help='xml forum file') +args = parser.parse_args() + +# make a soup +with open(args.filename, 'rb') as forum: + soup = BeautifulSoup(forum, "xml") + +# put json together +out = {} +out['id'] = re.search(r'ID: (\d+)', soup.group.title.text).group(1) +out['name'] = soup.group.table.find('string').text +out['discussions'] = [] +did = 50 +pid = did + 1 +# we ignore first table, and then rules +for d in soup.group.find_all('table')[4::2]: + posts = [] + for p in d.find_all('row'): + text = html.unescape(p.find_all('string')[1].text) + paragraphs = [render_bbcode(x) for x in text.splitlines()] + posts.append({ + 'id': pid, + 'parent': pid - 1, + 'author': p.find_all('string')[2].text, + 'message': [x for x in paragraphs if x] + }) + pid = pid + 1 + out['discussions'].append({ + 'id' : did, + 'title': d.row.find('string').text, + 'first_post': did + 1, + 'posts': posts + }) + did = did + 50 + pid = did + 1 + +with open('parsed.json', 'w', encoding='utf-8') as outfile: + json.dump(out, outfile, ensure_ascii=False, indent=2) diff --git a/backend/xmlParser.py b/backend/xmlParser.py index b475d14..deb404f 100644 --- a/backend/xmlParser.py +++ b/backend/xmlParser.py @@ -1,39 +1,43 @@ -#!/usr/bin/env python3 - -import argparse -from bs4 import BeautifulSoup -import json - - -# arguments -parser = argparse.ArgumentParser(description='Process some xml files.') -parser.add_argument('filename', help='xml forum file') -args = parser.parse_args() - -# make a soup -with open(args.filename) as forum: - soup = BeautifulSoup(forum, "xml") - -# put json together -out = {} -out['id'] = soup.forum.get('id') -out['name'] = soup.forum.find('name').text -out['discussions'] = [] -for d in soup.forum.find_all('discussion'): - posts = [] - for p in d.find_all('post'): - posts.append({ - 'id': p.get('id'), - 'parent': p.find('parent').text, - 'author': p.userid.text, - 'message': p.message.get_text() - }) - out['discussions'].append({ - 'id': d.get('id'), - 'title': d.find('name').text, - 'first_post': d.firstpost.text, - 'posts': posts - }) - -with open('parsed.json', 'w') as outfile: - json.dump(out, outfile, ensure_ascii=False, indent=2) +#!/usr/bin/env python3 + +import argparse +from bs4 import BeautifulSoup +from postmarkup import render_bbcode +import html +import json + + +# arguments +parser = argparse.ArgumentParser(description='Process some xml files.') +parser.add_argument('filename', help='xml forum file') +args = parser.parse_args() + +# make a soup +with open(args.filename) as forum: + soup = BeautifulSoup(forum, "xml") + +# put json together +out = {} +out['id'] = soup.forum.get('id') +out['name'] = soup.forum.find('name').text +out['discussions'] = [] +for d in soup.forum.find_all('discussion'): + posts = [] + for p in d.find_all('post'): + post_soup = BeautifulSoup(html.unescape(str(p.message)), "lxml") + paragraphs = [render_bbcode(x.text) for x in post_soup.find_all('p')] + posts.append({ + 'id': p.get('id'), + 'parent': p.find('parent').text, + 'author': p.userid.text, + 'message': [x for x in paragraphs if x] + }) + out['discussions'].append({ + 'id': d.get('id'), + 'title': d.find('name').text, + 'first_post': d.firstpost.text, + 'posts': posts + }) + +with open('parsed.json', 'w') as outfile: + json.dump(out, outfile, ensure_ascii=False, indent=2) \ No newline at end of file