From 017e560cbf0d6a609550b3d2484028cbe0e42218 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Karolina=20Boczo=C5=84?= Date: Wed, 27 May 2020 17:36:56 +0000 Subject: [PATCH 1/2] edumatic parser added; HTML tags are back in messages --- backend/edumaticParser.py | 47 +++++++++++++++++++++++++++++++++++++++ backend/xmlParser.py | 3 +-- 2 files changed, 48 insertions(+), 2 deletions(-) create mode 100644 backend/edumaticParser.py diff --git a/backend/edumaticParser.py b/backend/edumaticParser.py new file mode 100644 index 0000000..db1c3ee --- /dev/null +++ b/backend/edumaticParser.py @@ -0,0 +1,47 @@ +#!/usr/bin/env python3 + +import argparse +from bs4 import BeautifulSoup +from postmarkup import render_bbcode +import json +import re + + +# arguments +parser = argparse.ArgumentParser(description='Process some edumatic xml files.') +parser.add_argument('filename', help='xml forum file') +args = parser.parse_args() + +# make a soup +with open(args.filename) as forum: + soup = BeautifulSoup(forum, "xml") + +# put json together +out = {} +out['id'] = re.search(r'ID: (\d+)', soup.group.title.text).group(1) +out['name'] = soup.group.table.find('string').text +out['discussions'] = [] +did = 50 +pid = did + 1 +# we ignore first table, and then rules +for d in soup.group.find_all('table')[4::2]: + posts = [] + for p in d.find_all('row'): + posts.append({ + 'id': pid, + 'parent': pid - 1, + 'author': p.find_all('string')[2].text, + 'message': render_bbcode(p.find_all('string')[1].text) + }) + pid = pid + 1 + out['discussions'].append({ + 'id' : did, + 'title': d.row.find('string').text, + 'first_post': did + 1, + 'posts': posts + }) + did = did + 50 + pid = did + 1 + +with open('parsed.json', 'w') as outfile: + json.dump(out, outfile, ensure_ascii=False, indent=2) diff --git a/backend/xmlParser.py b/backend/xmlParser.py index 1cdde2b..b475d14 100644 --- a/backend/xmlParser.py +++ b/backend/xmlParser.py @@ -22,12 +22,11 @@ out['discussions'] = [] for d in soup.forum.find_all('discussion'): posts = [] for p in d.find_all('post'): - message_soup = BeautifulSoup(p.message.get_text(), "xml") posts.append({ 'id': p.get('id'), 'parent': p.find('parent').text, 'author': p.userid.text, - 'message': message_soup.get_text() + 'message': p.message.get_text() }) out['discussions'].append({ 'id': d.get('id'), From 7e153db9caa6ec575544f62cd6c2f3d974cf7658 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Karolina=20Boczo=C5=84?= Date: Thu, 4 Jun 2020 21:11:05 +0000 Subject: [PATCH 2/2] paragraph splitting in parsers --- backend/edumaticParser.py | 97 ++++++++++++++++++++------------------- backend/xmlParser.py | 82 +++++++++++++++++---------------- 2 files changed, 93 insertions(+), 86 deletions(-) diff --git a/backend/edumaticParser.py b/backend/edumaticParser.py index db1c3ee..f2e2775 100644 --- a/backend/edumaticParser.py +++ b/backend/edumaticParser.py @@ -1,47 +1,50 @@ -#!/usr/bin/env python3 - -import argparse -from bs4 import BeautifulSoup -from postmarkup import render_bbcode -import json -import re - - -# arguments -parser = argparse.ArgumentParser(description='Process some edumatic xml files.') -parser.add_argument('filename', help='xml forum file') -args = parser.parse_args() - -# make a soup -with open(args.filename) as forum: - soup = BeautifulSoup(forum, "xml") - -# put json together -out = {} -out['id'] = re.search(r'ID: (\d+)', soup.group.title.text).group(1) -out['name'] = soup.group.table.find('string').text -out['discussions'] = [] -did = 50 -pid = did + 1 -# we ignore first table, and then rules -for d in soup.group.find_all('table')[4::2]: - posts = [] - for p in d.find_all('row'): - posts.append({ - 'id': pid, - 'parent': pid - 1, - 'author': p.find_all('string')[2].text, - 'message': render_bbcode(p.find_all('string')[1].text) - }) - pid = pid + 1 - out['discussions'].append({ - 'id' : did, - 'title': d.row.find('string').text, - 'first_post': did + 1, - 'posts': posts - }) - did = did + 50 - pid = did + 1 - -with open('parsed.json', 'w') as outfile: - json.dump(out, outfile, ensure_ascii=False, indent=2) +#!/usr/bin/env python3 + +import argparse +from bs4 import BeautifulSoup +from postmarkup import render_bbcode +import html +import json +import re + + +# arguments +parser = argparse.ArgumentParser(description='Process some edumatic xml files.') +parser.add_argument('filename', help='xml forum file') +args = parser.parse_args() + +# make a soup +with open(args.filename, 'rb') as forum: + soup = BeautifulSoup(forum, "xml") + +# put json together +out = {} +out['id'] = re.search(r'ID: (\d+)', soup.group.title.text).group(1) +out['name'] = soup.group.table.find('string').text +out['discussions'] = [] +did = 50 +pid = did + 1 +# we ignore first table, and then rules +for d in soup.group.find_all('table')[4::2]: + posts = [] + for p in d.find_all('row'): + text = html.unescape(p.find_all('string')[1].text) + paragraphs = [render_bbcode(x) for x in text.splitlines()] + posts.append({ + 'id': pid, + 'parent': pid - 1, + 'author': p.find_all('string')[2].text, + 'message': [x for x in paragraphs if x] + }) + pid = pid + 1 + out['discussions'].append({ + 'id' : did, + 'title': d.row.find('string').text, + 'first_post': did + 1, + 'posts': posts + }) + did = did + 50 + pid = did + 1 + +with open('parsed.json', 'w', encoding='utf-8') as outfile: + json.dump(out, outfile, ensure_ascii=False, indent=2) diff --git a/backend/xmlParser.py b/backend/xmlParser.py index b475d14..deb404f 100644 --- a/backend/xmlParser.py +++ b/backend/xmlParser.py @@ -1,39 +1,43 @@ -#!/usr/bin/env python3 - -import argparse -from bs4 import BeautifulSoup -import json - - -# arguments -parser = argparse.ArgumentParser(description='Process some xml files.') -parser.add_argument('filename', help='xml forum file') -args = parser.parse_args() - -# make a soup -with open(args.filename) as forum: - soup = BeautifulSoup(forum, "xml") - -# put json together -out = {} -out['id'] = soup.forum.get('id') -out['name'] = soup.forum.find('name').text -out['discussions'] = [] -for d in soup.forum.find_all('discussion'): - posts = [] - for p in d.find_all('post'): - posts.append({ - 'id': p.get('id'), - 'parent': p.find('parent').text, - 'author': p.userid.text, - 'message': p.message.get_text() - }) - out['discussions'].append({ - 'id': d.get('id'), - 'title': d.find('name').text, - 'first_post': d.firstpost.text, - 'posts': posts - }) - -with open('parsed.json', 'w') as outfile: - json.dump(out, outfile, ensure_ascii=False, indent=2) +#!/usr/bin/env python3 + +import argparse +from bs4 import BeautifulSoup +from postmarkup import render_bbcode +import html +import json + + +# arguments +parser = argparse.ArgumentParser(description='Process some xml files.') +parser.add_argument('filename', help='xml forum file') +args = parser.parse_args() + +# make a soup +with open(args.filename) as forum: + soup = BeautifulSoup(forum, "xml") + +# put json together +out = {} +out['id'] = soup.forum.get('id') +out['name'] = soup.forum.find('name').text +out['discussions'] = [] +for d in soup.forum.find_all('discussion'): + posts = [] + for p in d.find_all('post'): + post_soup = BeautifulSoup(html.unescape(str(p.message)), "lxml") + paragraphs = [render_bbcode(x.text) for x in post_soup.find_all('p')] + posts.append({ + 'id': p.get('id'), + 'parent': p.find('parent').text, + 'author': p.userid.text, + 'message': [x for x in paragraphs if x] + }) + out['discussions'].append({ + 'id': d.get('id'), + 'title': d.find('name').text, + 'first_post': d.firstpost.text, + 'posts': posts + }) + +with open('parsed.json', 'w') as outfile: + json.dump(out, outfile, ensure_ascii=False, indent=2) \ No newline at end of file