From 017e560cbf0d6a609550b3d2484028cbe0e42218 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Karolina=20Boczo=C5=84?= <karboc3@st.amu.edu.pl>
Date: Wed, 27 May 2020 17:36:56 +0000
Subject: [PATCH 1/2] edumatic parser added; HTML tags are back in messages

---
 backend/edumaticParser.py | 47 +++++++++++++++++++++++++++++++++++++++
 backend/xmlParser.py      |  3 +--
 2 files changed, 48 insertions(+), 2 deletions(-)
 create mode 100644 backend/edumaticParser.py

diff --git a/backend/edumaticParser.py b/backend/edumaticParser.py
new file mode 100644
index 0000000..db1c3ee
--- /dev/null
+++ b/backend/edumaticParser.py
@@ -0,0 +1,47 @@
+#!/usr/bin/env python3
+
+import argparse
+from bs4 import BeautifulSoup
+from postmarkup import render_bbcode
+import json
+import re
+
+
+# arguments
+parser = argparse.ArgumentParser(description='Process some edumatic xml files.')
+parser.add_argument('filename', help='xml forum file')
+args = parser.parse_args()
+
+# make a soup
+with open(args.filename) as forum:
+  soup = BeautifulSoup(forum, "xml")
+
+# put json together
+out = {}
+out['id'] = re.search(r'ID: (\d+)', soup.group.title.text).group(1)
+out['name'] = soup.group.table.find('string').text
+out['discussions'] = []
+did = 50
+pid = did + 1
+# we ignore first table, and then rules 
+for d in soup.group.find_all('table')[4::2]:
+  posts = []
+  for p in d.find_all('row'):
+    posts.append({
+      'id': pid,
+      'parent': pid - 1,
+      'author': p.find_all('string')[2].text,
+      'message': render_bbcode(p.find_all('string')[1].text)
+    })
+    pid = pid + 1 
+  out['discussions'].append({
+    'id' : did,
+    'title':  d.row.find('string').text,
+    'first_post': did + 1,
+    'posts': posts
+  })
+  did = did + 50
+  pid = did + 1
+
+with open('parsed.json', 'w') as outfile:
+  json.dump(out, outfile, ensure_ascii=False, indent=2)
diff --git a/backend/xmlParser.py b/backend/xmlParser.py
index 1cdde2b..b475d14 100644
--- a/backend/xmlParser.py
+++ b/backend/xmlParser.py
@@ -22,12 +22,11 @@ out['discussions'] = []
 for d in soup.forum.find_all('discussion'):
   posts = []
   for p in d.find_all('post'):
-    message_soup = BeautifulSoup(p.message.get_text(), "xml")
     posts.append({
       'id': p.get('id'),
       'parent': p.find('parent').text,
       'author': p.userid.text,
-      'message': message_soup.get_text()
+      'message': p.message.get_text()
     })
   out['discussions'].append({
     'id':         d.get('id'),

From 7e153db9caa6ec575544f62cd6c2f3d974cf7658 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Karolina=20Boczo=C5=84?= <karboc3@st.amu.edu.pl>
Date: Thu, 4 Jun 2020 21:11:05 +0000
Subject: [PATCH 2/2] paragraph splitting in parsers

---
 backend/edumaticParser.py | 97 ++++++++++++++++++++-------------------
 backend/xmlParser.py      | 82 +++++++++++++++++----------------
 2 files changed, 93 insertions(+), 86 deletions(-)

diff --git a/backend/edumaticParser.py b/backend/edumaticParser.py
index db1c3ee..f2e2775 100644
--- a/backend/edumaticParser.py
+++ b/backend/edumaticParser.py
@@ -1,47 +1,50 @@
-#!/usr/bin/env python3
-
-import argparse
-from bs4 import BeautifulSoup
-from postmarkup import render_bbcode
-import json
-import re
-
-
-# arguments
-parser = argparse.ArgumentParser(description='Process some edumatic xml files.')
-parser.add_argument('filename', help='xml forum file')
-args = parser.parse_args()
-
-# make a soup
-with open(args.filename) as forum:
-  soup = BeautifulSoup(forum, "xml")
-
-# put json together
-out = {}
-out['id'] = re.search(r'ID: (\d+)', soup.group.title.text).group(1)
-out['name'] = soup.group.table.find('string').text
-out['discussions'] = []
-did = 50
-pid = did + 1
-# we ignore first table, and then rules 
-for d in soup.group.find_all('table')[4::2]:
-  posts = []
-  for p in d.find_all('row'):
-    posts.append({
-      'id': pid,
-      'parent': pid - 1,
-      'author': p.find_all('string')[2].text,
-      'message': render_bbcode(p.find_all('string')[1].text)
-    })
-    pid = pid + 1 
-  out['discussions'].append({
-    'id' : did,
-    'title':  d.row.find('string').text,
-    'first_post': did + 1,
-    'posts': posts
-  })
-  did = did + 50
-  pid = did + 1
-
-with open('parsed.json', 'w') as outfile:
-  json.dump(out, outfile, ensure_ascii=False, indent=2)
+#!/usr/bin/env python3
+
+import argparse
+from bs4 import BeautifulSoup
+from postmarkup import render_bbcode
+import html
+import json
+import re
+
+
+# arguments
+parser = argparse.ArgumentParser(description='Process some edumatic xml files.')
+parser.add_argument('filename', help='xml forum file')
+args = parser.parse_args()
+
+# make a soup
+with open(args.filename, 'rb') as forum:
+  soup = BeautifulSoup(forum, "xml")
+
+# put json together
+out = {}
+out['id'] = re.search(r'ID: (\d+)', soup.group.title.text).group(1)
+out['name'] = soup.group.table.find('string').text
+out['discussions'] = []
+did = 50
+pid = did + 1
+# we ignore first table, and then rules 
+for d in soup.group.find_all('table')[4::2]:
+  posts = []
+  for p in d.find_all('row'):
+    text = html.unescape(p.find_all('string')[1].text)
+    paragraphs = [render_bbcode(x) for x in text.splitlines()]
+    posts.append({
+      'id': pid,
+      'parent': pid - 1,
+      'author': p.find_all('string')[2].text,
+      'message': [x for x in paragraphs if x]
+    })
+    pid = pid + 1 
+  out['discussions'].append({
+    'id' : did,
+    'title':  d.row.find('string').text,
+    'first_post': did + 1,
+    'posts': posts
+  })
+  did = did + 50
+  pid = did + 1
+
+with open('parsed.json', 'w', encoding='utf-8') as outfile:
+  json.dump(out, outfile, ensure_ascii=False, indent=2)
diff --git a/backend/xmlParser.py b/backend/xmlParser.py
index b475d14..deb404f 100644
--- a/backend/xmlParser.py
+++ b/backend/xmlParser.py
@@ -1,39 +1,43 @@
-#!/usr/bin/env python3
-
-import argparse
-from bs4 import BeautifulSoup
-import json
-
-
-# arguments
-parser = argparse.ArgumentParser(description='Process some xml files.')
-parser.add_argument('filename', help='xml forum file')
-args = parser.parse_args()
-
-# make a soup
-with open(args.filename) as forum:
-  soup = BeautifulSoup(forum, "xml")
-
-# put json together
-out = {}
-out['id'] = soup.forum.get('id')
-out['name'] = soup.forum.find('name').text
-out['discussions'] = []
-for d in soup.forum.find_all('discussion'):
-  posts = []
-  for p in d.find_all('post'):
-    posts.append({
-      'id': p.get('id'),
-      'parent': p.find('parent').text,
-      'author': p.userid.text,
-      'message': p.message.get_text()
-    })
-  out['discussions'].append({
-    'id':         d.get('id'),
-    'title':      d.find('name').text,
-    'first_post': d.firstpost.text,
-    'posts': posts
-  })
-
-with open('parsed.json', 'w') as outfile:
-  json.dump(out, outfile, ensure_ascii=False, indent=2)
+#!/usr/bin/env python3
+
+import argparse
+from bs4 import BeautifulSoup
+from postmarkup import render_bbcode
+import html
+import json
+
+
+# arguments
+parser = argparse.ArgumentParser(description='Process some xml files.')
+parser.add_argument('filename', help='xml forum file')
+args = parser.parse_args()
+
+# make a soup
+with open(args.filename) as forum:
+  soup = BeautifulSoup(forum, "xml")
+
+# put json together
+out = {}
+out['id'] = soup.forum.get('id')
+out['name'] = soup.forum.find('name').text
+out['discussions'] = []
+for d in soup.forum.find_all('discussion'):
+  posts = []
+  for p in d.find_all('post'):
+    post_soup = BeautifulSoup(html.unescape(str(p.message)), "lxml")
+    paragraphs = [render_bbcode(x.text) for x in post_soup.find_all('p')]
+    posts.append({
+      'id': p.get('id'),
+      'parent': p.find('parent').text,
+      'author': p.userid.text,
+      'message': [x for x in paragraphs if x]
+    })
+  out['discussions'].append({
+    'id':         d.get('id'),
+    'title':      d.find('name').text,
+    'first_post': d.firstpost.text,
+    'posts': posts
+  })
+
+with open('parsed.json', 'w') as outfile:
+  json.dump(out, outfile, ensure_ascii=False, indent=2)
\ No newline at end of file