paragraph splitting in parsers

2020-06-04 21:11:05 +00:00 · 2020-06-04 21:11:05 +00:00 · 7e153db9ca
commit 7e153db9ca
parent 017e560cbf
2 changed files with 93 additions and 86 deletions
--- a/backend/edumaticParser.py
+++ b/backend/edumaticParser.py
@ -1,47 +1,50 @@
-#!/usr/bin/env python3
-
-import argparse
-from bs4 import BeautifulSoup
-from postmarkup import render_bbcode
-import json
-import re
-
-
-# arguments
-parser = argparse.ArgumentParser(description='Process some edumatic xml files.')
-parser.add_argument('filename', help='xml forum file')
-args = parser.parse_args()
-
-# make a soup
-with open(args.filename) as forum:
-  soup = BeautifulSoup(forum, "xml")
-
-# put json together
-out = {}
-out['id'] = re.search(r'ID: (\d+)', soup.group.title.text).group(1)
-out['name'] = soup.group.table.find('string').text
-out['discussions'] = []
-did = 50
-pid = did + 1
-# we ignore first table, and then rules 
-for d in soup.group.find_all('table')[4::2]:
-  posts = []
-  for p in d.find_all('row'):
-    posts.append({
-      'id': pid,
-      'parent': pid - 1,
-      'author': p.find_all('string')[2].text,
-      'message': render_bbcode(p.find_all('string')[1].text)
-    })
-    pid = pid + 1 
-  out['discussions'].append({
-    'id' : did,
-    'title':  d.row.find('string').text,
-    'first_post': did + 1,
-    'posts': posts
-  })
-  did = did + 50
-  pid = did + 1
-
-with open('parsed.json', 'w') as outfile:
-  json.dump(out, outfile, ensure_ascii=False, indent=2)
+#!/usr/bin/env python3
+
+import argparse
+from bs4 import BeautifulSoup
+from postmarkup import render_bbcode
+import html
+import json
+import re
+
+
+# arguments
+parser = argparse.ArgumentParser(description='Process some edumatic xml files.')
+parser.add_argument('filename', help='xml forum file')
+args = parser.parse_args()
+
+# make a soup
+with open(args.filename, 'rb') as forum:
+  soup = BeautifulSoup(forum, "xml")
+
+# put json together
+out = {}
+out['id'] = re.search(r'ID: (\d+)', soup.group.title.text).group(1)
+out['name'] = soup.group.table.find('string').text
+out['discussions'] = []
+did = 50
+pid = did + 1
+# we ignore first table, and then rules 
+for d in soup.group.find_all('table')[4::2]:
+  posts = []
+  for p in d.find_all('row'):
+    text = html.unescape(p.find_all('string')[1].text)
+    paragraphs = [render_bbcode(x) for x in text.splitlines()]
+    posts.append({
+      'id': pid,
+      'parent': pid - 1,
+      'author': p.find_all('string')[2].text,
+      'message': [x for x in paragraphs if x]
+    })
+    pid = pid + 1 
+  out['discussions'].append({
+    'id' : did,
+    'title':  d.row.find('string').text,
+    'first_post': did + 1,
+    'posts': posts
+  })
+  did = did + 50
+  pid = did + 1
+
+with open('parsed.json', 'w', encoding='utf-8') as outfile:
+  json.dump(out, outfile, ensure_ascii=False, indent=2)
--- a/backend/xmlParser.py
+++ b/backend/xmlParser.py
@ -1,39 +1,43 @@
-#!/usr/bin/env python3
-
-import argparse
-from bs4 import BeautifulSoup
-import json
-
-
-# arguments
-parser = argparse.ArgumentParser(description='Process some xml files.')
-parser.add_argument('filename', help='xml forum file')
-args = parser.parse_args()
-
-# make a soup
-with open(args.filename) as forum:
-  soup = BeautifulSoup(forum, "xml")
-
-# put json together
-out = {}
-out['id'] = soup.forum.get('id')
-out['name'] = soup.forum.find('name').text
-out['discussions'] = []
-for d in soup.forum.find_all('discussion'):
-  posts = []
-  for p in d.find_all('post'):
-    posts.append({
-      'id': p.get('id'),
-      'parent': p.find('parent').text,
-      'author': p.userid.text,
-      'message': p.message.get_text()
-    })
-  out['discussions'].append({
-    'id':         d.get('id'),
-    'title':      d.find('name').text,
-    'first_post': d.firstpost.text,
-    'posts': posts
-  })
-
-with open('parsed.json', 'w') as outfile:
-  json.dump(out, outfile, ensure_ascii=False, indent=2)
+#!/usr/bin/env python3
+
+import argparse
+from bs4 import BeautifulSoup
+from postmarkup import render_bbcode
+import html
+import json
+
+
+# arguments
+parser = argparse.ArgumentParser(description='Process some xml files.')
+parser.add_argument('filename', help='xml forum file')
+args = parser.parse_args()
+
+# make a soup
+with open(args.filename) as forum:
+  soup = BeautifulSoup(forum, "xml")
+
+# put json together
+out = {}
+out['id'] = soup.forum.get('id')
+out['name'] = soup.forum.find('name').text
+out['discussions'] = []
+for d in soup.forum.find_all('discussion'):
+  posts = []
+  for p in d.find_all('post'):
+    post_soup = BeautifulSoup(html.unescape(str(p.message)), "lxml")
+    paragraphs = [render_bbcode(x.text) for x in post_soup.find_all('p')]
+    posts.append({
+      'id': p.get('id'),
+      'parent': p.find('parent').text,
+      'author': p.userid.text,
+      'message': [x for x in paragraphs if x]
+    })
+  out['discussions'].append({
+    'id':         d.get('id'),
+    'title':      d.find('name').text,
+    'first_post': d.firstpost.text,
+    'posts': posts
+  })
+
+with open('parsed.json', 'w') as outfile:
+  json.dump(out, outfile, ensure_ascii=False, indent=2)