add corpus making script

2021-01-03 21:04:40 +01:00 · 2021-01-03 21:04:40 +01:00 · bc20d5dbab
commit bc20d5dbab
parent c1b41a7bd5
2 changed files with 60 additions and 50 deletions
--- a/backend/edumaticParser.py
+++ b/backend/edumaticParser.py
@ -1,50 +0,0 @@
-#!/usr/bin/env python3
-
-import argparse
-from bs4 import BeautifulSoup
-from postmarkup import render_bbcode
-import html
-import json
-import re
-
-
-# arguments
-parser = argparse.ArgumentParser(description='Process some edumatic xml files.')
-parser.add_argument('filename', help='xml forum file')
-args = parser.parse_args()
-
-# make a soup
-with open(args.filename, 'rb') as forum:
-  soup = BeautifulSoup(forum, "xml")
-
-# put json together
-out = {}
-out['id'] = re.search(r'ID: (\d+)', soup.group.title.text).group(1)
-out['name'] = soup.group.table.find('string').text
-out['discussions'] = []
-did = 50
-pid = did + 1
-# we ignore first table, and then rules 
-for d in soup.group.find_all('table')[4::2]:
-  posts = []
-  for p in d.find_all('row'):
-    text = html.unescape(p.find_all('string')[1].text)
-    paragraphs = [render_bbcode(x) for x in text.splitlines()]
-    posts.append({
-      'id': pid,
-      'parent': pid - 1,
-      'author': p.find_all('string')[2].text,
-      'message': [x for x in paragraphs if x]
-    })
-    pid = pid + 1 
-  out['discussions'].append({
-    'id' : did,
-    'title':  d.row.find('string').text,
-    'first_post': did + 1,
-    'posts': posts
-  })
-  did = did + 50
-  pid = did + 1
-
-with open('parsed.json', 'w', encoding='utf-8') as outfile:
-  json.dump(out, outfile, ensure_ascii=False, indent=2)
--- a/classifier/make_corp.py
+++ b/classifier/make_corp.py
@ -0,0 +1,60 @@
+#!/usr/bin/env python3
+
+import argparse
+
+from bs4 import BeautifulSoup
+import re
+from postmarkup import render_bbcode
+
+FULL_STOP = re.compile('\. [A-Z]')
+BULLET_FULL_STOP = re.compile('^([-–•>*]|http|\w[).])')
+
+def split_message(message):
+  soup = BeautifulSoup(render_bbcode(message), 'html.parser')
+  
+  # remove quotes, they may be multiline
+  if soup.blockquote:
+    soup.blockquote.decompose()
+
+  # remove other tags, cooking the soup twice
+  m = [BeautifulSoup(p).get_text() for p in str(soup).split('<br/>')]
+  pars = [""]
+
+  for par in m:
+    par = par.rstrip()
+    # long paragraphs
+    if len(par) > 500:
+      i = 0
+      if FULL_STOP.search(par, len(par)//2-50):
+        i = FULL_STOP.search(par, len(par)//2-50).start() + 1
+      if len(par)//2 > 500:
+        halfi = i//2-50
+        j = 0
+        if FULL_STOP.search(par[:i], halfi):
+          j = FULL_STOP.search(par[:i], halfi).start() + 1
+        k = 0
+        if FULL_STOP.search(par[i:], halfi):
+          k = FULL_STOP.search(par[i:], halfi).start() + 1
+        pars += [par[:j], par[j:i], par[i:i+k], par[i+k:]]
+      else:
+        pars += [par[:i], par[i:]]
+    # bullet lists
+    elif re.match(BULLET_POINT, par):
+      pars[-1] += '  ' + par
+    elif pars[-1].endswith(':') and len(pars[-1]) < 90:
+      pars[-1] += '  ' + par
+    # regular paragraphs
+    else:
+      pars.append(par)
+  return [par.strip() for par in pars if par]
+
+# arguments
+parser = argparse.ArgumentParser(description="Makes corpus out of forum \
+	messages. Each message should be terminated by `<end>` symbol.")
+parser.add_argument('filename')
+args = parser.parse_args()
+
+with open(args.filename, 'r') as raw:
+  for i,m in enumerate(raw.read().split('<end>')):
+    for j,p in enumerate(split_message(m)):
+      print(i,'-',j,'\t',p, sep='')