From bc20d5dbabaf2e28279e60076698dcd9ace0b456 Mon Sep 17 00:00:00 2001 From: Karolin Date: Sun, 3 Jan 2021 21:04:40 +0100 Subject: [PATCH] add corpus making script --- backend/edumaticParser.py | 50 -------------------------------- classifier/make_corp.py | 60 +++++++++++++++++++++++++++++++++++++++ 2 files changed, 60 insertions(+), 50 deletions(-) delete mode 100644 backend/edumaticParser.py create mode 100644 classifier/make_corp.py diff --git a/backend/edumaticParser.py b/backend/edumaticParser.py deleted file mode 100644 index f2e2775..0000000 --- a/backend/edumaticParser.py +++ /dev/null @@ -1,50 +0,0 @@ -#!/usr/bin/env python3 - -import argparse -from bs4 import BeautifulSoup -from postmarkup import render_bbcode -import html -import json -import re - - -# arguments -parser = argparse.ArgumentParser(description='Process some edumatic xml files.') -parser.add_argument('filename', help='xml forum file') -args = parser.parse_args() - -# make a soup -with open(args.filename, 'rb') as forum: - soup = BeautifulSoup(forum, "xml") - -# put json together -out = {} -out['id'] = re.search(r'ID: (\d+)', soup.group.title.text).group(1) -out['name'] = soup.group.table.find('string').text -out['discussions'] = [] -did = 50 -pid = did + 1 -# we ignore first table, and then rules -for d in soup.group.find_all('table')[4::2]: - posts = [] - for p in d.find_all('row'): - text = html.unescape(p.find_all('string')[1].text) - paragraphs = [render_bbcode(x) for x in text.splitlines()] - posts.append({ - 'id': pid, - 'parent': pid - 1, - 'author': p.find_all('string')[2].text, - 'message': [x for x in paragraphs if x] - }) - pid = pid + 1 - out['discussions'].append({ - 'id' : did, - 'title': d.row.find('string').text, - 'first_post': did + 1, - 'posts': posts - }) - did = did + 50 - pid = did + 1 - -with open('parsed.json', 'w', encoding='utf-8') as outfile: - json.dump(out, outfile, ensure_ascii=False, indent=2) diff --git a/classifier/make_corp.py b/classifier/make_corp.py new file mode 100644 index 0000000..b85005b --- /dev/null +++ b/classifier/make_corp.py @@ -0,0 +1,60 @@ +#!/usr/bin/env python3 + +import argparse + +from bs4 import BeautifulSoup +import re +from postmarkup import render_bbcode + +FULL_STOP = re.compile('\. [A-Z]') +BULLET_FULL_STOP = re.compile('^([-–•>*]|http|\w[).])') + +def split_message(message): + soup = BeautifulSoup(render_bbcode(message), 'html.parser') + + # remove quotes, they may be multiline + if soup.blockquote: + soup.blockquote.decompose() + + # remove other tags, cooking the soup twice + m = [BeautifulSoup(p).get_text() for p in str(soup).split('
')] + pars = [""] + + for par in m: + par = par.rstrip() + # long paragraphs + if len(par) > 500: + i = 0 + if FULL_STOP.search(par, len(par)//2-50): + i = FULL_STOP.search(par, len(par)//2-50).start() + 1 + if len(par)//2 > 500: + halfi = i//2-50 + j = 0 + if FULL_STOP.search(par[:i], halfi): + j = FULL_STOP.search(par[:i], halfi).start() + 1 + k = 0 + if FULL_STOP.search(par[i:], halfi): + k = FULL_STOP.search(par[i:], halfi).start() + 1 + pars += [par[:j], par[j:i], par[i:i+k], par[i+k:]] + else: + pars += [par[:i], par[i:]] + # bullet lists + elif re.match(BULLET_POINT, par): + pars[-1] += ' ' + par + elif pars[-1].endswith(':') and len(pars[-1]) < 90: + pars[-1] += ' ' + par + # regular paragraphs + else: + pars.append(par) + return [par.strip() for par in pars if par] + +# arguments +parser = argparse.ArgumentParser(description="Makes corpus out of forum \ + messages. Each message should be terminated by `` symbol.") +parser.add_argument('filename') +args = parser.parse_args() + +with open(args.filename, 'r') as raw: + for i,m in enumerate(raw.read().split('')): + for j,p in enumerate(split_message(m)): + print(i,'-',j,'\t',p, sep='')