add corpus making script

This commit is contained in:
Karolin 2021-01-03 21:04:40 +01:00
parent c1b41a7bd5
commit bc20d5dbab
2 changed files with 60 additions and 50 deletions

View File

@ -1,50 +0,0 @@
#!/usr/bin/env python3
import argparse
from bs4 import BeautifulSoup
from postmarkup import render_bbcode
import html
import json
import re
# arguments
parser = argparse.ArgumentParser(description='Process some edumatic xml files.')
parser.add_argument('filename', help='xml forum file')
args = parser.parse_args()
# make a soup
with open(args.filename, 'rb') as forum:
soup = BeautifulSoup(forum, "xml")
# put json together
out = {}
out['id'] = re.search(r'ID: (\d+)', soup.group.title.text).group(1)
out['name'] = soup.group.table.find('string').text
out['discussions'] = []
did = 50
pid = did + 1
# we ignore first table, and then rules
for d in soup.group.find_all('table')[4::2]:
posts = []
for p in d.find_all('row'):
text = html.unescape(p.find_all('string')[1].text)
paragraphs = [render_bbcode(x) for x in text.splitlines()]
posts.append({
'id': pid,
'parent': pid - 1,
'author': p.find_all('string')[2].text,
'message': [x for x in paragraphs if x]
})
pid = pid + 1
out['discussions'].append({
'id' : did,
'title': d.row.find('string').text,
'first_post': did + 1,
'posts': posts
})
did = did + 50
pid = did + 1
with open('parsed.json', 'w', encoding='utf-8') as outfile:
json.dump(out, outfile, ensure_ascii=False, indent=2)

60
classifier/make_corp.py Normal file
View File

@ -0,0 +1,60 @@
#!/usr/bin/env python3
import argparse
from bs4 import BeautifulSoup
import re
from postmarkup import render_bbcode
FULL_STOP = re.compile('\. [A-Z]')
BULLET_FULL_STOP = re.compile('^([-–•>*]|http|\w[).])')
def split_message(message):
soup = BeautifulSoup(render_bbcode(message), 'html.parser')
# remove quotes, they may be multiline
if soup.blockquote:
soup.blockquote.decompose()
# remove other tags, cooking the soup twice
m = [BeautifulSoup(p).get_text() for p in str(soup).split('<br/>')]
pars = [""]
for par in m:
par = par.rstrip()
# long paragraphs
if len(par) > 500:
i = 0
if FULL_STOP.search(par, len(par)//2-50):
i = FULL_STOP.search(par, len(par)//2-50).start() + 1
if len(par)//2 > 500:
halfi = i//2-50
j = 0
if FULL_STOP.search(par[:i], halfi):
j = FULL_STOP.search(par[:i], halfi).start() + 1
k = 0
if FULL_STOP.search(par[i:], halfi):
k = FULL_STOP.search(par[i:], halfi).start() + 1
pars += [par[:j], par[j:i], par[i:i+k], par[i+k:]]
else:
pars += [par[:i], par[i:]]
# bullet lists
elif re.match(BULLET_POINT, par):
pars[-1] += ' ' + par
elif pars[-1].endswith(':') and len(pars[-1]) < 90:
pars[-1] += ' ' + par
# regular paragraphs
else:
pars.append(par)
return [par.strip() for par in pars if par]
# arguments
parser = argparse.ArgumentParser(description="Makes corpus out of forum \
messages. Each message should be terminated by `<end>` symbol.")
parser.add_argument('filename')
args = parser.parse_args()
with open(args.filename, 'r') as raw:
for i,m in enumerate(raw.read().split('<end>')):
for j,p in enumerate(split_message(m)):
print(i,'-',j,'\t',p, sep='')