PRI_2020-FE/classifier/make_corp.py
2021-01-03 21:05:00 +01:00

61 lines
1.8 KiB
Python

#!/usr/bin/env python3
import argparse
from bs4 import BeautifulSoup
import re
from postmarkup import render_bbcode
FULL_STOP = re.compile('\. [A-Z]')
BULLET_FULL_STOP = re.compile('^([-–•>*]|http|\w[).])')
def split_message(message):
soup = BeautifulSoup(render_bbcode(message), 'html.parser')
# remove quotes, they may be multiline
if soup.blockquote:
soup.blockquote.decompose()
# remove other tags, cooking the soup twice
m = [BeautifulSoup(p).get_text() for p in str(soup).split('<br/>')]
pars = [""]
for par in m:
par = par.rstrip()
# long paragraphs
if len(par) > 500:
i = 0
if FULL_STOP.search(par, len(par)//2-50):
i = FULL_STOP.search(par, len(par)//2-50).start() + 1
if len(par)//2 > 500:
halfi = i//2-50
j = 0
if FULL_STOP.search(par[:i], halfi):
j = FULL_STOP.search(par[:i], halfi).start() + 1
k = 0
if FULL_STOP.search(par[i:], halfi):
k = FULL_STOP.search(par[i:], halfi).start() + 1
pars += [par[:j], par[j:i], par[i:i+k], par[i+k:]]
else:
pars += [par[:i], par[i:]]
# bullet lists
elif re.match(BULLET_POINT, par):
pars[-1] += ' ' + par
elif pars[-1].endswith(':') and len(pars[-1]) < 90:
pars[-1] += ' ' + par
# regular paragraphs
else:
pars.append(par)
return [par.strip() for par in pars if par]
# arguments
parser = argparse.ArgumentParser(description="Makes corpus out of forum \
messages. Each message should be terminated by `<end>` symbol.")
parser.add_argument('filename')
args = parser.parse_args()
with open(args.filename, 'r') as raw:
for i,m in enumerate(raw.read().split('<end>')):
for j,p in enumerate(split_message(m)):
print(i,'-',j,'\t',p, sep='')