61 lines
1.8 KiB
Python
61 lines
1.8 KiB
Python
|
#!/usr/bin/env python3
|
||
|
|
||
|
import argparse
|
||
|
|
||
|
from bs4 import BeautifulSoup
|
||
|
import re
|
||
|
from postmarkup import render_bbcode
|
||
|
|
||
|
FULL_STOP = re.compile('\. [A-Z]')
|
||
|
BULLET_FULL_STOP = re.compile('^([-–•>*]|http|\w[).])')
|
||
|
|
||
|
def split_message(message):
|
||
|
soup = BeautifulSoup(render_bbcode(message), 'html.parser')
|
||
|
|
||
|
# remove quotes, they may be multiline
|
||
|
if soup.blockquote:
|
||
|
soup.blockquote.decompose()
|
||
|
|
||
|
# remove other tags, cooking the soup twice
|
||
|
m = [BeautifulSoup(p).get_text() for p in str(soup).split('<br/>')]
|
||
|
pars = [""]
|
||
|
|
||
|
for par in m:
|
||
|
par = par.rstrip()
|
||
|
# long paragraphs
|
||
|
if len(par) > 500:
|
||
|
i = 0
|
||
|
if FULL_STOP.search(par, len(par)//2-50):
|
||
|
i = FULL_STOP.search(par, len(par)//2-50).start() + 1
|
||
|
if len(par)//2 > 500:
|
||
|
halfi = i//2-50
|
||
|
j = 0
|
||
|
if FULL_STOP.search(par[:i], halfi):
|
||
|
j = FULL_STOP.search(par[:i], halfi).start() + 1
|
||
|
k = 0
|
||
|
if FULL_STOP.search(par[i:], halfi):
|
||
|
k = FULL_STOP.search(par[i:], halfi).start() + 1
|
||
|
pars += [par[:j], par[j:i], par[i:i+k], par[i+k:]]
|
||
|
else:
|
||
|
pars += [par[:i], par[i:]]
|
||
|
# bullet lists
|
||
|
elif re.match(BULLET_POINT, par):
|
||
|
pars[-1] += ' ' + par
|
||
|
elif pars[-1].endswith(':') and len(pars[-1]) < 90:
|
||
|
pars[-1] += ' ' + par
|
||
|
# regular paragraphs
|
||
|
else:
|
||
|
pars.append(par)
|
||
|
return [par.strip() for par in pars if par]
|
||
|
|
||
|
# arguments
|
||
|
parser = argparse.ArgumentParser(description="Makes corpus out of forum \
|
||
|
messages. Each message should be terminated by `<end>` symbol.")
|
||
|
parser.add_argument('filename')
|
||
|
args = parser.parse_args()
|
||
|
|
||
|
with open(args.filename, 'r') as raw:
|
||
|
for i,m in enumerate(raw.read().split('<end>')):
|
||
|
for j,p in enumerate(split_message(m)):
|
||
|
print(i,'-',j,'\t',p, sep='')
|