#!/usr/bin/env python3 import argparse from bs4 import BeautifulSoup import re from postmarkup import render_bbcode FULL_STOP = re.compile('\. [A-Z]') BULLET_FULL_STOP = re.compile('^([-–•>*]|http|\w[).])') def split_message(message): soup = BeautifulSoup(render_bbcode(message), 'html.parser') # remove quotes, they may be multiline if soup.blockquote: soup.blockquote.decompose() # remove other tags, cooking the soup twice m = [BeautifulSoup(p).get_text() for p in str(soup).split('
')] pars = [""] for par in m: par = par.rstrip() # long paragraphs if len(par) > 500: i = 0 if FULL_STOP.search(par, len(par)//2-50): i = FULL_STOP.search(par, len(par)//2-50).start() + 1 if len(par)//2 > 500: halfi = i//2-50 j = 0 if FULL_STOP.search(par[:i], halfi): j = FULL_STOP.search(par[:i], halfi).start() + 1 k = 0 if FULL_STOP.search(par[i:], halfi): k = FULL_STOP.search(par[i:], halfi).start() + 1 pars += [par[:j], par[j:i], par[i:i+k], par[i+k:]] else: pars += [par[:i], par[i:]] # bullet lists elif re.match(BULLET_POINT, par): pars[-1] += ' ' + par elif pars[-1].endswith(':') and len(pars[-1]) < 90: pars[-1] += ' ' + par # regular paragraphs else: pars.append(par) return [par.strip() for par in pars if par] # arguments parser = argparse.ArgumentParser(description="Makes corpus out of forum \ messages. Each message should be terminated by `` symbol.") parser.add_argument('filename') args = parser.parse_args() with open(args.filename, 'r') as raw: for i,m in enumerate(raw.read().split('')): for j,p in enumerate(split_message(m)): print(i,'-',j,'\t',p, sep='')