PRI_2020-FE/classifier/make_corp.py

#!/usr/bin/env python3

import argparse

from bs4 import BeautifulSoup
import re
from postmarkup import render_bbcode

FULL_STOP = re.compile('\. [A-Z]')
BULLET_FULL_STOP = re.compile('^([-–•>*]|http|\w[).])')

def split_message(message):
  soup = BeautifulSoup(render_bbcode(message), 'html.parser')

  # remove quotes, they may be multiline
  if soup.blockquote:
    soup.blockquote.decompose()

  # remove other tags, cooking the soup twice
  m = [BeautifulSoup(p).get_text() for p in str(soup).split('<br/>')]
  pars = [""]

  for par in m:
    par = par.rstrip()
    # long paragraphs
    if len(par) > 500:
      i = 0
      if FULL_STOP.search(par, len(par)//2-50):
        i = FULL_STOP.search(par, len(par)//2-50).start() + 1
      if len(par)//2 > 500:
        halfi = i//2-50
        j = 0
        if FULL_STOP.search(par[:i], halfi):
          j = FULL_STOP.search(par[:i], halfi).start() + 1
        k = 0
        if FULL_STOP.search(par[i:], halfi):
          k = FULL_STOP.search(par[i:], halfi).start() + 1
        pars += [par[:j], par[j:i], par[i:i+k], par[i+k:]]
      else:
        pars += [par[:i], par[i:]]
    # bullet lists
    elif re.match(BULLET_POINT, par):
      pars[-1] += '  ' + par
    elif pars[-1].endswith(':') and len(pars[-1]) < 90:
      pars[-1] += '  ' + par
    # regular paragraphs
    else:
      pars.append(par)
  return [par.strip() for par in pars if par]

# arguments
parser = argparse.ArgumentParser(description="Makes corpus out of forum \
	messages. Each message should be terminated by `<end>` symbol.")
parser.add_argument('filename')
args = parser.parse_args()

with open(args.filename, 'r') as raw:
  for i,m in enumerate(raw.read().split('<end>')):
    for j,p in enumerate(split_message(m)):
      print(i,'-',j,'\t',p, sep='')