PRI_2020-FE/classifier/make_corp.py

#!/usr/bin/env python3

import argparse

from bs4 import BeautifulSoup
import re
from postmarkup import render_bbcode

FULL_STOP = re.compile('\. [A-Z]')
BULLET_FULL_STOP = re.compile('^([-–•>*]|http|\w[).])')

def split_message(message):
  soup = BeautifulSoup(render_bbcode(message), 'html.parser')
  
  # remove quotes, they may be multiline
  if soup.blockquote:
    soup.blockquote.decompose()

  # remove other tags, cooking the soup twice
  m = [BeautifulSoup(p).get_text() for p in str(soup).split('<br/>')]
  pars = [""]

  for par in m:
    par = par.rstrip()
    # long paragraphs
    if len(par) > 500:
      i = 0
      if FULL_STOP.search(par, len(par)//2-50):
        i = FULL_STOP.search(par, len(par)//2-50).start() + 1
      if len(par)//2 > 500:
        halfi = i//2-50
        j = 0
        if FULL_STOP.search(par[:i], halfi):
          j = FULL_STOP.search(par[:i], halfi).start() + 1
        k = 0
        if FULL_STOP.search(par[i:], halfi):
          k = FULL_STOP.search(par[i:], halfi).start() + 1
        pars += [par[:j], par[j:i], par[i:i+k], par[i+k:]]
      else:
        pars += [par[:i], par[i:]]
    # bullet lists
    elif re.match(BULLET_POINT, par):
      pars[-1] += '  ' + par
    elif pars[-1].endswith(':') and len(pars[-1]) < 90:
      pars[-1] += '  ' + par
    # regular paragraphs
    else:
      pars.append(par)
  return [par.strip() for par in pars if par]

# arguments
parser = argparse.ArgumentParser(description="Makes corpus out of forum \
	messages. Each message should be terminated by `<end>` symbol.")
parser.add_argument('filename')
args = parser.parse_args()

with open(args.filename, 'r') as raw:
  for i,m in enumerate(raw.read().split('<end>')):
    for j,p in enumerate(split_message(m)):
      print(i,'-',j,'\t',p, sep='')
add corpus making script 2021-01-03 21:04:40 +01:00			`#!/usr/bin/env python3`

			`import argparse`

			`from bs4 import BeautifulSoup`
			`import re`
			`from postmarkup import render_bbcode`

			`FULL_STOP = re.compile('\. [A-Z]')`
			`BULLET_FULL_STOP = re.compile('^([-–•>*]\|http\|\w[).])')`

			`def split_message(message):`
			`soup = BeautifulSoup(render_bbcode(message), 'html.parser')`

			`# remove quotes, they may be multiline`
			`if soup.blockquote:`
			`soup.blockquote.decompose()`

			`# remove other tags, cooking the soup twice`
			`m = [BeautifulSoup(p).get_text() for p in str(soup).split('<br/>')]`
			`pars = [""]`

			`for par in m:`
			`par = par.rstrip()`
			`# long paragraphs`
			`if len(par) > 500:`
			`i = 0`
			`if FULL_STOP.search(par, len(par)//2-50):`
			`i = FULL_STOP.search(par, len(par)//2-50).start() + 1`
			`if len(par)//2 > 500:`
			`halfi = i//2-50`
			`j = 0`
			`if FULL_STOP.search(par[:i], halfi):`
			`j = FULL_STOP.search(par[:i], halfi).start() + 1`
			`k = 0`
			`if FULL_STOP.search(par[i:], halfi):`
			`k = FULL_STOP.search(par[i:], halfi).start() + 1`
			`pars += [par[:j], par[j:i], par[i:i+k], par[i+k:]]`
			`else:`
			`pars += [par[:i], par[i:]]`
			`# bullet lists`
			`elif re.match(BULLET_POINT, par):`
			`pars[-1] += ' ' + par`
			`elif pars[-1].endswith(':') and len(pars[-1]) < 90:`
			`pars[-1] += ' ' + par`
			`# regular paragraphs`
			`else:`
			`pars.append(par)`
			`return [par.strip() for par in pars if par]`

			`# arguments`
			`parser = argparse.ArgumentParser(description="Makes corpus out of forum \`
			messages. Each message should be terminated by `<end>` symbol.")
			`parser.add_argument('filename')`
			`args = parser.parse_args()`

			`with open(args.filename, 'r') as raw:`
			`for i,m in enumerate(raw.read().split('<end>')):`
			`for j,p in enumerate(split_message(m)):`
			`print(i,'-',j,'\t',p, sep='')`