kleister-nda/main.ipynb
Iwona Christop 5dc80126c0 Add main
2022-05-02 14:33:16 +02:00

13 KiB

import lzma

# RTead file with lzma
NDAs = []

with lzma.open('train/in.tsv.xz') as f:
    for line in f:
        NDAs.append(line.decode('utf-8'))
# Read expected information
expected = []

with open('train/expected.tsv') as f:
    for line in f:
        expected.append(line.replace('\n', '').split(' '))
import re

months = {'01': 'January', '02': 'February', '03': 'March', 
    '04': 'April', '05': 'May', '06': 'June',
    '07': 'July', '08': 'August', '09': 'September',
    '10': 'October', '11': 'November', '12': 'December'}

def dayToWord(day):
    day = int(day)
    if day > 3:
        return str(day) + 'th'
    elif day == 3:
        return str(day) + 'rd'
    elif day == 2 :
        return str(day) + 'nd'
    else: return str(day) + 'st'

def numToWord(number):
    try:
        number = int(number)
        d = {1 : 'one', 2 : 'two', 3 : 'three', 4 : 'four', 5 : 'five',
            6 : 'six', 7 : 'seven', 8 : 'eight', 9 : 'nine', 10 : 'ten',
            11 : 'eleven', 12 : 'twelve', 13 : 'thirteen', 14 : 'fourteen',
            15 : 'fifteen', 16 : 'sixteen', 17 : 'seventeen', 18 : 'eighteen',
            19 : 'nineteen', 20 : 'twenty',
            30 : 'thirty', 40 : 'forty', 50 : 'fifty', 60 : 'sixty',
            70 : 'seventy', 80 : 'eighty', 90 : 'ninety' }
        if number < 20:
            return d[number]
        else:
            if number % 10 == 0: return d[number]
            else: return d[number // 10 * 10] + '-' + d[number % 10]
    except:
        return number

def labelJurisdiction(text, jurisdiction):
    jurisdictions = []
    jurisdiction = jurisdiction.replace('_', ' ')
    for match in re.finditer(jurisdiction, text):
        tup = (match.start(), match.end(), 'jurisdiction')
        jurisdictions.append(tup)
    return jurisdictions

def labelEffectiveDate(text, date):
    dates = []
    year, month, day = date.split('-')
    
    dateFormats = [month + '/' + day + '/' + year,
        month + '/' + day + '/' + year[-2:], 
        month[1] + '/' + day + '/' + year, 
        month[1] + '/' + day[1] + '/' + year, 
        month[1] + '/' + day + '/' + year[-2:], 
        month[1] + '/' + day[1] + '/' + year[-2:],
        dayToWord(day) + ' of ' + months[month] + ', ' + year,
        dayToWord(day) + ' day of ' + months[month] + ', ' + year,
        months[month] + ' ' + day + ', ' + year ]

    for format in dateFormats:
        for match in re.finditer(format, text, flags=re.IGNORECASE):
            tup = (match.start(), match.end(), 'effective_date')
            dates.append(tup)

    return dates

def labelParties(text, party):
    parties = []
    if 'Inc' in party:
        regular = ''
        for word in party.split('_'):
            regular += word + '(.*)'
        party = regular
    party = party.replace('_', ' ')
    for match in re.finditer(party, text, flags=re.IGNORECASE):
        tup = (match.start(), match.end(), 'party')
        parties.append(tup)
    return parties

def labelTerms(text, term):
    terms = []
    term = term.split('_')
    number = numToWord(term[0])
    units = term[1]
    for match in re.finditer(number + ' ' + units, text, flags=re.IGNORECASE):
        tup = (match.start(), match.end(), 'term')
        terms.append(tup)
    return terms
expectEntities = []

for expect in expected:
    # expect = expect.split()
    entities = []
    for e in expect:
        label, entity = e.split('=')
        entities.append((label, entity))
    expectEntities.append(entities)
trainData =[]

for i in range(len(expectEntities)):
    listOfEntities = []
    for entity in expectEntities[i]:
        if entity[0] == 'effective_date':
            listOfEntities.append(labelEffectiveDate(NDAs[i], entity[1]))
        elif entity[0] == 'jurisdiction':
            listOfEntities.append(labelJurisdiction(NDAs[i], entity[1]))
        elif entity[0] == 'party':
            listOfEntities.append(labelParties(NDAs[i], entity[1]))
        else: listOfEntities.append(labelTerms(NDAs[i], entity[1]))
    listOfEntities = [item for sublist in listOfEntities for item in sublist]
    trainData.append((NDAs[i], {'entities': listOfEntities}))
import spacy
from spacy.tokens import DocBin

model = None
nIter = 100

if model is not None:
    nlp = spacy.load(model)
    print('Loaded model')
else:
    nlp = spacy.blank('en')
    print('Created blank "en" model')

if 'ner' not in nlp.pipe_names:
    # ner = nlp.create_pipe('ner')
    ner = nlp.add_pipe('ner', last=True)
else:
    ner = nlp.get_pipe('ner')
Created blank "en" model
for data in trainData:
    for ent in data[1].get('entities'):
        ner.add_label(ent[2])
otherPipes = [pipe for pipe in nlp.pipe_names if pipe != 'ner']
# import random
from tqdm import tqdm

from spacy.training.example import Example

with nlp.disable_pipes(*otherPipes):
    optimizer = nlp.begin_training()
    for itn in range(nIter):
        # random.shuffle(trainData)
        losses = {}
        for text, annotations in tqdm(trainData):
            try:
                doc = nlp.make_doc(text)
                example = Example.from_dict(doc, annotations)
                nlp.update([example], drop=0.5, sgd=optimizer, losses=losses)
            except:
                pass
        print(losses)
  1%|          | 3/254 [00:00<01:11,  3.49it/s]/Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/site-packages/spacy/training/iob_utils.py:141: UserWarning: [W030] Some entities could not be aligned in the text "03efbda01358533c167ca9b1e6d72051.pdf	effective_dat..." with entities "[(7513, 7521, 'effective_date'), (15032, 15040, 'e...". Use `spacy.training.offsets_to_biluo_tags(nlp.make_doc(text), entities)` to check the alignment. Misaligned entities ('-') will be ignored during training.
  warnings.warn(
  2%|▏         | 4/254 [00:01<02:28,  1.68it/s]/Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/site-packages/spacy/training/iob_utils.py:141: UserWarning: [W030] Some entities could not be aligned in the text "03fd0e629b617da00c54794a8a78b24d.pdf	effective_dat..." with entities "[(287, 300, 'effective_date'), (25276, 25289, 'eff...". Use `spacy.training.offsets_to_biluo_tags(nlp.make_doc(text), entities)` to check the alignment. Misaligned entities ('-') will be ignored during training.
  warnings.warn(
  2%|▏         | 6/254 [00:04<04:11,  1.01s/it]/Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/site-packages/spacy/training/iob_utils.py:141: UserWarning: [W030] Some entities could not be aligned in the text "04bf0791804e8487c91ab84eaa47a335.pdf	effective_dat..." with entities "[(198, 216, 'effective_date'), (22663, 22681, 'eff...". Use `spacy.training.offsets_to_biluo_tags(nlp.make_doc(text), entities)` to check the alignment. Misaligned entities ('-') will be ignored during training.
  warnings.warn(
  3%|▎         | 8/254 [00:07<04:37,  1.13s/it]/Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/site-packages/spacy/training/iob_utils.py:141: UserWarning: [W030] Some entities could not be aligned in the text "0587275477c6ad6d0d72419383e04b88.pdf	effective_dat..." with entities "[(4528, 4536, 'jurisdiction'), (4604, 4612, 'juris...". Use `spacy.training.offsets_to_biluo_tags(nlp.make_doc(text), entities)` to check the alignment. Misaligned entities ('-') will be ignored during training.
  warnings.warn(
  4%|▎         | 9/254 [00:12<09:04,  2.22s/it]/Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/site-packages/spacy/training/iob_utils.py:141: UserWarning: [W030] Some entities could not be aligned in the text "05947711a24a5b7ce401911d31e19c91.pdf	effective_dat..." with entities "[(18271, 18279, 'jurisdiction'), (18507, 18515, 'j...". Use `spacy.training.offsets_to_biluo_tags(nlp.make_doc(text), entities)` to check the alignment. Misaligned entities ('-') will be ignored during training.
  warnings.warn(
  6%|▌         | 14/254 [00:18<04:18,  1.08s/it]/Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/site-packages/spacy/training/iob_utils.py:141: UserWarning: [W030] Some entities could not be aligned in the text "0859334b76224ff82c1312ae7b2b5da1.pdf	effective_dat..." with entities "[(279, 296, 'effective_date'), (22981, 22998, 'eff...". Use `spacy.training.offsets_to_biluo_tags(nlp.make_doc(text), entities)` to check the alignment. Misaligned entities ('-') will be ignored during training.
  warnings.warn(
  7%|▋         | 17/254 [00:20<03:29,  1.13it/s]/Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/site-packages/spacy/training/iob_utils.py:141: UserWarning: [W030] Some entities could not be aligned in the text "0c3ab1d0c8bb3b1c2f7a64f3ab584368.pdf	effective_dat..." with entities "[(243, 259, 'effective_date'), (35225, 35241, 'eff...". Use `spacy.training.offsets_to_biluo_tags(nlp.make_doc(text), entities)` to check the alignment. Misaligned entities ('-') will be ignored during training.
  warnings.warn(
  7%|▋         | 18/254 [00:23<04:38,  1.18s/it]/Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/site-packages/spacy/training/iob_utils.py:141: UserWarning: [W030] Some entities could not be aligned in the text "0c7b90701575b147c4ac245ca478ee7c.pdf	effective_dat..." with entities "[(10058, 10065, 'jurisdiction'), (10252, 10259, 'j...". Use `spacy.training.offsets_to_biluo_tags(nlp.make_doc(text), entities)` to check the alignment. Misaligned entities ('-') will be ignored during training.
  warnings.warn(
  7%|▋         | 19/254 [00:25<05:25,  1.39s/it]