In [1]:
import lzma

# RTead file with lzma
NDAs = []

with lzma.open('train/in.tsv.xz') as f:
    for line in f:
        NDAs.append(line.decode('utf-8'))

In [2]:
# Read expected information
expected = []

with open('train/expected.tsv') as f:
    for line in f:
        expected.append(line.replace('\n', '').split(' '))

In [3]:
import re

months = {'01': 'January', '02': 'February', '03': 'March', 
    '04': 'April', '05': 'May', '06': 'June',
    '07': 'July', '08': 'August', '09': 'September',
    '10': 'October', '11': 'November', '12': 'December'}

def dayToWord(day):
    day = int(day)
    if day > 3:
        return str(day) + 'th'
    elif day == 3:
        return str(day) + 'rd'
    elif day == 2 :
        return str(day) + 'nd'
    else: return str(day) + 'st'

def numToWord(number):
    try:
        number = int(number)
        d = {1 : 'one', 2 : 'two', 3 : 'three', 4 : 'four', 5 : 'five',
            6 : 'six', 7 : 'seven', 8 : 'eight', 9 : 'nine', 10 : 'ten',
            11 : 'eleven', 12 : 'twelve', 13 : 'thirteen', 14 : 'fourteen',
            15 : 'fifteen', 16 : 'sixteen', 17 : 'seventeen', 18 : 'eighteen',
            19 : 'nineteen', 20 : 'twenty',
            30 : 'thirty', 40 : 'forty', 50 : 'fifty', 60 : 'sixty',
            70 : 'seventy', 80 : 'eighty', 90 : 'ninety' }
        if number < 20:
            return d[number]
        else:
            if number % 10 == 0: return d[number]
            else: return d[number // 10 * 10] + '-' + d[number % 10]
    except:
        return number

def labelJurisdiction(text, jurisdiction):
    jurisdictions = []
    jurisdiction = jurisdiction.replace('_', ' ')
    for match in re.finditer(jurisdiction, text):
        tup = (match.start(), match.end(), 'jurisdiction')
        jurisdictions.append(tup)
    return jurisdictions

def labelEffectiveDate(text, date):
    dates = []
    year, month, day = date.split('-')
    
    dateFormats = [month + '/' + day + '/' + year,
        month + '/' + day + '/' + year[-2:], 
        month[1] + '/' + day + '/' + year, 
        month[1] + '/' + day[1] + '/' + year, 
        month[1] + '/' + day + '/' + year[-2:], 
        month[1] + '/' + day[1] + '/' + year[-2:],
        dayToWord(day) + ' of ' + months[month] + ', ' + year,
        dayToWord(day) + ' day of ' + months[month] + ', ' + year,
        months[month] + ' ' + day + ', ' + year ]

    for format in dateFormats:
        for match in re.finditer(format, text, flags=re.IGNORECASE):
            tup = (match.start(), match.end(), 'effective_date')
            dates.append(tup)

    return dates

def labelParties(text, party):
    parties = []
    if 'Inc' in party:
        regular = ''
        for word in party.split('_'):
            regular += word + '(.*)'
        party = regular
    party = party.replace('_', ' ')
    for match in re.finditer(party, text, flags=re.IGNORECASE):
        tup = (match.start(), match.end(), 'party')
        parties.append(tup)
    return parties

def labelTerms(text, term):
    terms = []
    term = term.split('_')
    number = numToWord(term[0])
    units = term[1]
    for match in re.finditer(number + ' ' + units, text, flags=re.IGNORECASE):
        tup = (match.start(), match.end(), 'term')
        terms.append(tup)
    return terms

In [4]:
expectEntities = []

for expect in expected:
    # expect = expect.split()
    entities = []
    for e in expect:
        label, entity = e.split('=')
        entities.append((label, entity))
    expectEntities.append(entities)


In [5]:
trainData =[]

for i in range(len(expectEntities)):
    listOfEntities = []
    for entity in expectEntities[i]:
        if entity[0] == 'effective_date':
            listOfEntities.append(labelEffectiveDate(NDAs[i], entity[1]))
        elif entity[0] == 'jurisdiction':
            listOfEntities.append(labelJurisdiction(NDAs[i], entity[1]))
        elif entity[0] == 'party':
            listOfEntities.append(labelParties(NDAs[i], entity[1]))
        else: listOfEntities.append(labelTerms(NDAs[i], entity[1]))
    listOfEntities = [item for sublist in listOfEntities for item in sublist]
    trainData.append((NDAs[i], {'entities': listOfEntities}))

In [6]:
import spacy
# from spacy.tokens import DocBin

model = None
nIter = 100

if model is not None:
    nlp = spacy.load(model)
    print('Loaded model')
else:
    nlp = spacy.blank('en')
    print('Created blank "en" model')

if 'ner' not in nlp.pipe_names:
    # ner = nlp.create_pipe('ner')
    ner = nlp.add_pipe('ner', last=True)
else:
    ner = nlp.get_pipe('ner')

Created blank "en" model


In [7]:
for data in trainData:
    for ent in data[1].get('entities'):
        ner.add_label(ent[2])

In [8]:
otherPipes = [pipe for pipe in nlp.pipe_names if pipe != 'ner']

In [9]:
# import random
from tqdm import tqdm

from spacy.training.example import Example

with nlp.disable_pipes(*otherPipes):
    optimizer = nlp.begin_training()
    for itn in range(nIter):
        # random.shuffle(trainData)
        losses = {}
        for text, annotations in tqdm(trainData):
            try:
                doc = nlp.make_doc(text)
                example = Example.from_dict(doc, annotations)
                nlp.update([example], drop=0.5, sgd=optimizer, losses=losses)
            except:
                pass
        print(losses)

100%|██████████| 254/254 [03:49<00:00,  1.11it/s]


{'ner': 121367.40999465056}


100%|██████████| 254/254 [03:45<00:00,  1.13it/s]


{'ner': 3064.6157787348466}


100%|██████████| 254/254 [03:45<00:00,  1.13it/s]


{'ner': 2713.940178823442}


100%|██████████| 254/254 [03:41<00:00,  1.15it/s]


{'ner': 2308.1697566876524}


100%|██████████| 254/254 [03:39<00:00,  1.16it/s]


{'ner': 2081.409584430913}


100%|██████████| 254/254 [03:44<00:00,  1.13it/s]


{'ner': 2703.6273422304675}


100%|██████████| 254/254 [03:45<00:00,  1.13it/s]


{'ner': 1873.849181939688}


100%|██████████| 254/254 [03:44<00:00,  1.13it/s]


{'ner': 2970.6824737787374}


100%|██████████| 254/254 [03:41<00:00,  1.15it/s]


{'ner': 1536.9394466317704}


100%|██████████| 254/254 [03:44<00:00,  1.13it/s]


{'ner': 1567.1744353484294}


100%|██████████| 254/254 [03:41<00:00,  1.15it/s]


{'ner': 12201.587548598394}


100%|██████████| 254/254 [03:45<00:00,  1.13it/s]


{'ner': 1475.4802474584865}


100%|██████████| 254/254 [03:42<00:00,  1.14it/s]


{'ner': 1637.554888988012}


100%|██████████| 254/254 [03:44<00:00,  1.13it/s]


{'ner': 3338.4118373711162}


100%|██████████| 254/254 [03:46<00:00,  1.12it/s]


{'ner': 3628.8529922539665}


100%|██████████| 254/254 [03:45<00:00,  1.12it/s]


{'ner': 1122.9326577445718}


100%|██████████| 254/254 [03:47<00:00,  1.12it/s]


{'ner': 1056.6042950392734}


100%|██████████| 254/254 [03:38<00:00,  1.16it/s]


{'ner': 1494.7422995005986}


100%|██████████| 254/254 [03:41<00:00,  1.15it/s]


{'ner': 1093.2139526897322}


100%|██████████| 254/254 [03:42<00:00,  1.14it/s]


{'ner': 1161.3516471813246}


100%|██████████| 254/254 [03:42<00:00,  1.14it/s]


{'ner': 923.4026813110729}


100%|██████████| 254/254 [03:39<00:00,  1.15it/s]


{'ner': 6822.4873693648}


100%|██████████| 254/254 [03:42<00:00,  1.14it/s]


{'ner': 965.8564889386865}


100%|██████████| 254/254 [03:41<00:00,  1.15it/s]


{'ner': 1895.9738231120605}


100%|██████████| 254/254 [03:45<00:00,  1.13it/s]


{'ner': 4759.545841583874}


100%|██████████| 254/254 [03:41<00:00,  1.14it/s]


{'ner': 849.9946473153236}


100%|██████████| 254/254 [03:44<00:00,  1.13it/s]


{'ner': 13133.735760338292}


100%|██████████| 254/254 [03:41<00:00,  1.15it/s]


{'ner': 1132.05477693737}


100%|██████████| 254/254 [03:43<00:00,  1.14it/s]


{'ner': 727.2229101967696}


100%|██████████| 254/254 [03:44<00:00,  1.13it/s]


{'ner': 712.5904286187064}


100%|██████████| 254/254 [03:44<00:00,  1.13it/s]


{'ner': 709.1340100197092}


100%|██████████| 254/254 [03:44<00:00,  1.13it/s]


{'ner': 677.6860686316803}


100%|██████████| 254/254 [03:41<00:00,  1.15it/s]


{'ner': 467.8078318010227}


100%|██████████| 254/254 [03:43<00:00,  1.14it/s]


{'ner': 577.8532304094363}


100%|██████████| 254/254 [03:40<00:00,  1.15it/s]


{'ner': 593.7090201894682}


100%|██████████| 254/254 [03:45<00:00,  1.13it/s]


{'ner': 545.9148195413269}


100%|██████████| 254/254 [03:43<00:00,  1.14it/s]


{'ner': 5190.76835287416}


100%|██████████| 254/254 [03:39<00:00,  1.16it/s]


{'ner': 10175.43726500724}


100%|██████████| 254/254 [03:42<00:00,  1.14it/s]


{'ner': 525.8841523092841}


100%|██████████| 254/254 [03:47<00:00,  1.12it/s]


{'ner': 652.4439105603246}


100%|██████████| 254/254 [03:41<00:00,  1.15it/s]


{'ner': 12232.87218168916}


100%|██████████| 254/254 [03:47<00:00,  1.12it/s]


{'ner': 584.2010901025561}


100%|██████████| 254/254 [03:41<00:00,  1.15it/s]


{'ner': 11239.083920775602}


100%|██████████| 254/254 [03:44<00:00,  1.13it/s]


{'ner': 479.70122952070625}


100%|██████████| 254/254 [03:48<00:00,  1.11it/s]


{'ner': 432.9402811895728}


100%|██████████| 254/254 [03:43<00:00,  1.14it/s]


{'ner': 351.3015391408119}


100%|██████████| 254/254 [03:46<00:00,  1.12it/s]


{'ner': 314.5821106558259}


100%|██████████| 254/254 [03:48<00:00,  1.11it/s]


{'ner': 674.1040463482614}


100%|██████████| 254/254 [03:51<00:00,  1.10it/s]


{'ner': 685.0549399011828}


100%|██████████| 254/254 [03:48<00:00,  1.11it/s]


{'ner': 346.72634777803324}


100%|██████████| 254/254 [03:46<00:00,  1.12it/s]


{'ner': 282.34050623250107}


100%|██████████| 254/254 [03:44<00:00,  1.13it/s]


{'ner': 528.4151804189036}


100%|██████████| 254/254 [03:44<00:00,  1.13it/s]


{'ner': 207.02574239415026}


100%|██████████| 254/254 [03:45<00:00,  1.13it/s]


{'ner': 4602.387233055954}


100%|██████████| 254/254 [03:44<00:00,  1.13it/s]


{'ner': 1327.8107613080297}


100%|██████████| 254/254 [03:46<00:00,  1.12it/s]


{'ner': 203.08545199977465}


100%|██████████| 254/254 [03:41<00:00,  1.15it/s]


{'ner': 432.95780566657976}


100%|██████████| 254/254 [03:42<00:00,  1.14it/s]


{'ner': 247.29635213365103}


100%|██████████| 254/254 [03:41<00:00,  1.14it/s]


{'ner': 11141.300520583844}


100%|██████████| 254/254 [03:44<00:00,  1.13it/s]


{'ner': 196.54009389246215}


100%|██████████| 254/254 [03:40<00:00,  1.15it/s]


{'ner': 245.53696564695727}


100%|██████████| 254/254 [03:42<00:00,  1.14it/s]


{'ner': 214.63265869778368}


100%|██████████| 254/254 [03:42<00:00,  1.14it/s]


{'ner': 335.24755693518506}


100%|██████████| 254/254 [03:40<00:00,  1.15it/s]


{'ner': 1845.3869408136584}


100%|██████████| 254/254 [03:41<00:00,  1.15it/s]


{'ner': 244.5266526305163}


100%|██████████| 254/254 [03:39<00:00,  1.16it/s]


{'ner': 324.677549649568}


100%|██████████| 254/254 [03:40<00:00,  1.15it/s]


{'ner': 193.7757545725294}


100%|██████████| 254/254 [03:41<00:00,  1.15it/s]


{'ner': 214.0988065129845}


100%|██████████| 254/254 [03:41<00:00,  1.15it/s]


{'ner': 211.30440763873858}


100%|██████████| 254/254 [03:43<00:00,  1.14it/s]


{'ner': 311.3926866805724}


100%|██████████| 254/254 [03:44<00:00,  1.13it/s]


{'ner': 152.8372567465775}


100%|██████████| 254/254 [03:41<00:00,  1.14it/s]


{'ner': 167.19798372224142}


100%|██████████| 254/254 [03:43<00:00,  1.14it/s]


{'ner': 269.8072879107703}


100%|██████████| 254/254 [03:42<00:00,  1.14it/s]


{'ner': 172.78895767745234}


100%|██████████| 254/254 [03:45<00:00,  1.12it/s]


{'ner': 152.2928964526185}


100%|██████████| 254/254 [03:40<00:00,  1.15it/s]


{'ner': 241.40016408906854}


100%|██████████| 254/254 [03:45<00:00,  1.13it/s]


{'ner': 174.4806328902659}


100%|██████████| 254/254 [03:41<00:00,  1.15it/s]


{'ner': 430.183177284666}


100%|██████████| 254/254 [03:43<00:00,  1.14it/s]


{'ner': 212.76146605464587}


100%|██████████| 254/254 [03:41<00:00,  1.15it/s]


{'ner': 4264.537137806244}


100%|██████████| 254/254 [03:40<00:00,  1.15it/s]


{'ner': 138.65115249654542}


100%|██████████| 254/254 [03:41<00:00,  1.14it/s]


{'ner': 169.38315583287215}


100%|██████████| 254/254 [03:42<00:00,  1.14it/s]


{'ner': 199.71923073314906}


100%|██████████| 254/254 [03:39<00:00,  1.16it/s]


{'ner': 141.07872416249504}


100%|██████████| 254/254 [03:39<00:00,  1.16it/s]


{'ner': 3239.6032942208617}


100%|██████████| 254/254 [03:42<00:00,  1.14it/s]


{'ner': 125.13759326328476}


100%|██████████| 254/254 [03:39<00:00,  1.16it/s]


{'ner': 166.43782435874667}


100%|██████████| 254/254 [03:41<00:00,  1.14it/s]


{'ner': 193.75167109626358}


100%|██████████| 254/254 [03:37<00:00,  1.17it/s]


{'ner': 210.71080058930684}


100%|██████████| 254/254 [03:38<00:00,  1.16it/s]


{'ner': 94.89951866488516}


100%|██████████| 254/254 [03:41<00:00,  1.15it/s]


{'ner': 135.40197065974007}


100%|██████████| 254/254 [03:41<00:00,  1.15it/s]


{'ner': 113.31096857552745}


100%|██████████| 254/254 [03:43<00:00,  1.14it/s]


{'ner': 115.2365843483141}


100%|██████████| 254/254 [03:42<00:00,  1.14it/s]


{'ner': 1149.9128505764934}


100%|██████████| 254/254 [03:45<00:00,  1.13it/s]


{'ner': 110.24805298311857}


100%|██████████| 254/254 [03:47<00:00,  1.12it/s]


{'ner': 122.23640644764889}


100%|██████████| 254/254 [03:50<00:00,  1.10it/s]


{'ner': 122.02146472314264}


100%|██████████| 254/254 [03:46<00:00,  1.12it/s]


{'ner': 150.26241508963884}


100%|██████████| 254/254 [03:46<00:00,  1.12it/s]


{'ner': 135.2006315147592}


100%|██████████| 254/254 [03:44<00:00,  1.13it/s]

{'ner': 160.26129673476012}





In [10]:
# Test trained model
# for text, _ in trainData[:1]:
#     doc = nlp(text)
    #print('Entities', [(ent.text, ent.label_) for ent in doc.ents])

In [11]:
# Save the model to path
nlp.to_disk('NER')

In [13]:
NDAs[0]

'00a1d238e37ac225b8045a97953e845d.pdf\teffective_date jurisdiction party term\tEX-10.23 5 dex1023.htm COVENANT NOT TO COMPETE AND NON-DISCLOSURE AGREEMENT\\nExhibit 10.23\\nCOVENANT NOT TO COMPETE\\nAND NON-DISCLOSURE AGREEMENT\\nPARTIES:\\nEric Dean Sprunk (“EMPLOYEE”)\\nand\\nNIKE, Inc., divisions, subsidiaries\\nand affiliates. (“NIKE”):\\nRECITALS:\\nA. This Covenant Not to Compete and Non-Disclosure Agreement is executed upon initial employment or upon the EMPLOYEE’s\\nadvancement with NIKE and is a condition of such employment or advancement.\\nB. Over the course of EMPLOYEE’s employment with NIKE, EMPLOYEE will be or has been exposed to and/or is in a position to\\ndevelop confidential information peculiar to NIKE’s business and not generally known to the public as defined below (“Protected Information”). It is\\nanticipated that EMPLOYEE will continue to be exposed to Protected Information of greater sensitivity as EMPLOYEE advances in the company.\\nC. The nature of NIKE’s bus