kleister-nda/main.ipynb
2022-05-02 18:54:19 +02:00

45 KiB

import lzma

# RTead file with lzma
NDAs = []

with lzma.open('train/in.tsv.xz') as f:
    for line in f:
        NDAs.append(line.decode('utf-8'))
# Read expected information
expected = []

with open('train/expected.tsv') as f:
    for line in f:
        expected.append(line.replace('\n', '').split(' '))
import re

months = {'01': 'January', '02': 'February', '03': 'March', 
    '04': 'April', '05': 'May', '06': 'June',
    '07': 'July', '08': 'August', '09': 'September',
    '10': 'October', '11': 'November', '12': 'December'}

def dayToWord(day):
    day = int(day)
    if day > 3:
        return str(day) + 'th'
    elif day == 3:
        return str(day) + 'rd'
    elif day == 2 :
        return str(day) + 'nd'
    else: return str(day) + 'st'

def numToWord(number):
    try:
        number = int(number)
        d = {1 : 'one', 2 : 'two', 3 : 'three', 4 : 'four', 5 : 'five',
            6 : 'six', 7 : 'seven', 8 : 'eight', 9 : 'nine', 10 : 'ten',
            11 : 'eleven', 12 : 'twelve', 13 : 'thirteen', 14 : 'fourteen',
            15 : 'fifteen', 16 : 'sixteen', 17 : 'seventeen', 18 : 'eighteen',
            19 : 'nineteen', 20 : 'twenty',
            30 : 'thirty', 40 : 'forty', 50 : 'fifty', 60 : 'sixty',
            70 : 'seventy', 80 : 'eighty', 90 : 'ninety' }
        if number < 20:
            return d[number]
        else:
            if number % 10 == 0: return d[number]
            else: return d[number // 10 * 10] + '-' + d[number % 10]
    except:
        return number

def labelJurisdiction(text, jurisdiction):
    jurisdictions = []
    jurisdiction = jurisdiction.replace('_', ' ')
    for match in re.finditer(jurisdiction, text):
        tup = (match.start(), match.end(), 'jurisdiction')
        jurisdictions.append(tup)
    return jurisdictions

def labelEffectiveDate(text, date):
    dates = []
    year, month, day = date.split('-')
    
    dateFormats = [month + '/' + day + '/' + year,
        month + '/' + day + '/' + year[-2:], 
        month[1] + '/' + day + '/' + year, 
        month[1] + '/' + day[1] + '/' + year, 
        month[1] + '/' + day + '/' + year[-2:], 
        month[1] + '/' + day[1] + '/' + year[-2:],
        dayToWord(day) + ' of ' + months[month] + ', ' + year,
        dayToWord(day) + ' day of ' + months[month] + ', ' + year,
        months[month] + ' ' + day + ', ' + year ]

    for format in dateFormats:
        for match in re.finditer(format, text, flags=re.IGNORECASE):
            tup = (match.start(), match.end(), 'effective_date')
            dates.append(tup)

    return dates

def labelParties(text, party):
    parties = []
    if 'Inc' in party:
        regular = ''
        for word in party.split('_'):
            regular += word + '(.*)'
        party = regular
    party = party.replace('_', ' ')
    for match in re.finditer(party, text, flags=re.IGNORECASE):
        tup = (match.start(), match.end(), 'party')
        parties.append(tup)
    return parties

def labelTerms(text, term):
    terms = []
    term = term.split('_')
    number = numToWord(term[0])
    units = term[1]
    for match in re.finditer(number + ' ' + units, text, flags=re.IGNORECASE):
        tup = (match.start(), match.end(), 'term')
        terms.append(tup)
    return terms
expectEntities = []

for expect in expected:
    # expect = expect.split()
    entities = []
    for e in expect:
        label, entity = e.split('=')
        entities.append((label, entity))
    expectEntities.append(entities)
trainData =[]

for i in range(len(expectEntities)):
    listOfEntities = []
    for entity in expectEntities[i]:
        if entity[0] == 'effective_date':
            listOfEntities.append(labelEffectiveDate(NDAs[i], entity[1]))
        elif entity[0] == 'jurisdiction':
            listOfEntities.append(labelJurisdiction(NDAs[i], entity[1]))
        elif entity[0] == 'party':
            listOfEntities.append(labelParties(NDAs[i], entity[1]))
        else: listOfEntities.append(labelTerms(NDAs[i], entity[1]))
    listOfEntities = [item for sublist in listOfEntities for item in sublist]
    trainData.append((NDAs[i], {'entities': listOfEntities}))
import spacy
# from spacy.tokens import DocBin

model = None
nIter = 3

if model is not None:
    nlp = spacy.load(model)
    print('Loaded model')
else:
    nlp = spacy.blank('en')
    print('Created blank "en" model')

if 'ner' not in nlp.pipe_names:
    # ner = nlp.create_pipe('ner')
    ner = nlp.add_pipe('ner', last=True)
else:
    ner = nlp.get_pipe('ner')
Created blank "en" model
for data in trainData:
    for ent in data[1].get('entities'):
        ner.add_label(ent[2])
otherPipes = [pipe for pipe in nlp.pipe_names if pipe != 'ner']
# import random
from tqdm import tqdm

from spacy.training.example import Example

with nlp.disable_pipes(*otherPipes):
    optimizer = nlp.begin_training()
    for itn in range(nIter):
        # random.shuffle(trainData)
        losses = {}
        for text, annotations in tqdm(trainData):
            try:
                doc = nlp.make_doc(text)
                example = Example.from_dict(doc, annotations)
                nlp.update([example], drop=0.5, sgd=optimizer, losses=losses)
            except:
                pass
        print(losses)
  1%|          | 3/254 [00:00<01:13,  3.43it/s]/Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/site-packages/spacy/training/iob_utils.py:141: UserWarning: [W030] Some entities could not be aligned in the text "03fd0e629b617da00c54794a8a78b24d.pdf	effective_dat..." with entities "[(287, 300, 'effective_date'), (25276, 25289, 'eff...". Use `spacy.training.offsets_to_biluo_tags(nlp.make_doc(text), entities)` to check the alignment. Misaligned entities ('-') will be ignored during training.
  warnings.warn(
  2%|▏         | 6/254 [00:03<03:03,  1.35it/s]/Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/site-packages/spacy/training/iob_utils.py:141: UserWarning: [W030] Some entities could not be aligned in the text "04bf0791804e8487c91ab84eaa47a335.pdf	effective_dat..." with entities "[(198, 216, 'effective_date'), (22663, 22681, 'eff...". Use `spacy.training.offsets_to_biluo_tags(nlp.make_doc(text), entities)` to check the alignment. Misaligned entities ('-') will be ignored during training.
  warnings.warn(
  3%|▎         | 8/254 [00:06<04:03,  1.01it/s]/Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/site-packages/spacy/training/iob_utils.py:141: UserWarning: [W030] Some entities could not be aligned in the text "0587275477c6ad6d0d72419383e04b88.pdf	effective_dat..." with entities "[(4528, 4536, 'jurisdiction'), (4604, 4612, 'juris...". Use `spacy.training.offsets_to_biluo_tags(nlp.make_doc(text), entities)` to check the alignment. Misaligned entities ('-') will be ignored during training.
  warnings.warn(
  4%|▎         | 9/254 [00:13<10:42,  2.62s/it]/Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/site-packages/spacy/training/iob_utils.py:141: UserWarning: [W030] Some entities could not be aligned in the text "05947711a24a5b7ce401911d31e19c91.pdf	effective_dat..." with entities "[(18271, 18279, 'jurisdiction'), (18507, 18515, 'j...". Use `spacy.training.offsets_to_biluo_tags(nlp.make_doc(text), entities)` to check the alignment. Misaligned entities ('-') will be ignored during training.
  warnings.warn(
  6%|▌         | 14/254 [00:18<04:28,  1.12s/it]/Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/site-packages/spacy/training/iob_utils.py:141: UserWarning: [W030] Some entities could not be aligned in the text "0859334b76224ff82c1312ae7b2b5da1.pdf	effective_dat..." with entities "[(279, 296, 'effective_date'), (22981, 22998, 'eff...". Use `spacy.training.offsets_to_biluo_tags(nlp.make_doc(text), entities)` to check the alignment. Misaligned entities ('-') will be ignored during training.
  warnings.warn(
  7%|▋         | 17/254 [00:21<03:21,  1.18it/s]/Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/site-packages/spacy/training/iob_utils.py:141: UserWarning: [W030] Some entities could not be aligned in the text "0c3ab1d0c8bb3b1c2f7a64f3ab584368.pdf	effective_dat..." with entities "[(243, 259, 'effective_date'), (35225, 35241, 'eff...". Use `spacy.training.offsets_to_biluo_tags(nlp.make_doc(text), entities)` to check the alignment. Misaligned entities ('-') will be ignored during training.
  warnings.warn(
  7%|▋         | 18/254 [00:23<05:00,  1.27s/it]/Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/site-packages/spacy/training/iob_utils.py:141: UserWarning: [W030] Some entities could not be aligned in the text "0c7b90701575b147c4ac245ca478ee7c.pdf	effective_dat..." with entities "[(10058, 10065, 'jurisdiction'), (10252, 10259, 'j...". Use `spacy.training.offsets_to_biluo_tags(nlp.make_doc(text), entities)` to check the alignment. Misaligned entities ('-') will be ignored during training.
  warnings.warn(
  8%|▊         | 20/254 [00:26<04:32,  1.17s/it]/Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/site-packages/spacy/training/iob_utils.py:141: UserWarning: [W030] Some entities could not be aligned in the text "0f446b4ed10d8d40824270d746511cca.pdf	jurisdiction ..." with entities "[(261, 268, 'jurisdiction'), (901, 908, 'jurisdict...". Use `spacy.training.offsets_to_biluo_tags(nlp.make_doc(text), entities)` to check the alignment. Misaligned entities ('-') will be ignored during training.
  warnings.warn(
  8%|▊         | 21/254 [00:27<05:02,  1.30s/it]/Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/site-packages/spacy/training/iob_utils.py:141: UserWarning: [W030] Some entities could not be aligned in the text "1058cd8d541c0622ad959facd34235ea.pdf	effective_dat..." with entities "[(21973, 21981, 'jurisdiction'), (46056, 46064, 'j...". Use `spacy.training.offsets_to_biluo_tags(nlp.make_doc(text), entities)` to check the alignment. Misaligned entities ('-') will be ignored during training.
  warnings.warn(
  9%|▉         | 23/254 [00:30<05:02,  1.31s/it]/Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/site-packages/spacy/training/iob_utils.py:141: UserWarning: [W030] Some entities could not be aligned in the text "15398fb3b5f357981a8be88dc4bb376e.pdf	effective_dat..." with entities "[(579, 591, 'jurisdiction'), (17167, 17179, 'juris...". Use `spacy.training.offsets_to_biluo_tags(nlp.make_doc(text), entities)` to check the alignment. Misaligned entities ('-') will be ignored during training.
  warnings.warn(
  9%|▉         | 24/254 [00:35<08:26,  2.20s/it]/Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/site-packages/spacy/training/iob_utils.py:141: UserWarning: [W030] Some entities could not be aligned in the text "154d30f607c74aa8a5f582bf84f7a5e2.pdf	effective_dat..." with entities "[(379, 387, 'jurisdiction'), (22505, 22513, 'juris...". Use `spacy.training.offsets_to_biluo_tags(nlp.make_doc(text), entities)` to check the alignment. Misaligned entities ('-') will be ignored during training.
  warnings.warn(
 13%|█▎        | 32/254 [00:39<01:50,  2.02it/s]/Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/site-packages/spacy/training/iob_utils.py:141: UserWarning: [W030] Some entities could not be aligned in the text "1a5847e0b968e25ddcf41ac9c6fc63b4.pdf	effective_dat..." with entities "[(210, 227, 'effective_date'), (708, 725, 'effecti...". Use `spacy.training.offsets_to_biluo_tags(nlp.make_doc(text), entities)` to check the alignment. Misaligned entities ('-') will be ignored during training.
  warnings.warn(
 14%|█▍        | 36/254 [00:43<02:24,  1.51it/s]/Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/site-packages/spacy/training/iob_utils.py:141: UserWarning: [W030] Some entities could not be aligned in the text "1c1705ebb86fb8c9ddd2c765d1d59486.pdf	effective_dat..." with entities "[(356, 373, 'effective_date'), (14632, 14649, 'eff...". Use `spacy.training.offsets_to_biluo_tags(nlp.make_doc(text), entities)` to check the alignment. Misaligned entities ('-') will be ignored during training.
  warnings.warn(
 15%|█▍        | 37/254 [00:45<03:27,  1.05it/s]/Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/site-packages/spacy/training/iob_utils.py:141: UserWarning: [W030] Some entities could not be aligned in the text "1c36bbc314ee3f0cbe059d15d4fdd36a.pdf	effective_dat..." with entities "[(250, 267, 'effective_date'), (31244, 31261, 'eff...". Use `spacy.training.offsets_to_biluo_tags(nlp.make_doc(text), entities)` to check the alignment. Misaligned entities ('-') will be ignored during training.
  warnings.warn(
 17%|█▋        | 43/254 [00:51<02:11,  1.61it/s]/Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/site-packages/spacy/training/iob_utils.py:141: UserWarning: [W030] Some entities could not be aligned in the text "22526e24107177141dc9b66afed7106d.pdf	effective_dat..." with entities "[(265, 273, 'jurisdiction'), (12609, 12617, 'juris...". Use `spacy.training.offsets_to_biluo_tags(nlp.make_doc(text), entities)` to check the alignment. Misaligned entities ('-') will be ignored during training.
  warnings.warn(
 18%|█▊        | 45/254 [00:52<02:14,  1.55it/s]/Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/site-packages/spacy/training/iob_utils.py:141: UserWarning: [W030] Some entities could not be aligned in the text "232b3bee703427df8e9893e4a52d5d60.pdf	effective_dat..." with entities "[(16031, 16039, 'jurisdiction'), (16220, 16228, 'j...". Use `spacy.training.offsets_to_biluo_tags(nlp.make_doc(text), entities)` to check the alignment. Misaligned entities ('-') will be ignored during training.
  warnings.warn(
 19%|█▉        | 48/254 [00:55<02:04,  1.65it/s]/Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/site-packages/spacy/training/iob_utils.py:141: UserWarning: [W030] Some entities could not be aligned in the text "247166e0245431dcf97ee884f1f07e35.pdf	effective_dat..." with entities "[(156, 170, 'effective_date'), (508, 522, 'effecti...". Use `spacy.training.offsets_to_biluo_tags(nlp.make_doc(text), entities)` to check the alignment. Misaligned entities ('-') will be ignored during training.
  warnings.warn(
 20%|█▉        | 50/254 [00:56<01:53,  1.80it/s]/Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/site-packages/spacy/training/iob_utils.py:141: UserWarning: [W030] Some entities could not be aligned in the text "2632c4c1238356489cab88d58e1a5fb0.pdf	effective_dat..." with entities "[(15689, 15697, 'effective_date'), (15719, 15727, ...". Use `spacy.training.offsets_to_biluo_tags(nlp.make_doc(text), entities)` to check the alignment. Misaligned entities ('-') will be ignored during training.
  warnings.warn(
 22%|██▏       | 56/254 [00:59<01:19,  2.49it/s]/Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/site-packages/spacy/training/iob_utils.py:141: UserWarning: [W030] Some entities could not be aligned in the text "2ab67f26bc51d57492e3f27b244fae3e.pdf	effective_dat..." with entities "[(22097, 22111, 'effective_date'), (44207, 44221, ...". Use `spacy.training.offsets_to_biluo_tags(nlp.make_doc(text), entities)` to check the alignment. Misaligned entities ('-') will be ignored during training.
  warnings.warn(
 24%|██▎       | 60/254 [01:05<03:01,  1.07it/s]/Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/site-packages/spacy/training/iob_utils.py:141: UserWarning: [W030] Some entities could not be aligned in the text "2ce3bbe2d6836d8b023c55883294fa63.pdf	effective_dat..." with entities "[(9413, 9422, 'jurisdiction'), (9806, 9815, 'juris...". Use `spacy.training.offsets_to_biluo_tags(nlp.make_doc(text), entities)` to check the alignment. Misaligned entities ('-') will be ignored during training.
  warnings.warn(
 29%|██▉       | 74/254 [01:11<00:50,  3.57it/s]/Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/site-packages/spacy/training/iob_utils.py:141: UserWarning: [W030] Some entities could not be aligned in the text "376f9746de69416a9561e92517c356ee.pdf	effective_dat..." with entities "[(306, 330, 'effective_date'), (9197, 9221, 'effec...". Use `spacy.training.offsets_to_biluo_tags(nlp.make_doc(text), entities)` to check the alignment. Misaligned entities ('-') will be ignored during training.
  warnings.warn(
 30%|███       | 77/254 [01:12<00:54,  3.23it/s]/Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/site-packages/spacy/training/iob_utils.py:141: UserWarning: [W030] Some entities could not be aligned in the text "39610c6bf605fdd8d0d9bcb2aacb5e74.pdf	effective_dat..." with entities "[(51002, 51012, 'jurisdiction'), (51132, 51142, 'j...". Use `spacy.training.offsets_to_biluo_tags(nlp.make_doc(text), entities)` to check the alignment. Misaligned entities ('-') will be ignored during training.
  warnings.warn(
 31%|███       | 79/254 [01:22<06:30,  2.23s/it]/Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/site-packages/spacy/training/iob_utils.py:141: UserWarning: [W030] Some entities could not be aligned in the text "3acc6f6bdad6eaaf7ab21faea5ea95fa.pdf	effective_dat..." with entities "[(251, 265, 'effective_date'), (23669, 23683, 'eff...". Use `spacy.training.offsets_to_biluo_tags(nlp.make_doc(text), entities)` to check the alignment. Misaligned entities ('-') will be ignored during training.
  warnings.warn(
 31%|███▏      | 80/254 [01:25<06:45,  2.33s/it]/Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/site-packages/spacy/training/iob_utils.py:141: UserWarning: [W030] Some entities could not be aligned in the text "3c19cab83f40f722fc8c1432299d7655.pdf	effective_dat..." with entities "[(290, 312, 'effective_date'), (30391, 30413, 'eff...". Use `spacy.training.offsets_to_biluo_tags(nlp.make_doc(text), entities)` to check the alignment. Misaligned entities ('-') will be ignored during training.
  warnings.warn(
 37%|███▋      | 94/254 [01:32<00:52,  3.07it/s]/Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/site-packages/spacy/training/iob_utils.py:141: UserWarning: [W030] Some entities could not be aligned in the text "480fcdb1b3d02989c11ace2c69bc9ba6.pdf	effective_dat..." with entities "[(22255, 22269, 'effective_date'), (44472, 44486, ...". Use `spacy.training.offsets_to_biluo_tags(nlp.make_doc(text), entities)` to check the alignment. Misaligned entities ('-') will be ignored during training.
  warnings.warn(
 38%|███▊      | 96/254 [01:35<02:11,  1.20it/s]/Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/site-packages/spacy/training/iob_utils.py:141: UserWarning: [W030] Some entities could not be aligned in the text "495f7d16921a1c8531be0844db0828a4.pdf	effective_dat..." with entities "[(130, 155, 'effective_date'), (19409, 19434, 'eff...". Use `spacy.training.offsets_to_biluo_tags(nlp.make_doc(text), entities)` to check the alignment. Misaligned entities ('-') will be ignored during training.
  warnings.warn(
 40%|███▉      | 101/254 [01:36<00:59,  2.56it/s]/Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/site-packages/spacy/training/iob_utils.py:141: UserWarning: [W030] Some entities could not be aligned in the text "4dc5c39e601cd476f4c2def0e6b96915.pdf	effective_dat..." with entities "[(329, 343, 'jurisdiction'), (29204, 29218, 'juris...". Use `spacy.training.offsets_to_biluo_tags(nlp.make_doc(text), entities)` to check the alignment. Misaligned entities ('-') will be ignored during training.
  warnings.warn(
 41%|████▏     | 105/254 [01:41<01:30,  1.64it/s]/Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/site-packages/spacy/training/iob_utils.py:141: UserWarning: [W030] Some entities could not be aligned in the text "4f0e455a90c53f8e40e09d324aab4ea3.pdf	effective_dat..." with entities "[(533, 550, 'effective_date'), (2868, 2885, 'effec...". Use `spacy.training.offsets_to_biluo_tags(nlp.make_doc(text), entities)` to check the alignment. Misaligned entities ('-') will be ignored during training.
  warnings.warn(
 42%|████▏     | 106/254 [01:44<03:16,  1.33s/it]/Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/site-packages/spacy/training/iob_utils.py:141: UserWarning: [W030] Some entities could not be aligned in the text "4fd432d8ce6796dabc17d3838d8539a2.pdf	effective_dat..." with entities "[(162, 176, 'effective_date'), (15065, 15079, 'eff...". Use `spacy.training.offsets_to_biluo_tags(nlp.make_doc(text), entities)` to check the alignment. Misaligned entities ('-') will be ignored during training.
  warnings.warn(
 44%|████▎     | 111/254 [01:46<01:15,  1.91it/s]/Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/site-packages/spacy/training/iob_utils.py:141: UserWarning: [W030] Some entities could not be aligned in the text "5b070e9583099dfdcddc9c9c811b7d44.pdf	effective_dat..." with entities "[(10864, 10876, 'jurisdiction'), (23351, 23363, 'j...". Use `spacy.training.offsets_to_biluo_tags(nlp.make_doc(text), entities)` to check the alignment. Misaligned entities ('-') will be ignored during training.
  warnings.warn(
 44%|████▍     | 112/254 [01:48<01:52,  1.26it/s]/Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/site-packages/spacy/training/iob_utils.py:141: UserWarning: [W030] Some entities could not be aligned in the text "5d18471dc0cb8c824fe86d5899aeb24b.pdf	effective_dat..." with entities "[(25946, 25952, 'effective_date'), (52332, 52338, ...". Use `spacy.training.offsets_to_biluo_tags(nlp.make_doc(text), entities)` to check the alignment. Misaligned entities ('-') will be ignored during training.
  warnings.warn(
 48%|████▊     | 122/254 [01:53<00:35,  3.71it/s]/Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/site-packages/spacy/training/iob_utils.py:141: UserWarning: [W030] Some entities could not be aligned in the text "6c0e2103cb185f28b0c1e9109c674836.pdf	effective_dat..." with entities "[(20591, 20598, 'effective_date'), (20707, 20714, ...". Use `spacy.training.offsets_to_biluo_tags(nlp.make_doc(text), entities)` to check the alignment. Misaligned entities ('-') will be ignored during training.
  warnings.warn(
 50%|████▉     | 126/254 [01:56<00:50,  2.56it/s]/Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/site-packages/spacy/training/iob_utils.py:141: UserWarning: [W030] Some entities could not be aligned in the text "6ecf1846ef305f44deb8f5c64da3b999.pdf	effective_dat..." with entities "[(1140, 1156, 'effective_date'), (23227, 23243, 'e...". Use `spacy.training.offsets_to_biluo_tags(nlp.make_doc(text), entities)` to check the alignment. Misaligned entities ('-') will be ignored during training.
  warnings.warn(
 53%|█████▎    | 135/254 [02:00<00:34,  3.49it/s]/Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/site-packages/spacy/training/iob_utils.py:141: UserWarning: [W030] Some entities could not be aligned in the text "73bfeebfeca04b3a804d844cbf16d7f3.pdf	effective_dat..." with entities "[(3546, 3556, 'jurisdiction'), (9893, 9903, 'juris...". Use `spacy.training.offsets_to_biluo_tags(nlp.make_doc(text), entities)` to check the alignment. Misaligned entities ('-') will be ignored during training.
  warnings.warn(
 54%|█████▍    | 137/254 [02:01<00:38,  3.01it/s]/Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/site-packages/spacy/training/iob_utils.py:141: UserWarning: [W030] Some entities could not be aligned in the text "7496116e8680dac321f36147b6312411.pdf	effective_dat..." with entities "[(13407, 13420, 'effective_date'), (26980, 26993, ...". Use `spacy.training.offsets_to_biluo_tags(nlp.make_doc(text), entities)` to check the alignment. Misaligned entities ('-') will be ignored during training.
  warnings.warn(
 54%|█████▍    | 138/254 [02:02<01:22,  1.41it/s]/Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/site-packages/spacy/training/iob_utils.py:141: UserWarning: [W030] Some entities could not be aligned in the text "7684f321eb08514fa1794427e73479b9.pdf	effective_dat..." with entities "[(16189, 16197, 'jurisdiction'), (16325, 16333, 'j...". Use `spacy.training.offsets_to_biluo_tags(nlp.make_doc(text), entities)` to check the alignment. Misaligned entities ('-') will be ignored during training.
  warnings.warn(
 55%|█████▍    | 139/254 [02:05<02:34,  1.34s/it]/Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/site-packages/spacy/training/iob_utils.py:141: UserWarning: [W030] Some entities could not be aligned in the text "782c651fc7cf288ec2f8857de0d6bb58.pdf	effective_dat..." with entities "[(5539, 5552, 'jurisdiction'), (11301, 11314, 'jur...". Use `spacy.training.offsets_to_biluo_tags(nlp.make_doc(text), entities)` to check the alignment. Misaligned entities ('-') will be ignored during training.
  warnings.warn(
 59%|█████▊    | 149/254 [02:08<00:25,  4.18it/s]/Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/site-packages/spacy/training/iob_utils.py:141: UserWarning: [W030] Some entities could not be aligned in the text "7cf3dfaf7afd9989de90cb3cbd8d6a83.pdf	effective_dat..." with entities "[(13571, 13578, 'effective_date'), (27080, 27087, ...". Use `spacy.training.offsets_to_biluo_tags(nlp.make_doc(text), entities)` to check the alignment. Misaligned entities ('-') will be ignored during training.
  warnings.warn(
 59%|█████▉    | 151/254 [02:09<00:48,  2.13it/s]/Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/site-packages/spacy/training/iob_utils.py:141: UserWarning: [W030] Some entities could not be aligned in the text "7cfa17a4165369964337c2f46c40e3a2.pdf	effective_dat..." with entities "[(13099, 13104, 'jurisdiction'), (13304, 13309, 'j...". Use `spacy.training.offsets_to_biluo_tags(nlp.make_doc(text), entities)` to check the alignment. Misaligned entities ('-') will be ignored during training.
  warnings.warn(
 62%|██████▏   | 157/254 [02:12<00:29,  3.33it/s]/Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/site-packages/spacy/training/iob_utils.py:141: UserWarning: [W030] Some entities could not be aligned in the text "82b263d025fddef5a8048b34eed91942.pdf	effective_dat..." with entities "[(15481, 15493, 'jurisdiction'), (15986, 15998, 'j...". Use `spacy.training.offsets_to_biluo_tags(nlp.make_doc(text), entities)` to check the alignment. Misaligned entities ('-') will be ignored during training.
  warnings.warn(
 62%|██████▏   | 158/254 [02:14<01:10,  1.37it/s]/Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/site-packages/spacy/training/iob_utils.py:141: UserWarning: [W030] Some entities could not be aligned in the text "83a79ed689ef320a8f65e0268de91e10.pdf	effective_dat..." with entities "[(57077, 57087, 'jurisdiction'), (57207, 57217, 'j...". Use `spacy.training.offsets_to_biluo_tags(nlp.make_doc(text), entities)` to check the alignment. Misaligned entities ('-') will be ignored during training.
  warnings.warn(
 63%|██████▎   | 161/254 [02:24<02:49,  1.83s/it]/Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/site-packages/spacy/training/iob_utils.py:141: UserWarning: [W030] Some entities could not be aligned in the text "86e9c90fa3986691fcb140266f514c7d.pdf	effective_dat..." with entities "[(14567, 14575, 'jurisdiction'), (30390, 30398, 'j...". Use `spacy.training.offsets_to_biluo_tags(nlp.make_doc(text), entities)` to check the alignment. Misaligned entities ('-') will be ignored during training.
  warnings.warn(
 64%|██████▍   | 162/254 [02:26<02:49,  1.84s/it]/Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/site-packages/spacy/training/iob_utils.py:141: UserWarning: [W030] Some entities could not be aligned in the text "8a7fedc5ffa5c2ffa424753229b52943.pdf	effective_dat..." with entities "[(10588, 10594, 'jurisdiction'), (11064, 11070, 'j...". Use `spacy.training.offsets_to_biluo_tags(nlp.make_doc(text), entities)` to check the alignment. Misaligned entities ('-') will be ignored during training.
  warnings.warn(
 66%|██████▌   | 167/254 [02:28<00:50,  1.71it/s]/Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/site-packages/spacy/training/iob_utils.py:141: UserWarning: [W030] Some entities could not be aligned in the text "988f7c53c00bb333a4b7188738a25378.pdf	effective_dat..." with entities "[(7568, 7576, 'jurisdiction'), (10008, 10016, 'jur...". Use `spacy.training.offsets_to_biluo_tags(nlp.make_doc(text), entities)` to check the alignment. Misaligned entities ('-') will be ignored during training.
  warnings.warn(
 68%|██████▊   | 172/254 [02:33<00:47,  1.71it/s]/Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/site-packages/spacy/training/iob_utils.py:141: UserWarning: [W030] Some entities could not be aligned in the text "9d70181e77cf74279fb6712c569da104.pdf	effective_dat..." with entities "[(226, 239, 'effective_date'), (26381, 26394, 'eff...". Use `spacy.training.offsets_to_biluo_tags(nlp.make_doc(text), entities)` to check the alignment. Misaligned entities ('-') will be ignored during training.
  warnings.warn(
 69%|██████▉   | 176/254 [02:37<00:45,  1.73it/s]/Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/site-packages/spacy/training/iob_utils.py:141: UserWarning: [W030] Some entities could not be aligned in the text "a373847e741d0b4db97466b8964a66ae.pdf	effective_dat..." with entities "[(18914, 18922, 'jurisdiction'), (19057, 19065, 'j...". Use `spacy.training.offsets_to_biluo_tags(nlp.make_doc(text), entities)` to check the alignment. Misaligned entities ('-') will be ignored during training.
  warnings.warn(
 70%|██████▉   | 177/254 [02:40<01:47,  1.40s/it]/Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/site-packages/spacy/training/iob_utils.py:141: UserWarning: [W030] Some entities could not be aligned in the text "a3ba9b969b390ce8ec0f62dde48f5a1f.pdf	effective_dat..." with entities "[(291, 303, 'effective_date'), (17055, 17067, 'eff...". Use `spacy.training.offsets_to_biluo_tags(nlp.make_doc(text), entities)` to check the alignment. Misaligned entities ('-') will be ignored during training.
  warnings.warn(
 70%|███████   | 178/254 [02:42<01:56,  1.54s/it]/Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/site-packages/spacy/training/iob_utils.py:141: UserWarning: [W030] Some entities could not be aligned in the text "a527509f8b744d57fc406679ab2287e0.pdf	effective_dat..." with entities "[(315, 337, 'effective_date'), (11945, 11967, 'eff...". Use `spacy.training.offsets_to_biluo_tags(nlp.make_doc(text), entities)` to check the alignment. Misaligned entities ('-') will be ignored during training.
  warnings.warn(
 72%|███████▏  | 182/254 [02:44<00:44,  1.63it/s]/Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/site-packages/spacy/training/iob_utils.py:141: UserWarning: [W030] Some entities could not be aligned in the text "a87ebed40675b7ed9c2d4a0721abbefb.pdf	effective_dat..." with entities "[(25895, 25903, 'jurisdiction'), (55588, 55596, 'j...". Use `spacy.training.offsets_to_biluo_tags(nlp.make_doc(text), entities)` to check the alignment. Misaligned entities ('-') will be ignored during training.
  warnings.warn(
 74%|███████▍  | 189/254 [02:48<00:22,  2.85it/s]/Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/site-packages/spacy/training/iob_utils.py:141: UserWarning: [W030] Some entities could not be aligned in the text "b960e85adabccfba6d758948a1ecc804.pdf	effective_dat..." with entities "[(12062, 12075, 'effective_date'), (24158, 24171, ...". Use `spacy.training.offsets_to_biluo_tags(nlp.make_doc(text), entities)` to check the alignment. Misaligned entities ('-') will be ignored during training.
  warnings.warn(
 77%|███████▋  | 195/254 [02:53<00:46,  1.28it/s]/Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/site-packages/spacy/training/iob_utils.py:141: UserWarning: [W030] Some entities could not be aligned in the text "c2149cc784d2d783c2de0c7b2f02a12f.pdf	effective_dat..." with entities "[(11364, 11371, 'effective_date'), (11398, 11405, ...". Use `spacy.training.offsets_to_biluo_tags(nlp.make_doc(text), entities)` to check the alignment. Misaligned entities ('-') will be ignored during training.
  warnings.warn(
 78%|███████▊  | 197/254 [02:54<00:42,  1.33it/s]/Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/site-packages/spacy/training/iob_utils.py:141: UserWarning: [W030] Some entities could not be aligned in the text "c4ccca5a5502597fc4a75b4ca50337df.pdf	effective_dat..." with entities "[(4849, 4859, 'jurisdiction'), (11069, 11079, 'jur...". Use `spacy.training.offsets_to_biluo_tags(nlp.make_doc(text), entities)` to check the alignment. Misaligned entities ('-') will be ignored during training.
  warnings.warn(
 80%|████████  | 204/254 [02:56<00:16,  3.04it/s]/Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/site-packages/spacy/training/iob_utils.py:141: UserWarning: [W030] Some entities could not be aligned in the text "c94fdb196d2502f60e21793b387023de.pdf	effective_dat..." with entities "[(240, 256, 'effective_date'), (23602, 23618, 'eff...". Use `spacy.training.offsets_to_biluo_tags(nlp.make_doc(text), entities)` to check the alignment. Misaligned entities ('-') will be ignored during training.
  warnings.warn(
 81%|████████▏ | 207/254 [02:59<00:28,  1.64it/s]/Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/site-packages/spacy/training/iob_utils.py:141: UserWarning: [W030] Some entities could not be aligned in the text "cbbcc01ea9cfa4ec8bfa27f0f9f71088.pdf	effective_dat..." with entities "[(24491, 24504, 'jurisdiction'), (24775, 24788, 'j...". Use `spacy.training.offsets_to_biluo_tags(nlp.make_doc(text), entities)` to check the alignment. Misaligned entities ('-') will be ignored during training.
  warnings.warn(
 83%|████████▎ | 210/254 [03:03<00:33,  1.33it/s]/Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/site-packages/spacy/training/iob_utils.py:141: UserWarning: [W030] Some entities could not be aligned in the text "cf34c9403e0092eca75ed9fc61284268.pdf	effective_dat..." with entities "[(256, 272, 'effective_date'), (16765, 16781, 'eff...". Use `spacy.training.offsets_to_biluo_tags(nlp.make_doc(text), entities)` to check the alignment. Misaligned entities ('-') will be ignored during training.
  warnings.warn(
 85%|████████▍ | 215/254 [03:05<00:16,  2.31it/s]/Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/site-packages/spacy/training/iob_utils.py:141: UserWarning: [W030] Some entities could not be aligned in the text "d2cedafb5d6fc0a7a2f4693f652606ef.pdf	effective_dat..." with entities "[(34099, 34104, 'jurisdiction'), (34219, 34224, 'j...". Use `spacy.training.offsets_to_biluo_tags(nlp.make_doc(text), entities)` to check the alignment. Misaligned entities ('-') will be ignored during training.
  warnings.warn(
 85%|████████▌ | 216/254 [03:11<01:09,  1.83s/it]/Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/site-packages/spacy/training/iob_utils.py:141: UserWarning: [W030] Some entities could not be aligned in the text "d50b5f4cf1b059aed9adb4d3d8953d84.pdf	effective_dat..." with entities "[(218, 223, 'jurisdiction'), (15327, 15332, 'juris...". Use `spacy.training.offsets_to_biluo_tags(nlp.make_doc(text), entities)` to check the alignment. Misaligned entities ('-') will be ignored during training.
  warnings.warn(
 86%|████████▌ | 219/254 [03:13<00:37,  1.06s/it]/Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/site-packages/spacy/training/iob_utils.py:141: UserWarning: [W030] Some entities could not be aligned in the text "d789f0680308f0638a05078c5d896b7a.pdf	effective_dat..." with entities "[(292, 310, 'effective_date'), (24569, 24587, 'eff...". Use `spacy.training.offsets_to_biluo_tags(nlp.make_doc(text), entities)` to check the alignment. Misaligned entities ('-') will be ignored during training.
  warnings.warn(
 90%|████████▉ | 228/254 [03:18<00:09,  2.81it/s]/Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/site-packages/spacy/training/iob_utils.py:141: UserWarning: [W030] Some entities could not be aligned in the text "e29c3877a103aaefcf77ebb110f981a5.pdf	effective_dat..." with entities "[(24146, 24159, 'jurisdiction'), (24324, 24337, 'j...". Use `spacy.training.offsets_to_biluo_tags(nlp.make_doc(text), entities)` to check the alignment. Misaligned entities ('-') will be ignored during training.
  warnings.warn(
 90%|█████████ | 229/254 [03:21<00:29,  1.18s/it]/Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/site-packages/spacy/training/iob_utils.py:141: UserWarning: [W030] Some entities could not be aligned in the text "e33d3ca6885f31faa68b2ab766afc86b.pdf	effective_dat..." with entities "[(360, 365, 'jurisdiction'), (551, 556, 'jurisdict...". Use `spacy.training.offsets_to_biluo_tags(nlp.make_doc(text), entities)` to check the alignment. Misaligned entities ('-') will be ignored during training.
  warnings.warn(
 96%|█████████▌| 243/254 [03:26<00:02,  3.98it/s]/Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/site-packages/spacy/training/iob_utils.py:141: UserWarning: [W030] Some entities could not be aligned in the text "f4d4ef76c5ce9b0d5bca8c55369b753c.pdf	effective_dat..." with entities "[(19234, 19242, 'jurisdiction'), (19468, 19476, 'j...". Use `spacy.training.offsets_to_biluo_tags(nlp.make_doc(text), entities)` to check the alignment. Misaligned entities ('-') will be ignored during training.
  warnings.warn(
 97%|█████████▋| 246/254 [03:29<00:04,  1.84it/s]/Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/site-packages/spacy/training/iob_utils.py:141: UserWarning: [W030] Some entities could not be aligned in the text "f6cf95250272fd7f3fd767819ee11255.pdf	effective_dat..." with entities "[(6008, 6016, 'jurisdiction'), (14222, 14230, 'jur...". Use `spacy.training.offsets_to_biluo_tags(nlp.make_doc(text), entities)` to check the alignment. Misaligned entities ('-') will be ignored during training.
  warnings.warn(
 98%|█████████▊| 248/254 [03:31<00:03,  1.78it/s]/Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/site-packages/spacy/training/iob_utils.py:141: UserWarning: [W030] Some entities could not be aligned in the text "fbf608b62ef498171b70fb7b36be61a0.pdf	effective_dat..." with entities "[(30197, 30220, 'effective_date'), (3688, 3695, 'j...". Use `spacy.training.offsets_to_biluo_tags(nlp.make_doc(text), entities)` to check the alignment. Misaligned entities ('-') will be ignored during training.
  warnings.warn(
 99%|█████████▉| 252/254 [03:32<00:00,  2.37it/s]/Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/site-packages/spacy/training/iob_utils.py:141: UserWarning: [W030] Some entities could not be aligned in the text "fdf657ad612664d6f363040992f9a93c.pdf	effective_dat..." with entities "[(205, 221, 'effective_date'), (18571, 18587, 'eff...". Use `spacy.training.offsets_to_biluo_tags(nlp.make_doc(text), entities)` to check the alignment. Misaligned entities ('-') will be ignored during training.
  warnings.warn(
100%|██████████| 254/254 [03:35<00:00,  1.18it/s]
{'ner': 135477.30665303842}
100%|██████████| 254/254 [03:31<00:00,  1.20it/s]
{'ner': 2773.646277800689}
100%|██████████| 254/254 [03:33<00:00,  1.19it/s]
{'ner': 2577.063714375327}
# Test trained model
for text, _ in trainData[:1]:
    doc = nlp(text)
    print('Entities', [(ent.text, ent.label_) for ent in doc.ents])
Entities [('New', 'jurisdiction'), ('New', 'jurisdiction'), ('New', 'jurisdiction'), ('New', 'jurisdiction')]
# Save the model to path
nlp.to_disk('NER')