45 KiB
45 KiB
import lzma
# RTead file with lzma
NDAs = []
with lzma.open('train/in.tsv.xz') as f:
for line in f:
NDAs.append(line.decode('utf-8'))
# Read expected information
expected = []
with open('train/expected.tsv') as f:
for line in f:
expected.append(line.replace('\n', '').split(' '))
import re
months = {'01': 'January', '02': 'February', '03': 'March',
'04': 'April', '05': 'May', '06': 'June',
'07': 'July', '08': 'August', '09': 'September',
'10': 'October', '11': 'November', '12': 'December'}
def dayToWord(day):
day = int(day)
if day > 3:
return str(day) + 'th'
elif day == 3:
return str(day) + 'rd'
elif day == 2 :
return str(day) + 'nd'
else: return str(day) + 'st'
def numToWord(number):
try:
number = int(number)
d = {1 : 'one', 2 : 'two', 3 : 'three', 4 : 'four', 5 : 'five',
6 : 'six', 7 : 'seven', 8 : 'eight', 9 : 'nine', 10 : 'ten',
11 : 'eleven', 12 : 'twelve', 13 : 'thirteen', 14 : 'fourteen',
15 : 'fifteen', 16 : 'sixteen', 17 : 'seventeen', 18 : 'eighteen',
19 : 'nineteen', 20 : 'twenty',
30 : 'thirty', 40 : 'forty', 50 : 'fifty', 60 : 'sixty',
70 : 'seventy', 80 : 'eighty', 90 : 'ninety' }
if number < 20:
return d[number]
else:
if number % 10 == 0: return d[number]
else: return d[number // 10 * 10] + '-' + d[number % 10]
except:
return number
def labelJurisdiction(text, jurisdiction):
jurisdictions = []
jurisdiction = jurisdiction.replace('_', ' ')
for match in re.finditer(jurisdiction, text):
tup = (match.start(), match.end(), 'jurisdiction')
jurisdictions.append(tup)
return jurisdictions
def labelEffectiveDate(text, date):
dates = []
year, month, day = date.split('-')
dateFormats = [month + '/' + day + '/' + year,
month + '/' + day + '/' + year[-2:],
month[1] + '/' + day + '/' + year,
month[1] + '/' + day[1] + '/' + year,
month[1] + '/' + day + '/' + year[-2:],
month[1] + '/' + day[1] + '/' + year[-2:],
dayToWord(day) + ' of ' + months[month] + ', ' + year,
dayToWord(day) + ' day of ' + months[month] + ', ' + year,
months[month] + ' ' + day + ', ' + year ]
for format in dateFormats:
for match in re.finditer(format, text, flags=re.IGNORECASE):
tup = (match.start(), match.end(), 'effective_date')
dates.append(tup)
return dates
def labelParties(text, party):
parties = []
if 'Inc' in party:
regular = ''
for word in party.split('_'):
regular += word + '(.*)'
party = regular
party = party.replace('_', ' ')
for match in re.finditer(party, text, flags=re.IGNORECASE):
tup = (match.start(), match.end(), 'party')
parties.append(tup)
return parties
def labelTerms(text, term):
terms = []
term = term.split('_')
number = numToWord(term[0])
units = term[1]
for match in re.finditer(number + ' ' + units, text, flags=re.IGNORECASE):
tup = (match.start(), match.end(), 'term')
terms.append(tup)
return terms
expectEntities = []
for expect in expected:
# expect = expect.split()
entities = []
for e in expect:
label, entity = e.split('=')
entities.append((label, entity))
expectEntities.append(entities)
trainData =[]
for i in range(len(expectEntities)):
listOfEntities = []
for entity in expectEntities[i]:
if entity[0] == 'effective_date':
listOfEntities.append(labelEffectiveDate(NDAs[i], entity[1]))
elif entity[0] == 'jurisdiction':
listOfEntities.append(labelJurisdiction(NDAs[i], entity[1]))
elif entity[0] == 'party':
listOfEntities.append(labelParties(NDAs[i], entity[1]))
else: listOfEntities.append(labelTerms(NDAs[i], entity[1]))
listOfEntities = [item for sublist in listOfEntities for item in sublist]
trainData.append((NDAs[i], {'entities': listOfEntities}))
import spacy
# from spacy.tokens import DocBin
model = None
nIter = 3
if model is not None:
nlp = spacy.load(model)
print('Loaded model')
else:
nlp = spacy.blank('en')
print('Created blank "en" model')
if 'ner' not in nlp.pipe_names:
# ner = nlp.create_pipe('ner')
ner = nlp.add_pipe('ner', last=True)
else:
ner = nlp.get_pipe('ner')
Created blank "en" model
for data in trainData:
for ent in data[1].get('entities'):
ner.add_label(ent[2])
otherPipes = [pipe for pipe in nlp.pipe_names if pipe != 'ner']
# import random
from tqdm import tqdm
from spacy.training.example import Example
with nlp.disable_pipes(*otherPipes):
optimizer = nlp.begin_training()
for itn in range(nIter):
# random.shuffle(trainData)
losses = {}
for text, annotations in tqdm(trainData):
try:
doc = nlp.make_doc(text)
example = Example.from_dict(doc, annotations)
nlp.update([example], drop=0.5, sgd=optimizer, losses=losses)
except:
pass
print(losses)
1%| | 3/254 [00:00<01:13, 3.43it/s]/Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/site-packages/spacy/training/iob_utils.py:141: UserWarning: [W030] Some entities could not be aligned in the text "03fd0e629b617da00c54794a8a78b24d.pdf effective_dat..." with entities "[(287, 300, 'effective_date'), (25276, 25289, 'eff...". Use `spacy.training.offsets_to_biluo_tags(nlp.make_doc(text), entities)` to check the alignment. Misaligned entities ('-') will be ignored during training. warnings.warn( 2%|▏ | 6/254 [00:03<03:03, 1.35it/s]/Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/site-packages/spacy/training/iob_utils.py:141: UserWarning: [W030] Some entities could not be aligned in the text "04bf0791804e8487c91ab84eaa47a335.pdf effective_dat..." with entities "[(198, 216, 'effective_date'), (22663, 22681, 'eff...". Use `spacy.training.offsets_to_biluo_tags(nlp.make_doc(text), entities)` to check the alignment. Misaligned entities ('-') will be ignored during training. warnings.warn( 3%|▎ | 8/254 [00:06<04:03, 1.01it/s]/Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/site-packages/spacy/training/iob_utils.py:141: UserWarning: [W030] Some entities could not be aligned in the text "0587275477c6ad6d0d72419383e04b88.pdf effective_dat..." with entities "[(4528, 4536, 'jurisdiction'), (4604, 4612, 'juris...". Use `spacy.training.offsets_to_biluo_tags(nlp.make_doc(text), entities)` to check the alignment. Misaligned entities ('-') will be ignored during training. warnings.warn( 4%|▎ | 9/254 [00:13<10:42, 2.62s/it]/Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/site-packages/spacy/training/iob_utils.py:141: UserWarning: [W030] Some entities could not be aligned in the text "05947711a24a5b7ce401911d31e19c91.pdf effective_dat..." with entities "[(18271, 18279, 'jurisdiction'), (18507, 18515, 'j...". Use `spacy.training.offsets_to_biluo_tags(nlp.make_doc(text), entities)` to check the alignment. Misaligned entities ('-') will be ignored during training. warnings.warn( 6%|▌ | 14/254 [00:18<04:28, 1.12s/it]/Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/site-packages/spacy/training/iob_utils.py:141: UserWarning: [W030] Some entities could not be aligned in the text "0859334b76224ff82c1312ae7b2b5da1.pdf effective_dat..." with entities "[(279, 296, 'effective_date'), (22981, 22998, 'eff...". Use `spacy.training.offsets_to_biluo_tags(nlp.make_doc(text), entities)` to check the alignment. Misaligned entities ('-') will be ignored during training. warnings.warn( 7%|▋ | 17/254 [00:21<03:21, 1.18it/s]/Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/site-packages/spacy/training/iob_utils.py:141: UserWarning: [W030] Some entities could not be aligned in the text "0c3ab1d0c8bb3b1c2f7a64f3ab584368.pdf effective_dat..." with entities "[(243, 259, 'effective_date'), (35225, 35241, 'eff...". Use `spacy.training.offsets_to_biluo_tags(nlp.make_doc(text), entities)` to check the alignment. Misaligned entities ('-') will be ignored during training. warnings.warn( 7%|▋ | 18/254 [00:23<05:00, 1.27s/it]/Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/site-packages/spacy/training/iob_utils.py:141: UserWarning: [W030] Some entities could not be aligned in the text "0c7b90701575b147c4ac245ca478ee7c.pdf effective_dat..." with entities "[(10058, 10065, 'jurisdiction'), (10252, 10259, 'j...". Use `spacy.training.offsets_to_biluo_tags(nlp.make_doc(text), entities)` to check the alignment. Misaligned entities ('-') will be ignored during training. warnings.warn( 8%|▊ | 20/254 [00:26<04:32, 1.17s/it]/Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/site-packages/spacy/training/iob_utils.py:141: UserWarning: [W030] Some entities could not be aligned in the text "0f446b4ed10d8d40824270d746511cca.pdf jurisdiction ..." with entities "[(261, 268, 'jurisdiction'), (901, 908, 'jurisdict...". Use `spacy.training.offsets_to_biluo_tags(nlp.make_doc(text), entities)` to check the alignment. Misaligned entities ('-') will be ignored during training. warnings.warn( 8%|▊ | 21/254 [00:27<05:02, 1.30s/it]/Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/site-packages/spacy/training/iob_utils.py:141: UserWarning: [W030] Some entities could not be aligned in the text "1058cd8d541c0622ad959facd34235ea.pdf effective_dat..." with entities "[(21973, 21981, 'jurisdiction'), (46056, 46064, 'j...". Use `spacy.training.offsets_to_biluo_tags(nlp.make_doc(text), entities)` to check the alignment. Misaligned entities ('-') will be ignored during training. warnings.warn( 9%|▉ | 23/254 [00:30<05:02, 1.31s/it]/Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/site-packages/spacy/training/iob_utils.py:141: UserWarning: [W030] Some entities could not be aligned in the text "15398fb3b5f357981a8be88dc4bb376e.pdf effective_dat..." with entities "[(579, 591, 'jurisdiction'), (17167, 17179, 'juris...". Use `spacy.training.offsets_to_biluo_tags(nlp.make_doc(text), entities)` to check the alignment. Misaligned entities ('-') will be ignored during training. warnings.warn( 9%|▉ | 24/254 [00:35<08:26, 2.20s/it]/Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/site-packages/spacy/training/iob_utils.py:141: UserWarning: [W030] Some entities could not be aligned in the text "154d30f607c74aa8a5f582bf84f7a5e2.pdf effective_dat..." with entities "[(379, 387, 'jurisdiction'), (22505, 22513, 'juris...". Use `spacy.training.offsets_to_biluo_tags(nlp.make_doc(text), entities)` to check the alignment. Misaligned entities ('-') will be ignored during training. warnings.warn( 13%|█▎ | 32/254 [00:39<01:50, 2.02it/s]/Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/site-packages/spacy/training/iob_utils.py:141: UserWarning: [W030] Some entities could not be aligned in the text "1a5847e0b968e25ddcf41ac9c6fc63b4.pdf effective_dat..." with entities "[(210, 227, 'effective_date'), (708, 725, 'effecti...". Use `spacy.training.offsets_to_biluo_tags(nlp.make_doc(text), entities)` to check the alignment. Misaligned entities ('-') will be ignored during training. warnings.warn( 14%|█▍ | 36/254 [00:43<02:24, 1.51it/s]/Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/site-packages/spacy/training/iob_utils.py:141: UserWarning: [W030] Some entities could not be aligned in the text "1c1705ebb86fb8c9ddd2c765d1d59486.pdf effective_dat..." with entities "[(356, 373, 'effective_date'), (14632, 14649, 'eff...". Use `spacy.training.offsets_to_biluo_tags(nlp.make_doc(text), entities)` to check the alignment. Misaligned entities ('-') will be ignored during training. warnings.warn( 15%|█▍ | 37/254 [00:45<03:27, 1.05it/s]/Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/site-packages/spacy/training/iob_utils.py:141: UserWarning: [W030] Some entities could not be aligned in the text "1c36bbc314ee3f0cbe059d15d4fdd36a.pdf effective_dat..." with entities "[(250, 267, 'effective_date'), (31244, 31261, 'eff...". Use `spacy.training.offsets_to_biluo_tags(nlp.make_doc(text), entities)` to check the alignment. Misaligned entities ('-') will be ignored during training. warnings.warn( 17%|█▋ | 43/254 [00:51<02:11, 1.61it/s]/Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/site-packages/spacy/training/iob_utils.py:141: UserWarning: [W030] Some entities could not be aligned in the text "22526e24107177141dc9b66afed7106d.pdf effective_dat..." with entities "[(265, 273, 'jurisdiction'), (12609, 12617, 'juris...". Use `spacy.training.offsets_to_biluo_tags(nlp.make_doc(text), entities)` to check the alignment. Misaligned entities ('-') will be ignored during training. warnings.warn( 18%|█▊ | 45/254 [00:52<02:14, 1.55it/s]/Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/site-packages/spacy/training/iob_utils.py:141: UserWarning: [W030] Some entities could not be aligned in the text "232b3bee703427df8e9893e4a52d5d60.pdf effective_dat..." with entities "[(16031, 16039, 'jurisdiction'), (16220, 16228, 'j...". Use `spacy.training.offsets_to_biluo_tags(nlp.make_doc(text), entities)` to check the alignment. Misaligned entities ('-') will be ignored during training. warnings.warn( 19%|█▉ | 48/254 [00:55<02:04, 1.65it/s]/Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/site-packages/spacy/training/iob_utils.py:141: UserWarning: [W030] Some entities could not be aligned in the text "247166e0245431dcf97ee884f1f07e35.pdf effective_dat..." with entities "[(156, 170, 'effective_date'), (508, 522, 'effecti...". Use `spacy.training.offsets_to_biluo_tags(nlp.make_doc(text), entities)` to check the alignment. Misaligned entities ('-') will be ignored during training. warnings.warn( 20%|█▉ | 50/254 [00:56<01:53, 1.80it/s]/Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/site-packages/spacy/training/iob_utils.py:141: UserWarning: [W030] Some entities could not be aligned in the text "2632c4c1238356489cab88d58e1a5fb0.pdf effective_dat..." with entities "[(15689, 15697, 'effective_date'), (15719, 15727, ...". Use `spacy.training.offsets_to_biluo_tags(nlp.make_doc(text), entities)` to check the alignment. Misaligned entities ('-') will be ignored during training. warnings.warn( 22%|██▏ | 56/254 [00:59<01:19, 2.49it/s]/Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/site-packages/spacy/training/iob_utils.py:141: UserWarning: [W030] Some entities could not be aligned in the text "2ab67f26bc51d57492e3f27b244fae3e.pdf effective_dat..." with entities "[(22097, 22111, 'effective_date'), (44207, 44221, ...". Use `spacy.training.offsets_to_biluo_tags(nlp.make_doc(text), entities)` to check the alignment. Misaligned entities ('-') will be ignored during training. warnings.warn( 24%|██▎ | 60/254 [01:05<03:01, 1.07it/s]/Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/site-packages/spacy/training/iob_utils.py:141: UserWarning: [W030] Some entities could not be aligned in the text "2ce3bbe2d6836d8b023c55883294fa63.pdf effective_dat..." with entities "[(9413, 9422, 'jurisdiction'), (9806, 9815, 'juris...". Use `spacy.training.offsets_to_biluo_tags(nlp.make_doc(text), entities)` to check the alignment. Misaligned entities ('-') will be ignored during training. warnings.warn( 29%|██▉ | 74/254 [01:11<00:50, 3.57it/s]/Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/site-packages/spacy/training/iob_utils.py:141: UserWarning: [W030] Some entities could not be aligned in the text "376f9746de69416a9561e92517c356ee.pdf effective_dat..." with entities "[(306, 330, 'effective_date'), (9197, 9221, 'effec...". Use `spacy.training.offsets_to_biluo_tags(nlp.make_doc(text), entities)` to check the alignment. Misaligned entities ('-') will be ignored during training. warnings.warn( 30%|███ | 77/254 [01:12<00:54, 3.23it/s]/Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/site-packages/spacy/training/iob_utils.py:141: UserWarning: [W030] Some entities could not be aligned in the text "39610c6bf605fdd8d0d9bcb2aacb5e74.pdf effective_dat..." with entities "[(51002, 51012, 'jurisdiction'), (51132, 51142, 'j...". Use `spacy.training.offsets_to_biluo_tags(nlp.make_doc(text), entities)` to check the alignment. Misaligned entities ('-') will be ignored during training. warnings.warn( 31%|███ | 79/254 [01:22<06:30, 2.23s/it]/Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/site-packages/spacy/training/iob_utils.py:141: UserWarning: [W030] Some entities could not be aligned in the text "3acc6f6bdad6eaaf7ab21faea5ea95fa.pdf effective_dat..." with entities "[(251, 265, 'effective_date'), (23669, 23683, 'eff...". Use `spacy.training.offsets_to_biluo_tags(nlp.make_doc(text), entities)` to check the alignment. Misaligned entities ('-') will be ignored during training. warnings.warn( 31%|███▏ | 80/254 [01:25<06:45, 2.33s/it]/Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/site-packages/spacy/training/iob_utils.py:141: UserWarning: [W030] Some entities could not be aligned in the text "3c19cab83f40f722fc8c1432299d7655.pdf effective_dat..." with entities "[(290, 312, 'effective_date'), (30391, 30413, 'eff...". Use `spacy.training.offsets_to_biluo_tags(nlp.make_doc(text), entities)` to check the alignment. Misaligned entities ('-') will be ignored during training. warnings.warn( 37%|███▋ | 94/254 [01:32<00:52, 3.07it/s]/Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/site-packages/spacy/training/iob_utils.py:141: UserWarning: [W030] Some entities could not be aligned in the text "480fcdb1b3d02989c11ace2c69bc9ba6.pdf effective_dat..." with entities "[(22255, 22269, 'effective_date'), (44472, 44486, ...". Use `spacy.training.offsets_to_biluo_tags(nlp.make_doc(text), entities)` to check the alignment. Misaligned entities ('-') will be ignored during training. warnings.warn( 38%|███▊ | 96/254 [01:35<02:11, 1.20it/s]/Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/site-packages/spacy/training/iob_utils.py:141: UserWarning: [W030] Some entities could not be aligned in the text "495f7d16921a1c8531be0844db0828a4.pdf effective_dat..." with entities "[(130, 155, 'effective_date'), (19409, 19434, 'eff...". Use `spacy.training.offsets_to_biluo_tags(nlp.make_doc(text), entities)` to check the alignment. Misaligned entities ('-') will be ignored during training. warnings.warn( 40%|███▉ | 101/254 [01:36<00:59, 2.56it/s]/Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/site-packages/spacy/training/iob_utils.py:141: UserWarning: [W030] Some entities could not be aligned in the text "4dc5c39e601cd476f4c2def0e6b96915.pdf effective_dat..." with entities "[(329, 343, 'jurisdiction'), (29204, 29218, 'juris...". Use `spacy.training.offsets_to_biluo_tags(nlp.make_doc(text), entities)` to check the alignment. Misaligned entities ('-') will be ignored during training. warnings.warn( 41%|████▏ | 105/254 [01:41<01:30, 1.64it/s]/Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/site-packages/spacy/training/iob_utils.py:141: UserWarning: [W030] Some entities could not be aligned in the text "4f0e455a90c53f8e40e09d324aab4ea3.pdf effective_dat..." with entities "[(533, 550, 'effective_date'), (2868, 2885, 'effec...". Use `spacy.training.offsets_to_biluo_tags(nlp.make_doc(text), entities)` to check the alignment. Misaligned entities ('-') will be ignored during training. warnings.warn( 42%|████▏ | 106/254 [01:44<03:16, 1.33s/it]/Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/site-packages/spacy/training/iob_utils.py:141: UserWarning: [W030] Some entities could not be aligned in the text "4fd432d8ce6796dabc17d3838d8539a2.pdf effective_dat..." with entities "[(162, 176, 'effective_date'), (15065, 15079, 'eff...". Use `spacy.training.offsets_to_biluo_tags(nlp.make_doc(text), entities)` to check the alignment. Misaligned entities ('-') will be ignored during training. warnings.warn( 44%|████▎ | 111/254 [01:46<01:15, 1.91it/s]/Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/site-packages/spacy/training/iob_utils.py:141: UserWarning: [W030] Some entities could not be aligned in the text "5b070e9583099dfdcddc9c9c811b7d44.pdf effective_dat..." with entities "[(10864, 10876, 'jurisdiction'), (23351, 23363, 'j...". Use `spacy.training.offsets_to_biluo_tags(nlp.make_doc(text), entities)` to check the alignment. Misaligned entities ('-') will be ignored during training. warnings.warn( 44%|████▍ | 112/254 [01:48<01:52, 1.26it/s]/Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/site-packages/spacy/training/iob_utils.py:141: UserWarning: [W030] Some entities could not be aligned in the text "5d18471dc0cb8c824fe86d5899aeb24b.pdf effective_dat..." with entities "[(25946, 25952, 'effective_date'), (52332, 52338, ...". Use `spacy.training.offsets_to_biluo_tags(nlp.make_doc(text), entities)` to check the alignment. Misaligned entities ('-') will be ignored during training. warnings.warn( 48%|████▊ | 122/254 [01:53<00:35, 3.71it/s]/Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/site-packages/spacy/training/iob_utils.py:141: UserWarning: [W030] Some entities could not be aligned in the text "6c0e2103cb185f28b0c1e9109c674836.pdf effective_dat..." with entities "[(20591, 20598, 'effective_date'), (20707, 20714, ...". Use `spacy.training.offsets_to_biluo_tags(nlp.make_doc(text), entities)` to check the alignment. Misaligned entities ('-') will be ignored during training. warnings.warn( 50%|████▉ | 126/254 [01:56<00:50, 2.56it/s]/Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/site-packages/spacy/training/iob_utils.py:141: UserWarning: [W030] Some entities could not be aligned in the text "6ecf1846ef305f44deb8f5c64da3b999.pdf effective_dat..." with entities "[(1140, 1156, 'effective_date'), (23227, 23243, 'e...". Use `spacy.training.offsets_to_biluo_tags(nlp.make_doc(text), entities)` to check the alignment. Misaligned entities ('-') will be ignored during training. warnings.warn( 53%|█████▎ | 135/254 [02:00<00:34, 3.49it/s]/Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/site-packages/spacy/training/iob_utils.py:141: UserWarning: [W030] Some entities could not be aligned in the text "73bfeebfeca04b3a804d844cbf16d7f3.pdf effective_dat..." with entities "[(3546, 3556, 'jurisdiction'), (9893, 9903, 'juris...". Use `spacy.training.offsets_to_biluo_tags(nlp.make_doc(text), entities)` to check the alignment. Misaligned entities ('-') will be ignored during training. warnings.warn( 54%|█████▍ | 137/254 [02:01<00:38, 3.01it/s]/Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/site-packages/spacy/training/iob_utils.py:141: UserWarning: [W030] Some entities could not be aligned in the text "7496116e8680dac321f36147b6312411.pdf effective_dat..." with entities "[(13407, 13420, 'effective_date'), (26980, 26993, ...". Use `spacy.training.offsets_to_biluo_tags(nlp.make_doc(text), entities)` to check the alignment. Misaligned entities ('-') will be ignored during training. warnings.warn( 54%|█████▍ | 138/254 [02:02<01:22, 1.41it/s]/Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/site-packages/spacy/training/iob_utils.py:141: UserWarning: [W030] Some entities could not be aligned in the text "7684f321eb08514fa1794427e73479b9.pdf effective_dat..." with entities "[(16189, 16197, 'jurisdiction'), (16325, 16333, 'j...". Use `spacy.training.offsets_to_biluo_tags(nlp.make_doc(text), entities)` to check the alignment. Misaligned entities ('-') will be ignored during training. warnings.warn( 55%|█████▍ | 139/254 [02:05<02:34, 1.34s/it]/Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/site-packages/spacy/training/iob_utils.py:141: UserWarning: [W030] Some entities could not be aligned in the text "782c651fc7cf288ec2f8857de0d6bb58.pdf effective_dat..." with entities "[(5539, 5552, 'jurisdiction'), (11301, 11314, 'jur...". Use `spacy.training.offsets_to_biluo_tags(nlp.make_doc(text), entities)` to check the alignment. Misaligned entities ('-') will be ignored during training. warnings.warn( 59%|█████▊ | 149/254 [02:08<00:25, 4.18it/s]/Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/site-packages/spacy/training/iob_utils.py:141: UserWarning: [W030] Some entities could not be aligned in the text "7cf3dfaf7afd9989de90cb3cbd8d6a83.pdf effective_dat..." with entities "[(13571, 13578, 'effective_date'), (27080, 27087, ...". Use `spacy.training.offsets_to_biluo_tags(nlp.make_doc(text), entities)` to check the alignment. Misaligned entities ('-') will be ignored during training. warnings.warn( 59%|█████▉ | 151/254 [02:09<00:48, 2.13it/s]/Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/site-packages/spacy/training/iob_utils.py:141: UserWarning: [W030] Some entities could not be aligned in the text "7cfa17a4165369964337c2f46c40e3a2.pdf effective_dat..." with entities "[(13099, 13104, 'jurisdiction'), (13304, 13309, 'j...". Use `spacy.training.offsets_to_biluo_tags(nlp.make_doc(text), entities)` to check the alignment. Misaligned entities ('-') will be ignored during training. warnings.warn( 62%|██████▏ | 157/254 [02:12<00:29, 3.33it/s]/Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/site-packages/spacy/training/iob_utils.py:141: UserWarning: [W030] Some entities could not be aligned in the text "82b263d025fddef5a8048b34eed91942.pdf effective_dat..." with entities "[(15481, 15493, 'jurisdiction'), (15986, 15998, 'j...". Use `spacy.training.offsets_to_biluo_tags(nlp.make_doc(text), entities)` to check the alignment. Misaligned entities ('-') will be ignored during training. warnings.warn( 62%|██████▏ | 158/254 [02:14<01:10, 1.37it/s]/Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/site-packages/spacy/training/iob_utils.py:141: UserWarning: [W030] Some entities could not be aligned in the text "83a79ed689ef320a8f65e0268de91e10.pdf effective_dat..." with entities "[(57077, 57087, 'jurisdiction'), (57207, 57217, 'j...". Use `spacy.training.offsets_to_biluo_tags(nlp.make_doc(text), entities)` to check the alignment. Misaligned entities ('-') will be ignored during training. warnings.warn( 63%|██████▎ | 161/254 [02:24<02:49, 1.83s/it]/Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/site-packages/spacy/training/iob_utils.py:141: UserWarning: [W030] Some entities could not be aligned in the text "86e9c90fa3986691fcb140266f514c7d.pdf effective_dat..." with entities "[(14567, 14575, 'jurisdiction'), (30390, 30398, 'j...". Use `spacy.training.offsets_to_biluo_tags(nlp.make_doc(text), entities)` to check the alignment. Misaligned entities ('-') will be ignored during training. warnings.warn( 64%|██████▍ | 162/254 [02:26<02:49, 1.84s/it]/Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/site-packages/spacy/training/iob_utils.py:141: UserWarning: [W030] Some entities could not be aligned in the text "8a7fedc5ffa5c2ffa424753229b52943.pdf effective_dat..." with entities "[(10588, 10594, 'jurisdiction'), (11064, 11070, 'j...". Use `spacy.training.offsets_to_biluo_tags(nlp.make_doc(text), entities)` to check the alignment. Misaligned entities ('-') will be ignored during training. warnings.warn( 66%|██████▌ | 167/254 [02:28<00:50, 1.71it/s]/Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/site-packages/spacy/training/iob_utils.py:141: UserWarning: [W030] Some entities could not be aligned in the text "988f7c53c00bb333a4b7188738a25378.pdf effective_dat..." with entities "[(7568, 7576, 'jurisdiction'), (10008, 10016, 'jur...". Use `spacy.training.offsets_to_biluo_tags(nlp.make_doc(text), entities)` to check the alignment. Misaligned entities ('-') will be ignored during training. warnings.warn( 68%|██████▊ | 172/254 [02:33<00:47, 1.71it/s]/Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/site-packages/spacy/training/iob_utils.py:141: UserWarning: [W030] Some entities could not be aligned in the text "9d70181e77cf74279fb6712c569da104.pdf effective_dat..." with entities "[(226, 239, 'effective_date'), (26381, 26394, 'eff...". Use `spacy.training.offsets_to_biluo_tags(nlp.make_doc(text), entities)` to check the alignment. Misaligned entities ('-') will be ignored during training. warnings.warn( 69%|██████▉ | 176/254 [02:37<00:45, 1.73it/s]/Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/site-packages/spacy/training/iob_utils.py:141: UserWarning: [W030] Some entities could not be aligned in the text "a373847e741d0b4db97466b8964a66ae.pdf effective_dat..." with entities "[(18914, 18922, 'jurisdiction'), (19057, 19065, 'j...". Use `spacy.training.offsets_to_biluo_tags(nlp.make_doc(text), entities)` to check the alignment. Misaligned entities ('-') will be ignored during training. warnings.warn( 70%|██████▉ | 177/254 [02:40<01:47, 1.40s/it]/Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/site-packages/spacy/training/iob_utils.py:141: UserWarning: [W030] Some entities could not be aligned in the text "a3ba9b969b390ce8ec0f62dde48f5a1f.pdf effective_dat..." with entities "[(291, 303, 'effective_date'), (17055, 17067, 'eff...". Use `spacy.training.offsets_to_biluo_tags(nlp.make_doc(text), entities)` to check the alignment. Misaligned entities ('-') will be ignored during training. warnings.warn( 70%|███████ | 178/254 [02:42<01:56, 1.54s/it]/Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/site-packages/spacy/training/iob_utils.py:141: UserWarning: [W030] Some entities could not be aligned in the text "a527509f8b744d57fc406679ab2287e0.pdf effective_dat..." with entities "[(315, 337, 'effective_date'), (11945, 11967, 'eff...". Use `spacy.training.offsets_to_biluo_tags(nlp.make_doc(text), entities)` to check the alignment. Misaligned entities ('-') will be ignored during training. warnings.warn( 72%|███████▏ | 182/254 [02:44<00:44, 1.63it/s]/Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/site-packages/spacy/training/iob_utils.py:141: UserWarning: [W030] Some entities could not be aligned in the text "a87ebed40675b7ed9c2d4a0721abbefb.pdf effective_dat..." with entities "[(25895, 25903, 'jurisdiction'), (55588, 55596, 'j...". Use `spacy.training.offsets_to_biluo_tags(nlp.make_doc(text), entities)` to check the alignment. Misaligned entities ('-') will be ignored during training. warnings.warn( 74%|███████▍ | 189/254 [02:48<00:22, 2.85it/s]/Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/site-packages/spacy/training/iob_utils.py:141: UserWarning: [W030] Some entities could not be aligned in the text "b960e85adabccfba6d758948a1ecc804.pdf effective_dat..." with entities "[(12062, 12075, 'effective_date'), (24158, 24171, ...". Use `spacy.training.offsets_to_biluo_tags(nlp.make_doc(text), entities)` to check the alignment. Misaligned entities ('-') will be ignored during training. warnings.warn( 77%|███████▋ | 195/254 [02:53<00:46, 1.28it/s]/Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/site-packages/spacy/training/iob_utils.py:141: UserWarning: [W030] Some entities could not be aligned in the text "c2149cc784d2d783c2de0c7b2f02a12f.pdf effective_dat..." with entities "[(11364, 11371, 'effective_date'), (11398, 11405, ...". Use `spacy.training.offsets_to_biluo_tags(nlp.make_doc(text), entities)` to check the alignment. Misaligned entities ('-') will be ignored during training. warnings.warn( 78%|███████▊ | 197/254 [02:54<00:42, 1.33it/s]/Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/site-packages/spacy/training/iob_utils.py:141: UserWarning: [W030] Some entities could not be aligned in the text "c4ccca5a5502597fc4a75b4ca50337df.pdf effective_dat..." with entities "[(4849, 4859, 'jurisdiction'), (11069, 11079, 'jur...". Use `spacy.training.offsets_to_biluo_tags(nlp.make_doc(text), entities)` to check the alignment. Misaligned entities ('-') will be ignored during training. warnings.warn( 80%|████████ | 204/254 [02:56<00:16, 3.04it/s]/Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/site-packages/spacy/training/iob_utils.py:141: UserWarning: [W030] Some entities could not be aligned in the text "c94fdb196d2502f60e21793b387023de.pdf effective_dat..." with entities "[(240, 256, 'effective_date'), (23602, 23618, 'eff...". Use `spacy.training.offsets_to_biluo_tags(nlp.make_doc(text), entities)` to check the alignment. Misaligned entities ('-') will be ignored during training. warnings.warn( 81%|████████▏ | 207/254 [02:59<00:28, 1.64it/s]/Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/site-packages/spacy/training/iob_utils.py:141: UserWarning: [W030] Some entities could not be aligned in the text "cbbcc01ea9cfa4ec8bfa27f0f9f71088.pdf effective_dat..." with entities "[(24491, 24504, 'jurisdiction'), (24775, 24788, 'j...". Use `spacy.training.offsets_to_biluo_tags(nlp.make_doc(text), entities)` to check the alignment. Misaligned entities ('-') will be ignored during training. warnings.warn( 83%|████████▎ | 210/254 [03:03<00:33, 1.33it/s]/Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/site-packages/spacy/training/iob_utils.py:141: UserWarning: [W030] Some entities could not be aligned in the text "cf34c9403e0092eca75ed9fc61284268.pdf effective_dat..." with entities "[(256, 272, 'effective_date'), (16765, 16781, 'eff...". Use `spacy.training.offsets_to_biluo_tags(nlp.make_doc(text), entities)` to check the alignment. Misaligned entities ('-') will be ignored during training. warnings.warn( 85%|████████▍ | 215/254 [03:05<00:16, 2.31it/s]/Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/site-packages/spacy/training/iob_utils.py:141: UserWarning: [W030] Some entities could not be aligned in the text "d2cedafb5d6fc0a7a2f4693f652606ef.pdf effective_dat..." with entities "[(34099, 34104, 'jurisdiction'), (34219, 34224, 'j...". Use `spacy.training.offsets_to_biluo_tags(nlp.make_doc(text), entities)` to check the alignment. Misaligned entities ('-') will be ignored during training. warnings.warn( 85%|████████▌ | 216/254 [03:11<01:09, 1.83s/it]/Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/site-packages/spacy/training/iob_utils.py:141: UserWarning: [W030] Some entities could not be aligned in the text "d50b5f4cf1b059aed9adb4d3d8953d84.pdf effective_dat..." with entities "[(218, 223, 'jurisdiction'), (15327, 15332, 'juris...". Use `spacy.training.offsets_to_biluo_tags(nlp.make_doc(text), entities)` to check the alignment. Misaligned entities ('-') will be ignored during training. warnings.warn( 86%|████████▌ | 219/254 [03:13<00:37, 1.06s/it]/Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/site-packages/spacy/training/iob_utils.py:141: UserWarning: [W030] Some entities could not be aligned in the text "d789f0680308f0638a05078c5d896b7a.pdf effective_dat..." with entities "[(292, 310, 'effective_date'), (24569, 24587, 'eff...". Use `spacy.training.offsets_to_biluo_tags(nlp.make_doc(text), entities)` to check the alignment. Misaligned entities ('-') will be ignored during training. warnings.warn( 90%|████████▉ | 228/254 [03:18<00:09, 2.81it/s]/Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/site-packages/spacy/training/iob_utils.py:141: UserWarning: [W030] Some entities could not be aligned in the text "e29c3877a103aaefcf77ebb110f981a5.pdf effective_dat..." with entities "[(24146, 24159, 'jurisdiction'), (24324, 24337, 'j...". Use `spacy.training.offsets_to_biluo_tags(nlp.make_doc(text), entities)` to check the alignment. Misaligned entities ('-') will be ignored during training. warnings.warn( 90%|█████████ | 229/254 [03:21<00:29, 1.18s/it]/Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/site-packages/spacy/training/iob_utils.py:141: UserWarning: [W030] Some entities could not be aligned in the text "e33d3ca6885f31faa68b2ab766afc86b.pdf effective_dat..." with entities "[(360, 365, 'jurisdiction'), (551, 556, 'jurisdict...". Use `spacy.training.offsets_to_biluo_tags(nlp.make_doc(text), entities)` to check the alignment. Misaligned entities ('-') will be ignored during training. warnings.warn( 96%|█████████▌| 243/254 [03:26<00:02, 3.98it/s]/Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/site-packages/spacy/training/iob_utils.py:141: UserWarning: [W030] Some entities could not be aligned in the text "f4d4ef76c5ce9b0d5bca8c55369b753c.pdf effective_dat..." with entities "[(19234, 19242, 'jurisdiction'), (19468, 19476, 'j...". Use `spacy.training.offsets_to_biluo_tags(nlp.make_doc(text), entities)` to check the alignment. Misaligned entities ('-') will be ignored during training. warnings.warn( 97%|█████████▋| 246/254 [03:29<00:04, 1.84it/s]/Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/site-packages/spacy/training/iob_utils.py:141: UserWarning: [W030] Some entities could not be aligned in the text "f6cf95250272fd7f3fd767819ee11255.pdf effective_dat..." with entities "[(6008, 6016, 'jurisdiction'), (14222, 14230, 'jur...". Use `spacy.training.offsets_to_biluo_tags(nlp.make_doc(text), entities)` to check the alignment. Misaligned entities ('-') will be ignored during training. warnings.warn( 98%|█████████▊| 248/254 [03:31<00:03, 1.78it/s]/Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/site-packages/spacy/training/iob_utils.py:141: UserWarning: [W030] Some entities could not be aligned in the text "fbf608b62ef498171b70fb7b36be61a0.pdf effective_dat..." with entities "[(30197, 30220, 'effective_date'), (3688, 3695, 'j...". Use `spacy.training.offsets_to_biluo_tags(nlp.make_doc(text), entities)` to check the alignment. Misaligned entities ('-') will be ignored during training. warnings.warn( 99%|█████████▉| 252/254 [03:32<00:00, 2.37it/s]/Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/site-packages/spacy/training/iob_utils.py:141: UserWarning: [W030] Some entities could not be aligned in the text "fdf657ad612664d6f363040992f9a93c.pdf effective_dat..." with entities "[(205, 221, 'effective_date'), (18571, 18587, 'eff...". Use `spacy.training.offsets_to_biluo_tags(nlp.make_doc(text), entities)` to check the alignment. Misaligned entities ('-') will be ignored during training. warnings.warn( 100%|██████████| 254/254 [03:35<00:00, 1.18it/s]
{'ner': 135477.30665303842}
100%|██████████| 254/254 [03:31<00:00, 1.20it/s]
{'ner': 2773.646277800689}
100%|██████████| 254/254 [03:33<00:00, 1.19it/s]
{'ner': 2577.063714375327}
# Test trained model
for text, _ in trainData[:1]:
doc = nlp(text)
print('Entities', [(ent.text, ent.label_) for ent in doc.ents])
Entities [('New', 'jurisdiction'), ('New', 'jurisdiction'), ('New', 'jurisdiction'), ('New', 'jurisdiction')]
# Save the model to path
nlp.to_disk('NER')