Add main
This commit is contained in:
parent
4b09fb6937
commit
5dc80126c0
303
main.ipynb
Normal file
303
main.ipynb
Normal file
@ -0,0 +1,303 @@
|
|||||||
|
{
|
||||||
|
"cells": [
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 1,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"import lzma\n",
|
||||||
|
"\n",
|
||||||
|
"# RTead file with lzma\n",
|
||||||
|
"NDAs = []\n",
|
||||||
|
"\n",
|
||||||
|
"with lzma.open('train/in.tsv.xz') as f:\n",
|
||||||
|
" for line in f:\n",
|
||||||
|
" NDAs.append(line.decode('utf-8'))"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 2,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"# Read expected information\n",
|
||||||
|
"expected = []\n",
|
||||||
|
"\n",
|
||||||
|
"with open('train/expected.tsv') as f:\n",
|
||||||
|
" for line in f:\n",
|
||||||
|
" expected.append(line.replace('\\n', '').split(' '))"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 3,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"import re\n",
|
||||||
|
"\n",
|
||||||
|
"months = {'01': 'January', '02': 'February', '03': 'March', \n",
|
||||||
|
" '04': 'April', '05': 'May', '06': 'June',\n",
|
||||||
|
" '07': 'July', '08': 'August', '09': 'September',\n",
|
||||||
|
" '10': 'October', '11': 'November', '12': 'December'}\n",
|
||||||
|
"\n",
|
||||||
|
"def dayToWord(day):\n",
|
||||||
|
" day = int(day)\n",
|
||||||
|
" if day > 3:\n",
|
||||||
|
" return str(day) + 'th'\n",
|
||||||
|
" elif day == 3:\n",
|
||||||
|
" return str(day) + 'rd'\n",
|
||||||
|
" elif day == 2 :\n",
|
||||||
|
" return str(day) + 'nd'\n",
|
||||||
|
" else: return str(day) + 'st'\n",
|
||||||
|
"\n",
|
||||||
|
"def numToWord(number):\n",
|
||||||
|
" try:\n",
|
||||||
|
" number = int(number)\n",
|
||||||
|
" d = {1 : 'one', 2 : 'two', 3 : 'three', 4 : 'four', 5 : 'five',\n",
|
||||||
|
" 6 : 'six', 7 : 'seven', 8 : 'eight', 9 : 'nine', 10 : 'ten',\n",
|
||||||
|
" 11 : 'eleven', 12 : 'twelve', 13 : 'thirteen', 14 : 'fourteen',\n",
|
||||||
|
" 15 : 'fifteen', 16 : 'sixteen', 17 : 'seventeen', 18 : 'eighteen',\n",
|
||||||
|
" 19 : 'nineteen', 20 : 'twenty',\n",
|
||||||
|
" 30 : 'thirty', 40 : 'forty', 50 : 'fifty', 60 : 'sixty',\n",
|
||||||
|
" 70 : 'seventy', 80 : 'eighty', 90 : 'ninety' }\n",
|
||||||
|
" if number < 20:\n",
|
||||||
|
" return d[number]\n",
|
||||||
|
" else:\n",
|
||||||
|
" if number % 10 == 0: return d[number]\n",
|
||||||
|
" else: return d[number // 10 * 10] + '-' + d[number % 10]\n",
|
||||||
|
" except:\n",
|
||||||
|
" return number\n",
|
||||||
|
"\n",
|
||||||
|
"def labelJurisdiction(text, jurisdiction):\n",
|
||||||
|
" jurisdictions = []\n",
|
||||||
|
" jurisdiction = jurisdiction.replace('_', ' ')\n",
|
||||||
|
" for match in re.finditer(jurisdiction, text):\n",
|
||||||
|
" tup = (match.start(), match.end(), 'jurisdiction')\n",
|
||||||
|
" jurisdictions.append(tup)\n",
|
||||||
|
" return jurisdictions\n",
|
||||||
|
"\n",
|
||||||
|
"def labelEffectiveDate(text, date):\n",
|
||||||
|
" dates = []\n",
|
||||||
|
" year, month, day = date.split('-')\n",
|
||||||
|
" \n",
|
||||||
|
" dateFormats = [month + '/' + day + '/' + year,\n",
|
||||||
|
" month + '/' + day + '/' + year[-2:], \n",
|
||||||
|
" month[1] + '/' + day + '/' + year, \n",
|
||||||
|
" month[1] + '/' + day[1] + '/' + year, \n",
|
||||||
|
" month[1] + '/' + day + '/' + year[-2:], \n",
|
||||||
|
" month[1] + '/' + day[1] + '/' + year[-2:],\n",
|
||||||
|
" dayToWord(day) + ' of ' + months[month] + ', ' + year,\n",
|
||||||
|
" dayToWord(day) + ' day of ' + months[month] + ', ' + year,\n",
|
||||||
|
" months[month] + ' ' + day + ', ' + year ]\n",
|
||||||
|
"\n",
|
||||||
|
" for format in dateFormats:\n",
|
||||||
|
" for match in re.finditer(format, text, flags=re.IGNORECASE):\n",
|
||||||
|
" tup = (match.start(), match.end(), 'effective_date')\n",
|
||||||
|
" dates.append(tup)\n",
|
||||||
|
"\n",
|
||||||
|
" return dates\n",
|
||||||
|
"\n",
|
||||||
|
"def labelParties(text, party):\n",
|
||||||
|
" parties = []\n",
|
||||||
|
" if 'Inc' in party:\n",
|
||||||
|
" regular = ''\n",
|
||||||
|
" for word in party.split('_'):\n",
|
||||||
|
" regular += word + '(.*)'\n",
|
||||||
|
" party = regular\n",
|
||||||
|
" party = party.replace('_', ' ')\n",
|
||||||
|
" for match in re.finditer(party, text, flags=re.IGNORECASE):\n",
|
||||||
|
" tup = (match.start(), match.end(), 'party')\n",
|
||||||
|
" parties.append(tup)\n",
|
||||||
|
" return parties\n",
|
||||||
|
"\n",
|
||||||
|
"def labelTerms(text, term):\n",
|
||||||
|
" terms = []\n",
|
||||||
|
" term = term.split('_')\n",
|
||||||
|
" number = numToWord(term[0])\n",
|
||||||
|
" units = term[1]\n",
|
||||||
|
" for match in re.finditer(number + ' ' + units, text, flags=re.IGNORECASE):\n",
|
||||||
|
" tup = (match.start(), match.end(), 'term')\n",
|
||||||
|
" terms.append(tup)\n",
|
||||||
|
" return terms"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 4,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"expectEntities = []\n",
|
||||||
|
"\n",
|
||||||
|
"for expect in expected:\n",
|
||||||
|
" # expect = expect.split()\n",
|
||||||
|
" entities = []\n",
|
||||||
|
" for e in expect:\n",
|
||||||
|
" label, entity = e.split('=')\n",
|
||||||
|
" entities.append((label, entity))\n",
|
||||||
|
" expectEntities.append(entities)\n"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 5,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"trainData =[]\n",
|
||||||
|
"\n",
|
||||||
|
"for i in range(len(expectEntities)):\n",
|
||||||
|
" listOfEntities = []\n",
|
||||||
|
" for entity in expectEntities[i]:\n",
|
||||||
|
" if entity[0] == 'effective_date':\n",
|
||||||
|
" listOfEntities.append(labelEffectiveDate(NDAs[i], entity[1]))\n",
|
||||||
|
" elif entity[0] == 'jurisdiction':\n",
|
||||||
|
" listOfEntities.append(labelJurisdiction(NDAs[i], entity[1]))\n",
|
||||||
|
" elif entity[0] == 'party':\n",
|
||||||
|
" listOfEntities.append(labelParties(NDAs[i], entity[1]))\n",
|
||||||
|
" else: listOfEntities.append(labelTerms(NDAs[i], entity[1]))\n",
|
||||||
|
" listOfEntities = [item for sublist in listOfEntities for item in sublist]\n",
|
||||||
|
" trainData.append((NDAs[i], {'entities': listOfEntities}))"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 6,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"Created blank \"en\" model\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"import spacy\n",
|
||||||
|
"from spacy.tokens import DocBin\n",
|
||||||
|
"\n",
|
||||||
|
"model = None\n",
|
||||||
|
"nIter = 100\n",
|
||||||
|
"\n",
|
||||||
|
"if model is not None:\n",
|
||||||
|
" nlp = spacy.load(model)\n",
|
||||||
|
" print('Loaded model')\n",
|
||||||
|
"else:\n",
|
||||||
|
" nlp = spacy.blank('en')\n",
|
||||||
|
" print('Created blank \"en\" model')\n",
|
||||||
|
"\n",
|
||||||
|
"if 'ner' not in nlp.pipe_names:\n",
|
||||||
|
" # ner = nlp.create_pipe('ner')\n",
|
||||||
|
" ner = nlp.add_pipe('ner', last=True)\n",
|
||||||
|
"else:\n",
|
||||||
|
" ner = nlp.get_pipe('ner')"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 7,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"for data in trainData:\n",
|
||||||
|
" for ent in data[1].get('entities'):\n",
|
||||||
|
" ner.add_label(ent[2])"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 8,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"otherPipes = [pipe for pipe in nlp.pipe_names if pipe != 'ner']"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 11,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stderr",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
" 1%| | 3/254 [00:00<01:11, 3.49it/s]/Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/site-packages/spacy/training/iob_utils.py:141: UserWarning: [W030] Some entities could not be aligned in the text \"03efbda01358533c167ca9b1e6d72051.pdf\teffective_dat...\" with entities \"[(7513, 7521, 'effective_date'), (15032, 15040, 'e...\". Use `spacy.training.offsets_to_biluo_tags(nlp.make_doc(text), entities)` to check the alignment. Misaligned entities ('-') will be ignored during training.\n",
|
||||||
|
" warnings.warn(\n",
|
||||||
|
" 2%|▏ | 4/254 [00:01<02:28, 1.68it/s]/Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/site-packages/spacy/training/iob_utils.py:141: UserWarning: [W030] Some entities could not be aligned in the text \"03fd0e629b617da00c54794a8a78b24d.pdf\teffective_dat...\" with entities \"[(287, 300, 'effective_date'), (25276, 25289, 'eff...\". Use `spacy.training.offsets_to_biluo_tags(nlp.make_doc(text), entities)` to check the alignment. Misaligned entities ('-') will be ignored during training.\n",
|
||||||
|
" warnings.warn(\n",
|
||||||
|
" 2%|▏ | 6/254 [00:04<04:11, 1.01s/it]/Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/site-packages/spacy/training/iob_utils.py:141: UserWarning: [W030] Some entities could not be aligned in the text \"04bf0791804e8487c91ab84eaa47a335.pdf\teffective_dat...\" with entities \"[(198, 216, 'effective_date'), (22663, 22681, 'eff...\". Use `spacy.training.offsets_to_biluo_tags(nlp.make_doc(text), entities)` to check the alignment. Misaligned entities ('-') will be ignored during training.\n",
|
||||||
|
" warnings.warn(\n",
|
||||||
|
" 3%|▎ | 8/254 [00:07<04:37, 1.13s/it]/Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/site-packages/spacy/training/iob_utils.py:141: UserWarning: [W030] Some entities could not be aligned in the text \"0587275477c6ad6d0d72419383e04b88.pdf\teffective_dat...\" with entities \"[(4528, 4536, 'jurisdiction'), (4604, 4612, 'juris...\". Use `spacy.training.offsets_to_biluo_tags(nlp.make_doc(text), entities)` to check the alignment. Misaligned entities ('-') will be ignored during training.\n",
|
||||||
|
" warnings.warn(\n",
|
||||||
|
" 4%|▎ | 9/254 [00:12<09:04, 2.22s/it]/Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/site-packages/spacy/training/iob_utils.py:141: UserWarning: [W030] Some entities could not be aligned in the text \"05947711a24a5b7ce401911d31e19c91.pdf\teffective_dat...\" with entities \"[(18271, 18279, 'jurisdiction'), (18507, 18515, 'j...\". Use `spacy.training.offsets_to_biluo_tags(nlp.make_doc(text), entities)` to check the alignment. Misaligned entities ('-') will be ignored during training.\n",
|
||||||
|
" warnings.warn(\n",
|
||||||
|
" 6%|▌ | 14/254 [00:18<04:18, 1.08s/it]/Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/site-packages/spacy/training/iob_utils.py:141: UserWarning: [W030] Some entities could not be aligned in the text \"0859334b76224ff82c1312ae7b2b5da1.pdf\teffective_dat...\" with entities \"[(279, 296, 'effective_date'), (22981, 22998, 'eff...\". Use `spacy.training.offsets_to_biluo_tags(nlp.make_doc(text), entities)` to check the alignment. Misaligned entities ('-') will be ignored during training.\n",
|
||||||
|
" warnings.warn(\n",
|
||||||
|
" 7%|▋ | 17/254 [00:20<03:29, 1.13it/s]/Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/site-packages/spacy/training/iob_utils.py:141: UserWarning: [W030] Some entities could not be aligned in the text \"0c3ab1d0c8bb3b1c2f7a64f3ab584368.pdf\teffective_dat...\" with entities \"[(243, 259, 'effective_date'), (35225, 35241, 'eff...\". Use `spacy.training.offsets_to_biluo_tags(nlp.make_doc(text), entities)` to check the alignment. Misaligned entities ('-') will be ignored during training.\n",
|
||||||
|
" warnings.warn(\n",
|
||||||
|
" 7%|▋ | 18/254 [00:23<04:38, 1.18s/it]/Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/site-packages/spacy/training/iob_utils.py:141: UserWarning: [W030] Some entities could not be aligned in the text \"0c7b90701575b147c4ac245ca478ee7c.pdf\teffective_dat...\" with entities \"[(10058, 10065, 'jurisdiction'), (10252, 10259, 'j...\". Use `spacy.training.offsets_to_biluo_tags(nlp.make_doc(text), entities)` to check the alignment. Misaligned entities ('-') will be ignored during training.\n",
|
||||||
|
" warnings.warn(\n",
|
||||||
|
" 7%|▋ | 19/254 [00:25<05:25, 1.39s/it]"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"# import random\n",
|
||||||
|
"from tqdm import tqdm\n",
|
||||||
|
"\n",
|
||||||
|
"from spacy.training.example import Example\n",
|
||||||
|
"\n",
|
||||||
|
"with nlp.disable_pipes(*otherPipes):\n",
|
||||||
|
" optimizer = nlp.begin_training()\n",
|
||||||
|
" for itn in range(nIter):\n",
|
||||||
|
" # random.shuffle(trainData)\n",
|
||||||
|
" losses = {}\n",
|
||||||
|
" for text, annotations in tqdm(trainData):\n",
|
||||||
|
" try:\n",
|
||||||
|
" doc = nlp.make_doc(text)\n",
|
||||||
|
" example = Example.from_dict(doc, annotations)\n",
|
||||||
|
" nlp.update([example], drop=0.5, sgd=optimizer, losses=losses)\n",
|
||||||
|
" except:\n",
|
||||||
|
" pass\n",
|
||||||
|
" print(losses)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": []
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"metadata": {
|
||||||
|
"interpreter": {
|
||||||
|
"hash": "aee8b7b246df8f9039afb4144a1f6fd8d2ca17a180786b69acc140d282b71a49"
|
||||||
|
},
|
||||||
|
"kernelspec": {
|
||||||
|
"display_name": "Python 3.9.2 64-bit",
|
||||||
|
"language": "python",
|
||||||
|
"name": "python3"
|
||||||
|
},
|
||||||
|
"language_info": {
|
||||||
|
"codemirror_mode": {
|
||||||
|
"name": "ipython",
|
||||||
|
"version": 3
|
||||||
|
},
|
||||||
|
"file_extension": ".py",
|
||||||
|
"mimetype": "text/x-python",
|
||||||
|
"name": "python",
|
||||||
|
"nbconvert_exporter": "python",
|
||||||
|
"pygments_lexer": "ipython3",
|
||||||
|
"version": "3.9.2"
|
||||||
|
},
|
||||||
|
"orig_nbformat": 4
|
||||||
|
},
|
||||||
|
"nbformat": 4,
|
||||||
|
"nbformat_minor": 2
|
||||||
|
}
|
123
main.py
Normal file
123
main.py
Normal file
@ -0,0 +1,123 @@
|
|||||||
|
import lzma
|
||||||
|
import re
|
||||||
|
|
||||||
|
months = {'01': 'January', '02': 'February', '03': 'March',
|
||||||
|
'04': 'April', '05': 'May', '06': 'June',
|
||||||
|
'07': 'July', '08': 'August', '09': 'September',
|
||||||
|
'10': 'October', '11': 'November', '12': 'December'}
|
||||||
|
|
||||||
|
|
||||||
|
def dayToWord(day):
|
||||||
|
day = int(day)
|
||||||
|
if day > 3:
|
||||||
|
return str(day) + 'th'
|
||||||
|
elif day == 3:
|
||||||
|
return str(day) + 'rd'
|
||||||
|
elif day == 2 :
|
||||||
|
return str(day) + 'nd'
|
||||||
|
else: return str(day) + 'st'
|
||||||
|
|
||||||
|
def numToWord(number):
|
||||||
|
number = int(number)
|
||||||
|
d = {1 : 'one', 2 : 'two', 3 : 'three', 4 : 'four', 5 : 'five',
|
||||||
|
6 : 'six', 7 : 'seven', 8 : 'eight', 9 : 'nine', 10 : 'ten',
|
||||||
|
11 : 'eleven', 12 : 'twelve', 13 : 'thirteen', 14 : 'fourteen',
|
||||||
|
15 : 'fifteen', 16 : 'sixteen', 17 : 'seventeen', 18 : 'eighteen',
|
||||||
|
19 : 'nineteen', 20 : 'twenty',
|
||||||
|
30 : 'thirty', 40 : 'forty', 50 : 'fifty', 60 : 'sixty',
|
||||||
|
70 : 'seventy', 80 : 'eighty', 90 : 'ninety' }
|
||||||
|
if number < 20:
|
||||||
|
return d[number]
|
||||||
|
else:
|
||||||
|
if number % 10 == 0: return d[number]
|
||||||
|
else: return d[number // 10 * 10] + '-' + d[number % 10]
|
||||||
|
|
||||||
|
def labelJurisdiction(text, jurisdiction):
|
||||||
|
jurisdictions = []
|
||||||
|
for match in re.finditer(jurisdiction, text):
|
||||||
|
tup = (match.start(), match.end(), 'JURISDICTION')
|
||||||
|
jurisdictions.append(tup)
|
||||||
|
return jurisdictions
|
||||||
|
|
||||||
|
def labelEffectiveDate(text, date):
|
||||||
|
dates = []
|
||||||
|
year, month, day = date.split('-')
|
||||||
|
|
||||||
|
dateFormats = [month + '/' + day + '/' + year,
|
||||||
|
month + '/' + day + '/' + year[-2:],
|
||||||
|
month[1] + '/' + day + '/' + year,
|
||||||
|
month[1] + '/' + day[1] + '/' + year,
|
||||||
|
month[1] + '/' + day + '/' + year[-2:],
|
||||||
|
month[1] + '/' + day[1] + '/' + year[-2:],
|
||||||
|
dayToWord(day) + ' of ' + months[month] + ', ' + year,
|
||||||
|
dayToWord(day) + ' day of ' + months[month] + ', ' + year,
|
||||||
|
months[month] + ' ' + day + ', ' + year ]
|
||||||
|
|
||||||
|
for format in dateFormats:
|
||||||
|
for match in re.finditer(format, text, flags=re.IGNORECASE):
|
||||||
|
tup = (match.start(), match.end(), 'EFFECTIVE_DATE')
|
||||||
|
dates.append(tup)
|
||||||
|
|
||||||
|
return dates
|
||||||
|
|
||||||
|
def labelParties(text, party):
|
||||||
|
parties = []
|
||||||
|
if 'Inc' in party:
|
||||||
|
regular = ''
|
||||||
|
for word in party.split('_'):
|
||||||
|
regular += word + '(.*)'
|
||||||
|
party = regular
|
||||||
|
party = party.replace('_', ' ')
|
||||||
|
for match in re.finditer(party, text, flags=re.IGNORECASE):
|
||||||
|
tup = (match.start(), match.end(), 'PARTY')
|
||||||
|
parties.append(tup)
|
||||||
|
return parties
|
||||||
|
|
||||||
|
def labelTerms(text, term):
|
||||||
|
terms = []
|
||||||
|
term = term.split('_')
|
||||||
|
number = numToWord(term[0])
|
||||||
|
units = term[1]
|
||||||
|
for match in re.finditer(number + ' ' + units, text, flags=re.IGNORECASE):
|
||||||
|
tup = (match.start(), match.end(), 'TERM')
|
||||||
|
terms.append(tup)
|
||||||
|
return terms
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
# Read NDAs with lzma
|
||||||
|
NDAs = []
|
||||||
|
with lzma.open('train/in.tsv.xz') as f:
|
||||||
|
for line in f:
|
||||||
|
NDAs.append(line.decode('utf-8'))
|
||||||
|
|
||||||
|
# Read expected information
|
||||||
|
expected = []
|
||||||
|
with open('train/expected.tsv') as f:
|
||||||
|
for line in f:
|
||||||
|
expected.append(line.replace('\n', ''))
|
||||||
|
|
||||||
|
# Expected to labeled entities
|
||||||
|
expectEntities = []
|
||||||
|
for expect in expected:
|
||||||
|
entities = []
|
||||||
|
for e in expect:
|
||||||
|
label, entity = e.split('=')
|
||||||
|
entities.append((label, entity))
|
||||||
|
expectEntities.append(entities)
|
||||||
|
|
||||||
|
|
||||||
|
# Training data for Spacy
|
||||||
|
trainData =[]
|
||||||
|
for i in range(len(expectEntities)):
|
||||||
|
listOfEntities = []
|
||||||
|
for entity in expectEntities[i]:
|
||||||
|
if entity[0] == 'effective_date':
|
||||||
|
listOfEntities.append(labelEffectiveDate(NDAs[i], entity[1]))
|
||||||
|
elif entity[0] == 'jurisdiction':
|
||||||
|
listOfEntities.append(labelJurisdiction(NDAs[i], entity[1]))
|
||||||
|
elif entity[0] == 'party':
|
||||||
|
listOfEntities.append(labelParties(NDAs[i], entity[1]))
|
||||||
|
else: listOfEntities.append(labelTerms(NDAs[i], entity[1]))
|
||||||
|
listOfEntities = [item for sublist in listOfEntities for item in sublist]
|
||||||
|
trainData.append((NDAs[i], {'entities': listOfEntities}))
|
Loading…
Reference in New Issue
Block a user