Update main.py
This commit is contained in:
parent
f28d4d4789
commit
c2748dc657
123
creatingModel.py
Normal file
123
creatingModel.py
Normal file
@ -0,0 +1,123 @@
|
|||||||
|
import lzma
|
||||||
|
import re
|
||||||
|
|
||||||
|
months = {'01': 'January', '02': 'February', '03': 'March',
|
||||||
|
'04': 'April', '05': 'May', '06': 'June',
|
||||||
|
'07': 'July', '08': 'August', '09': 'September',
|
||||||
|
'10': 'October', '11': 'November', '12': 'December'}
|
||||||
|
|
||||||
|
|
||||||
|
def dayToWord(day):
|
||||||
|
day = int(day)
|
||||||
|
if day > 3:
|
||||||
|
return str(day) + 'th'
|
||||||
|
elif day == 3:
|
||||||
|
return str(day) + 'rd'
|
||||||
|
elif day == 2 :
|
||||||
|
return str(day) + 'nd'
|
||||||
|
else: return str(day) + 'st'
|
||||||
|
|
||||||
|
def numToWord(number):
|
||||||
|
number = int(number)
|
||||||
|
d = {1 : 'one', 2 : 'two', 3 : 'three', 4 : 'four', 5 : 'five',
|
||||||
|
6 : 'six', 7 : 'seven', 8 : 'eight', 9 : 'nine', 10 : 'ten',
|
||||||
|
11 : 'eleven', 12 : 'twelve', 13 : 'thirteen', 14 : 'fourteen',
|
||||||
|
15 : 'fifteen', 16 : 'sixteen', 17 : 'seventeen', 18 : 'eighteen',
|
||||||
|
19 : 'nineteen', 20 : 'twenty',
|
||||||
|
30 : 'thirty', 40 : 'forty', 50 : 'fifty', 60 : 'sixty',
|
||||||
|
70 : 'seventy', 80 : 'eighty', 90 : 'ninety' }
|
||||||
|
if number < 20:
|
||||||
|
return d[number]
|
||||||
|
else:
|
||||||
|
if number % 10 == 0: return d[number]
|
||||||
|
else: return d[number // 10 * 10] + '-' + d[number % 10]
|
||||||
|
|
||||||
|
def labelJurisdiction(text, jurisdiction):
|
||||||
|
jurisdictions = []
|
||||||
|
for match in re.finditer(jurisdiction, text):
|
||||||
|
tup = (match.start(), match.end(), 'JURISDICTION')
|
||||||
|
jurisdictions.append(tup)
|
||||||
|
return jurisdictions
|
||||||
|
|
||||||
|
def labelEffectiveDate(text, date):
|
||||||
|
dates = []
|
||||||
|
year, month, day = date.split('-')
|
||||||
|
|
||||||
|
dateFormats = [month + '/' + day + '/' + year,
|
||||||
|
month + '/' + day + '/' + year[-2:],
|
||||||
|
month[1] + '/' + day + '/' + year,
|
||||||
|
month[1] + '/' + day[1] + '/' + year,
|
||||||
|
month[1] + '/' + day + '/' + year[-2:],
|
||||||
|
month[1] + '/' + day[1] + '/' + year[-2:],
|
||||||
|
dayToWord(day) + ' of ' + months[month] + ', ' + year,
|
||||||
|
dayToWord(day) + ' day of ' + months[month] + ', ' + year,
|
||||||
|
months[month] + ' ' + day + ', ' + year ]
|
||||||
|
|
||||||
|
for format in dateFormats:
|
||||||
|
for match in re.finditer(format, text, flags=re.IGNORECASE):
|
||||||
|
tup = (match.start(), match.end(), 'EFFECTIVE_DATE')
|
||||||
|
dates.append(tup)
|
||||||
|
|
||||||
|
return dates
|
||||||
|
|
||||||
|
def labelParties(text, party):
|
||||||
|
parties = []
|
||||||
|
if 'Inc' in party:
|
||||||
|
regular = ''
|
||||||
|
for word in party.split('_'):
|
||||||
|
regular += word + '(.*)'
|
||||||
|
party = regular
|
||||||
|
party = party.replace('_', ' ')
|
||||||
|
for match in re.finditer(party, text, flags=re.IGNORECASE):
|
||||||
|
tup = (match.start(), match.end(), 'PARTY')
|
||||||
|
parties.append(tup)
|
||||||
|
return parties
|
||||||
|
|
||||||
|
def labelTerms(text, term):
|
||||||
|
terms = []
|
||||||
|
term = term.split('_')
|
||||||
|
number = numToWord(term[0])
|
||||||
|
units = term[1]
|
||||||
|
for match in re.finditer(number + ' ' + units, text, flags=re.IGNORECASE):
|
||||||
|
tup = (match.start(), match.end(), 'TERM')
|
||||||
|
terms.append(tup)
|
||||||
|
return terms
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
# Read NDAs with lzma
|
||||||
|
NDAs = []
|
||||||
|
with lzma.open('train/in.tsv.xz') as f:
|
||||||
|
for line in f:
|
||||||
|
NDAs.append(line.decode('utf-8'))
|
||||||
|
|
||||||
|
# Read expected information
|
||||||
|
expected = []
|
||||||
|
with open('train/expected.tsv') as f:
|
||||||
|
for line in f:
|
||||||
|
expected.append(line.replace('\n', ''))
|
||||||
|
|
||||||
|
# Expected to labeled entities
|
||||||
|
expectEntities = []
|
||||||
|
for expect in expected:
|
||||||
|
entities = []
|
||||||
|
for e in expect:
|
||||||
|
label, entity = e.split('=')
|
||||||
|
entities.append((label, entity))
|
||||||
|
expectEntities.append(entities)
|
||||||
|
|
||||||
|
|
||||||
|
# Training data for Spacy
|
||||||
|
trainData =[]
|
||||||
|
for i in range(len(expectEntities)):
|
||||||
|
listOfEntities = []
|
||||||
|
for entity in expectEntities[i]:
|
||||||
|
if entity[0] == 'effective_date':
|
||||||
|
listOfEntities.append(labelEffectiveDate(NDAs[i], entity[1]))
|
||||||
|
elif entity[0] == 'jurisdiction':
|
||||||
|
listOfEntities.append(labelJurisdiction(NDAs[i], entity[1]))
|
||||||
|
elif entity[0] == 'party':
|
||||||
|
listOfEntities.append(labelParties(NDAs[i], entity[1]))
|
||||||
|
else: listOfEntities.append(labelTerms(NDAs[i], entity[1]))
|
||||||
|
listOfEntities = [item for sublist in listOfEntities for item in sublist]
|
||||||
|
trainData.append((NDAs[i], {'entities': listOfEntities}))
|
168
heSaidEdgar.ipynb
Normal file
168
heSaidEdgar.ipynb
Normal file
@ -0,0 +1,168 @@
|
|||||||
|
{
|
||||||
|
"cells": [
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 23,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"import lzma\n",
|
||||||
|
"\n",
|
||||||
|
"NDAs = []\n",
|
||||||
|
"with lzma.open('train/in.tsv.xz') as f:\n",
|
||||||
|
" for line in f:\n",
|
||||||
|
" NDAs.append(line.decode('utf-8'))"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 40,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"import spacy\n",
|
||||||
|
"from spacy import displacy\n",
|
||||||
|
"\n",
|
||||||
|
"nlp = spacy.load('NER')\n",
|
||||||
|
"\n",
|
||||||
|
"text = NDAs[9]\n",
|
||||||
|
"doc = nlp(text)\n",
|
||||||
|
"\n",
|
||||||
|
"effective_date = []\n",
|
||||||
|
"jurisdiction = []\n",
|
||||||
|
"party = []\n",
|
||||||
|
"term = []\n",
|
||||||
|
"\n",
|
||||||
|
"for word in doc.ents:\n",
|
||||||
|
" if word.label_ == 'effective_date':\n",
|
||||||
|
" effective_date.append(word.text)\n",
|
||||||
|
" elif word.label_ == 'jurisdiction':\n",
|
||||||
|
" jurisdiction.append(word.text)\n",
|
||||||
|
" elif word.label_ == 'party':\n",
|
||||||
|
" party.append(word.text)\n",
|
||||||
|
" else:\n",
|
||||||
|
" term.append(word.text)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 44,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/plain": [
|
||||||
|
"12"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 44,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"jurisdiction.count('New York')"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 45,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"juris = { j : jurisdiction.count(j) for j in jurisdiction}"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 47,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/plain": [
|
||||||
|
"{'New York': 12}"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 47,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"juris"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 39,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"CompuDyne Corporation --> party\n",
|
||||||
|
"two years --> term\n",
|
||||||
|
"New York --> jurisdiction\n",
|
||||||
|
"New York --> jurisdiction\n",
|
||||||
|
"New York --> jurisdiction\n",
|
||||||
|
"CompuDyne Corporation --> party\n",
|
||||||
|
"two years --> term\n",
|
||||||
|
"New York --> jurisdiction\n",
|
||||||
|
"New York --> jurisdiction\n",
|
||||||
|
"New York --> jurisdiction\n",
|
||||||
|
"CompuDyne Corporation --> party\n",
|
||||||
|
"two years --> term\n",
|
||||||
|
"New York --> jurisdiction\n",
|
||||||
|
"New York --> jurisdiction\n",
|
||||||
|
"New York --> jurisdiction\n",
|
||||||
|
"CompuDyne Corporation --> party\n",
|
||||||
|
"two years --> term\n",
|
||||||
|
"New York --> jurisdiction\n",
|
||||||
|
"New York --> jurisdiction\n",
|
||||||
|
"New York --> jurisdiction\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"text = NDAs[9]\n",
|
||||||
|
"doc = nlp(text)\n",
|
||||||
|
"for word in doc.ents:\n",
|
||||||
|
" print(word.text, '-->', word.label_)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": []
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"metadata": {
|
||||||
|
"interpreter": {
|
||||||
|
"hash": "3a5b3979b9a2fc2c8e649de363a592bbf5a2c9da164843b1adb5b45661722ad0"
|
||||||
|
},
|
||||||
|
"kernelspec": {
|
||||||
|
"display_name": "Python 3.8.10 64-bit",
|
||||||
|
"language": "python",
|
||||||
|
"name": "python3"
|
||||||
|
},
|
||||||
|
"language_info": {
|
||||||
|
"codemirror_mode": {
|
||||||
|
"name": "ipython",
|
||||||
|
"version": 3
|
||||||
|
},
|
||||||
|
"file_extension": ".py",
|
||||||
|
"mimetype": "text/x-python",
|
||||||
|
"name": "python",
|
||||||
|
"nbconvert_exporter": "python",
|
||||||
|
"pygments_lexer": "ipython3",
|
||||||
|
"version": "3.8.10"
|
||||||
|
},
|
||||||
|
"orig_nbformat": 4
|
||||||
|
},
|
||||||
|
"nbformat": 4,
|
||||||
|
"nbformat_minor": 2
|
||||||
|
}
|
20
main.ipynb
20
main.ipynb
File diff suppressed because one or more lines are too long
126
main.py
126
main.py
@ -1,123 +1,19 @@
|
|||||||
import lzma
|
import lzma
|
||||||
import re
|
import spacy
|
||||||
|
|
||||||
months = {'01': 'January', '02': 'February', '03': 'March',
|
|
||||||
'04': 'April', '05': 'May', '06': 'June',
|
|
||||||
'07': 'July', '08': 'August', '09': 'September',
|
|
||||||
'10': 'October', '11': 'November', '12': 'December'}
|
|
||||||
|
|
||||||
|
|
||||||
def dayToWord(day):
|
def readInput(dir):
|
||||||
day = int(day)
|
NDAs = []
|
||||||
if day > 3:
|
with lzma.open(dir) as f:
|
||||||
return str(day) + 'th'
|
for line in f:
|
||||||
elif day == 3:
|
NDAs.append(line.decode('utf-8'))
|
||||||
return str(day) + 'rd'
|
return NDAs
|
||||||
elif day == 2 :
|
|
||||||
return str(day) + 'nd'
|
|
||||||
else: return str(day) + 'st'
|
|
||||||
|
|
||||||
def numToWord(number):
|
|
||||||
number = int(number)
|
|
||||||
d = {1 : 'one', 2 : 'two', 3 : 'three', 4 : 'four', 5 : 'five',
|
|
||||||
6 : 'six', 7 : 'seven', 8 : 'eight', 9 : 'nine', 10 : 'ten',
|
|
||||||
11 : 'eleven', 12 : 'twelve', 13 : 'thirteen', 14 : 'fourteen',
|
|
||||||
15 : 'fifteen', 16 : 'sixteen', 17 : 'seventeen', 18 : 'eighteen',
|
|
||||||
19 : 'nineteen', 20 : 'twenty',
|
|
||||||
30 : 'thirty', 40 : 'forty', 50 : 'fifty', 60 : 'sixty',
|
|
||||||
70 : 'seventy', 80 : 'eighty', 90 : 'ninety' }
|
|
||||||
if number < 20:
|
|
||||||
return d[number]
|
|
||||||
else:
|
|
||||||
if number % 10 == 0: return d[number]
|
|
||||||
else: return d[number // 10 * 10] + '-' + d[number % 10]
|
|
||||||
|
|
||||||
def labelJurisdiction(text, jurisdiction):
|
|
||||||
jurisdictions = []
|
|
||||||
for match in re.finditer(jurisdiction, text):
|
|
||||||
tup = (match.start(), match.end(), 'JURISDICTION')
|
|
||||||
jurisdictions.append(tup)
|
|
||||||
return jurisdictions
|
|
||||||
|
|
||||||
def labelEffectiveDate(text, date):
|
|
||||||
dates = []
|
|
||||||
year, month, day = date.split('-')
|
|
||||||
|
|
||||||
dateFormats = [month + '/' + day + '/' + year,
|
|
||||||
month + '/' + day + '/' + year[-2:],
|
|
||||||
month[1] + '/' + day + '/' + year,
|
|
||||||
month[1] + '/' + day[1] + '/' + year,
|
|
||||||
month[1] + '/' + day + '/' + year[-2:],
|
|
||||||
month[1] + '/' + day[1] + '/' + year[-2:],
|
|
||||||
dayToWord(day) + ' of ' + months[month] + ', ' + year,
|
|
||||||
dayToWord(day) + ' day of ' + months[month] + ', ' + year,
|
|
||||||
months[month] + ' ' + day + ', ' + year ]
|
|
||||||
|
|
||||||
for format in dateFormats:
|
|
||||||
for match in re.finditer(format, text, flags=re.IGNORECASE):
|
|
||||||
tup = (match.start(), match.end(), 'EFFECTIVE_DATE')
|
|
||||||
dates.append(tup)
|
|
||||||
|
|
||||||
return dates
|
|
||||||
|
|
||||||
def labelParties(text, party):
|
|
||||||
parties = []
|
|
||||||
if 'Inc' in party:
|
|
||||||
regular = ''
|
|
||||||
for word in party.split('_'):
|
|
||||||
regular += word + '(.*)'
|
|
||||||
party = regular
|
|
||||||
party = party.replace('_', ' ')
|
|
||||||
for match in re.finditer(party, text, flags=re.IGNORECASE):
|
|
||||||
tup = (match.start(), match.end(), 'PARTY')
|
|
||||||
parties.append(tup)
|
|
||||||
return parties
|
|
||||||
|
|
||||||
def labelTerms(text, term):
|
|
||||||
terms = []
|
|
||||||
term = term.split('_')
|
|
||||||
number = numToWord(term[0])
|
|
||||||
units = term[1]
|
|
||||||
for match in re.finditer(number + ' ' + units, text, flags=re.IGNORECASE):
|
|
||||||
tup = (match.start(), match.end(), 'TERM')
|
|
||||||
terms.append(tup)
|
|
||||||
return terms
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
# Read NDAs with lzma
|
NDAs = readInput('train/in.tsv.xz')
|
||||||
NDAs = []
|
|
||||||
with lzma.open('train/in.tsv.xz') as f:
|
|
||||||
for line in f:
|
|
||||||
NDAs.append(line.decode('utf-8'))
|
|
||||||
|
|
||||||
# Read expected information
|
ner = spacy.load('NER')
|
||||||
expected = []
|
|
||||||
with open('train/expected.tsv') as f:
|
|
||||||
for line in f:
|
|
||||||
expected.append(line.replace('\n', ''))
|
|
||||||
|
|
||||||
# Expected to labeled entities
|
for nda in NDAs:
|
||||||
expectEntities = []
|
print('pass')
|
||||||
for expect in expected:
|
|
||||||
entities = []
|
|
||||||
for e in expect:
|
|
||||||
label, entity = e.split('=')
|
|
||||||
entities.append((label, entity))
|
|
||||||
expectEntities.append(entities)
|
|
||||||
|
|
||||||
|
|
||||||
# Training data for Spacy
|
|
||||||
trainData =[]
|
|
||||||
for i in range(len(expectEntities)):
|
|
||||||
listOfEntities = []
|
|
||||||
for entity in expectEntities[i]:
|
|
||||||
if entity[0] == 'effective_date':
|
|
||||||
listOfEntities.append(labelEffectiveDate(NDAs[i], entity[1]))
|
|
||||||
elif entity[0] == 'jurisdiction':
|
|
||||||
listOfEntities.append(labelJurisdiction(NDAs[i], entity[1]))
|
|
||||||
elif entity[0] == 'party':
|
|
||||||
listOfEntities.append(labelParties(NDAs[i], entity[1]))
|
|
||||||
else: listOfEntities.append(labelTerms(NDAs[i], entity[1]))
|
|
||||||
listOfEntities = [item for sublist in listOfEntities for item in sublist]
|
|
||||||
trainData.append((NDAs[i], {'entities': listOfEntities}))
|
|
Loading…
Reference in New Issue
Block a user