kleister-nda/main.py

141 lines
4.7 KiB
Python
Raw Normal View History

2022-05-03 20:10:12 +02:00
import lzma
2022-05-03 21:54:24 +02:00
from matplotlib.pyplot import getp
2022-05-03 20:10:12 +02:00
import spacy
2022-05-03 21:54:24 +02:00
import csv
months = {'01': 'January', '02': 'February', '03': 'March',
'04': 'April', '05': 'May', '06': 'June',
'07': 'July', '08': 'August', '09': 'September',
'10': 'October', '11': 'November', '12': 'December'}
punctuation = '!"#$%&\'()*+,-./:;<=>?@[\\\\]^_`{|}~'
states = ['Alabama', 'Alaska', 'Arizona', 'Arkansas', 'California',
'Colorado', 'Connecticut', 'Delaware', 'Florida', 'Georgia',
'Hawaii', 'Idaho', 'Illinois', 'Indiana', 'Iowa',
'Kansas', 'Kentucky', 'Louisiana', 'Maine', 'Maryland',
'Massachusetts', 'Michigan', 'Minnesota', 'Mississippi', 'Missouri',
'Montana', 'Nebraska', 'Nevada', 'New Hampshire', 'New Jersey',
'New Mexico', 'New York', 'North Carolina', 'North Dakota', 'Ohio',
'Oklahoma', 'Oregon', 'Pennsylvania', 'Rhode Island', 'South Carolina',
'South Dakota', 'Tennessee', 'Texas', 'Utah', 'Vermont',
'Virginia', 'Washington', 'West Virginia', 'Wisconsin', 'Wyoming']
wordToNumber = {1 : 'one', 2 : 'two', 3 : 'three', 4 : 'four', 5 : 'five',
6 : 'six', 7 : 'seven', 8 : 'eight', 9 : 'nine', 10 : 'ten',
11 : 'eleven', 12 : 'twelve', 13 : 'thirteen', 14 : 'fourteen',
15 : 'fifteen', 16 : 'sixteen', 17 : 'seventeen', 18 : 'eighteen',
19 : 'nineteen', 20 : 'twenty',
30 : 'thirty', 40 : 'forty', 50 : 'fifty', 60 : 'sixty',
70 : 'seventy', 80 : 'eighty', 90 : 'ninety' }
2022-05-03 20:10:12 +02:00
def readInput(dir):
NDAs = []
with lzma.open(dir) as f:
for line in f:
NDAs.append(line.decode('utf-8'))
return NDAs
2022-05-03 21:54:24 +02:00
def getEffectiveDate(document):
effectiveDate = []
for word in document.ents:
if word.label_ == 'effective_date':
effectiveDate.append(word.text)
#if len(effectiveDate) > 0:
try:
effectiveDate = { date : effectiveDate.count(date) for date in effectiveDate }
effectiveDate = max(effectiveDate, key=effectiveDate.get)
for char in punctuation: effectiveDate = effectiveDate.replace(char, '')
for d in effectiveDate.split():
if d in list(months.values()):
month = list(months.keys())[list(months.values()).index(d)]
elif int(d) < 32:
day = d
elif int(d) > 1900 and int(d) < 2030:
year = d
effectiveDate = year + '-' + month + '-' + day
except:
effectiveDate = ''
return effectiveDate # effectiveDate = '2011-07-13'
def getJurisdiction(document):
jurisdiction = []
for word in document.ents:
if word.label_ == 'jurisdiction':
if word.text not in states:
for state in states:
if word.text in state:
jurisdiction.append(state)
else:
jurisdiction.append(word.text)
if len(jurisdiction) > 0:
jurisdiction = { state : jurisdiction.count(state) for state in jurisdiction }
jurisdiction = max(jurisdiction, key=jurisdiction.get).replace(' ', '_')
else:
jurisdiction = ''
return jurisdiction # jurisdiction = 'New_York'
def getParties(document):
party = []
for word in document.ents:
if word.label_ == 'party':
party.append(word.text)
party = list(dict.fromkeys(party))
party = [ p.replace(' ', '_') for p in party]
return party # party = ['CompuDyne_Corporation']
def getTerm(document):
term = []
for word in document.ents:
if word.label_ == 'term':
term.append(word.text)
if len(term) > 0:
term = { time : term.count(time) for time in term }
term = max(term, key=term.get)
term = term.split()
term[0] = str(list(wordToNumber.keys())[list(wordToNumber.values()).index(term[0])])
term = '_'.join(term)
else: term = ''
return term # term = '3_years'
2022-05-03 20:10:12 +02:00
if __name__ == '__main__':
NDAs = readInput('train/in.tsv.xz')
ner = spacy.load('NER')
2022-05-03 21:54:24 +02:00
predicted = [''] * len(NDAs)
document = ner(NDAs[9])
for i in range(len(NDAs)):
document = ner(NDAs[i])
ed = getEffectiveDate(document)
j = getJurisdiction(document)
p = getParties(document)
t = getTerm(document)
if len(ed) > 0: predicted[i] += 'effective_date=' + ed + ' '
if len(j) > 0: predicted[i] += 'jurisdiction=' + j + ' '
if len(p) > 0:
for party in p: predicted[i] += 'party=' + party + ' '
if len(t) > 0: predicted[i] += 'term=' + t
with open('train/out.tsv', 'w', newline='') as f:
writer = csv.writer(f)
writer.writerows(predicted)