kleister-nda/main.py
Iwona Christop b9815844a4 Ready to go
2022-05-03 21:54:24 +02:00

141 lines
4.7 KiB
Python

import lzma
from matplotlib.pyplot import getp
import spacy
import csv
months = {'01': 'January', '02': 'February', '03': 'March',
'04': 'April', '05': 'May', '06': 'June',
'07': 'July', '08': 'August', '09': 'September',
'10': 'October', '11': 'November', '12': 'December'}
punctuation = '!"#$%&\'()*+,-./:;<=>?@[\\\\]^_`{|}~'
states = ['Alabama', 'Alaska', 'Arizona', 'Arkansas', 'California',
'Colorado', 'Connecticut', 'Delaware', 'Florida', 'Georgia',
'Hawaii', 'Idaho', 'Illinois', 'Indiana', 'Iowa',
'Kansas', 'Kentucky', 'Louisiana', 'Maine', 'Maryland',
'Massachusetts', 'Michigan', 'Minnesota', 'Mississippi', 'Missouri',
'Montana', 'Nebraska', 'Nevada', 'New Hampshire', 'New Jersey',
'New Mexico', 'New York', 'North Carolina', 'North Dakota', 'Ohio',
'Oklahoma', 'Oregon', 'Pennsylvania', 'Rhode Island', 'South Carolina',
'South Dakota', 'Tennessee', 'Texas', 'Utah', 'Vermont',
'Virginia', 'Washington', 'West Virginia', 'Wisconsin', 'Wyoming']
wordToNumber = {1 : 'one', 2 : 'two', 3 : 'three', 4 : 'four', 5 : 'five',
6 : 'six', 7 : 'seven', 8 : 'eight', 9 : 'nine', 10 : 'ten',
11 : 'eleven', 12 : 'twelve', 13 : 'thirteen', 14 : 'fourteen',
15 : 'fifteen', 16 : 'sixteen', 17 : 'seventeen', 18 : 'eighteen',
19 : 'nineteen', 20 : 'twenty',
30 : 'thirty', 40 : 'forty', 50 : 'fifty', 60 : 'sixty',
70 : 'seventy', 80 : 'eighty', 90 : 'ninety' }
def readInput(dir):
NDAs = []
with lzma.open(dir) as f:
for line in f:
NDAs.append(line.decode('utf-8'))
return NDAs
def getEffectiveDate(document):
effectiveDate = []
for word in document.ents:
if word.label_ == 'effective_date':
effectiveDate.append(word.text)
#if len(effectiveDate) > 0:
try:
effectiveDate = { date : effectiveDate.count(date) for date in effectiveDate }
effectiveDate = max(effectiveDate, key=effectiveDate.get)
for char in punctuation: effectiveDate = effectiveDate.replace(char, '')
for d in effectiveDate.split():
if d in list(months.values()):
month = list(months.keys())[list(months.values()).index(d)]
elif int(d) < 32:
day = d
elif int(d) > 1900 and int(d) < 2030:
year = d
effectiveDate = year + '-' + month + '-' + day
except:
effectiveDate = ''
return effectiveDate # effectiveDate = '2011-07-13'
def getJurisdiction(document):
jurisdiction = []
for word in document.ents:
if word.label_ == 'jurisdiction':
if word.text not in states:
for state in states:
if word.text in state:
jurisdiction.append(state)
else:
jurisdiction.append(word.text)
if len(jurisdiction) > 0:
jurisdiction = { state : jurisdiction.count(state) for state in jurisdiction }
jurisdiction = max(jurisdiction, key=jurisdiction.get).replace(' ', '_')
else:
jurisdiction = ''
return jurisdiction # jurisdiction = 'New_York'
def getParties(document):
party = []
for word in document.ents:
if word.label_ == 'party':
party.append(word.text)
party = list(dict.fromkeys(party))
party = [ p.replace(' ', '_') for p in party]
return party # party = ['CompuDyne_Corporation']
def getTerm(document):
term = []
for word in document.ents:
if word.label_ == 'term':
term.append(word.text)
if len(term) > 0:
term = { time : term.count(time) for time in term }
term = max(term, key=term.get)
term = term.split()
term[0] = str(list(wordToNumber.keys())[list(wordToNumber.values()).index(term[0])])
term = '_'.join(term)
else: term = ''
return term # term = '3_years'
if __name__ == '__main__':
NDAs = readInput('train/in.tsv.xz')
ner = spacy.load('NER')
predicted = [''] * len(NDAs)
document = ner(NDAs[9])
for i in range(len(NDAs)):
document = ner(NDAs[i])
ed = getEffectiveDate(document)
j = getJurisdiction(document)
p = getParties(document)
t = getTerm(document)
if len(ed) > 0: predicted[i] += 'effective_date=' + ed + ' '
if len(j) > 0: predicted[i] += 'jurisdiction=' + j + ' '
if len(p) > 0:
for party in p: predicted[i] += 'party=' + party + ' '
if len(t) > 0: predicted[i] += 'term=' + t
with open('train/out.tsv', 'w', newline='') as f:
writer = csv.writer(f)
writer.writerows(predicted)