kleister-nda/run.py

134 lines
4.7 KiB
Python
Raw Permalink Normal View History

2022-05-03 23:45:59 +02:00
import lzma
import re
import csv
states = ['Alabama', 'Alaska', 'Arizona', 'Arkansas', 'California',
'Colorado', 'Connecticut', 'Delaware', 'Florida', 'Georgia',
'Hawaii', 'Idaho', 'Illinois', 'Indiana', 'Iowa',
'Kansas', 'Kentucky', 'Louisiana', 'Maine', 'Maryland',
'Massachusetts', 'Michigan', 'Minnesota', 'Mississippi', 'Missouri',
'Montana', 'Nebraska', 'Nevada', 'New Hampshire', 'New Jersey',
'New Mexico', 'New York', 'North Carolina', 'North Dakota', 'Ohio',
'Oklahoma', 'Oregon', 'Pennsylvania', 'Rhode Island', 'South Carolina',
'South Dakota', 'Tennessee', 'Texas', 'Utah', 'Vermont',
'Virginia', 'Washington', 'West Virginia', 'Wisconsin', 'Wyoming']
months = {'01': 'January', '02': 'February', '03': 'March',
'04': 'April', '05': 'May', '06': 'June',
'07': 'July', '08': 'August', '09': 'September',
'10': 'October', '11': 'November', '12': 'December'}
wordToNumber = {1 : 'one', 2 : 'two', 3 : 'three', 4 : 'four', 5 : 'five',
6 : 'six', 7 : 'seven', 8 : 'eight', 9 : 'nine', 10 : 'ten',
11 : 'eleven', 12 : 'twelve', 13 : 'thirteen', 14 : 'fourteen',
15 : 'fifteen', 16 : 'sixteen', 17 : 'seventeen', 18 : 'eighteen',
19 : 'nineteen', 20 : 'twenty',
30 : 'thirty', 40 : 'forty', 50 : 'fifty', 60 : 'sixty',
70 : 'seventy', 80 : 'eighty', 90 : 'ninety' }
terms = ['days', 'months', 'years']
punctuation = '!"#$%&\'()*+,-./:;<=>?@[\\\\]^_`{|}~'
def readInput(dir):
NDAs = []
with lzma.open(dir) as f:
for line in f:
NDAs.append(line.decode('utf-8').replace('\\n', ' '))
return NDAs
def getEffectiveDate(nda):
stopWords = ['day ', 'of ', 'st', 'nd', 'rd', 'th', 'date ', 'Date ',
'as ', 'on ', 'from ', 'dated ']
for char in punctuation:
nda = nda.replace(char, '')
for word in stopWords:
nda = nda.replace(word, '')
effectiveDate = []
for month in list(months.values()):
if month in nda:
effectiveDate.append(re.findall('\d+ ' + month + ' \d+', nda))
effectiveDate = [item for sublist in effectiveDate for item in sublist]
try:
effectiveDate = { date : effectiveDate.count(date) for date in effectiveDate }
effectiveDate = max(effectiveDate, key=effectiveDate.get)
for char in punctuation: effectiveDate = effectiveDate.replace(char, '')
for d in effectiveDate.split():
if d in list(months.values()):
month = list(months.keys())[list(months.values()).index(d)]
elif int(d) < 10:
day = '0' + d
elif int(d) < 32:
day = d
elif int(d) > 1900 and int(d) < 2030:
year = d
effectiveDate = year + '-' + month + '-' + day
except:
effectiveDate = ''
return effectiveDate
def getJurisdiction(nda):
jurisdiction = []
for state in states:
if state in nda: jurisdiction.append(state)
if len(jurisdiction) > 0:
jurisdiction = { state : jurisdiction.count(state) for state in jurisdiction }
jurisdiction = max(jurisdiction, key=jurisdiction.get).replace(' ', '_')
else:
jurisdiction = ''
return jurisdiction
def getTerm(nda):
for char in punctuation:
nda = nda.replace(char, '')
term = []
reTerm = [ r'\w+ years', r'\w+ months', r'\w+ days']
for regex in reTerm:
term.append(re.findall(regex, nda))
term = [item for sublist in term for item in sublist]
if len(term) > 0:
term = { time : term.count(time) for time in term }
term = max(term, key=term.get)
term = term.split()
if term[0] in list(wordToNumber.values()):
term[0] = str(list(wordToNumber.keys())[list(wordToNumber.values()).index(term[0])])
term = '_'.join(term)
else: term = ''
return term
def getParties(nda):
nda = nda.replace(',', '')
party = re.findall(r'[A-Z]+[a-z]* Inc.', nda)
if len(party) > 0:
party = list(dict.fromkeys(party))
party = [ p.replace(' ', '_') for p in party ]
return party
else: return ''
if __name__ == '__main__':
NDAs = readInput('test-A/in.tsv.xz')
predicted = []
for nda in NDAs:
predict = ''
j = getJurisdiction(nda)
t = getTerm(nda)
ed = getEffectiveDate(nda)
p = getParties(nda)
if len(ed) > 0: predict += 'effective_date=' + ed + ' '
if len(j) > 0: predict += 'jurisdiction=' + j + ' '
if len(p) > 0:
for party in p: predict += 'party=' + party + ' '
if len(t) > 0: predict += 'term=' + t
predicted.append([predict])
with open('test-A/out.tsv', 'w', newline='') as f:
writer = csv.writer(f)
writer.writerows(predicted)