134 lines
4.7 KiB
Python
134 lines
4.7 KiB
Python
|
import lzma
|
||
|
import re
|
||
|
import csv
|
||
|
|
||
|
|
||
|
states = ['Alabama', 'Alaska', 'Arizona', 'Arkansas', 'California',
|
||
|
'Colorado', 'Connecticut', 'Delaware', 'Florida', 'Georgia',
|
||
|
'Hawaii', 'Idaho', 'Illinois', 'Indiana', 'Iowa',
|
||
|
'Kansas', 'Kentucky', 'Louisiana', 'Maine', 'Maryland',
|
||
|
'Massachusetts', 'Michigan', 'Minnesota', 'Mississippi', 'Missouri',
|
||
|
'Montana', 'Nebraska', 'Nevada', 'New Hampshire', 'New Jersey',
|
||
|
'New Mexico', 'New York', 'North Carolina', 'North Dakota', 'Ohio',
|
||
|
'Oklahoma', 'Oregon', 'Pennsylvania', 'Rhode Island', 'South Carolina',
|
||
|
'South Dakota', 'Tennessee', 'Texas', 'Utah', 'Vermont',
|
||
|
'Virginia', 'Washington', 'West Virginia', 'Wisconsin', 'Wyoming']
|
||
|
|
||
|
months = {'01': 'January', '02': 'February', '03': 'March',
|
||
|
'04': 'April', '05': 'May', '06': 'June',
|
||
|
'07': 'July', '08': 'August', '09': 'September',
|
||
|
'10': 'October', '11': 'November', '12': 'December'}
|
||
|
|
||
|
wordToNumber = {1 : 'one', 2 : 'two', 3 : 'three', 4 : 'four', 5 : 'five',
|
||
|
6 : 'six', 7 : 'seven', 8 : 'eight', 9 : 'nine', 10 : 'ten',
|
||
|
11 : 'eleven', 12 : 'twelve', 13 : 'thirteen', 14 : 'fourteen',
|
||
|
15 : 'fifteen', 16 : 'sixteen', 17 : 'seventeen', 18 : 'eighteen',
|
||
|
19 : 'nineteen', 20 : 'twenty',
|
||
|
30 : 'thirty', 40 : 'forty', 50 : 'fifty', 60 : 'sixty',
|
||
|
70 : 'seventy', 80 : 'eighty', 90 : 'ninety' }
|
||
|
|
||
|
terms = ['days', 'months', 'years']
|
||
|
|
||
|
punctuation = '!"#$%&\'()*+,-./:;<=>?@[\\\\]^_`{|}~'
|
||
|
|
||
|
|
||
|
def readInput(dir):
|
||
|
NDAs = []
|
||
|
with lzma.open(dir) as f:
|
||
|
for line in f:
|
||
|
NDAs.append(line.decode('utf-8').replace('\\n', ' '))
|
||
|
return NDAs
|
||
|
|
||
|
def getEffectiveDate(nda):
|
||
|
stopWords = ['day ', 'of ', 'st', 'nd', 'rd', 'th', 'date ', 'Date ',
|
||
|
'as ', 'on ', 'from ', 'dated ']
|
||
|
for char in punctuation:
|
||
|
nda = nda.replace(char, '')
|
||
|
for word in stopWords:
|
||
|
nda = nda.replace(word, '')
|
||
|
effectiveDate = []
|
||
|
for month in list(months.values()):
|
||
|
if month in nda:
|
||
|
effectiveDate.append(re.findall('\d+ ' + month + ' \d+', nda))
|
||
|
effectiveDate = [item for sublist in effectiveDate for item in sublist]
|
||
|
try:
|
||
|
effectiveDate = { date : effectiveDate.count(date) for date in effectiveDate }
|
||
|
effectiveDate = max(effectiveDate, key=effectiveDate.get)
|
||
|
for char in punctuation: effectiveDate = effectiveDate.replace(char, '')
|
||
|
for d in effectiveDate.split():
|
||
|
if d in list(months.values()):
|
||
|
month = list(months.keys())[list(months.values()).index(d)]
|
||
|
elif int(d) < 10:
|
||
|
day = '0' + d
|
||
|
elif int(d) < 32:
|
||
|
day = d
|
||
|
elif int(d) > 1900 and int(d) < 2030:
|
||
|
year = d
|
||
|
effectiveDate = year + '-' + month + '-' + day
|
||
|
except:
|
||
|
effectiveDate = ''
|
||
|
return effectiveDate
|
||
|
|
||
|
def getJurisdiction(nda):
|
||
|
jurisdiction = []
|
||
|
for state in states:
|
||
|
if state in nda: jurisdiction.append(state)
|
||
|
if len(jurisdiction) > 0:
|
||
|
jurisdiction = { state : jurisdiction.count(state) for state in jurisdiction }
|
||
|
jurisdiction = max(jurisdiction, key=jurisdiction.get).replace(' ', '_')
|
||
|
else:
|
||
|
jurisdiction = ''
|
||
|
return jurisdiction
|
||
|
|
||
|
def getTerm(nda):
|
||
|
for char in punctuation:
|
||
|
nda = nda.replace(char, '')
|
||
|
term = []
|
||
|
reTerm = [ r'\w+ years', r'\w+ months', r'\w+ days']
|
||
|
for regex in reTerm:
|
||
|
term.append(re.findall(regex, nda))
|
||
|
term = [item for sublist in term for item in sublist]
|
||
|
if len(term) > 0:
|
||
|
term = { time : term.count(time) for time in term }
|
||
|
term = max(term, key=term.get)
|
||
|
term = term.split()
|
||
|
if term[0] in list(wordToNumber.values()):
|
||
|
term[0] = str(list(wordToNumber.keys())[list(wordToNumber.values()).index(term[0])])
|
||
|
term = '_'.join(term)
|
||
|
else: term = ''
|
||
|
return term
|
||
|
|
||
|
def getParties(nda):
|
||
|
nda = nda.replace(',', '')
|
||
|
party = re.findall(r'[A-Z]+[a-z]* Inc.', nda)
|
||
|
if len(party) > 0:
|
||
|
party = list(dict.fromkeys(party))
|
||
|
party = [ p.replace(' ', '_') for p in party ]
|
||
|
return party
|
||
|
else: return ''
|
||
|
|
||
|
|
||
|
|
||
|
if __name__ == '__main__':
|
||
|
NDAs = readInput('test-A/in.tsv.xz')
|
||
|
predicted = []
|
||
|
|
||
|
for nda in NDAs:
|
||
|
predict = ''
|
||
|
j = getJurisdiction(nda)
|
||
|
t = getTerm(nda)
|
||
|
ed = getEffectiveDate(nda)
|
||
|
p = getParties(nda)
|
||
|
if len(ed) > 0: predict += 'effective_date=' + ed + ' '
|
||
|
if len(j) > 0: predict += 'jurisdiction=' + j + ' '
|
||
|
if len(p) > 0:
|
||
|
for party in p: predict += 'party=' + party + ' '
|
||
|
if len(t) > 0: predict += 'term=' + t
|
||
|
|
||
|
predicted.append([predict])
|
||
|
|
||
|
with open('test-A/out.tsv', 'w', newline='') as f:
|
||
|
writer = csv.writer(f)
|
||
|
writer.writerows(predicted)
|
||
|
|