kleister-nda/run.py

import lzma
import re
import csv


states = ['Alabama', 'Alaska', 'Arizona', 'Arkansas', 'California',
    'Colorado', 'Connecticut', 'Delaware', 'Florida', 'Georgia',
    'Hawaii', 'Idaho', 'Illinois', 'Indiana', 'Iowa',
    'Kansas', 'Kentucky', 'Louisiana', 'Maine', 'Maryland',
    'Massachusetts', 'Michigan', 'Minnesota', 'Mississippi', 'Missouri',
    'Montana', 'Nebraska', 'Nevada', 'New Hampshire', 'New Jersey',
    'New Mexico', 'New York', 'North Carolina', 'North Dakota', 'Ohio',
    'Oklahoma', 'Oregon', 'Pennsylvania', 'Rhode Island', 'South Carolina',
    'South Dakota', 'Tennessee', 'Texas', 'Utah', 'Vermont',
    'Virginia', 'Washington', 'West Virginia', 'Wisconsin', 'Wyoming']

months = {'01': 'January', '02': 'February', '03': 'March',
    '04': 'April', '05': 'May', '06': 'June',
    '07': 'July', '08': 'August', '09': 'September',
    '10': 'October', '11': 'November', '12': 'December'}

wordToNumber = {1 : 'one', 2 : 'two', 3 : 'three', 4 : 'four', 5 : 'five',
        6 : 'six', 7 : 'seven', 8 : 'eight', 9 : 'nine', 10 : 'ten',
        11 : 'eleven', 12 : 'twelve', 13 : 'thirteen', 14 : 'fourteen',
        15 : 'fifteen', 16 : 'sixteen', 17 : 'seventeen', 18 : 'eighteen',
        19 : 'nineteen', 20 : 'twenty',
        30 : 'thirty', 40 : 'forty', 50 : 'fifty', 60 : 'sixty',
        70 : 'seventy', 80 : 'eighty', 90 : 'ninety' }

terms = ['days', 'months', 'years']

punctuation = '!"#$%&\'()*+,-./:;<=>?@[\\\\]^_`{|}~'


def readInput(dir):
    NDAs = []
    with lzma.open(dir) as f:
        for line in f:
            NDAs.append(line.decode('utf-8').replace('\\n', ' '))
    return NDAs

def getEffectiveDate(nda):
    stopWords = ['day ', 'of ', 'st', 'nd', 'rd', 'th', 'date ', 'Date ',
     'as ', 'on ', 'from ', 'dated ']
    for char in punctuation:
        nda = nda.replace(char, '')
    for word in stopWords:
        nda = nda.replace(word, '')
    effectiveDate = []
    for month in list(months.values()):
        if month in nda:
            effectiveDate.append(re.findall('\d+ ' + month + ' \d+', nda))
    effectiveDate = [item for sublist in effectiveDate for item in sublist]
    try:
        effectiveDate = { date : effectiveDate.count(date) for date in effectiveDate }
        effectiveDate = max(effectiveDate, key=effectiveDate.get)
        for char in punctuation: effectiveDate = effectiveDate.replace(char, '')
        for d in effectiveDate.split():
            if d in list(months.values()):
                month = list(months.keys())[list(months.values()).index(d)]
            elif int(d) < 10:
                day = '0' + d
            elif int(d) < 32:
                day = d
            elif int(d) > 1900 and int(d) < 2030:
                year = d
        effectiveDate = year + '-' + month + '-' + day
    except:
        effectiveDate = ''
    return effectiveDate

def getJurisdiction(nda):
    jurisdiction = []
    for state in states:
        if state in nda: jurisdiction.append(state)
    if len(jurisdiction) > 0:
        jurisdiction = { state : jurisdiction.count(state) for state in jurisdiction }
        jurisdiction = max(jurisdiction, key=jurisdiction.get).replace(' ', '_')
    else:
        jurisdiction = ''
    return jurisdiction

def getTerm(nda):
    for char in punctuation:
        nda = nda.replace(char, '')
    term = []
    reTerm = [ r'\w+ years', r'\w+ months', r'\w+ days']
    for regex in reTerm:
        term.append(re.findall(regex, nda))
    term = [item for sublist in term for item in sublist]
    if len(term) > 0:
        term = { time : term.count(time) for time in term }
        term = max(term, key=term.get)
        term = term.split()
        if term[0] in list(wordToNumber.values()):
            term[0] = str(list(wordToNumber.keys())[list(wordToNumber.values()).index(term[0])])
        term = '_'.join(term)
    else: term = ''
    return term

def getParties(nda):
    nda = nda.replace(',', '')
    party = re.findall(r'[A-Z]+[a-z]* Inc.', nda)
    if len(party) > 0:
        party = list(dict.fromkeys(party))
        party = [ p.replace(' ', '_') for p in party ]
        return party
    else: return ''


if __name__ == '__main__':
    NDAs = readInput('test-A/in.tsv.xz')
    predicted = []

    for nda in NDAs:
        predict = ''
        j = getJurisdiction(nda)
        t = getTerm(nda)
        ed = getEffectiveDate(nda)
        p = getParties(nda)
        if len(ed) > 0: predict += 'effective_date=' + ed + ' '
        if len(j) > 0: predict += 'jurisdiction=' + j + ' '
        if len(p) > 0:
            for party in p: predict += 'party=' + party + ' '
        if len(t) > 0: predict += 'term=' + t

        predicted.append([predict])

    with open('test-A/out.tsv', 'w', newline='') as f:
        writer = csv.writer(f)
        writer.writerows(predicted)