kleister-nda/main.py

import lzma
from matplotlib.pyplot import getp
import spacy
import csv


months = {'01': 'January', '02': 'February', '03': 'March',
    '04': 'April', '05': 'May', '06': 'June',
    '07': 'July', '08': 'August', '09': 'September',
    '10': 'October', '11': 'November', '12': 'December'}

punctuation = '!"#$%&\'()*+,-./:;<=>?@[\\\\]^_`{|}~'

states = ['Alabama', 'Alaska', 'Arizona', 'Arkansas', 'California',
    'Colorado', 'Connecticut', 'Delaware', 'Florida', 'Georgia',
    'Hawaii', 'Idaho', 'Illinois', 'Indiana', 'Iowa',
    'Kansas', 'Kentucky', 'Louisiana', 'Maine', 'Maryland',
    'Massachusetts', 'Michigan', 'Minnesota', 'Mississippi', 'Missouri',
    'Montana', 'Nebraska', 'Nevada', 'New Hampshire', 'New Jersey',
    'New Mexico', 'New York', 'North Carolina', 'North Dakota', 'Ohio',
    'Oklahoma', 'Oregon', 'Pennsylvania', 'Rhode Island', 'South Carolina',
    'South Dakota', 'Tennessee', 'Texas', 'Utah', 'Vermont',
    'Virginia', 'Washington', 'West Virginia', 'Wisconsin', 'Wyoming']

wordToNumber = {1 : 'one', 2 : 'two', 3 : 'three', 4 : 'four', 5 : 'five',
        6 : 'six', 7 : 'seven', 8 : 'eight', 9 : 'nine', 10 : 'ten',
        11 : 'eleven', 12 : 'twelve', 13 : 'thirteen', 14 : 'fourteen',
        15 : 'fifteen', 16 : 'sixteen', 17 : 'seventeen', 18 : 'eighteen',
        19 : 'nineteen', 20 : 'twenty',
        30 : 'thirty', 40 : 'forty', 50 : 'fifty', 60 : 'sixty',
        70 : 'seventy', 80 : 'eighty', 90 : 'ninety' }


def readInput(dir):
    NDAs = []
    with lzma.open(dir) as f:
        for line in f:
            NDAs.append(line.decode('utf-8'))
    return NDAs

def getEffectiveDate(document):
    effectiveDate = []

    for word in document.ents:
        if word.label_ == 'effective_date':
            effectiveDate.append(word.text)

    #if len(effectiveDate) > 0:
    try:
        effectiveDate = { date : effectiveDate.count(date) for date in effectiveDate }
        effectiveDate = max(effectiveDate, key=effectiveDate.get)
        for char in punctuation: effectiveDate = effectiveDate.replace(char, '')
        for d in effectiveDate.split():
            if d in list(months.values()):
                month = list(months.keys())[list(months.values()).index(d)]
            elif int(d) < 32:
                day = d
            elif int(d) > 1900 and int(d) < 2030:
                year = d
        effectiveDate = year + '-' + month + '-' + day
    except:
        effectiveDate = ''

    return effectiveDate # effectiveDate = '2011-07-13'

def getJurisdiction(document):
    jurisdiction = []

    for word in document.ents:
        if word.label_ == 'jurisdiction':
            if word.text not in states:
                for state in states:
                    if word.text in state:
                        jurisdiction.append(state)
            else:
                jurisdiction.append(word.text)

    if len(jurisdiction) > 0:
        jurisdiction = { state : jurisdiction.count(state) for state in jurisdiction }
        jurisdiction = max(jurisdiction, key=jurisdiction.get).replace(' ', '_')
    else:
        jurisdiction = ''

    return jurisdiction # jurisdiction = 'New_York'

def getParties(document):
    party = []

    for word in document.ents:
        if word.label_ == 'party':
            party.append(word.text)

    party = list(dict.fromkeys(party))
    party = [ p.replace(' ', '_') for p in party]

    return party # party = ['CompuDyne_Corporation']

def getTerm(document):
    term = []

    for word in document.ents:
        if word.label_ == 'term':
            term.append(word.text)

    if len(term) > 0:
        term = { time : term.count(time) for time in term }
        term = max(term, key=term.get)
        term = term.split()
        term[0] = str(list(wordToNumber.keys())[list(wordToNumber.values()).index(term[0])])
        term = '_'.join(term)
    else: term = ''

    return term # term = '3_years'


if __name__ == '__main__':
    NDAs = readInput('test-A/in.tsv.xz')

    ner = spacy.load('NER')

    predicted = []

    for i in range(len(NDAs)):
        document = ner(NDAs[i].replace('\n', ' '))
        predict = ''

        ed = getEffectiveDate(document)
        j = getJurisdiction(document)
        p = getParties(document)
        t = getTerm(document)

        if len(ed) > 0: predict += 'effective_date=' + ed + ' '
        if len(j) > 0: predict += 'jurisdiction=' + j + ' '
        if len(p) > 0:
            for party in p: predict += 'party=' + party + ' '
        if len(t) > 0: predict += 'term=' + t

        predicted.append([predict])

    with open('test-A/out.tsv', 'w', newline='') as f:
        writer = csv.writer(f)
        writer.writerows(predicted)