kleister-nda/main.py

import lzma
from matplotlib.pyplot import getp
import spacy
import csv


months = {'01': 'January', '02': 'February', '03': 'March', 
    '04': 'April', '05': 'May', '06': 'June',
    '07': 'July', '08': 'August', '09': 'September',
    '10': 'October', '11': 'November', '12': 'December'}

punctuation = '!"#$%&\'()*+,-./:;<=>?@[\\\\]^_`{|}~'

states = ['Alabama', 'Alaska', 'Arizona', 'Arkansas', 'California',
    'Colorado', 'Connecticut', 'Delaware', 'Florida', 'Georgia',
    'Hawaii', 'Idaho', 'Illinois', 'Indiana', 'Iowa',
    'Kansas', 'Kentucky', 'Louisiana', 'Maine', 'Maryland',
    'Massachusetts', 'Michigan', 'Minnesota', 'Mississippi', 'Missouri',
    'Montana', 'Nebraska', 'Nevada', 'New Hampshire', 'New Jersey', 
    'New Mexico', 'New York', 'North Carolina', 'North Dakota', 'Ohio',
    'Oklahoma', 'Oregon', 'Pennsylvania', 'Rhode Island', 'South Carolina',
    'South Dakota', 'Tennessee', 'Texas', 'Utah', 'Vermont',
    'Virginia', 'Washington', 'West Virginia', 'Wisconsin', 'Wyoming']

wordToNumber = {1 : 'one', 2 : 'two', 3 : 'three', 4 : 'four', 5 : 'five',
        6 : 'six', 7 : 'seven', 8 : 'eight', 9 : 'nine', 10 : 'ten',
        11 : 'eleven', 12 : 'twelve', 13 : 'thirteen', 14 : 'fourteen',
        15 : 'fifteen', 16 : 'sixteen', 17 : 'seventeen', 18 : 'eighteen',
        19 : 'nineteen', 20 : 'twenty',
        30 : 'thirty', 40 : 'forty', 50 : 'fifty', 60 : 'sixty',
        70 : 'seventy', 80 : 'eighty', 90 : 'ninety' }


def readInput(dir):
    NDAs = []
    with lzma.open(dir) as f:
        for line in f:
            NDAs.append(line.decode('utf-8'))
    return NDAs

def getEffectiveDate(document):
    effectiveDate = []

    for word in document.ents:
        if word.label_ == 'effective_date':
            effectiveDate.append(word.text)

    #if len(effectiveDate) > 0:
    try:
        effectiveDate = { date : effectiveDate.count(date) for date in effectiveDate }
        effectiveDate = max(effectiveDate, key=effectiveDate.get)
        for char in punctuation: effectiveDate = effectiveDate.replace(char, '')
        for d in effectiveDate.split():
            if d in list(months.values()):
                month = list(months.keys())[list(months.values()).index(d)]
            elif int(d) < 32:
                day = d
            elif int(d) > 1900 and int(d) < 2030:
                year = d
        effectiveDate = year + '-' + month + '-' + day
    except:
        effectiveDate = ''

    return effectiveDate # effectiveDate = '2011-07-13'

def getJurisdiction(document):
    jurisdiction = []

    for word in document.ents:
        if word.label_ == 'jurisdiction':
            if word.text not in states:
                for state in states:
                    if word.text in state:
                        jurisdiction.append(state)
            else:
                jurisdiction.append(word.text)

    if len(jurisdiction) > 0:
        jurisdiction = { state : jurisdiction.count(state) for state in jurisdiction }
        jurisdiction = max(jurisdiction, key=jurisdiction.get).replace(' ', '_')
    else:
        jurisdiction = ''

    return jurisdiction # jurisdiction = 'New_York'

def getParties(document):
    party = []

    for word in document.ents:
        if word.label_ == 'party':
            party.append(word.text)

    party = list(dict.fromkeys(party))
    party = [ p.replace(' ', '_') for p in party]

    return party # party = ['CompuDyne_Corporation']

def getTerm(document):
    term = []

    for word in document.ents:
        if word.label_ == 'term':
            term.append(word.text)

    if len(term) > 0:
        term = { time : term.count(time) for time in term }
        term = max(term, key=term.get)
        term = term.split()
        term[0] = str(list(wordToNumber.keys())[list(wordToNumber.values()).index(term[0])])
        term = '_'.join(term)
    else: term = ''

    return term # term = '3_years'


if __name__ == '__main__':
    NDAs = readInput('train/in.tsv.xz')

    ner = spacy.load('NER')

    predicted = [''] * len(NDAs)

    document = ner(NDAs[9])

    for i in range(len(NDAs)):
        document = ner(NDAs[i])

        ed = getEffectiveDate(document)
        j = getJurisdiction(document)
        p = getParties(document)
        t = getTerm(document)

        if len(ed) > 0: predicted[i] += 'effective_date=' + ed + ' '
        if len(j) > 0: predicted[i] += 'jurisdiction=' + j + ' '
        if len(p) > 0:
            for party in p: predicted[i] += 'party=' + party + ' '
        if len(t) > 0: predicted[i] += 'term=' + t

    with open('train/out.tsv', 'w', newline='') as f:
        writer = csv.writer(f)
        writer.writerows(predicted)
Update main.py 2022-05-03 20:10:12 +02:00			`import lzma`
Ready to go 2022-05-03 21:54:24 +02:00			`from matplotlib.pyplot import getp`
Update main.py 2022-05-03 20:10:12 +02:00			`import spacy`
Ready to go 2022-05-03 21:54:24 +02:00			`import csv`


			`months = {'01': 'January', '02': 'February', '03': 'March',`
			`'04': 'April', '05': 'May', '06': 'June',`
			`'07': 'July', '08': 'August', '09': 'September',`
			`'10': 'October', '11': 'November', '12': 'December'}`

			punctuation = '!"#$%&\'()*+,-./:;<=>?@[\\\\]^_`{\|}~'

			`states = ['Alabama', 'Alaska', 'Arizona', 'Arkansas', 'California',`
			`'Colorado', 'Connecticut', 'Delaware', 'Florida', 'Georgia',`
			`'Hawaii', 'Idaho', 'Illinois', 'Indiana', 'Iowa',`
			`'Kansas', 'Kentucky', 'Louisiana', 'Maine', 'Maryland',`
			`'Massachusetts', 'Michigan', 'Minnesota', 'Mississippi', 'Missouri',`
			`'Montana', 'Nebraska', 'Nevada', 'New Hampshire', 'New Jersey',`
			`'New Mexico', 'New York', 'North Carolina', 'North Dakota', 'Ohio',`
			`'Oklahoma', 'Oregon', 'Pennsylvania', 'Rhode Island', 'South Carolina',`
			`'South Dakota', 'Tennessee', 'Texas', 'Utah', 'Vermont',`
			`'Virginia', 'Washington', 'West Virginia', 'Wisconsin', 'Wyoming']`

			`wordToNumber = {1 : 'one', 2 : 'two', 3 : 'three', 4 : 'four', 5 : 'five',`
			`6 : 'six', 7 : 'seven', 8 : 'eight', 9 : 'nine', 10 : 'ten',`
			`11 : 'eleven', 12 : 'twelve', 13 : 'thirteen', 14 : 'fourteen',`
			`15 : 'fifteen', 16 : 'sixteen', 17 : 'seventeen', 18 : 'eighteen',`
			`19 : 'nineteen', 20 : 'twenty',`
			`30 : 'thirty', 40 : 'forty', 50 : 'fifty', 60 : 'sixty',`
			`70 : 'seventy', 80 : 'eighty', 90 : 'ninety' }`
Update main.py 2022-05-03 20:10:12 +02:00

			`def readInput(dir):`
			`NDAs = []`
			`with lzma.open(dir) as f:`
			`for line in f:`
			`NDAs.append(line.decode('utf-8'))`
			`return NDAs`

Ready to go 2022-05-03 21:54:24 +02:00			`def getEffectiveDate(document):`
			`effectiveDate = []`

			`for word in document.ents:`
			`if word.label_ == 'effective_date':`
			`effectiveDate.append(word.text)`

			`#if len(effectiveDate) > 0:`
			`try:`
			`effectiveDate = { date : effectiveDate.count(date) for date in effectiveDate }`
			`effectiveDate = max(effectiveDate, key=effectiveDate.get)`
			`for char in punctuation: effectiveDate = effectiveDate.replace(char, '')`
			`for d in effectiveDate.split():`
			`if d in list(months.values()):`
			`month = list(months.keys())[list(months.values()).index(d)]`
			`elif int(d) < 32:`
			`day = d`
			`elif int(d) > 1900 and int(d) < 2030:`
			`year = d`
			`effectiveDate = year + '-' + month + '-' + day`
			`except:`
			`effectiveDate = ''`

			`return effectiveDate # effectiveDate = '2011-07-13'`

			`def getJurisdiction(document):`
			`jurisdiction = []`

			`for word in document.ents:`
			`if word.label_ == 'jurisdiction':`
			`if word.text not in states:`
			`for state in states:`
			`if word.text in state:`
			`jurisdiction.append(state)`
			`else:`
			`jurisdiction.append(word.text)`

			`if len(jurisdiction) > 0:`
			`jurisdiction = { state : jurisdiction.count(state) for state in jurisdiction }`
			`jurisdiction = max(jurisdiction, key=jurisdiction.get).replace(' ', '_')`
			`else:`
			`jurisdiction = ''`

			`return jurisdiction # jurisdiction = 'New_York'`

			`def getParties(document):`
			`party = []`

			`for word in document.ents:`
			`if word.label_ == 'party':`
			`party.append(word.text)`

			`party = list(dict.fromkeys(party))`
			`party = [ p.replace(' ', '_') for p in party]`

			`return party # party = ['CompuDyne_Corporation']`

			`def getTerm(document):`
			`term = []`

			`for word in document.ents:`
			`if word.label_ == 'term':`
			`term.append(word.text)`

			`if len(term) > 0:`
			`term = { time : term.count(time) for time in term }`
			`term = max(term, key=term.get)`
			`term = term.split()`
			`term[0] = str(list(wordToNumber.keys())[list(wordToNumber.values()).index(term[0])])`
			`term = '_'.join(term)`
			`else: term = ''`

			`return term # term = '3_years'`

Update main.py 2022-05-03 20:10:12 +02:00
			`if __name__ == '__main__':`
			`NDAs = readInput('train/in.tsv.xz')`

			`ner = spacy.load('NER')`

Ready to go 2022-05-03 21:54:24 +02:00			`predicted = [''] * len(NDAs)`

			`document = ner(NDAs[9])`

			`for i in range(len(NDAs)):`
			`document = ner(NDAs[i])`

			`ed = getEffectiveDate(document)`
			`j = getJurisdiction(document)`
			`p = getParties(document)`
			`t = getTerm(document)`

			`if len(ed) > 0: predicted[i] += 'effective_date=' + ed + ' '`
			`if len(j) > 0: predicted[i] += 'jurisdiction=' + j + ' '`
			`if len(p) > 0:`
			`for party in p: predicted[i] += 'party=' + party + ' '`
			`if len(t) > 0: predicted[i] += 'term=' + t`

			`with open('train/out.tsv', 'w', newline='') as f:`
			`writer = csv.writer(f)`
			`writer.writerows(predicted)`