import lzma from matplotlib.pyplot import getp import spacy import csv months = {'01': 'January', '02': 'February', '03': 'March', '04': 'April', '05': 'May', '06': 'June', '07': 'July', '08': 'August', '09': 'September', '10': 'October', '11': 'November', '12': 'December'} punctuation = '!"#$%&\'()*+,-./:;<=>?@[\\\\]^_`{|}~' states = ['Alabama', 'Alaska', 'Arizona', 'Arkansas', 'California', 'Colorado', 'Connecticut', 'Delaware', 'Florida', 'Georgia', 'Hawaii', 'Idaho', 'Illinois', 'Indiana', 'Iowa', 'Kansas', 'Kentucky', 'Louisiana', 'Maine', 'Maryland', 'Massachusetts', 'Michigan', 'Minnesota', 'Mississippi', 'Missouri', 'Montana', 'Nebraska', 'Nevada', 'New Hampshire', 'New Jersey', 'New Mexico', 'New York', 'North Carolina', 'North Dakota', 'Ohio', 'Oklahoma', 'Oregon', 'Pennsylvania', 'Rhode Island', 'South Carolina', 'South Dakota', 'Tennessee', 'Texas', 'Utah', 'Vermont', 'Virginia', 'Washington', 'West Virginia', 'Wisconsin', 'Wyoming'] wordToNumber = {1 : 'one', 2 : 'two', 3 : 'three', 4 : 'four', 5 : 'five', 6 : 'six', 7 : 'seven', 8 : 'eight', 9 : 'nine', 10 : 'ten', 11 : 'eleven', 12 : 'twelve', 13 : 'thirteen', 14 : 'fourteen', 15 : 'fifteen', 16 : 'sixteen', 17 : 'seventeen', 18 : 'eighteen', 19 : 'nineteen', 20 : 'twenty', 30 : 'thirty', 40 : 'forty', 50 : 'fifty', 60 : 'sixty', 70 : 'seventy', 80 : 'eighty', 90 : 'ninety' } def readInput(dir): NDAs = [] with lzma.open(dir) as f: for line in f: NDAs.append(line.decode('utf-8')) return NDAs def getEffectiveDate(document): effectiveDate = [] for word in document.ents: if word.label_ == 'effective_date': effectiveDate.append(word.text) #if len(effectiveDate) > 0: try: effectiveDate = { date : effectiveDate.count(date) for date in effectiveDate } effectiveDate = max(effectiveDate, key=effectiveDate.get) for char in punctuation: effectiveDate = effectiveDate.replace(char, '') for d in effectiveDate.split(): if d in list(months.values()): month = list(months.keys())[list(months.values()).index(d)] elif int(d) < 32: day = d elif int(d) > 1900 and int(d) < 2030: year = d effectiveDate = year + '-' + month + '-' + day except: effectiveDate = '' return effectiveDate # effectiveDate = '2011-07-13' def getJurisdiction(document): jurisdiction = [] for word in document.ents: if word.label_ == 'jurisdiction': if word.text not in states: for state in states: if word.text in state: jurisdiction.append(state) else: jurisdiction.append(word.text) if len(jurisdiction) > 0: jurisdiction = { state : jurisdiction.count(state) for state in jurisdiction } jurisdiction = max(jurisdiction, key=jurisdiction.get).replace(' ', '_') else: jurisdiction = '' return jurisdiction # jurisdiction = 'New_York' def getParties(document): party = [] for word in document.ents: if word.label_ == 'party': party.append(word.text) party = list(dict.fromkeys(party)) party = [ p.replace(' ', '_') for p in party] return party # party = ['CompuDyne_Corporation'] def getTerm(document): term = [] for word in document.ents: if word.label_ == 'term': term.append(word.text) if len(term) > 0: term = { time : term.count(time) for time in term } term = max(term, key=term.get) term = term.split() term[0] = str(list(wordToNumber.keys())[list(wordToNumber.values()).index(term[0])]) term = '_'.join(term) else: term = '' return term # term = '3_years' if __name__ == '__main__': NDAs = readInput('train/in.tsv.xz') ner = spacy.load('NER') predicted = [''] * len(NDAs) document = ner(NDAs[9]) for i in range(len(NDAs)): document = ner(NDAs[i]) ed = getEffectiveDate(document) j = getJurisdiction(document) p = getParties(document) t = getTerm(document) if len(ed) > 0: predicted[i] += 'effective_date=' + ed + ' ' if len(j) > 0: predicted[i] += 'jurisdiction=' + j + ' ' if len(p) > 0: for party in p: predicted[i] += 'party=' + party + ' ' if len(t) > 0: predicted[i] += 'term=' + t with open('train/out.tsv', 'w', newline='') as f: writer = csv.writer(f) writer.writerows(predicted)