From 9d30e595f0cfae4126fb5d7e46fbf8b34e54d945 Mon Sep 17 00:00:00 2001 From: Maciej Czajka Date: Thu, 28 Apr 2022 23:12:04 +0200 Subject: [PATCH] s444356 --- run.py | 183 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 183 insertions(+) create mode 100644 run.py diff --git a/run.py b/run.py new file mode 100644 index 0000000..3567df0 --- /dev/null +++ b/run.py @@ -0,0 +1,183 @@ +import re + +states = ['Alabama', 'Alaska', 'Arizona', 'Arkansas', 'California', 'Colorado', 'Connecticut', 'Delaware', 'Florida', + 'Georgia', 'Hawaii', 'Idaho', 'Illinois', 'Indiana', 'Iowa', 'Kansas', 'Kentucky', 'Louisiana', 'Maine', 'Maryland', + 'Massachusetts', 'Michigan', 'Minnesota', 'Mississippi', 'Missouri', 'Montan', 'Nebraska', 'Nevada', 'New Hampshire', + 'New Jersey', 'New Mexico', 'New York', 'North Carolina', 'North Dakota', 'Ohio', 'Oklahoma', 'Oregon', 'Pennsylvania', + 'Rhode Island', 'South Carolina', 'South Dakota', 'Tennessee', 'Texas', 'Utah', 'Vermont', 'Virginia', 'Washington', + 'West Virginia', 'Wisconsin', 'Wyoming'] + +months = {'january': '01', 'february': '02', 'march': '03', 'april': '04', 'may': '05', 'june': '06', 'july': '07', 'august': '08', 'september': '09', + 'october': '10', 'november': '11', 'december': '12'} + +def text2int(textnum, numwords={}): + if not numwords: + units = [ + "zero", "one", "two", "three", "four", "five", "six", "seven", "eight", + "nine", "ten", "eleven", "twelve", "thirteen", "fourteen", "fifteen", + "sixteen", "seventeen", "eighteen", "nineteen", + ] + + tens = ["", "", "twenty", "thirty", "forty", "fifty", "sixty", "seventy", "eighty", "ninety"] + + scales = ["hundred", "thousand", "million", "billion", "trillion"] + + numwords["and"] = (1, 0) + for idx, word in enumerate(units): numwords[word] = (1, idx) + for idx, word in enumerate(tens): numwords[word] = (1, idx * 10) + for idx, word in enumerate(scales): numwords[word] = (10 ** (idx * 3 or 2), 0) + + current = result = 0 + for word in textnum.split(): + if word not in numwords: + raise Exception("Illegal word: " + word) + + scale, increment = numwords[word] + current = current * scale + increment + if scale > 100: + result += current + current = 0 + + return result + current + +def find_state(line): + ctr = {} + for state in states: + if state.lower() in line.lower(): + ctr[state] = len(re.findall(state, line)) + if ctr != {}: + state = max(ctr, key=ctr.get) + state = state.replace(' ', '_') + return state + else: + return 'Alabama' + +def position_and_date(regex, text): + dates = [date.group() for date in re.finditer(regex, text)] + return dates + +# February 10, 2017 +# 6th day of January, 2012 +def find_dates(text): + res = [] + dates_regex = [r'[A-Z][A-Za-z]+ ([0-2][0-9]|3[0-1]), [0-2][0-9]{3}', + r'([0-2][0-9]|3[0-1])(th|rd|nd|st) day of [A-Za-z]+, [0-2][0-9]{3}', + r'([0-2][0-9]|3[0-1])/([0-2][0-9]|3[0-1])/[0-9][0-9]'] + for i in range(0, len(dates_regex)): + d = position_and_date(dates_regex[i], text) + if d != []: + res += d + return res + +def clean_date(date): + if ',' in date[0]: + date = date[0].replace(',', '') + if 'day of' in date: + date = re.sub(r'(th|rd|nd|st) day of', '', date) + if '/' in date[0]: + date = date[0].split('/') + date[2] = '20' + date[2] + else: + date = date.split() + for m in months: + if m in date[0].lower(): + s = date[0].lower() + date[0] = date[1] + date[1] = s + if m in date[1].lower(): + date[1] = date[1].lower().replace(m, months[m]) + d = date[0] + m = date[1] + y = date[2] + correct_date = str(f'{y}-{m}-{d}') + return correct_date + +def find_term(line): + regex = r'([0-9]|([0-2][0-9]|3[0-1])|three|nine|\([0-9]\)) (years|months)' + s = re.findall(regex, line) + return s + +def clean_term(t): + + term = [] + if t != []: + s = t[0][0] + s = re.sub(r'\(|\)', '', s) + term = list(t[0]) + term[0] = s + term.pop(1) + if not bool(re.search(r'\d', term[0])): + term[0] = str(text2int(term[0])) + term = term[0] + '_' + term[1] + return term + +with open('dev-0/in.tsv', 'r', encoding='utf8') as f: + dev0 = f.readlines() + +with open('dev-0/out.tsv', 'wt') as f: + for l in dev0: + res = "" + if 'effective_date' in l: + date = find_dates(l) + if date != []: + date = clean_date(date) + res = ('effective_date=' + str(date) + ' ') + + + t = find_term(l) + t = clean_term(t) + if t != []: + res += ('jurisdiction=' + str(find_state(l)) + ' ') + res += ('term=' + str(t) + '\n') + else: + res += ('jurisdiction=' + str(find_state(l)) + '\n') + + f.write(res) + +f.close() + + +with open('test-A/in.tsv', 'r', encoding='utf8') as f: + test_A = f.readlines() + +with open('test-A/out.tsv', 'wt') as f: + for l in test_A: + res = "" + if 'effective_date' in l: + date = find_dates(l) + if date != []: + date = clean_date(date) + res = ('effective_date=' + str(date) + ' ') + t = find_term(l) + t = clean_term(t) + if t != []: + res += ('jurisdiction=' + str(find_state(l)) + ' ') + res += ('term=' + str(t) + '\n') + else: + res += ('jurisdiction=' + str(find_state(l)) + '\n') + f.write(res) + +f.close() + + +with open('train/in.tsv', 'r', encoding='utf8') as f: + train = f.readlines() + +with open('train/out.tsv', 'wt') as f: + for l in train: + res = "" + if 'effective_date' in l: + date = find_dates(l) + if date != []: + date = clean_date(date) + res = ('effective_date=' + str(date) + ' ') + t = find_term(l) + t = clean_term(t) + if t != []: + res += ('jurisdiction=' + str(find_state(l)) + ' ') + res += ('term=' + str(t) + '\n') + else: + res += ('jurisdiction=' + str(find_state(l)) + '\n') + f.write(res) + +f.close() \ No newline at end of file