s444356

2022-04-28 23:12:04 +02:00 · 2022-04-28 23:12:04 +02:00 · 9d30e595f0
commit 9d30e595f0
parent 3bbf080d37
1 changed files with 183 additions and 0 deletions
--- a/run.py
+++ b/run.py
@ -0,0 +1,183 @@
+import re
+
+states = ['Alabama', 'Alaska', 'Arizona', 'Arkansas', 'California', 'Colorado', 'Connecticut', 'Delaware', 'Florida',
+          'Georgia', 'Hawaii', 'Idaho', 'Illinois', 'Indiana', 'Iowa', 'Kansas', 'Kentucky', 'Louisiana', 'Maine', 'Maryland',
+          'Massachusetts', 'Michigan', 'Minnesota', 'Mississippi', 'Missouri', 'Montan', 'Nebraska', 'Nevada', 'New Hampshire',
+          'New Jersey', 'New Mexico', 'New York', 'North Carolina', 'North Dakota', 'Ohio', 'Oklahoma', 'Oregon', 'Pennsylvania',
+          'Rhode Island', 'South Carolina', 'South Dakota', 'Tennessee', 'Texas', 'Utah', 'Vermont', 'Virginia', 'Washington',
+          'West Virginia', 'Wisconsin', 'Wyoming']
+
+months = {'january': '01', 'february': '02', 'march': '03', 'april': '04', 'may': '05', 'june': '06', 'july': '07', 'august': '08', 'september': '09',
+          'october': '10', 'november': '11', 'december': '12'}
+
+def text2int(textnum, numwords={}):
+    if not numwords:
+      units = [
+        "zero", "one", "two", "three", "four", "five", "six", "seven", "eight",
+        "nine", "ten", "eleven", "twelve", "thirteen", "fourteen", "fifteen",
+        "sixteen", "seventeen", "eighteen", "nineteen",
+      ]
+
+      tens = ["", "", "twenty", "thirty", "forty", "fifty", "sixty", "seventy", "eighty", "ninety"]
+
+      scales = ["hundred", "thousand", "million", "billion", "trillion"]
+
+      numwords["and"] = (1, 0)
+      for idx, word in enumerate(units):    numwords[word] = (1, idx)
+      for idx, word in enumerate(tens):     numwords[word] = (1, idx * 10)
+      for idx, word in enumerate(scales):   numwords[word] = (10 ** (idx * 3 or 2), 0)
+
+    current = result = 0
+    for word in textnum.split():
+        if word not in numwords:
+          raise Exception("Illegal word: " + word)
+
+        scale, increment = numwords[word]
+        current = current * scale + increment
+        if scale > 100:
+            result += current
+            current = 0
+
+    return result + current
+
+def find_state(line):
+    ctr = {}
+    for state in states:
+        if state.lower() in line.lower():
+            ctr[state] = len(re.findall(state, line))
+    if ctr != {}:
+        state = max(ctr, key=ctr.get)
+        state = state.replace(' ', '_')
+        return state
+    else:
+        return 'Alabama'
+
+def position_and_date(regex, text):
+    dates = [date.group() for date in re.finditer(regex, text)]
+    return dates
+
+# February 10, 2017
+# 6th day of January, 2012
+def find_dates(text):
+    res = []
+    dates_regex = [r'[A-Z][A-Za-z]+ ([0-2][0-9]|3[0-1]), [0-2][0-9]{3}',
+                   r'([0-2][0-9]|3[0-1])(th|rd|nd|st) day of [A-Za-z]+, [0-2][0-9]{3}',
+                   r'([0-2][0-9]|3[0-1])/([0-2][0-9]|3[0-1])/[0-9][0-9]']
+    for i in range(0, len(dates_regex)):
+        d = position_and_date(dates_regex[i], text)
+        if d != []:
+            res += d
+    return res
+
+def clean_date(date):
+    if ',' in date[0]:
+        date = date[0].replace(',', '')
+    if 'day of' in date:
+        date = re.sub(r'(th|rd|nd|st) day of', '', date)
+    if '/' in date[0]:
+        date = date[0].split('/')
+        date[2] = '20' + date[2]
+    else:
+        date = date.split()
+    for m in months:
+        if m in date[0].lower():
+            s = date[0].lower()
+            date[0] = date[1]
+            date[1] = s
+        if m in date[1].lower():
+            date[1] = date[1].lower().replace(m, months[m])
+    d = date[0]
+    m = date[1]
+    y = date[2]
+    correct_date = str(f'{y}-{m}-{d}')
+    return correct_date
+
+def find_term(line):
+    regex = r'([0-9]|([0-2][0-9]|3[0-1])|three|nine|\([0-9]\)) (years|months)'
+    s = re.findall(regex, line)
+    return s
+
+def clean_term(t):
+
+    term = []
+    if t != []:
+        s = t[0][0]
+        s = re.sub(r'\(|\)', '', s)
+        term = list(t[0])
+        term[0] = s
+        term.pop(1)
+        if not bool(re.search(r'\d', term[0])):
+            term[0] = str(text2int(term[0]))
+        term = term[0] + '_' + term[1]
+    return term
+
+with open('dev-0/in.tsv', 'r', encoding='utf8') as f:
+    dev0 = f.readlines()
+
+with open('dev-0/out.tsv', 'wt') as f:
+    for l in dev0:
+        res = ""
+        if 'effective_date' in l:
+            date = find_dates(l)
+            if date != []:
+                date = clean_date(date)
+                res = ('effective_date=' + str(date) + ' ')
+
+
+        t = find_term(l)
+        t = clean_term(t)
+        if t != []:
+            res += ('jurisdiction=' + str(find_state(l)) + ' ')
+            res += ('term=' + str(t) + '\n')
+        else:
+            res += ('jurisdiction=' + str(find_state(l)) + '\n')
+
+        f.write(res)
+
+f.close()
+
+
+with open('test-A/in.tsv', 'r', encoding='utf8') as f:
+    test_A = f.readlines()
+
+with open('test-A/out.tsv', 'wt') as f:
+    for l in test_A:
+        res = ""
+        if 'effective_date' in l:
+            date = find_dates(l)
+            if date != []:
+                date = clean_date(date)
+                res = ('effective_date=' + str(date) + ' ')
+        t = find_term(l)
+        t = clean_term(t)
+        if t != []:
+            res += ('jurisdiction=' + str(find_state(l)) + ' ')
+            res += ('term=' + str(t) + '\n')
+        else:
+            res += ('jurisdiction=' + str(find_state(l)) + '\n')
+        f.write(res)
+
+f.close()
+
+
+with open('train/in.tsv', 'r', encoding='utf8') as f:
+    train = f.readlines()
+
+with open('train/out.tsv', 'wt') as f:
+    for l in train:
+        res = ""
+        if 'effective_date' in l:
+            date = find_dates(l)
+            if date != []:
+                date = clean_date(date)
+                res = ('effective_date=' + str(date) + ' ')
+        t = find_term(l)
+        t = clean_term(t)
+        if t != []:
+            res += ('jurisdiction=' + str(find_state(l)) + ' ')
+            res += ('term=' + str(t) + '\n')
+        else:
+            res += ('jurisdiction=' + str(find_state(l)) + '\n')
+        f.write(res)
+
+f.close()