This commit is contained in:
Maciej Czajka 2022-04-28 23:12:04 +02:00
parent 3bbf080d37
commit 9d30e595f0

183
run.py Normal file
View File

@ -0,0 +1,183 @@
import re
states = ['Alabama', 'Alaska', 'Arizona', 'Arkansas', 'California', 'Colorado', 'Connecticut', 'Delaware', 'Florida',
'Georgia', 'Hawaii', 'Idaho', 'Illinois', 'Indiana', 'Iowa', 'Kansas', 'Kentucky', 'Louisiana', 'Maine', 'Maryland',
'Massachusetts', 'Michigan', 'Minnesota', 'Mississippi', 'Missouri', 'Montan', 'Nebraska', 'Nevada', 'New Hampshire',
'New Jersey', 'New Mexico', 'New York', 'North Carolina', 'North Dakota', 'Ohio', 'Oklahoma', 'Oregon', 'Pennsylvania',
'Rhode Island', 'South Carolina', 'South Dakota', 'Tennessee', 'Texas', 'Utah', 'Vermont', 'Virginia', 'Washington',
'West Virginia', 'Wisconsin', 'Wyoming']
months = {'january': '01', 'february': '02', 'march': '03', 'april': '04', 'may': '05', 'june': '06', 'july': '07', 'august': '08', 'september': '09',
'october': '10', 'november': '11', 'december': '12'}
def text2int(textnum, numwords={}):
if not numwords:
units = [
"zero", "one", "two", "three", "four", "five", "six", "seven", "eight",
"nine", "ten", "eleven", "twelve", "thirteen", "fourteen", "fifteen",
"sixteen", "seventeen", "eighteen", "nineteen",
]
tens = ["", "", "twenty", "thirty", "forty", "fifty", "sixty", "seventy", "eighty", "ninety"]
scales = ["hundred", "thousand", "million", "billion", "trillion"]
numwords["and"] = (1, 0)
for idx, word in enumerate(units): numwords[word] = (1, idx)
for idx, word in enumerate(tens): numwords[word] = (1, idx * 10)
for idx, word in enumerate(scales): numwords[word] = (10 ** (idx * 3 or 2), 0)
current = result = 0
for word in textnum.split():
if word not in numwords:
raise Exception("Illegal word: " + word)
scale, increment = numwords[word]
current = current * scale + increment
if scale > 100:
result += current
current = 0
return result + current
def find_state(line):
ctr = {}
for state in states:
if state.lower() in line.lower():
ctr[state] = len(re.findall(state, line))
if ctr != {}:
state = max(ctr, key=ctr.get)
state = state.replace(' ', '_')
return state
else:
return 'Alabama'
def position_and_date(regex, text):
dates = [date.group() for date in re.finditer(regex, text)]
return dates
# February 10, 2017
# 6th day of January, 2012
def find_dates(text):
res = []
dates_regex = [r'[A-Z][A-Za-z]+ ([0-2][0-9]|3[0-1]), [0-2][0-9]{3}',
r'([0-2][0-9]|3[0-1])(th|rd|nd|st) day of [A-Za-z]+, [0-2][0-9]{3}',
r'([0-2][0-9]|3[0-1])/([0-2][0-9]|3[0-1])/[0-9][0-9]']
for i in range(0, len(dates_regex)):
d = position_and_date(dates_regex[i], text)
if d != []:
res += d
return res
def clean_date(date):
if ',' in date[0]:
date = date[0].replace(',', '')
if 'day of' in date:
date = re.sub(r'(th|rd|nd|st) day of', '', date)
if '/' in date[0]:
date = date[0].split('/')
date[2] = '20' + date[2]
else:
date = date.split()
for m in months:
if m in date[0].lower():
s = date[0].lower()
date[0] = date[1]
date[1] = s
if m in date[1].lower():
date[1] = date[1].lower().replace(m, months[m])
d = date[0]
m = date[1]
y = date[2]
correct_date = str(f'{y}-{m}-{d}')
return correct_date
def find_term(line):
regex = r'([0-9]|([0-2][0-9]|3[0-1])|three|nine|\([0-9]\)) (years|months)'
s = re.findall(regex, line)
return s
def clean_term(t):
term = []
if t != []:
s = t[0][0]
s = re.sub(r'\(|\)', '', s)
term = list(t[0])
term[0] = s
term.pop(1)
if not bool(re.search(r'\d', term[0])):
term[0] = str(text2int(term[0]))
term = term[0] + '_' + term[1]
return term
with open('dev-0/in.tsv', 'r', encoding='utf8') as f:
dev0 = f.readlines()
with open('dev-0/out.tsv', 'wt') as f:
for l in dev0:
res = ""
if 'effective_date' in l:
date = find_dates(l)
if date != []:
date = clean_date(date)
res = ('effective_date=' + str(date) + ' ')
t = find_term(l)
t = clean_term(t)
if t != []:
res += ('jurisdiction=' + str(find_state(l)) + ' ')
res += ('term=' + str(t) + '\n')
else:
res += ('jurisdiction=' + str(find_state(l)) + '\n')
f.write(res)
f.close()
with open('test-A/in.tsv', 'r', encoding='utf8') as f:
test_A = f.readlines()
with open('test-A/out.tsv', 'wt') as f:
for l in test_A:
res = ""
if 'effective_date' in l:
date = find_dates(l)
if date != []:
date = clean_date(date)
res = ('effective_date=' + str(date) + ' ')
t = find_term(l)
t = clean_term(t)
if t != []:
res += ('jurisdiction=' + str(find_state(l)) + ' ')
res += ('term=' + str(t) + '\n')
else:
res += ('jurisdiction=' + str(find_state(l)) + '\n')
f.write(res)
f.close()
with open('train/in.tsv', 'r', encoding='utf8') as f:
train = f.readlines()
with open('train/out.tsv', 'wt') as f:
for l in train:
res = ""
if 'effective_date' in l:
date = find_dates(l)
if date != []:
date = clean_date(date)
res = ('effective_date=' + str(date) + ' ')
t = find_term(l)
t = clean_term(t)
if t != []:
res += ('jurisdiction=' + str(find_state(l)) + ' ')
res += ('term=' + str(t) + '\n')
else:
res += ('jurisdiction=' + str(find_state(l)) + '\n')
f.write(res)
f.close()