s444356
This commit is contained in:
parent
3bbf080d37
commit
9d30e595f0
183
run.py
Normal file
183
run.py
Normal file
@ -0,0 +1,183 @@
|
||||
import re
|
||||
|
||||
states = ['Alabama', 'Alaska', 'Arizona', 'Arkansas', 'California', 'Colorado', 'Connecticut', 'Delaware', 'Florida',
|
||||
'Georgia', 'Hawaii', 'Idaho', 'Illinois', 'Indiana', 'Iowa', 'Kansas', 'Kentucky', 'Louisiana', 'Maine', 'Maryland',
|
||||
'Massachusetts', 'Michigan', 'Minnesota', 'Mississippi', 'Missouri', 'Montan', 'Nebraska', 'Nevada', 'New Hampshire',
|
||||
'New Jersey', 'New Mexico', 'New York', 'North Carolina', 'North Dakota', 'Ohio', 'Oklahoma', 'Oregon', 'Pennsylvania',
|
||||
'Rhode Island', 'South Carolina', 'South Dakota', 'Tennessee', 'Texas', 'Utah', 'Vermont', 'Virginia', 'Washington',
|
||||
'West Virginia', 'Wisconsin', 'Wyoming']
|
||||
|
||||
months = {'january': '01', 'february': '02', 'march': '03', 'april': '04', 'may': '05', 'june': '06', 'july': '07', 'august': '08', 'september': '09',
|
||||
'october': '10', 'november': '11', 'december': '12'}
|
||||
|
||||
def text2int(textnum, numwords={}):
|
||||
if not numwords:
|
||||
units = [
|
||||
"zero", "one", "two", "three", "four", "five", "six", "seven", "eight",
|
||||
"nine", "ten", "eleven", "twelve", "thirteen", "fourteen", "fifteen",
|
||||
"sixteen", "seventeen", "eighteen", "nineteen",
|
||||
]
|
||||
|
||||
tens = ["", "", "twenty", "thirty", "forty", "fifty", "sixty", "seventy", "eighty", "ninety"]
|
||||
|
||||
scales = ["hundred", "thousand", "million", "billion", "trillion"]
|
||||
|
||||
numwords["and"] = (1, 0)
|
||||
for idx, word in enumerate(units): numwords[word] = (1, idx)
|
||||
for idx, word in enumerate(tens): numwords[word] = (1, idx * 10)
|
||||
for idx, word in enumerate(scales): numwords[word] = (10 ** (idx * 3 or 2), 0)
|
||||
|
||||
current = result = 0
|
||||
for word in textnum.split():
|
||||
if word not in numwords:
|
||||
raise Exception("Illegal word: " + word)
|
||||
|
||||
scale, increment = numwords[word]
|
||||
current = current * scale + increment
|
||||
if scale > 100:
|
||||
result += current
|
||||
current = 0
|
||||
|
||||
return result + current
|
||||
|
||||
def find_state(line):
|
||||
ctr = {}
|
||||
for state in states:
|
||||
if state.lower() in line.lower():
|
||||
ctr[state] = len(re.findall(state, line))
|
||||
if ctr != {}:
|
||||
state = max(ctr, key=ctr.get)
|
||||
state = state.replace(' ', '_')
|
||||
return state
|
||||
else:
|
||||
return 'Alabama'
|
||||
|
||||
def position_and_date(regex, text):
|
||||
dates = [date.group() for date in re.finditer(regex, text)]
|
||||
return dates
|
||||
|
||||
# February 10, 2017
|
||||
# 6th day of January, 2012
|
||||
def find_dates(text):
|
||||
res = []
|
||||
dates_regex = [r'[A-Z][A-Za-z]+ ([0-2][0-9]|3[0-1]), [0-2][0-9]{3}',
|
||||
r'([0-2][0-9]|3[0-1])(th|rd|nd|st) day of [A-Za-z]+, [0-2][0-9]{3}',
|
||||
r'([0-2][0-9]|3[0-1])/([0-2][0-9]|3[0-1])/[0-9][0-9]']
|
||||
for i in range(0, len(dates_regex)):
|
||||
d = position_and_date(dates_regex[i], text)
|
||||
if d != []:
|
||||
res += d
|
||||
return res
|
||||
|
||||
def clean_date(date):
|
||||
if ',' in date[0]:
|
||||
date = date[0].replace(',', '')
|
||||
if 'day of' in date:
|
||||
date = re.sub(r'(th|rd|nd|st) day of', '', date)
|
||||
if '/' in date[0]:
|
||||
date = date[0].split('/')
|
||||
date[2] = '20' + date[2]
|
||||
else:
|
||||
date = date.split()
|
||||
for m in months:
|
||||
if m in date[0].lower():
|
||||
s = date[0].lower()
|
||||
date[0] = date[1]
|
||||
date[1] = s
|
||||
if m in date[1].lower():
|
||||
date[1] = date[1].lower().replace(m, months[m])
|
||||
d = date[0]
|
||||
m = date[1]
|
||||
y = date[2]
|
||||
correct_date = str(f'{y}-{m}-{d}')
|
||||
return correct_date
|
||||
|
||||
def find_term(line):
|
||||
regex = r'([0-9]|([0-2][0-9]|3[0-1])|three|nine|\([0-9]\)) (years|months)'
|
||||
s = re.findall(regex, line)
|
||||
return s
|
||||
|
||||
def clean_term(t):
|
||||
|
||||
term = []
|
||||
if t != []:
|
||||
s = t[0][0]
|
||||
s = re.sub(r'\(|\)', '', s)
|
||||
term = list(t[0])
|
||||
term[0] = s
|
||||
term.pop(1)
|
||||
if not bool(re.search(r'\d', term[0])):
|
||||
term[0] = str(text2int(term[0]))
|
||||
term = term[0] + '_' + term[1]
|
||||
return term
|
||||
|
||||
with open('dev-0/in.tsv', 'r', encoding='utf8') as f:
|
||||
dev0 = f.readlines()
|
||||
|
||||
with open('dev-0/out.tsv', 'wt') as f:
|
||||
for l in dev0:
|
||||
res = ""
|
||||
if 'effective_date' in l:
|
||||
date = find_dates(l)
|
||||
if date != []:
|
||||
date = clean_date(date)
|
||||
res = ('effective_date=' + str(date) + ' ')
|
||||
|
||||
|
||||
t = find_term(l)
|
||||
t = clean_term(t)
|
||||
if t != []:
|
||||
res += ('jurisdiction=' + str(find_state(l)) + ' ')
|
||||
res += ('term=' + str(t) + '\n')
|
||||
else:
|
||||
res += ('jurisdiction=' + str(find_state(l)) + '\n')
|
||||
|
||||
f.write(res)
|
||||
|
||||
f.close()
|
||||
|
||||
|
||||
with open('test-A/in.tsv', 'r', encoding='utf8') as f:
|
||||
test_A = f.readlines()
|
||||
|
||||
with open('test-A/out.tsv', 'wt') as f:
|
||||
for l in test_A:
|
||||
res = ""
|
||||
if 'effective_date' in l:
|
||||
date = find_dates(l)
|
||||
if date != []:
|
||||
date = clean_date(date)
|
||||
res = ('effective_date=' + str(date) + ' ')
|
||||
t = find_term(l)
|
||||
t = clean_term(t)
|
||||
if t != []:
|
||||
res += ('jurisdiction=' + str(find_state(l)) + ' ')
|
||||
res += ('term=' + str(t) + '\n')
|
||||
else:
|
||||
res += ('jurisdiction=' + str(find_state(l)) + '\n')
|
||||
f.write(res)
|
||||
|
||||
f.close()
|
||||
|
||||
|
||||
with open('train/in.tsv', 'r', encoding='utf8') as f:
|
||||
train = f.readlines()
|
||||
|
||||
with open('train/out.tsv', 'wt') as f:
|
||||
for l in train:
|
||||
res = ""
|
||||
if 'effective_date' in l:
|
||||
date = find_dates(l)
|
||||
if date != []:
|
||||
date = clean_date(date)
|
||||
res = ('effective_date=' + str(date) + ' ')
|
||||
t = find_term(l)
|
||||
t = clean_term(t)
|
||||
if t != []:
|
||||
res += ('jurisdiction=' + str(find_state(l)) + ' ')
|
||||
res += ('term=' + str(t) + '\n')
|
||||
else:
|
||||
res += ('jurisdiction=' + str(find_state(l)) + '\n')
|
||||
f.write(res)
|
||||
|
||||
f.close()
|
Loading…
Reference in New Issue
Block a user