219 lines
7.2 KiB
Python
219 lines
7.2 KiB
Python
import re
|
|
|
|
states = ['Alabama', 'Alaska', 'Arizona', 'Arkansas', 'California', 'Colorado', 'Connecticut', 'Delaware', 'Florida',
|
|
'Georgia', 'Hawaii', 'Idaho', 'Illinois', 'Indiana', 'Iowa', 'Kansas', 'Kentucky', 'Louisiana', 'Maine', 'Maryland',
|
|
'Massachusetts', 'Michigan', 'Minnesota', 'Mississippi', 'Missouri', 'Montan', 'Nebraska', 'Nevada', 'New Hampshire',
|
|
'New Jersey', 'New Mexico', 'New York', 'North Carolina', 'North Dakota', 'Ohio', 'Oklahoma', 'Oregon', 'Pennsylvania',
|
|
'Rhode Island', 'South Carolina', 'South Dakota', 'Tennessee', 'Texas', 'Utah', 'Vermont', 'Virginia', 'Washington',
|
|
'West Virginia', 'Wisconsin', 'Wyoming']
|
|
|
|
months = {'january': '01', 'february': '02', 'march': '03', 'april': '04', 'may': '05', 'june': '06', 'july': '07', 'august': '08', 'september': '09',
|
|
'october': '10', 'november': '11', 'december': '12'}
|
|
|
|
def text2int(textnum, numwords={}):
|
|
if not numwords:
|
|
units = [
|
|
"zero", "one", "two", "three", "four", "five", "six", "seven", "eight",
|
|
"nine", "ten", "eleven", "twelve", "thirteen", "fourteen", "fifteen",
|
|
"sixteen", "seventeen", "eighteen", "nineteen",
|
|
]
|
|
|
|
tens = ["", "", "twenty", "thirty", "forty", "fifty", "sixty", "seventy", "eighty", "ninety"]
|
|
|
|
scales = ["hundred", "thousand", "million", "billion", "trillion"]
|
|
|
|
numwords["and"] = (1, 0)
|
|
for idx, word in enumerate(units): numwords[word] = (1, idx)
|
|
for idx, word in enumerate(tens): numwords[word] = (1, idx * 10)
|
|
for idx, word in enumerate(scales): numwords[word] = (10 ** (idx * 3 or 2), 0)
|
|
|
|
current = result = 0
|
|
for word in textnum.split():
|
|
if word not in numwords:
|
|
raise Exception("Illegal word: " + word)
|
|
|
|
scale, increment = numwords[word]
|
|
current = current * scale + increment
|
|
if scale > 100:
|
|
result += current
|
|
current = 0
|
|
|
|
return result + current
|
|
|
|
def find_state(line):
|
|
ctr = {}
|
|
for state in states:
|
|
if state.lower() in line.lower():
|
|
ctr[state] = len(re.findall(state, line))
|
|
if ctr != {}:
|
|
state = max(ctr, key=ctr.get)
|
|
state = state.replace(' ', '_')
|
|
return state
|
|
else:
|
|
return 'Alabama'
|
|
|
|
def position_and_date(regex, text):
|
|
dates = [date.group() for date in re.finditer(regex, text)]
|
|
return dates
|
|
|
|
# February 10, 2017
|
|
# 6th day of January, 2012
|
|
def find_dates(text):
|
|
res = []
|
|
dates_regex = [r'[A-Z][A-Za-z]+ ([0-2][0-9]|3[0-1]), [0-2][0-9]{3}',
|
|
r'([0-2][0-9]|3[0-1])(th|rd|nd|st) day of [A-Za-z]+, [0-2][0-9]{3}',
|
|
r'([0-2][0-9]|3[0-1])/([0-2][0-9]|3[0-1])/[0-9][0-9]']
|
|
for i in range(0, len(dates_regex)):
|
|
d = position_and_date(dates_regex[i], text)
|
|
if d != []:
|
|
res += d
|
|
return res
|
|
|
|
def clean_date(date):
|
|
if ',' in date[0]:
|
|
date = date[0].replace(',', '')
|
|
if 'day of' in date:
|
|
date = re.sub(r'(th|rd|nd|st) day of', '', date)
|
|
if '/' in date[0]:
|
|
date = date[0].split('/')
|
|
date[2] = '20' + date[2]
|
|
else:
|
|
date = date.split()
|
|
for m in months:
|
|
if m in date[0].lower():
|
|
s = date[0].lower()
|
|
date[0] = date[1]
|
|
date[1] = s
|
|
if m in date[1].lower():
|
|
date[1] = date[1].lower().replace(m, months[m])
|
|
d = date[0]
|
|
m = date[1]
|
|
y = date[2]
|
|
correct_date = str(f'{y}-{m}-{d}')
|
|
return correct_date
|
|
|
|
def find_term(line):
|
|
regex = r'([0-9]|([0-2][0-9]|3[0-1])|three|nine|\([0-9]\)) (years|months)'
|
|
s = re.findall(regex, line)
|
|
return s
|
|
|
|
def clean_term(t):
|
|
|
|
term = []
|
|
if t != []:
|
|
s = t[0][0]
|
|
s = re.sub(r'\(|\)', '', s)
|
|
term = list(t[0])
|
|
term[0] = s
|
|
term.pop(1)
|
|
if not bool(re.search(r'\d', term[0])):
|
|
term[0] = str(text2int(term[0]))
|
|
term = term[0] + '_' + term[1]
|
|
return term
|
|
|
|
def find_part(l):
|
|
# regex = r'\b[A-Z]\w+(?:[ -]+?[A-Z]\w+?){0,2}[,\s]+(?i:inc|holding)\b'
|
|
regex = r'\b[A-Z]\w+(?:[ -]+?[A-Z]\w+?){0,2}[,\s]+(?i:inc|holding|corporation|corp|llc)\b'
|
|
s = re.findall(regex, l)
|
|
for i in range(0,len(s)):
|
|
if 'and' in s[i].lower():
|
|
s[i] = s[i].lower().replace('and ', '')
|
|
if 'of' in s[i].lower():
|
|
s[i] = s[i].lower().replace('of ', '')
|
|
if 'us' in s[i].lower():
|
|
s[i] = s[i].lower().replace('us ', '')
|
|
if ',' in s[i]:
|
|
s[i] = s[i].replace(',', '')
|
|
if 'inc' in s[i].lower():
|
|
s[i] = s[i].lower().replace('inc', 'inc.')
|
|
if 'Ltd' in s[i]:
|
|
s[i] = s[i].replace('Ltd', 'Ltd.')
|
|
s[i] = s[i].title().replace(' ', '_')
|
|
# if 'Llc' in s[i]:
|
|
# s[i] = s[i].replace('Llc', 'LLC')
|
|
return s
|
|
|
|
with open('dev-0/in.tsv', 'r', encoding='utf8') as f:
|
|
dev0 = f.readlines()
|
|
|
|
with open('dev-0/out.tsv', 'wt') as f:
|
|
for l in dev0:
|
|
res = ""
|
|
|
|
if 'effective_date' in l:
|
|
date = find_dates(l)
|
|
if date != []:
|
|
date = clean_date(date)
|
|
res = ('effective_date=' + str(date) + ' ')
|
|
|
|
p = find_part(l)
|
|
p = list(dict.fromkeys(p))
|
|
t = find_term(l)
|
|
t = clean_term(t)
|
|
if t != []:
|
|
res += ('jurisdiction=' + str(find_state(l)) + ' ')
|
|
for i in range(0, len(p)):
|
|
res += ('party=' + str(p[i]) + ' ')
|
|
res += ('term=' + str(t) + '\n')
|
|
else:
|
|
res += ('jurisdiction=' + str(find_state(l)) + '\n')
|
|
for i in range(0, len(p)):
|
|
res += ('party=' + str(p[i]) + ' ')
|
|
|
|
f.write(res)
|
|
|
|
f.close()
|
|
|
|
|
|
with open('test-A/in.tsv', 'r', encoding='utf8') as f:
|
|
test_A = f.readlines()
|
|
|
|
with open('test-A/out.tsv', 'wt') as f:
|
|
for l in test_A:
|
|
res = ""
|
|
if 'effective_date' in l:
|
|
date = find_dates(l)
|
|
if date != []:
|
|
date = clean_date(date)
|
|
res = ('effective_date=' + str(date) + ' ')
|
|
p = find_part(l)
|
|
p = list(dict.fromkeys(p))
|
|
t = find_term(l)
|
|
t = clean_term(t)
|
|
if t != []:
|
|
res += ('jurisdiction=' + str(find_state(l)) + ' ')
|
|
for i in range(0, len(p)):
|
|
res += ('party=' + str(p[i]) + ' ')
|
|
res += ('term=' + str(t) + '\n')
|
|
else:
|
|
res += ('jurisdiction=' + str(find_state(l)) + '\n')
|
|
f.write(res)
|
|
|
|
f.close()
|
|
|
|
|
|
with open('train/in.tsv', 'r', encoding='utf8') as f:
|
|
train = f.readlines()
|
|
|
|
with open('train/out.tsv', 'wt') as f:
|
|
for l in train:
|
|
res = ""
|
|
if 'effective_date' in l:
|
|
date = find_dates(l)
|
|
if date != []:
|
|
date = clean_date(date)
|
|
res = ('effective_date=' + str(date) + ' ')
|
|
p = find_part(l)
|
|
p = list(dict.fromkeys(p))
|
|
t = find_term(l)
|
|
t = clean_term(t)
|
|
if t != []:
|
|
res += ('jurisdiction=' + str(find_state(l)) + ' ')
|
|
for i in range(0, len(p)):
|
|
res += ('party=' + str(p[i]) + ' ')
|
|
res += ('term=' + str(t) + '\n')
|
|
else:
|
|
res += ('jurisdiction=' + str(find_state(l)) + '\n')
|
|
f.write(res)
|
|
|
|
f.close() |