8.7 KiB
8.7 KiB
import regex as re
import pandas as pd
import us
from collections import Counter
columns_names = ['filename', 'params', 'text1', 'text2', 'text3', 'text4']
data_train = pd.read_csv('./train/in.tsv', sep='\t', names=columns_names)
data_dev = pd.read_csv('./dev-0/in.tsv', sep='\t', names=columns_names)
data_test = pd.read_csv('./test-A/in.tsv', sep='\t', names=columns_names)
months = {
'January': '01',
'February': '02',
'March': '03',
'April': '04',
'May': '05',
'June': '06',
'July': '07',
'August': '08',
'September': '09',
'October': '10',
'November': '11',
'December': '12'
}
def transform_date_format(date):
if date != None:
if len(date) == 4:
# Check if month is string
try:
month = months[date[1]]
except(KeyError):
month = None
# If year has 4-digit
if len(date[3]) == 2:
if int(date[3][0]) < 5:
if month != None:
return "20"+str(date[3])+"-"+str(month)+"-" + str(date[2])
else:
return "20"+str(date[3])+"-"+str(date[1])+"-" + str(date[2])
else:
if int(date[3][0]) < 5:
if month != None:
return "19"+str(date[3])+"-"+str(month)+"-" + str(date[2])
else:
return "19"+str(date[3])+"-"+str(date[1])+"-" + str(date[2])
elif len(date[3]) == 4:
if month != None:
return str(date[3])+"-"+str(month)+"-" + str(date[2])
else:
return str(date[3])+"-"+str(date[1])+"-" + str(date[2])
elif len(date) == 5:
return str(date[4])+"-"+str(months[date[3]])+"-"+re.findall(r'.*(\d+).*', date[1])[0]
return ""
def get_effective_date(text):
# Date format "04/18/01"
first_format = re.findall(r'((1[0-2]|0[1-9])/(0[1-9]|[1|2][0-9]|3[0-1])/(0[1-9]|[1-9][0-9]))', text)
# Date format "01/21/2016"
sec_format = re.findall(r'((0[1-9]|1[1-2])/(0[1-9]|1[1-9]|2[1-9]|3[0-1])/(19[0-9][0-9]|20[0-9][0-9]))', text)
# Date format "January, 13 2021", "February 28, 2011"
third_format = re.findall(r'(([j|J]anuary|[f|F]ebruary|[m|M]arch|[A|a]pril|[M|m]ay|[J|j]une|[J|j]uly|[A|a]ugust|[S|s]eptember|[O|o]ctober|[n|N]ovember|[d|D]ecember)[,\s|\s]+(0[1-9]|[1-2][0-9]|3[0-1])[,\s|\s|,]+(19[0-9][0-9]|20[0-9][0-9]))', text)
# Date format "6th day of January, 2012"
fourth_format = re.findall(r'(([1-9]+(th\sday\sof\s|rd\sday\sof\s|nd\sday\sof\s))([j|J]anuary|[f|F]ebruary|[m|M]arch|[A|a]pril|[M|m]ay|[J|j]une|[J|j]uly|[A|a]ugust|[S|s]eptember|[O|o]ctober|[n|N]ovember|[d|D]ecember),\s(19[0-9][0-9]|20[0-9][0-9]))', text)
dates = []
for format in [first_format, sec_format, third_format, fourth_format]:
if len(format) > 0:
dates = format
return dates[0]
return None
def get_terms(text):
years = re.findall(r'(?<=\s)[0-9.\s.,\(\)]+(?=years)', text)
months = re.findall(r'(?<=\s)[0-9.\s.,\(\)]+(?=months)', text)
if len(years) > 0:
return years
if len(months) > 0:
return months
def get_parties(text):
first_party = re.findall(r'(?<=between)[\p{Latin}\s.,]+(?=Inc.|INC.|LLC|llc|,|.)', text)
if len(first_party) > 0:
if "Inc." in first_party[0]:
prepare = first_party[0].replace(",", "").strip().replace(" ", "_")
return prepare[:prepare.index("Inc.") + len("Inc.")]
if "inc." in first_party[0]:
prepare = first_party[0].replace(",", "").strip().replace(" ", "_")
return prepare[:prepare.index("inc.") + len("inc.")]
if "LLC" in first_party[0]:
prepare = first_party[0].replace(",", "").strip().replace(" ", "_")
return prepare[:prepare.index("LLC") + len("LLC")]
if "llc" in first_party[0]:
prepare = first_party[0].replace(",", "").strip().replace(" ", "_")
return prepare[:prepare.index("llc") + len("llc")]
return ""
def get_jurisdiction(text):
for state in us.states.STATES:
if re.search(rf"(?<=laws\sof\sthe)[\w\s]*{str(state)}\s*(?=,|.)", text):
return str(state).replace(" ", "_")
for state in us.states.STATES:
if re.search(rf"(.*{str(state).lower()}.*)", text.lower()):
return str(state).replace(" ", "_")
return ""
def process_parameters(params, text):
params_result = ""
for param in params.split(" "):
if param == "effective_date":
params_result += " " + "effective_date=" + str(transform_date_format(get_effective_date(text)))
elif param == "jurisdiction":
params_result += " " + "jurisdiction=" + str(get_jurisdiction(text))
elif param == "party":
params_result += " " + "party=" + str(get_parties(text))
return params_result
with open('train/out.tsv', 'w') as writer:
for idx, row in data_train.iterrows():
params_result = process_parameters(row['params'], row['text1'])
writer.write(params_result+"\n")
with open('dev-0/out.tsv', 'w') as writer:
for idx, row in data_dev.iterrows():
params_result = process_parameters(row['params'], row['text1'])
writer.write(params_result+"\n")
with open('test-A/out.tsv', 'w') as writer:
for idx, row in data_test.iterrows():
params_result = process_parameters(row['params'], row['text1'])
writer.write(params_result+"\n")