2022-05-05 16:31:55 +02:00
|
|
|
import re
|
2022-05-05 17:33:36 +02:00
|
|
|
import datefinder
|
2022-05-05 16:31:55 +02:00
|
|
|
|
|
|
|
states = [
|
|
|
|
'Alabama', 'Alaska', 'Arizona', 'Arkansas', 'California', 'Colorado', 'Connecticut', 'Delaware', 'Florida',
|
|
|
|
'Georgia', 'Hawaii', 'Idaho', 'Illinois', 'Indiana', 'Iowa', 'Kansas', 'Kentucky', 'Louisiana', 'Maine',
|
|
|
|
'Maryland', 'Massachusetts', 'Michigan', 'Minnesota', 'Mississippi', 'Missouri', 'Montana', 'Nebraska',
|
|
|
|
'Nevada', 'New Hampshire', 'New Jersey', 'New Mexico', 'New York', 'North Carolina', 'North Dakota', 'Ohio',
|
|
|
|
'Oklahoma', 'Oregon', 'Pennsylvania', 'Rhode Island', 'South Carolina', 'South Dakota', 'Tennessee', 'Texas',
|
|
|
|
'Utah', 'Vermont', 'Virginia', 'Washington', 'West Virginia', 'Wisconsin', 'Wyoming'
|
|
|
|
]
|
|
|
|
|
|
|
|
|
|
|
|
def count_strings(text_in, search_str):
|
|
|
|
pattern = re.compile(search_str)
|
|
|
|
return len(pattern.findall(text_in, re.IGNORECASE))
|
|
|
|
|
|
|
|
|
|
|
|
def predict_state(text):
|
|
|
|
state_dict = {}
|
|
|
|
for state in states:
|
2022-05-05 17:33:36 +02:00
|
|
|
state_name = state.replace(" ", "_")
|
|
|
|
state_dict[state_name] = len(re.compile(state).findall(text, re.IGNORECASE))
|
|
|
|
# state_dict[state_name] = count_strings(text, state)
|
2022-05-05 16:31:55 +02:00
|
|
|
return max(state_dict, key=state_dict.get)
|
|
|
|
|
|
|
|
|
2022-05-05 17:33:36 +02:00
|
|
|
def predict_date(text):
|
|
|
|
match = re.search(r'\d{4}-\d{2}-\d{2}', text)
|
|
|
|
if match is not None:
|
|
|
|
return match.group(1)
|
|
|
|
|
|
|
|
|
2022-05-05 16:31:55 +02:00
|
|
|
def get_jurisdiction(file_in, file_out):
|
|
|
|
with open(file_in, 'r', encoding='utf8') as file_in:
|
|
|
|
lines = file_in.readlines()
|
|
|
|
|
|
|
|
with open(file_out, 'wt') as file_out:
|
|
|
|
for line in lines:
|
2022-05-05 17:33:36 +02:00
|
|
|
file_out.write("effective_date=" + str(predict_date(line)) + ', ')
|
2022-05-05 16:31:55 +02:00
|
|
|
file_out.write("jurisdiction=" + str(predict_state(line)) + '\n')
|
|
|
|
file_out.close()
|
|
|
|
|
|
|
|
|
|
|
|
get_jurisdiction('dev-0/in.tsv', 'dev-0/out.tsv')
|
2022-05-05 17:33:36 +02:00
|
|
|
print("Created dev-0/out.tsv")
|
2022-05-05 16:31:55 +02:00
|
|
|
get_jurisdiction('train/in.tsv', 'train/out.tsv')
|
2022-05-05 17:33:36 +02:00
|
|
|
print("Created train/out.tsv")
|
2022-05-05 16:31:55 +02:00
|
|
|
get_jurisdiction('test-A/in.tsv', 'test-A/out.tsv')
|
2022-05-05 17:33:36 +02:00
|
|
|
print("Created test-A/out.tsv")
|
|
|
|
|