kleister-nda/run.py

52 lines
1.9 KiB
Python
Raw Normal View History

2022-05-05 16:31:55 +02:00
import re
2022-05-05 17:33:36 +02:00
import datefinder
2022-05-05 16:31:55 +02:00
states = [
'Alabama', 'Alaska', 'Arizona', 'Arkansas', 'California', 'Colorado', 'Connecticut', 'Delaware', 'Florida',
'Georgia', 'Hawaii', 'Idaho', 'Illinois', 'Indiana', 'Iowa', 'Kansas', 'Kentucky', 'Louisiana', 'Maine',
'Maryland', 'Massachusetts', 'Michigan', 'Minnesota', 'Mississippi', 'Missouri', 'Montana', 'Nebraska',
'Nevada', 'New Hampshire', 'New Jersey', 'New Mexico', 'New York', 'North Carolina', 'North Dakota', 'Ohio',
'Oklahoma', 'Oregon', 'Pennsylvania', 'Rhode Island', 'South Carolina', 'South Dakota', 'Tennessee', 'Texas',
'Utah', 'Vermont', 'Virginia', 'Washington', 'West Virginia', 'Wisconsin', 'Wyoming'
]
def count_strings(text_in, search_str):
pattern = re.compile(search_str)
return len(pattern.findall(text_in, re.IGNORECASE))
def predict_state(text):
state_dict = {}
for state in states:
2022-05-05 17:33:36 +02:00
state_name = state.replace(" ", "_")
state_dict[state_name] = len(re.compile(state).findall(text, re.IGNORECASE))
# state_dict[state_name] = count_strings(text, state)
2022-05-05 16:31:55 +02:00
return max(state_dict, key=state_dict.get)
2022-05-05 17:33:36 +02:00
def predict_date(text):
match = re.search(r'\d{4}-\d{2}-\d{2}', text)
if match is not None:
return match.group(1)
2022-05-05 16:31:55 +02:00
def get_jurisdiction(file_in, file_out):
with open(file_in, 'r', encoding='utf8') as file_in:
lines = file_in.readlines()
with open(file_out, 'wt') as file_out:
for line in lines:
2022-05-05 17:33:36 +02:00
file_out.write("effective_date=" + str(predict_date(line)) + ', ')
2022-05-05 16:31:55 +02:00
file_out.write("jurisdiction=" + str(predict_state(line)) + '\n')
file_out.close()
get_jurisdiction('dev-0/in.tsv', 'dev-0/out.tsv')
2022-05-05 17:33:36 +02:00
print("Created dev-0/out.tsv")
2022-05-05 16:31:55 +02:00
get_jurisdiction('train/in.tsv', 'train/out.tsv')
2022-05-05 17:33:36 +02:00
print("Created train/out.tsv")
2022-05-05 16:31:55 +02:00
get_jurisdiction('test-A/in.tsv', 'test-A/out.tsv')
2022-05-05 17:33:36 +02:00
print("Created test-A/out.tsv")