38 lines
1.4 KiB
Python
38 lines
1.4 KiB
Python
import re
|
|
|
|
states = [
|
|
'Alabama', 'Alaska', 'Arizona', 'Arkansas', 'California', 'Colorado', 'Connecticut', 'Delaware', 'Florida',
|
|
'Georgia', 'Hawaii', 'Idaho', 'Illinois', 'Indiana', 'Iowa', 'Kansas', 'Kentucky', 'Louisiana', 'Maine',
|
|
'Maryland', 'Massachusetts', 'Michigan', 'Minnesota', 'Mississippi', 'Missouri', 'Montana', 'Nebraska',
|
|
'Nevada', 'New Hampshire', 'New Jersey', 'New Mexico', 'New York', 'North Carolina', 'North Dakota', 'Ohio',
|
|
'Oklahoma', 'Oregon', 'Pennsylvania', 'Rhode Island', 'South Carolina', 'South Dakota', 'Tennessee', 'Texas',
|
|
'Utah', 'Vermont', 'Virginia', 'Washington', 'West Virginia', 'Wisconsin', 'Wyoming'
|
|
]
|
|
|
|
|
|
def count_strings(text_in, search_str):
|
|
pattern = re.compile(search_str)
|
|
return len(pattern.findall(text_in, re.IGNORECASE))
|
|
|
|
|
|
def predict_state(text):
|
|
state_dict = {}
|
|
for state in states:
|
|
state_dict[state.replace(" ", "_")] = count_strings(text, state)
|
|
return max(state_dict, key=state_dict.get)
|
|
|
|
|
|
def get_jurisdiction(file_in, file_out):
|
|
with open(file_in, 'r', encoding='utf8') as file_in:
|
|
lines = file_in.readlines()
|
|
|
|
with open(file_out, 'wt') as file_out:
|
|
for line in lines:
|
|
file_out.write("jurisdiction=" + str(predict_state(line)) + '\n')
|
|
file_out.close()
|
|
|
|
|
|
get_jurisdiction('dev-0/in.tsv', 'dev-0/out.tsv')
|
|
get_jurisdiction('train/in.tsv', 'train/out.tsv')
|
|
get_jurisdiction('test-A/in.tsv', 'test-A/out.tsv')
|