52 lines
1.6 KiB
Python
52 lines
1.6 KiB
Python
|
import re
|
||
|
|
||
|
states = ['Alabama', 'Alaska', 'Arizona', 'Arkansas', 'California', 'Colorado', 'Connecticut', 'Delaware', 'Florida', 'Georgia',
|
||
|
'Hawaii', 'Idaho', 'Illinois', 'Indiana', 'Iowa', 'Kansas', 'Kentucky', 'Louisiana', 'Maine', 'Maryland',
|
||
|
'Massachusetts', 'Michigan', 'Minnesota', 'Mississippi', 'Missouri', 'Montana', 'Nebraska', 'Nevada', 'New Hampshire', 'New Jersey',
|
||
|
'New Mexico', 'New York', 'North Carolina', 'North Dakota', 'Ohio', 'Oklahoma', 'Oregon', 'Pennsylvania', 'Rhode Island', 'South Carolina',
|
||
|
'South Dakota', 'Tennessee', 'Texas', 'Utah', 'Vermont', 'Virginia', 'Washington', 'West Virginia', 'Wisconsin', 'Wyoming']
|
||
|
|
||
|
|
||
|
def count_strings(input, str_):
|
||
|
pattern = re.compile(str_)
|
||
|
return len(pattern.findall(input, re.IGNORECASE))
|
||
|
|
||
|
def predict_state(text):
|
||
|
# Predict state of jurisdiction
|
||
|
state_dict = {}
|
||
|
for state in states:
|
||
|
state_dict[state.replace(" ", "_")] = count_strings(text, state)
|
||
|
|
||
|
return max(state_dict, key=state_dict.get)
|
||
|
|
||
|
|
||
|
|
||
|
with open('dev-0/in.tsv', 'r', encoding='utf8') as f:
|
||
|
dev0_x = f.readlines()
|
||
|
|
||
|
with open('dev-0/out.tsv', 'wt') as f:
|
||
|
for x in dev0_x:
|
||
|
f.write("jurisdiction="+str(predict_state(x))+'\n')
|
||
|
|
||
|
f.close()
|
||
|
|
||
|
|
||
|
with open('train/in.tsv', 'r', encoding='utf8') as f:
|
||
|
train_x = f.readlines()
|
||
|
|
||
|
with open('train/out.tsv', 'wt') as f:
|
||
|
for x in train_x:
|
||
|
f.write("jurisdiction="+str(predict_state(x))+'\n')
|
||
|
|
||
|
f.close()
|
||
|
|
||
|
|
||
|
with open('test-A/in.tsv', 'r', encoding='utf8') as f:
|
||
|
testA_x = f.readlines()
|
||
|
|
||
|
with open('test-A/out.tsv', 'wt') as f:
|
||
|
for x in testA_x:
|
||
|
f.write("jurisdiction="+str(predict_state(x))+'\n')
|
||
|
|
||
|
f.close()
|