2022-05-03 23:59:27 +02:00
|
|
|
#!/usr/bin/env python
|
|
|
|
# coding: utf-8
|
|
|
|
|
2022-05-04 00:23:23 +02:00
|
|
|
# In[26]:
|
2022-05-03 23:59:27 +02:00
|
|
|
|
|
|
|
|
|
|
|
import re
|
|
|
|
|
|
|
|
|
2022-05-04 00:44:18 +02:00
|
|
|
# In[41]:
|
2022-05-03 23:59:27 +02:00
|
|
|
|
|
|
|
|
|
|
|
states = ['Alabama', 'Alaska', 'Arizona', 'Arkansas', 'California', 'Colorado', 'Connecticut', 'Delaware', 'Florida', 'Georgia',
|
|
|
|
'Hawaii', 'Idaho', 'Illinois', 'Indiana', 'Iowa', 'Kansas', 'Kentucky', 'Louisiana', 'Maine', 'Maryland',
|
|
|
|
'Massachusetts', 'Michigan', 'Minnesota', 'Mississippi', 'Missouri', 'Montana', 'Nebraska', 'Nevada', 'New Hampshire', 'New Jersey',
|
|
|
|
'New Mexico', 'New York', 'North Carolina', 'North Dakota', 'Ohio', 'Oklahoma', 'Oregon', 'Pennsylvania', 'Rhode Island', 'South Carolina',
|
|
|
|
'South Dakota', 'Tennessee', 'Texas', 'Utah', 'Vermont', 'Virginia', 'Washington', 'West Virginia', 'Wisconsin', 'Wyoming']
|
|
|
|
|
|
|
|
|
2022-05-04 00:44:18 +02:00
|
|
|
# In[42]:
|
2022-05-03 23:59:27 +02:00
|
|
|
|
|
|
|
|
|
|
|
def counter(text_in, query):
|
2022-05-04 00:44:18 +02:00
|
|
|
for line in text_in:
|
|
|
|
line = line.replace('.', ' ').replace(',', ' ').lower()
|
2022-05-03 23:59:27 +02:00
|
|
|
pattern = re.compile(query)
|
|
|
|
return len(pattern.findall(text_in, re.IGNORECASE))
|
|
|
|
|
|
|
|
|
2022-05-04 00:44:18 +02:00
|
|
|
# In[43]:
|
2022-05-03 23:59:27 +02:00
|
|
|
|
|
|
|
|
|
|
|
def state_prediction(text_in):
|
|
|
|
state_dict = {}
|
|
|
|
for state in states:
|
|
|
|
state_dict[state.replace(" ", "_")] = counter(text_in, state)
|
2022-05-04 00:23:23 +02:00
|
|
|
return state_dict
|
2022-05-03 23:59:27 +02:00
|
|
|
|
|
|
|
|
2022-05-04 00:44:18 +02:00
|
|
|
# In[44]:
|
2022-05-03 23:59:27 +02:00
|
|
|
|
|
|
|
|
|
|
|
def jurisdiction(path_in, path_out):
|
|
|
|
with open(path_in, 'r', encoding='utf8') as file:
|
2022-05-04 00:06:07 +02:00
|
|
|
lines = file.readlines()
|
2022-05-03 23:59:27 +02:00
|
|
|
with open(path_out, 'wt')as file_out:
|
|
|
|
for i in lines:
|
|
|
|
file_out.write("jurisdiction="+str(state_prediction(i))+'\n')
|
|
|
|
file_out.close()
|
|
|
|
|
|
|
|
|
2022-05-04 00:44:18 +02:00
|
|
|
# In[45]:
|
2022-05-03 23:59:27 +02:00
|
|
|
|
|
|
|
|
|
|
|
jurisdiction('dev-0/in.tsv', 'dev-0/out.tsv')
|
|
|
|
jurisdiction('train/in.tsv', 'train/out.tsv')
|
|
|
|
jurisdiction('test-A/in.tsv', 'test-A/out.tsv')
|
|
|
|
|
|
|
|
|
2022-05-04 00:44:18 +02:00
|
|
|
# In[39]:
|
2022-05-03 23:59:27 +02:00
|
|
|
|
|
|
|
|
|
|
|
# get_ipython().system('jupyter nbconvert --to script run.ipynb')
|
|
|
|
|