kleister-nda/.ipynb_checkpoints/run-checkpoint.py

63 lines
1.6 KiB
Python
Raw Normal View History

2022-05-03 23:59:27 +02:00
#!/usr/bin/env python
# coding: utf-8
2022-05-04 01:35:27 +02:00
# In[56]:
2022-05-03 23:59:27 +02:00
import re
2022-05-04 01:35:27 +02:00
# In[57]:
2022-05-03 23:59:27 +02:00
states = ['Alabama', 'Alaska', 'Arizona', 'Arkansas', 'California', 'Colorado', 'Connecticut', 'Delaware', 'Florida', 'Georgia',
2022-05-04 01:30:25 +02:00
'Hawaii', 'Idaho', 'Illinois', 'Indiana', 'Kansas', 'Kentucky', 'Louisiana', 'Maine', 'Maryland',
'Massachusetts', 'Michigan', 'Minnesota', 'Mississippi', 'Missouri', 'Nebraska', 'Nevada', 'New Hampshire', 'New Jersey',
'New Mexico', 'New York', 'North Carolina', 'North Dakota', 'Ohio', 'Oklahoma', 'Pennsylvania', 'Rhode Island', 'South Carolina',
2022-05-03 23:59:27 +02:00
'South Dakota', 'Tennessee', 'Texas', 'Utah', 'Vermont', 'Virginia', 'Washington', 'West Virginia', 'Wisconsin', 'Wyoming']
2022-05-04 01:35:27 +02:00
# In[58]:
2022-05-03 23:59:27 +02:00
def counter(text_in, query):
pattern = re.compile(query)
return len(pattern.findall(text_in, re.IGNORECASE))
2022-05-04 01:35:27 +02:00
# In[59]:
2022-05-03 23:59:27 +02:00
def state_prediction(text_in):
state_dict = {}
for state in states:
state_dict[state.replace(" ", "_")] = counter(text_in, state)
2022-05-04 01:35:27 +02:00
return max(state_dict, key=state_dict.get)
2022-05-03 23:59:27 +02:00
2022-05-04 01:35:27 +02:00
# In[60]:
2022-05-03 23:59:27 +02:00
def jurisdiction(path_in, path_out):
with open(path_in, 'r', encoding='utf8') as file:
2022-05-04 00:06:07 +02:00
lines = file.readlines()
2022-05-03 23:59:27 +02:00
with open(path_out, 'wt')as file_out:
for i in lines:
file_out.write("jurisdiction="+str(state_prediction(i))+'\n')
file_out.close()
2022-05-04 01:35:27 +02:00
# In[61]:
2022-05-03 23:59:27 +02:00
jurisdiction('dev-0/in.tsv', 'dev-0/out.tsv')
jurisdiction('train/in.tsv', 'train/out.tsv')
jurisdiction('test-A/in.tsv', 'test-A/out.tsv')
2022-05-04 01:35:27 +02:00
# In[62]:
2022-05-03 23:59:27 +02:00
# get_ipython().system('jupyter nbconvert --to script run.ipynb')