kleister-nda/.ipynb_checkpoints/run-checkpoint.py

65 lines
1.7 KiB
Python
Raw Normal View History

2022-05-03 23:59:27 +02:00
#!/usr/bin/env python
# coding: utf-8
2022-05-04 00:23:23 +02:00
# In[26]:
2022-05-03 23:59:27 +02:00
import re
2022-05-04 00:44:18 +02:00
# In[41]:
2022-05-03 23:59:27 +02:00
states = ['Alabama', 'Alaska', 'Arizona', 'Arkansas', 'California', 'Colorado', 'Connecticut', 'Delaware', 'Florida', 'Georgia',
'Hawaii', 'Idaho', 'Illinois', 'Indiana', 'Iowa', 'Kansas', 'Kentucky', 'Louisiana', 'Maine', 'Maryland',
'Massachusetts', 'Michigan', 'Minnesota', 'Mississippi', 'Missouri', 'Montana', 'Nebraska', 'Nevada', 'New Hampshire', 'New Jersey',
'New Mexico', 'New York', 'North Carolina', 'North Dakota', 'Ohio', 'Oklahoma', 'Oregon', 'Pennsylvania', 'Rhode Island', 'South Carolina',
'South Dakota', 'Tennessee', 'Texas', 'Utah', 'Vermont', 'Virginia', 'Washington', 'West Virginia', 'Wisconsin', 'Wyoming']
2022-05-04 00:44:18 +02:00
# In[42]:
2022-05-03 23:59:27 +02:00
def counter(text_in, query):
2022-05-04 00:44:18 +02:00
for line in text_in:
line = line.replace('.', ' ').replace(',', ' ').lower()
2022-05-03 23:59:27 +02:00
pattern = re.compile(query)
return len(pattern.findall(text_in, re.IGNORECASE))
2022-05-04 00:44:18 +02:00
# In[43]:
2022-05-03 23:59:27 +02:00
def state_prediction(text_in):
state_dict = {}
for state in states:
state_dict[state.replace(" ", "_")] = counter(text_in, state)
2022-05-04 00:23:23 +02:00
return state_dict
2022-05-03 23:59:27 +02:00
2022-05-04 00:44:18 +02:00
# In[44]:
2022-05-03 23:59:27 +02:00
def jurisdiction(path_in, path_out):
with open(path_in, 'r', encoding='utf8') as file:
2022-05-04 00:06:07 +02:00
lines = file.readlines()
2022-05-03 23:59:27 +02:00
with open(path_out, 'wt')as file_out:
for i in lines:
file_out.write("jurisdiction="+str(state_prediction(i))+'\n')
file_out.close()
2022-05-04 00:44:18 +02:00
# In[45]:
2022-05-03 23:59:27 +02:00
jurisdiction('dev-0/in.tsv', 'dev-0/out.tsv')
jurisdiction('train/in.tsv', 'train/out.tsv')
jurisdiction('test-A/in.tsv', 'test-A/out.tsv')
2022-05-04 00:44:18 +02:00
# In[39]:
2022-05-03 23:59:27 +02:00
# get_ipython().system('jupyter nbconvert --to script run.ipynb')