#!/usr/bin/env python # coding: utf-8 # In[56]: import re # In[57]: states = ['Alabama', 'Alaska', 'Arizona', 'Arkansas', 'California', 'Colorado', 'Connecticut', 'Delaware', 'Florida', 'Georgia', 'Hawaii', 'Idaho', 'Illinois', 'Indiana', 'Kansas', 'Kentucky', 'Louisiana', 'Maine', 'Maryland', 'Massachusetts', 'Michigan', 'Minnesota', 'Mississippi', 'Missouri', 'Nebraska', 'Nevada', 'New Hampshire', 'New Jersey', 'New Mexico', 'New York', 'North Carolina', 'North Dakota', 'Ohio', 'Oklahoma', 'Pennsylvania', 'Rhode Island', 'South Carolina', 'South Dakota', 'Tennessee', 'Texas', 'Utah', 'Vermont', 'Virginia', 'Washington', 'West Virginia', 'Wisconsin', 'Wyoming'] # In[58]: def counter(text_in, query): pattern = re.compile(query) return len(pattern.findall(text_in, re.IGNORECASE)) # In[59]: def state_prediction(text_in): state_dict = {} for state in states: state_dict[state.replace(" ", "_")] = counter(text_in, state) return max(state_dict, key=state_dict.get) # In[60]: def jurisdiction(path_in, path_out): with open(path_in, 'r', encoding='utf8') as file: lines = file.readlines() with open(path_out, 'wt')as file_out: for i in lines: file_out.write("jurisdiction="+str(state_prediction(i))+'\n') file_out.close() # In[61]: jurisdiction('dev-0/in.tsv', 'dev-0/out.tsv') jurisdiction('train/in.tsv', 'train/out.tsv') jurisdiction('test-A/in.tsv', 'test-A/out.tsv') # In[62]: # get_ipython().system('jupyter nbconvert --to script run.ipynb')