#!/usr/bin/env python # coding: utf-8 # In[26]: import re # In[41]: states = ['Alabama', 'Alaska', 'Arizona', 'Arkansas', 'California', 'Colorado', 'Connecticut', 'Delaware', 'Florida', 'Georgia', 'Hawaii', 'Idaho', 'Illinois', 'Indiana', 'Iowa', 'Kansas', 'Kentucky', 'Louisiana', 'Maine', 'Maryland', 'Massachusetts', 'Michigan', 'Minnesota', 'Mississippi', 'Missouri', 'Montana', 'Nebraska', 'Nevada', 'New Hampshire', 'New Jersey', 'New Mexico', 'New York', 'North Carolina', 'North Dakota', 'Ohio', 'Oklahoma', 'Oregon', 'Pennsylvania', 'Rhode Island', 'South Carolina', 'South Dakota', 'Tennessee', 'Texas', 'Utah', 'Vermont', 'Virginia', 'Washington', 'West Virginia', 'Wisconsin', 'Wyoming'] # In[42]: def counter(text_in, query): for line in text_in: line = line.replace('.', ' ').replace(',', ' ').lower() pattern = re.compile(query) return len(pattern.findall(text_in, re.IGNORECASE)) # In[43]: def state_prediction(text_in): state_dict = {} for state in states: state_dict[state.replace(" ", "_")] = counter(text_in, state) return state_dict # In[44]: def jurisdiction(path_in, path_out): with open(path_in, 'r', encoding='utf8') as file: lines = file.readlines() with open(path_out, 'wt')as file_out: for i in lines: file_out.write("jurisdiction="+str(state_prediction(i))+'\n') file_out.close() # In[45]: jurisdiction('dev-0/in.tsv', 'dev-0/out.tsv') jurisdiction('train/in.tsv', 'train/out.tsv') jurisdiction('test-A/in.tsv', 'test-A/out.tsv') # In[39]: # get_ipython().system('jupyter nbconvert --to script run.ipynb')