54 lines
1.5 KiB
Python
54 lines
1.5 KiB
Python
|
#!/usr/bin/env python
|
||
|
# coding: utf-8
|
||
|
|
||
|
# In[13]:
|
||
|
|
||
|
|
||
|
import re
|
||
|
|
||
|
|
||
|
# In[1]:
|
||
|
|
||
|
|
||
|
state_names = ["Alaska", "Alabama", "Arkansas", "American Samoa", "Arizona", "California", "Colorado", "Connecticut", "District ", "of Columbia", "Delaware", "Florida", "Georgia", "Guam", "Hawaii", "Iowa", "Idaho", "Illinois", "Indiana", "Kansas", "Kentucky", "Louisiana", "Massachusetts", "Maryland", "Maine", "Michigan", "Minnesota", "Missouri", "Mississippi", "Montana", "North Carolina", "North Dakota", "Nebraska", "New Hampshire", "New Jersey", "New Mexico", "Nevada", "New York", "Ohio", "Oklahoma", "Oregon", "Pennsylvania", "Puerto Rico", "Rhode Island", "South Carolina", "South Dakota", "Tennessee", "Texas", "Utah", "Virginia", "Virgin Islands", "Vermont", "Washington", "Wisconsin", "West Virginia", "Wyoming"]
|
||
|
regex = re.compile(r'\b(' + '|'.join(state_names) + r')\b')
|
||
|
|
||
|
|
||
|
# In[43]:
|
||
|
|
||
|
|
||
|
with open('dev-0/in.tsv', 'r', encoding='utf8') as file:
|
||
|
lines = file.readlines()
|
||
|
|
||
|
|
||
|
# In[44]:
|
||
|
|
||
|
|
||
|
with open('dev-0/out.tsv', 'w') as file:
|
||
|
for line in lines:
|
||
|
jur = regex.search(line)
|
||
|
if jur:
|
||
|
file.write('jurisdiction=' + jur.group().replace(' ', '_') +'\n')
|
||
|
else:
|
||
|
file.write('\n')
|
||
|
|
||
|
|
||
|
# In[41]:
|
||
|
|
||
|
|
||
|
with open('test-A/in.tsv', 'r', encoding='utf8') as file:
|
||
|
lines = file.readlines()
|
||
|
|
||
|
|
||
|
# In[42]:
|
||
|
|
||
|
|
||
|
with open('test-A/out.tsv', 'w') as file:
|
||
|
for line in lines:
|
||
|
jur = regex.search(line)
|
||
|
if jur:
|
||
|
file.write('jurisdiction=' + jur.group().replace(' ', '_') +'\n')
|
||
|
else:
|
||
|
file.write('\n')
|
||
|
|