34 lines
1.8 KiB
Python
34 lines
1.8 KiB
Python
import re
|
|
import sys
|
|
|
|
filename = sys.argv[1]
|
|
|
|
with open(filename, encoding='utf-8') as f_in, open('out.tsv', 'w', encoding='utf-8') as f_out:
|
|
for line in f_in:
|
|
cols = line.split('\t')
|
|
clean_col = re.sub(r'(\\n)|(\\f)|(\\t)|(\\)', ' ', cols[-1])
|
|
clean_col = re.sub('[^A-Za-z ]', ' ', clean_col)
|
|
clean_col = re.sub(' +', ' ', clean_col)
|
|
stateof2words = re.search('[sS]tate of [A-Z]\w+( \w+)?', clean_col)
|
|
if stateof2words:
|
|
stateof2words = stateof2words.group().split()
|
|
if stateof2words[2] in ['New', 'South', 'West', 'Rhode', 'North']:
|
|
f_out.write('jurisdiction=' + stateof2words[2] + '_' + stateof2words[3] + '\n')
|
|
else:
|
|
f_out.write('jurisdiction=' + stateof2words[2] + '\n')
|
|
else:
|
|
found = False
|
|
for word in ['Alabama', 'Alaska', 'Arizona', 'Arkansas', 'California', 'Colorado', 'Connecticut', 'Delaware',
|
|
'Florida', 'Georgia', 'Hawaii', 'Idaho', 'Illinois', 'Indiana', 'Iowa', 'Kansas', 'Kentucky',
|
|
'Louisiana', 'Maine', 'Maryland', 'Massachusetts', 'Michigan', 'Minnesota', 'Mississippi',
|
|
'Missouri', 'Montana', 'Nebraska', 'Nevada', 'New Hampshire', 'New Jersey', 'New Mexico',
|
|
'New York', 'North Carolina', 'North Dakota', 'Ohio', 'Oklahoma', 'Oregon', 'Pennsylvania',
|
|
'Rhode Island', 'South Carolina', 'South Dakota', 'Tennessee', 'Texas', 'Utah', 'Vermont',
|
|
'Virginia', 'Washington', 'West Virginia', 'Wisconsin', 'Wyoming']:
|
|
if word in clean_col:
|
|
f_out.write('jurisdiction=' + word + '\n')
|
|
found = True
|
|
break
|
|
if not found:
|
|
f_out.write('\n')
|