kleister-nda/run.py

34 lines
1.8 KiB
Python

import re
import sys
filename = sys.argv[1]
with open(filename, encoding='utf-8') as f_in, open('out.tsv', 'w', encoding='utf-8') as f_out:
for line in f_in:
cols = line.split('\t')
clean_col = re.sub(r'(\\n)|(\\f)|(\\t)|(\\)', ' ', cols[-1])
clean_col = re.sub('[^A-Za-z ]', ' ', clean_col)
clean_col = re.sub(' +', ' ', clean_col)
stateof2words = re.search('[sS]tate of [A-Z]\w+( \w+)?', clean_col)
if stateof2words:
stateof2words = stateof2words.group().split()
if stateof2words[2] in ['New', 'South', 'West', 'Rhode', 'North']:
f_out.write('jurisdiction=' + stateof2words[2] + '_' + stateof2words[3] + '\n')
else:
f_out.write('jurisdiction=' + stateof2words[2] + '\n')
else:
found = False
for word in ['Alabama', 'Alaska', 'Arizona', 'Arkansas', 'California', 'Colorado', 'Connecticut', 'Delaware',
'Florida', 'Georgia', 'Hawaii', 'Idaho', 'Illinois', 'Indiana', 'Iowa', 'Kansas', 'Kentucky',
'Louisiana', 'Maine', 'Maryland', 'Massachusetts', 'Michigan', 'Minnesota', 'Mississippi',
'Missouri', 'Montana', 'Nebraska', 'Nevada', 'New Hampshire', 'New Jersey', 'New Mexico',
'New York', 'North Carolina', 'North Dakota', 'Ohio', 'Oklahoma', 'Oregon', 'Pennsylvania',
'Rhode Island', 'South Carolina', 'South Dakota', 'Tennessee', 'Texas', 'Utah', 'Vermont',
'Virginia', 'Washington', 'West Virginia', 'Wisconsin', 'Wyoming']:
if word in clean_col:
f_out.write('jurisdiction=' + word + '\n')
found = True
break
if not found:
f_out.write('\n')