kleister-nda/run.py

34 lines
1.8 KiB
Python
Raw Permalink Normal View History

2022-04-22 22:25:21 +02:00
import re
import sys
filename = sys.argv[1]
with open(filename, encoding='utf-8') as f_in, open('out.tsv', 'w', encoding='utf-8') as f_out:
for line in f_in:
cols = line.split('\t')
clean_col = re.sub(r'(\\n)|(\\f)|(\\t)|(\\)', ' ', cols[-1])
clean_col = re.sub('[^A-Za-z ]', ' ', clean_col)
clean_col = re.sub(' +', ' ', clean_col)
stateof2words = re.search('[sS]tate of [A-Z]\w+( \w+)?', clean_col)
if stateof2words:
stateof2words = stateof2words.group().split()
if stateof2words[2] in ['New', 'South', 'West', 'Rhode', 'North']:
f_out.write('jurisdiction=' + stateof2words[2] + '_' + stateof2words[3] + '\n')
else:
f_out.write('jurisdiction=' + stateof2words[2] + '\n')
else:
found = False
for word in ['Alabama', 'Alaska', 'Arizona', 'Arkansas', 'California', 'Colorado', 'Connecticut', 'Delaware',
'Florida', 'Georgia', 'Hawaii', 'Idaho', 'Illinois', 'Indiana', 'Iowa', 'Kansas', 'Kentucky',
'Louisiana', 'Maine', 'Maryland', 'Massachusetts', 'Michigan', 'Minnesota', 'Mississippi',
'Missouri', 'Montana', 'Nebraska', 'Nevada', 'New Hampshire', 'New Jersey', 'New Mexico',
'New York', 'North Carolina', 'North Dakota', 'Ohio', 'Oklahoma', 'Oregon', 'Pennsylvania',
'Rhode Island', 'South Carolina', 'South Dakota', 'Tennessee', 'Texas', 'Utah', 'Vermont',
'Virginia', 'Washington', 'West Virginia', 'Wisconsin', 'Wyoming']:
if word in clean_col:
f_out.write('jurisdiction=' + word + '\n')
found = True
break
if not found:
f_out.write('\n')