import re import sys filename = sys.argv[1] with open(filename, encoding='utf-8') as f_in, open('out.tsv', 'w', encoding='utf-8') as f_out: for line in f_in: cols = line.split('\t') clean_col = re.sub(r'(\\n)|(\\f)|(\\t)|(\\)', ' ', cols[-1]) clean_col = re.sub('[^A-Za-z ]', ' ', clean_col) clean_col = re.sub(' +', ' ', clean_col) stateof2words = re.search('[sS]tate of [A-Z]\w+( \w+)?', clean_col) if stateof2words: stateof2words = stateof2words.group().split() if stateof2words[2] in ['New', 'South', 'West', 'Rhode', 'North']: f_out.write('jurisdiction=' + stateof2words[2] + '_' + stateof2words[3] + '\n') else: f_out.write('jurisdiction=' + stateof2words[2] + '\n') else: found = False for word in ['Alabama', 'Alaska', 'Arizona', 'Arkansas', 'California', 'Colorado', 'Connecticut', 'Delaware', 'Florida', 'Georgia', 'Hawaii', 'Idaho', 'Illinois', 'Indiana', 'Iowa', 'Kansas', 'Kentucky', 'Louisiana', 'Maine', 'Maryland', 'Massachusetts', 'Michigan', 'Minnesota', 'Mississippi', 'Missouri', 'Montana', 'Nebraska', 'Nevada', 'New Hampshire', 'New Jersey', 'New Mexico', 'New York', 'North Carolina', 'North Dakota', 'Ohio', 'Oklahoma', 'Oregon', 'Pennsylvania', 'Rhode Island', 'South Carolina', 'South Dakota', 'Tennessee', 'Texas', 'Utah', 'Vermont', 'Virginia', 'Washington', 'West Virginia', 'Wisconsin', 'Wyoming']: if word in clean_col: f_out.write('jurisdiction=' + word + '\n') found = True break if not found: f_out.write('\n')