import re import csv from collections import Counter from datetime import datetime import datefinder pat = r"Alabama|Alaska|Arizona|Arkansas|California|Colorado|Connecticut|Delaware|Florida|Georgia|Hawaii|Idaho|Illinois|Indiana|Iowa|Kansas|Kentucky|Louisiana|Maine|Maryland|Massachusetts|Michigan|Minnesota|Mississippi|Missouri|Montana|Nebraska|Nevada|New Hampshire|New Jersey|New Mexico|New York|North Carolina|North Dakota|Ohio|Oklahoma|Oregon|Pennsylvania|Rhode Island|South Carolina|South Dakota|Tennessee|Texas|Utah|Vermont|Virginia|Washington|West Virginia|Wisconsin|Wyoming" states = re.compile(pat, flags=re.I) terms = re.compile(r"\((\d{1,2})\) (years?|months?)", flags=re.I) def process(filepath_in, filepath_out): rowstrings = [] with open(filepath_in, 'r', encoding='utf8') as file: reader = csv.reader(file, delimiter='\t') for idx, item in enumerate(reader): print(idx) results = {} keys = item[1].split() text_best = item[-1] found = states.findall(text_best) if 'jurisdiction' in keys and found: jur = Counter(found).most_common(1)[0][0] jur = '_'.join([w.capitalize() for w in jur.split()]) results['jurisdiction'] = jur found = [dat for dat in datefinder.find_dates(text_best) if 2022 > dat.year > 1990] if 'effective_date' in keys and found: date = Counter(found).most_common(1)[0][0] date = date.date() results['effective_date'] = date found = terms.findall(text_best) if 'term' in keys and found: term = Counter(found).most_common(1)[0][0] results['term'] = f"{term[0]}_{term[1]}" rowstrings.append(' '.join([f"{k}={v}" for k,v in results.items()])) with open(filepath_out, 'w+', encoding='utf8') as file: file.write('\n'.join(rowstrings)) process('train/in.tsv', 'train/out.tsv') process('dev-0/in.tsv', 'dev-0/out.tsv') process('test-A/in.tsv', 'test-A/out.tsv')