2022-05-03 19:04:44 +02:00
|
|
|
import re
|
|
|
|
import csv
|
|
|
|
from collections import Counter
|
|
|
|
from datetime import datetime
|
|
|
|
|
|
|
|
import datefinder
|
|
|
|
|
|
|
|
pat = r"Alabama|Alaska|Arizona|Arkansas|California|Colorado|Connecticut|Delaware|Florida|Georgia|Hawaii|Idaho|Illinois|Indiana|Iowa|Kansas|Kentucky|Louisiana|Maine|Maryland|Massachusetts|Michigan|Minnesota|Mississippi|Missouri|Montana|Nebraska|Nevada|New Hampshire|New Jersey|New Mexico|New York|North Carolina|North Dakota|Ohio|Oklahoma|Oregon|Pennsylvania|Rhode Island|South Carolina|South Dakota|Tennessee|Texas|Utah|Vermont|Virginia|Washington|West Virginia|Wisconsin|Wyoming"
|
|
|
|
states = re.compile(pat, flags=re.I)
|
|
|
|
terms = re.compile(r"\((\d{1,2})\) (years?|months?)", flags=re.I)
|
2022-05-03 19:43:30 +02:00
|
|
|
parties = re.compile(r"n?((\w+ )?\w+,? (inc\.|llc|ltd\.))", flags=re.I)
|
2022-05-03 19:04:44 +02:00
|
|
|
|
|
|
|
def process(filepath_in, filepath_out):
|
|
|
|
rowstrings = []
|
|
|
|
with open(filepath_in, 'r', encoding='utf8') as file:
|
|
|
|
reader = csv.reader(file, delimiter='\t')
|
|
|
|
|
|
|
|
for idx, item in enumerate(reader):
|
|
|
|
print(idx)
|
|
|
|
results = {}
|
|
|
|
keys = item[1].split()
|
|
|
|
text_best = item[-1]
|
|
|
|
|
|
|
|
found = states.findall(text_best)
|
|
|
|
if 'jurisdiction' in keys and found:
|
|
|
|
jur = Counter(found).most_common(1)[0][0]
|
|
|
|
jur = '_'.join([w.capitalize() for w in jur.split()])
|
|
|
|
results['jurisdiction'] = jur
|
|
|
|
|
|
|
|
found = [dat for dat in datefinder.find_dates(text_best) if 2022 > dat.year > 1990]
|
|
|
|
if 'effective_date' in keys and found:
|
|
|
|
date = Counter(found).most_common(1)[0][0]
|
|
|
|
date = date.date()
|
|
|
|
results['effective_date'] = date
|
|
|
|
|
|
|
|
found = terms.findall(text_best)
|
|
|
|
if 'term' in keys and found:
|
|
|
|
term = Counter(found).most_common(1)[0][0]
|
|
|
|
results['term'] = f"{term[0]}_{term[1]}"
|
|
|
|
|
2022-05-03 19:43:30 +02:00
|
|
|
found = parties.findall(text_best)
|
|
|
|
if 'party' in keys and found:
|
|
|
|
part = []
|
|
|
|
for p in found:
|
|
|
|
party = p[0].replace(',', '').replace(' ', '_')
|
|
|
|
part.append(party.upper())
|
|
|
|
part = list(set(part))
|
|
|
|
|
|
|
|
line = ' '.join([f"{k}={v}" for k, v in results.items()])
|
|
|
|
if part:
|
|
|
|
y = ' '.join([f"party={v}" for v in part])
|
|
|
|
line += f' {y}'
|
|
|
|
|
|
|
|
rowstrings.append(line)
|
2022-05-03 19:04:44 +02:00
|
|
|
|
|
|
|
with open(filepath_out, 'w+', encoding='utf8') as file:
|
|
|
|
file.write('\n'.join(rowstrings))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
process('train/in.tsv', 'train/out.tsv')
|
|
|
|
process('dev-0/in.tsv', 'dev-0/out.tsv')
|
|
|
|
process('test-A/in.tsv', 'test-A/out.tsv')
|