69 lines
2.8 KiB
Python
69 lines
2.8 KiB
Python
import csv
|
|
import re
|
|
|
|
import datefinder
|
|
|
|
us_states = r"Alabama|Alaska|Arizona|Arkansas|California|Colorado|Connecticut|Delaware|Florida|Georgia|Hawaii|Idaho|Illinois|Indiana|Iowa|Kansas|Kentucky|Louisiana|Maine|Maryland|Massachusetts|Michigan|Minnesota|Mississippi|Missouri|Montana|Nebraska|Nevada|New Hampshire|New Jersey|New Mexico|New York|North Carolina|North Dakota|Ohio|Oklahoma|Oregon|Pennsylvania|Rhode Island|South Carolina|South Dakota|Tennessee|Texas|Utah|Vermont|Virginia|Washington|West Virginia|Wisconsin|Wyoming"
|
|
states = re.compile(us_states, flags=re.I)
|
|
terms = re.compile(r"\((\d{1,2})\) (years?|months?)", flags=re.I)
|
|
parties = re.compile(r"n?((\w+ )?\w+,? (inc\.|llc|ltd\.))", flags=re.I)
|
|
|
|
|
|
def extract_from_file(file_path):
|
|
with open(file_path, 'r') as tsvfile:
|
|
reader = csv.reader(tsvfile, delimiter='\t', quoting=csv.QUOTE_NONE)
|
|
file_output = []
|
|
|
|
for item in reader:
|
|
print(item[0])
|
|
|
|
headers = item[1].split()
|
|
processed_file = item[-1]
|
|
result_line = {}
|
|
|
|
for key in headers:
|
|
if key == "jurisdiction":
|
|
regex_search = states.findall(processed_file)
|
|
if regex_search:
|
|
result_line['jurisdiction'] = regex_search[0]
|
|
pass
|
|
if key == "effective_date":
|
|
regex_search = [date for date in datefinder.find_dates(processed_file)]
|
|
if regex_search:
|
|
result_line['effective_date'] = regex_search[0].date()
|
|
|
|
if key == "term":
|
|
regex_search = terms.findall(processed_file)
|
|
if regex_search:
|
|
result_line['term'] = f"{regex_search[0][0]}_{regex_search[0][1]}"
|
|
|
|
all_parties = None
|
|
if key == "party":
|
|
found = parties.findall(processed_file)
|
|
all_parties = [re.split(',| ', party[0])[0] for party in found]
|
|
|
|
output_line = ' '.join([f"{k}={v}" for k, v in result_line.items()])
|
|
|
|
if all_parties:
|
|
output_line += ' '.join([f"party={v}" for v in all_parties])
|
|
|
|
file_output.append(output_line)
|
|
print(f'file: {item[0]} processedd')
|
|
return file_output
|
|
|
|
with open(filepath_out, 'w+', encoding='utf8') as file:
|
|
file.write('\n'.join(rowstrings))
|
|
|
|
|
|
def save_to_file(output, output_file_path):
|
|
with open(output_file_path, 'w+', encoding='utf8') as file:
|
|
file.write('\n'.join(output))
|
|
|
|
|
|
output = extract_from_file('train/in.tsv')
|
|
save_to_file(output, 'train/out.tsv')
|
|
output = extract_from_file('dev-0/in.tsv')
|
|
save_to_file(output, 'dev-0/out.tsv')
|
|
output = extract_from_file('test-A/in.tsv')
|
|
save_to_file(output, 'test-A/out.tsv')
|