172 lines
5.4 KiB
Python
172 lines
5.4 KiB
Python
import csv
|
|
import lzma
|
|
import re
|
|
from collections import Counter
|
|
from datetime import datetime
|
|
from typing import Optional
|
|
import datefinder
|
|
from word2number import w2n
|
|
from itertools import combinations
|
|
|
|
|
|
def get_jurisdiction(text: str) -> Optional[str]:
|
|
us_states_regex = r"(Alabama|Alaska|Arizona|Arkansas|California|Colorado|Connecticut|Delaware|Florida|Georgia|Hawaii|Idaho|Illinois|Indiana|Iowa|Kansas|Kentucky|Louisiana|Maine|Maryland|Massachusetts|Michigan|Minnesota|Mississippi|Missouri|Montana|Nebraska|Nevada|New\sHampshire|New\sJersey|New\sMexico|New\sYork|North\sCarolina|North\sDakota|Ohio|Oklahoma|Oregon|Pennsylvania|Rhode\sIsland|South\sCarolina|South\sDakota|Tennessee|Texas|Utah|Vermont|Virginia|Washington|West\sVirginia|Wisconsin|Wyoming)"
|
|
|
|
matches = re.findall(us_states_regex, text, re.MULTILINE | re.IGNORECASE)
|
|
states_counter = Counter(matches)
|
|
|
|
result = states_counter.most_common(1)
|
|
|
|
if len(result) == 0:
|
|
return None
|
|
else:
|
|
return result[0][0].replace(" ", "_")
|
|
|
|
|
|
def get_date(text: str) -> Optional[str]:
|
|
|
|
df_matches = datefinder.find_dates(text)
|
|
|
|
matches = []
|
|
|
|
while True:
|
|
try:
|
|
m = next(df_matches)
|
|
except StopIteration:
|
|
break
|
|
except:
|
|
continue
|
|
matches.append(m)
|
|
|
|
matches = filter(lambda x: 2022 > x.year > 1950, matches)
|
|
|
|
dates = [x.strftime("%Y-%m-%d") for x in matches]
|
|
|
|
result = Counter(dates).most_common(1)
|
|
|
|
if len(result) == 0:
|
|
return None
|
|
else:
|
|
return result[0][0]
|
|
|
|
|
|
def get_term(text: str):
|
|
term_regex = r"\b([\w()]*)\s(months?|years?)\b"
|
|
|
|
match = list(re.finditer(term_regex, text, re.MULTILINE))
|
|
|
|
if match:
|
|
number, unit = match[0].groups()
|
|
else:
|
|
return None
|
|
|
|
if m := re.match(r"\d+", number):
|
|
number = m.group()
|
|
else:
|
|
try:
|
|
number = w2n.word_to_num(re.match(r"\b\w+\b", number).group())
|
|
except:
|
|
return None
|
|
|
|
return str(number) + "_" + unit
|
|
|
|
|
|
def get_parties(text: str):
|
|
company_regex = r"(([A-Z][A-za-z]+,?\s)+(Inc\.|LLC|Ltd\.|Company|Corporation|INC\.|LTD\.|COMPANY|CORPORATION))"
|
|
person_regex = r"([A-Z][a-z]+\s[A-Z]\.\s[A-Z][a-z]+)"
|
|
|
|
c_matches = re.findall(company_regex, text, re.MULTILINE)
|
|
c_matches = [m[0] for m in c_matches]
|
|
|
|
p_matches = re.findall(person_regex, text, re.MULTILINE)
|
|
|
|
companies = []
|
|
for m in c_matches:
|
|
splitted = m.split()
|
|
if splitted[-1] == "LLC":
|
|
splitted = [w.capitalize().rstrip(",") for w in splitted[:-1]] + [splitted[-1]]
|
|
else:
|
|
splitted = [w.capitalize().rstrip(",") for w in splitted]
|
|
companies.append(" ".join(splitted))
|
|
|
|
company_counter = Counter(companies)
|
|
|
|
result = company_counter.most_common(2)
|
|
|
|
if len(result) < 2:
|
|
result.extend(Counter(p_matches).most_common(2-len(result)))
|
|
|
|
if len(result) == 0:
|
|
return None
|
|
else:
|
|
return [x[0].replace(" ", "_") for x in result]
|
|
|
|
|
|
def extract(text: str, needed_info) -> str:
|
|
if "jurisdiction" in needed_info:
|
|
jurisdiction = get_jurisdiction(text)
|
|
else:
|
|
jurisdiction = None
|
|
|
|
if "effective_date" in needed_info:
|
|
date = get_date(text)
|
|
else:
|
|
date = None
|
|
|
|
if "term" in needed_info:
|
|
term = get_term(text)
|
|
else:
|
|
term = None
|
|
|
|
if "party" in needed_info:
|
|
parties = get_parties(text)
|
|
else: parties = None
|
|
|
|
result_dict = {}
|
|
|
|
if jurisdiction is not None:
|
|
result_dict["jurisdiction"] = jurisdiction
|
|
|
|
if date is not None:
|
|
result_dict["effective_date"] = date
|
|
|
|
if term is not None:
|
|
result_dict["term"] = term
|
|
|
|
result_str = " ".join([f"{k}={v}" for k,v in result_dict.items()])
|
|
|
|
if parties is not None:
|
|
for p in parties :
|
|
result_str = result_str + f" party={p}"
|
|
|
|
return result_str
|
|
|
|
|
|
if __name__ == "__main__":
|
|
with open('dev-0/in.tsv', 'r', encoding="utf-8") as in_file,\
|
|
open("dev-0/out.tsv", "w") as out_file:
|
|
reader = csv.reader(in_file, delimiter='\t', quoting=csv.QUOTE_NONE)
|
|
for item in reader:
|
|
needed_info = item[1].strip().split()
|
|
text = item[2].replace("\\n", " ").replace("\\f", " ").replace("\\t", " ").strip()
|
|
extracted = extract(text, needed_info).replace(":", "_")
|
|
out_file.write(extracted + "\n")
|
|
|
|
with open('train/in.tsv', 'r', encoding="utf-8") as in_file,\
|
|
open("train/out.tsv", "w") as out_file:
|
|
reader = csv.reader(in_file, delimiter='\t', quoting=csv.QUOTE_NONE)
|
|
for item in reader:
|
|
needed_info = item[1].strip().split()
|
|
text = item[2].replace("\\n", " ").replace("\\f", " ").replace("\\t", " ").strip()
|
|
extracted = extract(text, needed_info).replace(":", "_")
|
|
out_file.write(extracted + "\n")
|
|
|
|
with open('test-A/in.tsv', 'r', encoding="utf-8") as in_file,\
|
|
open("test-A/out.tsv", "w") as out_file:
|
|
reader = csv.reader(in_file, delimiter='\t', quoting=csv.QUOTE_NONE)
|
|
for item in reader:
|
|
needed_info = item[1].strip().split()
|
|
text = item[2].replace("\\n", " ").replace("\\f", " ").replace("\\t", " ").strip()
|
|
extracted = extract(text, needed_info).replace(":", "_")
|
|
out_file.write(extracted + "\n")
|