kleister-nda/extractor.py

172 lines
5.4 KiB
Python

import csv
import lzma
import re
from collections import Counter
from datetime import datetime
from typing import Optional
import datefinder
from word2number import w2n
from itertools import combinations
def get_jurisdiction(text: str) -> Optional[str]:
us_states_regex = r"(Alabama|Alaska|Arizona|Arkansas|California|Colorado|Connecticut|Delaware|Florida|Georgia|Hawaii|Idaho|Illinois|Indiana|Iowa|Kansas|Kentucky|Louisiana|Maine|Maryland|Massachusetts|Michigan|Minnesota|Mississippi|Missouri|Montana|Nebraska|Nevada|New\sHampshire|New\sJersey|New\sMexico|New\sYork|North\sCarolina|North\sDakota|Ohio|Oklahoma|Oregon|Pennsylvania|Rhode\sIsland|South\sCarolina|South\sDakota|Tennessee|Texas|Utah|Vermont|Virginia|Washington|West\sVirginia|Wisconsin|Wyoming)"
matches = re.findall(us_states_regex, text, re.MULTILINE | re.IGNORECASE)
states_counter = Counter(matches)
result = states_counter.most_common(1)
if len(result) == 0:
return None
else:
return result[0][0].replace(" ", "_")
def get_date(text: str) -> Optional[str]:
df_matches = datefinder.find_dates(text)
matches = []
while True:
try:
m = next(df_matches)
except StopIteration:
break
except:
continue
matches.append(m)
matches = filter(lambda x: 2022 > x.year > 1950, matches)
dates = [x.strftime("%Y-%m-%d") for x in matches]
result = Counter(dates).most_common(1)
if len(result) == 0:
return None
else:
return result[0][0]
def get_term(text: str):
term_regex = r"\b([\w()]*)\s(months?|years?)\b"
match = list(re.finditer(term_regex, text, re.MULTILINE))
if match:
number, unit = match[0].groups()
else:
return None
if m := re.match(r"\d+", number):
number = m.group()
else:
try:
number = w2n.word_to_num(re.match(r"\b\w+\b", number).group())
except:
return None
return str(number) + "_" + unit
def get_parties(text: str):
company_regex = r"(([A-Z][A-za-z]+,?\s)+(Inc\.|LLC|Ltd\.|Company|Corporation|INC\.|LTD\.|COMPANY|CORPORATION))"
person_regex = r"([A-Z][a-z]+\s[A-Z]\.\s[A-Z][a-z]+)"
c_matches = re.findall(company_regex, text, re.MULTILINE)
c_matches = [m[0] for m in c_matches]
p_matches = re.findall(person_regex, text, re.MULTILINE)
companies = []
for m in c_matches:
splitted = m.split()
if splitted[-1] == "LLC":
splitted = [w.capitalize().rstrip(",") for w in splitted[:-1]] + [splitted[-1]]
else:
splitted = [w.capitalize().rstrip(",") for w in splitted]
companies.append(" ".join(splitted))
company_counter = Counter(companies)
result = company_counter.most_common(2)
if len(result) < 2:
result.extend(Counter(p_matches).most_common(2-len(result)))
if len(result) == 0:
return None
else:
return [x[0].replace(" ", "_") for x in result]
def extract(text: str, needed_info) -> str:
if "jurisdiction" in needed_info:
jurisdiction = get_jurisdiction(text)
else:
jurisdiction = None
if "effective_date" in needed_info:
date = get_date(text)
else:
date = None
if "term" in needed_info:
term = get_term(text)
else:
term = None
if "party" in needed_info:
parties = get_parties(text)
else: parties = None
result_dict = {}
if jurisdiction is not None:
result_dict["jurisdiction"] = jurisdiction
if date is not None:
result_dict["effective_date"] = date
if term is not None:
result_dict["term"] = term
result_str = " ".join([f"{k}={v}" for k,v in result_dict.items()])
if parties is not None:
for p in parties :
result_str = result_str + f" party={p}"
return result_str
if __name__ == "__main__":
with open('dev-0/in.tsv', 'r', encoding="utf-8") as in_file,\
open("dev-0/out.tsv", "w") as out_file:
reader = csv.reader(in_file, delimiter='\t', quoting=csv.QUOTE_NONE)
for item in reader:
needed_info = item[1].strip().split()
text = item[2].replace("\\n", " ").replace("\\f", " ").replace("\\t", " ").strip()
extracted = extract(text, needed_info).replace(":", "_")
out_file.write(extracted + "\n")
with open('train/in.tsv', 'r', encoding="utf-8") as in_file,\
open("train/out.tsv", "w") as out_file:
reader = csv.reader(in_file, delimiter='\t', quoting=csv.QUOTE_NONE)
for item in reader:
needed_info = item[1].strip().split()
text = item[2].replace("\\n", " ").replace("\\f", " ").replace("\\t", " ").strip()
extracted = extract(text, needed_info).replace(":", "_")
out_file.write(extracted + "\n")
with open('test-A/in.tsv', 'r', encoding="utf-8") as in_file,\
open("test-A/out.tsv", "w") as out_file:
reader = csv.reader(in_file, delimiter='\t', quoting=csv.QUOTE_NONE)
for item in reader:
needed_info = item[1].strip().split()
text = item[2].replace("\\n", " ").replace("\\f", " ").replace("\\t", " ").strip()
extracted = extract(text, needed_info).replace(":", "_")
out_file.write(extracted + "\n")