kleister-nda/run.py

161 lines
4.1 KiB
Python

#!/usr/bin/env python
# coding: utf-8
# In[54]:
import csv
import re
from collections import Counter
from datetime import datetime
import datefinder
import word2number
# In[12]:
def most_frequent(List, howmany=1):
counter = Counter(List)
return counter.most_common(1)
# In[49]:
def get_jurisdiction(text):
us_states= r"(Alabama|Alaska|Arizona|Arkansas|California|Colorado|Connecticut|Delaware|Florida|Georgia|Hawaii|Idaho|Illinois|Indiana|Iowa|Kansas|Kentucky|Louisiana|Maine|Maryland|Massachusetts|Michigan|Minnesota|Mississippi|Missouri|Montana|Nebraska|Nevada|New\sHampshire|New\sJersey|New\sMexico|New\sYork|North\sCarolina|North\sDakota|Ohio|Oklahoma|Oregon|Pennsylvania|Rhode\sIsland|South\sCarolina|South\sDakota|Tennessee|Texas|Utah|Vermont|Virginia|Washington|West\sVirginia|Wisconsin|Wyoming)"
matches = re.findall(us_states, text, re.MULTILINE | re.IGNORECASE)
result = most_frequent(matches)
if result:
return result[0][0].replace(" ", "_")
# In[47]:
def get_parties(text):
company_regex = r"(([A-Z][A-za-z]+,?\s)+(Inc\.|LLC|Ltd\.|Company|Corporation|INC\.|LTD\.|COMPANY|CORPORATION|Bank|Com|Council|Technology|Systems))"
regex2 = r"([A-Z][a-z]+\s[A-Z]\.\s[A-Z][a-z]+)"
matches = re.findall(company_regex, text, re.MULTILINE)
matches = [m[0] for m in matches]
regex2 = re.findall(regex2, text, re.MULTILINE)
companies = []
for m in matches:
splitted = [w.capitalize().rstrip(",") for w in m.split()]
companies.append(" ".join(splitted))
result = most_frequent(companies,2)
if len(result) < 2:
result.extend(most_frequent(regex2, 2-len(result)))
if result:
return [x[0].replace(" ", "_") for x in result]
# In[5]:
def get_date(text):
matches = []
df_matches = datefinder.find_dates(text)
while True:
try:
m = next(df_matches)
except StopIteration:
break
except:
continue
matches.append(m)
matches = filter(lambda x: 2022 > x.year > 1950, matches)
dates = [x.strftime("%Y-%m-%d") for x in matches]
result = most_frequent(dates)
if len(result) == 0:
return None
else:
return result[0][0]
# In[51]:
def get_term(text):
term_regex = r"\b([\w()]*)\s(months?|years?)\b"
match = list(re.finditer(term_regex, text, re.MULTILINE))
if match:
number, unit = match[0].groups()
else:
return None
if m := re.match(r"\d+", number):
number = m.group()
else:
try:
number = w2n.word_to_num(re.match(r"\b\w+\b", number).group())
except:
return None
return str(number) + "_" + unit
# In[52]:
def run(text, needed_info):
jurisdiction, date, term, parties = None, None, None, None
if "jurisdiction" in needed_info:
jurisdiction = get_jurisdiction(text)
if "effective_date" in needed_info:
date = get_date(text)
if "term" in needed_info:
term = get_term(text)
if "party" in needed_info:
parties = get_parties(text)
result_dict = {}
if date:
result_dict["effective_date"] = date
if jurisdiction:
result_dict["jurisdiction"] = jurisdiction
if term:
result_dict["term"] = term
result_str = " ".join([f"{k}={v}" for k,v in result_dict.items()])
if parties:
for p in parties :
result_str += f" party={p}"
return result_str
# In[53]:
filenames=[('dev-0/in.tsv',"dev-0/out.tsv"), ('train/in.tsv', "train/out.tsv"), ('test-A/in.tsv', 'test-A/out.tsv')]
for filename in filenames:
with open(filename[0], 'r', encoding="utf-8") as in_file, open(filename[1], "w") as out_file:
reader = csv.reader(in_file, delimiter='\t', quoting=csv.QUOTE_NONE)
for item in reader:
needed_info = item[1].strip().split()
text = item[2].replace("\\n", " ").replace("\\f", " ").replace("\\t", " ").strip()
extracted = run(text, needed_info).replace(":", "_")
out_file.write(extracted + "\n")