#!/usr/bin/env python # coding: utf-8 # In[54]: import csv import re from collections import Counter from datetime import datetime import datefinder import word2number # In[12]: def most_frequent(List, howmany=1): counter = Counter(List) return counter.most_common(1) # In[49]: def get_jurisdiction(text): us_states= r"(Alabama|Alaska|Arizona|Arkansas|California|Colorado|Connecticut|Delaware|Florida|Georgia|Hawaii|Idaho|Illinois|Indiana|Iowa|Kansas|Kentucky|Louisiana|Maine|Maryland|Massachusetts|Michigan|Minnesota|Mississippi|Missouri|Montana|Nebraska|Nevada|New\sHampshire|New\sJersey|New\sMexico|New\sYork|North\sCarolina|North\sDakota|Ohio|Oklahoma|Oregon|Pennsylvania|Rhode\sIsland|South\sCarolina|South\sDakota|Tennessee|Texas|Utah|Vermont|Virginia|Washington|West\sVirginia|Wisconsin|Wyoming)" matches = re.findall(us_states, text, re.MULTILINE | re.IGNORECASE) result = most_frequent(matches) if result: return result[0][0].replace(" ", "_") # In[47]: def get_parties(text): company_regex = r"(([A-Z][A-za-z]+,?\s)+(Inc\.|LLC|Ltd\.|Company|Corporation|INC\.|LTD\.|COMPANY|CORPORATION|Bank|Com|Council|Technology|Systems))" regex2 = r"([A-Z][a-z]+\s[A-Z]\.\s[A-Z][a-z]+)" matches = re.findall(company_regex, text, re.MULTILINE) matches = [m[0] for m in matches] regex2 = re.findall(regex2, text, re.MULTILINE) companies = [] for m in matches: splitted = [w.capitalize().rstrip(",") for w in m.split()] companies.append(" ".join(splitted)) result = most_frequent(companies,2) if len(result) < 2: result.extend(most_frequent(regex2, 2-len(result))) if result: return [x[0].replace(" ", "_") for x in result] # In[5]: def get_date(text): matches = [] df_matches = datefinder.find_dates(text) while True: try: m = next(df_matches) except StopIteration: break except: continue matches.append(m) matches = filter(lambda x: 2022 > x.year > 1950, matches) dates = [x.strftime("%Y-%m-%d") for x in matches] result = most_frequent(dates) if len(result) == 0: return None else: return result[0][0] # In[51]: def get_term(text): term_regex = r"\b([\w()]*)\s(months?|years?)\b" match = list(re.finditer(term_regex, text, re.MULTILINE)) if match: number, unit = match[0].groups() else: return None if m := re.match(r"\d+", number): number = m.group() else: try: number = w2n.word_to_num(re.match(r"\b\w+\b", number).group()) except: return None return str(number) + "_" + unit # In[52]: def run(text, needed_info): jurisdiction, date, term, parties = None, None, None, None if "jurisdiction" in needed_info: jurisdiction = get_jurisdiction(text) if "effective_date" in needed_info: date = get_date(text) if "term" in needed_info: term = get_term(text) if "party" in needed_info: parties = get_parties(text) result_dict = {} if date: result_dict["effective_date"] = date if jurisdiction: result_dict["jurisdiction"] = jurisdiction if term: result_dict["term"] = term result_str = " ".join([f"{k}={v}" for k,v in result_dict.items()]) if parties: for p in parties : result_str += f" party={p}" return result_str # In[53]: filenames=[('dev-0/in.tsv',"dev-0/out.tsv"), ('train/in.tsv', "train/out.tsv"), ('test-A/in.tsv', 'test-A/out.tsv')] for filename in filenames: with open(filename[0], 'r', encoding="utf-8") as in_file, open(filename[1], "w") as out_file: reader = csv.reader(in_file, delimiter='\t', quoting=csv.QUOTE_NONE) for item in reader: needed_info = item[1].strip().split() text = item[2].replace("\\n", " ").replace("\\f", " ").replace("\\t", " ").strip() extracted = run(text, needed_info).replace(":", "_") out_file.write(extracted + "\n")