import csv import sys import lzma import os import regex as re out_file_name = "out.tsv" out_sep = '\t' months = { "January": "01", "February": "02", "March": "03", "April": "04", "May": "05", "June": "06", "July": "07", "August": "08", "September": "09", "October": "10", "November": "11", "December": "12" } def main(fname: str): dir_path = os.path.dirname(fname) out_path = os.path.join(dir_path, out_file_name) with lzma.open(fname, mode="rt", encoding="utf-8") as tsv_file: with open(out_path, "w") as out_file: reader = csv.reader(tsv_file, delimiter='\t') for item in reader: out_file.write(process_entry(item) + '\n') def process_entry(entry: list) -> str: res = "" parameters = entry[1] for p in parameters.split(" "): t = entry[2] if p == "party": party = get_party(t) if party is not None: res += out_sep + "party=" + party elif p == "jurisdiction": jur = get_jurisdiction(t) if jur is not None: res += out_sep + "jurisdiction=" + jur elif p == "effective_date": ed = get_date(t) if ed is not None: t_ed = format_date(ed) if t_ed is not None: res += out_sep + "effective_date=" + t_ed return res def get_party(text: str): res = None m = re.search( r"between([\p{L}\s.,]+)(inc.|Inc.|INC.|Llc|LLC|llc)[\,\.]", text) if m is not None: res = m.group(1).strip().replace(",", "").replace(" ", "_") t = m.group(2).lower() res += ("_" + t.replace("inc.", "Inc.").replace("llc", "LLC")) return res def get_jurisdiction(text: str): res = None m = re.search( r"laws? of the (?:(?:State of)|(?:Commonwealth of)) ([A-Z][a-z]+(?: [A-Z][a-z]+)?).+(?:(?: and)|[,.])", text) if m is not None: res = m.group(1).strip().replace(" ", "_") return res def get_date(text: str): res = re.search( r"(1[0-2]|0[1-9])[/-](0[1-9]|[1|2][0-9]|3[0-1])[/-](0[1-9]|[1-9][0-9])", text) if res is not None: return res res = re.search( r"(0[1-9]|1[1-2])[/-](0[1-9]|1[1-9]|2[1-9]|3[0-1])[/-](19[0-9][0-9]|20[0-9][0-9])", text) if res is not None: return res res = re.search( r"([j|J]anuary|[f|F]ebruary|[m|M]arch|[A|a]pril|[M|m]ay|[J|j]une|[J|j]uly|[A|a]ugust|[S|s]eptember|[O|o]ctober|[n|N]ovember|[d|D]ecember)[,\s|\s]+(0[1-9]|[1-2][0-9]|3[0-1])[,\s|\s|,]+(19[0-9][0-9]|20[0-9][0-9])", text) return res def format_date(date): res = None if date is not None: month = date[1] try: month = months[month] except(KeyError): pass day = date[2] year = date[3] if len(year) == 2: if int(year[0]) > 2: year = "19" + year else: year = "20" + year res = f"{year}-{month}-{day}" return res if __name__ == "__main__": if len(sys.argv) < 2: raise Exception("Input file not provided") main(sys.argv[1])