2021-04-27 21:03:51 +02:00
import csv
import sys
import lzma
2021-04-27 23:18:56 +02:00
import os
import regex as re
2021-04-27 21:03:51 +02:00
2021-04-27 23:18:56 +02:00
out_file_name = "out.tsv"
out_sep = '\t'
2021-04-27 21:03:51 +02:00
2021-04-27 23:18:56 +02:00
months = {
"January": "01",
"February": "02",
"March": "03",
"April": "04",
"May": "05",
"June": "06",
"July": "07",
"August": "08",
"September": "09",
"October": "10",
"November": "11",
"December": "12"
def main(fname: str):
dir_path = os.path.dirname(fname)
out_path = os.path.join(dir_path, out_file_name)
with lzma.open(fname, mode="rt", encoding="utf-8") as tsv_file:
with open(out_path, "w") as out_file:
reader = csv.reader(tsv_file, delimiter='\t')
for item in reader:
out_file.write(process_entry(item) + '\n')
def process_entry(entry: list) -> str:
res = ""
parameters = entry[1]
for p in parameters.split(" "):
t = entry[2]
if p == "party":
party = get_party(t)
if party is not None:
res += out_sep + "party=" + party
elif p == "jurisdiction":
jur = get_jurisdiction(t)
if jur is not None:
res += out_sep + "jurisdiction=" + jur
elif p == "effective_date":
ed = get_date(t)
if ed is not None:
t_ed = format_date(ed)
if t_ed is not None:
res += out_sep + "effective_date=" + t_ed
return res
def get_party(text: str):
res = None
m = re.search(
r"between([\p{L}\s.,]+)(inc.|Inc.|INC.|Llc|LLC|llc)[\,\.]", text)
if m is not None:
res = m.group(1).strip().replace(",", "").replace(" ", "_")
t = m.group(2).lower()
res += ("_" + t.replace("inc.", "Inc.").replace("llc", "LLC"))
return res
def get_jurisdiction(text: str):
res = None
m = re.search(
r"laws? of the (?:(?:State of)|(?:Commonwealth of)) ([A-Z][a-z]+(?: [A-Z][a-z]+)?).+(?:(?: and)|[,.])", text)
if m is not None:
res = m.group(1).strip().replace(" ", "_")
return res
def get_date(text: str):
res = re.search(
r"(1[0-2]|0[1-9])[/-](0[1-9]|[1|2][0-9]|3[0-1])[/-](0[1-9]|[1-9][0-9])", text)
if res is not None:
return res
res = re.search(
r"(0[1-9]|1[1-2])[/-](0[1-9]|1[1-9]|2[1-9]|3[0-1])[/-](19[0-9][0-9]|20[0-9][0-9])", text)
if res is not None:
return res
res = re.search(
r"([j|J]anuary|[f|F]ebruary|[m|M]arch|[A|a]pril|[M|m]ay|[J|j]une|[J|j]uly|[A|a]ugust|[S|s]eptember|[O|o]ctober|[n|N]ovember|[d|D]ecember)[,\s|\s]+(0[1-9]|[1-2][0-9]|3[0-1])[,\s|\s|,]+(19[0-9][0-9]|20[0-9][0-9])", text)
return res
def format_date(date):
res = None
if date is not None:
month = date[1]
month = months[month]
2021-04-27 21:03:51 +02:00
2021-04-27 23:18:56 +02:00
day = date[2]
year = date[3]
if len(year) == 2:
if int(year[0]) > 2:
year = "19" + year
year = "20" + year
res = f"{year}-{month}-{day}"
return res
2021-04-27 21:03:51 +02:00
if __name__ == "__main__":
if len(sys.argv) < 2:
raise Exception("Input file not provided")