forked from kubapok/kleister-nda-clone
126 lines
3.2 KiB
Python
126 lines
3.2 KiB
Python
import csv
|
|
import sys
|
|
import lzma
|
|
import os
|
|
import regex as re
|
|
|
|
out_file_name = "out.tsv"
|
|
out_sep = '\t'
|
|
|
|
months = {
|
|
"January": "01",
|
|
"February": "02",
|
|
"March": "03",
|
|
"April": "04",
|
|
"May": "05",
|
|
"June": "06",
|
|
"July": "07",
|
|
"August": "08",
|
|
"September": "09",
|
|
"October": "10",
|
|
"November": "11",
|
|
"December": "12"
|
|
}
|
|
|
|
|
|
def main(fname: str):
|
|
dir_path = os.path.dirname(fname)
|
|
out_path = os.path.join(dir_path, out_file_name)
|
|
|
|
with lzma.open(fname, mode="rt", encoding="utf-8") as tsv_file:
|
|
with open(out_path, "w") as out_file:
|
|
reader = csv.reader(tsv_file, delimiter='\t')
|
|
for item in reader:
|
|
out_file.write(process_entry(item) + '\n')
|
|
|
|
|
|
def process_entry(entry: list) -> str:
|
|
res = ""
|
|
parameters = entry[1]
|
|
for p in parameters.split(" "):
|
|
t = entry[2]
|
|
if p == "party":
|
|
party = get_party(t)
|
|
if party is not None:
|
|
res += out_sep + "party=" + party
|
|
|
|
elif p == "jurisdiction":
|
|
jur = get_jurisdiction(t)
|
|
if jur is not None:
|
|
res += out_sep + "jurisdiction=" + jur
|
|
|
|
elif p == "effective_date":
|
|
ed = get_date(t)
|
|
if ed is not None:
|
|
t_ed = format_date(ed)
|
|
if t_ed is not None:
|
|
res += out_sep + "effective_date=" + t_ed
|
|
return res
|
|
|
|
|
|
def get_party(text: str):
|
|
res = None
|
|
m = re.search(
|
|
r"between([\p{L}\s.,]+)(inc.|Inc.|INC.|Llc|LLC|llc)[\,\.]", text)
|
|
if m is not None:
|
|
res = m.group(1).strip().replace(",", "").replace(" ", "_")
|
|
t = m.group(2).lower()
|
|
res += ("_" + t.replace("inc.", "Inc.").replace("llc", "LLC"))
|
|
return res
|
|
|
|
|
|
def get_jurisdiction(text: str):
|
|
res = None
|
|
m = re.search(
|
|
r"laws? of the (?:(?:State of)|(?:Commonwealth of)) ([A-Z][a-z]+(?: [A-Z][a-z]+)?).+(?:(?: and)|[,.])", text)
|
|
if m is not None:
|
|
print(m.group(1))
|
|
res = m.group(1).strip().replace(" ", "_")
|
|
return res
|
|
|
|
|
|
def get_date(text: str):
|
|
res = re.search(
|
|
r"(1[0-2]|0[1-9])[/-](0[1-9]|[1|2][0-9]|3[0-1])[/-](0[1-9]|[1-9][0-9])", text)
|
|
if res is not None:
|
|
return res
|
|
|
|
res = re.search(
|
|
r"(0[1-9]|1[1-2])[/-](0[1-9]|1[1-9]|2[1-9]|3[0-1])[/-](19[0-9][0-9]|20[0-9][0-9])", text)
|
|
if res is not None:
|
|
return res
|
|
|
|
res = re.search(
|
|
r"([j|J]anuary|[f|F]ebruary|[m|M]arch|[A|a]pril|[M|m]ay|[J|j]une|[J|j]uly|[A|a]ugust|[S|s]eptember|[O|o]ctober|[n|N]ovember|[d|D]ecember)[,\s|\s]+(0[1-9]|[1-2][0-9]|3[0-1])[,\s|\s|,]+(19[0-9][0-9]|20[0-9][0-9])", text)
|
|
|
|
return res
|
|
|
|
|
|
def format_date(date):
|
|
res = None
|
|
if date is not None:
|
|
month = date[1]
|
|
try:
|
|
month = months[month]
|
|
except(KeyError):
|
|
pass
|
|
|
|
day = date[2]
|
|
|
|
year = date[3]
|
|
|
|
if len(year) == 2:
|
|
if int(year[0]) > 2:
|
|
year = "19" + year
|
|
else:
|
|
year = "20" + year
|
|
|
|
res = f"{year}-{month}-{day}"
|
|
return res
|
|
|
|
|
|
if __name__ == "__main__":
|
|
if len(sys.argv) < 2:
|
|
raise Exception("Input file not provided")
|
|
main(sys.argv[1])
|