Add solution

This commit is contained in:
nlitkowski 2021-04-27 23:18:56 +02:00
parent 72f787c21b
commit 99d7fabb68
2 changed files with 196 additions and 5 deletions

83
dev-0/out.tsv Normal file
View File

@ -0,0 +1,83 @@
effective_date=2014-05-20 jurisdiction=New_York party=LIQUIDMETAL_TECHNOLOGIES_Inc.
effective_date=2012-06-21 jurisdiction=New_York
jurisdiction=Delaware party=JDA_Software_Group_Inc.
effective_date=2009-02-23 jurisdiction=Massachusetts party=Kenneth_M._Bate_and_NitroMed_Inc.
jurisdiction=Delaware
jurisdiction=Washington
jurisdiction=New
effective_date=2008-03-21 jurisdiction=New_York
effective_date=2015-12-11
effective_date=2012-12-17 jurisdiction=Delaware
effective_date=2000-05-23 party=uDate.com_Inc.
jurisdiction=New_York party=Virgin_Mobile_USA_LLC
effective_date=2012-01-25 jurisdiction=Massachusetts
effective_date=2008-07-31 jurisdiction=Minnesota party=Cogent_Inc.
jurisdiction=California party=Penumbra_Inc.
effective_date=2020-11-25 jurisdiction=New_York party=AOL_Inc.
effective_date=2017-07-11 jurisdiction=Delaware
effective_date=2018-12-28 party=Flexsteel_Industries_Inc.
effective_date=2012-03-31 jurisdiction=California
jurisdiction=Virginia
effective_date=2004-10-11 jurisdiction=North_Carolina
jurisdiction=Arizona
jurisdiction=Indiana
jurisdiction=New_Jersey
effective_date=2004-02-27 jurisdiction=California
effective_date=2018-03-30 jurisdiction=Delaware party=Jamba_Inc.
jurisdiction=Georgia
jurisdiction=New_York
effective_date=2015-06-23 jurisdiction=New_York
jurisdiction=California
jurisdiction=California
party=CafePress_Inc.
effective_date=2017-01-13
jurisdiction=Ohio
party=GigPeak_Inc.
effective_date=2000-02-10 jurisdiction=Minnesota
jurisdiction=California
party=Opsware_Inc.
jurisdiction=Minnesota party=Flexsteel_Industries_Inc.
effective_date=2010-06-23 jurisdiction=Texas
jurisdiction=New_Jersey party=AlgoRx_Pharmaceuticals_Inc.
effective_date=2001-01-26 jurisdiction=Washington party=Corus_Pharma_Inc.
effective_date=2005-02-16
effective_date=2014-11-26
jurisdiction=Oregon
jurisdiction=Delaware
effective_date=2012-06-11 jurisdiction=Delaware party=Lightwave_Logic_Inc.
jurisdiction=Delaware party=Webex_Communications_Inc.
effective_date=2007-04-30 jurisdiction=Massachusetts
effective_date=2011-10-25
effective_date=2012-10-15
effective_date=2016-03-15 jurisdiction=Delaware
effective_date=2005-09-15 jurisdiction=Illinois
jurisdiction=Idaho
jurisdiction=Washington
effective_date=2016-07-15 jurisdiction=New_York party=Wizard_World_Inc.
jurisdiction=New_York
effective_date=2005-12-31
effective_date=2015-03-16 jurisdiction=Utah
jurisdiction=Delaware
effective_date=2004-02-29 jurisdiction=Virginia
effective_date=2006-12-19 jurisdiction=New_York
effective_date=2010-07-13
effective_date=2000-12-11 jurisdiction=Illinois party=Motorola_Inc.
effective_date=2011-12-31 party=GigOptix_LLC
effective_date=2011-03-29 jurisdiction=Texas
effective_date=2011-05-26 jurisdiction=California party=Skyworks_Solutions_Inc.
jurisdiction=Delaware
jurisdiction=Washington
effective_date=2011-01-18 jurisdiction=New_York
jurisdiction=Washington
jurisdiction=Illinois
Can't render this file because it has a wrong number of fields in line 2.

118
main.py
View File

@ -1,15 +1,123 @@
import csv
import sys
import lzma
import os
import regex as re
out_file_name = "out.tsv"
out_sep = '\t'
months = {
"January": "01",
"February": "02",
"March": "03",
"April": "04",
"May": "05",
"June": "06",
"July": "07",
"August": "08",
"September": "09",
"October": "10",
"November": "11",
"December": "12"
}
def main(fname):
print(fname)
with lzma.open(fname, mode="rt", encoding="utf-8") as tsvfile:
reader = csv.reader(tsvfile, delimiter='\t', quoting=csv.QUOTE_NONE)
for item in reader:
def main(fname: str):
dir_path = os.path.dirname(fname)
out_path = os.path.join(dir_path, out_file_name)
with lzma.open(fname, mode="rt", encoding="utf-8") as tsv_file:
with open(out_path, "w") as out_file:
reader = csv.reader(tsv_file, delimiter='\t')
for item in reader:
out_file.write(process_entry(item) + '\n')
def process_entry(entry: list) -> str:
res = ""
parameters = entry[1]
for p in parameters.split(" "):
t = entry[2]
if p == "party":
party = get_party(t)
if party is not None:
res += out_sep + "party=" + party
elif p == "jurisdiction":
jur = get_jurisdiction(t)
if jur is not None:
res += out_sep + "jurisdiction=" + jur
elif p == "effective_date":
ed = get_date(t)
if ed is not None:
t_ed = format_date(ed)
if t_ed is not None:
res += out_sep + "effective_date=" + t_ed
return res
def get_party(text: str):
res = None
m = re.search(
r"between([\p{L}\s.,]+)(inc.|Inc.|INC.|Llc|LLC|llc)[\,\.]", text)
if m is not None:
res = m.group(1).strip().replace(",", "").replace(" ", "_")
t = m.group(2).lower()
res += ("_" + t.replace("inc.", "Inc.").replace("llc", "LLC"))
return res
def get_jurisdiction(text: str):
res = None
m = re.search(
r"laws? of the (?:(?:State of)|(?:Commonwealth of)) ([A-Z][a-z]+(?: [A-Z][a-z]+)?).+(?:(?: and)|[,.])", text)
if m is not None:
print(m.group(1))
res = m.group(1).strip().replace(" ", "_")
return res
def get_date(text: str):
res = re.search(
r"(1[0-2]|0[1-9])[/-](0[1-9]|[1|2][0-9]|3[0-1])[/-](0[1-9]|[1-9][0-9])", text)
if res is not None:
return res
res = re.search(
r"(0[1-9]|1[1-2])[/-](0[1-9]|1[1-9]|2[1-9]|3[0-1])[/-](19[0-9][0-9]|20[0-9][0-9])", text)
if res is not None:
return res
res = re.search(
r"([j|J]anuary|[f|F]ebruary|[m|M]arch|[A|a]pril|[M|m]ay|[J|j]une|[J|j]uly|[A|a]ugust|[S|s]eptember|[O|o]ctober|[n|N]ovember|[d|D]ecember)[,\s|\s]+(0[1-9]|[1-2][0-9]|3[0-1])[,\s|\s|,]+(19[0-9][0-9]|20[0-9][0-9])", text)
return res
def format_date(date):
res = None
if date is not None:
month = date[1]
try:
month = months[month]
except(KeyError):
pass
day = date[2]
year = date[3]
if len(year) == 2:
if int(year[0]) > 2:
year = "19" + year
else:
year = "20" + year
res = f"{year}-{month}-{day}"
return res
if __name__ == "__main__":
if len(sys.argv) < 2: