Add files with functions
This commit is contained in:
parent
aeba98f8f9
commit
8a1ff852c8
40
get_effective_date.py
Normal file
40
get_effective_date.py
Normal file
@ -0,0 +1,40 @@
|
|||||||
|
months = ["january", "february", "march", "april", "may", "june", "july", "august", "september", "october", "november",
|
||||||
|
"december"]
|
||||||
|
|
||||||
|
|
||||||
|
def get_effective_date(data, line):
|
||||||
|
line = line.lower().replace(r"\n", " ")
|
||||||
|
|
||||||
|
for x in range(len(months)):
|
||||||
|
line = line.replace(months[x], f"{str(x + 1)}777")
|
||||||
|
|
||||||
|
splitted_document = line.split()
|
||||||
|
for y in range(len(splitted_document)):
|
||||||
|
splitted_document[y] = "".join([char for char in splitted_document[y] if char.isnumeric()])
|
||||||
|
|
||||||
|
for y in range(len(splitted_document)):
|
||||||
|
if len(splitted_document[y]) == 4 and 1990 < int(splitted_document[y]) < 2023 and (
|
||||||
|
"effective_date" in line.split()[:6]):
|
||||||
|
date_to_write = [splitted_document[y]]
|
||||||
|
for index in range(1, 6):
|
||||||
|
if len(splitted_document[y - index]) == 1:
|
||||||
|
date_to_write.append(f"0{splitted_document[y - index]}")
|
||||||
|
if len(splitted_document[y - index]) == 2:
|
||||||
|
date_to_write.append(splitted_document[y - index])
|
||||||
|
if "777" in splitted_document[y - index]:
|
||||||
|
if len(splitted_document[y - index]) == 4:
|
||||||
|
date_to_write.append(f"0{splitted_document[y - index]}")
|
||||||
|
else:
|
||||||
|
date_to_write.append(splitted_document[y - index])
|
||||||
|
if len(date_to_write) == 3:
|
||||||
|
break
|
||||||
|
if len(date_to_write) == 3:
|
||||||
|
if "777" in date_to_write[1]:
|
||||||
|
data[-1] += f"effective_date={date_to_write[0]}-{date_to_write[1][:-3]}-{date_to_write[2]} "
|
||||||
|
elif "777" in date_to_write[2]:
|
||||||
|
data[-1] += f"effective_date={date_to_write[0]}-{date_to_write[2][:-3]}-{date_to_write[1]} "
|
||||||
|
else:
|
||||||
|
data[-1] += f"effective_date={date_to_write[0]}-{date_to_write[2]}-{date_to_write[1]} "
|
||||||
|
break
|
||||||
|
|
||||||
|
return data
|
31
get_jurisdiction.py
Normal file
31
get_jurisdiction.py
Normal file
@ -0,0 +1,31 @@
|
|||||||
|
us_states = ['new york', 'delaware', 'california', 'massachusetts', 'texas', 'alaska', 'arizona', 'arkansas', 'alabama',
|
||||||
|
'colorado', 'connecticut', 'florida', 'georgia', 'hawaii', 'idaho', 'illinois', 'indiana', 'iowa',
|
||||||
|
'kansas', 'kentucky', 'louisiana', 'maine', 'maryland', 'michigan', 'minnesota', 'mississippi', 'missouri',
|
||||||
|
'montana', 'nebraska', 'nevada', 'new hampshire', 'new jersey', 'new mexico', 'north carolina',
|
||||||
|
'north dakota', 'ohio', 'oklahoma', 'oregon', 'pennsylvania', 'rhode island', 'south carolina',
|
||||||
|
'south dakota', 'tennessee', 'utah', 'vermont', 'virginia', 'washington', 'west virginia', 'wisconsin',
|
||||||
|
'wyoming']
|
||||||
|
|
||||||
|
|
||||||
|
def get_jurisdiction(data, line):
|
||||||
|
first_five_words = line.split()[:5]
|
||||||
|
line = line.lower().replace(r"\n", " ")
|
||||||
|
jurisdiction = None
|
||||||
|
max_occurrences = 0
|
||||||
|
for state in us_states:
|
||||||
|
current_occurrences = line.count(f"of {state}")
|
||||||
|
if current_occurrences > max_occurrences:
|
||||||
|
max_occurrences = current_occurrences
|
||||||
|
jurisdiction = state.title().replace(" ", "_")
|
||||||
|
|
||||||
|
if max_occurrences == 0:
|
||||||
|
for state in us_states:
|
||||||
|
current_occurrences = line.count(f"{state}")
|
||||||
|
if current_occurrences > max_occurrences:
|
||||||
|
max_occurrences = current_occurrences
|
||||||
|
jurisdiction = state.title().replace(" ", "_")
|
||||||
|
|
||||||
|
if max_occurrences > 0 and ("jurisdiction" in first_five_words):
|
||||||
|
data[-1] += f"jurisdiction={jurisdiction} "
|
||||||
|
|
||||||
|
return data
|
37
get_parties.py
Normal file
37
get_parties.py
Normal file
@ -0,0 +1,37 @@
|
|||||||
|
wrong_words = ["you", "the", "this", "me", "distributor", "any", "us", "residents", "employer"]
|
||||||
|
|
||||||
|
|
||||||
|
def get_parties(data, line):
|
||||||
|
line = " ".join(line.lower().replace(r"\n", " ").replace(",", "").split())
|
||||||
|
splitted_document = line.split()[:100]
|
||||||
|
parties = []
|
||||||
|
if "party" not in splitted_document[:7]:
|
||||||
|
return data
|
||||||
|
for x in range(len(splitted_document)):
|
||||||
|
first_party_not_over = True
|
||||||
|
if "between" == splitted_document[x] and splitted_document[x+1] not in wrong_words and splitted_document[x+1][0] != "(" and splitted_document[x+1][0] != "[" and first_party_not_over is True:
|
||||||
|
parties.append("")
|
||||||
|
for y in range(1, len(splitted_document)-x-1):
|
||||||
|
if splitted_document[x+y] != "a" and splitted_document[x+y] != "an" and splitted_document[x+y][0] != "(" and splitted_document[x+y][0] != "[" and y<8 and splitted_document[x+y] != "and":
|
||||||
|
parties[-1] += f"{splitted_document[x + y]} "
|
||||||
|
else:
|
||||||
|
first_party_not_over = False
|
||||||
|
x += y
|
||||||
|
break
|
||||||
|
if "and" == splitted_document[x] and first_party_not_over is False:
|
||||||
|
parties.append("")
|
||||||
|
for y in range(1, len(splitted_document)-x-1):
|
||||||
|
if splitted_document[x+y] != "a" and splitted_document[x+y] != "an" and splitted_document[x+y][0] != "(" and splitted_document[x+y][0] != "[" and splitted_document[x+y] != "and" and y<8:
|
||||||
|
parties[-1] += f"{splitted_document[x+y]} "
|
||||||
|
else:
|
||||||
|
break
|
||||||
|
if len(parties) >= 2:
|
||||||
|
break
|
||||||
|
|
||||||
|
for item in parties:
|
||||||
|
if item != "":
|
||||||
|
item = item[:-1]
|
||||||
|
item = item.replace("incorporate", "Inc.").title().replace(" ", "_")
|
||||||
|
data[-1] += f"party={item} "
|
||||||
|
|
||||||
|
return data
|
83
get_term.py
Normal file
83
get_term.py
Normal file
@ -0,0 +1,83 @@
|
|||||||
|
from sklearn.feature_extraction.text import TfidfVectorizer
|
||||||
|
from sklearn.ensemble import GradientBoostingClassifier
|
||||||
|
|
||||||
|
|
||||||
|
word_numbers_to_integers = {
|
||||||
|
"one": "1", "two": "2", "three": "3", "four": "4", "five": "5", "six": "6", "seven": "7", "eight": "8", "nine": "9",
|
||||||
|
"ten": "10", "eleven": "11", "twelve": "12", "fifteen": "15", "twenty": "20", "fifty": "50"
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def get_term(data, line, vectorizer, model):
|
||||||
|
first_five_words = line.split()[:5]
|
||||||
|
line = line.lower().replace(r"\n", " ")
|
||||||
|
|
||||||
|
query_vector = vectorizer.transform([line])
|
||||||
|
if (model.predict(query_vector) == 1) and ("term" in first_five_words):
|
||||||
|
splitted_document = line.split()
|
||||||
|
years_count = splitted_document.count("year") + splitted_document.count("years")
|
||||||
|
months_count = splitted_document.count("month") + splitted_document.count("months")
|
||||||
|
|
||||||
|
number_list = []
|
||||||
|
if years_count == 0 and months_count == 0:
|
||||||
|
return data
|
||||||
|
elif years_count >= months_count:
|
||||||
|
for x in range(len(splitted_document)):
|
||||||
|
if "year" in splitted_document[x]:
|
||||||
|
number_list.append("".join([char for char in splitted_document[x - 1] if char.isalnum()]))
|
||||||
|
|
||||||
|
for key, value in word_numbers_to_integers.items():
|
||||||
|
for x in range(len(number_list)):
|
||||||
|
number_list[x].replace(key, value)
|
||||||
|
|
||||||
|
for x in range(len(number_list)):
|
||||||
|
number_list[x] = "".join([char for char in number_list[x] if char.isnumeric()])
|
||||||
|
number_list = [x for x in number_list if x != ""]
|
||||||
|
highest_number_score = 0
|
||||||
|
number_to_return = 0
|
||||||
|
unique_numbers = set(number_list)
|
||||||
|
for number in unique_numbers:
|
||||||
|
current_score = number_list.count(number)
|
||||||
|
if current_score == highest_number_score:
|
||||||
|
if int(number) > int(number_to_return):
|
||||||
|
number_to_return = number
|
||||||
|
if current_score > highest_number_score:
|
||||||
|
highest_number_score = current_score
|
||||||
|
number_to_return = number
|
||||||
|
|
||||||
|
if int(number_to_return) > 0:
|
||||||
|
if number_to_return == 1:
|
||||||
|
data[-1] += f"term={number_to_return}_year "
|
||||||
|
else:
|
||||||
|
data[-1] += f"term={number_to_return}_years "
|
||||||
|
else:
|
||||||
|
for x in range(len(splitted_document)):
|
||||||
|
if "month" in splitted_document[x]:
|
||||||
|
number_list.append("".join([char for char in splitted_document[x - 1] if char.isalnum()]))
|
||||||
|
|
||||||
|
for key, value in word_numbers_to_integers.items():
|
||||||
|
for x in range(len(number_list)):
|
||||||
|
number_list[x].replace(key, value)
|
||||||
|
|
||||||
|
for x in range(len(number_list)):
|
||||||
|
number_list[x] = "".join([char for char in number_list[x] if char.isnumeric()])
|
||||||
|
number_list = [x for x in number_list if x != ""]
|
||||||
|
highest_number_score = 0
|
||||||
|
number_to_return = 0
|
||||||
|
unique_numbers = set(number_list)
|
||||||
|
for number in unique_numbers:
|
||||||
|
current_score = number_list.count(number)
|
||||||
|
if current_score == highest_number_score:
|
||||||
|
if int(number) > int(number_to_return):
|
||||||
|
number_to_return = number
|
||||||
|
if current_score > highest_number_score:
|
||||||
|
highest_number_score = current_score
|
||||||
|
number_to_return = number
|
||||||
|
|
||||||
|
if int(number_to_return) > 0:
|
||||||
|
if number_to_return == 1:
|
||||||
|
data[-1] += f"term={number_to_return}_month "
|
||||||
|
else:
|
||||||
|
data[-1] += f"term={number_to_return}_months "
|
||||||
|
|
||||||
|
return data
|
Loading…
Reference in New Issue
Block a user