Add files with functions

This commit is contained in:
Ryszard Staruch 2022-05-04 16:47:57 +02:00
parent aeba98f8f9
commit 8a1ff852c8
4 changed files with 191 additions and 0 deletions

40
get_effective_date.py Normal file
View File

@ -0,0 +1,40 @@
months = ["january", "february", "march", "april", "may", "june", "july", "august", "september", "october", "november",
"december"]
def get_effective_date(data, line):
line = line.lower().replace(r"\n", " ")
for x in range(len(months)):
line = line.replace(months[x], f"{str(x + 1)}777")
splitted_document = line.split()
for y in range(len(splitted_document)):
splitted_document[y] = "".join([char for char in splitted_document[y] if char.isnumeric()])
for y in range(len(splitted_document)):
if len(splitted_document[y]) == 4 and 1990 < int(splitted_document[y]) < 2023 and (
"effective_date" in line.split()[:6]):
date_to_write = [splitted_document[y]]
for index in range(1, 6):
if len(splitted_document[y - index]) == 1:
date_to_write.append(f"0{splitted_document[y - index]}")
if len(splitted_document[y - index]) == 2:
date_to_write.append(splitted_document[y - index])
if "777" in splitted_document[y - index]:
if len(splitted_document[y - index]) == 4:
date_to_write.append(f"0{splitted_document[y - index]}")
else:
date_to_write.append(splitted_document[y - index])
if len(date_to_write) == 3:
break
if len(date_to_write) == 3:
if "777" in date_to_write[1]:
data[-1] += f"effective_date={date_to_write[0]}-{date_to_write[1][:-3]}-{date_to_write[2]} "
elif "777" in date_to_write[2]:
data[-1] += f"effective_date={date_to_write[0]}-{date_to_write[2][:-3]}-{date_to_write[1]} "
else:
data[-1] += f"effective_date={date_to_write[0]}-{date_to_write[2]}-{date_to_write[1]} "
break
return data

31
get_jurisdiction.py Normal file
View File

@ -0,0 +1,31 @@
us_states = ['new york', 'delaware', 'california', 'massachusetts', 'texas', 'alaska', 'arizona', 'arkansas', 'alabama',
'colorado', 'connecticut', 'florida', 'georgia', 'hawaii', 'idaho', 'illinois', 'indiana', 'iowa',
'kansas', 'kentucky', 'louisiana', 'maine', 'maryland', 'michigan', 'minnesota', 'mississippi', 'missouri',
'montana', 'nebraska', 'nevada', 'new hampshire', 'new jersey', 'new mexico', 'north carolina',
'north dakota', 'ohio', 'oklahoma', 'oregon', 'pennsylvania', 'rhode island', 'south carolina',
'south dakota', 'tennessee', 'utah', 'vermont', 'virginia', 'washington', 'west virginia', 'wisconsin',
'wyoming']
def get_jurisdiction(data, line):
first_five_words = line.split()[:5]
line = line.lower().replace(r"\n", " ")
jurisdiction = None
max_occurrences = 0
for state in us_states:
current_occurrences = line.count(f"of {state}")
if current_occurrences > max_occurrences:
max_occurrences = current_occurrences
jurisdiction = state.title().replace(" ", "_")
if max_occurrences == 0:
for state in us_states:
current_occurrences = line.count(f"{state}")
if current_occurrences > max_occurrences:
max_occurrences = current_occurrences
jurisdiction = state.title().replace(" ", "_")
if max_occurrences > 0 and ("jurisdiction" in first_five_words):
data[-1] += f"jurisdiction={jurisdiction} "
return data

37
get_parties.py Normal file
View File

@ -0,0 +1,37 @@
wrong_words = ["you", "the", "this", "me", "distributor", "any", "us", "residents", "employer"]
def get_parties(data, line):
line = " ".join(line.lower().replace(r"\n", " ").replace(",", "").split())
splitted_document = line.split()[:100]
parties = []
if "party" not in splitted_document[:7]:
return data
for x in range(len(splitted_document)):
first_party_not_over = True
if "between" == splitted_document[x] and splitted_document[x+1] not in wrong_words and splitted_document[x+1][0] != "(" and splitted_document[x+1][0] != "[" and first_party_not_over is True:
parties.append("")
for y in range(1, len(splitted_document)-x-1):
if splitted_document[x+y] != "a" and splitted_document[x+y] != "an" and splitted_document[x+y][0] != "(" and splitted_document[x+y][0] != "[" and y<8 and splitted_document[x+y] != "and":
parties[-1] += f"{splitted_document[x + y]} "
else:
first_party_not_over = False
x += y
break
if "and" == splitted_document[x] and first_party_not_over is False:
parties.append("")
for y in range(1, len(splitted_document)-x-1):
if splitted_document[x+y] != "a" and splitted_document[x+y] != "an" and splitted_document[x+y][0] != "(" and splitted_document[x+y][0] != "[" and splitted_document[x+y] != "and" and y<8:
parties[-1] += f"{splitted_document[x+y]} "
else:
break
if len(parties) >= 2:
break
for item in parties:
if item != "":
item = item[:-1]
item = item.replace("incorporate", "Inc.").title().replace(" ", "_")
data[-1] += f"party={item} "
return data

83
get_term.py Normal file
View File

@ -0,0 +1,83 @@
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import GradientBoostingClassifier
word_numbers_to_integers = {
"one": "1", "two": "2", "three": "3", "four": "4", "five": "5", "six": "6", "seven": "7", "eight": "8", "nine": "9",
"ten": "10", "eleven": "11", "twelve": "12", "fifteen": "15", "twenty": "20", "fifty": "50"
}
def get_term(data, line, vectorizer, model):
first_five_words = line.split()[:5]
line = line.lower().replace(r"\n", " ")
query_vector = vectorizer.transform([line])
if (model.predict(query_vector) == 1) and ("term" in first_five_words):
splitted_document = line.split()
years_count = splitted_document.count("year") + splitted_document.count("years")
months_count = splitted_document.count("month") + splitted_document.count("months")
number_list = []
if years_count == 0 and months_count == 0:
return data
elif years_count >= months_count:
for x in range(len(splitted_document)):
if "year" in splitted_document[x]:
number_list.append("".join([char for char in splitted_document[x - 1] if char.isalnum()]))
for key, value in word_numbers_to_integers.items():
for x in range(len(number_list)):
number_list[x].replace(key, value)
for x in range(len(number_list)):
number_list[x] = "".join([char for char in number_list[x] if char.isnumeric()])
number_list = [x for x in number_list if x != ""]
highest_number_score = 0
number_to_return = 0
unique_numbers = set(number_list)
for number in unique_numbers:
current_score = number_list.count(number)
if current_score == highest_number_score:
if int(number) > int(number_to_return):
number_to_return = number
if current_score > highest_number_score:
highest_number_score = current_score
number_to_return = number
if int(number_to_return) > 0:
if number_to_return == 1:
data[-1] += f"term={number_to_return}_year "
else:
data[-1] += f"term={number_to_return}_years "
else:
for x in range(len(splitted_document)):
if "month" in splitted_document[x]:
number_list.append("".join([char for char in splitted_document[x - 1] if char.isalnum()]))
for key, value in word_numbers_to_integers.items():
for x in range(len(number_list)):
number_list[x].replace(key, value)
for x in range(len(number_list)):
number_list[x] = "".join([char for char in number_list[x] if char.isnumeric()])
number_list = [x for x in number_list if x != ""]
highest_number_score = 0
number_to_return = 0
unique_numbers = set(number_list)
for number in unique_numbers:
current_score = number_list.count(number)
if current_score == highest_number_score:
if int(number) > int(number_to_return):
number_to_return = number
if current_score > highest_number_score:
highest_number_score = current_score
number_to_return = number
if int(number_to_return) > 0:
if number_to_return == 1:
data[-1] += f"term={number_to_return}_month "
else:
data[-1] += f"term={number_to_return}_months "
return data