84 lines
3.8 KiB
Python
84 lines
3.8 KiB
Python
from sklearn.feature_extraction.text import TfidfVectorizer
|
|
from sklearn.ensemble import GradientBoostingClassifier
|
|
|
|
|
|
word_numbers_to_integers = {
|
|
"one": "1", "two": "2", "three": "3", "four": "4", "five": "5", "six": "6", "seven": "7", "eight": "8", "nine": "9",
|
|
"ten": "10", "eleven": "11", "twelve": "12", "fifteen": "15", "twenty": "20", "fifty": "50"
|
|
}
|
|
|
|
|
|
def get_term(data, line, vectorizer, model):
|
|
first_five_words = line.split()[:5]
|
|
line = line.lower().replace(r"\n", " ")
|
|
|
|
query_vector = vectorizer.transform([line])
|
|
if (model.predict(query_vector) == 1) and ("term" in first_five_words):
|
|
splitted_document = line.split()
|
|
years_count = splitted_document.count("year") + splitted_document.count("years")
|
|
months_count = splitted_document.count("month") + splitted_document.count("months")
|
|
|
|
number_list = []
|
|
if years_count == 0 and months_count == 0:
|
|
return data
|
|
elif years_count >= months_count:
|
|
for x in range(len(splitted_document)):
|
|
if "year" in splitted_document[x]:
|
|
number_list.append("".join([char for char in splitted_document[x - 1] if char.isalnum()]))
|
|
|
|
for key, value in word_numbers_to_integers.items():
|
|
for x in range(len(number_list)):
|
|
number_list[x].replace(key, value)
|
|
|
|
for x in range(len(number_list)):
|
|
number_list[x] = "".join([char for char in number_list[x] if char.isnumeric()])
|
|
number_list = [x for x in number_list if x != ""]
|
|
highest_number_score = 0
|
|
number_to_return = 0
|
|
unique_numbers = set(number_list)
|
|
for number in unique_numbers:
|
|
current_score = number_list.count(number)
|
|
if current_score == highest_number_score:
|
|
if int(number) > int(number_to_return):
|
|
number_to_return = number
|
|
if current_score > highest_number_score:
|
|
highest_number_score = current_score
|
|
number_to_return = number
|
|
|
|
if int(number_to_return) > 0:
|
|
if number_to_return == 1:
|
|
data[-1] += f"term={number_to_return}_year "
|
|
else:
|
|
data[-1] += f"term={number_to_return}_years "
|
|
else:
|
|
for x in range(len(splitted_document)):
|
|
if "month" in splitted_document[x]:
|
|
number_list.append("".join([char for char in splitted_document[x - 1] if char.isalnum()]))
|
|
|
|
for key, value in word_numbers_to_integers.items():
|
|
for x in range(len(number_list)):
|
|
number_list[x].replace(key, value)
|
|
|
|
for x in range(len(number_list)):
|
|
number_list[x] = "".join([char for char in number_list[x] if char.isnumeric()])
|
|
number_list = [x for x in number_list if x != ""]
|
|
highest_number_score = 0
|
|
number_to_return = 0
|
|
unique_numbers = set(number_list)
|
|
for number in unique_numbers:
|
|
current_score = number_list.count(number)
|
|
if current_score == highest_number_score:
|
|
if int(number) > int(number_to_return):
|
|
number_to_return = number
|
|
if current_score > highest_number_score:
|
|
highest_number_score = current_score
|
|
number_to_return = number
|
|
|
|
if int(number_to_return) > 0:
|
|
if number_to_return == 1:
|
|
data[-1] += f"term={number_to_return}_month "
|
|
else:
|
|
data[-1] += f"term={number_to_return}_months "
|
|
|
|
return data
|