kleister-nda/get_term.py

84 lines
3.8 KiB
Python
Raw Permalink Normal View History

2022-05-04 16:47:57 +02:00
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import GradientBoostingClassifier
word_numbers_to_integers = {
"one": "1", "two": "2", "three": "3", "four": "4", "five": "5", "six": "6", "seven": "7", "eight": "8", "nine": "9",
"ten": "10", "eleven": "11", "twelve": "12", "fifteen": "15", "twenty": "20", "fifty": "50"
}
def get_term(data, line, vectorizer, model):
first_five_words = line.split()[:5]
line = line.lower().replace(r"\n", " ")
query_vector = vectorizer.transform([line])
if (model.predict(query_vector) == 1) and ("term" in first_five_words):
splitted_document = line.split()
years_count = splitted_document.count("year") + splitted_document.count("years")
months_count = splitted_document.count("month") + splitted_document.count("months")
number_list = []
if years_count == 0 and months_count == 0:
return data
elif years_count >= months_count:
for x in range(len(splitted_document)):
if "year" in splitted_document[x]:
number_list.append("".join([char for char in splitted_document[x - 1] if char.isalnum()]))
for key, value in word_numbers_to_integers.items():
for x in range(len(number_list)):
number_list[x].replace(key, value)
for x in range(len(number_list)):
number_list[x] = "".join([char for char in number_list[x] if char.isnumeric()])
number_list = [x for x in number_list if x != ""]
highest_number_score = 0
number_to_return = 0
unique_numbers = set(number_list)
for number in unique_numbers:
current_score = number_list.count(number)
if current_score == highest_number_score:
if int(number) > int(number_to_return):
number_to_return = number
if current_score > highest_number_score:
highest_number_score = current_score
number_to_return = number
if int(number_to_return) > 0:
if number_to_return == 1:
data[-1] += f"term={number_to_return}_year "
else:
data[-1] += f"term={number_to_return}_years "
else:
for x in range(len(splitted_document)):
if "month" in splitted_document[x]:
number_list.append("".join([char for char in splitted_document[x - 1] if char.isalnum()]))
for key, value in word_numbers_to_integers.items():
for x in range(len(number_list)):
number_list[x].replace(key, value)
for x in range(len(number_list)):
number_list[x] = "".join([char for char in number_list[x] if char.isnumeric()])
number_list = [x for x in number_list if x != ""]
highest_number_score = 0
number_to_return = 0
unique_numbers = set(number_list)
for number in unique_numbers:
current_score = number_list.count(number)
if current_score == highest_number_score:
if int(number) > int(number_to_return):
number_to_return = number
if current_score > highest_number_score:
highest_number_score = current_score
number_to_return = number
if int(number_to_return) > 0:
if number_to_return == 1:
data[-1] += f"term={number_to_return}_month "
else:
data[-1] += f"term={number_to_return}_months "
return data