Better normalization

This commit is contained in:
emkarcinos 2022-04-06 10:21:38 +02:00
parent f14696c41b
commit c84af9ba00

View File

@ -39,7 +39,7 @@ def okapi_bm25(query, document_vectors, vectorizer: TfidfVectorizer):
def parse_owners(data): def parse_owners(data):
data = str(data) data = str(data)
if data == 'nan': if data == 'nan':
return 1.0 return 1000.0
return float(data.split('-')[1]) return float(data.split('-')[1])
@ -67,9 +67,11 @@ while True:
vectorizer.inverse_transform vectorizer.inverse_transform
similarities = okapi_bm25(query_vector, document_vectors, vectorizer) similarities = okapi_bm25(query_vector, document_vectors, vectorizer)
if enable_popularity: if enable_popularity:
popularities = steam_data.join(steam_data_names, on='appid', lsuffix='name')['owners'].map(parse_owners).values popularities = steam_data.join(steam_data_names, on='appid', lsuffix='name', how='left')['owners'].map(parse_owners).values
popularities_normalized = popularities / np.linalg.norm(popularities) popularities_normalized = popularities / np.linalg.norm(popularities)
similarities = np.multiply(similarities, popularities_normalized) similarities = np.multiply(similarities, popularities_normalized)
similarities = similarities / np.linalg.norm(similarities)
exec_time = time.time() - start_time exec_time = time.time() - start_time
results_count = len([x for x in similarities if x > 0]) results_count = len([x for x in similarities if x > 0])