From c84af9ba00fbdd39a64da2d59cdf006b681bc095 Mon Sep 17 00:00:00 2001 From: emkarcinos Date: Wed, 6 Apr 2022 10:21:38 +0200 Subject: [PATCH] Better normalization --- search-engine.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/search-engine.py b/search-engine.py index 4b69879..6872f26 100644 --- a/search-engine.py +++ b/search-engine.py @@ -39,7 +39,7 @@ def okapi_bm25(query, document_vectors, vectorizer: TfidfVectorizer): def parse_owners(data): data = str(data) if data == 'nan': - return 1.0 + return 1000.0 return float(data.split('-')[1]) @@ -67,9 +67,11 @@ while True: vectorizer.inverse_transform similarities = okapi_bm25(query_vector, document_vectors, vectorizer) if enable_popularity: - popularities = steam_data.join(steam_data_names, on='appid', lsuffix='name')['owners'].map(parse_owners).values + popularities = steam_data.join(steam_data_names, on='appid', lsuffix='name', how='left')['owners'].map(parse_owners).values popularities_normalized = popularities / np.linalg.norm(popularities) similarities = np.multiply(similarities, popularities_normalized) + + similarities = similarities / np.linalg.norm(similarities) exec_time = time.time() - start_time results_count = len([x for x in similarities if x > 0])