import numpy as np import pandas as pd import time import sys from sklearn.feature_extraction.text import TfidfVectorizer enable_popularity = not (len(sys.argv) >= 2 and sys.argv[1] == '--no-popularity') def get_appid_for_idx(idx): return steam_data.iloc[idx]['appid'] def get_name_for_idx_from_description(idx): app_id = get_appid_for_idx(idx) return steam_data_names[steam_data_names['appid'] == app_id]['name'].iloc[0] def get_url_for_idx(idx): app_id = get_appid_for_idx(idx) return f'https://store.steampowered.com/app/{app_id}/' def okapi_bm25(query, document_vectors, vectorizer: TfidfVectorizer): b = 0.6 k = 1.5 q, = query tf = document_vectors.tocsc()[:, q.indices] idf = vectorizer._tfidf.idf_[None, q.indices] - 1. avdl = document_vectors.sum(1).mean() doc_len = document_vectors.sum(1).A1 top = tf.multiply(np.broadcast_to(idf, tf.shape)) * (k + 1) bot = tf + (k * (1 - b + b * doc_len / avdl))[:, None] return (top/bot).sum(1).A1 def parse_owners(data): data = str(data) if data == 'nan': return 1000.0 return float(data.split('-')[1]) print('Loading dataset...') steam_data = pd.read_csv('data/steam_description_data.csv', usecols=[0, 1, 3]) steam_data_names = pd.read_csv('data/steam.csv', usecols=[0, 1, 16]) print(f'Dataset loaded. Row count: {len(steam_data)}') print('Vectorizing...') vectorizer = TfidfVectorizer(norm=None, smooth_idf=False) data_column = steam_data['detailed_description'] data_column_names = steam_data_names['name'] document_vectors = vectorizer.fit_transform(data_column) print('Done.') print() while True: print('Enter query: ', end='') query_str = input() start_time = time.time() query_vector = vectorizer.transform([query_str]) vectorizer.inverse_transform similarities = okapi_bm25(query_vector, document_vectors, vectorizer) if enable_popularity: popularities = steam_data.join(steam_data_names, on='appid', lsuffix='name', how='left')['owners'].map(parse_owners).values popularities_normalized = popularities / np.linalg.norm(popularities) similarities = np.multiply(similarities, popularities_normalized) similarities = similarities / np.linalg.norm(similarities) exec_time = time.time() - start_time results_count = len([x for x in similarities if x > 0]) print() print(f'Results for query \'{query_str}\'') for i in range (1,min(6, results_count + 1)): data_index = similarities.argsort()[-i] print(f'{i}.') print(f'Game: {get_name_for_idx_from_description(data_index)}') print(f'Description: {data_column[data_index]}') print(f'URL: {get_url_for_idx(data_index)}') print(f'Score: {round(np.sort(similarities)[-i], 3)}') print('-'*10) print() print(f'{results_count} results in {round(exec_time, 5)}s')