EKS_SteamSearchEngine/search-engine.py

89 lines
2.9 KiB
Python
Raw Normal View History

2022-04-05 23:23:00 +02:00
import numpy as np
import pandas as pd
import time
import sys
from sklearn.feature_extraction.text import TfidfVectorizer
enable_popularity = len(sys.argv) > 2 and sys.argv[1] == '--no-popularity'
def get_appid_for_idx(idx):
return steam_data.iloc[idx]['appid']
def get_name_for_idx_from_description(idx):
app_id = get_appid_for_idx(idx)
return steam_data_names[steam_data_names['appid'] == app_id]['name'].iloc[0]
def get_url_for_idx(idx):
app_id = get_appid_for_idx(idx)
return f'https://store.steampowered.com/app/{app_id}/'
def okapi_bm25(query, document_vectors, vectorizer: TfidfVectorizer):
b = 0.6
k = 1.5
q, = query
tf = document_vectors.tocsc()[:, q.indices]
idf = vectorizer._tfidf.idf_[None, q.indices] - 1.
avdl = document_vectors.sum(1).mean()
doc_len = document_vectors.sum(1).A1
top = tf.multiply(np.broadcast_to(idf, tf.shape)) * (k + 1)
bot = tf + (k * (1 - b + b * doc_len / avdl))[:, None]
return (top/bot).sum(1).A1
def parse_owners(data):
data = str(data)
if data == 'nan':
return 1.0
return float(data.split('-')[1])
print('Loading dataset...')
steam_data = pd.read_csv('data/steam_description_data.csv', usecols=[0, 1, 3])
steam_data_names = pd.read_csv('data/steam.csv', usecols=[0, 1, 16])
print(f'Dataset loaded. Row count: {len(steam_data)}')
print('Vectorizing...')
vectorizer = TfidfVectorizer(norm=None, smooth_idf=False)
data_column = steam_data['detailed_description']
data_column_names = steam_data_names['name']
document_vectors = vectorizer.fit_transform(data_column)
print('Done.')
print()
while True:
print('Enter query: ', end='')
query_str = input()
start_time = time.time()
query_vector = vectorizer.transform([query_str])
vectorizer.inverse_transform
similarities = okapi_bm25(query_vector, document_vectors, vectorizer)
if enable_popularity:
popularities = steam_data.join(steam_data_names, on='appid', lsuffix='name')['owners'].map(parse_owners).values
popularities_normalized = popularities / np.linalg.norm(popularities)
similarities = np.multiply(similarities, popularities_normalized)
exec_time = time.time() - start_time
results_count = len([x for x in similarities if x > 0])
print()
print(f'Results for query \'{query_str}\'')
for i in range (1,min(6, results_count + 1)):
data_index = similarities.argsort()[-i]
print(f'{i}.')
print(f'Game: {get_name_for_idx_from_description(data_index)}')
print(f'Description: {data_column[data_index]}')
print(f'URL: {get_url_for_idx(data_index)}')
print(f'Score: {round(np.sort(similarities)[-i], 3)}')
print('-'*10)
print()
print(f'{results_count} results in {round(exec_time, 5)}s')