Added files
This commit is contained in:
parent
a6e8809b08
commit
5803d66f08
20
README.md
20
README.md
@ -1,3 +1,23 @@
|
|||||||
# EKS_SteamSearchEngine
|
# EKS_SteamSearchEngine
|
||||||
|
|
||||||
A task for Information Extraction which provides you an interface to query for Steam games descriptions.
|
A task for Information Extraction which provides you an interface to query for Steam games descriptions.
|
||||||
|
|
||||||
|
Dataset used: https://www.kaggle.com/datasets/nikdavis/steam-store-games
|
||||||
|
|
||||||
|
## Requirements
|
||||||
|
- `python3`
|
||||||
|
- (Optional) Kaggle setup with an API token
|
||||||
|
|
||||||
|
## Setup
|
||||||
|
1. Dataset is available to downloaded on Kaggle. You can either download it manually from https://www.kaggle.com/datasets/nikdavis/steam-store-games, and extract the `.zip` file contents into the `data` folder. If you have Kaggle setup with a token on your machine, you can proceed to the next step
|
||||||
|
2. Run `./setup.sh`
|
||||||
|
|
||||||
|
## Usage
|
||||||
|
Run `python3 search-engine.py`.
|
||||||
|
|
||||||
|
The program should load the data and provide you a query interface.
|
||||||
|
|
||||||
|
### Additional arguments
|
||||||
|
|
||||||
|
The search engine also uses game's popularity to provide the query results. You can disable it by providing `--no-popularity` param into the executable.
|
||||||
|
|
||||||
|
0
data/.keepdir
Normal file
0
data/.keepdir
Normal file
3
requirements.txt
Normal file
3
requirements.txt
Normal file
@ -0,0 +1,3 @@
|
|||||||
|
numpy==1.22.3
|
||||||
|
scikit-learn==1.0.2
|
||||||
|
pandas==1.4.2
|
89
search-engine.py
Normal file
89
search-engine.py
Normal file
@ -0,0 +1,89 @@
|
|||||||
|
import numpy as np
|
||||||
|
import pandas as pd
|
||||||
|
import time
|
||||||
|
import sys
|
||||||
|
|
||||||
|
from sklearn.feature_extraction.text import TfidfVectorizer
|
||||||
|
|
||||||
|
enable_popularity = len(sys.argv) > 2 and sys.argv[1] == '--no-popularity'
|
||||||
|
|
||||||
|
def get_appid_for_idx(idx):
|
||||||
|
return steam_data.iloc[idx]['appid']
|
||||||
|
|
||||||
|
|
||||||
|
def get_name_for_idx_from_description(idx):
|
||||||
|
app_id = get_appid_for_idx(idx)
|
||||||
|
return steam_data_names[steam_data_names['appid'] == app_id]['name'].iloc[0]
|
||||||
|
|
||||||
|
|
||||||
|
def get_url_for_idx(idx):
|
||||||
|
app_id = get_appid_for_idx(idx)
|
||||||
|
return f'https://store.steampowered.com/app/{app_id}/'
|
||||||
|
|
||||||
|
|
||||||
|
def okapi_bm25(query, document_vectors, vectorizer: TfidfVectorizer):
|
||||||
|
b = 0.6
|
||||||
|
k = 1.5
|
||||||
|
q, = query
|
||||||
|
tf = document_vectors.tocsc()[:, q.indices]
|
||||||
|
idf = vectorizer._tfidf.idf_[None, q.indices] - 1.
|
||||||
|
avdl = document_vectors.sum(1).mean()
|
||||||
|
doc_len = document_vectors.sum(1).A1
|
||||||
|
|
||||||
|
top = tf.multiply(np.broadcast_to(idf, tf.shape)) * (k + 1)
|
||||||
|
bot = tf + (k * (1 - b + b * doc_len / avdl))[:, None]
|
||||||
|
|
||||||
|
return (top/bot).sum(1).A1
|
||||||
|
|
||||||
|
|
||||||
|
def parse_owners(data):
|
||||||
|
data = str(data)
|
||||||
|
if data == 'nan':
|
||||||
|
return 1.0
|
||||||
|
|
||||||
|
return float(data.split('-')[1])
|
||||||
|
|
||||||
|
print('Loading dataset...')
|
||||||
|
|
||||||
|
steam_data = pd.read_csv('data/steam_description_data.csv', usecols=[0, 1, 3])
|
||||||
|
steam_data_names = pd.read_csv('data/steam.csv', usecols=[0, 1, 16])
|
||||||
|
print(f'Dataset loaded. Row count: {len(steam_data)}')
|
||||||
|
|
||||||
|
print('Vectorizing...')
|
||||||
|
vectorizer = TfidfVectorizer(norm=None, smooth_idf=False)
|
||||||
|
data_column = steam_data['detailed_description']
|
||||||
|
data_column_names = steam_data_names['name']
|
||||||
|
document_vectors = vectorizer.fit_transform(data_column)
|
||||||
|
print('Done.')
|
||||||
|
print()
|
||||||
|
|
||||||
|
while True:
|
||||||
|
print('Enter query: ', end='')
|
||||||
|
|
||||||
|
query_str = input()
|
||||||
|
|
||||||
|
start_time = time.time()
|
||||||
|
query_vector = vectorizer.transform([query_str])
|
||||||
|
vectorizer.inverse_transform
|
||||||
|
similarities = okapi_bm25(query_vector, document_vectors, vectorizer)
|
||||||
|
if enable_popularity:
|
||||||
|
popularities = steam_data.join(steam_data_names, on='appid', lsuffix='name')['owners'].map(parse_owners).values
|
||||||
|
popularities_normalized = popularities / np.linalg.norm(popularities)
|
||||||
|
similarities = np.multiply(similarities, popularities_normalized)
|
||||||
|
exec_time = time.time() - start_time
|
||||||
|
|
||||||
|
results_count = len([x for x in similarities if x > 0])
|
||||||
|
|
||||||
|
print()
|
||||||
|
print(f'Results for query \'{query_str}\'')
|
||||||
|
for i in range (1,min(6, results_count + 1)):
|
||||||
|
data_index = similarities.argsort()[-i]
|
||||||
|
print(f'{i}.')
|
||||||
|
print(f'Game: {get_name_for_idx_from_description(data_index)}')
|
||||||
|
print(f'Description: {data_column[data_index]}')
|
||||||
|
print(f'URL: {get_url_for_idx(data_index)}')
|
||||||
|
print(f'Score: {round(np.sort(similarities)[-i], 3)}')
|
||||||
|
print('-'*10)
|
||||||
|
print()
|
||||||
|
|
||||||
|
print(f'{results_count} results in {round(exec_time, 5)}s')
|
13
setup.sh
Normal file
13
setup.sh
Normal file
@ -0,0 +1,13 @@
|
|||||||
|
cd data
|
||||||
|
if [ ! -f "steam.csv" ];
|
||||||
|
then
|
||||||
|
echo "Downloading dataset from Kaggle..."
|
||||||
|
kaggle datasets download -d https://www.kaggle.com/datasets/nikdavis/steam-store-games
|
||||||
|
echo "Done."
|
||||||
|
fi
|
||||||
|
|
||||||
|
sed -i -e '1!b;s/steam_appid/appid/' steam_description_data.csv
|
||||||
|
cd ..
|
||||||
|
|
||||||
|
echo "Installing dependencies..."
|
||||||
|
pip3 install -r requirements.txt
|
Loading…
Reference in New Issue
Block a user