Added files

2022-04-05 23:23:00 +02:00 · 2022-04-05 23:23:00 +02:00 · 5803d66f08
commit 5803d66f08
parent a6e8809b08
6 changed files with 201 additions and 76 deletions
--- a/README.md
+++ b/README.md
@ -1,3 +1,23 @@
 # EKS_SteamSearchEngine

 A task for Information Extraction which provides you an interface to query for Steam games descriptions.
+
+Dataset used: https://www.kaggle.com/datasets/nikdavis/steam-store-games
+
+## Requirements
+- `python3`
+- (Optional) Kaggle setup with an API token
+
+## Setup
+1. Dataset is available to downloaded on Kaggle. You can either download it manually from https://www.kaggle.com/datasets/nikdavis/steam-store-games, and extract the `.zip` file contents into the `data` folder. If you have Kaggle setup with a token on your machine, you can proceed to the next step
+2. Run `./setup.sh`
+
+## Usage
+Run `python3 search-engine.py`.
+
+The program should load the data and provide you a query interface.
+
+### Additional arguments
+
+The search engine also uses game's popularity to provide the query results. You can disable it by providing `--no-popularity` param into the executable.
+
--- a/data/.keepdir
+++ b/data/.keepdir
--- a/requirements.txt
+++ b/requirements.txt
@ -0,0 +1,3 @@
+numpy==1.22.3
+scikit-learn==1.0.2
+pandas==1.4.2
--- a/search-engine.py
+++ b/search-engine.py
@ -0,0 +1,89 @@
+import numpy as np
+import pandas as pd
+import time
+import sys
+
+from sklearn.feature_extraction.text import TfidfVectorizer
+
+enable_popularity = len(sys.argv) > 2 and sys.argv[1] == '--no-popularity'
+
+def get_appid_for_idx(idx):
+    return steam_data.iloc[idx]['appid']
+
+
+def get_name_for_idx_from_description(idx):
+    app_id = get_appid_for_idx(idx)
+    return steam_data_names[steam_data_names['appid'] == app_id]['name'].iloc[0]
+
+
+def get_url_for_idx(idx):
+    app_id = get_appid_for_idx(idx)
+    return f'https://store.steampowered.com/app/{app_id}/'
+
+
+def okapi_bm25(query, document_vectors, vectorizer: TfidfVectorizer):
+    b = 0.6
+    k = 1.5
+    q, = query
+    tf = document_vectors.tocsc()[:, q.indices]
+    idf = vectorizer._tfidf.idf_[None, q.indices] - 1.
+    avdl = document_vectors.sum(1).mean()
+    doc_len = document_vectors.sum(1).A1
+    
+    top = tf.multiply(np.broadcast_to(idf, tf.shape)) * (k + 1) 
+    bot = tf + (k * (1 - b + b * doc_len / avdl))[:, None]
+
+    return (top/bot).sum(1).A1
+
+
+def parse_owners(data):
+    data = str(data)
+    if data == 'nan':
+        return 1.0
+    
+    return float(data.split('-')[1])
+
+print('Loading dataset...')
+
+steam_data = pd.read_csv('data/steam_description_data.csv', usecols=[0, 1, 3])
+steam_data_names = pd.read_csv('data/steam.csv', usecols=[0, 1, 16])
+print(f'Dataset loaded. Row count: {len(steam_data)}')
+
+print('Vectorizing...')
+vectorizer = TfidfVectorizer(norm=None, smooth_idf=False)
+data_column = steam_data['detailed_description']
+data_column_names = steam_data_names['name']
+document_vectors = vectorizer.fit_transform(data_column)
+print('Done.')
+print()
+
+while True:
+    print('Enter query: ', end='')
+
+    query_str = input()
+
+    start_time = time.time()
+    query_vector = vectorizer.transform([query_str])
+    vectorizer.inverse_transform
+    similarities = okapi_bm25(query_vector, document_vectors, vectorizer)
+    if enable_popularity:
+        popularities = steam_data.join(steam_data_names, on='appid', lsuffix='name')['owners'].map(parse_owners).values
+        popularities_normalized = popularities / np.linalg.norm(popularities)
+        similarities = np.multiply(similarities, popularities_normalized)
+    exec_time = time.time() - start_time
+
+    results_count = len([x for x in similarities if x > 0])
+
+    print()
+    print(f'Results for query \'{query_str}\'')
+    for i in range (1,min(6, results_count + 1)):
+        data_index = similarities.argsort()[-i]
+        print(f'{i}.')
+        print(f'Game: {get_name_for_idx_from_description(data_index)}')
+        print(f'Description: {data_column[data_index]}')
+        print(f'URL: {get_url_for_idx(data_index)}')
+        print(f'Score: {round(np.sort(similarities)[-i], 3)}')
+        print('-'*10)
+        print()
+        
+    print(f'{results_count} results in {round(exec_time, 5)}s')
--- a/setup.sh
+++ b/setup.sh
@ -0,0 +1,13 @@
+cd data
+if [ ! -f "steam.csv" ];
+then
+    echo "Downloading dataset from Kaggle..."
+    kaggle datasets download -d https://www.kaggle.com/datasets/nikdavis/steam-store-games
+    echo "Done."
+fi
+
+sed -i -e '1!b;s/steam_appid/appid/' steam_description_data.csv
+cd ..
+
+echo "Installing dependencies..."
+pip3 install -r requirements.txt