2023-01-27 18:26:45 +01:00
|
|
|
import pandas as pd
|
|
|
|
from fuzzy_controllers import fuzzy_controler_popularity
|
|
|
|
import gensim
|
|
|
|
import numpy as np
|
|
|
|
|
|
|
|
|
|
|
|
def calculate_positive_percentage(positive_ratings: int, negative_ratings: int) -> float:
|
|
|
|
return round((100*positive_ratings)/(positive_ratings+negative_ratings), 2)
|
|
|
|
|
|
|
|
|
|
|
|
def owners_average_max_min(owners: int) -> int:
|
|
|
|
return int(owners.split("-")[-1]) - int(owners.split("-")[0])
|
|
|
|
|
|
|
|
|
|
|
|
def replace(row):
|
|
|
|
words = list(set(row.split(';')))
|
|
|
|
words.sort()
|
|
|
|
return words
|
|
|
|
|
|
|
|
|
|
|
|
def vectorize(embeddings, word):
|
|
|
|
try:
|
|
|
|
vector = embeddings[word]
|
|
|
|
except:
|
|
|
|
vector = np.zeros(300, )
|
|
|
|
return vector
|
|
|
|
|
|
|
|
|
|
|
|
def replace_with_vector(row, w2v):
|
|
|
|
words = set(row.split(';'))
|
|
|
|
vectors = [vectorize(w2v, word) for word in words]
|
|
|
|
vector_sum = np.array(vectors).sum(axis=0)
|
|
|
|
return vector_sum
|
|
|
|
|
|
|
|
|
|
|
|
if __name__ == '__main__':
|
2023-01-27 18:43:32 +01:00
|
|
|
df = pd.read_csv('data/games.csv')
|
2023-01-27 18:26:45 +01:00
|
|
|
|
|
|
|
df['positive_percentage'] = df.apply(
|
|
|
|
lambda row: calculate_positive_percentage(row.positive_ratings, row.negative_ratings), axis=1)
|
|
|
|
df['owners'] = df.apply(lambda row: owners_average_max_min(row.owners), axis=1)
|
|
|
|
df['fuzzy_popularity'] = df.apply(lambda row: fuzzy_controler_popularity(price=row.price,
|
|
|
|
game_length=row.average_playtime,
|
|
|
|
rating=row.positive_percentage,
|
|
|
|
number_of_owners=row.owners), axis=1)
|
|
|
|
df['all_categorical'] = df['categories'] + ';' + df['genres'] + ';' + df['steamspy_tags']
|
|
|
|
df['all_categorical'] = df['all_categorical'].map(lambda row: row.strip().replace(' ', ';').lower())
|
|
|
|
df['all_categorical'] = df['all_categorical'].apply(lambda row: replace(row))
|
2023-01-27 18:43:32 +01:00
|
|
|
df.to_csv('data/games_processed.csv', index=False, encoding='utf-8')
|
2023-01-27 18:26:45 +01:00
|
|
|
try:
|
|
|
|
w2v = gensim.models.KeyedVectors.load_word2vec_format('GoogleNews-vectors-negative300.bin',
|
|
|
|
binary=True)
|
2023-01-27 18:43:32 +01:00
|
|
|
df2 = pd.read_csv('data/games_processed.csv')
|
2023-01-27 18:26:45 +01:00
|
|
|
df2['temp'] = df2['categories'] + ';' + df2['genres'] + ';' + df2['steamspy_tags']
|
|
|
|
df2['temp'] = df2['temp'].map(lambda row: row.strip().replace(' ', ';').lower())
|
|
|
|
df2['all_categorical_vector'] = df2['temp'].apply(lambda row: replace_with_vector(row, w2v))
|
|
|
|
df2.drop('temp', inplace=True, axis=1)
|
2023-01-27 18:43:32 +01:00
|
|
|
df2.to_pickle('data/games_processed_vectorized.csv')
|
2023-01-27 18:26:45 +01:00
|
|
|
except:
|
|
|
|
print('A local copy of GoogleNews-vectors-negative300.bin was not found. The file can be downloaded from '
|
|
|
|
'https://www.kaggle.com/datasets/leadbest/googlenewsvectorsnegative300. Finishing without vectorization')
|