fuzzy-game-recommender/process_dataset.py

62 lines
2.8 KiB
Python
Raw Permalink Normal View History

import pandas as pd
from fuzzy_controllers import fuzzy_controler_popularity
import gensim
import numpy as np
def calculate_positive_percentage(positive_ratings: int, negative_ratings: int) -> float:
return round((100*positive_ratings)/(positive_ratings+negative_ratings), 2)
def owners_average_max_min(owners: int) -> int:
return int(owners.split("-")[-1]) - int(owners.split("-")[0])
def replace(row):
words = list(set(row.split(';')))
words.sort()
return words
def vectorize(embeddings, word):
try:
vector = embeddings[word]
except:
vector = np.zeros(300, )
return vector
def replace_with_vector(row, w2v):
words = set(row.split(';'))
vectors = [vectorize(w2v, word) for word in words]
vector_sum = np.array(vectors).sum(axis=0)
return vector_sum
if __name__ == '__main__':
2023-01-27 18:43:32 +01:00
df = pd.read_csv('data/games.csv')
df = df.drop_duplicates(subset=['name'])
df['positive_percentage'] = df.apply(
lambda row: calculate_positive_percentage(row.positive_ratings, row.negative_ratings), axis=1)
df['owners'] = df.apply(lambda row: owners_average_max_min(row.owners), axis=1)
df['fuzzy_popularity'] = df.apply(lambda row: fuzzy_controler_popularity(price=row.price,
game_length=row.average_playtime,
rating=row.positive_percentage,
number_of_owners=row.owners), axis=1)
df['all_categorical'] = df['categories'] + ';' + df['genres'] + ';' + df['steamspy_tags']
df['all_categorical'] = df['all_categorical'].map(lambda row: row.strip().replace(' ', ';').lower())
df['all_categorical'] = df['all_categorical'].apply(lambda row: replace(row))
2023-01-27 18:43:32 +01:00
df.to_csv('data/games_processed.csv', index=False, encoding='utf-8')
try:
w2v = gensim.models.KeyedVectors.load_word2vec_format('GoogleNews-vectors-negative300.bin',
binary=True)
2023-01-27 18:43:32 +01:00
df2 = pd.read_csv('data/games_processed.csv')
df2['temp'] = df2['categories'] + ';' + df2['genres'] + ';' + df2['steamspy_tags']
df2['temp'] = df2['temp'].map(lambda row: row.strip().replace(' ', ';').lower())
df2['all_categorical_vector'] = df2['temp'].apply(lambda row: replace_with_vector(row, w2v))
df2.drop('temp', inplace=True, axis=1)
2023-01-27 18:43:32 +01:00
df2.to_pickle('data/games_processed_vectorized.csv')
except:
print('A local copy of GoogleNews-vectors-negative300.bin was not found. The file can be downloaded from '
'https://www.kaggle.com/datasets/leadbest/googlenewsvectorsnegative300. Finishing without vectorization')