import pandas as pd from fuzzy_controllers import fuzzy_controler_popularity import gensim import numpy as np def calculate_positive_percentage(positive_ratings: int, negative_ratings: int) -> float: return round((100*positive_ratings)/(positive_ratings+negative_ratings), 2) def owners_average_max_min(owners: int) -> int: return int(owners.split("-")[-1]) - int(owners.split("-")[0]) def replace(row): words = list(set(row.split(';'))) words.sort() return words def vectorize(embeddings, word): try: vector = embeddings[word] except: vector = np.zeros(300, ) return vector def replace_with_vector(row, w2v): words = set(row.split(';')) vectors = [vectorize(w2v, word) for word in words] vector_sum = np.array(vectors).sum(axis=0) return vector_sum if __name__ == '__main__': df = pd.read_csv('data/games.csv') df['positive_percentage'] = df.apply( lambda row: calculate_positive_percentage(row.positive_ratings, row.negative_ratings), axis=1) df['owners'] = df.apply(lambda row: owners_average_max_min(row.owners), axis=1) df['fuzzy_popularity'] = df.apply(lambda row: fuzzy_controler_popularity(price=row.price, game_length=row.average_playtime, rating=row.positive_percentage, number_of_owners=row.owners), axis=1) df['all_categorical'] = df['categories'] + ';' + df['genres'] + ';' + df['steamspy_tags'] df['all_categorical'] = df['all_categorical'].map(lambda row: row.strip().replace(' ', ';').lower()) df['all_categorical'] = df['all_categorical'].apply(lambda row: replace(row)) df.to_csv('data/games_processed.csv', index=False, encoding='utf-8') try: w2v = gensim.models.KeyedVectors.load_word2vec_format('GoogleNews-vectors-negative300.bin', binary=True) df2 = pd.read_csv('data/games_processed.csv') df2['temp'] = df2['categories'] + ';' + df2['genres'] + ';' + df2['steamspy_tags'] df2['temp'] = df2['temp'].map(lambda row: row.strip().replace(' ', ';').lower()) df2['all_categorical_vector'] = df2['temp'].apply(lambda row: replace_with_vector(row, w2v)) df2.drop('temp', inplace=True, axis=1) df2.to_pickle('data/games_processed_vectorized.csv') except: print('A local copy of GoogleNews-vectors-negative300.bin was not found. The file can be downloaded from ' 'https://www.kaggle.com/datasets/leadbest/googlenewsvectorsnegative300. Finishing without vectorization')