Add notebook contents (plus a bit) as .py files

2023-01-27 18:26:45 +01:00 · 2023-01-27 18:26:45 +01:00 · 5a8495dad3
commit 5a8495dad3
parent fd9f81fb3f
6 changed files with 54385 additions and 0 deletions
--- a/fuzzy_controllers.py
+++ b/fuzzy_controllers.py
@ -0,0 +1,122 @@
+import simpful as sf
+import numpy as np
+import matplotlib.pylab as plt
+
+
+def fuzzy_controler_popularity(price: float, game_length: int, rating: float, number_of_owners: int) -> float:
+    FS = sf.FuzzySystem(show_banner=False)
+
+    S_1 = sf.FuzzySet(points=[[0., 1.], [30., 0.]], term="negative")
+    S_2 = sf.FuzzySet(points=[[35., 0.], [40., 1.], [60., 1.], [70., 0.]], term="average")
+    S_3 = sf.FuzzySet(points=[[80., 0.], [90., 1.]], term="positive")
+    FS.add_linguistic_variable("Rating", sf.LinguisticVariable([S_1, S_2, S_3], concept="Rating"))
+
+    S_1 = sf.FuzzySet(points=[[0., 1.], [50000., 0.]], term="small")
+    S_2 = sf.FuzzySet(points=[[100000., 0.], [300000., 1.], [3000000., 1.], [5000000., 0.]], term="average")
+    S_3 = sf.FuzzySet(points=[[10000000., 0.], [30000000., 1.]], term="big")
+    FS.add_linguistic_variable("Number_of_owners",
+                               sf.LinguisticVariable([S_1, S_2, S_3], concept="Number_of_owners"))
+
+    S_1 = sf.FuzzySet(points=[[0., 1.], [10., 0.]], term="cheap")
+    S_2 = sf.FuzzySet(points=[[10., 0.], [15., 1.], [20., 1.], [25., 0.]], term="average")
+    S_3 = sf.FuzzySet(points=[[25., 0.], [35., 1.]], term="expensive")
+    FS.add_linguistic_variable("Price", sf.LinguisticVariable([S_1, S_2, S_3], concept="Price"))
+
+    F_1 = sf.FuzzySet(points=[[200., 1.], [300., 0.]], term="short")
+    F_2 = sf.FuzzySet(points=[[300., 0.], [360., 1.], [420., 1.], [500., 0.]], term="average")
+    F_3 = sf.FuzzySet(points=[[500., 0.], [550., 1.]], term="long")
+    FS.add_linguistic_variable("Game_length", sf.LinguisticVariable([F_1, F_2, F_3], concept="Game_length"))
+
+    FS.set_crisp_output_value("small", 0)
+    FS.set_crisp_output_value("average", 0.5)
+    FS.set_crisp_output_value("big", 1)
+
+    R1 = "IF (Price IS average) OR (Game_length IS average) OR (Rating IS average) OR (Number_of_owners IS average) " \
+         "THEN (Popularity IS average)"
+    R2 = "IF (Price IS expensive) AND (Game_length IS long) AND (Rating IS positive) THEN (Popularity IS big)"
+    R3 = "IF (Price IS expensive) AND (Game_length IS short) THEN (Popularity IS small)"
+    R4 = "IF (Price IS cheap) THEN (Popularity IS big)"
+    R5 = "IF (Rating IS negative) THEN (Popularity IS small)"
+    R6 = "IF (Rating IS positive) AND (Number_of_owners IS small) THEN (Popularity IS average)"
+    R7 = "IF (Rating IS average) AND (Price IS cheap) THEN (Popularity IS big)"
+
+    FS.add_rules([R1, R2, R3, R4, R5, R6, R7])
+
+    FS.set_variable("Price", price)
+    FS.set_variable("Game_length", game_length)
+    FS.set_variable("Rating", rating)
+    FS.set_variable("Number_of_owners", number_of_owners)
+    popularity = FS.Sugeno_inference(["Popularity"])
+    return round(popularity["Popularity"], 2)
+
+
+def fuzzy_controler_similiarity(categorical_data: str, numerical_data: str, vector_distance: float, show_graph: bool)\
+        -> float:
+    FSS = sf.FuzzySystem(show_banner=False)
+
+    S_1 = sf.FuzzySet(points=[[.20, 1.], [.30, 0.]], term="small")
+    S_2 = sf.FuzzySet(points=[[.30, 0.], [.55, 1.], [.65, 1.], [.85, 0.]], term="average")
+    S_3 = sf.FuzzySet(points=[[.85, 0.], [.90, 1.]], term="big")
+    FSS.add_linguistic_variable("Categorical_similarity",
+                                sf.LinguisticVariable([S_1, S_2, S_3], concept="Categorical similarity"))
+
+    S_1 = sf.FuzzySet(points=[[.30, 1.], [.40, 0.]], term="small")
+    S_2 = sf.FuzzySet(points=[[.40, 0.], [.50, 1.], [.60, 1.], [.70, 0.]], term="average")
+    S_3 = sf.FuzzySet(points=[[.70, 0.], [.90, 1.]], term="big")
+    FSS.add_linguistic_variable("Numerical_difference",
+                                sf.LinguisticVariable([S_1, S_2, S_3], concept="Numerical difference"))
+
+    S_1 = sf.FuzzySet(points=[[.20, 1.], [.30, 0.]], term="small")
+    S_2 = sf.FuzzySet(points=[[.30, 0.], [.55, 1.], [.65, 1.], [.85, 0.]], term="average")
+    S_3 = sf.FuzzySet(points=[[.85, 0.], [.90, 1.]], term="big")
+    FSS.add_linguistic_variable("Word_vector_distance",
+                                sf.LinguisticVariable([S_1, S_2, S_3], concept="Word vector distance"))
+
+    FSS.set_crisp_output_value("small", 0)
+    FSS.set_crisp_output_value("average", 0.5)
+    FSS.set_crisp_output_value("big", 1)
+
+    # TODO: add Word_vector_distance to rules
+    R1 = "IF (Categorical_similarity IS average) OR (Numerical_difference IS average) THEN (Similarity IS average)"
+    R2 = "IF (Categorical_similarity IS small) OR (Numerical_difference IS big) THEN (Similarity IS small)"
+    R3 = "IF (Categorical_similarity IS big) OR (Numerical_difference IS small) THEN (Similarity IS big)"
+
+    FSS.add_rules([R1, R2, R3])
+
+    # show graph for two variables
+    if show_graph:
+        plot_graphs(FS=FSS)
+
+    FSS.set_variable("Categorical_similarity", categorical_data)
+    FSS.set_variable("Numerical_difference", numerical_data)
+    popularity = FSS.Sugeno_inference(["Similarity"])
+    return round(popularity["Similarity"], 2)
+
+
+def plot_graphs(FS: sf.FuzzySystem):
+    xs = []
+    ys = []
+    zs = []
+    for x in np.linspace(0, 1):
+        for y in np.linspace(0, 1):
+            FS.set_variable("Categorical_similarity", x)
+            FS.set_variable("Numerical_difference", y)
+            tip = FS.inference()['Similarity']
+            xs.append(x)
+            ys.append(y)
+            zs.append(tip)
+    xs = np.array(xs)
+    ys = np.array(ys)
+    zs = np.array(zs)
+
+    fig = plt.figure()
+    ax = fig.add_subplot(111, projection='3d')
+
+    xx, yy = plt.meshgrid(xs, ys)
+    ax.plot_trisurf(xs, ys, zs, vmin=0, vmax=100)
+    ax.set_xlabel("Categorical_similarity")
+    ax.set_ylabel("Numerical_difference")
+    ax.set_zlabel("Similarity")
+    ax.set_zlim(0, 1)
+    plt.tight_layout()
+    plt.show()
--- a/games.csv
+++ b/games.csv
--- a/games_processed.csv
+++ b/games_processed.csv
--- a/games_processed_vectorized.csv
+++ b/games_processed_vectorized.csv
--- a/main.py
+++ b/main.py
@ -0,0 +1,50 @@
+import pandas as pd
+from fuzzy_controllers import fuzzy_controler_similiarity
+from numpy import dot
+from numpy.linalg import norm
+
+
+def find_games_categorical_similarity(game_1: pd.DataFrame, game_2: pd.DataFrame) -> float:
+    game_1_categorical = set(game_1['all_categorical'].tolist()[0])
+    game_2_categorical = set(game_2['all_categorical'].tolist()[0])
+    return round(len(game_1_categorical & game_2_categorical) / len(game_1_categorical | game_2_categorical), 2)
+
+
+def find_games_numerical_similarity(game_1: pd.DataFrame, game_2: pd.DataFrame) -> float:
+    game_1_popularity = float(game_1["fuzzy_popularity"].to_string(index=False))
+    game_2_popularity = float(game_2["fuzzy_popularity"].to_string(index=False))
+    return round(abs(game_1_popularity - game_2_popularity), 2)
+
+
+def find_games_word_vector_distance(game_1: pd.DataFrame, game_2: pd.DataFrame) -> float:
+    game_1_vector = game_1['all_categorical_vector'].tolist()[0]
+    game_2_vector = game_2['all_categorical_vector'].tolist()[0]
+    return round(dot(game_1_vector, game_2_vector) / (norm(game_1_vector) * norm(game_2_vector)), 2)
+
+
+def compare_games(title_1: str, title_2: str, df: pd.DataFrame, show_graph: bool = False) -> float:
+    game_1 = df.loc[df['name'] == title_1]
+    game_2 = df.loc[df['name'] == title_2]
+
+    categorical_similarity = find_games_categorical_similarity(game_1=game_1, game_2=game_2)
+    numerical_difference = find_games_numerical_similarity(game_1=game_1, game_2=game_2)
+    word_vector_distance = find_games_word_vector_distance(game_1=game_1, game_2=game_2)
+    print(f"Categorical similarity: {categorical_similarity}\nNumerical difference: {numerical_difference}\n"
+          f"Word vector distance: {word_vector_distance}")
+
+    similarity_score = fuzzy_controler_similiarity(categorical_data=categorical_similarity,
+                                                   numerical_data=numerical_difference,
+                                                   vector_distance=word_vector_distance, show_graph=show_graph)
+    return similarity_score
+
+
+if __name__ == '__main__':
+
+    df = pd.read_pickle('games_processed_vectorized.csv')
+
+    while True:
+        title_1 = input("Enter title 1: ")
+        title_2 = input("Enter title 2: ")
+        similarity_score = compare_games(title_1=title_1, title_2=title_2, df=df, show_graph=False)
+        print(f'Similarity_score: {similarity_score}')
+
--- a/process_dataset.py
+++ b/process_dataset.py
@ -0,0 +1,61 @@
+import pandas as pd
+from fuzzy_controllers import fuzzy_controler_popularity
+import gensim
+import numpy as np
+
+
+def calculate_positive_percentage(positive_ratings: int, negative_ratings: int) -> float:
+    return round((100*positive_ratings)/(positive_ratings+negative_ratings), 2)
+
+
+def owners_average_max_min(owners: int) -> int:
+    return int(owners.split("-")[-1]) - int(owners.split("-")[0])
+
+
+def replace(row):
+    words = list(set(row.split(';')))
+    words.sort()
+    return words
+
+
+def vectorize(embeddings, word):
+    try:
+        vector = embeddings[word]
+    except:
+        vector = np.zeros(300, )
+    return vector
+
+
+def replace_with_vector(row, w2v):
+    words = set(row.split(';'))
+    vectors = [vectorize(w2v, word) for word in words]
+    vector_sum = np.array(vectors).sum(axis=0)
+    return vector_sum
+
+
+if __name__ == '__main__':
+    df = pd.read_csv('games.csv')
+
+    df['positive_percentage'] = df.apply(
+        lambda row: calculate_positive_percentage(row.positive_ratings, row.negative_ratings), axis=1)
+    df['owners'] = df.apply(lambda row: owners_average_max_min(row.owners), axis=1)
+    df['fuzzy_popularity'] = df.apply(lambda row: fuzzy_controler_popularity(price=row.price,
+                                                                             game_length=row.average_playtime,
+                                                                             rating=row.positive_percentage,
+                                                                             number_of_owners=row.owners), axis=1)
+    df['all_categorical'] = df['categories'] + ';' + df['genres'] + ';' + df['steamspy_tags']
+    df['all_categorical'] = df['all_categorical'].map(lambda row: row.strip().replace(' ', ';').lower())
+    df['all_categorical'] = df['all_categorical'].apply(lambda row: replace(row))
+    df.to_csv('games_processed.csv', index=False, encoding='utf-8')
+    try:
+        w2v = gensim.models.KeyedVectors.load_word2vec_format('GoogleNews-vectors-negative300.bin',
+                                                              binary=True)
+        df2 = pd.read_csv('games_processed.csv')
+        df2['temp'] = df2['categories'] + ';' + df2['genres'] + ';' + df2['steamspy_tags']
+        df2['temp'] = df2['temp'].map(lambda row: row.strip().replace(' ', ';').lower())
+        df2['all_categorical_vector'] = df2['temp'].apply(lambda row: replace_with_vector(row, w2v))
+        df2.drop('temp', inplace=True, axis=1)
+        df2.to_pickle('games_processed_vectorized.csv')
+    except:
+        print('A local copy of GoogleNews-vectors-negative300.bin was not found. The file can be downloaded from '
+              'https://www.kaggle.com/datasets/leadbest/googlenewsvectorsnegative300. Finishing without vectorization')