From da43071f7c2a2d38ec0a2b65b184a651a19d838b Mon Sep 17 00:00:00 2001
From: Kamila Bobkowska <kambob@st.amu.edu.pl>
Date: Sun, 29 Jan 2023 14:00:16 +0100
Subject: [PATCH] add functionality to compare a game to the whole db

---
 main.py            | 38 ++++++++++++++++++++++++++++----------
 process_dataset.py |  2 +-
 results/.gitkeep   |  0
 3 files changed, 29 insertions(+), 11 deletions(-)
 create mode 100644 results/.gitkeep

diff --git a/main.py b/main.py
index 4f3901f..3b66f33 100644
--- a/main.py
+++ b/main.py
@@ -2,7 +2,7 @@ import pandas as pd
 from fuzzy_controllers import fuzzy_controler_similiarity
 from numpy import dot
 from numpy.linalg import norm
-
+import json
 
 def find_games_categorical_similarity(game_1: pd.DataFrame, game_2: pd.DataFrame) -> float:
     game_1_categorical = set(game_1['all_categorical'].tolist()[0])
@@ -21,6 +21,24 @@ def find_games_word_vector_distance(game_1: pd.DataFrame, game_2: pd.DataFrame)
     game_2_vector = game_2['all_categorical_vector'].tolist()[0]
     return round(dot(game_1_vector, game_2_vector) / (norm(game_1_vector) * norm(game_2_vector)), 2)
 
+def calculate_similarities(game_title, title_list, df):
+    all_games = []
+    for compared_title in title_list:
+        if game_title != compared_title:
+            all_games.append({
+                "title": compared_title,
+                "similarity": compare_games(title_1=game_title, title_2=compared_title, df=df, show_graph=False)
+                })
+    sorted_games = sorted(all_games, key=lambda k: k['similarity'], reverse=True)
+    print("\n ==== Top 20 most similar games: ====")
+    for game in sorted_games[:20]:
+        print(f"- {game['title']}")
+    save_results(game_title=game_title, game_list=sorted_games)
+
+def save_results(game_title, game_list):
+    print("The full list of similar games available in the /results directory\n")
+    with open(f"results/similarity_list_{game_title.lower().replace(' ', '_')}.txt", 'w+') as fp:
+        json.dump(game_list, fp)
 
 def compare_games(title_1: str, title_2: str, df: pd.DataFrame, show_graph: bool = False) -> float:
     game_1 = df.loc[df['name'] == title_1]
@@ -29,9 +47,6 @@ def compare_games(title_1: str, title_2: str, df: pd.DataFrame, show_graph: bool
     categorical_similarity = find_games_categorical_similarity(game_1=game_1, game_2=game_2)
     numerical_difference = find_games_numerical_similarity(game_1=game_1, game_2=game_2)
     word_vector_distance = find_games_word_vector_distance(game_1=game_1, game_2=game_2)
-    print(f"Categorical similarity: {categorical_similarity}\nNumerical difference: {numerical_difference}\n"
-          f"Word vector distance: {word_vector_distance}")
-
     similarity_score = fuzzy_controler_similiarity(categorical_data=categorical_similarity,
                                                    numerical_data=numerical_difference,
                                                    vector_distance=word_vector_distance, show_graph=show_graph)
@@ -41,10 +56,13 @@ def compare_games(title_1: str, title_2: str, df: pd.DataFrame, show_graph: bool
 if __name__ == '__main__':
 
     df = pd.read_pickle('data/games_processed_vectorized.csv')
-
-    while True:
-        title_1 = input("Enter title 1: ")
-        title_2 = input("Enter title 2: ")
-        similarity_score = compare_games(title_1=title_1, title_2=title_2, df=df, show_graph=False)
-        print(f'Similarity_score: {similarity_score}')
+    title_list = df["name"].values.tolist()[:2000]
+    run_program = True
+    while run_program:
+        print("Welcome to Fuzzy Game Reccomender!\nType in a game title and we will find the most similar games from our database")
+        title = input("Enter the title or type 'exit' to leave: ")
+        if title == "exit":
+            run_program = False
+        else:
+            calculate_similarities(game_title=title, title_list=title_list, df=df)
 
diff --git a/process_dataset.py b/process_dataset.py
index f36426e..6d1e1e9 100644
--- a/process_dataset.py
+++ b/process_dataset.py
@@ -35,7 +35,7 @@ def replace_with_vector(row, w2v):
 
 if __name__ == '__main__':
     df = pd.read_csv('data/games.csv')
-
+    df = df.drop_duplicates(subset=['name'])
     df['positive_percentage'] = df.apply(
         lambda row: calculate_positive_percentage(row.positive_ratings, row.negative_ratings), axis=1)
     df['owners'] = df.apply(lambda row: owners_average_max_min(row.owners), axis=1)
diff --git a/results/.gitkeep b/results/.gitkeep
new file mode 100644
index 0000000..e69de29