Add notebook contents (plus a bit) as .py files

This commit is contained in:
Kacper 2023-01-27 18:26:45 +01:00
parent fd9f81fb3f
commit 5a8495dad3
6 changed files with 54385 additions and 0 deletions

122
fuzzy_controllers.py Normal file
View File

@ -0,0 +1,122 @@
import simpful as sf
import numpy as np
import matplotlib.pylab as plt
def fuzzy_controler_popularity(price: float, game_length: int, rating: float, number_of_owners: int) -> float:
FS = sf.FuzzySystem(show_banner=False)
S_1 = sf.FuzzySet(points=[[0., 1.], [30., 0.]], term="negative")
S_2 = sf.FuzzySet(points=[[35., 0.], [40., 1.], [60., 1.], [70., 0.]], term="average")
S_3 = sf.FuzzySet(points=[[80., 0.], [90., 1.]], term="positive")
FS.add_linguistic_variable("Rating", sf.LinguisticVariable([S_1, S_2, S_3], concept="Rating"))
S_1 = sf.FuzzySet(points=[[0., 1.], [50000., 0.]], term="small")
S_2 = sf.FuzzySet(points=[[100000., 0.], [300000., 1.], [3000000., 1.], [5000000., 0.]], term="average")
S_3 = sf.FuzzySet(points=[[10000000., 0.], [30000000., 1.]], term="big")
FS.add_linguistic_variable("Number_of_owners",
sf.LinguisticVariable([S_1, S_2, S_3], concept="Number_of_owners"))
S_1 = sf.FuzzySet(points=[[0., 1.], [10., 0.]], term="cheap")
S_2 = sf.FuzzySet(points=[[10., 0.], [15., 1.], [20., 1.], [25., 0.]], term="average")
S_3 = sf.FuzzySet(points=[[25., 0.], [35., 1.]], term="expensive")
FS.add_linguistic_variable("Price", sf.LinguisticVariable([S_1, S_2, S_3], concept="Price"))
F_1 = sf.FuzzySet(points=[[200., 1.], [300., 0.]], term="short")
F_2 = sf.FuzzySet(points=[[300., 0.], [360., 1.], [420., 1.], [500., 0.]], term="average")
F_3 = sf.FuzzySet(points=[[500., 0.], [550., 1.]], term="long")
FS.add_linguistic_variable("Game_length", sf.LinguisticVariable([F_1, F_2, F_3], concept="Game_length"))
FS.set_crisp_output_value("small", 0)
FS.set_crisp_output_value("average", 0.5)
FS.set_crisp_output_value("big", 1)
R1 = "IF (Price IS average) OR (Game_length IS average) OR (Rating IS average) OR (Number_of_owners IS average) " \
"THEN (Popularity IS average)"
R2 = "IF (Price IS expensive) AND (Game_length IS long) AND (Rating IS positive) THEN (Popularity IS big)"
R3 = "IF (Price IS expensive) AND (Game_length IS short) THEN (Popularity IS small)"
R4 = "IF (Price IS cheap) THEN (Popularity IS big)"
R5 = "IF (Rating IS negative) THEN (Popularity IS small)"
R6 = "IF (Rating IS positive) AND (Number_of_owners IS small) THEN (Popularity IS average)"
R7 = "IF (Rating IS average) AND (Price IS cheap) THEN (Popularity IS big)"
FS.add_rules([R1, R2, R3, R4, R5, R6, R7])
FS.set_variable("Price", price)
FS.set_variable("Game_length", game_length)
FS.set_variable("Rating", rating)
FS.set_variable("Number_of_owners", number_of_owners)
popularity = FS.Sugeno_inference(["Popularity"])
return round(popularity["Popularity"], 2)
def fuzzy_controler_similiarity(categorical_data: str, numerical_data: str, vector_distance: float, show_graph: bool)\
-> float:
FSS = sf.FuzzySystem(show_banner=False)
S_1 = sf.FuzzySet(points=[[.20, 1.], [.30, 0.]], term="small")
S_2 = sf.FuzzySet(points=[[.30, 0.], [.55, 1.], [.65, 1.], [.85, 0.]], term="average")
S_3 = sf.FuzzySet(points=[[.85, 0.], [.90, 1.]], term="big")
FSS.add_linguistic_variable("Categorical_similarity",
sf.LinguisticVariable([S_1, S_2, S_3], concept="Categorical similarity"))
S_1 = sf.FuzzySet(points=[[.30, 1.], [.40, 0.]], term="small")
S_2 = sf.FuzzySet(points=[[.40, 0.], [.50, 1.], [.60, 1.], [.70, 0.]], term="average")
S_3 = sf.FuzzySet(points=[[.70, 0.], [.90, 1.]], term="big")
FSS.add_linguistic_variable("Numerical_difference",
sf.LinguisticVariable([S_1, S_2, S_3], concept="Numerical difference"))
S_1 = sf.FuzzySet(points=[[.20, 1.], [.30, 0.]], term="small")
S_2 = sf.FuzzySet(points=[[.30, 0.], [.55, 1.], [.65, 1.], [.85, 0.]], term="average")
S_3 = sf.FuzzySet(points=[[.85, 0.], [.90, 1.]], term="big")
FSS.add_linguistic_variable("Word_vector_distance",
sf.LinguisticVariable([S_1, S_2, S_3], concept="Word vector distance"))
FSS.set_crisp_output_value("small", 0)
FSS.set_crisp_output_value("average", 0.5)
FSS.set_crisp_output_value("big", 1)
# TODO: add Word_vector_distance to rules
R1 = "IF (Categorical_similarity IS average) OR (Numerical_difference IS average) THEN (Similarity IS average)"
R2 = "IF (Categorical_similarity IS small) OR (Numerical_difference IS big) THEN (Similarity IS small)"
R3 = "IF (Categorical_similarity IS big) OR (Numerical_difference IS small) THEN (Similarity IS big)"
FSS.add_rules([R1, R2, R3])
# show graph for two variables
if show_graph:
plot_graphs(FS=FSS)
FSS.set_variable("Categorical_similarity", categorical_data)
FSS.set_variable("Numerical_difference", numerical_data)
popularity = FSS.Sugeno_inference(["Similarity"])
return round(popularity["Similarity"], 2)
def plot_graphs(FS: sf.FuzzySystem):
xs = []
ys = []
zs = []
for x in np.linspace(0, 1):
for y in np.linspace(0, 1):
FS.set_variable("Categorical_similarity", x)
FS.set_variable("Numerical_difference", y)
tip = FS.inference()['Similarity']
xs.append(x)
ys.append(y)
zs.append(tip)
xs = np.array(xs)
ys = np.array(ys)
zs = np.array(zs)
fig = plt.figure()
ax = fig.add_subplot(111, projection='3d')
xx, yy = plt.meshgrid(xs, ys)
ax.plot_trisurf(xs, ys, zs, vmin=0, vmax=100)
ax.set_xlabel("Categorical_similarity")
ax.set_ylabel("Numerical_difference")
ax.set_zlabel("Similarity")
ax.set_zlim(0, 1)
plt.tight_layout()
plt.show()

27076
games.csv Normal file

File diff suppressed because it is too large Load Diff

27076
games_processed.csv Normal file

File diff suppressed because it is too large Load Diff

Binary file not shown.
Can't render this file because it is too large.

50
main.py Normal file
View File

@ -0,0 +1,50 @@
import pandas as pd
from fuzzy_controllers import fuzzy_controler_similiarity
from numpy import dot
from numpy.linalg import norm
def find_games_categorical_similarity(game_1: pd.DataFrame, game_2: pd.DataFrame) -> float:
game_1_categorical = set(game_1['all_categorical'].tolist()[0])
game_2_categorical = set(game_2['all_categorical'].tolist()[0])
return round(len(game_1_categorical & game_2_categorical) / len(game_1_categorical | game_2_categorical), 2)
def find_games_numerical_similarity(game_1: pd.DataFrame, game_2: pd.DataFrame) -> float:
game_1_popularity = float(game_1["fuzzy_popularity"].to_string(index=False))
game_2_popularity = float(game_2["fuzzy_popularity"].to_string(index=False))
return round(abs(game_1_popularity - game_2_popularity), 2)
def find_games_word_vector_distance(game_1: pd.DataFrame, game_2: pd.DataFrame) -> float:
game_1_vector = game_1['all_categorical_vector'].tolist()[0]
game_2_vector = game_2['all_categorical_vector'].tolist()[0]
return round(dot(game_1_vector, game_2_vector) / (norm(game_1_vector) * norm(game_2_vector)), 2)
def compare_games(title_1: str, title_2: str, df: pd.DataFrame, show_graph: bool = False) -> float:
game_1 = df.loc[df['name'] == title_1]
game_2 = df.loc[df['name'] == title_2]
categorical_similarity = find_games_categorical_similarity(game_1=game_1, game_2=game_2)
numerical_difference = find_games_numerical_similarity(game_1=game_1, game_2=game_2)
word_vector_distance = find_games_word_vector_distance(game_1=game_1, game_2=game_2)
print(f"Categorical similarity: {categorical_similarity}\nNumerical difference: {numerical_difference}\n"
f"Word vector distance: {word_vector_distance}")
similarity_score = fuzzy_controler_similiarity(categorical_data=categorical_similarity,
numerical_data=numerical_difference,
vector_distance=word_vector_distance, show_graph=show_graph)
return similarity_score
if __name__ == '__main__':
df = pd.read_pickle('games_processed_vectorized.csv')
while True:
title_1 = input("Enter title 1: ")
title_2 = input("Enter title 2: ")
similarity_score = compare_games(title_1=title_1, title_2=title_2, df=df, show_graph=False)
print(f'Similarity_score: {similarity_score}')

61
process_dataset.py Normal file
View File

@ -0,0 +1,61 @@
import pandas as pd
from fuzzy_controllers import fuzzy_controler_popularity
import gensim
import numpy as np
def calculate_positive_percentage(positive_ratings: int, negative_ratings: int) -> float:
return round((100*positive_ratings)/(positive_ratings+negative_ratings), 2)
def owners_average_max_min(owners: int) -> int:
return int(owners.split("-")[-1]) - int(owners.split("-")[0])
def replace(row):
words = list(set(row.split(';')))
words.sort()
return words
def vectorize(embeddings, word):
try:
vector = embeddings[word]
except:
vector = np.zeros(300, )
return vector
def replace_with_vector(row, w2v):
words = set(row.split(';'))
vectors = [vectorize(w2v, word) for word in words]
vector_sum = np.array(vectors).sum(axis=0)
return vector_sum
if __name__ == '__main__':
df = pd.read_csv('games.csv')
df['positive_percentage'] = df.apply(
lambda row: calculate_positive_percentage(row.positive_ratings, row.negative_ratings), axis=1)
df['owners'] = df.apply(lambda row: owners_average_max_min(row.owners), axis=1)
df['fuzzy_popularity'] = df.apply(lambda row: fuzzy_controler_popularity(price=row.price,
game_length=row.average_playtime,
rating=row.positive_percentage,
number_of_owners=row.owners), axis=1)
df['all_categorical'] = df['categories'] + ';' + df['genres'] + ';' + df['steamspy_tags']
df['all_categorical'] = df['all_categorical'].map(lambda row: row.strip().replace(' ', ';').lower())
df['all_categorical'] = df['all_categorical'].apply(lambda row: replace(row))
df.to_csv('games_processed.csv', index=False, encoding='utf-8')
try:
w2v = gensim.models.KeyedVectors.load_word2vec_format('GoogleNews-vectors-negative300.bin',
binary=True)
df2 = pd.read_csv('games_processed.csv')
df2['temp'] = df2['categories'] + ';' + df2['genres'] + ';' + df2['steamspy_tags']
df2['temp'] = df2['temp'].map(lambda row: row.strip().replace(' ', ';').lower())
df2['all_categorical_vector'] = df2['temp'].apply(lambda row: replace_with_vector(row, w2v))
df2.drop('temp', inplace=True, axis=1)
df2.to_pickle('games_processed_vectorized.csv')
except:
print('A local copy of GoogleNews-vectors-negative300.bin was not found. The file can be downloaded from '
'https://www.kaggle.com/datasets/leadbest/googlenewsvectorsnegative300. Finishing without vectorization')