Add notebook contents (plus a bit) as .py files
This commit is contained in:
parent
fd9f81fb3f
commit
5a8495dad3
122
fuzzy_controllers.py
Normal file
122
fuzzy_controllers.py
Normal file
@ -0,0 +1,122 @@
|
||||
import simpful as sf
|
||||
import numpy as np
|
||||
import matplotlib.pylab as plt
|
||||
|
||||
|
||||
def fuzzy_controler_popularity(price: float, game_length: int, rating: float, number_of_owners: int) -> float:
|
||||
FS = sf.FuzzySystem(show_banner=False)
|
||||
|
||||
S_1 = sf.FuzzySet(points=[[0., 1.], [30., 0.]], term="negative")
|
||||
S_2 = sf.FuzzySet(points=[[35., 0.], [40., 1.], [60., 1.], [70., 0.]], term="average")
|
||||
S_3 = sf.FuzzySet(points=[[80., 0.], [90., 1.]], term="positive")
|
||||
FS.add_linguistic_variable("Rating", sf.LinguisticVariable([S_1, S_2, S_3], concept="Rating"))
|
||||
|
||||
S_1 = sf.FuzzySet(points=[[0., 1.], [50000., 0.]], term="small")
|
||||
S_2 = sf.FuzzySet(points=[[100000., 0.], [300000., 1.], [3000000., 1.], [5000000., 0.]], term="average")
|
||||
S_3 = sf.FuzzySet(points=[[10000000., 0.], [30000000., 1.]], term="big")
|
||||
FS.add_linguistic_variable("Number_of_owners",
|
||||
sf.LinguisticVariable([S_1, S_2, S_3], concept="Number_of_owners"))
|
||||
|
||||
S_1 = sf.FuzzySet(points=[[0., 1.], [10., 0.]], term="cheap")
|
||||
S_2 = sf.FuzzySet(points=[[10., 0.], [15., 1.], [20., 1.], [25., 0.]], term="average")
|
||||
S_3 = sf.FuzzySet(points=[[25., 0.], [35., 1.]], term="expensive")
|
||||
FS.add_linguistic_variable("Price", sf.LinguisticVariable([S_1, S_2, S_3], concept="Price"))
|
||||
|
||||
F_1 = sf.FuzzySet(points=[[200., 1.], [300., 0.]], term="short")
|
||||
F_2 = sf.FuzzySet(points=[[300., 0.], [360., 1.], [420., 1.], [500., 0.]], term="average")
|
||||
F_3 = sf.FuzzySet(points=[[500., 0.], [550., 1.]], term="long")
|
||||
FS.add_linguistic_variable("Game_length", sf.LinguisticVariable([F_1, F_2, F_3], concept="Game_length"))
|
||||
|
||||
FS.set_crisp_output_value("small", 0)
|
||||
FS.set_crisp_output_value("average", 0.5)
|
||||
FS.set_crisp_output_value("big", 1)
|
||||
|
||||
R1 = "IF (Price IS average) OR (Game_length IS average) OR (Rating IS average) OR (Number_of_owners IS average) " \
|
||||
"THEN (Popularity IS average)"
|
||||
R2 = "IF (Price IS expensive) AND (Game_length IS long) AND (Rating IS positive) THEN (Popularity IS big)"
|
||||
R3 = "IF (Price IS expensive) AND (Game_length IS short) THEN (Popularity IS small)"
|
||||
R4 = "IF (Price IS cheap) THEN (Popularity IS big)"
|
||||
R5 = "IF (Rating IS negative) THEN (Popularity IS small)"
|
||||
R6 = "IF (Rating IS positive) AND (Number_of_owners IS small) THEN (Popularity IS average)"
|
||||
R7 = "IF (Rating IS average) AND (Price IS cheap) THEN (Popularity IS big)"
|
||||
|
||||
FS.add_rules([R1, R2, R3, R4, R5, R6, R7])
|
||||
|
||||
FS.set_variable("Price", price)
|
||||
FS.set_variable("Game_length", game_length)
|
||||
FS.set_variable("Rating", rating)
|
||||
FS.set_variable("Number_of_owners", number_of_owners)
|
||||
popularity = FS.Sugeno_inference(["Popularity"])
|
||||
return round(popularity["Popularity"], 2)
|
||||
|
||||
|
||||
def fuzzy_controler_similiarity(categorical_data: str, numerical_data: str, vector_distance: float, show_graph: bool)\
|
||||
-> float:
|
||||
FSS = sf.FuzzySystem(show_banner=False)
|
||||
|
||||
S_1 = sf.FuzzySet(points=[[.20, 1.], [.30, 0.]], term="small")
|
||||
S_2 = sf.FuzzySet(points=[[.30, 0.], [.55, 1.], [.65, 1.], [.85, 0.]], term="average")
|
||||
S_3 = sf.FuzzySet(points=[[.85, 0.], [.90, 1.]], term="big")
|
||||
FSS.add_linguistic_variable("Categorical_similarity",
|
||||
sf.LinguisticVariable([S_1, S_2, S_3], concept="Categorical similarity"))
|
||||
|
||||
S_1 = sf.FuzzySet(points=[[.30, 1.], [.40, 0.]], term="small")
|
||||
S_2 = sf.FuzzySet(points=[[.40, 0.], [.50, 1.], [.60, 1.], [.70, 0.]], term="average")
|
||||
S_3 = sf.FuzzySet(points=[[.70, 0.], [.90, 1.]], term="big")
|
||||
FSS.add_linguistic_variable("Numerical_difference",
|
||||
sf.LinguisticVariable([S_1, S_2, S_3], concept="Numerical difference"))
|
||||
|
||||
S_1 = sf.FuzzySet(points=[[.20, 1.], [.30, 0.]], term="small")
|
||||
S_2 = sf.FuzzySet(points=[[.30, 0.], [.55, 1.], [.65, 1.], [.85, 0.]], term="average")
|
||||
S_3 = sf.FuzzySet(points=[[.85, 0.], [.90, 1.]], term="big")
|
||||
FSS.add_linguistic_variable("Word_vector_distance",
|
||||
sf.LinguisticVariable([S_1, S_2, S_3], concept="Word vector distance"))
|
||||
|
||||
FSS.set_crisp_output_value("small", 0)
|
||||
FSS.set_crisp_output_value("average", 0.5)
|
||||
FSS.set_crisp_output_value("big", 1)
|
||||
|
||||
# TODO: add Word_vector_distance to rules
|
||||
R1 = "IF (Categorical_similarity IS average) OR (Numerical_difference IS average) THEN (Similarity IS average)"
|
||||
R2 = "IF (Categorical_similarity IS small) OR (Numerical_difference IS big) THEN (Similarity IS small)"
|
||||
R3 = "IF (Categorical_similarity IS big) OR (Numerical_difference IS small) THEN (Similarity IS big)"
|
||||
|
||||
FSS.add_rules([R1, R2, R3])
|
||||
|
||||
# show graph for two variables
|
||||
if show_graph:
|
||||
plot_graphs(FS=FSS)
|
||||
|
||||
FSS.set_variable("Categorical_similarity", categorical_data)
|
||||
FSS.set_variable("Numerical_difference", numerical_data)
|
||||
popularity = FSS.Sugeno_inference(["Similarity"])
|
||||
return round(popularity["Similarity"], 2)
|
||||
|
||||
|
||||
def plot_graphs(FS: sf.FuzzySystem):
|
||||
xs = []
|
||||
ys = []
|
||||
zs = []
|
||||
for x in np.linspace(0, 1):
|
||||
for y in np.linspace(0, 1):
|
||||
FS.set_variable("Categorical_similarity", x)
|
||||
FS.set_variable("Numerical_difference", y)
|
||||
tip = FS.inference()['Similarity']
|
||||
xs.append(x)
|
||||
ys.append(y)
|
||||
zs.append(tip)
|
||||
xs = np.array(xs)
|
||||
ys = np.array(ys)
|
||||
zs = np.array(zs)
|
||||
|
||||
fig = plt.figure()
|
||||
ax = fig.add_subplot(111, projection='3d')
|
||||
|
||||
xx, yy = plt.meshgrid(xs, ys)
|
||||
ax.plot_trisurf(xs, ys, zs, vmin=0, vmax=100)
|
||||
ax.set_xlabel("Categorical_similarity")
|
||||
ax.set_ylabel("Numerical_difference")
|
||||
ax.set_zlabel("Similarity")
|
||||
ax.set_zlim(0, 1)
|
||||
plt.tight_layout()
|
||||
plt.show()
|
27076
games_processed.csv
Normal file
27076
games_processed.csv
Normal file
File diff suppressed because it is too large
Load Diff
BIN
games_processed_vectorized.csv
Normal file
BIN
games_processed_vectorized.csv
Normal file
Binary file not shown.
Can't render this file because it is too large.
|
50
main.py
Normal file
50
main.py
Normal file
@ -0,0 +1,50 @@
|
||||
import pandas as pd
|
||||
from fuzzy_controllers import fuzzy_controler_similiarity
|
||||
from numpy import dot
|
||||
from numpy.linalg import norm
|
||||
|
||||
|
||||
def find_games_categorical_similarity(game_1: pd.DataFrame, game_2: pd.DataFrame) -> float:
|
||||
game_1_categorical = set(game_1['all_categorical'].tolist()[0])
|
||||
game_2_categorical = set(game_2['all_categorical'].tolist()[0])
|
||||
return round(len(game_1_categorical & game_2_categorical) / len(game_1_categorical | game_2_categorical), 2)
|
||||
|
||||
|
||||
def find_games_numerical_similarity(game_1: pd.DataFrame, game_2: pd.DataFrame) -> float:
|
||||
game_1_popularity = float(game_1["fuzzy_popularity"].to_string(index=False))
|
||||
game_2_popularity = float(game_2["fuzzy_popularity"].to_string(index=False))
|
||||
return round(abs(game_1_popularity - game_2_popularity), 2)
|
||||
|
||||
|
||||
def find_games_word_vector_distance(game_1: pd.DataFrame, game_2: pd.DataFrame) -> float:
|
||||
game_1_vector = game_1['all_categorical_vector'].tolist()[0]
|
||||
game_2_vector = game_2['all_categorical_vector'].tolist()[0]
|
||||
return round(dot(game_1_vector, game_2_vector) / (norm(game_1_vector) * norm(game_2_vector)), 2)
|
||||
|
||||
|
||||
def compare_games(title_1: str, title_2: str, df: pd.DataFrame, show_graph: bool = False) -> float:
|
||||
game_1 = df.loc[df['name'] == title_1]
|
||||
game_2 = df.loc[df['name'] == title_2]
|
||||
|
||||
categorical_similarity = find_games_categorical_similarity(game_1=game_1, game_2=game_2)
|
||||
numerical_difference = find_games_numerical_similarity(game_1=game_1, game_2=game_2)
|
||||
word_vector_distance = find_games_word_vector_distance(game_1=game_1, game_2=game_2)
|
||||
print(f"Categorical similarity: {categorical_similarity}\nNumerical difference: {numerical_difference}\n"
|
||||
f"Word vector distance: {word_vector_distance}")
|
||||
|
||||
similarity_score = fuzzy_controler_similiarity(categorical_data=categorical_similarity,
|
||||
numerical_data=numerical_difference,
|
||||
vector_distance=word_vector_distance, show_graph=show_graph)
|
||||
return similarity_score
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
|
||||
df = pd.read_pickle('games_processed_vectorized.csv')
|
||||
|
||||
while True:
|
||||
title_1 = input("Enter title 1: ")
|
||||
title_2 = input("Enter title 2: ")
|
||||
similarity_score = compare_games(title_1=title_1, title_2=title_2, df=df, show_graph=False)
|
||||
print(f'Similarity_score: {similarity_score}')
|
||||
|
61
process_dataset.py
Normal file
61
process_dataset.py
Normal file
@ -0,0 +1,61 @@
|
||||
import pandas as pd
|
||||
from fuzzy_controllers import fuzzy_controler_popularity
|
||||
import gensim
|
||||
import numpy as np
|
||||
|
||||
|
||||
def calculate_positive_percentage(positive_ratings: int, negative_ratings: int) -> float:
|
||||
return round((100*positive_ratings)/(positive_ratings+negative_ratings), 2)
|
||||
|
||||
|
||||
def owners_average_max_min(owners: int) -> int:
|
||||
return int(owners.split("-")[-1]) - int(owners.split("-")[0])
|
||||
|
||||
|
||||
def replace(row):
|
||||
words = list(set(row.split(';')))
|
||||
words.sort()
|
||||
return words
|
||||
|
||||
|
||||
def vectorize(embeddings, word):
|
||||
try:
|
||||
vector = embeddings[word]
|
||||
except:
|
||||
vector = np.zeros(300, )
|
||||
return vector
|
||||
|
||||
|
||||
def replace_with_vector(row, w2v):
|
||||
words = set(row.split(';'))
|
||||
vectors = [vectorize(w2v, word) for word in words]
|
||||
vector_sum = np.array(vectors).sum(axis=0)
|
||||
return vector_sum
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
df = pd.read_csv('games.csv')
|
||||
|
||||
df['positive_percentage'] = df.apply(
|
||||
lambda row: calculate_positive_percentage(row.positive_ratings, row.negative_ratings), axis=1)
|
||||
df['owners'] = df.apply(lambda row: owners_average_max_min(row.owners), axis=1)
|
||||
df['fuzzy_popularity'] = df.apply(lambda row: fuzzy_controler_popularity(price=row.price,
|
||||
game_length=row.average_playtime,
|
||||
rating=row.positive_percentage,
|
||||
number_of_owners=row.owners), axis=1)
|
||||
df['all_categorical'] = df['categories'] + ';' + df['genres'] + ';' + df['steamspy_tags']
|
||||
df['all_categorical'] = df['all_categorical'].map(lambda row: row.strip().replace(' ', ';').lower())
|
||||
df['all_categorical'] = df['all_categorical'].apply(lambda row: replace(row))
|
||||
df.to_csv('games_processed.csv', index=False, encoding='utf-8')
|
||||
try:
|
||||
w2v = gensim.models.KeyedVectors.load_word2vec_format('GoogleNews-vectors-negative300.bin',
|
||||
binary=True)
|
||||
df2 = pd.read_csv('games_processed.csv')
|
||||
df2['temp'] = df2['categories'] + ';' + df2['genres'] + ';' + df2['steamspy_tags']
|
||||
df2['temp'] = df2['temp'].map(lambda row: row.strip().replace(' ', ';').lower())
|
||||
df2['all_categorical_vector'] = df2['temp'].apply(lambda row: replace_with_vector(row, w2v))
|
||||
df2.drop('temp', inplace=True, axis=1)
|
||||
df2.to_pickle('games_processed_vectorized.csv')
|
||||
except:
|
||||
print('A local copy of GoogleNews-vectors-negative300.bin was not found. The file can be downloaded from '
|
||||
'https://www.kaggle.com/datasets/leadbest/googlenewsvectorsnegative300. Finishing without vectorization')
|
Loading…
Reference in New Issue
Block a user