diff --git a/kMedoids.py b/kMedoids.py index 296a1b5..a6a422a 100644 --- a/kMedoids.py +++ b/kMedoids.py @@ -5,7 +5,7 @@ import seaborn as sns from numpy.random import choice, seed from sklearn.decomposition import PCA -from sklearn.preprocessing import MinMaxScaler +from sklearn.preprocessing import Normalizer seed(42) @@ -43,9 +43,8 @@ def is_finished(old_medoids, new_medoids): def kmedoids(num_samples, num_clusters): df = pd.read_csv('CC GENERAL.csv', index_col='CUST_ID') - df = df[:num_samples] - df_scaled = pd.DataFrame(MinMaxScaler().fit_transform(df)) - df_scaled = df_scaled.fillna(0) + df = df[:num_samples].fillna(0) + df_scaled = pd.DataFrame(Normalizer().fit_transform(df)) # initialize medoids (at random) medoids = initialize_medoids(num_medoids=num_clusters, data=df_scaled) @@ -67,4 +66,6 @@ def kmedoids(num_samples, num_clusters): plt.show() -kmedoids(num_samples=500, num_clusters=3) +for i in range(2, 8): + kmedoids(num_samples=500, num_clusters=i) + print(i) diff --git a/27.gif b/norm27.gif similarity index 100% rename from 27.gif rename to norm27.gif diff --git a/readme.md b/readme.md index 9e34e9a..d8b0c05 100644 --- a/readme.md +++ b/readme.md @@ -6,6 +6,11 @@ and Cyber-Security at AMU PoznaƄ. dataset: https://www.kaggle.com/arjunbhasin2013/ccdata -![](27.gif) +![](reg27.gif) + +Visualization of clustering first 500 data entries (with regularization). + +![](norm27.gif) + +Visualization of clustering first 500 data entries (with normalization). -It may seem that the quality is not satisfactory for bigger numbers of cluster. That is due to the fact that the data has 17 dimensions and for the purpouse of plotting it is [PCA](https://en.wikipedia.org/wiki/Principal_component_analysis)'ed into just 2 dimensions. \ No newline at end of file diff --git a/reg27.gif b/reg27.gif new file mode 100644 index 0000000..e489c55 Binary files /dev/null and b/reg27.gif differ