change normalization to regularization, new gif
This commit is contained in:
parent
2c68944813
commit
de7b569a89
11
kMedoids.py
11
kMedoids.py
@ -5,7 +5,7 @@ import seaborn as sns
|
|||||||
|
|
||||||
from numpy.random import choice, seed
|
from numpy.random import choice, seed
|
||||||
from sklearn.decomposition import PCA
|
from sklearn.decomposition import PCA
|
||||||
from sklearn.preprocessing import MinMaxScaler
|
from sklearn.preprocessing import Normalizer
|
||||||
|
|
||||||
seed(42)
|
seed(42)
|
||||||
|
|
||||||
@ -43,9 +43,8 @@ def is_finished(old_medoids, new_medoids):
|
|||||||
|
|
||||||
def kmedoids(num_samples, num_clusters):
|
def kmedoids(num_samples, num_clusters):
|
||||||
df = pd.read_csv('CC GENERAL.csv', index_col='CUST_ID')
|
df = pd.read_csv('CC GENERAL.csv', index_col='CUST_ID')
|
||||||
df = df[:num_samples]
|
df = df[:num_samples].fillna(0)
|
||||||
df_scaled = pd.DataFrame(MinMaxScaler().fit_transform(df))
|
df_scaled = pd.DataFrame(Normalizer().fit_transform(df))
|
||||||
df_scaled = df_scaled.fillna(0)
|
|
||||||
|
|
||||||
# initialize medoids (at random)
|
# initialize medoids (at random)
|
||||||
medoids = initialize_medoids(num_medoids=num_clusters, data=df_scaled)
|
medoids = initialize_medoids(num_medoids=num_clusters, data=df_scaled)
|
||||||
@ -67,4 +66,6 @@ def kmedoids(num_samples, num_clusters):
|
|||||||
plt.show()
|
plt.show()
|
||||||
|
|
||||||
|
|
||||||
kmedoids(num_samples=500, num_clusters=3)
|
for i in range(2, 8):
|
||||||
|
kmedoids(num_samples=500, num_clusters=i)
|
||||||
|
print(i)
|
||||||
|
Before Width: | Height: | Size: 131 KiB After Width: | Height: | Size: 131 KiB |
@ -6,6 +6,11 @@ and Cyber-Security at AMU Poznań.
|
|||||||
dataset:
|
dataset:
|
||||||
https://www.kaggle.com/arjunbhasin2013/ccdata
|
https://www.kaggle.com/arjunbhasin2013/ccdata
|
||||||
|
|
||||||
![](27.gif)
|
![](reg27.gif)
|
||||||
|
|
||||||
|
Visualization of clustering first 500 data entries (with regularization).
|
||||||
|
|
||||||
|
![](norm27.gif)
|
||||||
|
|
||||||
|
Visualization of clustering first 500 data entries (with normalization).
|
||||||
|
|
||||||
It may seem that the quality is not satisfactory for bigger numbers of cluster. That is due to the fact that the data has 17 dimensions and for the purpouse of plotting it is [PCA](https://en.wikipedia.org/wiki/Principal_component_analysis)'ed into just 2 dimensions.
|
|
Loading…
Reference in New Issue
Block a user