k-medoids/kMedoids.py

72 lines
2.6 KiB
Python
Raw Permalink Normal View History

2021-06-18 20:05:07 +02:00
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from numpy.random import choice, seed
from sklearn.decomposition import PCA
2021-06-25 20:28:33 +02:00
from sklearn.preprocessing import StandardScaler
2021-06-18 20:05:07 +02:00
seed(42)
def initialize_medoids(num_medoids, data):
return [data.iloc[idx] for idx in choice(len(data), size=num_medoids, replace=False)]
def assign_points_to_medoids(data, medoids):
return [np.argmin([distance_vec2vec(point[1], medoid) for medoid in medoids]) for point in data.iterrows()]
def distance_vec2vec(a, b) -> np.float64:
return sum([(abs(a[i] - b[i]) ** 2) for i in range(len(a))])
def reassign_medoids(data, assignments, initial_medoids):
new_medoids = []
for idm, medoid in enumerate(initial_medoids):
new_medoid = medoid
medoid_score = sum([distance_vec2vec(medoid, x[1]) if assignments[idx] == idm else 0
for idx, x in enumerate(data.iterrows())])
for point in data.iterrows():
point_score = sum(sum([distance_vec2vec(point, x[1]) if assignments[idx] == idm else 0
for idx, x in enumerate(data.iterrows())]))
if medoid_score > point_score:
new_medoid = point
new_medoids.append(new_medoid)
return new_medoids
def is_finished(old_medoids, new_medoids):
return set([tuple(om) for om in old_medoids]) == set([tuple(nm) for nm in new_medoids])
def kmedoids(num_samples, num_clusters):
df = pd.read_csv('CC GENERAL.csv', index_col='CUST_ID')
2021-06-25 20:35:54 +02:00
df = df[:num_samples].dropna()
2021-06-25 20:28:33 +02:00
df_scaled = pd.DataFrame(StandardScaler().fit_transform(df))
2021-06-18 20:05:07 +02:00
# initialize medoids (at random)
medoids = initialize_medoids(num_medoids=num_clusters, data=df_scaled)
# assign data points to the medoids
assignments = assign_points_to_medoids(data=df_scaled, medoids=medoids)
# fit
new_medoids = reassign_medoids(data=df_scaled, assignments=assignments, initial_medoids=medoids)
while not is_finished(old_medoids=medoids, new_medoids=new_medoids):
medoids = new_medoids
new_medoids = reassign_medoids(data=df_scaled, assignments=assignments, initial_medoids=medoids)
new_assignments = assign_points_to_medoids(data=df_scaled, medoids=new_medoids)
data = pd.DataFrame(PCA(n_components=2).fit_transform(df_scaled), columns=['0', '1'])
data['cluster'] = new_assignments
sns.relplot(x='0', y='1', hue='cluster', data=data, palette=sns.color_palette("husl", num_clusters))
plt.show()
for i in range(2, 8):
kmedoids(num_samples=500, num_clusters=i)
print(i)