init
This commit is contained in:
commit
e8fcca18c3
8951
CC GENERAL.csv
Normal file
8951
CC GENERAL.csv
Normal file
File diff suppressed because it is too large
Load Diff
70
kMedoids.py
Normal file
70
kMedoids.py
Normal file
@ -0,0 +1,70 @@
|
|||||||
|
import matplotlib.pyplot as plt
|
||||||
|
import numpy as np
|
||||||
|
import pandas as pd
|
||||||
|
import seaborn as sns
|
||||||
|
|
||||||
|
from numpy.random import choice, seed
|
||||||
|
from sklearn.decomposition import PCA
|
||||||
|
from sklearn.preprocessing import MinMaxScaler
|
||||||
|
|
||||||
|
seed(42)
|
||||||
|
|
||||||
|
|
||||||
|
def initialize_medoids(num_medoids, data):
|
||||||
|
return [data.iloc[idx] for idx in choice(len(data), size=num_medoids, replace=False)]
|
||||||
|
|
||||||
|
|
||||||
|
def assign_points_to_medoids(data, medoids):
|
||||||
|
return [np.argmin([distance_vec2vec(point[1], medoid) for medoid in medoids]) for point in data.iterrows()]
|
||||||
|
|
||||||
|
|
||||||
|
def distance_vec2vec(a, b) -> np.float64:
|
||||||
|
return sum([(abs(a[i] - b[i]) ** 2) for i in range(len(a))])
|
||||||
|
|
||||||
|
|
||||||
|
def reassign_medoids(data, assignments, initial_medoids):
|
||||||
|
new_medoids = []
|
||||||
|
for idm, medoid in enumerate(initial_medoids):
|
||||||
|
new_medoid = medoid
|
||||||
|
medoid_score = sum([distance_vec2vec(medoid, x[1]) if assignments[idx] == idm else 0
|
||||||
|
for idx, x in enumerate(data.iterrows())])
|
||||||
|
for point in data.iterrows():
|
||||||
|
point_score = sum(sum([distance_vec2vec(point, x[1]) if assignments[idx] == idm else 0
|
||||||
|
for idx, x in enumerate(data.iterrows())]))
|
||||||
|
if medoid_score > point_score:
|
||||||
|
new_medoid = point
|
||||||
|
new_medoids.append(new_medoid)
|
||||||
|
return new_medoids
|
||||||
|
|
||||||
|
|
||||||
|
def is_finished(old_medoids, new_medoids):
|
||||||
|
return set([tuple(om) for om in old_medoids]) == set([tuple(nm) for nm in new_medoids])
|
||||||
|
|
||||||
|
|
||||||
|
def kmedoids(num_samples, num_clusters):
|
||||||
|
df = pd.read_csv('CC GENERAL.csv', index_col='CUST_ID')
|
||||||
|
df = df[:num_samples]
|
||||||
|
df_scaled = pd.DataFrame(MinMaxScaler().fit_transform(df))
|
||||||
|
df_scaled = df_scaled.fillna(0)
|
||||||
|
|
||||||
|
# initialize medoids (at random)
|
||||||
|
medoids = initialize_medoids(num_medoids=num_clusters, data=df_scaled)
|
||||||
|
|
||||||
|
# assign data points to the medoids
|
||||||
|
assignments = assign_points_to_medoids(data=df_scaled, medoids=medoids)
|
||||||
|
|
||||||
|
# fit
|
||||||
|
new_medoids = reassign_medoids(data=df_scaled, assignments=assignments, initial_medoids=medoids)
|
||||||
|
while not is_finished(old_medoids=medoids, new_medoids=new_medoids):
|
||||||
|
medoids = new_medoids
|
||||||
|
new_medoids = reassign_medoids(data=df_scaled, assignments=assignments, initial_medoids=medoids)
|
||||||
|
|
||||||
|
new_assignments = assign_points_to_medoids(data=df_scaled, medoids=new_medoids)
|
||||||
|
data = pd.DataFrame(PCA(n_components=2).fit_transform(df_scaled), columns=['0', '1'])
|
||||||
|
data['cluster'] = new_assignments
|
||||||
|
|
||||||
|
sns.relplot(x='0', y='1', hue='cluster', data=data, palette=sns.color_palette("husl", num_clusters))
|
||||||
|
plt.show()
|
||||||
|
|
||||||
|
|
||||||
|
kmedoids(num_samples=500, num_clusters=3)
|
Loading…
Reference in New Issue
Block a user