This commit is contained in:
jakubknczny 2021-06-18 20:05:07 +02:00
commit e8fcca18c3
3 changed files with 9025 additions and 0 deletions

8951
CC GENERAL.csv Normal file

File diff suppressed because it is too large Load Diff

70
kMedoids.py Normal file
View File

@ -0,0 +1,70 @@
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from numpy.random import choice, seed
from sklearn.decomposition import PCA
from sklearn.preprocessing import MinMaxScaler
seed(42)
def initialize_medoids(num_medoids, data):
return [data.iloc[idx] for idx in choice(len(data), size=num_medoids, replace=False)]
def assign_points_to_medoids(data, medoids):
return [np.argmin([distance_vec2vec(point[1], medoid) for medoid in medoids]) for point in data.iterrows()]
def distance_vec2vec(a, b) -> np.float64:
return sum([(abs(a[i] - b[i]) ** 2) for i in range(len(a))])
def reassign_medoids(data, assignments, initial_medoids):
new_medoids = []
for idm, medoid in enumerate(initial_medoids):
new_medoid = medoid
medoid_score = sum([distance_vec2vec(medoid, x[1]) if assignments[idx] == idm else 0
for idx, x in enumerate(data.iterrows())])
for point in data.iterrows():
point_score = sum(sum([distance_vec2vec(point, x[1]) if assignments[idx] == idm else 0
for idx, x in enumerate(data.iterrows())]))
if medoid_score > point_score:
new_medoid = point
new_medoids.append(new_medoid)
return new_medoids
def is_finished(old_medoids, new_medoids):
return set([tuple(om) for om in old_medoids]) == set([tuple(nm) for nm in new_medoids])
def kmedoids(num_samples, num_clusters):
df = pd.read_csv('CC GENERAL.csv', index_col='CUST_ID')
df = df[:num_samples]
df_scaled = pd.DataFrame(MinMaxScaler().fit_transform(df))
df_scaled = df_scaled.fillna(0)
# initialize medoids (at random)
medoids = initialize_medoids(num_medoids=num_clusters, data=df_scaled)
# assign data points to the medoids
assignments = assign_points_to_medoids(data=df_scaled, medoids=medoids)
# fit
new_medoids = reassign_medoids(data=df_scaled, assignments=assignments, initial_medoids=medoids)
while not is_finished(old_medoids=medoids, new_medoids=new_medoids):
medoids = new_medoids
new_medoids = reassign_medoids(data=df_scaled, assignments=assignments, initial_medoids=medoids)
new_assignments = assign_points_to_medoids(data=df_scaled, medoids=new_medoids)
data = pd.DataFrame(PCA(n_components=2).fit_transform(df_scaled), columns=['0', '1'])
data['cluster'] = new_assignments
sns.relplot(x='0', y='1', hue='cluster', data=data, palette=sns.color_palette("husl", num_clusters))
plt.show()
kmedoids(num_samples=500, num_clusters=3)

4
readme.md Normal file
View File

@ -0,0 +1,4 @@
# K-medoids (Partitioning Around Medoids)
PAM k-medoids implementation for classes of Mathematical Foundations of Artificial Intelligence
and Cyber-Security at AMU Poznań.