svd_mpsic/kMedoids.py

164 lines
5.0 KiB
Python
Raw Normal View History

2022-06-15 22:16:06 +02:00
import random
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.metrics import silhouette_score
from sklearn.preprocessing import MinMaxScaler
class TrainModel:
def __init__(self, data, k_value):
self.data = data
scaler = MinMaxScaler()
# self.data = scaler.fit_transform(self.data)
self.k_value = k_value
self.kmedoids(self.data)
def get_random_medoids(self, data):
points = random.sample(range(0, len(data)), self.k_value)
medoids = []
for i in range(self.k_value):
medoids.append(data[i])
return medoids
def get_closest_medoids(self, sample_point, medoids):
min_distance = float('inf')
closest_medoid = None
for i in range(len(medoids)):
distance = self.calculateDistance(sample_point, medoids[i])
if distance < min_distance:
min_distance = distance
closest_medoid = i
return closest_medoid
def get_clusters(self, data_points, medoids):
clusters = [[] for _ in range(self.k_value)]
for i in range(len(data_points)):
x = self.get_closest_medoids(data_points[i], medoids)
clusters[x].append(data_points[i])
return clusters
def calculate_cost(self, data_points, clusters, medoids):
cost = 0
for i in range(len(clusters)):
for j in range(len(clusters[i])):
cost += self.calculateDistance(medoids[i], clusters[i][j])
return cost
def get_non_medoids(self, data_points, medoids):
non_medoids = []
for sample in data_points:
flag = False
for m in medoids:
if (sample == m).all():
flag = True
if flag == False:
non_medoids.append(sample)
return non_medoids
def get_clusters_label(self, data_points, clusters):
labels = []
for i in range(len(data_points)):
labels.append(0)
for i in range(len(clusters)):
cluster = clusters[i]
for j in range(len(cluster)):
for k in range(len(data_points)):
if (cluster[j] == data_points[k]).all():
labels[k] = i
break
return labels
def kmedoids(self, data):
medoids = self.get_random_medoids(data)
clusters = self.get_clusters(data, medoids)
initial_cost = self.calculate_cost(data, clusters, medoids)
while True:
best_medoids = medoids
lowest_cost = initial_cost
for i in range(len(medoids)):
non_medoids = self.get_non_medoids(data, medoids)
for j in range(len(non_medoids)):
new_medoids = medoids.copy()
for k in range(len(new_medoids)):
if (new_medoids[k] == medoids[i]).all():
new_medoids[k] = non_medoids[j]
new_clusters = self.get_clusters(data, new_medoids)
new_cost = self.calculate_cost(data, new_clusters, new_medoids)
if new_cost < lowest_cost:
lowest_cost = new_cost
best_medoids = new_medoids
if lowest_cost < initial_cost:
initial_cost = lowest_cost
medoids = best_medoids
else:
break
final_clusters = self.get_clusters(data, medoids)
cluster_labels = self.get_clusters_label(data, final_clusters)
silhouette_avg = silhouette_score(data, cluster_labels)
# First cluster
x0 = np.squeeze(final_clusters[0])[:, 0]
y0 = np.squeeze(final_clusters[0])[:, 1]
# Second cluster
x1 = np.squeeze(final_clusters[1])[:, 0]
y1 = np.squeeze(final_clusters[1])[:, 1]
plt.scatter(x0, y0, c='red')
plt.scatter(x1, y1, c='green')
# Draw medoids
mx = []
my = []
for m in medoids:
mx.append(m[0])
my.append(m[1])
plt.scatter(mx, my, c='yellow', marker='*')
plt.xlabel("X")
plt.ylabel("Y")
plt.title("K-medoids clusters")
plt.show()
print('Sylwetka (ang. Silhouette) dla algorytmu k-medoid dla k =', self.k_value, 10 * '-', silhouette_avg)
def calculateDistance(self, x, y):
return np.linalg.norm(x - y)
# Prepare dataset
dataset = np.array([
[5, 6],
[4, 7],
[4, 8],
[4, 6],
[5, 7],
[5, 8],
[7, 6],
[8, 8],
[7, 7],
[7, 8]]
)
column_values = ['x', 'y']
df = pd.DataFrame(data=dataset, columns=column_values, index=None)
# Draw data distribution
sns.set_theme(style='darkgrid')
sns.scatterplot(data=df, x='x', y='y')
plt.show()
# Run K-Medoids algorithm
model = TrainModel(dataset, 2)
# dataset = pd.read_csv('iris.csv')
# dataset = dataset.iloc[:,:-1]
# dataset = dataset.iloc[: , 1:]
# dataset = dataset.values