import random import matplotlib.pyplot as plt import numpy as np import pandas as pd import seaborn as sns from sklearn.metrics import silhouette_score from sklearn.preprocessing import MinMaxScaler class TrainModel: def __init__(self, data, k_value): self.data = data scaler = MinMaxScaler() # self.data = scaler.fit_transform(self.data) self.k_value = k_value self.kmedoids(self.data) def get_random_medoids(self, data): points = random.sample(range(0, len(data)), self.k_value) medoids = [] for i in range(self.k_value): medoids.append(data[i]) return medoids def get_closest_medoids(self, sample_point, medoids): min_distance = float('inf') closest_medoid = None for i in range(len(medoids)): distance = self.calculateDistance(sample_point, medoids[i]) if distance < min_distance: min_distance = distance closest_medoid = i return closest_medoid def get_clusters(self, data_points, medoids): clusters = [[] for _ in range(self.k_value)] for i in range(len(data_points)): x = self.get_closest_medoids(data_points[i], medoids) clusters[x].append(data_points[i]) return clusters def calculate_cost(self, data_points, clusters, medoids): cost = 0 for i in range(len(clusters)): for j in range(len(clusters[i])): cost += self.calculateDistance(medoids[i], clusters[i][j]) return cost def get_non_medoids(self, data_points, medoids): non_medoids = [] for sample in data_points: flag = False for m in medoids: if (sample == m).all(): flag = True if flag == False: non_medoids.append(sample) return non_medoids def get_clusters_label(self, data_points, clusters): labels = [] for i in range(len(data_points)): labels.append(0) for i in range(len(clusters)): cluster = clusters[i] for j in range(len(cluster)): for k in range(len(data_points)): if (cluster[j] == data_points[k]).all(): labels[k] = i break return labels def kmedoids(self, data): medoids = self.get_random_medoids(data) clusters = self.get_clusters(data, medoids) initial_cost = self.calculate_cost(data, clusters, medoids) while True: best_medoids = medoids lowest_cost = initial_cost for i in range(len(medoids)): non_medoids = self.get_non_medoids(data, medoids) for j in range(len(non_medoids)): new_medoids = medoids.copy() for k in range(len(new_medoids)): if (new_medoids[k] == medoids[i]).all(): new_medoids[k] = non_medoids[j] new_clusters = self.get_clusters(data, new_medoids) new_cost = self.calculate_cost(data, new_clusters, new_medoids) if new_cost < lowest_cost: lowest_cost = new_cost best_medoids = new_medoids if lowest_cost < initial_cost: initial_cost = lowest_cost medoids = best_medoids else: break final_clusters = self.get_clusters(data, medoids) cluster_labels = self.get_clusters_label(data, final_clusters) silhouette_avg = silhouette_score(data, cluster_labels) # First cluster x0 = np.squeeze(final_clusters[0])[:, 0] y0 = np.squeeze(final_clusters[0])[:, 1] # Second cluster x1 = np.squeeze(final_clusters[1])[:, 0] y1 = np.squeeze(final_clusters[1])[:, 1] plt.scatter(x0, y0, c='red') plt.scatter(x1, y1, c='green') # Draw medoids mx = [] my = [] for m in medoids: mx.append(m[0]) my.append(m[1]) plt.scatter(mx, my, c='yellow', marker='*') plt.xlabel("X") plt.ylabel("Y") plt.title("K-medoids clusters") plt.show() print('Sylwetka (ang. Silhouette) dla algorytmu k-medoid dla k =', self.k_value, 10 * '-', silhouette_avg) def calculateDistance(self, x, y): return np.linalg.norm(x - y) # Prepare dataset dataset = np.array([ [5, 6], [4, 7], [4, 8], [4, 6], [5, 7], [5, 8], [7, 6], [8, 8], [7, 7], [7, 8]] ) column_values = ['x', 'y'] df = pd.DataFrame(data=dataset, columns=column_values, index=None) # Draw data distribution sns.set_theme(style='darkgrid') sns.scatterplot(data=df, x='x', y='y') plt.show() # Run K-Medoids algorithm model = TrainModel(dataset, 2) # dataset = pd.read_csv('iris.csv') # dataset = dataset.iloc[:,:-1] # dataset = dataset.iloc[: , 1:] # dataset = dataset.values