164 lines
5.0 KiB
Python
164 lines
5.0 KiB
Python
|
import random
|
||
|
|
||
|
import matplotlib.pyplot as plt
|
||
|
import numpy as np
|
||
|
import pandas as pd
|
||
|
import seaborn as sns
|
||
|
from sklearn.metrics import silhouette_score
|
||
|
from sklearn.preprocessing import MinMaxScaler
|
||
|
|
||
|
|
||
|
class TrainModel:
|
||
|
def __init__(self, data, k_value):
|
||
|
self.data = data
|
||
|
scaler = MinMaxScaler()
|
||
|
# self.data = scaler.fit_transform(self.data)
|
||
|
self.k_value = k_value
|
||
|
self.kmedoids(self.data)
|
||
|
|
||
|
def get_random_medoids(self, data):
|
||
|
points = random.sample(range(0, len(data)), self.k_value)
|
||
|
medoids = []
|
||
|
for i in range(self.k_value):
|
||
|
medoids.append(data[i])
|
||
|
return medoids
|
||
|
|
||
|
def get_closest_medoids(self, sample_point, medoids):
|
||
|
min_distance = float('inf')
|
||
|
closest_medoid = None
|
||
|
for i in range(len(medoids)):
|
||
|
distance = self.calculateDistance(sample_point, medoids[i])
|
||
|
if distance < min_distance:
|
||
|
min_distance = distance
|
||
|
closest_medoid = i
|
||
|
return closest_medoid
|
||
|
|
||
|
def get_clusters(self, data_points, medoids):
|
||
|
clusters = [[] for _ in range(self.k_value)]
|
||
|
for i in range(len(data_points)):
|
||
|
x = self.get_closest_medoids(data_points[i], medoids)
|
||
|
clusters[x].append(data_points[i])
|
||
|
return clusters
|
||
|
|
||
|
def calculate_cost(self, data_points, clusters, medoids):
|
||
|
cost = 0
|
||
|
for i in range(len(clusters)):
|
||
|
for j in range(len(clusters[i])):
|
||
|
cost += self.calculateDistance(medoids[i], clusters[i][j])
|
||
|
return cost
|
||
|
|
||
|
def get_non_medoids(self, data_points, medoids):
|
||
|
non_medoids = []
|
||
|
for sample in data_points:
|
||
|
flag = False
|
||
|
for m in medoids:
|
||
|
if (sample == m).all():
|
||
|
flag = True
|
||
|
if flag == False:
|
||
|
non_medoids.append(sample)
|
||
|
return non_medoids
|
||
|
|
||
|
def get_clusters_label(self, data_points, clusters):
|
||
|
labels = []
|
||
|
for i in range(len(data_points)):
|
||
|
labels.append(0)
|
||
|
for i in range(len(clusters)):
|
||
|
cluster = clusters[i]
|
||
|
for j in range(len(cluster)):
|
||
|
for k in range(len(data_points)):
|
||
|
if (cluster[j] == data_points[k]).all():
|
||
|
labels[k] = i
|
||
|
break
|
||
|
return labels
|
||
|
|
||
|
def kmedoids(self, data):
|
||
|
medoids = self.get_random_medoids(data)
|
||
|
clusters = self.get_clusters(data, medoids)
|
||
|
initial_cost = self.calculate_cost(data, clusters, medoids)
|
||
|
while True:
|
||
|
best_medoids = medoids
|
||
|
lowest_cost = initial_cost
|
||
|
for i in range(len(medoids)):
|
||
|
non_medoids = self.get_non_medoids(data, medoids)
|
||
|
for j in range(len(non_medoids)):
|
||
|
new_medoids = medoids.copy()
|
||
|
for k in range(len(new_medoids)):
|
||
|
if (new_medoids[k] == medoids[i]).all():
|
||
|
new_medoids[k] = non_medoids[j]
|
||
|
new_clusters = self.get_clusters(data, new_medoids)
|
||
|
new_cost = self.calculate_cost(data, new_clusters, new_medoids)
|
||
|
if new_cost < lowest_cost:
|
||
|
lowest_cost = new_cost
|
||
|
best_medoids = new_medoids
|
||
|
if lowest_cost < initial_cost:
|
||
|
initial_cost = lowest_cost
|
||
|
medoids = best_medoids
|
||
|
else:
|
||
|
break
|
||
|
final_clusters = self.get_clusters(data, medoids)
|
||
|
cluster_labels = self.get_clusters_label(data, final_clusters)
|
||
|
silhouette_avg = silhouette_score(data, cluster_labels)
|
||
|
|
||
|
# First cluster
|
||
|
x0 = np.squeeze(final_clusters[0])[:, 0]
|
||
|
y0 = np.squeeze(final_clusters[0])[:, 1]
|
||
|
|
||
|
# Second cluster
|
||
|
x1 = np.squeeze(final_clusters[1])[:, 0]
|
||
|
y1 = np.squeeze(final_clusters[1])[:, 1]
|
||
|
|
||
|
plt.scatter(x0, y0, c='red')
|
||
|
plt.scatter(x1, y1, c='green')
|
||
|
|
||
|
# Draw medoids
|
||
|
mx = []
|
||
|
my = []
|
||
|
for m in medoids:
|
||
|
mx.append(m[0])
|
||
|
my.append(m[1])
|
||
|
plt.scatter(mx, my, c='yellow', marker='*')
|
||
|
|
||
|
plt.xlabel("X")
|
||
|
plt.ylabel("Y")
|
||
|
plt.title("K-medoids clusters")
|
||
|
plt.show()
|
||
|
|
||
|
|
||
|
print('Sylwetka (ang. Silhouette) dla algorytmu k-medoid dla k =', self.k_value, 10 * '-', silhouette_avg)
|
||
|
|
||
|
def calculateDistance(self, x, y):
|
||
|
return np.linalg.norm(x - y)
|
||
|
|
||
|
|
||
|
# Prepare dataset
|
||
|
dataset = np.array([
|
||
|
[5, 6],
|
||
|
[4, 7],
|
||
|
[4, 8],
|
||
|
[4, 6],
|
||
|
[5, 7],
|
||
|
[5, 8],
|
||
|
[7, 6],
|
||
|
[8, 8],
|
||
|
[7, 7],
|
||
|
[7, 8]]
|
||
|
)
|
||
|
column_values = ['x', 'y']
|
||
|
df = pd.DataFrame(data=dataset, columns=column_values, index=None)
|
||
|
|
||
|
# Draw data distribution
|
||
|
sns.set_theme(style='darkgrid')
|
||
|
sns.scatterplot(data=df, x='x', y='y')
|
||
|
plt.show()
|
||
|
|
||
|
# Run K-Medoids algorithm
|
||
|
model = TrainModel(dataset, 2)
|
||
|
|
||
|
|
||
|
|
||
|
|
||
|
# dataset = pd.read_csv('iris.csv')
|
||
|
# dataset = dataset.iloc[:,:-1]
|
||
|
# dataset = dataset.iloc[: , 1:]
|
||
|
# dataset = dataset.values
|