2024-11-03 23:54:46 +01:00

892 KiB

Raw Blame History

# Importowanie potrzebnych bibliotek
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import datasets
from collections import defaultdict
import warnings
from sklearn.utils import Bunch
warnings.simplefilter("ignore")

Metryka euklidesowa z 3 klasami

iris=datasets.load_iris()

X=pd.DataFrame(iris.data,columns=['Sepal Length', 'Sepal Width', 'Petal Length', 'Petal Width'])
y=pd.DataFrame(iris.target,columns=['Classes'])

X['species'] = pd.Series(np.random.randn(150), index=X.index)
X['species'][y['Classes']==0]='Setosa'
X['species'][y['Classes']==1]='versicolor'
X['species'][y['Classes']==2]='virginica'

# Ustawienie liczby klastrów
clusters=len(np.unique(y))

# Funkcja obliczania odległości euklidesowej
def euclidean_dis(x1, x2):
    return np.sqrt(np.sum((x1 - x2)**2))

# Implementacja KMeans
class KMeans:
    def __init__(self, data, k, max_ite):
        self.data = data
        self.k = k
        self.max_ite = max_ite
        
    def predict(self):
        centroids = defaultdict(int)
        for i in range(self.k):
            centroids[i] = self.data[i]
        
        for _ in range(self.max_ite):
            classes = defaultdict(list)
            for key in range(self.k):
                classes[key] = []
            
            for datapoint in self.data:
                distances = [euclidean_dis(datapoint, centroids[j]) for j in range(self.k)]
                min_distance_index = distances.index(min(distances))
                classes[min_distance_index].append(datapoint)
            
            old_centroids = dict(centroids)
            for t in range(self.k):
                centroids[t] = np.mean(classes[t], axis=0)
            
            if all(np.sum((centroids[t] - old_centroids[t])/old_centroids[t] * 100) <= 0.001 for t in range(self.k)):
                break
        
        return classes, centroids

# Predykcja z użyciem implementacji KMeans
kmeans = KMeans(iris.data[:, :4], clusters, 10000)
classes, centroids = kmeans.predict()

# Drukowanie wyników
for i in range(0, 3):
    classes[i] = np.array(classes[i]).tolist()
    print(f"Liczba elementów w klasie {i}: {len(classes[i])}")

print("Centroidy:")
for centroid in centroids.values():
    print(centroid)

# Funkcja do tworzenia macierzy pomyłek
def subset(array1, array2):
    return any(np.array_equal(array2, elem) for elem in array1)

def confusion_matrix(a, b, c, classes):
    cm = [[0] * len(np.unique(y)) for _ in range(len(np.unique(y)))]
    for idx, data_class in enumerate([a, b, c]):
        for element in data_class:
            for i, cluster in enumerate(classes.values()):
                if subset(cluster, element):
                    cm[idx][i] += 1
    return cm

# Obliczanie metryk wydajności
class Metrics:
    def __init__(self, confusion_m):
        self.confusion_m = np.array(confusion_m)
        self.total = np.sum(confusion_m)
        self.diagonal = np.sum(np.diag(confusion_m))

    def accuracy(self):
        return self.diagonal / self.total

    def recall(self):
        return np.mean(np.diag(self.confusion_m) / np.sum(self.confusion_m, axis=1))

    def precision(self):
        return np.mean(np.diag(self.confusion_m) / np.sum(self.confusion_m, axis=0))

    def f1_score(self, precision, recall):
        return (2 * precision * recall) / (precision + recall)

# Obliczenie metryk i wyświetlenie wyników
class0 = np.array([iris.data[i] for i in range(len(iris.target)) if iris.target[i] == 0])
class1 = np.array([iris.data[i] for i in range(len(iris.target)) if iris.target[i] == 1])
class2 = np.array([iris.data[i] for i in range(len(iris.target)) if iris.target[i] == 2])

matrix = confusion_matrix(class0, class1, class2, classes)
performance = Metrics(matrix)

accuracy = performance.accuracy()
recall = performance.recall()
precision = performance.precision()
f1_score = performance.f1_score(precision, recall)

print('Macierz pomyłek:')
print(np.array(matrix))

print(f"Dokładność modelu: {accuracy * 100:.2f}%")
print(f"Recall: {recall * 100:.2f}%")
print(f"Precyzja: {precision * 100:.2f}%")
print(f"F1-score: {f1_score * 100:.2f}%")

# Wykorzystanie wbudowanej funkcji KMeans ze scikit-learn
from sklearn.cluster import KMeans as SklearnKMeans

kmeans_sklearn = SklearnKMeans(n_clusters=3, random_state=42)
y_kmeans = kmeans_sklearn.fit_predict(iris.data)

print("Klasyfikacja przy użyciu wbudowanego KMeans:", y_kmeans)

Liczba elementów w klasie 0: 39
Liczba elementów w klasie 1: 61
Liczba elementów w klasie 2: 50
Centroidy:
[6.85384615 3.07692308 5.71538462 2.05384615]
[5.88360656 2.74098361 4.38852459 1.43442623]
[5.006 3.428 1.462 0.246]
Macierz pomyłek:
[[ 0  0 50]
 [ 3 47  0]
 [36 14  0]]
Dokładność modelu: 31.33%
Recall: 31.33%
Precyzja: 25.68%
F1-score: 28.23%
Klasyfikacja przy użyciu wbudowanego KMeans: [1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 0 2 0 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
 2 2 2 0 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 0 2 0 0 0 0 2 0 0 0 0
 0 0 2 2 0 0 0 0 2 0 2 0 2 0 0 2 2 0 0 0 0 0 2 0 0 0 0 2 0 0 0 2 0 0 0 2 0
 0 2]

Metryka L1 z 3 klasami

# Odległość Manhatan
def manhattan_dis(x1, x2):
    return np.sum(np.abs(x1 - x2))

# Implementacja KMeans
class KMeans:
    def __init__(self, data, k, max_ite):
        self.data = data
        self.k = k
        self.max_ite = max_ite
        
    def predict(self):
        centroids = defaultdict(int)
        for i in range(self.k):
            centroids[i] = self.data[i]
        
        for _ in range(self.max_ite):
            classes = defaultdict(list)
            for key in range(self.k):
                classes[key] = []
            
            for datapoint in self.data:
                distances = [manhattan_dis(datapoint, centroids[j]) for j in range(self.k)]
                min_distance_index = distances.index(min(distances))
                classes[min_distance_index].append(datapoint)
            
            old_centroids = dict(centroids)
            for t in range(self.k):
                centroids[t] = np.mean(classes[t], axis=0)
            
            if all(np.sum((centroids[t] - old_centroids[t])/old_centroids[t] * 100) <= 0.001 for t in range(self.k)):
                break
        
        return classes, centroids

# Predykcja z użyciem implementacji KMeans
kmeans = KMeans(iris.data[:, :4], clusters, 10000)
classes, centroids = kmeans.predict()

# Drukowanie wyników
for i in range(0, 3):
    classes[i] = np.array(classes[i]).tolist()
    print(f"Liczba elementów w klasie {i}: {len(classes[i])}")

print("Centroidy:")
for centroid in centroids.values():
    print(centroid)

# Funkcja do tworzenia macierzy pomyłek
def subset(array1, array2):
    return any(np.array_equal(array2, elem) for elem in array1)

def confusion_matrix(a, b, c, classes):
    cm = [[0] * len(np.unique(y)) for _ in range(len(np.unique(y)))]
    for idx, data_class in enumerate([a, b, c]):
        for element in data_class:
            for i, cluster in enumerate(classes.values()):
                if subset(cluster, element):
                    cm[idx][i] += 1
    return cm

# Obliczanie metryk wydajności
class Metrics:
    def __init__(self, confusion_m):
        self.confusion_m = np.array(confusion_m)
        self.total = np.sum(confusion_m)
        self.diagonal = np.sum(np.diag(confusion_m))

    def accuracy(self):
        return self.diagonal / self.total

    def recall(self):
        return np.mean(np.diag(self.confusion_m) / np.sum(self.confusion_m, axis=1))

    def precision(self):
        return np.mean(np.diag(self.confusion_m) / np.sum(self.confusion_m, axis=0))

    def f1_score(self, precision, recall):
        return (2 * precision * recall) / (precision + recall)

# Obliczenie metryk i wyświetlenie wyników
class0 = np.array([iris.data[i] for i in range(len(iris.target)) if iris.target[i] == 0])
class1 = np.array([iris.data[i] for i in range(len(iris.target)) if iris.target[i] == 1])
class2 = np.array([iris.data[i] for i in range(len(iris.target)) if iris.target[i] == 2])

matrix = confusion_matrix(class0, class1, class2, classes)
performance = Metrics(matrix)

accuracy = performance.accuracy()
recall = performance.recall()
precision = performance.precision()
f1_score = performance.f1_score(precision, recall)

print('Macierz pomyłek:')
print(np.array(matrix))

print(f"Dokładność modelu: {accuracy * 100:.2f}%")
print(f"Recall: {recall * 100:.2f}%")
print(f"Precyzja: {precision * 100:.2f}%")
print(f"F1-score: {f1_score * 100:.2f}%")

# Wykorzystanie wbudowanej funkcji KMeans ze scikit-learn
from sklearn.cluster import KMeans as SklearnKMeans

kmeans_sklearn = SklearnKMeans(n_clusters=3, random_state=42)
y_kmeans = kmeans_sklearn.fit_predict(iris.data)

print("Klasyfikacja przy użyciu wbudowanego KMeans:", y_kmeans)

Liczba elementów w klasie 0: 63
Liczba elementów w klasie 1: 37
Liczba elementów w klasie 2: 50
Centroidy:
[5.9047619  2.74603175 4.41269841 1.43333333]
[6.87027027 3.08648649 5.74594595 2.08918919]
[5.006 3.428 1.462 0.246]
Macierz pomyłek:
[[ 0  0 50]
 [48  2  0]
 [15 35  0]]
Dokładność modelu: 1.33%
Recall: 1.33%
Precyzja: 1.80%
F1-score: 1.53%
Klasyfikacja przy użyciu wbudowanego KMeans: [1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 0 2 0 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
 2 2 2 0 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 0 2 0 0 0 0 2 0 0 0 0
 0 0 2 2 0 0 0 0 2 0 2 0 2 0 0 2 2 0 0 0 0 0 2 0 0 0 0 2 0 0 0 2 0 0 0 2 0
 0 2]

Metryka euklidesowa z 2 klasami

# Usunięcie jednej z klas (np. klasy 2)
X = X[y['Classes'] != 2]
y = y[y['Classes'] != 2]

# Zresetowanie indeksów po usunięciu klasy
X.reset_index(drop=True, inplace=True)
y.reset_index(drop=True, inplace=True)

# Ustawienie liczby klastrów po usunięciu klasy
clusters = len(np.unique(y))

# Funkcja obliczania odległości euklidesowej
def euclidean_dis(x1, x2):
    return np.sqrt(np.sum((x1 - x2)**2))

# Implementacja KMeans z metryką Manhattan
class KMeans:
    def __init__(self, data, k, max_ite):
        self.data = data
        self.k = k
        self.max_ite = max_ite
        
    def predict(self):
        centroids = defaultdict(int)
        for i in range(self.k):
            centroids[i] = self.data[i]
        
        for _ in range(self.max_ite):
            classes = defaultdict(list)
            for key in range(self.k):
                classes[key] = []
            
            for datapoint in self.data:
                distances = [euclidean_dis(datapoint, centroids[j]) for j in range(self.k)]
                min_distance_index = distances.index(min(distances))
                classes[min_distance_index].append(datapoint)
            
            old_centroids = dict(centroids)
            for t in range(self.k):
                centroids[t] = np.mean(classes[t], axis=0)
            
            if all(np.sum((centroids[t] - old_centroids[t])/old_centroids[t] * 100) <= 0.001 for t in range(self.k)):
                break
        
        return classes, centroids

# Predykcja z użyciem zmodyfikowanej implementacji KMeans
kmeans = KMeans(X.values, clusters, 10000)
classes, centroids = kmeans.predict()

# Drukowanie wyników
for i in range(clusters):
    classes[i] = np.array(classes[i]).tolist()
    print(f"Liczba elementów w klasie {i}: {len(classes[i])}")

print("Centroidy:")
for centroid in centroids.values():
    print(centroid)

# Obliczenie metryk i wyświetlenie wyników dla pozostałych klas
class0 = np.array([X.values[i] for i in range(len(y)) if y.iloc[i, 0] == 0])
class1 = np.array([X.values[i] for i in range(len(y)) if y.iloc[i, 0] == 1])

matrix = confusion_matrix(class0, class1, [], classes)
performance = Metrics(matrix)

accuracy = performance.accuracy()
recall = performance.recall()
precision = performance.precision()
f1_score = performance.f1_score(precision, recall)

print('Macierz pomyłek:')
print(np.array(matrix))

print(f"Dokładność modelu: {accuracy * 100:.2f}%")
print(f"Recall: {recall * 100:.2f}%")
print(f"Precyzja: {precision * 100:.2f}%")
print(f"F1-score: {f1_score * 100:.2f}%")

Liczba elementów w klasie 0: 50
Liczba elementów w klasie 1: 50
Centroidy:
[5.006 3.428 1.462 0.246]
[5.936 2.77  4.26  1.326]
Macierz pomyłek:
[[50  0]
 [ 0 50]]
Dokładność modelu: 100.00%
Recall: 100.00%
Precyzja: 100.00%
F1-score: 100.00%

Porównanie wyników klasyfikacji przy użyciu metryki euklidesowej

1. Wyniki z metryką euklidesową dla wszystkich klas:

Liczba elementów w klasach:
- Klasa 0: 39
- Klasa 1: 61
- Klasa 2: 50
Centroidy:
- Klasa 0: [6.8538, 3.0769, 5.7154, 2.0538]
- Klasa 1: [5.8836, 2.7410, 4.3885, 1.4344]
- Klasa 2: [5.006, 3.428, 1.462, 0.246]
Macierz pomyłek:
- 0, 0, 50], [ 3, 47, 0], [36, 14, 0
Metryki wydajności:
- Dokładność: 31.33%
- Recall: 31.33%
- Precyzja: 25.68%
- F1-score: 28.23%
Komentarz: Model wykazuje niską jakość klasyfikacji, szczególnie dla klasy 2, która została całkowicie pominięta w prognozach. Chociaż klasa 1 ma relatywnie wysoką precyzję, wyniki ogólne są słabe, co sugeruje problemy z klastrowaniem przy pełnym zbiorze danych.

2. Wyniki z metryką euklidesową po usunięciu jednej klasy:

Liczba elementów w klasach:
- Klasa 0: 50
- Klasa 1: 50
Centroidy:
- Klasa 0: [5.006, 3.428, 1.462, 0.246]
- Klasa 1: [5.936, 2.770, 4.260, 1.326]
Macierz pomyłek:
- 50, 0], [ 0, 50
Metryki wydajności:
- Dokładność: 100.00%
- Recall: 100.00%
- Precyzja: 100.00%
- F1-score: 100.00%
Komentarz: Po usunięciu jednej klasy model osiągnął perfekcyjne wyniki. Wszystkie punkty danych zostały poprawnie sklasyfikowane, co pokazuje, że zredukowanie liczby klas poprawiło znacząco wydajność algorytmu.

3. Wyniki z metryką L1 dla wszystkich klas:

Liczba elementów w klasach:
- Klasa 0: 63
- Klasa 1: 37
- Klasa 2: 50
Centroidy:
- Klasa 0: [5.9048, 2.7460, 4.4127, 1.4333]
- Klasa 1: [6.8703, 3.0865, 5.7459, 2.0892]
- Klasa 2: [5.006, 3.428, 1.462, 0.246]
Macierz pomyłek:
- 0, 0, 50], [48, 2, 0], [15, 35, 0
Metryki wydajności:
- Dokładność: 1.33%
- Recall: 1.33%
- Precyzja: 1.80%
- F1-score: 1.53%
Komentarz: Podobnie jak w przypadku metryki euklidesowej z 3 klasami, model z tą metryką wykazuje bardzo niską dokładność. Klasyfikacja nie działa dobrze przy wykorzystaniu metryki L1 dla pełnego zbioru danych.

Podsumowanie:

Metryka euklidesowa dla wszystkich klas: Niska wydajność z punktami błędnie klasyfikowanymi. Klasa 2 została całkowicie pominięta.
Metryka euklidesowa po usunięciu klasy: Perfekcyjne wyniki. Algorytm działa doskonale przy dwóch klasach.
Metryka L1 dla wszystkich klas: Bardzo niska jakość klasyfikacji, co wskazuje na nieodpowiedniość metryki do struktury danych w tym przypadku.

Wnioski:

Zastosowanie metryki euklidesowej przy pełnym zbiorze danych wykazuje lepsze rezultaty w porównaniu do metryki L1, ale po usunięciu jednej klasy algorytm KMeans osiąga idealne wyniki. To podkreśla znaczenie doboru metryki oraz liczby klas w analizie klastrów.

iris = pd.read_csv("iris.csv")
x = iris.iloc[:, [0, 1, 2, 3]].values
iris.info()
iris[0:10]

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   sepal_length  150 non-null    float64
 1   sepal_width   150 non-null    float64
 2   petal_length  150 non-null    float64
 3   petal_width   150 non-null    float64
 4   species       150 non-null    object 
dtypes: float64(4), object(1)
memory usage: 6.0+ KB

	sepal_length	sepal_width	petal_length	petal_width	species
0	5.1	3.5	1.4	0.2	setosa
1	4.9	3.0	1.4	0.2	setosa
2	4.7	3.2	1.3	0.2	setosa
3	4.6	3.1	1.5	0.2	setosa
4	5.0	3.6	1.4	0.2	setosa
5	5.4	3.9	1.7	0.4	setosa
6	4.6	3.4	1.4	0.3	setosa
7	5.0	3.4	1.5	0.2	setosa
8	4.4	2.9	1.4	0.2	setosa
9	4.9	3.1	1.5	0.1	setosa

iris_outcome = pd.crosstab(index=iris["species"],  # Make a crosstab
                              columns="count")      # Name the count column

iris_outcome

col_0	count
species
setosa	50
versicolor	50
virginica	50

iris_setosa=iris.loc[iris["species"]=="Iris-setosa"]
iris_virginica=iris.loc[iris["species"]=="Iris-virginica"]
iris_versicolor=iris.loc[iris["species"]=="Iris-versicolor"]

sns.FacetGrid(iris,hue="species").map(sns.distplot,"petal_length").add_legend()
sns.FacetGrid(iris,hue="species").map(sns.distplot,"petal_width").add_legend()
sns.FacetGrid(iris,hue="species").map(sns.distplot,"sepal_length").add_legend()
plt.show()

sns.boxplot(x="species",y="petal_length",data=iris, palette="Set2")
plt.show()

sns.violinplot(x="species",y="petal_length",data=iris)
plt.show()

sns.set_style("whitegrid")
sns.pairplot(iris,hue="species",size=3);
plt.show()

from sklearn.cluster import KMeans
wcss = []

for i in range(1, 11):
    kmeans = KMeans(n_clusters = i, init = 'k-means++', max_iter = 300, n_init = 10, random_state = 0)
    kmeans.fit(x)
    wcss.append(kmeans.inertia_)

plt.plot(range(1, 11), wcss)
plt.title('The elbow method')
plt.xlabel('Number of clusters')
plt.ylabel('WCSS') #within cluster sum of squares
plt.show()

kmeans = KMeans(n_clusters = 3, init = 'k-means++', max_iter = 300, n_init = 10, random_state = 0)
y_kmeans = kmeans.fit_predict(x)

#Visualising the clusters
plt.scatter(x[y_kmeans == 0, 0], x[y_kmeans == 0, 1], s = 100, c = 'purple', label = 'Iris-setosa')
plt.scatter(x[y_kmeans == 1, 0], x[y_kmeans == 1, 1], s = 100, c = 'orange', label = 'Iris-versicolour')
plt.scatter(x[y_kmeans == 2, 0], x[y_kmeans == 2, 1], s = 100, c = 'green', label = 'Iris-virginica')

#Plotting the centroids of the clusters
plt.scatter(kmeans.cluster_centers_[:, 0], kmeans.cluster_centers_[:,1], s = 100, c = 'red', label = 'Centroids')

plt.legend()

<matplotlib.legend.Legend at 0x189f13ee3f0>

# 3d scatterplot using matplotlib

fig = plt.figure(figsize = (15,15))
ax = fig.add_subplot(111, projection='3d')
plt.scatter(x[y_kmeans == 0, 0], x[y_kmeans == 0, 1], s = 100, c = 'purple', label = 'Iris-setosa')
plt.scatter(x[y_kmeans == 1, 0], x[y_kmeans == 1, 1], s = 100, c = 'orange', label = 'Iris-versicolour')
plt.scatter(x[y_kmeans == 2, 0], x[y_kmeans == 2, 1], s = 100, c = 'green', label = 'Iris-virginica')

#Plotting the centroids of the clusters
plt.scatter(kmeans.cluster_centers_[:, 0], kmeans.cluster_centers_[:,1], s = 100, c = 'red', label = 'Centroids')
plt.show()

892 KiB Raw Blame History