1.8 MiB
1.8 MiB
Metryka euklidesowa z 3 klasami
iris = pd.read_csv("iris.csv")
x = iris.iloc[:, [0, 1, 2, 3]].values
iris.info()
iris[0:10]
iris_outcome = pd.crosstab(index=iris["species"], # Make a crosstab
columns="count") # Name the count column
iris_outcome
iris_setosa=iris.loc[iris["species"]=="Iris-setosa"]
iris_virginica=iris.loc[iris["species"]=="Iris-virginica"]
iris_versicolor=iris.loc[iris["species"]=="Iris-versicolor"]
sns.FacetGrid(iris,hue="species").map(sns.distplot,"petal_length").add_legend()
sns.FacetGrid(iris,hue="species").map(sns.distplot,"petal_width").add_legend()
sns.FacetGrid(iris,hue="species").map(sns.distplot,"sepal_length").add_legend()
plt.show()
sns.boxplot(x="species",y="petal_length",data=iris, palette="Set2")
plt.show()
sns.violinplot(x="species",y="petal_length",data=iris)
plt.show()
sns.set_style("whitegrid")
sns.pairplot(iris,hue="species",size=3);
plt.show()
from sklearn.cluster import KMeans
wcss = []
for i in range(1, 11):
kmeans = KMeans(n_clusters = i, metric='manhattan'init = 'k-means++', max_iter = 300, n_init = 10, random_state = 0)
kmeans.fit(x)
wcss.append(kmeans.inertia_)
plt.plot(range(1, 11), wcss)
plt.title('The elbow method')
plt.xlabel('Number of clusters')
plt.ylabel('WCSS') #within cluster sum of squares
plt.show()
kmeans = KMeans(n_clusters = 3, init = 'k-means++', max_iter = 300, n_init = 10, random_state = 0)
y_kmeans = kmeans.fit_predict(x)
#Visualising the clusters
plt.scatter(x[y_kmeans == 0, 0], x[y_kmeans == 0, 1], s = 100, c = 'purple', label = 'Iris-setosa')
plt.scatter(x[y_kmeans == 1, 0], x[y_kmeans == 1, 1], s = 100, c = 'orange', label = 'Iris-versicolour')
plt.scatter(x[y_kmeans == 2, 0], x[y_kmeans == 2, 1], s = 100, c = 'green', label = 'Iris-virginica')
#Plotting the centroids of the clusters
plt.scatter(kmeans.cluster_centers_[:, 0], kmeans.cluster_centers_[:,1], s = 100, c = 'red', label = 'Centroids')
plt.legend()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 150 entries, 0 to 149 Data columns (total 5 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 sepal_length 150 non-null float64 1 sepal_width 150 non-null float64 2 petal_length 150 non-null float64 3 petal_width 150 non-null float64 4 species 150 non-null object dtypes: float64(4), object(1) memory usage: 6.0+ KB
<matplotlib.legend.Legend at 0x1b44dcc1f40>
Metryka euklidesowa z 2 klasami
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.datasets import load_iris
from sklearn.cluster import KMeans
# Load the Iris dataset
iris_data = load_iris()
iris = pd.DataFrame(data=iris_data.data, columns=iris_data.feature_names)
iris['species'] = iris_data.target
# Map target integers to class names
iris['species'] = iris['species'].map({0: 'setosa', 1: 'versicolor', 2: 'virginica'})
# Remove the 'virginica' class
iris = iris[iris['species'] != 'virginica']
# Liczba elementów w klasach
class_counts = iris['species'].value_counts()
print("Liczba elementów w klasach:\n", class_counts)
# Centroidy (actual centroids will be computed by KMeans later)
centroids = {
'setosa': [5.006, 3.418, 1.464, 0.244],
'versicolor': [5.936, 2.770, 4.260, 1.326]
}
print("\nCentroidy:\n", centroids)
# FacetGrid z histogramem
g = sns.FacetGrid(iris, hue="species", height=5)
g.map(sns.histplot, "petal length (cm)", bins=20, kde=True)
g.add_legend()
plt.title('Histogram petal length')
plt.show()
# Boxplot
plt.figure(figsize=(8, 6))
sns.boxplot(x="species", y="petal length (cm)", data=iris, palette="Set2")
plt.title('Boxplot petal length')
plt.show()
# Violinplot
plt.figure(figsize=(8, 6))
sns.violinplot(x="species", y="petal length (cm)", data=iris)
plt.title('Violinplot petal length')
plt.show()
# Pairplot
sns.pairplot(iris, hue="species", height=3)
plt.suptitle('Pairplot of Features by Species', y=1.02)
plt.show()
# KMeans Elbow Method
X = iris.iloc[:, :4].values
wcss = []
for i in range(1, 11):
kmeans = KMeans(n_clusters=i, init='k-means++', max_iter=300, n_init=10, random_state=0)
kmeans.fit(X)
wcss.append(kmeans.inertia_)
plt.plot(range(1, 11), wcss)
plt.title('Elbow Method For Optimal k')
plt.xlabel('Number of clusters')
plt.ylabel('WCSS') # Within Cluster Sum of Squares
plt.show()
# KMeans Clustering
kmeans = KMeans(n_clusters=2, init='k-means++', max_iter=300, n_init=10, random_state=0)
y_kmeans = kmeans.fit_predict(X)
# Wykres rozrzutu z klastrami
plt.figure(figsize=(10, 6))
plt.scatter(X[y_kmeans == 0, 0], X[y_kmeans == 0, 1], s=100, c='purple', label='setosa')
plt.scatter(X[y_kmeans == 1, 0], X[y_kmeans == 1, 1], s=100, c='orange', label='versicolor')
# Plotting the centroids of the clusters
plt.scatter(kmeans.cluster_centers_[:, 0], kmeans.cluster_centers_[:, 1], s=300, c='red', label='Centroids')
plt.title('Clusters and Centroids')
plt.xlabel('Sepal Length (cm)')
plt.ylabel('Sepal Width (cm)')
plt.legend()
plt.show()
Liczba elementów w klasach: species setosa 50 versicolor 50 Name: count, dtype: int64 Centroidy: {'setosa': [5.006, 3.418, 1.464, 0.244], 'versicolor': [5.936, 2.77, 4.26, 1.326]}
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.datasets import load_iris
from sklearn.cluster import KMeans
# Load the Iris dataset
iris_data = load_iris()
iris = pd.DataFrame(data=iris_data.data, columns=iris_data.feature_names)
iris['species'] = iris_data.target
# Map target integers to class names
iris['species'] = iris['species'].map({0: 'setosa', 1: 'versicolor', 2: 'virginica'})
# Liczba elementów w klasach
class_counts = iris['species'].value_counts()
print("Liczba elementów w klasach:\n", class_counts)
# FacetGrid z histogramem
g = sns.FacetGrid(iris, hue="species", height=5)
g.map(sns.histplot, "petal length (cm)", bins=20, kde=True, stat="density")
g.add_legend()
plt.title('Histogram petal length')
plt.show()
# Histogramy dla petal width i sepal length
fig, axes = plt.subplots(1, 2, figsize=(12, 5))
# Histogram dla petal width
sns.histplot(data=iris, x="petal width (cm)", hue="species", bins=20, kde=True, stat="density", ax=axes[0])
axes[0].set_title('Histogram petal width')
# Histogram dla sepal length
sns.histplot(data=iris, x="sepal length (cm)", hue="species", bins=20, kde=True, stat="density", ax=axes[1])
axes[1].set_title('Histogram sepal length')
plt.tight_layout()
plt.show()
# Boxplot
plt.figure(figsize=(8, 6))
sns.boxplot(x="species", y="petal length (cm)", data=iris, palette="Set2")
plt.title('Boxplot petal length')
plt.show()
# Violinplot
plt.figure(figsize=(8, 6))
sns.violinplot(x="species", y="petal length (cm)", data=iris)
plt.title('Violinplot petal length')
plt.show()
# Pairplot
sns.pairplot(iris, hue="species", height=3)
plt.suptitle('Pairplot of Features by Species', y=1.02)
plt.show()
# KMeans with L1 metric
X = iris.iloc[:, :4].values
kmeans = KMeans(n_clusters=3, init='k-means++', max_iter=300, n_init=10, random_state=0)
kmeans.fit(X)
# KMeans with L1 distance (Manhattan)
def kmeans_l1(X, n_clusters, max_iter=300):
# Randomly initialize centroids
centroids = X[np.random.choice(X.shape[0], n_clusters, replace=False)]
for _ in range(max_iter):
# Compute distances using L1 metric
distances = np.abs(X[:, np.newaxis] - centroids).sum(axis=2)
# Assign clusters based on the closest centroid
labels = np.argmin(distances, axis=1)
# Update centroids
new_centroids = np.array([X[labels == i].mean(axis=0) if np.any(labels == i) else centroids[i] for i in range(n_clusters)])
if np.all(centroids == new_centroids):
break
centroids = new_centroids
return labels, centroids
# Execute KMeans with L1 distance
y_l1_kmeans, centroids_l1 = kmeans_l1(X, n_clusters=3)
# Wykres rozrzutu z klastrami
plt.figure(figsize=(10, 6))
plt.scatter(X[y_l1_kmeans == 0, 0], X[y_l1_kmeans == 0, 1], s=100, c='purple', label='setosa')
plt.scatter(X[y_l1_kmeans == 1, 0], X[y_l1_kmeans == 1, 1], s=100, c='orange', label='versicolor')
plt.scatter(X[y_l1_kmeans == 2, 0], X[y_l1_kmeans == 2, 1], s=100, c='green', label='virginica')
# Plotting the centroids of the clusters
plt.scatter(centroids_l1[:, 0], centroids_l1[:, 1], s=300, c='red', label='Centroids')
plt.title('Clusters and Centroids with L1 Metric')
plt.xlabel('Sepal Length (cm)')
plt.ylabel('Sepal Width (cm)')
plt.legend()
plt.show()
Liczba elementów w klasach: species setosa 50 versicolor 50 virginica 50 Name: count, dtype: int64