uczenie_maszynowe_zadania/cw_8/main.ipynb
2023-07-04 20:42:14 +02:00

907 KiB

from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

Formatowanie danych

data = pd.read_csv('flats_for_clustering.tsv',sep='\t')
data['Piętro'] = data['Piętro'].replace('parter','0')
data['Piętro'] = data['Piętro'].replace('niski parter','0')
data["Piętro"] = data["Piętro"].apply(pd.to_numeric, errors="coerce")
data = data.dropna()
scaler = StandardScaler()
scaler.fit(data)
scaled_data = scaler.transform(data)
scaled_data = pd.DataFrame(scaled_data)
scaled_data.head()
0 1 2 3 4
0 -0.388640 -0.499154 -0.618683 -0.193055 -0.986574
1 0.481205 0.089217 -0.618683 -0.737328 -0.263285
2 0.072479 0.994405 0.503142 2.800447 0.821648
3 0.208721 0.270255 0.503142 -1.009464 -0.263285
4 0.150328 0.089217 0.503142 1.984038 3.353159
sns.pairplot(data)
<seaborn.axisgrid.PairGrid at 0x7f0d6a5819d0>

Algorytm K średnich

kmeans = KMeans(n_clusters=5,init='k-means++')
kmeans = kmeans.fit(data)
kmeans.cluster_centers_
array([[5.34232145e+05, 7.85017921e+01, 3.25448029e+00, 4.91756272e+00,
        2.63082437e+00],
       [2.30703732e+05, 4.16620939e+01, 1.98700361e+00, 5.51841155e+00,
        2.55379061e+00],
       [1.61545162e+06, 1.51384615e+02, 4.03846154e+00, 2.69230769e+00,
        1.23076923e+00],
       [3.52262093e+05, 5.68337226e+01, 2.64128912e+00, 6.15226530e+00,
        2.90565156e+00],
       [9.08821504e+05, 1.11654135e+02, 3.74436090e+00, 4.48120301e+00,
        2.38345865e+00]])
data['Clusters'] = kmeans.labels_
data.head()
cena Powierzchnia w m2 Liczba pokoi Liczba pięter w budynku Piętro Clusters
0 290386 46 2 5.0 0.0 1
1 450000 59 2 3.0 2.0 0
2 375000 79 3 16.0 5.0 3
3 400000 63 3 2.0 2.0 3
4 389285 59 3 13.0 12.0 3
data['Clusters'].value_counts()
3    2141
1    1385
0     558
4     133
2      26
Name: Clusters, dtype: int64
sns.pairplot(data,hue='Clusters',palette='Spectral')
<seaborn.axisgrid.PairGrid at 0x7f0d7826b250>

Algorytm PCA

del data['Clusters']
pca = PCA(n_components=2)
pca.fit(scaled_data)
x_pca = pca.transform(scaled_data)
x_pca = pd.DataFrame(x_pca)
x_pca.columns = ['X','Y']
x_pca.head()
X Y
0 -0.733842 -0.981792
1 0.118314 -0.727320
2 0.433828 2.663883
3 0.725795 -0.774859
4 -0.195982 3.810580
sns.scatterplot(x_pca,x='X',y='Y')
<AxesSubplot:xlabel='X', ylabel='Y'>