907 KiB
907 KiB
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
Formatowanie danych
data = pd.read_csv('flats_for_clustering.tsv',sep='\t')
data['Piętro'] = data['Piętro'].replace('parter','0')
data['Piętro'] = data['Piętro'].replace('niski parter','0')
data["Piętro"] = data["Piętro"].apply(pd.to_numeric, errors="coerce")
data = data.dropna()
scaler = StandardScaler()
scaler.fit(data)
scaled_data = scaler.transform(data)
scaled_data = pd.DataFrame(scaled_data)
scaled_data.head()
0 | 1 | 2 | 3 | 4 | |
---|---|---|---|---|---|
0 | -0.388640 | -0.499154 | -0.618683 | -0.193055 | -0.986574 |
1 | 0.481205 | 0.089217 | -0.618683 | -0.737328 | -0.263285 |
2 | 0.072479 | 0.994405 | 0.503142 | 2.800447 | 0.821648 |
3 | 0.208721 | 0.270255 | 0.503142 | -1.009464 | -0.263285 |
4 | 0.150328 | 0.089217 | 0.503142 | 1.984038 | 3.353159 |
sns.pairplot(data)
<seaborn.axisgrid.PairGrid at 0x7f0d6a5819d0>
Algorytm K średnich
kmeans = KMeans(n_clusters=5,init='k-means++')
kmeans = kmeans.fit(data)
kmeans.cluster_centers_
array([[5.34232145e+05, 7.85017921e+01, 3.25448029e+00, 4.91756272e+00, 2.63082437e+00], [2.30703732e+05, 4.16620939e+01, 1.98700361e+00, 5.51841155e+00, 2.55379061e+00], [1.61545162e+06, 1.51384615e+02, 4.03846154e+00, 2.69230769e+00, 1.23076923e+00], [3.52262093e+05, 5.68337226e+01, 2.64128912e+00, 6.15226530e+00, 2.90565156e+00], [9.08821504e+05, 1.11654135e+02, 3.74436090e+00, 4.48120301e+00, 2.38345865e+00]])
data['Clusters'] = kmeans.labels_
data.head()
cena | Powierzchnia w m2 | Liczba pokoi | Liczba pięter w budynku | Piętro | Clusters | |
---|---|---|---|---|---|---|
0 | 290386 | 46 | 2 | 5.0 | 0.0 | 1 |
1 | 450000 | 59 | 2 | 3.0 | 2.0 | 0 |
2 | 375000 | 79 | 3 | 16.0 | 5.0 | 3 |
3 | 400000 | 63 | 3 | 2.0 | 2.0 | 3 |
4 | 389285 | 59 | 3 | 13.0 | 12.0 | 3 |
data['Clusters'].value_counts()
3 2141 1 1385 0 558 4 133 2 26 Name: Clusters, dtype: int64
sns.pairplot(data,hue='Clusters',palette='Spectral')
<seaborn.axisgrid.PairGrid at 0x7f0d7826b250>
Algorytm PCA
del data['Clusters']
pca = PCA(n_components=2)
pca.fit(scaled_data)
x_pca = pca.transform(scaled_data)
x_pca = pd.DataFrame(x_pca)
x_pca.columns = ['X','Y']
x_pca.head()
X | Y | |
---|---|---|
0 | -0.733842 | -0.981792 |
1 | 0.118314 | -0.727320 |
2 | 0.433828 | 2.663883 |
3 | 0.725795 | -0.774859 |
4 | -0.195982 | 3.810580 |
sns.scatterplot(x_pca,x='X',y='Y')
<AxesSubplot:xlabel='X', ylabel='Y'>