36 lines
1.0 KiB
Python
36 lines
1.0 KiB
Python
|
import numpy as np
|
||
|
import pandas as pd
|
||
|
from sklearn.preprocessing import StandardScaler
|
||
|
from sklearn.cluster import KMeans
|
||
|
from sklearn.decomposition import PCA
|
||
|
import matplotlib.pyplot as plt
|
||
|
|
||
|
data = pd.read_csv("flats_for_clustering.tsv",
|
||
|
header=0,
|
||
|
sep="\t")
|
||
|
data["Piętro"] = data["Piętro"].apply(
|
||
|
lambda x: 0 if x in ["parter", "niski parter"] else x
|
||
|
)
|
||
|
data["Piętro"] = data["Piętro"].apply(pd.to_numeric, errors="coerce")
|
||
|
data["cena"] = data["cena"].apply(
|
||
|
lambda x: np.NaN if x in [0] else x
|
||
|
)
|
||
|
data = data.loc[(data["Powierzchnia w m2"] < 500)] # pozbywamy się danych odstających
|
||
|
|
||
|
data = data.dropna()
|
||
|
|
||
|
scaler = StandardScaler()
|
||
|
|
||
|
scaled_data = scaler.fit_transform(data)
|
||
|
|
||
|
kmeans = KMeans(n_clusters=5)
|
||
|
clusters = kmeans.fit_predict(scaled_data)
|
||
|
|
||
|
pca = PCA(n_components=2)
|
||
|
pca_data = pca.fit_transform(scaled_data)
|
||
|
|
||
|
plt.scatter(pca_data[:, 0], pca_data[:, 1], c=clusters)
|
||
|
plt.title('Wizualizacja klastrów po zastosowaniu algorytmu PCA')
|
||
|
plt.xlabel('x1')
|
||
|
plt.ylabel('x2')
|
||
|
plt.show()
|