#!/usr/bin/env python # -*- coding: utf-8 -*- import sklearn import pandas as pd import numpy as np dane = pd.read_csv("mieszkania.csv") print(dane.head()) print(dane.columns) # check data for outliers from matplotlib import pyplot as plt plt.scatter(dane['SqrMeters'], dane['Expected'], color='g') plt.show() # remove all data points that have expected price <= 500.000 and living area <= 200 sqrt meters plt.scatter(dane['Rooms'], dane['Expected'], color='g') plt.show() # remove all data points that represent flats with more than 8 rooms flats = dane[(dane['Rooms'] < 10) & (dane['SqrMeters'] <= 200) & (dane['Expected'] <= 500000)] print(flats.head(20)) y = flats['Expected'] X = flats.drop(['Id', 'Expected', 'Floor', 'Location', 'Description', 'Unnamed: 7', 'Unnamed: 8', 'Unnamed: 9', 'Unnamed: 10', 'Unnamed: 11'], axis=1) print(y.head()) print(X.head()) from sklearn.model_selection import train_test_split train_X, test_X, train_y, test_y = train_test_split(X, y, test_size=0.3, random_state=38, shuffle=True) from sklearn.linear_model import LinearRegression model = LinearRegression() model.fit(X,y) predicted = model.predict(test_X) print("Predictions:", predicted[:5]) for p in zip(train_X.columns, model.coef_): print("Intercept for {}: {:.3}".format(p[0], p[1])) from sklearn.metrics import mean_squared_error rmse = np.sqrt(mean_squared_error(predicted, test_y)) print("RMSE:", rmse) r2 = model.score(test_X, test_y) print("R squared:", r2) # 0.54 comparing to 0.02 before cleaning the data