forked from tdwojak/Python2017
87 lines
2.6 KiB
Python
87 lines
2.6 KiB
Python
#!/usr/bin/env python
|
|
# -*- coding: utf-8 -*-
|
|
|
|
"""*(dodatkowe)*: Korzystając z pakietu *sklearn* zbuduj model regresji liniowej,
|
|
która będzie wyznaczać cenę mieszkania na podstawie wielkości mieszkania i liczby pokoi."""
|
|
|
|
#import bibliotek
|
|
import pandas as pd
|
|
import numpy as np
|
|
import matplotlib.pyplot as plt
|
|
import sklearn
|
|
from sklearn import linear_model
|
|
from sklearn.model_selection import train_test_split
|
|
from sklearn.metrics import mean_squared_error
|
|
import math
|
|
|
|
#wczytanie danych
|
|
dane = pd.read_csv('mieszkania.csv', sep = ',', encoding = 'utf-8')
|
|
dane = pd.DataFrame(dane)
|
|
|
|
#analiza korelacji
|
|
print(dane.corr())
|
|
#korelacja niewielka dodatnia pomiedzy SqrMeters, a Expected = 0.109640
|
|
#korelacja niewielka dodatnia pomiedzy Rooms, a Expected = 0.081177
|
|
#niska korelaje pokazuje wykres rozrzutu
|
|
dane.plot.scatter(x='SqrMeters', y='Expected')
|
|
plt.show()
|
|
|
|
#data preparation
|
|
#X -independent variables
|
|
#Y -dependent variable
|
|
|
|
X = dane[['SqrMeters', 'Rooms' ]]
|
|
X= pd.DataFrame(X)
|
|
|
|
Y = dane['Expected']
|
|
|
|
#splitting data into a training and test set:
|
|
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.20, random_state=1)
|
|
|
|
#train LinearRegression model using the training set of data
|
|
lm = linear_model.LinearRegression()
|
|
lm.fit(X_train, Y_train)
|
|
|
|
# coefficients of the model
|
|
for idx, col_name in enumerate(X_train.columns):
|
|
print("The coefficient for {} is {}".format(col_name, lm.coef_[idx]))
|
|
|
|
# intercept of the model
|
|
intercept = lm.intercept_
|
|
print("The intercept for our model is {}".format(intercept))
|
|
|
|
#linear model : 123969.05-34261.86*X1+5299.47*X2
|
|
|
|
#R^2 proportion of variability in Y that is explained by X in model = accuracy of regression models
|
|
#It seems that 0.78% of the variability in Y can be explained using X
|
|
print(lm.score(X_test, Y_test))
|
|
|
|
#predykcja
|
|
#comparing the prediction for the test data set (data not used for training) with the ground truth for the data test set
|
|
y_predict = lm.predict(X_test)
|
|
|
|
lm_mse = mean_squared_error(y_predict, Y_test)
|
|
|
|
#It seems that we are an average of 1148825.67 away from the ground truth when making predictions on our test set.
|
|
print(lm_mse)
|
|
print(math.sqrt(lm_mse))
|
|
|
|
print('--------')
|
|
#print(X_test['SqrMeters'])
|
|
#linear model plot - how our model plots against our test data.
|
|
plt.scatter(X_test['SqrMeters'], Y_test, color='black')
|
|
plt.plot(X_test['SqrMeters'], y_predict, color='blue', linewidth=2)
|
|
|
|
plt.scatter(X_test['SqrMeters'], Y_test, color='black')
|
|
plt.plot(X_test['SqrMeters'], y_predict, color='blue', linewidth=2)
|
|
|
|
plt.xticks(())
|
|
plt.yticks(())
|
|
|
|
plt.show()
|
|
|
|
#prediction using made up data
|
|
# SqrMeters:45
|
|
# Rooms: 3
|
|
print(lm.predict([[45, 3]]))
|
|
# Expected price: 259659.47 |