1
0
forked from tdwojak/Python2017

task02 **dodatkowe (labs06) done

This commit is contained in:
Ewelina 2017-12-27 12:55:29 +01:00
parent 8424251b3c
commit c440fa5c25

87
labs06/model.py Normal file
View File

@ -0,0 +1,87 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""*(dodatkowe)*: Korzystając z pakietu *sklearn* zbuduj model regresji liniowej,
która będzie wyznaczać cenę mieszkania na podstawie wielkości mieszkania i liczby pokoi."""
#import bibliotek
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import sklearn
from sklearn import linear_model
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import math
#wczytanie danych
dane = pd.read_csv('mieszkania.csv', sep = ',', encoding = 'utf-8')
dane = pd.DataFrame(dane)
#analiza korelacji
print(dane.corr())
#korelacja niewielka dodatnia pomiedzy SqrMeters, a Expected = 0.109640
#korelacja niewielka dodatnia pomiedzy Rooms, a Expected = 0.081177
#niska korelaje pokazuje wykres rozrzutu
dane.plot.scatter(x='SqrMeters', y='Expected')
plt.show()
#data preparation
#X -independent variables
#Y -dependent variable
X = dane[['SqrMeters', 'Rooms' ]]
X= pd.DataFrame(X)
Y = dane['Expected']
#splitting data into a training and test set:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.20, random_state=1)
#train LinearRegression model using the training set of data
lm = linear_model.LinearRegression()
lm.fit(X_train, Y_train)
# coefficients of the model
for idx, col_name in enumerate(X_train.columns):
print("The coefficient for {} is {}".format(col_name, lm.coef_[idx]))
# intercept of the model
intercept = lm.intercept_
print("The intercept for our model is {}".format(intercept))
#linear model : 123969.05-34261.86*X1+5299.47*X2
#R^2 proportion of variability in Y that is explained by X in model = accuracy of regression models
#It seems that 0.78% of the variability in Y can be explained using X
print(lm.score(X_test, Y_test))
#predykcja
#comparing the prediction for the test data set (data not used for training) with the ground truth for the data test set
y_predict = lm.predict(X_test)
lm_mse = mean_squared_error(y_predict, Y_test)
#It seems that we are an average of 1148825.67 away from the ground truth when making predictions on our test set.
print(lm_mse)
print(math.sqrt(lm_mse))
print('--------')
#print(X_test['SqrMeters'])
#linear model plot - how our model plots against our test data.
plt.scatter(X_test['SqrMeters'], Y_test, color='black')
plt.plot(X_test['SqrMeters'], y_predict, color='blue', linewidth=2)
plt.scatter(X_test['SqrMeters'], Y_test, color='black')
plt.plot(X_test['SqrMeters'], y_predict, color='blue', linewidth=2)
plt.xticks(())
plt.yticks(())
plt.show()
#prediction using made up data
# SqrMeters:45
# Rooms: 3
print(lm.predict([[45, 3]]))
# Expected price: 259659.47