diff --git a/labs06/model.py b/labs06/model.py new file mode 100644 index 0000000..2a82c9e --- /dev/null +++ b/labs06/model.py @@ -0,0 +1,87 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- + +"""*(dodatkowe)*: Korzystając z pakietu *sklearn* zbuduj model regresji liniowej, +która będzie wyznaczać cenę mieszkania na podstawie wielkości mieszkania i liczby pokoi.""" + +#import bibliotek +import pandas as pd +import numpy as np +import matplotlib.pyplot as plt +import sklearn +from sklearn import linear_model +from sklearn.model_selection import train_test_split +from sklearn.metrics import mean_squared_error +import math + +#wczytanie danych +dane = pd.read_csv('mieszkania.csv', sep = ',', encoding = 'utf-8') +dane = pd.DataFrame(dane) + +#analiza korelacji +print(dane.corr()) +#korelacja niewielka dodatnia pomiedzy SqrMeters, a Expected = 0.109640 +#korelacja niewielka dodatnia pomiedzy Rooms, a Expected = 0.081177 +#niska korelaje pokazuje wykres rozrzutu +dane.plot.scatter(x='SqrMeters', y='Expected') +plt.show() + +#data preparation +#X -independent variables +#Y -dependent variable + +X = dane[['SqrMeters', 'Rooms' ]] +X= pd.DataFrame(X) + +Y = dane['Expected'] + +#splitting data into a training and test set: +X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.20, random_state=1) + +#train LinearRegression model using the training set of data +lm = linear_model.LinearRegression() +lm.fit(X_train, Y_train) + +# coefficients of the model +for idx, col_name in enumerate(X_train.columns): + print("The coefficient for {} is {}".format(col_name, lm.coef_[idx])) + +# intercept of the model +intercept = lm.intercept_ +print("The intercept for our model is {}".format(intercept)) + +#linear model : 123969.05-34261.86*X1+5299.47*X2 + +#R^2 proportion of variability in Y that is explained by X in model = accuracy of regression models +#It seems that 0.78% of the variability in Y can be explained using X +print(lm.score(X_test, Y_test)) + +#predykcja +#comparing the prediction for the test data set (data not used for training) with the ground truth for the data test set +y_predict = lm.predict(X_test) + +lm_mse = mean_squared_error(y_predict, Y_test) + +#It seems that we are an average of 1148825.67 away from the ground truth when making predictions on our test set. +print(lm_mse) +print(math.sqrt(lm_mse)) + +print('--------') +#print(X_test['SqrMeters']) +#linear model plot - how our model plots against our test data. +plt.scatter(X_test['SqrMeters'], Y_test, color='black') +plt.plot(X_test['SqrMeters'], y_predict, color='blue', linewidth=2) + +plt.scatter(X_test['SqrMeters'], Y_test, color='black') +plt.plot(X_test['SqrMeters'], y_predict, color='blue', linewidth=2) + +plt.xticks(()) +plt.yticks(()) + +plt.show() + +#prediction using made up data +# SqrMeters:45 +# Rooms: 3 +print(lm.predict([[45, 3]])) +# Expected price: 259659.47 \ No newline at end of file