From 325549ab4a37f4640e9812bd375fb80cce3f14e5 Mon Sep 17 00:00:00 2001 From: "wagner.agnieszka" Date: Sat, 23 Jun 2018 01:00:53 +0200 Subject: [PATCH] passed --- labs06/linearModel.py | 56 +++++++++++++++++++++++++++++++++++++++++++ labs06/task02.py | 56 +++++++++++++++++++++++++++++++++++++++++++ labs06/tasks.py | 4 ++-- 3 files changed, 114 insertions(+), 2 deletions(-) create mode 100644 labs06/linearModel.py diff --git a/labs06/linearModel.py b/labs06/linearModel.py new file mode 100644 index 0000000..b0e7783 --- /dev/null +++ b/labs06/linearModel.py @@ -0,0 +1,56 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- + +import sklearn +import pandas as pd +import numpy as np + +dane = pd.read_csv("mieszkania.csv") +print(dane.head()) +print(dane.columns) + +# check data for outliers +from matplotlib import pyplot as plt +plt.scatter(dane['SqrMeters'], dane['Expected'], color='g') +plt.show() +# remove all data points that have expected price <= 500.000 and living area <= 200 sqrt meters +plt.scatter(dane['Rooms'], dane['Expected'], color='g') +plt.show() +# remove all data points that represent flats with more than 8 rooms + +flats = dane[(dane['Rooms'] < 10) & (dane['SqrMeters'] <= 200) & (dane['Expected'] <= 500000)] +print(flats.head(20)) + +y = flats['Expected'] +X = flats.drop(['Id', 'Expected', 'Floor', 'Location', + 'Description', 'Unnamed: 7', 'Unnamed: 8', 'Unnamed: 9', 'Unnamed: 10', 'Unnamed: 11'], axis=1) +print(y.head()) +print(X.head()) + + +from sklearn.model_selection import train_test_split + +train_X, test_X, train_y, test_y = train_test_split(X, y, test_size=0.3, random_state=38, shuffle=True) + +from sklearn.linear_model import LinearRegression +model = LinearRegression() +model.fit(X,y) + + +predicted = model.predict(test_X) +print("Predictions:", predicted[:5]) + +for p in zip(train_X.columns, model.coef_): + print("Intercept for {}: {:.3}".format(p[0], p[1])) + +from sklearn.metrics import mean_squared_error +rmse = np.sqrt(mean_squared_error(predicted, test_y)) +print("RMSE:", rmse) + +r2 = model.score(test_X, test_y) + +print("R squared:", r2) # 0.54 comparing to 0.02 before cleaning the data + + + + diff --git a/labs06/task02.py b/labs06/task02.py index 5210c0d..ee61315 100755 --- a/labs06/task02.py +++ b/labs06/task02.py @@ -1,5 +1,6 @@ #!/usr/bin/env python # -*- coding: utf-8 -*- + import pandas as pd import matplotlib.pyplot as plt @@ -106,3 +107,58 @@ def main(): if __name__ == "__main__": main() + + + + +# zadanie dodatkowe + +import sklearn +import pandas as pd +import numpy as np + +dane = pd.read_csv("mieszkania.csv") +print(dane.head()) +print(dane.columns) + +# check data for outliers +from matplotlib import pyplot as plt +plt.scatter(dane['SqrMeters'], dane['Expected'], color='g') +plt.show() +# remove all data points that have expected price <= 500.000 and living area <= 200 sqrt meters +plt.scatter(dane['Rooms'], dane['Expected'], color='g') +plt.show() +# remove all data points that represent flats with more than 8 rooms + +flats = dane[(dane['Rooms'] < 10) & (dane['SqrMeters'] <= 200) & (dane['Expected'] <= 500000)] +print(flats.head(20)) + +y = flats['Expected'] +X = flats.drop(['Id', 'Expected', 'Floor', 'Location', + 'Description', 'Unnamed: 7', 'Unnamed: 8', 'Unnamed: 9', 'Unnamed: 10', 'Unnamed: 11'], axis=1) +print(y.head()) +print(X.head()) + + +from sklearn.model_selection import train_test_split + +train_X, test_X, train_y, test_y = train_test_split(X, y, test_size=0.3, random_state=38, shuffle=True) + +from sklearn.linear_model import LinearRegression +model = LinearRegression() +model.fit(X,y) + + +predicted = model.predict(test_X) +print("Predictions:", predicted[:5]) + +for p in zip(train_X.columns, model.coef_): + print("Intercept for {}: {:.3}".format(p[0], p[1])) + +from sklearn.metrics import mean_squared_error +rmse = np.sqrt(mean_squared_error(predicted, test_y)) +print("RMSE:", rmse) + +r2 = model.score(test_X, test_y) + +print("R squared:", r2) # 0.54 comparing to 0.02 before cleaning the data \ No newline at end of file diff --git a/labs06/tasks.py b/labs06/tasks.py index 7725734..1eaa598 100755 --- a/labs06/tasks.py +++ b/labs06/tasks.py @@ -7,10 +7,10 @@ import pandas as pd """ -2. Wczytaj zbiór danych `311.csv` do zniennej data. +2. Wczytaj zbiór danych `311.csv` do zmiennej data. """ -data = pd.read_csv("labs06/311.csv") +data = pd.read_csv("311.csv", low_memory=False) """ 3. Wyświetl 5 pierwszych wierszy z data.