From 325549ab4a37f4640e9812bd375fb80cce3f14e5 Mon Sep 17 00:00:00 2001
From: "wagner.agnieszka" <wagner.agnieszka@gmail.com>
Date: Sat, 23 Jun 2018 01:00:53 +0200
Subject: [PATCH] passed

---
 labs06/linearModel.py | 56 +++++++++++++++++++++++++++++++++++++++++++
 labs06/task02.py      | 56 +++++++++++++++++++++++++++++++++++++++++++
 labs06/tasks.py       |  4 ++--
 3 files changed, 114 insertions(+), 2 deletions(-)
 create mode 100644 labs06/linearModel.py

diff --git a/labs06/linearModel.py b/labs06/linearModel.py
new file mode 100644
index 0000000..b0e7783
--- /dev/null
+++ b/labs06/linearModel.py
@@ -0,0 +1,56 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+
+import sklearn
+import pandas as pd
+import numpy as np
+
+dane = pd.read_csv("mieszkania.csv")
+print(dane.head())
+print(dane.columns)
+
+# check data for outliers
+from matplotlib import pyplot as plt
+plt.scatter(dane['SqrMeters'], dane['Expected'], color='g')
+plt.show()
+# remove all data points that have expected price <= 500.000 and living area <= 200 sqrt meters
+plt.scatter(dane['Rooms'], dane['Expected'], color='g')
+plt.show()
+# remove all data points that represent flats with more than 8 rooms
+
+flats = dane[(dane['Rooms'] < 10) & (dane['SqrMeters'] <= 200) & (dane['Expected'] <= 500000)]
+print(flats.head(20))
+
+y = flats['Expected']
+X = flats.drop(['Id', 'Expected', 'Floor', 'Location',
+               'Description', 'Unnamed: 7', 'Unnamed: 8', 'Unnamed: 9', 'Unnamed: 10', 'Unnamed: 11'], axis=1)
+print(y.head())
+print(X.head())
+
+
+from sklearn.model_selection import train_test_split
+
+train_X, test_X, train_y, test_y = train_test_split(X, y, test_size=0.3, random_state=38, shuffle=True)
+
+from sklearn.linear_model import LinearRegression
+model = LinearRegression()
+model.fit(X,y)
+
+
+predicted = model.predict(test_X)
+print("Predictions:", predicted[:5])
+
+for p in zip(train_X.columns, model.coef_):
+    print("Intercept for {}: {:.3}".format(p[0], p[1]))
+
+from sklearn.metrics import mean_squared_error
+rmse = np.sqrt(mean_squared_error(predicted, test_y))
+print("RMSE:", rmse)
+
+r2 = model.score(test_X, test_y)
+
+print("R squared:", r2) # 0.54 comparing to 0.02 before cleaning the data
+
+
+
+
diff --git a/labs06/task02.py b/labs06/task02.py
index 5210c0d..ee61315 100755
--- a/labs06/task02.py
+++ b/labs06/task02.py
@@ -1,5 +1,6 @@
 #!/usr/bin/env python
 # -*- coding: utf-8 -*-
+
 import pandas as pd
 import matplotlib.pyplot as plt
 
@@ -106,3 +107,58 @@ def main():
 
 if __name__ == "__main__":
     main()
+
+
+
+
+# zadanie dodatkowe
+
+import sklearn
+import pandas as pd
+import numpy as np
+
+dane = pd.read_csv("mieszkania.csv")
+print(dane.head())
+print(dane.columns)
+
+# check data for outliers
+from matplotlib import pyplot as plt
+plt.scatter(dane['SqrMeters'], dane['Expected'], color='g')
+plt.show()
+# remove all data points that have expected price <= 500.000 and living area <= 200 sqrt meters
+plt.scatter(dane['Rooms'], dane['Expected'], color='g')
+plt.show()
+# remove all data points that represent flats with more than 8 rooms
+
+flats = dane[(dane['Rooms'] < 10) & (dane['SqrMeters'] <= 200) & (dane['Expected'] <= 500000)]
+print(flats.head(20))
+
+y = flats['Expected']
+X = flats.drop(['Id', 'Expected', 'Floor', 'Location',
+               'Description', 'Unnamed: 7', 'Unnamed: 8', 'Unnamed: 9', 'Unnamed: 10', 'Unnamed: 11'], axis=1)
+print(y.head())
+print(X.head())
+
+
+from sklearn.model_selection import train_test_split
+
+train_X, test_X, train_y, test_y = train_test_split(X, y, test_size=0.3, random_state=38, shuffle=True)
+
+from sklearn.linear_model import LinearRegression
+model = LinearRegression()
+model.fit(X,y)
+
+
+predicted = model.predict(test_X)
+print("Predictions:", predicted[:5])
+
+for p in zip(train_X.columns, model.coef_):
+    print("Intercept for {}: {:.3}".format(p[0], p[1]))
+
+from sklearn.metrics import mean_squared_error
+rmse = np.sqrt(mean_squared_error(predicted, test_y))
+print("RMSE:", rmse)
+
+r2 = model.score(test_X, test_y)
+
+print("R squared:", r2) # 0.54 comparing to 0.02 before cleaning the data
\ No newline at end of file
diff --git a/labs06/tasks.py b/labs06/tasks.py
index 7725734..1eaa598 100755
--- a/labs06/tasks.py
+++ b/labs06/tasks.py
@@ -7,10 +7,10 @@
 import pandas as pd
 
 """
-2. Wczytaj zbiór danych `311.csv` do zniennej data.
+2. Wczytaj zbiór danych `311.csv` do zmiennej data.
 """
 
-data = pd.read_csv("labs06/311.csv")
+data = pd.read_csv("311.csv", low_memory=False)
 
 """
 3. Wyświetl 5 pierwszych wierszy z data.