forked from tdwojak/Python2018
passed
This commit is contained in:
parent
49fe26305c
commit
325549ab4a
56
labs06/linearModel.py
Normal file
56
labs06/linearModel.py
Normal file
@ -0,0 +1,56 @@
|
|||||||
|
#!/usr/bin/env python
|
||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
|
||||||
|
import sklearn
|
||||||
|
import pandas as pd
|
||||||
|
import numpy as np
|
||||||
|
|
||||||
|
dane = pd.read_csv("mieszkania.csv")
|
||||||
|
print(dane.head())
|
||||||
|
print(dane.columns)
|
||||||
|
|
||||||
|
# check data for outliers
|
||||||
|
from matplotlib import pyplot as plt
|
||||||
|
plt.scatter(dane['SqrMeters'], dane['Expected'], color='g')
|
||||||
|
plt.show()
|
||||||
|
# remove all data points that have expected price <= 500.000 and living area <= 200 sqrt meters
|
||||||
|
plt.scatter(dane['Rooms'], dane['Expected'], color='g')
|
||||||
|
plt.show()
|
||||||
|
# remove all data points that represent flats with more than 8 rooms
|
||||||
|
|
||||||
|
flats = dane[(dane['Rooms'] < 10) & (dane['SqrMeters'] <= 200) & (dane['Expected'] <= 500000)]
|
||||||
|
print(flats.head(20))
|
||||||
|
|
||||||
|
y = flats['Expected']
|
||||||
|
X = flats.drop(['Id', 'Expected', 'Floor', 'Location',
|
||||||
|
'Description', 'Unnamed: 7', 'Unnamed: 8', 'Unnamed: 9', 'Unnamed: 10', 'Unnamed: 11'], axis=1)
|
||||||
|
print(y.head())
|
||||||
|
print(X.head())
|
||||||
|
|
||||||
|
|
||||||
|
from sklearn.model_selection import train_test_split
|
||||||
|
|
||||||
|
train_X, test_X, train_y, test_y = train_test_split(X, y, test_size=0.3, random_state=38, shuffle=True)
|
||||||
|
|
||||||
|
from sklearn.linear_model import LinearRegression
|
||||||
|
model = LinearRegression()
|
||||||
|
model.fit(X,y)
|
||||||
|
|
||||||
|
|
||||||
|
predicted = model.predict(test_X)
|
||||||
|
print("Predictions:", predicted[:5])
|
||||||
|
|
||||||
|
for p in zip(train_X.columns, model.coef_):
|
||||||
|
print("Intercept for {}: {:.3}".format(p[0], p[1]))
|
||||||
|
|
||||||
|
from sklearn.metrics import mean_squared_error
|
||||||
|
rmse = np.sqrt(mean_squared_error(predicted, test_y))
|
||||||
|
print("RMSE:", rmse)
|
||||||
|
|
||||||
|
r2 = model.score(test_X, test_y)
|
||||||
|
|
||||||
|
print("R squared:", r2) # 0.54 comparing to 0.02 before cleaning the data
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
@ -1,5 +1,6 @@
|
|||||||
#!/usr/bin/env python
|
#!/usr/bin/env python
|
||||||
# -*- coding: utf-8 -*-
|
# -*- coding: utf-8 -*-
|
||||||
|
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
import matplotlib.pyplot as plt
|
import matplotlib.pyplot as plt
|
||||||
|
|
||||||
@ -106,3 +107,58 @@ def main():
|
|||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
main()
|
main()
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
# zadanie dodatkowe
|
||||||
|
|
||||||
|
import sklearn
|
||||||
|
import pandas as pd
|
||||||
|
import numpy as np
|
||||||
|
|
||||||
|
dane = pd.read_csv("mieszkania.csv")
|
||||||
|
print(dane.head())
|
||||||
|
print(dane.columns)
|
||||||
|
|
||||||
|
# check data for outliers
|
||||||
|
from matplotlib import pyplot as plt
|
||||||
|
plt.scatter(dane['SqrMeters'], dane['Expected'], color='g')
|
||||||
|
plt.show()
|
||||||
|
# remove all data points that have expected price <= 500.000 and living area <= 200 sqrt meters
|
||||||
|
plt.scatter(dane['Rooms'], dane['Expected'], color='g')
|
||||||
|
plt.show()
|
||||||
|
# remove all data points that represent flats with more than 8 rooms
|
||||||
|
|
||||||
|
flats = dane[(dane['Rooms'] < 10) & (dane['SqrMeters'] <= 200) & (dane['Expected'] <= 500000)]
|
||||||
|
print(flats.head(20))
|
||||||
|
|
||||||
|
y = flats['Expected']
|
||||||
|
X = flats.drop(['Id', 'Expected', 'Floor', 'Location',
|
||||||
|
'Description', 'Unnamed: 7', 'Unnamed: 8', 'Unnamed: 9', 'Unnamed: 10', 'Unnamed: 11'], axis=1)
|
||||||
|
print(y.head())
|
||||||
|
print(X.head())
|
||||||
|
|
||||||
|
|
||||||
|
from sklearn.model_selection import train_test_split
|
||||||
|
|
||||||
|
train_X, test_X, train_y, test_y = train_test_split(X, y, test_size=0.3, random_state=38, shuffle=True)
|
||||||
|
|
||||||
|
from sklearn.linear_model import LinearRegression
|
||||||
|
model = LinearRegression()
|
||||||
|
model.fit(X,y)
|
||||||
|
|
||||||
|
|
||||||
|
predicted = model.predict(test_X)
|
||||||
|
print("Predictions:", predicted[:5])
|
||||||
|
|
||||||
|
for p in zip(train_X.columns, model.coef_):
|
||||||
|
print("Intercept for {}: {:.3}".format(p[0], p[1]))
|
||||||
|
|
||||||
|
from sklearn.metrics import mean_squared_error
|
||||||
|
rmse = np.sqrt(mean_squared_error(predicted, test_y))
|
||||||
|
print("RMSE:", rmse)
|
||||||
|
|
||||||
|
r2 = model.score(test_X, test_y)
|
||||||
|
|
||||||
|
print("R squared:", r2) # 0.54 comparing to 0.02 before cleaning the data
|
@ -7,10 +7,10 @@
|
|||||||
import pandas as pd
|
import pandas as pd
|
||||||
|
|
||||||
"""
|
"""
|
||||||
2. Wczytaj zbiór danych `311.csv` do zniennej data.
|
2. Wczytaj zbiór danych `311.csv` do zmiennej data.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
data = pd.read_csv("labs06/311.csv")
|
data = pd.read_csv("311.csv", low_memory=False)
|
||||||
|
|
||||||
"""
|
"""
|
||||||
3. Wyświetl 5 pierwszych wierszy z data.
|
3. Wyświetl 5 pierwszych wierszy z data.
|
||||||
|
Loading…
Reference in New Issue
Block a user