forked from tdwojak/Python2018
164 lines
4.1 KiB
Python
Executable File
164 lines
4.1 KiB
Python
Executable File
#!/usr/bin/env python
|
|
# -*- coding: utf-8 -*-
|
|
|
|
import pandas as pd
|
|
import matplotlib.pyplot as plt
|
|
|
|
def wczytaj_dane():
|
|
dane = pd.read_csv("mieszkania.csv")
|
|
print(dane.head())
|
|
return(dane)
|
|
|
|
def most_common_room_number(dane):
|
|
return(dane['Rooms'].value_counts().idxmax())
|
|
|
|
|
|
|
|
def cheapest_flats(dane, n):
|
|
p = dane.sort_values(['Expected'], ascending=[0])
|
|
p.head(7)
|
|
|
|
def find_borough(desc):
|
|
dzielnice = ['Stare Miasto',
|
|
'Wilda',
|
|
'Jeżyce',
|
|
'Rataje',
|
|
'Piątkowo',
|
|
'Winogrady',
|
|
'Miłostowo',
|
|
'Dębiec',
|
|
'Grunwald',
|
|
'Nowe Miasto']
|
|
|
|
|
|
check = 0
|
|
for dzielnica in dzielnice:
|
|
if dzielnica in desc:
|
|
check = 1
|
|
save_dzielnica = dzielnica
|
|
if check == 1:
|
|
return(save_dzielnica)
|
|
else:
|
|
return("Inne")
|
|
|
|
|
|
|
|
def add_borough(dane):
|
|
dzielnice = ['Stare Miasto',
|
|
'Wilda',
|
|
'Jeżyce',
|
|
'Rataje',
|
|
'Piątkowo',
|
|
'Winogrady',
|
|
'Miłostowo',
|
|
'Dębiec',
|
|
'Grunwald',
|
|
'Nowe Miasto']
|
|
Borough = []
|
|
column = dane['Location']
|
|
for item in column:
|
|
check = 0
|
|
for dzielnica in dzielnice:
|
|
if dzielnica in item:
|
|
check = 1
|
|
save_dzielnica = dzielnica
|
|
if check == 1:
|
|
Borough.append(save_dzielnica)
|
|
else:
|
|
Borough.append("Inne")
|
|
|
|
Borough = pd.DataFrame(Borough)
|
|
|
|
dane = pd.concat([dane.reset_index(drop=True), Borough], axis=1)
|
|
print(dane)
|
|
|
|
def write_plot(dane, filename):
|
|
dane.groupby('Borough')['Id'].nunique().plot(kind='bar')
|
|
plt.show()
|
|
plt.savefig('output.png')
|
|
|
|
def mean_price(dane, room_number):
|
|
p1 = dane[dane['Rooms'] == room_number]
|
|
p2 = p1['Expected']
|
|
return(p2.mean())
|
|
|
|
def find_13(dane):
|
|
p1 = dane[dane['Floor'] == 13]
|
|
p1.Location.unique()
|
|
|
|
def find_best_flats(dane):
|
|
p_index = dane['Location'].str.contains('Winogrady')
|
|
p = dane[p_index]
|
|
best_flats = p[(p['Rooms'] == 3) & (p['Floor'] == 1)]
|
|
print(best_flats)
|
|
|
|
def main():
|
|
dane = wczytaj_dane()
|
|
|
|
|
|
print("Najpopularniejsza liczba pokoi w mieszkaniu to: {}"
|
|
.format(most_common_room_number(dane)))
|
|
|
|
print("{} to najładniejsza dzielnica w Poznaniu."
|
|
.format(find_borough("Grunwald i Jeżyce")))
|
|
|
|
print("Średnia cena mieszkania 3-pokojowego, to: {}"
|
|
.format(mean_price(dane, 3)))
|
|
|
|
if __name__ == "__main__":
|
|
main()
|
|
|
|
|
|
|
|
|
|
# zadanie dodatkowe
|
|
|
|
import sklearn
|
|
import pandas as pd
|
|
import numpy as np
|
|
|
|
dane = pd.read_csv("mieszkania.csv")
|
|
print(dane.head())
|
|
print(dane.columns)
|
|
|
|
# check data for outliers
|
|
from matplotlib import pyplot as plt
|
|
plt.scatter(dane['SqrMeters'], dane['Expected'], color='g')
|
|
plt.show()
|
|
# remove all data points that have expected price <= 500.000 and living area <= 200 sqrt meters
|
|
plt.scatter(dane['Rooms'], dane['Expected'], color='g')
|
|
plt.show()
|
|
# remove all data points that represent flats with more than 8 rooms
|
|
|
|
flats = dane[(dane['Rooms'] < 10) & (dane['SqrMeters'] <= 200) & (dane['Expected'] <= 500000)]
|
|
print(flats.head(20))
|
|
|
|
y = flats['Expected']
|
|
X = flats.drop(['Id', 'Expected', 'Floor', 'Location',
|
|
'Description', 'Unnamed: 7', 'Unnamed: 8', 'Unnamed: 9', 'Unnamed: 10', 'Unnamed: 11'], axis=1)
|
|
print(y.head())
|
|
print(X.head())
|
|
|
|
|
|
from sklearn.model_selection import train_test_split
|
|
|
|
train_X, test_X, train_y, test_y = train_test_split(X, y, test_size=0.3, random_state=38, shuffle=True)
|
|
|
|
from sklearn.linear_model import LinearRegression
|
|
model = LinearRegression()
|
|
model.fit(X,y)
|
|
|
|
|
|
predicted = model.predict(test_X)
|
|
print("Predictions:", predicted[:5])
|
|
|
|
for p in zip(train_X.columns, model.coef_):
|
|
print("Intercept for {}: {:.3}".format(p[0], p[1]))
|
|
|
|
from sklearn.metrics import mean_squared_error
|
|
rmse = np.sqrt(mean_squared_error(predicted, test_y))
|
|
print("RMSE:", rmse)
|
|
|
|
r2 = model.score(test_X, test_y)
|
|
|
|
print("R squared:", r2) # 0.54 comparing to 0.02 before cleaning the data |