Python2018/labs06/task02.py

#!/usr/bin/env python
# -*- coding: utf-8 -*-

import pandas as pd
import matplotlib.pyplot as plt

def wczytaj_dane():
    dane = pd.read_csv("mieszkania.csv")
    print(dane.head())
    return(dane)

def most_common_room_number(dane):
    return(dane['Rooms'].value_counts().idxmax())


def cheapest_flats(dane, n):
    p = dane.sort_values(['Expected'], ascending=[0])
    p.head(7)

def find_borough(desc):
    dzielnice = ['Stare Miasto',
                 'Wilda',
                 'Jeżyce',
                 'Rataje',
                 'Piątkowo',
                 'Winogrady',
                 'Miłostowo',
                 'Dębiec',
                 'Grunwald',
                 'Nowe Miasto']


    check = 0
    for dzielnica in dzielnice:
        if dzielnica in desc:
            check = 1
            save_dzielnica = dzielnica
    if check == 1:
        return(save_dzielnica)
    else:
        return("Inne")


def add_borough(dane):
    dzielnice = ['Stare Miasto',
                     'Wilda',
                     'Jeżyce',
                     'Rataje',
                     'Piątkowo',
                     'Winogrady',
                     'Miłostowo',
                     'Dębiec',
                     'Grunwald',
                     'Nowe Miasto']
    Borough = []
    column = dane['Location']
    for item in column:
        check = 0
        for dzielnica in dzielnice:
            if dzielnica in item:
                check = 1
                save_dzielnica = dzielnica
        if check == 1:
            Borough.append(save_dzielnica)
        else:
            Borough.append("Inne")

    Borough = pd.DataFrame(Borough)

    dane = pd.concat([dane.reset_index(drop=True), Borough], axis=1)
    print(dane)

def write_plot(dane, filename):
    dane.groupby('Borough')['Id'].nunique().plot(kind='bar')
    plt.show()
    plt.savefig('output.png')

def mean_price(dane, room_number):
    p1 = dane[dane['Rooms'] == room_number]
    p2 = p1['Expected']
    return(p2.mean())

def find_13(dane):
    p1 = dane[dane['Floor'] == 13]
    p1.Location.unique()

def find_best_flats(dane):
    p_index = dane['Location'].str.contains('Winogrady')
    p = dane[p_index]
    best_flats = p[(p['Rooms'] == 3) & (p['Floor'] == 1)]
    print(best_flats)

def main():
    dane = wczytaj_dane()


    print("Najpopularniejsza liczba pokoi w mieszkaniu to: {}"
          .format(most_common_room_number(dane)))

    print("{} to najładniejsza dzielnica w Poznaniu."
          .format(find_borough("Grunwald i Jeżyce")))

    print("Średnia cena mieszkania 3-pokojowego, to: {}"
          .format(mean_price(dane, 3)))

if __name__ == "__main__":
    main()


# zadanie dodatkowe

import sklearn
import pandas as pd
import numpy as np

dane = pd.read_csv("mieszkania.csv")
print(dane.head())
print(dane.columns)

# check data for outliers
from matplotlib import pyplot as plt
plt.scatter(dane['SqrMeters'], dane['Expected'], color='g')
plt.show()
# remove all data points that have expected price <= 500.000 and living area <= 200 sqrt meters
plt.scatter(dane['Rooms'], dane['Expected'], color='g')
plt.show()
# remove all data points that represent flats with more than 8 rooms

flats = dane[(dane['Rooms'] < 10) & (dane['SqrMeters'] <= 200) & (dane['Expected'] <= 500000)]
print(flats.head(20))

y = flats['Expected']
X = flats.drop(['Id', 'Expected', 'Floor', 'Location',
               'Description', 'Unnamed: 7', 'Unnamed: 8', 'Unnamed: 9', 'Unnamed: 10', 'Unnamed: 11'], axis=1)
print(y.head())
print(X.head())


from sklearn.model_selection import train_test_split

train_X, test_X, train_y, test_y = train_test_split(X, y, test_size=0.3, random_state=38, shuffle=True)

from sklearn.linear_model import LinearRegression
model = LinearRegression()
model.fit(X,y)


predicted = model.predict(test_X)
print("Predictions:", predicted[:5])

for p in zip(train_X.columns, model.coef_):
    print("Intercept for {}: {:.3}".format(p[0], p[1]))

from sklearn.metrics import mean_squared_error
rmse = np.sqrt(mean_squared_error(predicted, test_y))
print("RMSE:", rmse)

r2 = model.score(test_X, test_y)

print("R squared:", r2) # 0.54 comparing to 0.02 before cleaning the data