umz21/lab/wzor.py

#! /usr/bin/env python3
# -*- coding: utf-8 -*-

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from seaborn import relplot


def safeSigmoid(x, eps=0):
    y = 1.0/(1.0 + np.exp(-x))
    if eps > 0:
        y[y < eps] = eps
        y[y > 1 - eps] = 1 - eps
    return y

def h(theta, X, eps=0.0):
    return safeSigmoid(X*theta, eps)

def J(h, theta, X, y):
    m = len(y)
    h_val = h(theta, X)
    s1 = np.multiply(y, np.log(h_val))
    s2 = np.multiply((1 - y), np.log(1 - h_val))
    return -np.sum(s1 + s2, axis=0) / m

def dJ(h, theta, X, y):
    return 1.0 / y.shape[0] * (X.T * (h(theta, X) - y))

def GD(h, fJ, fdJ, theta, X, y, alpha=0.01, eps=10**-3, maxSteps=10000):
    errorCurr = fJ(h, theta, X, y)
    errors = [[errorCurr, theta]]
    while True:
        # oblicz nowe theta
        theta = theta - alpha * fdJ(h, theta, X, y)
        # raportuj poziom błędu
        errorCurr, errorPrev = fJ(h, theta, X, y), errorCurr
        # kryteria stopu
        if errorCurr > errorPrev:
            raise Exception('Zbyt duży krok!')
        if abs(errorPrev - errorCurr) <= eps:
            break
        if len(errors) > maxSteps:
            break
        errors.append([errorCurr, theta])
    return theta, errors

def classifyBi(h, theta, X):
    probs = h(theta, X)
    result = np.array(probs > 0.5, dtype=int)
    return result, probs

def plot_data_for_classification(X, Y, xlabel, ylabel):
    fig = plt.figure(figsize=(16*.6, 9*.6))
    ax = fig.add_subplot(111)
    fig.subplots_adjust(left=0.1, right=0.9, bottom=0.1, top=0.9)
    X = X.tolist()
    Y = Y.tolist()
    X1n = [x[1] for x, y in zip(X, Y) if y[0] == 0]
    X1p = [x[1] for x, y in zip(X, Y) if y[0] == 1]
    X2n = [x[2] for x, y in zip(X, Y) if y[0] == 0]
    X2p = [x[2] for x, y in zip(X, Y) if y[0] == 1]
    ax.scatter(X1n, X2n, c='r', marker='x', s=50, label='Dane')
    ax.scatter(X1p, X2p, c='g', marker='o', s=50, label='Dane')

    ax.set_xlabel(xlabel)
    ax.set_ylabel(ylabel)
    ax.margins(.05, .05)
    return fig

def powerme(x1,x2,n):
    X = []
    for m in range(n+1):
        for i in range(m+1):
            X.append(np.multiply(np.power(x1,i),np.power(x2,(m-i))))
    return np.hstack(X)

def plot_decision_boundary(fig, h, theta, degree):
    ax = fig.axes[0]

    xmin = 1860.0
    xmax = 2020.0
    xstep = 1.0
    xspan = int((xmax - xmin) / xstep)

    ymin = 0.0
    ymax = 1200.0
    ystep = 1.0
    yspan = int((ymax - ymin) / ystep)

    xx, yy = np.meshgrid(np.arange(xmin, xmax, xstep),
                         np.arange(ymin, ymax, ystep))
    l = len(xx.ravel())
    C = powerme(yy.reshape(l, 1), xx.reshape(l, 1), degree)
    z = h(theta, C).reshape(yspan, xspan)
    # z = classifyBi(h, theta, C)[0].reshape(yspan, xspan)

    # plt.contour(xx, yy, z, levels=[0.1,0.3,0.5,0.7,0.9], colors='m', lw=3)
    print(z)
    plt.contour(xx, yy, z)

data = pd.read_csv('wyk/mieszkania4.tsv', sep='\t')
data['Czy blok'] = (data['Typ zabudowy'] == 'blok')
print(data.columns)

data = data[
    (data['Powierzchnia w m2'] < 10000)
    & (data['cena'] < 10000000)
    ]

relplot(data=data, x='Rok budowy', y='Powierzchnia w m2', hue='Czy blok')
plt.show()

# X_columns = ['cena', 'Powierzchnia w m2', 'Rok budowy']
X_columns = ['Rok budowy', 'Powierzchnia w m2']
Y_column = ['Czy blok']

data = data[X_columns + Y_column].dropna()

m = len(data)
n = len(X_columns)

X = np.matrix(np.concatenate((np.ones((m, 1)), data[X_columns].values), axis=1))
Y = np.matrix(data[Y_column].values, dtype=int)

split_point = int(0.8 * m)

X_train = X[:split_point]
X_test = X[split_point:]
Y_train = Y[:split_point]
Y_test = Y[split_point:]

thetaStartMx = np.ones((n + 1, 1))
thetaBest, errors = GD(h, J, dJ, thetaStartMx, X_train, Y_train,
                       alpha=0.1, eps=10**-7, maxSteps=10000)
print(thetaBest)

Y_predicted, Y_probs = classifyBi(h, thetaBest, X_test)

print(Y_predicted.sum())
print(Y_test.sum())

accuracy = np.array(Y_predicted == Y_test, dtype=int).sum() / Y_test.shape[0]
print(accuracy)

fig = plot_data_for_classification(X, Y, xlabel=u'Rok budowy', ylabel=u'Powierzchnia w m2')
plot_decision_boundary(fig, h, thetaBest, 1)
plt.show()


# More dimensions

dim = 6

X_train2 = powerme(X_train[:,1], X_train[:,2], dim)
X_test2 = powerme(X_test[:,1], X_test[:,2], dim)
thetaStart2 = np.ones((X_train2.shape[1], 1))
thetaBest2, errors2 = GD(h, J, dJ, thetaStart2, X_train2, Y_train,
                         alpha=0.1, eps=10**-7, maxSteps=100000)

print(thetaBest2)
Y_predicted2, Y_probs2 = classifyBi(h, thetaBest2, X_test2)

print(Y_predicted2.sum())
print(Y_test.sum())

accuracy2 = np.array(Y_predicted2 == Y_test, dtype=int).sum() / Y_test.shape[0]
print(accuracy2)

fig2 = plot_data_for_classification(X, Y, xlabel=u'Rok budowy', ylabel=u'Powierzchnia w m2')
plot_decision_boundary(fig2, h, thetaBest2, dim)
plt.show()