Regresja wielomianowa

Celem regresji wielomianowej jest zamodelowanie relacji między zmienną zależną od zmiennych niezależnych jako funkcję wielomianu n-tego stopnia.

Postać ogólna regresji wielomianowej:

$$ h_{\theta}(x) = \sum_{i=0}^{n} \theta_i x^i $$

Gdzie:

$$ \theta - \text{wektor parametrów modelu} $$

Funkcja kosztu

W celu odpowiedniego dobrania parametrów modelu, trzeba znaleźć minimum funkcji kosztu zdefiniowanej poniższym wzorem:

$$ J = \frac{1}{2m} (X \theta - y)^T (X \theta - y) $$

Gdzie:

$$ m - \text{ilość przykładów} $$

Za funkcję kosztu przyjmuje się zmodyfikowaną wersję błędu średniokwadratowego. Dodatkowo dodaje się dzielenie przez 2*m zamiast m, aby gradient z funkcji był lepszej postaci.

Metoda gradientu prostego

Do znalezienia minimum funckji kosztu można użyć metody gradientu prostego. W tym celu iteracyjnie liczy się gradient funkcji kosztu i aktualizuje się za jego pomocą wektor parametrów modelu aż do uzyskania zbieżności (Różnica między obliczoną funkcją kosztu a funkcją kosztu w poprzedniej iteracji będzie mniejsza od ustalonej wcześniej wartości $\varepsilon$).

Gradient funkcji kosztu:

$$ \dfrac{\partial J(\theta)}{\partial \theta} = \frac{1}{m}X^T(X \theta - y)$$

Modyfikacja parametrów modelu:

$$ \theta_{new} = \theta - \alpha * \dfrac{\partial J(\theta)}{\partial \theta}$$

Gdzie:

$$ \alpha - \text{współczynnik uczenia} $$

import ipywidgets as widgets
import matplotlib.pyplot as plt
import numpy as np
import pandas

%matplotlib inline

# Przydatne funkcje
cost_functions = dict()

def cost(theta, X, y):
    """Wersja macierzowa funkcji kosztu"""
    m = len(y)
    J = 1.0 / (2.0 * m) * ((X * theta - y).T * (X * theta - y))
    return J.item()

def gradient(theta, X, y):
    """Wersja macierzowa gradientu funkcji kosztu"""
    return 1.0 / len(y) * (X.T * (X * theta - y)) 

def gradient_descent(fJ, fdJ, theta, X, y, alpha=0.1, eps=10**-7):
    """Algorytm gradientu prostego"""
    current_cost = fJ(theta, X, y)
    logs = [[current_cost, theta]]
    while True:
        theta = theta - alpha * fdJ(theta, X, y)
        current_cost, prev_cost = fJ(theta, X, y), current_cost
        # print(current_cost)
        if abs(prev_cost - current_cost) > 10**15:
            print('Algorytm nie jest zbieżny!')
            break
        if abs(prev_cost - current_cost) <= eps:
            break
        logs.append([current_cost, theta]) 
    return theta, logs

def plot_data(X, y, xlabel, ylabel):
    """Wykres danych"""
    fig = plt.figure(figsize=(16*.6, 9*.6))
    ax = fig.add_subplot(111)
    fig.subplots_adjust(left=0.1, right=0.9, bottom=0.1, top=0.9)
    ax.scatter([X[:, 1]], [y], c='r', s=50, label='Dane')
    
    ax.set_xlabel(xlabel)
    ax.set_ylabel(ylabel)
    ax.margins(.05, .05)
    plt.ylim(y.min() - 1, y.max() + 1)
    plt.xlim(np.min(X[:, 1]) - 1, np.max(X[:, 1]) + 1)
    return fig

def plot_data_cost(X, y, xlabel, ylabel):
    """Wykres funkcji kosztu"""
    fig = plt.figure(figsize=(16 * .6, 9 * .6))
    ax = fig.add_subplot(111)
    fig.subplots_adjust(left=0.1, right=0.9, bottom=0.1, top=0.9)
    ax.scatter([X], [y], c='r', s=50, label='Dane')

    ax.set_xlabel(xlabel)
    ax.set_ylabel(ylabel)
    ax.margins(.05, .05)
    plt.ylim(min(y) - 1, max(y) + 1)
    plt.xlim(np.min(X) - 1, np.max(X) + 1)
    return fig

def plot_fun(fig, fun, X):
    """Wykres funkcji `fun`"""
    ax = fig.axes[0]
    x0 = np.min(X[:, 1]) - 1.0
    x1 = np.max(X[:, 1]) + 1.0
    Arg = np.arange(x0, x1, 0.1)
    Val = fun(Arg)
    return ax.plot(Arg, Val, linewidth='2')

def MSE(Y_true, Y_pred):
    """Błąd średniokwadratowy - Mean Squared Error"""
    return np.square(np.subtract(Y_true,Y_pred)).mean()

# Funkcja regresji wielomianowej

def h_poly(Theta, x):
    """Funkcja wielomianowa"""
    return sum(theta * np.power(x, i) for i, theta in enumerate(Theta.tolist()))

def get_poly_data(data, deg):
    """Przygotowanie danych do regresji wielomianowej"""
    m, n_plus_1 = data.shape
    n = n_plus_1 - 1

    X1 = data[:, 0:n]
    X1 /= np.amax(X1, axis=0)

    Xs = [np.ones((m, 1)), X1]

    for i in range(2, deg+1):
        Xn = np.power(X1, i)
        Xn /= np.amax(Xn, axis=0)
        Xs.append(Xn)

    X = np.matrix(np.concatenate(Xs, axis=1)).reshape(m, deg * n + 1)

    y = np.matrix(data[:, -1]).reshape(m, 1)

    return X, y


def polynomial_regression(X, y, n):
    """Funkcja regresji wielomianowej"""
    theta_start = np.matrix([0] * (n+1)).reshape(n+1, 1)
    theta, logs = gradient_descent(cost, gradient, theta_start, X, y)
    return lambda x: h_poly(theta, x), logs

def predict_values(model, data, n):
    """Funkcja predykcji"""
    x, y = get_poly_data(np.array(data), n)
    preprocessed_x = []
    for i in x:
        preprocessed_x.append(i.item(1))
    return y, model(preprocessed_x), MSE(y, model(preprocessed_x))

def plot_and_mse(data, data_test, n):
    """Wykres wraz z MSE"""
    x, y = get_poly_data(np.array(data), n)
    model, logs = polynomial_regression(x, y, n)
    cost_function = [[element[0], i] for i, element in enumerate(logs)]
    cost_functions[n] = cost_function
    
    fig = plot_data(x, y, xlabel='x', ylabel='y')
    plot_fun(fig, model, x)

    y_true, Y_pred, mse = predict_values(model, data_test, n)
    print(f'Wielomian {n} stopnia, MSE = {mse}')

# Wczytanie danych (mieszkania) przy pomocy biblioteki pandas

alldata = pandas.read_csv('data_flats.tsv', header=0, sep='\t',
                          usecols=['price', 'rooms', 'sqrMetres'])
alldata = alldata[['sqrMetres', 'price']]
alldata = alldata.sample(frac=1)
alldata

	sqrMetres	price
318	62	399000.0
1294	50	247705.0
593	15	349668.0
1486	20	347633.0
224	42	289000.0
...	...	...
1410	50	315000.0
424	14	330255.0
103	21	327584.0
983	78	496573.0
418	16	309978.0

1674 rows × 2 columns

# alldata = np.matrix(alldata[['sqrMetres', 'price']])
data_train = alldata[0:1600]
data_test = alldata[1600:]
data_train = np.matrix(data_train).astype(float)
data_test = np.matrix(data_test).astype(float)

cost_fun_slices = []
for n in range(1, 4):
    plot_and_mse(data_train, data_test, n)
    
    cost_data = cost_functions.get(n)
    cost_x = [line[1] for line in cost_data[:250]]
    cost_y = [line[0] for line in cost_data[:250]]
    cost_fun_slices.append((cost_x, cost_y))

Wielomian 1 stopnia, MSE = 35514312788.779144
Wielomian 2 stopnia, MSE = 90239672317.31158
Wielomian 3 stopnia, MSE = 89217346258.50258

# Wczytanie danych (mieszkania) przy pomocy biblioteki pandas

alldata_p2 = pandas.read_csv('polynomial-regression_2.csv')
alldata_p2 = alldata_p2.sample(frac=1)
alldata_p2

	araba_fiyat	araba_max_hiz
7	250	240
12	1000	365
13	2000	365
3	100	200
11	750	360
2	80	200
8	300	300
0	60	180
9	400	350
4	120	200
1	70	180
10	500	350
6	200	240
5	150	220
14	3000	365

# alldata = np.matrix(alldata[['sqrMetres', 'price']])
data_train_p2 = alldata_p2[0:12]
data_test_p2 = alldata_p2[12:]
data_train_p2 = np.matrix(data_train_p2).astype(float)
data_test_p2 = np.matrix(data_test_p2).astype(float)

cost_fun_slices = []
for n in range(1, 4):
    plot_and_mse(data_train_p2, data_test_p2, n)
    
    cost_data = cost_functions.get(n)
    cost_x = [line[1] for line in cost_data[:250]]
    cost_y = [line[0] for line in cost_data[:250]]
    cost_fun_slices.append((cost_x, cost_y))

Wielomian 1 stopnia, MSE = 13592.268823271697
Wielomian 2 stopnia, MSE = 9030.04916509696
Wielomian 3 stopnia, MSE = 9795.339194991013

#WYKRESY FUNKCJI KOSZTU
for fig in cost_fun_slices:
    cost_x, cost_y = fig
    fig = plot_data_cost(cost_x, cost_y, "Iteration", "Cost function value")

#WYKRESY FUNKCJI KOSZTU
for fig in cost_fun_slices:
    cost_x, cost_y = fig
    fig = plot_data_cost(cost_x, cost_y, "Iteration", "Cost function value")

# Ilość nauki do oceny

data_marks_all = pandas.read_csv('Student_Marks.csv')
data_marks_all

	number_courses	time_study	Marks
0	3	4.508	19.202
1	4	0.096	7.734
2	4	3.133	13.811
3	6	7.909	53.018
4	8	7.811	55.299
...	...	...	...
95	6	3.561	19.128
96	3	0.301	5.609
97	4	7.163	41.444
98	7	0.309	12.027
99	3	6.335	32.357

100 rows × 3 columns

data_marks_all = data_marks_all[['time_study', 'Marks']]
# data_marks_all = data_marks_all.sample(frac=1)
data_marks_train = data_marks_all[0:70]
data_marks_test = data_marks_all[70:]
data_marks_train = np.matrix(data_marks_train).astype(float)
data_marks_test = np.matrix(data_marks_test).astype(float)

cost_fun_slices = []

for n in range(1, 4):
    plot_and_mse(data_marks_train, data_marks_test, n)
    
    cost_data = cost_functions.get(n)
    cost_x = [line[1] for line in cost_data[:250]]
    cost_y = [line[0] for line in cost_data[:250]]
    cost_fun_slices.append((cost_x, cost_y))

Wielomian 1 stopnia, MSE = 381.16937283505433
Wielomian 2 stopnia, MSE = 394.1863119057109
Wielomian 3 stopnia, MSE = 391.5017110730558

#WYKRESY FUNKCJI KOSZTU
for fig in cost_fun_slices:
    cost_x, cost_y = fig
    fig = plot_data_cost(cost_x, cost_y, "Iteration", "Cost function value")

data_ins = pandas.read_csv('insurance.csv')
data_ins = data_ins.sample(frac=1)
data_ins

	age	sex	bmi	children	smoker	region	charges
27	55	female	32.775	2	no	northwest	12268.63225
412	26	female	17.195	2	yes	northeast	14455.64405
547	54	female	46.700	2	no	southwest	11538.42100
983	27	female	30.590	1	no	northeast	16796.41194
661	57	female	23.980	1	no	southeast	22192.43711
...	...	...	...	...	...	...	...
827	36	male	28.025	1	yes	northeast	20773.62775
212	24	male	28.500	2	no	northwest	3537.70300
940	18	male	23.210	0	no	southeast	1121.87390
476	24	male	28.500	0	yes	northeast	35147.52848
1027	23	male	18.715	0	no	northwest	21595.38229

1338 rows × 7 columns

data_ins = data_ins[['age', 'charges']]
data_ins_train = data_ins[0:1200]
data_ins_test = data_ins[1200:]
data_ins_train = np.matrix(data_ins_train).astype(float)
data_ins_test = np.matrix(data_ins_test).astype(float)

cost_fun_slices = []

for n in range(1, 4):
    plot_and_mse(data_ins_train, data_ins_test, n)
    
    cost_data = cost_functions.get(n)
    cost_x = [line[1] for line in cost_data[:250]]
    cost_y = [line[0] for line in cost_data[:250]]
    cost_fun_slices.append((cost_x, cost_y))

Wielomian 1 stopnia, MSE = 178360681.2241408
Wielomian 2 stopnia, MSE = 179170879.99336478

[0;31m---------------------------------------------------------------------------[0m
[0;31mKeyboardInterrupt[0m                         Traceback (most recent call last)
[0;32m/tmp/ipykernel_5896/1100858333.py[0m in [0;36m<module>[0;34m[0m
[1;32m      2[0m [0;34m[0m[0m
[1;32m      3[0m [0;32mfor[0m [0mn[0m [0;32min[0m [0mrange[0m[0;34m([0m[0;36m1[0m[0;34m,[0m [0;36m4[0m[0;34m)[0m[0;34m:[0m[0;34m[0m[0;34m[0m[0m
[0;32m----> 4[0;31m     [0mplot_and_mse[0m[0;34m([0m[0mdata_ins_train[0m[0;34m,[0m [0mdata_ins_test[0m[0;34m,[0m [0mn[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m[1;32m      5[0m [0;34m[0m[0m
[1;32m      6[0m     [0mcost_data[0m [0;34m=[0m [0mcost_functions[0m[0;34m.[0m[0mget[0m[0;34m([0m[0mn[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m

[0;32m/tmp/ipykernel_5896/3773219878.py[0m in [0;36mplot_and_mse[0;34m(data, data_test, n)[0m
[1;32m     10[0m     [0;34m"""Wykres wraz z MSE"""[0m[0;34m[0m[0;34m[0m[0m
[1;32m     11[0m     [0mx[0m[0;34m,[0m [0my[0m [0;34m=[0m [0mget_poly_data[0m[0;34m([0m[0mnp[0m[0;34m.[0m[0marray[0m[0;34m([0m[0mdata[0m[0;34m)[0m[0;34m,[0m [0mn[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0;32m---> 12[0;31m     [0mmodel[0m[0;34m,[0m [0mlogs[0m [0;34m=[0m [0mpolynomial_regression[0m[0;34m([0m[0mx[0m[0;34m,[0m [0my[0m[0;34m,[0m [0mn[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m[1;32m     13[0m     [0mcost_function[0m [0;34m=[0m [0;34m[[0m[0;34m[[0m[0melement[0m[0;34m[[0m[0;36m0[0m[0;34m][0m[0;34m,[0m [0mi[0m[0;34m][0m [0;32mfor[0m [0mi[0m[0;34m,[0m [0melement[0m [0;32min[0m [0menumerate[0m[0;34m([0m[0mlogs[0m[0;34m)[0m[0;34m][0m[0;34m[0m[0;34m[0m[0m
[1;32m     14[0m     [0mcost_functions[0m[0;34m[[0m[0mn[0m[0;34m][0m [0;34m=[0m [0mcost_function[0m[0;34m[0m[0;34m[0m[0m

[0;32m/tmp/ipykernel_5896/1168100012.py[0m in [0;36mpolynomial_regression[0;34m(X, y, n)[0m
[1;32m     30[0m     [0;34m"""Funkcja regresji wielomianowej"""[0m[0;34m[0m[0;34m[0m[0m
[1;32m     31[0m     [0mtheta_start[0m [0;34m=[0m [0mnp[0m[0;34m.[0m[0mmatrix[0m[0;34m([0m[0;34m[[0m[0;36m0[0m[0;34m][0m [0;34m*[0m [0;34m([0m[0mn[0m[0;34m+[0m[0;36m1[0m[0;34m)[0m[0;34m)[0m[0;34m.[0m[0mreshape[0m[0;34m([0m[0mn[0m[0;34m+[0m[0;36m1[0m[0;34m,[0m [0;36m1[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0;32m---> 32[0;31m     [0mtheta[0m[0;34m,[0m [0mlogs[0m [0;34m=[0m [0mgradient_descent[0m[0;34m([0m[0mcost[0m[0;34m,[0m [0mgradient[0m[0;34m,[0m [0mtheta_start[0m[0;34m,[0m [0mX[0m[0;34m,[0m [0my[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m[1;32m     33[0m     [0;32mreturn[0m [0;32mlambda[0m [0mx[0m[0;34m:[0m [0mh_poly[0m[0;34m([0m[0mtheta[0m[0;34m,[0m [0mx[0m[0;34m)[0m[0;34m,[0m [0mlogs[0m[0;34m[0m[0;34m[0m[0m

[0;32m/tmp/ipykernel_5896/1641407440.py[0m in [0;36mgradient_descent[0;34m(fJ, fdJ, theta, X, y, alpha, eps)[0m
[1;32m     17[0m     [0mlogs[0m [0;34m=[0m [0;34m[[0m[0;34m[[0m[0mcurrent_cost[0m[0;34m,[0m [0mtheta[0m[0;34m][0m[0;34m][0m[0;34m[0m[0;34m[0m[0m
[1;32m     18[0m     [0;32mwhile[0m [0;32mTrue[0m[0;34m:[0m[0;34m[0m[0;34m[0m[0m
[0;32m---> 19[0;31m         [0mtheta[0m [0;34m=[0m [0mtheta[0m [0;34m-[0m [0malpha[0m [0;34m*[0m [0mfdJ[0m[0;34m([0m[0mtheta[0m[0;34m,[0m [0mX[0m[0;34m,[0m [0my[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m[1;32m     20[0m         [0mcurrent_cost[0m[0;34m,[0m [0mprev_cost[0m [0;34m=[0m [0mfJ[0m[0;34m([0m[0mtheta[0m[0;34m,[0m [0mX[0m[0;34m,[0m [0my[0m[0;34m)[0m[0;34m,[0m [0mcurrent_cost[0m[0;34m[0m[0;34m[0m[0m
[1;32m     21[0m         [0;31m# print(current_cost)[0m[0;34m[0m[0;34m[0m[0;34m[0m[0m

[0;32m/tmp/ipykernel_5896/1641407440.py[0m in [0;36mgradient[0;34m(theta, X, y)[0m
[1;32m     10[0m [0;32mdef[0m [0mgradient[0m[0;34m([0m[0mtheta[0m[0;34m,[0m [0mX[0m[0;34m,[0m [0my[0m[0;34m)[0m[0;34m:[0m[0;34m[0m[0;34m[0m[0m
[1;32m     11[0m     [0;34m"""Wersja macierzowa gradientu funkcji kosztu"""[0m[0;34m[0m[0;34m[0m[0m
[0;32m---> 12[0;31m     [0;32mreturn[0m [0;36m1.0[0m [0;34m/[0m [0mlen[0m[0;34m([0m[0my[0m[0;34m)[0m [0;34m*[0m [0;34m([0m[0mX[0m[0;34m.[0m[0mT[0m [0;34m*[0m [0;34m([0m[0mX[0m [0;34m*[0m [0mtheta[0m [0;34m-[0m [0my[0m[0;34m)[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m[1;32m     13[0m [0;34m[0m[0m
[1;32m     14[0m [0;32mdef[0m [0mgradient_descent[0m[0;34m([0m[0mfJ[0m[0;34m,[0m [0mfdJ[0m[0;34m,[0m [0mtheta[0m[0;34m,[0m [0mX[0m[0;34m,[0m [0my[0m[0;34m,[0m [0malpha[0m[0;34m=[0m[0;36m0.1[0m[0;34m,[0m [0meps[0m[0;34m=[0m[0;36m10[0m[0;34m**[0m[0;34m-[0m[0;36m7[0m[0;34m)[0m[0;34m:[0m[0;34m[0m[0;34m[0m[0m

[0;32m~/.local/lib/python3.8/site-packages/numpy/matrixlib/defmatrix.py[0m in [0;36m__array_finalize__[0;34m(self, obj)[0m
[1;32m    165[0m         [0;32mreturn[0m [0mret[0m[0;34m[0m[0;34m[0m[0m
[1;32m    166[0m [0;34m[0m[0m
[0;32m--> 167[0;31m     [0;32mdef[0m [0m__array_finalize__[0m[0;34m([0m[0mself[0m[0;34m,[0m [0mobj[0m[0;34m)[0m[0;34m:[0m[0;34m[0m[0;34m[0m[0m
[0m[1;32m    168[0m         [0mself[0m[0;34m.[0m[0m_getitem[0m [0;34m=[0m [0;32mFalse[0m[0;34m[0m[0;34m[0m[0m
[1;32m    169[0m         [0;32mif[0m [0;34m([0m[0misinstance[0m[0;34m([0m[0mobj[0m[0;34m,[0m [0mmatrix[0m[0;34m)[0m [0;32mand[0m [0mobj[0m[0;34m.[0m[0m_getitem[0m[0;34m)[0m[0;34m:[0m [0;32mreturn[0m[0;34m[0m[0;34m[0m[0m

[0;31mKeyboardInterrupt[0m:

#WYKRESY FUNKCJI KOSZTU
for fig in cost_fun_slices:
    cost_x, cost_y = fig
    fig = plot_data_cost(cost_x, cost_y, "Iteration", "Cost function value")

419 KiB Raw Blame History Unescape Escape

Regresja wielomianowa

Funkcja kosztu

Metoda gradientu prostego

419 KiB

Raw Blame History