419 KiB
Regresja wielomianowa
Celem regresji wielomianowej jest zamodelowanie relacji między zmienną zależną od zmiennych niezależnych jako funkcję wielomianu n-tego stopnia.
Postać ogólna regresji wielomianowej:
$$ h_{\theta}(x) = \sum_{i=0}^{n} \theta_i x^i $$
Gdzie:
$$ \theta - \text{wektor parametrów modelu} $$
Funkcja kosztu
W celu odpowiedniego dobrania parametrów modelu, trzeba znaleźć minimum funkcji kosztu zdefiniowanej poniższym wzorem:
$$ J = \frac{1}{2m} (X \theta - y)^T (X \theta - y) $$
Gdzie:
$$ m - \text{ilość przykładów} $$
Za funkcję kosztu przyjmuje się zmodyfikowaną wersję błędu średniokwadratowego. Dodatkowo dodaje się dzielenie przez 2*m zamiast m, aby gradient z funkcji był lepszej postaci.
Do znalezienia minimum funckji kosztu można użyć metody gradientu prostego. W tym celu iteracyjnie liczy się gradient funkcji kosztu i aktualizuje się za jego pomocą wektor parametrów modelu aż do uzyskania zbieżności (Różnica między obliczoną funkcją kosztu a funkcją kosztu w poprzedniej iteracji będzie mniejsza od ustalonej wcześniej wartości $\varepsilon$).
Gradient funkcji kosztu:
$$ \dfrac{\partial J(\theta)}{\partial \theta} = \frac{1}{m}X^T(X \theta - y)$$
Modyfikacja parametrów modelu:
$$ \theta_{new} = \theta - \alpha * \dfrac{\partial J(\theta)}{\partial \theta}$$
Gdzie:
$$ \alpha - \text{współczynnik uczenia} $$
import ipywidgets as widgets
import matplotlib.pyplot as plt
import numpy as np
import pandas
%matplotlib inline
# Przydatne funkcje
cost_functions = dict()
def cost(theta, X, y):
"""Wersja macierzowa funkcji kosztu"""
m = len(y)
J = 1.0 / (2.0 * m) * ((X * theta - y).T * (X * theta - y))
return J.item()
def gradient(theta, X, y):
"""Wersja macierzowa gradientu funkcji kosztu"""
return 1.0 / len(y) * (X.T * (X * theta - y))
def gradient_descent(fJ, fdJ, theta, X, y, alpha=0.1, eps=10**-7):
"""Algorytm gradientu prostego"""
current_cost = fJ(theta, X, y)
logs = [[current_cost, theta]]
while True:
theta = theta - alpha * fdJ(theta, X, y)
current_cost, prev_cost = fJ(theta, X, y), current_cost
# print(current_cost)
if abs(prev_cost - current_cost) > 10**15:
print('Algorytm nie jest zbieżny!')
break
if abs(prev_cost - current_cost) <= eps:
break
logs.append([current_cost, theta])
return theta, logs
def plot_data(X, y, xlabel, ylabel):
"""Wykres danych"""
fig = plt.figure(figsize=(16*.6, 9*.6))
ax = fig.add_subplot(111)
fig.subplots_adjust(left=0.1, right=0.9, bottom=0.1, top=0.9)
ax.scatter([X[:, 1]], [y], c='r', s=50, label='Dane')
ax.set_xlabel(xlabel)
ax.set_ylabel(ylabel)
ax.margins(.05, .05)
plt.ylim(y.min() - 1, y.max() + 1)
plt.xlim(np.min(X[:, 1]) - 1, np.max(X[:, 1]) + 1)
return fig
def plot_data_cost(X, y, xlabel, ylabel):
"""Wykres funkcji kosztu"""
fig = plt.figure(figsize=(16 * .6, 9 * .6))
ax = fig.add_subplot(111)
fig.subplots_adjust(left=0.1, right=0.9, bottom=0.1, top=0.9)
ax.scatter([X], [y], c='r', s=50, label='Dane')
ax.set_xlabel(xlabel)
ax.set_ylabel(ylabel)
ax.margins(.05, .05)
plt.ylim(min(y) - 1, max(y) + 1)
plt.xlim(np.min(X) - 1, np.max(X) + 1)
return fig
def plot_fun(fig, fun, X):
"""Wykres funkcji `fun`"""
ax = fig.axes[0]
x0 = np.min(X[:, 1]) - 1.0
x1 = np.max(X[:, 1]) + 1.0
Arg = np.arange(x0, x1, 0.1)
Val = fun(Arg)
return ax.plot(Arg, Val, linewidth='2')
def MSE(Y_true, Y_pred):
"""Błąd średniokwadratowy - Mean Squared Error"""
return np.square(np.subtract(Y_true,Y_pred)).mean()
# Funkcja regresji wielomianowej
def h_poly(Theta, x):
"""Funkcja wielomianowa"""
return sum(theta * np.power(x, i) for i, theta in enumerate(Theta.tolist()))
def get_poly_data(data, deg):
"""Przygotowanie danych do regresji wielomianowej"""
m, n_plus_1 = data.shape
n = n_plus_1 - 1
X1 = data[:, 0:n]
X1 /= np.amax(X1, axis=0)
Xs = [np.ones((m, 1)), X1]
for i in range(2, deg+1):
Xn = np.power(X1, i)
Xn /= np.amax(Xn, axis=0)
Xs.append(Xn)
X = np.matrix(np.concatenate(Xs, axis=1)).reshape(m, deg * n + 1)
y = np.matrix(data[:, -1]).reshape(m, 1)
return X, y
def polynomial_regression(X, y, n):
"""Funkcja regresji wielomianowej"""
theta_start = np.matrix([0] * (n+1)).reshape(n+1, 1)
theta, logs = gradient_descent(cost, gradient, theta_start, X, y)
return lambda x: h_poly(theta, x), logs
def predict_values(model, data, n):
"""Funkcja predykcji"""
x, y = get_poly_data(np.array(data), n)
preprocessed_x = []
for i in x:
preprocessed_x.append(i.item(1))
return y, model(preprocessed_x), MSE(y, model(preprocessed_x))
def plot_and_mse(data, data_test, n):
"""Wykres wraz z MSE"""
x, y = get_poly_data(np.array(data), n)
model, logs = polynomial_regression(x, y, n)
cost_function = [[element[0], i] for i, element in enumerate(logs)]
cost_functions[n] = cost_function
fig = plot_data(x, y, xlabel='x', ylabel='y')
plot_fun(fig, model, x)
y_true, Y_pred, mse = predict_values(model, data_test, n)
print(f'Wielomian {n} stopnia, MSE = {mse}')
# Wczytanie danych (mieszkania) przy pomocy biblioteki pandas
alldata = pandas.read_csv('data_flats.tsv', header=0, sep='\t',
usecols=['price', 'rooms', 'sqrMetres'])
alldata = alldata[['sqrMetres', 'price']]
alldata = alldata.sample(frac=1)
alldata
sqrMetres | price | |
---|---|---|
318 | 62 | 399000.0 |
1294 | 50 | 247705.0 |
593 | 15 | 349668.0 |
1486 | 20 | 347633.0 |
224 | 42 | 289000.0 |
... | ... | ... |
1410 | 50 | 315000.0 |
424 | 14 | 330255.0 |
103 | 21 | 327584.0 |
983 | 78 | 496573.0 |
418 | 16 | 309978.0 |
1674 rows × 2 columns
# alldata = np.matrix(alldata[['sqrMetres', 'price']])
data_train = alldata[0:1600]
data_test = alldata[1600:]
data_train = np.matrix(data_train).astype(float)
data_test = np.matrix(data_test).astype(float)
cost_fun_slices = []
for n in range(1, 4):
plot_and_mse(data_train, data_test, n)
cost_data = cost_functions.get(n)
cost_x = [line[1] for line in cost_data[:250]]
cost_y = [line[0] for line in cost_data[:250]]
cost_fun_slices.append((cost_x, cost_y))
Wielomian 1 stopnia, MSE = 35514312788.779144 Wielomian 2 stopnia, MSE = 90239672317.31158 Wielomian 3 stopnia, MSE = 89217346258.50258
# Wczytanie danych (mieszkania) przy pomocy biblioteki pandas
alldata_p2 = pandas.read_csv('polynomial-regression_2.csv')
alldata_p2 = alldata_p2.sample(frac=1)
alldata_p2
araba_fiyat | araba_max_hiz | |
---|---|---|
7 | 250 | 240 |
12 | 1000 | 365 |
13 | 2000 | 365 |
3 | 100 | 200 |
11 | 750 | 360 |
2 | 80 | 200 |
8 | 300 | 300 |
0 | 60 | 180 |
9 | 400 | 350 |
4 | 120 | 200 |
1 | 70 | 180 |
10 | 500 | 350 |
6 | 200 | 240 |
5 | 150 | 220 |
14 | 3000 | 365 |
# alldata = np.matrix(alldata[['sqrMetres', 'price']])
data_train_p2 = alldata_p2[0:12]
data_test_p2 = alldata_p2[12:]
data_train_p2 = np.matrix(data_train_p2).astype(float)
data_test_p2 = np.matrix(data_test_p2).astype(float)
cost_fun_slices = []
for n in range(1, 4):
plot_and_mse(data_train_p2, data_test_p2, n)
cost_data = cost_functions.get(n)
cost_x = [line[1] for line in cost_data[:250]]
cost_y = [line[0] for line in cost_data[:250]]
cost_fun_slices.append((cost_x, cost_y))
Wielomian 1 stopnia, MSE = 13592.268823271697 Wielomian 2 stopnia, MSE = 9030.04916509696 Wielomian 3 stopnia, MSE = 9795.339194991013
#WYKRESY FUNKCJI KOSZTU
for fig in cost_fun_slices:
cost_x, cost_y = fig
fig = plot_data_cost(cost_x, cost_y, "Iteration", "Cost function value")
#WYKRESY FUNKCJI KOSZTU
for fig in cost_fun_slices:
cost_x, cost_y = fig
fig = plot_data_cost(cost_x, cost_y, "Iteration", "Cost function value")
# Ilość nauki do oceny
data_marks_all = pandas.read_csv('Student_Marks.csv')
data_marks_all
number_courses | time_study | Marks | |
---|---|---|---|
0 | 3 | 4.508 | 19.202 |
1 | 4 | 0.096 | 7.734 |
2 | 4 | 3.133 | 13.811 |
3 | 6 | 7.909 | 53.018 |
4 | 8 | 7.811 | 55.299 |
... | ... | ... | ... |
95 | 6 | 3.561 | 19.128 |
96 | 3 | 0.301 | 5.609 |
97 | 4 | 7.163 | 41.444 |
98 | 7 | 0.309 | 12.027 |
99 | 3 | 6.335 | 32.357 |
100 rows × 3 columns
data_marks_all = data_marks_all[['time_study', 'Marks']]
# data_marks_all = data_marks_all.sample(frac=1)
data_marks_train = data_marks_all[0:70]
data_marks_test = data_marks_all[70:]
data_marks_train = np.matrix(data_marks_train).astype(float)
data_marks_test = np.matrix(data_marks_test).astype(float)
cost_fun_slices = []
for n in range(1, 4):
plot_and_mse(data_marks_train, data_marks_test, n)
cost_data = cost_functions.get(n)
cost_x = [line[1] for line in cost_data[:250]]
cost_y = [line[0] for line in cost_data[:250]]
cost_fun_slices.append((cost_x, cost_y))
Wielomian 1 stopnia, MSE = 381.16937283505433 Wielomian 2 stopnia, MSE = 394.1863119057109 Wielomian 3 stopnia, MSE = 391.5017110730558
#WYKRESY FUNKCJI KOSZTU
for fig in cost_fun_slices:
cost_x, cost_y = fig
fig = plot_data_cost(cost_x, cost_y, "Iteration", "Cost function value")
data_ins = pandas.read_csv('insurance.csv')
data_ins = data_ins.sample(frac=1)
data_ins
age | sex | bmi | children | smoker | region | charges | |
---|---|---|---|---|---|---|---|
27 | 55 | female | 32.775 | 2 | no | northwest | 12268.63225 |
412 | 26 | female | 17.195 | 2 | yes | northeast | 14455.64405 |
547 | 54 | female | 46.700 | 2 | no | southwest | 11538.42100 |
983 | 27 | female | 30.590 | 1 | no | northeast | 16796.41194 |
661 | 57 | female | 23.980 | 1 | no | southeast | 22192.43711 |
... | ... | ... | ... | ... | ... | ... | ... |
827 | 36 | male | 28.025 | 1 | yes | northeast | 20773.62775 |
212 | 24 | male | 28.500 | 2 | no | northwest | 3537.70300 |
940 | 18 | male | 23.210 | 0 | no | southeast | 1121.87390 |
476 | 24 | male | 28.500 | 0 | yes | northeast | 35147.52848 |
1027 | 23 | male | 18.715 | 0 | no | northwest | 21595.38229 |
1338 rows × 7 columns
data_ins = data_ins[['age', 'charges']]
data_ins_train = data_ins[0:1200]
data_ins_test = data_ins[1200:]
data_ins_train = np.matrix(data_ins_train).astype(float)
data_ins_test = np.matrix(data_ins_test).astype(float)
cost_fun_slices = []
for n in range(1, 4):
plot_and_mse(data_ins_train, data_ins_test, n)
cost_data = cost_functions.get(n)
cost_x = [line[1] for line in cost_data[:250]]
cost_y = [line[0] for line in cost_data[:250]]
cost_fun_slices.append((cost_x, cost_y))
Wielomian 1 stopnia, MSE = 178360681.2241408 Wielomian 2 stopnia, MSE = 179170879.99336478
[0;31m---------------------------------------------------------------------------[0m [0;31mKeyboardInterrupt[0m Traceback (most recent call last) [0;32m/tmp/ipykernel_5896/1100858333.py[0m in [0;36m<module>[0;34m[0m [1;32m 2[0m [0;34m[0m[0m [1;32m 3[0m [0;32mfor[0m [0mn[0m [0;32min[0m [0mrange[0m[0;34m([0m[0;36m1[0m[0;34m,[0m [0;36m4[0m[0;34m)[0m[0;34m:[0m[0;34m[0m[0;34m[0m[0m [0;32m----> 4[0;31m [0mplot_and_mse[0m[0;34m([0m[0mdata_ins_train[0m[0;34m,[0m [0mdata_ins_test[0m[0;34m,[0m [0mn[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m [0m[1;32m 5[0m [0;34m[0m[0m [1;32m 6[0m [0mcost_data[0m [0;34m=[0m [0mcost_functions[0m[0;34m.[0m[0mget[0m[0;34m([0m[0mn[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m [0;32m/tmp/ipykernel_5896/3773219878.py[0m in [0;36mplot_and_mse[0;34m(data, data_test, n)[0m [1;32m 10[0m [0;34m"""Wykres wraz z MSE"""[0m[0;34m[0m[0;34m[0m[0m [1;32m 11[0m [0mx[0m[0;34m,[0m [0my[0m [0;34m=[0m [0mget_poly_data[0m[0;34m([0m[0mnp[0m[0;34m.[0m[0marray[0m[0;34m([0m[0mdata[0m[0;34m)[0m[0;34m,[0m [0mn[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m [0;32m---> 12[0;31m [0mmodel[0m[0;34m,[0m [0mlogs[0m [0;34m=[0m [0mpolynomial_regression[0m[0;34m([0m[0mx[0m[0;34m,[0m [0my[0m[0;34m,[0m [0mn[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m [0m[1;32m 13[0m [0mcost_function[0m [0;34m=[0m [0;34m[[0m[0;34m[[0m[0melement[0m[0;34m[[0m[0;36m0[0m[0;34m][0m[0;34m,[0m [0mi[0m[0;34m][0m [0;32mfor[0m [0mi[0m[0;34m,[0m [0melement[0m [0;32min[0m [0menumerate[0m[0;34m([0m[0mlogs[0m[0;34m)[0m[0;34m][0m[0;34m[0m[0;34m[0m[0m [1;32m 14[0m [0mcost_functions[0m[0;34m[[0m[0mn[0m[0;34m][0m [0;34m=[0m [0mcost_function[0m[0;34m[0m[0;34m[0m[0m [0;32m/tmp/ipykernel_5896/1168100012.py[0m in [0;36mpolynomial_regression[0;34m(X, y, n)[0m [1;32m 30[0m [0;34m"""Funkcja regresji wielomianowej"""[0m[0;34m[0m[0;34m[0m[0m [1;32m 31[0m [0mtheta_start[0m [0;34m=[0m [0mnp[0m[0;34m.[0m[0mmatrix[0m[0;34m([0m[0;34m[[0m[0;36m0[0m[0;34m][0m [0;34m*[0m [0;34m([0m[0mn[0m[0;34m+[0m[0;36m1[0m[0;34m)[0m[0;34m)[0m[0;34m.[0m[0mreshape[0m[0;34m([0m[0mn[0m[0;34m+[0m[0;36m1[0m[0;34m,[0m [0;36m1[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m [0;32m---> 32[0;31m [0mtheta[0m[0;34m,[0m [0mlogs[0m [0;34m=[0m [0mgradient_descent[0m[0;34m([0m[0mcost[0m[0;34m,[0m [0mgradient[0m[0;34m,[0m [0mtheta_start[0m[0;34m,[0m [0mX[0m[0;34m,[0m [0my[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m [0m[1;32m 33[0m [0;32mreturn[0m [0;32mlambda[0m [0mx[0m[0;34m:[0m [0mh_poly[0m[0;34m([0m[0mtheta[0m[0;34m,[0m [0mx[0m[0;34m)[0m[0;34m,[0m [0mlogs[0m[0;34m[0m[0;34m[0m[0m [0;32m/tmp/ipykernel_5896/1641407440.py[0m in [0;36mgradient_descent[0;34m(fJ, fdJ, theta, X, y, alpha, eps)[0m [1;32m 17[0m [0mlogs[0m [0;34m=[0m [0;34m[[0m[0;34m[[0m[0mcurrent_cost[0m[0;34m,[0m [0mtheta[0m[0;34m][0m[0;34m][0m[0;34m[0m[0;34m[0m[0m [1;32m 18[0m [0;32mwhile[0m [0;32mTrue[0m[0;34m:[0m[0;34m[0m[0;34m[0m[0m [0;32m---> 19[0;31m [0mtheta[0m [0;34m=[0m [0mtheta[0m [0;34m-[0m [0malpha[0m [0;34m*[0m [0mfdJ[0m[0;34m([0m[0mtheta[0m[0;34m,[0m [0mX[0m[0;34m,[0m [0my[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m [0m[1;32m 20[0m [0mcurrent_cost[0m[0;34m,[0m [0mprev_cost[0m [0;34m=[0m [0mfJ[0m[0;34m([0m[0mtheta[0m[0;34m,[0m [0mX[0m[0;34m,[0m [0my[0m[0;34m)[0m[0;34m,[0m [0mcurrent_cost[0m[0;34m[0m[0;34m[0m[0m [1;32m 21[0m [0;31m# print(current_cost)[0m[0;34m[0m[0;34m[0m[0;34m[0m[0m [0;32m/tmp/ipykernel_5896/1641407440.py[0m in [0;36mgradient[0;34m(theta, X, y)[0m [1;32m 10[0m [0;32mdef[0m [0mgradient[0m[0;34m([0m[0mtheta[0m[0;34m,[0m [0mX[0m[0;34m,[0m [0my[0m[0;34m)[0m[0;34m:[0m[0;34m[0m[0;34m[0m[0m [1;32m 11[0m [0;34m"""Wersja macierzowa gradientu funkcji kosztu"""[0m[0;34m[0m[0;34m[0m[0m [0;32m---> 12[0;31m [0;32mreturn[0m [0;36m1.0[0m [0;34m/[0m [0mlen[0m[0;34m([0m[0my[0m[0;34m)[0m [0;34m*[0m [0;34m([0m[0mX[0m[0;34m.[0m[0mT[0m [0;34m*[0m [0;34m([0m[0mX[0m [0;34m*[0m [0mtheta[0m [0;34m-[0m [0my[0m[0;34m)[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m [0m[1;32m 13[0m [0;34m[0m[0m [1;32m 14[0m [0;32mdef[0m [0mgradient_descent[0m[0;34m([0m[0mfJ[0m[0;34m,[0m [0mfdJ[0m[0;34m,[0m [0mtheta[0m[0;34m,[0m [0mX[0m[0;34m,[0m [0my[0m[0;34m,[0m [0malpha[0m[0;34m=[0m[0;36m0.1[0m[0;34m,[0m [0meps[0m[0;34m=[0m[0;36m10[0m[0;34m**[0m[0;34m-[0m[0;36m7[0m[0;34m)[0m[0;34m:[0m[0;34m[0m[0;34m[0m[0m [0;32m~/.local/lib/python3.8/site-packages/numpy/matrixlib/defmatrix.py[0m in [0;36m__array_finalize__[0;34m(self, obj)[0m [1;32m 165[0m [0;32mreturn[0m [0mret[0m[0;34m[0m[0;34m[0m[0m [1;32m 166[0m [0;34m[0m[0m [0;32m--> 167[0;31m [0;32mdef[0m [0m__array_finalize__[0m[0;34m([0m[0mself[0m[0;34m,[0m [0mobj[0m[0;34m)[0m[0;34m:[0m[0;34m[0m[0;34m[0m[0m [0m[1;32m 168[0m [0mself[0m[0;34m.[0m[0m_getitem[0m [0;34m=[0m [0;32mFalse[0m[0;34m[0m[0;34m[0m[0m [1;32m 169[0m [0;32mif[0m [0;34m([0m[0misinstance[0m[0;34m([0m[0mobj[0m[0;34m,[0m [0mmatrix[0m[0;34m)[0m [0;32mand[0m [0mobj[0m[0;34m.[0m[0m_getitem[0m[0;34m)[0m[0;34m:[0m [0;32mreturn[0m[0;34m[0m[0;34m[0m[0m [0;31mKeyboardInterrupt[0m:
#WYKRESY FUNKCJI KOSZTU
for fig in cost_fun_slices:
cost_x, cost_y = fig
fig = plot_data_cost(cost_x, cost_y, "Iteration", "Cost function value")