Regresja wielomianowa

import ipywidgets as widgets
import matplotlib.pyplot as plt
import numpy as np
import pandas

%matplotlib inline

# Przydatne funkcje

def cost(theta, X, y):
    """Wersja macierzowa funkcji kosztu"""
    m = len(y)
    J = 1.0 / (2.0 * m) * ((X * theta - y).T * (X * theta - y))
    return J.item()

def gradient(theta, X, y):
    """Wersja macierzowa gradientu funkcji kosztu"""
    return 1.0 / len(y) * (X.T * (X * theta - y)) 

def gradient_descent(fJ, fdJ, theta, X, y, alpha=0.1, eps=10**-7):
    """Algorytm gradientu prostego (wersja macierzowa)"""
    current_cost = fJ(theta, X, y)
    logs = [[current_cost, theta]]
    while True:
        theta = theta - alpha * fdJ(theta, X, y)
        current_cost, prev_cost = fJ(theta, X, y), current_cost
        if abs(prev_cost - current_cost) > 10**15:
            print('Algorithm does not converge!')
            break
        if abs(prev_cost - current_cost) <= eps:
            break
        logs.append([current_cost, theta]) 
    return theta, logs

def plot_data(X, y, xlabel, ylabel):
    """Wykres danych (wersja macierzowa)"""
    fig = plt.figure(figsize=(16*.6, 9*.6))
    ax = fig.add_subplot(111)
    fig.subplots_adjust(left=0.1, right=0.9, bottom=0.1, top=0.9)
    ax.scatter([X[:, 1]], [y], c='r', s=50, label='Dane')
    
    ax.set_xlabel(xlabel)
    ax.set_ylabel(ylabel)
    ax.margins(.05, .05)
    plt.ylim(y.min() - 1, y.max() + 1)
    plt.xlim(np.min(X[:, 1]) - 1, np.max(X[:, 1]) + 1)
    return fig

def plot_fun(fig, fun, X):
    """Wykres funkcji `fun`"""
    ax = fig.axes[0]
    x0 = np.min(X[:, 1]) - 1.0
    x1 = np.max(X[:, 1]) + 1.0
    Arg = np.arange(x0, x1, 0.1)
    Val = fun(Arg)
    return ax.plot(Arg, Val, linewidth='2')

def MSE(Y_true, Y_pred):
    return np.square(np.subtract(Y_true,Y_pred)).mean()

# Funkcja regresji wielomianowej

def h_poly(Theta, x):
    """Funkcja wielomianowa"""
    return sum(theta * np.power(x, i) for i, theta in enumerate(Theta.tolist()))

def get_poly_data(data, deg):
    m, n_plus_1 = data.shape
    n = n_plus_1 - 1

    X1 = data[:, 0:n]
    X1 /= np.amax(X1, axis=0)

    Xs = [np.ones((m, 1)), X1]

    for i in range(2, deg+1):
        Xn = np.power(X1, i)
        Xn /= np.amax(Xn, axis=0)
        Xs.append(Xn)

    X = np.matrix(np.concatenate(Xs, axis=1)).reshape(m, deg * n + 1)

    y = np.matrix(data[:, -1]).reshape(m, 1)

    return X, y


def polynomial_regression(X, y, n):
    """Funkcja regresji wielomianowej"""
    theta_start = np.matrix([0] * (n+1)).reshape(n+1, 1)
    theta, logs = gradient_descent(cost, gradient, theta_start, X, y)
    return lambda x: h_poly(theta, x)

def predict_values(model, data, n):
    x, y = get_poly_data(np.array(data), n)
    preprocessed_x = []
    for i in x:
        preprocessed_x.append(i.item(1))
    return y, model(preprocessed_x), MSE(y, model(preprocessed_x))

def plot_and_mse(data, data_test, n):
    x, y = get_poly_data(np.array(data), n)
    model = polynomial_regression(x, y, n)
    
    fig = plot_data(x, y, xlabel='x', ylabel='y')
    plot_fun(fig, polynomial_regression(x, y, n), x)

    y_true, Y_pred, mse = predict_values(model, data_test, n)
    print(f'Wielomian {n} stopnia, MSE = {mse}')

# Wczytanie danych (mieszkania) przy pomocy biblioteki pandas

alldata = pandas.read_csv('data_flats.tsv', header=0, sep='\t',
                          usecols=['price', 'rooms', 'sqrMetres'])
alldata = alldata[['sqrMetres', 'price']]
alldata = alldata.sample(frac=1)
alldata

	sqrMetres	price
49	37	338000.00
1171	90	855000.00
368	16	399000.00
1206	58	359602.00
1500	20	424977.14
...	...	...
50	78	420000.00
396	52	275000.00
1367	55	192750.00
771	62	558745.00
337	55	246330.00

1674 rows × 2 columns

# alldata = np.matrix(alldata[['sqrMetres', 'price']])
data_train = alldata[0:1600]
data_test = alldata[1600:]
data_train = np.matrix(data_train).astype(float)
data_test = np.matrix(data_test).astype(float)

for n in range(1, 4):
    plot_and_mse(data_train, data_test, n)

Wielomian 1 stopnia, MSE = 41910519165.43458
Wielomian 2 stopnia, MSE = 60658890503.01548
Wielomian 3 stopnia, MSE = 63228721451.021095

# Ilość nauki do oceny

data_marks_all = pandas.read_csv('Student_Marks.csv')
data_marks_all

	number_courses	time_study	Marks
0	3	4.508	19.202
1	4	0.096	7.734
2	4	3.133	13.811
3	6	7.909	53.018
4	8	7.811	55.299
...	...	...	...
95	6	3.561	19.128
96	3	0.301	5.609
97	4	7.163	41.444
98	7	0.309	12.027
99	3	6.335	32.357

100 rows × 3 columns

data_marks_all = data_marks_all[['time_study', 'Marks']]
# data_marks_all = data_marks_all.sample(frac=1)
data_marks_train = data_marks_all[0:70]
data_marks_test = data_marks_all[70:]
data_marks_train = np.matrix(data_marks_train).astype(float)
data_marks_test = np.matrix(data_marks_test).astype(float)

for n in range(1, 4):
    plot_and_mse(data_marks, data_marks_test, n)

Wielomian 1 stopnia, MSE = 383.05506630630464
Wielomian 2 stopnia, MSE = 394.48126522686164
Wielomian 3 stopnia, MSE = 392.8631214454169

data_ins = pandas.read_csv('insurance.csv')
data_ins = data_ins.sample(frac=1)
data_ins

	age	sex	bmi	children	smoker	region	charges
238	19	male	29.070	0	yes	northwest	17352.68030
809	25	male	25.840	1	no	northeast	3309.79260
1053	47	male	29.800	3	yes	southwest	25309.48900
1177	40	female	27.400	1	no	southwest	6496.88600
964	52	male	36.765	2	no	northwest	26467.09737
...	...	...	...	...	...	...	...
374	20	male	33.330	0	no	southeast	1391.52870
950	57	male	18.335	0	no	northeast	11534.87265
954	34	male	27.835	1	yes	northwest	20009.63365
521	32	female	44.220	0	no	southeast	3994.17780
963	46	male	24.795	3	no	northeast	9500.57305

1338 rows × 7 columns

data_ins = data_ins[['age', 'charges']]
data_ins_train = data_ins[0:1200]
data_ins_test = data_ins[1200:]
data_ins_train = np.matrix(data_ins_train).astype(float)
data_ins_test = np.matrix(data_ins_test).astype(float)

for n in range(1, 4):
    plot_and_mse(data_ins_train, data_ins_test, n)

Wielomian 1 stopnia, MSE = 146688971.1828306
Wielomian 2 stopnia, MSE = 146881616.236919
Wielomian 3 stopnia, MSE = 146891792.9142127

268 KiB Raw Blame History Unescape Escape

Regresja wielomianowa

268 KiB

Raw Blame History