Algorytm najszybszego spadku dla regresji wielomianowej.

Skład grupy:

Nowak Anna,
Łaźna Patrycja,
Bregier Damian

0. Podstawowe informacje o zbiorze danych

Ze względu na specyfikę wytycznych projektu zakładającego wykorzystanie jedynie dwóch cech: x i y oraz modelowanie zależności y od x za pomocą funkcji wielomianowej dobrany został przez nas odpowiedni dataset. Obejmuje on jedynie trzy kolumny, czyli: płeć, wzrost w calach oraz wagę w funtach, z czego ze względu na specyfikę projektu wykorzystywane są jedynie wzrost oraz waga. Każdy z parametrów zawiera po 10 tysięcy unikalnych wartości.

Dokładne informacje na temat tego zbioru, jego zawartości i samych danych można znaleźć pod adresem: https://www.kaggle.com/mustafaali96/weight-height.

import pandas as pd
import random
import matplotlib.pyplot as plt
import numpy as np
import ipywidgets as widgets
np.set_printoptions(suppress=True)

1. Wczytywanie i preprocessing danych

# Stopień wielomianu
degree = 4
X_plot = np.linspace(0, 100, 1000)
initial_theta = np.matrix([0] * (degree + 1)).reshape(degree + 1, 1)

# Wybór dwóch kolumn - dotyczących wzrostu i wagi
data = pd.read_csv('weight-height.csv')[["Height", "Weight"]]
# Czyszczenie tabeli i wartości pustych
data = data.dropna()
data_matrix = np.matrix(data)

m, n_plus_1 = data_matrix.shape
n = n_plus_1 - 1
X = (np.ones((m, 1)))

for i in range(1, degree + 1):
    Xn = np.power(data_matrix[:, 0:n], i)
    Xn /= np.amax(Xn, axis=0)
    X = np.concatenate((X, Xn), axis=1)

X = np.matrix(X).reshape(m, degree * n + 1)
Y = np.matrix(data_matrix[:, -1])

2. Metody regresji wielomianowej

def print_theta(theta):
    print("f(x) = ", end="")
    for i,x in enumerate(theta.tolist()[:-1]):
        x = x[0]
        print(f"{x}x^{i}", end=" + ")
    print(f"{theta.tolist()[-1][0]}^{len(theta) - 1}")

# Implementacja wzrosu na regresję wielomianową
def polynomial_regression(theta, x):
    x = x/data["Height"].max()
    return sum(theta * np.power(x, i) for i, theta in enumerate(theta.tolist()))

# Implementacja wzoru na RMSE, czyli pierwiastek z błędu średniokwadratowego
def mean_squared_error(theta, X, Y):
    J = 1.0 / (2.0 * m) * ((X * theta - Y).T * (X * theta - Y))
    return J.item()

3. Wybrane rodzaje gradientów

Metoda gradientu prostego to algorytm numeryczny mający na celu znalezienie minimum lokalnego dla zadanej funkcji celu. Jak sama nazwa wskazuje jest to jedna z prostszych metod, która opiera się na iteracyjnym poszukiwaniu minimum. Proces ten (dla pierwszych czterech kroków), został pokazany na poniższym rysunku.

# Wzór na gradient prosty
def gradient(theta, X, Y):
    return 1.0 / len(Y) * (X.T * (X * theta - Y))

# Batch gradient descent (BGD)
def BGD(X, Y, theta, cost_function = mean_squared_error, learning_rate=0.1, eps=10**-5, max_steps = 10000000000):
    cost = cost_function(theta, X, Y)
    logs = [[cost, theta]]
    
    for i in range(max_steps):
        theta = theta - learning_rate * gradient(theta, X, Y)
        next_cost = cost_function(theta, X, Y)
        logs.append([next_cost, theta])
        if abs(cost - next_cost) <= eps:
            break
        cost = next_cost
    return theta, logs

# BGD z momentum
def momentum(X, Y, theta, cost_function = mean_squared_error, learning_rate=0.1, momentum = 0.9, epochs=30):
    cost = cost_function(theta, X, Y)
    logs = [[cost, theta]]
    delta_history = [np.matrix([0]*(degree+1)).T]
    for i in range(epochs):
        delta_history.append(momentum * delta_history[-1] + learning_rate * gradient(theta, X, Y))
        theta = theta - delta_history[-1]
        next_cost = cost_function(theta, X, Y)
        logs.append([next_cost, theta])
        cost = next_cost
    return theta, logs

# Mini-batch gradient descent (MBGD)
def MBGD(X, Y, theta, cost_function = mean_squared_error, learning_rate=0.1, epochs=5, batch_size=16):
    cost = cost_function(theta, X, Y)
    logs = [[cost, theta]]
    start, end = 0, batch_size
    
    steps = m / batch_size
    for i in range(epochs):
        zipped_XY = list(zip(X, Y))
        random.shuffle(zipped_XY)
        X_shuffled, Y_shuffled = zip(*zipped_XY)
        X_shuffled = np.concatenate(X_shuffled, axis=0) 
        Y_shuffled = np.concatenate(Y_shuffled, axis=0) 
        for j in range(int(steps)):
            batch =  X_shuffled[start:end,:], Y_shuffled[start:end,:]
            theta = theta - learning_rate * gradient(theta, batch[0], batch[1])
            cost = cost_function(theta, X, Y)
            logs.append([cost, theta])

            if start + batch_size < batch_size:
                start += batch_size
            else:
                start = 0
            end = min(start + batch_size, m)
    return theta, logs

# Stochastic gradient descent (SGD)
def SGD(X, Y, theta, cost_function = mean_squared_error, learning_rate=0.1, epochs=5):
    return MBGD(X, Y, theta, cost_function, learning_rate, epochs, 1)

# Steepest descent
def steepest_descent(X, Y, theta, cost_function = mean_squared_error, single_step=0.01, epochs=30):
    cost = cost_function(theta, X, Y)
    logs = [[cost, theta]]
    for i in range(epochs):
        direction = gradient(theta, X, Y)
        j = 0
        while(True):
            j+=1
            next_theta = theta - single_step * direction
            next_cost = cost_function(next_theta, X, Y)
            if(next_cost >= cost or type(next_cost) != float):
                break
            cost = next_cost
            theta = next_theta
        logs.append([next_cost, theta])
    return theta, logs

4. Zestawienie wyników

final_theta_BGD, logs_1 = BGD(X, Y, initial_theta)
print("BGD:")
print_theta(final_theta_BGD)
print(f"{len(logs_1)} updates\n")

final_theta_MBGD, logs_2 = MBGD(X, Y, initial_theta, epochs = 60)
print("MBGD:")
print_theta(final_theta_MBGD)
print(f"{len(logs_2)} updates\n")

final_theta_SGD, logs_3 = SGD(X, Y, initial_theta, epochs = 60)
print("SGD:")
print_theta(final_theta_SGD)
print(f"{len(logs_3)} updates\n")

final_theta_momentum, logs_4 = momentum(X, Y, initial_theta, epochs = 60)
print("momentum:")
print_theta(final_theta_momentum)
print(f"{len(logs_4)} updates\n")

final_steepest_descent, logs_5 = steepest_descent(X, Y, initial_theta, epochs = 60)
print("steepest descent:")
print_theta(final_steepest_descent)
print(f"{len(logs_5)} updates\n")

BGD:
f(x) = -65.64506277392503x^0 + 68.77648241255996x^1 + 114.75349304333463x^2 + 102.20675310280693x^3 + 52.80014652046723^4
82705 updates

MBGD:
f(x) = -20.49538398624665x^0 + 43.83203531549893x^1 + 74.9580517022499x^2 + 83.08157339221718x^3 + 75.75845309811228^4
37501 updates

SGD:
f(x) = 38.980419583838696x^0 + 42.24440889357842x^1 + 43.5350614491982x^2 + 43.437875831269686x^3 + 42.39005292817251^4
600001 updates

momentum:
f(x) = 8.015549563834483x^0 + 38.129978604712036x^1 + 58.338707783566186x^2 + 71.220198958028x^3 + 78.7251186900262^4
61 updates

steepest descent:
f(x) = 13.638648831593645x^0 + 39.092493619820296x^1 + 55.99541645170152x^2 + 66.58927282539155x^3 + 72.5680972400324^4
61 updates

Metoda gradientu prostego

4. Reprezentacja graficzna regresji wielomianowej i funkcji kosztu

def plot_polynomial_regression(theta):
    fig = plt.figure(figsize=(10,5))
    Y_plot = polynomial_regression(theta, X_plot).tolist()
    chart = fig.add_subplot()
    chart.plot(data["Height"], Y ,"go")
    chart.plot(X_plot, Y_plot, color="red", lw=2, label=f"degree {len(theta)}")
    plt.ylim([0,250])
    plt.show()
    

def plot_cost_function(logs):
    no_iter = no_iter = [x for x in range(1, len(logs) + 1)]
    all_cost = [row[0] for row in logs]
    
    fig = plt.figure(figsize=(10,5))  
    plt.plot(no_iter, all_cost)
    plt.xlabel('No. of iterations')
    plt.ylabel('Cost')
    plt.show()
    

print("BGD:")
plot_polynomial_regression(final_theta_BGD)
plot_cost_function(logs_1)
print("MBGD:")
plot_polynomial_regression(final_theta_MBGD)
plot_cost_function(logs_2)
print("SGD:")
plot_polynomial_regression(final_theta_SGD)
plot_cost_function(logs_3)
print("Momentum:")
plot_polynomial_regression(final_theta_momentum)
plot_cost_function(logs_4)
print("Metoda najszybszego spadku:")
plot_polynomial_regression(final_steepest_descent)
plot_cost_function(logs_5)

BGD:

MBGD:

SGD:

Momentum:

Metoda najszybszego spadku:

5. Regresja wielomianowa z wykorzystaniem gotowych bibliotek

from sklearn.preprocessing import PolynomialFeatures, StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import Ridge, LinearRegression

model = make_pipeline(PolynomialFeatures(degree=degree, include_bias=True), 
                       LinearRegression())
model.fit(data[["Height"]],Y)

Pipeline(steps=[('polynomialfeatures', PolynomialFeatures(degree=4)),
                ('linearregression', LinearRegression())])

Y_plot_2 = model.predict([[x] for x in X_plot])

fig = plt.figure(figsize=(10,5))
chart = fig.add_subplot()
chart.plot(data["Height"], Y ,"go")
chart.plot(X_plot, Y_plot_2, color="red", lw=2, label=f"degree {degree}")
plt.ylim([0,250])
plt.xlim([0,100])
degree

polyfit_theta = np.polyfit(data["Height"]/data["Height"].max(), Y, degree)
plot_polynomial_regression(polyfit_theta)

362 KiB Raw Blame History