diff --git a/.ipynb_checkpoints/05_Regresja_wielomianowa-checkpoint.ipynb b/.ipynb_checkpoints/05_Regresja_wielomianowa-checkpoint.ipynb
new file mode 100644
index 0000000..6eee57e
--- /dev/null
+++ b/.ipynb_checkpoints/05_Regresja_wielomianowa-checkpoint.ipynb
@@ -0,0 +1,1053 @@
+{
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "slideshow": {
+ "slide_type": "slide"
+ }
+ },
+ "source": [
+ "# Regresja wielomianowa"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 88,
+ "metadata": {
+ "slideshow": {
+ "slide_type": "notes"
+ }
+ },
+ "outputs": [],
+ "source": [
+ "import ipywidgets as widgets\n",
+ "import matplotlib.pyplot as plt\n",
+ "import numpy as np\n",
+ "import pandas\n",
+ "\n",
+ "%matplotlib inline"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 89,
+ "metadata": {
+ "slideshow": {
+ "slide_type": "notes"
+ }
+ },
+ "outputs": [],
+ "source": [
+ "# Przydatne funkcje\n",
+ "\n",
+ "def cost(theta, X, y):\n",
+ " \"\"\"Wersja macierzowa funkcji kosztu\"\"\"\n",
+ " m = len(y)\n",
+ " J = 1.0 / (2.0 * m) * ((X * theta - y).T * (X * theta - y))\n",
+ " return J.item()\n",
+ "\n",
+ "def gradient(theta, X, y):\n",
+ " \"\"\"Wersja macierzowa gradientu funkcji kosztu\"\"\"\n",
+ " return 1.0 / len(y) * (X.T * (X * theta - y)) \n",
+ "\n",
+ "def gradient_descent(fJ, fdJ, theta, X, y, alpha=0.1, eps=10**-7):\n",
+ " \"\"\"Algorytm gradientu prostego (wersja macierzowa)\"\"\"\n",
+ " current_cost = fJ(theta, X, y)\n",
+ " logs = [[current_cost, theta]]\n",
+ " while True:\n",
+ " theta = theta - alpha * fdJ(theta, X, y)\n",
+ " current_cost, prev_cost = fJ(theta, X, y), current_cost\n",
+ " if abs(prev_cost - current_cost) > 10**15:\n",
+ " print('Algorithm does not converge!')\n",
+ " break\n",
+ " if abs(prev_cost - current_cost) <= eps:\n",
+ " break\n",
+ " logs.append([current_cost, theta]) \n",
+ " return theta, logs\n",
+ "\n",
+ "def plot_data(X, y, xlabel, ylabel):\n",
+ " \"\"\"Wykres danych (wersja macierzowa)\"\"\"\n",
+ " fig = plt.figure(figsize=(16*.6, 9*.6))\n",
+ " ax = fig.add_subplot(111)\n",
+ " fig.subplots_adjust(left=0.1, right=0.9, bottom=0.1, top=0.9)\n",
+ " ax.scatter([X[:, 1]], [y], c='r', s=50, label='Dane')\n",
+ " \n",
+ " ax.set_xlabel(xlabel)\n",
+ " ax.set_ylabel(ylabel)\n",
+ " ax.margins(.05, .05)\n",
+ " plt.ylim(y.min() - 1, y.max() + 1)\n",
+ " plt.xlim(np.min(X[:, 1]) - 1, np.max(X[:, 1]) + 1)\n",
+ " return fig\n",
+ "\n",
+ "def plot_fun(fig, fun, X):\n",
+ " \"\"\"Wykres funkcji `fun`\"\"\"\n",
+ " ax = fig.axes[0]\n",
+ " x0 = np.min(X[:, 1]) - 1.0\n",
+ " x1 = np.max(X[:, 1]) + 1.0\n",
+ " Arg = np.arange(x0, x1, 0.1)\n",
+ " Val = fun(Arg)\n",
+ " return ax.plot(Arg, Val, linewidth='2')"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 90,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "def MSE(Y_true, Y_pred):\n",
+ " return np.square(np.subtract(Y_true,Y_pred)).mean()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 91,
+ "metadata": {
+ "slideshow": {
+ "slide_type": "fragment"
+ }
+ },
+ "outputs": [],
+ "source": [
+ "# Funkcja regresji wielomianowej\n",
+ "\n",
+ "def h_poly(Theta, x):\n",
+ " \"\"\"Funkcja wielomianowa\"\"\"\n",
+ " return sum(theta * np.power(x, i) for i, theta in enumerate(Theta.tolist()))\n",
+ "\n",
+ "def get_poly_data(data, deg):\n",
+ " m, n_plus_1 = data.shape\n",
+ " n = n_plus_1 - 1\n",
+ "\n",
+ " X1 = data[:, 0:n]\n",
+ " X1 /= np.amax(X1, axis=0)\n",
+ "\n",
+ " Xs = [np.ones((m, 1)), X1]\n",
+ "\n",
+ " for i in range(2, deg+1):\n",
+ " Xn = np.power(X1, i)\n",
+ " Xn /= np.amax(Xn, axis=0)\n",
+ " Xs.append(Xn)\n",
+ "\n",
+ " X = np.matrix(np.concatenate(Xs, axis=1)).reshape(m, deg * n + 1)\n",
+ "\n",
+ " y = np.matrix(data[:, -1]).reshape(m, 1)\n",
+ "\n",
+ " return X, y\n",
+ "\n",
+ "\n",
+ "def polynomial_regression(X, y, n):\n",
+ " \"\"\"Funkcja regresji wielomianowej\"\"\"\n",
+ " theta_start = np.matrix([0] * (n+1)).reshape(n+1, 1)\n",
+ " theta, logs = gradient_descent(cost, gradient, theta_start, X, y)\n",
+ " return lambda x: h_poly(theta, x)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 92,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "def predict_values(model, data, n):\n",
+ " x, y = get_poly_data(np.array(data), n)\n",
+ " preprocessed_x = []\n",
+ " for i in x:\n",
+ " preprocessed_x.append(i.item(1))\n",
+ " return y, model(preprocessed_x), MSE(y, model(preprocessed_x))\n",
+ "\n",
+ "def plot_and_mse(data, data_test, n):\n",
+ " x, y = get_poly_data(np.array(data), n)\n",
+ " model = polynomial_regression(x, y, n)\n",
+ " \n",
+ " fig = plot_data(x, y, xlabel='x', ylabel='y')\n",
+ " plot_fun(fig, polynomial_regression(x, y, n), x)\n",
+ "\n",
+ " y_true, Y_pred, mse = predict_values(model, data_test, n)\n",
+ " print(f'Wielomian {n} stopnia, MSE = {mse}')"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 101,
+ "metadata": {
+ "slideshow": {
+ "slide_type": "notes"
+ }
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "
\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " sqrMetres | \n",
+ " price | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " 78 | \n",
+ " 476118.0 | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " 62 | \n",
+ " 459531.0 | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " 15 | \n",
+ " 411557.0 | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " 14 | \n",
+ " 496416.0 | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " 15 | \n",
+ " 406032.0 | \n",
+ "
\n",
+ " \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ "
\n",
+ " \n",
+ " 1669 | \n",
+ " 51 | \n",
+ " 299000.0 | \n",
+ "
\n",
+ " \n",
+ " 1670 | \n",
+ " 53 | \n",
+ " 339000.0 | \n",
+ "
\n",
+ " \n",
+ " 1671 | \n",
+ " 65 | \n",
+ " 320000.0 | \n",
+ "
\n",
+ " \n",
+ " 1672 | \n",
+ " 67 | \n",
+ " 364000.0 | \n",
+ "
\n",
+ " \n",
+ " 1673 | \n",
+ " 50 | \n",
+ " 209000.0 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
1674 rows × 2 columns
\n",
+ "
"
+ ],
+ "text/plain": [
+ " sqrMetres price\n",
+ "0 78 476118.0\n",
+ "1 62 459531.0\n",
+ "2 15 411557.0\n",
+ "3 14 496416.0\n",
+ "4 15 406032.0\n",
+ "... ... ...\n",
+ "1669 51 299000.0\n",
+ "1670 53 339000.0\n",
+ "1671 65 320000.0\n",
+ "1672 67 364000.0\n",
+ "1673 50 209000.0\n",
+ "\n",
+ "[1674 rows x 2 columns]"
+ ]
+ },
+ "execution_count": 101,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "# Wczytanie danych (mieszkania) przy pomocy biblioteki pandas\n",
+ "\n",
+ "alldata = pandas.read_csv('data_flats.tsv', header=0, sep='\\t',\n",
+ " usecols=['price', 'rooms', 'sqrMetres'])\n",
+ "alldata = alldata[['sqrMetres', 'price']]\n",
+ "alldata"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# alldata = np.matrix(alldata[['sqrMetres', 'price']])\n",
+ "data_train = alldata[0:1600]\n",
+ "data_test = alldata[1600:]\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 6,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "[[3.97959184e-01 4.76118000e+05]\n",
+ " [3.16326531e-01 4.59531000e+05]\n",
+ " [7.65306122e-02 4.11557000e+05]\n",
+ " ...\n",
+ " [3.31632653e-01 3.20000000e+05]\n",
+ " [3.41836735e-01 3.64000000e+05]\n",
+ " [2.55102041e-01 2.09000000e+05]]\n"
+ ]
+ },
+ {
+ "data": {
+ "text/plain": [
+ "[]"
+ ]
+ },
+ "execution_count": 6,
+ "metadata": {},
+ "output_type": "execute_result"
+ },
+ {
+ "data": {
+ "image/png": "\n",
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {
+ "needs_background": "light"
+ },
+ "output_type": "display_data"
+ }
+ ],
+ "source": [
+ "plot_and_mse(data, data_marks_test, 1) "
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 7,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "[[3.97959184e-01 4.76118000e+05]\n",
+ " [3.16326531e-01 4.59531000e+05]\n",
+ " [7.65306122e-02 4.11557000e+05]\n",
+ " ...\n",
+ " [3.31632653e-01 3.20000000e+05]\n",
+ " [3.41836735e-01 3.64000000e+05]\n",
+ " [2.55102041e-01 2.09000000e+05]]\n"
+ ]
+ },
+ {
+ "data": {
+ "text/plain": [
+ "[]"
+ ]
+ },
+ "execution_count": 7,
+ "metadata": {},
+ "output_type": "execute_result"
+ },
+ {
+ "data": {
+ "image/png": "\n",
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {
+ "needs_background": "light"
+ },
+ "output_type": "display_data"
+ }
+ ],
+ "source": [
+ "n = 2\n",
+ "x, y = get_poly_data(data, n)\n",
+ "print(data)\n",
+ "fig = plot_data(x, y, xlabel='x', ylabel='y')\n",
+ "plot_fun(fig, polynomial_regression(x, y, n), x)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 8,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Ilość nauki do oceny"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 67,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " number_courses | \n",
+ " time_study | \n",
+ " Marks | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " 3 | \n",
+ " 4.508 | \n",
+ " 19.202 | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " 4 | \n",
+ " 0.096 | \n",
+ " 7.734 | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " 4 | \n",
+ " 3.133 | \n",
+ " 13.811 | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " 6 | \n",
+ " 7.909 | \n",
+ " 53.018 | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " 8 | \n",
+ " 7.811 | \n",
+ " 55.299 | \n",
+ "
\n",
+ " \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ "
\n",
+ " \n",
+ " 95 | \n",
+ " 6 | \n",
+ " 3.561 | \n",
+ " 19.128 | \n",
+ "
\n",
+ " \n",
+ " 96 | \n",
+ " 3 | \n",
+ " 0.301 | \n",
+ " 5.609 | \n",
+ "
\n",
+ " \n",
+ " 97 | \n",
+ " 4 | \n",
+ " 7.163 | \n",
+ " 41.444 | \n",
+ "
\n",
+ " \n",
+ " 98 | \n",
+ " 7 | \n",
+ " 0.309 | \n",
+ " 12.027 | \n",
+ "
\n",
+ " \n",
+ " 99 | \n",
+ " 3 | \n",
+ " 6.335 | \n",
+ " 32.357 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
100 rows × 3 columns
\n",
+ "
"
+ ],
+ "text/plain": [
+ " number_courses time_study Marks\n",
+ "0 3 4.508 19.202\n",
+ "1 4 0.096 7.734\n",
+ "2 4 3.133 13.811\n",
+ "3 6 7.909 53.018\n",
+ "4 8 7.811 55.299\n",
+ ".. ... ... ...\n",
+ "95 6 3.561 19.128\n",
+ "96 3 0.301 5.609\n",
+ "97 4 7.163 41.444\n",
+ "98 7 0.309 12.027\n",
+ "99 3 6.335 32.357\n",
+ "\n",
+ "[100 rows x 3 columns]"
+ ]
+ },
+ "execution_count": 67,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "data_marks_all = pandas.read_csv('archive(1)/Student_Marks.csv')\n",
+ "data_marks_all"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 77,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "[]"
+ ]
+ },
+ "execution_count": 77,
+ "metadata": {},
+ "output_type": "execute_result"
+ },
+ {
+ "data": {
+ "image/png": "\n",
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {
+ "needs_background": "light"
+ },
+ "output_type": "display_data"
+ }
+ ],
+ "source": [
+ "data_marks_all = data_marks_all[['time_study', 'Marks']]\n",
+ "data_marks_all = data_marks_all.sample(frac=1)\n",
+ "data_marks = data_marks_all[0:70]\n",
+ "data_marks_test = data_marks_all[70:]\n",
+ "data_marks = np.matrix(data_marks).astype(float)\n",
+ "n = 1 # Wielomian pierwszego stopnia\n",
+ "\n",
+ "x, y = get_poly_data(np.array(data_marks), n)\n",
+ "fig = plot_data(x, y, xlabel='x', ylabel='y')\n",
+ "plot_fun(fig, polynomial_regression(x, y, n), x)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 73,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "[]"
+ ]
+ },
+ "execution_count": 73,
+ "metadata": {},
+ "output_type": "execute_result"
+ },
+ {
+ "data": {
+ "image/png": "\n",
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {
+ "needs_background": "light"
+ },
+ "output_type": "display_data"
+ }
+ ],
+ "source": [
+ "n = 2 # Wielomian drugiego stopnia\n",
+ "\n",
+ "x, y = get_poly_data(np.array(data_marks), n)\n",
+ "fig = plot_data(x, y, xlabel='x', ylabel='y')\n",
+ "plot_fun(fig, polynomial_regression(x, y, n), x)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 74,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "40.024\n",
+ "[[1. 0.80130703 0.64209295]]\n",
+ "[0.8013070252607767]\n"
+ ]
+ },
+ {
+ "data": {
+ "text/plain": [
+ "array([37.16059534])"
+ ]
+ },
+ "execution_count": 74,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "\n",
+ "\n",
+ "n = 2 # Wielomaian pierwszego stopnia\n",
+ "x, y = get_poly_data(np.array(data_marks), n)\n",
+ "model = polynomial_regression(x, y, n)\n",
+ "\n",
+ "index = 2\n",
+ "print(data_marks[index].item(1))\n",
+ "print(x[index])\n",
+ "print([x[index].item(1)])\n",
+ "model([x[index].item(1)])\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 87,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Wielomian 1 stopnia, MSE = 465.8122515203192\n"
+ ]
+ },
+ {
+ "data": {
+ "image/png": "\n",
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {
+ "needs_background": "light"
+ },
+ "output_type": "display_data"
+ }
+ ],
+ "source": [
+ "\n",
+ "\n",
+ "# data_marks_test\n",
+ "plot_and_mse(data_marks, data_marks_test, 1) "
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": []
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 13,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Archive: archive.zip\n",
+ "replace insurance.csv? [y]es, [n]o, [A]ll, [N]one, [r]ename: ^C\n"
+ ]
+ }
+ ],
+ "source": [
+ "!unzip archive.zip"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 14,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " age | \n",
+ " sex | \n",
+ " bmi | \n",
+ " children | \n",
+ " smoker | \n",
+ " region | \n",
+ " charges | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 309 | \n",
+ " 41 | \n",
+ " female | \n",
+ " 33.060 | \n",
+ " 2 | \n",
+ " no | \n",
+ " northwest | \n",
+ " 7749.15640 | \n",
+ "
\n",
+ " \n",
+ " 696 | \n",
+ " 53 | \n",
+ " female | \n",
+ " 32.300 | \n",
+ " 2 | \n",
+ " no | \n",
+ " northeast | \n",
+ " 29186.48236 | \n",
+ "
\n",
+ " \n",
+ " 261 | \n",
+ " 20 | \n",
+ " female | \n",
+ " 26.840 | \n",
+ " 1 | \n",
+ " yes | \n",
+ " southeast | \n",
+ " 17085.26760 | \n",
+ "
\n",
+ " \n",
+ " 937 | \n",
+ " 39 | \n",
+ " female | \n",
+ " 24.225 | \n",
+ " 5 | \n",
+ " no | \n",
+ " northwest | \n",
+ " 8965.79575 | \n",
+ "
\n",
+ " \n",
+ " 891 | \n",
+ " 36 | \n",
+ " female | \n",
+ " 29.040 | \n",
+ " 4 | \n",
+ " no | \n",
+ " southeast | \n",
+ " 7243.81360 | \n",
+ "
\n",
+ " \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ "
\n",
+ " \n",
+ " 584 | \n",
+ " 19 | \n",
+ " male | \n",
+ " 20.700 | \n",
+ " 0 | \n",
+ " no | \n",
+ " southwest | \n",
+ " 1242.81600 | \n",
+ "
\n",
+ " \n",
+ " 1066 | \n",
+ " 48 | \n",
+ " male | \n",
+ " 37.290 | \n",
+ " 2 | \n",
+ " no | \n",
+ " southeast | \n",
+ " 8978.18510 | \n",
+ "
\n",
+ " \n",
+ " 1025 | \n",
+ " 21 | \n",
+ " female | \n",
+ " 34.600 | \n",
+ " 0 | \n",
+ " no | \n",
+ " southwest | \n",
+ " 2020.17700 | \n",
+ "
\n",
+ " \n",
+ " 831 | \n",
+ " 36 | \n",
+ " female | \n",
+ " 25.840 | \n",
+ " 0 | \n",
+ " no | \n",
+ " northwest | \n",
+ " 5266.36560 | \n",
+ "
\n",
+ " \n",
+ " 49 | \n",
+ " 36 | \n",
+ " male | \n",
+ " 35.200 | \n",
+ " 1 | \n",
+ " yes | \n",
+ " southeast | \n",
+ " 38709.17600 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
1338 rows × 7 columns
\n",
+ "
"
+ ],
+ "text/plain": [
+ " age sex bmi children smoker region charges\n",
+ "309 41 female 33.060 2 no northwest 7749.15640\n",
+ "696 53 female 32.300 2 no northeast 29186.48236\n",
+ "261 20 female 26.840 1 yes southeast 17085.26760\n",
+ "937 39 female 24.225 5 no northwest 8965.79575\n",
+ "891 36 female 29.040 4 no southeast 7243.81360\n",
+ "... ... ... ... ... ... ... ...\n",
+ "584 19 male 20.700 0 no southwest 1242.81600\n",
+ "1066 48 male 37.290 2 no southeast 8978.18510\n",
+ "1025 21 female 34.600 0 no southwest 2020.17700\n",
+ "831 36 female 25.840 0 no northwest 5266.36560\n",
+ "49 36 male 35.200 1 yes southeast 38709.17600\n",
+ "\n",
+ "[1338 rows x 7 columns]"
+ ]
+ },
+ "execution_count": 14,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "data_ins = pandas.read_csv('insurance.csv')\n",
+ "data_ins = data_ins.sample(frac=1)\n",
+ "data_ins"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 15,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "[[33.06 41. ]\n",
+ " [32.3 53. ]\n",
+ " [26.84 20. ]\n",
+ " ...\n",
+ " [34.6 21. ]\n",
+ " [25.84 36. ]\n",
+ " [35.2 36. ]]\n"
+ ]
+ }
+ ],
+ "source": [
+ "data_ins = data_ins[['bmi', 'age']]\n",
+ "data_ins = np.matrix(data_ins).astype(float)\n",
+ "print(data_ins)\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 16,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "[]"
+ ]
+ },
+ "execution_count": 16,
+ "metadata": {},
+ "output_type": "execute_result"
+ },
+ {
+ "data": {
+ "image/png": "\n",
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {
+ "needs_background": "light"
+ },
+ "output_type": "display_data"
+ }
+ ],
+ "source": [
+ "n = 1 # Wielomaian pierwszego stopnia\n",
+ "\n",
+ "x, y = get_poly_data(np.array(data_ins), n)\n",
+ "fig = plot_data(x, y, xlabel='x', ylabel='y')\n",
+ "plot_fun(fig, polynomial_regression(x, y, n), x)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 17,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "29.735 1.0\n"
+ ]
+ },
+ {
+ "ename": "NameError",
+ "evalue": "name 'a' is not defined",
+ "output_type": "error",
+ "traceback": [
+ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
+ "\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)",
+ "\u001b[0;32m/tmp/ipykernel_6535/4031094360.py\u001b[0m in \u001b[0;36m\u001b[0;34m\u001b[0m\n\u001b[1;32m 6\u001b[0m \u001b[0mindex\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;36m10\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 7\u001b[0m \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdata_ins\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mitem\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mindex\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mx\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mitem\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mindex\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 8\u001b[0;31m \u001b[0ma\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mx\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mitem\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mindex\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m",
+ "\u001b[0;31mNameError\u001b[0m: name 'a' is not defined"
+ ]
+ }
+ ],
+ "source": [
+ "n = 1 # Wielomaian pierwszego stopnia\n",
+ "\n",
+ "x, y = get_poly_data(np.array(data_ins), n)\n",
+ "model = polynomial_regression(x, y, n)\n",
+ "\n",
+ "index = 10\n",
+ "print(data_ins.item(index), x.item(index))\n",
+ "a([x.item(index)])"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": []
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "n = 2 # Wielomian 2 stopnia\n",
+ "x, y = get_poly_data(np.array(data_ins), n)\n",
+ "fig = plot_data(x, y, xlabel='x', ylabel='y')\n",
+ "plot_fun(fig, polynomial_regression(x, y, n), x)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "n = 3 # Wielomian 3 stopnia\n",
+ "x, y = get_poly_data(np.array(data_ins), n)\n",
+ "fig = plot_data(x, y, xlabel='x', ylabel='y')\n",
+ "plot_fun(fig, polynomial_regression(x, y, n), x)"
+ ]
+ }
+ ],
+ "metadata": {
+ "author": "Paweł Skórzewski",
+ "celltoolbar": "Slideshow",
+ "email": "pawel.skorzewski@amu.edu.pl",
+ "kernelspec": {
+ "display_name": "Python 3 (ipykernel)",
+ "language": "python",
+ "name": "python3"
+ },
+ "lang": "pl",
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.8.12"
+ },
+ "livereveal": {
+ "start_slideshow_at": "selected",
+ "theme": "white"
+ },
+ "subtitle": "5.Regresja wielomianowa. Problem nadmiernego dopasowania[wykład]",
+ "title": "Uczenie maszynowe",
+ "year": "2021"
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}
diff --git a/05_Regresja_wielomianowa.ipynb b/05_Regresja_wielomianowa.ipynb
index 19e922f..01f7001 100644
--- a/05_Regresja_wielomianowa.ipynb
+++ b/05_Regresja_wielomianowa.ipynb
@@ -13,7 +13,7 @@
},
{
"cell_type": "code",
- "execution_count": 6,
+ "execution_count": 88,
"metadata": {
"slideshow": {
"slide_type": "notes"
@@ -31,7 +31,7 @@
},
{
"cell_type": "code",
- "execution_count": 7,
+ "execution_count": 89,
"metadata": {
"slideshow": {
"slide_type": "notes"
@@ -92,24 +92,17 @@
},
{
"cell_type": "code",
- "execution_count": 8,
- "metadata": {
- "slideshow": {
- "slide_type": "notes"
- }
- },
+ "execution_count": 90,
+ "metadata": {},
"outputs": [],
"source": [
- "# Wczytanie danych (mieszkania) przy pomocy biblioteki pandas\n",
- "\n",
- "alldata = pandas.read_csv('data_flats.tsv', header=0, sep='\\t',\n",
- " usecols=['price', 'rooms', 'sqrMetres'])\n",
- "data = np.matrix(alldata[['sqrMetres', 'price']])"
+ "def MSE(Y_true, Y_pred):\n",
+ " return np.square(np.subtract(Y_true,Y_pred)).mean()"
]
},
{
"cell_type": "code",
- "execution_count": 9,
+ "execution_count": 91,
"metadata": {
"slideshow": {
"slide_type": "fragment"
@@ -153,38 +146,871 @@
},
{
"cell_type": "code",
- "execution_count": 10,
+ "execution_count": 92,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "def predict_values(model, data, n):\n",
+ " x, y = get_poly_data(np.array(data), n)\n",
+ " preprocessed_x = []\n",
+ " for i in x:\n",
+ " preprocessed_x.append(i.item(1))\n",
+ " return y, model(preprocessed_x), MSE(y, model(preprocessed_x))\n",
+ "\n",
+ "def plot_and_mse(data, data_test, n):\n",
+ " x, y = get_poly_data(np.array(data), n)\n",
+ " model = polynomial_regression(x, y, n)\n",
+ " \n",
+ " fig = plot_data(x, y, xlabel='x', ylabel='y')\n",
+ " plot_fun(fig, polynomial_regression(x, y, n), x)\n",
+ "\n",
+ " y_true, Y_pred, mse = predict_values(model, data_test, n)\n",
+ " print(f'Wielomian {n} stopnia, MSE = {mse}')"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 101,
"metadata": {
"slideshow": {
- "slide_type": "subslide"
+ "slide_type": "notes"
}
},
"outputs": [
{
- "output_type": "execute_result",
"data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " sqrMetres | \n",
+ " price | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " 78 | \n",
+ " 476118.0 | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " 62 | \n",
+ " 459531.0 | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " 15 | \n",
+ " 411557.0 | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " 14 | \n",
+ " 496416.0 | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " 15 | \n",
+ " 406032.0 | \n",
+ "
\n",
+ " \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ "
\n",
+ " \n",
+ " 1669 | \n",
+ " 51 | \n",
+ " 299000.0 | \n",
+ "
\n",
+ " \n",
+ " 1670 | \n",
+ " 53 | \n",
+ " 339000.0 | \n",
+ "
\n",
+ " \n",
+ " 1671 | \n",
+ " 65 | \n",
+ " 320000.0 | \n",
+ "
\n",
+ " \n",
+ " 1672 | \n",
+ " 67 | \n",
+ " 364000.0 | \n",
+ "
\n",
+ " \n",
+ " 1673 | \n",
+ " 50 | \n",
+ " 209000.0 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
1674 rows × 2 columns
\n",
+ "
"
+ ],
"text/plain": [
- "[]"
+ " sqrMetres price\n",
+ "0 78 476118.0\n",
+ "1 62 459531.0\n",
+ "2 15 411557.0\n",
+ "3 14 496416.0\n",
+ "4 15 406032.0\n",
+ "... ... ...\n",
+ "1669 51 299000.0\n",
+ "1670 53 339000.0\n",
+ "1671 65 320000.0\n",
+ "1672 67 364000.0\n",
+ "1673 50 209000.0\n",
+ "\n",
+ "[1674 rows x 2 columns]"
]
},
+ "execution_count": 101,
"metadata": {},
- "execution_count": 10
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "# Wczytanie danych (mieszkania) przy pomocy biblioteki pandas\n",
+ "\n",
+ "alldata = pandas.read_csv('data_flats.tsv', header=0, sep='\\t',\n",
+ " usecols=['price', 'rooms', 'sqrMetres'])\n",
+ "alldata = alldata[['sqrMetres', 'price']]\n",
+ "alldata"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 102,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# alldata = np.matrix(alldata[['sqrMetres', 'price']])\n",
+ "data_train = alldata[0:1600]\n",
+ "data_test = alldata[1600:]\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 105,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Wielomian 1 stopnia, MSE = 31777996749.774563\n",
+ "Wielomian 2 stopnia, MSE = 80047128653.54173\n"
+ ]
},
{
- "output_type": "display_data",
"data": {
- "text/plain": "