1203 lines
153 KiB
Plaintext
1203 lines
153 KiB
Plaintext
|
{
|
|||
|
"cells": [
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {
|
|||
|
"slideshow": {
|
|||
|
"slide_type": "slide"
|
|||
|
}
|
|||
|
},
|
|||
|
"source": [
|
|||
|
"## Uczenie maszynowe UMZ 2017/2018\n",
|
|||
|
"# 4. Algorytm KNN, uczenie nienadzorowane\n",
|
|||
|
"### Część 2"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {
|
|||
|
"slideshow": {
|
|||
|
"slide_type": "slide"
|
|||
|
}
|
|||
|
},
|
|||
|
"source": [
|
|||
|
"## 4.2. Uczenie nienadzorowane – Algorytm $k$ średnich"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 2,
|
|||
|
"metadata": {
|
|||
|
"slideshow": {
|
|||
|
"slide_type": "notes"
|
|||
|
}
|
|||
|
},
|
|||
|
"outputs": [],
|
|||
|
"source": [
|
|||
|
"# Przydatne importy\n",
|
|||
|
"\n",
|
|||
|
"import ipywidgets as widgets\n",
|
|||
|
"import matplotlib.pyplot as plt\n",
|
|||
|
"import numpy as np\n",
|
|||
|
"import pandas\n",
|
|||
|
"import random\n",
|
|||
|
"import seaborn\n",
|
|||
|
"\n",
|
|||
|
"%matplotlib inline"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 3,
|
|||
|
"metadata": {
|
|||
|
"slideshow": {
|
|||
|
"slide_type": "notes"
|
|||
|
}
|
|||
|
},
|
|||
|
"outputs": [],
|
|||
|
"source": [
|
|||
|
"# Wczytanie danych (gatunki kosaćców)\n",
|
|||
|
"\n",
|
|||
|
"data_iris = pandas.read_csv('iris.csv', header=0, usecols=['łod.dł.', 'łod.sz.', 'pł.dł.', 'pł.sz.'])\n",
|
|||
|
"data_iris.columns=['x1', 'x2', 'x3', 'x4']\n",
|
|||
|
"\n",
|
|||
|
"X = data_iris.values\n",
|
|||
|
"Xs = data_iris.values[:, 2:4]"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 4,
|
|||
|
"metadata": {
|
|||
|
"slideshow": {
|
|||
|
"slide_type": "notes"
|
|||
|
}
|
|||
|
},
|
|||
|
"outputs": [],
|
|||
|
"source": [
|
|||
|
"# Wykres danych\n",
|
|||
|
"def plot_unlabeled_data(X, col1=0, col2=1, x1label=r'$x_1$', x2label=r'$x_2$'): \n",
|
|||
|
" fig = plt.figure(figsize=(16*.7, 9*.7))\n",
|
|||
|
" ax = fig.add_subplot(111)\n",
|
|||
|
" fig.subplots_adjust(left=0.1, right=0.9, bottom=0.1, top=0.9)\n",
|
|||
|
" X1 = X[:, col1].tolist()\n",
|
|||
|
" X2 = X[:, col2].tolist()\n",
|
|||
|
" ax.scatter(X1, X2, c='k', marker='o', s=50, label='Dane')\n",
|
|||
|
" ax.set_xlabel(x1label)\n",
|
|||
|
" ax.set_ylabel(x2label)\n",
|
|||
|
" ax.margins(.05, .05)\n",
|
|||
|
" return fig"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 5,
|
|||
|
"metadata": {
|
|||
|
"slideshow": {
|
|||
|
"slide_type": "notes"
|
|||
|
}
|
|||
|
},
|
|||
|
"outputs": [],
|
|||
|
"source": [
|
|||
|
"# Przygotowanie interaktywnego wykresu\n",
|
|||
|
"\n",
|
|||
|
"dropdown_arg1 = widgets.Dropdown(options=[0, 1, 2, 3], value=2, description='arg1')\n",
|
|||
|
"dropdown_arg2 = widgets.Dropdown(options=[0, 1, 2, 3], value=3, description='arg2')\n",
|
|||
|
"\n",
|
|||
|
"def interactive_unlabeled_data(arg1, arg2):\n",
|
|||
|
" fig = plot_unlabeled_data(\n",
|
|||
|
" X, col1=arg1, col2=arg2, x1label='$x_{}$'.format(arg1), x2label='$x_{}$'.format(arg2))"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 6,
|
|||
|
"metadata": {
|
|||
|
"slideshow": {
|
|||
|
"slide_type": "subslide"
|
|||
|
}
|
|||
|
},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"data": {
|
|||
|
"application/vnd.jupyter.widget-view+json": {
|
|||
|
"model_id": "4196201adbb34356bf7a5f511a8206f9",
|
|||
|
"version_major": 2,
|
|||
|
"version_minor": 0
|
|||
|
},
|
|||
|
"text/html": [
|
|||
|
"<p>Failed to display Jupyter Widget of type <code>interactive</code>.</p>\n",
|
|||
|
"<p>\n",
|
|||
|
" If you're reading this message in Jupyter Notebook or JupyterLab, it may mean\n",
|
|||
|
" that the widgets JavaScript is still loading. If this message persists, it\n",
|
|||
|
" likely means that the widgets JavaScript library is either not installed or\n",
|
|||
|
" not enabled. See the <a href=\"https://ipywidgets.readthedocs.io/en/stable/user_install.html\">Jupyter\n",
|
|||
|
" Widgets Documentation</a> for setup instructions.\n",
|
|||
|
"</p>\n",
|
|||
|
"<p>\n",
|
|||
|
" If you're reading this message in another notebook frontend (for example, a static\n",
|
|||
|
" rendering on GitHub or <a href=\"https://nbviewer.jupyter.org/\">NBViewer</a>),\n",
|
|||
|
" it may mean that your frontend doesn't currently support widgets.\n",
|
|||
|
"</p>\n"
|
|||
|
],
|
|||
|
"text/plain": [
|
|||
|
"interactive(children=(Dropdown(description=u'arg1', index=2, options=(0, 1, 2, 3), value=2), Dropdown(description=u'arg2', index=3, options=(0, 1, 2, 3), value=3), Output()), _dom_classes=('widget-interact',))"
|
|||
|
]
|
|||
|
},
|
|||
|
"metadata": {},
|
|||
|
"output_type": "display_data"
|
|||
|
},
|
|||
|
{
|
|||
|
"data": {
|
|||
|
"text/plain": [
|
|||
|
"<function __main__.interactive_unlabeled_data>"
|
|||
|
]
|
|||
|
},
|
|||
|
"execution_count": 6,
|
|||
|
"metadata": {},
|
|||
|
"output_type": "execute_result"
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"widgets.interact(interactive_unlabeled_data, arg1=dropdown_arg1, arg2=dropdown_arg2)"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 7,
|
|||
|
"metadata": {
|
|||
|
"slideshow": {
|
|||
|
"slide_type": "subslide"
|
|||
|
}
|
|||
|
},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"data": {
|
|||
|
"text/plain": [
|
|||
|
"<seaborn.axisgrid.PairGrid at 0x7f68c678b8d0>"
|
|||
|
]
|
|||
|
},
|
|||
|
"execution_count": 7,
|
|||
|
"metadata": {},
|
|||
|
"output_type": "execute_result"
|
|||
|
},
|
|||
|
{
|
|||
|
"data": {
|
|||
|
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAuUAAAGoCAYAAADhFJvRAAAABHNCSVQICAgIfAhkiAAAAAlwSFlz\nAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDIuMS4yLCBo\ndHRwOi8vbWF0cGxvdGxpYi5vcmcvNQv5yAAAIABJREFUeJzsnXucFNWZ93+nqu8zgzAj8EKQII6Q\nvCA0zChLNC4hRsTsh/AJS5xJuCSbgOF1M/AajGt0s6yrcVmJkUnyopBkI2KGxOASdlfEZJU1Gw3K\nZUDZBBwREXG5zIhMd09fquq8f/RUTVfVqZ7u6e7pyzzfz2c+MD1V1aernvOcp5/zXBjnHARBEARB\nEARBFA+p2AMgCIIgCIIgiKEOGeUEQRAEQRAEUWTIKCcIgiAIgiCIIkNGOUEQBEEQBEEUGTLKCYIg\nCIIgCKLIkFFOEARBEARBEEWGjHKCIAiCIAiCKDJklBMEQRAEQRBEkSGjnCAIgiAIgiCKTNkZ5bfc\ncgsHQD/0k4+fokPyTD95/Ck6JM/0k8efokPyTD95/MmIkjDKGWP/lzF2lDH2BmOsjTHmczr2woUL\ngzk0gigoJM9EJUHyTFQSJM/EYFN0o5wx9hEALQAaOedTAcgAmoo7KoIgCIIgCIIYPIpulPfiAuBn\njLkABACcKfJ4CIIgCIIgCGLQKLpRzjl/D8AGAKcAvA/gQ87588UdFUEQQx1N4wjFFGi8918t47BA\nghgS0BwhSolKkMeiG+WMsREAPgfgSgBjAVQxxpZYjlnJGNvPGNt//vz5YgyTIPIGyXPpo2kcneE4\nVjyxH5Pu3Y0VT+xHZzhelkq+0JA8D00qdY6QPJcnlSKPRTfKAdwE4G3O+XnOeQLAMwA+kXoA53wz\n57yRc944cuTIogySIPIFyXPpE0moaGk7hFdOdELROF450YmWtkOIJNRiD63kIHkemlTqHCF5Lk8q\nRR5LwSg/BeDPGGMBxhgD8GkAfyzymAiCGMIEPDJeO9lleu21k10IeOQijYggSguaI0QpUSny6Cr2\nADjn+xhjvwJwEIAC4BCAzcUdFUEQQHJLMJJQEfDIiMRVBNwyJIkN+Lhsjy0WkbiKlrn1mDd1DOpH\nVaPjXAh73ngfkbiKam/R1SZBDCqiORtJqLh2Qi1eOdFpHHfthFpjjqiqhkhCRZXXhXBMgd8lI6pq\nJT3vifIlEk8vj1ZEMs05N8lswC1DlgfXd10KnnJwzv+Oc/4xzvlUzvlSznms2GMiiKFOpjF62cTy\naRpHdzSBC90xcA5c6I6hO5ooubg/v0tC03XjsW7XUUy+bzfW7TqKpuvGw+8qCZVJEIOG0/z2uyS0\nNs/A7Il1cEkMsyfWobV5BvwuCaFoAkxi6AzFcecv2vHP//U2uiLlH+9LlC5+l4SNTUGTPG5sCsLv\nkmwJoKqq2WS6O5pAZySOlVsPYNK9u7Fy6wF0huNQVW1QPwfjvLwmRWNjI9+/f3+xh0FUBkV305Sy\nPIdiClY8sd/keZg9sQ5bljeaPA+ZHgcAkbiCrnAcdz19BK+d7MK1E2rx8OJpqK3yIOApHQ90Np+p\nhCB5JvJOurmge8x1b6PfJaErkkBL2yFjfq9fNA2yBKx9+ki284nkmciYUEzBT393wra7+bUbJyIc\nU00yubE5iO37TuGR375pnL937Rzc88zrNhndvKwBNT53PoaYkTyT24cgCCGZxuhlE8unacBdvYuz\nnoxz19NHoA2uM6JfKiU+kSByJd1ckCSGaq8LEkv+26NotmS7u3ccwWV+D80noqAEPDJaX+jAvEdf\nwlXffhbzHn0JrS90QNNgk8nVbe2YN3WM6fwragNCGa0aZCcMGeUEQQjRY/RS0WP0BnIcAAS8Dgu8\nt7QW52w+E0FUMlnNbycD3ivTfCIKipOcOq059aOqTa+92xURnh+OKYUZsANklBMEISTgloUxowG3\nbDvusSUzsXftHLz13Vuxd+0cPLZkpu04AIjEHBb4WGktzpl+doKodJJzIWiZC0Hx/HYwjM5+GLXF\n+9J8InLBGifulOPgtOaEY4rp2BEBNzY222PSB1tGKaacGMpQzGI/ZFIpJZkIFkNLW7sRs9faHERd\nlTenY4tNOVSJsVD0wZW6PBPZoydnfxBJ4IraAN7timBEwI0an9thfsdN8butzUFUeVzwyFK21VdI\nngkhYjmbgdqAGz2KWcYAZHxsgauvZCTPZJQTQxlS+nkgm0QwXUmWmbFbLhT9JlaCPBNmRPP7zpuu\nxlduuBJVXpdtDufxyyzJMwHALlMSA776s8wT8UvEwZLRG5ZsGQGCIMoDpzhSv1sSeijqqjyG4izh\nSiYEQcA+vxdMH4uFM8Zh5dYDtnktScxI/gRofhO547T7MnqY13RcusThcpLJ0h4dQRAlT7qmDXrW\nOwCj7fGWZY0AQ9EaDZWI14QgSgrRvACAcFzB8Qfm41I0gWqvC6GYglXbDtrndWmXCyVKjEz1cCSh\nom3fO1i3YIpR6rBt3ymsuWkSdrafMY5L1yiomOPPFkr0JAgiJ5ySIqu8LrEH3SNl1Dwom6ZEmVKI\naxJEuWOdFz/93Qn0JFR0hmPJZir37caqbQdx5mIU1Q7zmsobEpmSjR72uyUsnDHO1Mht4YxxuKLW\nn3HisDUpNFd9X8h1hIxygiByQpIY6qo82LK8EccfnI8tyxtRV+URVmJomVuPzlAc9zzzOibftxv3\nPPM6umMKooq9+kokodrqy7a0HUIkMfBKLYW4JkGUO6nz4tZrxmDhjHE4351MyLbWHA/FFCpvSORE\nNno4Eldx944jNjmMxFXbmiPyVBfCgC7kOkJGOUEQOWNtIiJJTOhB//L1V2L19vaMmgcVooEPNQUi\nypl8e/x0UufFHZ+qx907jjg2U6nxubF+0TQqb0jYyFQ+s9HDTjuuVV6Xbc0RUQgDupDrCAWAEQRR\nEFI96EbcXZrmIlb0+rK2WPWYimrfwFRXuvh3ioclShmnMnBOHsJsSJ0X9aOq8drJLnScCwnnit6+\nfPOyBmH1FWJoko18ZqOHc9XZhTCgC7mOkKecIIiCYfWgOzVyiMbVZGw5T9ZEVhQNkgR87wvTTR65\n731hOqQctBY1BSLKFSePXzje55lUVc3RU+nkxdQ0DnDgqRWzsHftHJy52INrJ9TiRy922Dzi6xdN\nw5433kfzrI+iytO/l5IYOqTzSGfa6Eekh3PV2ZG4ipa59diz5ka89d1bsWfNjWiZWy8Mt1JVzbQO\nqapgCzcPY0oH1SknhjJFX0mGmjyLmgc9tmQmYqqG1SmvbWwKojbgwQeROMJx1WhaUuWRUVvtgZyD\nZV7B1VeK/iGGmjwPJhrnmHTvbigphrZLYjj2wHxMvm83WubWo+m68Vi9vd3mqQScG6h0RRKm11Pn\n4+hhXqy5aRLG1wWSzVQ8MnoS2mDNGZLnMsJJPo8/OB+docya9xSiCpeqaugMx03zYmNTEHVVHlNj\noEyPy2FMGQ2YPOUEQeRMprGEksRQG/Bg87IGHH9wPjYva4AkMay2JJSt3t6OHkVF26unEFOS3oqY\noqHt1VM5J5SJ4t8JotRxamHfcS4EReOYN3WMLV9D91Sm82K27XvH9PrXtx3EMK8Lm5c14JHbgqir\n9oBrHDU+N2RJojlDCHGSz3BMEcpej6JlrIdz0dk9imabF8n1xewFjyRUbH/1FNYtmIJjD8zHugVT\nsP3VU46x54VaR8goJwgiJ7LJbtc0jq5IIllm7d7dWLn1QNpEHlEpLErKJIYioi3z9Yum4UcvdgCA\nEQueih476xxXm5xjC6aPNV4fPcyL7phimqNdEXHZUoLQ8bskbGwKmuRzY1Ow6Mn1mb5/wCOXxHpD\nmU0EQeS0PZjqhQPSNxMRHXuqMyJMmgnHFKMUln7s3TuO4CfLG6FyFVVeV3JL3S0LtxcJohxxmovJ\nXSY3Ni9rQMDjQnc0ga0
|
|||
|
"text/plain": [
|
|||
|
"<matplotlib.figure.Figure at 0x7f68c678bad0>"
|
|||
|
]
|
|||
|
},
|
|||
|
"metadata": {},
|
|||
|
"output_type": "display_data"
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"seaborn.pairplot(data_iris, vars=data_iris.columns, size=1.5, aspect=1.75)"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 8,
|
|||
|
"metadata": {
|
|||
|
"slideshow": {
|
|||
|
"slide_type": "notes"
|
|||
|
}
|
|||
|
},
|
|||
|
"outputs": [],
|
|||
|
"source": [
|
|||
|
"# Odległość euklidesowa\n",
|
|||
|
"def euclidean_distance(x1, x2):\n",
|
|||
|
" return np.linalg.norm(x1 - x2)"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 9,
|
|||
|
"metadata": {
|
|||
|
"slideshow": {
|
|||
|
"slide_type": "notes"
|
|||
|
}
|
|||
|
},
|
|||
|
"outputs": [],
|
|||
|
"source": [
|
|||
|
"# Algorytm k średnich\n",
|
|||
|
"def k_means(X, k, distance=euclidean_distance):\n",
|
|||
|
" history = []\n",
|
|||
|
" Y = []\n",
|
|||
|
" \n",
|
|||
|
" # Wylosuj centroid dla każdej klasy\n",
|
|||
|
" centroids = [[random.uniform(X.min(axis=0)[f], X.max(axis=0)[f])\n",
|
|||
|
" for f in range(X.shape[1])]\n",
|
|||
|
" for c in range(k)]\n",
|
|||
|
"\n",
|
|||
|
" # Powtarzaj, dopóki klasy się zmieniają\n",
|
|||
|
" while True:\n",
|
|||
|
" distances = [[distance(centroids[c], x) for c in range(k)] for x in X]\n",
|
|||
|
" Y_new = [d.index(min(d)) for d in distances]\n",
|
|||
|
" if Y_new == Y:\n",
|
|||
|
" break\n",
|
|||
|
" Y = Y_new\n",
|
|||
|
" XY = np.asarray(np.concatenate((X, np.matrix(Y).T), axis=1))\n",
|
|||
|
" Xc = [XY[XY[:, 2] == c][:, :-1] for c in range(k)]\n",
|
|||
|
" centroids = [[Xc[c].mean(axis=0)[f] for f in range(X.shape[1])]\n",
|
|||
|
" for c in range(k)]\n",
|
|||
|
" history.append((centroids, Y))\n",
|
|||
|
"\n",
|
|||
|
" result = history[-1][1]\n",
|
|||
|
" return result, history"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 10,
|
|||
|
"metadata": {
|
|||
|
"slideshow": {
|
|||
|
"slide_type": "notes"
|
|||
|
}
|
|||
|
},
|
|||
|
"outputs": [],
|
|||
|
"source": [
|
|||
|
"# Wykres danych - klastrowanie\n",
|
|||
|
"def plot_clusters(X, Y, k, centroids=None):\n",
|
|||
|
" color = ['r', 'g', 'b', 'c', 'm', 'y', 'k']\n",
|
|||
|
" fig = plt.figure(figsize=(16*.7, 9*.7))\n",
|
|||
|
" ax = fig.add_subplot(111)\n",
|
|||
|
" fig.subplots_adjust(left=0.1, right=0.9, bottom=0.1, top=0.9)\n",
|
|||
|
"\n",
|
|||
|
" X1 = X[:, 0].tolist()\n",
|
|||
|
" X2 = X[:, 1].tolist()\n",
|
|||
|
" X1 = [[x for x, y in zip(X1, Y) if y == c] for c in range(k)]\n",
|
|||
|
" X2 = [[x for x, y in zip(X2, Y) if y == c] for c in range(k)]\n",
|
|||
|
"\n",
|
|||
|
" for c in range(k):\n",
|
|||
|
" ax.scatter(X1[c], X2[c], c=color[c], marker='o', s=25, label='Dane')\n",
|
|||
|
" if centroids:\n",
|
|||
|
" ax.scatter([centroids[c][0]], [centroids[c][1]], c=color[c], marker='+', s=500, label='Centroid')\n",
|
|||
|
"\n",
|
|||
|
" ax.set_xlabel(r'$x_1$')\n",
|
|||
|
" ax.set_ylabel(r'$x_2$')\n",
|
|||
|
" ax.margins(.05, .05)\n",
|
|||
|
" return fig"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 11,
|
|||
|
"metadata": {
|
|||
|
"slideshow": {
|
|||
|
"slide_type": "subslide"
|
|||
|
}
|
|||
|
},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"data": {
|
|||
|
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAr0AAAGdCAYAAAAfYMtzAAAABHNCSVQICAgIfAhkiAAAAAlwSFlz\nAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDIuMS4yLCBo\ndHRwOi8vbWF0cGxvdGxpYi5vcmcvNQv5yAAAIABJREFUeJzt3X+M2/d93/HX2zrf7JL+cYnlnXBx\n4hpqAtTOSma0uEpNalRIL/QMZ8ACzC7cLUJXD1nTJWu2NWtvClpd0QZDu6VxmyxNoiRz5mSL087T\nzN0C2HE9HUKLCtnasZtAc5vaB12s2rJsMvIIUe/98bmTjxKloyiS3y8/fD4A4Xv83Effz5sfKu3L\nX36+n6+5uwAAAICYXZZ0AQAAAMCwEXoBAAAQPUIvAAAAokfoBQAAQPQIvQAAAIgeoRcAAADRI/QC\nAAAgeoReAAAARI/QCwAAgOhNJV1Av6677jq/8cYbky4DAAAACTp8+PDfuPvWzfqNbei98cYbVa1W\nky4DAAAACTKz7/fSj+UNAAAAiB6hFwAAANEj9AIAACB6hF4AAABEj9ALAACA6BF6AQAAED1CLwAA\nAKJH6AUAAED0CL0AAACIHqEXAAAA0SP0AgAAIHqEXgAAAERv6KHXzG4ws0fN7Gkz+46ZfahLn9vM\n7ISZ1df+7B12XQAAAJgcUyMY45Skj7j7t83sKkmHzewb7v70Wf0ed/c7RlAPAABIQrstlctSrSbl\n81KpJG3Zko4xk6gNIzX00OvuRyUdXfv5VTN7RtKcpLNDLwAAiFW7Lc3PS5WK1GxKmYxULEpLS8ML\nl72OmURtGLmRruk1sxsl5SVVuvz6J83sz8ysbGY3j7IuAAAwZOVyCJWNhuQejpVKaE96zCRqw8iN\nLPSaWVbSg5I+7O6vnPXrb0t6i7v/hKRPSvqT85zjXjOrmln12LFjwy0YAAAMTq0WrqJu1GxK9Xry\nYyZRG0ZuJKHXzC5XCLxfdvevn/17d3/F3RtrPz8s6XIzu65Lv8+4e8HdC1u3bh163QAAYEDy+bBs\nYKNMRsrlkh8zidowcqPYvcEkfU7SM+7+e+fpM7vWT2a2Y62uF4ddGwAAGJFSKayTzWYls3AsFkN7\n0mMmURtGbhS7N+yS9POSnjSz9e8Jfk3SmyXJ3T8t6X2SPmBmpySdlHSXu/sIagMAAKOwZUu4Maxc\nDssGcrnh75DQ65hJ1IaRs3HNloVCwavVatJlAAAAIEFmdtjdC5v144lsAAAAiB6hFwAAANEj9AIA\nACB6hF4AAABEj9ALAACA6BF6AQBISrstHTgg7dsXju120hX1p9WS9u6Vdu8Ox1Yr6YqAc4xin14A\nAHC2dluan5cqlfDI20wmPBBhaWm89odttaTZWen48fD6kUek++6TVlel6elkawM24EovAABJKJdD\n4G00JPdwrFRC+zhZXHw98K47fjy0AylC6AUAIAm1WrjCu1GzGZ4INk4OHuzevrw82jqATRB6AQBI\nQj4fljRslMmER+COk127urfv3DnaOoBNEHoBAEhCqRTW8Gazklk4FouhfZwsLEgzM51tMzOhHUgR\nbmQDACAJW7aEm9bK5bCkIZcLgXecbmKTws1qq6thDe/ycrjCu7DATWxIHXP3pGvoS6FQ8Gq1mnQZ\nAAAASJCZHXb3wmb9WN4AAACA6BF6AQAAED1CLwAAAKJH6AUAAED0CL0AAACIHqEXAAAA0WOfXgAA\nktJuh316a7XwhLbz7dObVL9BG+S4g34PSc1JDMZk7gi9AAAkod2W5uelSkVqNsMjiIvF8MCKjYEh\nqX5Jvd9Rn2sY55skYzR3LG8AACAJ5XIICo2G5B6OlUpoT0O/pN7vqM81jPNNkjGaO0IvAABJqNXC\nlbGNms3wSOI09Bu0QY476PeQ1JzEYIzmjtALAEAS8vnwVfBGmYyUy6Wj36ANctxBv4ek5iQGYzR3\nhF4AAJJQKoW1j9msZBaOxWJoT0O/pN7vqM81jPNNkjGaO3P3pGvoS6FQ8Gq1mnQZAAD0b/2u93o9\nXBnbbLeFUfcbtEGOO+j3kNScxCDhuTOzw+5e2LQfoRcAAADjqtfQy/IGAAAARI/QCwAAgOgRegEA\nABA9Qi8AAACiR+gFAABA9KaSLgAAAIy59S2rarXwsIJBbEU2iHPh/CZwngm9AACgf+22ND8vVSrh\n8bOZTHg4wdLSxYeoQZ4L5zeh88zyBgAA0L9yOYSnRkNyD8dKJbQneS6c34TOM6EXAAD0r1YLVws3\najbD07mSPBfOb0LnmdALAAD6l8+Hr8c3ymTC42iTPBfOb0LnmdALAAD6VyqF9aDZrGQWjsViaE/y\nXDi/CZ1nc/eka+hLoVDwarWadBkAAGB9J4B6PVwtHMTuDYM4F84vonk2s8PuXti0H6EXAAAA46rX\n0MvyBgAAAESP0AsAAIDoEXoBAAAQPUIvAAAAokfoBQAAQPQIvQAAAIjeVNIFAACAEVvfo7VWC0/n\nOt8erUn1G+R7mDTMy3kRegEAmCTttjQ/L1UqUrMZHj9bLEpLS53hKKl+g3wPk4Z5uSCWNwAAMEnK\n5RCKGg3JPRwrldCehn6DfA+Thnm5IEIvAACTpFYLVwE3ajbD42jT0K8XgzxXTJiXCyL0AgAwSfL5\n8LX3RpmMlMulo18vBnmumDAvF0ToBQBgkpRKYZ1nNiuZhWOxGNrT0G+Q72HSMC8XZO6edA19KRQK\nXq1Wky4DAIDxs36Hf70ergJuttvCqPsN8j1MmgmcFzM77O6FTfsRegEAADCueg29LG8AAABA9Ai9\nAAAAiB6hFwAAANEj9AIAACB6hF4AAABEb+ih18xuMLNHzexpM/uOmX2oSx8zs983syNm9udm9o5h\n1wUAAIDJMTWCMU5J+oi7f9vMrpJ02My+4e5Pb+hTkvRja3+Kkj61dgQAID3W90Ct1cLTrzbbj3bU\n/TB+kvpsJ/Df1NBDr7sflXR07edXzewZSXOSNobe90r6kodNg79lZtea2ba1vwsAQPLabWl+XqpU\npGYzPN61WJSWljrDQlL9MH6S+mwn9N/USNf0mtmNkvKSKmf9ak7ScxteP7/WBgBAOpTLISQ0GpJ7\nOFYqoT0N/TB+kvpsJ/Tf1MhCr5llJT0o6cPu/kqf57jXzKpmVj127NhgCwQA4EJqtXBVbKNmMzzu\nNQ39MH6S+mwn9N/USEKvmV2uEHi/7O5f79JlRdING16/aa2tg7t/xt0L7l7YunXrcIoFAKCbfD58\nDbxRJiPlcunoh/GT1Gc7of+mRrF7g0n6nKRn3P33ztPtIUn/eG0Xh78n6QTreQEAqVIqhXWP2axk\nFo7FYmhPQz+Mn6Q+2wn9N2Xh3rEhDmD2U5Iel/SkpNNrzb8m6c2S5O6fXgvG90l6j6QfStrj7tUL\nnbdQKHi1esEuAAAM1vod7/V6uCq22W4Lo+6H8ZPUZxvRvykzO+zuhU37DTv0DguhFwAAAL2GXp7I\nBgAAgOgRegEAABA9Qi8AAACiR+gFAABA9Ai9AAAAiN5U0gUAABCd9e2garXwIIDzbQfVakmLi9LB\ng9KuXdLCgjQ9PfxxcS7mLnqEXgAABqndlubnpUolPNo1kwkb/y8tdYaoVkuanZWOHw+vH3lEuu8+\naXW1v+Db67g4F3M3EVjeAADAIJXLITw1GpJ7OFYqoX2jxcXXA++648dD+zDHxbmYu4lA6AUAYJBq\ntXC1cKNmMzz5aqODB7v//eXl4Y6LczF3E4HQCwDAIOXz4evxjTKZ8KjXjXbt6v73d+4c7rg4F3M3\nEQi9AAAMUqkU1oNms5JZOBaLoX2jhQVpZqazbWYmtA9zXJyLuZsI5u5J19CXQqHg1Wo16TIAADjX\n+k4A9Xq4WrjZ7g3Ly+EK76B2b9hsXJyLuRtbZnbY3Qub9iP0AgAAYFz1GnpZ3gAAAIDoEXoBAAAQ\nPUIvAAAAokfoBQAAQPQ
|
|||
|
"text/plain": [
|
|||
|
"<matplotlib.figure.Figure at 0x7f68c6003910>"
|
|||
|
]
|
|||
|
},
|
|||
|
"metadata": {},
|
|||
|
"output_type": "display_data"
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"Ys, history = k_means(Xs, 2)\n",
|
|||
|
"fig = plot_clusters(Xs, Ys, 2, centroids=history[-1][0])"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 12,
|
|||
|
"metadata": {
|
|||
|
"slideshow": {
|
|||
|
"slide_type": "notes"
|
|||
|
}
|
|||
|
},
|
|||
|
"outputs": [],
|
|||
|
"source": [
|
|||
|
"# Przygotowanie interaktywnego wykresu\n",
|
|||
|
"\n",
|
|||
|
"slider_k = widgets.IntSlider(min=1, max=7, step=1, value=2, description=r'$k$', width=300)\n",
|
|||
|
"\n",
|
|||
|
"def interactive_kmeans_k(steps, history, k):\n",
|
|||
|
" if steps >= len(history) or steps == 10:\n",
|
|||
|
" steps = len(history) - 1\n",
|
|||
|
" fig = plot_clusters(Xs, history[steps][1], k, centroids=history[steps][0])\n",
|
|||
|
" \n",
|
|||
|
"def interactive_kmeans(k):\n",
|
|||
|
" slider_steps = widgets.IntSlider(min=1, max=10, step=1, value=1, description=r'steps', width=300)\n",
|
|||
|
" _, history = k_means(Xs, k)\n",
|
|||
|
" widgets.interact(interactive_kmeans_k, steps=slider_steps,\n",
|
|||
|
" history=widgets.fixed(history), k=widgets.fixed(k))"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 13,
|
|||
|
"metadata": {
|
|||
|
"slideshow": {
|
|||
|
"slide_type": "subslide"
|
|||
|
}
|
|||
|
},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"data": {
|
|||
|
"application/vnd.jupyter.widget-view+json": {
|
|||
|
"model_id": "41f26926f3cc4de6acf9239e228e775a",
|
|||
|
"version_major": 2,
|
|||
|
"version_minor": 0
|
|||
|
},
|
|||
|
"text/html": [
|
|||
|
"<p>Failed to display Jupyter Widget of type <code>interactive</code>.</p>\n",
|
|||
|
"<p>\n",
|
|||
|
" If you're reading this message in Jupyter Notebook or JupyterLab, it may mean\n",
|
|||
|
" that the widgets JavaScript is still loading. If this message persists, it\n",
|
|||
|
" likely means that the widgets JavaScript library is either not installed or\n",
|
|||
|
" not enabled. See the <a href=\"https://ipywidgets.readthedocs.io/en/stable/user_install.html\">Jupyter\n",
|
|||
|
" Widgets Documentation</a> for setup instructions.\n",
|
|||
|
"</p>\n",
|
|||
|
"<p>\n",
|
|||
|
" If you're reading this message in another notebook frontend (for example, a static\n",
|
|||
|
" rendering on GitHub or <a href=\"https://nbviewer.jupyter.org/\">NBViewer</a>),\n",
|
|||
|
" it may mean that your frontend doesn't currently support widgets.\n",
|
|||
|
"</p>\n"
|
|||
|
],
|
|||
|
"text/plain": [
|
|||
|
"interactive(children=(IntSlider(value=2, description=u'$k$', max=7, min=1), Button(description=u'Run Interact', style=ButtonStyle()), Output()), _dom_classes=('widget-interact',))"
|
|||
|
]
|
|||
|
},
|
|||
|
"metadata": {},
|
|||
|
"output_type": "display_data"
|
|||
|
},
|
|||
|
{
|
|||
|
"data": {
|
|||
|
"text/plain": [
|
|||
|
"<function __main__.interactive_kmeans>"
|
|||
|
]
|
|||
|
},
|
|||
|
"execution_count": 13,
|
|||
|
"metadata": {},
|
|||
|
"output_type": "execute_result"
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"widgets.interact_manual(interactive_kmeans, k=slider_k)"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {
|
|||
|
"slideshow": {
|
|||
|
"slide_type": "subslide"
|
|||
|
}
|
|||
|
},
|
|||
|
"source": [
|
|||
|
"### Algorytm $k$ średnich – dane wejściowe\n",
|
|||
|
"\n",
|
|||
|
"* $k$ – liczba klastrów\n",
|
|||
|
"* zbiór uczący $X = \\{ x^{(1)}, x^{(2)}, \\ldots, x^{(m)} \\}$, $x^{(i)} \\in \\mathbb{R}^n$"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {
|
|||
|
"slideshow": {
|
|||
|
"slide_type": "fragment"
|
|||
|
}
|
|||
|
},
|
|||
|
"source": [
|
|||
|
"Na wejściu nie ma zbioru $Y$, ponieważ jest to uczenie nienadzorowane!"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {
|
|||
|
"slideshow": {
|
|||
|
"slide_type": "subslide"
|
|||
|
}
|
|||
|
},
|
|||
|
"source": [
|
|||
|
"### Algorytm $k$ średnich – pseudokod\n",
|
|||
|
"\n",
|
|||
|
"1. Zainicjalizuj losowo $k$ centroidów (środków ciężkości klastrów): $\\mu_1, \\ldots, \\mu_k$.\n",
|
|||
|
"1. Powtarzaj dopóki przyporządkowania klastrów się zmieniają:\n",
|
|||
|
" 1. Dla $i = 1$ do $m$:\n",
|
|||
|
" za $y^{(i)}$ przyjmij klasę najbliższego centroidu.\n",
|
|||
|
" 1. Dla $c = 1$ do $k$:\n",
|
|||
|
" za $\\mu_c$ przyjmij średnią wszystkich punktów $x^{(i)}$ takich, że $y^{(i)} = c$."
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 14,
|
|||
|
"metadata": {
|
|||
|
"slideshow": {
|
|||
|
"slide_type": "subslide"
|
|||
|
}
|
|||
|
},
|
|||
|
"outputs": [],
|
|||
|
"source": [
|
|||
|
"# Algorytm k średnich\n",
|
|||
|
"def k_means(X, k, distance=euclidean_distance):\n",
|
|||
|
" Y = []\n",
|
|||
|
" centroids = [[random.uniform(X.min(axis=0)[f],X.max(axis=0)[f])\n",
|
|||
|
" for f in range(X.shape[1])]\n",
|
|||
|
" for c in range(k)] # Wylosuj centroidy\n",
|
|||
|
" while True:\n",
|
|||
|
" distances = [[distance(centroids[c], x) for c in range(k)]\n",
|
|||
|
" for x in X] # Oblicz odległości\n",
|
|||
|
" Y_new = [d.index(min(d)) for d in distances]\n",
|
|||
|
" if Y_new == Y:\n",
|
|||
|
" break # Jeśli nic się nie zmienia, przerwij\n",
|
|||
|
" Y = Y_new\n",
|
|||
|
" XY = np.asarray(np.concatenate((X,np.matrix(Y).T),axis=1))\n",
|
|||
|
" Xc = [XY[XY[:, 2] == c][:, :-1] for c in range(k)]\n",
|
|||
|
" centroids = [[Xc[c].mean(axis=0)[f]\n",
|
|||
|
" for f in range(X.shape[1])]\n",
|
|||
|
" for c in range(k)] # Przesuń centroidy\n",
|
|||
|
" return Y"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {
|
|||
|
"slideshow": {
|
|||
|
"slide_type": "subslide"
|
|||
|
}
|
|||
|
},
|
|||
|
"source": [
|
|||
|
"* Liczba klastrów jest określona z góry i wynosi $k$."
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {
|
|||
|
"slideshow": {
|
|||
|
"slide_type": "fragment"
|
|||
|
}
|
|||
|
},
|
|||
|
"source": [
|
|||
|
"* Jeżeli w którymś kroku algorytmu jedna z klas nie zostanie przyporządkowana żadnemu z przykładów, pomija się ją – w ten sposób wynikiem działania algorytmu może być mniej niż $k$ klastrów."
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {
|
|||
|
"slideshow": {
|
|||
|
"slide_type": "subslide"
|
|||
|
}
|
|||
|
},
|
|||
|
"source": [
|
|||
|
"### Funkcja kosztu dla problemu klastrowania\n",
|
|||
|
"\n",
|
|||
|
"$$ J \\left( y^{(i)}, \\ldots, y^{(m)}, \\mu_{1}, \\ldots, \\mu_{k} \\right) = \\frac{1}{m} \\sum_{i=1}^{m} || x^{(i)} - \\mu_{y^{(i)}} || ^2 $$"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {
|
|||
|
"slideshow": {
|
|||
|
"slide_type": "fragment"
|
|||
|
}
|
|||
|
},
|
|||
|
"source": [
|
|||
|
"* Zauważmy, że z każdym krokiem algorytmu $k$ średnich koszt się zmniejsza (lub ewentualnie pozostaje taki sam)."
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {
|
|||
|
"slideshow": {
|
|||
|
"slide_type": "subslide"
|
|||
|
}
|
|||
|
},
|
|||
|
"source": [
|
|||
|
"### Wielokrotna inicjalizacja\n",
|
|||
|
"\n",
|
|||
|
"* Algorytm $k$ średnich zawsze znajdzie lokalne minimum funkcji kosztu $J$, ale nie zawsze będzie to globalne minimum."
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {
|
|||
|
"slideshow": {
|
|||
|
"slide_type": "fragment"
|
|||
|
}
|
|||
|
},
|
|||
|
"source": [
|
|||
|
"* Aby temu zaradzić, można uruchomić algorytm $k$ średnich wiele razy, za każdym razem z innym losowym położeniem centroidów (tzw. **wielokrotna losowa inicjalizacja** – _multiple random initialization_).\n",
|
|||
|
"* Za każdym razem obliczamy koszt $J$. Wybieramy ten wynik, który ma najniższy koszt."
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {
|
|||
|
"slideshow": {
|
|||
|
"slide_type": "subslide"
|
|||
|
}
|
|||
|
},
|
|||
|
"source": [
|
|||
|
"### Wybór liczby klastrów $k$\n",
|
|||
|
"\n",
|
|||
|
"* Najlepiej wybrać $k$ ręcznie w zależności od kształtu danych i celu, który chcemy osiągnąć."
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {
|
|||
|
"slideshow": {
|
|||
|
"slide_type": "slide"
|
|||
|
}
|
|||
|
},
|
|||
|
"source": [
|
|||
|
"## 4.3. Metryki"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 15,
|
|||
|
"metadata": {
|
|||
|
"slideshow": {
|
|||
|
"slide_type": "notes"
|
|||
|
}
|
|||
|
},
|
|||
|
"outputs": [],
|
|||
|
"source": [
|
|||
|
"def powerme(x1,x2,n):\n",
|
|||
|
" X = []\n",
|
|||
|
" for m in range(n+1):\n",
|
|||
|
" for i in range(m+1):\n",
|
|||
|
" X.append(np.multiply(np.power(x1,i),np.power(x2,(m-i))))\n",
|
|||
|
" return np.hstack(X)"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 16,
|
|||
|
"metadata": {
|
|||
|
"slideshow": {
|
|||
|
"slide_type": "notes"
|
|||
|
}
|
|||
|
},
|
|||
|
"outputs": [],
|
|||
|
"source": [
|
|||
|
"# Wykres danych\n",
|
|||
|
"def plot_data_for_classification(X, Y, xlabel=None, ylabel=None, Y_predicted=[], highlight=None):\n",
|
|||
|
" fig = plt.figure(figsize=(16*.6, 9*.6))\n",
|
|||
|
" ax = fig.add_subplot(111)\n",
|
|||
|
" fig.subplots_adjust(left=0.1, right=0.9, bottom=0.1, top=0.9)\n",
|
|||
|
" X = X.tolist()\n",
|
|||
|
" Y = Y.tolist()\n",
|
|||
|
" X1n = [x[1] for x, y in zip(X, Y) if y[0] == 0]\n",
|
|||
|
" X1p = [x[1] for x, y in zip(X, Y) if y[0] == 1]\n",
|
|||
|
" X2n = [x[2] for x, y in zip(X, Y) if y[0] == 0]\n",
|
|||
|
" X2p = [x[2] for x, y in zip(X, Y) if y[0] == 1]\n",
|
|||
|
" \n",
|
|||
|
" if Y_predicted != []:\n",
|
|||
|
" Y_predicted = Y_predicted.tolist()\n",
|
|||
|
" X1tn = [x[1] for x, y, yp in zip(X, Y, Y_predicted) if y[0] == 0 and yp[0] == 0]\n",
|
|||
|
" X1fn = [x[1] for x, y, yp in zip(X, Y, Y_predicted) if y[0] == 1 and yp[0] == 0]\n",
|
|||
|
" X1tp = [x[1] for x, y, yp in zip(X, Y, Y_predicted) if y[0] == 1 and yp[0] == 1]\n",
|
|||
|
" X1fp = [x[1] for x, y, yp in zip(X, Y, Y_predicted) if y[0] == 0 and yp[0] == 1]\n",
|
|||
|
" X2tn = [x[2] for x, y, yp in zip(X, Y, Y_predicted) if y[0] == 0 and yp[0] == 0]\n",
|
|||
|
" X2fn = [x[2] for x, y, yp in zip(X, Y, Y_predicted) if y[0] == 1 and yp[0] == 0]\n",
|
|||
|
" X2tp = [x[2] for x, y, yp in zip(X, Y, Y_predicted) if y[0] == 1 and yp[0] == 1]\n",
|
|||
|
" X2fp = [x[2] for x, y, yp in zip(X, Y, Y_predicted) if y[0] == 0 and yp[0] == 1]\n",
|
|||
|
" \n",
|
|||
|
" if Y_predicted != []:\n",
|
|||
|
" if highlight == 'tn':\n",
|
|||
|
" ax.scatter(X1tn, X2tn, c='r', marker='x', s=50, label='Dane')\n",
|
|||
|
" ax.scatter(X1fn, X2fn, c='k', marker='o', s=50, label='Dane')\n",
|
|||
|
" ax.scatter(X1tp, X2tp, c='k', marker='o', s=50, label='Dane')\n",
|
|||
|
" ax.scatter(X1fp, X2fp, c='k', marker='x', s=50, label='Dane')\n",
|
|||
|
" elif highlight == 'fn':\n",
|
|||
|
" ax.scatter(X1tn, X2tn, c='k', marker='x', s=50, label='Dane')\n",
|
|||
|
" ax.scatter(X1fn, X2fn, c='r', marker='o', s=50, label='Dane')\n",
|
|||
|
" ax.scatter(X1tp, X2tp, c='k', marker='o', s=50, label='Dane')\n",
|
|||
|
" ax.scatter(X1fp, X2fp, c='k', marker='x', s=50, label='Dane')\n",
|
|||
|
" elif highlight == 'tp':\n",
|
|||
|
" ax.scatter(X1tn, X2tn, c='k', marker='x', s=50, label='Dane')\n",
|
|||
|
" ax.scatter(X1fn, X2fn, c='k', marker='o', s=50, label='Dane')\n",
|
|||
|
" ax.scatter(X1tp, X2tp, c='g', marker='o', s=50, label='Dane')\n",
|
|||
|
" ax.scatter(X1fp, X2fp, c='k', marker='x', s=50, label='Dane')\n",
|
|||
|
" elif highlight == 'fp':\n",
|
|||
|
" ax.scatter(X1tn, X2tn, c='k', marker='x', s=50, label='Dane')\n",
|
|||
|
" ax.scatter(X1fn, X2fn, c='k', marker='o', s=50, label='Dane')\n",
|
|||
|
" ax.scatter(X1tp, X2tp, c='k', marker='o', s=50, label='Dane')\n",
|
|||
|
" ax.scatter(X1fp, X2fp, c='g', marker='x', s=50, label='Dane')\n",
|
|||
|
" else:\n",
|
|||
|
" ax.scatter(X1tn, X2tn, c='r', marker='x', s=50, label='Dane')\n",
|
|||
|
" ax.scatter(X1fn, X2fn, c='r', marker='o', s=50, label='Dane')\n",
|
|||
|
" ax.scatter(X1tp, X2tp, c='g', marker='o', s=50, label='Dane')\n",
|
|||
|
" ax.scatter(X1fp, X2fp, c='g', marker='x', s=50, label='Dane')\n",
|
|||
|
"\n",
|
|||
|
" else:\n",
|
|||
|
" ax.scatter(X1n, X2n, c='r', marker='x', s=50, label='Dane')\n",
|
|||
|
" ax.scatter(X1p, X2p, c='g', marker='o', s=50, label='Dane')\n",
|
|||
|
" \n",
|
|||
|
" if xlabel:\n",
|
|||
|
" ax.set_xlabel(xlabel)\n",
|
|||
|
" if ylabel:\n",
|
|||
|
" ax.set_ylabel(ylabel)\n",
|
|||
|
" \n",
|
|||
|
" ax.margins(.05, .05)\n",
|
|||
|
" return fig"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 17,
|
|||
|
"metadata": {
|
|||
|
"slideshow": {
|
|||
|
"slide_type": "notes"
|
|||
|
}
|
|||
|
},
|
|||
|
"outputs": [],
|
|||
|
"source": [
|
|||
|
"# Wczytanie danych\n",
|
|||
|
"import pandas\n",
|
|||
|
"import numpy as np\n",
|
|||
|
"\n",
|
|||
|
"alldata = pandas.read_csv('data.tsv', sep='\\t')\n",
|
|||
|
"data = np.matrix(alldata)\n",
|
|||
|
"\n",
|
|||
|
"m, n_plus_1 = data.shape\n",
|
|||
|
"n = n_plus_1 - 1\n",
|
|||
|
"Xn = data[:, 1:]\n",
|
|||
|
"\n",
|
|||
|
"X2 = powerme(data[:, 1], data[:, 2], n)\n",
|
|||
|
"Y2 = np.matrix(data[:, 0]).reshape(m, 1)"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 18,
|
|||
|
"metadata": {
|
|||
|
"slideshow": {
|
|||
|
"slide_type": "subslide"
|
|||
|
}
|
|||
|
},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"data": {
|
|||
|
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAnAAAAFpCAYAAAAcIhVtAAAABHNCSVQICAgIfAhkiAAAAAlwSFlz\nAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDIuMS4yLCBo\ndHRwOi8vbWF0cGxvdGxpYi5vcmcvNQv5yAAAIABJREFUeJzt3X90XOV95/HPV2A5G1kttnGpYzCQ\nSm0SYA9QlU0bn4oQfkV/YFmhkQk5JQ1ZNtmkcYF2cU7OJjmkOSXpnihKy6blOGnSXZ8wlMiKu1XK\n8nO7bkOKYIHYcIgUsgFjElybZMdKIwnmu3/ce+2r0Yw0g2bunXvn/TpnjuY+9478zJ0Zz0fPfX6Y\nuwsAAADZ0ZF2BQAAAFAfAhwAAEDGEOAAAAAyhgAHAACQMQQ4AACAjCHAAQAAZAwBDgAAIGMIcAAA\nABlDgAMAAMiYk9OuQBpOPfVUP+uss9KuBgAAwAKPPvrov7j7huWOa8sAd9ZZZ2lycjLtagAAACxg\nZj+s5TguoQIAAGQMAQ4AACBjCHAAAAAZQ4ADAADIGAIcAABAxrREgDOzr5jZS2a2v8p+M7Mvmtm0\nmT1pZhfG9l1nZlPh7brkag0AAJCOlghwkr4q6col9r9TUm94u0HSlyTJzNZJ+qSkfyfpIkmfNLO1\nTa0pAABAyloiwLn7P0g6usQhWyX9tQcelnSKmW2UdIWke939qLu/LOleLR0EAQAAMq8lAlwNNkl6\nPrZ9MCyrVr6Imd1gZpNmNnn48OGmVRQAAKDZshLgVszd73D3Pnfv27Bh2RUqAAAAWlZWAtwLks6I\nbZ8ellUrBxDnLu3ZE/yspRwA0NKyEuD2SvrdcDTqWyX91N1flHSPpMvNbG04eOHysAxA3Pi4NDQk\n3XjjibDmHmwPDQX7AQCZ0RKL2ZvZ1yVdLOlUMzuoYGTpKkly97+QNCFpQNK0pJ9J+r1w31Ez+7Sk\nR8Jfdau7LzUYAmhPg4PSjh3S6GiwPTIShLfR0aB8cDDd+gEA6mLehpdO+vr6fHJyMu1qAMmKWtyi\nECcF4W1kRDJLr14AgOPM7FF371v2OAIc0EbcpY5Yz4lSifAGAC2k1gCXlT5wAFYqaoGLi/eJAwBk\nBgEOaAfxy6c7dgQtb1GfOEIcAGROSwxiANBk4+MnwlvU521kJNg3Oir190vbtqVbRwBAzQhwQDsY\nHJTGxoKfUZ+3KMT19zMKFQAyhgAHtAOzyi1s1coBAC2NPnAAAAAZQ4ADAADIGAIcAABAxhDgAAAA\nMoYABwAAkDEEOAAAgIwhwAEAAGQM88ABAFCmOFtU4UBBU0em1Lu+V8PnDKt7dXfa1QKOI8ABABCz\n77l9Gtg9oJKXNDM/o65VXbrpnps0ce2Etmzeknb1AElcQgUA4LjibFEDuwdUnCtqZn5GkjQzP6Pi\nXFB+bO5YyjUEAgQ4AABChQMFlbxUcV/JSyrsLyRcI6AyAhwAAKGpI1PHW97KzczPaProdMI1Aioj\nwAEAEOpd16MuW11xX5etVs+6X0m4RkBlBDgAAELDP+hSx89nK+7r+Pmshp99fcI1AiojwAEAEOp+\n1zWamL1a3bNSlwcTNXT5yeqelSZmr9aad12Tcg2BANOIAAAQMdOW/3KXDt34YRX+/kuaXif1HH1F\nw7/9Ia0ZuV0yS7uGgCTJ3D3tOiSur6/PJycn064GAKBVuUsdsYtUpRLhDYkws0fdvW+547iECgBA\nnLt0440Ly268MSgHWgQBDgCASBTeRkelHTuClrcdO4JtQhxaCH3gAACIjI+fCG8jI8Fl05GRYN/o\nqNTfL23blm4dAbVIgDOzKyWNSjpJ0i53v61s/4ikt4ebr5f0S+5+SrjvVUnfDfc95+5XJVNrAEDu\nDA5KY2PBz6jPWxTi+vuDcqAFpD6IwcxOkvQ9SZdJOijpEUnXuPtTVY7/fUkXuPv7w+1j7r6mnn+T\nQQwAAKAVZWkQw0WSpt39WXefk3SnpK1LHH+NpK8nUjMAAIAW1AoBbpOk52PbB8OyRczsTElnS3og\nVvw6M5s0s4fNjLZtAMlxl/bsWdyxvVo5ADRIKwS4emyXdLe7vxorOzNsanyPpC+YWcWF6szshjDo\nTR4+fDiJugLIu/FxaWho4ejEaBTj0FCwHwCaoBUC3AuSzohtnx6WVbJdZZdP3f2F8Oezkh6SdEGl\nB7r7He7e5+59GzZsWGmdASDo0F4+xUR8Cgo6vANoklYYhfqIpF4zO1tBcNuuoDVtATN7k6S1kr4d\nK1sr6WfuPmtmp0p6m6TPJVJrACifYmJ0NLgfn4ICAJog9RY4d39F0kck3SPpaUl3ufsBM7vVzOJT\ngmyXdKcvHDb7ZkmTZvaEpAcl3VZt9CoANEU8xEUIbwCarBVa4OTuE5Imyso+Ubb9qQqP+ydJ5zW1\ncgCwlGrLLhHiADRR6i1wAJBZLLsEICUt0QIHAJnEsksAUkKAA4DXimWXAKSEAAcAr5VZ5Ra2auUA\n0CD0gQMAAMgYAhwAAEDGEOAAAAAyhgAHAACQMQQ4AACAjCHAAQAAZAwBDgAAIGMIcAAAABlDgAMA\nAMgYAhwAAEDGEOAAAAAyhgAHAACQMQQ4AACAjCHAAQAAZAwBDgAAIGMIcAAAABlDgAMAAMgYAhwA\nAEDGEOAAAAAyhgAHAACQMQQ4AACAjCHAAQAAZAwBDgAAIGNaIsCZ2ZVm9oyZTZvZzgr732dmh83s\n8fD2gdi+68xsKrxdl2zNAQAAkndy2hUws5Mk3S7pMkkHJT1iZnvd/amyQwvu/pGyx66T9ElJfZJc\n0qPhY19OoOoAAACpaIUWuIskTbv7s+4+J+lOSVtrfOwVku5196NhaLtX0pVNqicAAEBLaIUAt0nS\n87Htg2FZuXeZ2ZNmdreZnVHnY2VmN5jZpJlNHj58uBH1BgAASEUrBLha/K2ks9z93ypoZftavb/A\n3e9w9z5379uwYUPDKwgkpThb1K7HdumWe2/Rrsd2qThbTLtKAICEpd4HTtILks6IbZ8elh3n7kdi\nm7skfS722IvLHvtQw2sItIh9z+3TwO4BlbykmfkZda3q0k333KSJaye0ZfOWtKsHAEhIK7TAPSKp\n18zONrNOSdsl7Y0fYGYbY5tXSXo6vH+PpMvNbK2ZrZV0eVgG5E5xtqiB3QMqzhU1Mz8jSZqZn1Fx\nLig/Nncs5RoCAJKSeoBz91ckfURB8Hpa0l3ufsDMbjWzq8LDPmpmB8zsCUkflfS+8LFHJX1aQQh8\nRNKtYRmQO4UDBZW8VHFfyUsq7C8kXCMAQFpa4RKq3H1C0kRZ2Sdi9z8m6WNVHvsVSV9pagWBFjB1\nZOp4y1u5mfkZTR+dTrhGAIC0pN4CB6A2vet71bWqq+K+rlVd6lnXk3CNAABpIcABGTF8zrA6rPJH\ntsM6NHzucMI1qsBd2rMn+FlLOQDgNSHAARnRvbpbE9dOqLuz+3hLXNeqLnV3BuVrOtekXENJ4+PS\n0JB0440nwpp7sD00FOwHAKxYS/SBA1CbLZu36NDNh1TYX9D00Wn1rOvR8LnDrRHeJGlwUNqxQxod\nDbZHRoLwNjoalA8Opls/AMgJ8za8pNHX1+eTk5NpVwPIp6jFLQpxUhDeRkYks/TqBQAZYGaPunvf\nsscR4AA0nLvUEeuhUSoR3gCgBrUGOPrA5RkdypGGqAUuLt4nDgCwYgS4PKNDOZIWv3y6Y0fQ8hb1\niSPEAUDDMIghz+hQjqSNj594f0V93kZGgn2jo1J/v7RtW7p1BIAcoA9c3tGhHElyD0Lc4ODC91e1\ncgDAAgxiWEJbBTiJDuUAAGQEgxgQoEM5AAC5Q4DLMzqUAwCQSwxiyDM6lAMAkEsEuDwbHJTGxhZ2\nHI9CXH8/o1ABAMgoLqHmmVnQwlY+YKFaOdobEz8DQGYQ4ACoOFvUri9/WLf81yHtuuUyFX/+/4Id\nTPyce8XZonY9tku33Hu
|
|||
|
"text/plain": [
|
|||
|
"<matplotlib.figure.Figure at 0x7f68c603de50>"
|
|||
|
]
|
|||
|
},
|
|||
|
"metadata": {},
|
|||
|
"output_type": "display_data"
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"fig = plot_data_for_classification(X2, Y2, xlabel=r'$x_1$', ylabel=r'$x_2$')"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 19,
|
|||
|
"metadata": {
|
|||
|
"slideshow": {
|
|||
|
"slide_type": "notes"
|
|||
|
}
|
|||
|
},
|
|||
|
"outputs": [],
|
|||
|
"source": [
|
|||
|
"def safeSigmoid(x, eps=0):\n",
|
|||
|
" y = 1.0/(1.0 + np.exp(-x))\n",
|
|||
|
" if eps > 0:\n",
|
|||
|
" y[y < eps] = eps\n",
|
|||
|
" y[y > 1 - eps] = 1 - eps\n",
|
|||
|
" return y\n",
|
|||
|
"\n",
|
|||
|
"def h(theta, X, eps=0.0):\n",
|
|||
|
" return safeSigmoid(X*theta, eps)\n",
|
|||
|
"\n",
|
|||
|
"def J(h,theta,X,y, lamb=0):\n",
|
|||
|
" m = len(y)\n",
|
|||
|
" f = h(theta, X, eps=10**-7)\n",
|
|||
|
" j = -np.sum(np.multiply(y, np.log(f)) + \n",
|
|||
|
" np.multiply(1 - y, np.log(1 - f)), axis=0)/m\n",
|
|||
|
" if lamb > 0:\n",
|
|||
|
" j += lamb/(2*m) * np.sum(np.power(theta[1:],2))\n",
|
|||
|
" return j\n",
|
|||
|
"\n",
|
|||
|
"def dJ(h,theta,X,y,lamb=0):\n",
|
|||
|
" g = 1.0/y.shape[0]*(X.T*(h(theta,X)-y))\n",
|
|||
|
" if lamb > 0:\n",
|
|||
|
" g[1:] += lamb/float(y.shape[0]) * theta[1:] \n",
|
|||
|
" return g\n",
|
|||
|
"\n",
|
|||
|
"def classifyBi(theta, X):\n",
|
|||
|
" prob = h(theta, X)\n",
|
|||
|
" return prob"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 20,
|
|||
|
"metadata": {
|
|||
|
"slideshow": {
|
|||
|
"slide_type": "notes"
|
|||
|
}
|
|||
|
},
|
|||
|
"outputs": [],
|
|||
|
"source": [
|
|||
|
"# Metoda gradientu prostego dla regresji logistycznej\n",
|
|||
|
"def GD(h, fJ, fdJ, theta, X, y, alpha=0.01, eps=10**-3, maxSteps=10000):\n",
|
|||
|
" errorCurr = fJ(h, theta, X, y)\n",
|
|||
|
" errors = [[errorCurr, theta]]\n",
|
|||
|
" while True:\n",
|
|||
|
" # oblicz nowe theta\n",
|
|||
|
" theta = theta - alpha * fdJ(h, theta, X, y)\n",
|
|||
|
" # raportuj poziom błędu\n",
|
|||
|
" errorCurr, errorPrev = fJ(h, theta, X, y), errorCurr\n",
|
|||
|
" # kryteria stopu\n",
|
|||
|
" if abs(errorPrev - errorCurr) <= eps:\n",
|
|||
|
" break\n",
|
|||
|
" if len(errors) > maxSteps:\n",
|
|||
|
" break\n",
|
|||
|
" errors.append([errorCurr, theta]) \n",
|
|||
|
" return theta, errors"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 21,
|
|||
|
"metadata": {
|
|||
|
"slideshow": {
|
|||
|
"slide_type": "subslide"
|
|||
|
}
|
|||
|
},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"name": "stdout",
|
|||
|
"output_type": "stream",
|
|||
|
"text": [
|
|||
|
"theta = [[ 1.37136167]\n",
|
|||
|
" [ 0.90128948]\n",
|
|||
|
" [ 0.54708112]\n",
|
|||
|
" [-5.9929264 ]\n",
|
|||
|
" [ 2.64435168]\n",
|
|||
|
" [-4.27978238]]\n"
|
|||
|
]
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"# Uruchomienie metody gradientu prostego dla regresji logistycznej\n",
|
|||
|
"theta_start = np.matrix(np.zeros(X2.shape[1])).reshape(X2.shape[1],1)\n",
|
|||
|
"theta, errors = GD(h, J, dJ, theta_start, X2, Y2, \n",
|
|||
|
" alpha=0.1, eps=10**-7, maxSteps=10000)\n",
|
|||
|
"print('theta = {}'.format(theta))"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 22,
|
|||
|
"metadata": {
|
|||
|
"slideshow": {
|
|||
|
"slide_type": "notes"
|
|||
|
}
|
|||
|
},
|
|||
|
"outputs": [],
|
|||
|
"source": [
|
|||
|
"# Wykres granicy klas\n",
|
|||
|
"def plot_decision_boundary(fig, theta, X):\n",
|
|||
|
" ax = fig.axes[0]\n",
|
|||
|
" xx, yy = np.meshgrid(np.arange(-1.0, 1.0, 0.02),\n",
|
|||
|
" np.arange(-1.0, 1.0, 0.02))\n",
|
|||
|
" l = len(xx.ravel())\n",
|
|||
|
" C = powerme(xx.reshape(l, 1), yy.reshape(l, 1), n)\n",
|
|||
|
" z = classifyBi(theta, C).reshape(int(np.sqrt(l)), int(np.sqrt(l)))\n",
|
|||
|
"\n",
|
|||
|
" plt.contour(xx, yy, z, levels=[0.5], lw=3);"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 23,
|
|||
|
"metadata": {
|
|||
|
"slideshow": {
|
|||
|
"slide_type": "subslide"
|
|||
|
}
|
|||
|
},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"data": {
|
|||
|
"text/plain": [
|
|||
|
"matrix([[0],\n",
|
|||
|
" [1],\n",
|
|||
|
" [0],\n",
|
|||
|
" [0],\n",
|
|||
|
" [0],\n",
|
|||
|
" [0],\n",
|
|||
|
" [1],\n",
|
|||
|
" [1],\n",
|
|||
|
" [1],\n",
|
|||
|
" [0]])"
|
|||
|
]
|
|||
|
},
|
|||
|
"execution_count": 23,
|
|||
|
"metadata": {},
|
|||
|
"output_type": "execute_result"
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"Y_predicted = (classifyBi(theta, X2) > 0.5).astype(int)\n",
|
|||
|
"Y_predicted[:10]"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 24,
|
|||
|
"metadata": {
|
|||
|
"slideshow": {
|
|||
|
"slide_type": "notes"
|
|||
|
}
|
|||
|
},
|
|||
|
"outputs": [],
|
|||
|
"source": [
|
|||
|
"# Przygotowanie interaktywnego wykresu\n",
|
|||
|
"\n",
|
|||
|
"dropdown_highlight = widgets.Dropdown(options=['all', 'tp', 'fp', 'tn', 'fn'], value='all', description='highlight')\n",
|
|||
|
"\n",
|
|||
|
"def interactive_classification(highlight):\n",
|
|||
|
" fig = plot_data_for_classification(X2, Y2, xlabel=r'$x_1$', ylabel=r'$x_2$',\n",
|
|||
|
" Y_predicted=Y_predicted, highlight=highlight)\n",
|
|||
|
" plot_decision_boundary(fig, theta, X2)"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 25,
|
|||
|
"metadata": {
|
|||
|
"slideshow": {
|
|||
|
"slide_type": "subslide"
|
|||
|
}
|
|||
|
},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"data": {
|
|||
|
"application/vnd.jupyter.widget-view+json": {
|
|||
|
"model_id": "d66ed3534beb4ceb998f948bbb7f348c",
|
|||
|
"version_major": 2,
|
|||
|
"version_minor": 0
|
|||
|
},
|
|||
|
"text/html": [
|
|||
|
"<p>Failed to display Jupyter Widget of type <code>interactive</code>.</p>\n",
|
|||
|
"<p>\n",
|
|||
|
" If you're reading this message in Jupyter Notebook or JupyterLab, it may mean\n",
|
|||
|
" that the widgets JavaScript is still loading. If this message persists, it\n",
|
|||
|
" likely means that the widgets JavaScript library is either not installed or\n",
|
|||
|
" not enabled. See the <a href=\"https://ipywidgets.readthedocs.io/en/stable/user_install.html\">Jupyter\n",
|
|||
|
" Widgets Documentation</a> for setup instructions.\n",
|
|||
|
"</p>\n",
|
|||
|
"<p>\n",
|
|||
|
" If you're reading this message in another notebook frontend (for example, a static\n",
|
|||
|
" rendering on GitHub or <a href=\"https://nbviewer.jupyter.org/\">NBViewer</a>),\n",
|
|||
|
" it may mean that your frontend doesn't currently support widgets.\n",
|
|||
|
"</p>\n"
|
|||
|
],
|
|||
|
"text/plain": [
|
|||
|
"interactive(children=(Dropdown(description=u'highlight', options=('all', 'tp', 'fp', 'tn', 'fn'), value='all'), Output()), _dom_classes=('widget-interact',))"
|
|||
|
]
|
|||
|
},
|
|||
|
"metadata": {},
|
|||
|
"output_type": "display_data"
|
|||
|
},
|
|||
|
{
|
|||
|
"data": {
|
|||
|
"text/plain": [
|
|||
|
"<function __main__.interactive_classification>"
|
|||
|
]
|
|||
|
},
|
|||
|
"execution_count": 25,
|
|||
|
"metadata": {},
|
|||
|
"output_type": "execute_result"
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"widgets.interact(interactive_classification, highlight=dropdown_highlight)"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {
|
|||
|
"slideshow": {
|
|||
|
"slide_type": "subslide"
|
|||
|
}
|
|||
|
},
|
|||
|
"source": [
|
|||
|
"Dokładność (_accuracy_):\n",
|
|||
|
"$$ accuracy = \\frac{tp + tn}{tp + fp + tn + fn} $$"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {
|
|||
|
"slideshow": {
|
|||
|
"slide_type": "subslide"
|
|||
|
}
|
|||
|
},
|
|||
|
"source": [
|
|||
|
"Precyzja (_precision_):\n",
|
|||
|
" $$ precision = \\frac{tp}{tp + fp} $$"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {
|
|||
|
"slideshow": {
|
|||
|
"slide_type": "subslide"
|
|||
|
}
|
|||
|
},
|
|||
|
"source": [
|
|||
|
"Pokrycie (_recall_):\n",
|
|||
|
" $$ recall = \\frac{tp}{tp + fn} $$"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {
|
|||
|
"slideshow": {
|
|||
|
"slide_type": "subslide"
|
|||
|
}
|
|||
|
},
|
|||
|
"source": [
|
|||
|
"_$F$-measure_:\n",
|
|||
|
"$$ F = \\frac{2 \\cdot precision \\cdot recall}{precision + recall} $$"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {
|
|||
|
"slideshow": {
|
|||
|
"slide_type": "subslide"
|
|||
|
}
|
|||
|
},
|
|||
|
"source": [
|
|||
|
"_$F_\\beta$-measure_:\n",
|
|||
|
"$$ F_\\beta = \\frac{(1 + \\beta) \\cdot precision \\cdot recall}{\\beta^2 * precision + recall} $$"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {
|
|||
|
"slideshow": {
|
|||
|
"slide_type": "fragment"
|
|||
|
}
|
|||
|
},
|
|||
|
"source": [
|
|||
|
"* $F = F_1$"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {
|
|||
|
"slideshow": {
|
|||
|
"slide_type": "slide"
|
|||
|
}
|
|||
|
},
|
|||
|
"source": [
|
|||
|
"## 4.4. Analiza głównych składowych"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {
|
|||
|
"slideshow": {
|
|||
|
"slide_type": "subslide"
|
|||
|
}
|
|||
|
},
|
|||
|
"source": [
|
|||
|
"### Redukcja liczby wymiarów\n",
|
|||
|
"\n",
|
|||
|
"Z jakich powodów chcemy redukować liczbę wymiarów?"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {
|
|||
|
"slideshow": {
|
|||
|
"slide_type": "fragment"
|
|||
|
}
|
|||
|
},
|
|||
|
"source": [
|
|||
|
"* Chcemy pozbyć się nadmiarowych cech, np. „długość w cm” / „długość w calach”, „długość” i „szerokość” / „powierzchnia”."
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {
|
|||
|
"slideshow": {
|
|||
|
"slide_type": "fragment"
|
|||
|
}
|
|||
|
},
|
|||
|
"source": [
|
|||
|
"* Chcemy znaleźć bardziej optymalną kombinację cech."
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {
|
|||
|
"slideshow": {
|
|||
|
"slide_type": "fragment"
|
|||
|
}
|
|||
|
},
|
|||
|
"source": [
|
|||
|
"* Chcemy przyspieszyć działanie algorytmów."
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {
|
|||
|
"slideshow": {
|
|||
|
"slide_type": "fragment"
|
|||
|
}
|
|||
|
},
|
|||
|
"source": [
|
|||
|
"* Chcemy zwizualizować dane."
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {
|
|||
|
"slideshow": {
|
|||
|
"slide_type": "subslide"
|
|||
|
}
|
|||
|
},
|
|||
|
"source": [
|
|||
|
"### Błąd rzutowania\n",
|
|||
|
"\n",
|
|||
|
"**Błąd rzutowania** – błąd średniokwadratowy pomiędzy danymi oryginalnymi a danymi zrzutowanymi."
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {
|
|||
|
"slideshow": {
|
|||
|
"slide_type": "subslide"
|
|||
|
}
|
|||
|
},
|
|||
|
"source": [
|
|||
|
"### Sformułowanie problemu\n",
|
|||
|
"\n",
|
|||
|
"**Analiza głównych składowych** (_Principal Component Analysis_, PCA):\n",
|
|||
|
"\n",
|
|||
|
"Zredukować liczbę wymiarów z $n$ do $k$, czyli znaleźć $k$ wektorów $u^{(1)}, u^{(2)}, \\ldots, u^{(k)}$ takich, że rzutowanie danych na podprzeztrzeń rozpiętą na tych wektorach minimalizuje błąd rzutowania."
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {
|
|||
|
"slideshow": {
|
|||
|
"slide_type": "subslide"
|
|||
|
}
|
|||
|
},
|
|||
|
"source": [
|
|||
|
"* Analiza głównych składowych to zupełnie inne zagadnienie niż regresja liniowa!"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {
|
|||
|
"slideshow": {
|
|||
|
"slide_type": "subslide"
|
|||
|
}
|
|||
|
},
|
|||
|
"source": [
|
|||
|
"### Algorytm PCA\n",
|
|||
|
"\n",
|
|||
|
"1. Dany jest zbiór składający się z $x^{(1)}, x^{(2)}, \\ldots, x^{(m)} \\in \\mathbb{R}^n$.\n",
|
|||
|
"1. Chcemy zredukować liczbę wymiarów z $n$ do $k$ ($k < n$).\n",
|
|||
|
"1. W ramach wstępnego przetwarzania dokonujemy skalowania i normalizacji średniej.\n",
|
|||
|
"1. Znajdujemy macierz kowariancji:\n",
|
|||
|
" $$ \\Sigma = \\frac{1}{m} \\sum_{i=1}^{n} \\left( x^{(i)} \\right) \\left( x^{(i)} \\right)^T $$\n",
|
|||
|
"1. Znajdujemy wektory własne macierzy $\\Sigma$ (rozkład SVD):\n",
|
|||
|
" $$ (U, S, V) := \\mathop{\\rm SVD}(\\Sigma) $$\n",
|
|||
|
"1. Pierwszych $k$ kolumn macierzy $U$ to szukane wektory."
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 27,
|
|||
|
"metadata": {
|
|||
|
"slideshow": {
|
|||
|
"slide_type": "subslide"
|
|||
|
}
|
|||
|
},
|
|||
|
"outputs": [],
|
|||
|
"source": [
|
|||
|
"from sklearn.preprocessing import StandardScaler\n",
|
|||
|
"\n",
|
|||
|
"# Algorytm PCA - implementacja\n",
|
|||
|
"def pca(X, k):\n",
|
|||
|
" X_std = StandardScaler().fit_transform(X) # normalizacja\n",
|
|||
|
" mean_vec = np.mean(X_std, axis=0)\n",
|
|||
|
" cov_mat = np.cov(X_std.T) # macierz kowariancji\n",
|
|||
|
" n = cov_mat.shape[0]\n",
|
|||
|
" eig_vals, eig_vecs = np.linalg.eig(cov_mat) # wektory własne\n",
|
|||
|
" eig_pairs = [(np.abs(eig_vals[i]), eig_vecs[:, i])\n",
|
|||
|
" for i in range(len(eig_vals))]\n",
|
|||
|
" eig_pairs.sort()\n",
|
|||
|
" eig_pairs.reverse()\n",
|
|||
|
" matrix_w = np.hstack([eig_pairs[i][1].reshape(n, 1)\n",
|
|||
|
" for i in range(k)]) # wybór\n",
|
|||
|
" return X_std.dot(matrix_w) # transformacja"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 28,
|
|||
|
"metadata": {
|
|||
|
"slideshow": {
|
|||
|
"slide_type": "subslide"
|
|||
|
}
|
|||
|
},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"data": {
|
|||
|
"image/png": "iVBORw0KGgoAAAANSUhEUgAAArwAAAGdCAYAAADwoqBNAAAABHNCSVQICAgIfAhkiAAAAAlwSFlz\nAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDIuMS4wLCBo\ndHRwOi8vbWF0cGxvdGxpYi5vcmcvpW3flQAAIABJREFUeJzt3X1wZFd55/Hf0bzJavVuajGLh/Z4\nbSJlEufFpBhYyCq7YbGx3ZOCDR7TcajFLsQ6AYdFVigkQyrsLktmMqmEVTFOEdOkMFsmNDVOKoQR\nBrtgk+mtIptxyiQYPJbAIXZk1gYqy+0ez4zHffaPmRYa6fb7vffce+73UzVlq7vVffpKOvfpc5/z\nPMZaKwAAAMBXY64HAAAAAMSJgBcAAABeI+AFAACA1wh4AQAA4DUCXgAAAHiNgBcAAABeI+AFAACA\n1wh4AQAA4DUCXgAAAHhtu+sBDOPSSy+1V155pethAAAAwKGHH374u9baF/d6XCYD3iuvvFInTpxw\nPQwAAAA4ZIz5dj+PI6UBAAAAXiPgBQAAgNcIeAEAAOA1Al4AAAB4jYAXAAAAXiPgBQAAgNcIeAEA\nAOA1Al4AAAB4jYAXAAAAXiPgBQAAgNcIeAEAAOC17a4HAABpFQSBarWaVlZWND09rUqlomKx6HpY\nAIABEfACQIh6va5yuaxWq6Vms6lCoaD5+XktLy9rZmbG9fAAAAMgpQEANgmCQOVyWUEQqNlsSpKa\nzeb67Y1Gw/EIAQCDIOAFgE1qtZparVbofa1WS7VaLeERAQBGQcALAJusrKysr+xu1mw2tbq6mvCI\nAACjIOAFgE2mp6dVKBRC7ysUCpqamkp4RACAURDwAsAmlUpFY2Ph0+PY2JgqlUrCIwIAjIKAFwA2\nKRaLWl5eVrFYXF/pLRQK67dPTk46HiEAYBCUJQOAEDMzM1pbW1OtVtPq6qqmpqZUqVQIdgEggwh4\nAaCDyclJzc7Ouh4GAGBEpDQAAADAawS8AAAA8BoBLwAAALxGwAsAAACvEfACAADAawS8AAAA8BoB\nLwAAALxGwAsAAACvEfACAADAawS8AAAA8BqtheFMEASq1WpaWVnR9PS0KpWKisWi62EBAADPEPDC\niXq9rnK5rFarpWazqUKhoPn5eS0vL2tmZsb18AAAgEdIaUDigiBQuVxWEARqNpuSpGazuX57o9Fw\nPEIAAOATAl4krlarqdVqhd7XarVUq9USHhEAAPAZAS8St7Kysr6yu1mz2dTq6mrCIwIAAD4j4EXi\npqenVSgUQu8rFAqamppKeEQAAMBnBLxIXKVS0dhY+K/e2NiYKpVKwiMCAAA+I+BF4orFopaXl1Us\nFtdXeguFwvrtk5OTjkcIAAB8QlkyODEzM6O1tTXVajWtrq5qampKlUqFYBcAAESOgBfOTE5OanZ2\n1vUwAACA50hpAAAAgNcIeAEAAOA1Al4AAAB4jYAXAAAAXiPgBQAAgNcIeAEAAOA1Al4AAAB4jYAX\nAAAAXiPgBQAAgNfotAYgs4IgUK1W08rKiqanp1WpVFQsFl0PCwCQMgS8ADKpXq+rXC6r1Wqp2Wyq\nUChofn5ey8vLmpmZcT08AECKOE9pMMbsMcZ82RjzdWPMo8aYd7seE4B0C4JA5XJZQRCo2WxKkprN\n5vrtjUbD8QgBAGniPOCVdE7Sb1hrr5b0akl3GGOudjwmAClWq9XUarVC72u1WqrVagmPCACQZs4D\nXmvt09bav7nw/4Gkb0gquR0VgDRbWVlZX9ndrNlsanV1NeERAQDSzHnAu5Ex5kpJPyvpr0Luu90Y\nc8IYc+LZZ59NemgAUmR6elqFQiH0vkKhoKmpqYRHBABIM2OtdT0GSZIxZlLSX0j6kLX2T7o9dt++\nffbEiRPJDAxA6gRBoFKppCAIttxXLBa1tramyclJByMDACTJGPOwtXZfr8elYoXXGLND0v2S7usV\n7AJAsVjU8vKyisXi+kpvoVBYv51gFwCwkfOyZMYYI+njkr5hrf191+MBkA0zMzNaW1tTrVbT6uqq\npqamVKlUCHYBAFs4D3gl/RtJ/1HS3xljHrlw2/ustcsOxwQgAyYnJzU7O+t6GACAlHMe8Fpr65KM\n63EAAADAT84DXgBoo1UwACAOBLwAUoFWwQCAuKSiSgOAfKNVMAAgTgS8AJyjVTAAIE6kNABwjlbB\n6UVeNQAfEPACcK7dKjgs6KVV8OiGDVrJqwbgi9S0Fh4ErYUBv9AqOD5hQevY2FjPoJWfCYAsyFRr\nYQD5RqvgeIyyGZC8agA+IaUBQCrQKjh6/QStnTrVkVe9FfnMQHYR8AJIDVoFR2uUoJW86ouRzwxk\nGykNAOCpdtAaplfQWqlUNDYWfooYGxtTpVKJZIxZQJ1oIPsIeAHAU6MEreRV/xD5zED2kdIAAJ5q\nB6edqjT0ClrJqz6PfGYg+wh4AcBjowatec+rDoJA3/nOd7R9+3adO3duy/15zGcGsog6vAAAhGhv\nVHvhhRd06tSp0MdQkxhwq986vKzwAgCwycaNamEmJia0bdu23OUzA1lFwAsAEaJWqx+6bVTbsWOH\nbr75Zh05coRgF8gIAl4AiMggtVrjDIwJukfXbaPa888/r927dxPsAhlCwAsAEQi7BN4OmMrl8kV5\nnnE2MaBBQjRovAH4hTq8AFIrCAJVq1UtLCyoWq12zKdMg35rtcbZxIAGCdGh8QbgFwJeAKlUr9dV\nKpU0Nzenw4cPa25uTqVSSfV63fXQQvVbqzXOJgY0SIgOjTcAv5DSACB1BkkPSIt+L4HH2cSABgnR\novEG4A9WeAGkThZXKvu9BN4OjMOMmhsa53PnVbvxxsGDBzU7O0uwC2QUAS+A1MniSmW/l8DjzA0l\n7xQAwpHSACB1srpDvp9L4O0AeHMlhbGxsZFzQ+N8bgDIMloLA0idIAhUKpVCqzL40sq10WjElhsa\n53MDQJr021qYgBdAKoXVk22vVFJPFgAg9R/wktIAIJXYIQ8AiAoBL4DUau+QBwZFe2UAGxHwAgC8\nQntlAJtRlgwA4A3aKwMIQ8ALABkTBIGq1aoWFhZUrVZDq1nkVRablgCIHykNAJAhXK7vLotNSwDE\nj4AXAFKgn01WGy/Xt7WDu3K57EV94lFltWkJgHiR0gAAjtXrdZVKJc3Nzenw4cOam5tTqVRSvV6/\n6HFcru+N9soAwhDwAoBDg2yy4nJ9b+32ysViUYVCQdL5ld327XlfAQfyipQGAHCon1Xbdi1iLtf3\nh6YlADYj4AUAhwZZta1UKpqfnw99LJfrL0bTEgAbkdIAAA61V23DbF615XI9AAzHWGtdj2Fg+/bt\nsydOnHA9DAAYWRAEKpVKobV0i8ViaOWFRqPB5XoAkGSMedhau6/X40hpAACH2quzm2vrjo2NdVy1\n5XI9AAyGgBcAHGOTVTb0UysZQDqR0gAAQA9hHe7aq/B0uAPc6TelgU1rAAB0MUitZADpRMALABkT\nBIGq1aoWFhZUrVZDN7whOnS4A7KPHF4AyJCwS+vz8/NcWo8RHe6A7CPg9QSbKQD/bby03tYOxMrl\ncmgJM4yODndA9pHS4IF6va5SqaS5uTkdPnxYc3NzKpVKqtfrrocGIEJcWnejUqlobCz8dEmHOyAb\nWOHNOFZ8ADdcXFXh0robw9RKBpAuBLwZ18+KDwXqgWi5yqPl0ro71EoGso2AN+NY8QGS1e2qyo03\n3qiDBw/qySefjGXVt1KpaH5+PvQ+Lq3Hjw53QHaRw5tx7RWfMKz4ANHrdlWl0WjoPe95T2y59O1L\n68Vicf3vvlAorN/OaiMAhHPeac0Y80eSflHSM9ban+rne+i09kNBEKhUKoXW4SwWi+TwAhFbWFjQ\n4cOH+358HH+HjUZjy6V1ay2VWgDkTr+d1tKQ0vAJSUckfdLxODKJzRRAsrrl0YaJI5d+86V1avMC\nQHfOA15r7V8aY650PY4
|
|||
|
"text/plain": [
|
|||
|
"<matplotlib.figure.Figure at 0x7f0ead9d0a10>"
|
|||
|
]
|
|||
|
},
|
|||
|
"metadata": {},
|
|||
|
"output_type": "display_data"
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"X_pca = pca(X, 2)\n",
|
|||
|
"fig = plot_unlabeled_data(X_pca)"
|
|||
|
]
|
|||
|
}
|
|||
|
],
|
|||
|
"metadata": {
|
|||
|
"celltoolbar": "Slideshow",
|
|||
|
"kernelspec": {
|
|||
|
"display_name": "Python 3",
|
|||
|
"language": "python",
|
|||
|
"name": "python3"
|
|||
|
},
|
|||
|
"language_info": {
|
|||
|
"codemirror_mode": {
|
|||
|
"name": "ipython",
|
|||
|
"version": 2
|
|||
|
},
|
|||
|
"file_extension": ".py",
|
|||
|
"mimetype": "text/x-python",
|
|||
|
"name": "python",
|
|||
|
"nbconvert_exporter": "python",
|
|||
|
"pygments_lexer": "ipython2",
|
|||
|
"version": "2.7.15rc1"
|
|||
|
},
|
|||
|
"livereveal": {
|
|||
|
"start_slideshow_at": "selected",
|
|||
|
"theme": "amu"
|
|||
|
}
|
|||
|
},
|
|||
|
"nbformat": 4,
|
|||
|
"nbformat_minor": 2
|
|||
|
}
|