Python2018/labs07/sklearn.ipynb

486 lines
25 KiB
Plaintext
Raw Normal View History

2018-06-03 08:21:45 +02:00
{
"cells": [
{
"cell_type": "markdown",
"metadata": {
"slideshow": {
"slide_type": "slide"
}
},
"source": [
"# Analiza danych w Pythonie: sklearn\n",
"\n",
"### Tomasz Dwojak\n",
"\n",
"### 3 czerwca 2018"
]
},
{
"cell_type": "markdown",
"metadata": {
"slideshow": {
"slide_type": "slide"
}
},
"source": [
" * Pierwsza część: pandas\n",
" * Druga część: sklearn"
]
},
{
"cell_type": "markdown",
"metadata": {
"slideshow": {
"slide_type": "slide"
}
},
"source": [
"### Przypomnienie z UMZ\n",
" * przygotowanie i czyszczenie danych\n",
" * wybór i trening modelu\n",
" * tuning\n",
" * ewaluacja"
]
},
{
"cell_type": "code",
"execution_count": 17,
"metadata": {
"slideshow": {
"slide_type": "slide"
}
},
"outputs": [],
"source": [
"import sklearn\n",
"import pandas as pd\n",
"import numpy as np"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {
"slideshow": {
"slide_type": "slide"
}
},
"outputs": [],
"source": [
"data = pd.read_csv(\"./gapminder.csv\", index_col=0)"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {
"slideshow": {
"slide_type": "slide"
}
},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>female_BMI</th>\n",
" <th>male_BMI</th>\n",
" <th>gdp</th>\n",
" <th>population</th>\n",
" <th>under5mortality</th>\n",
" <th>life_expectancy</th>\n",
" <th>fertility</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>Afghanistan</th>\n",
" <td>21.07402</td>\n",
" <td>20.62058</td>\n",
" <td>1311.0</td>\n",
" <td>26528741.0</td>\n",
" <td>110.4</td>\n",
" <td>52.8</td>\n",
" <td>6.20</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Albania</th>\n",
" <td>25.65726</td>\n",
" <td>26.44657</td>\n",
" <td>8644.0</td>\n",
" <td>2968026.0</td>\n",
" <td>17.9</td>\n",
" <td>76.8</td>\n",
" <td>1.76</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Algeria</th>\n",
" <td>26.36841</td>\n",
" <td>24.59620</td>\n",
" <td>12314.0</td>\n",
" <td>34811059.0</td>\n",
" <td>29.5</td>\n",
" <td>75.5</td>\n",
" <td>2.73</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Angola</th>\n",
" <td>23.48431</td>\n",
" <td>22.25083</td>\n",
" <td>7103.0</td>\n",
" <td>19842251.0</td>\n",
" <td>192.0</td>\n",
" <td>56.7</td>\n",
" <td>6.43</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Antigua and Barbuda</th>\n",
" <td>27.50545</td>\n",
" <td>25.76602</td>\n",
" <td>25736.0</td>\n",
" <td>85350.0</td>\n",
" <td>10.9</td>\n",
" <td>75.5</td>\n",
" <td>2.16</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" female_BMI male_BMI gdp population \\\n",
"Afghanistan 21.07402 20.62058 1311.0 26528741.0 \n",
"Albania 25.65726 26.44657 8644.0 2968026.0 \n",
"Algeria 26.36841 24.59620 12314.0 34811059.0 \n",
"Angola 23.48431 22.25083 7103.0 19842251.0 \n",
"Antigua and Barbuda 27.50545 25.76602 25736.0 85350.0 \n",
"\n",
" under5mortality life_expectancy fertility \n",
"Afghanistan 110.4 52.8 6.20 \n",
"Albania 17.9 76.8 1.76 \n",
"Algeria 29.5 75.5 2.73 \n",
"Angola 192.0 56.7 6.43 \n",
"Antigua and Barbuda 10.9 75.5 2.16 "
]
},
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"data.head()"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {
"slideshow": {
"slide_type": "slide"
}
},
"outputs": [],
"source": [
"y = data['life_expectancy']\n",
"X = data.drop('life_expectancy', axis=1)"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {
"slideshow": {
"slide_type": "slide"
}
},
"outputs": [],
"source": [
"from sklearn.model_selection import train_test_split\n",
"train_X, test_X, train_y, test_y = \\\n",
" train_test_split(X, y, test_size=0.2, random_state=123, shuffle=True)"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {
"slideshow": {
"slide_type": "slide"
}
},
"outputs": [
{
"data": {
"text/plain": [
"LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)"
]
},
"execution_count": 14,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"from sklearn.linear_model import LinearRegression\n",
"model = LinearRegression()\n",
"model.fit(X,y)"
]
},
{
"cell_type": "code",
"execution_count": 16,
"metadata": {
"slideshow": {
"slide_type": "slide"
}
},
"outputs": [
{
"data": {
"text/plain": [
"array([67.56279809, 76.25840076, 50.21126326, 59.21303855, 72.06348723])"
]
},
"execution_count": 16,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"predicted = model.predict(test_X)\n",
"predicted[:5]"
]
},
{
"cell_type": "code",
"execution_count": 18,
"metadata": {
"slideshow": {
"slide_type": "slide"
}
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"RMSE: 3.5179543848147863\n"
]
}
],
"source": [
"from sklearn.metrics import mean_squared_error\n",
"rmse = np.sqrt(mean_squared_error(predicted, test_y))\n",
"print(\"RMSE:\", rmse)"
]
},
{
"cell_type": "code",
"execution_count": 20,
"metadata": {
"slideshow": {
"slide_type": "slide"
}
},
"outputs": [
{
"data": {
"text/plain": [
"0.795295000468209"
]
},
"execution_count": 20,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
" r2 = model.score(test_X, test_y)"
]
},
{
"cell_type": "markdown",
"metadata": {
"slideshow": {
"slide_type": "slide"
}
},
"source": [
"#### API\n",
" * model\n",
" * `fix`\n",
" * `predict`"
]
},
{
"cell_type": "code",
"execution_count": 30,
"metadata": {
"slideshow": {
"slide_type": "slide"
}
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"female_BMI: -1.18\n",
"male_BMI: 1.46\n",
"gdp: 5.11e-05\n",
"population: 7.21e-10\n",
"under5mortality: -0.159\n",
"fertility: 0.421\n"
]
}
],
"source": [
"for p in zip(train_X.columns, model.coef_):\n",
" print(\"{}: {:.3}\".format(p[0], p[1]))"
]
},
{
"cell_type": "code",
"execution_count": 36,
"metadata": {
"slideshow": {
"slide_type": "slide"
}
},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/usr/lib/python3.6/site-packages/ipykernel_launcher.py:2: FutureWarning: reshape is deprecated and will raise in a subsequent release. Please use .values.reshape(...) instead\n",
" \n"
]
},
{
"data": {
"text/plain": [
"LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)"
]
},
"execution_count": 36,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"model2 = LinearRegression()\n",
"model2.fit(train_X['male_BMI'].reshape(-1, 1), train_y)"
]
},
{
"cell_type": "code",
"execution_count": 37,
"metadata": {
"slideshow": {
"slide_type": "slide"
}
},
"outputs": [
{
"data": {
"text/plain": [
"0.5852413468462743"
]
},
"execution_count": 37,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"model2.intercept_"
]
},
{
"cell_type": "code",
"execution_count": 41,
"metadata": {
"slideshow": {
"slide_type": "slide"
}
},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/usr/lib/python3.6/site-packages/ipykernel_launcher.py:5: FutureWarning: reshape is deprecated and will raise in a subsequent release. Please use .values.reshape(...) instead\n",
" \"\"\"\n"
]
},
{
"data": {
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAXQAAAD8CAYAAABn919SAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDIuMS4yLCBodHRwOi8vbWF0cGxvdGxpYi5vcmcvNQv5yAAAIABJREFUeJzt3Xt8VPWZ+PHPMyGxAZQ73hOsom6VQhVv21arWF4KrkVpXWyw/LCVtfqr4OXX2k1XUUtrtStod7Wya2teJLogQmm7oAXUpVsrysUAai1qCa0XkABBJAJJnt8fc2EmmcuZmTNz5px53r58kXzn9j0zmed8z3Oe7/eIqmKMMcb/Ql53wBhjjDssoBtjTEBYQDfGmICwgG6MMQFhAd0YYwLCAroxxgSEBXRjjAkIC+jGGBMQFtCNMSYgehXzxQYPHqzDhg0r5ksaY4zvrV27doeqDsl0v6IG9GHDhrFmzZpivqQxxvieiLQ4uZ+lXIwxJiAsoBtjTEBYQDfGmICwgG6MMQFhAd0YYwLCAroxxgSEBXRjjAkIC+jGGFMgqsodd9zB8uXLi/J6RZ1YZIwx5aCjo4OpU6fS2NgIgIjQ1dVV8Ne1Eboxxrikvb2dsWPHUllZGQvm55xzDm1tbUV5fRuhG2NMntra2rjgggtobm6OtV1++eUsWLCAww47rGj9sBG6Mcbk6IMPPuCYY46hf//+sWB+3XXX0dHRwZIlS4oazMECujHGZO3tt9+mV69eHH300bz//vsA1NfX09XVxdy5c6moqPCkX5ZyMcYYh5qbmxk1alRC25w5c5g+fbpHPUpkAd0YYzJYtWoVF1xwQUJbY2MjdXV1HvUoOQvoxhiTwq9+9SuuuOKKhLalS5dy6aWXetSj9CyHbowx3Tz22GOISEIwf/HFF1HVkg3mYAHdGGNi7r33XkSEb33rW7G21157DVXlvPPO87BnzljKxRhT1lSVW2+9ldmzZ8faDj/8cDZt2kRNTY2HPcueBXRjTFnq6OhgypQpPPHEE7G2E088kZdeeonBgwd72LPcWUA3xpSV9vZ2Lr/8clasWBFrO++88/jd735H3759PexZ/iygG2PKwu7duzn//PPZuHFjrG3ChAnMnz+fqqoqD3vmHjspaowJtPfff5+jjjqKAQMGxIL5tGnT6OzsZPHixYEJ5mAB3RgTUG+99RahUIhjjjmGbdu2AfAv//IvdHV18eijjxIKBS/8BW+LjAGaNjYxbM4wQneFGDZnGE0bm7zuUsGU07Y6sX79ekSE4cOHo6oAPPTQQ6gqd999NyLicQ8Lx3LoJnCaNjYx7TfT2HdwHwAtbS1M+800AOpGlNZU7XyV07Zm8sILL3DhhRcmtD3xxBNcffXVHvWo+GyEbgKnfmV9LMBF7Tu4j/qV9UV5/WKOmL3e1lKwePFiRCQhmD/zzDOoalkFc7ARugmgrW1bs2p3U7FHzF5uq9f+4z/+g2nTpiW0/fGPf+Tcc8/1qEfesxG6CZyafsln96Vqd5OTEbObI3in2xqkPPuPfvQjRCQhmL/++uuoalkHc7CAbgJo1phZ9K7sndDWu7I3s8bMKvhrZxoxR0fwLW0tKBobwecaYJ1sq9uv6QVVZcaMGYgI9fXhnWO/fv3YunUrqsrf/d3fedzD0mAB3QRO3Yg65v7DXGr71SIItf1qmfsPc11JeWQa6WYaMbud83ayrdOXTfdtnr2jo4Orr76aUCjEgw8+CMDw4cPZsWMHu3fv5vjjj/e4h6VFomU9xTB69Ghds2ZN0V7PGDd1z49DeDQcH0Az3Sd0Vwil53dOELru7HK9v9OXTae1vTXp7YV4Tbe0t7dz2WWX8dxzz8XaPv/5z/PMM8/4fnp+LkRkraqOznQ/G6Eb45CT0XWmEXOx8vtNG5u4dsm1KYM5QEhCJZdT3717NyNGjKB3796xYH7FFVewf/9+/vd//7csg3k2LKAb45DTipK6EXVsmbGFrju72DJjS0L6w838frr0z/Rl0znQeSDt4zu1s2Ry6u+99x5Dhw5lwIABbNq0CYDrr7+ezs5OFi1aFKjp+YVkAd0Yh1KNorMZ6daNqGPKyClUSPiq8BVSwZSRU7LO72c60ZluZJ7MvoP7mLJ4StGD+ubNmxERjj32WD788EMA7rjjDrq6unjkkUcCOT2/kBy9WyJys4i8JiKbRORJEfmUiJwgIqtF5C0RmS8itgs1gZZsdA3ZjXSbNjbR0NxAp3bGHtvQ3JB1IE2V/pm+LPerz3dqZ9FG6uvWrUNEOPnkk2Nt//Zv/4aqctdddwV6en4hZQzoInIscBMwWlVPByqAScBPgNmqehKwC/hmITtqjNe658ejo+x4mapH3KpySZX+aW1vpWljE4OqB2X1fPn0JRvPP/88IsKZZ54Za3vyySdRVW688caCvW65cHo80wuoFpFeQG/gfeAiYGHk9gZggvvdM6a0xOfHuzR5hUi6WZotbS1p251OAEp3ErV+ZT1XnXZVj/bKUKWjQF+IWaZPP/00IsJFF10Ua3v22WdRVSZNmuT665WrjAFdVd8FfgpsJRzI24C1wG5V7Yjc7W/AsckeLyLTRGSNiKyJ5siMCYJcKlaSjeohXEI4+L7BTF402dEEoHQnUVvaWmhobujx/N8641vsbN+Z8nFO+p+tuXPnIiJ89atfjbWtXr0aVWXs2LGuvY4Jc5JyGQB8BTgBOAboA1zi9AVUda6qjlbV0UOGDMm5o8Z4Id2IOZeKlWjuvDtFk57ITJUCqRtRl3K0XSEVPdI6irJ089KMwdqtGbWzZs1CRPinf/qnWNsbb7yBqnL22Wfn/fwmOScpl4uBv6jqh6p6EFgEfB7oH0nBABwHvFugPhrjiUyVJLnMSK3tV5t1P1KlQB689MGkO5RUO42tbVuT7oQEifUtnxm1XV1d3HTTTYgIP/jBDwDo378/f/3rX1FVTj311Jye1zjnJKBvBc4Vkd4SPvU8BngdeB6IHkdNAZYUpovGeMPpRKJoTn3WmFnUr6xPm/9OVSmTTqpRdaodSqqdRk2/mqSPmXflPPRO7VEz79TBgweZNGkSFRUV/OxnPwPg5JNPZseOHezatYvjjjsu6+c0ucm4fK6qrhaRhcA6oANYD8wF/hv4LxH5YaTtsUJ21Jhiy2ZpWqfL5kZ/rl9Zz9a2rdT0q2Hvgb0p68YzpUDqRtQlDcLdlx8QhHHDx6V9TLb27dvHZZddxvPPPx9r++IXv8iyZcvo06dP3s9vsueoykVV71TVU1X1dFW9RlX3q+o7qnq2qp6kql9T1f2F7qwpvCAts5qvbE56ZlOO2H0mabLUCcCg6kE91olx8tlEJy9FUykQzqHnUu+ezK5duzjttNPo06dPLJhPnDiR/fv3s2rVKgvmHrJpWCbGT8usFmPHk81Jz3wuNJEsDdJ4ZSM7vrujx6JfTj+bpZuX9lgELN8a8/fee48hQ4YwcOBAXn/9dQBuuOEGOjs7WbhwoU3PLwG22qKJGTZnWNI66dp+tWyZsaX4HUrByaqHbr5WfHpk1phZSV+j0O9dts/v5qqOf/7znznllFMS2q789pUs/PeFNqOzSGy1RZM1v1zOrJjX0Uy30FY8NxbdSnfUke6zSfY4N1Z1XLt2LSKSGMzHATPhmeOe4YlNTzh+LlMcFtBNjJeXbstGKe54cr2oRjQYy13CNYuuSZlSSfUZDKwemDQVM274uKx2MPE7hSNvPBIRYfTouAHhV4GZQKSE3C8XyCg3FtBNjJeXbstGqe54uo/mgbR5/vi8OJA2553qs4ner/vjlm5e6ngHE+vHiy3oTGX7w9tjty1fvhyZKXB6z+0t5A7UTs7nJmPZoikfyUrqUuWMvTRrzKykOfRS2vE4KWNMljrqLho0U3021yy6JuXjnJYn3nT3Texb2K0f10HtZ2q5+OKLqdlUkzR/X6gdqNMSUNOTjdBNAqc5Yy8V8pqhbnGS53cywo0Pmsk+m0xHK6lGuqrKPffcg4iwc2Hc+i7/l3Bq5dhD/ct05Ob2aLqY50iCxqpcjCmAVFUmEK5M2dq2lZC
"text/plain": [
"<matplotlib.figure.Figure at 0x7fa465870198>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"from matplotlib import pyplot as plt\n",
"%matplotlib inline\n",
"\n",
"plt.scatter(train_X['male_BMI'], train_y,color='g')\n",
"plt.plot(train_X['male_BMI'], model2.predict(train_X['male_BMI'].reshape(-1, 1)),color='k')\n",
"\n",
"plt.show()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"celltoolbar": "Slideshow",
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.5"
}
},
"nbformat": 4,
"nbformat_minor": 2
}