aitech-eks-pub-22/cw/07_regresja_liniowa.ipynb

1087 lines
120 KiB
Plaintext
Raw Normal View History

2022-04-26 14:03:39 +02:00
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"![Logo 1](https://git.wmi.amu.edu.pl/AITech/Szablon/raw/branch/master/Logotyp_AITech1.jpg)\n",
"<div class=\"alert alert-block alert-info\">\n",
"<h1> Ekstrakcja informacji </h1>\n",
"<h2> 7. <i>Regresja liniowa</i> [ćwiczenia]</h2> \n",
"<h3> Jakub Pokrywka (2021)</h3>\n",
"</div>\n",
"\n",
"![Logo 2](https://git.wmi.amu.edu.pl/AITech/Szablon/raw/branch/master/Logotyp_AITech2.jpg)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Regresja liniowa"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## import bibliotek"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"import numpy as np\n",
"from pathlib import Path\n",
"import seaborn as sns\n",
"import matplotlib.pyplot as plt\n",
"import numpy as np\n",
"from sklearn.linear_model import LinearRegression\n",
"plt.rcParams['figure.figsize'] = [10, 5]"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Zbiór \n",
"\n",
"https://git.wmi.amu.edu.pl/kubapok/mieszkania2-below1m-public"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## ładowanie zbioru train"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"DATA_DIR = Path('/home/kuba/Syncthing/przedmioty/2020-02/ISI/zajecia7_regresja_liniowa/mieszkania2')"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"with open(DATA_DIR / 'names') as f_names:\n",
" names = f_names.read().rstrip('\\n').split('\\t')"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
"mieszkania_train = pd.read_csv(DATA_DIR/'train/in.tsv', sep ='\\t', names=names)"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>isNew</th>\n",
" <th>rooms</th>\n",
" <th>floor</th>\n",
" <th>location</th>\n",
" <th>sqrMetres</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>False</td>\n",
" <td>3</td>\n",
" <td>1</td>\n",
" <td>Centrum</td>\n",
" <td>78</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>False</td>\n",
" <td>3</td>\n",
" <td>2</td>\n",
" <td>Sołacz</td>\n",
" <td>62</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>False</td>\n",
" <td>3</td>\n",
" <td>0</td>\n",
" <td>Sołacz</td>\n",
" <td>15</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>False</td>\n",
" <td>4</td>\n",
" <td>0</td>\n",
" <td>Sołacz</td>\n",
" <td>14</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>False</td>\n",
" <td>3</td>\n",
" <td>0</td>\n",
" <td>Sołacz</td>\n",
" <td>15</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" isNew rooms floor location sqrMetres\n",
"0 False 3 1 Centrum 78\n",
"1 False 3 2 Sołacz 62\n",
"2 False 3 0 Sołacz 15\n",
"3 False 4 0 Sołacz 14\n",
"4 False 3 0 Sołacz 15"
]
},
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"mieszkania_train.head()"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [],
"source": [
"with open(DATA_DIR/'train'/'expected.tsv','r') as train_exp_f:\n",
" Y_train = np.array([float(x.rstrip('\\n')) for x in train_exp_f.readlines()])"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"array([476118., 459531., 411557., ..., 320000., 364000., 209000.])"
]
},
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"Y_train"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [],
"source": [
"mieszkania_train['price'] = Y_train"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [],
"source": [
"X_train = mieszkania_train['sqrMetres'].to_numpy()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Wizualizacja danych"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>isNew</th>\n",
" <th>rooms</th>\n",
" <th>floor</th>\n",
" <th>location</th>\n",
" <th>sqrMetres</th>\n",
" <th>price</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>False</td>\n",
" <td>3</td>\n",
" <td>1</td>\n",
" <td>Centrum</td>\n",
" <td>78</td>\n",
" <td>476118.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>False</td>\n",
" <td>3</td>\n",
" <td>2</td>\n",
" <td>Sołacz</td>\n",
" <td>62</td>\n",
" <td>459531.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>False</td>\n",
" <td>3</td>\n",
" <td>0</td>\n",
" <td>Sołacz</td>\n",
" <td>15</td>\n",
" <td>411557.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>False</td>\n",
" <td>4</td>\n",
" <td>0</td>\n",
" <td>Sołacz</td>\n",
" <td>14</td>\n",
" <td>496416.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>False</td>\n",
" <td>3</td>\n",
" <td>0</td>\n",
" <td>Sołacz</td>\n",
" <td>15</td>\n",
" <td>406032.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1652</th>\n",
" <td>True</td>\n",
" <td>2</td>\n",
" <td>0</td>\n",
" <td>Grunwald</td>\n",
" <td>51</td>\n",
" <td>299000.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1653</th>\n",
" <td>True</td>\n",
" <td>2</td>\n",
" <td>2</td>\n",
" <td>Centrum</td>\n",
" <td>53</td>\n",
" <td>339000.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1654</th>\n",
" <td>True</td>\n",
" <td>3</td>\n",
" <td>4</td>\n",
" <td>Stare</td>\n",
" <td>65</td>\n",
" <td>320000.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1655</th>\n",
" <td>True</td>\n",
" <td>3</td>\n",
" <td>1</td>\n",
" <td>Nowe</td>\n",
" <td>67</td>\n",
" <td>364000.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1656</th>\n",
" <td>True</td>\n",
" <td>3</td>\n",
" <td>3</td>\n",
" <td>Grunwald</td>\n",
" <td>50</td>\n",
" <td>209000.0</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>1657 rows × 6 columns</p>\n",
"</div>"
],
"text/plain": [
" isNew rooms floor location sqrMetres price\n",
"0 False 3 1 Centrum 78 476118.0\n",
"1 False 3 2 Sołacz 62 459531.0\n",
"2 False 3 0 Sołacz 15 411557.0\n",
"3 False 4 0 Sołacz 14 496416.0\n",
"4 False 3 0 Sołacz 15 406032.0\n",
"... ... ... ... ... ... ...\n",
"1652 True 2 0 Grunwald 51 299000.0\n",
"1653 True 2 2 Centrum 53 339000.0\n",
"1654 True 3 4 Stare 65 320000.0\n",
"1655 True 3 1 Nowe 67 364000.0\n",
"1656 True 3 3 Grunwald 50 209000.0\n",
"\n",
"[1657 rows x 6 columns]"
]
},
"execution_count": 10,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"mieszkania_train"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {
"scrolled": false
},
"outputs": [
{
"data": {
"text/plain": [
"<AxesSubplot:xlabel='sqrMetres', ylabel='price'>"
]
},
"execution_count": 11,
"metadata": {},
"output_type": "execute_result"
},
{
"data": {
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAmEAAAFICAYAAAAYvikoAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjMuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8vihELAAAACXBIWXMAAAsTAAALEwEAmpwYAABIGUlEQVR4nO3dfZgU1Z0v8O9vhgEHGBx0cBAGYXgbeVExTDRIUEjAl4UrZpV7TbJiNMZr2M1Fs3qjCxezLCTZNc+Kuxvi4xKMxGRNMAoGNCouriyicYjI2zgMOiDDywDCwACDjMy5f1R3c7q6q6a6pqqruuv7eR6eYrqrq0+d6un6zXn5HVFKgYiIiIiyqyDoAhARERFFEYMwIiIiogAwCCMiIiIKAIMwIiIiogAwCCMiIiIKAIMwIiIiogDkZBAmIktF5KCIbHW4//8Uke0isk1EfuN3+YiIiIg6IrmYJ0xErgVwAsAypdToDvYdBuB3AL6ilDoqIhcppQ5mo5xEREREVnKyJUwp9RaAI/pjIjJERP4oIhtFZJ2IXBp76jsAfqaUOhp7LQMwIiIiClxOBmEWngLwPaXUWAAPAlgce3w4gOEisl5E3hGRGwMrIREREVFMl6AL4AUR6QngGgDLRST+cLfYtguAYQAmAqgAsE5ERiulmrNcTCIiIqKEvAjCYLToNSulxqR5rhHAO0qpNgANIlIHIyh7L4vlIyIiIkqSF92RSqnjMAKsGQAghitiT68AMCn2eBmM7smPgygnERERUVxOBmEi8h8ANgCoEpFGEfk2gG8C+LaIfABgG4Dpsd1fBfCpiGwHsBbAQ0qpT4MoNxEREVFcTqaoICIiIsp1OdkSRkRERJTrGIQRERERBSDnZkeWlZWpQYMGBV0MIiIiog5t3LjxsFKqT7rnci4IGzRoEGpqaoIuBhEREVGHRGS31XPsjiQiIiIKAIMwIiIiogAwCCMiIiIKAIMwIiIiogAwCCMiIiIKAIMwIiIiogD4FoSJyFIROSgiWy2eFxH5FxHZKSKbReQLfpWFiIiIKGz8bAn7JYAbbZ6/CcCw2L97Afzcx7IQERERhYpvQZhS6i0AR2x2mQ5gmTK8A6BURC72qzxEREREYRLkmLD+APZoPzfGHiMiopiGwycxb+VWNBw+GXRRiMhjQQZhkuYxlXZHkXtFpEZEag4dOuRzsYiIwuPp9Q1YtmE3nl7fEHRRiMhjQQZhjQAGaD9XANiXbkel1FNKqWqlVHWfPmnXwMxJ/AuXKL+tqz+EGx5/C+vq3f/xeNf4SswcNxB3ja/0sGREFAZBBmEvAZgZmyX5JQDHlFL7AyxP1vEvXKL8tmBVLeqaWrBgVa3rY1SW9cD86aNRWdbDw5IRURh08evAIvIfACYCKBORRgCPAigCAKXUkwBeBvAXAHYCOAXgLr/KElbxv2z5Fy5Rfpo7bQQWrKrF3Gkjgi4KEYWQKJV2GFZoVVdXq5qamqCLQURERNQhEdmolKpO9xwz5hMRRRTHpRIFi0EYEVFEcVwqUbB8GxNGREThxnGpRMFiEEZEFFHxmZdEFAx2RxIREREFgEEYERERUQAYhBEREREFgEEYERERUQAYhBERhYh5vUkv1p8konBiEEZEFCLm9Sa9WH+SiMKJQRgRUYjMnTYCVeUlifUmzT8TUf7g2pFEREREPuHakUREREQhwyCMiIiIKAAMwoiIiIgCwCCMiChCGg6fxLyVW9Fw+GTQRSGKPAZhREQR8vT6BizbsBtPr28IuihEkdcl6AIQEVH23DW+MmlLRMFhEEZEFCGVZT0wf/rooItBRGB3JBEREVEgGIQRUahxIDkR5SsGYUQUahxITkT5ikEYEYXaXeMrMXPcQA4kR260CmarjLlQF0QdYRBGRKEWH0heWdYj6KIELhdaBbNVxlyoC6KOcHYkEVGOyIX0EtkqYy7UBVFHRCkVdBkyUl1drWpqaoIuBhFlScPhk3h6fQPuGl/J1jAiyjkislEpVZ3uOXZHElGosduJiPIVuyOJKNTY7URE+YpBGBGFGjO8E1G+YnckERERUQAYhBEREREFgEEYERERUQAYhBEREREFgEEYERERUQAYhBFRxqK4bp/X5xzFOiSiZAzCiCjBaWAQxQSqXp9zFOuQiJIxTxgRJcQDAwC2ubmimEDV63OOYh0SUTKuHUlECVynkYjIW3ZrR7IljIgSmJ2eiCh7OCaMiIiIKAAMwoiIiIgCwCCMiChDTC9BRF5gEEZElCGmlyAiL3BgPhFRhphegoi8wJYwIqIMxWeRMo0HOcUu7PAJwzVhEEZEROQzdmGHTxiuia/dkSJyI4AnABQCWKKU+onp+fMBPAvgklhZfqqUetrPMhEREWUbu7DDJwzXxLeM+SJSCGAHgCkAGgG8B+DrSqnt2j5/B+B8pdQPRKQPgDoAfZVSZ6yOy4z5RERElCvsMub72R15FYCdSqmPY0HVcwCmm/ZRAEpERAD0BHAEwOc+lomIiIgoFPwMwvoD2KP93Bh7TPdvAEYA2AdgC4DZSql2H8tEnRCGQYyUPbzeyXKpPnKprERR5mcQJmkeM/d93gBgE4B+AMYA+DcR6ZVyIJF7RaRGRGoOHTrkdTnJoTAMYqTs4fVOlkv1kUtlJYoyPwfmNwIYoP1cAaPFS3cXgJ8oY2DaThFpAHApgD/pOymlngLwFGCMCfOtxGQrDIMYKXt4vZPlUn3kUlmJoszPgfldYAzM/yqAvTAG5n9DKbVN2+fnAJqUUj8UkXIAfwZwhVLqsNVxOTCfiIiIcoXdwHzfWsKUUp+LyN8AeBVGioqlSqltInJf7PknAfwDgF+KyBYY3Zc/sAvAiIiIiPKFr3nClFIvA3jZ9NiT2v/3AbjezzIQERERhREz5nuEs5GIchN/d/3DuiWyxyDMI5yNRJSb+LvrH9YtkT1fuyOjhLORKMzW1R/CglW1mDttBCYM6xN0cUKFv7v+Yd0Gr+HwSTy9vgF3ja/kgvMh5NvsSL9wdiRR5m54/C3UNbWgqrwErz5wbdDFIaIsmbdyK5Zt2I2Z4wZi/vTRQRcnkgKZHUlE4TF32ohESxgRRQdbI8ONLWFEREREPglqAW8iorzBmX5E5DUGYRR6vPlRGHCmHxF5jUEYhR5vfhQGd42vxMxxA3N+bA3/qAkG653SYRBGoRfGmx+/UMPNj+tTWdYD86ePzvlp/vyjJhisd0qHsyMp9OI3vzCJf6ECCF3ZiNfHDmfLBYP1TukwCCNygV+o4cbrYy2Mf9REAeud0mGKCiIiIiKfMEUFERERUcgwCAsQB3cTERFFF4OwAHG2DBERUXRxYH6AOHiYKHc0HD6Jp9c34K7xlTmfpoKIwoEtYQHKl7xDRFHQ2ZZrDj8gIjMGYUSUd/wIeDqbNNjv4Qfr6g/hhsffwrr6Q74cn4i8xyCMiPKOHwGP3nLtJsjze+WHBatqUdfUggWran05PhF5j0EY5Q22BFCc3wGPmyDP7+EHc6eNQFV5CeZOG+HL8YnIe0zWSnnjhsffQl1TC6rKS/DqA9cGXRzKYxykT0ROMVkrRUIutwQEOWibA8Yz19muSSIigEEY5ZEJw/rg1QeuxYRhfYIuSsaCzBln9d4MLpxhvj8icotBWIC8uMnxRpkfzGOYsnldrcZPMbhwxu/xZ9nE7xOi7GKy1gDFb3IAMH/66MCOQcGLd2/FZfO6mt87jsmEnbGqv1zE7xOi7GIQFiAvbnK8UeanMFzXfAouyJkwfO6IooSzI4mIiIh8wtmRRERERCHDIIyIiIgoAAzCiChncPYeEeUTBmFElDOYNoOI8glnRxJRzuDsPSLKJ2wJI6Kc4fci2PmC3ba5hdcruhiEEYXQuvpDuOHxt7Cu/lDQRaEcxG7b3MLrFV0Mwog05uAnqL9QF6yqRV1TCxasqs3q+9I5YWydcFqmfFpKKQp4vaKLQRiRxhz8BPUX6txpI1BVXoK500Zk9X3DKKhgKIytE07L5HW3bRgD0nzCbvbo4sB8Is3caSOwYFVtIvgJaiB4Re/uuHrwBajo3T2r7xtGQa1nGMZJAEGViWtKEvmDyxZRJDQ
"text/plain": [
"<Figure size 720x360 with 1 Axes>"
]
},
"metadata": {
"needs_background": "light"
},
"output_type": "display_data"
}
],
"source": [
"sns.scatterplot(x='sqrMetres',y='price', data = mieszkania_train, linewidth = 0, s = 5)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Pytanie- Jaki jest baseline naszego systemu?"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Czym jest regresja liniowa?- przypadek jednowymiarowym\n",
"\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"![regresja liniowa 1](obrazki/1.png)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"![regresja liniowa 2](obrazki/2.png)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"![regresja liniowa 3](obrazki/3.png)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"![regresja liniowa 4](obrazki/4.png)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## wzór na regresję w przypadku jednowymiarowym?\n",
"\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"$Y = a*X_1 + b$\n",
"\n",
"$Y = w_1 * X_1 + w_0$"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Zadanie - napisać funkcję predict_score(sqr_metres) która zwraca cenę mieszkania zgodnie z modelem regresji liniowej ( 5 minut) \n",
"\n",
"Należy samemu wymyślić współczynniki modelu"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {},
"outputs": [],
"source": [
"def predict_price(sqr_metres):\n",
" pass"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {},
"outputs": [],
"source": [
"predict_price(20)"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {},
"outputs": [],
"source": [
"predict_price(40)"
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {},
"outputs": [],
"source": [
"predict_price(55)"
]
},
{
"cell_type": "code",
"execution_count": 16,
"metadata": {},
"outputs": [],
"source": [
"predict_price(0)"
]
},
{
"cell_type": "code",
"execution_count": 17,
"metadata": {},
"outputs": [],
"source": [
"Y_train_predicted = predict_price(X_train)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Mierzenie błędu"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"![RMSE 2](obrazki/6.png)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"![RMSE 2](obrazki/5.png)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Zadanie - napisać funkcję, która liczy błąd średniowadratowy na całym zbiorze (7 minut)\n",
"\n",
"rmse(Y_true, Y_predicted)"
]
},
{
"cell_type": "code",
"execution_count": 18,
"metadata": {},
"outputs": [],
"source": [
"def rmse(Y_true, Y_predicted):\n",
" pass "
]
},
{
"cell_type": "code",
"execution_count": 19,
"metadata": {},
"outputs": [],
"source": [
"rmse(np.array([300_000, 250_000]), np.array([300_000, 250_000]))"
]
},
{
"cell_type": "code",
"execution_count": 20,
"metadata": {},
"outputs": [],
"source": [
"rmse(np.array([305_000, 250_000]) ,np.array([300_000, 350_000]) )"
]
},
{
"cell_type": "code",
"execution_count": 21,
"metadata": {},
"outputs": [],
"source": [
"rmse(np.array([300_000, 250_000]), np.array([330_000, 360_000]))"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Zadanie - za pomocą rmse policzyć błąd dla baseline (3 minuty)\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Zadanie - za pomocą rmse policzyc błąd dla predykcji (2 minuty)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Na jakim zbiorze najlepiej sprawdzać wyniki?\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"![a](obrazki/7.png)\n",
"\n",
"![a](obrazki/8.png)\n",
"\n"
]
},
{
"cell_type": "code",
"execution_count": 22,
"metadata": {},
"outputs": [],
"source": [
"mieszkania_dev = pd.read_csv(DATA_DIR/'dev-0'/'in.tsv', sep = '\\t', names = names)"
]
},
{
"cell_type": "code",
"execution_count": 23,
"metadata": {},
"outputs": [],
"source": [
"with open(DATA_DIR/'dev-0'/'expected.tsv','r') as dev_exp_f:\n",
" Y_dev = np.array([float(x.rstrip('\\n')) for x in dev_exp_f.readlines()])"
]
},
{
"cell_type": "code",
"execution_count": 24,
"metadata": {},
"outputs": [],
"source": [
"mieszkania_dev['price'] = Y_dev"
]
},
{
"cell_type": "code",
"execution_count": 25,
"metadata": {},
"outputs": [],
"source": [
"X_dev = mieszkania_dev['sqrMetres'].to_numpy()"
]
},
{
"cell_type": "code",
"execution_count": 26,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"<AxesSubplot:xlabel='sqrMetres', ylabel='price'>"
]
},
"execution_count": 26,
"metadata": {},
"output_type": "execute_result"
},
{
"data": {
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAncAAAE9CAYAAABp4UT1AAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjMuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8vihELAAAACXBIWXMAAAsTAAALEwEAmpwYAAAp+UlEQVR4nO3df5Sc1Xng+e8TiWAhA5aQ0BIJW+2xTPgx8Q96BQ6BeIxBSsJYTAIbzZkMWqIJG8Im2DvZBGZYswF81uzkhJg5CxnGGAT2GBTiGC0bgtsijhmGCBr/CD8UWUo6gIwimjTGCvYQhJ/9o26b6qZVKpp6u6re+n7OqVNVt9576763m9bDve9z38hMJEmSVA8/0u0OSJIkqXMM7iRJkmrE4E6SJKlGDO4kSZJqxOBOkiSpRgzuJEmSamR+tzvQK5YsWZIrV67sdjckSZIO6tFHH30+M5fO9JnBXbFy5UpGR0e73Q1JkqSDioinDvSZy7KSJEk1YnAnSZJUIwZ3kiRJNWJwJ0mSVCMGd5IkSTVicCdJklQjBneSJEk1YnAnSZJUIwZ3kiRJNWJwJ0mS1CFjz7/Ex+9+nLHnX+paHwzuJEmSOuSWB8e47aGnuOXBsa71wXvLSpIkdciFpw1Nee4GgztJkqQOGVqykKvWndTVPrgsK0mSVCMGd5IkSTVicCdJklQjBneSJEk1YnAnSZJUIwZ3kiRJNWJwJ0mSVCMGd5IkSTVicCdJklQjlQZ3EXFpRDweEU9ExEdL2eKIGImIneV5UdPxl0fErojYERFrmspPjojHymfXR0SU8kMj4s5Svi0iVjbV2VC+Y2dEbKjyPCVJknpFZcFdRJwE/AqwGngPcE5ErAIuA7Zm5ipga3lPRJwArAdOBNYCN0TEvNLcjcBFwKryWFvKNwIvZOa7gOuAa0tbi4ErgVPK91/ZHERKkiTVVZUzd8cDf5GZ38vM/cCfA/8CWAdsKsdsAs4tr9cBd2Tmy5k5BuwCVkfEMcARmflQZiZw27Q6k23dBZxZZvXWACOZOZGZLwAjvBYQSpIk1VaVwd3jwBkRcVREHAb8LHAssCwz9wCU56PL8cuBZ5rq7y5ly8vr6eVT6pQA8kXgqBZtSZIk1dr8qhrOzO0RcS2NWbN/AL4J7G9RJWZqpkX5bOu89oURF9FY7uXtb397i65JkiT1h0oTKjLz5sx8f2aeAUwAO4G9ZamV8vxcOXw3jZm9SSuAZ0v5ihnKp9SJiPnAkeV7DtTW9P7dlJnDmTm8dOnSN3OqkiRJPaHqbNmjy/PbgZ8HPg9sASazVzcAd5fXW4D1JQN2iEbixMNl6XZfRJxarqe7YFqdybbOA+4v1+XdB5wdEYtKIsXZpUySJKnWKluWLf4oIo4CXgEuycwXIuKTwOaI2Ag8DZwPkJlPRMRm4Ekay7eXZOarpZ2LgVuBBcC95QFwM3B7ROyiMWO3vrQ1ERFXA4+U467KzIlqT1WSJKn7ojHRpeHh4RwdHe12NyRJkg4qIh7NzOGZPvMOFZIkSTVicCdJklQjBneSJEk1YnAnSZJUIwZ3kiRJNWJwJ0mSVCMGd5IkSTVicCdJklQjBneSJEk1YnAnSZJUIwZ3kiRJNWJwJ0mSVCMGd5IkSTVicCdJklQjBneSJEk1YnAnSZJUIwZ3kiRJNWJwJ0mSVCMGd5IkSTVicCdJklQjBneSJEk1YnAnSZJUI5UGdxHxsYh4IiIej4jPR8RbImJxRIxExM7yvKjp+MsjYldE7IiINU3lJ0fEY+Wz6yMiSvmhEXFnKd8WESub6mwo37EzIjZUeZ6SJEm9orLgLiKWA78BDGfmScA8YD1wGbA1M1cBW8t7IuKE8vmJwFrghoiYV5q7EbgIWFUea0v5RuCFzHwXcB1wbWlrMXAlcAqwGriyOYiUJEmqq6qXZecDCyJiPnAY8CywDthUPt8EnFterwPuyMyXM3MM2AWsjohjgCMy86HMTOC2aXUm27oLOLPM6q0BRjJzIjNfAEZ4LSCUJEmqrcqCu8z8NvC7wNPAHuDFzPwSsCwz95Rj9gBHlyrLgWeamthdypaX19PLp9TJzP3Ai8BRLdqSJEmqtSqXZRfRmFkbAn4MWBgRv9Sqygxl2aJ8tnWa+3hRRIxGxOj4+HiLrkmSJPWHKpdlPwyMZeZ4Zr4CfAH4SWBvWWqlPD9Xjt8NHNtUfwWNZdzd5fX08il1ytLvkcBEi7amyMybMnM4M4eXLl36Jk5VkiSpN1QZ3D0NnBoRh5Xr4M4EtgNbgMns1Q3A3eX1FmB9yYAdopE48XBZut0XEaeWdi6YVmeyrfOA+8t1efcBZ0fEojKDeHYpkyRJqrX5VTWcmdsi4i7ga8B+4OvATcBbgc0RsZFGAHh+Of6JiNgMPFmOvyQzXy3NXQzcCiwA7i0PgJuB2yNiF40Zu/WlrYmIuBp4pBx3VWZOVHWukiRJvSIaE10aHh7O0dHRbndDkiTpoCLi0cwcnukz71AhSZJUIwZ3kiRJNWJwJ0mSVCMGd5IkSTVicCdJklQjBneSJEk1YnAnSZJUIwZ3kiRJNWJwJ0kaGGPPv8TH736csedf6nZXpMoY3EmSBsYtD45x20NPccuDY93uilSZyu4tK0lSr7nwtKEpz1IdGdxJkgbG0JKFXLXupG53Q6qUy7KSJEk1YnAnSZJUIwZ3kjrOjERJ6h6DO0kdZ0aiJHWPCRWSOs6MREnqHoM7SR1nRqIkdY/LspIkSTVicCdJklQjBneSJEk1YnAnSZJUI5UFdxFxXER8o+nx3Yj4aEQsjoiRiNhZnhc11bk8InZFxI6IWNNUfnJEPFY+uz4iopQfGhF3lvJtEbGyqc6G8h07I2JDVecpSZLUSyoL7jJzR2a+NzPfC5wMfA/4Y+AyYGtmrgK2lvdExAnAeuBEYC1wQ0TMK83dCFwErCqPtaV8I/BCZr4LuA64trS1GLgSOAVYDVzZHERKkiTV1Vwty54J/HVmPgWsAzaV8k3AueX1OuCOzHw5M8eAXcDqiDgGOCIzH8rMBG6bVmeyrbuAM8us3hpgJDMnMvMFYITXAkJJkqTamqvgbj3w+fJ6WWbuASjPR5fy5cAzTXV2l7Ll5fX08il1MnM/8CJwVIu2JEmSaq3y4C4ifhT4CPCHBzt0hrJsUT7bOs19uygiRiNidHx8/CDdkyRJ6n1zMXP3M8DXMnNveb+3LLVSnp8r5buBY5vqrQCeLeUrZiifUici5gNHAhMt2poiM2/KzOHMHF66dOmsT1CSJKlXzEVw9y95bUkWYAswmb26Abi7qXx9yYAdopE48XBZut0XEaeW6+kumFZnsq3zgPvLdXn3AWdHxKKSSHF2KZMkSaq1Su8tGxGHAWcB/0tT8SeBzRGxEXgaOB8gM5+IiM3Ak8B+4JLMfLXUuRi4FVgA3FseADcDt0fELhozdutLWxMRcTXwSDnuqsycqOQkJUmSekg0Jro0PDyco6Oj3e6GJEnSQUXEo5k5PNNn3qFCkiSpRgzuJEmSasTgTpIkqUYM7iRJkmrE4E6SJKlGDO4kSZJqxOBOkiSpRgzupBbGnn+Jj9/9OGPPv9TtrkiS1BaDO6mFWx4c47aHnuKWB8e63RVJktpS6e3HpH534WlDU54lSep1BndSC0NLFnLVupO63Q1JktrmsqwkSVKNGNxJkiTViMGdJElSjRjcSZIk1YjBnSRJUo0Y3EmSJNWIwZ0kSVKNGNxJkiTViMGdJElSjRjcSZIk1YjBnSRJUo1UGtxFxNsi4q6I+KuI2B4RH4iIxRExEhE7y/OipuMvj4hdEbEjItY0lZ8cEY+Vz66PiCjlh0bEnaV8W0SsbKqzoXzHzojYUOV5SpIk9YqqZ+4+BfxpZv448B5gO3AZsDUzVwFby3si4gRgPXAisBa4ISLmlXZuBC4CVpXH2lK+EXghM98FXAdcW9paDFwJnAKsBq5sDiIlSZLqqrLgLiKOAM4AbgbIzH/MzO8A64BN5bBNwLnl9Trgjsx8OTPHgF3A6og4BjgiMx/KzARum1Znsq27gDPLrN4aYCQzJzLzBWCE1wJCdcnY8y/x8bs
"text/plain": [
"<Figure size 720x360 with 1 Axes>"
]
},
"metadata": {
"needs_background": "light"
},
"output_type": "display_data"
}
],
"source": [
"sns.scatterplot(x='sqrMetres',y='price', data = mieszkania_dev, linewidth = 0, s = 5)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Zadanie - policzyć rmse dla predykcji ze zbioru deweloperskiego modelu baseline i naszego modelu regresji liniowej"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Przypadek wielowymiarowy"
]
},
{
"cell_type": "code",
"execution_count": 27,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"<AxesSubplot:xlabel='floor', ylabel='price'>"
]
},
"execution_count": 27,
"metadata": {},
"output_type": "execute_result"
},
{
"data": {
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAmEAAAFICAYAAAAYvikoAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjMuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8vihELAAAACXBIWXMAAAsTAAALEwEAmpwYAAAyt0lEQVR4nO3de3zV9Z3v+/cnIcEI4aJCVKKAAhEExZJeqOOtBS8bptSxnoedTvHYdnuU6XjpTE87G8Z22LCnPbN31Tmt+vC4tXpmtraMrbSgVehoZSNtDQUrkAYQpEQkRLkFCCQk3/PHWunOjy4E57i+nwXf1/Px8PFJ1lrm+/2tFbI+63v5fC2EIAAAAMRV5t0BAACAFJGEAQAAOCAJAwAAcEASBgAA4IAkDAAAwAFJGAAAgIMTMgkzs0fNbIeZrTnOx/9vZrbOzNaa2f8odv8AAACOxU7EOmFmdrmkfZKeCCGMP8ZjR0v6oaRPhBB2mdnQEMKOGP0EAAA4mhNyJCyE8LKknb1vM7PzzexnZrbSzJaZ2QX5u/6jpO+FEHbl/18SMAAA4O6ETMKO4mFJfxVCmCTpbyQ9kL99jKQxZrbczH5pZte69RAAACCvj3cHPghm1l/SxyUtMLOem/vmYx9JoyVdKalW0jIzGx9C2B25mwAAAH9wUiRhyo3o7Q4hTCxwX7OkX4YQOiVtNrMm5ZKyVyP2DwAAIOOkmI4MIexVLsG6UZIs5+L83c9Iuip/+xnKTU9u8ugnAABAjxMyCTOzJyWtkFRnZs1m9kVJn5P0RTN7TdJaSTPyD39e0rtmtk7Si5K+GkJ416PfAAAAPU7IEhUAAAAnuhNyJAwAAOBERxIGAADg4ITbHXnGGWeEESNGeHcDAADgmFauXPlOCGFIoftOuCRsxIgRamho8O4GAADAMZnZlqPdx3QkAACAA5IwAAAAByRhAAAADkjCAAAAHJCEAQAAOCAJAwAAcFC0JMzMHjWzHWa25ij3m5n9k5ltNLPfmtmHitUXAACAUlPMkbDvS7r2Pe6/TtLo/H+3SnqwiH0BAAAoKUVLwkIIL0va+R4PmSHpiZDzS0mDzOysYvUHAACglHiuCRsmaWuv75vztwEuNr+zX/csXKPN7+z37goAIAGeSZgVuC0UfKDZrWbWYGYNra2tRe4WUvXY8s16YsUWPbZ8s3dXAAAJ8EzCmiWd0+v7WknbCj0whPBwCKE+hFA/ZEjBMzBPGgsatmri37+gBQ1bj/1gfKBuuXSkZk4erlsuHendFQBAAjyTsJ9ImpnfJfkxSXtCCG879qckzF/cqN3tnZq/uNG7K8kZeUY/zZ0xXiPP6OfdFQBAAopZouJJSSsk1ZlZs5l90cxuM7Pb8g95VtImSRsl/T+SZhWrLyeS2dPGalBVhWZPG+vdFQAAUEQWQsFlWCWrvr4+NDQ0eHcDAADgmMxsZQihvtB9VMwHSgS7MwEgLSRhQIlgdyYApKWPdwcA5PTsymR3JgCkgSQMKBE9uzMBAGlgOhIAAMABSRgAAIADkjAAAAAHJGEAAAAOSMKAvAde3Kgxs5/TAy9u9O4KACABJGFA3n1LN6ijq1v3Ld3g3RUAQAJIwoC8u6aMVmV5me6aMtq7KwCABFAnDMibddUozbpqlHc3AACJYCQMAADAAUkYAACAA5IwAAAAByRhJWbzO/t1z8I12vzOfu+uAACAIiIJKzGPLd+sJ1Zs0WPLN3t3BQAAFBG7I0vMLZeOzEQAAHByIgkrMSPP6Ke5M8Z7dwMAABQZ05EAAAAOSMIKYHE8AAAoNpKwAlgcDwAAio0krICp42pUV1OtqeNqvLsSnfcooHf7AADEQhJWwJJ1LWpqadOSdS3eXYnOexTQu30AAGJhd2QBKZeJ8L527/YBAIjFQgjefXhf6uvrQ0NDg3c3gA/c5nf267Hlm3XLpSM18ox+3t0BAHwAzGxlCKG+0H1MRwIlgqlYAEgL05FAiWAqFgDSQhIGlAhOSwCAtDAdCQAA4IAkDAAAwAFJGAAAgAOSMAAAAAckYQAAAA5IwpDhfXajd/ueUr52AEgRSRgyvAuGerfvKeVrJwEFkCLqhCHDu2Cod/ueUr72ngRUErXSACSDsyMBuOPcTAAnq/c6O5KRMADuOC0AQIpYEwYAAOCAJAwAAMABSViJYZcYAABpIAkrMSmXKQAAICUszC8xKZcpAAAgJYyEFcCUIAAAKDaSsAI8pwSZjgQAIA1FnY40s2sl3S+pXNIjIYRvHXH/QEn/LOncfF/+awjhsWL26Xh4TgkyHQkAQBqKVjHfzMolrZc0VVKzpFclfTaEsK7XY/6TpIEhhK+Z2RBJTZLODCF0HO3nUjEfAACcKN6rYn4xpyM/ImljCGFTPql6StKMIx4TJFWbmUnqL2mnpMNF7BMAAEBJKGYSNkzS1l7fN+dv6+27ksZK2ibpdUl3hhC6i9inkpf6poBlG1p1zb0va9mGVu+uRJf6aw8AqSlmEmYFbjty7vMaSaslnS1poqTvmtmAP/pBZreaWYOZNbS2ntxvzqkvzJ+3qFFNLW2at6jRuyvRpf7aA0Bqirkwv1nSOb2+r1VuxKu3WyR9K+QWpm00s82SLpD0694PCiE8LOlhKbcmrGg9LgGpL8yfM32s5i1q1JzpY727El3qrz0ApKaYC/P7KLcw/5OS3lJuYf6fhxDW9nrMg5JaQgjfNLMaSb+RdHEI4Z2j/VwW5gMAgBPFey3ML9pIWAjhsJl9WdLzypWoeDSEsNbMbsvf/5Ck/yzp+2b2unLTl197rwQMAADgZFHUOmEhhGclPXvEbQ/1+nqbpKuL2QcAAIBSRMX8AlLepeZ97d7tAwAQC0lYASnvUvO+du/2AQCIpajTkScqz11qm9/Zr8eWb9Ytl47UyDP6RW/fe4eeZ/vLNrT+YWfmZaOHRG8fAJCWou2OLJaTfXfkPQvX6IkVWzRz8nDNnTHeuztJuebel9XU0qa6mmo9f/fl3t0BAJwEXHZH4t/HeyQqZSnXKAMAxMdIGAAAQJF4HeCNExC7EwEAiIMkrICUExF2JwIAEAdrwgqYv3idljbu0Lbd7Xrk5g97dycq1qQBABAHI2EFvLFjfyamZOQZ/TR3xniX8hhI17INrbrm3pe1bEOrd1cAIBqSsALmfvpC1dVUa+6nL/TuCpCEeYsa1dTSpnmLGr27AgDRkIQVcNnoIXr+7ssp2AlEMmf6WNXVVFMeBEBSWBMGwF3ugw8fegCkhZEwAAAAByRhBaRcogIAAMRBElYAtbIAAECxsSasAGplAXFtfme/Hlu+WbdcOpLyKACSwUhYAZ61spgKRYoYfQaQIpKwEuP9ZuRdNJMkNE23XDpSMycPZ/QZQFJIwkqM95uRd9FM7yQUAIBYSMJKjPexQTMmnq3K8jLNmHi2S/veSSh8kHwDSBEL85GxcPU2dXR1a+HqbZp11ajo7fckoUgLm2EApIiRsBLjvSbqS5eN1KCqCn3psvTeDL2f+5R5jwADgAeSsBJz/9L1emLFFt2/dL1L+6+/tUe72zv1+lt7XNr3xJQYACAmkrACPEdE9h06nImxpbwmy/vavXemAgDiIgkrwHNEpH/fPpkYW8rTQt7X7r0zFQAQFwvzC/BcJHznlDEaUFWR5EhU6uZMH6t5ixo1Z/pY764AACKwEIJ3H96X+vr60NDQ4N0NAACAYzKzlSGE+kL3MR0JAADggCQMAADAAUkYSgq1ugAAqSAJQ0mhVhcAIBXsjkRJ4fgaAEAqGAkrgCkxAABQbCRhBTAl5sfzuadiPQAgJpKwAryPr0nZ1HE1qqup1tRxNdHb/tunX1dTS5v+9unXo7ctMQLriecegAeSMJSUJeta1NTSpiXrWqK3fbi7OxNjYwTWD889AA8szC/g/qXr9czqbdrb3qn7brrEuztJ8VyYP37YQG3fu0Pjhw2M3rbEpgRPPPcAPJCEIWPzO/v12PLNuuXSkckd4j172jidPajK7Y245wBxxMdzD8ADSVgBKR+i3TMtI8nlTWn+4nVa2rhD23a365GbPxy1bd6I/aSc/ANIF0lYASm/GXtPy7z
"text/plain": [
"<Figure size 720x360 with 1 Axes>"
]
},
"metadata": {
"needs_background": "light"
},
"output_type": "display_data"
}
],
"source": [
"sns.scatterplot(x='floor',y='price', data = mieszkania_train, linewidth = 0, s = 5)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"$Y = w_1 * X_1 + w_2 * X_1 + w_3 * X_3 + w_0$"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Zadanie. Napisać analogiczną funkcję predict_price(sqr_metres, floor), policzyć rmse dla takiego modelu ( 7 minut)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## jak dobrać najlepsze parametry?"
]
},
{
"cell_type": "code",
"execution_count": 28,
"metadata": {
"scrolled": true
},
"outputs": [
{
"data": {
"text/plain": [
"<seaborn.axisgrid.FacetGrid at 0x7fbaa0c46760>"
]
},
"execution_count": 28,
"metadata": {},
"output_type": "execute_result"
},
{
"data": {
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAW4AAAFuCAYAAAChovKPAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjMuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8vihELAAAACXBIWXMAAAsTAAALEwEAmpwYAAB7FklEQVR4nO29eZwkV3Xn+70RkVvtS1f1LnW3pO4CLYCQhIQFtBrZFsZPDDMau2WPPczAINtjS8YPBnkGMzyYBT3zQUifsY30ZA+DF8lYA0ZjQNhCNEK2BBINElpK3a1qtXqvpWvJyj0i7vvjRmTlvnVmVWbW/X4Q1RmZGXkjMuPEueee8ztCSolGo9FoOgdjrQeg0Wg0mvrQhluj0Wg6DG24NRqNpsPQhluj0Wg6DG24NRqNpsPQhluj0Wg6jI403EKIPxNCTAshXqjx9b8khHhJCPGiEOKvWj0+jUajaSWiE/O4hRDvBJaBL0spL6vy2kuArwD7pJTzQohxKeX0aoxTo9FoWkFHetxSyieAc7nbhBAXCSEeFUL8SAjxfSHEhPfUvwP+SEo5771XG22NRtPRdKThLsP9wO9IKd8KfBT4Y2/7bmC3EOIfhRBPCyFuWrMRajQaTROw1noAzUAI0Qe8HfgbIYS/OeT9tYBLgL3ANuD7QojLpJQLqzxMjUajaQpdYbhRM4cFKeWbSzx3AnhaSpkBjgohXkEZ8mdWcXwajUbTNLoiVCKlXEIZ5X8JIBRv8p7+W+AGb/sGVOhkai3GqdFoNM2gIw23EOJB4ClgjxDihBDig8CvAh8UQjwHvAi8z3v5t4E5IcRLwHeBj0kp59Zi3BqNRtMMOjIdUKPRaNYzHelxazQazXqm4xYnb7rpJvnoo4+u9TA0Go2mFYjqL+lAj3t2dnath6DRaDRrSscZbo1Go1nvaMOt0Wg0HYY23BqNRtNhaMOt0Wg0HYY23BqNRtNhaMOt0Wg0HYY23BqNRtNhaMOt0Wg0HYY23BqNRtNhtMxwV2vo60mv3iuEOCKEeF4IcWWrxqLRaDTdRCu1Sr4E/A/gy2Wefw+qocElwNuAP/H+ajRdz4HJae57Yorj83G2D/dw2zt3sXdifK2HpekQWuZxl2roW8D7UF3apZTyaWBICLG5VePRaNqFA5PTfPKRF5mOJhmKBJiOJvnkIy9yYFL3sdbUxlqqA24Fjuc8PuFtO134QiHEh4EPA1xwwQUNf6D2cjSt5t7HDvHAk0eJpR16gyYfun4nt9+4O+819z0xRcAU9ATV5dcTtIinbe57Ykr/HjU1sZaLk6XkC0t2dZBS3i+lvEpKedXY2FhDH6a9HE2rufexQ9zz+BESGQfLgETG4Z7Hj3DvY4fyXnd8Pk4kYOZtiwRMTszHV3O4mg5mLQ33CWB7zuNtwKlWfViulyOE+hswBfc9odtPaprDA08exRBgGQaGMLy/ansu24d7SGScvG2JjMO24Z7VHK6mg1lLw/0I8Otedsm1wKKUsihM0iy0l6NpNbG0g1EwjzSE2p7Lbe/cRcaRxNM2Uqq/GUdy2zt3reJoNZ1My2LcXkPfvcAGIcQJ4D8DAQAp5ReBbwK/ABwB4sC/adVYQHk509FkNq4I2svRNJfeoEkik2+8Xam257J3YpxPo2aBJ+bjbKtjvUWv02ighYZbSnlrlecl8O9b9fmF3PbOXXzykReJp20iAXWBaS9H00w+dP1O7nn8CLbrYghltF2ptheyd2K8boPrr9METJG3TvNpb3+a9cO6qZzcOzHOp2++lPH+MIuJDOP9YT5986X6B69pGrffuJs79l1MJGBiuyoUd8e+i4uyShpFr9NofDquWfD50IiXo9HUw+037m6aoS7k+HycoUggb5tep1mfrBuPW6PpdHQ2isZHG26NpkPQ2SgaH224NZoOQa/TaHzWVYxbo+l09DqNBrTHrdFoNB2H9rg1mlXEF6FaStqAEuzpD1slxag0mnJow63RrBK+CJXjrmipSWA5ZXPP40cAtPHW1IQOlWg0q4QvQlWIKykpRqXRlEMbbo1mlSglQuVTSoxKoymHNtwazSrRGzRxSyrOlxaj0mjKoQ23RrNKfOj6nSUNty9IVUqMSqMphV6c1GhWCX/hsTCrpC+ks0o09SGUumrncNVVV8lnn312rYeh0Wg0raDMKkg+2uPWaDoA3UBBk4uOcWs0bY5udK0pRBtujabN0Q0UNIVow63RtDm60bWmEG24NZo2RzdQ0BSiDbdG0+boBgqaQnRWiaZr6ZZMjL0T43waFes+MR9nWwcfi6Y56DxuTVfiZ2IETEEkYJLIOGQcqTvGaNodncetWb/kZmIA9AQt4mmb+56YaivDvRqzgvP9jG6ZuXQTOsat6Uo6IRNjNfKzz/czdA55e6INt6Yr6YRMjNXIzz7fz9A55O2JNtyarqQTMjFWY1Zwvp/RCTOX9Yg23JquZO/EOLdcuZWZaIqXz0SZiaa45cqtbRWbXY1Zwfl+RifMXNYj2nBrupIDk9M8fPAkY/0h3rCpn7H+EA8fPNlWsdnVmBWc72d0wsxlPaINt6Yr6YTY7N6JcT5986WM94dZTGQY7w83PV3xfD9jNcaoqR+dx63pSq6/63GGIgGEWEmLlVKymMjw/Y/vW8ORaTQVqSmPW3vcmq5Ex2Y13Yw23JquRMdmNd2MNtyarkTHZjXdjC5513QteyfGtaHWdCXa49ZoNJoOQxtujUaj6TB0qETTlrSjIl0jY2rH49B0PjqPW7OmlDJsQNtpaTei7601wTUNoPO4Ne1NOcnQux6dbLuqx0YqMTuhelPTmWjDrVkzyhm2qdlY2ynSNaKSp5X1NK1CG27NmlHOsAFtV/XYSCWmrt7UtAptuDVrRjnDtnO0p+2qHhupxNTVm5pWoQ23Zs0oZ9jufM8b2q7qsZFKTF29qWkVOqtEs6b4WSUn5uNs0+lyGo3u8q5pfzqxLF3nZmvWGh0q0WjqQHc917QD2nBrNHWgc7M17YAOlWg0dXB8Ps5QJJC3TedmN4/1Gobyj/upqbmp1z773qppR9rj1mjqQOdmt471GobKPW7gXC3v0YZbo6kDnZvdOtZrGCr3uGtFG26Npg50bnbrWK8SAaWOuxo6xq3R1EknpjB2AtuHe5iOJvM8z/UQhip13NXQHrdGo2kL1msYKve4a6WlHrcQ4ibgHsAEHpBSfrbg+UHgL4ALvLF8Tkr5P1s5ptViva6ON0onnK/VHGMnnI9ms3dinE/DuqukzT3uV2diI7W8p2Ul70IIEzgE/CxwAngGuFVK+VLOa/4jMCil/LgQYgx4BdgkpUyX228nlLxrAf366ITztZpj7ITzoWkZa95I4RrgiJRyyjPEDwHvK3iNBPqFEALoQ6XC1D5faFPW6+p4o3TC+VrNMXbC+dCsLa003FuB4zmPT3jbcvkfwBuAU8BPgTuklG7hjoQQHxZCPCuEeHZmZqZV420a63V1vFE64Xyt5hg74Xxo1pZWGu5SLn9hXObngZ8AW4A3A/9DCDFQ9CYp75dSXiWlvGpsbKzZ42w6ukijPjrhfK3mGDvhfGjWllYa7hPA9pzH21CedS7/BviqVBwBjgITLRzTqrBeV8cbpRPO12qOsRPOh2ZtaWVWyTPAJUKIncBJYD/wKwWveR14N/B9IcRGYA/Q8YG89bo63ijtdr7KZXSs1hjb7Xxo2o+WNlIQQvwC8AVUOuCfSSn/qxDiNwCklF8UQmwBvgRsRoVWPiul/ItK+yyXVbIe06c0zaebMzr0NdIR1JRV0hUdcLr5YtOsLrfe/3RRFVs8bTPeH+bBD1+7hiM7P/Q10jGsnw44hSItPUGLeNrmviem9I+yTbj3sUM88ORRYmmH3qDJh67fye037l7rYRXRrbKt6/Ea6eYZRleUvOv0qfbm3scOcc/jR0hkHCxDZUjc8/gR7n3s0FoPrYhuzehYb9dIt0vEdoXh7taLrVt44MmjGAIsw8AQhvdXbW83ujWjY71dI91exNQVhrtbL7ZuIZZ2MAoid4ZQ29uNbpVtXW/XSLfPMLoixq3Tp9qb3qBaDMs13q5U29uRbpRtXW/
"text/plain": [
"<Figure size 360x360 with 1 Axes>"
]
},
"metadata": {
"needs_background": "light"
},
"output_type": "display_data"
}
],
"source": [
"sns.lmplot(x='sqrMetres',y='price', data = mieszkania_train)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LinearRegression.html"
]
},
{
"cell_type": "code",
"execution_count": 29,
"metadata": {},
"outputs": [],
"source": [
"lm_model = LinearRegression()"
]
},
{
"cell_type": "code",
"execution_count": 30,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"LinearRegression()"
]
},
"execution_count": 30,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"lm_model.fit(mieszkania_train[['isNew','rooms', 'floor', 'sqrMetres']], Y_train)"
]
},
{
"cell_type": "code",
"execution_count": 31,
"metadata": {},
"outputs": [],
"source": [
"Y_train_predicted = lm_model.predict(mieszkania_train[['isNew','rooms', 'floor', 'sqrMetres']])"
]
},
{
"cell_type": "code",
"execution_count": 32,
"metadata": {},
"outputs": [],
"source": [
"rmse(Y_train, Y_train_predicted)"
]
},
{
"cell_type": "code",
"execution_count": 33,
"metadata": {},
"outputs": [],
"source": [
"Y_dev_predicted = lm_model.predict(mieszkania_dev[['isNew','rooms', 'floor', 'sqrMetres']])"
]
},
{
"cell_type": "code",
"execution_count": 34,
"metadata": {},
"outputs": [],
"source": [
"rmse(Y_dev, Y_dev_predicted)"
]
},
{
"cell_type": "code",
"execution_count": 35,
"metadata": {
"scrolled": true
},
"outputs": [
{
"data": {
"text/plain": [
"array([469449.27836213])"
]
},
"execution_count": 35,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"lm_model.predict(np.array(([[0, 4, 3, 70]])))"
]
},
{
"cell_type": "code",
"execution_count": 36,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"array([455982.54297977])"
]
},
"execution_count": 36,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"lm_model.predict(np.array(([[0, 4, 3, 60]])))"
]
},
{
"cell_type": "code",
"execution_count": 37,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"array([ 4522.65059749, 73763.4125433 , -78.83243119, 1346.67353824])"
]
},
"execution_count": 37,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"lm_model.coef_"
]
},
{
"cell_type": "code",
"execution_count": 38,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"80364.97780599026"
]
},
"execution_count": 38,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"lm_model.intercept_"
]
},
{
"cell_type": "code",
"execution_count": 39,
"metadata": {
"scrolled": true
},
"outputs": [
{
"data": {
"text/plain": [
"455982.5429800203"
]
},
"execution_count": 39,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"0 * 4522.65059749 + 4* 73763.4125433 + 3 * (-78.83243119) + 60 * 1346.67353824 + 80364.97780599032"
]
},
{
"cell_type": "code",
"execution_count": 40,
"metadata": {},
"outputs": [],
"source": [
"with open(DATA_DIR/'dev-0'/'out.tsv','w') as f_out_file:\n",
" for line in Y_dev_predicted:\n",
" f_out_file.write(str(line))\n",
" f_out_file.write('\\n')"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Uwaga - regresja linowa działa dobrze tylko dla danych, gdzie występuje korelacja liniowa"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"![RMSE 3](obrazki/9.png)\n",
"\n",
"![RMSE 4](obrazki/10.png)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Zadanie domowe\n",
"\n",
"\n",
"- https://gonito.net/challenge/retroc2\n",
"- termin 17.05\n",
"- należy użyć wektoryzacji (np tf-dif)\n",
"- wynik zaliczający to max 50 RMSE dla dev-0 \n",
"- punkty: 60, dla 3 najlepszych wyników na test-A: 80,\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"author": "Jakub Pokrywka",
"email": "kubapok@wmi.amu.edu.pl",
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"lang": "pl",
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.4"
},
"subtitle": "7.Regresja liniowa[ćwiczenia]",
"title": "Ekstrakcja informacji",
"year": "2021"
},
"nbformat": 4,
"nbformat_minor": 4
}