aitech-eks-pub-22/cw/07_regresja_liniowa_ODPOWIEDZI.ipynb

1417 lines
123 KiB
Plaintext
Raw Normal View History

2022-04-26 14:03:39 +02:00
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"![Logo 1](https://git.wmi.amu.edu.pl/AITech/Szablon/raw/branch/master/Logotyp_AITech1.jpg)\n",
"<div class=\"alert alert-block alert-info\">\n",
"<h1> Ekstrakcja informacji </h1>\n",
"<h2> 7. <i>Regresja liniowa</i> [ćwiczenia]</h2> \n",
"<h3> Jakub Pokrywka (2021)</h3>\n",
"</div>\n",
"\n",
"![Logo 2](https://git.wmi.amu.edu.pl/AITech/Szablon/raw/branch/master/Logotyp_AITech2.jpg)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Regresja liniowa"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## import bibliotek"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"import numpy as np\n",
"from pathlib import Path\n",
"import seaborn as sns\n",
"import matplotlib.pyplot as plt\n",
"import numpy as np\n",
"from sklearn.linear_model import LinearRegression\n",
"plt.rcParams['figure.figsize'] = [10, 5]"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Zbiór \n",
"\n",
"https://git.wmi.amu.edu.pl/kubapok/mieszkania2-below1m-public"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## ładowanie zbioru train"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"DATA_DIR = Path('/home/kuba/Syncthing/przedmioty/2020-02/ISI/zajecia7_regresja_liniowa/mieszkania2')"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"with open(DATA_DIR / 'names') as f_names:\n",
" names = f_names.read().rstrip('\\n').split('\\t')"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
"mieszkania_train = pd.read_csv(DATA_DIR/'train/in.tsv', sep ='\\t', names=names)"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>isNew</th>\n",
" <th>rooms</th>\n",
" <th>floor</th>\n",
" <th>location</th>\n",
" <th>sqrMetres</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>False</td>\n",
" <td>3</td>\n",
" <td>1</td>\n",
" <td>Centrum</td>\n",
" <td>78</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>False</td>\n",
" <td>3</td>\n",
" <td>2</td>\n",
" <td>Sołacz</td>\n",
" <td>62</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>False</td>\n",
" <td>3</td>\n",
" <td>0</td>\n",
" <td>Sołacz</td>\n",
" <td>15</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>False</td>\n",
" <td>4</td>\n",
" <td>0</td>\n",
" <td>Sołacz</td>\n",
" <td>14</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>False</td>\n",
" <td>3</td>\n",
" <td>0</td>\n",
" <td>Sołacz</td>\n",
" <td>15</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" isNew rooms floor location sqrMetres\n",
"0 False 3 1 Centrum 78\n",
"1 False 3 2 Sołacz 62\n",
"2 False 3 0 Sołacz 15\n",
"3 False 4 0 Sołacz 14\n",
"4 False 3 0 Sołacz 15"
]
},
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"mieszkania_train.head()"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [],
"source": [
"with open(DATA_DIR/'train'/'expected.tsv','r') as train_exp_f:\n",
" Y_train = np.array([float(x.rstrip('\\n')) for x in train_exp_f.readlines()])"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"array([476118., 459531., 411557., ..., 320000., 364000., 209000.])"
]
},
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"Y_train"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [],
"source": [
"mieszkania_train['price'] = Y_train"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [],
"source": [
"X_train = mieszkania_train['sqrMetres'].to_numpy()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Wizualizacja danych"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>isNew</th>\n",
" <th>rooms</th>\n",
" <th>floor</th>\n",
" <th>location</th>\n",
" <th>sqrMetres</th>\n",
" <th>price</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>False</td>\n",
" <td>3</td>\n",
" <td>1</td>\n",
" <td>Centrum</td>\n",
" <td>78</td>\n",
" <td>476118.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>False</td>\n",
" <td>3</td>\n",
" <td>2</td>\n",
" <td>Sołacz</td>\n",
" <td>62</td>\n",
" <td>459531.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>False</td>\n",
" <td>3</td>\n",
" <td>0</td>\n",
" <td>Sołacz</td>\n",
" <td>15</td>\n",
" <td>411557.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>False</td>\n",
" <td>4</td>\n",
" <td>0</td>\n",
" <td>Sołacz</td>\n",
" <td>14</td>\n",
" <td>496416.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>False</td>\n",
" <td>3</td>\n",
" <td>0</td>\n",
" <td>Sołacz</td>\n",
" <td>15</td>\n",
" <td>406032.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1652</th>\n",
" <td>True</td>\n",
" <td>2</td>\n",
" <td>0</td>\n",
" <td>Grunwald</td>\n",
" <td>51</td>\n",
" <td>299000.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1653</th>\n",
" <td>True</td>\n",
" <td>2</td>\n",
" <td>2</td>\n",
" <td>Centrum</td>\n",
" <td>53</td>\n",
" <td>339000.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1654</th>\n",
" <td>True</td>\n",
" <td>3</td>\n",
" <td>4</td>\n",
" <td>Stare</td>\n",
" <td>65</td>\n",
" <td>320000.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1655</th>\n",
" <td>True</td>\n",
" <td>3</td>\n",
" <td>1</td>\n",
" <td>Nowe</td>\n",
" <td>67</td>\n",
" <td>364000.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1656</th>\n",
" <td>True</td>\n",
" <td>3</td>\n",
" <td>3</td>\n",
" <td>Grunwald</td>\n",
" <td>50</td>\n",
" <td>209000.0</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>1657 rows × 6 columns</p>\n",
"</div>"
],
"text/plain": [
" isNew rooms floor location sqrMetres price\n",
"0 False 3 1 Centrum 78 476118.0\n",
"1 False 3 2 Sołacz 62 459531.0\n",
"2 False 3 0 Sołacz 15 411557.0\n",
"3 False 4 0 Sołacz 14 496416.0\n",
"4 False 3 0 Sołacz 15 406032.0\n",
"... ... ... ... ... ... ...\n",
"1652 True 2 0 Grunwald 51 299000.0\n",
"1653 True 2 2 Centrum 53 339000.0\n",
"1654 True 3 4 Stare 65 320000.0\n",
"1655 True 3 1 Nowe 67 364000.0\n",
"1656 True 3 3 Grunwald 50 209000.0\n",
"\n",
"[1657 rows x 6 columns]"
]
},
"execution_count": 10,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"mieszkania_train"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {
"scrolled": false
},
"outputs": [
{
"data": {
"text/plain": [
"<AxesSubplot:xlabel='sqrMetres', ylabel='price'>"
]
},
"execution_count": 11,
"metadata": {},
"output_type": "execute_result"
},
{
"data": {
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAmEAAAFICAYAAAAYvikoAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjUuMSwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy/YYfK9AAAACXBIWXMAAAsTAAALEwEAmpwYAABISklEQVR4nO3dfXwV1b0v/s834UEeAkGDQQlCgIBEVJTUSilWW3w6cMQe5dxaK9aHek89pxe19R49cKmHC6ft9fyq9vyK/rxUK31SsRUsaK148EApWkNFnlJIJFDCQwAhECBIJOv3x8zerD3JTNaezOyZvffn/XrxGpK99+y110wy36z1ne8SpRSIiIiIKLMKom4AERERUT5iEEZEREQUAQZhRERERBFgEEZEREQUAQZhRERERBFgEEZEREQUgawMwkTkORHZLyKbDJ//9yKyRUQ2i8gvw24fERERUWckG+uEichVAI4BWKSUGtvJcysAvAzgi0qpwyJyrlJqfybaSUREROQmK0fClFKrABzSvyciI0TkdyKyTkRWi8iF9kPfAPBjpdRh+7UMwIiIiChyWRmEuXgWwLeUUuMBfAfAAvv7owCMEpE1IvKuiNwQWQuJiIiIbN2ibkAQRKQvgM8BWCwiiW/3tLfdAFQAuBpAGYBVInKxUqopw80kIiIiSsqJIAzWiF6TUmpcB481AHhPKdUKoF5EtsEKyt7PYPuIiIiIUuTEdKRS6iisAGs6AIjlUvvhJbBGwSAiJbCmJ7dH0EwiIiKipKwMwkTkVwDWAhgtIg0icg+A2wHcIyIfAtgMYJr99DcBfCwiWwCsBPCwUurjKNpNRERElJCVJSqIiIiIsl1WjoQRERERZTsGYUREREQRyLq7I0tKStSwYcOibgYRERFRp9atW3dQKTWwo8eyLggbNmwYqquro24GERERUadEZKfbY5yOJCIiIooAgzAiIiKiCDAIIyIiIooAgzAiIiKiCDAIIyIiIooAgzAiIiKiCIQWhInIcyKyX0Q2uTwuIvIjEakTkQ0icnlYbSEiIiKKmzBHwn4K4AaPx28EUGH/uw/A0yG2hYiIiChWQgvClFKrABzyeMo0AIuU5V0AxSJyXljtISIiIoqTKHPCBgPYpX3dYH+PiIhs9QePY87STag/eDzqphBRwLIiMV9E7hORahGpPnDgQNTNISLKmOfX1GPR2p14fk191E0hooBFGYTtBjBE+7rM/l47SqlnlVJVSqmqgQM7XAMzK/EvXKLctrr2AK5/YhVW1/r/4/GuieWYMWEo7ppYHmDLiCgOogzCXgMww75L8koAR5RSeyNsT8bxL1yi3DZvWQ22NjZj3rIa3/soL+mDudPGorykT4AtI6I46BbWjkXkVwCuBlAiIg0AvgugOwAopZ4B8DqAvwFQB+AEgLvCaktcJf6y5V+4RLlp9tQxmLesBrOnjom6KUQUQ6KUiroNaamqqlLV1dVRN4OIiIioUyKyTilV1dFjWZGYT0REwWNeKlG0GIQREeUp5qUSRSu0nDAiIoo35qUSRYtBGBFRnkrceUlE0eB0JBEREVEEGIQRERERRYBBGBEREVEEGIQRERERRYBBGBFRjDjXmwxi/UkiiicGYUREMeJcbzKI9SeJKJ4YhBERxcjsqWMwurQoud6k82siyh1cO5KIiIgoJFw7koiIiChmGIQRERERRYBBGBEREVEEGIQREeWR+oPHMWfpJtQfPB51U4jyHoMwIqI88vyaeixauxPPr6mPuilEea9b1A0gIqLMuWtiecqWiKLDIIyIKI+Ul/TB3Gljo24GEYHTkURERESRYBBGRLHGRHIiylUMwogo1phITkS5ikEYEcXaXRPLMWPCUCaSI/6jgplsX9z7gsgEgzAiirVEInl5SZ+omxK5uI8KZrJ9ce8LIhO8O5KIKEvEvbxEJtsX974gMiFKqajbkJaqqipVXV0ddTOIKEPqDx7H82vqcdfEco6GEVHWEZF1Sqmqjh7jdCQRxRqnnYgoV3E6kohijdNORJSrGIQRUayxwjsR5SpORxIRERFFgEEYERERUQQYhBERERFFgEEYERERUQQYhBERERFFgEEYEaUt39btC+Pz5lsfElF7DMKIKMk0MMi3AqphfN5860Miao91wogoKREYAPCszZVvBVTD+Lz51odE1B7XjiSiJK7TSEQULK+1IzkSRkRJrE5PRJQ5zAkjIiIiigCDMCIiIqIIMAgjIkoTy0sQURAYhBERpYnlJYgoCEzMJyJKE8tLEFEQOBJGRJSmxF2kLONBpjiFHT9xOCYMwoiIiELGKez4icMxCXU6UkRuAPAUgEIAC5VS33c8fgGAFwAU2895RCn1ephtIiIiyjROYcdPHI5JaBXzRaQQwDYA1wJoAPA+gNuUUlu05zwL4AOl1NMiUgngdaXUMK/9smI+ERERZQuvivlhTkdeAaBOKbVdKXUKwIsApjmeowD0s//fH8CeENtDREREFBthBmGDAezSvm6wv6d7DMDXRKQBwOsAvhVie6iL4pDESJnD431GtvVFtrWXKF9FnZh/G4CfKqXKAPwNgJ+JSLs2ich9IlItItUHDhzIeCPJEockRsocHu8zsq0vsq29RPkqzMT83QCGaF+X2d/T3QPgBgBQSq0VkbMAlADYrz9JKfUsgGcBKycsrAaTtzgkMVLm8HifkW19kW3tJcpXYSbmd4OVmP8lWMHX+wC+qpTarD3nDQAvKaV+KiJjALwNYLDyaBQT84mIiChbRJKYr5T6FMA/AXgTQA2Al5VSm0VkrojcZD/t2wC+ISIfAvgVgK97BWBEREREuSLUOmF2za/XHd+bo/1/C4CJYbaBiIiIKI6iTszPGbwbiSj78Oc2XOxfIm8MwgLCu5GIsg9/bsPF/iXyFup0ZD7h3UgUZ6trD2DeshrMnjoGkyoGRt2c2ODPbbjYv9GrP3gcz6+px10Ty7ngfAyFdndkWHh3JFH6rn9iFbY2NmN0aRHefPCqqJtDRBkyZ+kmLFq7EzMmDMXcaWOjbk5e8ro7kiNhRHlg9tQxyZEwIsofHI2MN46EEREREYUkqgW8iYhyAu/yI6IwMAij2OMFkKLGu/yIKAwMwij2eAGkqN01sRwzJgzNibwa/lETDfY7dYRBGMVeHC+A/IUab0Efn/KSPpg7bWxO3OLPP2qiwX6njvDuSIq9xAUwThK/UAHErm3E4+OFd8tFg/1OHWEQRuQDf6HGG4+Puzj+UZMP2O/UEZaoICIiIgoJS1QQERERxQyDsAgxuZuIiCh/MQiLEO+WISIiyl9MzI8Qk4eJskP9weN4fk097ppYnhNlKogoHjgSFqFcqj1ElMuCGLVm+gEROTEII6KcE3TAE0TB4LDTD1bXHsD1T6zC6toDoeyfiILHIIyIck7QAY9z1NpPkBf2yg/zltVga2Mz5i2rCWX/RBQ8BmGUMzgSQAlhBzx+gryw0w9mTx2D0aVFmD11TCj7J6LgsVgr5Yzrn1iFrY3NGF1ahDcfvCrq5lAOY6I+EZlisVbKC9k8EhBl0jYTxtOnj2qx/4jILwZhlDMmVQzEmw9ehUkVA6NuStqirBnn9t4MLsyw3h8R+cUgLEJBXOR4ocwNzhymTB5Xt/wpBhdmws4/yyT+PiHKLBZrjVDiIgcAc6eNjWwfFL3E9FZCJo+r870TWEzYjFv/ZSP+PiHKLAZhEQriIscLZW6Kw3HNpeCCzMThvCPKJ7w7koiIiCgkvDuSiIiIKGYYhBERERFFgEEYEWUN3r1HRLmEQRgRZQ2WzSCiXMK7I4koa/DuPSLKJRwJI6KsEfYi2LmC07bZhccrfzEII4qh1bUHcP0Tq7C69kDUTaEsxGnb7MLjlb8YhBFp9OAnyr9O5y2rwdbGZsxbVpPx9yZLHEcnTNuUS0sp5QMer/zFIIxIowc/Uf51OnvqGIwuLcLsqWMy/t5xE1UwFMfRCdM2BT1tG8eANJdwmj1/MTGfSDN76hjMW1aD2VPHoGxAbwDRJIGXDeiNzw4/O9mGfBbVeoZxvAkgqjZxTUmicHDZIsoL9QeP4/k19bhrYnlW/LU5Z+k
"text/plain": [
"<Figure size 720x360 with 1 Axes>"
]
},
"metadata": {
"needs_background": "light"
},
"output_type": "display_data"
}
],
"source": [
"sns.scatterplot(x='sqrMetres',y='price', data = mieszkania_train, linewidth = 0, s = 5)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Pytanie- Jaki jest baseline naszego systemu?"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Czym jest regresja liniowa?- przypadek jednowymiarowy"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"![regresja liniowa 1](obrazki/1.png)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"![regresja liniowa 2](obrazki/2.png)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"![regresja liniowa 3](obrazki/3.png)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"![regresja liniowa 4](obrazki/4.png)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## wzór na regresję w przypadku jednowymiarowym?\n",
"\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"$Y = a*X_1 + b$\n",
"\n",
"$Y = w_1 * X_1 + w_0$"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Zadanie - napisać funkcję predict_score(sqr_metres) która zwraca cenę mieszkania zgodnie z modelem regresji liniowej ( 5 minut) \n",
"\n",
"Należy samemu wymyślić współczynniki modelu"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {},
"outputs": [],
"source": [
"def predict_price(sqr_metres):\n",
" return 2000* sqr_metres + 200000"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"240000"
]
},
"execution_count": 13,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"predict_price(20)"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"280000"
]
},
"execution_count": 14,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"predict_price(40)"
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"310000"
]
},
"execution_count": 15,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"predict_price(55)"
]
},
{
"cell_type": "code",
"execution_count": 16,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"200000"
]
},
"execution_count": 16,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"predict_price(0)"
]
},
{
"cell_type": "code",
"execution_count": 17,
"metadata": {},
"outputs": [],
"source": [
"Y_train_predicted = predict_price(X_train)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Mierzenie błędu"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"![RMSE 1](obrazki/6.png)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"![a](obrazki/5.png)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Zadanie - napisać funkcję, która liczy błąd średniowadratowy na całym zbiorze (7 minut)\n",
"\n",
"rmse(Y_true, Y_predicted)"
]
},
{
"cell_type": "code",
"execution_count": 18,
"metadata": {},
"outputs": [],
"source": [
"def rmse(Y_true, Y_predicted):\n",
" return np.sqrt(np.sum((Y_true - Y_predicted)**2)/ len(Y_true)) "
]
},
{
"cell_type": "code",
"execution_count": 19,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"0.0"
]
},
"execution_count": 19,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"rmse(np.array([300_000, 250_000]), np.array([300_000, 250_000]))"
]
},
{
"cell_type": "code",
"execution_count": 20,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"70799.01129253148"
]
},
"execution_count": 20,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"rmse(np.array([305_000, 250_000]) ,np.array([300_000, 350_000]) )"
]
},
{
"cell_type": "code",
"execution_count": 21,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"80622.57748298549"
]
},
"execution_count": 21,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"rmse(np.array([300_000, 250_000]), np.array([330_000, 360_000]))"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Zadanie - za pomocą rmse policzyć błąd dla baseline (3 minuty)\n"
]
},
{
"cell_type": "code",
"execution_count": 22,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"array([357461.18628244, 357461.18628244, 357461.18628244, ...,\n",
" 357461.18628244, 357461.18628244, 357461.18628244])"
]
},
"execution_count": 22,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"np.ones_like(Y_train) * Y_train.mean()"
]
},
{
"cell_type": "code",
"execution_count": 23,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"125698.71268014389"
]
},
"execution_count": 23,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"rmse(Y_train, np.ones_like(Y_train) * Y_train.mean())"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Zadanie - za pomocą rmse policzyc błąd dla predykcji (2 minuty)"
]
},
{
"cell_type": "code",
"execution_count": 24,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"123420.02227684396"
]
},
"execution_count": 24,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"rmse(Y_train, Y_train_predicted)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Na jakim zbiorze najlepiej sprawdzać wyniki?\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"![RMSE 2](obrazki/7.png)\n",
"\n",
"![RMSE 3](obrazki/8.png)\n",
"\n"
]
},
{
"cell_type": "code",
"execution_count": 25,
"metadata": {},
"outputs": [],
"source": [
"mieszkania_dev = pd.read_csv(DATA_DIR/'dev-0'/'in.tsv', sep = '\\t', names = names)"
]
},
{
"cell_type": "code",
"execution_count": 26,
"metadata": {},
"outputs": [],
"source": [
"with open(DATA_DIR/'dev-0'/'expected.tsv','r') as dev_exp_f:\n",
" Y_dev = np.array([float(x.rstrip('\\n')) for x in dev_exp_f.readlines()])"
]
},
{
"cell_type": "code",
"execution_count": 27,
"metadata": {},
"outputs": [],
"source": [
"mieszkania_dev['price'] = Y_dev"
]
},
{
"cell_type": "code",
"execution_count": 28,
"metadata": {},
"outputs": [],
"source": [
"X_dev = mieszkania_dev['sqrMetres'].to_numpy()"
]
},
{
"cell_type": "code",
"execution_count": 29,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"<AxesSubplot:xlabel='sqrMetres', ylabel='price'>"
]
},
"execution_count": 29,
"metadata": {},
"output_type": "execute_result"
},
{
"data": {
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAncAAAE9CAYAAABp4UT1AAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjUuMSwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy/YYfK9AAAACXBIWXMAAAsTAAALEwEAmpwYAAAoFklEQVR4nO3df7RdZXng8e8jAUQE+ZE0pQTlWiMSaUW4g2EQxoJAUMawWqu4XCWl1EwrrYjtVGxZMkW6lk47RZiltFSEZHREpFYyFE0DUqUMIBe0/EpDIjGSFMJFfhYcBPvMH/u9eO7l3pObm7PvOWef72ets84+7/7xvmffnXufvO9+3h2ZiSRJkprhZd1ugCRJkjrH4E6SJKlBDO4kSZIaxOBOkiSpQQzuJEmSGsTgTpIkqUHmdLsBvWLu3Ll54IEHdrsZkiRJ23THHXc8mpnzJltncFcceOCBjIyMdLsZkiRJ2xQRm6Za57CsJElSgxjcSZIkNYjBnSRJUoMY3EmSJDWIwZ0kSVKDGNxJkiQ1iMGdJElSgxjcSZIkNYjBnSRJUoMY3EmSJHXAxkef4ePX3MPGR5/pajsM7iRJkjrg8ps3svKWTVx+88autsNny0qSJHXA6UcNjXvvFoM7SZKkDhiauzvnLz2k281wWFaSJKlJDO4kSZIaxOBOkiSpQQzuJEmSGsTgTpIkqUEM7iRJkhrE4E6SJKlBDO4kSZIaxOBOkiSpQWoN7iLirIi4JyLujYgPl7J9ImJNRKwv73uX8oiIiyNiQ0TcFRGHtRxnWdl+fUQsayk/PCLuLvtcHBHRrg5JkqSmqy24i4hDgA8ARwBvAk6OiNcB5wA3ZOZC4IbyGeAkYGF5LQcuKcfZBzgPeEs51nktwdolpY6x/ZaU8qnqkCRJarQ6e+4OBm7LzGcz8wXgW8CvAkuBFWWbFcApZXkpsDIrtwJ7RcR+wInAmsx8LDMfB9YAS8q6PTPz1sxMYOWEY01WhyRJUqPVGdzdAxwdEftGxCuAdwAHAPMz86GyzcPA/LK8P/Bgy/6bS1m78s2TlNOmDkmSpEabU9eBM3NtRHwK+AfgGeB7wE8nbJMRkXW1YVt1RMRyqiFgXv3qV9fZDEmSpFlRa0JFZl6WmYdn5jHA48D9wNYypEp5f6RsvoWqZ2/MglLWrnzBJOW0qWNi+y7NzOHMHJ43b97Mv6gkSVKPqDtb9ufK+6up7rf738AqYCzjdRlwTVleBZxWsmYXA0+WodXVwAkRsXdJpDgBWF3WPRURi0uW7GkTjjVZHZIkSY1W27Bs8bcRsS/wPHBmZj4REZ8EroqIM4BNwHvKttdR3Ze3AXgWOB0gMx+LiE8At5ftzs/Mx8ryB4ErgN2Ar5cXwFR1SJIkNVpUiaYaHh7OkZGRbjdDkiRpmyLijswcnmydT6iQJElqEIM7SZKkBjG4kyRJahCDO0mSpAYxuJMkSWoQgztJkqQGMbiTJElqEIM7SZKkBjG4kyRJahCDO0mSpAYxuJMkSWoQgztJkqQGMbiTJElqEIM7SZKkBjG4kyRJahCDO0mSpAYxuJMkSWoQgztJkqQGMbiTJElqEIM7SZKkBjG4kyRJahCDO0mSpAapNbiLiLMj4t6IuCcivhQRL4+IoYi4LSI2RMSXI2KXsu2u5fOGsv7AluN8rJSvi4gTW8qXlLINEXFOS/mkdUiSJDVdbcFdROwPfAgYzsxDgJ2AU4FPARdm5uuAx4Ezyi5nAI+X8gvLdkTEorLfG4ElwGcjYqeI2An4DHASsAh4X9mWNnVIkiQ1Wt3DsnOA3SJiDvAK4CHgWODqsn4FcEpZXlo+U9YfFxFRyq/MzOcycyOwATiivDZk5gOZ+RPgSmBp2WeqOiRJkhqttuAuM7cAfwH8kCqoexK4A3giM18om20G9i/L+wMPln1fKNvv21o+YZ+pyvdtU4ckSVKj1TksuzdVr9sQ8AvA7lTDqj0jIpZHxEhEjIyOjna7OZIkSTuszmHZtwMbM3M0M58HvgocBexVhmkBFgBbyvIW4ACAsv5VwI9ayyfsM1X5j9rUMU5mXpqZw5k5PG/evB35rpIkST2hzuDuh8DiiHhFuQ/uOOA+4Ebg3WWbZcA1ZXlV+UxZ/83MzFJ+asmmHQIWAt8BbgcWlszYXaiSLlaVfaaqQ5IkqdHqvOfuNqqkhjuBu0tdlwIfBT4SERuo7o+7rOxyGbBvKf8IcE45zr3AVVSB4TeAMzPzp+Weut8DVgNrgavKtrSpQ5IkqdGi6ujS8PBwjoyMdLsZkiRJ2xQRd2Tm8GTrfEKFJElSgxjcSZIkNYjBnSRJUoMY3EmSJDWIwZ0kSVKDGNxJkiQ1iMGdJElSgxjcSZIkNYjBnSRpIGx89Bk+fs09bHz0mW43RaqVwZ0kaSBcfvNGVt6yictv3tjtpki1mtPtBkiSNBtOP2po3LvUVAZ3kqSBMDR3d85feki3myHVzmFZSZKkBjG4kyRJahCDO0kdZUaiJHWXwZ2kjjIjUZK6y4QKSR1lRqIkdZfBnaSOMiNRkrrLYVlJkqQGMbiTJElqEIM7SZKkBjG4kyRJapDagruIOCgivtfyeioiPhwR+0TEmohYX973LttHRFwcERsi4q6IOKzlWMvK9usjYllL+eERcXfZ5+KIiFI+aR2SJElNV1twl5nrMvPQzDwUOBx4Fvg74BzghsxcCNxQPgOcBCwsr+XAJVAFasB5wFuAI4DzWoK1S4APtOy3pJRPVYckSVKjzdaw7HHA9zNzE7AUWFHKVwCnlOWlwMqs3ArsFRH7AScCazLzscx8HFgDLCnr9szMWzMzgZUTjjVZHZIkSY02W8HdqcCXyvL8zHyoLD8MzC/L+wMPtuyzuZS1K988SXm7OiRJkhqt9uAuInYB3gV8ZeK60uOWddbfro6IWB4RIxExMjo6WmczJEmSZsVs9NydBNyZmVvL561lSJXy/kgp3wIc0LLfglLWrnzBJOXt6hgnMy/NzOHMHJ43b94Mv54kSVLvmI3g7n38bEgWYBUwlvG6DLimpfy0kjW7GHiyDK2uBk6IiL1LIsUJwOqy7qmIWFyyZE+bcKzJ6pAkSWq0Wp8tGxG7A8cD/6Wl+JPAVRFxBrAJeE8pvw54B7CBKrP2dIDMfCwiPgHcXrY7PzMfK8sfBK4AdgO+Xl7t6pAkSWq0qG5J0/DwcI6MjHS7GZIkSdsUEXdk5vBk63xChSRJUoMY3EmSJDWIwZ0kSVKDGNxJkiQ1iMGdJElSgxjcSZIkNYjBnSRJUoMY3ElT2PjoM3z8mnvY+Ogz3W6KJEnTZnAnTeHymzey8pZNXH7zxm43RZKkaav18WNSPzv9qKFx75Ik9QODO2kKQ3N35/ylh3S7GZIkbReHZSVJkhrE4E6SJKlBDO4kSZIaxOBOkiSpQQzuJEmSGsTgTpIkqUEM7iRJkhrE4E6SJKlBDO4kSZIaxOBOkiSpQQzuJEmSGqTW4C4i9oqIqyPiXyJibUQcGRH7RMSaiFhf3vcu20ZEXBwRGyLirog4rOU4y8r26yNiWUv54RFxd9nn4oiIUj5pHZIkSU1Xd8/dRcA3MvMNwJuAtcA5wA2ZuRC4oXwGOAlYWF7LgUugCtSA84C3AEcA57UEa5cAH2jZb0kpn6oOSZKkRqstuIuIVwHHAJcBZOZPMvMJYCmwomy2AjilLC8FVmblVmCviNgPOBFYk5mPZebjwBpgSVm3Z2bempkJrJxwrMnqUBdtfPQZPn7NPWx89JluN0WSpMaqs+duCBgFLo+I70bE5yJid2B+Zj5UtnkYmF+W9wcebNl/cylrV755knLa1KEuuvzmjay8ZROX37yx202RJKmx6gzu5gCHAZdk5puBZ5gwPFp63LLGNrStIyKWR8RIRIyMjo7W2QwBpx81xGlHvobTjxrqdlOkxrKHXFKdwd1mYHNm3lY+X00V7G0tQ6qU90fK+i3AAS37Lyhl7coXTFJOmzrGycxLM3M4M4fnzZs3oy+p6RuauzvnLz2Eobm7d7sp6mMGL+3ZQy6ptuAuMx8GHoyIg0rRccB9wCpgLON1GXBNWV4FnFayZhc
"text/plain": [
"<Figure size 720x360 with 1 Axes>"
]
},
"metadata": {
"needs_background": "light"
},
"output_type": "display_data"
}
],
"source": [
"sns.scatterplot(x='sqrMetres',y='price', data = mieszkania_dev, linewidth = 0, s = 5)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Zadanie - policzyć rmse dla predykcji ze zbioru deweloperskiego modelu baseline i naszego modelu regresji liniowej"
]
},
{
"cell_type": "code",
"execution_count": 30,
"metadata": {},
"outputs": [],
"source": [
"Y_dev_predicted = predict_price(X_dev)"
]
},
{
"cell_type": "code",
"execution_count": 31,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"117309.3154367544"
]
},
"execution_count": 31,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"rmse(Y_dev, np.ones_like(Y_dev) * Y_dev.mean())"
]
},
{
"cell_type": "code",
"execution_count": 32,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"104227.56492755697"
]
},
"execution_count": 32,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"rmse(Y_dev, Y_dev_predicted)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Przypadek wielowymiarowy"
]
},
{
"cell_type": "code",
"execution_count": 33,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"<AxesSubplot:xlabel='floor', ylabel='price'>"
]
},
"execution_count": 33,
"metadata": {},
"output_type": "execute_result"
},
{
"data": {
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAmEAAAFICAYAAAAYvikoAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjUuMSwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy/YYfK9AAAACXBIWXMAAAsTAAALEwEAmpwYAAAyWUlEQVR4nO3dfXzV5Znn8e+VkGCEACoQlSigkAiC1ZJpSxmttmBxYaSO466dtrq2HXd02vrQ7b6chaUdFnbamZ2qM1vtum592J3RyjgVCzoqHa0somMoWIE0gCAlKCEqDwECCcm9f5yTTn4HFLrrua8D9+f9evG6knMO3NfvnJBznfvRQggCAABAXGXeCQAAAKSIIgwAAMABRRgAAIADijAAAAAHFGEAAAAOKMIAAAAcHJdFmJn9yMx2mNmaY3z8vzazdWa21sz+rtj5AQAAHI0dj/uEmdklkvZKejiEMOEojx0r6TFJnw4h7DSz4SGEHTHyBAAAeD/HZU9YCOFFSe/1vc3MzjWzfzSzlWa2zMzOy9/1R5J+EELYmf+7FGAAAMDdcVmEvY/7JH09hDBJ0r+XdE/+9jpJdWa23MxeNrPpbhkCAADk9fNO4MNgZgMlfVLSQjPrvbl/PvaTNFbSpZJqJb1oZhNDCLsipwkAAPAbJ0QRplyP3q4QwoVHuK9F0ishhC5Jm81svXJF2asR8wMAAMg4IYYjQwh7lCuwrpEky/lI/u4nlOsFk5kNVW54cpNDmgAAAL9xXBZhZvaIpBWS6s2sxcy+IukLkr5iZq9JWitpVv7hz0h618zWSXpe0rdCCO965A0AANDruNyiAgAA4Hh3XPaEAQAAHO8owgAAABwcd6sjhw4dGkaNGuWdBgAAwFGtXLnynRDCsCPdd9wVYaNGjVJjY6N3GgAAAEdlZlve7z6GIwEAABxQhAEAADigCAMAAHBAEQYAAOCAIgwAAMABRRgAAICDohVhZvYjM9thZmve534zs782s41m9ksz+2ixcgEAACg1xewJe1DS9A+4/wpJY/N/bpR0bxFzAQAAKClFK8JCCC9Keu8DHjJL0sMh52VJQ8zsjGLlAwAAUEo854SNkLS1z/ct+dsAF5vf2ae5i9Zo8zv7vFMBACTguJiYb2Y3mlmjmTW2tbV5p4MT1APLN+vhFVv0wPLN3qkAABLgWYRtk3RWn+9r87cdJoRwXwihIYTQMGzYEc/APGEsbNyqC//sWS1s3Hr0B+NDdcOU0bpu8kjdMGW0dyoAgAR4FmFPSrouv0ryE5J2hxDedsynJCxY0qRdHV1asKTJO5XkjB46QPNmTdDooQO8UwEAJKCYW1Q8ImmFpHozazGzr5jZH5vZH+cf8pSkTZI2Svofkm4uVi7Hk9kzxmlIVYVmzxjnnQoAACgiCyF45/BbaWhoCI2Njd5pAAAAHJWZrQwhNBzpvuNiYj6QAlZnAkBaKMKAEsHqTABISz/vBADk9K7KZHUmAKSBIgwoEb2rMwEAaWA4EgAAwAFFGAAAgAOKMAAAAAcUYQAAAA4owoC8e57fqLrZT+ue5zd6pwIASABFGJB319IN6uzu0V1LN3inAgBIAEUYkHfr1LGqLC/TrVPHeqcCAEgA+4QBeTdfNkY3XzbGOw0AQCLoCQMAAHBAEQYAAOCAIgwAAMABRViJ2fzOPs1dtEab39nnnQoAACgiirAS88DyzXp4xRY9sHyzdyoAAKCIWB1ZYm6YMjoTAQDAiYkirMSMHjpA82ZN8E4DAAAUGcORAAAADijCCjAxHgAAxEARVoCJ8QAAIAaKsALTxteovqZa08bXeKfiwrsn0Lt9AABioQgr8Ny6VjW3tuu5da3eqbjw7gn0bh8AgFhYHVkg9S0ivK/fu30AAGKxEIJ3Dr+VhoaG0NjY6J0G8KHb/M4+PbB8s26YMlqjhw7wTgcA8CEws5UhhIYj3cdwJFAiGIoFgLQwHAmUCIZiASAtFGFAieC0BABIC8ORAAAADijCAAAAHFCEAQAAOKAIAwAAcEARBgAA4IAiDBneZzd6t+8p5WsHgBRRhCHDe8NQ7/Y9pXztFKAAUsQ+Ycjw3jDUu31PKV97bwEqib3SACSDsyMBuOPcTAAnqg86O5KeMADuOC0AQIqYEwYAAOCAIgwAAMABRViJYZUYAABpoAgrMSlvUwAAQEqYmF9iUt6mAACAlNATVoDhQAAAEANFWAHv4UDv9gEAQBxFHY40s+mS7pZULun+EMJ3C+4/W9JDkobkH3NHCOGpYuZ0NN7Dgd7tAwCAOIq2Y76ZlUtaL2mapBZJr0r6fAhhXZ/H3CdpVQjhXjMbL+mpEMKoD/p32TEfAAAcLz5ox/xiDkd+TNLGEMKmEEKnpEclzSp4TJA0KP/1YElvFTEfAACAklHMImyEpK19vm/J39bXdyR90cxaJD0l6etFzOe4kPrCgGUb2vTZO1/Usg1t3qlEl/prDwCp8Z6Y/3lJD4YQaiX9K0n/y8wOy8nMbjSzRjNrbGs7sd+cU5+YP39xk5pb2zV/cZN3KtGl/toDQGqKOTF/m6Sz+nxfm7+tr69Imi5JIYQVZnaSpKGSdvR9UAjhPkn3Sbk5YcVKuBSkPjF/zsxxmr+4SXNmjvNOJbrUX3sASE0xJ+b3U25i/meUK75elfSHIYS1fR7ztKQfhxAeNLNxkn4maUT4gKSYmA8AAI4XLhPzQwiHJH1N0jOSmiQ9FkJYa2bzzOzK/MO+KemPzOw1SY9I+rcfVIABAACcKIq6T1h+z6+nCm6b2+frdZKmFDMHAACAUuQ9Mb/kpL5Czfv6vdsHACAWirACqa9Q875+7/YBAIilqMORxyPvFWqb39mnB5Zv1g1TRmv00AHR2/e+fs/2l21o+83KzIvHDovePgAgLUVbHVksJ/rqyLmL1ujhFVt03eSRmjdrgnc6SfnsnS+qubVd9TXVeua2S7zTAQCcAD5odSQ9YSXGuycqZSnvUQYAiI+eMAAAgCLxOsAbxyFWJwIAEAdFWIHUixBWJwIAEAdzwgosWLJOS5t26K1dHbr/+t/xTic65qQBABAHPWEF3tixLxNTM3roAM2bNcFlewyka9mGNn32zhe1bEObdyoAEA1FWIF5nztf9TXVmve5871TAZIxf3GTmlvbNX9xk3cqABANRViBi8cO0zO3XcJmnUBEc2aOU31NNduDAEgKc8IAuMt9+OGDD4C00BMGAADggCKsQOpbVAAAgDgowgqwTxYAAIiBOWEF2CcLiG/zO/v0wPLNumHKaLZHAZAMesIKeO+TxXAoUkQPNIAUUYSVGO83I+9NMylC03TDlNG6bvJIeqABJIUirMR4vxl5b5rpXYQCABALRViJ8R4OnXXhmaosL9OsC890ad+7CIUPim8AKWJiPjIWrX5Lnd09WrT6Ld182Zjo7fcWoUgLC2IApIiesBLjPSfqqxeP1pCqCn314vTeDL2f+5R59wADgAeKsBJz99L1enjFFt29dL1L+69v261dHV16fdtul/Y9MSQGAIiJIqyAd2/I3oOHMjG2lOdkeV+798pUAEBcFGEFvHtDBvbvl4mxpTws5H3t3itTAQBxMTG/gPcE4Vum1mlQVUWSPVGpmzNznOYvbtKcmeO8UwEARGAhBO8cfisNDQ2hsbHROw0AAICjMrOVIYSGI93HcCQAAIADijAAAAAHFGEoKd6rUwEAiIUiDCXFe3UqAACxsDoSJcV7dSoAALHQE1aA4TAAABADRVgBhsN8eT7/7FgPAIiJIqyA99E1qZs2vkb1NdWaNr4mett/+vjram5t158+/nr0tiV6YT3x3APwQBGGkvLculY1t7bruXWt0ds+1NOTibHRC+uH5x6ABybmF7h76Xo9sfot7eno0l3XXuSdTnI8J+ZPGDFY2/fs0IQRg6O3LbEowRPPPQAPFGHI2PzOPj2wfLNumDI6uUO8Z88YrzOHVLm9EfceII74eO4BeKAIK5D6Adq9wzKSXN6UFixZp6VNO/TWrg7df/3vRG2bN2I/KRf/ANJFEVYg9Tdi72GZN9r2ZiLS4F38A4AHijBkeBeh82ZN0PzFTZozc5xbDoj
"text/plain": [
"<Figure size 720x360 with 1 Axes>"
]
},
"metadata": {
"needs_background": "light"
},
"output_type": "display_data"
}
],
"source": [
"sns.scatterplot(x='floor',y='price', data = mieszkania_train, linewidth = 0, s = 5)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"$Y = w_1 * X_1 + w_2 * X_1 + w_3 * X_3 + w_0$"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Zadanie. Napisać analogiczną funkcję predict_price(sqr_metres, floor), policzyć rmse dla takiego modelu ( 7 minut)"
]
},
{
"cell_type": "code",
"execution_count": 34,
"metadata": {},
"outputs": [],
"source": [
"def predict_price(sqr_metres, floor):\n",
" return 4000* sqr_metres + (-1000)* floor + 100000"
]
},
{
"cell_type": "code",
"execution_count": 35,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"298000"
]
},
"execution_count": 35,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"predict_price(50, 2)"
]
},
{
"cell_type": "code",
"execution_count": 36,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"295000"
]
},
"execution_count": 36,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"predict_price(50, 5)"
]
},
{
"cell_type": "code",
"execution_count": 37,
"metadata": {},
"outputs": [],
"source": [
"Y_dev_predicted = predict_price(mieszkania_dev['sqrMetres'], mieszkania_dev['floor'])"
]
},
{
"cell_type": "code",
"execution_count": 38,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"117436.43511182851"
]
},
"execution_count": 38,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"rmse(Y_dev, np.mean(Y_train))"
]
},
{
"cell_type": "code",
"execution_count": 39,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"100227.89896326358"
]
},
"execution_count": 39,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"rmse(Y_dev, Y_dev_predicted)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## jak dobrać najlepsze parametry?"
]
},
{
"cell_type": "code",
"execution_count": 40,
"metadata": {
"scrolled": true
},
"outputs": [
{
"data": {
"text/plain": [
"<seaborn.axisgrid.FacetGrid at 0x7fd259e2c190>"
]
},
"execution_count": 40,
"metadata": {},
"output_type": "execute_result"
},
{
"data": {
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAWAAAAFgCAYAAACFYaNMAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjUuMSwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy/YYfK9AAAACXBIWXMAAAsTAAALEwEAmpwYAAByTUlEQVR4nO29eZwc9X3n/f5VVV/Tc2hmNDMSkkASCISwwcb4kEOwjDHGdh682YckKMlmvQkxOSHOY69J4nizZPdZs8ljB3a9MSyb9SZ2TBw2NmxsA8ayLJOADRaHkRgkMQLdmnumZ/qsqt/zR1X1VN/dM33N9O/9MtZMH9W/run+1vf3PT5fIaVEoVAoFM1Ha/UCFAqFolNRBlihUChahDLACoVC0SKUAVYoFIoWoQywQqFQtAhlgBUKhaJFrEoDLIT4KyHEuBDi5Sof//NCiMNCiENCiL9t9PoUCoWiGsRqrAMWQlwHLAB/LaV8U4XH7gC+BlwvpZwRQgxLKcebsU6FQqEox6r0gKWUB4Bp/21CiIuFEI8JIX4shPiBEGKne9evA1+QUs64z1XGV6FQtAWr0gCX4AHgd6WUbwM+Afw39/ZLgUuFEP8khHhGCHFTy1aoUCgUPoxWL6AeCCG6gXcDfy+E8G4Ouf8awA5gD7AZOCCEeLOUcrbJy1QoFIoc1oQBxvHkZ6WUbyly3yngh1LKDHBcCHEExyA/28T1KRQKRQFrIgQhpZzHMa4/ByAcrnLv/gaO94sQYj1OSGKsBctUKBSKHFalARZCfBV4GrhMCHFKCPFrwC8BvyaEeBE4BHzEffjjwJQQ4jDwPeCTUsqpVqxboVAo/KzKMjSFQqFYC6xKD1ihUCjWAqsuCXfTTTfJxx57rNXLUCgUiloQxW5cdR7w5ORkq5egUCgUdWHVGWCFQqFYKygDrFAoFC1CGWCFQqFoEcoAKxQKRYtQBlihUChahDLACoVC0SKUAVYoFIoWoQywQqFQtIiGGeBKc9tcxbL7hBDHhBAvCSGubtRaFAqFoh1pZCvyl4D/Cvx1ifs/iKPLuwN4J/CX7r8Kxapn/+g49x8Y4+RMnC39Xdx+3Xb27Bxu9bIUbUbDPOBic9vy+AjOUE0ppXwGWCeE2Nio9SgUzWL/6DifefQQ47Ek6yIBxmNJPvPoIfaPqnGEilxaKcazCTjp+/2Ue9vZ1iwnF+XBKEpx35NHePCp4yymLaJBnduu3cYdN1yavf/+A2MEdEFX0Pl6dQUN4mmT+w+Mqc+QIodVkYQTQnxMCPGcEOK5iYmJhr+e8mAUpbjvySPcu+8YiYyFoUEiY3HvvmPc9+SR7GNOzsSJBPSc50UCOqdm4s1erqLNaaUBPg1s8f2+2b2tACnlA1LKa6SU1wwNDTV8YX4PRgjn34AuuP+AmmTU6Tz41HE0AYamoQnN/de53WNLfxeJjJXzvETGYnN/V7OXq2hzWmmAHwV+xa2GeBcwJ6Vsi/CD8mAUpVhMW2h5yq6acG73uP267WQsSTxtIqXzb8aS3H7d9iavVtHuNCwG7M5t2wOsF0KcAv4dEACQUn4R+BbwIeAYEAf+TaPWUitb+rsYjyWzMTxQHozCIRrUSWRyjbAtnds99uwc5m6cndSpmTibq8ghqJxDZ9IwAyyl3Fvhfgn8dqNefyXcft12PvPoIeJpk0jA+cIpD0YBcNu127h33zFM20YTjvG1pXO7nz07h6s2oF7OIaCLnJzD3e5xFGuXVZGEazZ7dg5z981XMNwTZi6RYbgnzN03X6G+DAruuOFS7rz+EiIBHdN2QlN3Xn9JThVEraicQ+ey6mbCNYtaPBhFZ3HHDZeuyODmc3ImzrpIIOc2lXPoDJQHrFC0GFU10bkoA6xQtBhVNdG5KAOsULQYlXPoXFQMWKFoA1TOoTNRHrBCoVC0COUBKxR1xBPqmU+aAAigJ2wUCPYoFKAMsEJRNzyhHsuW2dsksJAyuXffMQBlhBU5qBCEQlEnPKGefGxJgWCPQgHKACsUdaOYUI9HvmCPQgHKACsUdSMa1PFFH3LIF+xRKEAZYIWibtx27baiBtgT7ckX7FEoVBJOoagTXoItvwqiO6SqIBTFEY4q5Orhmmuukc8991yrl6FQ1A2lBdwRFM0OqBCEQtFC1PzBzkYZYIWihSgt4M5GGWCFooWo+YOdjTLACkULUVrAnY0ywApFC1FawJ2NKkNTrBlWYzXBciYoK9YOqgxNsSbwTxb2T7JWwuaKNqFoGZrygBVrAn81AUBX0CCeNrn/wFhLDXAjvPKVHHM17hLWMioGrFgTtGM1QSNqfFdyTFVz3H4oA6xYE7RjNUEjanxXckxVc9x+KAOsWBPcft125hMZjp6P8crZOY6ejzGfyLS0mqARXvlKjtmOu4RORxlgxZpBAggQQoBwf28hjfDKV3LMdtwldDrKACvWBPcfGKMvEmDHcA87N/SyY7iHvkigpdvrRtT4ruSYqua4/VAGWLEmaMft9Z6dw9x98xUM94SZS2QY7gmvuCxuJcdsxHoUK0PVASvWBHsfeIbxWDJbhgYQT5sM94T56sfe1cKVKRSAkqNUrGXU9lqxGlEGWLEmUNtrxWpEdcIp1gx7dg4rg6tYVSgPWKFQKFqEMsAKhULRIlQIQtF2tFowppbXb/VaFasb5QEr2opWC8bU8vqtXqti9aMMsKJl7B8dZ+8Dz3DtPfvY+8AzWW+ylYIxtbx+q9eqWP2oEISiJfgF1P3eYzxtsqE3nPPYZna0nZyJsy4SqOr1a3msQlEM5QErWkIp7zFt2i0VjKlFsEaJ2yhWijLAipZQSrshqIuWdrTV0lGnuu8UK0UZYEVLKOU97hjpbWlHWy0ddar7TrFSlBiPoiWoIZqKDkMN5VS0D6thHLuq8VU0GuUBKxRFUB66os4oOUqFolpUja+iGagQhEJRBFXjWztrOWTTqPemPGCFogiqxrc21nJbdiPfmzLACkURVI1vbazlkE0j35sywApFEVSNb22041DUetHI96ZiwApFCdSEjerZ0t9VMBR1rYRsGvneGuoBCyFuEkK8KoQ4JoS4q8j9FwohvieEeF4I8ZIQ4kONXI9CoWgMazlk08j31rA6YCGEDhwB3g+cAp4F9kopD/se8wDwvJTyL4UQu4BvSSm3ljvuWqkDXssZ41ppt3PR6PW02/utF977atfGmpVQh/dWtA64kQZ4N/AnUsoPuL//AYCU8j/5HnM/MCalvMd9/P8npXx3ueOuBQOsivyXaLdz0ej1tNv7VTSNpjdibAJO+n4/5d7m50+AXxZCnAK+BfxuA9fTNqzljHGttNu5aPR62u39KlpLq6sg9gJfklJuBj4E/I0QomBNQoiPCSGeE0I8NzEx0fRF1pu1nDGulXY7F41eT7u9X0VraaQBPg1s8f2+2b3Nz68BXwOQUj4NhIH1+QeSUj4gpbxGSnnN0NBQg5bbPFSR/xLtdi4avZ52e7+K1tJIA/wssEMIsU0IEQRuBR7Ne8wJ4H0AQojLcQzw6ndxK7CWM8a10m7notHrabf3q2gtDVVDc8vK/gLQgb+SUv5HIcTdwHNSykfdyof/DnQDEvi3Usonyh2znkm4Vmaj13LGuFZaeS6KfQagsTKZ9Xq/a7WaYo3S3CqIRlEvA6yy0YrV/BlYzWvvUJQcpR+VjW4u9z15hCv/5HEu/sNvceWfPM59Tx5p9ZJW9WdgNa9dsUTHtiIrucHmcd+TR7h33zE0AYbmJJ3u3XcMgDtuuLRl61rNn4HVvPZydFpYpWM9YJWNbh4PPnXcNb4amtDcf53bW8lq/gys5rWXYi1LWpaiYw2wykY3j8W0hZYXAdOEc3srWc2fgdW89lJ0YlilYw2wkhtsHtGgjp2X67Wlc3srWc2fgdW89lJ0YpNKx8aAYXXLDa6mWNlt127j3n3HMG0bTTjG15bO7a2m2GdgtZzb1fz5LcZalrQsRcd6wKuZ1RYru+OGS7nz+kuIBHRM2/Fq7rz+kpYm4Eqx2s7tWmIthlUq0bF1wKuZvQ88U+ApxNMmwz1hvvqxdy3rmKvF61su1b6/Rpzbeq2t3anH+1j
"text/plain": [
"<Figure size 360x360 with 1 Axes>"
]
},
"metadata": {
"needs_background": "light"
},
"output_type": "display_data"
}
],
"source": [
"sns.lmplot(x='sqrMetres',y='price', data = mieszkania_train)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LinearRegression.html"
]
},
{
"cell_type": "code",
"execution_count": 41,
"metadata": {},
"outputs": [],
"source": [
"lm_model = LinearRegression()"
]
},
{
"cell_type": "code",
"execution_count": 42,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"LinearRegression()"
]
},
"execution_count": 42,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"lm_model.fit(mieszkania_train[['isNew','rooms', 'floor', 'sqrMetres']], Y_train)"
]
},
{
"cell_type": "code",
"execution_count": 43,
"metadata": {},
"outputs": [],
"source": [
"Y_train_predicted = lm_model.predict(mieszkania_train[['isNew','rooms', 'floor', 'sqrMetres']])"
]
},
{
"cell_type": "code",
"execution_count": 44,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"103308.92502763818"
]
},
"execution_count": 44,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"rmse(Y_train, Y_train_predicted)"
]
},
{
"cell_type": "code",
"execution_count": 45,
"metadata": {},
"outputs": [],
"source": [
"Y_dev_predicted = lm_model.predict(mieszkania_dev[['isNew','rooms', 'floor', 'sqrMetres']])"
]
},
{
"cell_type": "code",
"execution_count": 46,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"84157.87889057388"
]
},
"execution_count": 46,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"rmse(Y_dev, Y_dev_predicted)"
]
},
{
"cell_type": "code",
"execution_count": 47,
"metadata": {
"scrolled": true
},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/home/kuba/anaconda3/envs/zajeciaei/lib/python3.10/site-packages/sklearn/base.py:450: UserWarning: X does not have valid feature names, but LinearRegression was fitted with feature names\n",
" warnings.warn(\n"
]
},
{
"data": {
"text/plain": [
"array([469449.27836213])"
]
},
"execution_count": 47,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"lm_model.predict(np.array(([[0, 4, 3, 70]])))"
]
},
{
"cell_type": "code",
"execution_count": 48,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/home/kuba/anaconda3/envs/zajeciaei/lib/python3.10/site-packages/sklearn/base.py:450: UserWarning: X does not have valid feature names, but LinearRegression was fitted with feature names\n",
" warnings.warn(\n"
]
},
{
"data": {
"text/plain": [
"array([455982.54297977])"
]
},
"execution_count": 48,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"lm_model.predict(np.array(([[0, 4, 3, 60]])))"
]
},
{
"cell_type": "code",
"execution_count": 49,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"array([ 4522.65059749, 73763.4125433 , -78.83243119, 1346.67353824])"
]
},
"execution_count": 49,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"lm_model.coef_"
]
},
{
"cell_type": "code",
"execution_count": 50,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"80364.9778059895"
]
},
"execution_count": 50,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"lm_model.intercept_"
]
},
{
"cell_type": "code",
"execution_count": 51,
"metadata": {
"scrolled": true
},
"outputs": [
{
"data": {
"text/plain": [
"455982.5429800203"
]
},
"execution_count": 51,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"0 * 4522.65059749 + 4* 73763.4125433 + 3 * (-78.83243119) + 60 * 1346.67353824 + 80364.97780599032"
]
},
{
"cell_type": "code",
"execution_count": 52,
"metadata": {},
"outputs": [],
"source": [
"with open(DATA_DIR/'dev-0'/'out.tsv','w') as f_out_file:\n",
" for line in Y_dev_predicted:\n",
" f_out_file.write(str(line))\n",
" f_out_file.write('\\n')"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Uwaga - regresja linowa działa dobrze tylko dla danych, gdzie występuje korelacja liniowa"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"![RMSE 5](obrazki/9.png)\n",
"\n",
"![6](obrazki/10.png)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Zadanie domowe\n",
"\n",
"\n",
"- https://gonito.net/challenge/retroc2\n",
"- termin 17.05\n",
"- należy użyć wektoryzacji (np tf-dif)\n",
"- wynik zaliczający to max 50 RMSE dla dev-0 \n",
"- punkty: 60, dla 3 najlepszych wyników na test-A: 80,\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"author": "Jakub Pokrywka",
"email": "kubapok@wmi.amu.edu.pl",
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"lang": "pl",
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.4"
},
"subtitle": "7.Regresja liniowa[ćwiczenia]",
"title": "Ekstrakcja informacji",
"year": "2021"
},
"nbformat": 4,
"nbformat_minor": 4
}