retroc2/train.ipynb

1729 lines
133 KiB
Plaintext
Raw Permalink Normal View History

2022-05-18 02:04:53 +02:00
{
"cells": [
{
"cell_type": "markdown",
"metadata": {
"pycharm": {
"name": "#%% md\n"
}
},
"source": [
"![Logo 1](https://git.wmi.amu.edu.pl/AITech/Szablon/raw/branch/master/Logotyp_AITech1.jpg)\n",
"<div class=\"alert alert-block alert-info\">\n",
"<h1> Ekstrakcja informacji </h1>\n",
"<h2> 7. <i>Regresja liniowa</i> [ćwiczenia]</h2> \n",
"<h3> Jakub Pokrywka (2021)</h3>\n",
"</div>\n",
"\n",
"![Logo 2](https://git.wmi.amu.edu.pl/AITech/Szablon/raw/branch/master/Logotyp_AITech2.jpg)"
]
},
{
"cell_type": "markdown",
"metadata": {
"pycharm": {
"name": "#%% md\n"
}
},
"source": [
"# Regresja liniowa"
]
},
{
"cell_type": "markdown",
"metadata": {
"pycharm": {
"name": "#%% md\n"
}
},
"source": [
"## import bibliotek"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {
"pycharm": {
"name": "#%%\n"
}
},
"outputs": [],
"source": [
"import pandas as pd\n",
"import numpy as np\n",
"from pathlib import Path\n",
"import seaborn as sns\n",
"import matplotlib.pyplot as plt\n",
"import numpy as np\n",
"from sklearn.linear_model import LinearRegression\n",
"plt.rcParams['figure.figsize'] = [10, 5]"
]
},
{
"cell_type": "markdown",
"metadata": {
"pycharm": {
"name": "#%% md\n"
}
},
"source": [
"## Zbiór \n",
"\n",
"https://git.wmi.amu.edu.pl/kubapok/mieszkania2-below1m-public"
]
},
{
"cell_type": "markdown",
"metadata": {
"pycharm": {
"name": "#%% md\n"
}
},
"source": [
"## ładowanie zbioru train"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {
"pycharm": {
"name": "#%%\n"
}
},
"outputs": [],
"source": [
"DATA_DIR = Path('/Users/adamwojdyla/Documents/Studia/Magisterskie/1_sem/EKS/aitech-eks/cw/mieszkania2-below1m-public')"
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {
"pycharm": {
"name": "#%%\n"
}
},
"outputs": [],
"source": [
"with open(DATA_DIR / 'names') as f_names:\n",
" names = f_names.read().rstrip('\\n').split('\\t')"
]
},
{
"cell_type": "code",
"execution_count": 16,
"metadata": {
"pycharm": {
"name": "#%%\n"
}
},
"outputs": [],
"source": [
"mieszkania_train = pd.read_csv(DATA_DIR/'train/in.tsv', sep ='\\t', names=names)"
]
},
{
"cell_type": "code",
"execution_count": 17,
"metadata": {
"pycharm": {
"name": "#%%\n"
}
},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>isNew</th>\n",
" <th>rooms</th>\n",
" <th>floor</th>\n",
" <th>location</th>\n",
" <th>sqrMetres</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>False</td>\n",
" <td>3</td>\n",
" <td>1</td>\n",
" <td>Centrum</td>\n",
" <td>78</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>False</td>\n",
" <td>3</td>\n",
" <td>2</td>\n",
" <td>Sołacz</td>\n",
" <td>62</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>False</td>\n",
" <td>3</td>\n",
" <td>0</td>\n",
" <td>Sołacz</td>\n",
" <td>15</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>False</td>\n",
" <td>4</td>\n",
" <td>0</td>\n",
" <td>Sołacz</td>\n",
" <td>14</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>False</td>\n",
" <td>3</td>\n",
" <td>0</td>\n",
" <td>Sołacz</td>\n",
" <td>15</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" isNew rooms floor location sqrMetres\n",
"0 False 3 1 Centrum 78\n",
"1 False 3 2 Sołacz 62\n",
"2 False 3 0 Sołacz 15\n",
"3 False 4 0 Sołacz 14\n",
"4 False 3 0 Sołacz 15"
]
},
"execution_count": 17,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"mieszkania_train.head()"
]
},
{
"cell_type": "code",
"execution_count": 18,
"metadata": {
"pycharm": {
"name": "#%%\n"
}
},
"outputs": [],
"source": [
"with open(DATA_DIR/'train'/'expected.tsv','r') as train_exp_f:\n",
" Y_train = np.array([float(x.rstrip('\\n')) for x in train_exp_f.readlines()])"
]
},
{
"cell_type": "code",
"execution_count": 19,
"metadata": {
"pycharm": {
"name": "#%%\n"
}
},
"outputs": [
{
"data": {
"text/plain": [
"array([476118., 459531., 411557., ..., 320000., 364000., 209000.])"
]
},
"execution_count": 19,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"Y_train"
]
},
{
"cell_type": "code",
"execution_count": 20,
"metadata": {
"pycharm": {
"name": "#%%\n"
}
},
"outputs": [],
"source": [
"mieszkania_train['price'] = Y_train"
]
},
{
"cell_type": "code",
"execution_count": 21,
"metadata": {
"pycharm": {
"name": "#%%\n"
}
},
"outputs": [],
"source": [
"X_train = mieszkania_train['sqrMetres'].to_numpy()"
]
},
{
"cell_type": "markdown",
"metadata": {
"pycharm": {
"name": "#%% md\n"
}
},
"source": [
"## Wizualizacja danych"
]
},
{
"cell_type": "code",
"execution_count": 22,
"metadata": {
"pycharm": {
"name": "#%%\n"
}
},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>isNew</th>\n",
" <th>rooms</th>\n",
" <th>floor</th>\n",
" <th>location</th>\n",
" <th>sqrMetres</th>\n",
" <th>price</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>False</td>\n",
" <td>3</td>\n",
" <td>1</td>\n",
" <td>Centrum</td>\n",
" <td>78</td>\n",
" <td>476118.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>False</td>\n",
" <td>3</td>\n",
" <td>2</td>\n",
" <td>Sołacz</td>\n",
" <td>62</td>\n",
" <td>459531.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>False</td>\n",
" <td>3</td>\n",
" <td>0</td>\n",
" <td>Sołacz</td>\n",
" <td>15</td>\n",
" <td>411557.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>False</td>\n",
" <td>4</td>\n",
" <td>0</td>\n",
" <td>Sołacz</td>\n",
" <td>14</td>\n",
" <td>496416.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>False</td>\n",
" <td>3</td>\n",
" <td>0</td>\n",
" <td>Sołacz</td>\n",
" <td>15</td>\n",
" <td>406032.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1652</th>\n",
" <td>True</td>\n",
" <td>2</td>\n",
" <td>0</td>\n",
" <td>Grunwald</td>\n",
" <td>51</td>\n",
" <td>299000.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1653</th>\n",
" <td>True</td>\n",
" <td>2</td>\n",
" <td>2</td>\n",
" <td>Centrum</td>\n",
" <td>53</td>\n",
" <td>339000.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1654</th>\n",
" <td>True</td>\n",
" <td>3</td>\n",
" <td>4</td>\n",
" <td>Stare</td>\n",
" <td>65</td>\n",
" <td>320000.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1655</th>\n",
" <td>True</td>\n",
" <td>3</td>\n",
" <td>1</td>\n",
" <td>Nowe</td>\n",
" <td>67</td>\n",
" <td>364000.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1656</th>\n",
" <td>True</td>\n",
" <td>3</td>\n",
" <td>3</td>\n",
" <td>Grunwald</td>\n",
" <td>50</td>\n",
" <td>209000.0</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>1657 rows × 6 columns</p>\n",
"</div>"
],
"text/plain": [
" isNew rooms floor location sqrMetres price\n",
"0 False 3 1 Centrum 78 476118.0\n",
"1 False 3 2 Sołacz 62 459531.0\n",
"2 False 3 0 Sołacz 15 411557.0\n",
"3 False 4 0 Sołacz 14 496416.0\n",
"4 False 3 0 Sołacz 15 406032.0\n",
"... ... ... ... ... ... ...\n",
"1652 True 2 0 Grunwald 51 299000.0\n",
"1653 True 2 2 Centrum 53 339000.0\n",
"1654 True 3 4 Stare 65 320000.0\n",
"1655 True 3 1 Nowe 67 364000.0\n",
"1656 True 3 3 Grunwald 50 209000.0\n",
"\n",
"[1657 rows x 6 columns]"
]
},
"execution_count": 22,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"mieszkania_train"
]
},
{
"cell_type": "code",
"execution_count": 24,
"metadata": {
"pycharm": {
"name": "#%%\n"
},
"scrolled": false
},
"outputs": [
{
"data": {
"text/plain": [
"<AxesSubplot:xlabel='sqrMetres', ylabel='price'>"
]
},
"execution_count": 24,
"metadata": {},
"output_type": "execute_result"
},
{
"data": {
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAl4AAAFFCAYAAAA92ONDAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjUuMSwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy/YYfK9AAAACXBIWXMAAAsTAAALEwEAmpwYAABSwUlEQVR4nO3deXgUVbo/8G9n34AEAsEASaAZxKBMjKJhRjaVAYXxDioSGREdRb06OKDXcRtBva74E0HcRnwuOqKimdEZxBUBWcQggkHZpYEEEgkEEkL2rX5/JN05XUlXKpXauvv7eR6fMqS7+tSp7q4357z1HockSRKIiIiIyHAhVjeAiIiIKFgw8CIiIiIyCQMvIiIiIpMw8CIiIiIyCQMvIiIiIpMw8CIiIiIyiV8FXjt27MCMGTMUH/Phhx9i6tSpuPrqq/Hyyy+b1DIiIiKijoVZ3QC1li5dipUrVyI6OtrnYwoKCvDee+/h7bffRkREBF588UXU19cjPDzcxJYSERERtc9vRrxSUlKwZMkSz8/79u3DjBkzMGPGDMyePRtnzpzB5s2bce655+L+++/HDTfcgMzMTAZdREREZBt+M+I1YcIEHD161PPzI488gqeeegqDBw9GTk4O3njjDURFReH777/He++9h9raWkyfPh0ZGRno3r27hS0nIiIiauY3gZecy+XCY489BgCor69HWloafv3rX+Oiiy5CXFwc4uLiMGjQIBw+fBjDhw+3uLVEREREfhx4DRw4EM8++yySk5Oxbds2nDhxAgMHDsS7776L2tpaNDY2wuVyISUlxeqmEhEREQHw48Dr0Ucfxf3334+GhgY4HA48+eSTGDhwIK655hpcf/31kCQJd955J+Lj461uKhEREREAwCFJkmR1I4iIiIiCgd/c1UhERETk7xh4EREREZnEL3K8mpqa0NgYODOioaGOgDqermJ/tGJftGJftGJftGJftGJfeLNTf4SHh/r8nV8EXo2NEsrKqqxuhm7i42MC6ni6iv3Rin3Rin3Rin3Rin3Rin3hzU790bt3N5+/41QjERERkUkYeBERERGZhIEXERERkUkYeBERERGZhIEXERERkUkYeBERERGZhIEXERERkUkMC7x27NiBGTNmtPn3tWvX4pprrsG0adPwwQcfGPXyRERERLZjSAHVpUuXYuXKlYiOjvb69/r6ejz99NP45z//iejoaFx//fW49NJLkZiYaEQziIiIiGzFkBGvlJQULFmypM2/u1wupKSkoEePHoiIiMAFF1yArVu3GtEEIiK/VVBajQVrDqCgtNrqphCRzgwZ8ZowYQKOHj3a5t8rKirQrVtrGf3Y2FhUVFR0uL/QUAfi42N0baOVQkNDAup4uor90Yp90SqY+2LxpsPIyStCZGQY5k9OD+q+kGNftGJfePOX/jB1rca4uDhUVlZ6fq6srPQKxHwJpLUaC0qr8dGuYkwZloSUhOiOnxAE7LS+ltXYF638tS9yD5di0XoX5oxxIistQdM+pgxLQm1tA6YMS0JZWZXf9oUR2Bet2Bfe7NQftlmr0el0Ij8/H2VlZairq8P333+P888/38wmWG7F9kIs31KAFdsLrW4KERlg0XoXXCVVWLTepXkfKQnR+Otlg/nHGVEAMmXE6+OPP0ZVVRWmTZuGBx54ALfccgskScI111yDpKQkM5pgG9mZ/RAZGYYpw4LruImCxZwxTs+IFxGRnEOSJMnqRnSkvr7RNsOHerDTcKgdsD9asS9asS9asS9asS9asS+82ak/bDPVSERE1uIdk0TWYuBFRBREVmwvRE5eEfNMiSxi6l2NRERkrezMfl5bIjIXAy8ioiDivmOSiKzBqUYiIiIikzDwIiIiIjIJAy8iIiIikzDwIiIiIjIJAy8iIovlHi5F9lvfI/dwabs/E1HgYOBFRGQx+fqOeqz3SET2xMCLiMhic8Y44UyM8azvKP+ZiAIH63gREVksKy0BK9Iu9PkzEQUOjngRERERmYSBFxEREZFJGHgRERERmYSBFxFRgCsorcaCNQdQUFptdVOIgh4DLyKiALdieyFy8oqwYnuh1U0hCnq8q5GIKMBlZ/bz2hKRdRh4EREFuJSEaPz1ssFWN4OIwKlGIiIiItMw8CIi2ykorcZjq3YzGZyIAg4DLyKynRXbC7F8SwGTwYko4DDwIiLbyc7shxsuTmEyOOw/+mdmqQq79wWRGgy8iMh2UhKiMX9yOlISoq1uiuXsPvpnZqkKu/cFkRq8q5GIyMayM/shMjIMU4YlWd2UdplZqsLufUGkhkOSJMnqRnSkvr4RZWVVVjdDN/HxMQF1PF3F/mjFvmhWUFqNj3YVY8qwJI56ge8LEfuiFfvCm536o3fvbj5/x6lGIrIdTikRUaDiVCMR2Q6nlIgoUDHwIiLbcSfX22XagIhIL5xqJCIiIjIJAy8iIiIikzDwIiIiIjIJAy8iIiIikzDwIiIiIjIJAy8iUsXMNfnswIjjDbY+JKK2GHgRBTm1wYCZa/LZgRHHG2x9SERtsY4XUZBzBwMA8NfLBvt8nJlr8tmBEccbbH1IRG0x8CIKcmqDgZSEaMXALNAYcbzB1odE1BYDL6Igx2CAiMg8zPEiIiIiMgkDLyIiIiKTMPAiIlKBpSCISA8MvIiIVGApCCLSA5PriYhUYCkIItIDR7yIiFRw3/2ZkhBtdVPIT3B62n7scE4YeBERERmA09P2Y4dzwqlGIiIiA3B62n7scE4MCbyamprw6KOPYt++fYiIiMATTzyB1NRUz+//7//+D6tWrYLD4cAdd9yB8ePHG9EMIiIiy7A4sf3Y4ZwYEnh99dVXqKurw/vvv4+8vDw888wzePXVVwEA5eXl+Mc//oEvv/wS1dXV+MMf/sDAi4iIiIKCITle27Ztw6hRowAAGRkZ2Llzp+d30dHRSE5ORnV1Naqrq+FwOIxoAunEDomIZB6e71b+1hf+1l6iYGXIiFdFRQXi4uI8P4eGhqKhoQFhYc0vd9ZZZ2HSpElobGzE7bff3uH+QkMdiI+PMaKplggNDfGb41m86TBy8ooQGRmG+ZPTDXkNf+oPo1ndF2acb7XYF63U9IWd2mskq98XdsK+8OYv/WFI4BUXF4fKykrPz01NTZ6ga8OGDTh+/DjWrFkDALjllluQmZmJ4cOH+9xfY6OEsrIqI5pqifj4GL85ninDklBb24Apw5IMa7M/9YfRrO4LM863WuyLVmr6wk7tNZLV7ws7YV94s1N/9O7dzefvDAm8MjMzsW7dOlx55ZXIy8vDkCFDPL/r0aMHoqKiEBERAYfDgW7duqG8vNyIZpAO7JCISObh+W7lb33hb+0lClaGBF7jx4/HN998g+zsbEiShKeeegrLli1DSkoKLrvsMmzevBnXXXcdQkJCkJmZid/+9rdGNIOIiIjIVhySJElWN6Ij9fWNthk+1IOdhkPtgP3Rin3Rin3Rin3Rin3Rin3hzU79oTTVyMr1XcC7iIj8Dz+3xmL/Eilj4NUFdlh6gIg6h59bY7F/iZRxyaAusMPSA0S+5B4uxaL1LswZ40RWWoLVzbENfm6Nxf61XkFpNVZsL0R2Zj8u6m5DHPHqAvddRHxjkx0tWu+Cq6QKi9a7rG6KrfBzayz2r/U46mhvHPEiClBzxjg9I15EFDw46mhvDLyIAlRWWgJWpF1odTOIyGSs6WZvnGokImoH784jIiMw8CJb4kWPrMY8GSIyAgMvsiVe9Mhq2Zn9MDUjOSDyZPiHjDXY79QeBl5kS3a86PFL1N70Pj+BdHce/5CxBvud2sPkerIlOyaHur9EAdiubcTzo4R3uVmD/U7tYeBFpBK/RO2N58c3O/4hEwzY79QeBl5EKvFL1N54fojIHzDHi4iIiMgkDLxMVlBajcdW7WaCNhERURBi4GWyFdsLsXxLAe9yISIiCkLM8TJZdmY/REaGYcqwJKubQkQKCkqrsWJ7IbIz+wVESQkisgeOeJksJSEa8yen84ucyOb0qMHE2m9EJMfAi4gCgt5Bjh5FfI0uoJl7uBTZb32P3MOlhuyfiPTHwIuIAoLeQY68cr2
"text/plain": [
"<Figure size 720x360 with 1 Axes>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"sns.scatterplot(x='sqrMetres',y='price', data = mieszkania_train, linewidth = 0, s = 5)"
]
},
{
"cell_type": "markdown",
"metadata": {
"pycharm": {
"name": "#%% md\n"
}
},
"source": [
"### Pytanie- Jaki jest baseline naszego systemu?"
]
},
{
"cell_type": "markdown",
"metadata": {
"pycharm": {
"name": "#%% md\n"
}
},
"source": [
"## Czym jest regresja liniowa?- przypadek jednowymiarowy"
]
},
{
"cell_type": "markdown",
"metadata": {
"pycharm": {
"name": "#%% md\n"
}
},
"source": [
"![regresja liniowa 1](obrazki/1.png)"
]
},
{
"cell_type": "markdown",
"metadata": {
"pycharm": {
"name": "#%% md\n"
}
},
"source": [
"![regresja liniowa 2](obrazki/2.png)"
]
},
{
"cell_type": "markdown",
"metadata": {
"pycharm": {
"name": "#%% md\n"
}
},
"source": [
"![regresja liniowa 3](obrazki/3.png)"
]
},
{
"cell_type": "markdown",
"metadata": {
"pycharm": {
"name": "#%% md\n"
}
},
"source": [
"![regresja liniowa 4](obrazki/4.png)"
]
},
{
"cell_type": "markdown",
"metadata": {
"pycharm": {
"name": "#%% md\n"
}
},
"source": [
"## wzór na regresję w przypadku jednowymiarowym?\n",
"\n"
]
},
{
"cell_type": "markdown",
"metadata": {
"pycharm": {
"name": "#%% md\n"
}
},
"source": [
"$Y = a*X_1 + b$\n",
"\n",
"$Y = w_1 * X_1 + w_0$"
]
},
{
"cell_type": "markdown",
"metadata": {
"pycharm": {
"name": "#%% md\n"
}
},
"source": [
"## Zadanie - napisać funkcję predict_score(sqr_metres) która zwraca cenę mieszkania zgodnie z modelem regresji liniowej ( 5 minut) \n",
"\n",
"Należy samemu wymyślić współczynniki modelu"
]
},
{
"cell_type": "code",
"execution_count": 25,
"metadata": {
"pycharm": {
"name": "#%%\n"
}
},
"outputs": [],
"source": [
"def predict_price(sqr_metres):\n",
" return 2000* sqr_metres + 200000"
]
},
{
"cell_type": "code",
"execution_count": 26,
"metadata": {
"pycharm": {
"name": "#%%\n"
}
},
"outputs": [
{
"data": {
"text/plain": [
"240000"
]
},
"execution_count": 26,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"predict_price(20)"
]
},
{
"cell_type": "code",
"execution_count": 27,
"metadata": {
"pycharm": {
"name": "#%%\n"
}
},
"outputs": [
{
"data": {
"text/plain": [
"280000"
]
},
"execution_count": 27,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"predict_price(40)"
]
},
{
"cell_type": "code",
"execution_count": 28,
"metadata": {
"pycharm": {
"name": "#%%\n"
}
},
"outputs": [
{
"data": {
"text/plain": [
"310000"
]
},
"execution_count": 28,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"predict_price(55)"
]
},
{
"cell_type": "code",
"execution_count": 29,
"metadata": {
"pycharm": {
"name": "#%%\n"
}
},
"outputs": [
{
"data": {
"text/plain": [
"200000"
]
},
"execution_count": 29,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"predict_price(0)"
]
},
{
"cell_type": "code",
"execution_count": 30,
"metadata": {
"pycharm": {
"name": "#%%\n"
}
},
"outputs": [],
"source": [
"Y_train_predicted = predict_price(X_train)"
]
},
{
"cell_type": "markdown",
"metadata": {
"pycharm": {
"name": "#%% md\n"
}
},
"source": [
"## Mierzenie błędu"
]
},
{
"cell_type": "markdown",
"metadata": {
"pycharm": {
"name": "#%% md\n"
}
},
"source": [
"![RMSE 1](obrazki/6.png)"
]
},
{
"cell_type": "markdown",
"metadata": {
"pycharm": {
"name": "#%% md\n"
}
},
"source": [
"![a](obrazki/5.png)"
]
},
{
"cell_type": "markdown",
"metadata": {
"pycharm": {
"name": "#%% md\n"
}
},
"source": [
"## Zadanie - napisać funkcję, która liczy błąd średniowadratowy na całym zbiorze (7 minut)\n",
"\n",
"rmse(Y_true, Y_predicted)"
]
},
{
"cell_type": "code",
"execution_count": 31,
"metadata": {
"pycharm": {
"name": "#%%\n"
}
},
"outputs": [],
"source": [
"def rmse(Y_true, Y_predicted):\n",
" return np.sqrt(np.sum((Y_true - Y_predicted)**2)/ len(Y_true)) \n",
"\n",
"def "
]
},
{
"cell_type": "code",
"execution_count": 32,
"metadata": {
"pycharm": {
"name": "#%%\n"
}
},
"outputs": [
{
"data": {
"text/plain": [
"array([0, 0])"
]
},
"execution_count": 32,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"rmse(np.array([300_000, 250_000]), np.array([300_000, 250_000]))"
]
},
{
"cell_type": "code",
"execution_count": 20,
"metadata": {
"pycharm": {
"name": "#%%\n"
}
},
"outputs": [
{
"data": {
"text/plain": [
"70799.01129253148"
]
},
"execution_count": 20,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"rmse(np.array([305_000, 250_000]) ,np.array([300_000, 350_000]) )"
]
},
{
"cell_type": "code",
"execution_count": 21,
"metadata": {
"pycharm": {
"name": "#%%\n"
}
},
"outputs": [
{
"data": {
"text/plain": [
"80622.57748298549"
]
},
"execution_count": 21,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"rmse(np.array([300_000, 250_000]), np.array([330_000, 360_000]))"
]
},
{
"cell_type": "markdown",
"metadata": {
"pycharm": {
"name": "#%% md\n"
}
},
"source": [
"## Zadanie - za pomocą rmse policzyć błąd dla baseline (3 minuty)\n"
]
},
{
"cell_type": "code",
"execution_count": 22,
"metadata": {
"pycharm": {
"name": "#%%\n"
}
},
"outputs": [
{
"data": {
"text/plain": [
"array([357461.18628244, 357461.18628244, 357461.18628244, ...,\n",
" 357461.18628244, 357461.18628244, 357461.18628244])"
]
},
"execution_count": 22,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"np.ones_like(Y_train) * Y_train.mean()"
]
},
{
"cell_type": "code",
"execution_count": 23,
"metadata": {
"pycharm": {
"name": "#%%\n"
}
},
"outputs": [
{
"data": {
"text/plain": [
"125698.71268014389"
]
},
"execution_count": 23,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"rmse(Y_train, np.ones_like(Y_train) * Y_train.mean())"
]
},
{
"cell_type": "markdown",
"metadata": {
"pycharm": {
"name": "#%% md\n"
}
},
"source": [
"## Zadanie - za pomocą rmse policzyc błąd dla predykcji (2 minuty)"
]
},
{
"cell_type": "code",
"execution_count": 24,
"metadata": {
"pycharm": {
"name": "#%%\n"
}
},
"outputs": [
{
"data": {
"text/plain": [
"123420.02227684396"
]
},
"execution_count": 24,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"rmse(Y_train, Y_train_predicted)"
]
},
{
"cell_type": "markdown",
"metadata": {
"pycharm": {
"name": "#%% md\n"
}
},
"source": [
"## Na jakim zbiorze najlepiej sprawdzać wyniki?\n"
]
},
{
"cell_type": "markdown",
"metadata": {
"pycharm": {
"name": "#%% md\n"
}
},
"source": [
"![RMSE 2](obrazki/7.png)\n",
"\n",
"![RMSE 3](obrazki/8.png)\n",
"\n"
]
},
{
"cell_type": "code",
"execution_count": 25,
"metadata": {
"pycharm": {
"name": "#%%\n"
}
},
"outputs": [],
"source": [
"mieszkania_dev = pd.read_csv(DATA_DIR/'dev-0'/'in.tsv', sep = '\\t', names = names)"
]
},
{
"cell_type": "code",
"execution_count": 26,
"metadata": {
"pycharm": {
"name": "#%%\n"
}
},
"outputs": [],
"source": [
"with open(DATA_DIR/'dev-0'/'expected.tsv','r') as dev_exp_f:\n",
" Y_dev = np.array([float(x.rstrip('\\n')) for x in dev_exp_f.readlines()])"
]
},
{
"cell_type": "code",
"execution_count": 27,
"metadata": {
"pycharm": {
"name": "#%%\n"
}
},
"outputs": [],
"source": [
"mieszkania_dev['price'] = Y_dev"
]
},
{
"cell_type": "code",
"execution_count": 28,
"metadata": {
"pycharm": {
"name": "#%%\n"
}
},
"outputs": [],
"source": [
"X_dev = mieszkania_dev['sqrMetres'].to_numpy()"
]
},
{
"cell_type": "code",
"execution_count": 29,
"metadata": {
"pycharm": {
"name": "#%%\n"
}
},
"outputs": [
{
"data": {
"text/plain": [
"<matplotlib.axes._subplots.AxesSubplot at 0x7f25f7e3efd0>"
]
},
"execution_count": 29,
"metadata": {},
"output_type": "execute_result"
},
{
"data": {
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAncAAAE9CAYAAABp4UT1AAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADh0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uMy4yLjIsIGh0dHA6Ly9tYXRwbG90bGliLm9yZy+WH4yJAAAgAElEQVR4nO3df5Sc1Xng+e8TiWAhA5aQ0BIJW+2xTPgx8Q96BQ6BeIxBSsJYTAIbzZkMWqIJG8Im2DvZBGZYswF81uzkhJg5CxnGGAT2GBTiGC0bgtsijhmGCBr/CD8UWUo6gIwimjTGCvYQhJ/9o26b6qZVKpp6u6re+n7OqVNVt9576763m9bDve9z38hMJEmSVA8/0u0OSJIkqXMM7iRJkmrE4E6SJKlGDO4kSZJqxOBOkiSpRgzuJEmSamR+tzvQK5YsWZIrV67sdjckSZIO6tFHH30+M5fO9JnBXbFy5UpGR0e73Q1JkqSDioinDvSZy7KSJEk1YnAnSZJUIwZ3kiRJNWJwJ0mSVCMGd5IkSTVicCdJklQjBneSJEk1YnAnSZJUIwZ3kiRJNWJwJ0mS1CFjz7/Ex+9+nLHnX+paHwzuJEmSOuSWB8e47aGnuOXBsa71wXvLSpIkdciFpw1Nee4GgztJkqQOGVqykKvWndTVPrgsK0mSVCMGd5IkSTVicCdJklQjBneSJEk1YnAnSZJUIwZ3kiRJNWJwJ0mSVCMGd5IkSTVicCdJklQjlQZ3EXFpRDweEU9ExEdL2eKIGImIneV5UdPxl0fErojYERFrmspPjojHymfXR0SU8kMj4s5Svi0iVjbV2VC+Y2dEbKjyPCVJknpFZcFdRJwE/AqwGngPcE5ErAIuA7Zm5ipga3lPRJwArAdOBNYCN0TEvNLcjcBFwKryWFvKNwIvZOa7gOuAa0tbi4ErgVPK91/ZHERKkiTVVZUzd8cDf5GZ38vM/cCfA/8CWAdsKsdsAs4tr9cBd2Tmy5k5BuwCVkfEMcARmflQZiZw27Q6k23dBZxZZvXWACOZOZGZLwAjvBYQSpIk1VaVwd3jwBkRcVREHAb8LHAssCwz9wCU56PL8cuBZ5rq7y5ly8vr6eVT6pQA8kXgqBZtSZIk1dr8qhrOzO0RcS2NWbN/AL4J7G9RJWZqpkX5bOu89oURF9FY7uXtb397i65JkiT1h0oTKjLz5sx8f2aeAUwAO4G9ZamV8vxcOXw3jZm9SSuAZ0v5ihnKp9SJiPnAkeV7DtTW9P7dlJnDmTm8dOnSN3OqkiRJPaHqbNmjy/PbgZ8HPg9sASazVzcAd5fXW4D1JQN2iEbixMNl6XZfRJxarqe7YFqdybbOA+4v1+XdB5wdEYtKIsXZpUySJKnWKluWLf4oIo4CXgEuycwXIuKTwOaI2Ag8DZwPkJlPRMRm4Ekay7eXZOarpZ2LgVuBBcC95QFwM3B7ROyiMWO3vrQ1ERFXA4+U467KzIlqT1WSJKn7ojHRpeHh4RwdHe12NyRJkg4qIh7NzOGZPvMOFZIkSTVicCdJklQjBneSJEk1YnAnSZJUIwZ3kiRJNWJwJ0mSVCMGd5IkSTVicCdJklQjBneSJEk1YnAnSZJUIwZ3kiRJNWJwJ0mSVCMGd5IkSTVicCdJklQjBneSJEk1YnAnSZJUIwZ3kiRJNWJwJ0mSVCMGd5IkSTVicCdJklQjBneSJEk1YnAnSZJUI5UGdxHxsYh4IiIej4jPR8RbImJxRIxExM7yvKjp+MsjYldE7IiINU3lJ0fEY+Wz6yMiSvmhEXFnKd8WESub6mwo37EzIjZUeZ6SJEm9orLgLiKWA78BDGfmScA8YD1wGbA1M1cBW8t7IuKE8vmJwFrghoiYV5q7EbgIWFUea0v5RuCFzHwXcB1wbWlrMXAlcAqwGriyOYiUJEmqq6qXZecDCyJiPnAY8CywDthUPt8EnFterwPuyMyXM3MM2AWsjohjgCMy86HMTOC2aXUm27oLOLPM6q0BRjJzIjNfAEZ4LSCUJEmqrcqCu8z8NvC7wNPAHuDFzPwSsCwz95Rj9gBHlyrLgWeamthdypaX19PLp9TJzP3Ai8BRLdqSJEmqtSqXZRfRmFkbAn4MWBgRv9Sqygxl2aJ8tnWa+3hRRIxGxOj4+HiLrkmSJPWHKpdlPwyMZeZ4Zr4CfAH4SWBvWWqlPD9Xjt8NHNtUfwWNZdzd5fX08il1ytLvkcBEi7amyMybMnM4M4eXLl36Jk5VkiSpN1QZ3D0NnBoRh5Xr4M4EtgNbgMns1Q3A3eX1FmB9yYAdopE48XBZut0XEaeWdi6YVmeyrfOA+8t1efcBZ0fEojKDeHYpkyRJqrX5VTWcmdsi4i7ga8B+4OvATcBbgc0RsZFGAHh+Of6JiNgMPFmOvyQzXy3NXQzcCiwA7i0PgJuB2yNiF40Zu/WlrYmIuBp4pBx3VWZOVHWukiRJvSIaE10aHh7O0dHRbndDkiTpoCLi0cwcnukz71AhSZJUIwZ3kiRJNWJwJ0mSVCMGd5IkSTVicCdJklQjBneSJEk1YnAnSZJUIwZ3kiRJNWJwJ0kaGGPPv8TH736csedf6nZXpMoY3EmSBsYtD45x20NPccuDY93uilSZyu4tK0lSr7nwtKEpz1IdGdxJkgbG0JKFXLXupG53Q6qUy7KSJEk1YnAnSZJUIwZ3kjrOjERJ6h6DO0kdZ0aiJHWPCRWSOs6MREnqHoM7SR1nRqIkdY/LspIkSTVicCdJklQjBneSJEk1YnAnSZJUI5UFdxFxXER8o+nx3Yj4aEQsjoiRiNhZnhc11bk8InZFxI6IWNNUfnJEPFY+uz4iopQfGhF3lvJtEbGyqc6G8h07I2JDVecpSZLUSyoL7jJzR2a+NzPfC5wMfA/4Y+AyYGtmrgK2lvdExAnAeuBEYC1wQ0TMK83dCFwErCqPtaV8I/BCZr4LuA64trS1GLgSOAVYDVzZHERKkiTV1Vwty54J/HVmPgWsAzaV8k3AueX1OuCOzHw5M8eAXcDqiDgGOCIzH8rMBG6bVmeyrbuAM8us3hpgJDMnMvMFYITXAkJJkqTamqvgbj3w+fJ6WWbuASjPR5fy5cAzTXV2l7Ll5fX08il1MnM/8CJwVIu2JEmSaq3y4C4ifhT4CPCHBzt0hrJsUT7bOs19uygiRiNidHx8/CDdkyRJ6n1zMXP3M8DXMnNveb+3LLVSnp8r5buBY5vqrQCeLeUrZiifUici5gNHAhMt2poiM2/KzOHMHF66dOmsT1CSJKlXzEVw9y95bUkWYAswmb26Abi7qXx9yYAdopE48XBZut0XEaeW6+kumFZnsq3zgPvLdXn3AWdHxKKSSHF2KZMkSaq1Su8tGxGHAWcB/0tT8SeBzRGxEXgaOB8gM5+IiM3Ak8B+4JLMfLXUuRi4FVgA3FseADcDt0fELhozdutLWxMRcTXwSDnuqsycqOQkJUmSekg0Jro0PDyco6Oj3e6GJEnSQUXEo5k5PNNn3qFCkiSpRgzuJEmSasTgTpIkqUYM7iRJkmrE4E6SJKlGDO4kSZJqxOBOkiSpRgzupBbGnn+Jj9/9OGPPv9TtrkiS1BaDO6mFWx4c47aHnuKWB8e63RVJktpS6e3HpH534WlDU54lSep1BndSC0NLFnLVupO63Q1JktrmsqwkSVKNGNxJkiTViMGdJElSjRjcSZIk1YjBnSRJUo0Y3EmSJNWIwZ0kSVKNGNxJkiTViMGdJElSjRjcSZIk1YjBnSRJUo1UGtxFxNsi4q6I+KuI2B4RH4iIxRExEhE7y/OipuMvj4hdEbEjItY0lZ8cEY+Vz66PiCjlh0bEnaV8W0SsbKqzoXzHzojYUOV5SpIk9YqqZ+4+BfxpZv448B5gO3AZsDUzVwFby3si4gRgPXAisBa4ISLmlXZuBC4CVpXH2lK+EXghM98FXAdcW9paDFwJnAKsBq5sDiIlSZLqqrLgLiKOAM4AbgbIzH/MzO8A64BN5bBNwLnl9Trgjsx8OTPHgF3A6og4BjgiMx/KzARum1Znsq27gDPLrN4aYCQzJzL
"text/plain": [
"<Figure size 720x360 with 1 Axes>"
]
},
"metadata": {
"needs_background": "light"
},
"output_type": "display_data"
}
],
"source": [
"sns.scatterplot(x='sqrMetres',y='price', data = mieszkania_dev, linewidth = 0, s = 5)"
]
},
{
"cell_type": "markdown",
"metadata": {
"pycharm": {
"name": "#%% md\n"
}
},
"source": [
"# Zadanie - policzyć rmse dla predykcji ze zbioru deweloperskiego modelu baseline i naszego modelu regresji liniowej"
]
},
{
"cell_type": "code",
"execution_count": 30,
"metadata": {
"pycharm": {
"name": "#%%\n"
}
},
"outputs": [],
"source": [
"Y_dev_predicted = predict_price(X_dev)"
]
},
{
"cell_type": "code",
"execution_count": 34,
"metadata": {
"pycharm": {
"name": "#%%\n"
}
},
"outputs": [
{
"data": {
"text/plain": [
"117309.3154367544"
]
},
"execution_count": 34,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"rmse(Y_dev, np.ones_like(Y_dev) * Y_dev.mean())"
]
},
{
"cell_type": "code",
"execution_count": 35,
"metadata": {
"pycharm": {
"name": "#%%\n"
}
},
"outputs": [
{
"data": {
"text/plain": [
"104227.56492755697"
]
},
"execution_count": 35,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"rmse(Y_dev, Y_dev_predicted)"
]
},
{
"cell_type": "markdown",
"metadata": {
"pycharm": {
"name": "#%% md\n"
}
},
"source": [
"## Przypadek wielowymiarowy"
]
},
{
"cell_type": "code",
"execution_count": 36,
"metadata": {
"pycharm": {
"name": "#%%\n"
}
},
"outputs": [
{
"data": {
"text/plain": [
"<matplotlib.axes._subplots.AxesSubplot at 0x7f25f7d67e20>"
]
},
"execution_count": 36,
"metadata": {},
"output_type": "execute_result"
},
{
"data": {
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAmEAAAFICAYAAAAYvikoAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADh0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uMy4yLjIsIGh0dHA6Ly9tYXRwbG90bGliLm9yZy+WH4yJAAAgAElEQVR4nO3dfXzV5Z3n//cnIcEI4UaFqEQBBSIIiiW9oY53LXizMKWO9few0ymubddVpuNNZ/prZ2Fsh4Wddme36vxa9eHP1epvZrVlbKUFrUJHK4u0NRSsQBpAkBKREOUuQCAhuX5/nJPZfOlBcNZzfQ5cr+fj4eOTnHPMdX3PCTmfc918LgshCAAAAHGVeXcAAAAgRSRhAAAADkjCAAAAHJCEAQAAOCAJAwAAcEASBgAA4OCETMLM7FEz22Fma47z8f+Xma0zs7Vm9j+L3T8AAIBjsROxTpiZXS5pn6QnQgjjj/HY0ZJ+KOkTIYRdZjY0hLAjRj8BAACO5oQcCQshvCxpZ+/bzOx8M/uZma00s2VmdkH+rv8g6XshhF35/5cEDAAAuDshk7CjeFjSX4QQJkn6K0kP5G8fI2mMmS03s1+a2bVuPQQAAMjr492BD4KZ9Zf0cUkLzKzn5r752EfSaElXSqqVtMzMxocQdsfuJwAAQI+TIglTbkRvdwhhYoH7miX9MoTQKWmzmTUpl5S9GrODAAAAvZ0U05EhhL3KJVg3SpLlXJy/+xlJV+VvP0O56clNLh0FAADIOyGTMDN7UtIKSXVm1mxmX5T0OUlfNLPXJK2VNCP/8OclvWtm6yS9KOmrIYR3PfoNAADQ44QsUQEAAHCiOyFHwgAAAE50JGEAAAAOTrjdkWeccUYYMWKEdzcAAACOaeXKle+EEIYUuu+ES8JGjBihhoYG724AAAAck5ltOdp9TEcCAAA4IAkDAABwQBIGAADggCQMAADAAUkYAACAA5IwAAAAB0VLwszsUTPbYWZrjnK/mdk/mNlGM/utmX2oWH0BAAAoNcUcCfu+pGvf4/7rJI3O/3erpAeL2BcAAICSUrQkLITwsqSd7/GQGZKeCDm/lDTIzM4qVn8AAABKieeasGGStvb6vjl/G+Bi8zv7dc/CNdr8zn7vrgAAEuCZhFmB20LBB5rdamYNZtbQ2tpa5G4hVY8t36wnVmzRY8s3e3cFAJAAzySsWdI5vb6vlbSt0ANDCA+HEOpDCPVDhhQ8A/OksaBhqyb+7Qta0LD12A/GB+qWS0dq5uThuuXSkd5dAQAkwDMJ+4mkmfldkh+TtCeE8LZjf0rC/MWN2t3eqfmLG727kpyRZ/TT3BnjNfKMft5dAQAkoJglKp6UtEJSnZk1m9kXzew2M7st/5BnJW2StFHS/ytpVrH6ciKZPW2sBlVVaPa0sd5dAQAARWQhFFyGVbLq6+tDQ0ODdzcAAACOycxWhhDqC91HxXygRLA7EwDSQhIGlAh2ZwJAWvp4dwBATs+uTHZnAkAaSMKAEtGzOxMAkAamIwEAAByQhAEAADggCQMAAHBAEgYAAOCAJAzIe+DFjRoz+zk98OJG764AABJAEgbk3bd0gzq6unXf0g3eXQEAJIAkDMi7a8poVZaX6a4po727AgBIAHXCgLxZV43SrKtGeXcDAJAIRsIAAAAckIQBAAA4IAkDAABwQBJWYja/s1/3LFyjze/s9+4KAAAoIpKwEvPY8s16YsUWPbZ8s3dXAABAEbE7ssTccunITAQAACcnkrASM/KMfpo7Y7x3NwAAQJExHQkAAOCAJKwAFscDAIBiIwkrgMXxAACg2EjCCpg6rkZ1NdWaOq7GuyvReY8CercPAEAsJGEFLFnXoqaWNi1Z1+Ldlei8RwG92wcAIBZ2RxaQcpkI72v3bh8AgFgshODdh/elvr4+NDQ0eHcD+MBtfme/Hlu+WbdcOlIjz+jn3R0AwAfAzFaGEOoL3cd0JFAimIoFgLQwHQmUCKZiASAtJGFAieC0BABIC9ORAAAADkjCAAAAHJCEAQAAOCAJAwAAcEASBgAA4IAkDBneZzd6t+8p5WsHgBSRhCHDu2Cod/ueUr52ElAAKaJOGDK8C4Z6t+8p5WvvSUAlUSsNQDI4OxKAO87NBHCyeq+zIxkJA+CO0wIApIg1YQAAAA5IwgAAAByQhJUYdokBAJAGkrASk3KZAgAAUsLC/BKTcpkCAABSwkhYAUwJAgCAYiMJK8BzSpDpSAAA0lDU6Ugzu1bS/ZLKJT0SQvjWEfcPlPSPks7N9+W/hRAeK2afjofnlCDTkQAApKFoFfPNrFzSeklTJTVLelXSZ0MI63o95j9JGhhC+JqZDZHUJOnMEELH0X4uFfMBAMCJ4r0q5hdzOvIjkjaGEDblk6qnJM044jFBUrWZmaT+knZKOlzEPgEAAJSEYiZhwyRt7fV9c/623r4raaykbZJel3RnCKG7iH0qealvCli2oVXX3Puylm1o9e5KdKm/9gCQmmImYVbgtiPnPq+RtFrS2ZImSvqumQ34gx9kdquZNZhZQ2vryf3mnPrC/HmLGtXU0qZ5ixq9uxJd6q89AKSmmAvzmyWd0+v7WuVGvHq7RdK3Qm5h2kYz2yzpAkm/7v2gEMLDkh6WcmvCitbjEpD6wvw508dq3qJGzZk+1rsr0aX+2gNAaoq5ML+PcgvzPynpLeUW5v9pCGFtr8c8KKklhPBNM6uR9BtJF4cQ3jnaz2VhPgAAOFG818L8oo2EhRAOm9mXJT2vXImKR0MIa83stvz9D0n6z5K+b2avKzd9+bX3SsAAAABOFkWtExZCeFbSs0fc9lCvr7dJurqYfQAAAChFVMwvIOVdat7X7t0+AACxkIQVkPIuNe9r924fAIBYijodeaLy3KW2+Z39emz5Zt1y6UiNPKNf9Pa9d+h5tr9sQ+u/7sy8bPSQ6O0DANJStN2RxXKy7468Z+EaPbFii2ZOHq65M8Z7dycp19z7sppa2lRXU63n777cuzsAgJOAy+5I/Nt4j0SlLOUaZQCA+BgJAwAAKBKvA7xxAmJ3IgAAcZCEFZByIsLuRAAA4mBNWAHzF6/T0sYd2ra7XY/c/GHv7kTFmjQAAOJgJKyAN3bsz8SUjDyjn+bOGO9SHgPpWrahVdfc+7KWbWj17goAREMSVsDcT1+ouppqzf30hd5dAZIwb1GjmlraNG9Ro3dXACAakrACLhs9RM/ffTkFO4FI5kwfq7qaasqDAEgKa8IAuMt98OFDD4C0MBIGAADggCSsgJRLVAAAgDhIwgqgVhYAACg21oQVQK0sIK7N7+zXY8s365ZLR1IeBUAyGAkrwLNWFlOhSBGjzwBSRBJWYrzfjLyLZpKEpumWS0dq5uThjD4DSApJWInxfjPyLprpnYQCABALSViJ8T42aMbEs1VZXqYZE892ad87CYUPkm8AKWJhPjIWrt6mjq5uLVy9TbOuGhW9/Z4kFGlhMwyAFDESVmK810R96bKRGlRVoS9dlt6bofdznzLvEWAA8EASVmLuX7peT6zYovuXrndp//W39mh3e6def2uPS/uemBIDAMREElaA54jIvkOHMzG2lNdkeV+7985UAEBcJGEFeI6I9O/bJxNjS3layPvavXemAgDiYmF+AZ6LhO+cMkYDqiqSHIlK3ZzpYzVvUaPmTB/r3RUAQAQWQvDuw/tSX18fGhoavLsBAABwTGa2MoRQX+g+piMBAAAckIQBAAA4IAlDSaFWFwAgFSRhKCnU6gIApILdkSgpHF8DAEgFI2EFMCUGAACKjSSsAKbE/Hg+91SsBwDERBJWgPfxNSmbOq5GdTXVmjquJnrbf/3062pqadNfP/169LYlRmA98dwD8EAShpKyZF2LmlratGRdS/S2D3d3Z2JsjMD64bkH4IGF+QXcv3S9nlm9TXvbO3XfTZd4dycpngvzxw8bqO17d2j8sIHR25bYlOCJ5x6AB5IwZGx+Z78eW75Zt1w6MrlDvGdPG6ezB1W5vRH3HCCO+HjuAXggCSsg5UO0e6ZlJLm8Kc1fvE5LG3do2+52PXLzh6O2zRuxn5S
"text/plain": [
"<Figure size 720x360 with 1 Axes>"
]
},
"metadata": {
"needs_background": "light"
},
"output_type": "display_data"
}
],
"source": [
"sns.scatterplot(x='floor',y='price', data = mieszkania_train, linewidth = 0, s = 5)"
]
},
{
"cell_type": "markdown",
"metadata": {
"pycharm": {
"name": "#%% md\n"
}
},
"source": [
"$Y = w_1 * X_1 + w_2 * X_1 + w_3 * X_3 + w_0$"
]
},
{
"cell_type": "markdown",
"metadata": {
"pycharm": {
"name": "#%% md\n"
}
},
"source": [
"## Zadanie. Napisać analogiczną funkcję predict_price(sqr_metres, floor), policzyć rmse dla takiego modelu ( 7 minut)"
]
},
{
"cell_type": "code",
"execution_count": 37,
"metadata": {
"pycharm": {
"name": "#%%\n"
}
},
"outputs": [],
"source": [
"def predict_price(sqr_metres, floor):\n",
" return 4000* sqr_metres + (-1000)* floor + 100000"
]
},
{
"cell_type": "code",
"execution_count": 38,
"metadata": {
"pycharm": {
"name": "#%%\n"
}
},
"outputs": [
{
"data": {
"text/plain": [
"298000"
]
},
"execution_count": 38,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"predict_price(50, 2)"
]
},
{
"cell_type": "code",
"execution_count": 39,
"metadata": {
"pycharm": {
"name": "#%%\n"
}
},
"outputs": [
{
"data": {
"text/plain": [
"295000"
]
},
"execution_count": 39,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"predict_price(50, 5)"
]
},
{
"cell_type": "code",
"execution_count": 40,
"metadata": {
"pycharm": {
"name": "#%%\n"
}
},
"outputs": [],
"source": [
"Y_dev_predicted = predict_price(mieszkania_dev['sqrMetres'], mieszkania_dev['floor'])"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {
"pycharm": {
"name": "#%%\n"
}
},
"outputs": [
{
"ename": "NameError",
"evalue": "name 'rmse' is not defined",
"output_type": "error",
"traceback": [
"\u001B[0;31m---------------------------------------------------------------------------\u001B[0m",
"\u001B[0;31mNameError\u001B[0m Traceback (most recent call last)",
"\u001B[0;32m<ipython-input-1-090db4c912bf>\u001B[0m in \u001B[0;36m<module>\u001B[0;34m\u001B[0m\n\u001B[0;32m----> 1\u001B[0;31m \u001B[0mrmse\u001B[0m\u001B[0;34m(\u001B[0m\u001B[0mY_dev\u001B[0m\u001B[0;34m,\u001B[0m \u001B[0mnp\u001B[0m\u001B[0;34m.\u001B[0m\u001B[0mmean\u001B[0m\u001B[0;34m(\u001B[0m\u001B[0mY_train\u001B[0m\u001B[0;34m)\u001B[0m\u001B[0;34m)\u001B[0m\u001B[0;34m\u001B[0m\u001B[0;34m\u001B[0m\u001B[0m\n\u001B[0m",
"\u001B[0;31mNameError\u001B[0m: name 'rmse' is not defined"
]
}
],
"source": [
"rmse(Y_dev, np.mean(Y_train))"
]
},
{
"cell_type": "code",
"execution_count": 42,
"metadata": {
"pycharm": {
"name": "#%%\n"
}
},
"outputs": [
{
"data": {
"text/plain": [
"100227.89896326358"
]
},
"execution_count": 42,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"rmse(Y_dev, Y_dev_predicted)"
]
},
{
"cell_type": "markdown",
"metadata": {
"pycharm": {
"name": "#%% md\n"
}
},
"source": [
"## jak dobrać najlepsze parametry?"
]
},
{
"cell_type": "code",
"execution_count": 43,
"metadata": {
"pycharm": {
"name": "#%%\n"
},
"scrolled": true
},
"outputs": [
{
"data": {
"text/plain": [
"<seaborn.axisgrid.FacetGrid at 0x7f25f7cdd8e0>"
]
},
"execution_count": 43,
"metadata": {},
"output_type": "execute_result"
},
{
"data": {
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAWAAAAFgCAYAAACFYaNMAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADh0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uMy4yLjIsIGh0dHA6Ly9tYXRwbG90bGliLm9yZy+WH4yJAAAgAElEQVR4nOy9eZQk1X3n+72x5FaZWXt19Qpd9FIIBFILSbSmjYsWGiF5Bj2/w5PBfvaTLAZ07HFj2dITnsPIftieESMdIdqWRfN4HrzMgOUeWzAjhOxWUzRtdUugRiAaqheqt+qt9qzcY7vvjxsRFRkZmRmZlVmVmXU/R5xW5RIZERn5i9/9Ld8foZSCw+FwOMuPsNI7wOFwOKsVboA5HA5nheAGmMPhcFYIboA5HA5nheAGmMPhcFYIboA5HA5nhWhJA0wI+UtCyCQh5C2fr/80IeRtQshxQsh/b/T+cTgcjh9IK9YBE0JuA5AC8NeU0hsrvHYrgO8A2E0pnSOEDFBKJ5djPzkcDqccLekBU0oPAZh1PkYIuY4Q8iIh5KeEkFcIIcPmU/8OwLcopXPme7nx5XA4TUFLGuASPAngdyilHwDwRQB/YT6+DcA2Qsi/EEKOEkLuXLE95HA4HAfSSu9APSCERAF8BMDfE0Ksh4PmvxKArQBGAGwA8Aoh5EZK6fxy7yeHw+E4aQsDDObJz1NK3+fx3ASAo5RSFcAZQsgJMIP86nLuIIfD4bhpixAEpXQBzLj+HwBAGDebT38XwO3m431gIYnxFdlRDofDcdCSBpgQ8gyAIwC2E0ImCCGfA/BrAD5HCHkDwHEAnzJf/gMAM4SQtwG8BOBLlNKZldhvDofDcdKSZWgcDofTDrSkB8zhcDjtQMsl4e6880764osvrvRucDgcTjlI5Ze0oAc8PT290rvA4XA4daHlDDCHw+G0C9wAczgczgrBDTCHw+GsENwAczgczgrBDTCHw+GsENwAczgczgrBDTCHw+GsENwAczgczgrRMANcaW6bqVi2lxBymhDyJiFkR6P2hcPhcJqRRrYiPw3gzwH8dYnnPwGmy7sVwIcBfNv8l8NpeUbHJrHv0DguzGWwsTuCB24bwsjwwErvFqfJaJgH7DW3zcWnwIZqUkrpUQBdhJC1jdofDme5GB2bxFeeP47JZA5dYRmTyRy+8vxxjI7xcYScQlZSjGc9gAuOvyfMxy434sO4R8KpB3sPnMRTh88grejoCIi4b9dm7LljW8Fr9h0ahywSRALs5xUJSMgoGvYdGufXHKeAlUzCeakFeYoTE0LuJ4S8Rgh5bWpqquoP4h4Jpx7sPXASjx88jayqQxKArKrj8YOnsffAyYLXXZjLICyLBY+FZRETc5nl3F1OC7CSBngCwEbH3xsAXPJ6IaX0SUrpLZTSW/r7+6v+IKdHQgj7VxYJ9h3ik4k4/nnq8BkIBJAEAQIRzH/Z4042dkeQVfWCx7Kqjg3dkeXcXU4LsJIG+HkAv2FWQ9wKIEEpbUj4gXsknHqQVnQIrnWbQNjjTh64bQiqTpFRNFDK/lV1igduG1rGveW0Ag2LAZtz20YA9BFCJgD8IQAZACilTwB4AcAnAZwGkAHw2Ubty8buCCaTOTsmB3CPhFM9HQERWbXQCBuUPe5kZHgAj4CtvCbmMthQRc6B5ypWFw0zwJTSeys8TwH8dqM+38kDtw3hK88fR0bREJbZj4h7JJxquW/XZjx+8DQ0w4BAmPE1KHvczcjwQNWG08pVyCIpyFU8Ym6P036sik64keEBPHLXDRiIhZDIqhiIhfDIXTfwi5pTFXvu2IYHd29BWBahGSyM9eDuLUVVELXCcxWrj5abCVcrtXgkHI6bPXdsq5vBdXNhLoOusFzwGM9VtDerwgPmcFoBXj2x+uAGmMNpEnj1xOqDG2AOp0nguYrVx6qJAXM4rQDPVawuuAfM4XA4KwT3gDmcOmKJ9SzkNABM8CQWkjxFezgcboA5nDphifXoxqKmFAWQymt4/OBpAOBGmFMAD0FwOHXCEutxY1B4ivZwONwAczh1wkusx8JLtIfD4QaYw6kTHQERhqeitbdoD4fDDTCHUyfu27XZ0wBbwj1eoj2c1Q1PwnE4dcJKsLmrIKJBXgXB8YYwVcjW4ZZbbqGvvfbaSu8Gh1NXuA5w21EiG1AID0FwOCsMn1m4euEGmMNZYbgO8OqFG2AOZ4XhMwtXL9wAczgrDNcBXr1wA8zhrDBcB3j1wsvQOC1NO1QPLGWKMqe14WVonJbFOUXYOe2ai5hzmgBfZWjcA+a0LM7qAQCIBCRkFA37Do03jQFutIe+lO23w+qh1eExYE7L0uzVA42u713K9nntcXPADTCnZWn26oFG1/cuZfu89rg54AaY07I8cNsQFrIqTl1N4p3LCZy6msRCVm2a6oFGe+hL2X6zrx5WC9wAc1oaCgAEIIQAxPy7SWi0h76U7Tf76mG1wA0wp2XZd2gcnWEZWwdiGB6MY+tADJ1huWmW0Y2u713K9nntcXPADTCnZWn2ZfTI8AAeuesGDMRCSGRVDMRCdS2RW8r2G71vHH/wOmBOy3Lvk0cxmczZZWgAkFE0DMRCeOb+W1dwzzgcLkfJaXP4MprT6nADzGlZ+DKa0+rwTjhOSzMyPMANLqdl4R4wh8PhrBDcAHM4HM4KwUMQnBWnmURhqt2XZtp3TuvBPWDOitJMojDV7ksz7TunNeEGmLNsjI5N4t4nj2LXowdx75NHbe+xWURhqt2XZtp3TmvCQxCcZcEpnu70FjOKhsF4qOC1K9XNdmEug66w7Htfqn09h+OGe8CcZaGUt6hoRtOIwlQrUMMFbThLhRtgzrJQSrchIJKm6WartrOOd+Jxlgo3wJxloZS3uHVNvGm62artrOOdeJylwsV4OMsCH6DJWWXwoZyc5qHVRq/z+l7OcsA9YA7HBffWOXWAy1FyOLXA63s5ywUPQXA4Lnh9b31ZTeEc61iPjM+Mn/3qL1Ush+EeMIfjgtf31o/V1K7tPFYAs37eww0wh+OC1/fWj9UUznEeq1+4AeZwXPD63vrR7INT64nXsVaCx4A5HA/4pI36sLE7UjQ4tV3DOV7HWomGesCEkDsJIScIIacJIQ95PN9JCPmfhJA3CCHHCSGfbeT+cDic5WU1hXOcx+qXhnnAhBARwLcAfAzABIBXCSHPU0rfdrzstwG8TSn9t4SQfgAnCCH/jVKqNGq/Gs1qyvjWSjOfo+Xat2Y+B/Wk1RpwloLzWN+dSvf4eU/DGjEIITsB/BGl9OPm338AAJTS/+x4zR8A2AhmiK8F8M8AtlFKjVLbbeZGDF7AX5lmPkfLtW/NfA44dWPFGzHWA7jg+HvCfMzJnwO4HsAlAD8H8GA549vsrKaMb6008zlarn1r5nPAWV4aaYC97gBud/vjAH4GYB2A9wH4c0JIvGhDhNxPCHmNEPLa1NRU/fe0TqymjG+tNPM5Wq59a+ZzwFleGmmAJ8DCCxYbwDxdJ58F8A+UcRrAGQDD7g1RSp+klN5CKb2lv7+/YTu8VHgBf2Wa+Rwt17418zngLC+NNMCvAthKCNlMCAkAuAfA867XnAfwUQAghKwBsB1Ay67DVlPGt1aa+Rwt17418zngLC8NVUMjhHwSwDcBiAD+klL6p4SQzwMApfQJQsg6AE8DWAsWsvgqpfRvy22zVBKuWbLK1n60e8Z3KTTDOSp1vSzXvtXzc5rl2ucU4CsJ1xZylDyrzKmGdrpe2ulY2owVr4JYNnhWubnYe+AkbvqjH+C6//ACbvqjH2DvgZMrvUsFtNP10k7Hshppi1ZkLh/YPOw9cBKPHzwNgQCSwJJLjx88DQDYc8e2Fd47RjtdL+10LJVox1BLW3jAPKvcPDx1+IxpfAUIRDD/ZY83C+10vbTTsZSjXWUt28IA86xy85BWdAiu6JdA2OPNQjtdL+10LOVo11BLWxhgLh/YPHQERBiuvK5B2ePNQjtdL+10LOVo1+aVtogBA+0lH9jKsa77dm3G4wdPQzMMCIQZX4Oyx5uJUtdLK577drr2S9GuspZ
"text/plain": [
"<Figure size 360x360 with 1 Axes>"
]
},
"metadata": {
"needs_background": "light"
},
"output_type": "display_data"
}
],
"source": [
"sns.lmplot(x='sqrMetres',y='price', data = mieszkania_train)"
]
},
{
"cell_type": "markdown",
"metadata": {
"pycharm": {
"name": "#%% md\n"
}
},
"source": [
"## https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LinearRegression.html"
]
},
{
"cell_type": "code",
"execution_count": 56,
"metadata": {
"pycharm": {
"name": "#%%\n"
}
},
"outputs": [],
"source": [
"lm_model = LinearRegression()"
]
},
{
"cell_type": "code",
"execution_count": 57,
"metadata": {
"pycharm": {
"name": "#%%\n"
}
},
"outputs": [
{
"data": {
"text/plain": [
"LinearRegression()"
]
},
"execution_count": 57,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"lm_model.fit(mieszkania_train[['isNew','rooms', 'floor', 'sqrMetres']], Y_train)"
]
},
{
"cell_type": "code",
"execution_count": 58,
"metadata": {
"pycharm": {
"name": "#%%\n"
}
},
"outputs": [],
"source": [
"Y_train_predicted = lm_model.predict(mieszkania_train[['isNew','rooms', 'floor', 'sqrMetres']])"
]
},
{
"cell_type": "code",
"execution_count": 59,
"metadata": {
"pycharm": {
"name": "#%%\n"
}
},
"outputs": [
{
"data": {
"text/plain": [
"103308.92502763818"
]
},
"execution_count": 59,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"rmse(Y_train, Y_train_predicted)"
]
},
{
"cell_type": "code",
"execution_count": 60,
"metadata": {
"pycharm": {
"name": "#%%\n"
}
},
"outputs": [],
"source": [
"Y_dev_predicted = lm_model.predict(mieszkania_dev[['isNew','rooms', 'floor', 'sqrMetres']])"
]
},
{
"cell_type": "code",
"execution_count": 61,
"metadata": {
"pycharm": {
"name": "#%%\n"
}
},
"outputs": [
{
"data": {
"text/plain": [
"84157.8788905739"
]
},
"execution_count": 61,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"rmse(Y_dev, Y_dev_predicted)"
]
},
{
"cell_type": "code",
"execution_count": 62,
"metadata": {
"pycharm": {
"name": "#%%\n"
},
"scrolled": true
},
"outputs": [
{
"data": {
"text/plain": [
"array([469449.27836213])"
]
},
"execution_count": 62,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"lm_model.predict(np.array(([[0, 4, 3, 70]])))"
]
},
{
"cell_type": "code",
"execution_count": 63,
"metadata": {
"pycharm": {
"name": "#%%\n"
}
},
"outputs": [
{
"data": {
"text/plain": [
"array([455982.54297977])"
]
},
"execution_count": 63,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"lm_model.predict(np.array(([[0, 4, 3, 60]])))"
]
},
{
"cell_type": "code",
"execution_count": 64,
"metadata": {
"pycharm": {
"name": "#%%\n"
}
},
"outputs": [
{
"data": {
"text/plain": [
"array([ 4522.65059749, 73763.4125433 , -78.83243119, 1346.67353824])"
]
},
"execution_count": 64,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"lm_model.coef_"
]
},
{
"cell_type": "code",
"execution_count": 65,
"metadata": {
"pycharm": {
"name": "#%%\n"
}
},
"outputs": [
{
"data": {
"text/plain": [
"80364.97780599032"
]
},
"execution_count": 65,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"lm_model.intercept_"
]
},
{
"cell_type": "code",
"execution_count": 66,
"metadata": {
"pycharm": {
"name": "#%%\n"
},
"scrolled": true
},
"outputs": [
{
"data": {
"text/plain": [
"455982.5429800203"
]
},
"execution_count": 66,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"0 * 4522.65059749 + 4* 73763.4125433 + 3 * (-78.83243119) + 60 * 1346.67353824 + 80364.97780599032"
]
},
{
"cell_type": "code",
"execution_count": 67,
"metadata": {
"pycharm": {
"name": "#%%\n"
}
},
"outputs": [],
"source": [
"with open(DATA_DIR/'dev-0'/'out.tsv','w') as f_out_file:\n",
" for line in Y_dev_predicted:\n",
" f_out_file.write(str(line))\n",
" f_out_file.write('\\n')"
]
},
{
"cell_type": "markdown",
"metadata": {
"pycharm": {
"name": "#%% md\n"
}
},
"source": [
"## Uwaga - regresja linowa działa dobrze tylko dla danych, gdzie występuje korelacja liniowa"
]
},
{
"cell_type": "markdown",
"metadata": {
"pycharm": {
"name": "#%% md\n"
}
},
"source": [
"![RMSE 5](obrazki/9.png)\n",
"\n",
"![6](obrazki/10.png)"
]
},
{
"cell_type": "markdown",
"metadata": {
"pycharm": {
"name": "#%% md\n"
}
},
"source": [
"## Zadanie domowe\n",
"Zadanie domowe, proszę wybrać jedno z dwóch:\n",
"\n",
"\n",
"- sforkować repozytorium <https://git.wmi.amu.edu.pl/kubapok/auta-public>\n",
"- Opis zadadania znajduje się w README.md\n",
"- stworzyć model regresji liniowej dla tego zbioru (można użyć gotowych bibliotek)\n",
"- dodać skrypty z rozwiązaniem oraz predykcje dla dev-0 i test-A i sprawdzić czy ewaluacja jest poprawna za pomocą geval\n",
"- wynik zaliczający to max 50_000 RMSE dla dev-0\n",
"- termin 18.05, 50 punktów,Zadanie proszę oddać w MS TEAMS umieszczając link do repo (repo powinno mieć uprawnienia do odczytu dla użytkownika kubapok lub być publiczne).\n",
"- punkty: 40, dla 3 najlepszych wyników na test-A: 70\n",
"\n",
"LUB:\n",
"\n",
"analogicznie dla <https://git.wmi.amu.edu.pl/kubapok/retroc2>\n",
"- należy użyć wektoryzacji (np tf-dif)\n",
"- wynik zaliczający to max 50 RMSE dla dev-0 \n",
"- punkty: 60, dla 3 najlepszych wyników na test-A: 80,"
]
}
],
"metadata": {
"author": "Jakub Pokrywka",
"email": "kubapok@wmi.amu.edu.pl",
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"lang": "pl",
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.12"
},
"subtitle": "7.Regresja liniowa[ćwiczenia]",
"title": "Ekstrakcja informacji",
"year": "2021"
},
"nbformat": 4,
"nbformat_minor": 4
}