From aa437c1964a6d8d3e1c8598f54288fc9b1951d45 Mon Sep 17 00:00:00 2001 From: Jakub Pokrywka Date: Tue, 26 Apr 2022 14:03:39 +0200 Subject: [PATCH] add 7 --- cw/07_regresja_liniowa.ipynb | 1086 +++++++++++++++++ cw/07_regresja_liniowa_ODPOWIEDZI.ipynb | 1416 +++++++++++++++++++++++ 2 files changed, 2502 insertions(+) create mode 100644 cw/07_regresja_liniowa.ipynb create mode 100644 cw/07_regresja_liniowa_ODPOWIEDZI.ipynb diff --git a/cw/07_regresja_liniowa.ipynb b/cw/07_regresja_liniowa.ipynb new file mode 100644 index 0000000..bf86c4b --- /dev/null +++ b/cw/07_regresja_liniowa.ipynb @@ -0,0 +1,1086 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "![Logo 1](https://git.wmi.amu.edu.pl/AITech/Szablon/raw/branch/master/Logotyp_AITech1.jpg)\n", + "
\n", + "

Ekstrakcja informacji

\n", + "

7. Regresja liniowa [ćwiczenia]

\n", + "

Jakub Pokrywka (2021)

\n", + "
\n", + "\n", + "![Logo 2](https://git.wmi.amu.edu.pl/AITech/Szablon/raw/branch/master/Logotyp_AITech2.jpg)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Regresja liniowa" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## import bibliotek" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "import numpy as np\n", + "from pathlib import Path\n", + "import seaborn as sns\n", + "import matplotlib.pyplot as plt\n", + "import numpy as np\n", + "from sklearn.linear_model import LinearRegression\n", + "plt.rcParams['figure.figsize'] = [10, 5]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Zbiór \n", + "\n", + "https://git.wmi.amu.edu.pl/kubapok/mieszkania2-below1m-public" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## ładowanie zbioru train" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "DATA_DIR = Path('/home/kuba/Syncthing/przedmioty/2020-02/ISI/zajecia7_regresja_liniowa/mieszkania2')" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "with open(DATA_DIR / 'names') as f_names:\n", + " names = f_names.read().rstrip('\\n').split('\\t')" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "mieszkania_train = pd.read_csv(DATA_DIR/'train/in.tsv', sep ='\\t', names=names)" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
isNewroomsfloorlocationsqrMetres
0False31Centrum78
1False32Sołacz62
2False30Sołacz15
3False40Sołacz14
4False30Sołacz15
\n", + "
" + ], + "text/plain": [ + " isNew rooms floor location sqrMetres\n", + "0 False 3 1 Centrum 78\n", + "1 False 3 2 Sołacz 62\n", + "2 False 3 0 Sołacz 15\n", + "3 False 4 0 Sołacz 14\n", + "4 False 3 0 Sołacz 15" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "mieszkania_train.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [], + "source": [ + "with open(DATA_DIR/'train'/'expected.tsv','r') as train_exp_f:\n", + " Y_train = np.array([float(x.rstrip('\\n')) for x in train_exp_f.readlines()])" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "array([476118., 459531., 411557., ..., 320000., 364000., 209000.])" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "Y_train" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [], + "source": [ + "mieszkania_train['price'] = Y_train" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [], + "source": [ + "X_train = mieszkania_train['sqrMetres'].to_numpy()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Wizualizacja danych" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
isNewroomsfloorlocationsqrMetresprice
0False31Centrum78476118.0
1False32Sołacz62459531.0
2False30Sołacz15411557.0
3False40Sołacz14496416.0
4False30Sołacz15406032.0
.....................
1652True20Grunwald51299000.0
1653True22Centrum53339000.0
1654True34Stare65320000.0
1655True31Nowe67364000.0
1656True33Grunwald50209000.0
\n", + "

1657 rows × 6 columns

\n", + "
" + ], + "text/plain": [ + " isNew rooms floor location sqrMetres price\n", + "0 False 3 1 Centrum 78 476118.0\n", + "1 False 3 2 Sołacz 62 459531.0\n", + "2 False 3 0 Sołacz 15 411557.0\n", + "3 False 4 0 Sołacz 14 496416.0\n", + "4 False 3 0 Sołacz 15 406032.0\n", + "... ... ... ... ... ... ...\n", + "1652 True 2 0 Grunwald 51 299000.0\n", + "1653 True 2 2 Centrum 53 339000.0\n", + "1654 True 3 4 Stare 65 320000.0\n", + "1655 True 3 1 Nowe 67 364000.0\n", + "1656 True 3 3 Grunwald 50 209000.0\n", + "\n", + "[1657 rows x 6 columns]" + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "mieszkania_train" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": { + "scrolled": false + }, + "outputs": [ + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "sns.scatterplot(x='sqrMetres',y='price', data = mieszkania_train, linewidth = 0, s = 5)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Pytanie- Jaki jest baseline naszego systemu?" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Czym jest regresja liniowa?- przypadek jednowymiarowym\n", + "\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "![regresja liniowa 1](obrazki/1.png)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "![regresja liniowa 2](obrazki/2.png)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "![regresja liniowa 3](obrazki/3.png)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "![regresja liniowa 4](obrazki/4.png)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## wzór na regresję w przypadku jednowymiarowym?\n", + "\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "$Y = a*X_1 + b$\n", + "\n", + "$Y = w_1 * X_1 + w_0$" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Zadanie - napisać funkcję predict_score(sqr_metres) która zwraca cenę mieszkania zgodnie z modelem regresji liniowej ( 5 minut) \n", + "\n", + "Należy samemu wymyślić współczynniki modelu" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [], + "source": [ + "def predict_price(sqr_metres):\n", + " pass" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [], + "source": [ + "predict_price(20)" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [], + "source": [ + "predict_price(40)" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [], + "source": [ + "predict_price(55)" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": {}, + "outputs": [], + "source": [ + "predict_price(0)" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": {}, + "outputs": [], + "source": [ + "Y_train_predicted = predict_price(X_train)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Mierzenie błędu" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "![RMSE 2](obrazki/6.png)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "![RMSE 2](obrazki/5.png)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Zadanie - napisać funkcję, która liczy błąd średniowadratowy na całym zbiorze (7 minut)\n", + "\n", + "rmse(Y_true, Y_predicted)" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": {}, + "outputs": [], + "source": [ + "def rmse(Y_true, Y_predicted):\n", + " pass " + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": {}, + "outputs": [], + "source": [ + "rmse(np.array([300_000, 250_000]), np.array([300_000, 250_000]))" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "metadata": {}, + "outputs": [], + "source": [ + "rmse(np.array([305_000, 250_000]) ,np.array([300_000, 350_000]) )" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "metadata": {}, + "outputs": [], + "source": [ + "rmse(np.array([300_000, 250_000]), np.array([330_000, 360_000]))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Zadanie - za pomocą rmse policzyć błąd dla baseline (3 minuty)\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Zadanie - za pomocą rmse policzyc błąd dla predykcji (2 minuty)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Na jakim zbiorze najlepiej sprawdzać wyniki?\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "![a](obrazki/7.png)\n", + "\n", + "![a](obrazki/8.png)\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "metadata": {}, + "outputs": [], + "source": [ + "mieszkania_dev = pd.read_csv(DATA_DIR/'dev-0'/'in.tsv', sep = '\\t', names = names)" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "metadata": {}, + "outputs": [], + "source": [ + "with open(DATA_DIR/'dev-0'/'expected.tsv','r') as dev_exp_f:\n", + " Y_dev = np.array([float(x.rstrip('\\n')) for x in dev_exp_f.readlines()])" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "metadata": {}, + "outputs": [], + "source": [ + "mieszkania_dev['price'] = Y_dev" + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "metadata": {}, + "outputs": [], + "source": [ + "X_dev = mieszkania_dev['sqrMetres'].to_numpy()" + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 26, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "sns.scatterplot(x='sqrMetres',y='price', data = mieszkania_dev, linewidth = 0, s = 5)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Zadanie - policzyć rmse dla predykcji ze zbioru deweloperskiego modelu baseline i naszego modelu regresji liniowej" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Przypadek wielowymiarowy" + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 27, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "sns.scatterplot(x='floor',y='price', data = mieszkania_train, linewidth = 0, s = 5)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "$Y = w_1 * X_1 + w_2 * X_1 + w_3 * X_3 + w_0$" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Zadanie. Napisać analogiczną funkcję predict_price(sqr_metres, floor), policzyć rmse dla takiego modelu ( 7 minut)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## jak dobrać najlepsze parametry?" + ] + }, + { + "cell_type": "code", + "execution_count": 28, + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 28, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "sns.lmplot(x='sqrMetres',y='price', data = mieszkania_train)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LinearRegression.html" + ] + }, + { + "cell_type": "code", + "execution_count": 29, + "metadata": {}, + "outputs": [], + "source": [ + "lm_model = LinearRegression()" + ] + }, + { + "cell_type": "code", + "execution_count": 30, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "LinearRegression()" + ] + }, + "execution_count": 30, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "lm_model.fit(mieszkania_train[['isNew','rooms', 'floor', 'sqrMetres']], Y_train)" + ] + }, + { + "cell_type": "code", + "execution_count": 31, + "metadata": {}, + "outputs": [], + "source": [ + "Y_train_predicted = lm_model.predict(mieszkania_train[['isNew','rooms', 'floor', 'sqrMetres']])" + ] + }, + { + "cell_type": "code", + "execution_count": 32, + "metadata": {}, + "outputs": [], + "source": [ + "rmse(Y_train, Y_train_predicted)" + ] + }, + { + "cell_type": "code", + "execution_count": 33, + "metadata": {}, + "outputs": [], + "source": [ + "Y_dev_predicted = lm_model.predict(mieszkania_dev[['isNew','rooms', 'floor', 'sqrMetres']])" + ] + }, + { + "cell_type": "code", + "execution_count": 34, + "metadata": {}, + "outputs": [], + "source": [ + "rmse(Y_dev, Y_dev_predicted)" + ] + }, + { + "cell_type": "code", + "execution_count": 35, + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "data": { + "text/plain": [ + "array([469449.27836213])" + ] + }, + "execution_count": 35, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "lm_model.predict(np.array(([[0, 4, 3, 70]])))" + ] + }, + { + "cell_type": "code", + "execution_count": 36, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "array([455982.54297977])" + ] + }, + "execution_count": 36, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "lm_model.predict(np.array(([[0, 4, 3, 60]])))" + ] + }, + { + "cell_type": "code", + "execution_count": 37, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "array([ 4522.65059749, 73763.4125433 , -78.83243119, 1346.67353824])" + ] + }, + "execution_count": 37, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "lm_model.coef_" + ] + }, + { + "cell_type": "code", + "execution_count": 38, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "80364.97780599026" + ] + }, + "execution_count": 38, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "lm_model.intercept_" + ] + }, + { + "cell_type": "code", + "execution_count": 39, + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "data": { + "text/plain": [ + "455982.5429800203" + ] + }, + "execution_count": 39, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "0 * 4522.65059749 + 4* 73763.4125433 + 3 * (-78.83243119) + 60 * 1346.67353824 + 80364.97780599032" + ] + }, + { + "cell_type": "code", + "execution_count": 40, + "metadata": {}, + "outputs": [], + "source": [ + "with open(DATA_DIR/'dev-0'/'out.tsv','w') as f_out_file:\n", + " for line in Y_dev_predicted:\n", + " f_out_file.write(str(line))\n", + " f_out_file.write('\\n')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Uwaga - regresja linowa działa dobrze tylko dla danych, gdzie występuje korelacja liniowa" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "![RMSE 3](obrazki/9.png)\n", + "\n", + "![RMSE 4](obrazki/10.png)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Zadanie domowe\n", + "\n", + "\n", + "- https://gonito.net/challenge/retroc2\n", + "- termin 17.05\n", + "- należy użyć wektoryzacji (np tf-dif)\n", + "- wynik zaliczający to max 50 RMSE dla dev-0 \n", + "- punkty: 60, dla 3 najlepszych wyników na test-A: 80,\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "author": "Jakub Pokrywka", + "email": "kubapok@wmi.amu.edu.pl", + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "lang": "pl", + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.4" + }, + "subtitle": "7.Regresja liniowa[ćwiczenia]", + "title": "Ekstrakcja informacji", + "year": "2021" + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/cw/07_regresja_liniowa_ODPOWIEDZI.ipynb b/cw/07_regresja_liniowa_ODPOWIEDZI.ipynb new file mode 100644 index 0000000..13217b2 --- /dev/null +++ b/cw/07_regresja_liniowa_ODPOWIEDZI.ipynb @@ -0,0 +1,1416 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "![Logo 1](https://git.wmi.amu.edu.pl/AITech/Szablon/raw/branch/master/Logotyp_AITech1.jpg)\n", + "
\n", + "

Ekstrakcja informacji

\n", + "

7. Regresja liniowa [ćwiczenia]

\n", + "

Jakub Pokrywka (2021)

\n", + "
\n", + "\n", + "![Logo 2](https://git.wmi.amu.edu.pl/AITech/Szablon/raw/branch/master/Logotyp_AITech2.jpg)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Regresja liniowa" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## import bibliotek" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "import numpy as np\n", + "from pathlib import Path\n", + "import seaborn as sns\n", + "import matplotlib.pyplot as plt\n", + "import numpy as np\n", + "from sklearn.linear_model import LinearRegression\n", + "plt.rcParams['figure.figsize'] = [10, 5]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Zbiór \n", + "\n", + "https://git.wmi.amu.edu.pl/kubapok/mieszkania2-below1m-public" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## ładowanie zbioru train" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "DATA_DIR = Path('/home/kuba/Syncthing/przedmioty/2020-02/ISI/zajecia7_regresja_liniowa/mieszkania2')" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "with open(DATA_DIR / 'names') as f_names:\n", + " names = f_names.read().rstrip('\\n').split('\\t')" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "mieszkania_train = pd.read_csv(DATA_DIR/'train/in.tsv', sep ='\\t', names=names)" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
isNewroomsfloorlocationsqrMetres
0False31Centrum78
1False32Sołacz62
2False30Sołacz15
3False40Sołacz14
4False30Sołacz15
\n", + "
" + ], + "text/plain": [ + " isNew rooms floor location sqrMetres\n", + "0 False 3 1 Centrum 78\n", + "1 False 3 2 Sołacz 62\n", + "2 False 3 0 Sołacz 15\n", + "3 False 4 0 Sołacz 14\n", + "4 False 3 0 Sołacz 15" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "mieszkania_train.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [], + "source": [ + "with open(DATA_DIR/'train'/'expected.tsv','r') as train_exp_f:\n", + " Y_train = np.array([float(x.rstrip('\\n')) for x in train_exp_f.readlines()])" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "array([476118., 459531., 411557., ..., 320000., 364000., 209000.])" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "Y_train" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [], + "source": [ + "mieszkania_train['price'] = Y_train" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [], + "source": [ + "X_train = mieszkania_train['sqrMetres'].to_numpy()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Wizualizacja danych" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
isNewroomsfloorlocationsqrMetresprice
0False31Centrum78476118.0
1False32Sołacz62459531.0
2False30Sołacz15411557.0
3False40Sołacz14496416.0
4False30Sołacz15406032.0
.....................
1652True20Grunwald51299000.0
1653True22Centrum53339000.0
1654True34Stare65320000.0
1655True31Nowe67364000.0
1656True33Grunwald50209000.0
\n", + "

1657 rows × 6 columns

\n", + "
" + ], + "text/plain": [ + " isNew rooms floor location sqrMetres price\n", + "0 False 3 1 Centrum 78 476118.0\n", + "1 False 3 2 Sołacz 62 459531.0\n", + "2 False 3 0 Sołacz 15 411557.0\n", + "3 False 4 0 Sołacz 14 496416.0\n", + "4 False 3 0 Sołacz 15 406032.0\n", + "... ... ... ... ... ... ...\n", + "1652 True 2 0 Grunwald 51 299000.0\n", + "1653 True 2 2 Centrum 53 339000.0\n", + "1654 True 3 4 Stare 65 320000.0\n", + "1655 True 3 1 Nowe 67 364000.0\n", + "1656 True 3 3 Grunwald 50 209000.0\n", + "\n", + "[1657 rows x 6 columns]" + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "mieszkania_train" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": { + "scrolled": false + }, + "outputs": [ + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "sns.scatterplot(x='sqrMetres',y='price', data = mieszkania_train, linewidth = 0, s = 5)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Pytanie- Jaki jest baseline naszego systemu?" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Czym jest regresja liniowa?- przypadek jednowymiarowy" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "![regresja liniowa 1](obrazki/1.png)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "![regresja liniowa 2](obrazki/2.png)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "![regresja liniowa 3](obrazki/3.png)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "![regresja liniowa 4](obrazki/4.png)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## wzór na regresję w przypadku jednowymiarowym?\n", + "\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "$Y = a*X_1 + b$\n", + "\n", + "$Y = w_1 * X_1 + w_0$" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Zadanie - napisać funkcję predict_score(sqr_metres) która zwraca cenę mieszkania zgodnie z modelem regresji liniowej ( 5 minut) \n", + "\n", + "Należy samemu wymyślić współczynniki modelu" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [], + "source": [ + "def predict_price(sqr_metres):\n", + " return 2000* sqr_metres + 200000" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "240000" + ] + }, + "execution_count": 13, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "predict_price(20)" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "280000" + ] + }, + "execution_count": 14, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "predict_price(40)" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "310000" + ] + }, + "execution_count": 15, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "predict_price(55)" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "200000" + ] + }, + "execution_count": 16, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "predict_price(0)" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": {}, + "outputs": [], + "source": [ + "Y_train_predicted = predict_price(X_train)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Mierzenie błędu" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "![RMSE 1](obrazki/6.png)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "![a](obrazki/5.png)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Zadanie - napisać funkcję, która liczy błąd średniowadratowy na całym zbiorze (7 minut)\n", + "\n", + "rmse(Y_true, Y_predicted)" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": {}, + "outputs": [], + "source": [ + "def rmse(Y_true, Y_predicted):\n", + " return np.sqrt(np.sum((Y_true - Y_predicted)**2)/ len(Y_true)) " + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0.0" + ] + }, + "execution_count": 19, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "rmse(np.array([300_000, 250_000]), np.array([300_000, 250_000]))" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "70799.01129253148" + ] + }, + "execution_count": 20, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "rmse(np.array([305_000, 250_000]) ,np.array([300_000, 350_000]) )" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "80622.57748298549" + ] + }, + "execution_count": 21, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "rmse(np.array([300_000, 250_000]), np.array([330_000, 360_000]))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Zadanie - za pomocą rmse policzyć błąd dla baseline (3 minuty)\n" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "array([357461.18628244, 357461.18628244, 357461.18628244, ...,\n", + " 357461.18628244, 357461.18628244, 357461.18628244])" + ] + }, + "execution_count": 22, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "np.ones_like(Y_train) * Y_train.mean()" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "125698.71268014389" + ] + }, + "execution_count": 23, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "rmse(Y_train, np.ones_like(Y_train) * Y_train.mean())" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Zadanie - za pomocą rmse policzyc błąd dla predykcji (2 minuty)" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "123420.02227684396" + ] + }, + "execution_count": 24, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "rmse(Y_train, Y_train_predicted)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Na jakim zbiorze najlepiej sprawdzać wyniki?\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "![RMSE 2](obrazki/7.png)\n", + "\n", + "![RMSE 3](obrazki/8.png)\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "metadata": {}, + "outputs": [], + "source": [ + "mieszkania_dev = pd.read_csv(DATA_DIR/'dev-0'/'in.tsv', sep = '\\t', names = names)" + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "metadata": {}, + "outputs": [], + "source": [ + "with open(DATA_DIR/'dev-0'/'expected.tsv','r') as dev_exp_f:\n", + " Y_dev = np.array([float(x.rstrip('\\n')) for x in dev_exp_f.readlines()])" + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "metadata": {}, + "outputs": [], + "source": [ + "mieszkania_dev['price'] = Y_dev" + ] + }, + { + "cell_type": "code", + "execution_count": 28, + "metadata": {}, + "outputs": [], + "source": [ + "X_dev = mieszkania_dev['sqrMetres'].to_numpy()" + ] + }, + { + "cell_type": "code", + "execution_count": 29, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 29, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "sns.scatterplot(x='sqrMetres',y='price', data = mieszkania_dev, linewidth = 0, s = 5)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Zadanie - policzyć rmse dla predykcji ze zbioru deweloperskiego modelu baseline i naszego modelu regresji liniowej" + ] + }, + { + "cell_type": "code", + "execution_count": 30, + "metadata": {}, + "outputs": [], + "source": [ + "Y_dev_predicted = predict_price(X_dev)" + ] + }, + { + "cell_type": "code", + "execution_count": 31, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "117309.3154367544" + ] + }, + "execution_count": 31, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "rmse(Y_dev, np.ones_like(Y_dev) * Y_dev.mean())" + ] + }, + { + "cell_type": "code", + "execution_count": 32, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "104227.56492755697" + ] + }, + "execution_count": 32, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "rmse(Y_dev, Y_dev_predicted)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Przypadek wielowymiarowy" + ] + }, + { + "cell_type": "code", + "execution_count": 33, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 33, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "sns.scatterplot(x='floor',y='price', data = mieszkania_train, linewidth = 0, s = 5)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "$Y = w_1 * X_1 + w_2 * X_1 + w_3 * X_3 + w_0$" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Zadanie. Napisać analogiczną funkcję predict_price(sqr_metres, floor), policzyć rmse dla takiego modelu ( 7 minut)" + ] + }, + { + "cell_type": "code", + "execution_count": 34, + "metadata": {}, + "outputs": [], + "source": [ + "def predict_price(sqr_metres, floor):\n", + " return 4000* sqr_metres + (-1000)* floor + 100000" + ] + }, + { + "cell_type": "code", + "execution_count": 35, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "298000" + ] + }, + "execution_count": 35, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "predict_price(50, 2)" + ] + }, + { + "cell_type": "code", + "execution_count": 36, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "295000" + ] + }, + "execution_count": 36, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "predict_price(50, 5)" + ] + }, + { + "cell_type": "code", + "execution_count": 37, + "metadata": {}, + "outputs": [], + "source": [ + "Y_dev_predicted = predict_price(mieszkania_dev['sqrMetres'], mieszkania_dev['floor'])" + ] + }, + { + "cell_type": "code", + "execution_count": 38, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "117436.43511182851" + ] + }, + "execution_count": 38, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "rmse(Y_dev, np.mean(Y_train))" + ] + }, + { + "cell_type": "code", + "execution_count": 39, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "100227.89896326358" + ] + }, + "execution_count": 39, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "rmse(Y_dev, Y_dev_predicted)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## jak dobrać najlepsze parametry?" + ] + }, + { + "cell_type": "code", + "execution_count": 40, + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 40, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "sns.lmplot(x='sqrMetres',y='price', data = mieszkania_train)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LinearRegression.html" + ] + }, + { + "cell_type": "code", + "execution_count": 41, + "metadata": {}, + "outputs": [], + "source": [ + "lm_model = LinearRegression()" + ] + }, + { + "cell_type": "code", + "execution_count": 42, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "LinearRegression()" + ] + }, + "execution_count": 42, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "lm_model.fit(mieszkania_train[['isNew','rooms', 'floor', 'sqrMetres']], Y_train)" + ] + }, + { + "cell_type": "code", + "execution_count": 43, + "metadata": {}, + "outputs": [], + "source": [ + "Y_train_predicted = lm_model.predict(mieszkania_train[['isNew','rooms', 'floor', 'sqrMetres']])" + ] + }, + { + "cell_type": "code", + "execution_count": 44, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "103308.92502763818" + ] + }, + "execution_count": 44, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "rmse(Y_train, Y_train_predicted)" + ] + }, + { + "cell_type": "code", + "execution_count": 45, + "metadata": {}, + "outputs": [], + "source": [ + "Y_dev_predicted = lm_model.predict(mieszkania_dev[['isNew','rooms', 'floor', 'sqrMetres']])" + ] + }, + { + "cell_type": "code", + "execution_count": 46, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "84157.87889057388" + ] + }, + "execution_count": 46, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "rmse(Y_dev, Y_dev_predicted)" + ] + }, + { + "cell_type": "code", + "execution_count": 47, + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/home/kuba/anaconda3/envs/zajeciaei/lib/python3.10/site-packages/sklearn/base.py:450: UserWarning: X does not have valid feature names, but LinearRegression was fitted with feature names\n", + " warnings.warn(\n" + ] + }, + { + "data": { + "text/plain": [ + "array([469449.27836213])" + ] + }, + "execution_count": 47, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "lm_model.predict(np.array(([[0, 4, 3, 70]])))" + ] + }, + { + "cell_type": "code", + "execution_count": 48, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/home/kuba/anaconda3/envs/zajeciaei/lib/python3.10/site-packages/sklearn/base.py:450: UserWarning: X does not have valid feature names, but LinearRegression was fitted with feature names\n", + " warnings.warn(\n" + ] + }, + { + "data": { + "text/plain": [ + "array([455982.54297977])" + ] + }, + "execution_count": 48, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "lm_model.predict(np.array(([[0, 4, 3, 60]])))" + ] + }, + { + "cell_type": "code", + "execution_count": 49, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "array([ 4522.65059749, 73763.4125433 , -78.83243119, 1346.67353824])" + ] + }, + "execution_count": 49, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "lm_model.coef_" + ] + }, + { + "cell_type": "code", + "execution_count": 50, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "80364.9778059895" + ] + }, + "execution_count": 50, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "lm_model.intercept_" + ] + }, + { + "cell_type": "code", + "execution_count": 51, + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "data": { + "text/plain": [ + "455982.5429800203" + ] + }, + "execution_count": 51, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "0 * 4522.65059749 + 4* 73763.4125433 + 3 * (-78.83243119) + 60 * 1346.67353824 + 80364.97780599032" + ] + }, + { + "cell_type": "code", + "execution_count": 52, + "metadata": {}, + "outputs": [], + "source": [ + "with open(DATA_DIR/'dev-0'/'out.tsv','w') as f_out_file:\n", + " for line in Y_dev_predicted:\n", + " f_out_file.write(str(line))\n", + " f_out_file.write('\\n')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Uwaga - regresja linowa działa dobrze tylko dla danych, gdzie występuje korelacja liniowa" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "![RMSE 5](obrazki/9.png)\n", + "\n", + "![6](obrazki/10.png)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Zadanie domowe\n", + "\n", + "\n", + "- https://gonito.net/challenge/retroc2\n", + "- termin 17.05\n", + "- należy użyć wektoryzacji (np tf-dif)\n", + "- wynik zaliczający to max 50 RMSE dla dev-0 \n", + "- punkty: 60, dla 3 najlepszych wyników na test-A: 80,\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "author": "Jakub Pokrywka", + "email": "kubapok@wmi.amu.edu.pl", + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "lang": "pl", + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.4" + }, + "subtitle": "7.Regresja liniowa[ćwiczenia]", + "title": "Ekstrakcja informacji", + "year": "2021" + }, + "nbformat": 4, + "nbformat_minor": 4 +}