mieszkania5/model_regresji_liniowej.ipynb
2023-10-28 14:25:18 +02:00

1609 lines
48 KiB
Plaintext
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

{
"cells": [
{
"cell_type": "code",
"execution_count": 316,
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"from statistics import mean,median\n",
"import re\n",
"import numpy as np\n",
" "
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Wczytanie datasetów"
]
},
{
"cell_type": "code",
"execution_count": 223,
"metadata": {},
"outputs": [],
"source": [
"train_dataset = pd.read_csv(\"./train/train.tsv\", sep = \"\\t\", header=None)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Data exploration "
]
},
{
"cell_type": "code",
"execution_count": 188,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>0</th>\n",
" <th>1</th>\n",
" <th>2</th>\n",
" <th>3</th>\n",
" <th>4</th>\n",
" <th>5</th>\n",
" <th>6</th>\n",
" <th>7</th>\n",
" <th>8</th>\n",
" <th>9</th>\n",
" <th>...</th>\n",
" <th>16</th>\n",
" <th>17</th>\n",
" <th>18</th>\n",
" <th>19</th>\n",
" <th>20</th>\n",
" <th>21</th>\n",
" <th>22</th>\n",
" <th>23</th>\n",
" <th>24</th>\n",
" <th>25</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>309000.0</td>\n",
" <td>do zamieszkania</td>\n",
" <td>390 zł</td>\n",
" <td>spółdzielcze własnościowe</td>\n",
" <td>7113</td>\n",
" <td>https://www.otodom.pl/oferta/niezalezny-uklad-...</td>\n",
" <td>2</td>\n",
" <td>NaN</td>\n",
" <td>43.44</td>\n",
" <td>wtórny</td>\n",
" <td>...</td>\n",
" <td>NaN</td>\n",
" <td>gazowe</td>\n",
" <td>plastikowe</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>cegła</td>\n",
" <td>Polecamy na sprzedaż dwupokojowe mieszkanie p...</td>\n",
" <td>NaN</td>\n",
" <td>telewizja kablowa, internet, meble, piwnica, g...</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>1 rows × 26 columns</p>\n",
"</div>"
],
"text/plain": [
" 0 1 2 3 4 \\\n",
"0 309000.0 do zamieszkania 390 zł spółdzielcze własnościowe 7113 \n",
"\n",
" 5 6 7 8 9 \\\n",
"0 https://www.otodom.pl/oferta/niezalezny-uklad-... 2 NaN 43.44 wtórny \n",
"\n",
" ... 16 17 18 19 20 21 22 \\\n",
"0 ... NaN gazowe plastikowe NaN NaN NaN cegła \n",
"\n",
" 23 24 \\\n",
"0 Polecamy na sprzedaż dwupokojowe mieszkanie p... NaN \n",
"\n",
" 25 \n",
"0 telewizja kablowa, internet, meble, piwnica, g... \n",
"\n",
"[1 rows x 26 columns]"
]
},
"execution_count": 188,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"train_dataset.head(1)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Wczytywanie danych testowych i preprocessing jak na treningu"
]
},
{
"cell_type": "code",
"execution_count": 243,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"COLUMN 15:\n",
"Value counts before changes:\n",
" 15\n",
" 1 569\n",
" 2 527\n",
" 0 452\n",
" 4 357\n",
" 3 321\n",
" 5 117\n",
" 6 51\n",
" 7 42\n",
" 8 32\n",
" 10 29\n",
" 11 24\n",
" 9 21\n",
"-1 5\n",
"Name: count, dtype: int64\n",
"Value counts after changes:\n",
" 15\n",
" 1 569\n",
" 2 527\n",
" 0 452\n",
" 4 357\n",
" 3 321\n",
" 5 117\n",
" 6 51\n",
" 7 42\n",
" 8 32\n",
" 10 29\n",
" 11 24\n",
" 9 21\n",
"-1 5\n",
"Name: count, dtype: int64\n",
"COLUMN 8:\n",
"0 43.44\n",
"1 42.60\n",
"2 44.30\n",
"3 88.00\n",
"4 77.00\n",
" ... \n",
"2542 94.00\n",
"2543 53.50\n",
"2544 55.25\n",
"2545 62.00\n",
"2546 392.00\n",
"Name: 8, Length: 2547, dtype: float64\n",
"COLUMN 6:\n",
"Value counts before changes:\n",
" 6\n",
"2 1014\n",
"3 878\n",
"4 293\n",
"1 271\n",
"5 64\n",
"6 13\n",
"7 7\n",
"10 6\n",
"9 1\n",
"Name: count, dtype: int64\n",
"Value counts after changes:\n",
" 6\n",
"2 1014\n",
"3 878\n",
"4 293\n",
"1 271\n",
"5 64\n",
"6 13\n",
"7 7\n",
"10 6\n",
"9 1\n",
"Name: count, dtype: int64\n"
]
}
],
"source": [
"\n",
"# Preprocessing column 15:\n",
"print(\"COLUMN 15:\")\n",
"# Count the occurrence of unique values in column before preprocessing:\n",
"print(\"Value counts before changes:\\n\",train_dataset[15].value_counts())\n",
"\n",
"# Replace string to int or NaN:\n",
"train_dataset[15] = train_dataset[15].replace({\"parter\": 0, \"suterena\": -1, \"> 10\": 11, \"poddasze\": np.nan})\n",
"train_dataset[15] = train_dataset[15].apply(float)\n",
"\n",
"# Fill Nans with median:\n",
"train_dataset[15].fillna(train_dataset[15].median(), inplace=True)\n",
"train_dataset[15]= train_dataset[15].apply(int)\n",
"\n",
"# Count the occurrence of unique values in column after preprocessing:\n",
"print(\"Value counts after changes:\\n\",train_dataset[15].value_counts())\n",
"\n",
"# Preprocessing column 8:\n",
"print(\"COLUMN 8:\")\n",
"# Replace strings containing space to NaN:\n",
"train_dataset[8] = train_dataset[8].replace(' ', np.nan, regex=True)\n",
"\n",
"# Fill Nans with median:\n",
"train_dataset[8] = train_dataset[8].apply(float)\n",
"train_dataset[8].fillna(train_dataset[8].median(), inplace=True)\n",
"\n",
"print(train_dataset[8])\n",
"\n",
"# Preprocessing column 6:\n",
"print(\"COLUMN 6:\")\n",
"# Count the occurrence of unique values in column before preprocessing:\n",
"print(\"Value counts before changes:\\n\",train_dataset[6].value_counts())\n",
"\n",
"# Change string to 10:\n",
"train_dataset[6] = train_dataset[6].replace({\"więcej niż 10\": 10})\n",
"train_dataset[6] = train_dataset[6].apply(int)\n",
"\n",
"# Count the occurrence of unique values in column after preprocessing:\n",
"print(\"Value counts after changes:\\n\",train_dataset[6].value_counts())\n",
"\n",
"train_dataset[10].fillna(train_dataset[10].median(), inplace=True)\n",
"train_dataset[10] = train_dataset[10].apply(float)\n",
"\n",
"train_dataset = train_dataset[[0,6,8,10,15]]\n",
"\n"
]
},
{
"cell_type": "code",
"execution_count": 275,
"metadata": {},
"outputs": [],
"source": [
"test_dataset = pd.read_csv(\"./dev-0/in.tsv\", sep= \"\\t\", header=None)"
]
},
{
"cell_type": "code",
"execution_count": 278,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"COLUMN 15:\n",
"Value counts before changes:\n",
" 14\n",
" 1 108\n",
" 2 89\n",
" 0 82\n",
" 4 65\n",
" 3 54\n",
" 5 22\n",
" 6 12\n",
" 7 9\n",
" 11 9\n",
" 10 5\n",
" 8 3\n",
"-1 2\n",
" 9 2\n",
"Name: count, dtype: int64\n",
"Value counts after changes:\n",
" 14\n",
" 1 108\n",
" 2 89\n",
" 0 82\n",
" 4 65\n",
" 3 54\n",
" 5 22\n",
" 6 12\n",
" 7 9\n",
" 11 9\n",
" 10 5\n",
" 8 3\n",
"-1 2\n",
" 9 2\n",
"Name: count, dtype: int64\n",
"COLUMN 8:\n",
"0 59.10\n",
"1 38.00\n",
"2 63.84\n",
"3 50.00\n",
"4 65.62\n",
" ... \n",
"457 72.78\n",
"458 51.23\n",
"459 54.16\n",
"460 90.10\n",
"461 71.90\n",
"Name: 7, Length: 462, dtype: float64\n",
"COLUMN 6:\n",
"Value counts before changes:\n",
" 5\n",
"2 196\n",
"3 152\n",
"1 51\n",
"4 50\n",
"5 9\n",
"6 4\n",
"Name: count, dtype: int64\n",
"Value counts after changes:\n",
" 5\n",
"2 196\n",
"3 152\n",
"1 51\n",
"4 50\n",
"5 9\n",
"6 4\n",
"Name: count, dtype: int64\n"
]
}
],
"source": [
"\n",
"# Preprocessing column 15:\n",
"print(\"COLUMN 15:\")\n",
"# Count the occurrence of unique values in column before preprocessing:\n",
"print(\"Value counts before changes:\\n\",test_dataset[14].value_counts())\n",
"\n",
"# Replace string to int or NaN:\n",
"test_dataset[14] = test_dataset[14].replace({\"parter\": 0, \"suterena\": -1, \"> 10\": 11, \"poddasze\": np.nan})\n",
"test_dataset[14] = test_dataset[14].apply(float)\n",
"\n",
"# Fill Nans with median:\n",
"test_dataset[14].fillna(test_dataset[14].median(), inplace=True)\n",
"test_dataset[14]= test_dataset[14].apply(int)\n",
"\n",
"# Count the occurrence of unique values in column after preprocessing:\n",
"print(\"Value counts after changes:\\n\",test_dataset[14].value_counts())\n",
"\n",
"# Preprocessing column 8:\n",
"print(\"COLUMN 8:\")\n",
"# Replace strings containing space to NaN:\n",
"test_dataset[7] = test_dataset[7].replace(' ', np.nan, regex=True)\n",
"\n",
"# Fill Nans with median:\n",
"test_dataset[7] = test_dataset[7].apply(float)\n",
"test_dataset[7].fillna(test_dataset[7].median(), inplace=True)\n",
"\n",
"print(test_dataset[7])\n",
"\n",
"# Preprocessing column 6:\n",
"print(\"COLUMN 6:\")\n",
"# Count the occurrence of unique values in column before preprocessing:\n",
"print(\"Value counts before changes:\\n\",test_dataset[5].value_counts())\n",
"\n",
"# Change string to 10:\n",
"test_dataset[5] = test_dataset[5].replace({\"więcej niż 10\": 10})\n",
"test_dataset[5] = test_dataset[5].apply(int)\n",
"\n",
"# Count the occurrence of unique values in column after preprocessing:\n",
"print(\"Value counts after changes:\\n\",test_dataset[5].value_counts())\n",
"\n",
"test_dataset[9].fillna(test_dataset[9].median(), inplace=True)\n",
"test_dataset[9] = test_dataset[9].apply(float)\n",
"\n",
"test_dataset = test_dataset[[5,7,9,14]]\n",
"\n"
]
},
{
"cell_type": "code",
"execution_count": 305,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>5</th>\n",
" <th>7</th>\n",
" <th>9</th>\n",
" <th>14</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>3</td>\n",
" <td>59.1</td>\n",
" <td>4.0</td>\n",
" <td>2</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" 5 7 9 14\n",
"0 3 59.1 4.0 2"
]
},
"execution_count": 305,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"test_dataset.head(1)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Model"
]
},
{
"cell_type": "code",
"execution_count": 234,
"metadata": {},
"outputs": [],
"source": [
"from sklearn.linear_model import LinearRegression\n",
"from sklearn.preprocessing import StandardScaler"
]
},
{
"cell_type": "code",
"execution_count": 291,
"metadata": {},
"outputs": [],
"source": [
"X_train = train_dataset.drop(0,axis=1)\n",
"y_train = train_dataset[[0]]\n",
"\n",
"scaler = StandardScaler()\n",
"trans_data = scaler.fit_transform(X)"
]
},
{
"cell_type": "code",
"execution_count": 292,
"metadata": {},
"outputs": [],
"source": [
"X_test = test_dataset"
]
},
{
"cell_type": "code",
"execution_count": 293,
"metadata": {},
"outputs": [],
"source": [
"reg = LinearRegression()"
]
},
{
"cell_type": "code",
"execution_count": 294,
"metadata": {},
"outputs": [],
"source": [
"reg.fit(X_train, y_train)\n",
"results = reg.predict(X_test)"
]
},
{
"cell_type": "code",
"execution_count": 265,
"metadata": {},
"outputs": [],
"source": [
"import pickle\n",
"from sklearn.metrics import r2_score\n",
"# pickle.dump(reg, open(\"model.pkl\", \"wb\"))"
]
},
{
"cell_type": "code",
"execution_count": 295,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"array([[ 394901.20434554],\n",
" [ 293271.41755997],\n",
" [ 432666.21541136],\n",
" [ 295330.00173591],\n",
" [ 444001.60173013],\n",
" [ 595102.64364947],\n",
" [ 306562.53056792],\n",
" [ 346367.98664224],\n",
" [ 320708.28590668],\n",
" [ 355678.48101873],\n",
" [ 272087.02326267],\n",
" [ 315111.17058773],\n",
" [ 408743.59976314],\n",
" [ 392452.80148004],\n",
" [ 351776.80580328],\n",
" [ 325340.80253875],\n",
" [ 324415.74401286],\n",
" [ 286605.64798109],\n",
" [ 354409.52248178],\n",
" [ 273703.59297418],\n",
" [ 343402.57630023],\n",
" [ 285271.79788568],\n",
" [ 370401.6837088 ],\n",
" [ 482522.18182752],\n",
" [ 394119.81457774],\n",
" [ 474686.42204592],\n",
" [ 627962.51992074],\n",
" [ 334221.5006669 ],\n",
" [ 397338.23464708],\n",
" [ 414336.53457982],\n",
" [ 335485.00759125],\n",
" [ 351363.03195993],\n",
" [ 384755.53002533],\n",
" [ 374013.34228146],\n",
" [ 384755.53002533],\n",
" [ 355882.69461662],\n",
" [ 521392.20674093],\n",
" [ 425546.58946533],\n",
" [ 294138.97474644],\n",
" [ 539668.46177031],\n",
" [ 340107.2565533 ],\n",
" [ 467566.06735236],\n",
" [ 228315.74093461],\n",
" [ 373516.14139746],\n",
" [ 572962.98245529],\n",
" [ 425442.35590324],\n",
" [ 264688.02449027],\n",
" [ 321384.60839985],\n",
" [ 313697.70958017],\n",
" [ 257339.46910406],\n",
" [ 285491.92892354],\n",
" [ 265315.27967261],\n",
" [ 269349.61545595],\n",
" [ 370207.10222578],\n",
" [ 505152.41437514],\n",
" [ 326640.0334956 ],\n",
" [ 361868.73382815],\n",
" [ 641520.72645455],\n",
" [ 513506.74409331],\n",
" [ 225524.54295198],\n",
" [ 237226.80467502],\n",
" [ 453176.39203834],\n",
" [ 261995.60845714],\n",
" [ 955187.2509814 ],\n",
" [ 492991.37526251],\n",
" [ 374938.40080734],\n",
" [ 774416.69022809],\n",
" [ 523696.94084834],\n",
" [ 434831.26310559],\n",
" [ 489623.47044873],\n",
" [ 280423.33071801],\n",
" [ 264688.02449027],\n",
" [ 280814.31555288],\n",
" [ 359688.46533354],\n",
" [ 314210.82790415],\n",
" [ 622185.54247246],\n",
" [ 448643.38610329],\n",
" [ 561068.71341085],\n",
" [ 331068.80586855],\n",
" [ 387502.56994691],\n",
" [ 251024.38630808],\n",
" [ 295949.29222845],\n",
" [ 311115.63444521],\n",
" [ 307926.65308324],\n",
" [ 268121.19700892],\n",
" [ 329516.02282157],\n",
" [ 260145.38644037],\n",
" [ 402080.9057926 ],\n",
" [ 247029.12134696],\n",
" [ 293241.9788526 ],\n",
" [ 968741.83954407],\n",
" [ 463714.83552309],\n",
" [ 458749.5202625 ],\n",
" [ 467566.06735236],\n",
" [ 251228.59990596],\n",
" [ 367839.0507926 ],\n",
" [ 178563.73689914],\n",
" [ 404748.33480203],\n",
" [ 361629.76401877],\n",
" [ 335003.99532863],\n",
" [ 468590.27220151],\n",
" [ 342562.21592532],\n",
" [ 371236.86365252],\n",
" [ 310876.80170877],\n",
" [ 276928.20517629],\n",
" [ 273106.88139315],\n",
" [ 517720.942006 ],\n",
" [ 309671.80782539],\n",
" [ 321349.71807989],\n",
" [ 346542.76153275],\n",
" [ 374434.93476654],\n",
" [ 919566.5541124 ],\n",
" [ 295586.59732644],\n",
" [ 394954.99452145],\n",
" [ 259707.15616347],\n",
" [ 296950.34369535],\n",
" [ 390519.03008681],\n",
" [ 338032.92115378],\n",
" [ 279645.24385936],\n",
" [ 394134.53393142],\n",
" [ 279107.13851167],\n",
" [ 454032.21228709],\n",
" [ 959467.06920008],\n",
" [ 683669.88249351],\n",
" [ 371046.82704552],\n",
" [ 576981.26372572],\n",
" [ 294170.56215725],\n",
" [ 443250.48438223],\n",
" [ 283176.45780734],\n",
" [ 434794.43417916],\n",
" [ 344911.47246756],\n",
" [ 272235.14370441],\n",
" [ 824501.58608444],\n",
" [ 334396.91111258],\n",
" [ 350120.43652201],\n",
" [ 253527.41306202],\n",
" [ 293854.69267028],\n",
" [ 291547.17430427],\n",
" [ 270927.11434524],\n",
" [ 383959.89024261],\n",
" [ 656199.3017448 ],\n",
" [ 553341.638875 ],\n",
" [ 432687.22009017],\n",
" [ 526139.69394388],\n",
" [ 447026.81639178],\n",
" [ 274087.19936264],\n",
" [ 397721.29867274],\n",
" [ 491976.34496209],\n",
" [ 334112.72222878],\n",
" [ 564079.20871881],\n",
" [ 262283.77968584],\n",
" [ 325888.17562753],\n",
" [ 707612.5028916 ],\n",
" [ 392111.24832978],\n",
" [ 498668.84201377],\n",
" [ 287777.33771682],\n",
" [ 269508.93047261],\n",
" [ 438284.33540911],\n",
" [ 325678.41722469],\n",
" [ 451638.79768381],\n",
" [ 313355.0076554 ],\n",
" [ 507395.87489522],\n",
" [ 319663.80512625],\n",
" [ 334586.40535677],\n",
" [ 336983.25994216],\n",
" [ 241704.55116646],\n",
" [ 551645.5514437 ],\n",
" [ 263184.95608194],\n",
" [ 353876.49260028],\n",
" [ 261995.60845714],\n",
" [ 310876.80170877],\n",
" [ 350329.83055109],\n",
" [ 322549.60455611],\n",
" [ 394105.09522405],\n",
" [ 553247.67298293],\n",
" [ 318793.17233144],\n",
" [ 467137.74037172],\n",
" [ 242785.12035314],\n",
" [ 382352.68146455],\n",
" [ 327735.05085456],\n",
" [ 517343.1516039 ],\n",
" [ 290640.3481382 ],\n",
" [ 219340.07698546],\n",
" [ 298372.97925164],\n",
" [ 194674.57986053],\n",
" [ 273246.9321801 ],\n",
" [ 332307.2208042 ],\n",
" [ 264140.01584633],\n",
" [ 457026.61216604],\n",
" [ 395940.59788713],\n",
" [ 290989.89791922],\n",
" [ 473556.78554839],\n",
" [ 318678.94228073],\n",
" [ 485526.48500272],\n",
" [ 325262.02533913],\n",
" [ 481691.81794123],\n",
" [ 339771.01794269],\n",
" [ 367305.38535593],\n",
" [ 443173.11536588],\n",
" [ 505575.09158582],\n",
" [ 318678.94228073],\n",
" [ 216558.78229908],\n",
" [ 233490.17904287],\n",
" [ 394075.65651668],\n",
" [ 295760.63169678],\n",
" [ 258795.98327875],\n",
" [ 346353.26728856],\n",
" [ 305592.11589469],\n",
" [ 443578.56014568],\n",
" [ 300577.82651562],\n",
" [ 402095.62514628],\n",
" [ 513536.18280069],\n",
" [ 757593.69577602],\n",
" [ 287777.33771682],\n",
" [ 397881.4272336 ],\n",
" [ 414604.91098864],\n",
" [ 349552.42312819],\n",
" [ 534205.58125196],\n",
" [ 378854.88856884],\n",
" [ 348203.48930532],\n",
" [ 333984.13719815],\n",
" [ 330351.10957292],\n",
" [ 182753.04098048],\n",
" [ 329142.59091077],\n",
" [ 290053.63304579],\n",
" [ 309309.5704895 ],\n",
" [ 481568.04896803],\n",
" [ 270985.99175998],\n",
" [ 327735.05085456],\n",
" [ 237495.07611884],\n",
" [ 295905.1341674 ],\n",
" [ 267893.10128127],\n",
" [ 305518.51912627],\n",
" [ 289640.4947576 ],\n",
" [ 448016.13092096],\n",
" [ 448276.23951762],\n",
" [ 306562.53056792],\n",
" [ 763649.68167691],\n",
" [ 302348.80199401],\n",
" [ 316359.12444587],\n",
" [ 290899.54999829],\n",
" [ 231743.82621443],\n",
" [ 313404.81548641],\n",
" [ 278816.39312135],\n",
" [ 383639.70614492],\n",
" [ 300989.76671752],\n",
" [ 320213.59809987],\n",
" [ 327790.31029817],\n",
" [ 365619.94174106],\n",
" [ 405226.27145632],\n",
" [ 351115.20266379],\n",
" [ 301726.6340505 ],\n",
" [ 239057.85565444],\n",
" [ 623907.54383236],\n",
" [ 325824.388963 ],\n",
" [ 305965.91217926],\n",
" [ 349984.19009093],\n",
" [ 638635.09324109],\n",
" [ 624904.92801633],\n",
" [ 421648.73037838],\n",
" [ 290610.1806833 ],\n",
" [ 288601.57055477],\n",
" [ 504376.11184616],\n",
" [ 246333.26344267],\n",
" [ 390474.87202575],\n",
" [ 676486.10550918],\n",
" [ 217145.59058386],\n",
" [ 253652.38012152],\n",
" [ 874149.26336545],\n",
" [ 241282.41631858],\n",
" [ 300762.91295673],\n",
" [ 364474.75217731],\n",
" [ 819789.82291398],\n",
" [ 227915.8527324 ],\n",
" [ 333171.00574275],\n",
" [ 406022.1093964 ],\n",
" [ 424620.79041928],\n",
" [ 433925.27065205],\n",
" [ 369693.34834663],\n",
" [ 659209.432679 ],\n",
" [ 377446.15042634],\n",
" [ 489887.75954764],\n",
" [ 215155.31778014],\n",
" [ 369953.55013564],\n",
" [ 541589.4962969 ],\n",
" [ 270941.83369892],\n",
" [ 342407.81015845],\n",
" [ 259234.77608678],\n",
" [ 433939.99000574],\n",
" [ 277310.70667081],\n",
" [ 675738.22998601],\n",
" [ 376222.07869798],\n",
" [ 381358.95913228],\n",
" [ 366112.32656763],\n",
" [ 311543.39889473],\n",
" [ 236978.24663136],\n",
" [ 321370.9939401 ],\n",
" [ 412013.97636694],\n",
" [ 321355.16969248],\n",
" [ 318664.22292705],\n",
" [ 382338.23329227],\n",
" [ 581803.00342058],\n",
" [ 513506.74409331],\n",
" [ 635007.71538581],\n",
" [ 270912.39499155],\n",
" [ 316897.22979356],\n",
" [ 352344.72600475],\n",
" [ 256796.27651754],\n",
" [ 453674.42663487],\n",
" [ 321897.63353148],\n",
" [ 307478.51951008],\n",
" [ 268956.93948377],\n",
" [ 557884.64129867],\n",
" [1047884.98951607],\n",
" [ 440857.06977894],\n",
" [ 226951.61841929],\n",
" [ 350348.53224967],\n",
" [ 443134.68009882],\n",
" [ 277414.30467774],\n",
" [ 476949.14679571],\n",
" [ 286093.7277434 ],\n",
" [ 585186.00373445],\n",
" [ 441842.20380586],\n",
" [ 265771.94046669],\n",
" [ 318649.50357336],\n",
" [ 252423.12796196],\n",
" [ 321470.14026336],\n",
" [ 316360.32253217],\n",
" [ 318678.94228073],\n",
" [ 249636.38166299],\n",
" [ 342591.65463269],\n",
" [ 255418.53954247],\n",
" [ 911134.08274977],\n",
" [ 427920.19747611],\n",
" [ 528632.81740156],\n",
" [1170092.55572165],\n",
" [ 394119.81457774],\n",
" [ 306503.65315318],\n",
" [ 312892.06153619],\n",
" [ 233843.14863766],\n",
" [ 490663.22836409],\n",
" [ 459535.36171397],\n",
" [ 333969.41784446],\n",
" [ 282679.25692334],\n",
" [ 226111.82057552],\n",
" [ 538215.56556676],\n",
" [ 226111.82057552],\n",
" [ 352457.85116153],\n",
" [-161201.02176254],\n",
" [ 302303.81022042],\n",
" [ 323007.37024413],\n",
" [ 370455.47388471],\n",
" [ 287524.71253158],\n",
" [1060592.82741372],\n",
" [ 248909.9801574 ],\n",
" [ 288108.35201566],\n",
" [ 287812.79056791],\n",
" [ 229678.75855599],\n",
" [ 249386.81191776],\n",
" [ 390314.04386082],\n",
" [ 451537.07719892],\n",
" [ 528632.81740156],\n",
" [ 811858.17040649],\n",
" [ 302786.19855838],\n",
" [ 394915.92369922],\n",
" [ 390314.04386082],\n",
" [ 405425.39781537],\n",
" [1289225.09943457],\n",
" [ 349637.21447153],\n",
" [ 309215.24022367],\n",
" [ 347266.58887672],\n",
" [ 295166.06881919],\n",
" [ 301125.47078581],\n",
" [ 738397.19827818],\n",
" [ 494697.19977366],\n",
" [ 305130.09668038],\n",
" [ 278844.04749905],\n",
" [ 302727.32114364],\n",
" [ 310444.76356463],\n",
" [ 430134.21928882],\n",
" [ 309671.80782539],\n",
" [ 721669.74395088],\n",
" [ 597337.30576723],\n",
" [ 351846.05585305],\n",
" [ 704587.65260372],\n",
" [ 584703.8865779 ],\n",
" [ 293576.41296521],\n",
" [ 252313.71396867],\n",
" [ 718524.74266092],\n",
" [ 253189.43400231],\n",
" [ 302168.8466723 ],\n",
" [ 288735.60719799],\n",
" [ 498483.16389807],\n",
" [ 457172.49071198],\n",
" [ 333178.1247804 ],\n",
" [ 704558.21389635],\n",
" [ 468988.05558085],\n",
" [ 417037.85398029],\n",
" [ 245048.7518395 ],\n",
" [ 423113.73966605],\n",
" [ 354931.43920809],\n",
" [ 720067.89359305],\n",
" [ 749936.43382107],\n",
" [ 551719.98192466],\n",
" [ 441294.46634331],\n",
" [ 556690.74879784],\n",
" [ 592416.24176006],\n",
" [ 422202.56678133],\n",
" [ 318693.39045302],\n",
" [ 403430.30895423],\n",
" [ 604406.85270084],\n",
" [ 255244.59836449],\n",
" [ 374597.01387348],\n",
" [ 308150.766466 ],\n",
" [ 604268.00000018],\n",
" [ 616438.10869654],\n",
" [ 993142.48503895],\n",
" [ 395901.52706491],\n",
" [ 434344.3298916 ],\n",
" [ 485469.25484471],\n",
" [ 627355.43570469],\n",
" [ 310876.80170877],\n",
" [ 174618.27976904],\n",
" [ 447061.07115658],\n",
" [ 335585.89436361],\n",
" [ 327421.4232634 ],\n",
" [ 373803.94825239],\n",
" [ 373789.2288987 ],\n",
" [ 348443.52012806],\n",
" [ 372500.73495064],\n",
" [ 556630.4021154 ],\n",
" [ 553112.80262718],\n",
" [ 400867.67603802],\n",
" [ 569468.03490261],\n",
" [ 444493.62218294],\n",
" [ 514644.81461941],\n",
" [ 702403.53883714],\n",
" [ 608248.81678902],\n",
" [1162131.46450679],\n",
" [ 314668.1242534 ],\n",
" [ 344572.38851392],\n",
" [ 641419.64152483],\n",
" [ 406890.71039238],\n",
" [ 330256.77930708],\n",
" [ 406762.86588192],\n",
" [ 453663.96080749],\n",
" [1656917.6302292 ],\n",
" [ 449408.85843111],\n",
" [ 258707.30278287],\n",
" [ 345745.18314357],\n",
" [ 310109.76692089],\n",
" [ 424505.72665604],\n",
" [ 310095.0475672 ],\n",
" [ 401125.77300418],\n",
" [ 457988.13524458],\n",
" [ 550575.62607344],\n",
" [ 523303.70234502],\n",
" [ 506600.1301475 ],\n",
" [ 379014.11039314],\n",
" [ 352767.4964078 ],\n",
" [ 644515.6686963 ],\n",
" [ 537987.4698391 ]])"
]
},
"execution_count": 295,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"results"
]
},
{
"cell_type": "code",
"execution_count": 301,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>0</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>373000.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>299000.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>365000.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>369000.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>483791.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>457</th>\n",
" <td>655544.02</td>\n",
" </tr>\n",
" <tr>\n",
" <th>458</th>\n",
" <td>471397.97</td>\n",
" </tr>\n",
" <tr>\n",
" <th>459</th>\n",
" <td>309958.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>460</th>\n",
" <td>699000.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>461</th>\n",
" <td>850000.00</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>462 rows × 1 columns</p>\n",
"</div>"
],
"text/plain": [
" 0\n",
"0 373000.00\n",
"1 299000.00\n",
"2 365000.00\n",
"3 369000.00\n",
"4 483791.00\n",
".. ...\n",
"457 655544.02\n",
"458 471397.97\n",
"459 309958.00\n",
"460 699000.00\n",
"461 850000.00\n",
"\n",
"[462 rows x 1 columns]"
]
},
"execution_count": 301,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"y_test = pd.read_csv(\"./dev-0/expected.tsv\", header=None)\n",
"y_test"
]
},
{
"cell_type": "code",
"execution_count": 302,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"0.6393762535622007"
]
},
"execution_count": 302,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"r2_score(y_test, results)"
]
},
{
"cell_type": "code",
"execution_count": 303,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"71559.96181964973"
]
},
"execution_count": 303,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"mean_absolute_error(y_test, results)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Predykcja dla zbioru testowego"
]
},
{
"cell_type": "code",
"execution_count": 317,
"metadata": {},
"outputs": [],
"source": [
"final_test_dataset = pd.read_csv(\"./test-A/in.tsv\", sep= \"\\t\", header=None)"
]
},
{
"cell_type": "code",
"execution_count": 318,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"COLUMN 15:\n",
"Value counts before changes:\n",
" 14\n",
"1 92\n",
"parter 70\n",
"3 68\n",
"4 64\n",
"2 61\n",
"5 15\n",
"6 11\n",
"7 7\n",
"10 5\n",
"> 10 5\n",
"9 4\n",
"8 2\n",
"suterena 1\n",
"Name: count, dtype: int64\n",
"Value counts after changes:\n",
" 14\n",
" 1 92\n",
" 2 74\n",
" 0 70\n",
" 3 68\n",
" 4 64\n",
" 5 15\n",
" 6 11\n",
" 7 7\n",
" 10 5\n",
" 11 5\n",
" 9 4\n",
" 8 2\n",
"-1 1\n",
"Name: count, dtype: int64\n",
"COLUMN 8:\n",
"0 61.99\n",
"1 64.00\n",
"2 51.15\n",
"3 45.77\n",
"4 44.36\n",
" ... \n",
"413 34.97\n",
"414 49.06\n",
"415 76.71\n",
"416 72.63\n",
"417 65.84\n",
"Name: 7, Length: 418, dtype: float64\n",
"COLUMN 6:\n",
"Value counts before changes:\n",
" 5\n",
"2 175\n",
"3 143\n",
"4 50\n",
"1 40\n",
"5 6\n",
"6 2\n",
"więcej niż 10 1\n",
"8 1\n",
"Name: count, dtype: int64\n",
"Value counts after changes:\n",
" 5\n",
"2 175\n",
"3 143\n",
"4 50\n",
"1 40\n",
"5 6\n",
"6 2\n",
"10 1\n",
"8 1\n",
"Name: count, dtype: int64\n"
]
}
],
"source": [
"\n",
"# Preprocessing column 15:\n",
"print(\"COLUMN 15:\")\n",
"# Count the occurrence of unique values in column before preprocessing:\n",
"print(\"Value counts before changes:\\n\",final_test_dataset[14].value_counts())\n",
"\n",
"# Replace string to int or NaN:\n",
"final_test_dataset[14] = final_test_dataset[14].replace({\"parter\": 0, \"suterena\": -1, \"> 10\": 11, \"poddasze\": np.nan})\n",
"final_test_dataset[14] = final_test_dataset[14].apply(float)\n",
"\n",
"# Fill Nans with median:\n",
"final_test_dataset[14].fillna(final_test_dataset[14].median(), inplace=True)\n",
"final_test_dataset[14]= final_test_dataset[14].apply(int)\n",
"\n",
"# Count the occurrence of unique values in column after preprocessing:\n",
"print(\"Value counts after changes:\\n\",final_test_dataset[14].value_counts())\n",
"\n",
"# Preprocessing column 8:\n",
"print(\"COLUMN 8:\")\n",
"# Replace strings containing space to NaN:\n",
"final_test_dataset[7] = final_test_dataset[7].replace(' ', np.nan, regex=True)\n",
"\n",
"# Fill Nans with median:\n",
"final_test_dataset[7] = final_test_dataset[7].apply(float)\n",
"final_test_dataset[7].fillna(final_test_dataset[7].median(), inplace=True)\n",
"\n",
"print(final_test_dataset[7])\n",
"\n",
"# Preprocessing column 6:\n",
"print(\"COLUMN 6:\")\n",
"# Count the occurrence of unique values in column before preprocessing:\n",
"print(\"Value counts before changes:\\n\",final_test_dataset[5].value_counts())\n",
"\n",
"# Change string to 10:\n",
"final_test_dataset[5] = final_test_dataset[5].replace({\"więcej niż 10\": 10})\n",
"final_test_dataset[5] = final_test_dataset[5].apply(int)\n",
"\n",
"# Count the occurrence of unique values in column after preprocessing:\n",
"print(\"Value counts after changes:\\n\",final_test_dataset[5].value_counts())\n",
"\n",
"final_test_dataset[9].fillna(final_test_dataset[9].median(), inplace=True)\n",
"final_test_dataset[9] = final_test_dataset[9].apply(float)\n",
"\n",
"final_test_dataset = final_test_dataset[[5,7,9,14]]\n",
"\n"
]
},
{
"cell_type": "code",
"execution_count": 319,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>5</th>\n",
" <th>7</th>\n",
" <th>9</th>\n",
" <th>14</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>3</td>\n",
" <td>61.99</td>\n",
" <td>7.0</td>\n",
" <td>2</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>4</td>\n",
" <td>64.00</td>\n",
" <td>4.0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>3</td>\n",
" <td>51.15</td>\n",
" <td>5.0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>2</td>\n",
" <td>45.77</td>\n",
" <td>7.0</td>\n",
" <td>2</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>2</td>\n",
" <td>44.36</td>\n",
" <td>13.0</td>\n",
" <td>5</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>413</th>\n",
" <td>1</td>\n",
" <td>34.97</td>\n",
" <td>8.0</td>\n",
" <td>4</td>\n",
" </tr>\n",
" <tr>\n",
" <th>414</th>\n",
" <td>3</td>\n",
" <td>49.06</td>\n",
" <td>3.0</td>\n",
" <td>3</td>\n",
" </tr>\n",
" <tr>\n",
" <th>415</th>\n",
" <td>3</td>\n",
" <td>76.71</td>\n",
" <td>5.0</td>\n",
" <td>3</td>\n",
" </tr>\n",
" <tr>\n",
" <th>416</th>\n",
" <td>3</td>\n",
" <td>72.63</td>\n",
" <td>5.0</td>\n",
" <td>3</td>\n",
" </tr>\n",
" <tr>\n",
" <th>417</th>\n",
" <td>2</td>\n",
" <td>65.84</td>\n",
" <td>10.0</td>\n",
" <td>3</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>418 rows × 4 columns</p>\n",
"</div>"
],
"text/plain": [
" 5 7 9 14\n",
"0 3 61.99 7.0 2\n",
"1 4 64.00 4.0 0\n",
"2 3 51.15 5.0 0\n",
"3 2 45.77 7.0 2\n",
"4 2 44.36 13.0 5\n",
".. .. ... ... ..\n",
"413 1 34.97 8.0 4\n",
"414 3 49.06 3.0 3\n",
"415 3 76.71 5.0 3\n",
"416 3 72.63 5.0 3\n",
"417 2 65.84 10.0 3\n",
"\n",
"[418 rows x 4 columns]"
]
},
"execution_count": 319,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"final_test_dataset"
]
},
{
"cell_type": "code",
"execution_count": 320,
"metadata": {},
"outputs": [],
"source": [
"final_results = reg.predict(final_test_dataset)"
]
},
{
"cell_type": "code",
"execution_count": 321,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>0</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>426282.351904</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>389890.897311</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>334372.288463</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>341143.667679</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>346709.875023</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>413</th>\n",
" <td>301974.734528</td>\n",
" </tr>\n",
" <tr>\n",
" <th>414</th>\n",
" <td>312195.369919</td>\n",
" </tr>\n",
" <tr>\n",
" <th>415</th>\n",
" <td>537901.937976</td>\n",
" </tr>\n",
" <tr>\n",
" <th>416</th>\n",
" <td>505420.685819</td>\n",
" </tr>\n",
" <tr>\n",
" <th>417</th>\n",
" <td>509311.081663</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>418 rows × 1 columns</p>\n",
"</div>"
],
"text/plain": [
" 0\n",
"0 426282.351904\n",
"1 389890.897311\n",
"2 334372.288463\n",
"3 341143.667679\n",
"4 346709.875023\n",
".. ...\n",
"413 301974.734528\n",
"414 312195.369919\n",
"415 537901.937976\n",
"416 505420.685819\n",
"417 509311.081663\n",
"\n",
"[418 rows x 1 columns]"
]
},
"execution_count": 321,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"pd.DataFrame(final_results)"
]
},
{
"cell_type": "code",
"execution_count": 322,
"metadata": {},
"outputs": [],
"source": [
"pd.DataFrame(final_results).to_csv(\"./test-A/out.tsv\", sep='\\t', index=False, header=None)"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.0"
},
"vscode": {
"interpreter": {
"hash": "1b132c2ed43285dcf39f6d01712959169a14a721cf314fe69015adab49bb1fd1"
}
}
},
"nbformat": 4,
"nbformat_minor": 2
}