{ "cells": [ { "cell_type": "code", "execution_count": 316, "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "from statistics import mean,median\n", "import re\n", "import numpy as np\n", " " ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Wczytanie datasetów" ] }, { "cell_type": "code", "execution_count": 223, "metadata": {}, "outputs": [], "source": [ "train_dataset = pd.read_csv(\"./train/train.tsv\", sep = \"\\t\", header=None)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Data exploration " ] }, { "cell_type": "code", "execution_count": 188, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
0123456789...16171819202122232425
0309000.0do zamieszkania390 złspółdzielcze własnościowe7113https://www.otodom.pl/oferta/niezalezny-uklad-...2NaN43.44wtórny...NaNgazoweplastikoweNaNNaNNaNcegłaPolecamy na sprzedaż dwupokojowe mieszkanie p...NaNtelewizja kablowa, internet, meble, piwnica, g...
\n", "

1 rows × 26 columns

\n", "
" ], "text/plain": [ " 0 1 2 3 4 \\\n", "0 309000.0 do zamieszkania 390 zł spółdzielcze własnościowe 7113 \n", "\n", " 5 6 7 8 9 \\\n", "0 https://www.otodom.pl/oferta/niezalezny-uklad-... 2 NaN 43.44 wtórny \n", "\n", " ... 16 17 18 19 20 21 22 \\\n", "0 ... NaN gazowe plastikowe NaN NaN NaN cegła \n", "\n", " 23 24 \\\n", "0 Polecamy na sprzedaż dwupokojowe mieszkanie p... NaN \n", "\n", " 25 \n", "0 telewizja kablowa, internet, meble, piwnica, g... \n", "\n", "[1 rows x 26 columns]" ] }, "execution_count": 188, "metadata": {}, "output_type": "execute_result" } ], "source": [ "train_dataset.head(1)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Wczytywanie danych testowych i preprocessing jak na treningu" ] }, { "cell_type": "code", "execution_count": 243, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "COLUMN 15:\n", "Value counts before changes:\n", " 15\n", " 1 569\n", " 2 527\n", " 0 452\n", " 4 357\n", " 3 321\n", " 5 117\n", " 6 51\n", " 7 42\n", " 8 32\n", " 10 29\n", " 11 24\n", " 9 21\n", "-1 5\n", "Name: count, dtype: int64\n", "Value counts after changes:\n", " 15\n", " 1 569\n", " 2 527\n", " 0 452\n", " 4 357\n", " 3 321\n", " 5 117\n", " 6 51\n", " 7 42\n", " 8 32\n", " 10 29\n", " 11 24\n", " 9 21\n", "-1 5\n", "Name: count, dtype: int64\n", "COLUMN 8:\n", "0 43.44\n", "1 42.60\n", "2 44.30\n", "3 88.00\n", "4 77.00\n", " ... \n", "2542 94.00\n", "2543 53.50\n", "2544 55.25\n", "2545 62.00\n", "2546 392.00\n", "Name: 8, Length: 2547, dtype: float64\n", "COLUMN 6:\n", "Value counts before changes:\n", " 6\n", "2 1014\n", "3 878\n", "4 293\n", "1 271\n", "5 64\n", "6 13\n", "7 7\n", "10 6\n", "9 1\n", "Name: count, dtype: int64\n", "Value counts after changes:\n", " 6\n", "2 1014\n", "3 878\n", "4 293\n", "1 271\n", "5 64\n", "6 13\n", "7 7\n", "10 6\n", "9 1\n", "Name: count, dtype: int64\n" ] } ], "source": [ "\n", "# Preprocessing column 15:\n", "print(\"COLUMN 15:\")\n", "# Count the occurrence of unique values in column before preprocessing:\n", "print(\"Value counts before changes:\\n\",train_dataset[15].value_counts())\n", "\n", "# Replace string to int or NaN:\n", "train_dataset[15] = train_dataset[15].replace({\"parter\": 0, \"suterena\": -1, \"> 10\": 11, \"poddasze\": np.nan})\n", "train_dataset[15] = train_dataset[15].apply(float)\n", "\n", "# Fill Nans with median:\n", "train_dataset[15].fillna(train_dataset[15].median(), inplace=True)\n", "train_dataset[15]= train_dataset[15].apply(int)\n", "\n", "# Count the occurrence of unique values in column after preprocessing:\n", "print(\"Value counts after changes:\\n\",train_dataset[15].value_counts())\n", "\n", "# Preprocessing column 8:\n", "print(\"COLUMN 8:\")\n", "# Replace strings containing space to NaN:\n", "train_dataset[8] = train_dataset[8].replace(' ', np.nan, regex=True)\n", "\n", "# Fill Nans with median:\n", "train_dataset[8] = train_dataset[8].apply(float)\n", "train_dataset[8].fillna(train_dataset[8].median(), inplace=True)\n", "\n", "print(train_dataset[8])\n", "\n", "# Preprocessing column 6:\n", "print(\"COLUMN 6:\")\n", "# Count the occurrence of unique values in column before preprocessing:\n", "print(\"Value counts before changes:\\n\",train_dataset[6].value_counts())\n", "\n", "# Change string to 10:\n", "train_dataset[6] = train_dataset[6].replace({\"więcej niż 10\": 10})\n", "train_dataset[6] = train_dataset[6].apply(int)\n", "\n", "# Count the occurrence of unique values in column after preprocessing:\n", "print(\"Value counts after changes:\\n\",train_dataset[6].value_counts())\n", "\n", "train_dataset[10].fillna(train_dataset[10].median(), inplace=True)\n", "train_dataset[10] = train_dataset[10].apply(float)\n", "\n", "train_dataset = train_dataset[[0,6,8,10,15]]\n", "\n" ] }, { "cell_type": "code", "execution_count": 275, "metadata": {}, "outputs": [], "source": [ "test_dataset = pd.read_csv(\"./dev-0/in.tsv\", sep= \"\\t\", header=None)" ] }, { "cell_type": "code", "execution_count": 278, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "COLUMN 15:\n", "Value counts before changes:\n", " 14\n", " 1 108\n", " 2 89\n", " 0 82\n", " 4 65\n", " 3 54\n", " 5 22\n", " 6 12\n", " 7 9\n", " 11 9\n", " 10 5\n", " 8 3\n", "-1 2\n", " 9 2\n", "Name: count, dtype: int64\n", "Value counts after changes:\n", " 14\n", " 1 108\n", " 2 89\n", " 0 82\n", " 4 65\n", " 3 54\n", " 5 22\n", " 6 12\n", " 7 9\n", " 11 9\n", " 10 5\n", " 8 3\n", "-1 2\n", " 9 2\n", "Name: count, dtype: int64\n", "COLUMN 8:\n", "0 59.10\n", "1 38.00\n", "2 63.84\n", "3 50.00\n", "4 65.62\n", " ... \n", "457 72.78\n", "458 51.23\n", "459 54.16\n", "460 90.10\n", "461 71.90\n", "Name: 7, Length: 462, dtype: float64\n", "COLUMN 6:\n", "Value counts before changes:\n", " 5\n", "2 196\n", "3 152\n", "1 51\n", "4 50\n", "5 9\n", "6 4\n", "Name: count, dtype: int64\n", "Value counts after changes:\n", " 5\n", "2 196\n", "3 152\n", "1 51\n", "4 50\n", "5 9\n", "6 4\n", "Name: count, dtype: int64\n" ] } ], "source": [ "\n", "# Preprocessing column 15:\n", "print(\"COLUMN 15:\")\n", "# Count the occurrence of unique values in column before preprocessing:\n", "print(\"Value counts before changes:\\n\",test_dataset[14].value_counts())\n", "\n", "# Replace string to int or NaN:\n", "test_dataset[14] = test_dataset[14].replace({\"parter\": 0, \"suterena\": -1, \"> 10\": 11, \"poddasze\": np.nan})\n", "test_dataset[14] = test_dataset[14].apply(float)\n", "\n", "# Fill Nans with median:\n", "test_dataset[14].fillna(test_dataset[14].median(), inplace=True)\n", "test_dataset[14]= test_dataset[14].apply(int)\n", "\n", "# Count the occurrence of unique values in column after preprocessing:\n", "print(\"Value counts after changes:\\n\",test_dataset[14].value_counts())\n", "\n", "# Preprocessing column 8:\n", "print(\"COLUMN 8:\")\n", "# Replace strings containing space to NaN:\n", "test_dataset[7] = test_dataset[7].replace(' ', np.nan, regex=True)\n", "\n", "# Fill Nans with median:\n", "test_dataset[7] = test_dataset[7].apply(float)\n", "test_dataset[7].fillna(test_dataset[7].median(), inplace=True)\n", "\n", "print(test_dataset[7])\n", "\n", "# Preprocessing column 6:\n", "print(\"COLUMN 6:\")\n", "# Count the occurrence of unique values in column before preprocessing:\n", "print(\"Value counts before changes:\\n\",test_dataset[5].value_counts())\n", "\n", "# Change string to 10:\n", "test_dataset[5] = test_dataset[5].replace({\"więcej niż 10\": 10})\n", "test_dataset[5] = test_dataset[5].apply(int)\n", "\n", "# Count the occurrence of unique values in column after preprocessing:\n", "print(\"Value counts after changes:\\n\",test_dataset[5].value_counts())\n", "\n", "test_dataset[9].fillna(test_dataset[9].median(), inplace=True)\n", "test_dataset[9] = test_dataset[9].apply(float)\n", "\n", "test_dataset = test_dataset[[5,7,9,14]]\n", "\n" ] }, { "cell_type": "code", "execution_count": 305, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
57914
0359.14.02
\n", "
" ], "text/plain": [ " 5 7 9 14\n", "0 3 59.1 4.0 2" ] }, "execution_count": 305, "metadata": {}, "output_type": "execute_result" } ], "source": [ "test_dataset.head(1)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Model" ] }, { "cell_type": "code", "execution_count": 234, "metadata": {}, "outputs": [], "source": [ "from sklearn.linear_model import LinearRegression\n", "from sklearn.preprocessing import StandardScaler" ] }, { "cell_type": "code", "execution_count": 291, "metadata": {}, "outputs": [], "source": [ "X_train = train_dataset.drop(0,axis=1)\n", "y_train = train_dataset[[0]]\n", "\n", "scaler = StandardScaler()\n", "trans_data = scaler.fit_transform(X)" ] }, { "cell_type": "code", "execution_count": 292, "metadata": {}, "outputs": [], "source": [ "X_test = test_dataset" ] }, { "cell_type": "code", "execution_count": 293, "metadata": {}, "outputs": [], "source": [ "reg = LinearRegression()" ] }, { "cell_type": "code", "execution_count": 294, "metadata": {}, "outputs": [], "source": [ "reg.fit(X_train, y_train)\n", "results = reg.predict(X_test)" ] }, { "cell_type": "code", "execution_count": 265, "metadata": {}, "outputs": [], "source": [ "import pickle\n", "from sklearn.metrics import r2_score\n", "# pickle.dump(reg, open(\"model.pkl\", \"wb\"))" ] }, { "cell_type": "code", "execution_count": 295, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array([[ 394901.20434554],\n", " [ 293271.41755997],\n", " [ 432666.21541136],\n", " [ 295330.00173591],\n", " [ 444001.60173013],\n", " [ 595102.64364947],\n", " [ 306562.53056792],\n", " [ 346367.98664224],\n", " [ 320708.28590668],\n", " [ 355678.48101873],\n", " [ 272087.02326267],\n", " [ 315111.17058773],\n", " [ 408743.59976314],\n", " [ 392452.80148004],\n", " [ 351776.80580328],\n", " [ 325340.80253875],\n", " [ 324415.74401286],\n", " [ 286605.64798109],\n", " [ 354409.52248178],\n", " [ 273703.59297418],\n", " [ 343402.57630023],\n", " [ 285271.79788568],\n", " [ 370401.6837088 ],\n", " [ 482522.18182752],\n", " [ 394119.81457774],\n", " [ 474686.42204592],\n", " [ 627962.51992074],\n", " [ 334221.5006669 ],\n", " [ 397338.23464708],\n", " [ 414336.53457982],\n", " [ 335485.00759125],\n", " [ 351363.03195993],\n", " [ 384755.53002533],\n", " [ 374013.34228146],\n", " [ 384755.53002533],\n", " [ 355882.69461662],\n", " [ 521392.20674093],\n", " [ 425546.58946533],\n", " [ 294138.97474644],\n", " [ 539668.46177031],\n", " [ 340107.2565533 ],\n", " [ 467566.06735236],\n", " [ 228315.74093461],\n", " [ 373516.14139746],\n", " [ 572962.98245529],\n", " [ 425442.35590324],\n", " [ 264688.02449027],\n", " [ 321384.60839985],\n", " [ 313697.70958017],\n", " [ 257339.46910406],\n", " [ 285491.92892354],\n", " [ 265315.27967261],\n", " [ 269349.61545595],\n", " [ 370207.10222578],\n", " [ 505152.41437514],\n", " [ 326640.0334956 ],\n", " [ 361868.73382815],\n", " [ 641520.72645455],\n", " [ 513506.74409331],\n", " [ 225524.54295198],\n", " [ 237226.80467502],\n", " [ 453176.39203834],\n", " [ 261995.60845714],\n", " [ 955187.2509814 ],\n", " [ 492991.37526251],\n", " [ 374938.40080734],\n", " [ 774416.69022809],\n", " [ 523696.94084834],\n", " [ 434831.26310559],\n", " [ 489623.47044873],\n", " [ 280423.33071801],\n", " [ 264688.02449027],\n", " [ 280814.31555288],\n", " [ 359688.46533354],\n", " [ 314210.82790415],\n", " [ 622185.54247246],\n", " [ 448643.38610329],\n", " [ 561068.71341085],\n", " [ 331068.80586855],\n", " [ 387502.56994691],\n", " [ 251024.38630808],\n", " [ 295949.29222845],\n", " [ 311115.63444521],\n", " [ 307926.65308324],\n", " [ 268121.19700892],\n", " [ 329516.02282157],\n", " [ 260145.38644037],\n", " [ 402080.9057926 ],\n", " [ 247029.12134696],\n", " [ 293241.9788526 ],\n", " [ 968741.83954407],\n", " [ 463714.83552309],\n", " [ 458749.5202625 ],\n", " [ 467566.06735236],\n", " [ 251228.59990596],\n", " [ 367839.0507926 ],\n", " [ 178563.73689914],\n", " [ 404748.33480203],\n", " [ 361629.76401877],\n", " [ 335003.99532863],\n", " [ 468590.27220151],\n", " [ 342562.21592532],\n", " [ 371236.86365252],\n", " [ 310876.80170877],\n", " [ 276928.20517629],\n", " [ 273106.88139315],\n", " [ 517720.942006 ],\n", " [ 309671.80782539],\n", " [ 321349.71807989],\n", " [ 346542.76153275],\n", " [ 374434.93476654],\n", " [ 919566.5541124 ],\n", " [ 295586.59732644],\n", " [ 394954.99452145],\n", " [ 259707.15616347],\n", " [ 296950.34369535],\n", " [ 390519.03008681],\n", " [ 338032.92115378],\n", " [ 279645.24385936],\n", " [ 394134.53393142],\n", " [ 279107.13851167],\n", " [ 454032.21228709],\n", " [ 959467.06920008],\n", " [ 683669.88249351],\n", " [ 371046.82704552],\n", " [ 576981.26372572],\n", " [ 294170.56215725],\n", " [ 443250.48438223],\n", " [ 283176.45780734],\n", " [ 434794.43417916],\n", " [ 344911.47246756],\n", " [ 272235.14370441],\n", " [ 824501.58608444],\n", " [ 334396.91111258],\n", " [ 350120.43652201],\n", " [ 253527.41306202],\n", " [ 293854.69267028],\n", " [ 291547.17430427],\n", " [ 270927.11434524],\n", " [ 383959.89024261],\n", " [ 656199.3017448 ],\n", " [ 553341.638875 ],\n", " [ 432687.22009017],\n", " [ 526139.69394388],\n", " [ 447026.81639178],\n", " [ 274087.19936264],\n", " [ 397721.29867274],\n", " [ 491976.34496209],\n", " [ 334112.72222878],\n", " [ 564079.20871881],\n", " [ 262283.77968584],\n", " [ 325888.17562753],\n", " [ 707612.5028916 ],\n", " [ 392111.24832978],\n", " [ 498668.84201377],\n", " [ 287777.33771682],\n", " [ 269508.93047261],\n", " [ 438284.33540911],\n", " [ 325678.41722469],\n", " [ 451638.79768381],\n", " [ 313355.0076554 ],\n", " [ 507395.87489522],\n", " [ 319663.80512625],\n", " [ 334586.40535677],\n", " [ 336983.25994216],\n", " [ 241704.55116646],\n", " [ 551645.5514437 ],\n", " [ 263184.95608194],\n", " [ 353876.49260028],\n", " [ 261995.60845714],\n", " [ 310876.80170877],\n", " [ 350329.83055109],\n", " [ 322549.60455611],\n", " [ 394105.09522405],\n", " [ 553247.67298293],\n", " [ 318793.17233144],\n", " [ 467137.74037172],\n", " [ 242785.12035314],\n", " [ 382352.68146455],\n", " [ 327735.05085456],\n", " [ 517343.1516039 ],\n", " [ 290640.3481382 ],\n", " [ 219340.07698546],\n", " [ 298372.97925164],\n", " [ 194674.57986053],\n", " [ 273246.9321801 ],\n", " [ 332307.2208042 ],\n", " [ 264140.01584633],\n", " [ 457026.61216604],\n", " [ 395940.59788713],\n", " [ 290989.89791922],\n", " [ 473556.78554839],\n", " [ 318678.94228073],\n", " [ 485526.48500272],\n", " [ 325262.02533913],\n", " [ 481691.81794123],\n", " [ 339771.01794269],\n", " [ 367305.38535593],\n", " [ 443173.11536588],\n", " [ 505575.09158582],\n", " [ 318678.94228073],\n", " [ 216558.78229908],\n", " [ 233490.17904287],\n", " [ 394075.65651668],\n", " [ 295760.63169678],\n", " [ 258795.98327875],\n", " [ 346353.26728856],\n", " [ 305592.11589469],\n", " [ 443578.56014568],\n", " [ 300577.82651562],\n", " [ 402095.62514628],\n", " [ 513536.18280069],\n", " [ 757593.69577602],\n", " [ 287777.33771682],\n", " [ 397881.4272336 ],\n", " [ 414604.91098864],\n", " [ 349552.42312819],\n", " [ 534205.58125196],\n", " [ 378854.88856884],\n", " [ 348203.48930532],\n", " [ 333984.13719815],\n", " [ 330351.10957292],\n", " [ 182753.04098048],\n", " [ 329142.59091077],\n", " [ 290053.63304579],\n", " [ 309309.5704895 ],\n", " [ 481568.04896803],\n", " [ 270985.99175998],\n", " [ 327735.05085456],\n", " [ 237495.07611884],\n", " [ 295905.1341674 ],\n", " [ 267893.10128127],\n", " [ 305518.51912627],\n", " [ 289640.4947576 ],\n", " [ 448016.13092096],\n", " [ 448276.23951762],\n", " [ 306562.53056792],\n", " [ 763649.68167691],\n", " [ 302348.80199401],\n", " [ 316359.12444587],\n", " [ 290899.54999829],\n", " [ 231743.82621443],\n", " [ 313404.81548641],\n", " [ 278816.39312135],\n", " [ 383639.70614492],\n", " [ 300989.76671752],\n", " [ 320213.59809987],\n", " [ 327790.31029817],\n", " [ 365619.94174106],\n", " [ 405226.27145632],\n", " [ 351115.20266379],\n", " [ 301726.6340505 ],\n", " [ 239057.85565444],\n", " [ 623907.54383236],\n", " [ 325824.388963 ],\n", " [ 305965.91217926],\n", " [ 349984.19009093],\n", " [ 638635.09324109],\n", " [ 624904.92801633],\n", " [ 421648.73037838],\n", " [ 290610.1806833 ],\n", " [ 288601.57055477],\n", " [ 504376.11184616],\n", " [ 246333.26344267],\n", " [ 390474.87202575],\n", " [ 676486.10550918],\n", " [ 217145.59058386],\n", " [ 253652.38012152],\n", " [ 874149.26336545],\n", " [ 241282.41631858],\n", " [ 300762.91295673],\n", " [ 364474.75217731],\n", " [ 819789.82291398],\n", " [ 227915.8527324 ],\n", " [ 333171.00574275],\n", " [ 406022.1093964 ],\n", " [ 424620.79041928],\n", " [ 433925.27065205],\n", " [ 369693.34834663],\n", " [ 659209.432679 ],\n", " [ 377446.15042634],\n", " [ 489887.75954764],\n", " [ 215155.31778014],\n", " [ 369953.55013564],\n", " [ 541589.4962969 ],\n", " [ 270941.83369892],\n", " [ 342407.81015845],\n", " [ 259234.77608678],\n", " [ 433939.99000574],\n", " [ 277310.70667081],\n", " [ 675738.22998601],\n", " [ 376222.07869798],\n", " [ 381358.95913228],\n", " [ 366112.32656763],\n", " [ 311543.39889473],\n", " [ 236978.24663136],\n", " [ 321370.9939401 ],\n", " [ 412013.97636694],\n", " [ 321355.16969248],\n", " [ 318664.22292705],\n", " [ 382338.23329227],\n", " [ 581803.00342058],\n", " [ 513506.74409331],\n", " [ 635007.71538581],\n", " [ 270912.39499155],\n", " [ 316897.22979356],\n", " [ 352344.72600475],\n", " [ 256796.27651754],\n", " [ 453674.42663487],\n", " [ 321897.63353148],\n", " [ 307478.51951008],\n", " [ 268956.93948377],\n", " [ 557884.64129867],\n", " [1047884.98951607],\n", " [ 440857.06977894],\n", " [ 226951.61841929],\n", " [ 350348.53224967],\n", " [ 443134.68009882],\n", " [ 277414.30467774],\n", " [ 476949.14679571],\n", " [ 286093.7277434 ],\n", " [ 585186.00373445],\n", " [ 441842.20380586],\n", " [ 265771.94046669],\n", " [ 318649.50357336],\n", " [ 252423.12796196],\n", " [ 321470.14026336],\n", " [ 316360.32253217],\n", " [ 318678.94228073],\n", " [ 249636.38166299],\n", " [ 342591.65463269],\n", " [ 255418.53954247],\n", " [ 911134.08274977],\n", " [ 427920.19747611],\n", " [ 528632.81740156],\n", " [1170092.55572165],\n", " [ 394119.81457774],\n", " [ 306503.65315318],\n", " [ 312892.06153619],\n", " [ 233843.14863766],\n", " [ 490663.22836409],\n", " [ 459535.36171397],\n", " [ 333969.41784446],\n", " [ 282679.25692334],\n", " [ 226111.82057552],\n", " [ 538215.56556676],\n", " [ 226111.82057552],\n", " [ 352457.85116153],\n", " [-161201.02176254],\n", " [ 302303.81022042],\n", " [ 323007.37024413],\n", " [ 370455.47388471],\n", " [ 287524.71253158],\n", " [1060592.82741372],\n", " [ 248909.9801574 ],\n", " [ 288108.35201566],\n", " [ 287812.79056791],\n", " [ 229678.75855599],\n", " [ 249386.81191776],\n", " [ 390314.04386082],\n", " [ 451537.07719892],\n", " [ 528632.81740156],\n", " [ 811858.17040649],\n", " [ 302786.19855838],\n", " [ 394915.92369922],\n", " [ 390314.04386082],\n", " [ 405425.39781537],\n", " [1289225.09943457],\n", " [ 349637.21447153],\n", " [ 309215.24022367],\n", " [ 347266.58887672],\n", " [ 295166.06881919],\n", " [ 301125.47078581],\n", " [ 738397.19827818],\n", " [ 494697.19977366],\n", " [ 305130.09668038],\n", " [ 278844.04749905],\n", " [ 302727.32114364],\n", " [ 310444.76356463],\n", " [ 430134.21928882],\n", " [ 309671.80782539],\n", " [ 721669.74395088],\n", " [ 597337.30576723],\n", " [ 351846.05585305],\n", " [ 704587.65260372],\n", " [ 584703.8865779 ],\n", " [ 293576.41296521],\n", " [ 252313.71396867],\n", " [ 718524.74266092],\n", " [ 253189.43400231],\n", " [ 302168.8466723 ],\n", " [ 288735.60719799],\n", " [ 498483.16389807],\n", " [ 457172.49071198],\n", " [ 333178.1247804 ],\n", " [ 704558.21389635],\n", " [ 468988.05558085],\n", " [ 417037.85398029],\n", " [ 245048.7518395 ],\n", " [ 423113.73966605],\n", " [ 354931.43920809],\n", " [ 720067.89359305],\n", " [ 749936.43382107],\n", " [ 551719.98192466],\n", " [ 441294.46634331],\n", " [ 556690.74879784],\n", " [ 592416.24176006],\n", " [ 422202.56678133],\n", " [ 318693.39045302],\n", " [ 403430.30895423],\n", " [ 604406.85270084],\n", " [ 255244.59836449],\n", " [ 374597.01387348],\n", " [ 308150.766466 ],\n", " [ 604268.00000018],\n", " [ 616438.10869654],\n", " [ 993142.48503895],\n", " [ 395901.52706491],\n", " [ 434344.3298916 ],\n", " [ 485469.25484471],\n", " [ 627355.43570469],\n", " [ 310876.80170877],\n", " [ 174618.27976904],\n", " [ 447061.07115658],\n", " [ 335585.89436361],\n", " [ 327421.4232634 ],\n", " [ 373803.94825239],\n", " [ 373789.2288987 ],\n", " [ 348443.52012806],\n", " [ 372500.73495064],\n", " [ 556630.4021154 ],\n", " [ 553112.80262718],\n", " [ 400867.67603802],\n", " [ 569468.03490261],\n", " [ 444493.62218294],\n", " [ 514644.81461941],\n", " [ 702403.53883714],\n", " [ 608248.81678902],\n", " [1162131.46450679],\n", " [ 314668.1242534 ],\n", " [ 344572.38851392],\n", " [ 641419.64152483],\n", " [ 406890.71039238],\n", " [ 330256.77930708],\n", " [ 406762.86588192],\n", " [ 453663.96080749],\n", " [1656917.6302292 ],\n", " [ 449408.85843111],\n", " [ 258707.30278287],\n", " [ 345745.18314357],\n", " [ 310109.76692089],\n", " [ 424505.72665604],\n", " [ 310095.0475672 ],\n", " [ 401125.77300418],\n", " [ 457988.13524458],\n", " [ 550575.62607344],\n", " [ 523303.70234502],\n", " [ 506600.1301475 ],\n", " [ 379014.11039314],\n", " [ 352767.4964078 ],\n", " [ 644515.6686963 ],\n", " [ 537987.4698391 ]])" ] }, "execution_count": 295, "metadata": {}, "output_type": "execute_result" } ], "source": [ "results" ] }, { "cell_type": "code", "execution_count": 301, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
0
0373000.00
1299000.00
2365000.00
3369000.00
4483791.00
......
457655544.02
458471397.97
459309958.00
460699000.00
461850000.00
\n", "

462 rows × 1 columns

\n", "
" ], "text/plain": [ " 0\n", "0 373000.00\n", "1 299000.00\n", "2 365000.00\n", "3 369000.00\n", "4 483791.00\n", ".. ...\n", "457 655544.02\n", "458 471397.97\n", "459 309958.00\n", "460 699000.00\n", "461 850000.00\n", "\n", "[462 rows x 1 columns]" ] }, "execution_count": 301, "metadata": {}, "output_type": "execute_result" } ], "source": [ "y_test = pd.read_csv(\"./dev-0/expected.tsv\", header=None)\n", "y_test" ] }, { "cell_type": "code", "execution_count": 302, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "0.6393762535622007" ] }, "execution_count": 302, "metadata": {}, "output_type": "execute_result" } ], "source": [ "r2_score(y_test, results)" ] }, { "cell_type": "code", "execution_count": 303, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "71559.96181964973" ] }, "execution_count": 303, "metadata": {}, "output_type": "execute_result" } ], "source": [ "mean_absolute_error(y_test, results)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Predykcja dla zbioru testowego" ] }, { "cell_type": "code", "execution_count": 317, "metadata": {}, "outputs": [], "source": [ "final_test_dataset = pd.read_csv(\"./test-A/in.tsv\", sep= \"\\t\", header=None)" ] }, { "cell_type": "code", "execution_count": 318, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "COLUMN 15:\n", "Value counts before changes:\n", " 14\n", "1 92\n", "parter 70\n", "3 68\n", "4 64\n", "2 61\n", "5 15\n", "6 11\n", "7 7\n", "10 5\n", "> 10 5\n", "9 4\n", "8 2\n", "suterena 1\n", "Name: count, dtype: int64\n", "Value counts after changes:\n", " 14\n", " 1 92\n", " 2 74\n", " 0 70\n", " 3 68\n", " 4 64\n", " 5 15\n", " 6 11\n", " 7 7\n", " 10 5\n", " 11 5\n", " 9 4\n", " 8 2\n", "-1 1\n", "Name: count, dtype: int64\n", "COLUMN 8:\n", "0 61.99\n", "1 64.00\n", "2 51.15\n", "3 45.77\n", "4 44.36\n", " ... \n", "413 34.97\n", "414 49.06\n", "415 76.71\n", "416 72.63\n", "417 65.84\n", "Name: 7, Length: 418, dtype: float64\n", "COLUMN 6:\n", "Value counts before changes:\n", " 5\n", "2 175\n", "3 143\n", "4 50\n", "1 40\n", "5 6\n", "6 2\n", "więcej niż 10 1\n", "8 1\n", "Name: count, dtype: int64\n", "Value counts after changes:\n", " 5\n", "2 175\n", "3 143\n", "4 50\n", "1 40\n", "5 6\n", "6 2\n", "10 1\n", "8 1\n", "Name: count, dtype: int64\n" ] } ], "source": [ "\n", "# Preprocessing column 15:\n", "print(\"COLUMN 15:\")\n", "# Count the occurrence of unique values in column before preprocessing:\n", "print(\"Value counts before changes:\\n\",final_test_dataset[14].value_counts())\n", "\n", "# Replace string to int or NaN:\n", "final_test_dataset[14] = final_test_dataset[14].replace({\"parter\": 0, \"suterena\": -1, \"> 10\": 11, \"poddasze\": np.nan})\n", "final_test_dataset[14] = final_test_dataset[14].apply(float)\n", "\n", "# Fill Nans with median:\n", "final_test_dataset[14].fillna(final_test_dataset[14].median(), inplace=True)\n", "final_test_dataset[14]= final_test_dataset[14].apply(int)\n", "\n", "# Count the occurrence of unique values in column after preprocessing:\n", "print(\"Value counts after changes:\\n\",final_test_dataset[14].value_counts())\n", "\n", "# Preprocessing column 8:\n", "print(\"COLUMN 8:\")\n", "# Replace strings containing space to NaN:\n", "final_test_dataset[7] = final_test_dataset[7].replace(' ', np.nan, regex=True)\n", "\n", "# Fill Nans with median:\n", "final_test_dataset[7] = final_test_dataset[7].apply(float)\n", "final_test_dataset[7].fillna(final_test_dataset[7].median(), inplace=True)\n", "\n", "print(final_test_dataset[7])\n", "\n", "# Preprocessing column 6:\n", "print(\"COLUMN 6:\")\n", "# Count the occurrence of unique values in column before preprocessing:\n", "print(\"Value counts before changes:\\n\",final_test_dataset[5].value_counts())\n", "\n", "# Change string to 10:\n", "final_test_dataset[5] = final_test_dataset[5].replace({\"więcej niż 10\": 10})\n", "final_test_dataset[5] = final_test_dataset[5].apply(int)\n", "\n", "# Count the occurrence of unique values in column after preprocessing:\n", "print(\"Value counts after changes:\\n\",final_test_dataset[5].value_counts())\n", "\n", "final_test_dataset[9].fillna(final_test_dataset[9].median(), inplace=True)\n", "final_test_dataset[9] = final_test_dataset[9].apply(float)\n", "\n", "final_test_dataset = final_test_dataset[[5,7,9,14]]\n", "\n" ] }, { "cell_type": "code", "execution_count": 319, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
57914
0361.997.02
1464.004.00
2351.155.00
3245.777.02
4244.3613.05
...............
413134.978.04
414349.063.03
415376.715.03
416372.635.03
417265.8410.03
\n", "

418 rows × 4 columns

\n", "
" ], "text/plain": [ " 5 7 9 14\n", "0 3 61.99 7.0 2\n", "1 4 64.00 4.0 0\n", "2 3 51.15 5.0 0\n", "3 2 45.77 7.0 2\n", "4 2 44.36 13.0 5\n", ".. .. ... ... ..\n", "413 1 34.97 8.0 4\n", "414 3 49.06 3.0 3\n", "415 3 76.71 5.0 3\n", "416 3 72.63 5.0 3\n", "417 2 65.84 10.0 3\n", "\n", "[418 rows x 4 columns]" ] }, "execution_count": 319, "metadata": {}, "output_type": "execute_result" } ], "source": [ "final_test_dataset" ] }, { "cell_type": "code", "execution_count": 320, "metadata": {}, "outputs": [], "source": [ "final_results = reg.predict(final_test_dataset)" ] }, { "cell_type": "code", "execution_count": 321, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
0
0426282.351904
1389890.897311
2334372.288463
3341143.667679
4346709.875023
......
413301974.734528
414312195.369919
415537901.937976
416505420.685819
417509311.081663
\n", "

418 rows × 1 columns

\n", "
" ], "text/plain": [ " 0\n", "0 426282.351904\n", "1 389890.897311\n", "2 334372.288463\n", "3 341143.667679\n", "4 346709.875023\n", ".. ...\n", "413 301974.734528\n", "414 312195.369919\n", "415 537901.937976\n", "416 505420.685819\n", "417 509311.081663\n", "\n", "[418 rows x 1 columns]" ] }, "execution_count": 321, "metadata": {}, "output_type": "execute_result" } ], "source": [ "pd.DataFrame(final_results)" ] }, { "cell_type": "code", "execution_count": 322, "metadata": {}, "outputs": [], "source": [ "pd.DataFrame(final_results).to_csv(\"./test-A/out.tsv\", sep='\\t', index=False, header=None)" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.10.0" }, "vscode": { "interpreter": { "hash": "1b132c2ed43285dcf39f6d01712959169a14a721cf314fe69015adab49bb1fd1" } } }, "nbformat": 4, "nbformat_minor": 2 }