mieszkania5/model_regresji_liniowej.ipynb
2023-10-28 14:34:26 +02:00

48 KiB
Raw Blame History

import pandas as pd
from statistics import mean,median
import re
import numpy as np
from sklearn.metrics import mean_absolute_error
 

Wczytanie datasetów

train_dataset = pd.read_csv("./train/train.tsv", sep = "\t", header=None)

Data exploration

train_dataset.head(1)
0 1 2 3 4 5 6 7 8 9 ... 16 17 18 19 20 21 22 23 24 25
0 309000.0 do zamieszkania 390 zł spółdzielcze własnościowe 7113 https://www.otodom.pl/oferta/niezalezny-uklad-... 2 NaN 43.44 wtórny ... NaN gazowe plastikowe NaN NaN NaN cegła Polecamy na sprzedaż dwupokojowe mieszkanie p... NaN telewizja kablowa, internet, meble, piwnica, g...

1 rows × 26 columns

Wczytywanie danych testowych i preprocessing jak na treningu


# Preprocessing column 15:
print("COLUMN 15:")
# Count the occurrence of unique values in column before preprocessing:
print("Value counts before changes:\n",train_dataset[15].value_counts())

# Replace string to int or NaN:
train_dataset[15] = train_dataset[15].replace({"parter": 0, "suterena": -1, "> 10": 11, "poddasze": np.nan})
train_dataset[15] = train_dataset[15].apply(float)

# Fill Nans with median:
train_dataset[15].fillna(train_dataset[15].median(), inplace=True)
train_dataset[15]= train_dataset[15].apply(int)

# Count the occurrence of unique values in column after preprocessing:
print("Value counts after changes:\n",train_dataset[15].value_counts())

# Preprocessing column 8:
print("COLUMN 8:")
# Replace strings containing space to NaN:
train_dataset[8] = train_dataset[8].replace(' ', np.nan, regex=True)

# Fill Nans with median:
train_dataset[8] = train_dataset[8].apply(float)
train_dataset[8].fillna(train_dataset[8].median(), inplace=True)

print(train_dataset[8])

# Preprocessing column 6:
print("COLUMN 6:")
# Count the occurrence of unique values in column before preprocessing:
print("Value counts before changes:\n",train_dataset[6].value_counts())

# Change string to 10:
train_dataset[6] = train_dataset[6].replace({"więcej niż 10": 10})
train_dataset[6] = train_dataset[6].apply(int)

# Count the occurrence of unique values in column after preprocessing:
print("Value counts after changes:\n",train_dataset[6].value_counts())

train_dataset[10].fillna(train_dataset[10].median(), inplace=True)
train_dataset[10] = train_dataset[10].apply(float)

train_dataset = train_dataset[[0,6,8,10,15]]

COLUMN 15:
Value counts before changes:
 15
 1     569
 2     527
 0     452
 4     357
 3     321
 5     117
 6      51
 7      42
 8      32
 10     29
 11     24
 9      21
-1       5
Name: count, dtype: int64
Value counts after changes:
 15
 1     569
 2     527
 0     452
 4     357
 3     321
 5     117
 6      51
 7      42
 8      32
 10     29
 11     24
 9      21
-1       5
Name: count, dtype: int64
COLUMN 8:
0        43.44
1        42.60
2        44.30
3        88.00
4        77.00
         ...  
2542     94.00
2543     53.50
2544     55.25
2545     62.00
2546    392.00
Name: 8, Length: 2547, dtype: float64
COLUMN 6:
Value counts before changes:
 6
2     1014
3      878
4      293
1      271
5       64
6       13
7        7
10       6
9        1
Name: count, dtype: int64
Value counts after changes:
 6
2     1014
3      878
4      293
1      271
5       64
6       13
7        7
10       6
9        1
Name: count, dtype: int64
test_dataset = pd.read_csv("./dev-0/in.tsv", sep= "\t", header=None)
# Preprocessing column 15:
print("COLUMN 15:")
# Count the occurrence of unique values in column before preprocessing:
print("Value counts before changes:\n",test_dataset[14].value_counts())

# Replace string to int or NaN:
test_dataset[14] = test_dataset[14].replace({"parter": 0, "suterena": -1, "> 10": 11, "poddasze": np.nan})
test_dataset[14] = test_dataset[14].apply(float)

# Fill Nans with median:
test_dataset[14].fillna(test_dataset[14].median(), inplace=True)
test_dataset[14]= test_dataset[14].apply(int)

# Count the occurrence of unique values in column after preprocessing:
print("Value counts after changes:\n",test_dataset[14].value_counts())

# Preprocessing column 8:
print("COLUMN 8:")
# Replace strings containing space to NaN:
test_dataset[7] = test_dataset[7].replace(' ', np.nan, regex=True)

# Fill Nans with median:
test_dataset[7] = test_dataset[7].apply(float)
test_dataset[7].fillna(test_dataset[7].median(), inplace=True)

print(test_dataset[7])

# Preprocessing column 6:
print("COLUMN 6:")
# Count the occurrence of unique values in column before preprocessing:
print("Value counts before changes:\n",test_dataset[5].value_counts())

# Change string to 10:
test_dataset[5] = test_dataset[5].replace({"więcej niż 10": 10})
test_dataset[5] = test_dataset[5].apply(int)

# Count the occurrence of unique values in column after preprocessing:
print("Value counts after changes:\n",test_dataset[5].value_counts())

test_dataset[9].fillna(test_dataset[9].median(), inplace=True)
test_dataset[9] = test_dataset[9].apply(float)

test_dataset = test_dataset[[5,7,9,14]]

COLUMN 15:
Value counts before changes:
 14
 1     108
 2      89
 0      82
 4      65
 3      54
 5      22
 6      12
 7       9
 11      9
 10      5
 8       3
-1       2
 9       2
Name: count, dtype: int64
Value counts after changes:
 14
 1     108
 2      89
 0      82
 4      65
 3      54
 5      22
 6      12
 7       9
 11      9
 10      5
 8       3
-1       2
 9       2
Name: count, dtype: int64
COLUMN 8:
0      59.10
1      38.00
2      63.84
3      50.00
4      65.62
       ...  
457    72.78
458    51.23
459    54.16
460    90.10
461    71.90
Name: 7, Length: 462, dtype: float64
COLUMN 6:
Value counts before changes:
 5
2    196
3    152
1     51
4     50
5      9
6      4
Name: count, dtype: int64
Value counts after changes:
 5
2    196
3    152
1     51
4     50
5      9
6      4
Name: count, dtype: int64
test_dataset.head(1)
5 7 9 14
0 3 59.1 4.0 2

Model

from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler
X_train = train_dataset.drop(0,axis=1)
y_train = train_dataset[[0]]

scaler = StandardScaler()
trans_data = scaler.fit_transform(X)
X_test = test_dataset
reg = LinearRegression()
reg.fit(X_train, y_train)
results = reg.predict(X_test)
import pickle
from sklearn.metrics import r2_score
# pickle.dump(reg, open("model.pkl", "wb"))
results
array([[ 394901.20434554],
       [ 293271.41755997],
       [ 432666.21541136],
       [ 295330.00173591],
       [ 444001.60173013],
       [ 595102.64364947],
       [ 306562.53056792],
       [ 346367.98664224],
       [ 320708.28590668],
       [ 355678.48101873],
       [ 272087.02326267],
       [ 315111.17058773],
       [ 408743.59976314],
       [ 392452.80148004],
       [ 351776.80580328],
       [ 325340.80253875],
       [ 324415.74401286],
       [ 286605.64798109],
       [ 354409.52248178],
       [ 273703.59297418],
       [ 343402.57630023],
       [ 285271.79788568],
       [ 370401.6837088 ],
       [ 482522.18182752],
       [ 394119.81457774],
       [ 474686.42204592],
       [ 627962.51992074],
       [ 334221.5006669 ],
       [ 397338.23464708],
       [ 414336.53457982],
       [ 335485.00759125],
       [ 351363.03195993],
       [ 384755.53002533],
       [ 374013.34228146],
       [ 384755.53002533],
       [ 355882.69461662],
       [ 521392.20674093],
       [ 425546.58946533],
       [ 294138.97474644],
       [ 539668.46177031],
       [ 340107.2565533 ],
       [ 467566.06735236],
       [ 228315.74093461],
       [ 373516.14139746],
       [ 572962.98245529],
       [ 425442.35590324],
       [ 264688.02449027],
       [ 321384.60839985],
       [ 313697.70958017],
       [ 257339.46910406],
       [ 285491.92892354],
       [ 265315.27967261],
       [ 269349.61545595],
       [ 370207.10222578],
       [ 505152.41437514],
       [ 326640.0334956 ],
       [ 361868.73382815],
       [ 641520.72645455],
       [ 513506.74409331],
       [ 225524.54295198],
       [ 237226.80467502],
       [ 453176.39203834],
       [ 261995.60845714],
       [ 955187.2509814 ],
       [ 492991.37526251],
       [ 374938.40080734],
       [ 774416.69022809],
       [ 523696.94084834],
       [ 434831.26310559],
       [ 489623.47044873],
       [ 280423.33071801],
       [ 264688.02449027],
       [ 280814.31555288],
       [ 359688.46533354],
       [ 314210.82790415],
       [ 622185.54247246],
       [ 448643.38610329],
       [ 561068.71341085],
       [ 331068.80586855],
       [ 387502.56994691],
       [ 251024.38630808],
       [ 295949.29222845],
       [ 311115.63444521],
       [ 307926.65308324],
       [ 268121.19700892],
       [ 329516.02282157],
       [ 260145.38644037],
       [ 402080.9057926 ],
       [ 247029.12134696],
       [ 293241.9788526 ],
       [ 968741.83954407],
       [ 463714.83552309],
       [ 458749.5202625 ],
       [ 467566.06735236],
       [ 251228.59990596],
       [ 367839.0507926 ],
       [ 178563.73689914],
       [ 404748.33480203],
       [ 361629.76401877],
       [ 335003.99532863],
       [ 468590.27220151],
       [ 342562.21592532],
       [ 371236.86365252],
       [ 310876.80170877],
       [ 276928.20517629],
       [ 273106.88139315],
       [ 517720.942006  ],
       [ 309671.80782539],
       [ 321349.71807989],
       [ 346542.76153275],
       [ 374434.93476654],
       [ 919566.5541124 ],
       [ 295586.59732644],
       [ 394954.99452145],
       [ 259707.15616347],
       [ 296950.34369535],
       [ 390519.03008681],
       [ 338032.92115378],
       [ 279645.24385936],
       [ 394134.53393142],
       [ 279107.13851167],
       [ 454032.21228709],
       [ 959467.06920008],
       [ 683669.88249351],
       [ 371046.82704552],
       [ 576981.26372572],
       [ 294170.56215725],
       [ 443250.48438223],
       [ 283176.45780734],
       [ 434794.43417916],
       [ 344911.47246756],
       [ 272235.14370441],
       [ 824501.58608444],
       [ 334396.91111258],
       [ 350120.43652201],
       [ 253527.41306202],
       [ 293854.69267028],
       [ 291547.17430427],
       [ 270927.11434524],
       [ 383959.89024261],
       [ 656199.3017448 ],
       [ 553341.638875  ],
       [ 432687.22009017],
       [ 526139.69394388],
       [ 447026.81639178],
       [ 274087.19936264],
       [ 397721.29867274],
       [ 491976.34496209],
       [ 334112.72222878],
       [ 564079.20871881],
       [ 262283.77968584],
       [ 325888.17562753],
       [ 707612.5028916 ],
       [ 392111.24832978],
       [ 498668.84201377],
       [ 287777.33771682],
       [ 269508.93047261],
       [ 438284.33540911],
       [ 325678.41722469],
       [ 451638.79768381],
       [ 313355.0076554 ],
       [ 507395.87489522],
       [ 319663.80512625],
       [ 334586.40535677],
       [ 336983.25994216],
       [ 241704.55116646],
       [ 551645.5514437 ],
       [ 263184.95608194],
       [ 353876.49260028],
       [ 261995.60845714],
       [ 310876.80170877],
       [ 350329.83055109],
       [ 322549.60455611],
       [ 394105.09522405],
       [ 553247.67298293],
       [ 318793.17233144],
       [ 467137.74037172],
       [ 242785.12035314],
       [ 382352.68146455],
       [ 327735.05085456],
       [ 517343.1516039 ],
       [ 290640.3481382 ],
       [ 219340.07698546],
       [ 298372.97925164],
       [ 194674.57986053],
       [ 273246.9321801 ],
       [ 332307.2208042 ],
       [ 264140.01584633],
       [ 457026.61216604],
       [ 395940.59788713],
       [ 290989.89791922],
       [ 473556.78554839],
       [ 318678.94228073],
       [ 485526.48500272],
       [ 325262.02533913],
       [ 481691.81794123],
       [ 339771.01794269],
       [ 367305.38535593],
       [ 443173.11536588],
       [ 505575.09158582],
       [ 318678.94228073],
       [ 216558.78229908],
       [ 233490.17904287],
       [ 394075.65651668],
       [ 295760.63169678],
       [ 258795.98327875],
       [ 346353.26728856],
       [ 305592.11589469],
       [ 443578.56014568],
       [ 300577.82651562],
       [ 402095.62514628],
       [ 513536.18280069],
       [ 757593.69577602],
       [ 287777.33771682],
       [ 397881.4272336 ],
       [ 414604.91098864],
       [ 349552.42312819],
       [ 534205.58125196],
       [ 378854.88856884],
       [ 348203.48930532],
       [ 333984.13719815],
       [ 330351.10957292],
       [ 182753.04098048],
       [ 329142.59091077],
       [ 290053.63304579],
       [ 309309.5704895 ],
       [ 481568.04896803],
       [ 270985.99175998],
       [ 327735.05085456],
       [ 237495.07611884],
       [ 295905.1341674 ],
       [ 267893.10128127],
       [ 305518.51912627],
       [ 289640.4947576 ],
       [ 448016.13092096],
       [ 448276.23951762],
       [ 306562.53056792],
       [ 763649.68167691],
       [ 302348.80199401],
       [ 316359.12444587],
       [ 290899.54999829],
       [ 231743.82621443],
       [ 313404.81548641],
       [ 278816.39312135],
       [ 383639.70614492],
       [ 300989.76671752],
       [ 320213.59809987],
       [ 327790.31029817],
       [ 365619.94174106],
       [ 405226.27145632],
       [ 351115.20266379],
       [ 301726.6340505 ],
       [ 239057.85565444],
       [ 623907.54383236],
       [ 325824.388963  ],
       [ 305965.91217926],
       [ 349984.19009093],
       [ 638635.09324109],
       [ 624904.92801633],
       [ 421648.73037838],
       [ 290610.1806833 ],
       [ 288601.57055477],
       [ 504376.11184616],
       [ 246333.26344267],
       [ 390474.87202575],
       [ 676486.10550918],
       [ 217145.59058386],
       [ 253652.38012152],
       [ 874149.26336545],
       [ 241282.41631858],
       [ 300762.91295673],
       [ 364474.75217731],
       [ 819789.82291398],
       [ 227915.8527324 ],
       [ 333171.00574275],
       [ 406022.1093964 ],
       [ 424620.79041928],
       [ 433925.27065205],
       [ 369693.34834663],
       [ 659209.432679  ],
       [ 377446.15042634],
       [ 489887.75954764],
       [ 215155.31778014],
       [ 369953.55013564],
       [ 541589.4962969 ],
       [ 270941.83369892],
       [ 342407.81015845],
       [ 259234.77608678],
       [ 433939.99000574],
       [ 277310.70667081],
       [ 675738.22998601],
       [ 376222.07869798],
       [ 381358.95913228],
       [ 366112.32656763],
       [ 311543.39889473],
       [ 236978.24663136],
       [ 321370.9939401 ],
       [ 412013.97636694],
       [ 321355.16969248],
       [ 318664.22292705],
       [ 382338.23329227],
       [ 581803.00342058],
       [ 513506.74409331],
       [ 635007.71538581],
       [ 270912.39499155],
       [ 316897.22979356],
       [ 352344.72600475],
       [ 256796.27651754],
       [ 453674.42663487],
       [ 321897.63353148],
       [ 307478.51951008],
       [ 268956.93948377],
       [ 557884.64129867],
       [1047884.98951607],
       [ 440857.06977894],
       [ 226951.61841929],
       [ 350348.53224967],
       [ 443134.68009882],
       [ 277414.30467774],
       [ 476949.14679571],
       [ 286093.7277434 ],
       [ 585186.00373445],
       [ 441842.20380586],
       [ 265771.94046669],
       [ 318649.50357336],
       [ 252423.12796196],
       [ 321470.14026336],
       [ 316360.32253217],
       [ 318678.94228073],
       [ 249636.38166299],
       [ 342591.65463269],
       [ 255418.53954247],
       [ 911134.08274977],
       [ 427920.19747611],
       [ 528632.81740156],
       [1170092.55572165],
       [ 394119.81457774],
       [ 306503.65315318],
       [ 312892.06153619],
       [ 233843.14863766],
       [ 490663.22836409],
       [ 459535.36171397],
       [ 333969.41784446],
       [ 282679.25692334],
       [ 226111.82057552],
       [ 538215.56556676],
       [ 226111.82057552],
       [ 352457.85116153],
       [-161201.02176254],
       [ 302303.81022042],
       [ 323007.37024413],
       [ 370455.47388471],
       [ 287524.71253158],
       [1060592.82741372],
       [ 248909.9801574 ],
       [ 288108.35201566],
       [ 287812.79056791],
       [ 229678.75855599],
       [ 249386.81191776],
       [ 390314.04386082],
       [ 451537.07719892],
       [ 528632.81740156],
       [ 811858.17040649],
       [ 302786.19855838],
       [ 394915.92369922],
       [ 390314.04386082],
       [ 405425.39781537],
       [1289225.09943457],
       [ 349637.21447153],
       [ 309215.24022367],
       [ 347266.58887672],
       [ 295166.06881919],
       [ 301125.47078581],
       [ 738397.19827818],
       [ 494697.19977366],
       [ 305130.09668038],
       [ 278844.04749905],
       [ 302727.32114364],
       [ 310444.76356463],
       [ 430134.21928882],
       [ 309671.80782539],
       [ 721669.74395088],
       [ 597337.30576723],
       [ 351846.05585305],
       [ 704587.65260372],
       [ 584703.8865779 ],
       [ 293576.41296521],
       [ 252313.71396867],
       [ 718524.74266092],
       [ 253189.43400231],
       [ 302168.8466723 ],
       [ 288735.60719799],
       [ 498483.16389807],
       [ 457172.49071198],
       [ 333178.1247804 ],
       [ 704558.21389635],
       [ 468988.05558085],
       [ 417037.85398029],
       [ 245048.7518395 ],
       [ 423113.73966605],
       [ 354931.43920809],
       [ 720067.89359305],
       [ 749936.43382107],
       [ 551719.98192466],
       [ 441294.46634331],
       [ 556690.74879784],
       [ 592416.24176006],
       [ 422202.56678133],
       [ 318693.39045302],
       [ 403430.30895423],
       [ 604406.85270084],
       [ 255244.59836449],
       [ 374597.01387348],
       [ 308150.766466  ],
       [ 604268.00000018],
       [ 616438.10869654],
       [ 993142.48503895],
       [ 395901.52706491],
       [ 434344.3298916 ],
       [ 485469.25484471],
       [ 627355.43570469],
       [ 310876.80170877],
       [ 174618.27976904],
       [ 447061.07115658],
       [ 335585.89436361],
       [ 327421.4232634 ],
       [ 373803.94825239],
       [ 373789.2288987 ],
       [ 348443.52012806],
       [ 372500.73495064],
       [ 556630.4021154 ],
       [ 553112.80262718],
       [ 400867.67603802],
       [ 569468.03490261],
       [ 444493.62218294],
       [ 514644.81461941],
       [ 702403.53883714],
       [ 608248.81678902],
       [1162131.46450679],
       [ 314668.1242534 ],
       [ 344572.38851392],
       [ 641419.64152483],
       [ 406890.71039238],
       [ 330256.77930708],
       [ 406762.86588192],
       [ 453663.96080749],
       [1656917.6302292 ],
       [ 449408.85843111],
       [ 258707.30278287],
       [ 345745.18314357],
       [ 310109.76692089],
       [ 424505.72665604],
       [ 310095.0475672 ],
       [ 401125.77300418],
       [ 457988.13524458],
       [ 550575.62607344],
       [ 523303.70234502],
       [ 506600.1301475 ],
       [ 379014.11039314],
       [ 352767.4964078 ],
       [ 644515.6686963 ],
       [ 537987.4698391 ]])
y_test = pd.read_csv("./dev-0/expected.tsv", header=None)
y_test
0
0 373000.00
1 299000.00
2 365000.00
3 369000.00
4 483791.00
... ...
457 655544.02
458 471397.97
459 309958.00
460 699000.00
461 850000.00

462 rows × 1 columns

r2_score(y_test, results)
0.6393762535622007
mean_absolute_error(y_test, results)
71559.96181964973

Predykcja dla zbioru testowego

final_test_dataset = pd.read_csv("./test-A/in.tsv", sep= "\t", header=None)

# Preprocessing column 15:
print("COLUMN 15:")
# Count the occurrence of unique values in column before preprocessing:
print("Value counts before changes:\n",final_test_dataset[14].value_counts())

# Replace string to int or NaN:
final_test_dataset[14] = final_test_dataset[14].replace({"parter": 0, "suterena": -1, "> 10": 11, "poddasze": np.nan})
final_test_dataset[14] = final_test_dataset[14].apply(float)

# Fill Nans with median:
final_test_dataset[14].fillna(final_test_dataset[14].median(), inplace=True)
final_test_dataset[14]= final_test_dataset[14].apply(int)

# Count the occurrence of unique values in column after preprocessing:
print("Value counts after changes:\n",final_test_dataset[14].value_counts())

# Preprocessing column 8:
print("COLUMN 8:")
# Replace strings containing space to NaN:
final_test_dataset[7] = final_test_dataset[7].replace(' ', np.nan, regex=True)

# Fill Nans with median:
final_test_dataset[7] = final_test_dataset[7].apply(float)
final_test_dataset[7].fillna(final_test_dataset[7].median(), inplace=True)

print(final_test_dataset[7])

# Preprocessing column 6:
print("COLUMN 6:")
# Count the occurrence of unique values in column before preprocessing:
print("Value counts before changes:\n",final_test_dataset[5].value_counts())

# Change string to 10:
final_test_dataset[5] = final_test_dataset[5].replace({"więcej niż 10": 10})
final_test_dataset[5] = final_test_dataset[5].apply(int)

# Count the occurrence of unique values in column after preprocessing:
print("Value counts after changes:\n",final_test_dataset[5].value_counts())

final_test_dataset[9].fillna(final_test_dataset[9].median(), inplace=True)
final_test_dataset[9] = final_test_dataset[9].apply(float)

final_test_dataset = final_test_dataset[[5,7,9,14]]

COLUMN 15:
Value counts before changes:
 14
1           92
parter      70
3           68
4           64
2           61
5           15
6           11
7            7
10           5
> 10         5
9            4
8            2
suterena     1
Name: count, dtype: int64
Value counts after changes:
 14
 1     92
 2     74
 0     70
 3     68
 4     64
 5     15
 6     11
 7      7
 10     5
 11     5
 9      4
 8      2
-1      1
Name: count, dtype: int64
COLUMN 8:
0      61.99
1      64.00
2      51.15
3      45.77
4      44.36
       ...  
413    34.97
414    49.06
415    76.71
416    72.63
417    65.84
Name: 7, Length: 418, dtype: float64
COLUMN 6:
Value counts before changes:
 5
2                175
3                143
4                 50
1                 40
5                  6
6                  2
więcej niż 10      1
8                  1
Name: count, dtype: int64
Value counts after changes:
 5
2     175
3     143
4      50
1      40
5       6
6       2
10      1
8       1
Name: count, dtype: int64
final_test_dataset
5 7 9 14
0 3 61.99 7.0 2
1 4 64.00 4.0 0
2 3 51.15 5.0 0
3 2 45.77 7.0 2
4 2 44.36 13.0 5
... ... ... ... ...
413 1 34.97 8.0 4
414 3 49.06 3.0 3
415 3 76.71 5.0 3
416 3 72.63 5.0 3
417 2 65.84 10.0 3

418 rows × 4 columns

final_results = reg.predict(final_test_dataset)
pd.DataFrame(final_results)
0
0 426282.351904
1 389890.897311
2 334372.288463
3 341143.667679
4 346709.875023
... ...
413 301974.734528
414 312195.369919
415 537901.937976
416 505420.685819
417 509311.081663

418 rows × 1 columns

pd.DataFrame(final_results).to_csv("./test-A/out.tsv", sep='\t', index=False, header=None)