48 KiB
48 KiB
import pandas as pd
from statistics import mean,median
import re
import numpy as np
Wczytanie datasetów
train_dataset = pd.read_csv("./train/train.tsv", sep = "\t", header=None)
Data exploration
train_dataset.head(1)
0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | ... | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 309000.0 | do zamieszkania | 390 zł | spółdzielcze własnościowe | 7113 | https://www.otodom.pl/oferta/niezalezny-uklad-... | 2 | NaN | 43.44 | wtórny | ... | NaN | gazowe | plastikowe | NaN | NaN | NaN | cegła | Polecamy na sprzedaż dwupokojowe mieszkanie p... | NaN | telewizja kablowa, internet, meble, piwnica, g... |
1 rows × 26 columns
Wczytywanie danych testowych i preprocessing jak na treningu
# Preprocessing column 15:
print("COLUMN 15:")
# Count the occurrence of unique values in column before preprocessing:
print("Value counts before changes:\n",train_dataset[15].value_counts())
# Replace string to int or NaN:
train_dataset[15] = train_dataset[15].replace({"parter": 0, "suterena": -1, "> 10": 11, "poddasze": np.nan})
train_dataset[15] = train_dataset[15].apply(float)
# Fill Nans with median:
train_dataset[15].fillna(train_dataset[15].median(), inplace=True)
train_dataset[15]= train_dataset[15].apply(int)
# Count the occurrence of unique values in column after preprocessing:
print("Value counts after changes:\n",train_dataset[15].value_counts())
# Preprocessing column 8:
print("COLUMN 8:")
# Replace strings containing space to NaN:
train_dataset[8] = train_dataset[8].replace(' ', np.nan, regex=True)
# Fill Nans with median:
train_dataset[8] = train_dataset[8].apply(float)
train_dataset[8].fillna(train_dataset[8].median(), inplace=True)
print(train_dataset[8])
# Preprocessing column 6:
print("COLUMN 6:")
# Count the occurrence of unique values in column before preprocessing:
print("Value counts before changes:\n",train_dataset[6].value_counts())
# Change string to 10:
train_dataset[6] = train_dataset[6].replace({"więcej niż 10": 10})
train_dataset[6] = train_dataset[6].apply(int)
# Count the occurrence of unique values in column after preprocessing:
print("Value counts after changes:\n",train_dataset[6].value_counts())
train_dataset[10].fillna(train_dataset[10].median(), inplace=True)
train_dataset[10] = train_dataset[10].apply(float)
train_dataset = train_dataset[[0,6,8,10,15]]
COLUMN 15: Value counts before changes: 15 1 569 2 527 0 452 4 357 3 321 5 117 6 51 7 42 8 32 10 29 11 24 9 21 -1 5 Name: count, dtype: int64 Value counts after changes: 15 1 569 2 527 0 452 4 357 3 321 5 117 6 51 7 42 8 32 10 29 11 24 9 21 -1 5 Name: count, dtype: int64 COLUMN 8: 0 43.44 1 42.60 2 44.30 3 88.00 4 77.00 ... 2542 94.00 2543 53.50 2544 55.25 2545 62.00 2546 392.00 Name: 8, Length: 2547, dtype: float64 COLUMN 6: Value counts before changes: 6 2 1014 3 878 4 293 1 271 5 64 6 13 7 7 10 6 9 1 Name: count, dtype: int64 Value counts after changes: 6 2 1014 3 878 4 293 1 271 5 64 6 13 7 7 10 6 9 1 Name: count, dtype: int64
test_dataset = pd.read_csv("./dev-0/in.tsv", sep= "\t", header=None)
# Preprocessing column 15:
print("COLUMN 15:")
# Count the occurrence of unique values in column before preprocessing:
print("Value counts before changes:\n",test_dataset[14].value_counts())
# Replace string to int or NaN:
test_dataset[14] = test_dataset[14].replace({"parter": 0, "suterena": -1, "> 10": 11, "poddasze": np.nan})
test_dataset[14] = test_dataset[14].apply(float)
# Fill Nans with median:
test_dataset[14].fillna(test_dataset[14].median(), inplace=True)
test_dataset[14]= test_dataset[14].apply(int)
# Count the occurrence of unique values in column after preprocessing:
print("Value counts after changes:\n",test_dataset[14].value_counts())
# Preprocessing column 8:
print("COLUMN 8:")
# Replace strings containing space to NaN:
test_dataset[7] = test_dataset[7].replace(' ', np.nan, regex=True)
# Fill Nans with median:
test_dataset[7] = test_dataset[7].apply(float)
test_dataset[7].fillna(test_dataset[7].median(), inplace=True)
print(test_dataset[7])
# Preprocessing column 6:
print("COLUMN 6:")
# Count the occurrence of unique values in column before preprocessing:
print("Value counts before changes:\n",test_dataset[5].value_counts())
# Change string to 10:
test_dataset[5] = test_dataset[5].replace({"więcej niż 10": 10})
test_dataset[5] = test_dataset[5].apply(int)
# Count the occurrence of unique values in column after preprocessing:
print("Value counts after changes:\n",test_dataset[5].value_counts())
test_dataset[9].fillna(test_dataset[9].median(), inplace=True)
test_dataset[9] = test_dataset[9].apply(float)
test_dataset = test_dataset[[5,7,9,14]]
COLUMN 15: Value counts before changes: 14 1 108 2 89 0 82 4 65 3 54 5 22 6 12 7 9 11 9 10 5 8 3 -1 2 9 2 Name: count, dtype: int64 Value counts after changes: 14 1 108 2 89 0 82 4 65 3 54 5 22 6 12 7 9 11 9 10 5 8 3 -1 2 9 2 Name: count, dtype: int64 COLUMN 8: 0 59.10 1 38.00 2 63.84 3 50.00 4 65.62 ... 457 72.78 458 51.23 459 54.16 460 90.10 461 71.90 Name: 7, Length: 462, dtype: float64 COLUMN 6: Value counts before changes: 5 2 196 3 152 1 51 4 50 5 9 6 4 Name: count, dtype: int64 Value counts after changes: 5 2 196 3 152 1 51 4 50 5 9 6 4 Name: count, dtype: int64
test_dataset.head(1)
5 | 7 | 9 | 14 | |
---|---|---|---|---|
0 | 3 | 59.1 | 4.0 | 2 |
Model
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler
X_train = train_dataset.drop(0,axis=1)
y_train = train_dataset[[0]]
scaler = StandardScaler()
trans_data = scaler.fit_transform(X)
X_test = test_dataset
reg = LinearRegression()
reg.fit(X_train, y_train)
results = reg.predict(X_test)
import pickle
from sklearn.metrics import r2_score
# pickle.dump(reg, open("model.pkl", "wb"))
results
array([[ 394901.20434554], [ 293271.41755997], [ 432666.21541136], [ 295330.00173591], [ 444001.60173013], [ 595102.64364947], [ 306562.53056792], [ 346367.98664224], [ 320708.28590668], [ 355678.48101873], [ 272087.02326267], [ 315111.17058773], [ 408743.59976314], [ 392452.80148004], [ 351776.80580328], [ 325340.80253875], [ 324415.74401286], [ 286605.64798109], [ 354409.52248178], [ 273703.59297418], [ 343402.57630023], [ 285271.79788568], [ 370401.6837088 ], [ 482522.18182752], [ 394119.81457774], [ 474686.42204592], [ 627962.51992074], [ 334221.5006669 ], [ 397338.23464708], [ 414336.53457982], [ 335485.00759125], [ 351363.03195993], [ 384755.53002533], [ 374013.34228146], [ 384755.53002533], [ 355882.69461662], [ 521392.20674093], [ 425546.58946533], [ 294138.97474644], [ 539668.46177031], [ 340107.2565533 ], [ 467566.06735236], [ 228315.74093461], [ 373516.14139746], [ 572962.98245529], [ 425442.35590324], [ 264688.02449027], [ 321384.60839985], [ 313697.70958017], [ 257339.46910406], [ 285491.92892354], [ 265315.27967261], [ 269349.61545595], [ 370207.10222578], [ 505152.41437514], [ 326640.0334956 ], [ 361868.73382815], [ 641520.72645455], [ 513506.74409331], [ 225524.54295198], [ 237226.80467502], [ 453176.39203834], [ 261995.60845714], [ 955187.2509814 ], [ 492991.37526251], [ 374938.40080734], [ 774416.69022809], [ 523696.94084834], [ 434831.26310559], [ 489623.47044873], [ 280423.33071801], [ 264688.02449027], [ 280814.31555288], [ 359688.46533354], [ 314210.82790415], [ 622185.54247246], [ 448643.38610329], [ 561068.71341085], [ 331068.80586855], [ 387502.56994691], [ 251024.38630808], [ 295949.29222845], [ 311115.63444521], [ 307926.65308324], [ 268121.19700892], [ 329516.02282157], [ 260145.38644037], [ 402080.9057926 ], [ 247029.12134696], [ 293241.9788526 ], [ 968741.83954407], [ 463714.83552309], [ 458749.5202625 ], [ 467566.06735236], [ 251228.59990596], [ 367839.0507926 ], [ 178563.73689914], [ 404748.33480203], [ 361629.76401877], [ 335003.99532863], [ 468590.27220151], [ 342562.21592532], [ 371236.86365252], [ 310876.80170877], [ 276928.20517629], [ 273106.88139315], [ 517720.942006 ], [ 309671.80782539], [ 321349.71807989], [ 346542.76153275], [ 374434.93476654], [ 919566.5541124 ], [ 295586.59732644], [ 394954.99452145], [ 259707.15616347], [ 296950.34369535], [ 390519.03008681], [ 338032.92115378], [ 279645.24385936], [ 394134.53393142], [ 279107.13851167], [ 454032.21228709], [ 959467.06920008], [ 683669.88249351], [ 371046.82704552], [ 576981.26372572], [ 294170.56215725], [ 443250.48438223], [ 283176.45780734], [ 434794.43417916], [ 344911.47246756], [ 272235.14370441], [ 824501.58608444], [ 334396.91111258], [ 350120.43652201], [ 253527.41306202], [ 293854.69267028], [ 291547.17430427], [ 270927.11434524], [ 383959.89024261], [ 656199.3017448 ], [ 553341.638875 ], [ 432687.22009017], [ 526139.69394388], [ 447026.81639178], [ 274087.19936264], [ 397721.29867274], [ 491976.34496209], [ 334112.72222878], [ 564079.20871881], [ 262283.77968584], [ 325888.17562753], [ 707612.5028916 ], [ 392111.24832978], [ 498668.84201377], [ 287777.33771682], [ 269508.93047261], [ 438284.33540911], [ 325678.41722469], [ 451638.79768381], [ 313355.0076554 ], [ 507395.87489522], [ 319663.80512625], [ 334586.40535677], [ 336983.25994216], [ 241704.55116646], [ 551645.5514437 ], [ 263184.95608194], [ 353876.49260028], [ 261995.60845714], [ 310876.80170877], [ 350329.83055109], [ 322549.60455611], [ 394105.09522405], [ 553247.67298293], [ 318793.17233144], [ 467137.74037172], [ 242785.12035314], [ 382352.68146455], [ 327735.05085456], [ 517343.1516039 ], [ 290640.3481382 ], [ 219340.07698546], [ 298372.97925164], [ 194674.57986053], [ 273246.9321801 ], [ 332307.2208042 ], [ 264140.01584633], [ 457026.61216604], [ 395940.59788713], [ 290989.89791922], [ 473556.78554839], [ 318678.94228073], [ 485526.48500272], [ 325262.02533913], [ 481691.81794123], [ 339771.01794269], [ 367305.38535593], [ 443173.11536588], [ 505575.09158582], [ 318678.94228073], [ 216558.78229908], [ 233490.17904287], [ 394075.65651668], [ 295760.63169678], [ 258795.98327875], [ 346353.26728856], [ 305592.11589469], [ 443578.56014568], [ 300577.82651562], [ 402095.62514628], [ 513536.18280069], [ 757593.69577602], [ 287777.33771682], [ 397881.4272336 ], [ 414604.91098864], [ 349552.42312819], [ 534205.58125196], [ 378854.88856884], [ 348203.48930532], [ 333984.13719815], [ 330351.10957292], [ 182753.04098048], [ 329142.59091077], [ 290053.63304579], [ 309309.5704895 ], [ 481568.04896803], [ 270985.99175998], [ 327735.05085456], [ 237495.07611884], [ 295905.1341674 ], [ 267893.10128127], [ 305518.51912627], [ 289640.4947576 ], [ 448016.13092096], [ 448276.23951762], [ 306562.53056792], [ 763649.68167691], [ 302348.80199401], [ 316359.12444587], [ 290899.54999829], [ 231743.82621443], [ 313404.81548641], [ 278816.39312135], [ 383639.70614492], [ 300989.76671752], [ 320213.59809987], [ 327790.31029817], [ 365619.94174106], [ 405226.27145632], [ 351115.20266379], [ 301726.6340505 ], [ 239057.85565444], [ 623907.54383236], [ 325824.388963 ], [ 305965.91217926], [ 349984.19009093], [ 638635.09324109], [ 624904.92801633], [ 421648.73037838], [ 290610.1806833 ], [ 288601.57055477], [ 504376.11184616], [ 246333.26344267], [ 390474.87202575], [ 676486.10550918], [ 217145.59058386], [ 253652.38012152], [ 874149.26336545], [ 241282.41631858], [ 300762.91295673], [ 364474.75217731], [ 819789.82291398], [ 227915.8527324 ], [ 333171.00574275], [ 406022.1093964 ], [ 424620.79041928], [ 433925.27065205], [ 369693.34834663], [ 659209.432679 ], [ 377446.15042634], [ 489887.75954764], [ 215155.31778014], [ 369953.55013564], [ 541589.4962969 ], [ 270941.83369892], [ 342407.81015845], [ 259234.77608678], [ 433939.99000574], [ 277310.70667081], [ 675738.22998601], [ 376222.07869798], [ 381358.95913228], [ 366112.32656763], [ 311543.39889473], [ 236978.24663136], [ 321370.9939401 ], [ 412013.97636694], [ 321355.16969248], [ 318664.22292705], [ 382338.23329227], [ 581803.00342058], [ 513506.74409331], [ 635007.71538581], [ 270912.39499155], [ 316897.22979356], [ 352344.72600475], [ 256796.27651754], [ 453674.42663487], [ 321897.63353148], [ 307478.51951008], [ 268956.93948377], [ 557884.64129867], [1047884.98951607], [ 440857.06977894], [ 226951.61841929], [ 350348.53224967], [ 443134.68009882], [ 277414.30467774], [ 476949.14679571], [ 286093.7277434 ], [ 585186.00373445], [ 441842.20380586], [ 265771.94046669], [ 318649.50357336], [ 252423.12796196], [ 321470.14026336], [ 316360.32253217], [ 318678.94228073], [ 249636.38166299], [ 342591.65463269], [ 255418.53954247], [ 911134.08274977], [ 427920.19747611], [ 528632.81740156], [1170092.55572165], [ 394119.81457774], [ 306503.65315318], [ 312892.06153619], [ 233843.14863766], [ 490663.22836409], [ 459535.36171397], [ 333969.41784446], [ 282679.25692334], [ 226111.82057552], [ 538215.56556676], [ 226111.82057552], [ 352457.85116153], [-161201.02176254], [ 302303.81022042], [ 323007.37024413], [ 370455.47388471], [ 287524.71253158], [1060592.82741372], [ 248909.9801574 ], [ 288108.35201566], [ 287812.79056791], [ 229678.75855599], [ 249386.81191776], [ 390314.04386082], [ 451537.07719892], [ 528632.81740156], [ 811858.17040649], [ 302786.19855838], [ 394915.92369922], [ 390314.04386082], [ 405425.39781537], [1289225.09943457], [ 349637.21447153], [ 309215.24022367], [ 347266.58887672], [ 295166.06881919], [ 301125.47078581], [ 738397.19827818], [ 494697.19977366], [ 305130.09668038], [ 278844.04749905], [ 302727.32114364], [ 310444.76356463], [ 430134.21928882], [ 309671.80782539], [ 721669.74395088], [ 597337.30576723], [ 351846.05585305], [ 704587.65260372], [ 584703.8865779 ], [ 293576.41296521], [ 252313.71396867], [ 718524.74266092], [ 253189.43400231], [ 302168.8466723 ], [ 288735.60719799], [ 498483.16389807], [ 457172.49071198], [ 333178.1247804 ], [ 704558.21389635], [ 468988.05558085], [ 417037.85398029], [ 245048.7518395 ], [ 423113.73966605], [ 354931.43920809], [ 720067.89359305], [ 749936.43382107], [ 551719.98192466], [ 441294.46634331], [ 556690.74879784], [ 592416.24176006], [ 422202.56678133], [ 318693.39045302], [ 403430.30895423], [ 604406.85270084], [ 255244.59836449], [ 374597.01387348], [ 308150.766466 ], [ 604268.00000018], [ 616438.10869654], [ 993142.48503895], [ 395901.52706491], [ 434344.3298916 ], [ 485469.25484471], [ 627355.43570469], [ 310876.80170877], [ 174618.27976904], [ 447061.07115658], [ 335585.89436361], [ 327421.4232634 ], [ 373803.94825239], [ 373789.2288987 ], [ 348443.52012806], [ 372500.73495064], [ 556630.4021154 ], [ 553112.80262718], [ 400867.67603802], [ 569468.03490261], [ 444493.62218294], [ 514644.81461941], [ 702403.53883714], [ 608248.81678902], [1162131.46450679], [ 314668.1242534 ], [ 344572.38851392], [ 641419.64152483], [ 406890.71039238], [ 330256.77930708], [ 406762.86588192], [ 453663.96080749], [1656917.6302292 ], [ 449408.85843111], [ 258707.30278287], [ 345745.18314357], [ 310109.76692089], [ 424505.72665604], [ 310095.0475672 ], [ 401125.77300418], [ 457988.13524458], [ 550575.62607344], [ 523303.70234502], [ 506600.1301475 ], [ 379014.11039314], [ 352767.4964078 ], [ 644515.6686963 ], [ 537987.4698391 ]])
y_test = pd.read_csv("./dev-0/expected.tsv", header=None)
y_test
0 | |
---|---|
0 | 373000.00 |
1 | 299000.00 |
2 | 365000.00 |
3 | 369000.00 |
4 | 483791.00 |
... | ... |
457 | 655544.02 |
458 | 471397.97 |
459 | 309958.00 |
460 | 699000.00 |
461 | 850000.00 |
462 rows × 1 columns
r2_score(y_test, results)
0.6393762535622007
mean_absolute_error(y_test, results)
71559.96181964973
Predykcja dla zbioru testowego
final_test_dataset = pd.read_csv("./test-A/in.tsv", sep= "\t", header=None)
# Preprocessing column 15:
print("COLUMN 15:")
# Count the occurrence of unique values in column before preprocessing:
print("Value counts before changes:\n",final_test_dataset[14].value_counts())
# Replace string to int or NaN:
final_test_dataset[14] = final_test_dataset[14].replace({"parter": 0, "suterena": -1, "> 10": 11, "poddasze": np.nan})
final_test_dataset[14] = final_test_dataset[14].apply(float)
# Fill Nans with median:
final_test_dataset[14].fillna(final_test_dataset[14].median(), inplace=True)
final_test_dataset[14]= final_test_dataset[14].apply(int)
# Count the occurrence of unique values in column after preprocessing:
print("Value counts after changes:\n",final_test_dataset[14].value_counts())
# Preprocessing column 8:
print("COLUMN 8:")
# Replace strings containing space to NaN:
final_test_dataset[7] = final_test_dataset[7].replace(' ', np.nan, regex=True)
# Fill Nans with median:
final_test_dataset[7] = final_test_dataset[7].apply(float)
final_test_dataset[7].fillna(final_test_dataset[7].median(), inplace=True)
print(final_test_dataset[7])
# Preprocessing column 6:
print("COLUMN 6:")
# Count the occurrence of unique values in column before preprocessing:
print("Value counts before changes:\n",final_test_dataset[5].value_counts())
# Change string to 10:
final_test_dataset[5] = final_test_dataset[5].replace({"więcej niż 10": 10})
final_test_dataset[5] = final_test_dataset[5].apply(int)
# Count the occurrence of unique values in column after preprocessing:
print("Value counts after changes:\n",final_test_dataset[5].value_counts())
final_test_dataset[9].fillna(final_test_dataset[9].median(), inplace=True)
final_test_dataset[9] = final_test_dataset[9].apply(float)
final_test_dataset = final_test_dataset[[5,7,9,14]]
COLUMN 15: Value counts before changes: 14 1 92 parter 70 3 68 4 64 2 61 5 15 6 11 7 7 10 5 > 10 5 9 4 8 2 suterena 1 Name: count, dtype: int64 Value counts after changes: 14 1 92 2 74 0 70 3 68 4 64 5 15 6 11 7 7 10 5 11 5 9 4 8 2 -1 1 Name: count, dtype: int64 COLUMN 8: 0 61.99 1 64.00 2 51.15 3 45.77 4 44.36 ... 413 34.97 414 49.06 415 76.71 416 72.63 417 65.84 Name: 7, Length: 418, dtype: float64 COLUMN 6: Value counts before changes: 5 2 175 3 143 4 50 1 40 5 6 6 2 więcej niż 10 1 8 1 Name: count, dtype: int64 Value counts after changes: 5 2 175 3 143 4 50 1 40 5 6 6 2 10 1 8 1 Name: count, dtype: int64
final_test_dataset
5 | 7 | 9 | 14 | |
---|---|---|---|---|
0 | 3 | 61.99 | 7.0 | 2 |
1 | 4 | 64.00 | 4.0 | 0 |
2 | 3 | 51.15 | 5.0 | 0 |
3 | 2 | 45.77 | 7.0 | 2 |
4 | 2 | 44.36 | 13.0 | 5 |
... | ... | ... | ... | ... |
413 | 1 | 34.97 | 8.0 | 4 |
414 | 3 | 49.06 | 3.0 | 3 |
415 | 3 | 76.71 | 5.0 | 3 |
416 | 3 | 72.63 | 5.0 | 3 |
417 | 2 | 65.84 | 10.0 | 3 |
418 rows × 4 columns
final_results = reg.predict(final_test_dataset)
pd.DataFrame(final_results)
0 | |
---|---|
0 | 426282.351904 |
1 | 389890.897311 |
2 | 334372.288463 |
3 | 341143.667679 |
4 | 346709.875023 |
... | ... |
413 | 301974.734528 |
414 | 312195.369919 |
415 | 537901.937976 |
416 | 505420.685819 |
417 | 509311.081663 |
418 rows × 1 columns
pd.DataFrame(final_results).to_csv("./test-A/out.tsv", sep='\t', index=False, header=None)