import pandas as pd
import numpy as np
import joblib
df_train = pd.read_csv('train/train.tsv', sep='\t', header=None)
pd.set_option('display.max_columns', 500)
0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25
0 309000.0 do zamieszkania 390 zł spółdzielcze własnościowe 7113 https://www.otodom.pl/oferta/niezalezny-uklad-... 2 NaN 43.44 wtórny 4.0 blok NaN NaN Niezależny Układ W Nowoczesnym Wydaniu 1 NaN gazowe plastikowe NaN NaN NaN cegła Polecamy na sprzedaż dwupokojowe mieszkanie p... NaN telewizja kablowa, internet, meble, piwnica, g...
1 314900.0 do wykończenia NaN pełna własność 7392 https://www.otodom.pl/oferta/urokliwe-mieszkan... 2 NaN 42.60 pierwotny 2.0 blok NaN NaN Urokliwe mieszkanie 2 pokojowe Strzeszyn 1 NaN gazowe plastikowe NaN NaN NaN cegła Kameralne 2 pokojowe mieszkanie z aneksem kuc... NaN telewizja kablowa, internet, telefon, drzwi / ...
2 249000.0 do remontu 300 zł pełna własność 5621 https://www.otodom.pl/oferta/mieszkanie-do-rem... 2 NaN 44.30 wtórny 4.0 blok NaN NaN Mieszkanie do remontu ul. Klonowa, blisko tramwaj 2 NaN miejskie plastikowe 1960.0 NaN 2019-06-30 cegła Oferta bezpośrednio od właściciela - bez pośr... NaN telewizja kablowa, internet, telefon, domofon ...
3 419000.0 do zamieszkania 490 zł pełna własność 4761 https://www.otodom.pl/oferta/w-szeregowcu-4-po... 4 NaN 88 wtórny 3.0 szeregowiec NaN NaN W szeregowcu 4 pokoje z garażem 1 NaN gazowe plastikowe NaN NaN NaN cegła Drodzy Państwo Zapraszam do zapoznania się z ... NaN telewizja kablowa, internet, zmywarka, lodówka...
4 499000.0 NaN 850 zł NaN 6481 https://www.otodom.pl/oferta/komfortowe-przest... 3 NaN 77 wtórny 16.0 blok NaN NaN Komfortowe,Przestronne,3Pokoje, Armii Krajowej!!! 7 NaN NaN plastikowe NaN NaN NaN NaN Biuro Immohouse ma przyjemność proponować do ... NaN balkon, piwnica, winda

1,2,6, 8, 9 - to bedziemy w aplikacji używać

df_train = df_train.iloc[:, [0, 1, 2, 6, 8, 9]]
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2547 entries, 0 to 2546
Data columns (total 6 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   0       2547 non-null   float64
 1   1       1519 non-null   object 
 2   2       929 non-null    object 
 3   6       2547 non-null   object 
 4   8       2547 non-null   object 
 5   9       2547 non-null   object 
dtypes: float64(1), object(5)
memory usage: 119.5+ KB
df_train.rename(columns={0: "cena m", 1: "stan", 2: "czynsz", 6: "l pokoi", 8: "metraż", 9: "rynek"}, inplace=True)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2547 entries, 0 to 2546
Data columns (total 6 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   cena m   2547 non-null   float64
 1   stan     1519 non-null   object 
 2   czynsz   929 non-null    object 
 3   l pokoi  2547 non-null   object 
 4   metraż   2547 non-null   object 
 5   rynek    2547 non-null   object 
dtypes: float64(1), object(5)
memory usage: 119.5+ KB

Usunięcie kolumn, w których jest dużo pustych wartości

df_train.drop(columns = ['czynsz'], axis=1, inplace=True)
cena m stan l pokoi metraż rynek
0 309000.0 do zamieszkania 2 43.44 wtórny
1 314900.0 do wykończenia 2 42.60 pierwotny
2 249000.0 do remontu 2 44.30 wtórny
3 419000.0 do zamieszkania 4 88 wtórny
4 499000.0 NaN 3 77 wtórny

Uzupełnianie braków w danych

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2547 entries, 0 to 2546
Data columns (total 5 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   cena m   2547 non-null   float64
 1   stan     1519 non-null   object 
 2   l pokoi  2547 non-null   object 
 3   metraż   2547 non-null   object 
 4   rynek    2547 non-null   object 
dtypes: float64(1), object(4)
memory usage: 99.6+ KB
df_train['stan'].fillna('brak info', inplace=True)
df_train = df_train[df_train['metraż'] != "6 909"]
df_train = df_train[df_train['l pokoi'] != "więcej niż 10"]
df_train['l pokoi'] = pd.to_numeric(df_train['l pokoi'])
df_train['metraż'] = pd.to_numeric(df_train['metraż'])
X_train = df_train.drop(columns = ["cena m"])
y_train = df_train["cena m"]
number_cols = X_train.select_dtypes(exclude=[object]).columns
object_cols = X_train.select_dtypes(include=[object]).columns
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
ct = ColumnTransformer([
        (number_cols, StandardScaler()),
        (object_cols, OneHotEncoder())
from sklearn.linear_model import Ridge
scaler = StandardScaler()
encoder = OneHotEncoder()
# putting numeric columns to scaler and categorical to encoder
num_transformer = make_pipeline(scaler)
cat_transformer = make_pipeline(encoder)

# getting together our scaler and encoder with preprocessor
preprocessor = ColumnTransformer(
      transformers=[('num', num_transformer, number_cols),
                    ('cat', cat_transformer, object_cols)])
clf = Ridge(alpha=1.0)
model1 = make_pipeline(preprocessor, clf)
stan l pokoi metraż rynek
0 do zamieszkania 2 43.44 wtórny
1 do wykończenia 2 42.60 pierwotny
2 do remontu 2 44.30 wtórny
3 do zamieszkania 4 88.00 wtórny
4 brak info 3 77.00 wtórny
0       309000.0
1       314900.0
2       249000.0
3       419000.0
4       499000.0
2541    383680.0
2542    507600.0
2543    342400.0
2544    335000.0
2545    260000.0
Name: cena m, Length: 2540, dtype: float64

Model Ridge - regresja liniowa

model1.fit(X_train, y_train)
                                                  Index(['l pokoi', 'metraż'], dtype='object')),
                                                  Index(['stan', 'rynek'], dtype='object'))])),
                ('ridge', Ridge())])
                                                  Index(['l pokoi', 'metraż'], dtype='object')),
                                                  Index(['stan', 'rynek'], dtype='object'))])),
                ('ridge', Ridge())])
                                 Index(['l pokoi', 'metraż'], dtype='object')),
                                 Index(['stan', 'rynek'], dtype='object'))])
Index(['l pokoi', 'metraż'], dtype='object')
Index(['stan', 'rynek'], dtype='object')
df_test = pd.read_csv('dev-0/in.tsv', sep='\t', header=None)
0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24
0 do zamieszkania 250 zł pełna własność 6311 https://www.otodom.pl/oferta/idealne-mieszkani... 3 NaN 59.10 wtórny 4.0 blok NaN NaN idealne mieszkanie do zamieszkania od zaraz 2 NaN miejskie plastikowe NaN NaN NaN wielka płyta Zamieszkaj od zaraz idealna lokalizacja OFE... NaN telewizja kablowa, internet, telefon, domofon ...
1 do zamieszkania NaN pełna własność 7868 https://www.otodom.pl/oferta/2-pokoje-38m2-po-... 2 NaN 38.00 wtórny 12.0 blok NaN NaN 2 pokoje / 38m2 / po remoncie / Winogrady 4 NaN inne plastikowe NaN NaN NaN NaN Na sprzedaż mieszkanie dwupokojowe na osiedlu... NaN balkon
2 do zamieszkania 650 zł pełna własność 5717 https://www.otodom.pl/oferta/3-pokoje-na-jezyc... 3 NaN 63.84 wtórny 4.0 blok NaN NaN 3 pokoje na Jeżycach blisko Rusałki 4 NaN miejskie plastikowe 1958.0 NaN NaN cegła !! Oferta dostępna tylko u nas !! Polecam ofe... NaN telewizja kablowa, internet, telefon, piwnica,...
3 do zamieszkania 359 zł spółdzielcze własnościowe 7380 https://www.otodom.pl/oferta/uniwersytet-przyr... 4 NaN 50.00 wtórny 10.0 blok NaN NaN Uniwersytet Przyrodniczy - 4 pokoje - ROI 8-10% 10 NaN miejskie plastikowe 1975.0 NaN 2019-06-30 wielka płyta Oferta  bezpośrednio od właściciela  - bez po... NaN telewizja kablowa, internet, telefon, drzwi / ...
4 NaN NaN NaN 7373 https://www.otodom.pl/oferta/mieszkanie-blisko... 3 NaN 65.62 pierwotny 3.0 NaN NaN NaN Mieszkanie blisko centrum w kameralnej okolicy. 1 NaN NaN NaN 2020.0 NaN 2020-01-01 NaN MIESZKANIE 3-POKOJOWE O POW.65,62M2 Mieszkani... NaN garaż/miejsce parkingowe, taras, pom. użytkowe
y_test = pd.read_csv('dev-0/expected.tsv', sep='\t', header=None)
stan l pokoi metraż rynek
0 do zamieszkania 2 43.44 wtórny
1 do wykończenia 2 42.60 pierwotny
2 do remontu 2 44.30 wtórny
3 do zamieszkania 4 88.00 wtórny
4 brak info 3 77.00 wtórny
... ... ... ... ...
2541 do wykończenia 3 70.40 pierwotny
2542 do wykończenia 4 94.00 wtórny
2543 brak info 2 53.50 wtórny
2544 brak info 3 55.25 wtórny
2545 brak info 3 62.00 wtórny

2540 rows × 4 columns

df_test.rename(columns={0: "stan", 1: "czynsz", 5: "l pokoi", 7: "metraż", 8: "rynek"}, inplace=True)
Index([   'stan',  'czynsz',         2,         3,         4, 'l pokoi',
               6,  'metraż',   'rynek',         9,        10,        11,
              12,        13,        14,        15,        16,        17,
              18,        19,        20,        21,        22,        23,
df_test = df_test[['stan', 'l pokoi', 'metraż',  'rynek']] 
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 462 entries, 0 to 461
Data columns (total 4 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   stan     275 non-null    object 
 1   l pokoi  462 non-null    int64  
 2   metraż   462 non-null    float64
 3   rynek    462 non-null    object 
dtypes: float64(1), int64(1), object(2)
memory usage: 14.6+ KB
df_test['stan'].fillna('brak info', inplace=True)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 462 entries, 0 to 461
Data columns (total 4 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   stan     462 non-null    object 
 1   l pokoi  462 non-null    int64  
 2   metraż   462 non-null    float64
 3   rynek    462 non-null    object 
dtypes: float64(1), int64(1), object(2)
memory usage: 14.6+ KB
0      59.10
1      38.00
2      63.84
3      50.00
4      65.62
457    72.78
458    51.23
459    54.16
460    90.10
461    71.90
Name: metraż, Length: 462, dtype: float64
df_test.reset_index(drop=True, inplace=True)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 462 entries, 0 to 461
Data columns (total 4 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   stan     462 non-null    object 
 1   l pokoi  462 non-null    int64  
 2   metraż   462 non-null    float64
 3   rynek    462 non-null    object 
dtypes: float64(1), int64(1), object(2)
memory usage: 14.6+ KB
                                                  Index(['l pokoi', 'metraż'], dtype='object')),
                                                  Index(['stan', 'rynek'], dtype='object'))])),
                ('ridge', Ridge())])
                                                  Index(['l pokoi', 'metraż'], dtype='object')),
                                                  Index(['stan', 'rynek'], dtype='object'))])),
                ('ridge', Ridge())])
                                 Index(['l pokoi', 'metraż'], dtype='object')),
                                 Index(['stan', 'rynek'], dtype='object'))])
Index(['l pokoi', 'metraż'], dtype='object')
Index(['stan', 'rynek'], dtype='object')

Przykład - przewidywanie cen na danych testowych

array([ 407917.66889452,  285306.27863261,  444184.94702291,
        299459.05926666,  459271.83408962,  614055.95902743,
        311243.98561031,  361244.58944658,  329238.65906305,
        369470.84812889,  285350.13919824,  329238.65906305,
        420020.53940636,  399085.99304054,  368717.32379097,
        339834.65864991,  335512.74515277,  304524.88092914,
        367036.20209137,  286773.80808993,  354525.20168926,
        282085.35025252,  386303.76202814,  477660.29347794,
        407152.53644455,  490742.17175818,  529758.94244497,
        335044.40697838,  399501.21194489,  431641.29394143,
        331214.22563058,  334772.89329194,  404282.20491201,
        369968.98599038,  392538.22357404,  354476.13872373,
        531690.53563375,  428309.25045513,  298510.41733766,
        551966.54555785,  332974.0302655 ,  474293.99377424,
        247049.65613431,  383931.85143325,  546789.20856336,
        438766.28443053,  275556.44383868,  325403.7381088 ,
        319934.12593671,  277654.95413295,  294800.60980049,
        285306.27863261,  283776.01373268,  282811.35937324,
        511210.54963994,  327121.56879228,  335039.88788041,
        638797.47051695,  521922.40393947,  235305.67479634,
        238447.52001532,  465392.89368935,  272046.09327325,
        848434.43816552,  512546.27689188,  382253.07914128,
        781492.5027453 ,  531716.09929903,  442847.05008064,
        498968.43044048,  272108.82871587,  275556.44383868,
        307110.38376632,  373296.51037872,  320469.71865169,
        629040.94693472,  456886.14569235,  463535.64405524,
        356361.80264533,  387288.89243253,  249184.65490398,
        305443.0398691 ,  312129.77494706,  323562.90113092,
        286773.80808993,  338640.52949322,  265910.97279499,
        414803.86094421,  262352.30513363,  273562.29729465,
        959123.60494806,  476401.54226445,  463695.54142089,
        474293.99377424,  245933.92683679,  276309.90323886,
         95163.71086422,  410945.54601501,  377887.30507851,
        332477.49594204,  374627.25336918,  354168.19912956,
        369968.98599038,  311900.23521207,  275897.03611221,
        272224.40035237,  522497.33812211,  317135.7885512 ,
        317671.38126618,  357993.86137939,  352637.93422963,
        908358.15173798,  288551.8042999 ,  398779.94006055,
        279465.70834759,  310024.57586448,  367940.58322895,
        336039.07916333,  295296.8610478 ,  395408.55510658,
        289706.87506509,  461286.74212482,  959007.7502354 ,
        671918.16381992,  357151.93260828,  573837.44243868,
        286071.41108258,  453635.41762516,  294033.30766019,
        430106.50994353,  361819.52362922,  280070.72529348,
        820481.6050142 ,  349767.60269709,  364114.92097912,
        262468.15984629,  301183.86181458,  297937.65284535,
        285306.27863261,  393955.0865278 ,  666659.89980777,
        560179.02643777,  440551.36965458,  544315.22105819,
        460711.80794218,  281978.75424428,  380105.90610726,
        501187.31454538,  343001.78445802,  567830.35093743,
        262544.39001514,  332442.95664849,  703770.99332145,
        369470.84812889,  494101.3824478 ,  273562.29729465,
        180666.17730277,  452295.35099255,  331304.51667796,
        453635.41762516,  323108.3407589 ,  507194.68912278,
        326474.92353875,  334772.89329194,  335044.40697838,
        240054.29816025,  562052.51609502,  274404.22606576,
        375397.37008062,  253585.25133645,  311900.23521207,
        365811.99017143,  339834.65864991,  395408.55510658,
        567728.27402715,  336238.53613507,  490526.69290172,
        257086.66903124,  385858.46036054,  344082.22859239,
        533220.80053368,  295941.33661099,  220577.95997966,
        291229.76787478,  200262.6085879 ,  281213.62179431,
        338640.52949322,  267288.49428108,  439786.23720462,
        400076.14612753,  301183.86181458,  479730.95326697,
        319470.24429261,  470581.73347059,  336354.39084773,
        491317.10594082,  354168.19912956,  369551.88047185,
        435462.4370933 ,  514271.0794398 ,  331214.22563058,
        234999.90489251,  240207.32465024,  395408.55510658,
        305290.01337911,  265336.03861235,  349500.60810862,
        315911.57663126,  448585.54345538,  310063.91733215,
        414803.86094421,  521922.40393947,  768450.42872618,
        285306.27863261,  414228.92676157,  408188.43671135,
        262917.91567413,  530071.86630062,  284993.07170081,
        354168.19912956,  350852.56592942,  341174.72528252,
        174670.0129817 ,  346185.25798463,  308617.25469129,
        319206.16526408,  479496.61135786,  285306.27863261,
        338832.89745087,  238677.05975031,  300193.70872759,
        284350.94791532,  312129.77494706,  288864.94629397,
        447136.31089841,  468759.4764692 ,  311243.98561031,
        773841.17824564,  309069.24514719,  311818.91979295,
        303300.6690092 ,  249129.2915516 ,  317026.62262684,
        270578.56381593,  374826.77527865,  317632.03979852,
        218540.23357609,  336303.15819187,  364613.05884061,
        419438.51620964,  348842.0716669 ,  300494.95953345,
        245456.65579176,  632191.76778196,  342245.91071247,
        321241.94011573,  345941.94044726,  654441.17458326,
        594917.92628041,  403059.87960624,  295368.85519483,
        287144.48312671,  504057.36300177,  259834.05683704,
        356196.60189099,  671431.17127702,  221915.85692194,
        266766.67936848,  862139.34958591,  253061.54980967,
        278229.88831559,  360787.39659078,  819749.12524361,
        225938.68930354,  337453.4893506 ,  417419.08907647,
        433477.61182135,  428415.84646337,  369470.84812889,
        644948.32980378,  369778.78772305,  486728.1978552 ,
        220003.02579702,  372225.04187261,  533707.33028312,
        268312.96615313,  351337.20906469,  266766.67936848,
        433665.17760489,  291427.33823234,  681260.15179974,
        386155.25463611,  361819.52362922,  378283.64910587,
        312584.05224292,  262442.87925716,  347027.18675575,
        388599.15937804,  330653.06925032,  331214.22563058,
        392424.82162787,  581111.00288747,  510178.4226015 ,
        638681.61580429,  273562.29729465,  332934.68879784,
        370402.78487122,  251183.25797831,  454520.92388575,
        327814.99017136,  317446.36062915,  270745.3681083 ,
        568020.54920476, 1041637.53573378,  445984.0931255 ,
        246474.72195167,  353326.27035845,  456478.01580208,
        268245.71161255,  476014.4569415 ,  305276.23557673,
        590784.32443641,  441316.50210455,  265910.97279499,
        319470.24429261,  266451.36768408,  331214.22563058,
        327784.90740811,  319470.24429261,  244403.66193686,
        252780.99555724,  268449.97075841,  912139.95342218,
        430604.64780502,  536459.92048882, 1156407.40322869,
        395408.55510658,  221600.76337596,  327898.59243044,
        237907.12512623,  498752.66850787,  468835.9897142 ,
        350852.56592942,  300033.9934493 ,  232365.80188326,
        533897.52855045,  232365.80188326,  369329.42975094,
       -118069.19958188,  317722.33084589,  323601.95952243,
        369203.85354041,  268053.34365489, 1051697.65952951,
        254248.58995229,  303300.6690092 ,  273638.81053964,
        244461.9836068 ,  258111.42397945,  388332.16478956,
        429074.38290509,  536459.92048882,  800353.81940598,
        298918.26415178,  407917.66889452,  388332.16478956,
        414613.66267689, 1262607.50420783,  369329.42975094,
        319129.65201908,  334197.9591093 ,  286263.77904024,
        307993.25754312,  741174.54173005,  491064.17223088,
        298429.10191855,  292957.60313228,  315911.57663126,
        327478.85442812,  451544.27942111,  317135.7885512 ,
        724682.50318048,  509137.53807322,  361833.3014316 ,
        693810.21059336,  582168.1274389 ,  212609.37224369,
        260216.62306202,  710680.29626995,  264840.07044119,
        300877.52575844,  309268.70211893,  389929.9023685 ,
        464347.27192468,  347409.46990458,  693810.21059336,
        468363.13244184,  427977.91688601,  251373.45624563,
        427817.80138193,  352408.11141849,  728508.16543031,
        749166.7415794 ,  558648.76153784,  450512.15238267,
        571124.93957025,  592161.2797702 ,  418937.46278821,
        319398.53322173,  415378.79512685,  600960.30294481,
        263047.6131269 ,  364648.06092761,  325259.97032322,
        589476.22718124,  599471.54920288,  983410.6732152 ,
        405432.07327729,  442158.14779951,  491279.65108734,
        636692.27143438,  311900.23521207,  186643.53405465,
        454252.04268309,  348902.56302718,  335848.88089601,
        377701.62590915,  377701.62590915,  353370.41400023,
        376477.41398921,  560336.57202573,  556969.98924588,
        402990.3382257 ,  568449.1456857 ,  448323.35104102,
        517259.61509264,  688264.08517628,  606585.11129723,
       1137012.09739106,  327010.51625373,  361028.82751397,
        628428.55789859,  413554.08535898,  341098.21203752,
        401415.92968398,  455318.42593893, 1619425.93740436,
        451114.99963823,  267139.98688905,  342658.5597007 ,
        304976.87138504,  438573.91647287,  304976.87138504,
        392508.02366116,  458652.63911557,  547637.54304662,
        524873.76781497,  508805.98636568,  382751.50007893,
        366338.32418199,  645108.72838401,  532942.19783316])
Index(['stan', 'l pokoi', 'metraż', 'rynek'], dtype='object')
array(['wtórny', 'pierwotny'], dtype=object)

Saving the model to file

filename = 'ridge_model.sav'
joblib.dump(model1, filename)
df_test = pd.read_csv('test-A/in.tsv', sep='\t', header=None)
df_test.rename(columns={0: "stan", 1: "czynsz", 5: "l pokoi", 7: "metraż", 8: "rynek"}, inplace=True)
df_test = df_test[['stan', 'l pokoi', 'metraż',  'rynek']] 
df_test['stan'].fillna('brak info', inplace=True)
# df_test[df_test['l pokoi'] == "więcej niż 10"] = 10
df_test.loc[df_test['l pokoi'] == "więcej niż 10", 'l pokoi'] = 10
df_test = df_test[df_test['metraż'] != "6 909"]
df_test.reset_index(drop=True, inplace=True)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 4 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   stan     418 non-null    object 
 1   l pokoi  418 non-null    object 
 2   metraż   418 non-null    float64
 3   rynek    418 non-null    object 
dtypes: float64(1), object(3)
memory usage: 13.2+ KB
predictions = model1.predict(df_test)
prediction = pd.DataFrame(predictions, columns=['predictions']).to_csv('test-A/out.tsv', index=False, sep="\t")