ium_444417/src/task1.ipynb
2022-04-10 19:55:37 +02:00

22 KiB

Scripts for first task

Download data:

run commands:

  1. pip install --user kaggle #API Kaggle, do pobrania zbioru
  2. pip install --user pandas
  3. kaggle datasets download -d akash14/house-price-dataset (U have to have Kaggle token, more info here: https://www.kaggle.com/docs/api)
  4. unzip -o house-price-dataset.zip / tar -xf .\house-price-dataset.zip (for windows)

data from https://www.kaggle.com/datasets/akash14/house-price-dataset

Data preparation

import pandas as pd

# paths
filePathTest = "../Participants_Data_HPP/Train.csv"
filePathTrain = "../Participants_Data_HPP/Test.csv"

dataTest = pd.read_csv(filePathTest)
dataTrain = pd.read_csv(filePathTrain)

number_lines = len(dataTest.index)
row_size = number_lines // 2

# start looping through data writing it to a new file for each set
# no of csv files with row size
k = 2
size = row_size

# split test data to test and dev
for i in range(k):
    df = dataTest[size * i:size * (i + 1)]
    name = ""
    if i == 0:
        name = "Dev"
    else:
        name = "Test"
    df.to_csv(f'../Participants_Data_HPP/' + name + '.csv', index=False)

df_1 = pd.read_csv("../Participants_Data_HPP/Dev.csv")

df_2 = pd.read_csv("../Participants_Data_HPP/Test.csv")

df_3 = pd.read_csv("../Participants_Data_HPP/Train.csv")

print("dev:" + str(len(df_1)))
print("test:" + str(len(df_2)))
print("train:" + str(len(df_3)))

dev:14725
test:14725
train:29451

Preprocessing data

import pandas as pd
import matplotlib.pyplot as plt

#https://www.kaggle.com/code/aadhavvignesh/regression-with-scikit-learn-practical-ml-1

dataPath = '../Participants_Data_HPP/Train.csv'

#data informations
data = pd.read_csv(dataPath)
info = data.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 29451 entries, 0 to 29450
Data columns (total 12 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   POSTED_BY              29451 non-null  object 
 1   UNDER_CONSTRUCTION     29451 non-null  int64  
 2   RERA                   29451 non-null  int64  
 3   BHK_NO.                29451 non-null  int64  
 4   BHK_OR_RK              29451 non-null  object 
 5   SQUARE_FT              29451 non-null  float64
 6   READY_TO_MOVE          29451 non-null  int64  
 7   RESALE                 29451 non-null  int64  
 8   ADDRESS                29451 non-null  object 
 9   LONGITUDE              29451 non-null  float64
 10  LATITUDE               29451 non-null  float64
 11  TARGET(PRICE_IN_LACS)  29451 non-null  float64
dtypes: float64(4), int64(5), object(3)
memory usage: 2.7+ MB
description = data.describe(include="all")
print(description)
       POSTED_BY  UNDER_CONSTRUCTION          RERA       BHK_NO. BHK_OR_RK  \
count      29451        29451.000000  29451.000000  29451.000000     29451   
unique         3                 NaN           NaN           NaN         2   
top       Dealer                 NaN           NaN           NaN       BHK   
freq       18291                 NaN           NaN           NaN     29427   
mean         NaN            0.179756      0.317918      2.392279       NaN   
std          NaN            0.383991      0.465675      0.879091       NaN   
min          NaN            0.000000      0.000000      1.000000       NaN   
25%          NaN            0.000000      0.000000      2.000000       NaN   
50%          NaN            0.000000      0.000000      2.000000       NaN   
75%          NaN            0.000000      1.000000      3.000000       NaN   
max          NaN            1.000000      1.000000     20.000000       NaN   

           SQUARE_FT  READY_TO_MOVE        RESALE              ADDRESS  \
count   2.945100e+04   29451.000000  29451.000000                29451   
unique           NaN            NaN           NaN                 6899   
top              NaN            NaN           NaN  Zirakpur,Chandigarh   
freq             NaN            NaN           NaN                  509   
mean    1.980217e+04       0.820244      0.929578                  NaN   
std     1.901335e+06       0.383991      0.255861                  NaN   
min     3.000000e+00       0.000000      0.000000                  NaN   
25%     9.000211e+02       1.000000      1.000000                  NaN   
50%     1.175057e+03       1.000000      1.000000                  NaN   
75%     1.550688e+03       1.000000      1.000000                  NaN   
max     2.545455e+08       1.000000      1.000000                  NaN   

           LONGITUDE      LATITUDE  TARGET(PRICE_IN_LACS)  
count   29451.000000  29451.000000           29451.000000  
unique           NaN           NaN                    NaN  
top              NaN           NaN                    NaN  
freq             NaN           NaN                    NaN  
mean       21.300255     76.837695             142.898746  
std         6.205306     10.557747             656.880713  
min       -37.713008   -121.761248               0.250000  
25%        18.452663     73.798100              38.000000  
50%        20.750000     77.324137              62.000000  
75%        26.900926     77.828740             100.000000  
max        59.912884    152.962676           30000.000000  
corr = data.corr()
print(corr)
                       UNDER_CONSTRUCTION      RERA   BHK_NO.  SQUARE_FT  \
UNDER_CONSTRUCTION               1.000000  0.363826 -0.040712  -0.004204   
RERA                             0.363826  1.000000  0.009547  -0.006229   
BHK_NO.                         -0.040712  0.009547  1.000000   0.005303   
SQUARE_FT                       -0.004204 -0.006229  0.005303   1.000000   
READY_TO_MOVE                   -1.000000 -0.363826  0.040712   0.004204   
RESALE                          -0.347405 -0.270351  0.014581   0.001732   
LONGITUDE                        0.006440  0.104976  0.068730  -0.012591   
LATITUDE                        -0.000381 -0.065106  0.046930   0.000803   
TARGET(PRICE_IN_LACS)            0.055399  0.067636  0.112283   0.402685   

                       READY_TO_MOVE    RESALE  LONGITUDE  LATITUDE  \
UNDER_CONSTRUCTION         -1.000000 -0.347405   0.006440 -0.000381   
RERA                       -0.363826 -0.270351   0.104976 -0.065106   
BHK_NO.                     0.040712  0.014581   0.068730  0.046930   
SQUARE_FT                   0.004204  0.001732  -0.012591  0.000803   
READY_TO_MOVE               1.000000  0.347405  -0.006440  0.000381   
RESALE                      0.347405  1.000000   0.024038  0.014844   
LONGITUDE                  -0.006440  0.024038   1.000000 -0.155062   
LATITUDE                    0.000381  0.014844  -0.155062  1.000000   
TARGET(PRICE_IN_LACS)      -0.055399 -0.207378  -0.031112 -0.017254   

                       TARGET(PRICE_IN_LACS)  
UNDER_CONSTRUCTION                  0.055399  
RERA                                0.067636  
BHK_NO.                             0.112283  
SQUARE_FT                           0.402685  
READY_TO_MOVE                      -0.055399  
RESALE                             -0.207378  
LONGITUDE                          -0.031112  
LATITUDE                           -0.017254  
TARGET(PRICE_IN_LACS)               1.000000  
#print(data.head())
data["BHK_NO."].value_counts().plot(kind="bar")
<AxesSubplot:>
#select the most significant
data = data[['TARGET(PRICE_IN_LACS)', 'SQUARE_FT', 'BHK_NO.', 'RESALE']]
#normalize price column and flat area using min max technique
columnName1 = 'TARGET(PRICE_IN_LACS)'
columnName2 = 'SQUARE_FT'

column1Min = data[columnName1].min()
column1Max = data[columnName1].max()
column2Min = data[columnName2].min()
column2Max = data[columnName2].max()

data[columnName1] = (data[columnName1] - column1Min) / (column1Max - column1Min)
data[columnName2] = (data[columnName2] - column2Min) / (column2Max - column2Min)

data.describe(include="all")
TARGET(PRICE_IN_LACS) SQUARE_FT BHK_NO. RESALE
count 29451.000000 29451.000000 29451.000000 29451.000000
mean 0.004755 0.000078 2.392279 0.929578
std 0.021896 0.007470 0.879091 0.255861
min 0.000000 0.000000 1.000000 0.000000
25% 0.001258 0.000004 2.000000 1.000000
50% 0.002058 0.000005 2.000000 1.000000
75% 0.003325 0.000006 3.000000 1.000000
max 1.000000 1.000000 20.000000 1.000000