22 KiB
22 KiB
Download data:
run commands:
- pip install --user kaggle #API Kaggle, do pobrania zbioru
- pip install --user pandas
- kaggle datasets download -d akash14/house-price-dataset (U have to have Kaggle token, more info here: https://www.kaggle.com/docs/api)
- unzip -o house-price-dataset.zip / tar -xf .\house-price-dataset.zip (for windows)
data from https://www.kaggle.com/datasets/akash14/house-price-dataset
import pandas as pd
# paths
filePathTest = "../Participants_Data_HPP/Train.csv"
filePathTrain = "../Participants_Data_HPP/Test.csv"
dataTest = pd.read_csv(filePathTest)
dataTrain = pd.read_csv(filePathTrain)
number_lines = len(dataTest.index)
row_size = number_lines // 2
# start looping through data writing it to a new file for each set
# no of csv files with row size
k = 2
size = row_size
# split test data to test and dev
for i in range(k):
df = dataTest[size * i:size * (i + 1)]
name = ""
if i == 0:
name = "Dev"
else:
name = "Test"
df.to_csv(f'../Participants_Data_HPP/' + name + '.csv', index=False)
df_1 = pd.read_csv("../Participants_Data_HPP/Dev.csv")
df_2 = pd.read_csv("../Participants_Data_HPP/Test.csv")
df_3 = pd.read_csv("../Participants_Data_HPP/Train.csv")
print("dev:" + str(len(df_1)))
print("test:" + str(len(df_2)))
print("train:" + str(len(df_3)))
dev:14725 test:14725 train:29451
import pandas as pd
import matplotlib.pyplot as plt
#https://www.kaggle.com/code/aadhavvignesh/regression-with-scikit-learn-practical-ml-1
dataPath = '../Participants_Data_HPP/Train.csv'
#data informations
data = pd.read_csv(dataPath)
info = data.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 29451 entries, 0 to 29450 Data columns (total 12 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 POSTED_BY 29451 non-null object 1 UNDER_CONSTRUCTION 29451 non-null int64 2 RERA 29451 non-null int64 3 BHK_NO. 29451 non-null int64 4 BHK_OR_RK 29451 non-null object 5 SQUARE_FT 29451 non-null float64 6 READY_TO_MOVE 29451 non-null int64 7 RESALE 29451 non-null int64 8 ADDRESS 29451 non-null object 9 LONGITUDE 29451 non-null float64 10 LATITUDE 29451 non-null float64 11 TARGET(PRICE_IN_LACS) 29451 non-null float64 dtypes: float64(4), int64(5), object(3) memory usage: 2.7+ MB
description = data.describe(include="all")
print(description)
POSTED_BY UNDER_CONSTRUCTION RERA BHK_NO. BHK_OR_RK \ count 29451 29451.000000 29451.000000 29451.000000 29451 unique 3 NaN NaN NaN 2 top Dealer NaN NaN NaN BHK freq 18291 NaN NaN NaN 29427 mean NaN 0.179756 0.317918 2.392279 NaN std NaN 0.383991 0.465675 0.879091 NaN min NaN 0.000000 0.000000 1.000000 NaN 25% NaN 0.000000 0.000000 2.000000 NaN 50% NaN 0.000000 0.000000 2.000000 NaN 75% NaN 0.000000 1.000000 3.000000 NaN max NaN 1.000000 1.000000 20.000000 NaN SQUARE_FT READY_TO_MOVE RESALE ADDRESS \ count 2.945100e+04 29451.000000 29451.000000 29451 unique NaN NaN NaN 6899 top NaN NaN NaN Zirakpur,Chandigarh freq NaN NaN NaN 509 mean 1.980217e+04 0.820244 0.929578 NaN std 1.901335e+06 0.383991 0.255861 NaN min 3.000000e+00 0.000000 0.000000 NaN 25% 9.000211e+02 1.000000 1.000000 NaN 50% 1.175057e+03 1.000000 1.000000 NaN 75% 1.550688e+03 1.000000 1.000000 NaN max 2.545455e+08 1.000000 1.000000 NaN LONGITUDE LATITUDE TARGET(PRICE_IN_LACS) count 29451.000000 29451.000000 29451.000000 unique NaN NaN NaN top NaN NaN NaN freq NaN NaN NaN mean 21.300255 76.837695 142.898746 std 6.205306 10.557747 656.880713 min -37.713008 -121.761248 0.250000 25% 18.452663 73.798100 38.000000 50% 20.750000 77.324137 62.000000 75% 26.900926 77.828740 100.000000 max 59.912884 152.962676 30000.000000
corr = data.corr()
print(corr)
UNDER_CONSTRUCTION RERA BHK_NO. SQUARE_FT \ UNDER_CONSTRUCTION 1.000000 0.363826 -0.040712 -0.004204 RERA 0.363826 1.000000 0.009547 -0.006229 BHK_NO. -0.040712 0.009547 1.000000 0.005303 SQUARE_FT -0.004204 -0.006229 0.005303 1.000000 READY_TO_MOVE -1.000000 -0.363826 0.040712 0.004204 RESALE -0.347405 -0.270351 0.014581 0.001732 LONGITUDE 0.006440 0.104976 0.068730 -0.012591 LATITUDE -0.000381 -0.065106 0.046930 0.000803 TARGET(PRICE_IN_LACS) 0.055399 0.067636 0.112283 0.402685 READY_TO_MOVE RESALE LONGITUDE LATITUDE \ UNDER_CONSTRUCTION -1.000000 -0.347405 0.006440 -0.000381 RERA -0.363826 -0.270351 0.104976 -0.065106 BHK_NO. 0.040712 0.014581 0.068730 0.046930 SQUARE_FT 0.004204 0.001732 -0.012591 0.000803 READY_TO_MOVE 1.000000 0.347405 -0.006440 0.000381 RESALE 0.347405 1.000000 0.024038 0.014844 LONGITUDE -0.006440 0.024038 1.000000 -0.155062 LATITUDE 0.000381 0.014844 -0.155062 1.000000 TARGET(PRICE_IN_LACS) -0.055399 -0.207378 -0.031112 -0.017254 TARGET(PRICE_IN_LACS) UNDER_CONSTRUCTION 0.055399 RERA 0.067636 BHK_NO. 0.112283 SQUARE_FT 0.402685 READY_TO_MOVE -0.055399 RESALE -0.207378 LONGITUDE -0.031112 LATITUDE -0.017254 TARGET(PRICE_IN_LACS) 1.000000
#print(data.head())
data["BHK_NO."].value_counts().plot(kind="bar")
<AxesSubplot:>
#select the most significant
data = data[['TARGET(PRICE_IN_LACS)', 'SQUARE_FT', 'BHK_NO.', 'RESALE']]
#normalize price column and flat area using min max technique
columnName1 = 'TARGET(PRICE_IN_LACS)'
columnName2 = 'SQUARE_FT'
column1Min = data[columnName1].min()
column1Max = data[columnName1].max()
column2Min = data[columnName2].min()
column2Max = data[columnName2].max()
data[columnName1] = (data[columnName1] - column1Min) / (column1Max - column1Min)
data[columnName2] = (data[columnName2] - column2Min) / (column2Max - column2Min)
data.describe(include="all")
TARGET(PRICE_IN_LACS) | SQUARE_FT | BHK_NO. | RESALE | |
---|---|---|---|---|
count | 29451.000000 | 29451.000000 | 29451.000000 | 29451.000000 |
mean | 0.004755 | 0.000078 | 2.392279 | 0.929578 |
std | 0.021896 | 0.007470 | 0.879091 | 0.255861 |
min | 0.000000 | 0.000000 | 1.000000 | 0.000000 |
25% | 0.001258 | 0.000004 | 2.000000 | 1.000000 |
50% | 0.002058 | 0.000005 | 2.000000 | 1.000000 |
75% | 0.003325 | 0.000006 | 3.000000 | 1.000000 |
max | 1.000000 | 1.000000 | 20.000000 | 1.000000 |