task 1 script
This commit is contained in:
parent
a2fa75ba27
commit
561a92489e
7
.gitignore
vendored
7
.gitignore
vendored
@ -212,5 +212,8 @@ fabric.properties
|
||||
.idea/caches/build_file_checksums.ser
|
||||
|
||||
.idea
|
||||
./lianjia.zip
|
||||
./new.csv
|
||||
|
||||
/house-price-dataset.zip
|
||||
/Participants_Data_HPP/
|
||||
/Participants_Data_HPP/*
|
||||
|
||||
|
10
README.md
10
README.md
@ -1,8 +1,10 @@
|
||||
#Projekt na przedmiot inżynieria oprogramowania
|
||||
##Projekt na przedmiot inżynieria oprogramowania
|
||||
|
||||
### Starting script:
|
||||
#### Starting script:
|
||||
|
||||
1. pip install --user kaggle #API Kaggle, do pobrania zbioru
|
||||
2. pip install --user pandas
|
||||
3. kaggle datasets download -d ruiqurm/lianjia (U have to have Kaggle token, more info here: https://www.kaggle.com/docs/api)
|
||||
4. unzip -o lianjia.zip / tar -xf .\lianjia.zip (for windows windows)
|
||||
3. kaggle datasets download -d akash14/house-price-dataset (U have to have Kaggle token, more info here: https://www.kaggle.com/docs/api)
|
||||
4. unzip -o house-price-dataset.zip / tar -xf .\house-price-dataset.zip (for windows)
|
||||
|
||||
data from https://www.kaggle.com/datasets/akash14/house-price-dataset
|
||||
|
BIN
lianjia.zip
BIN
lianjia.zip
Binary file not shown.
36
src/preparation.py
Normal file
36
src/preparation.py
Normal file
@ -0,0 +1,36 @@
|
||||
import pandas as pd
|
||||
import os
|
||||
|
||||
# rename files
|
||||
# os.rename('../Participants_Data_HPP/Train.csv', '../Participants_Data_HPP/Test1.csv')
|
||||
# os.rename('../Participants_Data_HPP/Test.csv', '../Participants_Data_HPP/Train1.csv')
|
||||
|
||||
# paths
|
||||
filePathTest = "../Participants_Data_HPP/Train.csv"
|
||||
filePathTrain = "../Participants_Data_HPP/Test.csv"
|
||||
|
||||
dataTest = pd.read_csv(filePathTest)
|
||||
dataTrain = pd.read_csv(filePathTrain)
|
||||
|
||||
number_lines = len(dataTest.index)
|
||||
row_size = number_lines // 2
|
||||
|
||||
# start looping through data writing it to a new file for each set
|
||||
# no of csv files with row size
|
||||
k = 2
|
||||
size = row_size
|
||||
|
||||
# split test data to test and dev
|
||||
for i in range(k):
|
||||
df = dataTest[size * i:size * (i + 1)]
|
||||
name = ""
|
||||
if i == 0:
|
||||
name = "Dev"
|
||||
else:
|
||||
name = "Test"
|
||||
df.to_csv(f'../Participants_Data_HPP/' + name + '.csv', index=False)
|
||||
|
||||
#df_1 = pd.read_csv("../Participants_Data_HPP/Dev.csv")
|
||||
|
||||
#df_2 = pd.read_csv("../Participants_Data_HPP/Test.csv")
|
||||
|
31
src/statistics.py
Normal file
31
src/statistics.py
Normal file
@ -0,0 +1,31 @@
|
||||
import pandas as pd
|
||||
import matplotlib.pyplot as plt
|
||||
|
||||
#https://www.kaggle.com/code/aadhavvignesh/regression-with-scikit-learn-practical-ml-1
|
||||
|
||||
dataPath = '../Participants_Data_HPP/Train.csv'
|
||||
data = pd.read_csv(dataPath)
|
||||
info = data.info()
|
||||
description = data.describe(include="all")
|
||||
corr = data.corr()
|
||||
|
||||
#select the most significant
|
||||
data = data[['TARGET(PRICE_IN_LACS)', 'SQUARE_FT', 'BHK_NO.', 'RESALE']]
|
||||
#print(data.head())
|
||||
data["BHK_NO."].value_counts().plot(kind="bar")
|
||||
#plt.show()
|
||||
|
||||
#normalize price column and flat area using min max technique
|
||||
columnName1 = 'TARGET(PRICE_IN_LACS)'
|
||||
columnName2 = 'SQUARE_FT'
|
||||
|
||||
column1Min = data[columnName1].min()
|
||||
column1Max = data[columnName1].max()
|
||||
column2Min = data[columnName2].min()
|
||||
column2Max = data[columnName2].max()
|
||||
|
||||
data[columnName1] = (data[columnName1] - column1Min) / (column1Max - column1Min)
|
||||
data[columnName2] = (data[columnName2] - column2Min) / (column2Max - column2Min)
|
||||
|
||||
print(data.head())
|
||||
print(data.describe(include="all"))
|
Loading…
Reference in New Issue
Block a user