add jupyter

This commit is contained in:
s444417 2022-03-18 13:08:00 +01:00
parent 561a92489e
commit 4cdb496a97
4 changed files with 415 additions and 76 deletions

View File

@ -1,10 +1,2 @@
## Projekt na przedmiot inżynieria oprogramowania ## Projekt na przedmiot inżynieria oprogramowania
#### Starting script:
1. pip install --user kaggle #API Kaggle, do pobrania zbioru
2. pip install --user pandas
3. kaggle datasets download -d akash14/house-price-dataset (U have to have Kaggle token, more info here: https://www.kaggle.com/docs/api)
4. unzip -o house-price-dataset.zip / tar -xf .\house-price-dataset.zip (for windows)
data from https://www.kaggle.com/datasets/akash14/house-price-dataset

View File

@ -1,36 +0,0 @@
import pandas as pd
import os
# rename files
# os.rename('../Participants_Data_HPP/Train.csv', '../Participants_Data_HPP/Test1.csv')
# os.rename('../Participants_Data_HPP/Test.csv', '../Participants_Data_HPP/Train1.csv')
# paths
filePathTest = "../Participants_Data_HPP/Train.csv"
filePathTrain = "../Participants_Data_HPP/Test.csv"
dataTest = pd.read_csv(filePathTest)
dataTrain = pd.read_csv(filePathTrain)
number_lines = len(dataTest.index)
row_size = number_lines // 2
# start looping through data writing it to a new file for each set
# no of csv files with row size
k = 2
size = row_size
# split test data to test and dev
for i in range(k):
df = dataTest[size * i:size * (i + 1)]
name = ""
if i == 0:
name = "Dev"
else:
name = "Test"
df.to_csv(f'../Participants_Data_HPP/' + name + '.csv', index=False)
#df_1 = pd.read_csv("../Participants_Data_HPP/Dev.csv")
#df_2 = pd.read_csv("../Participants_Data_HPP/Test.csv")

View File

@ -1,31 +0,0 @@
import pandas as pd
import matplotlib.pyplot as plt
#https://www.kaggle.com/code/aadhavvignesh/regression-with-scikit-learn-practical-ml-1
dataPath = '../Participants_Data_HPP/Train.csv'
data = pd.read_csv(dataPath)
info = data.info()
description = data.describe(include="all")
corr = data.corr()
#select the most significant
data = data[['TARGET(PRICE_IN_LACS)', 'SQUARE_FT', 'BHK_NO.', 'RESALE']]
#print(data.head())
data["BHK_NO."].value_counts().plot(kind="bar")
#plt.show()
#normalize price column and flat area using min max technique
columnName1 = 'TARGET(PRICE_IN_LACS)'
columnName2 = 'SQUARE_FT'
column1Min = data[columnName1].min()
column1Max = data[columnName1].max()
column2Min = data[columnName2].min()
column2Max = data[columnName2].max()
data[columnName1] = (data[columnName1] - column1Min) / (column1Max - column1Min)
data[columnName2] = (data[columnName2] - column2Min) / (column2Max - column2Min)
print(data.head())
print(data.describe(include="all"))

414
src/task1.ipynb Normal file

File diff suppressed because one or more lines are too long