Dockerfile

This commit is contained in:
s444417 2022-04-03 12:24:05 +02:00
parent 55e68cc2cc
commit fe38ed7a4c
5 changed files with 119 additions and 17 deletions

3
.vscode/settings.json vendored Normal file
View File

@ -0,0 +1,3 @@
{
"python.terminal.executeInFileDir": true
}

26
Dockerfile Normal file
View File

@ -0,0 +1,26 @@
# Nasz obraz będzie dzidziczył z obrazu Ubuntu w wersji latest
FROM ubuntu:latest
# Instalujemy niezbędne zależności. Zwróć uwagę na flagę "-y" (assume yes)
RUN apt update && apt install -y
RUN apt-get install -y python3
RUN apt-get install -y unzip
RUN apt-get install -y python3-pip
RUN pip install --upgrade pip
RUN pip install --user kaggle
RUN pip install --user pandas
# Stwórzmy w kontenerze (jeśli nie istnieje) katalog /app i przejdźmy do niego (wszystkie kolejne polecenia RUN, CMD, ENTRYPOINT, COPY i ADD będą w nim wykonywane)
WORKDIR /app
# Skopiujmy nasz skrypt do katalogu /app w kontenerze
COPY ./startscript1.sh ./
COPY ./src/task1python.py ./src/task1python.py
ARG KAGGLE_USERNAME
ARG KAGGLE_KEY
# Domyślne polecenie, które zostanie uruchomione w kontenerze po jego starcie
# RUN chmod u+x ./startscript1.sh
# RUN chmod u+x ./src/task1python.py

6
Jenkinsfile vendored
View File

@ -1,5 +1,4 @@
pipeline {
agent any
parameters{
string(
defaultValue: 'mikolaj2',
@ -17,6 +16,9 @@ pipeline {
name: 'KAGGLE_KEY'
)
}
agent {
dockerfile {additionalBuildArgs "additionalBuildArgs '--build-arg KAGGLE_USERNAME="$KAGGLE_USERNAME" --build-arg KAGGLE_KEY="$KAGGLE_KEY" -t my-image"}
}
stages {
stage("Check out from version control") {
steps {
@ -26,7 +28,7 @@ pipeline {
stage("Shell Scripts") {
steps {
sh "chmod u+x ./startscript1.sh"
sh "KAGGLE_USERNAME=${KAGGLE_USERNAME} KAGGLE_KEY=${env.KAGGLE_KEY} CUTOFF=${CUTOFF} ./startscript1.sh"
sh "KAGGLE_USERNAME=${params.KAGGLE_USERNAME} KAGGLE_KEY=${params.KAGGLE_KEY} CUTOFF=${CUTOFF} ./startscript1.sh"
archiveArtifacts 'data.txt'
}
}

View File

@ -78,7 +78,7 @@
},
{
"cell_type": "code",
"execution_count": 15,
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
@ -95,7 +95,7 @@
},
{
"cell_type": "code",
"execution_count": 14,
"execution_count": 3,
"metadata": {},
"outputs": [
{
@ -104,15 +104,23 @@
"text": [
"<class 'pandas.core.frame.DataFrame'>\n",
"RangeIndex: 29451 entries, 0 to 29450\n",
"Data columns (total 4 columns):\n",
"Data columns (total 12 columns):\n",
" # Column Non-Null Count Dtype \n",
"--- ------ -------------- ----- \n",
" 0 TARGET(PRICE_IN_LACS) 29451 non-null float64\n",
" 1 SQUARE_FT 29451 non-null float64\n",
" 2 BHK_NO. 29451 non-null int64 \n",
" 3 RESALE 29451 non-null int64 \n",
"dtypes: float64(2), int64(2)\n",
"memory usage: 920.5 KB\n"
" 0 POSTED_BY 29451 non-null object \n",
" 1 UNDER_CONSTRUCTION 29451 non-null int64 \n",
" 2 RERA 29451 non-null int64 \n",
" 3 BHK_NO. 29451 non-null int64 \n",
" 4 BHK_OR_RK 29451 non-null object \n",
" 5 SQUARE_FT 29451 non-null float64\n",
" 6 READY_TO_MOVE 29451 non-null int64 \n",
" 7 RESALE 29451 non-null int64 \n",
" 8 ADDRESS 29451 non-null object \n",
" 9 LONGITUDE 29451 non-null float64\n",
" 10 LATITUDE 29451 non-null float64\n",
" 11 TARGET(PRICE_IN_LACS) 29451 non-null float64\n",
"dtypes: float64(4), int64(5), object(3)\n",
"memory usage: 2.7+ MB\n"
]
}
],
@ -122,7 +130,7 @@
},
{
"cell_type": "code",
"execution_count": 23,
"execution_count": 4,
"metadata": {},
"outputs": [
{
@ -177,7 +185,7 @@
},
{
"cell_type": "code",
"execution_count": 22,
"execution_count": 5,
"metadata": {},
"outputs": [
{
@ -226,7 +234,7 @@
},
{
"cell_type": "code",
"execution_count": 11,
"execution_count": 6,
"metadata": {},
"outputs": [
{
@ -235,7 +243,7 @@
"<AxesSubplot:>"
]
},
"execution_count": 11,
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
},
@ -259,7 +267,7 @@
},
{
"cell_type": "code",
"execution_count": 13,
"execution_count": 7,
"metadata": {},
"outputs": [
{
@ -362,7 +370,7 @@
"max 1.000000 1.000000 20.000000 1.000000"
]
},
"execution_count": 13,
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}

63
src/task1python.py Normal file
View File

@ -0,0 +1,63 @@
import pandas as pd
# paths
filePathTest = "../Participants_Data_HPP/Train.csv"
filePathTrain = "../Participants_Data_HPP/Test.csv"
dataTest = pd.read_csv(filePathTest)
dataTrain = pd.read_csv(filePathTrain)
number_lines = len(dataTest.index)
row_size = number_lines // 2
# start looping through data writing it to a new file for each set
# no of csv files with row size
k = 2
size = row_size
# split test data to test and dev
for i in range(k):
df = dataTest[size * i:size * (i + 1)]
name = ""
if i == 0:
name = "Dev"
else:
name = "Test"
df.to_csv(f'../Participants_Data_HPP/' + name + '.csv', index=False)
#df_1 = pd.read_csv("../Participants_Data_HPP/Dev.csv")
#df_2 = pd.read_csv("../Participants_Data_HPP/Test.csv")
#df_2 = pd.read_csv("../Participants_Data_HPP/Train.csv")
dataPath = '../Participants_Data_HPP/Train.csv'
#data informations
data = pd.read_csv(dataPath)
description = data.describe(include="all")
corr = data.corr()
#select the most significant
data = data[['TARGET(PRICE_IN_LACS)', 'SQUARE_FT', 'BHK_NO.', 'RESALE']]
#normalize price column and flat area using min max technique
columnName1 = 'TARGET(PRICE_IN_LACS)'
columnName2 = 'SQUARE_FT'
column1Min = data[columnName1].min()
column1Max = data[columnName1].max()
column2Min = data[columnName2].min()
column2Max = data[columnName2].max()
data[columnName1] = (data[columnName1] - column1Min) / (column1Max - column1Min)
data[columnName2] = (data[columnName2] - column2Min) / (column2Max - column2Min)
print(description)
print(corr)
print(data.describe(include="all"))
print(data.head())