Jenkins and docker intergration. Create model for lab 5.

This commit is contained in:
Marek Moryl 2023-04-21 14:05:54 +02:00
parent 7d8d0a4dec
commit aa08c7feae
7 changed files with 134 additions and 70 deletions

View File

@ -1,9 +0,0 @@
FROM ubuntu:latest
ADD get-data.sh /get-data.sh
ADD prepare_dataset.py /prepare_dataset.py
RUN apt-get update
RUN apt-get install -y python3 python3-pip unzip
RUN pip install pandas
RUN pip install scikit-learn

View File

@ -1 +0,0 @@
FROM ubuntu:latest

View File

@ -1,6 +1,10 @@
pipeline { pipeline {
agent any agent {
//Definijuemy parametry, które będzie można podać podczas wywoływania zadania dockerfile {
filename 'Lab4.dockerfile'
reuseNode true
}
}
parameters { parameters {
string( string(
defaultValue: '1000', defaultValue: '1000',
@ -23,26 +27,23 @@ pipeline {
stages { stages {
stage('Checkout') { stage('Checkout') {
steps { steps {
sh 'rm -rf ium_z487183' checkout scmGit(
sh 'git clone https://git.wmi.amu.edu.pl/s487183/ium_z487183.git' branches: [[name: '*/master']],
extensions: [cleanBeforeCheckout()],
userRemoteConfigs: [[url: 'https://git.wmi.amu.edu.pl/s487183/ium_z487183.git']]
)
} }
} }
stage('Prepare data') { stage('Prepare data') {
steps { steps {
withEnv(["KAGGLE_USERNAME=${params.KAGGLE_USERNAME}", "KAGGLE_KEY=${params.KAGGLE_KEY}"]) { withEnv(["KAGGLE_USERNAME=${params.KAGGLE_USERNAME}", "KAGGLE_KEY=${params.KAGGLE_KEY}"]) {
sh 'ium_z487183/get-data.sh' sh 'echo "{\"username\":\"$KAGGLE_USERNAME\",\"key\":\"$KAGGLE_KEY\"}" > /.kaggle/kaggle.json'
sh 'python3 ium_z487183/prepare_dataset.py' sh './get-data.sh'
sh 'python3 prepare_dataset.py'
} }
} }
} }
stage('Archive artifacts') { stage('Archive artifacts') {
agent {
dockerfile {
filename 'CreateDataset.dockerfile'
dir 'ium_z487183'
reuseNode true
}
}
steps { steps {
withEnv(["CUTOFF=${params.CUTOFF}"]) { withEnv(["CUTOFF=${params.CUTOFF}"]) {
archiveArtifacts 'X_test.csv' archiveArtifacts 'X_test.csv'
@ -55,4 +56,4 @@ pipeline {
} }
} }
} }
} }

View File

@ -1,29 +1,32 @@
pipeline { pipeline {
agent any agent any
parameters{
buildSelector( parameters{
defaultSelector: lastSuccessful(), buildSelector(
description: 'Which build to use for copying artifacts', defaultSelector: lastSuccessful(),
name: 'BUILD_SELECTOR' description: 'Which build to use for copying artifacts',
) name: 'BUILD_SELECTOR'
} )
stages { }
stages {
stage('Checkout') { stage('Checkout') {
steps { steps {
sh 'rm -rf ium_z487183' checkout scmGit(
sh 'git clone https://git.wmi.amu.edu.pl/s487183/ium_z487183.git' branches: [[name: '*/master']],
extensions: [cleanBeforeCheckout()],
userRemoteConfigs: [[url: 'https://git.wmi.amu.edu.pl/s487183/ium_z487183.git']]
)
} }
} }
stage('Prepare stats') { stage('Prepare stats') {
agent { agent {
dockerfile { docker {
filename 'DatasetStats.dockerfile' image 'mmoryl/ium:latest'
dir 'ium_z487183'
reuseNode true reuseNode true
} }
} }
steps { steps {
copyArtifacts filter: 'X_test.csv,X_val.csv,X_train.csv,Y_test.csv,Y_val.csv,Y_train.csv', fingerprintArtifacts: true, projectName: 'z487183-create-dataset', selector: workspace() copyArtifacts projectName: 'z487183-create-dataset'
sh './prepare-stats.sh' sh './prepare-stats.sh'
archiveArtifacts 'stats.txt' archiveArtifacts 'stats.txt'
} }

19
Lab4.dockerfile Normal file
View File

@ -0,0 +1,19 @@
FROM ubuntu:latest
RUN apt-get update
RUN apt-get install -y python3 python3-pip unzip
RUN pip install pandas
RUN pip install scikit-learn
RUN pip install kaggle
ARG DEBIAN_FRONTEND=noninteractive
ADD get-data.sh /get-data.sh
ADD prepare-stats.sh /prepare-stats.sh
ADD prepare_dataset.py /prepare_dataset.py
ADD property_model.py /property_model.py
ADD predict_values.py /predict_values.py
RUN mkdir /.kaggle
RUN touch /.kaggle/kaggle.json
RUN chmod 777 /.kaggle/kaggle.json

View File

@ -1,49 +1,67 @@
import os import os
import pandas as pd import pandas as pd
from sklearn.model_selection import train_test_split from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
# get data # get data
sells = pd.read_csv('data/Property Sales of Melbourne City.csv') sells = pd.read_csv('data/Property Sales of Melbourne City.csv')
# delete unnecessary columns and drop rows with NaN values # prepare column, which will be predicted
columns_to_drop = [ price_median = sells['Price'].median()
'Lattitude',
'Longtitude',
'CouncilArea',
'Propertycount',
'Method',
'SellerG',
'Date',
'Postcode',
'Bedroom2',
'Bathroom',
'Car',
'BuildingArea',
'Address'
]
sells = sells.drop(columns_to_drop, axis=1).dropna()
# normalize values def price_above_median(price):
sells["Price"] = sells["Price"] / sells["Price"].max() if price > price_median:
sells["Landsize"] = sells["Landsize"] / sells["Landsize"].max() return 1
sells["Distance"] = sells["Distance"] / sells["Distance"].max() else:
return 0
sells['PriceAboveMedian'] = sells['Price'].apply(price_above_median)
# delete unnecessary columns and drop rows with NaN values
columns_to_take = ['Rooms', 'Distance', 'Bedroom2', 'Bathroom', 'PriceAboveMedian']
sells = sells[columns_to_take].dropna()
# cut off dataset to fixed number of values # cut off dataset to fixed number of values
cutoff = int(os.environ['CUTOFF']) cutoff = 1000
sells = sells.sample(cutoff) sells = sells.sample(cutoff)
# split to train/dev/test subsets # split dataset
X = sells train_data, test_data = train_test_split(sells, test_size=0.2, random_state=42)
Y = sells.pop('Price') train_data, dev_data = train_test_split(train_data, test_size=0.2, random_state=42)
X_train, X_temp, Y_train, Y_temp = train_test_split(X, Y, test_size=0.3, random_state=1) # prepare dataset
X_val, X_test, Y_val, Y_test = train_test_split(X_temp, Y_temp, test_size=0.5, random_state=1) features = ['Rooms', 'Distance', 'Bedroom2', 'Bathroom']
target = 'PriceAboveMedian'
X_train = train_data[features].values
y_train = train_data[target].values
X_dev = dev_data[features].values
y_dev = dev_data[target].values
X_test = test_data[features].values
y_test = test_data[target].values
# normalize values
scaler = MinMaxScaler()
features = ['Rooms', 'Distance', 'Bedroom2', 'Bathroom']
for feature in features:
X_train = scaler.fit_transform(X_train)
X_dev = scaler.fit_transform(X_dev)
X_test = scaler.fit_transform(X_test)
# save subsets to files # save subsets to files
X_train = pd.DataFrame(X_train, columns = ['Rooms', 'Distance', 'Bedroom2', 'Bathroom'])
y_train = pd.DataFrame(y_train, columns = ['PriceAboveMedian'])
X_dev = pd.DataFrame(X_dev, columns = ['Rooms', 'Distance', 'Bedroom2', 'Bathroom'])
y_dev = pd.DataFrame(y_dev, columns = ['PriceAboveMedian'])
X_test = pd.DataFrame(X_test, columns = ['Rooms', 'Distance', 'Bedroom2', 'Bathroom'])
y_test = pd.DataFrame(y_test, columns = ['PriceAboveMedian'])
X_train.to_csv('X_train.csv', index=False) X_train.to_csv('X_train.csv', index=False)
X_val.to_csv('X_val.csv', index=False) X_dev.to_csv('X_val.csv', index=False)
X_test.to_csv('X_test.csv', index=False) X_test.to_csv('X_test.csv', index=False)
Y_train.to_csv('Y_train.csv', index=False) y_train.to_csv('Y_train.csv', index=False)
Y_val.to_csv('Y_val.csv', index=False) y_dev.to_csv('Y_val.csv', index=False)
Y_test.to_csv('Y_test.csv', index=False) y_test.to_csv('Y_test.csv', index=False)

33
property_model.py Normal file
View File

@ -0,0 +1,33 @@
import pandas as pd
from keras.models import Sequential
from keras.layers import Dense
# prepare dataset
features = ['Rooms', 'Distance', 'Bedroom2', 'Bathroom']
target = 'PriceAboveMedian'
X_train = pd.read_csv('X_train.csv').values
y_train = pd.read_csv('Y_train.csv').values
X_dev = pd.read_csv('X_val.csv').values
y_dev = pd.read_csv('Y_val.csv').values
X_test = pd.read_csv('X_test.csv').values
y_test = pd.read_csv('Y_test.csv').values
# model definition
model = Sequential([
Dense(32, activation='relu', input_shape=(len(features),)),
Dense(32, activation='relu'),
Dense(1, activation='sigmoid'),
])
#compile and train
model.compile(optimizer='sgd', loss='binary_crossentropy', metrics=['accuracy'])
hist = model.fit(X_train, y_train,
batch_size=32, epochs=100,
validation_data=(X_dev, y_dev))
model.evaluate(X_test, y_test)[1]
model.save('model.h5')