Jenkins and docker intergration. Create model for lab 5.

This commit is contained in:
Marek Moryl 2023-04-21 14:05:54 +02:00
parent 7d8d0a4dec
commit aa08c7feae
7 changed files with 134 additions and 70 deletions

View File

@ -1,9 +0,0 @@
FROM ubuntu:latest
ADD get-data.sh /get-data.sh
ADD prepare_dataset.py /prepare_dataset.py
RUN apt-get update
RUN apt-get install -y python3 python3-pip unzip
RUN pip install pandas
RUN pip install scikit-learn

View File

@ -1 +0,0 @@
FROM ubuntu:latest

View File

@ -1,6 +1,10 @@
pipeline {
agent any
//Definijuemy parametry, które będzie można podać podczas wywoływania zadania
agent {
dockerfile {
filename 'Lab4.dockerfile'
reuseNode true
}
}
parameters {
string(
defaultValue: '1000',
@ -23,26 +27,23 @@ pipeline {
stages {
stage('Checkout') {
steps {
sh 'rm -rf ium_z487183'
sh 'git clone https://git.wmi.amu.edu.pl/s487183/ium_z487183.git'
checkout scmGit(
branches: [[name: '*/master']],
extensions: [cleanBeforeCheckout()],
userRemoteConfigs: [[url: 'https://git.wmi.amu.edu.pl/s487183/ium_z487183.git']]
)
}
}
stage('Prepare data') {
steps {
withEnv(["KAGGLE_USERNAME=${params.KAGGLE_USERNAME}", "KAGGLE_KEY=${params.KAGGLE_KEY}"]) {
sh 'ium_z487183/get-data.sh'
sh 'python3 ium_z487183/prepare_dataset.py'
sh 'echo "{\"username\":\"$KAGGLE_USERNAME\",\"key\":\"$KAGGLE_KEY\"}" > /.kaggle/kaggle.json'
sh './get-data.sh'
sh 'python3 prepare_dataset.py'
}
}
}
stage('Archive artifacts') {
agent {
dockerfile {
filename 'CreateDataset.dockerfile'
dir 'ium_z487183'
reuseNode true
}
}
steps {
withEnv(["CUTOFF=${params.CUTOFF}"]) {
archiveArtifacts 'X_test.csv'
@ -55,4 +56,4 @@ pipeline {
}
}
}
}
}

View File

@ -1,29 +1,32 @@
pipeline {
agent any
parameters{
buildSelector(
defaultSelector: lastSuccessful(),
description: 'Which build to use for copying artifacts',
name: 'BUILD_SELECTOR'
)
}
stages {
agent any
parameters{
buildSelector(
defaultSelector: lastSuccessful(),
description: 'Which build to use for copying artifacts',
name: 'BUILD_SELECTOR'
)
}
stages {
stage('Checkout') {
steps {
sh 'rm -rf ium_z487183'
sh 'git clone https://git.wmi.amu.edu.pl/s487183/ium_z487183.git'
checkout scmGit(
branches: [[name: '*/master']],
extensions: [cleanBeforeCheckout()],
userRemoteConfigs: [[url: 'https://git.wmi.amu.edu.pl/s487183/ium_z487183.git']]
)
}
}
stage('Prepare stats') {
agent {
dockerfile {
filename 'DatasetStats.dockerfile'
dir 'ium_z487183'
docker {
image 'mmoryl/ium:latest'
reuseNode true
}
}
steps {
copyArtifacts filter: 'X_test.csv,X_val.csv,X_train.csv,Y_test.csv,Y_val.csv,Y_train.csv', fingerprintArtifacts: true, projectName: 'z487183-create-dataset', selector: workspace()
copyArtifacts projectName: 'z487183-create-dataset'
sh './prepare-stats.sh'
archiveArtifacts 'stats.txt'
}

19
Lab4.dockerfile Normal file
View File

@ -0,0 +1,19 @@
FROM ubuntu:latest
RUN apt-get update
RUN apt-get install -y python3 python3-pip unzip
RUN pip install pandas
RUN pip install scikit-learn
RUN pip install kaggle
ARG DEBIAN_FRONTEND=noninteractive
ADD get-data.sh /get-data.sh
ADD prepare-stats.sh /prepare-stats.sh
ADD prepare_dataset.py /prepare_dataset.py
ADD property_model.py /property_model.py
ADD predict_values.py /predict_values.py
RUN mkdir /.kaggle
RUN touch /.kaggle/kaggle.json
RUN chmod 777 /.kaggle/kaggle.json

View File

@ -1,49 +1,67 @@
import os
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
# get data
sells = pd.read_csv('data/Property Sales of Melbourne City.csv')
# delete unnecessary columns and drop rows with NaN values
columns_to_drop = [
'Lattitude',
'Longtitude',
'CouncilArea',
'Propertycount',
'Method',
'SellerG',
'Date',
'Postcode',
'Bedroom2',
'Bathroom',
'Car',
'BuildingArea',
'Address'
]
sells = sells.drop(columns_to_drop, axis=1).dropna()
# prepare column, which will be predicted
price_median = sells['Price'].median()
# normalize values
sells["Price"] = sells["Price"] / sells["Price"].max()
sells["Landsize"] = sells["Landsize"] / sells["Landsize"].max()
sells["Distance"] = sells["Distance"] / sells["Distance"].max()
def price_above_median(price):
if price > price_median:
return 1
else:
return 0
sells['PriceAboveMedian'] = sells['Price'].apply(price_above_median)
# delete unnecessary columns and drop rows with NaN values
columns_to_take = ['Rooms', 'Distance', 'Bedroom2', 'Bathroom', 'PriceAboveMedian']
sells = sells[columns_to_take].dropna()
# cut off dataset to fixed number of values
cutoff = int(os.environ['CUTOFF'])
cutoff = 1000
sells = sells.sample(cutoff)
# split to train/dev/test subsets
X = sells
Y = sells.pop('Price')
# split dataset
train_data, test_data = train_test_split(sells, test_size=0.2, random_state=42)
train_data, dev_data = train_test_split(train_data, test_size=0.2, random_state=42)
X_train, X_temp, Y_train, Y_temp = train_test_split(X, Y, test_size=0.3, random_state=1)
X_val, X_test, Y_val, Y_test = train_test_split(X_temp, Y_temp, test_size=0.5, random_state=1)
# prepare dataset
features = ['Rooms', 'Distance', 'Bedroom2', 'Bathroom']
target = 'PriceAboveMedian'
X_train = train_data[features].values
y_train = train_data[target].values
X_dev = dev_data[features].values
y_dev = dev_data[target].values
X_test = test_data[features].values
y_test = test_data[target].values
# normalize values
scaler = MinMaxScaler()
features = ['Rooms', 'Distance', 'Bedroom2', 'Bathroom']
for feature in features:
X_train = scaler.fit_transform(X_train)
X_dev = scaler.fit_transform(X_dev)
X_test = scaler.fit_transform(X_test)
# save subsets to files
X_train = pd.DataFrame(X_train, columns = ['Rooms', 'Distance', 'Bedroom2', 'Bathroom'])
y_train = pd.DataFrame(y_train, columns = ['PriceAboveMedian'])
X_dev = pd.DataFrame(X_dev, columns = ['Rooms', 'Distance', 'Bedroom2', 'Bathroom'])
y_dev = pd.DataFrame(y_dev, columns = ['PriceAboveMedian'])
X_test = pd.DataFrame(X_test, columns = ['Rooms', 'Distance', 'Bedroom2', 'Bathroom'])
y_test = pd.DataFrame(y_test, columns = ['PriceAboveMedian'])
X_train.to_csv('X_train.csv', index=False)
X_val.to_csv('X_val.csv', index=False)
X_dev.to_csv('X_val.csv', index=False)
X_test.to_csv('X_test.csv', index=False)
Y_train.to_csv('Y_train.csv', index=False)
Y_val.to_csv('Y_val.csv', index=False)
Y_test.to_csv('Y_test.csv', index=False)
y_train.to_csv('Y_train.csv', index=False)
y_dev.to_csv('Y_val.csv', index=False)
y_test.to_csv('Y_test.csv', index=False)

33
property_model.py Normal file
View File

@ -0,0 +1,33 @@
import pandas as pd
from keras.models import Sequential
from keras.layers import Dense
# prepare dataset
features = ['Rooms', 'Distance', 'Bedroom2', 'Bathroom']
target = 'PriceAboveMedian'
X_train = pd.read_csv('X_train.csv').values
y_train = pd.read_csv('Y_train.csv').values
X_dev = pd.read_csv('X_val.csv').values
y_dev = pd.read_csv('Y_val.csv').values
X_test = pd.read_csv('X_test.csv').values
y_test = pd.read_csv('Y_test.csv').values
# model definition
model = Sequential([
Dense(32, activation='relu', input_shape=(len(features),)),
Dense(32, activation='relu'),
Dense(1, activation='sigmoid'),
])
#compile and train
model.compile(optimizer='sgd', loss='binary_crossentropy', metrics=['accuracy'])
hist = model.fit(X_train, y_train,
batch_size=32, epochs=100,
validation_data=(X_dev, y_dev))
model.evaluate(X_test, y_test)[1]
model.save('model.h5')