Jenkins and docker intergration. Create model for lab 5.
This commit is contained in:
parent
7d8d0a4dec
commit
aa08c7feae
@ -1,9 +0,0 @@
|
||||
FROM ubuntu:latest
|
||||
|
||||
ADD get-data.sh /get-data.sh
|
||||
ADD prepare_dataset.py /prepare_dataset.py
|
||||
|
||||
RUN apt-get update
|
||||
RUN apt-get install -y python3 python3-pip unzip
|
||||
RUN pip install pandas
|
||||
RUN pip install scikit-learn
|
@ -1 +0,0 @@
|
||||
FROM ubuntu:latest
|
@ -1,6 +1,10 @@
|
||||
pipeline {
|
||||
agent any
|
||||
//Definijuemy parametry, które będzie można podać podczas wywoływania zadania
|
||||
agent {
|
||||
dockerfile {
|
||||
filename 'Lab4.dockerfile'
|
||||
reuseNode true
|
||||
}
|
||||
}
|
||||
parameters {
|
||||
string(
|
||||
defaultValue: '1000',
|
||||
@ -23,26 +27,23 @@ pipeline {
|
||||
stages {
|
||||
stage('Checkout') {
|
||||
steps {
|
||||
sh 'rm -rf ium_z487183'
|
||||
sh 'git clone https://git.wmi.amu.edu.pl/s487183/ium_z487183.git'
|
||||
checkout scmGit(
|
||||
branches: [[name: '*/master']],
|
||||
extensions: [cleanBeforeCheckout()],
|
||||
userRemoteConfigs: [[url: 'https://git.wmi.amu.edu.pl/s487183/ium_z487183.git']]
|
||||
)
|
||||
}
|
||||
}
|
||||
stage('Prepare data') {
|
||||
steps {
|
||||
withEnv(["KAGGLE_USERNAME=${params.KAGGLE_USERNAME}", "KAGGLE_KEY=${params.KAGGLE_KEY}"]) {
|
||||
sh 'ium_z487183/get-data.sh'
|
||||
sh 'python3 ium_z487183/prepare_dataset.py'
|
||||
sh 'echo "{\"username\":\"$KAGGLE_USERNAME\",\"key\":\"$KAGGLE_KEY\"}" > /.kaggle/kaggle.json'
|
||||
sh './get-data.sh'
|
||||
sh 'python3 prepare_dataset.py'
|
||||
}
|
||||
}
|
||||
}
|
||||
stage('Archive artifacts') {
|
||||
agent {
|
||||
dockerfile {
|
||||
filename 'CreateDataset.dockerfile'
|
||||
dir 'ium_z487183'
|
||||
reuseNode true
|
||||
}
|
||||
}
|
||||
steps {
|
||||
withEnv(["CUTOFF=${params.CUTOFF}"]) {
|
||||
archiveArtifacts 'X_test.csv'
|
||||
@ -55,4 +56,4 @@ pipeline {
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -1,29 +1,32 @@
|
||||
pipeline {
|
||||
agent any
|
||||
parameters{
|
||||
buildSelector(
|
||||
defaultSelector: lastSuccessful(),
|
||||
description: 'Which build to use for copying artifacts',
|
||||
name: 'BUILD_SELECTOR'
|
||||
)
|
||||
}
|
||||
stages {
|
||||
agent any
|
||||
|
||||
parameters{
|
||||
buildSelector(
|
||||
defaultSelector: lastSuccessful(),
|
||||
description: 'Which build to use for copying artifacts',
|
||||
name: 'BUILD_SELECTOR'
|
||||
)
|
||||
}
|
||||
stages {
|
||||
stage('Checkout') {
|
||||
steps {
|
||||
sh 'rm -rf ium_z487183'
|
||||
sh 'git clone https://git.wmi.amu.edu.pl/s487183/ium_z487183.git'
|
||||
checkout scmGit(
|
||||
branches: [[name: '*/master']],
|
||||
extensions: [cleanBeforeCheckout()],
|
||||
userRemoteConfigs: [[url: 'https://git.wmi.amu.edu.pl/s487183/ium_z487183.git']]
|
||||
)
|
||||
}
|
||||
}
|
||||
stage('Prepare stats') {
|
||||
agent {
|
||||
dockerfile {
|
||||
filename 'DatasetStats.dockerfile'
|
||||
dir 'ium_z487183'
|
||||
docker {
|
||||
image 'mmoryl/ium:latest'
|
||||
reuseNode true
|
||||
}
|
||||
}
|
||||
steps {
|
||||
copyArtifacts filter: 'X_test.csv,X_val.csv,X_train.csv,Y_test.csv,Y_val.csv,Y_train.csv', fingerprintArtifacts: true, projectName: 'z487183-create-dataset', selector: workspace()
|
||||
copyArtifacts projectName: 'z487183-create-dataset'
|
||||
sh './prepare-stats.sh'
|
||||
archiveArtifacts 'stats.txt'
|
||||
}
|
||||
|
19
Lab4.dockerfile
Normal file
19
Lab4.dockerfile
Normal file
@ -0,0 +1,19 @@
|
||||
FROM ubuntu:latest
|
||||
|
||||
RUN apt-get update
|
||||
RUN apt-get install -y python3 python3-pip unzip
|
||||
RUN pip install pandas
|
||||
RUN pip install scikit-learn
|
||||
RUN pip install kaggle
|
||||
|
||||
ARG DEBIAN_FRONTEND=noninteractive
|
||||
|
||||
ADD get-data.sh /get-data.sh
|
||||
ADD prepare-stats.sh /prepare-stats.sh
|
||||
ADD prepare_dataset.py /prepare_dataset.py
|
||||
ADD property_model.py /property_model.py
|
||||
ADD predict_values.py /predict_values.py
|
||||
|
||||
RUN mkdir /.kaggle
|
||||
RUN touch /.kaggle/kaggle.json
|
||||
RUN chmod 777 /.kaggle/kaggle.json
|
@ -1,49 +1,67 @@
|
||||
import os
|
||||
import pandas as pd
|
||||
from sklearn.model_selection import train_test_split
|
||||
from sklearn.preprocessing import MinMaxScaler
|
||||
|
||||
# get data
|
||||
sells = pd.read_csv('data/Property Sales of Melbourne City.csv')
|
||||
|
||||
# delete unnecessary columns and drop rows with NaN values
|
||||
columns_to_drop = [
|
||||
'Lattitude',
|
||||
'Longtitude',
|
||||
'CouncilArea',
|
||||
'Propertycount',
|
||||
'Method',
|
||||
'SellerG',
|
||||
'Date',
|
||||
'Postcode',
|
||||
'Bedroom2',
|
||||
'Bathroom',
|
||||
'Car',
|
||||
'BuildingArea',
|
||||
'Address'
|
||||
]
|
||||
sells = sells.drop(columns_to_drop, axis=1).dropna()
|
||||
# prepare column, which will be predicted
|
||||
price_median = sells['Price'].median()
|
||||
|
||||
# normalize values
|
||||
sells["Price"] = sells["Price"] / sells["Price"].max()
|
||||
sells["Landsize"] = sells["Landsize"] / sells["Landsize"].max()
|
||||
sells["Distance"] = sells["Distance"] / sells["Distance"].max()
|
||||
def price_above_median(price):
|
||||
if price > price_median:
|
||||
return 1
|
||||
else:
|
||||
return 0
|
||||
|
||||
sells['PriceAboveMedian'] = sells['Price'].apply(price_above_median)
|
||||
|
||||
# delete unnecessary columns and drop rows with NaN values
|
||||
columns_to_take = ['Rooms', 'Distance', 'Bedroom2', 'Bathroom', 'PriceAboveMedian']
|
||||
sells = sells[columns_to_take].dropna()
|
||||
|
||||
# cut off dataset to fixed number of values
|
||||
cutoff = int(os.environ['CUTOFF'])
|
||||
cutoff = 1000
|
||||
sells = sells.sample(cutoff)
|
||||
|
||||
# split to train/dev/test subsets
|
||||
X = sells
|
||||
Y = sells.pop('Price')
|
||||
# split dataset
|
||||
train_data, test_data = train_test_split(sells, test_size=0.2, random_state=42)
|
||||
train_data, dev_data = train_test_split(train_data, test_size=0.2, random_state=42)
|
||||
|
||||
X_train, X_temp, Y_train, Y_temp = train_test_split(X, Y, test_size=0.3, random_state=1)
|
||||
X_val, X_test, Y_val, Y_test = train_test_split(X_temp, Y_temp, test_size=0.5, random_state=1)
|
||||
# prepare dataset
|
||||
features = ['Rooms', 'Distance', 'Bedroom2', 'Bathroom']
|
||||
target = 'PriceAboveMedian'
|
||||
|
||||
X_train = train_data[features].values
|
||||
y_train = train_data[target].values
|
||||
|
||||
X_dev = dev_data[features].values
|
||||
y_dev = dev_data[target].values
|
||||
|
||||
X_test = test_data[features].values
|
||||
y_test = test_data[target].values
|
||||
|
||||
# normalize values
|
||||
scaler = MinMaxScaler()
|
||||
features = ['Rooms', 'Distance', 'Bedroom2', 'Bathroom']
|
||||
for feature in features:
|
||||
X_train = scaler.fit_transform(X_train)
|
||||
X_dev = scaler.fit_transform(X_dev)
|
||||
X_test = scaler.fit_transform(X_test)
|
||||
|
||||
# save subsets to files
|
||||
X_train = pd.DataFrame(X_train, columns = ['Rooms', 'Distance', 'Bedroom2', 'Bathroom'])
|
||||
y_train = pd.DataFrame(y_train, columns = ['PriceAboveMedian'])
|
||||
X_dev = pd.DataFrame(X_dev, columns = ['Rooms', 'Distance', 'Bedroom2', 'Bathroom'])
|
||||
y_dev = pd.DataFrame(y_dev, columns = ['PriceAboveMedian'])
|
||||
X_test = pd.DataFrame(X_test, columns = ['Rooms', 'Distance', 'Bedroom2', 'Bathroom'])
|
||||
y_test = pd.DataFrame(y_test, columns = ['PriceAboveMedian'])
|
||||
|
||||
X_train.to_csv('X_train.csv', index=False)
|
||||
X_val.to_csv('X_val.csv', index=False)
|
||||
X_dev.to_csv('X_val.csv', index=False)
|
||||
X_test.to_csv('X_test.csv', index=False)
|
||||
|
||||
Y_train.to_csv('Y_train.csv', index=False)
|
||||
Y_val.to_csv('Y_val.csv', index=False)
|
||||
Y_test.to_csv('Y_test.csv', index=False)
|
||||
y_train.to_csv('Y_train.csv', index=False)
|
||||
y_dev.to_csv('Y_val.csv', index=False)
|
||||
y_test.to_csv('Y_test.csv', index=False)
|
||||
|
33
property_model.py
Normal file
33
property_model.py
Normal file
@ -0,0 +1,33 @@
|
||||
import pandas as pd
|
||||
from keras.models import Sequential
|
||||
from keras.layers import Dense
|
||||
|
||||
# prepare dataset
|
||||
features = ['Rooms', 'Distance', 'Bedroom2', 'Bathroom']
|
||||
target = 'PriceAboveMedian'
|
||||
|
||||
X_train = pd.read_csv('X_train.csv').values
|
||||
y_train = pd.read_csv('Y_train.csv').values
|
||||
|
||||
X_dev = pd.read_csv('X_val.csv').values
|
||||
y_dev = pd.read_csv('Y_val.csv').values
|
||||
|
||||
X_test = pd.read_csv('X_test.csv').values
|
||||
y_test = pd.read_csv('Y_test.csv').values
|
||||
|
||||
# model definition
|
||||
model = Sequential([
|
||||
Dense(32, activation='relu', input_shape=(len(features),)),
|
||||
Dense(32, activation='relu'),
|
||||
Dense(1, activation='sigmoid'),
|
||||
])
|
||||
|
||||
#compile and train
|
||||
model.compile(optimizer='sgd', loss='binary_crossentropy', metrics=['accuracy'])
|
||||
hist = model.fit(X_train, y_train,
|
||||
batch_size=32, epochs=100,
|
||||
validation_data=(X_dev, y_dev))
|
||||
|
||||
model.evaluate(X_test, y_test)[1]
|
||||
|
||||
model.save('model.h5')
|
Loading…
Reference in New Issue
Block a user