Jenkins and docker intergration. Create model for lab 5.
This commit is contained in:
parent
7d8d0a4dec
commit
aa08c7feae
@ -1,9 +0,0 @@
|
|||||||
FROM ubuntu:latest
|
|
||||||
|
|
||||||
ADD get-data.sh /get-data.sh
|
|
||||||
ADD prepare_dataset.py /prepare_dataset.py
|
|
||||||
|
|
||||||
RUN apt-get update
|
|
||||||
RUN apt-get install -y python3 python3-pip unzip
|
|
||||||
RUN pip install pandas
|
|
||||||
RUN pip install scikit-learn
|
|
@ -1 +0,0 @@
|
|||||||
FROM ubuntu:latest
|
|
@ -1,6 +1,10 @@
|
|||||||
pipeline {
|
pipeline {
|
||||||
agent any
|
agent {
|
||||||
//Definijuemy parametry, które będzie można podać podczas wywoływania zadania
|
dockerfile {
|
||||||
|
filename 'Lab4.dockerfile'
|
||||||
|
reuseNode true
|
||||||
|
}
|
||||||
|
}
|
||||||
parameters {
|
parameters {
|
||||||
string(
|
string(
|
||||||
defaultValue: '1000',
|
defaultValue: '1000',
|
||||||
@ -23,26 +27,23 @@ pipeline {
|
|||||||
stages {
|
stages {
|
||||||
stage('Checkout') {
|
stage('Checkout') {
|
||||||
steps {
|
steps {
|
||||||
sh 'rm -rf ium_z487183'
|
checkout scmGit(
|
||||||
sh 'git clone https://git.wmi.amu.edu.pl/s487183/ium_z487183.git'
|
branches: [[name: '*/master']],
|
||||||
|
extensions: [cleanBeforeCheckout()],
|
||||||
|
userRemoteConfigs: [[url: 'https://git.wmi.amu.edu.pl/s487183/ium_z487183.git']]
|
||||||
|
)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
stage('Prepare data') {
|
stage('Prepare data') {
|
||||||
steps {
|
steps {
|
||||||
withEnv(["KAGGLE_USERNAME=${params.KAGGLE_USERNAME}", "KAGGLE_KEY=${params.KAGGLE_KEY}"]) {
|
withEnv(["KAGGLE_USERNAME=${params.KAGGLE_USERNAME}", "KAGGLE_KEY=${params.KAGGLE_KEY}"]) {
|
||||||
sh 'ium_z487183/get-data.sh'
|
sh 'echo "{\"username\":\"$KAGGLE_USERNAME\",\"key\":\"$KAGGLE_KEY\"}" > /.kaggle/kaggle.json'
|
||||||
sh 'python3 ium_z487183/prepare_dataset.py'
|
sh './get-data.sh'
|
||||||
|
sh 'python3 prepare_dataset.py'
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
stage('Archive artifacts') {
|
stage('Archive artifacts') {
|
||||||
agent {
|
|
||||||
dockerfile {
|
|
||||||
filename 'CreateDataset.dockerfile'
|
|
||||||
dir 'ium_z487183'
|
|
||||||
reuseNode true
|
|
||||||
}
|
|
||||||
}
|
|
||||||
steps {
|
steps {
|
||||||
withEnv(["CUTOFF=${params.CUTOFF}"]) {
|
withEnv(["CUTOFF=${params.CUTOFF}"]) {
|
||||||
archiveArtifacts 'X_test.csv'
|
archiveArtifacts 'X_test.csv'
|
||||||
@ -55,4 +56,4 @@ pipeline {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -1,29 +1,32 @@
|
|||||||
pipeline {
|
pipeline {
|
||||||
agent any
|
agent any
|
||||||
parameters{
|
|
||||||
buildSelector(
|
parameters{
|
||||||
defaultSelector: lastSuccessful(),
|
buildSelector(
|
||||||
description: 'Which build to use for copying artifacts',
|
defaultSelector: lastSuccessful(),
|
||||||
name: 'BUILD_SELECTOR'
|
description: 'Which build to use for copying artifacts',
|
||||||
)
|
name: 'BUILD_SELECTOR'
|
||||||
}
|
)
|
||||||
stages {
|
}
|
||||||
|
stages {
|
||||||
stage('Checkout') {
|
stage('Checkout') {
|
||||||
steps {
|
steps {
|
||||||
sh 'rm -rf ium_z487183'
|
checkout scmGit(
|
||||||
sh 'git clone https://git.wmi.amu.edu.pl/s487183/ium_z487183.git'
|
branches: [[name: '*/master']],
|
||||||
|
extensions: [cleanBeforeCheckout()],
|
||||||
|
userRemoteConfigs: [[url: 'https://git.wmi.amu.edu.pl/s487183/ium_z487183.git']]
|
||||||
|
)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
stage('Prepare stats') {
|
stage('Prepare stats') {
|
||||||
agent {
|
agent {
|
||||||
dockerfile {
|
docker {
|
||||||
filename 'DatasetStats.dockerfile'
|
image 'mmoryl/ium:latest'
|
||||||
dir 'ium_z487183'
|
|
||||||
reuseNode true
|
reuseNode true
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
steps {
|
steps {
|
||||||
copyArtifacts filter: 'X_test.csv,X_val.csv,X_train.csv,Y_test.csv,Y_val.csv,Y_train.csv', fingerprintArtifacts: true, projectName: 'z487183-create-dataset', selector: workspace()
|
copyArtifacts projectName: 'z487183-create-dataset'
|
||||||
sh './prepare-stats.sh'
|
sh './prepare-stats.sh'
|
||||||
archiveArtifacts 'stats.txt'
|
archiveArtifacts 'stats.txt'
|
||||||
}
|
}
|
||||||
|
19
Lab4.dockerfile
Normal file
19
Lab4.dockerfile
Normal file
@ -0,0 +1,19 @@
|
|||||||
|
FROM ubuntu:latest
|
||||||
|
|
||||||
|
RUN apt-get update
|
||||||
|
RUN apt-get install -y python3 python3-pip unzip
|
||||||
|
RUN pip install pandas
|
||||||
|
RUN pip install scikit-learn
|
||||||
|
RUN pip install kaggle
|
||||||
|
|
||||||
|
ARG DEBIAN_FRONTEND=noninteractive
|
||||||
|
|
||||||
|
ADD get-data.sh /get-data.sh
|
||||||
|
ADD prepare-stats.sh /prepare-stats.sh
|
||||||
|
ADD prepare_dataset.py /prepare_dataset.py
|
||||||
|
ADD property_model.py /property_model.py
|
||||||
|
ADD predict_values.py /predict_values.py
|
||||||
|
|
||||||
|
RUN mkdir /.kaggle
|
||||||
|
RUN touch /.kaggle/kaggle.json
|
||||||
|
RUN chmod 777 /.kaggle/kaggle.json
|
@ -1,49 +1,67 @@
|
|||||||
import os
|
import os
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
from sklearn.model_selection import train_test_split
|
from sklearn.model_selection import train_test_split
|
||||||
|
from sklearn.preprocessing import MinMaxScaler
|
||||||
|
|
||||||
# get data
|
# get data
|
||||||
sells = pd.read_csv('data/Property Sales of Melbourne City.csv')
|
sells = pd.read_csv('data/Property Sales of Melbourne City.csv')
|
||||||
|
|
||||||
# delete unnecessary columns and drop rows with NaN values
|
# prepare column, which will be predicted
|
||||||
columns_to_drop = [
|
price_median = sells['Price'].median()
|
||||||
'Lattitude',
|
|
||||||
'Longtitude',
|
|
||||||
'CouncilArea',
|
|
||||||
'Propertycount',
|
|
||||||
'Method',
|
|
||||||
'SellerG',
|
|
||||||
'Date',
|
|
||||||
'Postcode',
|
|
||||||
'Bedroom2',
|
|
||||||
'Bathroom',
|
|
||||||
'Car',
|
|
||||||
'BuildingArea',
|
|
||||||
'Address'
|
|
||||||
]
|
|
||||||
sells = sells.drop(columns_to_drop, axis=1).dropna()
|
|
||||||
|
|
||||||
# normalize values
|
def price_above_median(price):
|
||||||
sells["Price"] = sells["Price"] / sells["Price"].max()
|
if price > price_median:
|
||||||
sells["Landsize"] = sells["Landsize"] / sells["Landsize"].max()
|
return 1
|
||||||
sells["Distance"] = sells["Distance"] / sells["Distance"].max()
|
else:
|
||||||
|
return 0
|
||||||
|
|
||||||
|
sells['PriceAboveMedian'] = sells['Price'].apply(price_above_median)
|
||||||
|
|
||||||
|
# delete unnecessary columns and drop rows with NaN values
|
||||||
|
columns_to_take = ['Rooms', 'Distance', 'Bedroom2', 'Bathroom', 'PriceAboveMedian']
|
||||||
|
sells = sells[columns_to_take].dropna()
|
||||||
|
|
||||||
# cut off dataset to fixed number of values
|
# cut off dataset to fixed number of values
|
||||||
cutoff = int(os.environ['CUTOFF'])
|
cutoff = 1000
|
||||||
sells = sells.sample(cutoff)
|
sells = sells.sample(cutoff)
|
||||||
|
|
||||||
# split to train/dev/test subsets
|
# split dataset
|
||||||
X = sells
|
train_data, test_data = train_test_split(sells, test_size=0.2, random_state=42)
|
||||||
Y = sells.pop('Price')
|
train_data, dev_data = train_test_split(train_data, test_size=0.2, random_state=42)
|
||||||
|
|
||||||
X_train, X_temp, Y_train, Y_temp = train_test_split(X, Y, test_size=0.3, random_state=1)
|
# prepare dataset
|
||||||
X_val, X_test, Y_val, Y_test = train_test_split(X_temp, Y_temp, test_size=0.5, random_state=1)
|
features = ['Rooms', 'Distance', 'Bedroom2', 'Bathroom']
|
||||||
|
target = 'PriceAboveMedian'
|
||||||
|
|
||||||
|
X_train = train_data[features].values
|
||||||
|
y_train = train_data[target].values
|
||||||
|
|
||||||
|
X_dev = dev_data[features].values
|
||||||
|
y_dev = dev_data[target].values
|
||||||
|
|
||||||
|
X_test = test_data[features].values
|
||||||
|
y_test = test_data[target].values
|
||||||
|
|
||||||
|
# normalize values
|
||||||
|
scaler = MinMaxScaler()
|
||||||
|
features = ['Rooms', 'Distance', 'Bedroom2', 'Bathroom']
|
||||||
|
for feature in features:
|
||||||
|
X_train = scaler.fit_transform(X_train)
|
||||||
|
X_dev = scaler.fit_transform(X_dev)
|
||||||
|
X_test = scaler.fit_transform(X_test)
|
||||||
|
|
||||||
# save subsets to files
|
# save subsets to files
|
||||||
|
X_train = pd.DataFrame(X_train, columns = ['Rooms', 'Distance', 'Bedroom2', 'Bathroom'])
|
||||||
|
y_train = pd.DataFrame(y_train, columns = ['PriceAboveMedian'])
|
||||||
|
X_dev = pd.DataFrame(X_dev, columns = ['Rooms', 'Distance', 'Bedroom2', 'Bathroom'])
|
||||||
|
y_dev = pd.DataFrame(y_dev, columns = ['PriceAboveMedian'])
|
||||||
|
X_test = pd.DataFrame(X_test, columns = ['Rooms', 'Distance', 'Bedroom2', 'Bathroom'])
|
||||||
|
y_test = pd.DataFrame(y_test, columns = ['PriceAboveMedian'])
|
||||||
|
|
||||||
X_train.to_csv('X_train.csv', index=False)
|
X_train.to_csv('X_train.csv', index=False)
|
||||||
X_val.to_csv('X_val.csv', index=False)
|
X_dev.to_csv('X_val.csv', index=False)
|
||||||
X_test.to_csv('X_test.csv', index=False)
|
X_test.to_csv('X_test.csv', index=False)
|
||||||
|
|
||||||
Y_train.to_csv('Y_train.csv', index=False)
|
y_train.to_csv('Y_train.csv', index=False)
|
||||||
Y_val.to_csv('Y_val.csv', index=False)
|
y_dev.to_csv('Y_val.csv', index=False)
|
||||||
Y_test.to_csv('Y_test.csv', index=False)
|
y_test.to_csv('Y_test.csv', index=False)
|
||||||
|
33
property_model.py
Normal file
33
property_model.py
Normal file
@ -0,0 +1,33 @@
|
|||||||
|
import pandas as pd
|
||||||
|
from keras.models import Sequential
|
||||||
|
from keras.layers import Dense
|
||||||
|
|
||||||
|
# prepare dataset
|
||||||
|
features = ['Rooms', 'Distance', 'Bedroom2', 'Bathroom']
|
||||||
|
target = 'PriceAboveMedian'
|
||||||
|
|
||||||
|
X_train = pd.read_csv('X_train.csv').values
|
||||||
|
y_train = pd.read_csv('Y_train.csv').values
|
||||||
|
|
||||||
|
X_dev = pd.read_csv('X_val.csv').values
|
||||||
|
y_dev = pd.read_csv('Y_val.csv').values
|
||||||
|
|
||||||
|
X_test = pd.read_csv('X_test.csv').values
|
||||||
|
y_test = pd.read_csv('Y_test.csv').values
|
||||||
|
|
||||||
|
# model definition
|
||||||
|
model = Sequential([
|
||||||
|
Dense(32, activation='relu', input_shape=(len(features),)),
|
||||||
|
Dense(32, activation='relu'),
|
||||||
|
Dense(1, activation='sigmoid'),
|
||||||
|
])
|
||||||
|
|
||||||
|
#compile and train
|
||||||
|
model.compile(optimizer='sgd', loss='binary_crossentropy', metrics=['accuracy'])
|
||||||
|
hist = model.fit(X_train, y_train,
|
||||||
|
batch_size=32, epochs=100,
|
||||||
|
validation_data=(X_dev, y_dev))
|
||||||
|
|
||||||
|
model.evaluate(X_test, y_test)[1]
|
||||||
|
|
||||||
|
model.save('model.h5')
|
Loading…
Reference in New Issue
Block a user