From aa08c7feae2ab22d5d49e1ecec6816358d2970f5 Mon Sep 17 00:00:00 2001 From: Marek Moryl Date: Fri, 21 Apr 2023 14:05:54 +0200 Subject: [PATCH] Jenkins and docker intergration. Create model for lab 5. --- CreateDataset.dockerfile | 9 ----- DatasetStats.dockerfile | 1 - JenkinsfileCreateDataset | 29 ++++++++------- JenkinsfileDatasetStats | 33 +++++++++-------- Lab4.dockerfile | 19 ++++++++++ prepare_dataset.py | 80 ++++++++++++++++++++++++---------------- property_model.py | 33 +++++++++++++++++ 7 files changed, 134 insertions(+), 70 deletions(-) delete mode 100644 CreateDataset.dockerfile delete mode 100644 DatasetStats.dockerfile create mode 100644 Lab4.dockerfile create mode 100644 property_model.py diff --git a/CreateDataset.dockerfile b/CreateDataset.dockerfile deleted file mode 100644 index 3f303bf..0000000 --- a/CreateDataset.dockerfile +++ /dev/null @@ -1,9 +0,0 @@ -FROM ubuntu:latest - -ADD get-data.sh /get-data.sh -ADD prepare_dataset.py /prepare_dataset.py - -RUN apt-get update -RUN apt-get install -y python3 python3-pip unzip -RUN pip install pandas -RUN pip install scikit-learn diff --git a/DatasetStats.dockerfile b/DatasetStats.dockerfile deleted file mode 100644 index 08ee655..0000000 --- a/DatasetStats.dockerfile +++ /dev/null @@ -1 +0,0 @@ -FROM ubuntu:latest \ No newline at end of file diff --git a/JenkinsfileCreateDataset b/JenkinsfileCreateDataset index fda2d04..45296bd 100644 --- a/JenkinsfileCreateDataset +++ b/JenkinsfileCreateDataset @@ -1,6 +1,10 @@ pipeline { - agent any - //Definijuemy parametry, które będzie można podać podczas wywoływania zadania + agent { + dockerfile { + filename 'Lab4.dockerfile' + reuseNode true + } + } parameters { string( defaultValue: '1000', @@ -23,26 +27,23 @@ pipeline { stages { stage('Checkout') { steps { - sh 'rm -rf ium_z487183' - sh 'git clone https://git.wmi.amu.edu.pl/s487183/ium_z487183.git' + checkout scmGit( + branches: [[name: '*/master']], + extensions: [cleanBeforeCheckout()], + userRemoteConfigs: [[url: 'https://git.wmi.amu.edu.pl/s487183/ium_z487183.git']] + ) } } stage('Prepare data') { steps { withEnv(["KAGGLE_USERNAME=${params.KAGGLE_USERNAME}", "KAGGLE_KEY=${params.KAGGLE_KEY}"]) { - sh 'ium_z487183/get-data.sh' - sh 'python3 ium_z487183/prepare_dataset.py' + sh 'echo "{\"username\":\"$KAGGLE_USERNAME\",\"key\":\"$KAGGLE_KEY\"}" > /.kaggle/kaggle.json' + sh './get-data.sh' + sh 'python3 prepare_dataset.py' } } } stage('Archive artifacts') { - agent { - dockerfile { - filename 'CreateDataset.dockerfile' - dir 'ium_z487183' - reuseNode true - } - } steps { withEnv(["CUTOFF=${params.CUTOFF}"]) { archiveArtifacts 'X_test.csv' @@ -55,4 +56,4 @@ pipeline { } } } -} \ No newline at end of file +} diff --git a/JenkinsfileDatasetStats b/JenkinsfileDatasetStats index 6bffb51..f5a440b 100644 --- a/JenkinsfileDatasetStats +++ b/JenkinsfileDatasetStats @@ -1,29 +1,32 @@ pipeline { - agent any - parameters{ - buildSelector( - defaultSelector: lastSuccessful(), - description: 'Which build to use for copying artifacts', - name: 'BUILD_SELECTOR' - ) - } - stages { + agent any + + parameters{ + buildSelector( + defaultSelector: lastSuccessful(), + description: 'Which build to use for copying artifacts', + name: 'BUILD_SELECTOR' + ) + } + stages { stage('Checkout') { steps { - sh 'rm -rf ium_z487183' - sh 'git clone https://git.wmi.amu.edu.pl/s487183/ium_z487183.git' + checkout scmGit( + branches: [[name: '*/master']], + extensions: [cleanBeforeCheckout()], + userRemoteConfigs: [[url: 'https://git.wmi.amu.edu.pl/s487183/ium_z487183.git']] + ) } } stage('Prepare stats') { agent { - dockerfile { - filename 'DatasetStats.dockerfile' - dir 'ium_z487183' + docker { + image 'mmoryl/ium:latest' reuseNode true } } steps { - copyArtifacts filter: 'X_test.csv,X_val.csv,X_train.csv,Y_test.csv,Y_val.csv,Y_train.csv', fingerprintArtifacts: true, projectName: 'z487183-create-dataset', selector: workspace() + copyArtifacts projectName: 'z487183-create-dataset' sh './prepare-stats.sh' archiveArtifacts 'stats.txt' } diff --git a/Lab4.dockerfile b/Lab4.dockerfile new file mode 100644 index 0000000..68da029 --- /dev/null +++ b/Lab4.dockerfile @@ -0,0 +1,19 @@ +FROM ubuntu:latest + +RUN apt-get update +RUN apt-get install -y python3 python3-pip unzip +RUN pip install pandas +RUN pip install scikit-learn +RUN pip install kaggle + +ARG DEBIAN_FRONTEND=noninteractive + +ADD get-data.sh /get-data.sh +ADD prepare-stats.sh /prepare-stats.sh +ADD prepare_dataset.py /prepare_dataset.py +ADD property_model.py /property_model.py +ADD predict_values.py /predict_values.py + +RUN mkdir /.kaggle +RUN touch /.kaggle/kaggle.json +RUN chmod 777 /.kaggle/kaggle.json \ No newline at end of file diff --git a/prepare_dataset.py b/prepare_dataset.py index df9c0fd..d1d139c 100644 --- a/prepare_dataset.py +++ b/prepare_dataset.py @@ -1,49 +1,67 @@ import os import pandas as pd from sklearn.model_selection import train_test_split +from sklearn.preprocessing import MinMaxScaler # get data sells = pd.read_csv('data/Property Sales of Melbourne City.csv') -# delete unnecessary columns and drop rows with NaN values -columns_to_drop = [ - 'Lattitude', - 'Longtitude', - 'CouncilArea', - 'Propertycount', - 'Method', - 'SellerG', - 'Date', - 'Postcode', - 'Bedroom2', - 'Bathroom', - 'Car', - 'BuildingArea', - 'Address' - ] -sells = sells.drop(columns_to_drop, axis=1).dropna() +# prepare column, which will be predicted +price_median = sells['Price'].median() -# normalize values -sells["Price"] = sells["Price"] / sells["Price"].max() -sells["Landsize"] = sells["Landsize"] / sells["Landsize"].max() -sells["Distance"] = sells["Distance"] / sells["Distance"].max() +def price_above_median(price): + if price > price_median: + return 1 + else: + return 0 + +sells['PriceAboveMedian'] = sells['Price'].apply(price_above_median) + +# delete unnecessary columns and drop rows with NaN values +columns_to_take = ['Rooms', 'Distance', 'Bedroom2', 'Bathroom', 'PriceAboveMedian'] +sells = sells[columns_to_take].dropna() # cut off dataset to fixed number of values -cutoff = int(os.environ['CUTOFF']) +cutoff = 1000 sells = sells.sample(cutoff) -# split to train/dev/test subsets -X = sells -Y = sells.pop('Price') +# split dataset +train_data, test_data = train_test_split(sells, test_size=0.2, random_state=42) +train_data, dev_data = train_test_split(train_data, test_size=0.2, random_state=42) -X_train, X_temp, Y_train, Y_temp = train_test_split(X, Y, test_size=0.3, random_state=1) -X_val, X_test, Y_val, Y_test = train_test_split(X_temp, Y_temp, test_size=0.5, random_state=1) +# prepare dataset +features = ['Rooms', 'Distance', 'Bedroom2', 'Bathroom'] +target = 'PriceAboveMedian' + +X_train = train_data[features].values +y_train = train_data[target].values + +X_dev = dev_data[features].values +y_dev = dev_data[target].values + +X_test = test_data[features].values +y_test = test_data[target].values + +# normalize values +scaler = MinMaxScaler() +features = ['Rooms', 'Distance', 'Bedroom2', 'Bathroom'] +for feature in features: + X_train = scaler.fit_transform(X_train) + X_dev = scaler.fit_transform(X_dev) + X_test = scaler.fit_transform(X_test) # save subsets to files +X_train = pd.DataFrame(X_train, columns = ['Rooms', 'Distance', 'Bedroom2', 'Bathroom']) +y_train = pd.DataFrame(y_train, columns = ['PriceAboveMedian']) +X_dev = pd.DataFrame(X_dev, columns = ['Rooms', 'Distance', 'Bedroom2', 'Bathroom']) +y_dev = pd.DataFrame(y_dev, columns = ['PriceAboveMedian']) +X_test = pd.DataFrame(X_test, columns = ['Rooms', 'Distance', 'Bedroom2', 'Bathroom']) +y_test = pd.DataFrame(y_test, columns = ['PriceAboveMedian']) + X_train.to_csv('X_train.csv', index=False) -X_val.to_csv('X_val.csv', index=False) +X_dev.to_csv('X_val.csv', index=False) X_test.to_csv('X_test.csv', index=False) -Y_train.to_csv('Y_train.csv', index=False) -Y_val.to_csv('Y_val.csv', index=False) -Y_test.to_csv('Y_test.csv', index=False) +y_train.to_csv('Y_train.csv', index=False) +y_dev.to_csv('Y_val.csv', index=False) +y_test.to_csv('Y_test.csv', index=False) diff --git a/property_model.py b/property_model.py new file mode 100644 index 0000000..7a54d44 --- /dev/null +++ b/property_model.py @@ -0,0 +1,33 @@ +import pandas as pd +from keras.models import Sequential +from keras.layers import Dense + +# prepare dataset +features = ['Rooms', 'Distance', 'Bedroom2', 'Bathroom'] +target = 'PriceAboveMedian' + +X_train = pd.read_csv('X_train.csv').values +y_train = pd.read_csv('Y_train.csv').values + +X_dev = pd.read_csv('X_val.csv').values +y_dev = pd.read_csv('Y_val.csv').values + +X_test = pd.read_csv('X_test.csv').values +y_test = pd.read_csv('Y_test.csv').values + +# model definition +model = Sequential([ + Dense(32, activation='relu', input_shape=(len(features),)), + Dense(32, activation='relu'), + Dense(1, activation='sigmoid'), +]) + +#compile and train +model.compile(optimizer='sgd', loss='binary_crossentropy', metrics=['accuracy']) +hist = model.fit(X_train, y_train, + batch_size=32, epochs=100, + validation_data=(X_dev, y_dev)) + +model.evaluate(X_test, y_test)[1] + +model.save('model.h5')