From 0bcff8ff051717783fed1afb5ed500a5131caa8c Mon Sep 17 00:00:00 2001 From: s434766 Date: Sat, 10 Apr 2021 13:22:11 +0200 Subject: [PATCH] docker --- Dockerfile | 9 +++++++ Jenkinsfile | 23 +++++++++-------- copyArtiJenkins/Jenkinsfile | 16 +++++++----- create.py | 50 +++++++++++++++++++++++++++++++++++++ stats.py | 15 +++++++++++ 5 files changed, 96 insertions(+), 17 deletions(-) create mode 100644 Dockerfile create mode 100644 create.py create mode 100644 stats.py diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 0000000..5fbfed1 --- /dev/null +++ b/Dockerfile @@ -0,0 +1,9 @@ +FROM ubuntu:latest + +RUN apt update && apt install -y python3-pip --no-install-recommends && pip3 install numpy && pip3 install pandas && pip3 install wget && pip3 install scikit-learn && rm -rf /var/lib/apt/lists/* + +WORKDIR /app + +COPY ./create.py ./ +COPY ./stats.py ./ + diff --git a/Jenkinsfile b/Jenkinsfile index 90186c7..a767ec0 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -1,5 +1,7 @@ pipeline { - agent any + agent { + dockerfile true + } parameters { string ( defaultValue: '40', @@ -9,23 +11,22 @@ pipeline { ) } stages { + stage('Docker'){ + steps{ + sh 'python3 ./create.py' + } + } stage('checkout: Check out from version control') { steps { git 'https://git.wmi.amu.edu.pl/s434766/ium_434766.git' } } - stage('sh: Shell Script') { - steps { - sh 'chmod +x script.sh' - sh './script.sh ${CUTOFF}' - } - } stage('archiveArtifacts') { steps { - archiveArtifacts 'scriptTest.csv' - archiveArtifacts 'scriptDev.csv' - archiveArtifacts 'scriptTrain.csv' - archiveArtifacts 'lab3.csv' + archiveArtifacts 'data_val.csv' + archiveArtifacts 'data_test.csv' + archiveArtifacts 'data_train.csv' + archiveArtifacts 'healthcare-dataset-stroke-data.csv' } } } diff --git a/copyArtiJenkins/Jenkinsfile b/copyArtiJenkins/Jenkinsfile index f321110..42b8a8f 100644 --- a/copyArtiJenkins/Jenkinsfile +++ b/copyArtiJenkins/Jenkinsfile @@ -9,12 +9,16 @@ pipeline { copyArtifacts fingerprintArtifacts: true, projectName: 's434766-create-dataset', selector: buildParameter('BUILD_SELECTOR') } } - stage('sh: Shell Script') { - steps { - sh 'chmod +x copyArtiJenkins/script2.sh' - sh './copyArtiJenkins/script2.sh' - } - } + stage('Docker image'){ + agent { + docker { + image 'owczarczykp/ium_s434766' + } + } + steps { + sh 'python3 ./stats.py > stats.txt' + } + } stage('archiveArtifacts') { steps { archiveArtifacts 'stats.txt' diff --git a/create.py b/create.py new file mode 100644 index 0000000..90ac579 --- /dev/null +++ b/create.py @@ -0,0 +1,50 @@ +import os +import numpy as np +import pandas as pd +import wget +from sklearn.preprocessing import MinMaxScaler +from sklearn.model_selection import train_test_split + +def downloadCSV(): + url = 'https://git.wmi.amu.edu.pl/s434766/ium_434766/raw/branch/master/healthcare-dataset-stroke-data.csv' + wget.download(url, out='healthcare-dataset-stroke-data.csv', bar=None) + +def dropNaN(): + data = pd.read_csv('healthcare-dataset-stroke-data.csv') + data = data.dropna() + return data + + +def NormalizeData(data): + data = data.astype({"age": np.int64}) + for col in data.columns: + if data[col].dtype == object: # STRINGS TO LOWERCASE + data[col] = data[col].str.lower() + if data[col].dtype == np.float64: # FLOATS TO VALUES IN [ 0, 1] + dataReshaped = data[col].values.reshape(-1,1) + scaler = MinMaxScaler(feature_range=(0, 1)) + data[col] = scaler.fit_transform(dataReshaped) + if col == 'ever_married': # YES/NO TO 1/0 + data[col] = data[col].map(dict(yes=1, no=0)) + if col == 'smoking_status': + data[col] = data[col].str.replace(" ", "_") + if col == 'work_type': + data[col] = data[col].str.replace("-", "_") + return data + +def saveToCSV(data1,data2,data3): + data1.to_csv("data_train.csv", index=False) + data2.to_csv("data_test.csv",index=False) + data3.to_csv("data_val.csv",index=False) + + + +downloadCSV() +data = dropNaN() +data = NormalizeData(data) + +data_train, data_test = train_test_split(data, test_size=0.2, random_state=1) +data_train, data_val = train_test_split(data_train, test_size=0.25, random_state=1) ## Twice to get 0.6, 0.2, 0.2 +saveToCSV(data_train,data_test,data_val) + + diff --git a/stats.py b/stats.py new file mode 100644 index 0000000..07f0ecd --- /dev/null +++ b/stats.py @@ -0,0 +1,15 @@ +import pandas as pd + +def describeDataset(dt, dt2, dv): + data = pd.read_csv('healthcare-dataset-stroke-data.csv') + print("Whole dataset size: ", data.size) + print("Train dataset size: ", dt.size) + print("Test dataset size: ", dt2.size) + print("Validate dataset size: ", dv.size) + print(data.describe(include='all')) + + +data_train = pd.read_csv('data_train.csv') +data_test = pd.read_csv('data_test.csv') +data_val = pd.read_csv('data_val.csv') +describeDataset(data_train,data_test,data_val) \ No newline at end of file