From 04acddb289dc24ba07df3239894c705e8eb25b5f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Pawe=C5=82=20=C5=81=C4=85czkowski?= Date: Wed, 3 Apr 2024 19:57:37 +0200 Subject: [PATCH] IUM_04 - update Jenkinsfile, update Dockerfile, add requirements.txt file --- Dockerfile | 13 +++++++++++-- Jenkinsfile | 26 +++++++++----------------- download_dataset.py | 8 ++++++++ requirements.txt | Bin 0 -> 100 bytes 4 files changed, 28 insertions(+), 19 deletions(-) create mode 100644 requirements.txt diff --git a/Dockerfile b/Dockerfile index b51b2e6..4ba99de 100644 --- a/Dockerfile +++ b/Dockerfile @@ -6,15 +6,24 @@ RUN apt-get update && apt-get install -y \ python3 \ python3-pip -# Install the required Python packages -RUN pip3 install numpy pandas kaggle scikit-learn +# Copy the requirements.txt file to the working directory +COPY requirements.txt ./ + +# Install the required Python packages form requirements.txt +RUN pip3 install -r requirements.txt # Set the working directory WORKDIR /app # Copy scripts to the working directory + +# Python scripts COPY download_dataset.py ./ COPY get_stats.py ./ +# Bash scripts +COPY download_dataset.sh ./ +COPY get_stats.sh ./ + # Default command CMD bash \ No newline at end of file diff --git a/Jenkinsfile b/Jenkinsfile index 6e79bee..2da9c26 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -26,29 +26,21 @@ pipeline { } } - stage('Download dataset') { + stage('Build Docker image') { steps { - withEnv(["KAGGLE_USERNAME=${params.KAGGLE_USERNAME}", "KAGGLE_KEY=${params.KAGGLE_KEY}"]) { - sh "kaggle datasets download -d uciml/breast-cancer-wisconsin-data" - sh "unzip -o breast-cancer-wisconsin-data.zip" - sh "mkdir -p datasets" - sh "mv data.csv datasets/data.csv" + script { + docker.build("create-dataset-s464863") } } } - stage('Preprocess data') { - agent { - dockerfile { - filename 'Dockerfile' - reuseNode true - } - } - + stage('Download dataset and preprocess data') { steps { - sh "chmod +x ./download_dataset.py" - sh "python3 ./download_dataset.py ${params.CUTOFF}" - archiveArtifacts artifacts: 'datasets/data.csv,datasets/train.csv,datasets/val.csv,datasets/test.csv', onlyIfSuccessful: true + docker.image('create-dataset-s464863').withRun('-e KAGGLE_USERNAME=${params.KAGGLE_USERNAME} -e KAGGLE_KEY=${params.KAGGLE_KEY} -e CUTOFF=${params.CUTOFF}') { + sh "chmod +x ./download_dataset.py" + sh "python3 ./download_dataset.py ${params.CUTOFF}" + archiveArtifacts artifacts: 'datasets/*', onlyIfSuccessful: true + } } } } diff --git a/download_dataset.py b/download_dataset.py index dd6dfee..74ba05c 100644 --- a/download_dataset.py +++ b/download_dataset.py @@ -1,9 +1,14 @@ # Necessary imports import pandas as pd +import kaggle import sys from sklearn.model_selection import train_test_split from sklearn.preprocessing import MinMaxScaler +# Download the dataset +kaggle.api.authenticate() +kaggle.api.dataset_download_files('uciml/breast-cancer-wisconsin-data', path='./datasets', unzip=True) + # Load the dataset df = pd.read_csv('./datasets/data.csv', index_col='id') @@ -21,6 +26,9 @@ print(df.isnull().sum()) # Print the first 5 rows of the dataset print(df.head()) +# Convert the diagnosis column to binary +df['diagnosis'] = df['diagnosis'].map({'M': 1, 'B': 0}) + # Normalize the dataset scaler = MinMaxScaler() df[df.columns[1:]] = scaler.fit_transform(df[df.columns[1:]]) diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000000000000000000000000000000000000..75f5c0aaa2bea8e01f2a3da9045b5fb0016dd291 GIT binary patch literal 100 zcmW-YOAde_3`F1B#6yZ2G!`&`3ooyZO_R?5eKHq`lbex2IFVWOo9iUj;(54 VtCAqMs{b#$^O?=np)~4Dy$=`+63zes literal 0 HcmV?d00001