From 04acddb289dc24ba07df3239894c705e8eb25b5f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Pawe=C5=82=20=C5=81=C4=85czkowski?= <broketmpl@gmail.com>
Date: Wed, 3 Apr 2024 19:57:37 +0200
Subject: [PATCH] IUM_04 - update Jenkinsfile, update Dockerfile, add
 requirements.txt file

---
 Dockerfile          |  13 +++++++++++--
 Jenkinsfile         |  26 +++++++++-----------------
 download_dataset.py |   8 ++++++++
 requirements.txt    | Bin 0 -> 100 bytes
 4 files changed, 28 insertions(+), 19 deletions(-)
 create mode 100644 requirements.txt

diff --git a/Dockerfile b/Dockerfile
index b51b2e6..4ba99de 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -6,15 +6,24 @@ RUN apt-get update && apt-get install -y \
     python3 \
     python3-pip
 
-# Install the required Python packages
-RUN pip3 install numpy pandas kaggle scikit-learn
+# Copy the requirements.txt file to the working directory
+COPY requirements.txt ./
+
+# Install the required Python packages form requirements.txt
+RUN pip3 install -r requirements.txt
 
 # Set the working directory
 WORKDIR /app
 
 # Copy scripts to the working directory
+
+# Python scripts
 COPY download_dataset.py ./
 COPY get_stats.py ./
 
+# Bash scripts
+COPY download_dataset.sh ./
+COPY get_stats.sh ./
+
 # Default command
 CMD bash
\ No newline at end of file
diff --git a/Jenkinsfile b/Jenkinsfile
index 6e79bee..2da9c26 100644
--- a/Jenkinsfile
+++ b/Jenkinsfile
@@ -26,29 +26,21 @@ pipeline {
             }
         }
 
-        stage('Download dataset') {
+        stage('Build Docker image') {
             steps {
-                withEnv(["KAGGLE_USERNAME=${params.KAGGLE_USERNAME}", "KAGGLE_KEY=${params.KAGGLE_KEY}"]) {
-                    sh "kaggle datasets download -d uciml/breast-cancer-wisconsin-data"
-                    sh "unzip -o breast-cancer-wisconsin-data.zip"
-                    sh "mkdir -p datasets"
-                    sh "mv data.csv datasets/data.csv"
+                script {
+                    docker.build("create-dataset-s464863")
                 }
             }
         }
 
-        stage('Preprocess data') {
-            agent {
-                dockerfile {
-                    filename 'Dockerfile'
-                    reuseNode true
-                }
-            }
-
+        stage('Download dataset and preprocess data') {
             steps {
-                sh "chmod +x ./download_dataset.py"
-                sh "python3 ./download_dataset.py ${params.CUTOFF}"
-                archiveArtifacts artifacts: 'datasets/data.csv,datasets/train.csv,datasets/val.csv,datasets/test.csv', onlyIfSuccessful: true
+                docker.image('create-dataset-s464863').withRun('-e KAGGLE_USERNAME=${params.KAGGLE_USERNAME} -e KAGGLE_KEY=${params.KAGGLE_KEY} -e CUTOFF=${params.CUTOFF}') {
+                    sh "chmod +x ./download_dataset.py"
+                    sh "python3 ./download_dataset.py ${params.CUTOFF}"
+                    archiveArtifacts artifacts: 'datasets/*', onlyIfSuccessful: true
+                }
             }
         }
     }
diff --git a/download_dataset.py b/download_dataset.py
index dd6dfee..74ba05c 100644
--- a/download_dataset.py
+++ b/download_dataset.py
@@ -1,9 +1,14 @@
 # Necessary imports
 import pandas as pd
+import kaggle
 import sys
 from sklearn.model_selection import train_test_split
 from sklearn.preprocessing import MinMaxScaler
 
+# Download the dataset
+kaggle.api.authenticate()
+kaggle.api.dataset_download_files('uciml/breast-cancer-wisconsin-data', path='./datasets', unzip=True)
+
 # Load the dataset
 df = pd.read_csv('./datasets/data.csv', index_col='id')
 
@@ -21,6 +26,9 @@ print(df.isnull().sum())
 # Print the first 5 rows of the dataset
 print(df.head())
 
+# Convert the diagnosis column to binary
+df['diagnosis'] = df['diagnosis'].map({'M': 1, 'B': 0})
+
 # Normalize the dataset
 scaler = MinMaxScaler()
 df[df.columns[1:]] = scaler.fit_transform(df[df.columns[1:]])
diff --git a/requirements.txt b/requirements.txt
new file mode 100644
index 0000000000000000000000000000000000000000..75f5c0aaa2bea8e01f2a3da9045b5fb0016dd291
GIT binary patch
literal 100
zcmW-YOAde_3`F1B#6yZ2G!`&`3ooyZO_R?5eKHq`lbex2IFVWOo9iUj<r+n3>;(54
VtCAqMs{b#$^O?=np)~4Dy$=`+63zes

literal 0
HcmV?d00001