IUM_04 - update Jenkinsfile, update Dockerfile, add requirements.txt file

2024-04-03 19:57:37 +02:00 · 2024-04-03 19:57:37 +02:00 · 04acddb289
commit 04acddb289
parent 399a1b173d
4 changed files with 28 additions and 19 deletions
--- a/13
+++ b/13
@ -6,15 +6,24 @@ RUN apt-get update && apt-get install -y \
    python3 \
    python3-pip
-# Install the required Python packages
+# Copy the requirements.txt file to the working directory
-RUN pip3 install numpy pandas kaggle scikit-learn
+COPY requirements.txt ./
 # Install the required Python packages form requirements.txt
 RUN pip3 install -r requirements.txt
 # Set the working directory
 WORKDIR /app
 # Copy scripts to the working directory
 # Python scripts
 COPY download_dataset.py ./
 COPY get_stats.py ./
 # Bash scripts
 COPY download_dataset.sh ./
 COPY get_stats.sh ./
 # Default command
 CMD bash
--- a/26
+++ b/26
@ -26,29 +26,21 @@ pipeline {
            }
        }
-        stage('Download dataset') {
+        stage('Build Docker image') {
            steps {
-                withEnv(["KAGGLE_USERNAME=${params.KAGGLE_USERNAME}", "KAGGLE_KEY=${params.KAGGLE_KEY}"]) {
+                script {
-                    sh "kaggle datasets download -d uciml/breast-cancer-wisconsin-data"
+                    docker.build("create-dataset-s464863")
                    sh "unzip -o breast-cancer-wisconsin-data.zip"
                    sh "mkdir -p datasets"
                    sh "mv data.csv datasets/data.csv"
                }
            }
        }
-        stage('Preprocess data') {
+        stage('Download dataset and preprocess data') {
            agent {
                dockerfile {
                    filename 'Dockerfile'
                    reuseNode true
                }
            }
            steps {
-                sh "chmod +x ./download_dataset.py"
+                docker.image('create-dataset-s464863').withRun('-e KAGGLE_USERNAME=${params.KAGGLE_USERNAME} -e KAGGLE_KEY=${params.KAGGLE_KEY} -e CUTOFF=${params.CUTOFF}') {
-                sh "python3 ./download_dataset.py ${params.CUTOFF}"
+                    sh "chmod +x ./download_dataset.py"
-                archiveArtifacts artifacts: 'datasets/data.csv,datasets/train.csv,datasets/val.csv,datasets/test.csv', onlyIfSuccessful: true
+                    sh "python3 ./download_dataset.py ${params.CUTOFF}"
                    archiveArtifacts artifacts: 'datasets/*', onlyIfSuccessful: true
                }
            }
        }
    }
--- a/download_dataset.py
+++ b/download_dataset.py
@ -1,9 +1,14 @@
 # Necessary imports
 import pandas as pd
 import kaggle
 import sys
 from sklearn.model_selection import train_test_split
 from sklearn.preprocessing import MinMaxScaler
 # Download the dataset
 kaggle.api.authenticate()
 kaggle.api.dataset_download_files('uciml/breast-cancer-wisconsin-data', path='./datasets', unzip=True)
 # Load the dataset
 df = pd.read_csv('./datasets/data.csv', index_col='id')
@ -21,6 +26,9 @@ print(df.isnull().sum())
 # Print the first 5 rows of the dataset
 print(df.head())
 # Convert the diagnosis column to binary
 df['diagnosis'] = df['diagnosis'].map({'M': 1, 'B': 0})
 # Normalize the dataset
 scaler = MinMaxScaler()
 df[df.columns[1:]] = scaler.fit_transform(df[df.columns[1:]])
--- a/requirements.txt
+++ b/requirements.txt