diff --git a/Dockerfile b/Dockerfile index b51b2e6..4ba99de 100644 --- a/Dockerfile +++ b/Dockerfile @@ -6,15 +6,24 @@ RUN apt-get update && apt-get install -y \ python3 \ python3-pip -# Install the required Python packages -RUN pip3 install numpy pandas kaggle scikit-learn +# Copy the requirements.txt file to the working directory +COPY requirements.txt ./ + +# Install the required Python packages form requirements.txt +RUN pip3 install -r requirements.txt # Set the working directory WORKDIR /app # Copy scripts to the working directory + +# Python scripts COPY download_dataset.py ./ COPY get_stats.py ./ +# Bash scripts +COPY download_dataset.sh ./ +COPY get_stats.sh ./ + # Default command CMD bash \ No newline at end of file diff --git a/Jenkinsfile b/Jenkinsfile index 6e79bee..2da9c26 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -26,29 +26,21 @@ pipeline { } } - stage('Download dataset') { + stage('Build Docker image') { steps { - withEnv(["KAGGLE_USERNAME=${params.KAGGLE_USERNAME}", "KAGGLE_KEY=${params.KAGGLE_KEY}"]) { - sh "kaggle datasets download -d uciml/breast-cancer-wisconsin-data" - sh "unzip -o breast-cancer-wisconsin-data.zip" - sh "mkdir -p datasets" - sh "mv data.csv datasets/data.csv" + script { + docker.build("create-dataset-s464863") } } } - stage('Preprocess data') { - agent { - dockerfile { - filename 'Dockerfile' - reuseNode true - } - } - + stage('Download dataset and preprocess data') { steps { - sh "chmod +x ./download_dataset.py" - sh "python3 ./download_dataset.py ${params.CUTOFF}" - archiveArtifacts artifacts: 'datasets/data.csv,datasets/train.csv,datasets/val.csv,datasets/test.csv', onlyIfSuccessful: true + docker.image('create-dataset-s464863').withRun('-e KAGGLE_USERNAME=${params.KAGGLE_USERNAME} -e KAGGLE_KEY=${params.KAGGLE_KEY} -e CUTOFF=${params.CUTOFF}') { + sh "chmod +x ./download_dataset.py" + sh "python3 ./download_dataset.py ${params.CUTOFF}" + archiveArtifacts artifacts: 'datasets/*', onlyIfSuccessful: true + } } } } diff --git a/download_dataset.py b/download_dataset.py index dd6dfee..74ba05c 100644 --- a/download_dataset.py +++ b/download_dataset.py @@ -1,9 +1,14 @@ # Necessary imports import pandas as pd +import kaggle import sys from sklearn.model_selection import train_test_split from sklearn.preprocessing import MinMaxScaler +# Download the dataset +kaggle.api.authenticate() +kaggle.api.dataset_download_files('uciml/breast-cancer-wisconsin-data', path='./datasets', unzip=True) + # Load the dataset df = pd.read_csv('./datasets/data.csv', index_col='id') @@ -21,6 +26,9 @@ print(df.isnull().sum()) # Print the first 5 rows of the dataset print(df.head()) +# Convert the diagnosis column to binary +df['diagnosis'] = df['diagnosis'].map({'M': 1, 'B': 0}) + # Normalize the dataset scaler = MinMaxScaler() df[df.columns[1:]] = scaler.fit_transform(df[df.columns[1:]]) diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..75f5c0a Binary files /dev/null and b/requirements.txt differ