IUM_04 - update Jenkinsfile, update Dockerfile, add requirements.txt file

This commit is contained in:
Paweł Łączkowski 2024-04-03 19:57:37 +02:00
parent 399a1b173d
commit 04acddb289
4 changed files with 28 additions and 19 deletions

View File

@ -6,15 +6,24 @@ RUN apt-get update && apt-get install -y \
python3 \ python3 \
python3-pip python3-pip
# Install the required Python packages # Copy the requirements.txt file to the working directory
RUN pip3 install numpy pandas kaggle scikit-learn COPY requirements.txt ./
# Install the required Python packages form requirements.txt
RUN pip3 install -r requirements.txt
# Set the working directory # Set the working directory
WORKDIR /app WORKDIR /app
# Copy scripts to the working directory # Copy scripts to the working directory
# Python scripts
COPY download_dataset.py ./ COPY download_dataset.py ./
COPY get_stats.py ./ COPY get_stats.py ./
# Bash scripts
COPY download_dataset.sh ./
COPY get_stats.sh ./
# Default command # Default command
CMD bash CMD bash

26
Jenkinsfile vendored
View File

@ -26,29 +26,21 @@ pipeline {
} }
} }
stage('Download dataset') { stage('Build Docker image') {
steps { steps {
withEnv(["KAGGLE_USERNAME=${params.KAGGLE_USERNAME}", "KAGGLE_KEY=${params.KAGGLE_KEY}"]) { script {
sh "kaggle datasets download -d uciml/breast-cancer-wisconsin-data" docker.build("create-dataset-s464863")
sh "unzip -o breast-cancer-wisconsin-data.zip"
sh "mkdir -p datasets"
sh "mv data.csv datasets/data.csv"
} }
} }
} }
stage('Preprocess data') { stage('Download dataset and preprocess data') {
agent {
dockerfile {
filename 'Dockerfile'
reuseNode true
}
}
steps { steps {
sh "chmod +x ./download_dataset.py" docker.image('create-dataset-s464863').withRun('-e KAGGLE_USERNAME=${params.KAGGLE_USERNAME} -e KAGGLE_KEY=${params.KAGGLE_KEY} -e CUTOFF=${params.CUTOFF}') {
sh "python3 ./download_dataset.py ${params.CUTOFF}" sh "chmod +x ./download_dataset.py"
archiveArtifacts artifacts: 'datasets/data.csv,datasets/train.csv,datasets/val.csv,datasets/test.csv', onlyIfSuccessful: true sh "python3 ./download_dataset.py ${params.CUTOFF}"
archiveArtifacts artifacts: 'datasets/*', onlyIfSuccessful: true
}
} }
} }
} }

View File

@ -1,9 +1,14 @@
# Necessary imports # Necessary imports
import pandas as pd import pandas as pd
import kaggle
import sys import sys
from sklearn.model_selection import train_test_split from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler from sklearn.preprocessing import MinMaxScaler
# Download the dataset
kaggle.api.authenticate()
kaggle.api.dataset_download_files('uciml/breast-cancer-wisconsin-data', path='./datasets', unzip=True)
# Load the dataset # Load the dataset
df = pd.read_csv('./datasets/data.csv', index_col='id') df = pd.read_csv('./datasets/data.csv', index_col='id')
@ -21,6 +26,9 @@ print(df.isnull().sum())
# Print the first 5 rows of the dataset # Print the first 5 rows of the dataset
print(df.head()) print(df.head())
# Convert the diagnosis column to binary
df['diagnosis'] = df['diagnosis'].map({'M': 1, 'B': 0})
# Normalize the dataset # Normalize the dataset
scaler = MinMaxScaler() scaler = MinMaxScaler()
df[df.columns[1:]] = scaler.fit_transform(df[df.columns[1:]]) df[df.columns[1:]] = scaler.fit_transform(df[df.columns[1:]])

BIN
requirements.txt Normal file

Binary file not shown.