From 5f21137f2dc8ee8b861efb18876322172ca02fb8 Mon Sep 17 00:00:00 2001 From: Mateusz Date: Mon, 1 Apr 2024 13:47:56 +0200 Subject: [PATCH] Dockerfile --- Dockerfile | 9 ++++ Jenkinsfile | 16 +++--- create-dataset.py | 120 ++++++++++++++++++++++++++++++++++++++++++++ download_dataset.sh | 2 - 4 files changed, 135 insertions(+), 12 deletions(-) create mode 100644 Dockerfile create mode 100644 create-dataset.py diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 0000000..8e584b5 --- /dev/null +++ b/Dockerfile @@ -0,0 +1,9 @@ +FROM ubuntu:latest + +RUN apt update && apt install -y python3-pip unzip + +RUN pip install --user kaggle pandas numpy scikit-learn + +WORKDIR /app + +COPY ./create-dataset.py ./ \ No newline at end of file diff --git a/Jenkinsfile b/Jenkinsfile index dfde3a4..2541310 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -1,5 +1,7 @@ pipeline { - agent any + agent { + dockerfile true + } parameters { string ( @@ -16,24 +18,18 @@ pipeline { } stages { - stage('Clone Repository') { - steps { - git branch: 'main', url: 'https://git.wmi.amu.edu.pl/s464913/ium_464913.git' - } - } - stage('Download Dataset') { + stage('Run create-dataset script') { steps { script { withEnv (["KAGGLE_USERNAME=${params.KAGGLE_USERNAME}", "KAGGLE_KEY=${params.KAGGLE_KEY}"]) { - sh 'chmod +x download_dataset.sh' - sh './download_dataset.sh' + sh 'python create-dataset.py' } } } } stage('Archive Artifacts') { steps { - archiveArtifacts artifacts: 'data/*', onlyIfSuccessful: true + archiveArtifacts artifacts: '/data/*', onlyIfSuccessful: true } } } diff --git a/create-dataset.py b/create-dataset.py new file mode 100644 index 0000000..d9b8adb --- /dev/null +++ b/create-dataset.py @@ -0,0 +1,120 @@ +import os +import kaggle +import pandas as pd +import numpy as np + +from sklearn.preprocessing import StandardScaler + +from sklearn.model_selection import train_test_split + + +def download_kaggle_dataset(): + kaggle.api.authenticate() + kaggle.api.dataset_download_files("mlg-ulb/creditcardfraud", path="./", unzip=True) + + +def load_data(name): + df = pd.read_csv(name) + return df + + +def normalize_data(df): + scaler = StandardScaler() + df["Amount"] = scaler.fit_transform(df["Amount"].values.reshape(-1, 1)) + return df + + +def create_undersample_data(df): + # Determine the number of instances in the minority class + fraud_count = len(df[df.Class == 1]) + fraud_indices = np.array(df[df.Class == 1].index) + + # Select indices corresponding to majority class instances + normal_indices = df[df.Class == 0].index + + # Randomly sample the same number of instances from the majority class + random_normal_indices = np.random.choice(normal_indices, fraud_count, replace=False) + random_normal_indices = np.array(random_normal_indices) + + # Combine indices of both classes + undersample_indice = np.concatenate([fraud_indices, random_normal_indices]) + + # Undersample dataset + undersample_data = df.iloc[undersample_indice, :] + + X_undersample = undersample_data.iloc[:, undersample_data.columns != "Class"] + y_undersample = undersample_data.iloc[:, undersample_data.columns == "Class"] + + return undersample_data, X_undersample, y_undersample + + +def split_undersample_data(X_undersample, y_undersample): + X_train_undersample, X_test_undersample, y_train_undersample, y_test_undersample = ( + train_test_split(X_undersample, y_undersample, test_size=0.3, random_state=0) + ) + + return ( + X_train_undersample, + X_test_undersample, + y_train_undersample, + y_test_undersample, + ) + + +def save_undersample_data( + undersample_data, + X_train_undersample, + X_test_undersample, + y_train_undersample, + y_test_undersample, +): + undersample_data.to_csv("/data/undersample_data.csv", index=False) + X_train_undersample.to_csv("/data/X_train_undersample.csv", index=False) + X_test_undersample.to_csv("/data/X_test_undersample.csv", index=False) + y_train_undersample.to_csv("/data/y_train_undersample.csv", index=False) + y_test_undersample.to_csv("/data/y_test_undersample.csv", index=False) + + +def split_whole_data(df): + X = df.iloc[:, df.columns != "Class"] + y = df.iloc[:, df.columns == "Class"] + + X_train, X_test, y_train, y_test = train_test_split( + X, y, test_size=0.3, random_state=0 + ) + return X_train, X_test, y_train, y_test + + +def save_whole_data(df, X_train, X_test, y_train, y_test): + df.to_csv("/data/creditcard.csv", index=False) + X_train.to_csv("/data/X_train.csv", index=False) + X_test.to_csv("/data/X_test.csv", index=False) + y_train.to_csv("/data/y_train.csv", index=False) + y_test.to_csv("/data/y_test.csv", index=False) + + +def main(): + download_kaggle_dataset() + os.makedirs("data", exist_ok=True) + + df = load_data("creditcard.csv") + df = normalize_data(df) + + undersample_data, X_undersample, y_undersample = create_undersample_data(df) + X_train_undersample, X_test_undersample, y_train_undersample, y_test_undersample = ( + split_undersample_data(X_undersample, y_undersample) + ) + save_undersample_data( + undersample_data, + X_train_undersample, + X_test_undersample, + y_train_undersample, + y_test_undersample, + ) + + X_train, X_test, y_train, y_test = split_whole_data(df) + save_whole_data(X_train, X_test, y_train, y_test) + + +if __name__ == "__main__": + main() diff --git a/download_dataset.sh b/download_dataset.sh index 3b2f183..ea9c48b 100644 --- a/download_dataset.sh +++ b/download_dataset.sh @@ -1,7 +1,5 @@ #!/bin/bash -# Install the Kaggle API -pip install kaggle # Download the dataset from Kaggle kaggle datasets download -d mlg-ulb/creditcardfraud