From 28864b02c98a19431d92fb2e852cf527983a6afc Mon Sep 17 00:00:00 2001 From: Mateusz Date: Mon, 1 Apr 2024 17:59:41 +0200 Subject: [PATCH] Dockerfile --- Jenkinsfile | 42 ++++++++++++++++++++++++++++++------------ create-dataset.py | 18 +++++++++++++++++- 2 files changed, 47 insertions(+), 13 deletions(-) diff --git a/Jenkinsfile b/Jenkinsfile index 517a38d..57f76e2 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -1,10 +1,5 @@ pipeline { - agent { - dockerfile { - filename 'Dockerfile' - args '-u root' - } - } + agent any parameters { string ( @@ -20,16 +15,39 @@ pipeline { ) } - environment { - KAGGLE_USERNAME = "${params.KAGGLE_USERNAME}" - KAGGLE_KEY = "${params.KAGGLE_KEY}" - } - stages { + stage('Clone Repository') { + steps { + git branch: 'main', url: 'https://git.wmi.amu.edu.pl/s464913/ium_464913.git' + } + } + + stage('Download dataset') { + steps { + withEnv(["KAGGLE_USERNAME=${params.KAGGLE_USERNAME}", "KAGGLE_KEY=${params.KAGGLE_KEY}"]) { + sh 'kaggle datasets download -d mlg-ulb/creditcardfraud' + sh 'unzip -o creditcardfraud.zip' + sh 'rm creditcardfraud.zip' + } + } + } + stage('Run create-dataset script') { + agent { + dockerfile { + reuseNode true + } + } + steps { sh 'chmod +x create-dataset.py' - sh 'python3 ./create-dataset.py $KAGGLE_USERNAME' + sh 'python3 ./create-dataset.py' + } + } + + stage('Archive Artifacts') { + steps { + archiveArtifacts artifacts: 'data/*', onlyIfSuccessful: true } } } diff --git a/create-dataset.py b/create-dataset.py index a4e6edc..59e63d9 100644 --- a/create-dataset.py +++ b/create-dataset.py @@ -91,7 +91,23 @@ def main(): # download_kaggle_dataset() os.makedirs("data", exist_ok=True) - os.system("rm -rf data/*") + df = load_data("creditcard.csv") + df = normalize_data(df) + + undersample_data, X_undersample, y_undersample = create_undersample_data(df) + X_train_undersample, X_test_undersample, y_train_undersample, y_test_undersample = ( + split_undersample_data(X_undersample, y_undersample) + ) + save_undersample_data( + undersample_data, + X_train_undersample, + X_test_undersample, + y_train_undersample, + y_test_undersample, + ) + + X_train, X_test, y_train, y_test = split_whole_data(df) + save_whole_data(df, X_train, X_test, y_train, y_test) if __name__ == "__main__":