diff --git a/Jenkinsfile b/Jenkinsfile index ef3df31..3d8e529 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -28,8 +28,11 @@ pipeline { stages { stage('Run create-dataset script') { steps { + sh 'kaggle datasets download -d mlg-ulb/creditcardfraud' + sh 'unzip creditcardfraud.zip' + sh 'rm creditcardfraud.zip' sh 'chmod +x create-dataset.py' - sh 'python3 ./create-dataset.py' + sh 'python3 ./create-dataset.py $KAGGLE_USERNAME' } } stage('Archive Artifacts') { diff --git a/create-dataset.py b/create-dataset.py index c5e7d0c..46281c3 100644 --- a/create-dataset.py +++ b/create-dataset.py @@ -10,9 +10,11 @@ from sklearn.model_selection import train_test_split def download_kaggle_dataset(): - os.system("kaggle datasets download -d mlg-ulb/creditcardfraud") - os.system("unzip creditcardfraud.zip") - os.system("rm creditcardfraud.zip") + os.environ["KAGGLE_USERNAME"] = "vskyper" + os.environ["KAGGLE_KEY"] = sys.argv[1] + kaggle = KaggleApi() + kaggle.authenticate() + kaggle.dataset_download_files("mlg-ulb/creditcardfraud", path="./", unzip=True) def load_data(name): @@ -96,7 +98,7 @@ def save_whole_data(df, X_train, X_test, y_train, y_test): def main(): - download_kaggle_dataset() + # download_kaggle_dataset() os.makedirs("data", exist_ok=True) df = load_data("creditcard.csv")