From d7110d7bf9796796d4162d9b0e1bb86c136983e7 Mon Sep 17 00:00:00 2001 From: Sheaza Date: Tue, 26 Mar 2024 18:29:52 +0100 Subject: [PATCH] add script to download and process dataset --- Jenkinsfile | 31 ++++++++++++++++++++++++++++--- download_dataset.sh | 17 +++++++++++++++++ get_stats.sh | 3 +++ 3 files changed, 48 insertions(+), 3 deletions(-) create mode 100644 download_dataset.sh create mode 100644 get_stats.sh diff --git a/Jenkinsfile b/Jenkinsfile index b7315fe..11eb8a1 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -1,14 +1,39 @@ pipeline { agent any + parameters { + string ( + name: 'KAGGLE_USERNAME', + defaultValue: 'ardenw', + description: 'Kaggle username' + ) + password ( + name: 'KAGGLE_KEY', + defaultValue: '', + description: 'Kaggle API key' + ) + string ( + name: 'DATA_TRAIN_RATIO', + defaultValue: '0.8', + description: 'Train data ratio' + ) + string ( + name: 'CUTOFF', + defaultValue: '500', + description: 'Cutoff value' + ) + } stages { stage('Checkout repository') { steps { - checkout scmGit(branches: [[name: '*/master']], extensions: [], userRemoteConfigs: [[credentialsId: 's464980_token', url: 'https://git.wmi.amu.edu.pl/s464980/IUM_s464980.git']]) + checkout scm } } - stage('Download and process data') { + stage('Download dataset') { steps { - echo "Hello" + withEnv(["USERNAME=${params.USERNAME}", "API_KEY=${params.API_KEY}", "DATA_TRAIN_RATIO=${params.DATA_TRAIN_RATIO}"]) { + sh "chmod +x download_dataset.sh" + sh "./download_dataset.sh $DATA_TRAIN_RATIO" + archiveArtifacts artifacts: 'data.csv,train.csv,test.csv', onlyIfSuccessful: true } } } diff --git a/download_dataset.sh b/download_dataset.sh new file mode 100644 index 0000000..13c0cee --- /dev/null +++ b/download_dataset.sh @@ -0,0 +1,17 @@ +#!/bin/bash + +# download data from kaggle +kaggle datasets download -p "https://www.kaggle.com/datasets/nikhil7280/student-performance-multiple-linear-regression/code" --unzip + +# change dataset name to data.csv +mv Student_Performance.csv data.csv + +# cut off rows +head -n "$1" data.csv > data.csv.tmp && mv data.csv.tmp data.csv + +# get data size +data_size=$(wc -l < data.csv) + +# split data to train and test and save it to csv files +head -n $(( $data_size * $1 )) data.csv > train.csv +tail -n $(( $data_size * ( 1 - $1 ) )) data.csv > test.csv \ No newline at end of file diff --git a/get_stats.sh b/get_stats.sh new file mode 100644 index 0000000..d499f5a --- /dev/null +++ b/get_stats.sh @@ -0,0 +1,3 @@ +#!/bin/bash + +wc -l < data.csv > stats.txt \ No newline at end of file