add script to download and process dataset

This commit is contained in:
Sheaza 2024-03-26 18:29:52 +01:00
parent f529abac39
commit d7110d7bf9
3 changed files with 48 additions and 3 deletions

31
Jenkinsfile vendored
View File

@ -1,14 +1,39 @@
pipeline {
agent any
parameters {
string (
name: 'KAGGLE_USERNAME',
defaultValue: 'ardenw',
description: 'Kaggle username'
)
password (
name: 'KAGGLE_KEY',
defaultValue: '',
description: 'Kaggle API key'
)
string (
name: 'DATA_TRAIN_RATIO',
defaultValue: '0.8',
description: 'Train data ratio'
)
string (
name: 'CUTOFF',
defaultValue: '500',
description: 'Cutoff value'
)
}
stages {
stage('Checkout repository') {
steps {
checkout scmGit(branches: [[name: '*/master']], extensions: [], userRemoteConfigs: [[credentialsId: 's464980_token', url: 'https://git.wmi.amu.edu.pl/s464980/IUM_s464980.git']])
checkout scm
}
}
stage('Download and process data') {
stage('Download dataset') {
steps {
echo "Hello"
withEnv(["USERNAME=${params.USERNAME}", "API_KEY=${params.API_KEY}", "DATA_TRAIN_RATIO=${params.DATA_TRAIN_RATIO}"]) {
sh "chmod +x download_dataset.sh"
sh "./download_dataset.sh $DATA_TRAIN_RATIO"
archiveArtifacts artifacts: 'data.csv,train.csv,test.csv', onlyIfSuccessful: true
}
}
}

17
download_dataset.sh Normal file
View File

@ -0,0 +1,17 @@
#!/bin/bash
# download data from kaggle
kaggle datasets download -p "https://www.kaggle.com/datasets/nikhil7280/student-performance-multiple-linear-regression/code" --unzip
# change dataset name to data.csv
mv Student_Performance.csv data.csv
# cut off rows
head -n "$1" data.csv > data.csv.tmp && mv data.csv.tmp data.csv
# get data size
data_size=$(wc -l < data.csv)
# split data to train and test and save it to csv files
head -n $(( $data_size * $1 )) data.csv > train.csv
tail -n $(( $data_size * ( 1 - $1 ) )) data.csv > test.csv

3
get_stats.sh Normal file
View File

@ -0,0 +1,3 @@
#!/bin/bash
wc -l < data.csv > stats.txt