add script to download and process dataset
This commit is contained in:
parent
f529abac39
commit
d7110d7bf9
31
Jenkinsfile
vendored
31
Jenkinsfile
vendored
@ -1,14 +1,39 @@
|
|||||||
pipeline {
|
pipeline {
|
||||||
agent any
|
agent any
|
||||||
|
parameters {
|
||||||
|
string (
|
||||||
|
name: 'KAGGLE_USERNAME',
|
||||||
|
defaultValue: 'ardenw',
|
||||||
|
description: 'Kaggle username'
|
||||||
|
)
|
||||||
|
password (
|
||||||
|
name: 'KAGGLE_KEY',
|
||||||
|
defaultValue: '',
|
||||||
|
description: 'Kaggle API key'
|
||||||
|
)
|
||||||
|
string (
|
||||||
|
name: 'DATA_TRAIN_RATIO',
|
||||||
|
defaultValue: '0.8',
|
||||||
|
description: 'Train data ratio'
|
||||||
|
)
|
||||||
|
string (
|
||||||
|
name: 'CUTOFF',
|
||||||
|
defaultValue: '500',
|
||||||
|
description: 'Cutoff value'
|
||||||
|
)
|
||||||
|
}
|
||||||
stages {
|
stages {
|
||||||
stage('Checkout repository') {
|
stage('Checkout repository') {
|
||||||
steps {
|
steps {
|
||||||
checkout scmGit(branches: [[name: '*/master']], extensions: [], userRemoteConfigs: [[credentialsId: 's464980_token', url: 'https://git.wmi.amu.edu.pl/s464980/IUM_s464980.git']])
|
checkout scm
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
stage('Download and process data') {
|
stage('Download dataset') {
|
||||||
steps {
|
steps {
|
||||||
echo "Hello"
|
withEnv(["USERNAME=${params.USERNAME}", "API_KEY=${params.API_KEY}", "DATA_TRAIN_RATIO=${params.DATA_TRAIN_RATIO}"]) {
|
||||||
|
sh "chmod +x download_dataset.sh"
|
||||||
|
sh "./download_dataset.sh $DATA_TRAIN_RATIO"
|
||||||
|
archiveArtifacts artifacts: 'data.csv,train.csv,test.csv', onlyIfSuccessful: true
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
17
download_dataset.sh
Normal file
17
download_dataset.sh
Normal file
@ -0,0 +1,17 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
|
||||||
|
# download data from kaggle
|
||||||
|
kaggle datasets download -p "https://www.kaggle.com/datasets/nikhil7280/student-performance-multiple-linear-regression/code" --unzip
|
||||||
|
|
||||||
|
# change dataset name to data.csv
|
||||||
|
mv Student_Performance.csv data.csv
|
||||||
|
|
||||||
|
# cut off rows
|
||||||
|
head -n "$1" data.csv > data.csv.tmp && mv data.csv.tmp data.csv
|
||||||
|
|
||||||
|
# get data size
|
||||||
|
data_size=$(wc -l < data.csv)
|
||||||
|
|
||||||
|
# split data to train and test and save it to csv files
|
||||||
|
head -n $(( $data_size * $1 )) data.csv > train.csv
|
||||||
|
tail -n $(( $data_size * ( 1 - $1 ) )) data.csv > test.csv
|
3
get_stats.sh
Normal file
3
get_stats.sh
Normal file
@ -0,0 +1,3 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
|
||||||
|
wc -l < data.csv > stats.txt
|
Loading…
Reference in New Issue
Block a user