diff --git a/.idea/.gitignore b/.idea/.gitignore new file mode 100644 index 0000000..13566b8 --- /dev/null +++ b/.idea/.gitignore @@ -0,0 +1,8 @@ +# Default ignored files +/shelf/ +/workspace.xml +# Editor-based HTTP Client requests +/httpRequests/ +# Datasource local storage ignored files +/dataSources/ +/dataSources.local.xml diff --git a/.idea/inspectionProfiles/profiles_settings.xml b/.idea/inspectionProfiles/profiles_settings.xml new file mode 100644 index 0000000..105ce2d --- /dev/null +++ b/.idea/inspectionProfiles/profiles_settings.xml @@ -0,0 +1,6 @@ + + + + \ No newline at end of file diff --git a/.idea/ium_464979.iml b/.idea/ium_464979.iml new file mode 100644 index 0000000..d0876a7 --- /dev/null +++ b/.idea/ium_464979.iml @@ -0,0 +1,8 @@ + + + + + + + + \ No newline at end of file diff --git a/.idea/misc.xml b/.idea/misc.xml new file mode 100644 index 0000000..dc9ea49 --- /dev/null +++ b/.idea/misc.xml @@ -0,0 +1,4 @@ + + + + \ No newline at end of file diff --git a/.idea/modules.xml b/.idea/modules.xml new file mode 100644 index 0000000..4f48266 --- /dev/null +++ b/.idea/modules.xml @@ -0,0 +1,8 @@ + + + + + + + + \ No newline at end of file diff --git a/.idea/vcs.xml b/.idea/vcs.xml new file mode 100644 index 0000000..35eb1dd --- /dev/null +++ b/.idea/vcs.xml @@ -0,0 +1,6 @@ + + + + + + \ No newline at end of file diff --git a/Jenkinsfile b/Jenkinsfile index 5cafbfe..973cf0d 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -1,29 +1,33 @@ pipeline { - agent any - //Definijuemy parametry, kt�re b�dzie mo�na poda� podczas wywo�ywania zadania - parameters { - string ( - defaultValue: 'Hello World!', - description: 'Tekst, kt�rym chcesz przywita� �wiat', - name: 'INPUT_TEXT', - trim: false - ) - } - stages { - stage('Hello') { - steps { - //Wypisz warto�� parametru w konsoli (To nie jest polecenie bash, tylko groovy!) - echo "INPUT_TEXT: promoscan" - //Wywo�aj w konsoli komend� "figlet", kt�ra generuje ASCI-art - sh "figlet \"promoscan\" | tee output.txt" - } - } - stage('Goodbye!') { - steps { - echo 'Goodbye!' - //Zarchiwizuj wynik - archiveArtifacts 'output.txt' - } - } - } -} \ No newline at end of file + agent any + + parameters { + properties([ + parameters([ + string(name: 'KAGGLE_DATASET_ID', defaultValue: '', description: 'Kaggle dataset'), + string(name: 'REPO_URL', defaultValue: '', description: 'Git Url'), + string(name: 'CUTOFF', defaultValue: '10000', description: 'Liczba wierszy do obcięcia ze zbioru danych') + ]) + ]) + } + + stages { + stage('Clone Repository') { + steps { + git url: "${params.REPO_URL}" + } + } + + stage('Download, Process, and Split Dataset') { + steps { + withEnv([ + "KAGGLE_USERNAME=${env.KAGGLE_USERNAME}", + "KAGGLE_KEY=${env.KAGGLE_KEY}" + ]) { + sh "bash ./kuggle_download.sh ${params.KAGGLE_DATASET_ID} ${params.CUTOFF}" + sh './kuggle_download.sh' + } + } + } + } +} diff --git a/kuggle_download.sh b/kuggle_download.sh new file mode 100644 index 0000000..20e4416 --- /dev/null +++ b/kuggle_download.sh @@ -0,0 +1,27 @@ +#!/bin/bash + +pip install kaggle + +kaggle datasets download -d $DATASET_ID + +unzip -o $DATASET_FILE + +shuf $DATASET_FILE > shuffled_dataset.csv + +split -l 80000 shuffled_dataset.csv train.csv +split -l 10000 train.csv dev.csv +mv shuffled_dataset.csv test.csv + +head -n 1000 train.csv > train_head.csv +tail -n 1000 train.csv > train_tail.csv + +if [ -n "$CUTOFF" ]; then + head -n "$CUTOFF" train.csv > train_cutoff.csv +fi + +tar -czf artifacts.tar.gz train.csv dev.csv test.csv train_head.csv train_tail.csv train_cutoff.csv + +rm $DATASET_FILE shuffled_dataset.csv + +echo "artifacts.tar.gz" +