From 87211b61b908f8008d8e2a616423d64ee2212bab Mon Sep 17 00:00:00 2001 From: AdamOsiowy123 Date: Sun, 3 Apr 2022 23:22:54 +0200 Subject: [PATCH] download-dataset with docker --- Dockerfile | 19 ++++++++++++++++ Jenkinsfile | 58 +++++++++++++++++++++++++----------------------- figlet-loop.sh | 4 ++++ lab2_data.py | 6 ++--- requirements.txt | 19 ++++++++++++++++ 5 files changed, 74 insertions(+), 32 deletions(-) create mode 100644 Dockerfile create mode 100644 figlet-loop.sh create mode 100644 requirements.txt diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 0000000..03288ea --- /dev/null +++ b/Dockerfile @@ -0,0 +1,19 @@ +# Nasz obraz będzie dzidziczył z obrazu Ubuntu w wersji latest +FROM ubuntu:latest + +# Instalujemy niezbędne zależności. Zwróć uwagę na flagę "-y" (assume yes) +RUN apt update && apt install -y python3 \ + python3-pip \ + vim + +ENV CUTOFF=${CUTOFF} +ENV KAGGLE_USERNAME=${KAGGLE_USERNAME} +ENV KAGGLE_KEY=${KAGGLE_KEY} + +# Stwórzmy w kontenerze (jeśli nie istnieje) katalog /app i przejdźmy do niego (wszystkie kolejne polecenia RUN, CMD, ENTRYPOINT, COPY i ADD będą w nim wykonywane) +WORKDIR /app + +# Skopiujmy nasz skrypt do katalogu /app w kontenerze +COPY . /app/ + +RUN python3 -m pip install -r requirements.txt \ No newline at end of file diff --git a/Jenkinsfile b/Jenkinsfile index b204b5a..6ec9107 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -1,38 +1,40 @@ node { - stage('Preparation') { - properties([ - parameters([ - string( + docker.image('s444452/ium:1.0').inside { + stage('Preparation') { + properties([ + parameters([ + string( defaultValue: 'adamosiowy', description: 'Kaggle username', name: 'KAGGLE_USERNAME', trim: false - ), - password( + ), + password( defaultValue: '', description: 'Kaggle token taken from kaggle.json file, as described in https://github.com/Kaggle/kaggle-api#api-credentials', name: 'KAGGLE_KEY' - ), - string( - defaultValue: "1000", - description: 'Determine the size of dataset', - name: 'CUTOFF' - ) - ]) - ]) - } - stage('Clone repository') { - checkout([$class: 'GitSCM', branches: [[name: '*/master']], extensions: [], userRemoteConfigs: [[credentialsId: '5e0a58a0-03ad-41dd-beff-7b8a07c7fe0c', url: 'https://git.wmi.amu.edu.pl/s444452/ium_444452.git']]]) - } - stage('Run script') { - withEnv(["KAGGLE_USERNAME=${params.KAGGLE_USERNAME}", - "KAGGLE_KEY=${params.KAGGLE_KEY}","CUTOFF=${params.CUTOFF}"]) { - sh 'echo KAGGLE_USERNAME: $KAGGLE_USERNAME' - sh "chmod u+x ./download_dataset.sh" - sh "./download_dataset.sh $CUTOFF" + ), + string( + defaultValue: "10000", + description: 'Determine the size of dataset', + name: 'CUTOFF' + ) + ]) + ]) + } + stage('Clone repository') { + checkout([$class: 'GitSCM', branches: [[name: '*/master']], extensions: [], userRemoteConfigs: [[credentialsId: '5e0a58a0-03ad-41dd-beff-7b8a07c7fe0c', url: 'https://git.wmi.amu.edu.pl/s444452/ium_444452.git']]]) + } + stage('Run script') { + withEnv(["KAGGLE_USERNAME=${params.KAGGLE_USERNAME}", + "KAGGLE_KEY=${params.KAGGLE_KEY}","CUTOFF=${params.CUTOFF}"]) { + sh 'echo KAGGLE_USERNAME: $KAGGLE_USERNAME' + sh "chmod u+x ./lab2_data.py" + sh "./lab2_data.py" + } + } + stage('Archive artifacts') { + archiveArtifacts 'fake_job_postings.csv' } } - stage('Archive artifacts') { - archiveArtifacts 'dataset.csv' - } -} +} \ No newline at end of file diff --git a/figlet-loop.sh b/figlet-loop.sh new file mode 100644 index 0000000..6c85ffc --- /dev/null +++ b/figlet-loop.sh @@ -0,0 +1,4 @@ +#!/bin/bash +while read line; do + figlet "$line" +done \ No newline at end of file diff --git a/lab2_data.py b/lab2_data.py index 4efbc6d..49294fc 100644 --- a/lab2_data.py +++ b/lab2_data.py @@ -6,9 +6,7 @@ from sklearn.model_selection import train_test_split def download_and_save_dataset(): api.authenticate() - api.dataset_download_files('shivamb/real-or-fake-fake-jobposting-prediction', - path='./data', - unzip=True) + api.dataset_download_files('shivamb/real-or-fake-fake-jobposting-prediction', unzip=True) def split_dataset(data: DataFrame): @@ -26,7 +24,7 @@ def split_dataset(data: DataFrame): def main(): # download_and_save_dataset() - df = read_csv('./data/fake_job_postings.csv') + df = read_csv('./fake_job_postings.csv') print(df.describe(include='all')) print(df.shape) x_train, x_val, x_test, y_train, y_val, y_test = split_dataset(df) diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..8e75929 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,19 @@ +certifi==2021.10.8 +charset-normalizer==2.0.12 +idna==3.3 +joblib==1.1.0 +kaggle==1.5.12 +numpy==1.22.3 +pandas==1.4.1 +python-dateutil==2.8.2 +python-slugify==6.1.1 +pytz==2022.1 +requests==2.27.1 +scikit-learn==1.0.2 +scipy==1.8.0 +six==1.16.0 +sklearn==0.0 +text-unidecode==1.3 +threadpoolctl==3.1.0 +tqdm==4.63.1 +urllib3==1.26.9