download-dataset with docker

2022-04-03 23:22:54 +02:00 · 2022-04-03 23:22:54 +02:00 · 87211b61b9
commit 87211b61b9
parent f9e12e8e3c
5 changed files with 74 additions and 32 deletions
--- a/19
+++ b/19
@ -0,0 +1,19 @@
 # Nasz obraz będzie dzidziczył z obrazu Ubuntu w wersji latest
 FROM ubuntu:latest
 # Instalujemy niezbędne zależności. Zwróć uwagę na flagę "-y" (assume yes)
 RUN apt update && apt install -y python3 \
    python3-pip \
    vim
 ENV CUTOFF=${CUTOFF}
 ENV KAGGLE_USERNAME=${KAGGLE_USERNAME}
 ENV KAGGLE_KEY=${KAGGLE_KEY}
 # Stwórzmy w kontenerze (jeśli nie istnieje) katalog /app i przejdźmy do niego (wszystkie kolejne polecenia RUN, CMD, ENTRYPOINT, COPY i ADD będą w nim wykonywane)
 WORKDIR /app
 # Skopiujmy nasz skrypt do katalogu /app w kontenerze
 COPY . /app/
 RUN python3 -m pip install -r requirements.txt
--- a/10
+++ b/10
@ -1,4 +1,5 @@
 node {
    docker.image('s444452/ium:1.0').inside {
        stage('Preparation') {
            properties([
                parameters([
@ -14,7 +15,7 @@ node {
                 name: 'KAGGLE_KEY'
                ),
                string(
-                defaultValue: "1000",
+                 defaultValue: "10000",
                 description: 'Determine the size of dataset',
                 name: 'CUTOFF'
                )
@ -28,11 +29,12 @@ node {
            withEnv(["KAGGLE_USERNAME=${params.KAGGLE_USERNAME}",
                "KAGGLE_KEY=${params.KAGGLE_KEY}","CUTOFF=${params.CUTOFF}"]) {
                sh 'echo KAGGLE_USERNAME: $KAGGLE_USERNAME'
-            sh "chmod u+x ./download_dataset.sh"
+                sh "chmod u+x ./lab2_data.py"
-            sh "./download_dataset.sh $CUTOFF"
+                sh "./lab2_data.py"
            }
        }
        stage('Archive artifacts') {
-        archiveArtifacts 'dataset.csv'
+            archiveArtifacts 'fake_job_postings.csv'
        }
    }
 }
--- a/figlet-loop.sh
+++ b/figlet-loop.sh
@ -0,0 +1,4 @@
 #!/bin/bash
 while read line; do
 	figlet "$line"
 done
--- a/lab2_data.py
+++ b/lab2_data.py
@ -6,9 +6,7 @@ from sklearn.model_selection import train_test_split
 def download_and_save_dataset():
    api.authenticate() 
-    api.dataset_download_files('shivamb/real-or-fake-fake-jobposting-prediction',
+    api.dataset_download_files('shivamb/real-or-fake-fake-jobposting-prediction', unzip=True)
                               path='./data',
                               unzip=True)
 def split_dataset(data: DataFrame):
@ -26,7 +24,7 @@ def split_dataset(data: DataFrame):
 def main():
    # download_and_save_dataset()
-    df = read_csv('./data/fake_job_postings.csv')
+    df = read_csv('./fake_job_postings.csv')
    print(df.describe(include='all'))
    print(df.shape)
    x_train, x_val, x_test, y_train, y_val, y_test = split_dataset(df)
--- a/requirements.txt
+++ b/requirements.txt
@ -0,0 +1,19 @@
 certifi==2021.10.8
 charset-normalizer==2.0.12
 idna==3.3
 joblib==1.1.0
 kaggle==1.5.12
 numpy==1.22.3
 pandas==1.4.1
 python-dateutil==2.8.2
 python-slugify==6.1.1
 pytz==2022.1
 requests==2.27.1
 scikit-learn==1.0.2
 scipy==1.8.0
 six==1.16.0
 sklearn==0.0
 text-unidecode==1.3
 threadpoolctl==3.1.0
 tqdm==4.63.1
 urllib3==1.26.9