From 87211b61b908f8008d8e2a616423d64ee2212bab Mon Sep 17 00:00:00 2001
From: AdamOsiowy123 <adaosi@st.amu.edu.pl>
Date: Sun, 3 Apr 2022 23:22:54 +0200
Subject: [PATCH] download-dataset with docker

---
 Dockerfile       | 19 ++++++++++++++++
 Jenkinsfile      | 58 +++++++++++++++++++++++++-----------------------
 figlet-loop.sh   |  4 ++++
 lab2_data.py     |  6 ++---
 requirements.txt | 19 ++++++++++++++++
 5 files changed, 74 insertions(+), 32 deletions(-)
 create mode 100644 Dockerfile
 create mode 100644 figlet-loop.sh
 create mode 100644 requirements.txt

diff --git a/Dockerfile b/Dockerfile
new file mode 100644
index 0000000..03288ea
--- /dev/null
+++ b/Dockerfile
@@ -0,0 +1,19 @@
+# Nasz obraz będzie dzidziczył z obrazu Ubuntu w wersji latest
+FROM ubuntu:latest
+
+# Instalujemy niezbędne zależności. Zwróć uwagę na flagę "-y" (assume yes)
+RUN apt update && apt install -y python3 \
+    python3-pip \
+    vim
+
+ENV CUTOFF=${CUTOFF}
+ENV KAGGLE_USERNAME=${KAGGLE_USERNAME}
+ENV KAGGLE_KEY=${KAGGLE_KEY}
+
+# Stwórzmy w kontenerze (jeśli nie istnieje) katalog /app i przejdźmy do niego (wszystkie kolejne polecenia RUN, CMD, ENTRYPOINT, COPY i ADD będą w nim wykonywane)
+WORKDIR /app
+
+# Skopiujmy nasz skrypt do katalogu /app w kontenerze
+COPY . /app/
+
+RUN python3 -m pip install -r requirements.txt
\ No newline at end of file
diff --git a/Jenkinsfile b/Jenkinsfile
index b204b5a..6ec9107 100644
--- a/Jenkinsfile
+++ b/Jenkinsfile
@@ -1,38 +1,40 @@
 node {
-    stage('Preparation') {
-     properties([
-         parameters([
-             string(
+    docker.image('s444452/ium:1.0').inside {
+        stage('Preparation') {
+            properties([
+                parameters([
+                string(
                  defaultValue: 'adamosiowy',
                  description: 'Kaggle username',
                  name: 'KAGGLE_USERNAME',
                  trim: false
-             ),
-             password(
+                ),
+                password(
                  defaultValue: '',
                  description: 'Kaggle token taken from kaggle.json file, as described in https://github.com/Kaggle/kaggle-api#api-credentials',
                  name: 'KAGGLE_KEY'
-             ),
-             string(
-                defaultValue: "1000",
-                description: 'Determine the size of dataset',
-                name: 'CUTOFF'
-             )
-         ])
-     ])
-    }
-    stage('Clone repository') {
-        checkout([$class: 'GitSCM', branches: [[name: '*/master']], extensions: [], userRemoteConfigs: [[credentialsId: '5e0a58a0-03ad-41dd-beff-7b8a07c7fe0c', url: 'https://git.wmi.amu.edu.pl/s444452/ium_444452.git']]])
-    }
-    stage('Run script') {
-        withEnv(["KAGGLE_USERNAME=${params.KAGGLE_USERNAME}",
-              "KAGGLE_KEY=${params.KAGGLE_KEY}","CUTOFF=${params.CUTOFF}"]) {
-            sh 'echo KAGGLE_USERNAME: $KAGGLE_USERNAME'
-            sh "chmod u+x ./download_dataset.sh"
-            sh "./download_dataset.sh $CUTOFF"
+                ),
+                string(
+                 defaultValue: "10000",
+                 description: 'Determine the size of dataset',
+                 name: 'CUTOFF'
+                )
+                ])
+            ])
+        }
+        stage('Clone repository') {
+            checkout([$class: 'GitSCM', branches: [[name: '*/master']], extensions: [], userRemoteConfigs: [[credentialsId: '5e0a58a0-03ad-41dd-beff-7b8a07c7fe0c', url: 'https://git.wmi.amu.edu.pl/s444452/ium_444452.git']]])
+        }
+        stage('Run script') {
+            withEnv(["KAGGLE_USERNAME=${params.KAGGLE_USERNAME}",
+                "KAGGLE_KEY=${params.KAGGLE_KEY}","CUTOFF=${params.CUTOFF}"]) {
+                sh 'echo KAGGLE_USERNAME: $KAGGLE_USERNAME'
+                sh "chmod u+x ./lab2_data.py"
+                sh "./lab2_data.py"
+            }
+        }
+        stage('Archive artifacts') {
+            archiveArtifacts 'fake_job_postings.csv'
         }
     }
-    stage('Archive artifacts') {
-        archiveArtifacts 'dataset.csv'
-    }
-}
+}
\ No newline at end of file
diff --git a/figlet-loop.sh b/figlet-loop.sh
new file mode 100644
index 0000000..6c85ffc
--- /dev/null
+++ b/figlet-loop.sh
@@ -0,0 +1,4 @@
+#!/bin/bash
+while read line; do
+	figlet "$line"
+done
\ No newline at end of file
diff --git a/lab2_data.py b/lab2_data.py
index 4efbc6d..49294fc 100644
--- a/lab2_data.py
+++ b/lab2_data.py
@@ -6,9 +6,7 @@ from sklearn.model_selection import train_test_split
 
 def download_and_save_dataset():
     api.authenticate() 
-    api.dataset_download_files('shivamb/real-or-fake-fake-jobposting-prediction',
-                               path='./data',
-                               unzip=True)
+    api.dataset_download_files('shivamb/real-or-fake-fake-jobposting-prediction', unzip=True)
 
 
 def split_dataset(data: DataFrame):
@@ -26,7 +24,7 @@ def split_dataset(data: DataFrame):
 
 def main():
     # download_and_save_dataset()
-    df = read_csv('./data/fake_job_postings.csv')
+    df = read_csv('./fake_job_postings.csv')
     print(df.describe(include='all'))
     print(df.shape)
     x_train, x_val, x_test, y_train, y_val, y_test = split_dataset(df)
diff --git a/requirements.txt b/requirements.txt
new file mode 100644
index 0000000..8e75929
--- /dev/null
+++ b/requirements.txt
@@ -0,0 +1,19 @@
+certifi==2021.10.8
+charset-normalizer==2.0.12
+idna==3.3
+joblib==1.1.0
+kaggle==1.5.12
+numpy==1.22.3
+pandas==1.4.1
+python-dateutil==2.8.2
+python-slugify==6.1.1
+pytz==2022.1
+requests==2.27.1
+scikit-learn==1.0.2
+scipy==1.8.0
+six==1.16.0
+sklearn==0.0
+text-unidecode==1.3
+threadpoolctl==3.1.0
+tqdm==4.63.1
+urllib3==1.26.9