a

2022-05-03 15:55:42 +02:00 · 2022-05-03 15:55:42 +02:00 · 40617e67f9
commit 40617e67f9
parent ac8d2b740d
8 changed files with 4 additions and 19489 deletions
--- a/3
+++ b/3
@ -17,4 +17,5 @@ COPY ./download_data_and_process.py .
 COPY ./stats.py .
 COPY ./real-or-fake-fake-jobposting-prediction.zip .
 RUN chmod +x process_data.sh
-CMD python3 download_data_and_process.py
+RUN ls
+# CMD python3 download_data_and_process.py
--- a/4
+++ b/4
@ -34,8 +34,8 @@ pipeline {
                withEnv(["KAGGLE_USERNAME=${params.KAGGLE_USERNAME}",
                         "KAGGLE_KEY=${params.KAGGLE_KEY}",
                         "CUTOFF=${params.CUTOFF}"]) {
-                            sh 'python3 ./download_data_and_process.py'
-                            archiveArtifacts artifacts: "data_test.csv, data_dev.csv, data_train.csv, column_titles.csv, data.csv"
+                            sh './process_data.sh'
+                            archiveArtifacts artifacts: "data_test.csv, data_dev.csv, data_train.csv, column_titles.csv, fake_job_postings.csv"
                }
            }
        }
--- a/Jenkinsfile.stats
+++ b/Jenkinsfile.stats
@ -1,30 +0,0 @@
-pipeline {
-    agent { 
-        docker { image 'mikolajpokrywka/ium:0.0.0' }
-    } 
-    parameters {
-        buildSelector(
-            defaultSelector: lastSuccessful(),
-            description: 'Which build to use for copying artifacts',
-            name: 'BUILD_SELECTOR'
-        )
-    }
-    stages {
-       stage('checkout: Check out from version control') {
-            steps { 
-                    checkout([$class: 'GitSCM', branches: [[name: '*/master']], extensions: [], userRemoteConfigs: [[credentialsId: 's444463', url: 'https://git.wmi.amu.edu.pl/s444463/ium_444463.git']]])
-                }
-        }
-        stage('Copy artifacts') {
-            steps { 
-                   copyArtifacts fingerprintArtifacts: true, projectName: 's444463-create-dataset', selector: buildParameter('BUILD_SELECTOR')
-                }
-        }
-        stage("Calcualte stats") {
-            steps {
-                sh "python3 stats.py"
-                archiveArtifacts artifacts: "stats.txt"
-            }
-        }
-    }
-}
--- a/README.md
+++ b/README.md
@ -1,4 +0,0 @@
-# Run with docker
-`docker build  -t ium .`
-
-`docker run -e KAGGLE_USERNAME='your_kaggle_username' -e KAGGLE_KEY='<your_kaggle_key>' -e CUTOFF='1600' -it ium:latest`
--- a/download_data.ipynb
+++ b/download_data.ipynb
--- a/fake_job_postings.csv
+++ b/fake_job_postings.csv
--- a/stats.py
+++ b/stats.py
@ -1,33 +0,0 @@
-import subprocess
-import pandas as pd
-import numpy as np
-import os
-
-
-path = ''
-
-all_files = ['column_titles.csv', 'data_train.csv', 'data_dev.csv', 'data_test.csv']
-
-data_file = open("data.csv", "w")
-for name in all_files:
-    f = open(name, "r")
-    data_file.write(f.read())
-    f.close()
-
-data_file.close()
-data=pd.read_csv('data.csv')
-data = data.replace(np.nan, '', regex=True)
-
-print("="*20)
-print('Ilość wierszy w zbiorze: ',len(data))
-
-print("="*10, ' data["department"].value_counts() ', 10*'=')
-print(data["department"].value_counts())
-
-print("="*10, ' data.median() ', 10*'=')
-print(data.median())
-
-print("="*10, ' data.describe(include="all") ', 10*'=')
-print(data.describe(include='all'))
-
-data.describe(include="all").to_csv(r'stats.txt', header=None, index=None, sep='\t', mode='a')
--- a/stats.sh
+++ b/stats.sh
@ -1,2 +0,0 @@
-#!/bin/bash
-wc data_train.csv > stats.txt