diff --git a/Dockerfile b/Dockerfile index 7d40e45..543bb11 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,31 +1,20 @@ FROM ubuntu:latest +FROM python:3.8 RUN apt update # Stwórzmy w kontenerze (jeśli nie istnieje) katalog /app i przejdźmy do niego (wszystkie kolejne polecenia RUN, CMD, ENTRYPOINT, COPY i ADD będą w nim wykonywane) COPY ./requirements.txt . RUN apt-get update -RUN apt-get install -y python3-pip -RUN pip3 install --upgrade pip -RUN pip3 install pandas -RUN pip3 install numpy -RUN pip install kaggle -RUN apt-get install zip unzip +RUN pip3 install -r requirements.txt +RUN apt-get install zip unzip --yes -ARG CUTOFF -ARG KAGGLE_USERNAME -ARG KAGGLE_KEY -ENV CUTOFF=${CUTOFF} -ENV KAGGLE_USERNAME=${KAGGLE_USERNAME} -ENV KAGGLE_KEY=${KAGGLE_KEY} - -# Skopiujmy nasz skrypt do katalogu /app w kontenerze -RUN mkdir /data WORKDIR /app COPY ./process_data.sh . COPY ./download_data_and_process.py . COPY ./stats.py . +COPY ./real-or-fake-fake-jobposting-prediction.zip . RUN chmod +x process_data.sh -RUN ./process_data.sh +CMD python3 download_data_and_process.py diff --git a/Jenkinsfile b/Jenkinsfile index 027f3d2..7d09333 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -34,10 +34,6 @@ pipeline { withEnv(["KAGGLE_USERNAME=${params.KAGGLE_USERNAME}", "KAGGLE_KEY=${params.KAGGLE_KEY}", "CUTOFF=${params.CUTOFF}"]) { - sh './process_data.sh' - sh 'ls' - sh 'wc data_train.csv' - sh 'cat column_titles.csv' sh 'python3 ./download_data_and_process.py' archiveArtifacts artifacts: "data_test.csv, data_dev.csv, data_train.csv, column_titles.csv, data.csv" } diff --git a/download_data_and_process.py b/download_data_and_process.py index b4dd4f2..c46f28b 100644 --- a/download_data_and_process.py +++ b/download_data_and_process.py @@ -1,6 +1,10 @@ import subprocess import pandas as pd import numpy as np + +import subprocess + +rc = subprocess.call("./process_data.sh") # import kaggle # kaggle.api.authenticate() @@ -8,9 +12,6 @@ import numpy as np - - - data=pd.read_csv('fake_job_postings.csv') data = data.replace(np.nan, '', regex=True) diff --git a/process_data.sh b/process_data.sh index f152fcd..8895f0c 100755 --- a/process_data.sh +++ b/process_data.sh @@ -1,14 +1,8 @@ #!/bin/bash echo "Download data from kaggle" echo $KAGGLE_USERNAME -kaggle datasets download -d shivamb/real-or-fake-fake-jobposting-prediction +# kaggle datasets download -d shivamb/real-or-fake-fake-jobposting-prediction unzip -o real-or-fake-fake-jobposting-prediction.zip -ls -wc fake_job_postings.csv -cp fake_job_postings.csv /data/fake_job_postings.csv -cd data -ls -wc fake_job_postings.csv echo "Save column titles" head -n 1 fake_job_postings.csv > column_titles.csv tail -n +2 fake_job_postings.csv > data_not_shuf.csv diff --git a/real-or-fake-fake-jobposting-prediction.zip b/real-or-fake-fake-jobposting-prediction.zip new file mode 100644 index 0000000..d8b4583 Binary files /dev/null and b/real-or-fake-fake-jobposting-prediction.zip differ diff --git a/requirements.txt b/requirements.txt index 84dc8b2..c37d049 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,2 +1,3 @@ -pandas==1.1.5 -numpy==1.22.3 \ No newline at end of file +pandas +numpy +kaggle \ No newline at end of file