diff --git a/Dockerfile b/Dockerfile index a9b0d28..7bdc7f7 100644 --- a/Dockerfile +++ b/Dockerfile @@ -10,7 +10,7 @@ RUN pip3 install --upgrade pip RUN pip3 install pandas RUN pip3 install numpy RUN pip3 install kaggle - +RUN apt-get install zip unzip ARG CUTOFF ARG KAGGLE_USERNAME ARG KAGGLE_KEY @@ -27,4 +27,4 @@ COPY ./process_data.sh . COPY ./download_data_and_process.py . COPY ./stats.py . -# RUN ./process_data.sh +RUN ./process_data.sh diff --git a/download_data_and_process.py b/download_data_and_process.py index a225f2d..b4dd4f2 100644 --- a/download_data_and_process.py +++ b/download_data_and_process.py @@ -1,17 +1,17 @@ import subprocess import pandas as pd import numpy as np -import kaggle +# import kaggle -kaggle.api.authenticate() -kaggle.api.dataset_download_files('shivamb/real-or-fake-fake-jobposting-prediction', path='fake_job_postings.csv', unzip=True) +# kaggle.api.authenticate() +# kaggle.api.dataset_download_files('shivamb/real-or-fake-fake-jobposting-prediction', path='fake_job_postings.csv', unzip=True) -data=pd.read_csv('fake_job_postings.csv/fake_job_postings.csv') +data=pd.read_csv('fake_job_postings.csv') data = data.replace(np.nan, '', regex=True) print("="*20)