added dataset to repository bcs of unresolved problem with kaggle permision in Jenkins :(

This commit is contained in:
Mikołaj Pokrywka 2022-04-09 10:10:16 +02:00
parent a030c6fb81
commit 23acd59d7a
6 changed files with 13 additions and 32 deletions

View File

@ -1,31 +1,20 @@
FROM ubuntu:latest FROM ubuntu:latest
FROM python:3.8
RUN apt update RUN apt update
# Stwórzmy w kontenerze (jeśli nie istnieje) katalog /app i przejdźmy do niego (wszystkie kolejne polecenia RUN, CMD, ENTRYPOINT, COPY i ADD będą w nim wykonywane) # Stwórzmy w kontenerze (jeśli nie istnieje) katalog /app i przejdźmy do niego (wszystkie kolejne polecenia RUN, CMD, ENTRYPOINT, COPY i ADD będą w nim wykonywane)
COPY ./requirements.txt . COPY ./requirements.txt .
RUN apt-get update RUN apt-get update
RUN apt-get install -y python3-pip RUN pip3 install -r requirements.txt
RUN pip3 install --upgrade pip RUN apt-get install zip unzip --yes
RUN pip3 install pandas
RUN pip3 install numpy
RUN pip install kaggle
RUN apt-get install zip unzip
ARG CUTOFF
ARG KAGGLE_USERNAME
ARG KAGGLE_KEY
ENV CUTOFF=${CUTOFF}
ENV KAGGLE_USERNAME=${KAGGLE_USERNAME}
ENV KAGGLE_KEY=${KAGGLE_KEY}
# Skopiujmy nasz skrypt do katalogu /app w kontenerze
RUN mkdir /data
WORKDIR /app WORKDIR /app
COPY ./process_data.sh . COPY ./process_data.sh .
COPY ./download_data_and_process.py . COPY ./download_data_and_process.py .
COPY ./stats.py . COPY ./stats.py .
COPY ./real-or-fake-fake-jobposting-prediction.zip .
RUN chmod +x process_data.sh RUN chmod +x process_data.sh
RUN ./process_data.sh CMD python3 download_data_and_process.py

4
Jenkinsfile vendored
View File

@ -34,10 +34,6 @@ pipeline {
withEnv(["KAGGLE_USERNAME=${params.KAGGLE_USERNAME}", withEnv(["KAGGLE_USERNAME=${params.KAGGLE_USERNAME}",
"KAGGLE_KEY=${params.KAGGLE_KEY}", "KAGGLE_KEY=${params.KAGGLE_KEY}",
"CUTOFF=${params.CUTOFF}"]) { "CUTOFF=${params.CUTOFF}"]) {
sh './process_data.sh'
sh 'ls'
sh 'wc data_train.csv'
sh 'cat column_titles.csv'
sh 'python3 ./download_data_and_process.py' sh 'python3 ./download_data_and_process.py'
archiveArtifacts artifacts: "data_test.csv, data_dev.csv, data_train.csv, column_titles.csv, data.csv" archiveArtifacts artifacts: "data_test.csv, data_dev.csv, data_train.csv, column_titles.csv, data.csv"
} }

View File

@ -1,6 +1,10 @@
import subprocess import subprocess
import pandas as pd import pandas as pd
import numpy as np import numpy as np
import subprocess
rc = subprocess.call("./process_data.sh")
# import kaggle # import kaggle
# kaggle.api.authenticate() # kaggle.api.authenticate()
@ -8,9 +12,6 @@ import numpy as np
data=pd.read_csv('fake_job_postings.csv') data=pd.read_csv('fake_job_postings.csv')
data = data.replace(np.nan, '', regex=True) data = data.replace(np.nan, '', regex=True)

View File

@ -1,14 +1,8 @@
#!/bin/bash #!/bin/bash
echo "Download data from kaggle" echo "Download data from kaggle"
echo $KAGGLE_USERNAME echo $KAGGLE_USERNAME
kaggle datasets download -d shivamb/real-or-fake-fake-jobposting-prediction # kaggle datasets download -d shivamb/real-or-fake-fake-jobposting-prediction
unzip -o real-or-fake-fake-jobposting-prediction.zip unzip -o real-or-fake-fake-jobposting-prediction.zip
ls
wc fake_job_postings.csv
cp fake_job_postings.csv /data/fake_job_postings.csv
cd data
ls
wc fake_job_postings.csv
echo "Save column titles" echo "Save column titles"
head -n 1 fake_job_postings.csv > column_titles.csv head -n 1 fake_job_postings.csv > column_titles.csv
tail -n +2 fake_job_postings.csv > data_not_shuf.csv tail -n +2 fake_job_postings.csv > data_not_shuf.csv

Binary file not shown.

View File

@ -1,2 +1,3 @@
pandas==1.1.5 pandas
numpy==1.22.3 numpy
kaggle