download-dataset with docker

This commit is contained in:
AdamOsiowy123 2022-04-03 23:22:54 +02:00
parent f9e12e8e3c
commit 87211b61b9
5 changed files with 74 additions and 32 deletions

19
Dockerfile Normal file
View File

@ -0,0 +1,19 @@
# Nasz obraz będzie dzidziczył z obrazu Ubuntu w wersji latest
FROM ubuntu:latest
# Instalujemy niezbędne zależności. Zwróć uwagę na flagę "-y" (assume yes)
RUN apt update && apt install -y python3 \
python3-pip \
vim
ENV CUTOFF=${CUTOFF}
ENV KAGGLE_USERNAME=${KAGGLE_USERNAME}
ENV KAGGLE_KEY=${KAGGLE_KEY}
# Stwórzmy w kontenerze (jeśli nie istnieje) katalog /app i przejdźmy do niego (wszystkie kolejne polecenia RUN, CMD, ENTRYPOINT, COPY i ADD będą w nim wykonywane)
WORKDIR /app
# Skopiujmy nasz skrypt do katalogu /app w kontenerze
COPY . /app/
RUN python3 -m pip install -r requirements.txt

10
Jenkinsfile vendored
View File

@ -1,4 +1,5 @@
node {
docker.image('s444452/ium:1.0').inside {
stage('Preparation') {
properties([
parameters([
@ -14,7 +15,7 @@ node {
name: 'KAGGLE_KEY'
),
string(
defaultValue: "1000",
defaultValue: "10000",
description: 'Determine the size of dataset',
name: 'CUTOFF'
)
@ -28,11 +29,12 @@ node {
withEnv(["KAGGLE_USERNAME=${params.KAGGLE_USERNAME}",
"KAGGLE_KEY=${params.KAGGLE_KEY}","CUTOFF=${params.CUTOFF}"]) {
sh 'echo KAGGLE_USERNAME: $KAGGLE_USERNAME'
sh "chmod u+x ./download_dataset.sh"
sh "./download_dataset.sh $CUTOFF"
sh "chmod u+x ./lab2_data.py"
sh "./lab2_data.py"
}
}
stage('Archive artifacts') {
archiveArtifacts 'dataset.csv'
archiveArtifacts 'fake_job_postings.csv'
}
}
}

4
figlet-loop.sh Normal file
View File

@ -0,0 +1,4 @@
#!/bin/bash
while read line; do
figlet "$line"
done

View File

@ -6,9 +6,7 @@ from sklearn.model_selection import train_test_split
def download_and_save_dataset():
api.authenticate()
api.dataset_download_files('shivamb/real-or-fake-fake-jobposting-prediction',
path='./data',
unzip=True)
api.dataset_download_files('shivamb/real-or-fake-fake-jobposting-prediction', unzip=True)
def split_dataset(data: DataFrame):
@ -26,7 +24,7 @@ def split_dataset(data: DataFrame):
def main():
# download_and_save_dataset()
df = read_csv('./data/fake_job_postings.csv')
df = read_csv('./fake_job_postings.csv')
print(df.describe(include='all'))
print(df.shape)
x_train, x_val, x_test, y_train, y_val, y_test = split_dataset(df)

19
requirements.txt Normal file
View File

@ -0,0 +1,19 @@
certifi==2021.10.8
charset-normalizer==2.0.12
idna==3.3
joblib==1.1.0
kaggle==1.5.12
numpy==1.22.3
pandas==1.4.1
python-dateutil==2.8.2
python-slugify==6.1.1
pytz==2022.1
requests==2.27.1
scikit-learn==1.0.2
scipy==1.8.0
six==1.16.0
sklearn==0.0
text-unidecode==1.3
threadpoolctl==3.1.0
tqdm==4.63.1
urllib3==1.26.9