diff --git a/Dockerfile b/Dockerfile index 67cde30..e8d6787 100644 --- a/Dockerfile +++ b/Dockerfile @@ -5,7 +5,7 @@ FROM ubuntu:latest WORKDIR /app # Install required dependencies -ADD . . +COPY ./requirements.txt . RUN apt-get update && \ apt-get install -y python3.8 python3-pip figlet unzip RUN pip3 install -r requirements.txt @@ -15,7 +15,7 @@ ARG KAGGLE_KEY # Copy scripts to the catalog COPY ./scripts/. / -# COPY ./kaggle.json /root/.kaggle/kaggle.json +COPY ./kaggle.json /root/.kaggle/kaggle.json # Run the copied script RUN chmod +x /load_data.sh && /load_data.sh diff --git a/jenkins/Jenkinsfile_docker b/jenkins/Jenkinsfile_docker index d2c3aaa..2b43030 100644 --- a/jenkins/Jenkinsfile_docker +++ b/jenkins/Jenkinsfile_docker @@ -12,7 +12,7 @@ pipeline { steps { sh 'chmod u+x ./scripts/data_stats.sh' sh './scripts/data_stats.sh' - archiveArtifacts artifacts: 'avocado.data*', followSymlinks: false + archiveArtifacts artifacts: '*/avocado.data*', followSymlinks: false } } // stage('Archive arifacts') { diff --git a/scripts/grab_avocado.py b/scripts/grab_avocado.py index b83a2fc..86d3cf9 100644 --- a/scripts/grab_avocado.py +++ b/scripts/grab_avocado.py @@ -2,9 +2,9 @@ import pandas as pd from sklearn.model_selection import train_test_split from sklearn.preprocessing import StandardScaler, MinMaxScaler -cols = list(pd.read_csv("avocado.csv", nrows=1)) +cols = list(pd.read_csv("data/avocado.csv", nrows=1)) # print("###\n", cols, "\n###") -avocados = pd.read_csv("avocado.csv", usecols=cols[1:]) +avocados = pd.read_csv("data/avocado.csv", usecols=cols[1:]) avocados.describe(include="all") float_cols = ['AveragePrice','Total Volume','4046','4225','4770','Total Bags','Small Bags','Large Bags','XLarge Bags'] @@ -22,6 +22,6 @@ print("Train\n", avocado_train.describe(include="all"), "\n") print("Valid\n", avocado_valid.describe(include="all"), "\n") print("Test\n", avocado_test.describe(include="all")) -avocado_train.to_csv("avocado.data.train", index=False) -avocado_valid.to_csv("avocado.data.valid", index=False) -avocado_test.to_csv("avocado.data.test", index=False) +avocado_train.to_csv("data/avocado.data.train", index=False) +avocado_valid.to_csv("data/avocado.data.valid", index=False) +avocado_test.to_csv("data/avocado.data.test", index=False) diff --git a/scripts/load_data.sh b/scripts/load_data.sh index e032fb3..e1f2660 100644 --- a/scripts/load_data.sh +++ b/scripts/load_data.sh @@ -13,6 +13,9 @@ echo "Loading dataset..." kaggle datasets download -d neuromusic/avocado-prices echo "Extracting files from zip archive..." unzip -o avocado-prices.zip +rm avocado-prizes.zip +mkdir data +mv avocado.csv data/. echo Done # Dividing data # echo "Start the data splitting..."