diff --git a/Dockerfile b/Dockerfile index 145d616..9f44b16 100644 --- a/Dockerfile +++ b/Dockerfile @@ -14,9 +14,10 @@ ARG KAGGLE_USERNAME ARG KAGGLE_KEY # Copy scripts to the catalog -COPY ./load_data.sh / -# COPY ./kaggle.json /root/.kaggle/kaggle.json +COPY ./scripts/. / +COPY ./kaggle.json /root/.kaggle/kaggle.json # Run the copied script -RUN chmod +x /load_data.sh -RUN /load_data.sh \ No newline at end of file +RUN chmod +x /load_data.sh && /load_data.sh + +RUN chmod +x /grab_avocado.py && python3 /grab_avocado.py \ No newline at end of file diff --git a/figlet-loop.sh b/figlet-loop.sh deleted file mode 100644 index 723692a..0000000 --- a/figlet-loop.sh +++ /dev/null @@ -1,4 +0,0 @@ -#!/bin/bash -while read line; do - figlet "$line" -done \ No newline at end of file diff --git a/Jenkinsfile b/jenkins/Jenkinsfile similarity index 100% rename from Jenkinsfile rename to jenkins/Jenkinsfile diff --git a/Jenkinsfile_docker b/jenkins/Jenkinsfile_docker similarity index 88% rename from Jenkinsfile_docker rename to jenkins/Jenkinsfile_docker index 4f175bd..2636faf 100644 --- a/Jenkinsfile_docker +++ b/jenkins/Jenkinsfile_docker @@ -10,8 +10,8 @@ pipeline { stages { stage('sh: Shell script') { steps { - sh 'chmod u+x ./data_stats.sh' - sh './data_stats.sh' + sh 'chmod u+x ./scripts/data_stats.sh' + sh './scripts/data_stats.sh' } } stage('Archive arifacts') { diff --git a/Jenkinstats b/jenkins/Jenkinstats similarity index 100% rename from Jenkinstats rename to jenkins/Jenkinstats diff --git a/data_stats.sh b/scripts/data_stats.sh similarity index 100% rename from data_stats.sh rename to scripts/data_stats.sh diff --git a/scripts/grab_avocado.py b/scripts/grab_avocado.py new file mode 100644 index 0000000..b83a2fc --- /dev/null +++ b/scripts/grab_avocado.py @@ -0,0 +1,27 @@ +import pandas as pd +from sklearn.model_selection import train_test_split +from sklearn.preprocessing import StandardScaler, MinMaxScaler + +cols = list(pd.read_csv("avocado.csv", nrows=1)) +# print("###\n", cols, "\n###") +avocados = pd.read_csv("avocado.csv", usecols=cols[1:]) +avocados.describe(include="all") + +float_cols = ['AveragePrice','Total Volume','4046','4225','4770','Total Bags','Small Bags','Large Bags','XLarge Bags'] + +avocados.loc[:, float_cols] = StandardScaler().fit_transform(avocados.loc[:, float_cols]) +print(avocados.head()) + +# avocados.loc[:, float_cols] = MinMaxScaler().fit_transform(avocados.loc[:, float_cols]) +# print(avocados.head()) + +avocado_train, avocado_test = train_test_split(avocados, test_size=2000, random_state=3337) +avocado_train, avocado_valid = train_test_split(avocado_train, test_size=2249, random_state=3337) + +print("Train\n", avocado_train.describe(include="all"), "\n") +print("Valid\n", avocado_valid.describe(include="all"), "\n") +print("Test\n", avocado_test.describe(include="all")) + +avocado_train.to_csv("avocado.data.train", index=False) +avocado_valid.to_csv("avocado.data.valid", index=False) +avocado_test.to_csv("avocado.data.test", index=False) diff --git a/load_data.sh b/scripts/load_data.sh similarity index 56% rename from load_data.sh rename to scripts/load_data.sh index 36d642b..e032fb3 100644 --- a/load_data.sh +++ b/scripts/load_data.sh @@ -13,14 +13,14 @@ echo "Loading dataset..." kaggle datasets download -d neuromusic/avocado-prices echo "Extracting files from zip archive..." unzip -o avocado-prices.zip - +echo Done # Dividing data -echo "Start the data splitting..." -tail -n +2 avocado.csv | shuf > avocado_shuf.csv -head -n 14000 avocado_shuf.csv > avocado.data.train -tail -n +14001 avocado_shuf.csv | head -n 2249 > avocado.data.valid -tail -n 2000 avocado_shuf.csv > avocado.data.test +# echo "Start the data splitting..." +# tail -n +2 avocado.csv | shuf > avocado_shuf.csv +# head -n 14000 avocado_shuf.csv > avocado.data.train +# tail -n +14001 avocado_shuf.csv | head -n 2249 > avocado.data.valid +# tail -n 2000 avocado_shuf.csv > avocado.data.test # Saving simple stats in a text file -echo "Getting simple stats..." -wc -l avocado.data* > results.txt \ No newline at end of file +# echo "Getting simple stats..." +# wc -l avocado.data* > results.txt \ No newline at end of file