Docker create-dataset

This commit is contained in:
MatOgr 2022-04-03 19:39:46 +02:00
parent 930c65a826
commit 3d13d4ca4a
8 changed files with 42 additions and 18 deletions

View File

@ -14,9 +14,10 @@ ARG KAGGLE_USERNAME
ARG KAGGLE_KEY ARG KAGGLE_KEY
# Copy scripts to the catalog # Copy scripts to the catalog
COPY ./load_data.sh / COPY ./scripts/. /
# COPY ./kaggle.json /root/.kaggle/kaggle.json COPY ./kaggle.json /root/.kaggle/kaggle.json
# Run the copied script # Run the copied script
RUN chmod +x /load_data.sh RUN chmod +x /load_data.sh && /load_data.sh
RUN /load_data.sh
RUN chmod +x /grab_avocado.py && python3 /grab_avocado.py

View File

@ -1,4 +0,0 @@
#!/bin/bash
while read line; do
figlet "$line"
done

View File

View File

@ -10,8 +10,8 @@ pipeline {
stages { stages {
stage('sh: Shell script') { stage('sh: Shell script') {
steps { steps {
sh 'chmod u+x ./data_stats.sh' sh 'chmod u+x ./scripts/data_stats.sh'
sh './data_stats.sh' sh './scripts/data_stats.sh'
} }
} }
stage('Archive arifacts') { stage('Archive arifacts') {

27
scripts/grab_avocado.py Normal file
View File

@ -0,0 +1,27 @@
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler
cols = list(pd.read_csv("avocado.csv", nrows=1))
# print("###\n", cols, "\n###")
avocados = pd.read_csv("avocado.csv", usecols=cols[1:])
avocados.describe(include="all")
float_cols = ['AveragePrice','Total Volume','4046','4225','4770','Total Bags','Small Bags','Large Bags','XLarge Bags']
avocados.loc[:, float_cols] = StandardScaler().fit_transform(avocados.loc[:, float_cols])
print(avocados.head())
# avocados.loc[:, float_cols] = MinMaxScaler().fit_transform(avocados.loc[:, float_cols])
# print(avocados.head())
avocado_train, avocado_test = train_test_split(avocados, test_size=2000, random_state=3337)
avocado_train, avocado_valid = train_test_split(avocado_train, test_size=2249, random_state=3337)
print("Train\n", avocado_train.describe(include="all"), "\n")
print("Valid\n", avocado_valid.describe(include="all"), "\n")
print("Test\n", avocado_test.describe(include="all"))
avocado_train.to_csv("avocado.data.train", index=False)
avocado_valid.to_csv("avocado.data.valid", index=False)
avocado_test.to_csv("avocado.data.test", index=False)

View File

@ -13,14 +13,14 @@ echo "Loading dataset..."
kaggle datasets download -d neuromusic/avocado-prices kaggle datasets download -d neuromusic/avocado-prices
echo "Extracting files from zip archive..." echo "Extracting files from zip archive..."
unzip -o avocado-prices.zip unzip -o avocado-prices.zip
echo Done
# Dividing data # Dividing data
echo "Start the data splitting..." # echo "Start the data splitting..."
tail -n +2 avocado.csv | shuf > avocado_shuf.csv # tail -n +2 avocado.csv | shuf > avocado_shuf.csv
head -n 14000 avocado_shuf.csv > avocado.data.train # head -n 14000 avocado_shuf.csv > avocado.data.train
tail -n +14001 avocado_shuf.csv | head -n 2249 > avocado.data.valid # tail -n +14001 avocado_shuf.csv | head -n 2249 > avocado.data.valid
tail -n 2000 avocado_shuf.csv > avocado.data.test # tail -n 2000 avocado_shuf.csv > avocado.data.test
# Saving simple stats in a text file # Saving simple stats in a text file
echo "Getting simple stats..." # echo "Getting simple stats..."
wc -l avocado.data* > results.txt # wc -l avocado.data* > results.txt