From 924738d4b38e36b45e6d159a742fac8a46c57db3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cezary=20Ga=C5=82=C4=85zkiewicz?= Date: Mon, 4 Apr 2022 01:27:04 +0200 Subject: [PATCH] Zad 04.Jenkins - Konteneryzacja --- Dockerfile | 19 +++++++++++++++++++ download_dataset.sh | 2 ++ process_dataset.py | 25 +++++++++++++++++++++++++ 3 files changed, 46 insertions(+) create mode 100644 Dockerfile create mode 100644 download_dataset.sh create mode 100644 process_dataset.py diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 0000000..45f20d3 --- /dev/null +++ b/Dockerfile @@ -0,0 +1,19 @@ +FROM ubuntu:latest + +RUN apt update && apt install -y python3-pip +RUN apt install -y unzip +RUN pip install --user kaggle pandas seaborn sklearn + +RUN mkdir ~/.kaggle/ +RUN echo '{"username":"ikami1","key":"c70ff184133bfabb351608b128e76cd2"}' > ~/.kaggle/kaggle.json + +WORKDIR /ium + +#COPY ./download_dataset.sh ./ +COPY ./Steel_industry_data.csv ./ +COPY ./process_dataset.py ./ +#COPY ./stats.sh ./ + +#CMD ./download_dataset.sh +CMD python3 process_dataset.py +#CMD ./stats.sh \ No newline at end of file diff --git a/download_dataset.sh b/download_dataset.sh new file mode 100644 index 0000000..a6db0e7 --- /dev/null +++ b/download_dataset.sh @@ -0,0 +1,2 @@ +kaggle datasets download -d csafrit2/steel-industry-energy-consumption --force +unzip -o -j steel-industry-energy-consumption.zip \ No newline at end of file diff --git a/process_dataset.py b/process_dataset.py new file mode 100644 index 0000000..45e0340 --- /dev/null +++ b/process_dataset.py @@ -0,0 +1,25 @@ +import pandas as pd +from sklearn.model_selection import train_test_split + +energy_data = pd.read_csv('Steel_industry_data.csv') + +train_data, test_data = train_test_split(energy_data, test_size=7008, random_state=1) +test_data, dev_data = train_test_split(test_data, test_size=3504, random_state=1) + +# stats +print(energy_data.describe(include='all')) + +print('Training set size:') +print(train_data.shape) +print('Testing set size:') +print(test_data.shape) +print('Dev set size:') +print(dev_data.shape) + +#print(train_data.describe(include='all')) +#print(test_data.describe(include='all')) +#print(dev_data.describe(include='all')) + +test_data.to_csv("steel_industry_data_test.csv", encoding="utf-8", index=False) +dev_data.to_csv("steel_industry_data_dev.csv", encoding="utf-8", index=False) +train_data.to_csv("steel_industry_data_train.csv", encoding="utf-8", index=False)