From 924738d4b38e36b45e6d159a742fac8a46c57db3 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Cezary=20Ga=C5=82=C4=85zkiewicz?= <cgalazkiewicz@gmail.com>
Date: Mon, 4 Apr 2022 01:27:04 +0200
Subject: [PATCH] Zad 04.Jenkins - Konteneryzacja

---
 Dockerfile          | 19 +++++++++++++++++++
 download_dataset.sh |  2 ++
 process_dataset.py  | 25 +++++++++++++++++++++++++
 3 files changed, 46 insertions(+)
 create mode 100644 Dockerfile
 create mode 100644 download_dataset.sh
 create mode 100644 process_dataset.py

diff --git a/Dockerfile b/Dockerfile
new file mode 100644
index 0000000..45f20d3
--- /dev/null
+++ b/Dockerfile
@@ -0,0 +1,19 @@
+FROM ubuntu:latest
+
+RUN apt update && apt install -y python3-pip
+RUN apt install -y unzip
+RUN pip install --user kaggle pandas seaborn sklearn
+
+RUN mkdir ~/.kaggle/
+RUN echo '{"username":"ikami1","key":"c70ff184133bfabb351608b128e76cd2"}' > ~/.kaggle/kaggle.json
+
+WORKDIR /ium
+
+#COPY ./download_dataset.sh ./
+COPY ./Steel_industry_data.csv ./
+COPY ./process_dataset.py ./
+#COPY ./stats.sh ./
+
+#CMD ./download_dataset.sh
+CMD python3 process_dataset.py
+#CMD ./stats.sh
\ No newline at end of file
diff --git a/download_dataset.sh b/download_dataset.sh
new file mode 100644
index 0000000..a6db0e7
--- /dev/null
+++ b/download_dataset.sh
@@ -0,0 +1,2 @@
+kaggle datasets download -d csafrit2/steel-industry-energy-consumption --force
+unzip -o -j steel-industry-energy-consumption.zip
\ No newline at end of file
diff --git a/process_dataset.py b/process_dataset.py
new file mode 100644
index 0000000..45e0340
--- /dev/null
+++ b/process_dataset.py
@@ -0,0 +1,25 @@
+import pandas as pd
+from sklearn.model_selection import train_test_split
+
+energy_data = pd.read_csv('Steel_industry_data.csv')
+
+train_data, test_data = train_test_split(energy_data, test_size=7008, random_state=1)
+test_data, dev_data = train_test_split(test_data, test_size=3504, random_state=1)
+
+# stats
+print(energy_data.describe(include='all'))
+
+print('Training set size:')
+print(train_data.shape)
+print('Testing set size:')
+print(test_data.shape)
+print('Dev set size:')
+print(dev_data.shape)
+
+#print(train_data.describe(include='all'))
+#print(test_data.describe(include='all'))
+#print(dev_data.describe(include='all'))
+
+test_data.to_csv("steel_industry_data_test.csv", encoding="utf-8", index=False)
+dev_data.to_csv("steel_industry_data_dev.csv", encoding="utf-8", index=False)
+train_data.to_csv("steel_industry_data_train.csv", encoding="utf-8", index=False)