diff --git a/Dockerfile b/Dockerfile index 45f20d3..eb57ed5 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,19 +1,18 @@ FROM ubuntu:latest RUN apt update && apt install -y python3-pip -RUN apt install -y unzip -RUN pip install --user kaggle pandas seaborn sklearn +RUN apt install -y unzip python3 +RUN pip3 install kaggle pandas seaborn sklearn RUN mkdir ~/.kaggle/ RUN echo '{"username":"ikami1","key":"c70ff184133bfabb351608b128e76cd2"}' > ~/.kaggle/kaggle.json WORKDIR /ium -#COPY ./download_dataset.sh ./ -COPY ./Steel_industry_data.csv ./ -COPY ./process_dataset.py ./ -#COPY ./stats.sh ./ +COPY download_dataset.sh process_dataset.py stats.sh Steel_industry_data.csv ./ + +RUN chmod a+x download_dataset.sh process_dataset.py #CMD ./download_dataset.sh -CMD python3 process_dataset.py +#CMD python3 process_dataset.py #CMD ./stats.sh \ No newline at end of file diff --git a/IUM_dane02.ipynb b/IUM_dane02.ipynb index 462690f..afefe7d 100644 --- a/IUM_dane02.ipynb +++ b/IUM_dane02.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "code", - "execution_count": 9, + "execution_count": 1, "id": "expected-payroll", "metadata": {}, "outputs": [ @@ -10,36 +10,35 @@ "name": "stdout", "output_type": "stream", "text": [ - "Requirement already satisfied: kaggle in c:\\users\\cgala\\appdata\\local\\programs\\python\\python39\\lib\\site-packages (1.5.12)\n", - "Requirement already satisfied: six>=1.10 in c:\\users\\cgala\\appdata\\local\\programs\\python\\python39\\lib\\site-packages (from kaggle) (1.15.0)\n", - "Requirement already satisfied: certifi in c:\\users\\cgala\\appdata\\local\\programs\\python\\python39\\lib\\site-packages (from kaggle) (2021.10.8)\n", - "Requirement already satisfied: python-dateutil in c:\\users\\cgala\\appdata\\local\\programs\\python\\python39\\lib\\site-packages (from kaggle) (2.8.1)\n", - "Requirement already satisfied: requests in c:\\users\\cgala\\appdata\\local\\programs\\python\\python39\\lib\\site-packages (from kaggle) (2.27.1)\n", - "Requirement already satisfied: tqdm in c:\\users\\cgala\\appdata\\local\\programs\\python\\python39\\lib\\site-packages (from kaggle) (4.59.0)\n", - "Requirement already satisfied: python-slugify in c:\\users\\cgala\\appdata\\local\\programs\\python\\python39\\lib\\site-packages (from kaggle) (6.1.1)\n", - "Requirement already satisfied: urllib3 in c:\\users\\cgala\\appdata\\local\\programs\\python\\python39\\lib\\site-packages (from kaggle) (1.26.9)\n", - "Requirement already satisfied: text-unidecode>=1.3 in c:\\users\\cgala\\appdata\\local\\programs\\python\\python39\\lib\\site-packages (from python-slugify->kaggle) (1.3)\n", - "Requirement already satisfied: idna<4,>=2.5 in c:\\users\\cgala\\appdata\\local\\programs\\python\\python39\\lib\\site-packages (from requests->kaggle) (3.3)\n", - "Requirement already satisfied: charset-normalizer~=2.0.0 in c:\\users\\cgala\\appdata\\local\\programs\\python\\python39\\lib\\site-packages (from requests->kaggle) (2.0.12)\n", - "Requirement already satisfied: pandas in c:\\users\\cgala\\appdata\\local\\programs\\python\\python39\\lib\\site-packages (1.4.1)\n", - "Requirement already satisfied: pytz>=2020.1 in c:\\users\\cgala\\appdata\\local\\programs\\python\\python39\\lib\\site-packages (from pandas) (2022.1)\n", - "Requirement already satisfied: python-dateutil>=2.8.1 in c:\\users\\cgala\\appdata\\local\\programs\\python\\python39\\lib\\site-packages (from pandas) (2.8.1)\n", - "Requirement already satisfied: numpy>=1.18.5 in c:\\users\\cgala\\appdata\\local\\programs\\python\\python39\\lib\\site-packages (from pandas) (1.20.1)\n", - "Requirement already satisfied: six>=1.5 in c:\\users\\cgala\\appdata\\local\\programs\\python\\python39\\lib\\site-packages (from python-dateutil>=2.8.1->pandas) (1.15.0)\n", - "Requirement already satisfied: seaborn in c:\\users\\cgala\\appdata\\local\\programs\\python\\python39\\lib\\site-packages (0.11.2)\n", - "Requirement already satisfied: pandas>=0.23 in c:\\users\\cgala\\appdata\\local\\programs\\python\\python39\\lib\\site-packages (from seaborn) (1.4.1)\n", - "Requirement already satisfied: numpy>=1.15 in c:\\users\\cgala\\appdata\\local\\programs\\python\\python39\\lib\\site-packages (from seaborn) (1.20.1)\n", - "Requirement already satisfied: scipy>=1.0 in c:\\users\\cgala\\appdata\\local\\programs\\python\\python39\\lib\\site-packages (from seaborn) (1.6.1)\n", - "Requirement already satisfied: matplotlib>=2.2 in c:\\users\\cgala\\appdata\\local\\programs\\python\\python39\\lib\\site-packages (from seaborn) (3.5.1)\n", - "Requirement already satisfied: packaging>=20.0 in c:\\users\\cgala\\appdata\\local\\programs\\python\\python39\\lib\\site-packages (from matplotlib>=2.2->seaborn) (20.9)\n", - "Requirement already satisfied: pyparsing>=2.2.1 in c:\\users\\cgala\\appdata\\local\\programs\\python\\python39\\lib\\site-packages (from matplotlib>=2.2->seaborn) (2.4.7)\n", - "Requirement already satisfied: fonttools>=4.22.0 in c:\\users\\cgala\\appdata\\local\\programs\\python\\python39\\lib\\site-packages (from matplotlib>=2.2->seaborn) (4.31.1)\n", - "Requirement already satisfied: python-dateutil>=2.7 in c:\\users\\cgala\\appdata\\local\\programs\\python\\python39\\lib\\site-packages (from matplotlib>=2.2->seaborn) (2.8.1)\n", - "Requirement already satisfied: pillow>=6.2.0 in c:\\users\\cgala\\appdata\\local\\programs\\python\\python39\\lib\\site-packages (from matplotlib>=2.2->seaborn) (9.0.1)\n", - "Requirement already satisfied: kiwisolver>=1.0.1 in c:\\users\\cgala\\appdata\\local\\programs\\python\\python39\\lib\\site-packages (from matplotlib>=2.2->seaborn) (1.4.0)\n", - "Requirement already satisfied: cycler>=0.10 in c:\\users\\cgala\\appdata\\local\\programs\\python\\python39\\lib\\site-packages (from matplotlib>=2.2->seaborn) (0.11.0)\n", - "Requirement already satisfied: pytz>=2020.1 in c:\\users\\cgala\\appdata\\local\\programs\\python\\python39\\lib\\site-packages (from pandas>=0.23->seaborn) (2022.1)\n", - "Requirement already satisfied: six>=1.5 in c:\\users\\cgala\\appdata\\local\\programs\\python\\python39\\lib\\site-packages (from python-dateutil>=2.7->matplotlib>=2.2->seaborn) (1.15.0)\n" + "Requirement already satisfied: kaggle in c:\\users\\cgala\\anaconda3\\lib\\site-packages (1.5.12)\n", + "Requirement already satisfied: tqdm in c:\\users\\cgala\\anaconda3\\lib\\site-packages (from kaggle) (4.50.2)\n", + "Requirement already satisfied: requests in c:\\users\\cgala\\anaconda3\\lib\\site-packages (from kaggle) (2.24.0)\n", + "Requirement already satisfied: certifi in c:\\users\\cgala\\anaconda3\\lib\\site-packages (from kaggle) (2020.6.20)\n", + "Requirement already satisfied: six>=1.10 in c:\\users\\cgala\\anaconda3\\lib\\site-packages (from kaggle) (1.15.0)\n", + "Requirement already satisfied: python-slugify in c:\\users\\cgala\\anaconda3\\lib\\site-packages (from kaggle) (6.1.1)\n", + "Requirement already satisfied: urllib3 in c:\\users\\cgala\\anaconda3\\lib\\site-packages (from kaggle) (1.25.11)\n", + "Requirement already satisfied: python-dateutil in c:\\users\\cgala\\anaconda3\\lib\\site-packages (from kaggle) (2.8.1)\n", + "Requirement already satisfied: idna<3,>=2.5 in c:\\users\\cgala\\anaconda3\\lib\\site-packages (from requests->kaggle) (2.10)\n", + "Requirement already satisfied: chardet<4,>=3.0.2 in c:\\users\\cgala\\anaconda3\\lib\\site-packages (from requests->kaggle) (3.0.4)\n", + "Requirement already satisfied: text-unidecode>=1.3 in c:\\users\\cgala\\anaconda3\\lib\\site-packages (from python-slugify->kaggle) (1.3)\n", + "Requirement already satisfied: pandas in c:\\users\\cgala\\anaconda3\\lib\\site-packages (1.1.3)\n", + "Requirement already satisfied: python-dateutil>=2.7.3 in c:\\users\\cgala\\anaconda3\\lib\\site-packages (from pandas) (2.8.1)\n", + "Requirement already satisfied: numpy>=1.15.4 in c:\\users\\cgala\\anaconda3\\lib\\site-packages (from pandas) (1.19.2)\n", + "Requirement already satisfied: pytz>=2017.2 in c:\\users\\cgala\\anaconda3\\lib\\site-packages (from pandas) (2020.1)\n", + "Requirement already satisfied: six>=1.5 in c:\\users\\cgala\\anaconda3\\lib\\site-packages (from python-dateutil>=2.7.3->pandas) (1.15.0)\n", + "Requirement already satisfied: seaborn in c:\\users\\cgala\\anaconda3\\lib\\site-packages (0.11.0)\n", + "Requirement already satisfied: numpy>=1.15 in c:\\users\\cgala\\anaconda3\\lib\\site-packages (from seaborn) (1.19.2)\n", + "Requirement already satisfied: scipy>=1.0 in c:\\users\\cgala\\anaconda3\\lib\\site-packages (from seaborn) (1.5.2)\n", + "Requirement already satisfied: matplotlib>=2.2 in c:\\users\\cgala\\anaconda3\\lib\\site-packages (from seaborn) (3.3.2)\n", + "Requirement already satisfied: pandas>=0.23 in c:\\users\\cgala\\anaconda3\\lib\\site-packages (from seaborn) (1.1.3)\n", + "Requirement already satisfied: python-dateutil>=2.1 in c:\\users\\cgala\\anaconda3\\lib\\site-packages (from matplotlib>=2.2->seaborn) (2.8.1)\n", + "Requirement already satisfied: kiwisolver>=1.0.1 in c:\\users\\cgala\\anaconda3\\lib\\site-packages (from matplotlib>=2.2->seaborn) (1.3.0)\n", + "Requirement already satisfied: pyparsing!=2.0.4,!=2.1.2,!=2.1.6,>=2.0.3 in c:\\users\\cgala\\anaconda3\\lib\\site-packages (from matplotlib>=2.2->seaborn) (2.4.7)\n", + "Requirement already satisfied: cycler>=0.10 in c:\\users\\cgala\\anaconda3\\lib\\site-packages (from matplotlib>=2.2->seaborn) (0.10.0)\n", + "Requirement already satisfied: pillow>=6.2.0 in c:\\users\\cgala\\anaconda3\\lib\\site-packages (from matplotlib>=2.2->seaborn) (8.0.1)\n", + "Requirement already satisfied: certifi>=2020.06.20 in c:\\users\\cgala\\anaconda3\\lib\\site-packages (from matplotlib>=2.2->seaborn) (2020.6.20)\n", + "Requirement already satisfied: pytz>=2017.2 in c:\\users\\cgala\\anaconda3\\lib\\site-packages (from pandas>=0.23->seaborn) (2020.1)\n", + "Requirement already satisfied: six>=1.5 in c:\\users\\cgala\\anaconda3\\lib\\site-packages (from python-dateutil>=2.1->matplotlib>=2.2->seaborn) (1.15.0)\n" ] } ], @@ -51,7 +50,7 @@ }, { "cell_type": "code", - "execution_count": 1, + "execution_count": 3, "id": "genetic-plaintiff", "metadata": {}, "outputs": [ @@ -69,18 +68,18 @@ "text": [ "\n", " 0%| | 0.00/484k [00:00\n", " \n", " count\n", - " 31536\n", - " 31536.000000\n", - " 31536.000000\n", - " 31536.000000\n", - " 31536.000000\n", - " 31536.000000\n", - " 31536.000000\n", - " 31536.000000\n", - " 31536\n", - " 31536\n", - " 31536\n", + " 28032\n", + " 28032.000000\n", + " 28032.000000\n", + " 28032.000000\n", + " 28032.000000\n", + " 28032.000000\n", + " 28032.000000\n", + " 28032.000000\n", + " 28032\n", + " 28032\n", + " 28032\n", " \n", " \n", " unique\n", - " 31536\n", + " 28032\n", " NaN\n", " NaN\n", " NaN\n", @@ -729,7 +728,7 @@ " \n", " \n", " top\n", - " 30/01/2018 00:15\n", + " 07/08/2018 14:15\n", " NaN\n", " NaN\n", " NaN\n", @@ -751,20 +750,20 @@ " NaN\n", " NaN\n", " NaN\n", - " 22514\n", - " 4560\n", - " 16280\n", + " 19998\n", + " 4087\n", + " 14467\n", " \n", " \n", " mean\n", " NaN\n", - " 27.369449\n", - " 13.037946\n", - " 3.866059\n", - " 0.011513\n", - " 80.525058\n", - " 84.410086\n", - " 42707.363014\n", + " 27.340174\n", + " 13.026801\n", + " 3.875001\n", + " 0.011498\n", + " 80.520145\n", + " 84.369511\n", + " 42761.429795\n", " NaN\n", " NaN\n", " NaN\n", @@ -772,13 +771,13 @@ " \n", " std\n", " NaN\n", - " 33.473304\n", - " 16.302910\n", - " 7.434250\n", - " 0.016159\n", - " 18.929571\n", - " 30.436675\n", - " 24968.193911\n", + " 33.469130\n", + " 16.289348\n", + " 7.445898\n", + " 0.016153\n", + " 18.932825\n", + " 30.462193\n", + " 24944.585138\n", " NaN\n", " NaN\n", " NaN\n", @@ -801,11 +800,11 @@ " 25%\n", " NaN\n", " 3.200000\n", - " 2.330000\n", + " 2.300000\n", " 0.000000\n", " 0.000000\n", - " 63.200000\n", - " 99.720000\n", + " 63.227500\n", + " 99.710000\n", " 20700.000000\n", " NaN\n", " NaN\n", @@ -818,9 +817,9 @@ " 5.000000\n", " 0.000000\n", " 0.000000\n", - " 87.900000\n", + " 87.870000\n", " 100.000000\n", - " 42300.000000\n", + " 43200.000000\n", " NaN\n", " NaN\n", " NaN\n", @@ -828,13 +827,13 @@ " \n", " 75%\n", " NaN\n", - " 51.230000\n", - " 22.650000\n", - " 1.980000\n", + " 51.190000\n", + " 22.750000\n", + " 2.020000\n", " 0.020000\n", - " 98.970000\n", + " 99.000000\n", " 100.000000\n", - " 63900.000000\n", + " 64800.000000\n", " NaN\n", " NaN\n", " NaN\n", @@ -859,59 +858,59 @@ ], "text/plain": [ " date Usage_kWh Lagging_Current_Reactive.Power_kVarh \\\n", - "count 31536 31536.000000 31536.000000 \n", - "unique 31536 NaN NaN \n", - "top 30/01/2018 00:15 NaN NaN \n", + "count 28032 28032.000000 28032.000000 \n", + "unique 28032 NaN NaN \n", + "top 07/08/2018 14:15 NaN NaN \n", "freq 1 NaN NaN \n", - "mean NaN 27.369449 13.037946 \n", - "std NaN 33.473304 16.302910 \n", + "mean NaN 27.340174 13.026801 \n", + "std NaN 33.469130 16.289348 \n", "min NaN 0.000000 0.000000 \n", - "25% NaN 3.200000 2.330000 \n", + "25% NaN 3.200000 2.300000 \n", "50% NaN 4.570000 5.000000 \n", - "75% NaN 51.230000 22.650000 \n", + "75% NaN 51.190000 22.750000 \n", "max NaN 157.180000 96.910000 \n", "\n", " Leading_Current_Reactive_Power_kVarh CO2(tCO2) \\\n", - "count 31536.000000 31536.000000 \n", + "count 28032.000000 28032.000000 \n", "unique NaN NaN \n", "top NaN NaN \n", "freq NaN NaN \n", - "mean 3.866059 0.011513 \n", - "std 7.434250 0.016159 \n", + "mean 3.875001 0.011498 \n", + "std 7.445898 0.016153 \n", "min 0.000000 0.000000 \n", "25% 0.000000 0.000000 \n", "50% 0.000000 0.000000 \n", - "75% 1.980000 0.020000 \n", + "75% 2.020000 0.020000 \n", "max 27.760000 0.070000 \n", "\n", " Lagging_Current_Power_Factor Leading_Current_Power_Factor \\\n", - "count 31536.000000 31536.000000 \n", + "count 28032.000000 28032.000000 \n", "unique NaN NaN \n", "top NaN NaN \n", "freq NaN NaN \n", - "mean 80.525058 84.410086 \n", - "std 18.929571 30.436675 \n", + "mean 80.520145 84.369511 \n", + "std 18.932825 30.462193 \n", "min 0.000000 0.000000 \n", - "25% 63.200000 99.720000 \n", - "50% 87.900000 100.000000 \n", - "75% 98.970000 100.000000 \n", + "25% 63.227500 99.710000 \n", + "50% 87.870000 100.000000 \n", + "75% 99.000000 100.000000 \n", "max 100.000000 100.000000 \n", "\n", " NSM WeekStatus Day_of_week Load_Type \n", - "count 31536.000000 31536 31536 31536 \n", + "count 28032.000000 28032 28032 28032 \n", "unique NaN 2 7 3 \n", "top NaN Weekday Monday Light_Load \n", - "freq NaN 22514 4560 16280 \n", - "mean 42707.363014 NaN NaN NaN \n", - "std 24968.193911 NaN NaN NaN \n", + "freq NaN 19998 4087 14467 \n", + "mean 42761.429795 NaN NaN NaN \n", + "std 24944.585138 NaN NaN NaN \n", "min 0.000000 NaN NaN NaN \n", "25% 20700.000000 NaN NaN NaN \n", - "50% 42300.000000 NaN NaN NaN \n", - "75% 63900.000000 NaN NaN NaN \n", + "50% 43200.000000 NaN NaN NaN \n", + "75% 64800.000000 NaN NaN NaN \n", "max 85500.000000 NaN NaN NaN " ] }, - "execution_count": 12, + "execution_count": 7, "metadata": {}, "output_type": "execute_result" } @@ -922,7 +921,7 @@ }, { "cell_type": "code", - "execution_count": 13, + "execution_count": 8, "id": "radical-score", "metadata": {}, "outputs": [ @@ -963,21 +962,21 @@ " \n", " \n", " count\n", - " 1752\n", - " 1752.000000\n", - " 1752.000000\n", - " 1752.000000\n", - " 1752.000000\n", - " 1752.000000\n", - " 1752.000000\n", - " 1752.000000\n", - " 1752\n", - " 1752\n", - " 1752\n", + " 3504\n", + " 3504.000000\n", + " 3504.000000\n", + " 3504.000000\n", + " 3504.000000\n", + " 3504.000000\n", + " 3504.000000\n", + " 3504.000000\n", + " 3504\n", + " 3504\n", + " 3504\n", " \n", " \n", " unique\n", - " 1752\n", + " 3504\n", " NaN\n", " NaN\n", " NaN\n", @@ -991,7 +990,7 @@ " \n", " \n", " top\n", - " 07/05/2018 06:00\n", + " 20/06/2018 13:00\n", " NaN\n", " NaN\n", " NaN\n", @@ -1000,7 +999,7 @@ " NaN\n", " NaN\n", " Weekday\n", - " Tuesday\n", + " Wednesday\n", " Light_Load\n", " \n", " \n", @@ -1013,20 +1012,20 @@ " NaN\n", " NaN\n", " NaN\n", - " 1268\n", - " 291\n", - " 898\n", + " 2522\n", + " 527\n", + " 1837\n", " \n", " \n", " mean\n", " NaN\n", - " 27.330982\n", - " 12.649024\n", - " 3.949281\n", - " 0.011530\n", - " 81.364526\n", - " 83.630702\n", - " 43080.821918\n", + " 26.355685\n", + " 12.374717\n", + " 3.891093\n", + " 0.011050\n", + " 80.687751\n", + " 84.082794\n", + " 42594.092466\n", " NaN\n", " NaN\n", " NaN\n", @@ -1034,13 +1033,13 @@ " \n", " std\n", " NaN\n", - " 33.484216\n", - " 16.185283\n", - " 7.298637\n", - " 0.016224\n", - " 18.758338\n", - " 30.801180\n", - " 24944.325392\n", + " 32.519749\n", + " 15.830961\n", + " 7.353028\n", + " 0.015762\n", + " 19.053018\n", + " 30.614144\n", + " 25222.804637\n", " NaN\n", " NaN\n", " NaN\n", @@ -1052,7 +1051,7 @@ " 0.000000\n", " 0.000000\n", " 0.000000\n", - " 41.120000\n", + " 40.290000\n", " 12.540000\n", " 0.000000\n", " NaN\n", @@ -1062,13 +1061,13 @@ " \n", " 25%\n", " NaN\n", - " 3.200000\n", - " 1.392500\n", + " 3.192500\n", + " 2.090000\n", " 0.000000\n", " 0.000000\n", - " 64.630000\n", - " 99.180000\n", - " 21600.000000\n", + " 63.130000\n", + " 99.562500\n", + " 20700.000000\n", " NaN\n", " NaN\n", " NaN\n", @@ -1076,13 +1075,13 @@ " \n", " 50%\n", " NaN\n", - " 4.570000\n", - " 4.930000\n", + " 4.500000\n", + " 4.900000\n", " 0.000000\n", " 0.000000\n", - " 88.955000\n", + " 88.210000\n", " 100.000000\n", - " 43200.000000\n", + " 42300.000000\n", " NaN\n", " NaN\n", " NaN\n", @@ -1090,11 +1089,11 @@ " \n", " 75%\n", " NaN\n", - " 49.870000\n", - " 21.240000\n", - " 3.837500\n", + " 49.570000\n", + " 20.700000\n", + " 2.967500\n", " 0.020000\n", - " 99.852500\n", + " 99.390000\n", " 100.000000\n", " 64800.000000\n", " NaN\n", @@ -1104,9 +1103,9 @@ " \n", " max\n", " NaN\n", - " 143.930000\n", - " 87.700000\n", - " 27.540000\n", + " 153.140000\n", + " 82.940000\n", + " 27.650000\n", " 0.070000\n", " 100.000000\n", " 100.000000\n", @@ -1121,59 +1120,59 @@ ], "text/plain": [ " date Usage_kWh Lagging_Current_Reactive.Power_kVarh \\\n", - "count 1752 1752.000000 1752.000000 \n", - "unique 1752 NaN NaN \n", - "top 07/05/2018 06:00 NaN NaN \n", + "count 3504 3504.000000 3504.000000 \n", + "unique 3504 NaN NaN \n", + "top 20/06/2018 13:00 NaN NaN \n", "freq 1 NaN NaN \n", - "mean NaN 27.330982 12.649024 \n", - "std NaN 33.484216 16.185283 \n", + "mean NaN 26.355685 12.374717 \n", + "std NaN 32.519749 15.830961 \n", "min NaN 2.480000 0.000000 \n", - "25% NaN 3.200000 1.392500 \n", - "50% NaN 4.570000 4.930000 \n", - "75% NaN 49.870000 21.240000 \n", - "max NaN 143.930000 87.700000 \n", + "25% NaN 3.192500 2.090000 \n", + "50% NaN 4.500000 4.900000 \n", + "75% NaN 49.570000 20.700000 \n", + "max NaN 153.140000 82.940000 \n", "\n", " Leading_Current_Reactive_Power_kVarh CO2(tCO2) \\\n", - "count 1752.000000 1752.000000 \n", + "count 3504.000000 3504.000000 \n", "unique NaN NaN \n", "top NaN NaN \n", "freq NaN NaN \n", - "mean 3.949281 0.011530 \n", - "std 7.298637 0.016224 \n", + "mean 3.891093 0.011050 \n", + "std 7.353028 0.015762 \n", "min 0.000000 0.000000 \n", "25% 0.000000 0.000000 \n", "50% 0.000000 0.000000 \n", - "75% 3.837500 0.020000 \n", - "max 27.540000 0.070000 \n", + "75% 2.967500 0.020000 \n", + "max 27.650000 0.070000 \n", "\n", " Lagging_Current_Power_Factor Leading_Current_Power_Factor \\\n", - "count 1752.000000 1752.000000 \n", + "count 3504.000000 3504.000000 \n", "unique NaN NaN \n", "top NaN NaN \n", "freq NaN NaN \n", - "mean 81.364526 83.630702 \n", - "std 18.758338 30.801180 \n", - "min 41.120000 12.540000 \n", - "25% 64.630000 99.180000 \n", - "50% 88.955000 100.000000 \n", - "75% 99.852500 100.000000 \n", + "mean 80.687751 84.082794 \n", + "std 19.053018 30.614144 \n", + "min 40.290000 12.540000 \n", + "25% 63.130000 99.562500 \n", + "50% 88.210000 100.000000 \n", + "75% 99.390000 100.000000 \n", "max 100.000000 100.000000 \n", "\n", " NSM WeekStatus Day_of_week Load_Type \n", - "count 1752.000000 1752 1752 1752 \n", + "count 3504.000000 3504 3504 3504 \n", "unique NaN 2 7 3 \n", - "top NaN Weekday Tuesday Light_Load \n", - "freq NaN 1268 291 898 \n", - "mean 43080.821918 NaN NaN NaN \n", - "std 24944.325392 NaN NaN NaN \n", + "top NaN Weekday Wednesday Light_Load \n", + "freq NaN 2522 527 1837 \n", + "mean 42594.092466 NaN NaN NaN \n", + "std 25222.804637 NaN NaN NaN \n", "min 0.000000 NaN NaN NaN \n", - "25% 21600.000000 NaN NaN NaN \n", - "50% 43200.000000 NaN NaN NaN \n", + "25% 20700.000000 NaN NaN NaN \n", + "50% 42300.000000 NaN NaN NaN \n", "75% 64800.000000 NaN NaN NaN \n", "max 85500.000000 NaN NaN NaN " ] }, - "execution_count": 13, + "execution_count": 8, "metadata": {}, "output_type": "execute_result" } @@ -1184,7 +1183,7 @@ }, { "cell_type": "code", - "execution_count": 14, + "execution_count": 9, "id": "attempted-lafayette", "metadata": {}, "outputs": [ @@ -1225,21 +1224,21 @@ " \n", " \n", " count\n", - " 1752\n", - " 1752.000000\n", - " 1752.000000\n", - " 1752.000000\n", - " 1752.000000\n", - " 1752.000000\n", - " 1752.000000\n", - " 1752.000000\n", - " 1752\n", - " 1752\n", - " 1752\n", + " 3504\n", + " 3504.000000\n", + " 3504.000000\n", + " 3504.000000\n", + " 3504.000000\n", + " 3504.000000\n", + " 3504.000000\n", + " 3504.000000\n", + " 3504\n", + " 3504\n", + " 3504\n", " \n", " \n", " unique\n", - " 1752\n", + " 3504\n", " NaN\n", " NaN\n", " NaN\n", @@ -1253,7 +1252,7 @@ " \n", " \n", " top\n", - " 02/06/2018 02:00\n", + " 16/11/2018 16:45\n", " NaN\n", " NaN\n", " NaN\n", @@ -1262,7 +1261,7 @@ " NaN\n", " NaN\n", " Weekday\n", - " Monday\n", + " Tuesday\n", " Light_Load\n", " \n", " \n", @@ -1275,20 +1274,20 @@ " NaN\n", " NaN\n", " NaN\n", - " 1274\n", - " 275\n", - " 894\n", + " 2536\n", + " 543\n", + " 1768\n", " \n", " \n", " mean\n", " NaN\n", - " 27.756787\n", - " 13.375628\n", - " 3.880634\n", - " 0.011729\n", - " 80.745548\n", - " 84.345154\n", - " 43186.643836\n", + " 28.791849\n", + " 13.764709\n", + " 3.818382\n", + " 0.012212\n", + " 80.931650\n", + " 84.639817\n", + " 42814.469178\n", " NaN\n", " NaN\n", " NaN\n", @@ -1296,13 +1295,13 @@ " \n", " std\n", " NaN\n", - " 32.895802\n", - " 16.482148\n", - " 7.376468\n", - " 0.015943\n", - " 18.927378\n", - " 30.475427\n", - " 24440.888112\n", + " 34.115238\n", + " 16.872400\n", + " 7.325016\n", + " 0.016499\n", + " 18.696834\n", + " 30.258743\n", + " 24628.829557\n", " NaN\n", " NaN\n", " NaN\n", @@ -1310,12 +1309,12 @@ " \n", " min\n", " NaN\n", - " 2.520000\n", + " 2.480000\n", " 0.000000\n", " 0.000000\n", " 0.000000\n", " 38.330000\n", - " 14.070000\n", + " 13.050000\n", " 0.000000\n", " NaN\n", " NaN\n", @@ -1324,13 +1323,13 @@ " \n", " 25%\n", " NaN\n", - " 3.200000\n", - " 2.270000\n", + " 3.240000\n", + " 2.380000\n", " 0.000000\n", " 0.000000\n", - " 63.942500\n", - " 99.690000\n", - " 22500.000000\n", + " 64.112500\n", + " 99.730000\n", + " 21600.000000\n", " NaN\n", " NaN\n", " NaN\n", @@ -1338,11 +1337,11 @@ " \n", " 50%\n", " NaN\n", - " 4.680000\n", + " 4.720000\n", " 5.110000\n", " 0.000000\n", " 0.000000\n", - " 87.940000\n", + " 88.325000\n", " 100.000000\n", " 43200.000000\n", " NaN\n", @@ -1352,11 +1351,11 @@ " \n", " 75%\n", " NaN\n", - " 52.187500\n", - " 24.050000\n", - " 2.177500\n", + " 53.227500\n", + " 24.810000\n", + " 1.917500\n", " 0.020000\n", - " 99.030000\n", + " 98.792500\n", " 100.000000\n", " 63900.000000\n", " NaN\n", @@ -1366,10 +1365,10 @@ " \n", " max\n", " NaN\n", - " 139.030000\n", - " 80.750000\n", - " 27.580000\n", - " 0.060000\n", + " 146.880000\n", + " 87.700000\n", + " 27.540000\n", + " 0.070000\n", " 100.000000\n", " 100.000000\n", " 85500.000000\n", @@ -1383,59 +1382,59 @@ ], "text/plain": [ " date Usage_kWh Lagging_Current_Reactive.Power_kVarh \\\n", - "count 1752 1752.000000 1752.000000 \n", - "unique 1752 NaN NaN \n", - "top 02/06/2018 02:00 NaN NaN \n", + "count 3504 3504.000000 3504.000000 \n", + "unique 3504 NaN NaN \n", + "top 16/11/2018 16:45 NaN NaN \n", "freq 1 NaN NaN \n", - "mean NaN 27.756787 13.375628 \n", - "std NaN 32.895802 16.482148 \n", - "min NaN 2.520000 0.000000 \n", - "25% NaN 3.200000 2.270000 \n", - "50% NaN 4.680000 5.110000 \n", - "75% NaN 52.187500 24.050000 \n", - "max NaN 139.030000 80.750000 \n", + "mean NaN 28.791849 13.764709 \n", + "std NaN 34.115238 16.872400 \n", + "min NaN 2.480000 0.000000 \n", + "25% NaN 3.240000 2.380000 \n", + "50% NaN 4.720000 5.110000 \n", + "75% NaN 53.227500 24.810000 \n", + "max NaN 146.880000 87.700000 \n", "\n", " Leading_Current_Reactive_Power_kVarh CO2(tCO2) \\\n", - "count 1752.000000 1752.000000 \n", + "count 3504.000000 3504.000000 \n", "unique NaN NaN \n", "top NaN NaN \n", "freq NaN NaN \n", - "mean 3.880634 0.011729 \n", - "std 7.376468 0.015943 \n", + "mean 3.818382 0.012212 \n", + "std 7.325016 0.016499 \n", "min 0.000000 0.000000 \n", "25% 0.000000 0.000000 \n", "50% 0.000000 0.000000 \n", - "75% 2.177500 0.020000 \n", - "max 27.580000 0.060000 \n", + "75% 1.917500 0.020000 \n", + "max 27.540000 0.070000 \n", "\n", " Lagging_Current_Power_Factor Leading_Current_Power_Factor \\\n", - "count 1752.000000 1752.000000 \n", + "count 3504.000000 3504.000000 \n", "unique NaN NaN \n", "top NaN NaN \n", "freq NaN NaN \n", - "mean 80.745548 84.345154 \n", - "std 18.927378 30.475427 \n", - "min 38.330000 14.070000 \n", - "25% 63.942500 99.690000 \n", - "50% 87.940000 100.000000 \n", - "75% 99.030000 100.000000 \n", + "mean 80.931650 84.639817 \n", + "std 18.696834 30.258743 \n", + "min 38.330000 13.050000 \n", + "25% 64.112500 99.730000 \n", + "50% 88.325000 100.000000 \n", + "75% 98.792500 100.000000 \n", "max 100.000000 100.000000 \n", "\n", " NSM WeekStatus Day_of_week Load_Type \n", - "count 1752.000000 1752 1752 1752 \n", + "count 3504.000000 3504 3504 3504 \n", "unique NaN 2 7 3 \n", - "top NaN Weekday Monday Light_Load \n", - "freq NaN 1274 275 894 \n", - "mean 43186.643836 NaN NaN NaN \n", - "std 24440.888112 NaN NaN NaN \n", + "top NaN Weekday Tuesday Light_Load \n", + "freq NaN 2536 543 1768 \n", + "mean 42814.469178 NaN NaN NaN \n", + "std 24628.829557 NaN NaN NaN \n", "min 0.000000 NaN NaN NaN \n", - "25% 22500.000000 NaN NaN NaN \n", + "25% 21600.000000 NaN NaN NaN \n", "50% 43200.000000 NaN NaN NaN \n", "75% 63900.000000 NaN NaN NaN \n", "max 85500.000000 NaN NaN NaN " ] }, - "execution_count": 14, + "execution_count": 9, "metadata": {}, "output_type": "execute_result" } @@ -1443,6 +1442,18 @@ "source": [ "dev_data.describe(include='all')" ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "banned-scottish", + "metadata": {}, + "outputs": [], + "source": [ + "test_data.to_csv(\"steel_industry_data_test.csv\", encoding=\"utf-8\", index=False)\n", + "dev_data.to_csv(\"steel_industry_data_dev.csv\", encoding=\"utf-8\", index=False)\n", + "train_data.to_csv(\"steel_industry_data_train.csv\", encoding=\"utf-8\", index=False)" + ] } ], "metadata": { @@ -1466,4 +1477,4 @@ }, "nbformat": 4, "nbformat_minor": 5 -} +} \ No newline at end of file diff --git a/Jenkinsfile_stats b/Jenkinsfile_stats index b76e432..781ea90 100644 --- a/Jenkinsfile_stats +++ b/Jenkinsfile_stats @@ -1,5 +1,7 @@ pipeline { - agent any + agent { + docker { image 'ikami1/ium:v1' } + } parameters { buildSelector( defaultSelector: lastSuccessful(), diff --git a/download_dataset.sh b/download_dataset.sh index a6db0e7..dc332dc 100644 --- a/download_dataset.sh +++ b/download_dataset.sh @@ -1,2 +1,2 @@ -kaggle datasets download -d csafrit2/steel-industry-energy-consumption --force -unzip -o -j steel-industry-energy-consumption.zip \ No newline at end of file +kaggle datasets download -d csafrit2/steel-industry-energy-consumption +unzip -o steel-industry-energy-consumption.zip \ No newline at end of file diff --git a/jenkinsfile b/jenkinsfile index fcdee6e..88dada2 100644 --- a/jenkinsfile +++ b/jenkinsfile @@ -1,5 +1,7 @@ pipeline { - agent any + agent { + dockerfile true + } parameters { string( defaultValue: 'ikami1', @@ -8,7 +10,7 @@ pipeline { trim: false ) password( - defaultValue: 'c70ff184133bfabb351608b128e76cd2', + defaultValue: '', description: 'Kaggle token taken from kaggle.json file, as described in https://github.com/Kaggle/kaggle-api#api-credentials', name: 'KAGGLE_KEY' ) @@ -38,7 +40,8 @@ pipeline { withEnv(["KAGGLE_USERNAME=${params.KAGGLE_USERNAME}", "KAGGLE_KEY=${params.KAGGLE_KEY}", "CUTOFF=${params.CUTOFF}"]) { - sh "./download.sh" + sh "./download_dataset.sh" + sh "python3 process_dataset.py" archiveArtifacts artifacts: "steel_industry_data_test.csv, steel_industry_data_dev.csv, steel_industry_data_train.csv" } }