ium_470623/IUM_dane02.ipynb
2022-04-11 00:40:48 +02:00

57 KiB
Raw Blame History

!pip install kaggle
!pip install pandas
!pip install seaborn
Requirement already satisfied: kaggle in c:\users\cgala\anaconda3\lib\site-packages (1.5.12)
Requirement already satisfied: tqdm in c:\users\cgala\anaconda3\lib\site-packages (from kaggle) (4.50.2)
Requirement already satisfied: requests in c:\users\cgala\anaconda3\lib\site-packages (from kaggle) (2.24.0)
Requirement already satisfied: certifi in c:\users\cgala\anaconda3\lib\site-packages (from kaggle) (2020.6.20)
Requirement already satisfied: six>=1.10 in c:\users\cgala\anaconda3\lib\site-packages (from kaggle) (1.15.0)
Requirement already satisfied: python-slugify in c:\users\cgala\anaconda3\lib\site-packages (from kaggle) (6.1.1)
Requirement already satisfied: urllib3 in c:\users\cgala\anaconda3\lib\site-packages (from kaggle) (1.25.11)
Requirement already satisfied: python-dateutil in c:\users\cgala\anaconda3\lib\site-packages (from kaggle) (2.8.1)
Requirement already satisfied: idna<3,>=2.5 in c:\users\cgala\anaconda3\lib\site-packages (from requests->kaggle) (2.10)
Requirement already satisfied: chardet<4,>=3.0.2 in c:\users\cgala\anaconda3\lib\site-packages (from requests->kaggle) (3.0.4)
Requirement already satisfied: text-unidecode>=1.3 in c:\users\cgala\anaconda3\lib\site-packages (from python-slugify->kaggle) (1.3)
Requirement already satisfied: pandas in c:\users\cgala\anaconda3\lib\site-packages (1.1.3)
Requirement already satisfied: python-dateutil>=2.7.3 in c:\users\cgala\anaconda3\lib\site-packages (from pandas) (2.8.1)
Requirement already satisfied: numpy>=1.15.4 in c:\users\cgala\anaconda3\lib\site-packages (from pandas) (1.19.2)
Requirement already satisfied: pytz>=2017.2 in c:\users\cgala\anaconda3\lib\site-packages (from pandas) (2020.1)
Requirement already satisfied: six>=1.5 in c:\users\cgala\anaconda3\lib\site-packages (from python-dateutil>=2.7.3->pandas) (1.15.0)
Requirement already satisfied: seaborn in c:\users\cgala\anaconda3\lib\site-packages (0.11.0)
Requirement already satisfied: numpy>=1.15 in c:\users\cgala\anaconda3\lib\site-packages (from seaborn) (1.19.2)
Requirement already satisfied: scipy>=1.0 in c:\users\cgala\anaconda3\lib\site-packages (from seaborn) (1.5.2)
Requirement already satisfied: matplotlib>=2.2 in c:\users\cgala\anaconda3\lib\site-packages (from seaborn) (3.3.2)
Requirement already satisfied: pandas>=0.23 in c:\users\cgala\anaconda3\lib\site-packages (from seaborn) (1.1.3)
Requirement already satisfied: python-dateutil>=2.1 in c:\users\cgala\anaconda3\lib\site-packages (from matplotlib>=2.2->seaborn) (2.8.1)
Requirement already satisfied: kiwisolver>=1.0.1 in c:\users\cgala\anaconda3\lib\site-packages (from matplotlib>=2.2->seaborn) (1.3.0)
Requirement already satisfied: pyparsing!=2.0.4,!=2.1.2,!=2.1.6,>=2.0.3 in c:\users\cgala\anaconda3\lib\site-packages (from matplotlib>=2.2->seaborn) (2.4.7)
Requirement already satisfied: cycler>=0.10 in c:\users\cgala\anaconda3\lib\site-packages (from matplotlib>=2.2->seaborn) (0.10.0)
Requirement already satisfied: pillow>=6.2.0 in c:\users\cgala\anaconda3\lib\site-packages (from matplotlib>=2.2->seaborn) (8.0.1)
Requirement already satisfied: certifi>=2020.06.20 in c:\users\cgala\anaconda3\lib\site-packages (from matplotlib>=2.2->seaborn) (2020.6.20)
Requirement already satisfied: pytz>=2017.2 in c:\users\cgala\anaconda3\lib\site-packages (from pandas>=0.23->seaborn) (2020.1)
Requirement already satisfied: six>=1.5 in c:\users\cgala\anaconda3\lib\site-packages (from python-dateutil>=2.1->matplotlib>=2.2->seaborn) (1.15.0)
!kaggle datasets download -d csafrit2/steel-industry-energy-consumption --force
Downloading steel-industry-energy-consumption.zip to D:\UAM zajecia\IUM\ium_470623

  0%|          | 0.00/484k [00:00<?, ?B/s]
100%|##########| 484k/484k [00:00<00:00, 2.36MB/s]
100%|##########| 484k/484k [00:00<00:00, 2.36MB/s]
!unzip -o steel-industry-energy-consumption.zip
Archive:  steel-industry-energy-consumption.zip
  inflating: Steel_industry_data.csv  
import pandas as pd
energy_data=pd.read_csv('Steel_industry_data.csv')
energy_data
date Usage_kWh Lagging_Current_Reactive.Power_kVarh Leading_Current_Reactive_Power_kVarh CO2(tCO2) Lagging_Current_Power_Factor Leading_Current_Power_Factor NSM WeekStatus Day_of_week Load_Type
0 01/01/2018 00:15 3.17 2.95 0.00 0.0 73.21 100.00 900 Weekday Monday Light_Load
1 01/01/2018 00:30 4.00 4.46 0.00 0.0 66.77 100.00 1800 Weekday Monday Light_Load
2 01/01/2018 00:45 3.24 3.28 0.00 0.0 70.28 100.00 2700 Weekday Monday Light_Load
3 01/01/2018 01:00 3.31 3.56 0.00 0.0 68.09 100.00 3600 Weekday Monday Light_Load
4 01/01/2018 01:15 3.82 4.50 0.00 0.0 64.72 100.00 4500 Weekday Monday Light_Load
... ... ... ... ... ... ... ... ... ... ... ...
35035 31/12/2018 23:00 3.85 4.86 0.00 0.0 62.10 100.00 82800 Weekday Monday Light_Load
35036 31/12/2018 23:15 3.74 3.74 0.00 0.0 70.71 100.00 83700 Weekday Monday Light_Load
35037 31/12/2018 23:30 3.78 3.17 0.07 0.0 76.62 99.98 84600 Weekday Monday Light_Load
35038 31/12/2018 23:45 3.78 3.06 0.11 0.0 77.72 99.96 85500 Weekday Monday Light_Load
35039 31/12/2018 00:00 3.67 3.02 0.07 0.0 77.22 99.98 0 Weekday Monday Light_Load

35040 rows × 11 columns

energy_data.describe(include='all')
date Usage_kWh Lagging_Current_Reactive.Power_kVarh Leading_Current_Reactive_Power_kVarh CO2(tCO2) Lagging_Current_Power_Factor Leading_Current_Power_Factor NSM WeekStatus Day_of_week Load_Type
count 35040 35040.000000 35040.000000 35040.000000 35040.000000 35040.000000 35040.000000 35040.000000 35040 35040 35040
unique 35040 NaN NaN NaN NaN NaN NaN NaN 2 7 3
top 01/01/2018 00:15 NaN NaN NaN NaN NaN NaN NaN Weekday Monday Light_Load
freq 1 NaN NaN NaN NaN NaN NaN NaN 25056 5088 18072
mean NaN 27.386892 13.035384 3.870949 0.011524 80.578056 84.367870 42750.000000 NaN NaN NaN
std NaN 33.444380 16.306000 7.424463 0.016151 18.921322 30.456535 24940.534317 NaN NaN NaN
min NaN 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 NaN NaN NaN
25% NaN 3.200000 2.300000 0.000000 0.000000 63.320000 99.700000 21375.000000 NaN NaN NaN
50% NaN 4.570000 5.000000 0.000000 0.000000 87.960000 100.000000 42750.000000 NaN NaN NaN
75% NaN 51.237500 22.640000 2.090000 0.020000 99.022500 100.000000 64125.000000 NaN NaN NaN
max NaN 157.180000 96.910000 27.760000 0.070000 100.000000 100.000000 85500.000000 NaN NaN NaN
from sklearn.model_selection import train_test_split

train_data, test_data  = train_test_split(energy_data, test_size=7008, random_state=1)
test_data, dev_data = train_test_split(test_data, test_size=3504, random_state=1)
print('Training set size:')
print(train_data.shape)
print('Testing set size:')
print(test_data.shape)
print('Dev set size:')
print(dev_data.shape)
Training set size:
(28032, 11)
Testing set size:
(3504, 11)
Dev set size:
(3504, 11)
train_data.describe(include='all')
date Usage_kWh Lagging_Current_Reactive.Power_kVarh Leading_Current_Reactive_Power_kVarh CO2(tCO2) Lagging_Current_Power_Factor Leading_Current_Power_Factor NSM WeekStatus Day_of_week Load_Type
count 28032 28032.000000 28032.000000 28032.000000 28032.000000 28032.000000 28032.000000 28032.000000 28032 28032 28032
unique 28032 NaN NaN NaN NaN NaN NaN NaN 2 7 3
top 07/08/2018 14:15 NaN NaN NaN NaN NaN NaN NaN Weekday Monday Light_Load
freq 1 NaN NaN NaN NaN NaN NaN NaN 19998 4087 14467
mean NaN 27.340174 13.026801 3.875001 0.011498 80.520145 84.369511 42761.429795 NaN NaN NaN
std NaN 33.469130 16.289348 7.445898 0.016153 18.932825 30.462193 24944.585138 NaN NaN NaN
min NaN 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 NaN NaN NaN
25% NaN 3.200000 2.300000 0.000000 0.000000 63.227500 99.710000 20700.000000 NaN NaN NaN
50% NaN 4.570000 5.000000 0.000000 0.000000 87.870000 100.000000 43200.000000 NaN NaN NaN
75% NaN 51.190000 22.750000 2.020000 0.020000 99.000000 100.000000 64800.000000 NaN NaN NaN
max NaN 157.180000 96.910000 27.760000 0.070000 100.000000 100.000000 85500.000000 NaN NaN NaN
test_data.describe(include='all')
date Usage_kWh Lagging_Current_Reactive.Power_kVarh Leading_Current_Reactive_Power_kVarh CO2(tCO2) Lagging_Current_Power_Factor Leading_Current_Power_Factor NSM WeekStatus Day_of_week Load_Type
count 3504 3504.000000 3504.000000 3504.000000 3504.000000 3504.000000 3504.000000 3504.000000 3504 3504 3504
unique 3504 NaN NaN NaN NaN NaN NaN NaN 2 7 3
top 20/06/2018 13:00 NaN NaN NaN NaN NaN NaN NaN Weekday Wednesday Light_Load
freq 1 NaN NaN NaN NaN NaN NaN NaN 2522 527 1837
mean NaN 26.355685 12.374717 3.891093 0.011050 80.687751 84.082794 42594.092466 NaN NaN NaN
std NaN 32.519749 15.830961 7.353028 0.015762 19.053018 30.614144 25222.804637 NaN NaN NaN
min NaN 2.480000 0.000000 0.000000 0.000000 40.290000 12.540000 0.000000 NaN NaN NaN
25% NaN 3.192500 2.090000 0.000000 0.000000 63.130000 99.562500 20700.000000 NaN NaN NaN
50% NaN 4.500000 4.900000 0.000000 0.000000 88.210000 100.000000 42300.000000 NaN NaN NaN
75% NaN 49.570000 20.700000 2.967500 0.020000 99.390000 100.000000 64800.000000 NaN NaN NaN
max NaN 153.140000 82.940000 27.650000 0.070000 100.000000 100.000000 85500.000000 NaN NaN NaN
dev_data.describe(include='all')
date Usage_kWh Lagging_Current_Reactive.Power_kVarh Leading_Current_Reactive_Power_kVarh CO2(tCO2) Lagging_Current_Power_Factor Leading_Current_Power_Factor NSM WeekStatus Day_of_week Load_Type
count 3504 3504.000000 3504.000000 3504.000000 3504.000000 3504.000000 3504.000000 3504.000000 3504 3504 3504
unique 3504 NaN NaN NaN NaN NaN NaN NaN 2 7 3
top 16/11/2018 16:45 NaN NaN NaN NaN NaN NaN NaN Weekday Tuesday Light_Load
freq 1 NaN NaN NaN NaN NaN NaN NaN 2536 543 1768
mean NaN 28.791849 13.764709 3.818382 0.012212 80.931650 84.639817 42814.469178 NaN NaN NaN
std NaN 34.115238 16.872400 7.325016 0.016499 18.696834 30.258743 24628.829557 NaN NaN NaN
min NaN 2.480000 0.000000 0.000000 0.000000 38.330000 13.050000 0.000000 NaN NaN NaN
25% NaN 3.240000 2.380000 0.000000 0.000000 64.112500 99.730000 21600.000000 NaN NaN NaN
50% NaN 4.720000 5.110000 0.000000 0.000000 88.325000 100.000000 43200.000000 NaN NaN NaN
75% NaN 53.227500 24.810000 1.917500 0.020000 98.792500 100.000000 63900.000000 NaN NaN NaN
max NaN 146.880000 87.700000 27.540000 0.070000 100.000000 100.000000 85500.000000 NaN NaN NaN
test_data.to_csv("steel_industry_data_test.csv", encoding="utf-8", index=False)
dev_data.to_csv("steel_industry_data_dev.csv", encoding="utf-8", index=False)
train_data.to_csv("steel_industry_data_train.csv", encoding="utf-8", index=False)