{ "cells": [ { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
DATE_TIMEPLANT_IDSOURCE_KEYDC_POWERAC_POWERDAILY_YIELDTOTAL_YIELD
count6877868778.06877868778.00000068778.00000068778.0000006.877800e+04
unique3158NaN22NaNNaNNaNNaN
top01-06-2020 12:45NaNbvBOhCH3iADSZryNaNNaNNaNNaN
freq22NaN3155NaNNaNNaNNaN
meanNaN4135001.0NaN3147.426211307.8027523295.9687376.978712e+06
stdNaN0.0NaN4036.457169394.3964393145.1783094.162720e+05
minNaN4135001.0NaN0.0000000.0000000.0000006.183645e+06
25%NaN4135001.0NaN0.0000000.0000000.0000006.512003e+06
50%NaN4135001.0NaN429.00000041.4937502658.7142867.146685e+06
75%NaN4135001.0NaN6366.964286623.6187506274.0000007.268706e+06
maxNaN4135001.0NaN14471.1250001410.9500009163.0000007.846821e+06
\n", "
" ], "text/plain": [ " DATE_TIME PLANT_ID SOURCE_KEY DC_POWER \\\n", "count 68778 68778.0 68778 68778.000000 \n", "unique 3158 NaN 22 NaN \n", "top 01-06-2020 12:45 NaN bvBOhCH3iADSZry NaN \n", "freq 22 NaN 3155 NaN \n", "mean NaN 4135001.0 NaN 3147.426211 \n", "std NaN 0.0 NaN 4036.457169 \n", "min NaN 4135001.0 NaN 0.000000 \n", "25% NaN 4135001.0 NaN 0.000000 \n", "50% NaN 4135001.0 NaN 429.000000 \n", "75% NaN 4135001.0 NaN 6366.964286 \n", "max NaN 4135001.0 NaN 14471.125000 \n", "\n", " AC_POWER DAILY_YIELD TOTAL_YIELD \n", "count 68778.000000 68778.000000 6.877800e+04 \n", "unique NaN NaN NaN \n", "top NaN NaN NaN \n", "freq NaN NaN NaN \n", "mean 307.802752 3295.968737 6.978712e+06 \n", "std 394.396439 3145.178309 4.162720e+05 \n", "min 0.000000 0.000000 6.183645e+06 \n", "25% 0.000000 0.000000 6.512003e+06 \n", "50% 41.493750 2658.714286 7.146685e+06 \n", "75% 623.618750 6274.000000 7.268706e+06 \n", "max 1410.950000 9163.000000 7.846821e+06 " ] }, "execution_count": 2, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# statystyki dla pełnego zbioru\n", "\n", "import pandas as pd\n", "plant_all = pd.read_csv('data/Plant_1_Generation_Data.csv')\n", "plant_all.describe(include='all')" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
DATE_TIMEPLANT_IDSOURCE_KEYDC_POWERAC_POWERDAILY_YIELDTOTAL_YIELD
count68776877.068776877.0000006877.0000006877.0000006.877000e+03
unique2833NaN22NaNNaNNaNNaN
top01-06-2020 00:00NaN1BY6WEcLGh8j5v7NaNNaNNaNNaN
freq8NaN345NaNNaNNaNNaN
meanNaN4135001.0NaN3260.482360318.8576423310.7692696.974811e+06
stdNaN0.0NaN4068.560282397.5320313139.9061754.218293e+05
minNaN4135001.0NaN0.0000000.0000000.0000006.183645e+06
25%NaN4135001.0NaN0.0000000.0000000.0000006.497496e+06
50%NaN4135001.0NaN680.28571465.9142862652.7142867.143812e+06
75%NaN4135001.0NaN6623.571429648.8428576277.0000007.266135e+06
maxNaN4135001.0NaN14418.4285701405.8000009163.0000007.846821e+06
\n", "
" ], "text/plain": [ " DATE_TIME PLANT_ID SOURCE_KEY DC_POWER \\\n", "count 6877 6877.0 6877 6877.000000 \n", "unique 2833 NaN 22 NaN \n", "top 01-06-2020 00:00 NaN 1BY6WEcLGh8j5v7 NaN \n", "freq 8 NaN 345 NaN \n", "mean NaN 4135001.0 NaN 3260.482360 \n", "std NaN 0.0 NaN 4068.560282 \n", "min NaN 4135001.0 NaN 0.000000 \n", "25% NaN 4135001.0 NaN 0.000000 \n", "50% NaN 4135001.0 NaN 680.285714 \n", "75% NaN 4135001.0 NaN 6623.571429 \n", "max NaN 4135001.0 NaN 14418.428570 \n", "\n", " AC_POWER DAILY_YIELD TOTAL_YIELD \n", "count 6877.000000 6877.000000 6.877000e+03 \n", "unique NaN NaN NaN \n", "top NaN NaN NaN \n", "freq NaN NaN NaN \n", "mean 318.857642 3310.769269 6.974811e+06 \n", "std 397.532031 3139.906175 4.218293e+05 \n", "min 0.000000 0.000000 6.183645e+06 \n", "25% 0.000000 0.000000 6.497496e+06 \n", "50% 65.914286 2652.714286 7.143812e+06 \n", "75% 648.842857 6277.000000 7.266135e+06 \n", "max 1405.800000 9163.000000 7.846821e+06 " ] }, "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# statystyki dla zbioru dev\n", "\n", "plant_dev = pd.read_csv('data/Plant_1_Generation_Data.csv.dev')\n", "plant_dev.describe(include='all')" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
DATE_TIMEPLANT_IDSOURCE_KEYDC_POWERAC_POWERDAILY_YIELDTOTAL_YIELD
count68776877.068776877.0000006877.0000006877.0000006.877000e+03
unique2831NaN22NaNNaNNaNNaN
top03-06-2020 13:30NaNz9Y9gH1T5YWrNuGNaNNaNNaNNaN
freq9NaN363NaNNaNNaNNaN
meanNaN4135001.0NaN3150.807630308.1514263305.7639076.981431e+06
stdNaN0.0NaN4020.609169392.8785253142.4075104.151093e+05
minNaN4135001.0NaN0.0000000.0000000.0000006.183645e+06
25%NaN4135001.0NaN0.0000000.0000000.0000006.512002e+06
50%NaN4135001.0NaN468.28571445.4000002682.2857147.149051e+06
75%NaN4135001.0NaN6369.250000623.9750006274.0000007.271854e+06
maxNaN4135001.0NaN14466.8571401410.5285719163.0000007.846821e+06
\n", "
" ], "text/plain": [ " DATE_TIME PLANT_ID SOURCE_KEY DC_POWER \\\n", "count 6877 6877.0 6877 6877.000000 \n", "unique 2831 NaN 22 NaN \n", "top 03-06-2020 13:30 NaN z9Y9gH1T5YWrNuG NaN \n", "freq 9 NaN 363 NaN \n", "mean NaN 4135001.0 NaN 3150.807630 \n", "std NaN 0.0 NaN 4020.609169 \n", "min NaN 4135001.0 NaN 0.000000 \n", "25% NaN 4135001.0 NaN 0.000000 \n", "50% NaN 4135001.0 NaN 468.285714 \n", "75% NaN 4135001.0 NaN 6369.250000 \n", "max NaN 4135001.0 NaN 14466.857140 \n", "\n", " AC_POWER DAILY_YIELD TOTAL_YIELD \n", "count 6877.000000 6877.000000 6.877000e+03 \n", "unique NaN NaN NaN \n", "top NaN NaN NaN \n", "freq NaN NaN NaN \n", "mean 308.151426 3305.763907 6.981431e+06 \n", "std 392.878525 3142.407510 4.151093e+05 \n", "min 0.000000 0.000000 6.183645e+06 \n", "25% 0.000000 0.000000 6.512002e+06 \n", "50% 45.400000 2682.285714 7.149051e+06 \n", "75% 623.975000 6274.000000 7.271854e+06 \n", "max 1410.528571 9163.000000 7.846821e+06 " ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# statystyki dla zbioru test\n", "\n", "plant_test = pd.read_csv('data/Plant_1_Generation_Data.csv.test')\n", "plant_test.describe(include='all')" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
DATE_TIMEPLANT_IDSOURCE_KEYDC_POWERAC_POWERDAILY_YIELDTOTAL_YIELD
count5502455024.05502455024.00000055024.00000055024.0000005.502400e+04
unique3158NaN22NaNNaNNaNNaN
top15-06-2020 09:30NaNiCRJl6heRkivqQ3NaNNaNNaNNaN
freq22NaN2561NaNNaNNaNNaN
meanNaN4135001.0NaN3132.873631306.3775143292.8947216.978859e+06
stdNaN0.0NaN4034.254455394.1775103146.2319204.157218e+05
minNaN4135001.0NaN0.0000000.0000000.0000006.183645e+06
25%NaN4135001.0NaN0.0000000.0000000.0000006.514911e+06
50%NaN4135001.0NaN400.50000038.7205362658.0625007.146685e+06
75%NaN4135001.0NaN6337.535714620.7281256273.6160727.268792e+06
maxNaN4135001.0NaN14471.1250001410.9500009163.0000007.846821e+06
\n", "
" ], "text/plain": [ " DATE_TIME PLANT_ID SOURCE_KEY DC_POWER \\\n", "count 55024 55024.0 55024 55024.000000 \n", "unique 3158 NaN 22 NaN \n", "top 15-06-2020 09:30 NaN iCRJl6heRkivqQ3 NaN \n", "freq 22 NaN 2561 NaN \n", "mean NaN 4135001.0 NaN 3132.873631 \n", "std NaN 0.0 NaN 4034.254455 \n", "min NaN 4135001.0 NaN 0.000000 \n", "25% NaN 4135001.0 NaN 0.000000 \n", "50% NaN 4135001.0 NaN 400.500000 \n", "75% NaN 4135001.0 NaN 6337.535714 \n", "max NaN 4135001.0 NaN 14471.125000 \n", "\n", " AC_POWER DAILY_YIELD TOTAL_YIELD \n", "count 55024.000000 55024.000000 5.502400e+04 \n", "unique NaN NaN NaN \n", "top NaN NaN NaN \n", "freq NaN NaN NaN \n", "mean 306.377514 3292.894721 6.978859e+06 \n", "std 394.177510 3146.231920 4.157218e+05 \n", "min 0.000000 0.000000 6.183645e+06 \n", "25% 0.000000 0.000000 6.514911e+06 \n", "50% 38.720536 2658.062500 7.146685e+06 \n", "75% 620.728125 6273.616072 7.268792e+06 \n", "max 1410.950000 9163.000000 7.846821e+06 " ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# statystyki dla zbioru train\n", "\n", "plant_train = pd.read_csv('data/Plant_1_Generation_Data.csv.train')\n", "plant_train.describe(include='all')" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
DATE_TIMEPLANT_IDSOURCE_KEYDC_POWERAC_POWERDAILY_YIELDTOTAL_YIELD
010-06-2020 22:454135001rGa61gmuvPhdLxV0.0000000.0000006565.0000007310769.000
125-05-2020 07:154135001uHbuxQJl8lW7ozc0.166544236.262500121.7500007111973.750
223-05-2020 17:4541350011IF53ai7Xc0U56Y0.109156154.4857148607.0000006249141.000
315-05-2020 04:4541350013PZuoBAID5Wc2HD0.0000000.0000000.0000006987759.000
412-06-2020 16:304135001iCRJl6heRkivqQ30.191808272.1571435567.4285717391038.429
........................
687201-06-2020 10:004135001zBIq5rxdHJRwDNY0.539282763.6285711779.2857146465018.286
687327-05-2020 02:004135001VHMLBKoKgIrUVDU0.0000000.0000000.0000007297615.000
687431-05-2020 21:3041350013PZuoBAID5Wc2HD0.0000000.0000005816.0000007115304.000
687511-06-2020 18:454135001ih0vzX44oOqAx2f0.0000000.0000005521.0000006386553.000
687616-06-2020 05:4541350013PZuoBAID5Wc2HD0.0000000.0000000.0000007225042.000
\n", "

6877 rows × 7 columns

\n", "
" ], "text/plain": [ " DATE_TIME PLANT_ID SOURCE_KEY DC_POWER AC_POWER \\\n", "0 10-06-2020 22:45 4135001 rGa61gmuvPhdLxV 0.000000 0.000000 \n", "1 25-05-2020 07:15 4135001 uHbuxQJl8lW7ozc 0.166544 236.262500 \n", "2 23-05-2020 17:45 4135001 1IF53ai7Xc0U56Y 0.109156 154.485714 \n", "3 15-05-2020 04:45 4135001 3PZuoBAID5Wc2HD 0.000000 0.000000 \n", "4 12-06-2020 16:30 4135001 iCRJl6heRkivqQ3 0.191808 272.157143 \n", "... ... ... ... ... ... \n", "6872 01-06-2020 10:00 4135001 zBIq5rxdHJRwDNY 0.539282 763.628571 \n", "6873 27-05-2020 02:00 4135001 VHMLBKoKgIrUVDU 0.000000 0.000000 \n", "6874 31-05-2020 21:30 4135001 3PZuoBAID5Wc2HD 0.000000 0.000000 \n", "6875 11-06-2020 18:45 4135001 ih0vzX44oOqAx2f 0.000000 0.000000 \n", "6876 16-06-2020 05:45 4135001 3PZuoBAID5Wc2HD 0.000000 0.000000 \n", "\n", " DAILY_YIELD TOTAL_YIELD \n", "0 6565.000000 7310769.000 \n", "1 121.750000 7111973.750 \n", "2 8607.000000 6249141.000 \n", "3 0.000000 6987759.000 \n", "4 5567.428571 7391038.429 \n", "... ... ... \n", "6872 1779.285714 6465018.286 \n", "6873 0.000000 7297615.000 \n", "6874 5816.000000 7115304.000 \n", "6875 5521.000000 6386553.000 \n", "6876 0.000000 7225042.000 \n", "\n", "[6877 rows x 7 columns]" ] }, "execution_count": 10, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# normalizacja\n", "\n", "\n", "plant_normalized = plant_test.copy()\n", "column = 'DC_POWER'\n", "\n", "plant_normalized[column] = plant_normalized[column] / plant_normalized[column].abs().max()\n", "\n", "plant_normalized" ] } ], "metadata": { "interpreter": { "hash": "ac59ebe37160ed0dfa835113d9b8498d9f09ceb179beaac4002f036b9467c963" }, "kernelspec": { "display_name": "Python 3.9.1 64-bit", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.9.1" }, "orig_nbformat": 4 }, "nbformat": 4, "nbformat_minor": 2 }