Added first solution

This commit is contained in:
Marcin Kostrzewski 2022-03-20 18:07:34 +01:00
parent 557d35fca0
commit d4b1d98926
3 changed files with 1012 additions and 1 deletions

View File

@ -1,3 +1,24 @@
# ium_444409
Zadania realizowane w ramach zajęć Inżynieria Uczenia Maszynowego
Zadania realizowane w ramach zajęć Inżynieria Uczenia Maszynowego.
## Zbiór
***Solar Power Generation Data***
https://www.kaggle.com/datasets/anikannal/solar-power-generation-data?select=Plant_1_Generation_Data.csv
## Wymagania
- `python3`
- `pip`
- API token z `kaggle.com`
## Uruchamianie
- Instalujemy potrzebne pakiety:
```sh
$ pip install -r requirements.txt
```
- Pobieramy zbiór danych z Kaggle. Skorzystamy ze skryptu w repo, który pobierze i podzieli dane na podzbiory:
```
$ ./download_dataset.sh
```

View File

@ -0,0 +1,988 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>DATE_TIME</th>\n",
" <th>PLANT_ID</th>\n",
" <th>SOURCE_KEY</th>\n",
" <th>DC_POWER</th>\n",
" <th>AC_POWER</th>\n",
" <th>DAILY_YIELD</th>\n",
" <th>TOTAL_YIELD</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>count</th>\n",
" <td>68778</td>\n",
" <td>68778.0</td>\n",
" <td>68778</td>\n",
" <td>68778.000000</td>\n",
" <td>68778.000000</td>\n",
" <td>68778.000000</td>\n",
" <td>6.877800e+04</td>\n",
" </tr>\n",
" <tr>\n",
" <th>unique</th>\n",
" <td>3158</td>\n",
" <td>NaN</td>\n",
" <td>22</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>top</th>\n",
" <td>01-06-2020 12:45</td>\n",
" <td>NaN</td>\n",
" <td>bvBOhCH3iADSZry</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>freq</th>\n",
" <td>22</td>\n",
" <td>NaN</td>\n",
" <td>3155</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>mean</th>\n",
" <td>NaN</td>\n",
" <td>4135001.0</td>\n",
" <td>NaN</td>\n",
" <td>3147.426211</td>\n",
" <td>307.802752</td>\n",
" <td>3295.968737</td>\n",
" <td>6.978712e+06</td>\n",
" </tr>\n",
" <tr>\n",
" <th>std</th>\n",
" <td>NaN</td>\n",
" <td>0.0</td>\n",
" <td>NaN</td>\n",
" <td>4036.457169</td>\n",
" <td>394.396439</td>\n",
" <td>3145.178309</td>\n",
" <td>4.162720e+05</td>\n",
" </tr>\n",
" <tr>\n",
" <th>min</th>\n",
" <td>NaN</td>\n",
" <td>4135001.0</td>\n",
" <td>NaN</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>6.183645e+06</td>\n",
" </tr>\n",
" <tr>\n",
" <th>25%</th>\n",
" <td>NaN</td>\n",
" <td>4135001.0</td>\n",
" <td>NaN</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>6.512003e+06</td>\n",
" </tr>\n",
" <tr>\n",
" <th>50%</th>\n",
" <td>NaN</td>\n",
" <td>4135001.0</td>\n",
" <td>NaN</td>\n",
" <td>429.000000</td>\n",
" <td>41.493750</td>\n",
" <td>2658.714286</td>\n",
" <td>7.146685e+06</td>\n",
" </tr>\n",
" <tr>\n",
" <th>75%</th>\n",
" <td>NaN</td>\n",
" <td>4135001.0</td>\n",
" <td>NaN</td>\n",
" <td>6366.964286</td>\n",
" <td>623.618750</td>\n",
" <td>6274.000000</td>\n",
" <td>7.268706e+06</td>\n",
" </tr>\n",
" <tr>\n",
" <th>max</th>\n",
" <td>NaN</td>\n",
" <td>4135001.0</td>\n",
" <td>NaN</td>\n",
" <td>14471.125000</td>\n",
" <td>1410.950000</td>\n",
" <td>9163.000000</td>\n",
" <td>7.846821e+06</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" DATE_TIME PLANT_ID SOURCE_KEY DC_POWER \\\n",
"count 68778 68778.0 68778 68778.000000 \n",
"unique 3158 NaN 22 NaN \n",
"top 01-06-2020 12:45 NaN bvBOhCH3iADSZry NaN \n",
"freq 22 NaN 3155 NaN \n",
"mean NaN 4135001.0 NaN 3147.426211 \n",
"std NaN 0.0 NaN 4036.457169 \n",
"min NaN 4135001.0 NaN 0.000000 \n",
"25% NaN 4135001.0 NaN 0.000000 \n",
"50% NaN 4135001.0 NaN 429.000000 \n",
"75% NaN 4135001.0 NaN 6366.964286 \n",
"max NaN 4135001.0 NaN 14471.125000 \n",
"\n",
" AC_POWER DAILY_YIELD TOTAL_YIELD \n",
"count 68778.000000 68778.000000 6.877800e+04 \n",
"unique NaN NaN NaN \n",
"top NaN NaN NaN \n",
"freq NaN NaN NaN \n",
"mean 307.802752 3295.968737 6.978712e+06 \n",
"std 394.396439 3145.178309 4.162720e+05 \n",
"min 0.000000 0.000000 6.183645e+06 \n",
"25% 0.000000 0.000000 6.512003e+06 \n",
"50% 41.493750 2658.714286 7.146685e+06 \n",
"75% 623.618750 6274.000000 7.268706e+06 \n",
"max 1410.950000 9163.000000 7.846821e+06 "
]
},
"execution_count": 2,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# statystyki dla pełnego zbioru\n",
"\n",
"import pandas as pd\n",
"plant_all = pd.read_csv('data/Plant_1_Generation_Data.csv')\n",
"plant_all.describe(include='all')"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>DATE_TIME</th>\n",
" <th>PLANT_ID</th>\n",
" <th>SOURCE_KEY</th>\n",
" <th>DC_POWER</th>\n",
" <th>AC_POWER</th>\n",
" <th>DAILY_YIELD</th>\n",
" <th>TOTAL_YIELD</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>count</th>\n",
" <td>6877</td>\n",
" <td>6877.0</td>\n",
" <td>6877</td>\n",
" <td>6877.000000</td>\n",
" <td>6877.000000</td>\n",
" <td>6877.000000</td>\n",
" <td>6.877000e+03</td>\n",
" </tr>\n",
" <tr>\n",
" <th>unique</th>\n",
" <td>2833</td>\n",
" <td>NaN</td>\n",
" <td>22</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>top</th>\n",
" <td>01-06-2020 00:00</td>\n",
" <td>NaN</td>\n",
" <td>1BY6WEcLGh8j5v7</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>freq</th>\n",
" <td>8</td>\n",
" <td>NaN</td>\n",
" <td>345</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>mean</th>\n",
" <td>NaN</td>\n",
" <td>4135001.0</td>\n",
" <td>NaN</td>\n",
" <td>3260.482360</td>\n",
" <td>318.857642</td>\n",
" <td>3310.769269</td>\n",
" <td>6.974811e+06</td>\n",
" </tr>\n",
" <tr>\n",
" <th>std</th>\n",
" <td>NaN</td>\n",
" <td>0.0</td>\n",
" <td>NaN</td>\n",
" <td>4068.560282</td>\n",
" <td>397.532031</td>\n",
" <td>3139.906175</td>\n",
" <td>4.218293e+05</td>\n",
" </tr>\n",
" <tr>\n",
" <th>min</th>\n",
" <td>NaN</td>\n",
" <td>4135001.0</td>\n",
" <td>NaN</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>6.183645e+06</td>\n",
" </tr>\n",
" <tr>\n",
" <th>25%</th>\n",
" <td>NaN</td>\n",
" <td>4135001.0</td>\n",
" <td>NaN</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>6.497496e+06</td>\n",
" </tr>\n",
" <tr>\n",
" <th>50%</th>\n",
" <td>NaN</td>\n",
" <td>4135001.0</td>\n",
" <td>NaN</td>\n",
" <td>680.285714</td>\n",
" <td>65.914286</td>\n",
" <td>2652.714286</td>\n",
" <td>7.143812e+06</td>\n",
" </tr>\n",
" <tr>\n",
" <th>75%</th>\n",
" <td>NaN</td>\n",
" <td>4135001.0</td>\n",
" <td>NaN</td>\n",
" <td>6623.571429</td>\n",
" <td>648.842857</td>\n",
" <td>6277.000000</td>\n",
" <td>7.266135e+06</td>\n",
" </tr>\n",
" <tr>\n",
" <th>max</th>\n",
" <td>NaN</td>\n",
" <td>4135001.0</td>\n",
" <td>NaN</td>\n",
" <td>14418.428570</td>\n",
" <td>1405.800000</td>\n",
" <td>9163.000000</td>\n",
" <td>7.846821e+06</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" DATE_TIME PLANT_ID SOURCE_KEY DC_POWER \\\n",
"count 6877 6877.0 6877 6877.000000 \n",
"unique 2833 NaN 22 NaN \n",
"top 01-06-2020 00:00 NaN 1BY6WEcLGh8j5v7 NaN \n",
"freq 8 NaN 345 NaN \n",
"mean NaN 4135001.0 NaN 3260.482360 \n",
"std NaN 0.0 NaN 4068.560282 \n",
"min NaN 4135001.0 NaN 0.000000 \n",
"25% NaN 4135001.0 NaN 0.000000 \n",
"50% NaN 4135001.0 NaN 680.285714 \n",
"75% NaN 4135001.0 NaN 6623.571429 \n",
"max NaN 4135001.0 NaN 14418.428570 \n",
"\n",
" AC_POWER DAILY_YIELD TOTAL_YIELD \n",
"count 6877.000000 6877.000000 6.877000e+03 \n",
"unique NaN NaN NaN \n",
"top NaN NaN NaN \n",
"freq NaN NaN NaN \n",
"mean 318.857642 3310.769269 6.974811e+06 \n",
"std 397.532031 3139.906175 4.218293e+05 \n",
"min 0.000000 0.000000 6.183645e+06 \n",
"25% 0.000000 0.000000 6.497496e+06 \n",
"50% 65.914286 2652.714286 7.143812e+06 \n",
"75% 648.842857 6277.000000 7.266135e+06 \n",
"max 1405.800000 9163.000000 7.846821e+06 "
]
},
"execution_count": 3,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# statystyki dla zbioru dev\n",
"\n",
"plant_dev = pd.read_csv('data/Plant_1_Generation_Data.csv.dev')\n",
"plant_dev.describe(include='all')"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>DATE_TIME</th>\n",
" <th>PLANT_ID</th>\n",
" <th>SOURCE_KEY</th>\n",
" <th>DC_POWER</th>\n",
" <th>AC_POWER</th>\n",
" <th>DAILY_YIELD</th>\n",
" <th>TOTAL_YIELD</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>count</th>\n",
" <td>6877</td>\n",
" <td>6877.0</td>\n",
" <td>6877</td>\n",
" <td>6877.000000</td>\n",
" <td>6877.000000</td>\n",
" <td>6877.000000</td>\n",
" <td>6.877000e+03</td>\n",
" </tr>\n",
" <tr>\n",
" <th>unique</th>\n",
" <td>2831</td>\n",
" <td>NaN</td>\n",
" <td>22</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>top</th>\n",
" <td>03-06-2020 13:30</td>\n",
" <td>NaN</td>\n",
" <td>z9Y9gH1T5YWrNuG</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>freq</th>\n",
" <td>9</td>\n",
" <td>NaN</td>\n",
" <td>363</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>mean</th>\n",
" <td>NaN</td>\n",
" <td>4135001.0</td>\n",
" <td>NaN</td>\n",
" <td>3150.807630</td>\n",
" <td>308.151426</td>\n",
" <td>3305.763907</td>\n",
" <td>6.981431e+06</td>\n",
" </tr>\n",
" <tr>\n",
" <th>std</th>\n",
" <td>NaN</td>\n",
" <td>0.0</td>\n",
" <td>NaN</td>\n",
" <td>4020.609169</td>\n",
" <td>392.878525</td>\n",
" <td>3142.407510</td>\n",
" <td>4.151093e+05</td>\n",
" </tr>\n",
" <tr>\n",
" <th>min</th>\n",
" <td>NaN</td>\n",
" <td>4135001.0</td>\n",
" <td>NaN</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>6.183645e+06</td>\n",
" </tr>\n",
" <tr>\n",
" <th>25%</th>\n",
" <td>NaN</td>\n",
" <td>4135001.0</td>\n",
" <td>NaN</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>6.512002e+06</td>\n",
" </tr>\n",
" <tr>\n",
" <th>50%</th>\n",
" <td>NaN</td>\n",
" <td>4135001.0</td>\n",
" <td>NaN</td>\n",
" <td>468.285714</td>\n",
" <td>45.400000</td>\n",
" <td>2682.285714</td>\n",
" <td>7.149051e+06</td>\n",
" </tr>\n",
" <tr>\n",
" <th>75%</th>\n",
" <td>NaN</td>\n",
" <td>4135001.0</td>\n",
" <td>NaN</td>\n",
" <td>6369.250000</td>\n",
" <td>623.975000</td>\n",
" <td>6274.000000</td>\n",
" <td>7.271854e+06</td>\n",
" </tr>\n",
" <tr>\n",
" <th>max</th>\n",
" <td>NaN</td>\n",
" <td>4135001.0</td>\n",
" <td>NaN</td>\n",
" <td>14466.857140</td>\n",
" <td>1410.528571</td>\n",
" <td>9163.000000</td>\n",
" <td>7.846821e+06</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" DATE_TIME PLANT_ID SOURCE_KEY DC_POWER \\\n",
"count 6877 6877.0 6877 6877.000000 \n",
"unique 2831 NaN 22 NaN \n",
"top 03-06-2020 13:30 NaN z9Y9gH1T5YWrNuG NaN \n",
"freq 9 NaN 363 NaN \n",
"mean NaN 4135001.0 NaN 3150.807630 \n",
"std NaN 0.0 NaN 4020.609169 \n",
"min NaN 4135001.0 NaN 0.000000 \n",
"25% NaN 4135001.0 NaN 0.000000 \n",
"50% NaN 4135001.0 NaN 468.285714 \n",
"75% NaN 4135001.0 NaN 6369.250000 \n",
"max NaN 4135001.0 NaN 14466.857140 \n",
"\n",
" AC_POWER DAILY_YIELD TOTAL_YIELD \n",
"count 6877.000000 6877.000000 6.877000e+03 \n",
"unique NaN NaN NaN \n",
"top NaN NaN NaN \n",
"freq NaN NaN NaN \n",
"mean 308.151426 3305.763907 6.981431e+06 \n",
"std 392.878525 3142.407510 4.151093e+05 \n",
"min 0.000000 0.000000 6.183645e+06 \n",
"25% 0.000000 0.000000 6.512002e+06 \n",
"50% 45.400000 2682.285714 7.149051e+06 \n",
"75% 623.975000 6274.000000 7.271854e+06 \n",
"max 1410.528571 9163.000000 7.846821e+06 "
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# statystyki dla zbioru test\n",
"\n",
"plant_test = pd.read_csv('data/Plant_1_Generation_Data.csv.test')\n",
"plant_test.describe(include='all')"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>DATE_TIME</th>\n",
" <th>PLANT_ID</th>\n",
" <th>SOURCE_KEY</th>\n",
" <th>DC_POWER</th>\n",
" <th>AC_POWER</th>\n",
" <th>DAILY_YIELD</th>\n",
" <th>TOTAL_YIELD</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>count</th>\n",
" <td>55024</td>\n",
" <td>55024.0</td>\n",
" <td>55024</td>\n",
" <td>55024.000000</td>\n",
" <td>55024.000000</td>\n",
" <td>55024.000000</td>\n",
" <td>5.502400e+04</td>\n",
" </tr>\n",
" <tr>\n",
" <th>unique</th>\n",
" <td>3158</td>\n",
" <td>NaN</td>\n",
" <td>22</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>top</th>\n",
" <td>15-06-2020 09:30</td>\n",
" <td>NaN</td>\n",
" <td>iCRJl6heRkivqQ3</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>freq</th>\n",
" <td>22</td>\n",
" <td>NaN</td>\n",
" <td>2561</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>mean</th>\n",
" <td>NaN</td>\n",
" <td>4135001.0</td>\n",
" <td>NaN</td>\n",
" <td>3132.873631</td>\n",
" <td>306.377514</td>\n",
" <td>3292.894721</td>\n",
" <td>6.978859e+06</td>\n",
" </tr>\n",
" <tr>\n",
" <th>std</th>\n",
" <td>NaN</td>\n",
" <td>0.0</td>\n",
" <td>NaN</td>\n",
" <td>4034.254455</td>\n",
" <td>394.177510</td>\n",
" <td>3146.231920</td>\n",
" <td>4.157218e+05</td>\n",
" </tr>\n",
" <tr>\n",
" <th>min</th>\n",
" <td>NaN</td>\n",
" <td>4135001.0</td>\n",
" <td>NaN</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>6.183645e+06</td>\n",
" </tr>\n",
" <tr>\n",
" <th>25%</th>\n",
" <td>NaN</td>\n",
" <td>4135001.0</td>\n",
" <td>NaN</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>6.514911e+06</td>\n",
" </tr>\n",
" <tr>\n",
" <th>50%</th>\n",
" <td>NaN</td>\n",
" <td>4135001.0</td>\n",
" <td>NaN</td>\n",
" <td>400.500000</td>\n",
" <td>38.720536</td>\n",
" <td>2658.062500</td>\n",
" <td>7.146685e+06</td>\n",
" </tr>\n",
" <tr>\n",
" <th>75%</th>\n",
" <td>NaN</td>\n",
" <td>4135001.0</td>\n",
" <td>NaN</td>\n",
" <td>6337.535714</td>\n",
" <td>620.728125</td>\n",
" <td>6273.616072</td>\n",
" <td>7.268792e+06</td>\n",
" </tr>\n",
" <tr>\n",
" <th>max</th>\n",
" <td>NaN</td>\n",
" <td>4135001.0</td>\n",
" <td>NaN</td>\n",
" <td>14471.125000</td>\n",
" <td>1410.950000</td>\n",
" <td>9163.000000</td>\n",
" <td>7.846821e+06</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" DATE_TIME PLANT_ID SOURCE_KEY DC_POWER \\\n",
"count 55024 55024.0 55024 55024.000000 \n",
"unique 3158 NaN 22 NaN \n",
"top 15-06-2020 09:30 NaN iCRJl6heRkivqQ3 NaN \n",
"freq 22 NaN 2561 NaN \n",
"mean NaN 4135001.0 NaN 3132.873631 \n",
"std NaN 0.0 NaN 4034.254455 \n",
"min NaN 4135001.0 NaN 0.000000 \n",
"25% NaN 4135001.0 NaN 0.000000 \n",
"50% NaN 4135001.0 NaN 400.500000 \n",
"75% NaN 4135001.0 NaN 6337.535714 \n",
"max NaN 4135001.0 NaN 14471.125000 \n",
"\n",
" AC_POWER DAILY_YIELD TOTAL_YIELD \n",
"count 55024.000000 55024.000000 5.502400e+04 \n",
"unique NaN NaN NaN \n",
"top NaN NaN NaN \n",
"freq NaN NaN NaN \n",
"mean 306.377514 3292.894721 6.978859e+06 \n",
"std 394.177510 3146.231920 4.157218e+05 \n",
"min 0.000000 0.000000 6.183645e+06 \n",
"25% 0.000000 0.000000 6.514911e+06 \n",
"50% 38.720536 2658.062500 7.146685e+06 \n",
"75% 620.728125 6273.616072 7.268792e+06 \n",
"max 1410.950000 9163.000000 7.846821e+06 "
]
},
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# statystyki dla zbioru train\n",
"\n",
"plant_train = pd.read_csv('data/Plant_1_Generation_Data.csv.train')\n",
"plant_train.describe(include='all')"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>DATE_TIME</th>\n",
" <th>PLANT_ID</th>\n",
" <th>SOURCE_KEY</th>\n",
" <th>DC_POWER</th>\n",
" <th>AC_POWER</th>\n",
" <th>DAILY_YIELD</th>\n",
" <th>TOTAL_YIELD</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>10-06-2020 22:45</td>\n",
" <td>4135001</td>\n",
" <td>rGa61gmuvPhdLxV</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>6565.000000</td>\n",
" <td>7310769.000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>25-05-2020 07:15</td>\n",
" <td>4135001</td>\n",
" <td>uHbuxQJl8lW7ozc</td>\n",
" <td>0.166544</td>\n",
" <td>236.262500</td>\n",
" <td>121.750000</td>\n",
" <td>7111973.750</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>23-05-2020 17:45</td>\n",
" <td>4135001</td>\n",
" <td>1IF53ai7Xc0U56Y</td>\n",
" <td>0.109156</td>\n",
" <td>154.485714</td>\n",
" <td>8607.000000</td>\n",
" <td>6249141.000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>15-05-2020 04:45</td>\n",
" <td>4135001</td>\n",
" <td>3PZuoBAID5Wc2HD</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>6987759.000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>12-06-2020 16:30</td>\n",
" <td>4135001</td>\n",
" <td>iCRJl6heRkivqQ3</td>\n",
" <td>0.191808</td>\n",
" <td>272.157143</td>\n",
" <td>5567.428571</td>\n",
" <td>7391038.429</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6872</th>\n",
" <td>01-06-2020 10:00</td>\n",
" <td>4135001</td>\n",
" <td>zBIq5rxdHJRwDNY</td>\n",
" <td>0.539282</td>\n",
" <td>763.628571</td>\n",
" <td>1779.285714</td>\n",
" <td>6465018.286</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6873</th>\n",
" <td>27-05-2020 02:00</td>\n",
" <td>4135001</td>\n",
" <td>VHMLBKoKgIrUVDU</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>7297615.000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6874</th>\n",
" <td>31-05-2020 21:30</td>\n",
" <td>4135001</td>\n",
" <td>3PZuoBAID5Wc2HD</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>5816.000000</td>\n",
" <td>7115304.000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6875</th>\n",
" <td>11-06-2020 18:45</td>\n",
" <td>4135001</td>\n",
" <td>ih0vzX44oOqAx2f</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>5521.000000</td>\n",
" <td>6386553.000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6876</th>\n",
" <td>16-06-2020 05:45</td>\n",
" <td>4135001</td>\n",
" <td>3PZuoBAID5Wc2HD</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>7225042.000</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>6877 rows × 7 columns</p>\n",
"</div>"
],
"text/plain": [
" DATE_TIME PLANT_ID SOURCE_KEY DC_POWER AC_POWER \\\n",
"0 10-06-2020 22:45 4135001 rGa61gmuvPhdLxV 0.000000 0.000000 \n",
"1 25-05-2020 07:15 4135001 uHbuxQJl8lW7ozc 0.166544 236.262500 \n",
"2 23-05-2020 17:45 4135001 1IF53ai7Xc0U56Y 0.109156 154.485714 \n",
"3 15-05-2020 04:45 4135001 3PZuoBAID5Wc2HD 0.000000 0.000000 \n",
"4 12-06-2020 16:30 4135001 iCRJl6heRkivqQ3 0.191808 272.157143 \n",
"... ... ... ... ... ... \n",
"6872 01-06-2020 10:00 4135001 zBIq5rxdHJRwDNY 0.539282 763.628571 \n",
"6873 27-05-2020 02:00 4135001 VHMLBKoKgIrUVDU 0.000000 0.000000 \n",
"6874 31-05-2020 21:30 4135001 3PZuoBAID5Wc2HD 0.000000 0.000000 \n",
"6875 11-06-2020 18:45 4135001 ih0vzX44oOqAx2f 0.000000 0.000000 \n",
"6876 16-06-2020 05:45 4135001 3PZuoBAID5Wc2HD 0.000000 0.000000 \n",
"\n",
" DAILY_YIELD TOTAL_YIELD \n",
"0 6565.000000 7310769.000 \n",
"1 121.750000 7111973.750 \n",
"2 8607.000000 6249141.000 \n",
"3 0.000000 6987759.000 \n",
"4 5567.428571 7391038.429 \n",
"... ... ... \n",
"6872 1779.285714 6465018.286 \n",
"6873 0.000000 7297615.000 \n",
"6874 5816.000000 7115304.000 \n",
"6875 5521.000000 6386553.000 \n",
"6876 0.000000 7225042.000 \n",
"\n",
"[6877 rows x 7 columns]"
]
},
"execution_count": 10,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# normalizacja\n",
"\n",
"\n",
"plant_normalized = plant_test.copy()\n",
"column = 'DC_POWER'\n",
"\n",
"plant_normalized[column] = plant_normalized[column] / plant_normalized[column].abs().max()\n",
"\n",
"plant_normalized"
]
}
],
"metadata": {
"interpreter": {
"hash": "ac59ebe37160ed0dfa835113d9b8498d9f09ceb179beaac4002f036b9467c963"
},
"kernelspec": {
"display_name": "Python 3.9.1 64-bit",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.1"
},
"orig_nbformat": 4
},
"nbformat": 4,
"nbformat_minor": 2
}

2
requirements.txt Normal file
View File

@ -0,0 +1,2 @@
kaggle==1.5.12
pandas==1.4.1