11 KiB
11 KiB
Notebook for first substask of Inżynieria Uczenia Maszynowego class project.
This workbook downloads, normalizes and prints short summary of the dataset I will be working on and its subsets.
Link to the dataset at Kaggle.com:
google colab related stuff
from google.colab import drive
drive.mount('drive')
- Click in Colab GUI to allow Colab access and modify Google Drive files
!mkdir ~/.kaggle
!cp drive/MyDrive/kaggle.json ~/.kaggle/.
!chmod +x ~/.kaggle/kaggle.json
!pip install -q kaggle
script
download data
!kaggle datasets download -d 'pcbreviglieri/smart-grid-stability' >>/dev/null 2>&1
!unzip smart-grid-stability.zip >>/dev/null 2>&1
read the data as pandas data frame
import pandas as pd
df = pd.read_csv('smart_grid_stability_augmented.csv')
normalize values, so they are all between 0 and 1 (included)
from sklearn import preprocessing
scaler = preprocessing.StandardScaler().fit(df.iloc[:, 0:-1])
df_norm_array = scaler.transform(df.iloc[:, 0:-1])
df_norm = pd.DataFrame(data=df_norm_array,
columns=df.columns[:-1])
df_norm['stabf'] = df['stabf']
divide the data into train, test and validation subsets
from sklearn.model_selection import train_test_split
train, testAndValid = train_test_split(
df_norm,
test_size=0.2,
random_state=42,
stratify=df_norm['stabf'])
test, valid =train_test_split(
testAndValid,
test_size=0.5,
random_state=42,
stratify=testAndValid['stabf'])
print short summary of the dataset and its subsets
def namestr(obj, namespace):
return [name for name in namespace if namespace[name] is obj]
dataset = df_norm
for x in [dataset, train, test, valid]:
print([q for q in namestr(x, globals()) if len(q) == max([len(w) for w in namestr(x, globals())])][-1])
print("size:", len(x))
print(x.describe(include='all'))
print("class distribution", x.value_counts('stabf'))
print('===============================================================')
dataset size: 60000 tau1 tau2 ... stab stabf count 6.000000e+04 6.000000e+04 ... 6.000000e+04 60000 unique NaN NaN ... NaN 2 top NaN NaN ... NaN unstable freq NaN NaN ... NaN 38280 mean 1.476245e-16 -1.998105e-16 ... 3.981075e-17 NaN std 1.000008e+00 1.000008e+00 ... 1.000008e+00 NaN min -1.731763e+00 -1.731999e+00 ... -2.613709e+00 NaN 25% -8.660657e-01 -8.660215e-01 ... -8.475133e-01 NaN 50% 1.437170e-06 -7.028730e-06 ... 3.821538e-02 NaN 75% 8.659131e-01 8.659873e-01 ... 7.895385e-01 NaN max 1.731859e+00 1.731991e+00 ... 2.537363e+00 NaN [11 rows x 14 columns] class distribution stabf unstable 38280 stable 21720 dtype: int64 =============================================================== train size: 48000 tau1 tau2 ... stab stabf count 48000.000000 48000.000000 ... 48000.000000 48000 unique NaN NaN ... NaN 2 top NaN NaN ... NaN unstable freq NaN NaN ... NaN 30624 mean -0.001546 -0.001068 ... -0.000873 NaN std 1.000934 0.999107 ... 0.999578 NaN min -1.731763 -1.731999 ... -2.613709 NaN 25% -0.868796 -0.864317 ... -0.847686 NaN 50% -0.001740 -0.005136 ... 0.036743 NaN 75% 0.868335 0.861387 ... 0.788993 NaN max 1.731859 1.731991 ... 2.537363 NaN [11 rows x 14 columns] class distribution stabf unstable 30624 stable 17376 dtype: int64 =============================================================== test size: 6000 tau1 tau2 ... stab stabf count 6000.000000 6000.000000 ... 6000.000000 6000 unique NaN NaN ... NaN 2 top NaN NaN ... NaN unstable freq NaN NaN ... NaN 3828 mean 0.023917 0.012911 ... 0.003546 NaN std 0.998552 1.001761 ... 0.998815 NaN min -1.731763 -1.731184 ... -2.613709 NaN 25% -0.839910 -0.855393 ... -0.847835 NaN 50% 0.042499 0.020595 ... 0.049834 NaN 75% 0.889110 0.902355 ... 0.794568 NaN max 1.731686 1.731427 ... 2.537363 NaN [11 rows x 14 columns] class distribution stabf unstable 3828 stable 2172 dtype: int64 =============================================================== valid size: 6000 tau1 tau2 ... stab stabf count 6000.000000 6000.000000 ... 6000.000000 6000 unique NaN NaN ... NaN 2 top NaN NaN ... NaN unstable freq NaN NaN ... NaN 3828 mean -0.011551 -0.004364 ... 0.003435 NaN std 0.993842 1.005519 ... 1.004786 NaN min -1.731763 -1.731999 ... -2.613709 NaN 25% -0.874471 -0.887753 ... -0.844789 NaN 50% -0.017244 0.017840 ... 0.039665 NaN 75% 0.825347 0.868048 ... 0.787678 NaN max 1.731859 1.731991 ... 2.537363 NaN [11 rows x 14 columns] class distribution stabf unstable 3828 stable 2172 dtype: int64 ===============================================================