381 lines
11 KiB
Plaintext
381 lines
11 KiB
Plaintext
{
|
|
"nbformat": 4,
|
|
"nbformat_minor": 0,
|
|
"metadata": {
|
|
"colab": {
|
|
"name": "ium01.ipynb",
|
|
"provenance": [],
|
|
"collapsed_sections": [],
|
|
"toc_visible": true,
|
|
"mount_file_id": "1Z43Re5xIaiFOO8c1uCDSbP5Xf4BxmRqM",
|
|
"authorship_tag": "ABX9TyOIuQ5zGfTk3BtU/LhkFVWV"
|
|
},
|
|
"kernelspec": {
|
|
"name": "python3",
|
|
"display_name": "Python 3"
|
|
},
|
|
"language_info": {
|
|
"name": "python"
|
|
}
|
|
},
|
|
"cells": [
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {
|
|
"id": "zn8GQjYWnbcX"
|
|
},
|
|
"source": [
|
|
"# Notebook for first substask of Inżynieria Uczenia Maszynowego class project.\n",
|
|
"This workbook downloads, normalizes and prints short summary of the dataset I will be working on and its subsets.\n",
|
|
"\n",
|
|
"Link to the dataset at Kaggle.com:\n",
|
|
"\n",
|
|
"https://www.kaggle.com/pcbreviglieri/smart-grid-stability"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {
|
|
"id": "Omh9bzNn7s0Z"
|
|
},
|
|
"source": [
|
|
"#### google colab related stuff"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"metadata": {
|
|
"id": "Z14xGWuJnWwq"
|
|
},
|
|
"source": [
|
|
"from google.colab import drive\n",
|
|
"drive.mount('drive')"
|
|
],
|
|
"execution_count": null,
|
|
"outputs": []
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {
|
|
"id": "mROvxIELsVv1"
|
|
},
|
|
"source": [
|
|
"* Click in Colab GUI to allow Colab access and modify Google Drive files"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"metadata": {
|
|
"id": "hVfCOcburj5P",
|
|
"executionInfo": {
|
|
"status": "ok",
|
|
"timestamp": 1616369081457,
|
|
"user_tz": -60,
|
|
"elapsed": 5742,
|
|
"user": {
|
|
"displayName": "jadenadjezioro",
|
|
"photoUrl": "",
|
|
"userId": "13576387580000290170"
|
|
}
|
|
}
|
|
},
|
|
"source": [
|
|
"!mkdir ~/.kaggle\n",
|
|
"!cp drive/MyDrive/kaggle.json ~/.kaggle/.\n",
|
|
"!chmod +x ~/.kaggle/kaggle.json\n",
|
|
"!pip install -q kaggle"
|
|
],
|
|
"execution_count": 2,
|
|
"outputs": []
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {
|
|
"id": "EYeZaE3Cxf5i"
|
|
},
|
|
"source": [
|
|
"# script"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {
|
|
"id": "SRF-igrsma-A"
|
|
},
|
|
"source": [
|
|
"download data"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"metadata": {
|
|
"id": "3UjQJzTawfKH",
|
|
"executionInfo": {
|
|
"status": "ok",
|
|
"timestamp": 1616369086975,
|
|
"user_tz": -60,
|
|
"elapsed": 1915,
|
|
"user": {
|
|
"displayName": "jadenadjezioro",
|
|
"photoUrl": "",
|
|
"userId": "13576387580000290170"
|
|
}
|
|
}
|
|
},
|
|
"source": [
|
|
"!kaggle datasets download -d 'pcbreviglieri/smart-grid-stability' >>/dev/null 2>&1\n",
|
|
"!unzip smart-grid-stability.zip >>/dev/null 2>&1"
|
|
],
|
|
"execution_count": 3,
|
|
"outputs": []
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {
|
|
"id": "mkK6wZ2zmhdQ"
|
|
},
|
|
"source": [
|
|
"read the data as pandas data frame"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"metadata": {
|
|
"id": "JcPbvjeixwQa",
|
|
"executionInfo": {
|
|
"status": "ok",
|
|
"timestamp": 1616369395418,
|
|
"user_tz": -60,
|
|
"elapsed": 563,
|
|
"user": {
|
|
"displayName": "jadenadjezioro",
|
|
"photoUrl": "",
|
|
"userId": "13576387580000290170"
|
|
}
|
|
}
|
|
},
|
|
"source": [
|
|
"import pandas as pd\n",
|
|
"\n",
|
|
"df = pd.read_csv('smart_grid_stability_augmented.csv')"
|
|
],
|
|
"execution_count": 17,
|
|
"outputs": []
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {
|
|
"id": "x81Ip-6fmnfr"
|
|
},
|
|
"source": [
|
|
"normalize values, so they are all between 0 and 1 (included)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"metadata": {
|
|
"id": "7QZX5c2ZMpTj",
|
|
"executionInfo": {
|
|
"status": "ok",
|
|
"timestamp": 1616369401750,
|
|
"user_tz": -60,
|
|
"elapsed": 552,
|
|
"user": {
|
|
"displayName": "jadenadjezioro",
|
|
"photoUrl": "",
|
|
"userId": "13576387580000290170"
|
|
}
|
|
}
|
|
},
|
|
"source": [
|
|
"from sklearn import preprocessing\n",
|
|
"\n",
|
|
"scaler = preprocessing.StandardScaler().fit(df.iloc[:, 0:-1])\n",
|
|
"df_norm_array = scaler.transform(df.iloc[:, 0:-1])\n",
|
|
"df_norm = pd.DataFrame(data=df_norm_array,\n",
|
|
" columns=df.columns[:-1])\n",
|
|
"df_norm['stabf'] = df['stabf']"
|
|
],
|
|
"execution_count": 18,
|
|
"outputs": []
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {
|
|
"id": "hjAT_K-Cmzhq"
|
|
},
|
|
"source": [
|
|
"divide the data into train, test and validation subsets"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"metadata": {
|
|
"id": "MvI7kiL0UPc8",
|
|
"executionInfo": {
|
|
"status": "ok",
|
|
"timestamp": 1616369417725,
|
|
"user_tz": -60,
|
|
"elapsed": 562,
|
|
"user": {
|
|
"displayName": "jadenadjezioro",
|
|
"photoUrl": "",
|
|
"userId": "13576387580000290170"
|
|
}
|
|
}
|
|
},
|
|
"source": [
|
|
"from sklearn.model_selection import train_test_split\n",
|
|
"\n",
|
|
"train, testAndValid = train_test_split(\n",
|
|
" df_norm,\n",
|
|
" test_size=0.2,\n",
|
|
" random_state=42,\n",
|
|
" stratify=df_norm['stabf'])\n",
|
|
"\n",
|
|
"test, valid =train_test_split(\n",
|
|
" testAndValid,\n",
|
|
" test_size=0.5,\n",
|
|
" random_state=42,\n",
|
|
" stratify=testAndValid['stabf'])"
|
|
],
|
|
"execution_count": 19,
|
|
"outputs": []
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {
|
|
"id": "FdUL87MgnE2G"
|
|
},
|
|
"source": [
|
|
"print short summary of the dataset and its subsets"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"metadata": {
|
|
"id": "WUrX63SGcHSB",
|
|
"colab": {
|
|
"base_uri": "https://localhost:8080/"
|
|
},
|
|
"executionInfo": {
|
|
"status": "ok",
|
|
"timestamp": 1616369421560,
|
|
"user_tz": -60,
|
|
"elapsed": 854,
|
|
"user": {
|
|
"displayName": "jadenadjezioro",
|
|
"photoUrl": "",
|
|
"userId": "13576387580000290170"
|
|
}
|
|
},
|
|
"outputId": "8cffba3e-8ea5-48b1-c7ce-b8ba2b7229e7"
|
|
},
|
|
"source": [
|
|
"def namestr(obj, namespace):\n",
|
|
" return [name for name in namespace if namespace[name] is obj]\n",
|
|
"\n",
|
|
"dataset = df_norm\n",
|
|
"for x in [dataset, train, test, valid]:\n",
|
|
" print([q for q in namestr(x, globals()) if len(q) == max([len(w) for w in namestr(x, globals())])][-1]) \n",
|
|
" print(\"size:\", len(x))\n",
|
|
" print(x.describe(include='all'))\n",
|
|
" print(\"class distribution\", x.value_counts('stabf'))\n",
|
|
" print('===============================================================')"
|
|
],
|
|
"execution_count": 20,
|
|
"outputs": [
|
|
{
|
|
"output_type": "stream",
|
|
"text": [
|
|
"dataset\n",
|
|
"size: 60000\n",
|
|
" tau1 tau2 ... stab stabf\n",
|
|
"count 6.000000e+04 6.000000e+04 ... 6.000000e+04 60000\n",
|
|
"unique NaN NaN ... NaN 2\n",
|
|
"top NaN NaN ... NaN unstable\n",
|
|
"freq NaN NaN ... NaN 38280\n",
|
|
"mean 1.476245e-16 -1.998105e-16 ... 3.981075e-17 NaN\n",
|
|
"std 1.000008e+00 1.000008e+00 ... 1.000008e+00 NaN\n",
|
|
"min -1.731763e+00 -1.731999e+00 ... -2.613709e+00 NaN\n",
|
|
"25% -8.660657e-01 -8.660215e-01 ... -8.475133e-01 NaN\n",
|
|
"50% 1.437170e-06 -7.028730e-06 ... 3.821538e-02 NaN\n",
|
|
"75% 8.659131e-01 8.659873e-01 ... 7.895385e-01 NaN\n",
|
|
"max 1.731859e+00 1.731991e+00 ... 2.537363e+00 NaN\n",
|
|
"\n",
|
|
"[11 rows x 14 columns]\n",
|
|
"class distribution stabf\n",
|
|
"unstable 38280\n",
|
|
"stable 21720\n",
|
|
"dtype: int64\n",
|
|
"===============================================================\n",
|
|
"train\n",
|
|
"size: 48000\n",
|
|
" tau1 tau2 ... stab stabf\n",
|
|
"count 48000.000000 48000.000000 ... 48000.000000 48000\n",
|
|
"unique NaN NaN ... NaN 2\n",
|
|
"top NaN NaN ... NaN unstable\n",
|
|
"freq NaN NaN ... NaN 30624\n",
|
|
"mean -0.001546 -0.001068 ... -0.000873 NaN\n",
|
|
"std 1.000934 0.999107 ... 0.999578 NaN\n",
|
|
"min -1.731763 -1.731999 ... -2.613709 NaN\n",
|
|
"25% -0.868796 -0.864317 ... -0.847686 NaN\n",
|
|
"50% -0.001740 -0.005136 ... 0.036743 NaN\n",
|
|
"75% 0.868335 0.861387 ... 0.788993 NaN\n",
|
|
"max 1.731859 1.731991 ... 2.537363 NaN\n",
|
|
"\n",
|
|
"[11 rows x 14 columns]\n",
|
|
"class distribution stabf\n",
|
|
"unstable 30624\n",
|
|
"stable 17376\n",
|
|
"dtype: int64\n",
|
|
"===============================================================\n",
|
|
"test\n",
|
|
"size: 6000\n",
|
|
" tau1 tau2 ... stab stabf\n",
|
|
"count 6000.000000 6000.000000 ... 6000.000000 6000\n",
|
|
"unique NaN NaN ... NaN 2\n",
|
|
"top NaN NaN ... NaN unstable\n",
|
|
"freq NaN NaN ... NaN 3828\n",
|
|
"mean 0.023917 0.012911 ... 0.003546 NaN\n",
|
|
"std 0.998552 1.001761 ... 0.998815 NaN\n",
|
|
"min -1.731763 -1.731184 ... -2.613709 NaN\n",
|
|
"25% -0.839910 -0.855393 ... -0.847835 NaN\n",
|
|
"50% 0.042499 0.020595 ... 0.049834 NaN\n",
|
|
"75% 0.889110 0.902355 ... 0.794568 NaN\n",
|
|
"max 1.731686 1.731427 ... 2.537363 NaN\n",
|
|
"\n",
|
|
"[11 rows x 14 columns]\n",
|
|
"class distribution stabf\n",
|
|
"unstable 3828\n",
|
|
"stable 2172\n",
|
|
"dtype: int64\n",
|
|
"===============================================================\n",
|
|
"valid\n",
|
|
"size: 6000\n",
|
|
" tau1 tau2 ... stab stabf\n",
|
|
"count 6000.000000 6000.000000 ... 6000.000000 6000\n",
|
|
"unique NaN NaN ... NaN 2\n",
|
|
"top NaN NaN ... NaN unstable\n",
|
|
"freq NaN NaN ... NaN 3828\n",
|
|
"mean -0.011551 -0.004364 ... 0.003435 NaN\n",
|
|
"std 0.993842 1.005519 ... 1.004786 NaN\n",
|
|
"min -1.731763 -1.731999 ... -2.613709 NaN\n",
|
|
"25% -0.874471 -0.887753 ... -0.844789 NaN\n",
|
|
"50% -0.017244 0.017840 ... 0.039665 NaN\n",
|
|
"75% 0.825347 0.868048 ... 0.787678 NaN\n",
|
|
"max 1.731859 1.731991 ... 2.537363 NaN\n",
|
|
"\n",
|
|
"[11 rows x 14 columns]\n",
|
|
"class distribution stabf\n",
|
|
"unstable 3828\n",
|
|
"stable 2172\n",
|
|
"dtype: int64\n",
|
|
"===============================================================\n"
|
|
],
|
|
"name": "stdout"
|
|
}
|
|
]
|
|
}
|
|
]
|
|
} |