{
 "nbformat": 4,
 "nbformat_minor": 0,
 "metadata": {
  "colab": {
   "name": "ium01.ipynb",
   "provenance": [],
   "collapsed_sections": [],
   "toc_visible": true,
   "mount_file_id": "1Z43Re5xIaiFOO8c1uCDSbP5Xf4BxmRqM",
   "authorship_tag": "ABX9TyOIuQ5zGfTk3BtU/LhkFVWV"
  },
  "kernelspec": {
   "name": "python3",
   "display_name": "Python 3"
  },
  "language_info": {
   "name": "python"
  }
 },
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "zn8GQjYWnbcX"
   },
   "source": [
    "# Notebook for first substask of Inżynieria Uczenia Maszynowego class project.\n",
    "This workbook downloads, normalizes and prints short summary of the dataset I will be working on and its subsets.\n",
    "\n",
    "Link to the dataset at Kaggle.com:\n",
    "\n",
    "https://www.kaggle.com/pcbreviglieri/smart-grid-stability"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "Omh9bzNn7s0Z"
   },
   "source": [
    "#### google colab related stuff"
   ]
  },
  {
   "cell_type": "code",
   "metadata": {
    "id": "Z14xGWuJnWwq"
   },
   "source": [
    "from google.colab import drive\n",
    "drive.mount('drive')"
   ],
   "execution_count": null,
   "outputs": []
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "mROvxIELsVv1"
   },
   "source": [
    "* Click in Colab GUI to allow Colab access and modify Google Drive files"
   ]
  },
  {
   "cell_type": "code",
   "metadata": {
    "id": "hVfCOcburj5P",
    "executionInfo": {
     "status": "ok",
     "timestamp": 1616369081457,
     "user_tz": -60,
     "elapsed": 5742,
     "user": {
      "displayName": "jadenadjezioro",
      "photoUrl": "",
      "userId": "13576387580000290170"
     }
    }
   },
   "source": [
    "!mkdir ~/.kaggle\n",
    "!cp drive/MyDrive/kaggle.json ~/.kaggle/.\n",
    "!chmod +x ~/.kaggle/kaggle.json\n",
    "!pip install -q kaggle"
   ],
   "execution_count": 2,
   "outputs": []
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "EYeZaE3Cxf5i"
   },
   "source": [
    "# script"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "SRF-igrsma-A"
   },
   "source": [
    "download data"
   ]
  },
  {
   "cell_type": "code",
   "metadata": {
    "id": "3UjQJzTawfKH",
    "executionInfo": {
     "status": "ok",
     "timestamp": 1616369086975,
     "user_tz": -60,
     "elapsed": 1915,
     "user": {
      "displayName": "jadenadjezioro",
      "photoUrl": "",
      "userId": "13576387580000290170"
     }
    }
   },
   "source": [
    "!kaggle datasets download -d 'pcbreviglieri/smart-grid-stability' >>/dev/null 2>&1\n",
    "!unzip smart-grid-stability.zip >>/dev/null 2>&1"
   ],
   "execution_count": 3,
   "outputs": []
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "mkK6wZ2zmhdQ"
   },
   "source": [
    "read the data as pandas data frame"
   ]
  },
  {
   "cell_type": "code",
   "metadata": {
    "id": "JcPbvjeixwQa",
    "executionInfo": {
     "status": "ok",
     "timestamp": 1616369395418,
     "user_tz": -60,
     "elapsed": 563,
     "user": {
      "displayName": "jadenadjezioro",
      "photoUrl": "",
      "userId": "13576387580000290170"
     }
    }
   },
   "source": [
    "import pandas as pd\n",
    "\n",
    "df = pd.read_csv('smart_grid_stability_augmented.csv')"
   ],
   "execution_count": 17,
   "outputs": []
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "x81Ip-6fmnfr"
   },
   "source": [
    "normalize values, so they are all between 0 and 1 (included)"
   ]
  },
  {
   "cell_type": "code",
   "metadata": {
    "id": "7QZX5c2ZMpTj",
    "executionInfo": {
     "status": "ok",
     "timestamp": 1616369401750,
     "user_tz": -60,
     "elapsed": 552,
     "user": {
      "displayName": "jadenadjezioro",
      "photoUrl": "",
      "userId": "13576387580000290170"
     }
    }
   },
   "source": [
    "from sklearn import preprocessing\n",
    "\n",
    "scaler = preprocessing.StandardScaler().fit(df.iloc[:, 0:-1])\n",
    "df_norm_array = scaler.transform(df.iloc[:, 0:-1])\n",
    "df_norm = pd.DataFrame(data=df_norm_array,\n",
    "                       columns=df.columns[:-1])\n",
    "df_norm['stabf'] = df['stabf']"
   ],
   "execution_count": 18,
   "outputs": []
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "hjAT_K-Cmzhq"
   },
   "source": [
    "divide the data into train, test and validation subsets"
   ]
  },
  {
   "cell_type": "code",
   "metadata": {
    "id": "MvI7kiL0UPc8",
    "executionInfo": {
     "status": "ok",
     "timestamp": 1616369417725,
     "user_tz": -60,
     "elapsed": 562,
     "user": {
      "displayName": "jadenadjezioro",
      "photoUrl": "",
      "userId": "13576387580000290170"
     }
    }
   },
   "source": [
    "from sklearn.model_selection import train_test_split\n",
    "\n",
    "train, testAndValid = train_test_split(\n",
    "    df_norm,\n",
    "    test_size=0.2,\n",
    "    random_state=42,\n",
    "    stratify=df_norm['stabf'])\n",
    "\n",
    "test, valid =train_test_split(\n",
    "    testAndValid,\n",
    "    test_size=0.5,\n",
    "    random_state=42,\n",
    "    stratify=testAndValid['stabf'])"
   ],
   "execution_count": 19,
   "outputs": []
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "FdUL87MgnE2G"
   },
   "source": [
    "print short summary of the dataset and its subsets"
   ]
  },
  {
   "cell_type": "code",
   "metadata": {
    "id": "WUrX63SGcHSB",
    "colab": {
     "base_uri": "https://localhost:8080/"
    },
    "executionInfo": {
     "status": "ok",
     "timestamp": 1616369421560,
     "user_tz": -60,
     "elapsed": 854,
     "user": {
      "displayName": "jadenadjezioro",
      "photoUrl": "",
      "userId": "13576387580000290170"
     }
    },
    "outputId": "8cffba3e-8ea5-48b1-c7ce-b8ba2b7229e7"
   },
   "source": [
    "def namestr(obj, namespace):\n",
    "  return [name for name in namespace if namespace[name] is obj]\n",
    "\n",
    "dataset = df_norm\n",
    "for x in [dataset, train, test, valid]:\n",
    "  print([q for q in namestr(x, globals()) if len(q) == max([len(w) for w in namestr(x, globals())])][-1]) \n",
    "  print(\"size:\", len(x))\n",
    "  print(x.describe(include='all'))\n",
    "  print(\"class distribution\", x.value_counts('stabf'))\n",
    "  print('===============================================================')"
   ],
   "execution_count": 20,
   "outputs": [
    {
     "output_type": "stream",
     "text": [
      "dataset\n",
      "size: 60000\n",
      "                tau1          tau2  ...          stab     stabf\n",
      "count   6.000000e+04  6.000000e+04  ...  6.000000e+04     60000\n",
      "unique           NaN           NaN  ...           NaN         2\n",
      "top              NaN           NaN  ...           NaN  unstable\n",
      "freq             NaN           NaN  ...           NaN     38280\n",
      "mean    1.476245e-16 -1.998105e-16  ...  3.981075e-17       NaN\n",
      "std     1.000008e+00  1.000008e+00  ...  1.000008e+00       NaN\n",
      "min    -1.731763e+00 -1.731999e+00  ... -2.613709e+00       NaN\n",
      "25%    -8.660657e-01 -8.660215e-01  ... -8.475133e-01       NaN\n",
      "50%     1.437170e-06 -7.028730e-06  ...  3.821538e-02       NaN\n",
      "75%     8.659131e-01  8.659873e-01  ...  7.895385e-01       NaN\n",
      "max     1.731859e+00  1.731991e+00  ...  2.537363e+00       NaN\n",
      "\n",
      "[11 rows x 14 columns]\n",
      "class distribution stabf\n",
      "unstable    38280\n",
      "stable      21720\n",
      "dtype: int64\n",
      "===============================================================\n",
      "train\n",
      "size: 48000\n",
      "                tau1          tau2  ...          stab     stabf\n",
      "count   48000.000000  48000.000000  ...  48000.000000     48000\n",
      "unique           NaN           NaN  ...           NaN         2\n",
      "top              NaN           NaN  ...           NaN  unstable\n",
      "freq             NaN           NaN  ...           NaN     30624\n",
      "mean       -0.001546     -0.001068  ...     -0.000873       NaN\n",
      "std         1.000934      0.999107  ...      0.999578       NaN\n",
      "min        -1.731763     -1.731999  ...     -2.613709       NaN\n",
      "25%        -0.868796     -0.864317  ...     -0.847686       NaN\n",
      "50%        -0.001740     -0.005136  ...      0.036743       NaN\n",
      "75%         0.868335      0.861387  ...      0.788993       NaN\n",
      "max         1.731859      1.731991  ...      2.537363       NaN\n",
      "\n",
      "[11 rows x 14 columns]\n",
      "class distribution stabf\n",
      "unstable    30624\n",
      "stable      17376\n",
      "dtype: int64\n",
      "===============================================================\n",
      "test\n",
      "size: 6000\n",
      "               tau1         tau2  ...         stab     stabf\n",
      "count   6000.000000  6000.000000  ...  6000.000000      6000\n",
      "unique          NaN          NaN  ...          NaN         2\n",
      "top             NaN          NaN  ...          NaN  unstable\n",
      "freq            NaN          NaN  ...          NaN      3828\n",
      "mean       0.023917     0.012911  ...     0.003546       NaN\n",
      "std        0.998552     1.001761  ...     0.998815       NaN\n",
      "min       -1.731763    -1.731184  ...    -2.613709       NaN\n",
      "25%       -0.839910    -0.855393  ...    -0.847835       NaN\n",
      "50%        0.042499     0.020595  ...     0.049834       NaN\n",
      "75%        0.889110     0.902355  ...     0.794568       NaN\n",
      "max        1.731686     1.731427  ...     2.537363       NaN\n",
      "\n",
      "[11 rows x 14 columns]\n",
      "class distribution stabf\n",
      "unstable    3828\n",
      "stable      2172\n",
      "dtype: int64\n",
      "===============================================================\n",
      "valid\n",
      "size: 6000\n",
      "               tau1         tau2  ...         stab     stabf\n",
      "count   6000.000000  6000.000000  ...  6000.000000      6000\n",
      "unique          NaN          NaN  ...          NaN         2\n",
      "top             NaN          NaN  ...          NaN  unstable\n",
      "freq            NaN          NaN  ...          NaN      3828\n",
      "mean      -0.011551    -0.004364  ...     0.003435       NaN\n",
      "std        0.993842     1.005519  ...     1.004786       NaN\n",
      "min       -1.731763    -1.731999  ...    -2.613709       NaN\n",
      "25%       -0.874471    -0.887753  ...    -0.844789       NaN\n",
      "50%       -0.017244     0.017840  ...     0.039665       NaN\n",
      "75%        0.825347     0.868048  ...     0.787678       NaN\n",
      "max        1.731859     1.731991  ...     2.537363       NaN\n",
      "\n",
      "[11 rows x 14 columns]\n",
      "class distribution stabf\n",
      "unstable    3828\n",
      "stable      2172\n",
      "dtype: int64\n",
      "===============================================================\n"
     ],
     "name": "stdout"
    }
   ]
  }
 ]
}