ium_470607/ium01.ipynb

{
  "nbformat": 4,
  "nbformat_minor": 0,
  "metadata": {
    "colab": {
      "name": "ium01.ipynb",
      "provenance": [],
      "collapsed_sections": [
        "EYeZaE3Cxf5i"
      ],
      "toc_visible": true
    },
    "kernelspec": {
      "name": "python3",
      "display_name": "Python 3"
    },
    "language_info": {
      "name": "python"
    }
  },
  "cells": [
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "zn8GQjYWnbcX"
      },
      "source": [
        "# Notebook for first substask of Inżynieria Uczenia Maszynowego class project.\n",
        "This workbook downloads, normalizes and prints short summary of the dataset I will be working on and its subsets.\n",
        "\n",
        "Link to the dataset at Kaggle.com:\n",
        "\n",
        "https://www.kaggle.com/pcbreviglieri/smart-grid-stability"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "Omh9bzNn7s0Z"
      },
      "source": [
        "#### google colab related stuff"
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "Z14xGWuJnWwq",
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "outputId": "250e7323-7b2a-4553-bd10-cd8682c0743f"
      },
      "source": [
        "from google.colab import drive\n",
        "drive.mount('drive')"
      ],
      "execution_count": null,
      "outputs": [
        {
          "output_type": "stream",
          "text": [
            "Drive already mounted at drive; to attempt to forcibly remount, call drive.mount(\"drive\", force_remount=True).\n"
          ],
          "name": "stdout"
        }
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "mROvxIELsVv1"
      },
      "source": [
        "* Click in Colab GUI to allow Colab access and modify Google Drive files"
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "hVfCOcburj5P"
      },
      "source": [
        "!mkdir ~/.kaggle\n",
        "!cp drive/MyDrive/kaggle.json ~/.kaggle/.\n",
        "!chmod +x ~/.kaggle/kaggle.json\n",
        "!pip install -q kaggle"
      ],
      "execution_count": 1,
      "outputs": []
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "EYeZaE3Cxf5i"
      },
      "source": [
        "# script for lab IUM-01"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "XspjcqV4U9tb"
      },
      "source": [
        "download data"
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "3UjQJzTawfKH"
      },
      "source": [
        "!kaggle datasets download -d 'pcbreviglieri/smart-grid-stability' >>/dev/null 2>&1\n",
        "!unzip smart-grid-stability.zip >>/dev/null 2>&1"
      ],
      "execution_count": 2,
      "outputs": []
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "mkK6wZ2zmhdQ"
      },
      "source": [
        "read the data as pandas data frame"
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "JcPbvjeixwQa"
      },
      "source": [
        "import pandas as pd\n",
        "\n",
        "df = pd.read_csv('smart_grid_stability_augmented.csv')"
      ],
      "execution_count": 3,
      "outputs": []
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "x81Ip-6fmnfr"
      },
      "source": [
        "normalize values, so they are all between 0 and 1 (included)"
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "7QZX5c2ZMpTj"
      },
      "source": [
        "from sklearn import preprocessing\n",
        "\n",
        "scaler = preprocessing.StandardScaler().fit(df.iloc[:, 0:-1])\n",
        "df_norm_array = scaler.transform(df.iloc[:, 0:-1])\n",
        "df_norm = pd.DataFrame(data=df_norm_array,\n",
        "                       columns=df.columns[:-1])\n",
        "df_norm['stabf'] = df['stabf']"
      ],
      "execution_count": 4,
      "outputs": []
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "hjAT_K-Cmzhq"
      },
      "source": [
        "divide the data into train, test and validation subsets"
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "MvI7kiL0UPc8"
      },
      "source": [
        "from sklearn.model_selection import train_test_split\n",
        "\n",
        "df_norm_data = df_norm.copy()\n",
        "df_norm_data = df_norm_data.drop('stab', axis=1)\n",
        "df_norm_labels = df_norm_data.pop('stabf')\n",
        "\n",
        "X_train, X_testAndValid, Y_train, Y_testAndValid = train_test_split(\n",
        "    df_norm_data,\n",
        "    df_norm_labels,\n",
        "    test_size=0.2,\n",
        "    random_state=42)\n",
        "\n",
        "X_test, X_valid, Y_test, Y_valid = train_test_split(\n",
        "    X_testAndValid,\n",
        "    Y_testAndValid,\n",
        "    test_size=0.5,\n",
        "    random_state=42)"
      ],
      "execution_count": 5,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "bM2HBhhlyU3Q"
      },
      "source": [
        "train = pd.concat([X_train, Y_train], axis=1)\n",
        "test = pd.concat([X_test, Y_test], axis=1)\n",
        "valid = pd.concat([X_valid, Y_valid], axis=1)"
      ],
      "execution_count": 6,
      "outputs": []
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "FdUL87MgnE2G"
      },
      "source": [
        "print short summary of the dataset and its subsets"
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "WUrX63SGcHSB"
      },
      "source": [
        "def namestr(obj, namespace):\n",
        "  return [name for name in namespace if namespace[name] is obj]\n",
        "\n",
        "dataset = df_norm\n",
        "for x in [dataset, train, test, valid]:\n",
        "  print([q for q in namestr(x, globals()) if len(q) == max([len(w) for w in namestr(x, globals())])][-1]) \n",
        "  print(\"size:\", len(x))\n",
        "  print(x.describe(include='all'))\n",
        "  print(\"class distribution\", x.value_counts('stabf'))\n",
        "  print('===============================================================')"
      ],
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "66jPkCNlVGK9"
      },
      "source": [
        "# script for lab IUM-03"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "SRF-igrsma-A"
      },
      "source": [
        "download data"
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "IkZTO5PhVB7R"
      },
      "source": [
        "!kaggle datasets download -d 'pcbreviglieri/smart-grid-stability' >>/dev/null 2>&1\n",
        "!unzip smart-grid-stability.zip >>/dev/null 2>&1"
      ],
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "ijEeDXgpLYqk"
      },
      "source": [
        "check how many data entries is in the dataset"
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "p9ONnUUIW27z",
        "outputId": "cb078dbb-daa0-4ae6-ec08-9b84141c8507"
      },
      "source": [
        "!wc -l smart_grid_stability_augmented.csv"
      ],
      "execution_count": null,
      "outputs": [
        {
          "output_type": "stream",
          "text": [
            "60001 smart_grid_stability_augmented.csv\n"
          ],
          "name": "stdout"
        }
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "jQw4a7ZjMmXr"
      },
      "source": [
        "take a look at the dataset to choose columns to keep"
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 226
        },
        "id": "82msg0JoVZiK",
        "outputId": "9c0243dd-f21d-4453-dd56-7410c4391cdd"
      },
      "source": [
        "import pandas as pd\n",
        "df = pd.read_csv('smart_grid_stability_augmented.csv')\n",
        "df.head()"
      ],
      "execution_count": null,
      "outputs": [
        {
          "output_type": "execute_result",
          "data": {
            "text/html": [
              "<div>\n",
              "<style scoped>\n",
              "    .dataframe tbody tr th:only-of-type {\n",
              "        vertical-align: middle;\n",
              "    }\n",
              "\n",
              "    .dataframe tbody tr th {\n",
              "        vertical-align: top;\n",
              "    }\n",
              "\n",
              "    .dataframe thead th {\n",
              "        text-align: right;\n",
              "    }\n",
              "</style>\n",
              "<table border=\"1\" class=\"dataframe\">\n",
              "  <thead>\n",
              "    <tr style=\"text-align: right;\">\n",
              "      <th></th>\n",
              "      <th>tau1</th>\n",
              "      <th>tau2</th>\n",
              "      <th>tau3</th>\n",
              "      <th>tau4</th>\n",
              "      <th>p1</th>\n",
              "      <th>p2</th>\n",
              "      <th>p3</th>\n",
              "      <th>p4</th>\n",
              "      <th>g1</th>\n",
              "      <th>g2</th>\n",
              "      <th>g3</th>\n",
              "      <th>g4</th>\n",
              "      <th>stab</th>\n",
              "      <th>stabf</th>\n",
              "    </tr>\n",
              "  </thead>\n",
              "  <tbody>\n",
              "    <tr>\n",
              "      <th>0</th>\n",
              "      <td>2.959060</td>\n",
              "      <td>3.079885</td>\n",
              "      <td>8.381025</td>\n",
              "      <td>9.780754</td>\n",
              "      <td>3.763085</td>\n",
              "      <td>-0.782604</td>\n",
              "      <td>-1.257395</td>\n",
              "      <td>-1.723086</td>\n",
              "      <td>0.650456</td>\n",
              "      <td>0.859578</td>\n",
              "      <td>0.887445</td>\n",
              "      <td>0.958034</td>\n",
              "      <td>0.055347</td>\n",
              "      <td>unstable</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>1</th>\n",
              "      <td>9.304097</td>\n",
              "      <td>4.902524</td>\n",
              "      <td>3.047541</td>\n",
              "      <td>1.369357</td>\n",
              "      <td>5.067812</td>\n",
              "      <td>-1.940058</td>\n",
              "      <td>-1.872742</td>\n",
              "      <td>-1.255012</td>\n",
              "      <td>0.413441</td>\n",
              "      <td>0.862414</td>\n",
              "      <td>0.562139</td>\n",
              "      <td>0.781760</td>\n",
              "      <td>-0.005957</td>\n",
              "      <td>stable</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>2</th>\n",
              "      <td>8.971707</td>\n",
              "      <td>8.848428</td>\n",
              "      <td>3.046479</td>\n",
              "      <td>1.214518</td>\n",
              "      <td>3.405158</td>\n",
              "      <td>-1.207456</td>\n",
              "      <td>-1.277210</td>\n",
              "      <td>-0.920492</td>\n",
              "      <td>0.163041</td>\n",
              "      <td>0.766689</td>\n",
              "      <td>0.839444</td>\n",
              "      <td>0.109853</td>\n",
              "      <td>0.003471</td>\n",
              "      <td>unstable</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>3</th>\n",
              "      <td>0.716415</td>\n",
              "      <td>7.669600</td>\n",
              "      <td>4.486641</td>\n",
              "      <td>2.340563</td>\n",
              "      <td>3.963791</td>\n",
              "      <td>-1.027473</td>\n",
              "      <td>-1.938944</td>\n",
              "      <td>-0.997374</td>\n",
              "      <td>0.446209</td>\n",
              "      <td>0.976744</td>\n",
              "      <td>0.929381</td>\n",
              "      <td>0.362718</td>\n",
              "      <td>0.028871</td>\n",
              "      <td>unstable</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>4</th>\n",
              "      <td>3.134112</td>\n",
              "      <td>7.608772</td>\n",
              "      <td>4.943759</td>\n",
              "      <td>9.857573</td>\n",
              "      <td>3.525811</td>\n",
              "      <td>-1.125531</td>\n",
              "      <td>-1.845975</td>\n",
              "      <td>-0.554305</td>\n",
              "      <td>0.797110</td>\n",
              "      <td>0.455450</td>\n",
              "      <td>0.656947</td>\n",
              "      <td>0.820923</td>\n",
              "      <td>0.049860</td>\n",
              "      <td>unstable</td>\n",
              "    </tr>\n",
              "  </tbody>\n",
              "</table>\n",
              "</div>"
            ],
            "text/plain": [
              "       tau1      tau2      tau3  ...        g4      stab     stabf\n",
              "0  2.959060  3.079885  8.381025  ...  0.958034  0.055347  unstable\n",
              "1  9.304097  4.902524  3.047541  ...  0.781760 -0.005957    stable\n",
              "2  8.971707  8.848428  3.046479  ...  0.109853  0.003471  unstable\n",
              "3  0.716415  7.669600  4.486641  ...  0.362718  0.028871  unstable\n",
              "4  3.134112  7.608772  4.943759  ...  0.820923  0.049860  unstable\n",
              "\n",
              "[5 rows x 14 columns]"
            ]
          },
          "metadata": {
            "tags": []
          },
          "execution_count": 5
        }
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "KHBmXrjALz0W"
      },
      "source": [
        "discard some of the columns; shuffle the data; divide into train, test and validations subsets and print number of rows of the subsets"
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "tJo2DytZVyOo",
        "outputId": "e61890d6-3f2f-4008-a83d-b7272aeecd11"
      },
      "source": [
        "!sed 1d smart_grid_stability_augmented.csv | cut -f 1,5,9,13,14 -d \",\" | shuf | split -l 48000\n",
        "!mv xaa train.csv\n",
        "!mv xab toDivide\n",
        "!split -l 6000 toDivide\n",
        "!mv xaa test.csv\n",
        "!mv xab valid.csv\n",
        "!wc -l train.csv\n",
        "!wc -l test.csv\n",
        "!wc -l valid.csv\n"
      ],
      "execution_count": null,
      "outputs": [
        {
          "output_type": "stream",
          "text": [
            "48000 train.csv\n",
            "6000 test.csv\n",
            "6000 valid.csv\n"
          ],
          "name": "stdout"
        }
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "HhVyLU0XIZgt"
      },
      "source": [
        "# script for lab IUM-05 - Model and training\n",
        "\n",
        "### first run lab IUM-01!!!"
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "dGX3WRlPIQYO"
      },
      "source": [
        "import tensorflow as tf\n",
        "from tensorflow.keras import layers\n",
        "\n",
        "model = tf.keras.Sequential([\n",
        "                          layers.Input(shape=(12,)),\n",
        "                          layers.Dense(32),\n",
        "                          layers.Dense(16),\n",
        "                          layers.Dense(2, activation='softmax')\n",
        "])\n",
        "\n",
        "model.compile(\n",
        "    loss=tf.losses.BinaryCrossentropy(),\n",
        "    optimizer=tf.optimizers.Adam(),\n",
        "    metrics=[tf.keras.metrics.BinaryAccuracy()])"
      ],
      "execution_count": 32,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "l6H4gchZqpBM"
      },
      "source": [
        "import numpy as np\n",
        "\n",
        "def onezero(label):\n",
        "  return 0 if label == 'unstable' else 1\n",
        "\n",
        "\n",
        "Y_train_one_zero = [onezero(x) for x in Y_train]\n",
        "Y_train_onehot = np.eye(2)[Y_train_one_zero]\n",
        "\n",
        "Y_test_one_zero = [onezero(x) for x in Y_test]\n",
        "Y_test_onehot = np.eye(2)[Y_test_one_zero]\n"
      ],
      "execution_count": 39,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "ZJnjZvMOqcxr",
        "outputId": "119a86a5-d9de-41f2-9ff7-c4576aec11ab"
      },
      "source": [
        "history = model.fit(tf.convert_to_tensor(X_train, np.float32),\n",
        "          Y_train_onehot, epochs=5)"
      ],
      "execution_count": 36,
      "outputs": [
        {
          "output_type": "stream",
          "text": [
            "Epoch 1/5\n",
            "1500/1500 [==============================] - 2s 1ms/step - loss: 0.3926 - binary_accuracy: 0.8149\n",
            "Epoch 2/5\n",
            "1500/1500 [==============================] - 2s 1ms/step - loss: 0.3925 - binary_accuracy: 0.8161\n",
            "Epoch 3/5\n",
            "1500/1500 [==============================] - 2s 1ms/step - loss: 0.3922 - binary_accuracy: 0.8150\n",
            "Epoch 4/5\n",
            "1500/1500 [==============================] - 2s 1ms/step - loss: 0.3927 - binary_accuracy: 0.8146\n",
            "Epoch 5/5\n",
            "1500/1500 [==============================] - 2s 1ms/step - loss: 0.3926 - binary_accuracy: 0.8143\n"
          ],
          "name": "stdout"
        }
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "a-TyrAertSsr",
        "outputId": "20f2996b-b2c8-494d-8e1b-c612acb2ede0"
      },
      "source": [
        "model.summary()"
      ],
      "execution_count": 35,
      "outputs": [
        {
          "output_type": "stream",
          "text": [
            "Model: \"sequential_6\"\n",
            "_________________________________________________________________\n",
            "Layer (type)                 Output Shape              Param #   \n",
            "=================================================================\n",
            "dense_28 (Dense)             (None, 32)                416       \n",
            "_________________________________________________________________\n",
            "dense_29 (Dense)             (None, 16)                528       \n",
            "_________________________________________________________________\n",
            "dense_30 (Dense)             (None, 2)                 34        \n",
            "=================================================================\n",
            "Total params: 978\n",
            "Trainable params: 978\n",
            "Non-trainable params: 0\n",
            "_________________________________________________________________\n"
          ],
          "name": "stdout"
        }
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "tBzh-0yLWV1P",
        "outputId": "8886d20a-5a2f-4385-bb9d-a033d4f4d50b"
      },
      "source": [
        "results = model.evaluate(X_test, Y_test_onehot, batch_size=64)\n",
        "print('test loss: ',results[0])\n",
        "print('test acc: ', results[1])"
      ],
      "execution_count": 41,
      "outputs": [
        {
          "output_type": "stream",
          "text": [
            "94/94 [==============================] - 0s 1ms/step - loss: 0.3933 - binary_accuracy: 0.8112\n",
            "test loss:  0.3933383822441101\n",
            "test acc:  0.8111666440963745\n"
          ],
          "name": "stdout"
        }
      ]
    }
  ]
}