{ "nbformat": 4, "nbformat_minor": 0, "metadata": { "colab": { "name": "ium01.ipynb", "provenance": [], "collapsed_sections": [ "EYeZaE3Cxf5i" ], "toc_visible": true }, "kernelspec": { "name": "python3", "display_name": "Python 3" }, "language_info": { "name": "python" } }, "cells": [ { "cell_type": "markdown", "metadata": { "id": "zn8GQjYWnbcX" }, "source": [ "# Notebook for first substask of Inżynieria Uczenia Maszynowego class project.\n", "This workbook downloads, normalizes and prints short summary of the dataset I will be working on and its subsets.\n", "\n", "Link to the dataset at Kaggle.com:\n", "\n", "https://www.kaggle.com/pcbreviglieri/smart-grid-stability" ] }, { "cell_type": "markdown", "metadata": { "id": "Omh9bzNn7s0Z" }, "source": [ "#### google colab related stuff" ] }, { "cell_type": "code", "metadata": { "id": "Z14xGWuJnWwq", "colab": { "base_uri": "https://localhost:8080/" }, "outputId": "250e7323-7b2a-4553-bd10-cd8682c0743f" }, "source": [ "from google.colab import drive\n", "drive.mount('drive')" ], "execution_count": null, "outputs": [ { "output_type": "stream", "text": [ "Drive already mounted at drive; to attempt to forcibly remount, call drive.mount(\"drive\", force_remount=True).\n" ], "name": "stdout" } ] }, { "cell_type": "markdown", "metadata": { "id": "mROvxIELsVv1" }, "source": [ "* Click in Colab GUI to allow Colab access and modify Google Drive files" ] }, { "cell_type": "code", "metadata": { "id": "hVfCOcburj5P" }, "source": [ "!mkdir ~/.kaggle\n", "!cp drive/MyDrive/kaggle.json ~/.kaggle/.\n", "!chmod +x ~/.kaggle/kaggle.json\n", "!pip install -q kaggle" ], "execution_count": 1, "outputs": [] }, { "cell_type": "markdown", "metadata": { "id": "EYeZaE3Cxf5i" }, "source": [ "# script for lab IUM-01" ] }, { "cell_type": "markdown", "metadata": { "id": "XspjcqV4U9tb" }, "source": [ "download data" ] }, { "cell_type": "code", "metadata": { "id": "3UjQJzTawfKH" }, "source": [ "!kaggle datasets download -d 'pcbreviglieri/smart-grid-stability' >>/dev/null 2>&1\n", "!unzip smart-grid-stability.zip >>/dev/null 2>&1" ], "execution_count": 2, "outputs": [] }, { "cell_type": "markdown", "metadata": { "id": "mkK6wZ2zmhdQ" }, "source": [ "read the data as pandas data frame" ] }, { "cell_type": "code", "metadata": { "id": "JcPbvjeixwQa" }, "source": [ "import pandas as pd\n", "\n", "df = pd.read_csv('smart_grid_stability_augmented.csv')" ], "execution_count": 3, "outputs": [] }, { "cell_type": "markdown", "metadata": { "id": "x81Ip-6fmnfr" }, "source": [ "normalize values, so they are all between 0 and 1 (included)" ] }, { "cell_type": "code", "metadata": { "id": "7QZX5c2ZMpTj" }, "source": [ "from sklearn import preprocessing\n", "\n", "scaler = preprocessing.StandardScaler().fit(df.iloc[:, 0:-1])\n", "df_norm_array = scaler.transform(df.iloc[:, 0:-1])\n", "df_norm = pd.DataFrame(data=df_norm_array,\n", " columns=df.columns[:-1])\n", "df_norm['stabf'] = df['stabf']" ], "execution_count": 4, "outputs": [] }, { "cell_type": "markdown", "metadata": { "id": "hjAT_K-Cmzhq" }, "source": [ "divide the data into train, test and validation subsets" ] }, { "cell_type": "code", "metadata": { "id": "MvI7kiL0UPc8" }, "source": [ "from sklearn.model_selection import train_test_split\n", "\n", "df_norm_data = df_norm.copy()\n", "df_norm_data = df_norm_data.drop('stab', axis=1)\n", "df_norm_labels = df_norm_data.pop('stabf')\n", "\n", "X_train, X_testAndValid, Y_train, Y_testAndValid = train_test_split(\n", " df_norm_data,\n", " df_norm_labels,\n", " test_size=0.2,\n", " random_state=42)\n", "\n", "X_test, X_valid, Y_test, Y_valid = train_test_split(\n", " X_testAndValid,\n", " Y_testAndValid,\n", " test_size=0.5,\n", " random_state=42)" ], "execution_count": 5, "outputs": [] }, { "cell_type": "code", "metadata": { "id": "bM2HBhhlyU3Q" }, "source": [ "train = pd.concat([X_train, Y_train], axis=1)\n", "test = pd.concat([X_test, Y_test], axis=1)\n", "valid = pd.concat([X_valid, Y_valid], axis=1)" ], "execution_count": 6, "outputs": [] }, { "cell_type": "markdown", "metadata": { "id": "FdUL87MgnE2G" }, "source": [ "print short summary of the dataset and its subsets" ] }, { "cell_type": "code", "metadata": { "id": "WUrX63SGcHSB" }, "source": [ "def namestr(obj, namespace):\n", " return [name for name in namespace if namespace[name] is obj]\n", "\n", "dataset = df_norm\n", "for x in [dataset, train, test, valid]:\n", " print([q for q in namestr(x, globals()) if len(q) == max([len(w) for w in namestr(x, globals())])][-1]) \n", " print(\"size:\", len(x))\n", " print(x.describe(include='all'))\n", " print(\"class distribution\", x.value_counts('stabf'))\n", " print('===============================================================')" ], "execution_count": null, "outputs": [] }, { "cell_type": "markdown", "metadata": { "id": "66jPkCNlVGK9" }, "source": [ "# script for lab IUM-03" ] }, { "cell_type": "markdown", "metadata": { "id": "SRF-igrsma-A" }, "source": [ "download data" ] }, { "cell_type": "code", "metadata": { "id": "IkZTO5PhVB7R" }, "source": [ "!kaggle datasets download -d 'pcbreviglieri/smart-grid-stability' >>/dev/null 2>&1\n", "!unzip smart-grid-stability.zip >>/dev/null 2>&1" ], "execution_count": null, "outputs": [] }, { "cell_type": "markdown", "metadata": { "id": "ijEeDXgpLYqk" }, "source": [ "check how many data entries is in the dataset" ] }, { "cell_type": "code", "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "p9ONnUUIW27z", "outputId": "cb078dbb-daa0-4ae6-ec08-9b84141c8507" }, "source": [ "!wc -l smart_grid_stability_augmented.csv" ], "execution_count": null, "outputs": [ { "output_type": "stream", "text": [ "60001 smart_grid_stability_augmented.csv\n" ], "name": "stdout" } ] }, { "cell_type": "markdown", "metadata": { "id": "jQw4a7ZjMmXr" }, "source": [ "take a look at the dataset to choose columns to keep" ] }, { "cell_type": "code", "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 226 }, "id": "82msg0JoVZiK", "outputId": "9c0243dd-f21d-4453-dd56-7410c4391cdd" }, "source": [ "import pandas as pd\n", "df = pd.read_csv('smart_grid_stability_augmented.csv')\n", "df.head()" ], "execution_count": null, "outputs": [ { "output_type": "execute_result", "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
tau1tau2tau3tau4p1p2p3p4g1g2g3g4stabstabf
02.9590603.0798858.3810259.7807543.763085-0.782604-1.257395-1.7230860.6504560.8595780.8874450.9580340.055347unstable
19.3040974.9025243.0475411.3693575.067812-1.940058-1.872742-1.2550120.4134410.8624140.5621390.781760-0.005957stable
28.9717078.8484283.0464791.2145183.405158-1.207456-1.277210-0.9204920.1630410.7666890.8394440.1098530.003471unstable
30.7164157.6696004.4866412.3405633.963791-1.027473-1.938944-0.9973740.4462090.9767440.9293810.3627180.028871unstable
43.1341127.6087724.9437599.8575733.525811-1.125531-1.845975-0.5543050.7971100.4554500.6569470.8209230.049860unstable
\n", "
" ], "text/plain": [ " tau1 tau2 tau3 ... g4 stab stabf\n", "0 2.959060 3.079885 8.381025 ... 0.958034 0.055347 unstable\n", "1 9.304097 4.902524 3.047541 ... 0.781760 -0.005957 stable\n", "2 8.971707 8.848428 3.046479 ... 0.109853 0.003471 unstable\n", "3 0.716415 7.669600 4.486641 ... 0.362718 0.028871 unstable\n", "4 3.134112 7.608772 4.943759 ... 0.820923 0.049860 unstable\n", "\n", "[5 rows x 14 columns]" ] }, "metadata": { "tags": [] }, "execution_count": 5 } ] }, { "cell_type": "markdown", "metadata": { "id": "KHBmXrjALz0W" }, "source": [ "discard some of the columns; shuffle the data; divide into train, test and validations subsets and print number of rows of the subsets" ] }, { "cell_type": "code", "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "tJo2DytZVyOo", "outputId": "e61890d6-3f2f-4008-a83d-b7272aeecd11" }, "source": [ "!sed 1d smart_grid_stability_augmented.csv | cut -f 1,5,9,13,14 -d \",\" | shuf | split -l 48000\n", "!mv xaa train.csv\n", "!mv xab toDivide\n", "!split -l 6000 toDivide\n", "!mv xaa test.csv\n", "!mv xab valid.csv\n", "!wc -l train.csv\n", "!wc -l test.csv\n", "!wc -l valid.csv\n" ], "execution_count": null, "outputs": [ { "output_type": "stream", "text": [ "48000 train.csv\n", "6000 test.csv\n", "6000 valid.csv\n" ], "name": "stdout" } ] }, { "cell_type": "markdown", "metadata": { "id": "HhVyLU0XIZgt" }, "source": [ "# script for lab IUM-05 - Model and training\n", "\n", "### first run lab IUM-01!!!" ] }, { "cell_type": "code", "metadata": { "id": "dGX3WRlPIQYO" }, "source": [ "import tensorflow as tf\n", "from tensorflow.keras import layers\n", "\n", "model = tf.keras.Sequential([\n", " layers.Input(shape=(12,)),\n", " layers.Dense(32),\n", " layers.Dense(16),\n", " layers.Dense(2, activation='softmax')\n", "])\n", "\n", "model.compile(\n", " loss=tf.losses.BinaryCrossentropy(),\n", " optimizer=tf.optimizers.Adam(),\n", " metrics=[tf.keras.metrics.BinaryAccuracy()])" ], "execution_count": 32, "outputs": [] }, { "cell_type": "code", "metadata": { "id": "l6H4gchZqpBM" }, "source": [ "import numpy as np\n", "\n", "def onezero(label):\n", " return 0 if label == 'unstable' else 1\n", "\n", "\n", "Y_train_one_zero = [onezero(x) for x in Y_train]\n", "Y_train_onehot = np.eye(2)[Y_train_one_zero]\n", "\n", "Y_test_one_zero = [onezero(x) for x in Y_test]\n", "Y_test_onehot = np.eye(2)[Y_test_one_zero]\n" ], "execution_count": 39, "outputs": [] }, { "cell_type": "code", "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "ZJnjZvMOqcxr", "outputId": "119a86a5-d9de-41f2-9ff7-c4576aec11ab" }, "source": [ "history = model.fit(tf.convert_to_tensor(X_train, np.float32),\n", " Y_train_onehot, epochs=5)" ], "execution_count": 36, "outputs": [ { "output_type": "stream", "text": [ "Epoch 1/5\n", "1500/1500 [==============================] - 2s 1ms/step - loss: 0.3926 - binary_accuracy: 0.8149\n", "Epoch 2/5\n", "1500/1500 [==============================] - 2s 1ms/step - loss: 0.3925 - binary_accuracy: 0.8161\n", "Epoch 3/5\n", "1500/1500 [==============================] - 2s 1ms/step - loss: 0.3922 - binary_accuracy: 0.8150\n", "Epoch 4/5\n", "1500/1500 [==============================] - 2s 1ms/step - loss: 0.3927 - binary_accuracy: 0.8146\n", "Epoch 5/5\n", "1500/1500 [==============================] - 2s 1ms/step - loss: 0.3926 - binary_accuracy: 0.8143\n" ], "name": "stdout" } ] }, { "cell_type": "code", "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "a-TyrAertSsr", "outputId": "20f2996b-b2c8-494d-8e1b-c612acb2ede0" }, "source": [ "model.summary()" ], "execution_count": 35, "outputs": [ { "output_type": "stream", "text": [ "Model: \"sequential_6\"\n", "_________________________________________________________________\n", "Layer (type) Output Shape Param # \n", "=================================================================\n", "dense_28 (Dense) (None, 32) 416 \n", "_________________________________________________________________\n", "dense_29 (Dense) (None, 16) 528 \n", "_________________________________________________________________\n", "dense_30 (Dense) (None, 2) 34 \n", "=================================================================\n", "Total params: 978\n", "Trainable params: 978\n", "Non-trainable params: 0\n", "_________________________________________________________________\n" ], "name": "stdout" } ] }, { "cell_type": "code", "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "tBzh-0yLWV1P", "outputId": "8886d20a-5a2f-4385-bb9d-a033d4f4d50b" }, "source": [ "results = model.evaluate(X_test, Y_test_onehot, batch_size=64)\n", "print('test loss: ',results[0])\n", "print('test acc: ', results[1])" ], "execution_count": 41, "outputs": [ { "output_type": "stream", "text": [ "94/94 [==============================] - 0s 1ms/step - loss: 0.3933 - binary_accuracy: 0.8112\n", "test loss: 0.3933383822441101\n", "test acc: 0.8111666440963745\n" ], "name": "stdout" } ] } ] }