{ "nbformat": 4, "nbformat_minor": 0, "metadata": { "colab": { "name": "ium01.ipynb", "provenance": [], "collapsed_sections": [ "EYeZaE3Cxf5i" ], "toc_visible": true }, "kernelspec": { "name": "python3", "display_name": "Python 3" }, "language_info": { "name": "python" } }, "cells": [ { "cell_type": "markdown", "metadata": { "id": "zn8GQjYWnbcX" }, "source": [ "# Notebook for first substask of Inżynieria Uczenia Maszynowego class project.\n", "This workbook downloads, normalizes and prints short summary of the dataset I will be working on and its subsets.\n", "\n", "Link to the dataset at Kaggle.com:\n", "\n", "https://www.kaggle.com/pcbreviglieri/smart-grid-stability" ] }, { "cell_type": "markdown", "metadata": { "id": "Omh9bzNn7s0Z" }, "source": [ "#### google colab related stuff" ] }, { "cell_type": "code", "metadata": { "id": "Z14xGWuJnWwq", "colab": { "base_uri": "https://localhost:8080/" }, "outputId": "250e7323-7b2a-4553-bd10-cd8682c0743f" }, "source": [ "from google.colab import drive\n", "drive.mount('drive')" ], "execution_count": null, "outputs": [ { "output_type": "stream", "text": [ "Drive already mounted at drive; to attempt to forcibly remount, call drive.mount(\"drive\", force_remount=True).\n" ], "name": "stdout" } ] }, { "cell_type": "markdown", "metadata": { "id": "mROvxIELsVv1" }, "source": [ "* Click in Colab GUI to allow Colab access and modify Google Drive files" ] }, { "cell_type": "code", "metadata": { "id": "hVfCOcburj5P" }, "source": [ "!mkdir ~/.kaggle\n", "!cp drive/MyDrive/kaggle.json ~/.kaggle/.\n", "!chmod +x ~/.kaggle/kaggle.json\n", "!pip install -q kaggle" ], "execution_count": 1, "outputs": [] }, { "cell_type": "markdown", "metadata": { "id": "EYeZaE3Cxf5i" }, "source": [ "# script for lab IUM-01" ] }, { "cell_type": "markdown", "metadata": { "id": "XspjcqV4U9tb" }, "source": [ "download data" ] }, { "cell_type": "code", "metadata": { "id": "3UjQJzTawfKH" }, "source": [ "!kaggle datasets download -d 'pcbreviglieri/smart-grid-stability' >>/dev/null 2>&1\n", "!unzip smart-grid-stability.zip >>/dev/null 2>&1" ], "execution_count": 2, "outputs": [] }, { "cell_type": "markdown", "metadata": { "id": "mkK6wZ2zmhdQ" }, "source": [ "read the data as pandas data frame" ] }, { "cell_type": "code", "metadata": { "id": "JcPbvjeixwQa" }, "source": [ "import pandas as pd\n", "\n", "df = pd.read_csv('smart_grid_stability_augmented.csv')" ], "execution_count": 3, "outputs": [] }, { "cell_type": "markdown", "metadata": { "id": "x81Ip-6fmnfr" }, "source": [ "normalize values, so they are all between 0 and 1 (included)" ] }, { "cell_type": "code", "metadata": { "id": "7QZX5c2ZMpTj" }, "source": [ "from sklearn import preprocessing\n", "\n", "scaler = preprocessing.StandardScaler().fit(df.iloc[:, 0:-1])\n", "df_norm_array = scaler.transform(df.iloc[:, 0:-1])\n", "df_norm = pd.DataFrame(data=df_norm_array,\n", " columns=df.columns[:-1])\n", "df_norm['stabf'] = df['stabf']" ], "execution_count": 4, "outputs": [] }, { "cell_type": "markdown", "metadata": { "id": "hjAT_K-Cmzhq" }, "source": [ "divide the data into train, test and validation subsets" ] }, { "cell_type": "code", "metadata": { "id": "MvI7kiL0UPc8" }, "source": [ "from sklearn.model_selection import train_test_split\n", "\n", "df_norm_data = df_norm.copy()\n", "df_norm_data = df_norm_data.drop('stab', axis=1)\n", "df_norm_labels = df_norm_data.pop('stabf')\n", "\n", "X_train, X_testAndValid, Y_train, Y_testAndValid = train_test_split(\n", " df_norm_data,\n", " df_norm_labels,\n", " test_size=0.2,\n", " random_state=42)\n", "\n", "X_test, X_valid, Y_test, Y_valid = train_test_split(\n", " X_testAndValid,\n", " Y_testAndValid,\n", " test_size=0.5,\n", " random_state=42)" ], "execution_count": 5, "outputs": [] }, { "cell_type": "code", "metadata": { "id": "bM2HBhhlyU3Q" }, "source": [ "train = pd.concat([X_train, Y_train], axis=1)\n", "test = pd.concat([X_test, Y_test], axis=1)\n", "valid = pd.concat([X_valid, Y_valid], axis=1)" ], "execution_count": 6, "outputs": [] }, { "cell_type": "markdown", "metadata": { "id": "FdUL87MgnE2G" }, "source": [ "print short summary of the dataset and its subsets" ] }, { "cell_type": "code", "metadata": { "id": "WUrX63SGcHSB" }, "source": [ "def namestr(obj, namespace):\n", " return [name for name in namespace if namespace[name] is obj]\n", "\n", "dataset = df_norm\n", "for x in [dataset, train, test, valid]:\n", " print([q for q in namestr(x, globals()) if len(q) == max([len(w) for w in namestr(x, globals())])][-1]) \n", " print(\"size:\", len(x))\n", " print(x.describe(include='all'))\n", " print(\"class distribution\", x.value_counts('stabf'))\n", " print('===============================================================')" ], "execution_count": null, "outputs": [] }, { "cell_type": "markdown", "metadata": { "id": "66jPkCNlVGK9" }, "source": [ "# script for lab IUM-03" ] }, { "cell_type": "markdown", "metadata": { "id": "SRF-igrsma-A" }, "source": [ "download data" ] }, { "cell_type": "code", "metadata": { "id": "IkZTO5PhVB7R" }, "source": [ "!kaggle datasets download -d 'pcbreviglieri/smart-grid-stability' >>/dev/null 2>&1\n", "!unzip smart-grid-stability.zip >>/dev/null 2>&1" ], "execution_count": null, "outputs": [] }, { "cell_type": "markdown", "metadata": { "id": "ijEeDXgpLYqk" }, "source": [ "check how many data entries is in the dataset" ] }, { "cell_type": "code", "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "p9ONnUUIW27z", "outputId": "cb078dbb-daa0-4ae6-ec08-9b84141c8507" }, "source": [ "!wc -l smart_grid_stability_augmented.csv" ], "execution_count": null, "outputs": [ { "output_type": "stream", "text": [ "60001 smart_grid_stability_augmented.csv\n" ], "name": "stdout" } ] }, { "cell_type": "markdown", "metadata": { "id": "jQw4a7ZjMmXr" }, "source": [ "take a look at the dataset to choose columns to keep" ] }, { "cell_type": "code", "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 226 }, "id": "82msg0JoVZiK", "outputId": "9c0243dd-f21d-4453-dd56-7410c4391cdd" }, "source": [ "import pandas as pd\n", "df = pd.read_csv('smart_grid_stability_augmented.csv')\n", "df.head()" ], "execution_count": null, "outputs": [ { "output_type": "execute_result", "data": { "text/html": [ "
\n", " | tau1 | \n", "tau2 | \n", "tau3 | \n", "tau4 | \n", "p1 | \n", "p2 | \n", "p3 | \n", "p4 | \n", "g1 | \n", "g2 | \n", "g3 | \n", "g4 | \n", "stab | \n", "stabf | \n", "
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | \n", "2.959060 | \n", "3.079885 | \n", "8.381025 | \n", "9.780754 | \n", "3.763085 | \n", "-0.782604 | \n", "-1.257395 | \n", "-1.723086 | \n", "0.650456 | \n", "0.859578 | \n", "0.887445 | \n", "0.958034 | \n", "0.055347 | \n", "unstable | \n", "
1 | \n", "9.304097 | \n", "4.902524 | \n", "3.047541 | \n", "1.369357 | \n", "5.067812 | \n", "-1.940058 | \n", "-1.872742 | \n", "-1.255012 | \n", "0.413441 | \n", "0.862414 | \n", "0.562139 | \n", "0.781760 | \n", "-0.005957 | \n", "stable | \n", "
2 | \n", "8.971707 | \n", "8.848428 | \n", "3.046479 | \n", "1.214518 | \n", "3.405158 | \n", "-1.207456 | \n", "-1.277210 | \n", "-0.920492 | \n", "0.163041 | \n", "0.766689 | \n", "0.839444 | \n", "0.109853 | \n", "0.003471 | \n", "unstable | \n", "
3 | \n", "0.716415 | \n", "7.669600 | \n", "4.486641 | \n", "2.340563 | \n", "3.963791 | \n", "-1.027473 | \n", "-1.938944 | \n", "-0.997374 | \n", "0.446209 | \n", "0.976744 | \n", "0.929381 | \n", "0.362718 | \n", "0.028871 | \n", "unstable | \n", "
4 | \n", "3.134112 | \n", "7.608772 | \n", "4.943759 | \n", "9.857573 | \n", "3.525811 | \n", "-1.125531 | \n", "-1.845975 | \n", "-0.554305 | \n", "0.797110 | \n", "0.455450 | \n", "0.656947 | \n", "0.820923 | \n", "0.049860 | \n", "unstable | \n", "