ium_470607/ium01.ipynb
2021-04-24 13:31:02 +02:00

677 lines
20 KiB
Plaintext

{
"nbformat": 4,
"nbformat_minor": 0,
"metadata": {
"colab": {
"name": "ium01.ipynb",
"provenance": [],
"collapsed_sections": [
"EYeZaE3Cxf5i"
],
"toc_visible": true
},
"kernelspec": {
"name": "python3",
"display_name": "Python 3"
},
"language_info": {
"name": "python"
}
},
"cells": [
{
"cell_type": "markdown",
"metadata": {
"id": "zn8GQjYWnbcX"
},
"source": [
"# Notebook for first substask of Inżynieria Uczenia Maszynowego class project.\n",
"This workbook downloads, normalizes and prints short summary of the dataset I will be working on and its subsets.\n",
"\n",
"Link to the dataset at Kaggle.com:\n",
"\n",
"https://www.kaggle.com/pcbreviglieri/smart-grid-stability"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "Omh9bzNn7s0Z"
},
"source": [
"#### google colab related stuff"
]
},
{
"cell_type": "code",
"metadata": {
"id": "Z14xGWuJnWwq",
"colab": {
"base_uri": "https://localhost:8080/"
},
"outputId": "250e7323-7b2a-4553-bd10-cd8682c0743f"
},
"source": [
"from google.colab import drive\n",
"drive.mount('drive')"
],
"execution_count": null,
"outputs": [
{
"output_type": "stream",
"text": [
"Drive already mounted at drive; to attempt to forcibly remount, call drive.mount(\"drive\", force_remount=True).\n"
],
"name": "stdout"
}
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "mROvxIELsVv1"
},
"source": [
"* Click in Colab GUI to allow Colab access and modify Google Drive files"
]
},
{
"cell_type": "code",
"metadata": {
"id": "hVfCOcburj5P"
},
"source": [
"!mkdir ~/.kaggle\n",
"!cp drive/MyDrive/kaggle.json ~/.kaggle/.\n",
"!chmod +x ~/.kaggle/kaggle.json\n",
"!pip install -q kaggle"
],
"execution_count": 1,
"outputs": []
},
{
"cell_type": "markdown",
"metadata": {
"id": "EYeZaE3Cxf5i"
},
"source": [
"# script for lab IUM-01"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "XspjcqV4U9tb"
},
"source": [
"download data"
]
},
{
"cell_type": "code",
"metadata": {
"id": "3UjQJzTawfKH"
},
"source": [
"!kaggle datasets download -d 'pcbreviglieri/smart-grid-stability' >>/dev/null 2>&1\n",
"!unzip smart-grid-stability.zip >>/dev/null 2>&1"
],
"execution_count": 2,
"outputs": []
},
{
"cell_type": "markdown",
"metadata": {
"id": "mkK6wZ2zmhdQ"
},
"source": [
"read the data as pandas data frame"
]
},
{
"cell_type": "code",
"metadata": {
"id": "JcPbvjeixwQa"
},
"source": [
"import pandas as pd\n",
"\n",
"df = pd.read_csv('smart_grid_stability_augmented.csv')"
],
"execution_count": 3,
"outputs": []
},
{
"cell_type": "markdown",
"metadata": {
"id": "x81Ip-6fmnfr"
},
"source": [
"normalize values, so they are all between 0 and 1 (included)"
]
},
{
"cell_type": "code",
"metadata": {
"id": "7QZX5c2ZMpTj"
},
"source": [
"from sklearn import preprocessing\n",
"\n",
"scaler = preprocessing.StandardScaler().fit(df.iloc[:, 0:-1])\n",
"df_norm_array = scaler.transform(df.iloc[:, 0:-1])\n",
"df_norm = pd.DataFrame(data=df_norm_array,\n",
" columns=df.columns[:-1])\n",
"df_norm['stabf'] = df['stabf']"
],
"execution_count": 4,
"outputs": []
},
{
"cell_type": "markdown",
"metadata": {
"id": "hjAT_K-Cmzhq"
},
"source": [
"divide the data into train, test and validation subsets"
]
},
{
"cell_type": "code",
"metadata": {
"id": "MvI7kiL0UPc8"
},
"source": [
"from sklearn.model_selection import train_test_split\n",
"\n",
"df_norm_data = df_norm.copy()\n",
"df_norm_data = df_norm_data.drop('stab', axis=1)\n",
"df_norm_labels = df_norm_data.pop('stabf')\n",
"\n",
"X_train, X_testAndValid, Y_train, Y_testAndValid = train_test_split(\n",
" df_norm_data,\n",
" df_norm_labels,\n",
" test_size=0.2,\n",
" random_state=42)\n",
"\n",
"X_test, X_valid, Y_test, Y_valid = train_test_split(\n",
" X_testAndValid,\n",
" Y_testAndValid,\n",
" test_size=0.5,\n",
" random_state=42)"
],
"execution_count": 5,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"id": "bM2HBhhlyU3Q"
},
"source": [
"train = pd.concat([X_train, Y_train], axis=1)\n",
"test = pd.concat([X_test, Y_test], axis=1)\n",
"valid = pd.concat([X_valid, Y_valid], axis=1)"
],
"execution_count": 6,
"outputs": []
},
{
"cell_type": "markdown",
"metadata": {
"id": "FdUL87MgnE2G"
},
"source": [
"print short summary of the dataset and its subsets"
]
},
{
"cell_type": "code",
"metadata": {
"id": "WUrX63SGcHSB"
},
"source": [
"def namestr(obj, namespace):\n",
" return [name for name in namespace if namespace[name] is obj]\n",
"\n",
"dataset = df_norm\n",
"for x in [dataset, train, test, valid]:\n",
" print([q for q in namestr(x, globals()) if len(q) == max([len(w) for w in namestr(x, globals())])][-1]) \n",
" print(\"size:\", len(x))\n",
" print(x.describe(include='all'))\n",
" print(\"class distribution\", x.value_counts('stabf'))\n",
" print('===============================================================')"
],
"execution_count": null,
"outputs": []
},
{
"cell_type": "markdown",
"metadata": {
"id": "66jPkCNlVGK9"
},
"source": [
"# script for lab IUM-03"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "SRF-igrsma-A"
},
"source": [
"download data"
]
},
{
"cell_type": "code",
"metadata": {
"id": "IkZTO5PhVB7R"
},
"source": [
"!kaggle datasets download -d 'pcbreviglieri/smart-grid-stability' >>/dev/null 2>&1\n",
"!unzip smart-grid-stability.zip >>/dev/null 2>&1"
],
"execution_count": null,
"outputs": []
},
{
"cell_type": "markdown",
"metadata": {
"id": "ijEeDXgpLYqk"
},
"source": [
"check how many data entries is in the dataset"
]
},
{
"cell_type": "code",
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "p9ONnUUIW27z",
"outputId": "cb078dbb-daa0-4ae6-ec08-9b84141c8507"
},
"source": [
"!wc -l smart_grid_stability_augmented.csv"
],
"execution_count": null,
"outputs": [
{
"output_type": "stream",
"text": [
"60001 smart_grid_stability_augmented.csv\n"
],
"name": "stdout"
}
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "jQw4a7ZjMmXr"
},
"source": [
"take a look at the dataset to choose columns to keep"
]
},
{
"cell_type": "code",
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 226
},
"id": "82msg0JoVZiK",
"outputId": "9c0243dd-f21d-4453-dd56-7410c4391cdd"
},
"source": [
"import pandas as pd\n",
"df = pd.read_csv('smart_grid_stability_augmented.csv')\n",
"df.head()"
],
"execution_count": null,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>tau1</th>\n",
" <th>tau2</th>\n",
" <th>tau3</th>\n",
" <th>tau4</th>\n",
" <th>p1</th>\n",
" <th>p2</th>\n",
" <th>p3</th>\n",
" <th>p4</th>\n",
" <th>g1</th>\n",
" <th>g2</th>\n",
" <th>g3</th>\n",
" <th>g4</th>\n",
" <th>stab</th>\n",
" <th>stabf</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>2.959060</td>\n",
" <td>3.079885</td>\n",
" <td>8.381025</td>\n",
" <td>9.780754</td>\n",
" <td>3.763085</td>\n",
" <td>-0.782604</td>\n",
" <td>-1.257395</td>\n",
" <td>-1.723086</td>\n",
" <td>0.650456</td>\n",
" <td>0.859578</td>\n",
" <td>0.887445</td>\n",
" <td>0.958034</td>\n",
" <td>0.055347</td>\n",
" <td>unstable</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>9.304097</td>\n",
" <td>4.902524</td>\n",
" <td>3.047541</td>\n",
" <td>1.369357</td>\n",
" <td>5.067812</td>\n",
" <td>-1.940058</td>\n",
" <td>-1.872742</td>\n",
" <td>-1.255012</td>\n",
" <td>0.413441</td>\n",
" <td>0.862414</td>\n",
" <td>0.562139</td>\n",
" <td>0.781760</td>\n",
" <td>-0.005957</td>\n",
" <td>stable</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>8.971707</td>\n",
" <td>8.848428</td>\n",
" <td>3.046479</td>\n",
" <td>1.214518</td>\n",
" <td>3.405158</td>\n",
" <td>-1.207456</td>\n",
" <td>-1.277210</td>\n",
" <td>-0.920492</td>\n",
" <td>0.163041</td>\n",
" <td>0.766689</td>\n",
" <td>0.839444</td>\n",
" <td>0.109853</td>\n",
" <td>0.003471</td>\n",
" <td>unstable</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>0.716415</td>\n",
" <td>7.669600</td>\n",
" <td>4.486641</td>\n",
" <td>2.340563</td>\n",
" <td>3.963791</td>\n",
" <td>-1.027473</td>\n",
" <td>-1.938944</td>\n",
" <td>-0.997374</td>\n",
" <td>0.446209</td>\n",
" <td>0.976744</td>\n",
" <td>0.929381</td>\n",
" <td>0.362718</td>\n",
" <td>0.028871</td>\n",
" <td>unstable</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>3.134112</td>\n",
" <td>7.608772</td>\n",
" <td>4.943759</td>\n",
" <td>9.857573</td>\n",
" <td>3.525811</td>\n",
" <td>-1.125531</td>\n",
" <td>-1.845975</td>\n",
" <td>-0.554305</td>\n",
" <td>0.797110</td>\n",
" <td>0.455450</td>\n",
" <td>0.656947</td>\n",
" <td>0.820923</td>\n",
" <td>0.049860</td>\n",
" <td>unstable</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" tau1 tau2 tau3 ... g4 stab stabf\n",
"0 2.959060 3.079885 8.381025 ... 0.958034 0.055347 unstable\n",
"1 9.304097 4.902524 3.047541 ... 0.781760 -0.005957 stable\n",
"2 8.971707 8.848428 3.046479 ... 0.109853 0.003471 unstable\n",
"3 0.716415 7.669600 4.486641 ... 0.362718 0.028871 unstable\n",
"4 3.134112 7.608772 4.943759 ... 0.820923 0.049860 unstable\n",
"\n",
"[5 rows x 14 columns]"
]
},
"metadata": {
"tags": []
},
"execution_count": 5
}
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "KHBmXrjALz0W"
},
"source": [
"discard some of the columns; shuffle the data; divide into train, test and validations subsets and print number of rows of the subsets"
]
},
{
"cell_type": "code",
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "tJo2DytZVyOo",
"outputId": "e61890d6-3f2f-4008-a83d-b7272aeecd11"
},
"source": [
"!sed 1d smart_grid_stability_augmented.csv | cut -f 1,5,9,13,14 -d \",\" | shuf | split -l 48000\n",
"!mv xaa train.csv\n",
"!mv xab toDivide\n",
"!split -l 6000 toDivide\n",
"!mv xaa test.csv\n",
"!mv xab valid.csv\n",
"!wc -l train.csv\n",
"!wc -l test.csv\n",
"!wc -l valid.csv\n"
],
"execution_count": null,
"outputs": [
{
"output_type": "stream",
"text": [
"48000 train.csv\n",
"6000 test.csv\n",
"6000 valid.csv\n"
],
"name": "stdout"
}
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "HhVyLU0XIZgt"
},
"source": [
"# script for lab IUM-05 - Model and training\n",
"\n",
"### first run lab IUM-01!!!"
]
},
{
"cell_type": "code",
"metadata": {
"id": "dGX3WRlPIQYO"
},
"source": [
"import tensorflow as tf\n",
"from tensorflow.keras import layers\n",
"\n",
"model = tf.keras.Sequential([\n",
" layers.Input(shape=(12,)),\n",
" layers.Dense(32),\n",
" layers.Dense(16),\n",
" layers.Dense(2, activation='softmax')\n",
"])\n",
"\n",
"model.compile(\n",
" loss=tf.losses.BinaryCrossentropy(),\n",
" optimizer=tf.optimizers.Adam(),\n",
" metrics=[tf.keras.metrics.BinaryAccuracy()])"
],
"execution_count": 32,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"id": "l6H4gchZqpBM"
},
"source": [
"import numpy as np\n",
"\n",
"def onezero(label):\n",
" return 0 if label == 'unstable' else 1\n",
"\n",
"\n",
"Y_train_one_zero = [onezero(x) for x in Y_train]\n",
"Y_train_onehot = np.eye(2)[Y_train_one_zero]\n",
"\n",
"Y_test_one_zero = [onezero(x) for x in Y_test]\n",
"Y_test_onehot = np.eye(2)[Y_test_one_zero]\n"
],
"execution_count": 39,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "ZJnjZvMOqcxr",
"outputId": "119a86a5-d9de-41f2-9ff7-c4576aec11ab"
},
"source": [
"history = model.fit(tf.convert_to_tensor(X_train, np.float32),\n",
" Y_train_onehot, epochs=5)"
],
"execution_count": 36,
"outputs": [
{
"output_type": "stream",
"text": [
"Epoch 1/5\n",
"1500/1500 [==============================] - 2s 1ms/step - loss: 0.3926 - binary_accuracy: 0.8149\n",
"Epoch 2/5\n",
"1500/1500 [==============================] - 2s 1ms/step - loss: 0.3925 - binary_accuracy: 0.8161\n",
"Epoch 3/5\n",
"1500/1500 [==============================] - 2s 1ms/step - loss: 0.3922 - binary_accuracy: 0.8150\n",
"Epoch 4/5\n",
"1500/1500 [==============================] - 2s 1ms/step - loss: 0.3927 - binary_accuracy: 0.8146\n",
"Epoch 5/5\n",
"1500/1500 [==============================] - 2s 1ms/step - loss: 0.3926 - binary_accuracy: 0.8143\n"
],
"name": "stdout"
}
]
},
{
"cell_type": "code",
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "a-TyrAertSsr",
"outputId": "20f2996b-b2c8-494d-8e1b-c612acb2ede0"
},
"source": [
"model.summary()"
],
"execution_count": 35,
"outputs": [
{
"output_type": "stream",
"text": [
"Model: \"sequential_6\"\n",
"_________________________________________________________________\n",
"Layer (type) Output Shape Param # \n",
"=================================================================\n",
"dense_28 (Dense) (None, 32) 416 \n",
"_________________________________________________________________\n",
"dense_29 (Dense) (None, 16) 528 \n",
"_________________________________________________________________\n",
"dense_30 (Dense) (None, 2) 34 \n",
"=================================================================\n",
"Total params: 978\n",
"Trainable params: 978\n",
"Non-trainable params: 0\n",
"_________________________________________________________________\n"
],
"name": "stdout"
}
]
},
{
"cell_type": "code",
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "tBzh-0yLWV1P",
"outputId": "8886d20a-5a2f-4385-bb9d-a033d4f4d50b"
},
"source": [
"results = model.evaluate(X_test, Y_test_onehot, batch_size=64)\n",
"print('test loss: ',results[0])\n",
"print('test acc: ', results[1])"
],
"execution_count": 41,
"outputs": [
{
"output_type": "stream",
"text": [
"94/94 [==============================] - 0s 1ms/step - loss: 0.3933 - binary_accuracy: 0.8112\n",
"test loss: 0.3933383822441101\n",
"test acc: 0.8111666440963745\n"
],
"name": "stdout"
}
]
}
]
}