commit 13abcc45512f4d9363b2fee2668b26fbeb395d8d Author: jakubknczny Date: Sun Mar 21 18:41:48 2021 +0100 add code for Zadanie 1 diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..c38fa4e --- /dev/null +++ b/.gitignore @@ -0,0 +1,2 @@ +.idea +*.iml diff --git a/README.md b/README.md new file mode 100644 index 0000000..1d738fb --- /dev/null +++ b/README.md @@ -0,0 +1,12 @@ +## Project for Inżynieria Uczenia Maszynowego class. + +The scope of this project is to propose a classifier based on Smart Grid Stability dataset: +https://www.kaggle.com/pcbreviglieri/smart-grid-stability +while using proper ML tools in a correct way. + +### Zadanie 1 +script.sh downloads and unzips the dataset and executes python_script.sh, +which then normalizes the data, divides the dataset into train, test and validation subsets +and prints a short summary of the dataset as well as its subsets. + +ium01.ipynb is a notebook used to develop previously mentioned scripts. diff --git a/ium01.ipynb b/ium01.ipynb new file mode 100644 index 0000000..d5944f2 --- /dev/null +++ b/ium01.ipynb @@ -0,0 +1 @@ +{"nbformat":4,"nbformat_minor":0,"metadata":{"colab":{"name":"ium01.ipynb","provenance":[],"collapsed_sections":[],"mount_file_id":"1Z43Re5xIaiFOO8c1uCDSbP5Xf4BxmRqM","authorship_tag":"ABX9TyNb+bVyOCogjiRTMUYEJ5AR"},"kernelspec":{"name":"python3","display_name":"Python 3"},"language_info":{"name":"python"}},"cells":[{"cell_type":"markdown","metadata":{"id":"zn8GQjYWnbcX"},"source":["# Notebook for first substask of Inżynieria Uczenia Maszynowego class project.\n","This workbook downloads, normalizes and prints short summary of the dataset I will be working on and its subsets.\n","\n","Link to the dataset at Kaggle.com:\n","\n","https://www.kaggle.com/pcbreviglieri/smart-grid-stability"]},{"cell_type":"code","metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"Z14xGWuJnWwq","executionInfo":{"status":"ok","timestamp":1616345223048,"user_tz":-60,"elapsed":21202,"user":{"displayName":"jadenadjezioro","photoUrl":"","userId":"13576387580000290170"}},"outputId":"d221b1c6-8331-4124-f2f2-52cfbaeb3283"},"source":["# google colab related stuff\n","from google.colab import drive\n","drive.mount('drive')"],"execution_count":1,"outputs":[{"output_type":"stream","text":["Mounted at /gdrive\n"],"name":"stdout"}]},{"cell_type":"markdown","metadata":{"id":"mROvxIELsVv1"},"source":["* Click in Colab GUI to allow Colab access and modify Google Drive files"]},{"cell_type":"code","metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"hVfCOcburj5P","executionInfo":{"status":"ok","timestamp":1616345349978,"user_tz":-60,"elapsed":4575,"user":{"displayName":"jadenadjezioro","photoUrl":"","userId":"13576387580000290170"}},"outputId":"510fdcf3-baa5-4103-a438-22f363f9e10e"},"source":["!mkdir ~/.kaggle\n","!cp drive/MyDrive/kaggle.json ~/.kaggle/.\n","!chmod +x ~/.kaggle/kaggle.json\n","!pip install -q kaggle"],"execution_count":9,"outputs":[{"output_type":"stream","text":["mkdir: cannot create directory ‘/root/.kaggle’: File exists\n"],"name":"stdout"}]},{"cell_type":"markdown","metadata":{"id":"EYeZaE3Cxf5i"},"source":["# script"]},{"cell_type":"markdown","metadata":{"id":"SRF-igrsma-A"},"source":["download data"]},{"cell_type":"code","metadata":{"id":"3UjQJzTawfKH","executionInfo":{"status":"ok","timestamp":1616345365360,"user_tz":-60,"elapsed":3560,"user":{"displayName":"jadenadjezioro","photoUrl":"","userId":"13576387580000290170"}}},"source":["!kaggle datasets download -d 'pcbreviglieri/smart-grid-stability' >>/dev/null 2>&1\n","!unzip smart-grid-stability.zip >>/dev/null 2>&1"],"execution_count":10,"outputs":[]},{"cell_type":"markdown","metadata":{"id":"mkK6wZ2zmhdQ"},"source":["read the data as pandas data frame"]},{"cell_type":"code","metadata":{"id":"JcPbvjeixwQa","executionInfo":{"status":"ok","timestamp":1616345367508,"user_tz":-60,"elapsed":915,"user":{"displayName":"jadenadjezioro","photoUrl":"","userId":"13576387580000290170"}}},"source":["import pandas as pd\n","\n","df = pd.read_csv('smart_grid_stability_augmented.csv')"],"execution_count":11,"outputs":[]},{"cell_type":"markdown","metadata":{"id":"x81Ip-6fmnfr"},"source":["normalize values, so they are all between 0 and 1 (included)"]},{"cell_type":"code","metadata":{"id":"7QZX5c2ZMpTj","executionInfo":{"status":"ok","timestamp":1616345371911,"user_tz":-60,"elapsed":1367,"user":{"displayName":"jadenadjezioro","photoUrl":"","userId":"13576387580000290170"}}},"source":["from sklearn import preprocessing\n","\n","min_max_scaler = preprocessing.MinMaxScaler()\n","df_norm_array = min_max_scaler.fit_transform(df.iloc[:,0:-1])\n","df_norm = pd.DataFrame(data=df_norm_array,\n"," columns=df.columns[:-1])\n","df_norm['stabf'] = df['stabf']"],"execution_count":12,"outputs":[]},{"cell_type":"markdown","metadata":{"id":"hjAT_K-Cmzhq"},"source":["divide the data into train, test and validation subsets"]},{"cell_type":"code","metadata":{"id":"MvI7kiL0UPc8","executionInfo":{"status":"ok","timestamp":1616345374785,"user_tz":-60,"elapsed":851,"user":{"displayName":"jadenadjezioro","photoUrl":"","userId":"13576387580000290170"}}},"source":["from sklearn.model_selection import train_test_split\n","\n","train, testAndValid = train_test_split(\n"," df_norm,\n"," test_size=0.2,\n"," random_state=42,\n"," stratify=df_norm['stabf'])\n","\n","test, valid =train_test_split(\n"," testAndValid,\n"," test_size=0.5,\n"," random_state=42,\n"," stratify=testAndValid['stabf'])"],"execution_count":13,"outputs":[]},{"cell_type":"markdown","metadata":{"id":"FdUL87MgnE2G"},"source":["print short summary of the dataset and its subsets"]},{"cell_type":"code","metadata":{"id":"WUrX63SGcHSB","colab":{"base_uri":"https://localhost:8080/"},"executionInfo":{"status":"ok","timestamp":1616345377648,"user_tz":-60,"elapsed":932,"user":{"displayName":"jadenadjezioro","photoUrl":"","userId":"13576387580000290170"}},"outputId":"59b0aae8-ddef-4da3-f669-74744f04d3d8"},"source":["def namestr(obj, namespace):\n"," return [name for name in namespace if namespace[name] is obj]\n","\n","dataset = df_norm\n","for x in [dataset, train, test, valid]:\n"," print([q for q in namestr(x, globals()) if len(q) == max([len(w) for w in namestr(x, globals())])][-1]) \n"," print(\"size:\", len(x))\n"," print(x.describe(include='all'))\n"," print(\"class distribution\", x.value_counts('stabf'))"],"execution_count":14,"outputs":[{"output_type":"stream","text":["dataset\n","size: 60000\n"," tau1 tau2 ... stab stabf\n","count 60000.000000 60000.000000 ... 60000.000000 60000\n","unique NaN NaN ... NaN 2\n","top NaN NaN ... NaN unstable\n","freq NaN NaN ... NaN 38280\n","mean 0.499986 0.500001 ... 0.507411 NaN\n","std 0.288717 0.288687 ... 0.194136 NaN\n","min 0.000000 0.000000 ... 0.000000 NaN\n","25% 0.249940 0.249994 ... 0.342879 NaN\n","50% 0.499987 0.499999 ... 0.514830 NaN\n","75% 0.749988 0.749998 ... 0.660687 NaN\n","max 1.000000 1.000000 ... 1.000000 NaN\n","\n","[11 rows x 14 columns]\n","class distribution stabf\n","unstable 38280\n","stable 21720\n","dtype: int64\n","train\n","size: 48000\n"," tau1 tau2 ... stab stabf\n","count 48000.000000 48000.000000 ... 48000.000000 48000\n","unique NaN NaN ... NaN 2\n","top NaN NaN ... NaN unstable\n","freq NaN NaN ... NaN 30624\n","mean 0.499540 0.499693 ... 0.507241 NaN\n","std 0.288985 0.288427 ... 0.194052 NaN\n","min 0.000000 0.000000 ... 0.000000 NaN\n","25% 0.249152 0.250486 ... 0.342846 NaN\n","50% 0.499484 0.498519 ... 0.514544 NaN\n","75% 0.750688 0.748670 ... 0.660581 NaN\n","max 1.000000 1.000000 ... 1.000000 NaN\n","\n","[11 rows x 14 columns]\n","class distribution stabf\n","unstable 30624\n","stable 17376\n","dtype: int64\n","test\n","size: 6000\n"," tau1 tau2 ... stab stabf\n","count 6000.000000 6000.000000 ... 6000.000000 6000\n","unique NaN NaN ... NaN 2\n","top NaN NaN ... NaN unstable\n","freq NaN NaN ... NaN 3828\n","mean 0.506892 0.503728 ... 0.508099 NaN\n","std 0.288297 0.289193 ... 0.193904 NaN\n","min 0.000000 0.000235 ... 0.000000 NaN\n","25% 0.257491 0.253063 ... 0.342817 NaN\n","50% 0.512256 0.505947 ... 0.517085 NaN\n","75% 0.756686 0.760497 ... 0.661664 NaN\n","max 0.999950 0.999837 ... 1.000000 NaN\n","\n","[11 rows x 14 columns]\n","class distribution stabf\n","unstable 3828\n","stable 2172\n","dtype: int64\n","valid\n","size: 6000\n"," tau1 tau2 ... stab stabf\n","count 6000.000000 6000.000000 ... 6000.000000 6000\n","unique NaN NaN ... NaN 2\n","top NaN NaN ... NaN unstable\n","freq NaN NaN ... NaN 3828\n","mean 0.496651 0.498741 ... 0.508078 NaN\n","std 0.286937 0.290278 ... 0.195063 NaN\n","min 0.000000 0.000000 ... 0.000000 NaN\n","25% 0.247513 0.243721 ... 0.343408 NaN\n","50% 0.495008 0.505151 ... 0.515111 NaN\n","75% 0.738276 0.750593 ... 0.660326 NaN\n","max 1.000000 1.000000 ... 1.000000 NaN\n","\n","[11 rows x 14 columns]\n","class distribution stabf\n","unstable 3828\n","stable 2172\n","dtype: int64\n"],"name":"stdout"}]}]} \ No newline at end of file diff --git a/python_script.py b/python_script.py new file mode 100644 index 0000000..a5a2417 --- /dev/null +++ b/python_script.py @@ -0,0 +1,36 @@ +import pandas as pd + +from sklearn import preprocessing +from sklearn.model_selection import train_test_split + + +df = pd.read_csv('smart_grid_stability_augmented.csv') +min_max_scaler = preprocessing.MinMaxScaler() +df_norm_array = min_max_scaler.fit_transform(df.iloc[:, 0:-1]) +df_norm = pd.DataFrame(data=df_norm_array, + columns=df.columns[:-1]) +df_norm['stabf'] = df['stabf'] + +train, testAndValid = train_test_split( + df_norm, + test_size=0.2, + random_state=42, + stratify=df_norm['stabf']) + +test, valid = train_test_split( + testAndValid, + test_size=0.5, + random_state=42, + stratify=testAndValid['stabf']) + + +def namestr(obj, namespace): + return [name for name in namespace if namespace[name] is obj] + + +dataset = df_norm +for x in [dataset, train, test, valid]: + print([q for q in namestr(x, globals()) if len(q) == max([len(w) for w in namestr(x, globals())])][-1]) + print("size:", len(x)) + print(x.describe(include='all')) + print("class distribution", x.value_counts('stabf')) diff --git a/script.sh b/script.sh new file mode 100644 index 0000000..9ff94f7 --- /dev/null +++ b/script.sh @@ -0,0 +1,6 @@ +#!/bin/bash + +kaggle datasets download -d 'pcbreviglieri/smart-grid-stability' >>/dev/null 2>&1 +unzip smart-grid-stability.zip >>/dev/null 2>&1 + +python python_script.py