{"nbformat":4,"nbformat_minor":0,"metadata":{"colab":{"name":"ium01.ipynb","provenance":[],"collapsed_sections":[],"mount_file_id":"1Z43Re5xIaiFOO8c1uCDSbP5Xf4BxmRqM","authorship_tag":"ABX9TyNb+bVyOCogjiRTMUYEJ5AR"},"kernelspec":{"name":"python3","display_name":"Python 3"},"language_info":{"name":"python"}},"cells":[{"cell_type":"markdown","metadata":{"id":"zn8GQjYWnbcX"},"source":["# Notebook for first substask of Inżynieria Uczenia Maszynowego class project.\n","This workbook downloads, normalizes and prints short summary of the dataset I will be working on and its subsets.\n","\n","Link to the dataset at Kaggle.com:\n","\n","https://www.kaggle.com/pcbreviglieri/smart-grid-stability"]},{"cell_type":"code","metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"Z14xGWuJnWwq","executionInfo":{"status":"ok","timestamp":1616345223048,"user_tz":-60,"elapsed":21202,"user":{"displayName":"jadenadjezioro","photoUrl":"","userId":"13576387580000290170"}},"outputId":"d221b1c6-8331-4124-f2f2-52cfbaeb3283"},"source":["# google colab related stuff\n","from google.colab import drive\n","drive.mount('drive')"],"execution_count":1,"outputs":[{"output_type":"stream","text":["Mounted at /gdrive\n"],"name":"stdout"}]},{"cell_type":"markdown","metadata":{"id":"mROvxIELsVv1"},"source":["* Click in Colab GUI to allow Colab access and modify Google Drive files"]},{"cell_type":"code","metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"hVfCOcburj5P","executionInfo":{"status":"ok","timestamp":1616345349978,"user_tz":-60,"elapsed":4575,"user":{"displayName":"jadenadjezioro","photoUrl":"","userId":"13576387580000290170"}},"outputId":"510fdcf3-baa5-4103-a438-22f363f9e10e"},"source":["!mkdir ~/.kaggle\n","!cp drive/MyDrive/kaggle.json ~/.kaggle/.\n","!chmod +x ~/.kaggle/kaggle.json\n","!pip install -q kaggle"],"execution_count":9,"outputs":[{"output_type":"stream","text":["mkdir: cannot create directory ‘/root/.kaggle’: File exists\n"],"name":"stdout"}]},{"cell_type":"markdown","metadata":{"id":"EYeZaE3Cxf5i"},"source":["# script"]},{"cell_type":"markdown","metadata":{"id":"SRF-igrsma-A"},"source":["download data"]},{"cell_type":"code","metadata":{"id":"3UjQJzTawfKH","executionInfo":{"status":"ok","timestamp":1616345365360,"user_tz":-60,"elapsed":3560,"user":{"displayName":"jadenadjezioro","photoUrl":"","userId":"13576387580000290170"}}},"source":["!kaggle datasets download -d 'pcbreviglieri/smart-grid-stability' >>/dev/null 2>&1\n","!unzip smart-grid-stability.zip >>/dev/null 2>&1"],"execution_count":10,"outputs":[]},{"cell_type":"markdown","metadata":{"id":"mkK6wZ2zmhdQ"},"source":["read the data as pandas data frame"]},{"cell_type":"code","metadata":{"id":"JcPbvjeixwQa","executionInfo":{"status":"ok","timestamp":1616345367508,"user_tz":-60,"elapsed":915,"user":{"displayName":"jadenadjezioro","photoUrl":"","userId":"13576387580000290170"}}},"source":["import pandas as pd\n","\n","df = pd.read_csv('smart_grid_stability_augmented.csv')"],"execution_count":11,"outputs":[]},{"cell_type":"markdown","metadata":{"id":"x81Ip-6fmnfr"},"source":["normalize values, so they are all between 0 and 1 (included)"]},{"cell_type":"code","metadata":{"id":"7QZX5c2ZMpTj","executionInfo":{"status":"ok","timestamp":1616345371911,"user_tz":-60,"elapsed":1367,"user":{"displayName":"jadenadjezioro","photoUrl":"","userId":"13576387580000290170"}}},"source":["from sklearn import preprocessing\n","\n","min_max_scaler = preprocessing.MinMaxScaler()\n","df_norm_array = min_max_scaler.fit_transform(df.iloc[:,0:-1])\n","df_norm = pd.DataFrame(data=df_norm_array,\n","                       columns=df.columns[:-1])\n","df_norm['stabf'] = df['stabf']"],"execution_count":12,"outputs":[]},{"cell_type":"markdown","metadata":{"id":"hjAT_K-Cmzhq"},"source":["divide the data into train, test and validation subsets"]},{"cell_type":"code","metadata":{"id":"MvI7kiL0UPc8","executionInfo":{"status":"ok","timestamp":1616345374785,"user_tz":-60,"elapsed":851,"user":{"displayName":"jadenadjezioro","photoUrl":"","userId":"13576387580000290170"}}},"source":["from sklearn.model_selection import train_test_split\n","\n","train, testAndValid = train_test_split(\n","    df_norm,\n","    test_size=0.2,\n","    random_state=42,\n","    stratify=df_norm['stabf'])\n","\n","test, valid =train_test_split(\n","    testAndValid,\n","    test_size=0.5,\n","    random_state=42,\n","    stratify=testAndValid['stabf'])"],"execution_count":13,"outputs":[]},{"cell_type":"markdown","metadata":{"id":"FdUL87MgnE2G"},"source":["print short summary of the dataset and its subsets"]},{"cell_type":"code","metadata":{"id":"WUrX63SGcHSB","colab":{"base_uri":"https://localhost:8080/"},"executionInfo":{"status":"ok","timestamp":1616345377648,"user_tz":-60,"elapsed":932,"user":{"displayName":"jadenadjezioro","photoUrl":"","userId":"13576387580000290170"}},"outputId":"59b0aae8-ddef-4da3-f669-74744f04d3d8"},"source":["def namestr(obj, namespace):\n","  return [name for name in namespace if namespace[name] is obj]\n","\n","dataset = df_norm\n","for x in [dataset, train, test, valid]:\n","  print([q for q in namestr(x, globals()) if len(q) == max([len(w) for w in namestr(x, globals())])][-1]) \n","  print(\"size:\", len(x))\n","  print(x.describe(include='all'))\n","  print(\"class distribution\", x.value_counts('stabf'))"],"execution_count":14,"outputs":[{"output_type":"stream","text":["dataset\n","size: 60000\n","                tau1          tau2  ...          stab     stabf\n","count   60000.000000  60000.000000  ...  60000.000000     60000\n","unique           NaN           NaN  ...           NaN         2\n","top              NaN           NaN  ...           NaN  unstable\n","freq             NaN           NaN  ...           NaN     38280\n","mean        0.499986      0.500001  ...      0.507411       NaN\n","std         0.288717      0.288687  ...      0.194136       NaN\n","min         0.000000      0.000000  ...      0.000000       NaN\n","25%         0.249940      0.249994  ...      0.342879       NaN\n","50%         0.499987      0.499999  ...      0.514830       NaN\n","75%         0.749988      0.749998  ...      0.660687       NaN\n","max         1.000000      1.000000  ...      1.000000       NaN\n","\n","[11 rows x 14 columns]\n","class distribution stabf\n","unstable    38280\n","stable      21720\n","dtype: int64\n","train\n","size: 48000\n","                tau1          tau2  ...          stab     stabf\n","count   48000.000000  48000.000000  ...  48000.000000     48000\n","unique           NaN           NaN  ...           NaN         2\n","top              NaN           NaN  ...           NaN  unstable\n","freq             NaN           NaN  ...           NaN     30624\n","mean        0.499540      0.499693  ...      0.507241       NaN\n","std         0.288985      0.288427  ...      0.194052       NaN\n","min         0.000000      0.000000  ...      0.000000       NaN\n","25%         0.249152      0.250486  ...      0.342846       NaN\n","50%         0.499484      0.498519  ...      0.514544       NaN\n","75%         0.750688      0.748670  ...      0.660581       NaN\n","max         1.000000      1.000000  ...      1.000000       NaN\n","\n","[11 rows x 14 columns]\n","class distribution stabf\n","unstable    30624\n","stable      17376\n","dtype: int64\n","test\n","size: 6000\n","               tau1         tau2  ...         stab     stabf\n","count   6000.000000  6000.000000  ...  6000.000000      6000\n","unique          NaN          NaN  ...          NaN         2\n","top             NaN          NaN  ...          NaN  unstable\n","freq            NaN          NaN  ...          NaN      3828\n","mean       0.506892     0.503728  ...     0.508099       NaN\n","std        0.288297     0.289193  ...     0.193904       NaN\n","min        0.000000     0.000235  ...     0.000000       NaN\n","25%        0.257491     0.253063  ...     0.342817       NaN\n","50%        0.512256     0.505947  ...     0.517085       NaN\n","75%        0.756686     0.760497  ...     0.661664       NaN\n","max        0.999950     0.999837  ...     1.000000       NaN\n","\n","[11 rows x 14 columns]\n","class distribution stabf\n","unstable    3828\n","stable      2172\n","dtype: int64\n","valid\n","size: 6000\n","               tau1         tau2  ...         stab     stabf\n","count   6000.000000  6000.000000  ...  6000.000000      6000\n","unique          NaN          NaN  ...          NaN         2\n","top             NaN          NaN  ...          NaN  unstable\n","freq            NaN          NaN  ...          NaN      3828\n","mean       0.496651     0.498741  ...     0.508078       NaN\n","std        0.286937     0.290278  ...     0.195063       NaN\n","min        0.000000     0.000000  ...     0.000000       NaN\n","25%        0.247513     0.243721  ...     0.343408       NaN\n","50%        0.495008     0.505151  ...     0.515111       NaN\n","75%        0.738276     0.750593  ...     0.660326       NaN\n","max        1.000000     1.000000  ...     1.000000       NaN\n","\n","[11 rows x 14 columns]\n","class distribution stabf\n","unstable    3828\n","stable      2172\n","dtype: int64\n"],"name":"stdout"}]}]}