add code for Zadanie 1

2021-03-21 18:41:48 +01:00 · 2021-03-21 18:41:48 +01:00 · 13abcc4551
commit 13abcc4551
5 changed files with 57 additions and 0 deletions
--- a/.gitignore
+++ b/.gitignore
@ -0,0 +1,2 @@
 .idea
 *.iml
--- a/README.md
+++ b/README.md
@ -0,0 +1,12 @@
 ## Project for Inżynieria Uczenia Maszynowego class.
 The scope of this project is to propose a classifier based on Smart Grid Stability dataset:
 https://www.kaggle.com/pcbreviglieri/smart-grid-stability
 while using proper ML tools in a correct way.
 ### Zadanie 1
 script.sh downloads and unzips the dataset and executes python_script.sh,
 which then normalizes the data, divides the dataset into train, test and validation subsets
 and prints a short summary of the dataset as well as its subsets.
 ium01.ipynb is a notebook used to develop previously mentioned scripts.
--- a/ium01.ipynb
+++ b/ium01.ipynb
--- a/python_script.py
+++ b/python_script.py
@ -0,0 +1,36 @@
 import pandas as pd
 from sklearn import preprocessing
 from sklearn.model_selection import train_test_split
 df = pd.read_csv('smart_grid_stability_augmented.csv')
 min_max_scaler = preprocessing.MinMaxScaler()
 df_norm_array = min_max_scaler.fit_transform(df.iloc[:, 0:-1])
 df_norm = pd.DataFrame(data=df_norm_array,
                       columns=df.columns[:-1])
 df_norm['stabf'] = df['stabf']
 train, testAndValid = train_test_split(
    df_norm,
    test_size=0.2,
    random_state=42,
    stratify=df_norm['stabf'])
 test, valid = train_test_split(
    testAndValid,
    test_size=0.5,
    random_state=42,
    stratify=testAndValid['stabf'])
 def namestr(obj, namespace):
    return [name for name in namespace if namespace[name] is obj]
 dataset = df_norm
 for x in [dataset, train, test, valid]:
    print([q for q in namestr(x, globals()) if len(q) == max([len(w) for w in namestr(x, globals())])][-1])
    print("size:", len(x))
    print(x.describe(include='all'))
    print("class distribution", x.value_counts('stabf'))
--- a/script.sh
+++ b/script.sh
@ -0,0 +1,6 @@
 #!/bin/bash
 kaggle datasets download -d 'pcbreviglieri/smart-grid-stability' >>/dev/null 2>&1
 unzip smart-grid-stability.zip >>/dev/null 2>&1
 python python_script.py