import pandas as pd from sklearn import preprocessing from sklearn.model_selection import train_test_split df = pd.read_csv('smart_grid_stability_augmented.csv') scaler = preprocessing.StandardScaler().fit(df.iloc[:, 0:-1]) df_norm_array = scaler.transform(df.iloc[:, 0:-1]) df_norm = pd.DataFrame(data=df_norm_array, columns=df.columns[:-1]) df_norm['stabf'] = df['stabf'] train, testAndValid = train_test_split( df_norm, test_size=0.2, random_state=42, stratify=df_norm['stabf']) test, valid = train_test_split( testAndValid, test_size=0.5, random_state=42, stratify=testAndValid['stabf']) def namestr(obj, namespace): return [name for name in namespace if namespace[name] is obj] dataset = df_norm for x in [dataset, train, test, valid]: print([q for q in namestr(x, globals()) if len(q) == max([len(w) for w in namestr(x, globals())])][-1]) print("size:", len(x)) print(x.describe(include='all')) print("class distribution", x.value_counts('stabf')) print('===============================================================')