38 lines
1.1 KiB
Python
38 lines
1.1 KiB
Python
import pandas as pd
|
|
|
|
from sklearn import preprocessing
|
|
from sklearn.model_selection import train_test_split
|
|
|
|
|
|
df = pd.read_csv('smart_grid_stability_augmented.csv')
|
|
scaler = preprocessing.StandardScaler().fit(df.iloc[:, 0:-1])
|
|
df_norm_array = scaler.transform(df.iloc[:, 0:-1])
|
|
df_norm = pd.DataFrame(data=df_norm_array,
|
|
columns=df.columns[:-1])
|
|
df_norm['stabf'] = df['stabf']
|
|
|
|
train, testAndValid = train_test_split(
|
|
df_norm,
|
|
test_size=0.2,
|
|
random_state=42,
|
|
stratify=df_norm['stabf'])
|
|
|
|
test, valid = train_test_split(
|
|
testAndValid,
|
|
test_size=0.5,
|
|
random_state=42,
|
|
stratify=testAndValid['stabf'])
|
|
|
|
|
|
def namestr(obj, namespace):
|
|
return [name for name in namespace if namespace[name] is obj]
|
|
|
|
|
|
dataset = df_norm
|
|
for x in [dataset, train, test, valid]:
|
|
print([q for q in namestr(x, globals()) if len(q) == max([len(w) for w in namestr(x, globals())])][-1])
|
|
print("size:", len(x))
|
|
print(x.describe(include='all'))
|
|
print("class distribution", x.value_counts('stabf'))
|
|
print('===============================================================')
|