9.0 KiB
9.0 KiB
Notebook for first substask of Inżynieria Uczenia Maszynowego class project.
This workbook downloads, normalizes and prints short summary of the dataset I will be working on and its subsets.
Link to the dataset at Kaggle.com:
# google colab related stuff
from google.colab import drive
drive.mount('drive')
Mounted at /gdrive
- Click in Colab GUI to allow Colab access and modify Google Drive files
!mkdir ~/.kaggle
!cp drive/MyDrive/kaggle.json ~/.kaggle/.
!chmod +x ~/.kaggle/kaggle.json
!pip install -q kaggle
mkdir: cannot create directory ‘/root/.kaggle’: File exists
script
download data
!kaggle datasets download -d 'pcbreviglieri/smart-grid-stability' >>/dev/null 2>&1
!unzip smart-grid-stability.zip >>/dev/null 2>&1
read the data as pandas data frame
import pandas as pd
df = pd.read_csv('smart_grid_stability_augmented.csv')
normalize values, so they are all between 0 and 1 (included)
from sklearn import preprocessing
min_max_scaler = preprocessing.MinMaxScaler()
df_norm_array = min_max_scaler.fit_transform(df.iloc[:,0:-1])
df_norm = pd.DataFrame(data=df_norm_array,
columns=df.columns[:-1])
df_norm['stabf'] = df['stabf']
divide the data into train, test and validation subsets
from sklearn.model_selection import train_test_split
train, testAndValid = train_test_split(
df_norm,
test_size=0.2,
random_state=42,
stratify=df_norm['stabf'])
test, valid =train_test_split(
testAndValid,
test_size=0.5,
random_state=42,
stratify=testAndValid['stabf'])
print short summary of the dataset and its subsets
def namestr(obj, namespace):
return [name for name in namespace if namespace[name] is obj]
dataset = df_norm
for x in [dataset, train, test, valid]:
print([q for q in namestr(x, globals()) if len(q) == max([len(w) for w in namestr(x, globals())])][-1])
print("size:", len(x))
print(x.describe(include='all'))
print("class distribution", x.value_counts('stabf'))
dataset size: 60000 tau1 tau2 ... stab stabf count 60000.000000 60000.000000 ... 60000.000000 60000 unique NaN NaN ... NaN 2 top NaN NaN ... NaN unstable freq NaN NaN ... NaN 38280 mean 0.499986 0.500001 ... 0.507411 NaN std 0.288717 0.288687 ... 0.194136 NaN min 0.000000 0.000000 ... 0.000000 NaN 25% 0.249940 0.249994 ... 0.342879 NaN 50% 0.499987 0.499999 ... 0.514830 NaN 75% 0.749988 0.749998 ... 0.660687 NaN max 1.000000 1.000000 ... 1.000000 NaN [11 rows x 14 columns] class distribution stabf unstable 38280 stable 21720 dtype: int64 train size: 48000 tau1 tau2 ... stab stabf count 48000.000000 48000.000000 ... 48000.000000 48000 unique NaN NaN ... NaN 2 top NaN NaN ... NaN unstable freq NaN NaN ... NaN 30624 mean 0.499540 0.499693 ... 0.507241 NaN std 0.288985 0.288427 ... 0.194052 NaN min 0.000000 0.000000 ... 0.000000 NaN 25% 0.249152 0.250486 ... 0.342846 NaN 50% 0.499484 0.498519 ... 0.514544 NaN 75% 0.750688 0.748670 ... 0.660581 NaN max 1.000000 1.000000 ... 1.000000 NaN [11 rows x 14 columns] class distribution stabf unstable 30624 stable 17376 dtype: int64 test size: 6000 tau1 tau2 ... stab stabf count 6000.000000 6000.000000 ... 6000.000000 6000 unique NaN NaN ... NaN 2 top NaN NaN ... NaN unstable freq NaN NaN ... NaN 3828 mean 0.506892 0.503728 ... 0.508099 NaN std 0.288297 0.289193 ... 0.193904 NaN min 0.000000 0.000235 ... 0.000000 NaN 25% 0.257491 0.253063 ... 0.342817 NaN 50% 0.512256 0.505947 ... 0.517085 NaN 75% 0.756686 0.760497 ... 0.661664 NaN max 0.999950 0.999837 ... 1.000000 NaN [11 rows x 14 columns] class distribution stabf unstable 3828 stable 2172 dtype: int64 valid size: 6000 tau1 tau2 ... stab stabf count 6000.000000 6000.000000 ... 6000.000000 6000 unique NaN NaN ... NaN 2 top NaN NaN ... NaN unstable freq NaN NaN ... NaN 3828 mean 0.496651 0.498741 ... 0.508078 NaN std 0.286937 0.290278 ... 0.195063 NaN min 0.000000 0.000000 ... 0.000000 NaN 25% 0.247513 0.243721 ... 0.343408 NaN 50% 0.495008 0.505151 ... 0.515111 NaN 75% 0.738276 0.750593 ... 0.660326 NaN max 1.000000 1.000000 ... 1.000000 NaN [11 rows x 14 columns] class distribution stabf unstable 3828 stable 2172 dtype: int64