add code for Zadanie 1
This commit is contained in:
commit
13abcc4551
2
.gitignore
vendored
Normal file
2
.gitignore
vendored
Normal file
@ -0,0 +1,2 @@
|
|||||||
|
.idea
|
||||||
|
*.iml
|
12
README.md
Normal file
12
README.md
Normal file
@ -0,0 +1,12 @@
|
|||||||
|
## Project for Inżynieria Uczenia Maszynowego class.
|
||||||
|
|
||||||
|
The scope of this project is to propose a classifier based on Smart Grid Stability dataset:
|
||||||
|
https://www.kaggle.com/pcbreviglieri/smart-grid-stability
|
||||||
|
while using proper ML tools in a correct way.
|
||||||
|
|
||||||
|
### Zadanie 1
|
||||||
|
script.sh downloads and unzips the dataset and executes python_script.sh,
|
||||||
|
which then normalizes the data, divides the dataset into train, test and validation subsets
|
||||||
|
and prints a short summary of the dataset as well as its subsets.
|
||||||
|
|
||||||
|
ium01.ipynb is a notebook used to develop previously mentioned scripts.
|
1
ium01.ipynb
Normal file
1
ium01.ipynb
Normal file
File diff suppressed because one or more lines are too long
36
python_script.py
Normal file
36
python_script.py
Normal file
@ -0,0 +1,36 @@
|
|||||||
|
import pandas as pd
|
||||||
|
|
||||||
|
from sklearn import preprocessing
|
||||||
|
from sklearn.model_selection import train_test_split
|
||||||
|
|
||||||
|
|
||||||
|
df = pd.read_csv('smart_grid_stability_augmented.csv')
|
||||||
|
min_max_scaler = preprocessing.MinMaxScaler()
|
||||||
|
df_norm_array = min_max_scaler.fit_transform(df.iloc[:, 0:-1])
|
||||||
|
df_norm = pd.DataFrame(data=df_norm_array,
|
||||||
|
columns=df.columns[:-1])
|
||||||
|
df_norm['stabf'] = df['stabf']
|
||||||
|
|
||||||
|
train, testAndValid = train_test_split(
|
||||||
|
df_norm,
|
||||||
|
test_size=0.2,
|
||||||
|
random_state=42,
|
||||||
|
stratify=df_norm['stabf'])
|
||||||
|
|
||||||
|
test, valid = train_test_split(
|
||||||
|
testAndValid,
|
||||||
|
test_size=0.5,
|
||||||
|
random_state=42,
|
||||||
|
stratify=testAndValid['stabf'])
|
||||||
|
|
||||||
|
|
||||||
|
def namestr(obj, namespace):
|
||||||
|
return [name for name in namespace if namespace[name] is obj]
|
||||||
|
|
||||||
|
|
||||||
|
dataset = df_norm
|
||||||
|
for x in [dataset, train, test, valid]:
|
||||||
|
print([q for q in namestr(x, globals()) if len(q) == max([len(w) for w in namestr(x, globals())])][-1])
|
||||||
|
print("size:", len(x))
|
||||||
|
print(x.describe(include='all'))
|
||||||
|
print("class distribution", x.value_counts('stabf'))
|
Loading…
Reference in New Issue
Block a user