ium_464914/IUM_2.py

39 lines
1.3 KiB
Python
Raw Normal View History

2024-04-02 19:05:02 +02:00
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
def split(data):
2024-04-13 19:07:21 +02:00
forest_train, forest_test = train_test_split(data, test_size=0.2, random_state=1)
forest_train, forest_val = train_test_split(forest_train, test_size=0.25, random_state=1)
return forest_train, forest_test, forest_val
2024-04-02 19:05:02 +02:00
def normalization(data):
scaler = StandardScaler()
2024-04-13 19:07:21 +02:00
columns_to_normalize = data.columns[~data.columns.str.startswith('Soil_Type')]
columns_to_normalize = columns_to_normalize.to_list()
columns_to_normalize.remove('Cover_Type')
data[columns_to_normalize] = scaler.fit_transform(data[columns_to_normalize])
2024-04-02 19:05:02 +02:00
return data
def preprocessing(data):
2024-04-13 19:07:21 +02:00
#shuffle
data = data.sample(frac = 1)
2024-04-02 19:05:02 +02:00
return data
2024-04-13 19:07:21 +02:00
data = pd.read_csv("covtype.csv")
forest_train, forest_test, forest_val = split(data)
2024-04-02 19:05:02 +02:00
2024-04-13 19:07:21 +02:00
forest_train = preprocessing(forest_train)
forest_test = preprocessing(forest_test)
forest_val = preprocessing(forest_val)
2024-04-02 19:05:02 +02:00
2024-04-13 19:07:21 +02:00
forest_train = normalization(forest_train)
forest_test = normalization(forest_test)
forest_val = normalization(forest_val)
2024-04-02 19:05:02 +02:00
2024-04-13 19:07:21 +02:00
forest_train.to_csv('forest_train.csv', encoding='utf-8', index=False)
forest_test.to_csv('forest_test.csv', encoding='utf-8', index=False)
forest_val.to_csv('forest_val.csv', encoding='utf-8', index=False)
2024-04-02 19:05:02 +02:00