From 4709691adf5c1873eae668b710c56b0fb5f2add5 Mon Sep 17 00:00:00 2001 From: Alicja Szulecka <73056579+AliSzu@users.noreply.github.com> Date: Sat, 13 Apr 2024 19:07:21 +0200 Subject: [PATCH] first script --- IUM_2.py | 42 ++++++++++++++++++++---------------------- 1 file changed, 20 insertions(+), 22 deletions(-) diff --git a/IUM_2.py b/IUM_2.py index 5fa7a25..e2807fb 100644 --- a/IUM_2.py +++ b/IUM_2.py @@ -3,38 +3,36 @@ from sklearn.model_selection import train_test_split from sklearn.preprocessing import StandardScaler def split(data): - meteorite_train, meteorite_test = train_test_split(data, test_size=0.2, random_state=1) - meteorite_train, meteorite_val = train_test_split(meteorite_train, test_size=0.25, random_state=1) - return meteorite_train, meteorite_test, meteorite_val + forest_train, forest_test = train_test_split(data, test_size=0.2, random_state=1) + forest_train, forest_val = train_test_split(forest_train, test_size=0.25, random_state=1) + return forest_train, forest_test, forest_val def normalization(data): scaler = StandardScaler() - data['mass'] = scaler.fit_transform(data[['mass']]) + columns_to_normalize = data.columns[~data.columns.str.startswith('Soil_Type')] + columns_to_normalize = columns_to_normalize.to_list() + columns_to_normalize.remove('Cover_Type') + data[columns_to_normalize] = scaler.fit_transform(data[columns_to_normalize]) return data def preprocessing(data): - data = data.dropna(subset=['reclat']) - - incorrect_years_index = data.loc[(data['year'] > 2016) | (data['year'] < 860)].index - incorrect_location_index = data.loc[(data['reclat'] == 0) & (data['reclong'] == 0)].index - - data.drop(incorrect_years_index.union(incorrect_location_index), inplace=True) - data.loc[(data['mass'].isnull()) & (data['name'].str.startswith('Österplana')), 'mass'] = 0 + #shuffle + data = data.sample(frac = 1) return data -data = pd.read_csv("meteorite-landings.csv") -meteorite_train, meteorite_test, meteorite_val = split(data) +data = pd.read_csv("covtype.csv") +forest_train, forest_test, forest_val = split(data) -meteorite_train = normalization(meteorite_train) -meteorite_test = normalization(meteorite_test) -meteorite_val = normalization(meteorite_val) +forest_train = preprocessing(forest_train) +forest_test = preprocessing(forest_test) +forest_val = preprocessing(forest_val) -meteorite_train = normalization(meteorite_train) -meteorite_test = normalization(meteorite_test) -meteorite_val = normalization(meteorite_val) +forest_train = normalization(forest_train) +forest_test = normalization(forest_test) +forest_val = normalization(forest_val) -meteorite_train.to_csv('meteorite_train.csv', encoding='utf-8') -meteorite_test.to_csv('meteorite_test.csv', encoding='utf-8') -meteorite_val.to_csv('meteorite_val.csv', encoding='utf-8') +forest_train.to_csv('forest_train.csv', encoding='utf-8', index=False) +forest_test.to_csv('forest_test.csv', encoding='utf-8', index=False) +forest_val.to_csv('forest_val.csv', encoding='utf-8', index=False)