import pandas as pd from sklearn.model_selection import train_test_split from sklearn.preprocessing import StandardScaler def split(data): meteorite_train, meteorite_test = train_test_split(data, test_size=0.2, random_state=1) meteorite_train, meteorite_val = train_test_split(meteorite_train, test_size=0.25, random_state=1) return meteorite_train, meteorite_test, meteorite_val def normalization(data): scaler = StandardScaler() data['mass'] = scaler.fit_transform(data[['mass']]) return data def preprocessing(data): data = data.dropna(subset=['reclat']) incorrect_years_index = data.loc[(data['year'] > 2016) | (data['year'] < 860)].index incorrect_location_index = data.loc[(data['reclat'] == 0) & (data['reclong'] == 0)].index data.drop(incorrect_years_index.union(incorrect_location_index), inplace=True) data.loc[(data['mass'].isnull()) & (data['name'].str.startswith('Österplana')), 'mass'] = 0 return data data = pd.read_csv("meteorite-landings.csv") meteorite_train, meteorite_test, meteorite_val = split(data) meteorite_train = normalization(meteorite_train) meteorite_test = normalization(meteorite_test) meteorite_val = normalization(meteorite_val) meteorite_train = normalization(meteorite_train) meteorite_test = normalization(meteorite_test) meteorite_val = normalization(meteorite_val) meteorite_train.to_csv('meteorite_train.csv', encoding='utf-8') meteorite_test.to_csv('meteorite_test.csv', encoding='utf-8') meteorite_val.to_csv('meteorite_val.csv', encoding='utf-8')