import pandas as pd from sklearn.model_selection import train_test_split from sklearn.preprocessing import StandardScaler, MinMaxScaler, OneHotEncoder cols = list(pd.read_csv("data/avocado.csv", nrows=1)) # print("###\n", cols, "\n###") avocados = pd.read_csv( "data/avocado.csv").rename(columns={"Unnamed: 0": 'Week'}) avocados.describe(include="all") # * Retrieve the target column # y = avocados.AveragePrice # avocados.drop(['AveragePrice'], axis=1, inplace=True) # * columns containing numerical values for... # ['Total Volume', '4046', '4225', '4770', 'Total Bags', 'Small Bags', 'Large Bags', 'XLarge Bags'] # fcols = (avocados.dtypes != 'object') # float_cols = list(fcols[fcols].index) # print("Numerical columns: ", float_cols) # # * ...standarization # avocados.loc[:, float_cols] = StandardScaler( # ).fit_transform(avocados.loc[:, float_cols]) # * columns containing objects for... obj_cols = (avocados.dtypes == 'object') object_cols = list(obj_cols[obj_cols].index) print("Object columns: ", object_cols) # * ...OHE enc = OneHotEncoder(handle_unknown='ignore', sparse=False) # encoded_region = enc.fit_transform( # avocados['region'].to_numpy().reshape(-1, 1)).toarray() # encoded_region_frame = pd.DataFrame( # encoded_region, columns=enc.get_feature_names_out()) # encoded_types = enc.fit_transform( # avocados['type'].to_numpy().reshape(-1, 1)).toarray() # encoded_types_frame = pd.DataFrame( # encoded_types, columns=enc.get_feature_names_out()) ohe_df = pd.DataFrame(enc.fit_transform(avocados[object_cols])) ohe_df.index = avocados.index avocados = pd.concat([avocados.drop(object_cols, axis=1), ohe_df], axis=1) all_cols = avocados.columns print(all_cols) # avocados = pd.concat([avocados, ohe_df], axis=1) # * Time for normalization mM = MinMaxScaler() avocados_normed = pd.DataFrame(mM.fit_transform(avocados.values), columns=all_cols) print(avocados_normed.head()) # avocados.loc[:, float_cols] = MinMaxScaler().fit_transform(avocados.loc[:, float_cols]) # print(avocados.head()) avocado_train, avocado_test = train_test_split( avocados_normed, test_size=2000, random_state=3337) avocado_train, avocado_valid = train_test_split( avocado_train, test_size=2249, random_state=3337) print("Train\n", avocado_train.describe(include="all"), "\n") print("Valid\n", avocado_valid.describe(include="all"), "\n") print("Test\n", avocado_test.describe(include="all")) avocado_train.to_csv("data/avocado.data.train", index=False) avocado_valid.to_csv("data/avocado.data.valid", index=False) avocado_test.to_csv("data/avocado.data.test", index=False)