diff --git a/scripts/grab_avocado.py b/scripts/grab_avocado.py index 86d3cf9..7d3406e 100644 --- a/scripts/grab_avocado.py +++ b/scripts/grab_avocado.py @@ -1,22 +1,41 @@ import pandas as pd from sklearn.model_selection import train_test_split -from sklearn.preprocessing import StandardScaler, MinMaxScaler +from sklearn.preprocessing import StandardScaler, MinMaxScaler, OneHotEncoder cols = list(pd.read_csv("data/avocado.csv", nrows=1)) # print("###\n", cols, "\n###") -avocados = pd.read_csv("data/avocado.csv", usecols=cols[1:]) +avocados = pd.read_csv( + "data/avocado.csv").rename(columns={"Unnamed: 0": 'Week'}) avocados.describe(include="all") -float_cols = ['AveragePrice','Total Volume','4046','4225','4770','Total Bags','Small Bags','Large Bags','XLarge Bags'] +# * columns containing float values to +float_cols = ['AveragePrice', 'Total Volume', '4046', '4225', + '4770', 'Total Bags', 'Small Bags', 'Large Bags', 'XLarge Bags'] + +avocados.loc[:, float_cols] = StandardScaler( +).fit_transform(avocados.loc[:, float_cols]) + +enc = OneHotEncoder(handle_unknown='ignore') +encoded_region = enc.fit_transform( + avocados['region'].to_numpy().reshape(-1, 1)).toarray() +encoded_region_frame = pd.DataFrame( + encoded_region, columns=enc.get_feature_names_out()) +encoded_types = enc.fit_transform( + avocados['type'].to_numpy().reshape(-1, 1)).toarray() +encoded_types_frame = pd.DataFrame( + encoded_types, columns=enc.get_feature_names_out()) +avocados = pd.concat([avocados, encoded_types_frame, encoded_region_frame], axis=1).drop( + ['type', 'region', 'Date'], axis=1) -avocados.loc[:, float_cols] = StandardScaler().fit_transform(avocados.loc[:, float_cols]) print(avocados.head()) # avocados.loc[:, float_cols] = MinMaxScaler().fit_transform(avocados.loc[:, float_cols]) # print(avocados.head()) -avocado_train, avocado_test = train_test_split(avocados, test_size=2000, random_state=3337) -avocado_train, avocado_valid = train_test_split(avocado_train, test_size=2249, random_state=3337) +avocado_train, avocado_test = train_test_split( + avocados, test_size=2000, random_state=3337) +avocado_train, avocado_valid = train_test_split( + avocado_train, test_size=2249, random_state=3337) print("Train\n", avocado_train.describe(include="all"), "\n") print("Valid\n", avocado_valid.describe(include="all"), "\n")