2022-04-03 19:39:46 +02:00
|
|
|
import pandas as pd
|
|
|
|
from sklearn.model_selection import train_test_split
|
2022-04-23 16:47:43 +02:00
|
|
|
from sklearn.preprocessing import StandardScaler, MinMaxScaler, OneHotEncoder
|
2022-04-03 19:39:46 +02:00
|
|
|
|
2022-04-03 20:17:21 +02:00
|
|
|
cols = list(pd.read_csv("data/avocado.csv", nrows=1))
|
2022-04-03 19:39:46 +02:00
|
|
|
# print("###\n", cols, "\n###")
|
2022-04-23 16:47:43 +02:00
|
|
|
avocados = pd.read_csv(
|
|
|
|
"data/avocado.csv").rename(columns={"Unnamed: 0": 'Week'})
|
2022-05-11 19:10:26 +02:00
|
|
|
print(avocados.describe(include="all"))
|
|
|
|
avg_prices = avocados['AveragePrice']
|
|
|
|
avocados.drop(['AveragePrice'], axis=1, inplace=True)
|
|
|
|
|
2022-04-03 19:39:46 +02:00
|
|
|
|
2022-04-24 13:32:00 +02:00
|
|
|
# * Retrieve the target column
|
|
|
|
# y = avocados.AveragePrice
|
|
|
|
# avocados.drop(['AveragePrice'], axis=1, inplace=True)
|
|
|
|
|
|
|
|
# * columns containing numerical values for...
|
|
|
|
# ['Total Volume', '4046', '4225', '4770', 'Total Bags', 'Small Bags', 'Large Bags', 'XLarge Bags']
|
|
|
|
# fcols = (avocados.dtypes != 'object')
|
|
|
|
# float_cols = list(fcols[fcols].index)
|
|
|
|
# print("Numerical columns: ", float_cols)
|
|
|
|
# # * ...standarization
|
|
|
|
# avocados.loc[:, float_cols] = StandardScaler(
|
|
|
|
# ).fit_transform(avocados.loc[:, float_cols])
|
|
|
|
|
|
|
|
# * columns containing objects for...
|
|
|
|
obj_cols = (avocados.dtypes == 'object')
|
|
|
|
object_cols = list(obj_cols[obj_cols].index)
|
|
|
|
print("Object columns: ", object_cols)
|
|
|
|
# * ...OHE
|
|
|
|
enc = OneHotEncoder(handle_unknown='ignore', sparse=False)
|
|
|
|
# encoded_region = enc.fit_transform(
|
|
|
|
# avocados['region'].to_numpy().reshape(-1, 1)).toarray()
|
|
|
|
# encoded_region_frame = pd.DataFrame(
|
|
|
|
# encoded_region, columns=enc.get_feature_names_out())
|
|
|
|
# encoded_types = enc.fit_transform(
|
|
|
|
# avocados['type'].to_numpy().reshape(-1, 1)).toarray()
|
|
|
|
# encoded_types_frame = pd.DataFrame(
|
|
|
|
# encoded_types, columns=enc.get_feature_names_out())
|
|
|
|
ohe_df = pd.DataFrame(enc.fit_transform(avocados[object_cols]))
|
|
|
|
ohe_df.index = avocados.index
|
|
|
|
avocados = pd.concat([avocados.drop(object_cols, axis=1), ohe_df], axis=1)
|
|
|
|
all_cols = avocados.columns
|
|
|
|
print(all_cols)
|
|
|
|
# avocados = pd.concat([avocados, ohe_df], axis=1)
|
|
|
|
# * Time for normalization
|
|
|
|
mM = MinMaxScaler()
|
2022-05-11 19:10:26 +02:00
|
|
|
avocados_normed = pd.concat([avg_prices, pd.DataFrame(
|
|
|
|
mM.fit_transform(avocados.values), columns=all_cols)], axis=1)
|
2022-04-24 13:32:00 +02:00
|
|
|
|
|
|
|
print(avocados_normed.head())
|
2022-04-03 19:39:46 +02:00
|
|
|
|
|
|
|
# avocados.loc[:, float_cols] = MinMaxScaler().fit_transform(avocados.loc[:, float_cols])
|
|
|
|
# print(avocados.head())
|
|
|
|
|
2022-04-23 16:47:43 +02:00
|
|
|
avocado_train, avocado_test = train_test_split(
|
2022-04-24 13:32:00 +02:00
|
|
|
avocados_normed, test_size=2000, random_state=3337)
|
2022-04-23 16:47:43 +02:00
|
|
|
avocado_train, avocado_valid = train_test_split(
|
|
|
|
avocado_train, test_size=2249, random_state=3337)
|
2022-04-03 19:39:46 +02:00
|
|
|
|
|
|
|
print("Train\n", avocado_train.describe(include="all"), "\n")
|
|
|
|
print("Valid\n", avocado_valid.describe(include="all"), "\n")
|
|
|
|
print("Test\n", avocado_test.describe(include="all"))
|
|
|
|
|
2022-04-03 20:17:21 +02:00
|
|
|
avocado_train.to_csv("data/avocado.data.train", index=False)
|
|
|
|
avocado_valid.to_csv("data/avocado.data.valid", index=False)
|
|
|
|
avocado_test.to_csv("data/avocado.data.test", index=False)
|