From 715a0efe64924f24e2b916cb5e7a90e3fc12820c Mon Sep 17 00:00:00 2001 From: MatOgr Date: Sun, 24 Apr 2022 13:32:00 +0200 Subject: [PATCH] Data preprocessing change (OHE + MinMax) --- Dockerfile | 9 ++- requirements.txt | 4 +- scripts/grab_avocado.py | 54 ++++++++++----- scripts/model.py | 148 ++++++++++++++++++++++++++++++++++++++++ 4 files changed, 194 insertions(+), 21 deletions(-) create mode 100644 scripts/model.py diff --git a/Dockerfile b/Dockerfile index 02c0529..2a763ee 100644 --- a/Dockerfile +++ b/Dockerfile @@ -15,11 +15,16 @@ ARG KAGGLE_KEY # Copy scripts to the catalog COPY ./scripts/. / -# COPY ./kaggle.json /root/.kaggle/kaggle.json +# * For local (Jenkins) processing +# COPY ./kaggle.json /root/.kaggle/kaggle.json # Run the copied script RUN chmod +x /load_data.sh RUN /load_data.sh RUN chmod +x /grab_avocado.py -RUN python3 /grab_avocado.py \ No newline at end of file +RUN python3 /grab_avocado.py + +# Run the model and train it +RUN chmod +x /model.py +RUN python3 /model.py \ No newline at end of file diff --git a/requirements.txt b/requirements.txt index cf7d49b..33909ff 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,3 +1,5 @@ kaggle pandas -sklearn \ No newline at end of file +numpy +sklearn +torch \ No newline at end of file diff --git a/scripts/grab_avocado.py b/scripts/grab_avocado.py index 7d3406e..342d036 100644 --- a/scripts/grab_avocado.py +++ b/scripts/grab_avocado.py @@ -8,32 +8,50 @@ avocados = pd.read_csv( "data/avocado.csv").rename(columns={"Unnamed: 0": 'Week'}) avocados.describe(include="all") -# * columns containing float values to -float_cols = ['AveragePrice', 'Total Volume', '4046', '4225', - '4770', 'Total Bags', 'Small Bags', 'Large Bags', 'XLarge Bags'] +# * Retrieve the target column +# y = avocados.AveragePrice +# avocados.drop(['AveragePrice'], axis=1, inplace=True) -avocados.loc[:, float_cols] = StandardScaler( -).fit_transform(avocados.loc[:, float_cols]) +# * columns containing numerical values for... +# ['Total Volume', '4046', '4225', '4770', 'Total Bags', 'Small Bags', 'Large Bags', 'XLarge Bags'] +# fcols = (avocados.dtypes != 'object') +# float_cols = list(fcols[fcols].index) +# print("Numerical columns: ", float_cols) +# # * ...standarization +# avocados.loc[:, float_cols] = StandardScaler( +# ).fit_transform(avocados.loc[:, float_cols]) -enc = OneHotEncoder(handle_unknown='ignore') -encoded_region = enc.fit_transform( - avocados['region'].to_numpy().reshape(-1, 1)).toarray() -encoded_region_frame = pd.DataFrame( - encoded_region, columns=enc.get_feature_names_out()) -encoded_types = enc.fit_transform( - avocados['type'].to_numpy().reshape(-1, 1)).toarray() -encoded_types_frame = pd.DataFrame( - encoded_types, columns=enc.get_feature_names_out()) -avocados = pd.concat([avocados, encoded_types_frame, encoded_region_frame], axis=1).drop( - ['type', 'region', 'Date'], axis=1) +# * columns containing objects for... +obj_cols = (avocados.dtypes == 'object') +object_cols = list(obj_cols[obj_cols].index) +print("Object columns: ", object_cols) +# * ...OHE +enc = OneHotEncoder(handle_unknown='ignore', sparse=False) +# encoded_region = enc.fit_transform( +# avocados['region'].to_numpy().reshape(-1, 1)).toarray() +# encoded_region_frame = pd.DataFrame( +# encoded_region, columns=enc.get_feature_names_out()) +# encoded_types = enc.fit_transform( +# avocados['type'].to_numpy().reshape(-1, 1)).toarray() +# encoded_types_frame = pd.DataFrame( +# encoded_types, columns=enc.get_feature_names_out()) +ohe_df = pd.DataFrame(enc.fit_transform(avocados[object_cols])) +ohe_df.index = avocados.index +avocados = pd.concat([avocados.drop(object_cols, axis=1), ohe_df], axis=1) +all_cols = avocados.columns +print(all_cols) +# avocados = pd.concat([avocados, ohe_df], axis=1) +# * Time for normalization +mM = MinMaxScaler() +avocados_normed = pd.DataFrame(mM.fit_transform(avocados.values), columns=all_cols) -print(avocados.head()) +print(avocados_normed.head()) # avocados.loc[:, float_cols] = MinMaxScaler().fit_transform(avocados.loc[:, float_cols]) # print(avocados.head()) avocado_train, avocado_test = train_test_split( - avocados, test_size=2000, random_state=3337) + avocados_normed, test_size=2000, random_state=3337) avocado_train, avocado_valid = train_test_split( avocado_train, test_size=2249, random_state=3337) diff --git a/scripts/model.py b/scripts/model.py new file mode 100644 index 0000000..cd2bd46 --- /dev/null +++ b/scripts/model.py @@ -0,0 +1,148 @@ +import pandas as pd +import numpy as np +from sklearn.metrics import mean_squared_error + +import torch +from torch import nn +from torch.utils import data as t_u_data + +print( + f"PyTorch working?\t →\t{torch.__version__}\nLooks like potatoe...but seems to be fine") + + +# * Customized Dataset class (base provided by PyTorch) +class AvocadoDataset(t_u_data.Dataset): + def __init__(self, path: str, target: str = 'AveragePrice'): + data = pd.read_csv(path) + self.y = data.values[:, 1].astype('float32') + self.y = self.y.reshape((len(self.y), 1)) + self.x_shape = data.drop([target], axis=1).shape + self.x_data = data.drop( + [target], axis=1).values.astype('float32') + # print("Data shape is: ", self.x_data.shape) + + def __len__(self): + return len(self.x_data) + + def __getitem__(self, idx): + return [self.x_data[idx], self.y[idx]] + + def get_shape(self): + return self.x_shape + + def get_splits(self, n_test=0.33): + test_size = round(n_test * len(self.x_data)) + train_size = len(self.x_data) - test_size + return t_u_data.random_split(self, [train_size, test_size]) + + +class AvocadoRegressor(nn.Module): + def __init__(self, input_dim): + super(AvocadoRegressor, self).__init__() + self.hidden1 = nn.Linear(input_dim, 32) + nn.init.xavier_uniform_(self.hidden1.weight) + self.act1 = nn.ReLU() + self.hidden2 = nn.Linear(32, 8) + nn.init.xavier_uniform_(self.hidden2.weight) + self.act2 = nn.ReLU() + self.hidden3 = nn.Linear(8, 1) + nn.init.xavier_uniform_(self.hidden3.weight) + + def forward(self, x): + x = self.hidden1(x) + x = self.act1(x) + x = self.hidden2(x) + x = self.act2(x) + x = self.hidden3(x) + return x + + +def prepare_data(path): + dataset = AvocadoDataset(path) + train, test = dataset.get_splits() + train_dl = t_u_data.DataLoader(train, batch_size=32, shuffle=True) + test_dl = t_u_data.DataLoader(test, batch_size=1024, shuffle=False) + return train_dl, test_dl + + +def train_model(train_dl, model, epochs=100): + criterion = nn.MSELoss() + optimizer = torch.optim.SGD(model.parameters(), lr=0.01, momentum=0.9) + to_compare = None + + for epoch in range(epochs): + if epoch == 0: + print(f"Epoch: {epoch+1}") + if epoch > 0 and (epoch+1) % 10 == 0: + print( + f"Epoch: {epoch+1}\tloss\t→\t{mean_squared_error(to_compare[1].detach().numpy(), to_compare[0].detach().numpy())}") + for i, (inputs, targets) in enumerate(train_dl): + optimizer.zero_grad() + yhat = model(inputs) + # * For loss value inspection + to_compare = (yhat, targets) + loss = criterion(yhat, targets) + loss.backward() + optimizer.step() + + +def evaluate_model(test_dl, model): + predictions, actuals = list(), list() + for _, (inputs, targets) in enumerate(test_dl): + yhat = model(inputs) + # * retrieve numpy array + yhat = yhat.detach().numpy() + actual = targets.numpy() + actual = actual.reshape((len(actual), 1)) + # * store predictions + predictions.append(yhat) + actuals.append(actual) + predictions, actuals = np.vstack(predictions), np.vstack(actuals) + # * return MSE value + return mean_squared_error(actuals, predictions) + + +def predict(row, model): + row = row[0].flatten() + yhat = model(row) + yhat = yhat.detach().numpy() + return yhat + + +if __name__ == '__main__': + + # * Paths to data + avocado_train = './data/avocado.data.train' + avocado_valid = './data/avocado.data.valid' + avocado_test = './data/avocado.data.test' + + # * Data preparation + train_dl = t_u_data.DataLoader(AvocadoDataset( + avocado_train), batch_size=32, shuffle=True) + validate_dl = t_u_data.DataLoader(AvocadoDataset( + avocado_valid), batch_size=128, shuffle=True) + test_dl = t_u_data.DataLoader(AvocadoDataset( + avocado_test), batch_size=1, shuffle=False) + print(f""" + Train set size: {len(train_dl.dataset)}, + Validate set size: {len(validate_dl.dataset)} + Test set size: {len(test_dl.dataset)} + """) + + # * Model definition + # ! 66 - in case only regions and type are used (among all the categorical vals) + model = AvocadoRegressor(235) + + # * Train model + print("Let's start the training, mate!") + train_model(train_dl, model) + + # * Evaluate model + mse = evaluate_model(validate_dl, model) + print(f"\nEvaluation\t→\tMSE: {mse}, RMSE: {np.sqrt(mse)}") + + # * Prediction + predictions = [(predict(row, model)[0], row[1].item()) for row in test_dl] + preds_df = pd.DataFrame(predictions, columns=["Prediction", "Target"]) + print("\nNow predictions - hey ho, let's go!\n", preds_df.head()) + preds_df.to_csv("./data/predictions.csv", index=False)