diff --git a/Dockerfile b/Dockerfile index 02c0529..2a763ee 100644 --- a/Dockerfile +++ b/Dockerfile @@ -15,11 +15,16 @@ ARG KAGGLE_KEY # Copy scripts to the catalog COPY ./scripts/. / -# COPY ./kaggle.json /root/.kaggle/kaggle.json +# * For local (Jenkins) processing +# COPY ./kaggle.json /root/.kaggle/kaggle.json # Run the copied script RUN chmod +x /load_data.sh RUN /load_data.sh RUN chmod +x /grab_avocado.py -RUN python3 /grab_avocado.py \ No newline at end of file +RUN python3 /grab_avocado.py + +# Run the model and train it +RUN chmod +x /model.py +RUN python3 /model.py \ No newline at end of file diff --git a/jenkins/Jenkinsfile_docker b/jenkins/Jenkinsfile_docker deleted file mode 100644 index 6d43c77..0000000 --- a/jenkins/Jenkinsfile_docker +++ /dev/null @@ -1,46 +0,0 @@ -// pipeline { - -// agent { -// dockerfile { -// additionalBuildArgs "--build-arg KAGGLE_USERNAME=${params.KAGGLE_USERNAME} --build-arg KAGGLE_KEY=${params.KAGGLE_KEY} -t s478841-create-dataset" -// } -// } - -// stages { -// stage('Simple data stats') { -// steps { -// sh 'chmod u+x ./scripts/data_stats.sh' -// sh """ -// docker run -// """ -// sh './scripts/data_stats.sh' -// } -// } -// } - -// post { -// always { -// archiveArtifacts artifacts: 'data/*', -// onlyIfSuccessful: true -// } -// } -// } - -node { - checkout scm - - stage('Load Docker image & data') { - def dataImage = docker.build('s478841-image', "--build-arg KAGGLE_USERNAME=${params.KAGGLE_USERNAME} --build-arg KAGGLE_KEY=${params.KAGGLE_KEY} .") - - dataImage.inside('--name kaggload -v $WORKSPACE:/data -u root') { - // sh 'chmod u+x ./scripts/data_stats.sh' - // sh './scripts/data_stats.sh' - sh 'cp /app/data/* /data/data' - sh 'echo Data loaded' - } - } - - stage('Archive arifacts') { - archiveArtifacts artifacts: '*data/avocado.data*', onlyIfSuccessful: true - } -} diff --git a/jenkins/docker.Jenkinsfile b/jenkins/docker.Jenkinsfile new file mode 100644 index 0000000..7d02f49 --- /dev/null +++ b/jenkins/docker.Jenkinsfile @@ -0,0 +1,18 @@ +node { + checkout scm + + stage('Load Docker image & data') { + def dataImage = docker.build('s478841-image', "--build-arg KAGGLE_USERNAME=${params.KAGGLE_USERNAME} --build-arg KAGGLE_KEY=${params.KAGGLE_KEY} .") + + dataImage.inside('--name kaggload -v $WORKSPACE:/data -u root') { + // sh 'chmod u+x ./scripts/data_stats.sh' + // sh './scripts/data_stats.sh' + sh 'cp /app/data/* /data/data' + sh 'echo Data loaded' + } + } + + stage('Archive arifacts') { + archiveArtifacts artifacts: '*data/avocado.data*', onlyIfSuccessful: true + } +} diff --git a/jenkins/Jenkinstats_docker b/jenkins/dockerstats.Jenkinsfile similarity index 100% rename from jenkins/Jenkinstats_docker rename to jenkins/dockerstats.Jenkinsfile diff --git a/jenkins/Jenkinstats b/jenkins/stats.Jenkinsfile similarity index 100% rename from jenkins/Jenkinstats rename to jenkins/stats.Jenkinsfile diff --git a/requirements.txt b/requirements.txt index cf7d49b..33909ff 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,3 +1,5 @@ kaggle pandas -sklearn \ No newline at end of file +numpy +sklearn +torch \ No newline at end of file diff --git a/scripts/grab_avocado.py b/scripts/grab_avocado.py index 86d3cf9..342d036 100644 --- a/scripts/grab_avocado.py +++ b/scripts/grab_avocado.py @@ -1,22 +1,59 @@ import pandas as pd from sklearn.model_selection import train_test_split -from sklearn.preprocessing import StandardScaler, MinMaxScaler +from sklearn.preprocessing import StandardScaler, MinMaxScaler, OneHotEncoder cols = list(pd.read_csv("data/avocado.csv", nrows=1)) # print("###\n", cols, "\n###") -avocados = pd.read_csv("data/avocado.csv", usecols=cols[1:]) +avocados = pd.read_csv( + "data/avocado.csv").rename(columns={"Unnamed: 0": 'Week'}) avocados.describe(include="all") -float_cols = ['AveragePrice','Total Volume','4046','4225','4770','Total Bags','Small Bags','Large Bags','XLarge Bags'] +# * Retrieve the target column +# y = avocados.AveragePrice +# avocados.drop(['AveragePrice'], axis=1, inplace=True) -avocados.loc[:, float_cols] = StandardScaler().fit_transform(avocados.loc[:, float_cols]) -print(avocados.head()) +# * columns containing numerical values for... +# ['Total Volume', '4046', '4225', '4770', 'Total Bags', 'Small Bags', 'Large Bags', 'XLarge Bags'] +# fcols = (avocados.dtypes != 'object') +# float_cols = list(fcols[fcols].index) +# print("Numerical columns: ", float_cols) +# # * ...standarization +# avocados.loc[:, float_cols] = StandardScaler( +# ).fit_transform(avocados.loc[:, float_cols]) + +# * columns containing objects for... +obj_cols = (avocados.dtypes == 'object') +object_cols = list(obj_cols[obj_cols].index) +print("Object columns: ", object_cols) +# * ...OHE +enc = OneHotEncoder(handle_unknown='ignore', sparse=False) +# encoded_region = enc.fit_transform( +# avocados['region'].to_numpy().reshape(-1, 1)).toarray() +# encoded_region_frame = pd.DataFrame( +# encoded_region, columns=enc.get_feature_names_out()) +# encoded_types = enc.fit_transform( +# avocados['type'].to_numpy().reshape(-1, 1)).toarray() +# encoded_types_frame = pd.DataFrame( +# encoded_types, columns=enc.get_feature_names_out()) +ohe_df = pd.DataFrame(enc.fit_transform(avocados[object_cols])) +ohe_df.index = avocados.index +avocados = pd.concat([avocados.drop(object_cols, axis=1), ohe_df], axis=1) +all_cols = avocados.columns +print(all_cols) +# avocados = pd.concat([avocados, ohe_df], axis=1) +# * Time for normalization +mM = MinMaxScaler() +avocados_normed = pd.DataFrame(mM.fit_transform(avocados.values), columns=all_cols) + +print(avocados_normed.head()) # avocados.loc[:, float_cols] = MinMaxScaler().fit_transform(avocados.loc[:, float_cols]) # print(avocados.head()) -avocado_train, avocado_test = train_test_split(avocados, test_size=2000, random_state=3337) -avocado_train, avocado_valid = train_test_split(avocado_train, test_size=2249, random_state=3337) +avocado_train, avocado_test = train_test_split( + avocados_normed, test_size=2000, random_state=3337) +avocado_train, avocado_valid = train_test_split( + avocado_train, test_size=2249, random_state=3337) print("Train\n", avocado_train.describe(include="all"), "\n") print("Valid\n", avocado_valid.describe(include="all"), "\n") diff --git a/scripts/model.py b/scripts/model.py new file mode 100644 index 0000000..cd2bd46 --- /dev/null +++ b/scripts/model.py @@ -0,0 +1,148 @@ +import pandas as pd +import numpy as np +from sklearn.metrics import mean_squared_error + +import torch +from torch import nn +from torch.utils import data as t_u_data + +print( + f"PyTorch working?\t →\t{torch.__version__}\nLooks like potatoe...but seems to be fine") + + +# * Customized Dataset class (base provided by PyTorch) +class AvocadoDataset(t_u_data.Dataset): + def __init__(self, path: str, target: str = 'AveragePrice'): + data = pd.read_csv(path) + self.y = data.values[:, 1].astype('float32') + self.y = self.y.reshape((len(self.y), 1)) + self.x_shape = data.drop([target], axis=1).shape + self.x_data = data.drop( + [target], axis=1).values.astype('float32') + # print("Data shape is: ", self.x_data.shape) + + def __len__(self): + return len(self.x_data) + + def __getitem__(self, idx): + return [self.x_data[idx], self.y[idx]] + + def get_shape(self): + return self.x_shape + + def get_splits(self, n_test=0.33): + test_size = round(n_test * len(self.x_data)) + train_size = len(self.x_data) - test_size + return t_u_data.random_split(self, [train_size, test_size]) + + +class AvocadoRegressor(nn.Module): + def __init__(self, input_dim): + super(AvocadoRegressor, self).__init__() + self.hidden1 = nn.Linear(input_dim, 32) + nn.init.xavier_uniform_(self.hidden1.weight) + self.act1 = nn.ReLU() + self.hidden2 = nn.Linear(32, 8) + nn.init.xavier_uniform_(self.hidden2.weight) + self.act2 = nn.ReLU() + self.hidden3 = nn.Linear(8, 1) + nn.init.xavier_uniform_(self.hidden3.weight) + + def forward(self, x): + x = self.hidden1(x) + x = self.act1(x) + x = self.hidden2(x) + x = self.act2(x) + x = self.hidden3(x) + return x + + +def prepare_data(path): + dataset = AvocadoDataset(path) + train, test = dataset.get_splits() + train_dl = t_u_data.DataLoader(train, batch_size=32, shuffle=True) + test_dl = t_u_data.DataLoader(test, batch_size=1024, shuffle=False) + return train_dl, test_dl + + +def train_model(train_dl, model, epochs=100): + criterion = nn.MSELoss() + optimizer = torch.optim.SGD(model.parameters(), lr=0.01, momentum=0.9) + to_compare = None + + for epoch in range(epochs): + if epoch == 0: + print(f"Epoch: {epoch+1}") + if epoch > 0 and (epoch+1) % 10 == 0: + print( + f"Epoch: {epoch+1}\tloss\t→\t{mean_squared_error(to_compare[1].detach().numpy(), to_compare[0].detach().numpy())}") + for i, (inputs, targets) in enumerate(train_dl): + optimizer.zero_grad() + yhat = model(inputs) + # * For loss value inspection + to_compare = (yhat, targets) + loss = criterion(yhat, targets) + loss.backward() + optimizer.step() + + +def evaluate_model(test_dl, model): + predictions, actuals = list(), list() + for _, (inputs, targets) in enumerate(test_dl): + yhat = model(inputs) + # * retrieve numpy array + yhat = yhat.detach().numpy() + actual = targets.numpy() + actual = actual.reshape((len(actual), 1)) + # * store predictions + predictions.append(yhat) + actuals.append(actual) + predictions, actuals = np.vstack(predictions), np.vstack(actuals) + # * return MSE value + return mean_squared_error(actuals, predictions) + + +def predict(row, model): + row = row[0].flatten() + yhat = model(row) + yhat = yhat.detach().numpy() + return yhat + + +if __name__ == '__main__': + + # * Paths to data + avocado_train = './data/avocado.data.train' + avocado_valid = './data/avocado.data.valid' + avocado_test = './data/avocado.data.test' + + # * Data preparation + train_dl = t_u_data.DataLoader(AvocadoDataset( + avocado_train), batch_size=32, shuffle=True) + validate_dl = t_u_data.DataLoader(AvocadoDataset( + avocado_valid), batch_size=128, shuffle=True) + test_dl = t_u_data.DataLoader(AvocadoDataset( + avocado_test), batch_size=1, shuffle=False) + print(f""" + Train set size: {len(train_dl.dataset)}, + Validate set size: {len(validate_dl.dataset)} + Test set size: {len(test_dl.dataset)} + """) + + # * Model definition + # ! 66 - in case only regions and type are used (among all the categorical vals) + model = AvocadoRegressor(235) + + # * Train model + print("Let's start the training, mate!") + train_model(train_dl, model) + + # * Evaluate model + mse = evaluate_model(validate_dl, model) + print(f"\nEvaluation\t→\tMSE: {mse}, RMSE: {np.sqrt(mse)}") + + # * Prediction + predictions = [(predict(row, model)[0], row[1].item()) for row in test_dl] + preds_df = pd.DataFrame(predictions, columns=["Prediction", "Target"]) + print("\nNow predictions - hey ho, let's go!\n", preds_df.head()) + preds_df.to_csv("./data/predictions.csv", index=False)