Merge branch 'develop'
This commit is contained in:
commit
40b02d2187
@ -15,11 +15,16 @@ ARG KAGGLE_KEY
|
||||
|
||||
# Copy scripts to the catalog
|
||||
COPY ./scripts/. /
|
||||
# COPY ./kaggle.json /root/.kaggle/kaggle.json
|
||||
# * For local (Jenkins) processing
|
||||
# COPY ./kaggle.json /root/.kaggle/kaggle.json
|
||||
|
||||
# Run the copied script
|
||||
RUN chmod +x /load_data.sh
|
||||
RUN /load_data.sh
|
||||
|
||||
RUN chmod +x /grab_avocado.py
|
||||
RUN python3 /grab_avocado.py
|
||||
RUN python3 /grab_avocado.py
|
||||
|
||||
# Run the model and train it
|
||||
RUN chmod +x /model.py
|
||||
RUN python3 /model.py
|
@ -1,46 +0,0 @@
|
||||
// pipeline {
|
||||
|
||||
// agent {
|
||||
// dockerfile {
|
||||
// additionalBuildArgs "--build-arg KAGGLE_USERNAME=${params.KAGGLE_USERNAME} --build-arg KAGGLE_KEY=${params.KAGGLE_KEY} -t s478841-create-dataset"
|
||||
// }
|
||||
// }
|
||||
|
||||
// stages {
|
||||
// stage('Simple data stats') {
|
||||
// steps {
|
||||
// sh 'chmod u+x ./scripts/data_stats.sh'
|
||||
// sh """
|
||||
// docker run
|
||||
// """
|
||||
// sh './scripts/data_stats.sh'
|
||||
// }
|
||||
// }
|
||||
// }
|
||||
|
||||
// post {
|
||||
// always {
|
||||
// archiveArtifacts artifacts: 'data/*',
|
||||
// onlyIfSuccessful: true
|
||||
// }
|
||||
// }
|
||||
// }
|
||||
|
||||
node {
|
||||
checkout scm
|
||||
|
||||
stage('Load Docker image & data') {
|
||||
def dataImage = docker.build('s478841-image', "--build-arg KAGGLE_USERNAME=${params.KAGGLE_USERNAME} --build-arg KAGGLE_KEY=${params.KAGGLE_KEY} .")
|
||||
|
||||
dataImage.inside('--name kaggload -v $WORKSPACE:/data -u root') {
|
||||
// sh 'chmod u+x ./scripts/data_stats.sh'
|
||||
// sh './scripts/data_stats.sh'
|
||||
sh 'cp /app/data/* /data/data'
|
||||
sh 'echo Data loaded'
|
||||
}
|
||||
}
|
||||
|
||||
stage('Archive arifacts') {
|
||||
archiveArtifacts artifacts: '*data/avocado.data*', onlyIfSuccessful: true
|
||||
}
|
||||
}
|
18
jenkins/docker.Jenkinsfile
Normal file
18
jenkins/docker.Jenkinsfile
Normal file
@ -0,0 +1,18 @@
|
||||
node {
|
||||
checkout scm
|
||||
|
||||
stage('Load Docker image & data') {
|
||||
def dataImage = docker.build('s478841-image', "--build-arg KAGGLE_USERNAME=${params.KAGGLE_USERNAME} --build-arg KAGGLE_KEY=${params.KAGGLE_KEY} .")
|
||||
|
||||
dataImage.inside('--name kaggload -v $WORKSPACE:/data -u root') {
|
||||
// sh 'chmod u+x ./scripts/data_stats.sh'
|
||||
// sh './scripts/data_stats.sh'
|
||||
sh 'cp /app/data/* /data/data'
|
||||
sh 'echo Data loaded'
|
||||
}
|
||||
}
|
||||
|
||||
stage('Archive arifacts') {
|
||||
archiveArtifacts artifacts: '*data/avocado.data*', onlyIfSuccessful: true
|
||||
}
|
||||
}
|
@ -1,3 +1,5 @@
|
||||
kaggle
|
||||
pandas
|
||||
sklearn
|
||||
numpy
|
||||
sklearn
|
||||
torch
|
@ -1,22 +1,59 @@
|
||||
import pandas as pd
|
||||
from sklearn.model_selection import train_test_split
|
||||
from sklearn.preprocessing import StandardScaler, MinMaxScaler
|
||||
from sklearn.preprocessing import StandardScaler, MinMaxScaler, OneHotEncoder
|
||||
|
||||
cols = list(pd.read_csv("data/avocado.csv", nrows=1))
|
||||
# print("###\n", cols, "\n###")
|
||||
avocados = pd.read_csv("data/avocado.csv", usecols=cols[1:])
|
||||
avocados = pd.read_csv(
|
||||
"data/avocado.csv").rename(columns={"Unnamed: 0": 'Week'})
|
||||
avocados.describe(include="all")
|
||||
|
||||
float_cols = ['AveragePrice','Total Volume','4046','4225','4770','Total Bags','Small Bags','Large Bags','XLarge Bags']
|
||||
# * Retrieve the target column
|
||||
# y = avocados.AveragePrice
|
||||
# avocados.drop(['AveragePrice'], axis=1, inplace=True)
|
||||
|
||||
avocados.loc[:, float_cols] = StandardScaler().fit_transform(avocados.loc[:, float_cols])
|
||||
print(avocados.head())
|
||||
# * columns containing numerical values for...
|
||||
# ['Total Volume', '4046', '4225', '4770', 'Total Bags', 'Small Bags', 'Large Bags', 'XLarge Bags']
|
||||
# fcols = (avocados.dtypes != 'object')
|
||||
# float_cols = list(fcols[fcols].index)
|
||||
# print("Numerical columns: ", float_cols)
|
||||
# # * ...standarization
|
||||
# avocados.loc[:, float_cols] = StandardScaler(
|
||||
# ).fit_transform(avocados.loc[:, float_cols])
|
||||
|
||||
# * columns containing objects for...
|
||||
obj_cols = (avocados.dtypes == 'object')
|
||||
object_cols = list(obj_cols[obj_cols].index)
|
||||
print("Object columns: ", object_cols)
|
||||
# * ...OHE
|
||||
enc = OneHotEncoder(handle_unknown='ignore', sparse=False)
|
||||
# encoded_region = enc.fit_transform(
|
||||
# avocados['region'].to_numpy().reshape(-1, 1)).toarray()
|
||||
# encoded_region_frame = pd.DataFrame(
|
||||
# encoded_region, columns=enc.get_feature_names_out())
|
||||
# encoded_types = enc.fit_transform(
|
||||
# avocados['type'].to_numpy().reshape(-1, 1)).toarray()
|
||||
# encoded_types_frame = pd.DataFrame(
|
||||
# encoded_types, columns=enc.get_feature_names_out())
|
||||
ohe_df = pd.DataFrame(enc.fit_transform(avocados[object_cols]))
|
||||
ohe_df.index = avocados.index
|
||||
avocados = pd.concat([avocados.drop(object_cols, axis=1), ohe_df], axis=1)
|
||||
all_cols = avocados.columns
|
||||
print(all_cols)
|
||||
# avocados = pd.concat([avocados, ohe_df], axis=1)
|
||||
# * Time for normalization
|
||||
mM = MinMaxScaler()
|
||||
avocados_normed = pd.DataFrame(mM.fit_transform(avocados.values), columns=all_cols)
|
||||
|
||||
print(avocados_normed.head())
|
||||
|
||||
# avocados.loc[:, float_cols] = MinMaxScaler().fit_transform(avocados.loc[:, float_cols])
|
||||
# print(avocados.head())
|
||||
|
||||
avocado_train, avocado_test = train_test_split(avocados, test_size=2000, random_state=3337)
|
||||
avocado_train, avocado_valid = train_test_split(avocado_train, test_size=2249, random_state=3337)
|
||||
avocado_train, avocado_test = train_test_split(
|
||||
avocados_normed, test_size=2000, random_state=3337)
|
||||
avocado_train, avocado_valid = train_test_split(
|
||||
avocado_train, test_size=2249, random_state=3337)
|
||||
|
||||
print("Train\n", avocado_train.describe(include="all"), "\n")
|
||||
print("Valid\n", avocado_valid.describe(include="all"), "\n")
|
||||
|
148
scripts/model.py
Normal file
148
scripts/model.py
Normal file
@ -0,0 +1,148 @@
|
||||
import pandas as pd
|
||||
import numpy as np
|
||||
from sklearn.metrics import mean_squared_error
|
||||
|
||||
import torch
|
||||
from torch import nn
|
||||
from torch.utils import data as t_u_data
|
||||
|
||||
print(
|
||||
f"PyTorch working?\t →\t{torch.__version__}\nLooks like potatoe...but seems to be fine")
|
||||
|
||||
|
||||
# * Customized Dataset class (base provided by PyTorch)
|
||||
class AvocadoDataset(t_u_data.Dataset):
|
||||
def __init__(self, path: str, target: str = 'AveragePrice'):
|
||||
data = pd.read_csv(path)
|
||||
self.y = data.values[:, 1].astype('float32')
|
||||
self.y = self.y.reshape((len(self.y), 1))
|
||||
self.x_shape = data.drop([target], axis=1).shape
|
||||
self.x_data = data.drop(
|
||||
[target], axis=1).values.astype('float32')
|
||||
# print("Data shape is: ", self.x_data.shape)
|
||||
|
||||
def __len__(self):
|
||||
return len(self.x_data)
|
||||
|
||||
def __getitem__(self, idx):
|
||||
return [self.x_data[idx], self.y[idx]]
|
||||
|
||||
def get_shape(self):
|
||||
return self.x_shape
|
||||
|
||||
def get_splits(self, n_test=0.33):
|
||||
test_size = round(n_test * len(self.x_data))
|
||||
train_size = len(self.x_data) - test_size
|
||||
return t_u_data.random_split(self, [train_size, test_size])
|
||||
|
||||
|
||||
class AvocadoRegressor(nn.Module):
|
||||
def __init__(self, input_dim):
|
||||
super(AvocadoRegressor, self).__init__()
|
||||
self.hidden1 = nn.Linear(input_dim, 32)
|
||||
nn.init.xavier_uniform_(self.hidden1.weight)
|
||||
self.act1 = nn.ReLU()
|
||||
self.hidden2 = nn.Linear(32, 8)
|
||||
nn.init.xavier_uniform_(self.hidden2.weight)
|
||||
self.act2 = nn.ReLU()
|
||||
self.hidden3 = nn.Linear(8, 1)
|
||||
nn.init.xavier_uniform_(self.hidden3.weight)
|
||||
|
||||
def forward(self, x):
|
||||
x = self.hidden1(x)
|
||||
x = self.act1(x)
|
||||
x = self.hidden2(x)
|
||||
x = self.act2(x)
|
||||
x = self.hidden3(x)
|
||||
return x
|
||||
|
||||
|
||||
def prepare_data(path):
|
||||
dataset = AvocadoDataset(path)
|
||||
train, test = dataset.get_splits()
|
||||
train_dl = t_u_data.DataLoader(train, batch_size=32, shuffle=True)
|
||||
test_dl = t_u_data.DataLoader(test, batch_size=1024, shuffle=False)
|
||||
return train_dl, test_dl
|
||||
|
||||
|
||||
def train_model(train_dl, model, epochs=100):
|
||||
criterion = nn.MSELoss()
|
||||
optimizer = torch.optim.SGD(model.parameters(), lr=0.01, momentum=0.9)
|
||||
to_compare = None
|
||||
|
||||
for epoch in range(epochs):
|
||||
if epoch == 0:
|
||||
print(f"Epoch: {epoch+1}")
|
||||
if epoch > 0 and (epoch+1) % 10 == 0:
|
||||
print(
|
||||
f"Epoch: {epoch+1}\tloss\t→\t{mean_squared_error(to_compare[1].detach().numpy(), to_compare[0].detach().numpy())}")
|
||||
for i, (inputs, targets) in enumerate(train_dl):
|
||||
optimizer.zero_grad()
|
||||
yhat = model(inputs)
|
||||
# * For loss value inspection
|
||||
to_compare = (yhat, targets)
|
||||
loss = criterion(yhat, targets)
|
||||
loss.backward()
|
||||
optimizer.step()
|
||||
|
||||
|
||||
def evaluate_model(test_dl, model):
|
||||
predictions, actuals = list(), list()
|
||||
for _, (inputs, targets) in enumerate(test_dl):
|
||||
yhat = model(inputs)
|
||||
# * retrieve numpy array
|
||||
yhat = yhat.detach().numpy()
|
||||
actual = targets.numpy()
|
||||
actual = actual.reshape((len(actual), 1))
|
||||
# * store predictions
|
||||
predictions.append(yhat)
|
||||
actuals.append(actual)
|
||||
predictions, actuals = np.vstack(predictions), np.vstack(actuals)
|
||||
# * return MSE value
|
||||
return mean_squared_error(actuals, predictions)
|
||||
|
||||
|
||||
def predict(row, model):
|
||||
row = row[0].flatten()
|
||||
yhat = model(row)
|
||||
yhat = yhat.detach().numpy()
|
||||
return yhat
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
|
||||
# * Paths to data
|
||||
avocado_train = './data/avocado.data.train'
|
||||
avocado_valid = './data/avocado.data.valid'
|
||||
avocado_test = './data/avocado.data.test'
|
||||
|
||||
# * Data preparation
|
||||
train_dl = t_u_data.DataLoader(AvocadoDataset(
|
||||
avocado_train), batch_size=32, shuffle=True)
|
||||
validate_dl = t_u_data.DataLoader(AvocadoDataset(
|
||||
avocado_valid), batch_size=128, shuffle=True)
|
||||
test_dl = t_u_data.DataLoader(AvocadoDataset(
|
||||
avocado_test), batch_size=1, shuffle=False)
|
||||
print(f"""
|
||||
Train set size: {len(train_dl.dataset)},
|
||||
Validate set size: {len(validate_dl.dataset)}
|
||||
Test set size: {len(test_dl.dataset)}
|
||||
""")
|
||||
|
||||
# * Model definition
|
||||
# ! 66 - in case only regions and type are used (among all the categorical vals)
|
||||
model = AvocadoRegressor(235)
|
||||
|
||||
# * Train model
|
||||
print("Let's start the training, mate!")
|
||||
train_model(train_dl, model)
|
||||
|
||||
# * Evaluate model
|
||||
mse = evaluate_model(validate_dl, model)
|
||||
print(f"\nEvaluation\t→\tMSE: {mse}, RMSE: {np.sqrt(mse)}")
|
||||
|
||||
# * Prediction
|
||||
predictions = [(predict(row, model)[0], row[1].item()) for row in test_dl]
|
||||
preds_df = pd.DataFrame(predictions, columns=["Prediction", "Target"])
|
||||
print("\nNow predictions - hey ho, let's go!\n", preds_df.head())
|
||||
preds_df.to_csv("./data/predictions.csv", index=False)
|
Loading…
Reference in New Issue
Block a user