Initial MLFlow setup
Some checks failed
s444409-training/pipeline/head There was a failure building this commit

This commit is contained in:
emkarcinos 2022-05-09 11:20:41 +02:00 committed by Marcin Kostrzewski
parent b6f47e9fef
commit 66c7e1c583
3 changed files with 47 additions and 38 deletions

View File

@ -34,7 +34,7 @@ pipeline {
} }
stage('Train model') { stage('Train model') {
steps { steps {
sh "python train_model.py with 'epochs=${params.EPOCHS}' 'batch_size=${params.BATCHSIZE}'" sh "python train_model.py -e ${params.EPOCHS} -b ${params.BATCHSIZE}"
} }
} }
stage('Archive model and evaluate it') { stage('Archive model and evaluate it') {

View File

@ -5,3 +5,4 @@ numpy~=1.22.3
matplotlib==3.5.2 matplotlib==3.5.2
sacred==0.8.2 sacred==0.8.2
pymongo==4.1.1 pymongo==4.1.1
mlflow==1.25.1

View File

@ -1,12 +1,9 @@
import argparse
import numpy as np
import pandas as pd
import torch import torch
from sacred.observers import FileStorageObserver, MongoObserver import mlflow
import argparse
from torch import nn from torch import nn
from torch.utils.data import DataLoader, Dataset from torch.utils.data import DataLoader
from sacred import Experiment from urllib.parse import urlparse
from model import PlantsDataset, MLP, train, test from model import PlantsDataset, MLP, train, test
@ -15,8 +12,23 @@ default_epochs = 5
device = "cuda" if torch.cuda.is_available() else "cpu" device = "cuda" if torch.cuda.is_available() else "cpu"
mlflow.set_tracking_uri("http://172.17.0.1:5000")
mlflow.set_experiment("s444409")
def setup_args():
args_parser = argparse.ArgumentParser(prefix_chars='-')
args_parser.add_argument('-b', '--batchSize', type=int, default=default_batch_size)
args_parser.add_argument('-e', '--epochs', type=int, default=default_epochs)
return args_parser.parse_args()
if __name__ == "__main__":
args = setup_args()
batch_size = args.batchSize
epochs = args.epochs
def main(batch_size, epochs, _run):
print(f"Using {device} device") print(f"Using {device} device")
plant_test = PlantsDataset('data/Plant_1_Generation_Data.csv.test') plant_test = PlantsDataset('data/Plant_1_Generation_Data.csv.test')
@ -35,37 +47,33 @@ def main(batch_size, epochs, _run):
loss_fn = nn.MSELoss() loss_fn = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3) optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
for t in range(epochs):
print(f"Epoch {t + 1}\n-------------------------------")
train(train_dataloader, model, loss_fn, optimizer)
last_loss = test(test_dataloader, model, loss_fn)
_run.log_scalar('training.loss', last_loss, t)
print("Done!") print("Done!")
torch.save(model.state_dict(), './model_out') torch.save(model.state_dict(), './model_out')
print("Model saved in ./model_out file.") print("Model saved in ./model_out file.")
with mlflow.start_run() as run:
print("MLflow run experiment_id: {0}".format(run.info.experiment_id))
print("MLflow run artifact_uri: {0}".format(run.info.artifact_uri))
mlflow.log_param("batch_size", batch_size)
mlflow.log_param("epochs", epochs)
for t in range(epochs):
print(f"Epoch {t + 1}\n-------------------------------")
train(train_dataloader, model, loss_fn, optimizer)
last_loss = test(test_dataloader, model, loss_fn)
mlflow.log_metric("rmse", last_loss)
def setup_experiment(): with torch.no_grad():
ex = Experiment('Predict power output for a given time') preds = model(plant_test.x_train)
ex.observers.append(FileStorageObserver('sacred_runs')) signature = mlflow.models.signature.infer_signature(plant_test.x_train.numpy(), preds.numpy())
ex.observers.append(MongoObserver(url='mongodb://admin:IUM_2021@172.17.0.1:27017', tracking_url_type_store = urlparse(mlflow.get_tracking_uri()).scheme
db_name='sacred')) if tracking_url_type_store != "file":
return ex mlflow.pytorch.log_model(
model,
"s444409-power-plant-model",
ex = setup_experiment() registered_model_name="s444409PowerPlant",
signature=signature
)
@ex.config else:
def experiment_config(): mlflow.pytorch.log_model(model, "s444409-power-plant-model", signature=signature)
batch_size = 64
epochs = 5
@ex.automain
def run(batch_size, epochs, _run):
main(batch_size, epochs, _run)
ex.add_artifact('model_out')