From 06894754f85a7c5755fec00c4806a493d70d7757 Mon Sep 17 00:00:00 2001 From: Jan Nowak Date: Sun, 16 May 2021 13:28:57 +0200 Subject: [PATCH] Added sacred. --- .gitignore | 1 + Dockerfile | 1 + dlgssdpytorch.py | 191 ++++++++++++++++++++++++++--------------------- 3 files changed, 108 insertions(+), 85 deletions(-) diff --git a/.gitignore b/.gitignore index 26b8055..5417086 100644 --- a/.gitignore +++ b/.gitignore @@ -4,3 +4,4 @@ venv metrics.tsv *.pt plot.png +my_runs diff --git a/Dockerfile b/Dockerfile index b5b81d0..fc9e32e 100644 --- a/Dockerfile +++ b/Dockerfile @@ -12,6 +12,7 @@ RUN chmod -R 777 /.kaggle COPY ./requirments.txt ./ RUN pip3 install -r requirments.txt RUN pip3 install torch==1.8.1+cpu torchvision==0.9.1+cpu torchaudio==0.8.1 -f https://download.pytorch.org/whl/torch_stable.html +RUN pip3 install sacred # Stwórzmy w kontenerze (jeśli nie istnieje) katalog /app i przejdźmy do niego (wszystkie kolejne polecenia RUN, CMD, ENTRYPOINT, COPY i ADD będą w nim wykonywane) WORKDIR /app diff --git a/dlgssdpytorch.py b/dlgssdpytorch.py index 67558ed..e7cb5a7 100644 --- a/dlgssdpytorch.py +++ b/dlgssdpytorch.py @@ -5,21 +5,12 @@ import torch.nn as nn import torch.optim as optim from torch.utils.data import Dataset, TensorDataset, DataLoader import argparse +from sacred import Experiment +from sacred.observers import MongoObserver, FileStorageObserver -parser = argparse.ArgumentParser(description='Program do uczenia modelu') -parser.add_argument('-l', '--lr', type=float, default=1e-3, help="Współczynik uczenia (lr)", required=False) -parser.add_argument('-e', '--epochs', type=int, default=100, help="Liczba epok", required=False) -args = parser.parse_args() - -lr = args.lr -n_epochs = args.epochs - -train_dataset = torch.load('train_dataset.pt') -#val_dataset = torch.load('val_dataset.pt') - -train_loader = DataLoader(dataset=train_dataset) -#val_loader = DataLoader(dataset=val_dataset) - +ex = Experiment("426206", interactive=False, save_git_info=False) +ex.observers.append(FileStorageObserver('my_runs')) +ex.observers.append(MongoObserver(url='mongodb://mongo_user:mongo_password_IUM_2021@172.17.0.1:27017', db_name='sacred')) class LayerLinearRegression(nn.Module): def __init__(self): super().__init__() @@ -29,81 +20,111 @@ class LayerLinearRegression(nn.Module): def forward(self, x): # Now it only takes a call to the layer to make predictions return self.linear(x) - -model = LayerLinearRegression() -# Checks model's parameters -#print(model.state_dict()) -loss_fn = nn.MSELoss(reduction='mean') -optimizer = optim.SGD(model.parameters(), lr=lr) +# parser = argparse.ArgumentParser(description='Program do uczenia modelu') +# parser.add_argument('-l', '--lr', type=float, default=1e-3, help="Współczynik uczenia (lr)", required=False) +# parser.add_argument('-e', '--epochs', type=int, default=100, help="Liczba epok", required=False) +# args = parser.parse_args() +#python3 dlgssdpytorch.py with lr=0.01 n_epochs=10 -def make_train_step(model, loss_fn, optimizer): - # Builds function that performs a step in the train loop - def train_step(x, y): - # Sets model to TRAIN mode - model.train() - # Makes predictions - yhat = model(x) - # Computes loss - loss = loss_fn(y, yhat) - # Computes gradients - loss.backward() - # Updates parameters and zeroes gradients - optimizer.step() - optimizer.zero_grad() - # Returns the loss - return loss.item() - - # Returns the function that will be called inside the train loop - return train_step +@ex.config +def my_config(): + lr = 1e-3 + n_epochs = 100 -# Creates the train_step function for our model, loss function and optimizer -train_step = make_train_step(model, loss_fn, optimizer) -training_losses = [] -validation_losses = [] -#print(model.state_dict()) -# For each epoch... -for epoch in range(n_epochs): - losses = [] - # Uses loader to fetch one mini-batch for training - for x_batch, y_batch in train_loader: - # NOW, sends the mini-batch data to the device - # so it matches location of the MODEL - # x_batch = x_batch.to(device) - # y_batch = y_batch.to(device) - # One stpe of training - loss = train_step(x_batch, y_batch) - losses.append(loss) - training_loss = np.mean(losses) - training_losses.append(training_loss) +@ex.capture +def train(lr, n_epochs, _run): + train_dataset = torch.load('train_dataset.pt') + #val_dataset = torch.load('val_dataset.pt') + + train_loader = DataLoader(dataset=train_dataset) + #val_loader = DataLoader(dataset=val_dataset) + + model = LayerLinearRegression() + # Checks model's parameters + #print(model.state_dict()) + + loss_fn = nn.MSELoss(reduction='mean') + optimizer = optim.SGD(model.parameters(), lr=lr) + + def make_train_step(model, loss_fn, optimizer): + # Builds function that performs a step in the train loop + def train_step(x, y): + # Sets model to TRAIN mode + model.train() + # Makes predictions + yhat = model(x) + # Computes loss + loss = loss_fn(y, yhat) + # Computes gradients + loss.backward() + # Updates parameters and zeroes gradients + optimizer.step() + optimizer.zero_grad() + # Returns the loss + return loss.item() - # After finishing training steps for all mini-batches, - # it is time for evaluation! - # Ewaluacja jest już tutaj nie potrzebna bo odbywa sie w evaluation.py. Można jednak włączyć podgląd ewaluacji dla poszczególnych epok. - # # We tell PyTorch to NOT use autograd... - # # Do you remember why? - # with torch.no_grad(): - # val_losses = [] - # # Uses loader to fetch one mini-batch for validation - # for x_val, y_val in val_loader: - # # Again, sends data to same device as model - # # x_val = x_val.to(device) - # # y_val = y_val.to(device) + # Returns the function that will be called inside the train loop + return train_step + + # Creates the train_step function for our model, loss function and optimizer + train_step = make_train_step(model, loss_fn, optimizer) + training_losses = [] + validation_losses = [] + #print(model.state_dict()) + # For each epoch... + for epoch in range(n_epochs): + + _run.log_scalar("Epoch", str(epoch)) + losses = [] + # Uses loader to fetch one mini-batch for training + for x_batch, y_batch in train_loader: + # NOW, sends the mini-batch data to the device + # so it matches location of the MODEL + # x_batch = x_batch.to(device) + # y_batch = y_batch.to(device) + # One stpe of training + loss = train_step(x_batch, y_batch) + losses.append(loss) + training_loss = np.mean(losses) + training_losses.append(training_loss) + + _run.log_scalar("MSE", str(training_loss)) - # model.eval() - # # Makes predictions - # yhat = model(x_val) - # # Computes validation loss - # val_loss = loss_fn(y_val, yhat) - # val_losses.append(val_loss.item()) - # validation_loss = np.mean(val_losses) - # validation_losses.append(validation_loss) + # After finishing training steps for all mini-batches, + # it is time for evaluation! + # Ewaluacja jest już tutaj nie potrzebna bo odbywa sie w evaluation.py. Można jednak włączyć podgląd ewaluacji dla poszczególnych epok. + # # We tell PyTorch to NOT use autograd... + # # Do you remember why? + # with torch.no_grad(): + # val_losses = [] + # # Uses loader to fetch one mini-batch for validation + # for x_val, y_val in val_loader: + # # Again, sends data to same device as model + # # x_val = x_val.to(device) + # # y_val = y_val.to(device) + + # model.eval() + # # Makes predictions + # yhat = model(x_val) + # # Computes validation loss + # val_loss = loss_fn(y_val, yhat) + # val_losses.append(val_loss.item()) + # validation_loss = np.mean(val_losses) + # validation_losses.append(validation_loss) - # print(f"[{epoch+1}] Training loss: {training_loss:.3f}\t Validation loss: {validation_loss:.3f}") - print(f"[{epoch+1}] Training loss: {training_loss:.3f}\t") + # print(f"[{epoch+1}] Training loss: {training_loss:.3f}\t Validation loss: {validation_loss:.3f}") + print(f"[{epoch+1}] Training loss: {training_loss:.3f}\t") -torch.save({ - 'model_state_dict': model.state_dict(), - 'optimizer_state_dict': optimizer.state_dict(), - 'loss': lr, - }, 'model.pt') + torch.save({ + 'model_state_dict': model.state_dict(), + 'optimizer_state_dict': optimizer.state_dict(), + 'loss': lr, + }, 'model.pt') + +@ex.automain +def my_main(lr, n_epochs, _run): + train(lr, n_epochs, _run) + +ex.run() +ex.add_artifact('model.pt')