From bd67201997d2529ccd255ff2a63ae0ac818d07b2 Mon Sep 17 00:00:00 2001 From: Aleksander Mendoza Date: Sun, 9 May 2021 18:35:29 +0200 Subject: [PATCH] sacred --- evaluation.Jenkinsfile | 4 +- train-model.Jenkinsfile | 4 +- train_model.py | 192 ++++++++++++++++++++++------------------ 3 files changed, 111 insertions(+), 89 deletions(-) diff --git a/evaluation.Jenkinsfile b/evaluation.Jenkinsfile index a3bb860..90eb6c6 100644 --- a/evaluation.Jenkinsfile +++ b/evaluation.Jenkinsfile @@ -8,7 +8,7 @@ pipeline { steps { git 'https://git.wmi.amu.edu.pl/s434749/ium_434749.git' copyArtifacts fingerprintArtifacts: true, projectName: 's434749-training', selector: lastSuccessful() - sh 'python3 train_model.py eval' + sh 'python3 train_model.py with "mode=eval"' script{ def results = readFile "${env.WORKSPACE}/results.txt" } @@ -17,7 +17,7 @@ pipeline { post { success { emailext body: 'Evaluation of CNN for english phonetic embeddings has finished successfully!\n'+results, subject: 's434749 evaluation finished', to: '26ab8f35.uam.onmicrosoft.com@emea.teams.ms' - archiveArtifacts 'results.txt' + archiveArtifacts 'results.txt, sacred_file_observer' } } } diff --git a/train-model.Jenkinsfile b/train-model.Jenkinsfile index c52601f..155e7d0 100644 --- a/train-model.Jenkinsfile +++ b/train-model.Jenkinsfile @@ -8,13 +8,13 @@ pipeline { steps { git 'https://git.wmi.amu.edu.pl/s434749/ium_434749.git' copyArtifacts fingerprintArtifacts: true, projectName: 's434749-create-dataset', selector: lastSuccessful() - sh 'python3 train_model.py train' + sh 'python3 train_model.py' } post { success { emailext body: 'Training of CNN for english phonetic embeddings has finished successfully', subject: 's434749 training finished', to: '26ab8f35.uam.onmicrosoft.com@emea.teams.ms' - archiveArtifacts 'cnn.pth' + archiveArtifacts 'cnn.pth,sacred_file_observer' } } } diff --git a/train_model.py b/train_model.py index 52e847d..1bb0c99 100644 --- a/train_model.py +++ b/train_model.py @@ -9,6 +9,7 @@ import torch import torch.nn as nn import torch.nn.functional as F import torch.optim as optim +from sacred.observers import FileStorageObserver, MongoObserver from torch.utils.data import Dataset, DataLoader import re import random @@ -16,60 +17,22 @@ import os import sys from tqdm import tqdm from Levenshtein import distance as levenshtein_distance +from sacred import Experiment -DATA_FILE = 'preprocessed.tsv' -EPOCHS = 14 -TEACHER_FORCING_PROBABILITY = 0.4 -LEARNING_RATE = 0.01 -BATCH_SIZE = 512 - -DEVICE = torch.device('cuda:0') if torch.cuda.is_available() else torch.device('cpu') - -OUT_LOOKUP = ['', 'b', 'a', 'ʊ', 't', 'k', 'ə', 'z', 'ɔ', 'ɹ', 's', 'j', 'u', 'm', 'f', 'ɪ', 'o', 'ɡ', 'ɛ', 'n', - 'e', 'd', - 'ɫ', 'w', 'i', 'p', 'ɑ', 'ɝ', 'θ', 'v', 'h', 'æ', 'ŋ', 'ʃ', 'ʒ', 'ð'] - -IN_LOOKUP = ['', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', - 'u', 'v', 'w', 'x', 'y', 'z'] - -IN_ALPHABET = {letter: idx for idx, letter in enumerate(IN_LOOKUP)} - -OUT_ALPHABET = {letter: idx for idx, letter in enumerate(OUT_LOOKUP)} - -TOTAL_OUT_LEN = 0 - -DATA: [(torch.tensor, torch.tensor)] = [] - -TEXT: [str] = [] - -MAX_LEN = 32 - -with open(DATA_FILE) as f: - for line in f: - text, phonemes = line.split("\t") - TEXT.append(text) - assert len(text) <= MAX_LEN, text - text = torch.tensor([IN_ALPHABET[letter] for letter in text], dtype=torch.int) - DATA.append((text, phonemes)) - - -def collate(batch: [(torch.tensor, str)]): - batch_text = torch.zeros((len(batch), len(IN_ALPHABET), MAX_LEN)) - batch_phonemes = list(map(lambda x: x[1], batch)) - for i, (sample, _) in enumerate(batch): - for chr_pos, index in enumerate(sample): - batch_text[i, index, chr_pos] = 1 - return batch_text, batch_phonemes - +ex = Experiment("CNN") +ex.observers.append(FileStorageObserver('sacred_file_observer')) +ex.observers.append(MongoObserver(url='mongodb://mongo_user:mongo_password_IUM_2021@localhost:27017', + db_name='sacred')) +device = torch.device('cuda:0') if torch.cuda.is_available() else torch.device('cpu') class CNN(nn.Module): - def __init__(self, kernel_size, hidden_layers, channels, embedding_size): + def __init__(self, kernel_size, hidden_layers, channels, embedding_size, in_alphabet, max_len): super(CNN, self).__init__() - self.input_conv = nn.Conv1d(in_channels=len(IN_ALPHABET), out_channels=channels, kernel_size=kernel_size) + self.input_conv = nn.Conv1d(in_channels=len(in_alphabet), out_channels=channels, kernel_size=kernel_size) self.conv_hidden = nn.ModuleList( [nn.Conv1d(in_channels=channels, out_channels=channels, kernel_size=kernel_size) for _ in range(hidden_layers)]) - self.last_layer_size = (MAX_LEN - (kernel_size - 1) * (hidden_layers + 1)) * channels + self.last_layer_size = (max_len - (kernel_size - 1) * (hidden_layers + 1)) * channels self.lin = nn.Linear(self.last_layer_size, embedding_size) def forward(self, x): @@ -83,34 +46,40 @@ class CNN(nn.Module): return x -outer_bar = tqdm(total=EPOCHS, position=0) -inner_bar = tqdm(total=len(DATA), position=1) - - def dist(a: [str], b: [str]): - return torch.tensor([levenshtein_distance(a[i], b[i]) for i in range(len(a))], dtype=torch.float, device=DEVICE) + return torch.tensor([levenshtein_distance(a[i], b[i]) for i in range(len(a))], dtype=torch.float, device=device) -def train_model(model): +def train_model(model, learning_rate, in_alphabet, max_len, data, epochs, batch_size): optimizer = optim.Adam(filter(lambda x: x.requires_grad, model.parameters()), - lr=LEARNING_RATE) - loss_snapshots = [] + lr=learning_rate) + outer_bar = tqdm(total=epochs, position=0) + inner_bar = tqdm(total=len(data), position=1) outer_bar.reset() outer_bar.set_description("Epochs") - data_loader = DataLoader(dataset=DATA, drop_last=True, - batch_size=3 * BATCH_SIZE, + + def collate(batch: [(torch.tensor, str)]): + batch_text = torch.zeros((len(batch), len(in_alphabet), max_len)) + batch_phonemes = list(map(lambda x: x[1], batch)) + for i, (sample, _) in enumerate(batch): + for chr_pos, index in enumerate(sample): + batch_text[i, index, chr_pos] = 1 + return batch_text, batch_phonemes + + data_loader = DataLoader(dataset=data, drop_last=True, + batch_size=3 * batch_size, collate_fn=collate, shuffle=True) - for epoch in range(EPOCHS): + for epoch in range(epochs): total_loss = 0 inner_bar.reset() for batch_text, batch_phonemes in data_loader: optimizer.zero_grad() - anchor, positive, negative = batch_text.to(DEVICE).split(BATCH_SIZE) - ph_anchor = batch_phonemes[:BATCH_SIZE] - ph_positive = batch_phonemes[BATCH_SIZE:2 * BATCH_SIZE] - ph_negative = batch_phonemes[2 * BATCH_SIZE:] + anchor, positive, negative = batch_text.to(device).split(batch_size) + ph_anchor = batch_phonemes[:batch_size] + ph_positive = batch_phonemes[batch_size:2 * batch_size] + ph_negative = batch_phonemes[2 * batch_size:] embedded_anchor = model(anchor) embedded_positive = model(positive) embedded_negative = model(negative) @@ -126,11 +95,11 @@ def train_model(model): + (estimated_pos_dist - estimated_neg_dist - (actual_pos_dist - actual_neg_dist)).clip(min=0)) loss.backward() optimizer.step() - inner_bar.update(3 * BATCH_SIZE) + inner_bar.update(3 * batch_size) loss_scalar = loss.item() total_loss += loss_scalar inner_bar.set_description("loss %.2f" % loss_scalar) - loss_snapshots.append(total_loss / len(DATA) * 3) + ex.log_scalar("avg_loss", total_loss / len(data) * 3) # print() # print("Total epoch loss:", total_loss) # print("Total epoch avg loss:", total_loss / TOTAL_TRAINING_OUT_LEN) @@ -142,46 +111,99 @@ def train_model(model): outer_bar.update(1) -def evaluate_monte_carlo(model, repeats): +def evaluate_monte_carlo(model, repeats, data, batch_size, in_alphabet, max_len): with torch.no_grad(): i = 0 diff = 0 - outer_bar.reset(total=repeats) + outer_bar = tqdm(total=repeats, position=0) + inner_bar = tqdm(total=len(data), position=1) outer_bar.set_description("Epochs") + + def collate(batch: [(torch.tensor, str)]): + batch_text = torch.zeros((len(batch), len(in_alphabet), max_len)) + batch_phonemes = list(map(lambda x: x[1], batch)) + for i, (sample, _) in enumerate(batch): + for chr_pos, index in enumerate(sample): + batch_text[i, index, chr_pos] = 1 + return batch_text, batch_phonemes + for _ in range(repeats): - data_loader = DataLoader(dataset=DATA, drop_last=True, - batch_size=2 * BATCH_SIZE, + data_loader = DataLoader(dataset=data, drop_last=True, + batch_size=2 * batch_size, collate_fn=collate, shuffle=True) inner_bar.reset() for batch_text, batch_phonemes in data_loader: - positive, negative = batch_text.to(DEVICE).split(BATCH_SIZE) - ph_positive = batch_phonemes[0:BATCH_SIZE] - ph_negative = batch_phonemes[BATCH_SIZE:] + positive, negative = batch_text.to(device).split(batch_size) + ph_positive = batch_phonemes[0:batch_size] + ph_negative = batch_phonemes[batch_size:] embedded_positive = model(positive) embedded_negative = model(negative) estimated_dist = torch.linalg.norm(embedded_negative - embedded_positive, dim=1) actual_dist = dist(ph_negative, ph_positive) diff += sum(abs(estimated_dist - actual_dist)) - i += BATCH_SIZE - inner_bar.update(2 * BATCH_SIZE) + i += batch_size + inner_bar.update(2 * batch_size) outer_bar.update(1) with open('results.txt', 'w+') as r: print("Average estimation error " + str(diff.item() / i)) r.write("Average estimation error " + str(diff.item() / i) + "\n") + ex.log_scalar("avg_estim_error", diff.item() / i) -cnn = CNN(kernel_size=3, hidden_layers=14, channels=MAX_LEN, embedding_size=MAX_LEN).to(DEVICE) -if os.path.isfile('cnn.pth'): - cnn.load_state_dict(torch.load('cnn.pth', map_location=torch.device('cpu'))) -else: - if len(sys.argv) > 1 and sys.argv[1] == 'train': - train_model(cnn) - torch.save(cnn.state_dict(), 'cnn.pth') +@ex.config +def cfg(): + kernel_size = 3 + hidden_layers = 14 + data_file = 'preprocessed.tsv' + epochs = 14 + mode = 'train' + teacher_forcing_probability = 0.4 + learning_rate = 0.01 + batch_size = 512 + max_len = 32 + total_out_len = 0 + model_file = 'cnn.pth' + out_lookup = ['', 'b', 'a', 'ʊ', 't', 'k', 'ə', 'z', 'ɔ', 'ɹ', 's', 'j', 'u', 'm', 'f', 'ɪ', 'o', 'ɡ', 'ɛ', 'n', + 'e', 'd', + 'ɫ', 'w', 'i', 'p', 'ɑ', 'ɝ', 'θ', 'v', 'h', 'æ', 'ŋ', 'ʃ', 'ʒ', 'ð'] + in_lookup = ['', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', + 'u', 'v', 'w', 'x', 'y', 'z'] + + +@ex.automain +def run(kernel_size, hidden_layers, data_file, epochs, teacher_forcing_probability, learning_rate, batch_size, max_len, + total_out_len, model_file, out_lookup, in_lookup, mode): + in_alphabet = {letter: idx for idx, letter in enumerate(in_lookup)} + + out_alphabet = {letter: idx for idx, letter in enumerate(out_lookup)} + + data: [(torch.tensor, torch.tensor)] = [] + + texts: [str] = [] + + with open(data_file) as f: + for line in f: + text, phonemes = line.split("\t") + texts.append(text) + assert len(text) <= max_len, text + text = torch.tensor([in_alphabet[letter] for letter in text], dtype=torch.int) + data.append((text, phonemes)) + + cnn = CNN(kernel_size=kernel_size, hidden_layers=hidden_layers, channels=max_len, embedding_size=max_len, + in_alphabet=in_alphabet, max_len=max_len).to(device) + if os.path.isfile(model_file): + cnn.load_state_dict(torch.load(model_file, map_location=torch.device('cpu'))) else: - print("cnn.pth missing!") - exit(2) + if mode == 'train': + train_model(cnn, learning_rate, in_alphabet, max_len, data, epochs, batch_size) + torch.save(cnn.state_dict(), model_file) + ex.add_artifact(model_file) + else: + print(model_file + " missing!") + exit(2) + + if mode == 'eval': + cnn.eval() + evaluate_monte_carlo(cnn, 1, data, batch_size, in_alphabet, max_len) -if len(sys.argv) > 1 and sys.argv[1] == 'eval': - cnn.eval() - evaluate_monte_carlo(cnn, 1)