diff --git a/Dockerfile b/Dockerfile index de96505..0c9ea54 100644 --- a/Dockerfile +++ b/Dockerfile @@ -5,7 +5,7 @@ ENV PYTHONIOENCODING=utf-8 # Instalujemy niezbędne zależności. Zwróć uwagę na flagę "-y" (assume yes) RUN apt update && apt install -y python3 python3-pip git locales -RUN pip3 install requests python-Levenshtein tqdm sacred pymongo +RUN pip3 install requests python-Levenshtein tqdm sacred pymongo mlflow RUN pip3 install torch==1.8.1+cpu torchvision==0.9.1+cpu torchaudio==0.8.1 -f https://download.pytorch.org/whl/torch_stable.html RUN sed -i '/en_US.UTF-8/s/^# //g' /etc/locale.gen && locale-gen en_US.UTF-8 diff --git a/MLProject b/MLProject new file mode 100644 index 0000000..69cfb36 --- /dev/null +++ b/MLProject @@ -0,0 +1,24 @@ +name: cnn + +conda_env: conda_env.yaml +# Can have a docker_env instead of a conda_env, e.g. +# docker_env: +# image: mlflow-docker-example + +entry_points: + main: + parameters: + data_file: path + regularization: {type: float, default: 0.1} + batch_size: {type: int, default: 32} + learning_rate: {type: float, default: 0.001} + epochs: {type: int, default: 2} + command: "python train_model.py with 'batch_size={batch_size}' 'learning_rate=${learning_rate}' 'epochs=${epochs}'" + validate: + parameters: + data_file: path + regularization: {type: float, default: 0.1} + batch_size: {type: int, default: 32} + learning_rate: {type: float, default: 0.001} + epochs: {type: int, default: 2} + command: "python train_model.py with 'batch_size={batch_size}' 'learning_rate=${learning_rate}' 'epochs=${epochs}'" diff --git a/conda.yaml b/conda.yaml new file mode 100644 index 0000000..c37690a --- /dev/null +++ b/conda.yaml @@ -0,0 +1,16 @@ +name: cnn +channels: + - defaults +dependencies: + - python=3.6 + - pip + - pip: + - mlflow==1.17.0 + - requests==2.25.1 + - tqdm==4.59.0 + - pymongo==3.11.3 + - torch==1.8.1+cpu + - torchvision==0.9.1+cpu + - torchaudio==0.8.1 + - python-Levenshtein-0.12.2 + - sacred-0.8.2 \ No newline at end of file diff --git a/train_model.py b/train_model.py index 0b2b8bf..7e1097a 100644 --- a/train_model.py +++ b/train_model.py @@ -19,6 +19,16 @@ from tqdm import tqdm from Levenshtein import distance as levenshtein_distance from sacred import Experiment import traceback +from mlflow import log_metric, log_param, log_artifacts +import mlflow + +import logging + +logging.basicConfig(level=logging.WARN) +logger = logging.getLogger(__name__) + +mlflow.set_tracking_uri("http://172.17.0.1:5000") +mlflow.set_experiment("s434749") ex = Experiment("CNN") ex.observers.append(FileStorageObserver('sacred_file_observer')) @@ -56,6 +66,20 @@ def dist(a: [str], b: [str]): return torch.tensor([levenshtein_distance(a[i], b[i]) for i in range(len(a))], dtype=torch.float, device=device) +def encode(batch: [(torch.tensor, str)], in_alphabet, max_len): + batch_text = torch.zeros((len(batch), len(in_alphabet), max_len)) + batch_phonemes = list(map(lambda x: x[1], batch)) + for i, (sample, _) in enumerate(batch): + for chr_pos, index in enumerate(sample): + batch_text[i, index, chr_pos] = 1 + return batch_text, batch_phonemes + + +def encode_str(batch: [(str, str)], in_alphabet, max_len): + batch = [(torch.tensor([in_alphabet[letter] for letter in in_str], dtype=torch.int), out_str) for in_str, out_str in batch] + return encode(batch) + + def train_model(model, learning_rate, in_alphabet, max_len, data, epochs, batch_size): optimizer = optim.Adam(filter(lambda x: x.requires_grad, model.parameters()), lr=learning_rate) @@ -65,12 +89,7 @@ def train_model(model, learning_rate, in_alphabet, max_len, data, epochs, batch_ outer_bar.set_description("Epochs") def collate(batch: [(torch.tensor, str)]): - batch_text = torch.zeros((len(batch), len(in_alphabet), max_len)) - batch_phonemes = list(map(lambda x: x[1], batch)) - for i, (sample, _) in enumerate(batch): - for chr_pos, index in enumerate(sample): - batch_text[i, index, chr_pos] = 1 - return batch_text, batch_phonemes + return encode(batch, in_alphabet, max_len) data_loader = DataLoader(dataset=data, drop_last=True, batch_size=3 * batch_size, @@ -106,6 +125,7 @@ def train_model(model, learning_rate, in_alphabet, max_len, data, epochs, batch_ total_loss += loss_scalar inner_bar.set_description("loss %.2f" % loss_scalar) ex.log_scalar("avg_loss", total_loss / len(data) * 3) + log_metric("avg_loss", total_loss / len(data) * 3) # print() # print("Total epoch loss:", total_loss) # print("Total epoch avg loss:", total_loss / TOTAL_TRAINING_OUT_LEN) @@ -126,12 +146,7 @@ def evaluate_monte_carlo(model, repeats, data, batch_size, in_alphabet, max_len) outer_bar.set_description("Epochs") def collate(batch: [(torch.tensor, str)]): - batch_text = torch.zeros((len(batch), len(in_alphabet), max_len)) - batch_phonemes = list(map(lambda x: x[1], batch)) - for i, (sample, _) in enumerate(batch): - for chr_pos, index in enumerate(sample): - batch_text[i, index, chr_pos] = 1 - return batch_text, batch_phonemes + return encode(batch, in_alphabet, max_len) for _ in range(repeats): data_loader = DataLoader(dataset=data, drop_last=True, @@ -177,38 +192,56 @@ def cfg(): 'u', 'v', 'w', 'x', 'y', 'z'] +def signature(model,in_alphabet,max_len): + mock_x = [('abc', 'xyz'), ('hey', 'man')] + mock_text, _ = encode_str(mock_x, in_alphabet, max_len) + mock_y = model(mock_text) + return mlflow.models.signature.infer_signature(mock_x, mock_y) + @ex.automain def run(kernel_size, hidden_layers, data_file, epochs, teacher_forcing_probability, learning_rate, batch_size, max_len, total_out_len, model_file, out_lookup, in_lookup, mode): - in_alphabet = {letter: idx for idx, letter in enumerate(in_lookup)} + with mlflow.start_run(): + log_param("kernel_size", kernel_size) + log_param("hidden_layers", hidden_layers) + log_param("data_file", data_file) + log_param("epochs", epochs) + log_param("learning_rate", learning_rate) + log_param("batch_size", batch_size) + log_param("max_len", max_len) + in_alphabet = {letter: idx for idx, letter in enumerate(in_lookup)} - out_alphabet = {letter: idx for idx, letter in enumerate(out_lookup)} + out_alphabet = {letter: idx for idx, letter in enumerate(out_lookup)} - data: [(torch.tensor, torch.tensor)] = [] + data: [(torch.tensor, torch.tensor)] = [] - texts: [str] = [] + texts: [str] = [] - with open(data_file) as f: - for line in f: - text, phonemes = line.split("\t") - texts.append(text) - assert len(text) <= max_len, text - text = torch.tensor([in_alphabet[letter] for letter in text], dtype=torch.int) - data.append((text, phonemes)) + with open(data_file) as f: + for line in f: + text, phonemes = line.split("\t") + texts.append(text) + assert len(text) <= max_len, text + text = torch.tensor([in_alphabet[letter] for letter in text], dtype=torch.int) + data.append((text, phonemes)) - cnn = CNN(kernel_size=kernel_size, hidden_layers=hidden_layers, channels=max_len, embedding_size=max_len, - in_alphabet=in_alphabet, max_len=max_len).to(device) - if os.path.isfile(model_file): - cnn.load_state_dict(torch.load(model_file, map_location=torch.device('cpu'))) - else: - if mode == 'train': - train_model(cnn, learning_rate, in_alphabet, max_len, data, epochs, batch_size) - torch.save(cnn.state_dict(), model_file) - ex.add_artifact(model_file) + cnn = CNN(kernel_size=kernel_size, hidden_layers=hidden_layers, channels=max_len, embedding_size=max_len, + in_alphabet=in_alphabet, max_len=max_len).to(device) + if os.path.isfile(model_file): + cnn.load_state_dict(torch.load(model_file, map_location=torch.device('cpu'))) else: - print(model_file + " missing!") - exit(2) + if mode == 'train': + train_model(cnn, learning_rate, in_alphabet, max_len, data, epochs, batch_size) + torch.save(cnn.state_dict(), model_file) + ex.add_artifact(model_file) - if mode == 'eval': - cnn.eval() - evaluate_monte_carlo(cnn, 1, data, batch_size, in_alphabet, max_len) + mlflow.pytorch.log_model(cnn, "cnn-model", registered_model_name="PhoneticEdDistEmbeddings", + signature=signature(cnn,in_alphabet, max_len)) + log_artifacts(model_file) + else: + print(model_file + " missing!") + exit(2) + + if mode == 'eval': + cnn.eval() + evaluate_monte_carlo(cnn, 1, data, batch_size, in_alphabet, max_len)