mlflow

2021-05-23 17:00:43 +02:00 · 2021-05-23 17:00:43 +02:00 · 099bfb8540
commit 099bfb8540
parent 3e0649786b
4 changed files with 111 additions and 38 deletions
--- a/2
+++ b/2
@ -5,7 +5,7 @@ ENV PYTHONIOENCODING=utf-8
 # Instalujemy niezbędne zależności. Zwróć uwagę na flagę "-y" (assume yes)
 RUN apt update && apt install -y python3 python3-pip git locales

-RUN pip3 install requests python-Levenshtein tqdm sacred pymongo
+RUN pip3 install requests python-Levenshtein tqdm sacred pymongo mlflow
 RUN pip3 install torch==1.8.1+cpu torchvision==0.9.1+cpu torchaudio==0.8.1 -f https://download.pytorch.org/whl/torch_stable.html

 RUN sed -i '/en_US.UTF-8/s/^# //g' /etc/locale.gen && locale-gen en_US.UTF-8
--- a/24
+++ b/24
@ -0,0 +1,24 @@
+name: cnn
+
+conda_env: conda_env.yaml
+# Can have a docker_env instead of a conda_env, e.g.
+# docker_env:
+#    image:  mlflow-docker-example
+
+entry_points:
+  main:
+    parameters:
+      data_file: path
+      regularization: {type: float, default: 0.1}
+      batch_size: {type: int, default: 32}
+      learning_rate: {type: float, default: 0.001}
+      epochs: {type: int, default: 2}
+    command: "python train_model.py with 'batch_size={batch_size}' 'learning_rate=${learning_rate}' 'epochs=${epochs}'"
+  validate:
+    parameters:
+      data_file: path
+      regularization: {type: float, default: 0.1}
+      batch_size: {type: int, default: 32}
+      learning_rate: {type: float, default: 0.001}
+      epochs: {type: int, default: 2}
+    command: "python train_model.py with 'batch_size={batch_size}' 'learning_rate=${learning_rate}' 'epochs=${epochs}'"
--- a/conda.yaml
+++ b/conda.yaml
@ -0,0 +1,16 @@
+name: cnn
+channels:
+  - defaults
+dependencies:
+  - python=3.6
+  - pip
+  - pip:
+    - mlflow==1.17.0
+    - requests==2.25.1
+    - tqdm==4.59.0
+    - pymongo==3.11.3
+    - torch==1.8.1+cpu
+    - torchvision==0.9.1+cpu
+    - torchaudio==0.8.1
+    - python-Levenshtein-0.12.2
+    - sacred-0.8.2
--- a/train_model.py
+++ b/train_model.py
@ -19,6 +19,16 @@ from tqdm import tqdm
 from Levenshtein import distance as levenshtein_distance
 from sacred import Experiment
 import traceback
+from mlflow import log_metric, log_param, log_artifacts
+import mlflow
+
+import logging
+
+logging.basicConfig(level=logging.WARN)
+logger = logging.getLogger(__name__)
+
+mlflow.set_tracking_uri("http://172.17.0.1:5000")
+mlflow.set_experiment("s434749")

 ex = Experiment("CNN")
 ex.observers.append(FileStorageObserver('sacred_file_observer'))
@ -56,6 +66,20 @@ def dist(a: [str], b: [str]):
    return torch.tensor([levenshtein_distance(a[i], b[i]) for i in range(len(a))], dtype=torch.float, device=device)


+def encode(batch: [(torch.tensor, str)], in_alphabet, max_len):
+    batch_text = torch.zeros((len(batch), len(in_alphabet), max_len))
+    batch_phonemes = list(map(lambda x: x[1], batch))
+    for i, (sample, _) in enumerate(batch):
+        for chr_pos, index in enumerate(sample):
+            batch_text[i, index, chr_pos] = 1
+    return batch_text, batch_phonemes
+
+
+def encode_str(batch: [(str, str)], in_alphabet, max_len):
+    batch = [(torch.tensor([in_alphabet[letter] for letter in in_str], dtype=torch.int), out_str) for in_str, out_str in batch]
+    return encode(batch)
+
+
 def train_model(model, learning_rate, in_alphabet, max_len, data, epochs, batch_size):
    optimizer = optim.Adam(filter(lambda x: x.requires_grad, model.parameters()),
                           lr=learning_rate)
@ -65,12 +89,7 @@ def train_model(model, learning_rate, in_alphabet, max_len, data, epochs, batch_
    outer_bar.set_description("Epochs")

    def collate(batch: [(torch.tensor, str)]):
-        batch_text = torch.zeros((len(batch), len(in_alphabet), max_len))
-        batch_phonemes = list(map(lambda x: x[1], batch))
-        for i, (sample, _) in enumerate(batch):
-            for chr_pos, index in enumerate(sample):
-                batch_text[i, index, chr_pos] = 1
-        return batch_text, batch_phonemes
+        return encode(batch, in_alphabet, max_len)

    data_loader = DataLoader(dataset=data, drop_last=True,
                             batch_size=3 * batch_size,
@ -106,6 +125,7 @@ def train_model(model, learning_rate, in_alphabet, max_len, data, epochs, batch_
            total_loss += loss_scalar
            inner_bar.set_description("loss %.2f" % loss_scalar)
        ex.log_scalar("avg_loss", total_loss / len(data) * 3)
+        log_metric("avg_loss", total_loss / len(data) * 3)
        # print()
        # print("Total epoch loss:", total_loss)
        # print("Total epoch avg loss:", total_loss / TOTAL_TRAINING_OUT_LEN)
@ -126,12 +146,7 @@ def evaluate_monte_carlo(model, repeats, data, batch_size, in_alphabet, max_len)
        outer_bar.set_description("Epochs")

        def collate(batch: [(torch.tensor, str)]):
-            batch_text = torch.zeros((len(batch), len(in_alphabet), max_len))
-            batch_phonemes = list(map(lambda x: x[1], batch))
-            for i, (sample, _) in enumerate(batch):
-                for chr_pos, index in enumerate(sample):
-                    batch_text[i, index, chr_pos] = 1
-            return batch_text, batch_phonemes
+            return encode(batch, in_alphabet, max_len)

        for _ in range(repeats):
            data_loader = DataLoader(dataset=data, drop_last=True,
@ -177,38 +192,56 @@ def cfg():
                 'u', 'v', 'w', 'x', 'y', 'z']


+def signature(model,in_alphabet,max_len):
+    mock_x = [('abc', 'xyz'), ('hey', 'man')]
+    mock_text, _ = encode_str(mock_x, in_alphabet, max_len)
+    mock_y = model(mock_text)
+    return mlflow.models.signature.infer_signature(mock_x, mock_y)
+
@ex.automain
 def run(kernel_size, hidden_layers, data_file, epochs, teacher_forcing_probability, learning_rate, batch_size, max_len,
        total_out_len, model_file, out_lookup, in_lookup, mode):
-    in_alphabet = {letter: idx for idx, letter in enumerate(in_lookup)}
+    with mlflow.start_run():
+        log_param("kernel_size", kernel_size)
+        log_param("hidden_layers", hidden_layers)
+        log_param("data_file", data_file)
+        log_param("epochs", epochs)
+        log_param("learning_rate", learning_rate)
+        log_param("batch_size", batch_size)
+        log_param("max_len", max_len)
+        in_alphabet = {letter: idx for idx, letter in enumerate(in_lookup)}

-    out_alphabet = {letter: idx for idx, letter in enumerate(out_lookup)}
+        out_alphabet = {letter: idx for idx, letter in enumerate(out_lookup)}

-    data: [(torch.tensor, torch.tensor)] = []
+        data: [(torch.tensor, torch.tensor)] = []

-    texts: [str] = []
+        texts: [str] = []

-    with open(data_file) as f:
-        for line in f:
-            text, phonemes = line.split("\t")
-            texts.append(text)
-            assert len(text) <= max_len, text
-            text = torch.tensor([in_alphabet[letter] for letter in text], dtype=torch.int)
-            data.append((text, phonemes))
+        with open(data_file) as f:
+            for line in f:
+                text, phonemes = line.split("\t")
+                texts.append(text)
+                assert len(text) <= max_len, text
+                text = torch.tensor([in_alphabet[letter] for letter in text], dtype=torch.int)
+                data.append((text, phonemes))

-    cnn = CNN(kernel_size=kernel_size, hidden_layers=hidden_layers, channels=max_len, embedding_size=max_len,
-              in_alphabet=in_alphabet, max_len=max_len).to(device)
-    if os.path.isfile(model_file):
-        cnn.load_state_dict(torch.load(model_file, map_location=torch.device('cpu')))
-    else:
-        if mode == 'train':
-            train_model(cnn, learning_rate, in_alphabet, max_len, data, epochs, batch_size)
-            torch.save(cnn.state_dict(), model_file)
-            ex.add_artifact(model_file)
+        cnn = CNN(kernel_size=kernel_size, hidden_layers=hidden_layers, channels=max_len, embedding_size=max_len,
+                  in_alphabet=in_alphabet, max_len=max_len).to(device)
+        if os.path.isfile(model_file):
+            cnn.load_state_dict(torch.load(model_file, map_location=torch.device('cpu')))
        else:
-            print(model_file + " missing!")
-            exit(2)
+            if mode == 'train':
+                train_model(cnn, learning_rate, in_alphabet, max_len, data, epochs, batch_size)
+                torch.save(cnn.state_dict(), model_file)
+                ex.add_artifact(model_file)

-    if mode == 'eval':
-        cnn.eval()
-        evaluate_monte_carlo(cnn, 1, data, batch_size, in_alphabet, max_len)
+                mlflow.pytorch.log_model(cnn, "cnn-model", registered_model_name="PhoneticEdDistEmbeddings",
+                                         signature=signature(cnn,in_alphabet, max_len))
+                log_artifacts(model_file)
+            else:
+                print(model_file + " missing!")
+                exit(2)
+
+        if mode == 'eval':
+            cnn.eval()
+            evaluate_monte_carlo(cnn, 1, data, batch_size, in_alphabet, max_len)