From 2d8f539aa5046b713dd32b9f4bb2e4478d741be2 Mon Sep 17 00:00:00 2001 From: Alagris Date: Sun, 25 Apr 2021 20:55:45 +0200 Subject: [PATCH] cnn --- Dockerfile | 2 +- create_dataset.py | 5 +- train-model.Jenkinsfile | 21 +++++ train_model.py | 187 ++++++++++++++++++++++++++++++++++++++++ 4 files changed, 212 insertions(+), 3 deletions(-) create mode 100644 train-model.Jenkinsfile create mode 100644 train_model.py diff --git a/Dockerfile b/Dockerfile index 7c090ab..89e0244 100644 --- a/Dockerfile +++ b/Dockerfile @@ -5,7 +5,7 @@ ENV PYTHONIOENCODING=utf-8 # Instalujemy niezbędne zależności. Zwróć uwagę na flagę "-y" (assume yes) RUN apt update && apt install -y python3 python3-pip git locales -RUN pip3 install requests +RUN pip3 install requests torch==1.8.1+cpu python-Levenshtein RUN sed -i '/en_US.UTF-8/s/^# //g' /etc/locale.gen && locale-gen en_US.UTF-8 ENV LANG='en_US.UTF-8' LANGUAGE='en_US:en' LC_ALL='en_US.UTF-8' diff --git a/create_dataset.py b/create_dataset.py index d71c3c6..7bd356d 100644 --- a/create_dataset.py +++ b/create_dataset.py @@ -13,14 +13,15 @@ with open(DATA_FILE) as f, open(PREPROCESSED, 'w+') as p: for line in f: text, phonemes = line.strip().split("\t") phonemes = phonemes.split(",")[0] - phonemes = '^' + re.sub(r'[/\'ˈˌ]', '', phonemes) + '$' - text = '^' + re.sub(r'[^a-z]', '', text.strip()) + '$' + phonemes = re.sub(r'[/\'ˈˌ]', '', phonemes) + text = re.sub(r'[^a-z]', '', text.strip()) for letter in phonemes: out_alph.add(letter) for letter in text: in_alph.add(letter) p.write(text + '\t' + phonemes+'\n') + print(in_alph) print(out_alph) with open('in_alphabet', 'w+') as p: diff --git a/train-model.Jenkinsfile b/train-model.Jenkinsfile new file mode 100644 index 0000000..e36ed8b --- /dev/null +++ b/train-model.Jenkinsfile @@ -0,0 +1,21 @@ +pipeline { + agent { + dockerfile true + } + + stages { + stage('Build') { + steps { + git 'https://git.wmi.amu.edu.pl/s434749/ium_434749.git' + copyArtifacts fingerprintArtifacts: true, projectName: 's434749-create-dataset', selector: lastSuccessful() + sh 'python3 train_model.py' + } + + post { + success { + archiveArtifacts 'cnn.pth' + } + } + } + } +} diff --git a/train_model.py b/train_model.py new file mode 100644 index 0000000..62f2b4b --- /dev/null +++ b/train_model.py @@ -0,0 +1,187 @@ +# https://arxiv.org/pdf/2001.11692.pdf + + +import numpy as np +import unicodedata +from time import sleep + +import torch +import torch.nn as nn +import torch.nn.functional as F +import torch.optim as optim +from torch.utils.data import Dataset, DataLoader +import matplotlib.pyplot as plt +import re +import random +import os +from tqdm import tqdm +from Levenshtein import distance as levenshtein_distance + +DATA_FILE = 'preprocessed.tsv' +EPOCHS = 14 +TEACHER_FORCING_PROBABILITY = 0.4 +LEARNING_RATE = 0.01 +BATCH_SIZE = 512 + +DEVICE = torch.device('cuda:0') if torch.cuda.is_available() else torch.device('cpu') + +OUT_LOOKUP = ['', 'b', 'a', 'ʊ', 't', 'k', 'ə', 'z', 'ɔ', 'ɹ', 's', 'j', 'u', 'm', 'f', 'ɪ', 'o', 'ɡ', 'ɛ', 'n', + 'e', 'd', + 'ɫ', 'w', 'i', 'p', 'ɑ', 'ɝ', 'θ', 'v', 'h', 'æ', 'ŋ', 'ʃ', 'ʒ', 'ð'] + +IN_LOOKUP = ['', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', + 'u', 'v', 'w', 'x', 'y', 'z'] + +IN_ALPHABET = {letter: idx for idx, letter in enumerate(IN_LOOKUP)} + +OUT_ALPHABET = {letter: idx for idx, letter in enumerate(OUT_LOOKUP)} + +TOTAL_OUT_LEN = 0 + +DATA: [(torch.tensor, torch.tensor)] = [] + +TEXT: [str] = [] + +MAX_LEN = 32 + +with open(DATA_FILE) as f: + for line in f: + text, phonemes = line.split("\t") + TEXT.append(text) + assert len(text) <= MAX_LEN, text + text = torch.tensor([IN_ALPHABET[letter] for letter in text], dtype=torch.int) + DATA.append((text, phonemes)) + + +def collate(batch: [(torch.tensor, str)]): + batch_text = torch.zeros((len(batch), len(IN_ALPHABET), MAX_LEN)) + batch_phonemes = list(map(lambda x: x[1], batch)) + for i, (sample, _) in enumerate(batch): + for chr_pos, index in enumerate(sample): + batch_text[i, index, chr_pos] = 1 + return batch_text, batch_phonemes + + +class CNN(nn.Module): + def __init__(self, kernel_size, hidden_layers, channels, embedding_size): + super(CNN, self).__init__() + self.input_conv = nn.Conv1d(in_channels=len(IN_ALPHABET), out_channels=channels, kernel_size=kernel_size) + self.conv_hidden = nn.ModuleList( + [nn.Conv1d(in_channels=channels, out_channels=channels, kernel_size=kernel_size) for _ in + range(hidden_layers)]) + self.last_layer_size = (MAX_LEN - (kernel_size - 1) * (hidden_layers + 1)) * channels + self.lin = nn.Linear(self.last_layer_size, embedding_size) + + def forward(self, x): + x = self.input_conv(x) + x = F.relu(x, inplace=True) + for c in self.conv_hidden: + x = c(x) + x = F.relu(x, inplace=True) + x = x.view(x.size()[0], self.last_layer_size) + x = self.lin(x) + return x + + +outer_bar = tqdm(total=EPOCHS, position=0) +inner_bar = tqdm(total=len(DATA), position=1) + + +def dist(a: [str], b: [str]): + return torch.tensor([levenshtein_distance(a[i], b[i]) for i in range(len(a))], dtype=torch.float, device=DEVICE) + + +def train_model(model): + plt.ion() + optimizer = optim.Adam(filter(lambda x: x.requires_grad, model.parameters()), + lr=LEARNING_RATE) + loss_snapshots = [] + outer_bar.reset() + outer_bar.set_description("Epochs") + data_loader = DataLoader(dataset=DATA, drop_last=True, + batch_size=3 * BATCH_SIZE, + collate_fn=collate, + shuffle=True) + for epoch in range(EPOCHS): + total_loss = 0 + inner_bar.reset() + + for batch_text, batch_phonemes in data_loader: + optimizer.zero_grad() + anchor, positive, negative = batch_text.to(DEVICE).split(BATCH_SIZE) + ph_anchor = batch_phonemes[:BATCH_SIZE] + ph_positive = batch_phonemes[BATCH_SIZE:2 * BATCH_SIZE] + ph_negative = batch_phonemes[2 * BATCH_SIZE:] + embedded_anchor = model(anchor) + embedded_positive = model(positive) + embedded_negative = model(negative) + estimated_pos_dist = torch.linalg.norm(embedded_anchor - embedded_positive, dim=1) + estimated_neg_dist = torch.linalg.norm(embedded_anchor - embedded_negative, dim=1) + estimated_pos_neg_dist = torch.linalg.norm(embedded_positive - embedded_negative, dim=1) + actual_pos_dist = dist(ph_anchor, ph_positive) + actual_neg_dist = dist(ph_anchor, ph_negative) + actual_pos_neg_dist = dist(ph_positive, ph_negative) + loss = sum(abs(estimated_neg_dist - actual_neg_dist) + + abs(estimated_pos_dist - actual_pos_dist) + + abs(estimated_pos_neg_dist - actual_pos_neg_dist) + + (estimated_pos_dist - estimated_neg_dist - (actual_pos_dist - actual_neg_dist)).clip(min=0)) + loss.backward() + optimizer.step() + inner_bar.update(3 * BATCH_SIZE) + loss_scalar = loss.item() + total_loss += loss_scalar + inner_bar.set_description("loss %.2f" % loss_scalar) + loss_snapshots.append(total_loss / len(DATA) * 3) + plt.clf() + plt.plot(loss_snapshots, label="Avg loss ") + plt.legend(loc="upper left") + plt.pause(interval=0.01) + # print() + # print("Total epoch loss:", total_loss) + # print("Total epoch avg loss:", total_loss / TOTAL_TRAINING_OUT_LEN) + # print("Training snapshots:", train_snapshots) + # print("Training snapshots(%):", train_snapshots_percentage) + # print("Evaluation snapshots:", eval_snapshots) + # print("Evaluation snapshots(%):", eval_snapshots_percentage) + outer_bar.set_description("Epochs") + outer_bar.update(1) + plt.ioff() + + +def evaluate_monte_carlo(model, repeats): + with torch.no_grad(): + i = 0 + diff = 0 + outer_bar.reset(total=repeats) + outer_bar.set_description("Epochs") + for _ in range(repeats): + data_loader = DataLoader(dataset=DATA, drop_last=True, + batch_size=2 * BATCH_SIZE, + collate_fn=collate, + shuffle=True) + inner_bar.reset() + for batch_text, batch_phonemes in data_loader: + positive, negative = batch_text.to(DEVICE).split(BATCH_SIZE) + ph_positive = batch_phonemes[0:BATCH_SIZE] + ph_negative = batch_phonemes[BATCH_SIZE:] + embedded_positive = model(positive) + embedded_negative = model(negative) + estimated_dist = torch.linalg.norm(embedded_negative - embedded_positive, dim=1) + actual_dist = dist(ph_negative, ph_positive) + diff += sum(abs(estimated_dist - actual_dist)) + i += BATCH_SIZE + inner_bar.update(2 * BATCH_SIZE) + outer_bar.update(1) + print("Average estimation error " + str(diff.item() / i)) + + +cnn = CNN(kernel_size=3, hidden_layers=14, channels=MAX_LEN, embedding_size=MAX_LEN).to(DEVICE) +if os.path.isfile('cnn.pth'): + cnn.load_state_dict(torch.load('cnn.pth', map_location=torch.device('cpu'))) +else: + train_model(cnn) + torch.save(cnn.state_dict(), 'cnn.pth') + +cnn.eval() +print("Training finished! Starting evaluation!") +evaluate_monte_carlo(cnn, 1)