git force reset history

2021-03-22 12:49:11 +01:00 · 2021-03-22 12:49:11 +01:00 · a72ad9e743
commit a72ad9e743
1 changed files with 298 additions and 0 deletions
--- a/PH.py
+++ b/PH.py
@ -0,0 +1,298 @@
 import math
 import unicodedata
 from time import sleep
 import torch
 import torch.nn as nn
 import torch.optim as optim
 from torch.utils.data import Dataset, DataLoader
 import matplotlib.pyplot as plt
 import re
 import random
 import os
 from tqdm import tqdm
 DATA_FILE = 'en_US.txt'
 EPOCHS = 4000
 TEACHER_FORCING_PROBABILITY = 0.4
 LEARNING_RATE = 0.01
 BATCH_SIZE = 1024
 plt.ion()
 if not os.path.isfile(DATA_FILE):
    import requests
    open(DATA_FILE, 'wb').write(
        requests.get('https://raw.githubusercontent.com/open-dict-data/ipa-dict/master/data/' + DATA_FILE).content)
 DEVICE = torch.device('cuda:0') if torch.cuda.is_available() else torch.device('cpu')
 OUT_LOOKUP = ['', 'b', 'a', 'ʊ', 't', 'k', 'ə', 'z', 'ɔ', 'ɹ', 's', 'j', 'u', 'm', 'f', 'ɪ', 'o', 'ɡ', 'ɛ', 'n',
              'e', 'd',
              'ɫ', 'w', 'i', 'p', 'ɑ', 'ɝ', 'θ', 'v', 'h', 'æ', 'ŋ', 'ʃ', 'ʒ', 'ð', '^', '$']
 IN_LOOKUP = ['', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't',
             'u', 'v', 'w', 'x', 'y', 'z', '$', '^']
 def extract_alphabet():
    """
    This function has been used to extract the alphabet but then it was hard-coded directly into
    code in order to speed up loading time
    """
    with open(DATA_FILE) as f:
        in_alph = set()
        out_alph = set()
        for line in f:
            text, phonemes = line.strip().split("\t")
            phonemes = phonemes.split(",")[0]
            for letter in phonemes:
                out_alph.add(letter)
                in_alph.add(letter)
        return in_alph, out_alph
 IN_ALPHABET = {letter: idx for idx, letter in enumerate(IN_LOOKUP)}
 OUT_ALPHABET = {letter: idx for idx, letter in enumerate(OUT_LOOKUP)}
 TOTAL_OUT_LEN = 0
 DATA: [(torch.tensor, torch.tensor)] = []
 with open(DATA_FILE) as f:
    for line in f:
        text, phonemes = line.split("\t")
        phonemes = phonemes.strip().split(",")[0]
        phonemes = '^' + re.sub(r'[/\'ˈˌ]', '', phonemes) + '$'
        text = '^' + re.sub(r'[^a-z]', '', text.strip()) + '$'
        text = torch.tensor([IN_ALPHABET[letter] for letter in text], dtype=torch.int)
        phonemes = torch.tensor([OUT_ALPHABET[letter] for letter in phonemes], dtype=torch.int)
        DATA.append((text, phonemes))
 random.shuffle(DATA)
 # DATA = DATA[:2000]
 print("Number of samples ", len(DATA))
 TRAINING_SET_SIZE = int(len(DATA) * 0.5)
 TRAINING_SET_SIZE -= TRAINING_SET_SIZE % BATCH_SIZE
 EVAL = DATA[TRAINING_SET_SIZE:]
 DATA = DATA[:TRAINING_SET_SIZE]
 assert len(DATA) % BATCH_SIZE == 0
 print("Training samples ", len(DATA))
 print("Evaluation samples ", len(EVAL))
 print("Input alphabet ", IN_LOOKUP)
 print("Output alphabet ", OUT_LOOKUP)
 TOTAL_TRAINING_OUT_LEN = 0
 TOTAL_EVALUATION_OUT_LEN = 0
 for text, phonemes in DATA:
    TOTAL_TRAINING_OUT_LEN += len(phonemes)
 for text, phonemes in EVAL:
    TOTAL_EVALUATION_OUT_LEN += len(phonemes)
 TOTAL_EVALUATION_OUT_LEN -= len(EVAL)  # do not count the beginning of line ^ character
 TOTAL_TRAINING_OUT_LEN -= len(DATA)
 print("Total output length in training set", TOTAL_TRAINING_OUT_LEN)
 print("Total output length in evaluation set", TOTAL_EVALUATION_OUT_LEN)
 def shuffle_but_keep_sorted_by_output_lengths(data: [(torch.tensor, torch.tensor)]):
    random.shuffle(data)
    data.sort(reverse=True, key=lambda x: len(x[1]))  # sort with respect to output lengths
 def collate(batch: [(torch.tensor, torch.tensor)]):
    batch.sort(reverse=True, key=lambda x: len(x[0]))  # sort with respect to input lengths
    in_lengths = [len(entry[0]) for entry in batch]
    max_in_len = max(in_lengths)
    out_lengths = [len(entry[1]) for entry in batch]
    max_out_len = max(out_lengths)
    padded_in = torch.zeros((len(batch), max_in_len), dtype=torch.int)
    padded_out = torch.zeros((len(batch), max_out_len), dtype=torch.long)
    for i in range(0, len(batch)):
        padded_in[i, :len(batch[i][0])] = batch[i][0]
        padded_out[i, :len(batch[i][1])] = batch[i][1]
    return padded_in, in_lengths, padded_out, out_lengths
 class EncoderRNN(nn.Module):
    def __init__(self, hidden_size, layers):
        super(EncoderRNN, self).__init__()
        self.hidden_size = hidden_size
        self.hidden_layers = layers
        self.embedding = nn.Embedding(num_embeddings=len(IN_ALPHABET),
                                      embedding_dim=hidden_size,
                                      padding_idx=IN_ALPHABET[''])
        self.gru = nn.LSTM(input_size=hidden_size,
                           hidden_size=hidden_size,
                           num_layers=self.hidden_layers,
                           batch_first=True)
        # self.lin = nn.Linear(hidden_size, hidden_size)
    def forward(self, padded_in, in_lengths):
        batch_size = len(in_lengths)
        # hidden_state, cell_state = hidden
        # assert hidden_state.size() == (self.hidden_layers, batch_size, self.hidden_size)
        # assert cell_state.size() == (self.hidden_layers, batch_size, self.hidden_size)
        embedded = self.embedding(padded_in)
        assert embedded.size() == (batch_size, max(in_lengths), self.hidden_size)
        packed = torch.nn.utils.rnn.pack_padded_sequence(embedded, in_lengths, batch_first=True)
        gru_out, hidden = self.gru(packed)
        unpacked, _ = torch.nn.utils.rnn.pad_packed_sequence(gru_out, batch_first=True)
        assert unpacked.size() == (batch_size, max(in_lengths), self.hidden_size)
        # assert hidden.size() == (self.hidden_layers, batch_size, self.hidden_size)
        # h, cell_state = hidden
        # final_hidden = self.lin(h)
        return unpacked, hidden
    def init_hidden(self, batch_size, device):
        hidden_state = torch.zeros(self.hidden_layers, batch_size, self.hidden_size, device=device)
        cell_state = torch.zeros(self.hidden_layers, batch_size, self.hidden_size, device=device)
        return hidden_state, cell_state
 class DecoderRNN(nn.Module):
    def __init__(self, hidden_size, layers):
        super(DecoderRNN, self).__init__()
        self.hidden_size = hidden_size
        self.hidden_layers = layers
        self.embedding = nn.Embedding(num_embeddings=len(OUT_ALPHABET),
                                      embedding_dim=hidden_size,
                                      padding_idx=OUT_ALPHABET[''])
        self.gru = nn.LSTM(input_size=hidden_size,
                           hidden_size=hidden_size,
                           num_layers=self.hidden_layers,
                           batch_first=True)
        self.out = nn.Linear(hidden_size, len(OUT_ALPHABET))
        self.softmax = nn.LogSoftmax(dim=2)
    def forward(self, padded_out, hidden):
        batch_size = len(padded_out)
        padded_out = padded_out.unsqueeze(1)
        seq_length = 1
        hidden_state, cell_state = hidden
        assert hidden_state.size() == (self.hidden_layers, batch_size, self.hidden_size)
        assert cell_state.size() == (self.hidden_layers, batch_size, self.hidden_size)
        embedded = self.embedding(padded_out)
        assert embedded.size() == (batch_size, seq_length, self.hidden_size)
        gru_out, hidden = self.gru(embedded, hidden)
        # assert hidden.size() == (self.hidden_layers, batch_size, self.hidden_size)
        assert gru_out.size() == (batch_size, seq_length, self.hidden_size)
        lin = self.out(gru_out)
        assert lin.size() == (batch_size, seq_length, len(OUT_ALPHABET))
        probabilities = self.softmax(lin)
        assert probabilities.size() == (batch_size, seq_length, len(OUT_ALPHABET))
        return probabilities, hidden
 def run(encoder, decoder, batch_in, i_lengths, batch_out, o_lengths, teacher_forcing_prob, criterion):
    batch_in = batch_in.to(DEVICE)
    batch_out = batch_out.to(DEVICE)
    out_seq_len = batch_out.size()[1]
    in_seq_len = batch_in.size()[1]
    assert batch_in.size() == (BATCH_SIZE, in_seq_len)
    assert batch_out.size() == (BATCH_SIZE, out_seq_len)
    loss = 0
    total_correct_predictions = 0
    encoder_output, hidden = encoder(batch_in, i_lengths)
    output = batch_out[:, 0]
    for i in range(out_seq_len - 1):
        if random.random() < teacher_forcing_prob:
            out = batch_out[:, i]
        else:
            out = output
        output, hidden = decoder(out, hidden)
        output = output.squeeze(1)
        expected_output = batch_out[:, i + 1]
        if criterion is not None:
            loss += criterion(output, expected_output)
        argmax_output = torch.argmax(output, 1)
        with torch.no_grad():
            total_correct_predictions += (argmax_output == expected_output).sum().item()
        output = argmax_output
    return loss, total_correct_predictions
 eval_bar = tqdm(total=len(EVAL), position=2)
 def eval_model(encoder, decoder):
    eval_bar.reset()
    eval_bar.set_description("Evaluation")
    with torch.no_grad():
        total_correct_predictions = 0
        for batch_in, i_lengths, batch_out, o_lengths in DataLoader(dataset=EVAL, drop_last=True,
                                                                    batch_size=BATCH_SIZE,
                                                                    collate_fn=collate):
            loss, correct_predictions = run(encoder=encoder,
                                            decoder=decoder,
                                            criterion=None,
                                            i_lengths=i_lengths,
                                            o_lengths=o_lengths,
                                            batch_in=batch_in,
                                            batch_out=batch_out,
                                            teacher_forcing_prob=0)
            total_correct_predictions += correct_predictions
            eval_bar.update(BATCH_SIZE)
        return total_correct_predictions
 outer_bar = tqdm(total=EPOCHS, position=0)
 inner_bar = tqdm(total=len(DATA), position=1)
 def train_model(encoder, decoder):
    encoder_optimizer = optim.Adam(filter(lambda x: x.requires_grad, encoder.parameters()),
                                   lr=LEARNING_RATE)
    decoder_optimizer = optim.Adam(filter(lambda x: x.requires_grad, decoder.parameters()),
                                   lr=LEARNING_RATE)
    criterion = nn.NLLLoss(ignore_index=OUT_ALPHABET[''])
    train_snapshots_percentage = [0]
    train_snapshots = [0]
    eval_snapshots = [eval_model(encoder, decoder)]
    eval_snapshots_percentage = [eval_snapshots[0] / TOTAL_EVALUATION_OUT_LEN]
    outer_bar.reset()
    outer_bar.set_description("Epochs")
    for epoch in range(EPOCHS):
        shuffle_but_keep_sorted_by_output_lengths(DATA)
        total_loss = 0
        total_correct_predictions = 0
        inner_bar.reset()
        for batch_in, i_lengths, batch_out, o_lengths in DataLoader(dataset=DATA, drop_last=True,
                                                                    batch_size=BATCH_SIZE,
                                                                    collate_fn=collate):
            encoder_optimizer.zero_grad()
            decoder_optimizer.zero_grad()
            loss, correct_predictions = run(encoder=encoder,
                                            decoder=decoder,
                                            criterion=criterion,
                                            i_lengths=i_lengths,
                                            o_lengths=o_lengths,
                                            batch_in=batch_in,
                                            batch_out=batch_out,
                                            teacher_forcing_prob=TEACHER_FORCING_PROBABILITY)
            total_correct_predictions += correct_predictions
            loss.backward()
            encoder_optimizer.step()
            decoder_optimizer.step()
            inner_bar.update(BATCH_SIZE)
            loss_scalar = loss.item()
            total_loss += loss_scalar
            inner_bar.set_description("Avg loss %.2f" % (loss_scalar / batch_out.size()[1]))
        train_snapshots.append(total_correct_predictions)
        train_snapshots_percentage.append(total_correct_predictions / TOTAL_TRAINING_OUT_LEN)
        eval_snapshots.append(eval_model(encoder, decoder))
        eval_snapshots_percentage.append(eval_snapshots[-1] / TOTAL_EVALUATION_OUT_LEN)
        plt.clf()
        plt.plot(train_snapshots_percentage, label="Training %")
        plt.plot(eval_snapshots_percentage, label="Evaluation %")
        plt.legend(loc="upper left")
        plt.pause(interval=0.01)
        # print()
        # print("Total epoch loss:", total_loss)
        # print("Total epoch avg loss:", total_loss / TOTAL_TRAINING_OUT_LEN)
        # print("Training snapshots:", train_snapshots)
        # print("Training snapshots(%):", train_snapshots_percentage)
        # print("Evaluation snapshots:", eval_snapshots)
        # print("Evaluation snapshots(%):", eval_snapshots_percentage)
        outer_bar.set_description("Epochs")
        outer_bar.update(1)
 train_model(EncoderRNN(32, 4).to(DEVICE), DecoderRNN(32, 4).to(DEVICE))