diff --git a/.gitignore b/.gitignore index 1c18d74..44533c3 100644 --- a/.gitignore +++ b/.gitignore @@ -6,3 +6,6 @@ *.o .DS_Store .token +train/ +test-A/ +dev-0/ diff --git a/gonito.yaml b/gonito.yaml new file mode 100644 index 0000000..f46b209 --- /dev/null +++ b/gonito.yaml @@ -0,0 +1,20 @@ +description: trigram model prediction +tags: + - neural-network + - trigram +params: + epochs: 5 + learning-rate: 0.001 + vocab_size: 30000 + embed_size: 300 + hidden_size: 150 + batch_size: 1000 +unwanted-params: + - model-file + - vocab-file +param-files: + - "*.yaml" + - config/*.yaml +links: + - title: "repository" + url: "https://git.wmi.amu.edu.pl/s444465/challenging-america-word-gap-prediction.git" \ No newline at end of file diff --git a/main.py b/main.py new file mode 100644 index 0000000..e69de29 diff --git a/neural_trigram.py b/neural_trigram.py new file mode 100644 index 0000000..5bee801 --- /dev/null +++ b/neural_trigram.py @@ -0,0 +1,163 @@ +# -*- coding: utf-8 -*- + +import itertools +import lzma +import numpy as np +import regex as re +import torch +from torch import nn +from torch.utils.data import IterableDataset, DataLoader +from torchtext.vocab import build_vocab_from_iterator + +vocab_size = 30000 +embed_size = 300 +hidden_size = 150 +batch_size = 1000 +device = 'cuda' +train_path = 'train/in.tsv.xz' +model_path = 'model.bin' + +# Commented out IPython magic to ensure Python compatibility. +from google.colab import drive +drive.mount('modelowanie_jezyka', force_remount=True) +# %cd /content/modelowanie_jezyka/MyDrive/modelowanie_jezyka + +def process_line(line): + separated = line.split('\t') + left = separated[6].replace(r'\n', ' ').strip() + right = separated[7].replace(r'\n', ' ').strip() + line = left + ' ' + right + return line + + +def get_line(line): + line = process_line(line) + for word in line.split(): + yield word + +def get_word_lines_from_file(file_name): + i = 0 + with lzma.open(file_name, mode='rt', encoding='utf-8') as fid: + for line in fid: + if i > 100_000: + break + i += 1 + yield get_line(line) + +def double_look_ahead_iterator(gen): + prev_prev = None + prev = None + for item in gen: + if prev_prev is not None: + yield np.asarray((prev_prev, prev, item)) + prev_prev = prev + prev = item + +def prediction(words, model, top): + words_tensor = [train_dataset.vocab.forward([word]) for word in words] + word_t = torch.tensor(words_tensor).view(-1).to(device) + out = model(word_t) + top = torch.topk(out[0], top) + top_indices = top.indices.tolist() + top_probs = top.values.tolist() + top_words = vocab.lookup_tokens(top_indices) + zipped = list(zip(top_words, top_probs)) + for index, element in enumerate(zipped): + unk = None + if '' in element: + unk = zipped.pop(index) + zipped.append(('', unk[1])) + break + if unk is None: + zipped[-1] = ('', zipped[-1][1]) + return ' '.join([f'{x[0]}:{x[1]}' for x in zipped]) + +def create_outputs(folder_name, model, top): + print(f'Creating outputs in {folder_name}') + with lzma.open(f'{folder_name}/in.tsv.xz', mode='rt', encoding='utf-8') as fid: + with open(f'{folder_name}/out-top={top}.tsv', 'w', encoding='utf-8', newline='\n') as f: + for line in fid: + separated = line.split('\t') + prefix = separated[6].replace(r'\n', ' ').split()[-2:] + output_line = prediction(prefix, model, top) + f.write(output_line + '\n') + +def train_model(lr): + model = TrigramNeuralLanguageModel(vocab_size, embed_size, hidden_size).to(device) + data = DataLoader(train_dataset, batch_size=batch_size) + optimizer = torch.optim.Adam(model.parameters(), lr=lr) + criterion = torch.nn.NLLLoss() + + model.train() + step = 0 + for batch in data: + x = batch[:, :2] + y = batch[:, 2] + x = x.to(device) + y = y.to(device) + optimizer.zero_grad() + predicted = model(x) + loss = criterion(torch.log(predicted), y) + if step % 100 == 0: + print(step, loss) + step += 1 + loss.backward() + + torch.nn.utils.clip_grad_norm_(model.parameters(), 10) + + + optimizer.step() + + torch.save(model.state_dict(), model_path) + +def with_hyperparams(): + train_model(lr=0.001) + model = TrigramNeuralLanguageModel(vocab_size, embed_size, hidden_size).to(device) + model.load_state_dict(torch.load(model_path)) + model.eval() + for top in [100, 200, 400, 600, 800]: + create_outputs('dev-0', model, top) + create_outputs('test-A', model, top) + +"""### Classes""" + +class Trigrams(IterableDataset): + def __init__(self, text_file, vocabulary_size): + self.vocab = build_vocab_from_iterator( + get_word_lines_from_file(text_file), + max_tokens=vocabulary_size, + specials=['']) + self.vocab.set_default_index(self.vocab['']) + self.vocabulary_size = vocabulary_size + self.text_file = text_file + + def __iter__(self): + return double_look_ahead_iterator( + (self.vocab[t] for t in itertools.chain.from_iterable(get_word_lines_from_file(self.text_file)))) + +class TrigramNeuralLanguageModel(nn.Module): + def __init__(self, vocabulary_size, embedding_size, hidden_size): + super(TrigramNeuralLanguageModel, self).__init__() + self.embedding_size = embedding_size + self.embedding = nn.Embedding(vocabulary_size, embedding_size) + self.linear = nn.Linear(2 * embedding_size, hidden_size) + self.relu = nn.ReLU() + self.linear2 = nn.Linear(hidden_size, vocabulary_size) + self.softmax = nn.Softmax() + + def forward(self, x): + x = self.embedding(x).view((-1, 2 * self.embedding_size)) + x = self.linear(x) + x = self.relu(x) + x = self.linear2(x) + return self.softmax(x) + +vocab = build_vocab_from_iterator( + get_word_lines_from_file(train_path), + max_tokens=vocab_size, + specials=[''] +) + +vocab.set_default_index(vocab['']) +train_dataset = Trigrams(train_path, vocab_size) +with_hyperparams()