from itertools import islice import regex as re import sys from torchtext.vocab import build_vocab_from_iterator from torch import nn import torch from torch.utils.data import IterableDataset import itertools import pandas as pd from torch.utils.data import DataLoader import csv import os def data_preprocessing(text): return re.sub(r'\p{P}', '', text.lower().replace('-\\n', '').replace('\\n', ' ').replace("'ll", " will").replace("-", "").replace("'ve", " have").replace("'s", " is")) def get_words_from_line(line, s = True): line = line.rstrip() if s: yield '' for m in re.finditer(r'[\p{L}0-9\*]+|\p{P}+', line): yield m.group(0).lower() if s: yield '' def get_word_lines_from_file(data): for line in data: yield get_words_from_line(line) class SimpleBigramNeuralLanguageModel(nn.Module): def __init__(self, vocabulary_size, embedding_size): super(SimpleBigramNeuralLanguageModel, self).__init__() self.model = nn.Sequential( nn.Embedding(vocabulary_size, embedding_size), nn.Linear(embedding_size, vocabulary_size), nn.Softmax() ) def forward(self, x): return self.model(x) def look_ahead_iterator(gen): prev = None for item in gen: if prev is not None: yield (prev, item) prev = item class Bigrams(IterableDataset): def __init__(self, text_file, vocabulary_size): self.vocab = build_vocab_from_iterator( get_word_lines_from_file(text_file), max_tokens = vocabulary_size, specials = ['']) self.vocab.set_default_index(self.vocab['']) self.vocabulary_size = vocabulary_size self.text_file = text_file def __iter__(self): return look_ahead_iterator( (self.vocab[t] for t in itertools.chain.from_iterable(get_word_lines_from_file(self.text_file)))) in_file = 'train/in.tsv.xz' out_file = 'train/expected.tsv' train_set = pd.read_csv( 'train/in.tsv.xz', sep='\t', header=None, quoting=csv.QUOTE_NONE, nrows=35000) train_labels = pd.read_csv( 'train/expected.tsv', sep='\t', header=None, quoting=csv.QUOTE_NONE, nrows=35000) data = pd.concat([train_set, train_labels], axis=1) data = train_set[6] + train_set[0] + train_set[7] data = data.apply(data_preprocessing) vocab_size = 30000 embed_size = 150 bigram_data = Bigrams(data, vocab_size) device = 'cpu' model = SimpleBigramNeuralLanguageModel(vocab_size, embed_size).to(device) if(not os.path.exists('model1.bin')): data = DataLoader(bigram_data, batch_size=5000) optimizer = torch.optim.Adam(model.parameters()) criterion = torch.nn.NLLLoss() model.train() step = 0 for x, y in data: x = x.to(device) y = y.to(device) optimizer.zero_grad() ypredicted = model(x) loss = criterion(torch.log(ypredicted), y) if step % 100 == 0: print(step, loss) step += 1 loss.backward() optimizer.step() torch.save(model.state_dict(), 'model1.bin') else: model.load_state_dict(torch.load('model1.bin')) vocab = bigram_data.vocab prediction = 'the:0.03 be:0.03 to:0.03 of:0.025 and:0.025 a:0.025 in:0.020 that:0.020 have:0.015 I:0.010 it:0.010 for:0.010 not:0.010 on:0.010 with:0.010 he:0.010 as:0.010 you:0.010 do:0.010 at:0.010 :0.77' def predict_word(w): ixs = torch.tensor(vocab.forward(w)).to(device) out = model(ixs) top = torch.topk(out[0], 8) top_indices = top.indices.tolist() top_probs = top.values.tolist() top_words = vocab.lookup_tokens(top_indices) pred_str = "" for word, prob in list(zip(top_words, top_probs)): pred_str += f"{word}:{prob} " return pred_str def predict(f): x = pd.read_csv(f'{f}/in.tsv.xz', sep='\t', header=None, quoting=csv.QUOTE_NONE, on_bad_lines='skip', encoding="UTF-8")[6] x = x.apply(data_preprocessing) with open(f'{f}/out.tsv', "w+", encoding="UTF-8") as f: for row in x: before = None for before in get_words_from_line(data_preprocessing(str(row)), False): pass before = [before] if(len(before) < 1): pred_str = prediction else: pred_str = predict_word(before) pred_str = pred_str.strip() f.write(pred_str + "\n") predict("dev-0/") predict("test-A/")