import csv import itertools from os.path import exists import pandas as pd import regex as re import torch from torch import nn from torch.utils.data import DataLoader from torchtext.vocab import build_vocab_from_iterator IN_INPUT_PATH = "train/in.tsv.xz" IN_OUTPUT_PATH = "train/expected.tsv" VOCAB_SIZE = 30000 EMBED_SIZE = 150 BATCH_SIZE = 8000 DEV_PATH = "dev-0/" TEST_PATH = "test-A/" DEVICE = "cpu" def clean(text): text = str(text).lower().replace("-\\n", "").replace("\\n", " ") return re.sub(r"\p{P}", "", text) def get_words_from_line(line, specials=True): line = line.rstrip() if specials: yield "" for m in re.finditer(r"[\p{L}0-9\*]+|\p{P}+", line): yield m.group(0).lower() if specials: yield "" def get_word_lines_from_data(d): for line in d: yield get_words_from_line(line) def look_ahead_iterator(gen): w1 = None for item in gen: if w1 is not None: yield (w1, item) w1 = item class SimpleBigramNeuralLanguageModel(torch.nn.Module): def __init__(self, vocabulary_size, embedding_size): super(SimpleBigramNeuralLanguageModel, self).__init__() self.model = nn.Sequential( nn.Embedding(vocabulary_size, embedding_size), nn.Linear(embedding_size, vocabulary_size), nn.Softmax(), ) def forward(self, x): return self.model(x) class Bigrams(torch.utils.data.IterableDataset): def __init__(self, data, vocabulary_size): self.vocab = build_vocab_from_iterator( get_word_lines_from_data(data), max_tokens=vocabulary_size, specials=[""], ) self.vocab.set_default_index(self.vocab[""]) self.vocabulary_size = vocabulary_size self.data = data def __iter__(self): return look_ahead_iterator( ( self.vocab[t] for t in itertools.chain.from_iterable( get_word_lines_from_data(self.data) ) ) ) def get_dataset(): X_train = pd.read_csv( IN_INPUT_PATH, sep="\t", header=None, quoting=csv.QUOTE_NONE, nrows=200000, on_bad_lines="skip", encoding="UTF-8", ) Y_train = pd.read_csv( IN_OUTPUT_PATH, sep="\t", header=None, quoting=csv.QUOTE_NONE, nrows=200000, on_bad_lines="skip", encoding="UTF-8", ) X_train = X_train[[6, 7]] X_train = pd.concat([X_train, Y_train], axis=1) X_train = X_train[6] + X_train[0] + X_train[7] X_train = X_train.apply(clean) return Bigrams(X_train, VOCAB_SIZE) dataset = get_dataset() def get_model(): model = SimpleBigramNeuralLanguageModel(VOCAB_SIZE, EMBED_SIZE).to(DEVICE) if not exists("nn_model.bin"): data = DataLoader(dataset, batch_size=BATCH_SIZE) optimizer = torch.optim.Adam(model.parameters()) criterion = torch.nn.NLLLoss() model.train() step = 0 for i in range(2): for x, y in data: x = x.to(DEVICE) y = y.to(DEVICE) optimizer.zero_grad() y_predicted = model(x) loss = criterion(torch.log(y_predicted), y) if step % 100 == 0: print(step, loss) step += 1 loss.backward() optimizer.step() torch.save(model.state_dict(), "nn_model.bin") else: model.load_state_dict(torch.load("nn_model.bin", map_location=torch.device('cpu'))) return model vocab = dataset.vocab model = get_model() def predict(ws): ixs = torch.tensor(vocab.forward(ws)).to(DEVICE) out = model(ixs) top = torch.topk(out[0], 8) top_indices = top.indices.tolist() top_probs = top.values.tolist() top_words = vocab.lookup_tokens(top_indices) pred_str = "" for word, prob in list(zip(top_words, top_probs)): pred_str += f"{word}:{prob} " return pred_str def predict_input(file): X_test = pd.read_csv( f"{file}/in.tsv.xz", sep="\t", header=None, quoting=csv.QUOTE_NONE, on_bad_lines="skip", encoding="UTF-8", )[6] X_test = X_test.apply(clean) with open(f"{file}/out.tsv", "w+", encoding="UTF-8") as f: for row in X_test: before = None for before in get_words_from_line(clean(str(row)), False): pass before = [before] if len(before) < 1: pred_str = "a:0.2 the:0.2 to:0.2 of:0.1 and:0.1 of:0.1 :0.1" else: pred_str = predict(before) pred_str = pred_str.strip() f.write(pred_str + "\n") predict_input(DEV_PATH) predict_input(TEST_PATH)