import itertools import lzma import regex as re import torch from nltk.tokenize import RegexpTokenizer from torch import nn from torch.utils.data import DataLoader, IterableDataset from torchtext.vocab import build_vocab_from_iterator VOCAB_SIZE = 40000 EMBED_SIZE = 100 DEVICE = "cuda" tokenizer = RegexpTokenizer(r"\w+") def read_file(file): for line in file: text = line.split("\t") yield re.sub( r"[^\w\d'\s]+", "", re.sub(" +", " ", text[6].replace("\\n", " ").replace("\n", "").lower()), ) def get_words(line): line = line.rstrip() yield "~~" for m in re.finditer(r"[\p{L}0-9\*]+|\p{P}+", line): yield m.group(0).lower() yield "~~" def get_line(file_path): with lzma.open(file_path, mode="rt") as file: for _, line in enumerate(file): text = line.split("\t") yield get_words( re.sub( r"[^\w\d'\s]+", "", re.sub( " +", " ", " ".join([text[6], text[7]]) .replace("\\n", " ") .replace("\n", "") .lower(), ), ) ) def buidl_vocab(): vocab = build_vocab_from_iterator( get_line("train/in.tsv.xz"), max_tokens=VOCAB_SIZE, specials=[""] ) vocab.set_default_index(vocab[""]) return vocab def look_ahead_iterator(gen): prev = None for item in gen: if prev is not None: yield (prev, item) prev = item class SimpleBigramNeuralLanguageModel(nn.Module): def __init__(self, vocabulary_size, embedding_size): super(SimpleBigramNeuralLanguageModel, self).__init__() self.model = nn.Sequential( nn.Embedding(vocabulary_size, embedding_size), nn.Linear(embedding_size, vocabulary_size), nn.Softmax(), ) def forward(self, x): return self.model(x) class Bigrams(IterableDataset): def __init__(self, text_file, vocabulary_size): self.vocab = build_vocab_from_iterator( get_line(text_file), max_tokens=vocabulary_size, specials=[""] ) self.vocab.set_default_index(self.vocab[""]) self.vocabulary_size = vocabulary_size self.text_file = text_file def __iter__(self): return look_ahead_iterator( ( self.vocab[t] for t in itertools.chain.from_iterable(get_line(self.text_file)) ) ) vocab = buidl_vocab() def train(): batch_size = 10000 train_dataset = Bigrams("train/in.tsv.xz", VOCAB_SIZE) device = "cuda" model = SimpleBigramNeuralLanguageModel(VOCAB_SIZE, EMBED_SIZE).to(device) train_data_loader = DataLoader(train_dataset, batch_size=batch_size) optimizer = torch.optim.Adam(model.parameters()) criterion = torch.nn.NLLLoss() model.train() step = 0 for x, y in train_data_loader: x = x.to(device) y = y.to(device) optimizer.zero_grad() ypredicted = model(x) loss = criterion(torch.log(ypredicted), y) if step % 100 == 0: print(step, loss) step += 1 loss.backward() optimizer.step() torch.save(model.state_dict(), "model1.bin") def predict(word, model): ixs = torch.tensor(vocab.forward([word])).to(DEVICE) out = model(ixs) top = torch.topk(out[0], 8) top_indices = top.indices.tolist() top_probs = top.values.tolist() top_words = vocab.lookup_tokens(top_indices) str_predictions = "" lht = 1.0 for pred_word in list(zip(top_words, top_indices, top_probs)): if lht - pred_word[2] >= 0: str_predictions += f"{pred_word[0]}:{pred_word[2]} " lht -= pred_word[2] if lht != 1.0: str_predictions += f":{lht}" return str_predictions def generate_predictions(input_file, output_file, model): with open(output_file, "w") as outputf: with lzma.open(input_file, mode="rt") as file: for _, text in enumerate(read_file(file)): tokens = tokenizer.tokenize(text) if len(tokens) < 4: prediction = "the:0.2 be:0.2 to:0.2 of:0.1 and:0.1 a:0.1 :0.1" else: prediction = predict(tokens[-1], model) outputf.write(prediction + "\n") if __name__ == "__main__": train() model = SimpleBigramNeuralLanguageModel(VOCAB_SIZE, EMBED_SIZE).to(DEVICE) model.load_state_dict(torch.load("model1.bin")) model.eval() generate_predictions("dev-0/in.tsv.xz", "dev-0/out.tsv", model) generate_predictions("test-A/in.tsv.xz", "test-A/out.tsv", model)