import nltk import torch import numpy as np from tqdm.notebook import tqdm from nltk.tokenize import word_tokenize from sklearn.model_selection import train_test_split nltk.download('punkt') NGRAMS = 5 BATCH_SIZE = 128 EPOCHS = 15 device = torch.device("cuda" if torch.cuda.is_available() else "cpu") with open("train/train.tsv", "r", encoding="utf8") as lalka_path: lines = lalka_path.readlines() train, test = train_test_split(lines, test_size=0.2) with open("train/train_train.tsv", "w", encoding="utf8") as out_train_file: for i in train: out_train_file.write(i) with open("train/train_test.tsv", "w", encoding="utf8") as out_test_file: for i in test: out_test_file.write(i) lalka_path_train = 'train/train_train.tsv' lalka_path_valid = 'train/train_test.tsv' corpora_train = open(lalka_path_train, encoding="utf8").read() corpora_train_tokenized = list(word_tokenize(corpora_train)) corpora_train_tokenized = [token.lower() for token in corpora_train_tokenized] vocab_itos = sorted(set(corpora_train_tokenized)) vocab_itos = vocab_itos[:15005] vocab_itos[15001] = "" vocab_itos[15002] = "" vocab_itos[15003] = "" vocab_itos[15004] = "" vocab_stoi = dict() for i, token in enumerate(vocab_itos): vocab_stoi[token] = i def get_token_id(dataset): token_ids = [vocab_stoi['']] * (NGRAMS - 1) + [vocab_stoi['']] for token in dataset: try: token_ids.append(vocab_stoi[token]) except KeyError: token_ids.append(vocab_stoi['']) token_ids.append(vocab_stoi['']) return token_ids def get_samples(dataset): samples = [] for i in range(len(dataset) - NGRAMS): samples.append(dataset[i:i + NGRAMS]) return samples train_ids = get_token_id(corpora_train_tokenized) train_ids = get_samples(train_ids) train_ids = torch.tensor(train_ids, device=device) corpora_valid = open(lalka_path_valid, encoding="utf8").read() corpora_valid_tokenized = list(word_tokenize(corpora_valid)) corpora_valid_tokenized = [token.lower() for token in corpora_valid_tokenized] valid_ids = get_token_id(corpora_valid_tokenized) valid_ids = torch.tensor(get_samples(valid_ids), dtype=torch.long, device=device) class GRU(torch.nn.Module): def __init__(self): super(GRU, self).__init__() self.emb = torch.nn.Embedding(len(vocab_itos), 100) self.rec = torch.nn.GRU(100, 256, 1, batch_first=True) self.fc1 = torch.nn.Linear(256, len(vocab_itos)) def forward(self, x): emb = self.emb(x) output, h_n = self.rec(emb) hidden = h_n.squeeze(0) out = self.fc1(hidden) return out lm = GRU().to(device) criterion = torch.nn.CrossEntropyLoss() optimizer = torch.optim.Adam(lm.parameters(), lr=0.0001) def get_ppl(dataset_ids): lm.eval() batches = 0 loss_sum = 0 for i in range(0, len(dataset_ids) - BATCH_SIZE + 1, BATCH_SIZE): X = dataset_ids[i:i + BATCH_SIZE, :NGRAMS - 1] Y = dataset_ids[i:i + BATCH_SIZE, NGRAMS - 1] predictions = lm(X) loss = criterion(predictions, Y) loss_sum += loss.item() batches += 1 return np.exp(loss_sum / batches) history_ppl_train = [] history_ppl_valid = [] for epoch in range(EPOCHS): batches = 0 loss_sum = 0 lm.train() total = len(train_ids) - BATCH_SIZE + 1 for i in range(0, total, BATCH_SIZE): print('batches: ' + str(batches)) X = train_ids[i: i + BATCH_SIZE, :NGRAMS - 1] Y = train_ids[i: i + BATCH_SIZE, NGRAMS - 1] predictions = lm(X) loss = criterion(predictions, Y) optimizer.zero_grad() loss.backward() optimizer.step() loss_sum += loss.item() batches += 1 ppl_train = get_ppl(train_ids) ppl_valid = get_ppl(valid_ids) history_ppl_train.append(ppl_train) history_ppl_valid.append(ppl_valid) print('epoch: ', epoch) print('train ppl: ', ppl_train) print('valid ppl: ', ppl_valid) print() tokenized = list(word_tokenize('Gości innych nie widział oprócz spółleśników')) tokenized = [token.lower() for token in tokenized] ids = [] for word in tokenized: if word in vocab_stoi: ids.append(vocab_stoi[word]) else: ids.append(vocab_stoi['']) lm.eval() ids = torch.tensor(ids, dtype=torch.long, device=device) preds = lm(ids.unsqueeze(0)) vocab_itos[torch.argmax(torch.softmax(preds, 1), 1).item()] tokenized = list(word_tokenize('Lalka')) tokenized = [token.lower() for token in tokenized] ids = [] for word in tokenized: if word in vocab_stoi: ids.append(vocab_stoi[word]) else: ids.append(vocab_stoi['']) ids = torch.tensor([ids], dtype=torch.long, device=device) candidates_number = 10 for i in range(30): preds = lm(ids) candidates = torch.topk(torch.softmax(preds, 1), candidates_number)[1][0].cpu().numpy() candidate = 15001 while candidate > 15000: candidate = candidates[np.random.randint(candidates_number)] print(vocab_itos[candidate]) ids = torch.cat((ids, torch.tensor([[candidate]], device=device)), 1) print('starting outs...') with open("dev-0/in.tsv", "r", encoding="UTF-8") as dev_path: nr_of_dev_lines = len(dev_path.readlines()) with open("test-A/in.tsv", "r", encoding="UTF-8") as test_a_path: nr_of_test_a_lines = len(test_a_path.readlines()) with open("dev-0/out.tsv", "w", encoding="UTF-8") as out_dev_file: for i in range(nr_of_dev_lines): preds = lm(ids) candidates = torch.topk(torch.softmax(preds, 1), candidates_number)[1][0].cpu().numpy() candidate = 15001 while candidate > 15000: candidate = candidates[np.random.randint(candidates_number)] print(vocab_itos[candidate]) ids = torch.cat((ids, torch.tensor([[candidate]], device=device)), 1) out_dev_file.write(vocab_itos[candidate] + '\n') with open("test-A/out.tsv", "w", encoding="UTF-8") as out_test_file: for i in range(nr_of_dev_lines): preds = lm(ids) candidates = torch.topk(torch.softmax(preds, 1), candidates_number)[1][0].cpu().numpy() candidate = 15001 while candidate > 15000: candidate = candidates[np.random.randint(candidates_number)] print(vocab_itos[candidate]) ids = torch.cat((ids, torch.tensor([[candidate]], device=device)), 1) out_test_file.write(vocab_itos[candidate] + '\n')