import os import numpy as np import torch from sklearn.model_selection import train_test_split import nltk from nltk.tokenize import word_tokenize nltk.download('punkt') dir_path = os.path.dirname(os.path.realpath(__file__)) NGRAMS = 5 BATCH_SIZE = 128 EPOCHS = 15 device = torch.device("cuda" if torch.cuda.is_available() else "cpu") with open(dir_path + "\\train\\train.tsv", "r", encoding="utf8") as lalka_path: lines = lalka_path.readlines() train, test = train_test_split(lines, test_size = 0.2) with open(dir_path + "\\train\\train_train.tsv", "w", encoding="utf8") as out_train_file: for i in train: out_train_file.write(i) with open(dir_path + "\\train\\train_test.tsv", "w", encoding="utf8") as out_test_file: for i in test: out_test_file.write(i) lalka_path_train= dir_path + '\\train\\train_train.tsv' lalka_path_valid= dir_path + '\\train\\train_test.tsv' corpora_train = open(lalka_path_train, encoding="utf8").read() corpora_train_tokenized = list(word_tokenize(corpora_train)) corpora_train_tokenized = [token.lower() for token in corpora_train_tokenized] vocab_itos = sorted(set(corpora_train_tokenized)) vocab_itos = vocab_itos[:15005] vocab_itos[15001] = "" vocab_itos[15002] = "" vocab_itos[15003] = "" vocab_itos[15004] = "" vocab_stoi = dict() for i, token in enumerate(vocab_itos): vocab_stoi[token] = i def get_token_id(dataset): token_ids = [vocab_stoi['']] * (NGRAMS-1) + [vocab_stoi['']] for token in dataset: try: token_ids.append(vocab_stoi[token]) except KeyError: token_ids.append(vocab_stoi['']) token_ids.append(vocab_stoi['']) return token_ids def get_samples(dataset): samples = [] for i in range(len(dataset)-NGRAMS): samples.append(dataset[i:i+NGRAMS]) return samples train_ids = get_token_id(corpora_train_tokenized) train_ids = get_samples(train_ids) train_ids = torch.tensor(train_ids, device = device) corpora_valid = open(lalka_path_valid, encoding="utf8").read() corpora_valid_tokenized = list(word_tokenize(corpora_valid)) corpora_valid_tokenized = [token.lower() for token in corpora_valid_tokenized] valid_ids = get_token_id(corpora_valid_tokenized) valid_ids = torch.tensor(get_samples(valid_ids), dtype = torch.long, device = device) class GRU(torch.nn.Module): def __init__(self): super(GRU, self).__init__() self.emb = torch.nn.Embedding(len(vocab_itos),100) self.rec = torch.nn.GRU(100, 256, 1, batch_first = True) self.fc1 = torch.nn.Linear( 256 ,len(vocab_itos)) def forward(self, x): emb = self.emb(x) output, h_n = self.rec(emb) hidden = h_n.squeeze(0) out = self.fc1(hidden) return out lm = GRU().to(device) criterion = torch.nn.CrossEntropyLoss() optimizer = torch.optim.Adam(lm.parameters(),lr=0.0001) def get_ppl(dataset_ids): lm.eval() batches = 0 loss_sum =0 for i in range(0, len(dataset_ids)-BATCH_SIZE+1, BATCH_SIZE): X = dataset_ids[i:i+BATCH_SIZE,:NGRAMS-1] Y = dataset_ids[i:i+BATCH_SIZE,NGRAMS-1] predictions = lm(X) loss = criterion(predictions,Y) loss_sum += loss.item() batches += 1 return np.exp(loss_sum / batches) history_ppl_train = [] history_ppl_valid = [] for epoch in range(EPOCHS): batches = 0 loss_sum = 0 lm.train() total = len(train_ids)-BATCH_SIZE+1 for i in range(0, total, BATCH_SIZE): print('batches: ' + str(batches)) X = train_ids[i : i + BATCH_SIZE, :NGRAMS - 1] Y = train_ids[i : i + BATCH_SIZE, NGRAMS - 1] predictions = lm(X) loss = criterion(predictions,Y) optimizer.zero_grad() loss.backward() optimizer.step() loss_sum += loss.item() batches += 1 ppl_train = get_ppl(train_ids) ppl_valid = get_ppl(valid_ids) history_ppl_train.append(ppl_train) history_ppl_valid.append(ppl_valid) print('epoch: ', epoch) print('train ppl: ', ppl_train) print('valid ppl: ', ppl_valid) print() tokenized = list(word_tokenize('Gości innych nie widział oprócz spółleśników')) tokenized = [token.lower() for token in tokenized] ids = [] for word in tokenized: if word in vocab_stoi: ids.append(vocab_stoi[word]) else: ids.append(vocab_stoi['']) lm.eval() ids = torch.tensor(ids, dtype = torch.long, device = device) preds= lm(ids.unsqueeze(0)) vocab_itos[torch.argmax(torch.softmax(preds,1),1).item()] tokenized = list(word_tokenize('Lalka')) tokenized = [token.lower() for token in tokenized] ids = [] for word in tokenized: if word in vocab_stoi: ids.append(vocab_stoi[word]) else: ids.append(vocab_stoi['']) ids = torch.tensor([ids], dtype = torch.long, device = device) candidates_number = 10 for i in range(30): preds= lm(ids) candidates = torch.topk(torch.softmax(preds,1),candidates_number)[1][0].cpu().numpy() candidate = 15001 while candidate > 15000: candidate = candidates[np.random.randint(candidates_number)] print(vocab_itos[candidate]) ids = torch.cat((ids, torch.tensor([[candidate]], device = device)), 1) print('starting outs...') with open(dir_path + "\\dev-0\\in.tsv", "r", encoding="UTF-8") as dev_path: nr_of_dev_lines = len(dev_path.readlines()) with open(dir_path + "\\test-A\\in.tsv", "r", encoding="UTF-8") as test_a_path: nr_of_test_a_lines = len(test_a_path.readlines()) with open(dir_path + "\\dev-0\\out.tsv", "w", encoding="UTF-8") as out_dev_file: for i in range(nr_of_dev_lines): preds= lm(ids) candidates = torch.topk(torch.softmax(preds,1),candidates_number)[1][0].cpu().numpy() candidate = 15001 while candidate > 15000: candidate = candidates[np.random.randint(candidates_number)] candidate2 = 15001 while candidate2 > 15000 or candidate2 == candidate: candidate2 = candidates[np.random.randint(candidates_number)] print(vocab_itos[candidate]) ids = torch.cat((ids, torch.tensor([[candidate]], device = device)), 1) out_dev_file.write(vocab_itos[candidate] + ':0.3 ' + vocab_itos[candidate2] + ':0.2 :0.5' '\n') with open(dir_path + "\\test-A\\out.tsv", "w", encoding="UTF-8") as out_test_file: for i in range(nr_of_dev_lines): preds= lm(ids) candidates = torch.topk(torch.softmax(preds,1),candidates_number)[1][0].cpu().numpy() candidate = 15001 while candidate > 15000: candidate = candidates[np.random.randint(candidates_number)] print(vocab_itos[candidate]) ids = torch.cat((ids, torch.tensor([[candidate]], device = device)), 1) out_test_file.write(vocab_itos[candidate] + '\n')