import numpy as np import torch from tqdm.notebook import tqdm from sklearn.model_selection import train_test_split import nltk from nltk.tokenize import word_tokenize nltk.download('punkt') device = torch.device("cuda" if torch.cuda.is_available() else "cpu") with open("train/train.tsv", "r") as lalka_path: lines = lalka_path.readlines() train, test = train_test_split(lines, test_size = 0.2) with open("train/train_train.tsv", "w") as out_train_file: for i in train: out_train_file.write(i) with open("train/train_test.tsv", "w") as out_test_file: for i in test: out_test_file.write(i) lalka_path_train= 'train/train_train.tsv' lalka_path_valid= 'train/train_test.tsv' corpora_train = open(lalka_path_train).read() corpora_train_tokenized = list(word_tokenize(corpora_train)) corpora_train_tokenized = [token.lower() for token in corpora_train_tokenized] vocab_itos = sorted(set(corpora_train_tokenized)) vocab_itos = vocab_itos[:15005] vocab_itos[15001] = "" vocab_itos[15002] = "" vocab_itos[15003] = "" vocab_itos[15004] = "" vocab_stoi = dict() for i, token in enumerate(vocab_itos): vocab_stoi[token] = i NGRAMS = 5 def get_token_id(dataset): token_ids = [vocab_stoi['']] * (NGRAMS-1) + [vocab_stoi['']] for token in dataset: try: token_ids.append(vocab_stoi[token]) except KeyError: token_ids.append(vocab_stoi['']) token_ids.append(vocab_stoi['']) return token_ids train_ids = get_token_id(corpora_train_tokenized) def get_samples(dataset): samples = [] for i in range(len(dataset)-NGRAMS): samples.append(dataset[i:i+NGRAMS]) return samples train_ids = get_samples(train_ids) train_ids = torch.tensor(train_ids, device = device) corpora_valid = open(lalka_path_valid).read() corpora_valid_tokenized = list(word_tokenize(corpora_valid)) corpora_valid_tokenized = [token.lower() for token in corpora_valid_tokenized] valid_ids = get_token_id(corpora_valid_tokenized) valid_ids = torch.tensor(get_samples(valid_ids), dtype = torch.long, device = device) class GRU(torch.nn.Module): def __init__(self): super(GRU, self).__init__() self.emb = torch.nn.Embedding(len(vocab_itos),100) self.rec = torch.nn.GRU(100, 256, 1, batch_first = True) self.fc1 = torch.nn.Linear( 256 ,len(vocab_itos)) #self.dropout = torch.nn.Dropout(0.5) def forward(self, x): emb = self.emb(x) #emb = self.dropout(emb) output, h_n = self.rec(emb) hidden = h_n.squeeze(0) out = self.fc1(hidden) #out = self.dropout(out) return out lm = GRU().to(device) criterion = torch.nn.CrossEntropyLoss() optimizer = torch.optim.Adam(lm.parameters(),lr=0.0001) BATCH_SIZE = 128 EPOCHS = 15 def get_ppl(dataset_ids): lm.eval() batches = 0 loss_sum = 0 acc_score = 0 for i in range(0, len(dataset_ids) - BATCH_SIZE + 1, BATCH_SIZE): X = dataset_ids[i : i + BATCH_SIZE, :NGRAMS - 1] Y = dataset_ids[i : i + BATCH_SIZE, NGRAMS - 1] predictions = lm(X) loss = criterion(predictions,Y) loss_sum += loss.item() batches += 1 return np.exp(loss_sum / batches) history_ppl_train = [] history_ppl_valid = [] for epoch in range(EPOCHS): batches = 0 loss_sum = 0 acc_score = 0 lm.train() for i in tqdm(range(0, len(train_ids)-BATCH_SIZE + 1, BATCH_SIZE)): X = train_ids[i : i + BATCH_SIZE, :NGRAMS - 1] Y = train_ids[i : i + BATCH_SIZE, NGRAMS - 1] predictions = lm(X) loss = criterion(predictions,Y) optimizer.zero_grad() loss.backward() optimizer.step() loss_sum += loss.item() batches += 1 ppl_train = get_ppl(train_ids) ppl_valid = get_ppl(valid_ids) history_ppl_train.append(ppl_train) history_ppl_valid.append(ppl_valid) print('epoch: ', epoch) print('train ppl: ', ppl_train) print('valid ppl: ', ppl_valid) print() tokenized = list(word_tokenize('Gości innych nie widział oprócz spółleśników')) tokenized = [token.lower() for token in tokenized] ids = [] for word in tokenized: if word in vocab_stoi: ids.append(vocab_stoi[word]) else: ids.append(vocab_stoi['']) lm.eval() ids = torch.tensor(ids, dtype = torch.long, device = device) preds= lm(ids.unsqueeze(0)) vocab_itos[torch.argmax(torch.softmax(preds, 1), 1).item()] tokenized = list(word_tokenize('Lalka')) tokenized = [token.lower() for token in tokenized] ids = [] for word in tokenized: if word in vocab_stoi: ids.append(vocab_stoi[word]) else: ids.append(vocab_stoi['']) ids = torch.tensor([ids], dtype = torch.long, device = device) candidates_number = 10 for i in range(30): preds= lm(ids) candidates = torch.topk(torch.softmax(preds,1),candidates_number)[1][0].cpu().numpy() candidate = 15001 while candidate > 15000: candidate = candidates[np.random.randint(candidates_number)] print(vocab_itos[candidate]) ids = torch.cat((ids, torch.tensor([[candidate]], device = device)), 1) with open("dev-0/in.tsv", "r") as dev_path: nr_of_dev_lines = len(dev_path.readlines()) with open("test-A/in.tsv", "r") as test_a_path: nr_of_test_a_lines = len(test_a_path.readlines()) with open("dev-0/out.tsv", "w") as out_dev_file: for i in range(nr_of_dev_lines): preds= lm(ids) candidates = torch.topk(torch.softmax(preds,1),candidates_number)[1][0].cpu().numpy() candidate = 15001 while candidate > 15000: candidate = candidates[np.random.randint(candidates_number)] print(vocab_itos[candidate]) ids = torch.cat((ids, torch.tensor([[candidate]], device = device)), 1) out_dev_file.write(vocab_itos[candidate] + '\n') with open("test-A/out.tsv", "w") as out_test_file: for i in range(nr_of_dev_lines): preds= lm(ids) candidates = torch.topk(torch.softmax(preds,1),candidates_number)[1][0].cpu().numpy() candidate = 15001 while candidate > 15000: candidate = candidates[np.random.randint(candidates_number)] print(vocab_itos[candidate]) ids = torch.cat((ids, torch.tensor([[candidate]], device = device)), 1) out_test_file.write(vocab_itos[candidate] + '\n')