import numpy as np import torch from sklearn.model_selection import train_test_split from nltk.tokenize import word_tokenize lalka_path_train= 'train/train_train.tsv' lalka_path_valid= 'train/train_test.tsv' corpora_train = open(lalka_path_train).read() corpora_train_tokenized = list(word_tokenize(corpora_train)) corpora_train_tokenized = [token.lower() for token in corpora_train_tokenized] vocab_itos = sorted(set(corpora_train_tokenized)) vocab_itos = vocab_itos[:15005] vocab_itos[15001] = "" vocab_itos[15002] = "" vocab_itos[15003] = "" vocab_itos[15004] = "" with open("train/train.tsv", "r") as lalka: lines = lalka.readlines() with open("train/train_train.tsv", "w") as out_train: for i in train: out_train.write(i) with open("train/train_test.tsv", "w") as out_test: for i in test: out_test.write(i) vocab_stoi = dict() for i, token in enumerate(vocab_itos): vocab_stoi[token] = i NGRAMS = 5 def get_token_id(dataset): token_ids = [vocab_stoi['']] * (NGRAMS-1) + [vocab_stoi['']] for token in dataset: try: token_ids.append(vocab_stoi[token]) except KeyError: token_ids.append(vocab_stoi['']) token_ids.append(vocab_stoi['']) return token_ids train_ids = get_token_id(corpora_train_tokenized) def get_samples(dataset): samples = [] for i in range(len(dataset)-NGRAMS): samples.append(dataset[i:i+NGRAMS]) return samples train_ids = get_samples(train_ids) train_ids = torch.tensor(train_ids, device = device) corpora_valid = open(lalka_path_valid).read() corpora_valid_tokenized = list(word_tokenize(corpora_valid)) valid_ids = get_token_id(corpora_valid_tokenized) valid_ids = torch.tensor(get_samples(valid_ids), dtype = torch.long, device = device) class GRU(torch.nn.Module): def __init__(self): super(GRU, self).__init__() self.emb = torch.nn.Embedding(len(vocab_itos),100) self.rec = torch.nn.GRU(100, 256, 1, batch_first = True) self.fc1 = torch.nn.Linear( 256 ,len(vocab_itos)) def forward(self, x): emb = self.emb(x) output, h_n = self.rec(emb) hidden = h_n.squeeze(0) out = self.fc1(hidden) return out lm = GRU().to(device) criterion = torch.nn.CrossEntropyLoss() optimizer = torch.optim.Adam(lm.parameters(),lr=0.0001) BATCH_SIZE = 128 EPOCHS = 15 def get_ppl(dataset_ids): lm.eval() batches = 0 loss_sum =0 acc_score = 0 for i in range(0, len(dataset_ids)-BATCH_SIZE+1, BATCH_SIZE): X = dataset_ids[i:i+BATCH_SIZE,:NGRAMS-1] Y = dataset_ids[i:i+BATCH_SIZE,NGRAMS-1] predictions = lm(X) loss = criterion(predictions,Y) loss_sum += loss.item() batches += 1 return np.exp(loss_sum / batches) history_ppl_train = [] history_ppl_valid = [] for epoch in range(EPOCHS): batches = 0 loss_sum =0 acc_score = 0 lm.train() #for i in range(0, len(train_ids)-BATCH_SIZE+1, BATCH_SIZE): for i in range(0, len(train_ids)-BATCH_SIZE+1, BATCH_SIZE): X = train_ids[i:i+BATCH_SIZE,:NGRAMS-1] Y = train_ids[i:i+BATCH_SIZE,NGRAMS-1] predictions = lm(X) loss = criterion(predictions,Y) optimizer.zero_grad() loss.backward() optimizer.step() loss_sum += loss.item() batches += 1 ppl_train = get_ppl(train_ids) ppl_valid = get_ppl(valid_ids) history_ppl_train.append(ppl_train) history_ppl_valid.append(ppl_valid) tokenized = list(word_tokenize('Gości innych nie widział oprócz spółleśników')) tokenized = tokenized[-NGRAMS :-1 ] ids = [] for word in tokenized: if word in vocab_stoi: ids.append(vocab_stoi[word]) else: ids.append(vocab_stoi['']) lm.eval() ids = torch.tensor(ids, dtype = torch.long, device = device) preds= lm(ids.unsqueeze(0)) vocab_itos[torch.argmax(torch.softmax(preds,1),1).item()] tokenized = list(word_tokenize('Lalka')) tokenized = [token.lower() for token in tokenized] ids = [] for word in tokenized: if word in vocab_stoi: ids.append(vocab_stoi[word]) else: ids.append(vocab_stoi['']) ids = torch.tensor([ids], dtype = torch.long, device = device) candidates_number = 10 for i in range(30): preds= lm(ids) candidates = torch.topk(torch.softmax(preds,1),candidates_number)[1][0].cpu().numpy() candidate = 15001 while candidate > 15000: candidate = candidates[np.random.randint(candidates_number)] print(vocab_itos[candidate]) ids = torch.cat((ids, torch.tensor([[candidate]], device = device)), 1) dev= open("dev-0/in.tsv", "r") dev_ = len(dev.readlines()) a = open("test-A/in.tsv", "r") a_ = len(a.readlines()) with open("dev-0/out.tsv", "w") as out_dev: for i in range(dev_): prediction = lm(id_list) candidates = torch.topk(torch.softmax(prediction, 1), candidates_number)[1][0].cpu().numpy() candidate = 15001 while candidate > 15000: candidate = candidates[np.random.randint(candidates_number)] id_list = torch.cat((id_list, torch.tensor([[candidate]], device=device)), 1) out_dev_file.write(vocab_itos[candidate] + '\n') with open("test-A/out.tsv", "w") as out_test: for i in range(dev_): preds = lm(id_list) candidates = torch.topk(torch.softmax(preds, 1), candidates_number)[1][0].cpu().numpy() candidate = 15001 while candidate > 15000: candidate = candidates[np.random.randint(candidates_number)] id_list = torch.cat((id_list, torch.tensor([[candidate]], device=device)), 1) out_test_file.write(vocab_itos[candidate] + '\n')