From 706d35e834ef2686b37a3beaf75f722e35afd69d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Helena=20Ga=C5=82=C4=85zka?= Date: Mon, 5 Jul 2021 09:39:18 +0200 Subject: [PATCH 1/2] Zaktualizuj 'guessword.py' --- guessword.py | 449 ++++++++++++++++++++++++++------------------------- 1 file changed, 226 insertions(+), 223 deletions(-) diff --git a/guessword.py b/guessword.py index 25388f5..3336de5 100644 --- a/guessword.py +++ b/guessword.py @@ -1,223 +1,226 @@ -import os -import numpy as np -import torch -from sklearn.model_selection import train_test_split -import nltk -from nltk.tokenize import word_tokenize -nltk.download('punkt') -dir_path = os.path.dirname(os.path.realpath(__file__)) - -NGRAMS = 5 -BATCH_SIZE = 128 -EPOCHS = 15 - -device = torch.device("cuda" if torch.cuda.is_available() else "cpu") - -with open(dir_path + "\\train\\train.tsv", "r", encoding="utf8") as lalka_path: - lines = lalka_path.readlines() - -train, test = train_test_split(lines, test_size = 0.2) - -with open(dir_path + "\\train\\train_train.tsv", "w", encoding="utf8") as out_train_file: - for i in train: - out_train_file.write(i) - -with open(dir_path + "\\train\\train_test.tsv", "w", encoding="utf8") as out_test_file: - for i in test: - out_test_file.write(i) - -lalka_path_train= dir_path + '\\train\\train_train.tsv' -lalka_path_valid= dir_path + '\\train\\train_test.tsv' - -corpora_train = open(lalka_path_train, encoding="utf8").read() -corpora_train_tokenized = list(word_tokenize(corpora_train)) -corpora_train_tokenized = [token.lower() for token in corpora_train_tokenized] - -vocab_itos = sorted(set(corpora_train_tokenized)) -vocab_itos = vocab_itos[:15005] -vocab_itos[15001] = "" -vocab_itos[15002] = "" -vocab_itos[15003] = "" -vocab_itos[15004] = "" - -vocab_stoi = dict() - -for i, token in enumerate(vocab_itos): - vocab_stoi[token] = i - -def get_token_id(dataset): - token_ids = [vocab_stoi['']] * (NGRAMS-1) + [vocab_stoi['']] - for token in dataset: - try: - token_ids.append(vocab_stoi[token]) - except KeyError: - token_ids.append(vocab_stoi['']) - token_ids.append(vocab_stoi['']) - return token_ids - -def get_samples(dataset): - samples = [] - for i in range(len(dataset)-NGRAMS): - samples.append(dataset[i:i+NGRAMS]) - return samples - -train_ids = get_token_id(corpora_train_tokenized) - -train_ids = get_samples(train_ids) - -train_ids = torch.tensor(train_ids, device = device) - -corpora_valid = open(lalka_path_valid, encoding="utf8").read() - -corpora_valid_tokenized = list(word_tokenize(corpora_valid)) -corpora_valid_tokenized = [token.lower() for token in corpora_valid_tokenized] - -valid_ids = get_token_id(corpora_valid_tokenized) - -valid_ids = torch.tensor(get_samples(valid_ids), dtype = torch.long, device = device) - -class GRU(torch.nn.Module): - - def __init__(self): - super(GRU, self).__init__() - self.emb = torch.nn.Embedding(len(vocab_itos),100) - self.rec = torch.nn.GRU(100, 256, 1, batch_first = True) - self.fc1 = torch.nn.Linear( 256 ,len(vocab_itos)) - - def forward(self, x): - emb = self.emb(x) - output, h_n = self.rec(emb) - hidden = h_n.squeeze(0) - out = self.fc1(hidden) - return out - -lm = GRU().to(device) -criterion = torch.nn.CrossEntropyLoss() -optimizer = torch.optim.Adam(lm.parameters(),lr=0.0001) - -def get_ppl(dataset_ids): - lm.eval() - - batches = 0 - loss_sum =0 - - for i in range(0, len(dataset_ids)-BATCH_SIZE+1, BATCH_SIZE): - X = dataset_ids[i:i+BATCH_SIZE,:NGRAMS-1] - Y = dataset_ids[i:i+BATCH_SIZE,NGRAMS-1] - - predictions = lm(X) - loss = criterion(predictions,Y) - loss_sum += loss.item() - batches += 1 - - return np.exp(loss_sum / batches) - -history_ppl_train = [] -history_ppl_valid = [] - -for epoch in range(EPOCHS): - batches = 0 - loss_sum = 0 - lm.train() - total = len(train_ids)-BATCH_SIZE+1 - for i in range(0, total, BATCH_SIZE): - print('batches: ' + str(batches)) - - X = train_ids[i : i + BATCH_SIZE, :NGRAMS - 1] - Y = train_ids[i : i + BATCH_SIZE, NGRAMS - 1] - - predictions = lm(X) - loss = criterion(predictions,Y) - - optimizer.zero_grad() - loss.backward() - optimizer.step() - - loss_sum += loss.item() - batches += 1 - - ppl_train = get_ppl(train_ids) - ppl_valid = get_ppl(valid_ids) - - history_ppl_train.append(ppl_train) - history_ppl_valid.append(ppl_valid) - - print('epoch: ', epoch) - print('train ppl: ', ppl_train) - print('valid ppl: ', ppl_valid) - print() - -tokenized = list(word_tokenize('Gości innych nie widział oprócz spółleśników')) -tokenized = [token.lower() for token in tokenized] - -ids = [] -for word in tokenized: - if word in vocab_stoi: - ids.append(vocab_stoi[word]) - else: - ids.append(vocab_stoi['']) - -lm.eval() - -ids = torch.tensor(ids, dtype = torch.long, device = device) -preds= lm(ids.unsqueeze(0)) -vocab_itos[torch.argmax(torch.softmax(preds,1),1).item()] - -tokenized = list(word_tokenize('Lalka')) -tokenized = [token.lower() for token in tokenized] - -ids = [] - -for word in tokenized: - if word in vocab_stoi: - ids.append(vocab_stoi[word]) - else: - ids.append(vocab_stoi['']) - -ids = torch.tensor([ids], dtype = torch.long, device = device) - -candidates_number = 10 - -for i in range(30): - preds= lm(ids) - candidates = torch.topk(torch.softmax(preds,1),candidates_number)[1][0].cpu().numpy() - candidate = 15001 - - while candidate > 15000: - candidate = candidates[np.random.randint(candidates_number)] - - print(vocab_itos[candidate]) - ids = torch.cat((ids, torch.tensor([[candidate]], device = device)), 1) - -print('starting outs...') - -with open(dir_path + "\\dev-0\\in.tsv", "r", encoding="UTF-8") as dev_path: - nr_of_dev_lines = len(dev_path.readlines()) - -with open(dir_path + "\\test-A\\in.tsv", "r", encoding="UTF-8") as test_a_path: - nr_of_test_a_lines = len(test_a_path.readlines()) - -with open(dir_path + "\\dev-0\\out.tsv", "w", encoding="UTF-8") as out_dev_file: - for i in range(nr_of_dev_lines): - preds= lm(ids) - candidates = torch.topk(torch.softmax(preds,1),candidates_number)[1][0].cpu().numpy() - candidate = 15001 - while candidate > 15000: - candidate = candidates[np.random.randint(candidates_number)] - candidate2 = 15001 - while candidate2 > 15000 or candidate2 == candidate: - candidate2 = candidates[np.random.randint(candidates_number)] - print(vocab_itos[candidate]) - ids = torch.cat((ids, torch.tensor([[candidate]], device = device)), 1) - out_dev_file.write(vocab_itos[candidate] + ':0.3 ' + vocab_itos[candidate2] + ':0.2 :0.5' '\n') - -with open(dir_path + "\\test-A\\out.tsv", "w", encoding="UTF-8") as out_test_file: - for i in range(nr_of_dev_lines): - preds= lm(ids) - candidates = torch.topk(torch.softmax(preds,1),candidates_number)[1][0].cpu().numpy() - candidate = 15001 - while candidate > 15000: - candidate = candidates[np.random.randint(candidates_number)] - print(vocab_itos[candidate]) - ids = torch.cat((ids, torch.tensor([[candidate]], device = device)), 1) - out_test_file.write(vocab_itos[candidate] + '\n') \ No newline at end of file +import os +import numpy as np +import torch +from sklearn.model_selection import train_test_split +import nltk +from nltk.tokenize import word_tokenize +nltk.download('punkt') +dir_path = os.path.dirname(os.path.realpath(__file__)) + +NGRAMS = 5 +BATCH_SIZE = 128 +EPOCHS = 15 + +device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + +with open(dir_path + "\\train\\train.tsv", "r", encoding="utf8") as lalka_path: + lines = lalka_path.readlines() + +train, test = train_test_split(lines, test_size = 0.2) + +with open(dir_path + "\\train\\train_train.tsv", "w", encoding="utf8") as out_train_file: + for i in train: + out_train_file.write(i) + +with open(dir_path + "\\train\\train_test.tsv", "w", encoding="utf8") as out_test_file: + for i in test: + out_test_file.write(i) + +lalka_path_train= dir_path + '\\train\\train_train.tsv' +lalka_path_valid= dir_path + '\\train\\train_test.tsv' + +corpora_train = open(lalka_path_train, encoding="utf8").read() +corpora_train_tokenized = list(word_tokenize(corpora_train)) +corpora_train_tokenized = [token.lower() for token in corpora_train_tokenized] + +vocab_itos = sorted(set(corpora_train_tokenized)) +vocab_itos = vocab_itos[:15005] +vocab_itos[15001] = "" +vocab_itos[15002] = "" +vocab_itos[15003] = "" +vocab_itos[15004] = "" + +vocab_stoi = dict() + +for i, token in enumerate(vocab_itos): + vocab_stoi[token] = i + +def get_token_id(dataset): + token_ids = [vocab_stoi['']] * (NGRAMS-1) + [vocab_stoi['']] + for token in dataset: + try: + token_ids.append(vocab_stoi[token]) + except KeyError: + token_ids.append(vocab_stoi['']) + token_ids.append(vocab_stoi['']) + return token_ids + +def get_samples(dataset): + samples = [] + for i in range(len(dataset)-NGRAMS): + samples.append(dataset[i:i+NGRAMS]) + return samples + +train_ids = get_token_id(corpora_train_tokenized) + +train_ids = get_samples(train_ids) + +train_ids = torch.tensor(train_ids, device = device) + +corpora_valid = open(lalka_path_valid, encoding="utf8").read() + +corpora_valid_tokenized = list(word_tokenize(corpora_valid)) +corpora_valid_tokenized = [token.lower() for token in corpora_valid_tokenized] + +valid_ids = get_token_id(corpora_valid_tokenized) + +valid_ids = torch.tensor(get_samples(valid_ids), dtype = torch.long, device = device) + +class GRU(torch.nn.Module): + + def __init__(self): + super(GRU, self).__init__() + self.emb = torch.nn.Embedding(len(vocab_itos),100) + self.rec = torch.nn.GRU(100, 256, 1, batch_first = True) + self.fc1 = torch.nn.Linear( 256 ,len(vocab_itos)) + + def forward(self, x): + emb = self.emb(x) + output, h_n = self.rec(emb) + hidden = h_n.squeeze(0) + out = self.fc1(hidden) + return out + +lm = GRU().to(device) +criterion = torch.nn.CrossEntropyLoss() +optimizer = torch.optim.Adam(lm.parameters(),lr=0.0001) + +def get_ppl(dataset_ids): + lm.eval() + + batches = 0 + loss_sum =0 + + for i in range(0, len(dataset_ids)-BATCH_SIZE+1, BATCH_SIZE): + X = dataset_ids[i:i+BATCH_SIZE,:NGRAMS-1] + Y = dataset_ids[i:i+BATCH_SIZE,NGRAMS-1] + + predictions = lm(X) + loss = criterion(predictions,Y) + loss_sum += loss.item() + batches += 1 + + return np.exp(loss_sum / batches) + +history_ppl_train = [] +history_ppl_valid = [] + +for epoch in range(EPOCHS): + batches = 0 + loss_sum = 0 + lm.train() + total = len(train_ids)-BATCH_SIZE+1 + for i in range(0, total, BATCH_SIZE): + print('batches: ' + str(batches)) + + X = train_ids[i : i + BATCH_SIZE, :NGRAMS - 1] + Y = train_ids[i : i + BATCH_SIZE, NGRAMS - 1] + + predictions = lm(X) + loss = criterion(predictions,Y) + + optimizer.zero_grad() + loss.backward() + optimizer.step() + + loss_sum += loss.item() + batches += 1 + + ppl_train = get_ppl(train_ids) + ppl_valid = get_ppl(valid_ids) + + history_ppl_train.append(ppl_train) + history_ppl_valid.append(ppl_valid) + + print('epoch: ', epoch) + print('train ppl: ', ppl_train) + print('valid ppl: ', ppl_valid) + print() + +tokenized = list(word_tokenize('Gości innych nie widział oprócz spółleśników')) +tokenized = [token.lower() for token in tokenized] + +ids = [] +for word in tokenized: + if word in vocab_stoi: + ids.append(vocab_stoi[word]) + else: + ids.append(vocab_stoi['']) + +lm.eval() + +ids = torch.tensor(ids, dtype = torch.long, device = device) +preds= lm(ids.unsqueeze(0)) +vocab_itos[torch.argmax(torch.softmax(preds,1),1).item()] + +tokenized = list(word_tokenize('Lalka')) +tokenized = [token.lower() for token in tokenized] + +ids = [] + +for word in tokenized: + if word in vocab_stoi: + ids.append(vocab_stoi[word]) + else: + ids.append(vocab_stoi['']) + +ids = torch.tensor([ids], dtype = torch.long, device = device) + +candidates_number = 10 + +for i in range(30): + preds= lm(ids) + candidates = torch.topk(torch.softmax(preds,1),candidates_number)[1][0].cpu().numpy() + candidate = 15001 + + while candidate > 15000: + candidate = candidates[np.random.randint(candidates_number)] + + print(vocab_itos[candidate]) + ids = torch.cat((ids, torch.tensor([[candidate]], device = device)), 1) + +print('starting outs...') + +with open(dir_path + "\\dev-0\\in.tsv", "r", encoding="UTF-8") as dev_path: + nr_of_dev_lines = len(dev_path.readlines()) + +with open(dir_path + "\\test-A\\in.tsv", "r", encoding="UTF-8") as test_a_path: + nr_of_test_a_lines = len(test_a_path.readlines()) + +with open(dir_path + "\\dev-0\\out.tsv", "w", encoding="UTF-8") as out_dev_file: + for i in range(nr_of_dev_lines): + preds= lm(ids) + candidates = torch.topk(torch.softmax(preds,1),candidates_number)[1][0].cpu().numpy() + candidate = 15001 + while candidate > 15000: + candidate = candidates[np.random.randint(candidates_number)] + candidate2 = 15001 + while candidate2 > 15000 or candidate2 == candidate: + candidate2 = candidates[np.random.randint(candidates_number)] + print(vocab_itos[candidate]) + ids = torch.cat((ids, torch.tensor([[candidate]], device = device)), 1) + out_dev_file.write(vocab_itos[candidate] + ':0.3 ' + vocab_itos[candidate2] + ':0.2 :0.5' '\n') + +with open(dir_path + "\\test-A\\out.tsv", "w", encoding="UTF-8") as out_test_file: + for i in range(nr_of_dev_lines): + preds= lm(ids) + candidates = torch.topk(torch.softmax(preds,1),candidates_number)[1][0].cpu().numpy() + candidate = 15001 + while candidate > 15000: + candidate = candidates[np.random.randint(candidates_number)] + candidate2 = 15001 + while candidate2 > 15000 or candidate2 == candidate: + candidate2 = candidates[np.random.randint(candidates_number)] + print(vocab_itos[candidate]) + ids = torch.cat((ids, torch.tensor([[candidate]], device = device)), 1) + out_test_file.write(vocab_itos[candidate] + ':0.3 ' + vocab_itos[candidate2] + ':0.2 :0.5' '\n') \ No newline at end of file From c0285ec280a3d6a51a4bd9b926143398b1bec7fd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Helena=20Ga=C5=82=C4=85zka?= Date: Mon, 5 Jul 2021 09:46:22 +0200 Subject: [PATCH 2/2] Zaktualizuj 'guessword.py' --- guessword.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/guessword.py b/guessword.py index 3336de5..95e577b 100644 --- a/guessword.py +++ b/guessword.py @@ -218,7 +218,7 @@ with open(dir_path + "\\test-A\\out.tsv", "w", encoding="UTF-8") as out_test_fil candidate = 15001 while candidate > 15000: candidate = candidates[np.random.randint(candidates_number)] - candidate2 = 15001 + candidate2 = 15001 while candidate2 > 15000 or candidate2 == candidate: candidate2 = candidates[np.random.randint(candidates_number)] print(vocab_itos[candidate])