Zaktualizuj 'guessword.py'

This commit is contained in:
Helena Gałązka 2021-07-05 09:39:18 +02:00
parent 5073323495
commit 706d35e834

View File

@ -1,223 +1,226 @@
import os
import numpy as np
import torch
from sklearn.model_selection import train_test_split
import nltk
from nltk.tokenize import word_tokenize
nltk.download('punkt')
dir_path = os.path.dirname(os.path.realpath(__file__))
NGRAMS = 5
BATCH_SIZE = 128
EPOCHS = 15
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
with open(dir_path + "\\train\\train.tsv", "r", encoding="utf8") as lalka_path:
lines = lalka_path.readlines()
train, test = train_test_split(lines, test_size = 0.2)
with open(dir_path + "\\train\\train_train.tsv", "w", encoding="utf8") as out_train_file:
for i in train:
out_train_file.write(i)
with open(dir_path + "\\train\\train_test.tsv", "w", encoding="utf8") as out_test_file:
for i in test:
out_test_file.write(i)
lalka_path_train= dir_path + '\\train\\train_train.tsv'
lalka_path_valid= dir_path + '\\train\\train_test.tsv'
corpora_train = open(lalka_path_train, encoding="utf8").read()
corpora_train_tokenized = list(word_tokenize(corpora_train))
corpora_train_tokenized = [token.lower() for token in corpora_train_tokenized]
vocab_itos = sorted(set(corpora_train_tokenized))
vocab_itos = vocab_itos[:15005]
vocab_itos[15001] = "<UNK>"
vocab_itos[15002] = "<BOS>"
vocab_itos[15003] = "<EOS>"
vocab_itos[15004] = "<PAD>"
vocab_stoi = dict()
for i, token in enumerate(vocab_itos):
vocab_stoi[token] = i
def get_token_id(dataset):
token_ids = [vocab_stoi['<PAD>']] * (NGRAMS-1) + [vocab_stoi['<BOS>']]
for token in dataset:
try:
token_ids.append(vocab_stoi[token])
except KeyError:
token_ids.append(vocab_stoi['<UNK>'])
token_ids.append(vocab_stoi['<EOS>'])
return token_ids
def get_samples(dataset):
samples = []
for i in range(len(dataset)-NGRAMS):
samples.append(dataset[i:i+NGRAMS])
return samples
train_ids = get_token_id(corpora_train_tokenized)
train_ids = get_samples(train_ids)
train_ids = torch.tensor(train_ids, device = device)
corpora_valid = open(lalka_path_valid, encoding="utf8").read()
corpora_valid_tokenized = list(word_tokenize(corpora_valid))
corpora_valid_tokenized = [token.lower() for token in corpora_valid_tokenized]
valid_ids = get_token_id(corpora_valid_tokenized)
valid_ids = torch.tensor(get_samples(valid_ids), dtype = torch.long, device = device)
class GRU(torch.nn.Module):
def __init__(self):
super(GRU, self).__init__()
self.emb = torch.nn.Embedding(len(vocab_itos),100)
self.rec = torch.nn.GRU(100, 256, 1, batch_first = True)
self.fc1 = torch.nn.Linear( 256 ,len(vocab_itos))
def forward(self, x):
emb = self.emb(x)
output, h_n = self.rec(emb)
hidden = h_n.squeeze(0)
out = self.fc1(hidden)
return out
lm = GRU().to(device)
criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(lm.parameters(),lr=0.0001)
def get_ppl(dataset_ids):
lm.eval()
batches = 0
loss_sum =0
for i in range(0, len(dataset_ids)-BATCH_SIZE+1, BATCH_SIZE):
X = dataset_ids[i:i+BATCH_SIZE,:NGRAMS-1]
Y = dataset_ids[i:i+BATCH_SIZE,NGRAMS-1]
predictions = lm(X)
loss = criterion(predictions,Y)
loss_sum += loss.item()
batches += 1
return np.exp(loss_sum / batches)
history_ppl_train = []
history_ppl_valid = []
for epoch in range(EPOCHS):
batches = 0
loss_sum = 0
lm.train()
total = len(train_ids)-BATCH_SIZE+1
for i in range(0, total, BATCH_SIZE):
print('batches: ' + str(batches))
X = train_ids[i : i + BATCH_SIZE, :NGRAMS - 1]
Y = train_ids[i : i + BATCH_SIZE, NGRAMS - 1]
predictions = lm(X)
loss = criterion(predictions,Y)
optimizer.zero_grad()
loss.backward()
optimizer.step()
loss_sum += loss.item()
batches += 1
ppl_train = get_ppl(train_ids)
ppl_valid = get_ppl(valid_ids)
history_ppl_train.append(ppl_train)
history_ppl_valid.append(ppl_valid)
print('epoch: ', epoch)
print('train ppl: ', ppl_train)
print('valid ppl: ', ppl_valid)
print()
tokenized = list(word_tokenize('Gości innych nie widział oprócz spółleśników'))
tokenized = [token.lower() for token in tokenized]
ids = []
for word in tokenized:
if word in vocab_stoi:
ids.append(vocab_stoi[word])
else:
ids.append(vocab_stoi['<UNK>'])
lm.eval()
ids = torch.tensor(ids, dtype = torch.long, device = device)
preds= lm(ids.unsqueeze(0))
vocab_itos[torch.argmax(torch.softmax(preds,1),1).item()]
tokenized = list(word_tokenize('Lalka'))
tokenized = [token.lower() for token in tokenized]
ids = []
for word in tokenized:
if word in vocab_stoi:
ids.append(vocab_stoi[word])
else:
ids.append(vocab_stoi['<UNK>'])
ids = torch.tensor([ids], dtype = torch.long, device = device)
candidates_number = 10
for i in range(30):
preds= lm(ids)
candidates = torch.topk(torch.softmax(preds,1),candidates_number)[1][0].cpu().numpy()
candidate = 15001
while candidate > 15000:
candidate = candidates[np.random.randint(candidates_number)]
print(vocab_itos[candidate])
ids = torch.cat((ids, torch.tensor([[candidate]], device = device)), 1)
print('starting outs...')
with open(dir_path + "\\dev-0\\in.tsv", "r", encoding="UTF-8") as dev_path:
nr_of_dev_lines = len(dev_path.readlines())
with open(dir_path + "\\test-A\\in.tsv", "r", encoding="UTF-8") as test_a_path:
nr_of_test_a_lines = len(test_a_path.readlines())
with open(dir_path + "\\dev-0\\out.tsv", "w", encoding="UTF-8") as out_dev_file:
for i in range(nr_of_dev_lines):
preds= lm(ids)
candidates = torch.topk(torch.softmax(preds,1),candidates_number)[1][0].cpu().numpy()
candidate = 15001
while candidate > 15000:
candidate = candidates[np.random.randint(candidates_number)]
candidate2 = 15001
while candidate2 > 15000 or candidate2 == candidate:
candidate2 = candidates[np.random.randint(candidates_number)]
print(vocab_itos[candidate])
ids = torch.cat((ids, torch.tensor([[candidate]], device = device)), 1)
out_dev_file.write(vocab_itos[candidate] + ':0.3 ' + vocab_itos[candidate2] + ':0.2 :0.5' '\n')
with open(dir_path + "\\test-A\\out.tsv", "w", encoding="UTF-8") as out_test_file:
for i in range(nr_of_dev_lines):
preds= lm(ids)
candidates = torch.topk(torch.softmax(preds,1),candidates_number)[1][0].cpu().numpy()
candidate = 15001
while candidate > 15000:
candidate = candidates[np.random.randint(candidates_number)]
print(vocab_itos[candidate])
ids = torch.cat((ids, torch.tensor([[candidate]], device = device)), 1)
out_test_file.write(vocab_itos[candidate] + '\n')
import os
import numpy as np
import torch
from sklearn.model_selection import train_test_split
import nltk
from nltk.tokenize import word_tokenize
nltk.download('punkt')
dir_path = os.path.dirname(os.path.realpath(__file__))
NGRAMS = 5
BATCH_SIZE = 128
EPOCHS = 15
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
with open(dir_path + "\\train\\train.tsv", "r", encoding="utf8") as lalka_path:
lines = lalka_path.readlines()
train, test = train_test_split(lines, test_size = 0.2)
with open(dir_path + "\\train\\train_train.tsv", "w", encoding="utf8") as out_train_file:
for i in train:
out_train_file.write(i)
with open(dir_path + "\\train\\train_test.tsv", "w", encoding="utf8") as out_test_file:
for i in test:
out_test_file.write(i)
lalka_path_train= dir_path + '\\train\\train_train.tsv'
lalka_path_valid= dir_path + '\\train\\train_test.tsv'
corpora_train = open(lalka_path_train, encoding="utf8").read()
corpora_train_tokenized = list(word_tokenize(corpora_train))
corpora_train_tokenized = [token.lower() for token in corpora_train_tokenized]
vocab_itos = sorted(set(corpora_train_tokenized))
vocab_itos = vocab_itos[:15005]
vocab_itos[15001] = "<UNK>"
vocab_itos[15002] = "<BOS>"
vocab_itos[15003] = "<EOS>"
vocab_itos[15004] = "<PAD>"
vocab_stoi = dict()
for i, token in enumerate(vocab_itos):
vocab_stoi[token] = i
def get_token_id(dataset):
token_ids = [vocab_stoi['<PAD>']] * (NGRAMS-1) + [vocab_stoi['<BOS>']]
for token in dataset:
try:
token_ids.append(vocab_stoi[token])
except KeyError:
token_ids.append(vocab_stoi['<UNK>'])
token_ids.append(vocab_stoi['<EOS>'])
return token_ids
def get_samples(dataset):
samples = []
for i in range(len(dataset)-NGRAMS):
samples.append(dataset[i:i+NGRAMS])
return samples
train_ids = get_token_id(corpora_train_tokenized)
train_ids = get_samples(train_ids)
train_ids = torch.tensor(train_ids, device = device)
corpora_valid = open(lalka_path_valid, encoding="utf8").read()
corpora_valid_tokenized = list(word_tokenize(corpora_valid))
corpora_valid_tokenized = [token.lower() for token in corpora_valid_tokenized]
valid_ids = get_token_id(corpora_valid_tokenized)
valid_ids = torch.tensor(get_samples(valid_ids), dtype = torch.long, device = device)
class GRU(torch.nn.Module):
def __init__(self):
super(GRU, self).__init__()
self.emb = torch.nn.Embedding(len(vocab_itos),100)
self.rec = torch.nn.GRU(100, 256, 1, batch_first = True)
self.fc1 = torch.nn.Linear( 256 ,len(vocab_itos))
def forward(self, x):
emb = self.emb(x)
output, h_n = self.rec(emb)
hidden = h_n.squeeze(0)
out = self.fc1(hidden)
return out
lm = GRU().to(device)
criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(lm.parameters(),lr=0.0001)
def get_ppl(dataset_ids):
lm.eval()
batches = 0
loss_sum =0
for i in range(0, len(dataset_ids)-BATCH_SIZE+1, BATCH_SIZE):
X = dataset_ids[i:i+BATCH_SIZE,:NGRAMS-1]
Y = dataset_ids[i:i+BATCH_SIZE,NGRAMS-1]
predictions = lm(X)
loss = criterion(predictions,Y)
loss_sum += loss.item()
batches += 1
return np.exp(loss_sum / batches)
history_ppl_train = []
history_ppl_valid = []
for epoch in range(EPOCHS):
batches = 0
loss_sum = 0
lm.train()
total = len(train_ids)-BATCH_SIZE+1
for i in range(0, total, BATCH_SIZE):
print('batches: ' + str(batches))
X = train_ids[i : i + BATCH_SIZE, :NGRAMS - 1]
Y = train_ids[i : i + BATCH_SIZE, NGRAMS - 1]
predictions = lm(X)
loss = criterion(predictions,Y)
optimizer.zero_grad()
loss.backward()
optimizer.step()
loss_sum += loss.item()
batches += 1
ppl_train = get_ppl(train_ids)
ppl_valid = get_ppl(valid_ids)
history_ppl_train.append(ppl_train)
history_ppl_valid.append(ppl_valid)
print('epoch: ', epoch)
print('train ppl: ', ppl_train)
print('valid ppl: ', ppl_valid)
print()
tokenized = list(word_tokenize('Gości innych nie widział oprócz spółleśników'))
tokenized = [token.lower() for token in tokenized]
ids = []
for word in tokenized:
if word in vocab_stoi:
ids.append(vocab_stoi[word])
else:
ids.append(vocab_stoi['<UNK>'])
lm.eval()
ids = torch.tensor(ids, dtype = torch.long, device = device)
preds= lm(ids.unsqueeze(0))
vocab_itos[torch.argmax(torch.softmax(preds,1),1).item()]
tokenized = list(word_tokenize('Lalka'))
tokenized = [token.lower() for token in tokenized]
ids = []
for word in tokenized:
if word in vocab_stoi:
ids.append(vocab_stoi[word])
else:
ids.append(vocab_stoi['<UNK>'])
ids = torch.tensor([ids], dtype = torch.long, device = device)
candidates_number = 10
for i in range(30):
preds= lm(ids)
candidates = torch.topk(torch.softmax(preds,1),candidates_number)[1][0].cpu().numpy()
candidate = 15001
while candidate > 15000:
candidate = candidates[np.random.randint(candidates_number)]
print(vocab_itos[candidate])
ids = torch.cat((ids, torch.tensor([[candidate]], device = device)), 1)
print('starting outs...')
with open(dir_path + "\\dev-0\\in.tsv", "r", encoding="UTF-8") as dev_path:
nr_of_dev_lines = len(dev_path.readlines())
with open(dir_path + "\\test-A\\in.tsv", "r", encoding="UTF-8") as test_a_path:
nr_of_test_a_lines = len(test_a_path.readlines())
with open(dir_path + "\\dev-0\\out.tsv", "w", encoding="UTF-8") as out_dev_file:
for i in range(nr_of_dev_lines):
preds= lm(ids)
candidates = torch.topk(torch.softmax(preds,1),candidates_number)[1][0].cpu().numpy()
candidate = 15001
while candidate > 15000:
candidate = candidates[np.random.randint(candidates_number)]
candidate2 = 15001
while candidate2 > 15000 or candidate2 == candidate:
candidate2 = candidates[np.random.randint(candidates_number)]
print(vocab_itos[candidate])
ids = torch.cat((ids, torch.tensor([[candidate]], device = device)), 1)
out_dev_file.write(vocab_itos[candidate] + ':0.3 ' + vocab_itos[candidate2] + ':0.2 :0.5' '\n')
with open(dir_path + "\\test-A\\out.tsv", "w", encoding="UTF-8") as out_test_file:
for i in range(nr_of_dev_lines):
preds= lm(ids)
candidates = torch.topk(torch.softmax(preds,1),candidates_number)[1][0].cpu().numpy()
candidate = 15001
while candidate > 15000:
candidate = candidates[np.random.randint(candidates_number)]
candidate2 = 15001
while candidate2 > 15000 or candidate2 == candidate:
candidate2 = candidates[np.random.randint(candidates_number)]
print(vocab_itos[candidate])
ids = torch.cat((ids, torch.tensor([[candidate]], device = device)), 1)
out_test_file.write(vocab_itos[candidate] + ':0.3 ' + vocab_itos[candidate2] + ':0.2 :0.5' '\n')