This commit is contained in:
fraszosia@gmail.com 2022-05-29 23:56:57 +02:00
parent 51220186a3
commit 4edb9baefc
4 changed files with 16485 additions and 18182 deletions

File diff suppressed because it is too large Load Diff

5748
run.ipynb

File diff suppressed because it is too large Load Diff

469
run.py
View File

@ -1,130 +1,389 @@
#!/usr/bin/env python
# coding: utf-8
# MODEL TRIGRAMOWY - uwzględniamy dwa poprzednie słowa
# In[2]:
import torch
from torch import nn, optim
from torch.utils.data import DataLoader
import numpy as np
from collections import Counter
import re
import lzma
import csv
import re
import math
def read_data(folder_name, test_data=False):
all_data = lzma.open(f'{folder_name}/in.tsv.xz').read().decode('UTF-8').split('\n')
data = [line.split('\t') for line in all_data][:-1]
data = [[i[6].replace('\\n', ' '), i[7].replace('\\n', ' ')] for i in data]
if not test_data:
# In[3]:
device = 'cuda'
# In[4]:
class Dataset(torch.utils.data.Dataset):
def __init__(
self,
sequence_length,
):
self.sequence_length = sequence_length
self.words = self.load()
self.uniq_words = self.get_uniq_words()
self.index_to_word = {index: word for index, word in enumerate(self.uniq_words)}
self.word_to_index = {word: index for index, word in enumerate(self.uniq_words)}
self.words_indexes = [self.word_to_index[w] for w in self.words]
def load(self):
data = lzma.open(f'train/in.tsv.xz').read().decode('UTF-8').split('\n')
data = [line.split('\t') for line in data][:-1]
data = [[i[6].replace('\\\\n', ' '), i[7].replace('\\\\n', ' ')] for i in data]
words = []
with open(f'{folder_name}/expected.tsv') as file:
with open(f'train/expected.tsv') as file:
tsv_file = csv.reader(file, delimiter="\t")
for line in tsv_file:
words.append(line[0])
text = []
# for i in range(len(data) - 1):
for i in range(5000):
t = data[i][0] + ' ' + words[i] + ' ' + data[i][1] + ' '
text += [t.replace('\\n', ' ')]
return data, words
text = ' '.join(text).lower()
text = re.sub('[^a-z ]', '', text)
text = text.split(' ')
return text
return data
train_data, train_words = read_data('train')
def print_example(data, words, idx):
print(f'{data[idx][0]} _____{words[idx].upper()}_____ {data[idx][1]}')
# print_example(train_data, train_words, 13)
def get_uniq_words(self):
word_counts = Counter(self.words)
return sorted(word_counts, key=word_counts.get, reverse=True)
def __len__(self):
return len(self.words_indexes) - self.sequence_length
def __getitem__(self, index):
return (
torch.tensor(self.words_indexes[index:index+self.sequence_length]),
torch.tensor(self.words_indexes[index+1:index+self.sequence_length+1]),
)
def generate_N_grams(text, ngram=1, no_punctuation=True):
text = re.sub(r'[\-] ', '', text).lower()
if no_punctuation:
text = re.sub(r'[^\w\s]', ' ', text)
words=[word for word in text.split()]
temp=zip(*[words[i:] for i in range(0,ngram)])
ans=[' '.join(ngram) for ngram in temp]
return ans
N_grams = []
for i in range(len(train_data[:5000])):
N_grams += generate_N_grams(f'{train_data[i][0]} {train_words[i]} {train_data[i][1]}', 2)
N_grams += generate_N_grams(f'{train_data[i][0]} {train_words[i]} {train_data[i][1]}', 3)
# In[5]:
def check_prob(N_grams):
count = {}
for i in N_grams:
i = i.rsplit(maxsplit=1)
if i[0] in count:
if i[1] in count[i[0]]:
count[i[0]][i[1]] += 1
else:
count[i[0]][i[1]] = 1
else:
count[i[0]] = {i[1]: 1}
dataset = Dataset(5)
# In[6]:
dataset[200]
# In[7]:
[dataset.index_to_word[x] for x in [ 0, 231, 19, 98, 189]]
# In[8]:
[dataset.index_to_word[x] for x in [231, 19, 98, 189, 5]]
# In[9]:
input_tensor = torch.tensor([[ 0, 231, 19, 98, 189]], dtype=torch.int32).to(device)
# In[ ]:
class Model(nn.Module):
def __init__(self, vocab_size):
super(Model, self).__init__()
self.lstm_size = 128
self.embedding_dim = 128
self.num_layers = 3
self.embedding = nn.Embedding(
num_embeddings=vocab_size,
embedding_dim=self.embedding_dim,
)
self.lstm = nn.LSTM(
input_size=self.lstm_size,
hidden_size=self.lstm_size,
num_layers=self.num_layers,
dropout=0.2,
)
self.fc = nn.Linear(self.lstm_size, vocab_size)
def forward(self, x, prev_state = None):
embed = self.embedding(x)
output, state = self.lstm(embed, prev_state)
logits = self.fc(output)
return logits, state
def init_state(self, sequence_length):
return (torch.zeros(self.num_layers, sequence_length, self.lstm_size).to(device),
torch.zeros(self.num_layers, sequence_length, self.lstm_size).to(device))
# In[ ]:
model = Model(len(dataset)).to(device)
# In[ ]:
y_pred, state_h = model(input_tensor)
# In[ ]:
y_pred
# In[ ]:
y_pred.shape
# In[ ]:
def train(dataset, model, max_epochs, batch_size):
model.train()
dataloader = DataLoader(dataset, batch_size=batch_size)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)
for epoch in range(max_epochs):
for batch, (x, y) in enumerate(dataloader):
optimizer.zero_grad()
x = x.to(device)
y = y.to(device)
y_pred, state_h = model(x)
loss = criterion(y_pred.transpose(1, 2), y)
loss.backward()
optimizer.step()
print({ 'epoch': epoch, 'update in batch': batch, '/' : len(dataloader), 'loss': loss.item() })
# In[ ]:
model = Model(vocab_size = len(dataset.uniq_words)).to(device)
train(dataset, model, 1, 64)
# In[ ]:
def predict(dataset, model, text, next_words=5):
model.eval()
words = text.split(' ')
state_h = model.init_state(len(words))
res = []
x = torch.tensor([[dataset.word_to_index[w] for w in words]]).to(device)
y_pred, state_h = model(x, state_h)
last_word_logits = y_pred[0][-1]
p = torch.nn.functional.softmax(last_word_logits, dim=0).detach().cpu().numpy()
tmp = sorted(zip(p, range(len(p))), reverse=True)[:next_words]
for w in tmp:
res.append((dataset.index_to_word[w[1]], w[0]))
return res
def predict2(dataset, model, model2, text, text2, next_words=5):
model.eval()
model2.eval()
words = text.split(' ')
words2 = text2.split(' ')
words2.reverse()
state_h = model.init_state(len(words))
state_h_2 = model2.init_state(len(words))
res = []
x = torch.tensor([[dataset.word_to_index[w] for w in words]]).to(device)
x2 = torch.tensor([[dataset.word_to_index[w] for w in words2]]).to(device)
y_pred, state_h = model(x, state_h)
y_pred_2, state_h_2 = model2(x2, state_h_2)
last_word_logits = y_pred[0][-1]
last_word_logits_2 = y_pred_2[0][-1]
p = torch.nn.functional.softmax(last_word_logits, dim=0).detach().cpu().numpy()
p2 = torch.nn.functional.softmax(last_word_logits_2, dim=0).detach().cpu().numpy()
p_mean = [(g + h) / 2 for g, h in zip(p, p2)]
tmp = sorted(zip(p_mean, range(len(p_mean))), reverse=True)[:next_words]
for w in tmp:
res.append((dataset.index_to_word[w[1]], w[0]))
return res
# In[ ]:
predict(dataset, model, 'it is a')
# In[69]:
dev_data = lzma.open(f'dev-0/in.tsv.xz').read().decode('UTF-8').split('\n')
dev_data = [line.split('\t') for line in dev_data][:-1]
dev_data1 = [re.sub('[^a-z ]', '', i[6].replace('\\n', ' ').lower()).strip() for i in dev_data]
dev_data2 = [re.sub('[^a-z ]', '', i[7].replace('\\n', ' ').lower()).strip() for i in dev_data]
# In[23]:
dev_data[0]
# In[54]:
print(predict(dataset, model, ' '.join(dev_data[9].split()[-1:])))
# In[66]:
class ReversedDataset(torch.utils.data.Dataset):
def __init__(
self,
sequence_length,
):
self.sequence_length = sequence_length
self.words = self.load()
self.uniq_words = self.get_uniq_words()
self.index_to_word = {index: word for index, word in enumerate(self.uniq_words)}
self.word_to_index = {word: index for index, word in enumerate(self.uniq_words)}
self.words_indexes = [self.word_to_index[w] for w in self.words]
def load(self):
data = lzma.open(f'train/in.tsv.xz').read().decode('UTF-8').split('\n')
data = [line.split('\t') for line in data][:-1]
data = [[i[6].replace('\\\\n', ' '), i[7].replace('\\\\n', ' ')] for i in data]
words = []
with open(f'train/expected.tsv') as file:
tsv_file = csv.reader(file, delimiter="\t")
for line in tsv_file:
words.append(line[0])
text = []
# for i in range(len(data) - 1):
for i in range(5000):
t = data[i][0] + ' ' + words[i] + ' ' + data[i][1] + ' '
text += [t.replace('\\n', ' ')]
for word in count:
s = sum(count[word].values())
for i in count[word]:
count[word][i] = count[word][i] / s
return count
probs = check_prob(N_grams)
dev_data, dev_words = read_data('dev-0')
def find_word(word_1, word_2):
tmp_probs = {}
if word_1 in probs:
if word_2 in probs:
for i in probs[word_1]:
if i in probs[word_2]:
tmp_probs[i] = probs[word_1][i] * probs[word_2][i]
if tmp_probs[i] == 1:
tmp_probs[i] = 0.1
else:
tmp_probs[i] = probs[word_1][i] / 5
else:
tmp_probs = probs[word_1]
else:
tmp_probs = {}
text = ' '.join(text).lower()
text = re.sub('[^a-z ]', '', text)
text = text.split(' ')
text.reverse()
return text
sorted_list = sorted(tmp_probs.items(), key=lambda x: x[1], reverse=True)[:1]
tmm = ' '.join([i[0] + ':' + str(i[1]) for i in sorted_list])
s = 1 - sum(n for _, n in sorted_list)
if s == 0:
s = 0.01
tmm += ' :' + str(s)
if tmp_probs == {}:
return ':1'
return tmm
def find_words(data):
found_words = []
for i in data:
t = i[0]
t = re.sub(r'[\-] ', '', t).lower()
if True:
t = re.sub(r'[^\w\s]', ' ', t)
words=[word for word in t.split()]
found_words.append(find_word(words[-1], ' '.join(words[-2:])))
return found_words
dev_found_words = find_words(dev_data)
def save_data(folder, words):
f = open(f'{folder}/out.tsv', 'w')
f.write('\n'.join(words) + '\n')
f.close()
save_data('dev-0', dev_found_words)
def get_uniq_words(self):
word_counts = Counter(self.words)
return sorted(word_counts, key=word_counts.get, reverse=True)
def __len__(self):
return len(self.words_indexes) - self.sequence_length
def __getitem__(self, index):
return (
torch.tensor(self.words_indexes[index:index+self.sequence_length]),
torch.tensor(self.words_indexes[index+1:index+self.sequence_length+1]),
)
test_data = read_data('test-A', True)
test_found_words = find_words(test_data)
save_data('test-A', test_found_words)
# In[67]:
dataset_2 = ReversedDataset(5)
input_tensor_2 = torch.tensor([[ 0, 231, 19, 98, 189]], dtype=torch.int32).to(device)
model_2 = Model(len(dataset_2)).to(device)
y_pred_2, state_h_2 = model(input_tensor_2)
model_2 = Model(vocab_size = len(dataset_2.uniq_words)).to(device)
train(dataset_2, model_2, 1, 64)
# In[96]:
n = 2
f = open("dev-0/out.tsv", "w")
for i in range(len(dev_data1)):
d1 = dev_data1[i]
d2 = dev_data2[i]
try:
tmp = predict2(dataset, model, model_2, ' '.join(d1.split()[-n:]), ' '.join(d2.split()[:n]))
f.writelines(' '.join([f'{i[0]}:{i[1]}' for i in tmp]) + ' :0.3\n')
except:
f.writelines(':1\n')
f.close()
# In[95]:
len(dev_data1)
# In[93]:
test_data = lzma.open(f'test-A/in.tsv.xz').read().decode('UTF-8').split('\n')
test_data = [line.split('\t') for line in test_data][:-1]
test_data1 = [re.sub('[^a-z ]', '', i[6].replace('\\n', ' ').lower()).strip() for i in test_data]
test_data2 = [re.sub('[^a-z ]', '', i[7].replace('\\n', ' ').lower()).strip() for i in test_data]
n = 2
f = open("test-A/out.tsv", "w")
for i in range(len(test_data1)):
d1 = test_data1[i]
d2 = test_data2[i]
try:
tmp = predict2(dataset, model, model_2, ' '.join(d1.split()[-n:]), ' '.join(d2.split()[:n]))
f.writelines(' '.join([f'{i[0]}:{i[1]}' for i in tmp]) + ' :0.3\n')
except:
f.writelines(':1\n')
f.close()

File diff suppressed because it is too large Load Diff