470619
This commit is contained in:
parent
51220186a3
commit
4edb9baefc
21036
dev-0/out.tsv
21036
dev-0/out.tsv
File diff suppressed because it is too large
Load Diff
447
run.py
447
run.py
@ -1,130 +1,389 @@
|
|||||||
#!/usr/bin/env python
|
#!/usr/bin/env python
|
||||||
# coding: utf-8
|
# coding: utf-8
|
||||||
|
|
||||||
# MODEL TRIGRAMOWY - uwzględniamy dwa poprzednie słowa
|
# In[2]:
|
||||||
|
|
||||||
|
|
||||||
|
import torch
|
||||||
|
from torch import nn, optim
|
||||||
|
from torch.utils.data import DataLoader
|
||||||
|
import numpy as np
|
||||||
|
from collections import Counter
|
||||||
|
import re
|
||||||
import lzma
|
import lzma
|
||||||
import csv
|
import csv
|
||||||
import re
|
|
||||||
import math
|
|
||||||
|
|
||||||
|
|
||||||
def read_data(folder_name, test_data=False):
|
# In[3]:
|
||||||
|
|
||||||
all_data = lzma.open(f'{folder_name}/in.tsv.xz').read().decode('UTF-8').split('\n')
|
|
||||||
data = [line.split('\t') for line in all_data][:-1]
|
|
||||||
data = [[i[6].replace('\\n', ' '), i[7].replace('\\n', ' ')] for i in data]
|
|
||||||
|
|
||||||
if not test_data:
|
device = 'cuda'
|
||||||
|
|
||||||
|
|
||||||
|
# In[4]:
|
||||||
|
|
||||||
|
|
||||||
|
class Dataset(torch.utils.data.Dataset):
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
sequence_length,
|
||||||
|
):
|
||||||
|
self.sequence_length = sequence_length
|
||||||
|
self.words = self.load()
|
||||||
|
self.uniq_words = self.get_uniq_words()
|
||||||
|
|
||||||
|
self.index_to_word = {index: word for index, word in enumerate(self.uniq_words)}
|
||||||
|
self.word_to_index = {word: index for index, word in enumerate(self.uniq_words)}
|
||||||
|
|
||||||
|
self.words_indexes = [self.word_to_index[w] for w in self.words]
|
||||||
|
|
||||||
|
def load(self):
|
||||||
|
data = lzma.open(f'train/in.tsv.xz').read().decode('UTF-8').split('\n')
|
||||||
|
data = [line.split('\t') for line in data][:-1]
|
||||||
|
data = [[i[6].replace('\\\\n', ' '), i[7].replace('\\\\n', ' ')] for i in data]
|
||||||
|
|
||||||
words = []
|
words = []
|
||||||
with open(f'{folder_name}/expected.tsv') as file:
|
with open(f'train/expected.tsv') as file:
|
||||||
tsv_file = csv.reader(file, delimiter="\t")
|
tsv_file = csv.reader(file, delimiter="\t")
|
||||||
for line in tsv_file:
|
for line in tsv_file:
|
||||||
words.append(line[0])
|
words.append(line[0])
|
||||||
|
|
||||||
return data, words
|
text = []
|
||||||
|
# for i in range(len(data) - 1):
|
||||||
|
for i in range(5000):
|
||||||
|
t = data[i][0] + ' ' + words[i] + ' ' + data[i][1] + ' '
|
||||||
|
text += [t.replace('\\n', ' ')]
|
||||||
|
|
||||||
return data
|
text = ' '.join(text).lower()
|
||||||
|
text = re.sub('[^a-z ]', '', text)
|
||||||
train_data, train_words = read_data('train')
|
text = text.split(' ')
|
||||||
|
return text
|
||||||
|
|
||||||
|
|
||||||
def print_example(data, words, idx):
|
def get_uniq_words(self):
|
||||||
print(f'{data[idx][0]} _____{words[idx].upper()}_____ {data[idx][1]}')
|
word_counts = Counter(self.words)
|
||||||
|
return sorted(word_counts, key=word_counts.get, reverse=True)
|
||||||
|
|
||||||
# print_example(train_data, train_words, 13)
|
def __len__(self):
|
||||||
|
return len(self.words_indexes) - self.sequence_length
|
||||||
|
|
||||||
|
def __getitem__(self, index):
|
||||||
|
return (
|
||||||
|
torch.tensor(self.words_indexes[index:index+self.sequence_length]),
|
||||||
|
torch.tensor(self.words_indexes[index+1:index+self.sequence_length+1]),
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
def generate_N_grams(text, ngram=1, no_punctuation=True):
|
# In[5]:
|
||||||
text = re.sub(r'[\-] ', '', text).lower()
|
|
||||||
if no_punctuation:
|
|
||||||
text = re.sub(r'[^\w\s]', ' ', text)
|
|
||||||
words=[word for word in text.split()]
|
|
||||||
temp=zip(*[words[i:] for i in range(0,ngram)])
|
|
||||||
ans=[' '.join(ngram) for ngram in temp]
|
|
||||||
return ans
|
|
||||||
|
|
||||||
N_grams = []
|
|
||||||
for i in range(len(train_data[:5000])):
|
|
||||||
N_grams += generate_N_grams(f'{train_data[i][0]} {train_words[i]} {train_data[i][1]}', 2)
|
|
||||||
N_grams += generate_N_grams(f'{train_data[i][0]} {train_words[i]} {train_data[i][1]}', 3)
|
|
||||||
|
|
||||||
|
|
||||||
def check_prob(N_grams):
|
dataset = Dataset(5)
|
||||||
count = {}
|
|
||||||
for i in N_grams:
|
|
||||||
i = i.rsplit(maxsplit=1)
|
|
||||||
if i[0] in count:
|
|
||||||
if i[1] in count[i[0]]:
|
|
||||||
count[i[0]][i[1]] += 1
|
|
||||||
else:
|
|
||||||
count[i[0]][i[1]] = 1
|
|
||||||
else:
|
|
||||||
count[i[0]] = {i[1]: 1}
|
|
||||||
|
|
||||||
for word in count:
|
|
||||||
s = sum(count[word].values())
|
|
||||||
for i in count[word]:
|
|
||||||
count[word][i] = count[word][i] / s
|
|
||||||
|
|
||||||
return count
|
|
||||||
|
|
||||||
probs = check_prob(N_grams)
|
|
||||||
|
|
||||||
|
|
||||||
dev_data, dev_words = read_data('dev-0')
|
# In[6]:
|
||||||
|
|
||||||
|
|
||||||
def find_word(word_1, word_2):
|
dataset[200]
|
||||||
tmp_probs = {}
|
|
||||||
if word_1 in probs:
|
|
||||||
if word_2 in probs:
|
|
||||||
for i in probs[word_1]:
|
|
||||||
if i in probs[word_2]:
|
|
||||||
tmp_probs[i] = probs[word_1][i] * probs[word_2][i]
|
|
||||||
if tmp_probs[i] == 1:
|
|
||||||
tmp_probs[i] = 0.1
|
|
||||||
else:
|
|
||||||
tmp_probs[i] = probs[word_1][i] / 5
|
|
||||||
else:
|
|
||||||
tmp_probs = probs[word_1]
|
|
||||||
else:
|
|
||||||
tmp_probs = {}
|
|
||||||
|
|
||||||
sorted_list = sorted(tmp_probs.items(), key=lambda x: x[1], reverse=True)[:1]
|
|
||||||
tmm = ' '.join([i[0] + ':' + str(i[1]) for i in sorted_list])
|
|
||||||
s = 1 - sum(n for _, n in sorted_list)
|
|
||||||
if s == 0:
|
|
||||||
s = 0.01
|
|
||||||
tmm += ' :' + str(s)
|
|
||||||
if tmp_probs == {}:
|
|
||||||
return ':1'
|
|
||||||
return tmm
|
|
||||||
|
|
||||||
|
|
||||||
def find_words(data):
|
# In[7]:
|
||||||
found_words = []
|
|
||||||
for i in data:
|
|
||||||
t = i[0]
|
|
||||||
t = re.sub(r'[\-] ', '', t).lower()
|
|
||||||
if True:
|
|
||||||
t = re.sub(r'[^\w\s]', ' ', t)
|
|
||||||
words=[word for word in t.split()]
|
|
||||||
found_words.append(find_word(words[-1], ' '.join(words[-2:])))
|
|
||||||
return found_words
|
|
||||||
|
|
||||||
dev_found_words = find_words(dev_data)
|
|
||||||
|
|
||||||
|
|
||||||
def save_data(folder, words):
|
[dataset.index_to_word[x] for x in [ 0, 231, 19, 98, 189]]
|
||||||
f = open(f'{folder}/out.tsv', 'w')
|
|
||||||
f.write('\n'.join(words) + '\n')
|
|
||||||
|
# In[8]:
|
||||||
|
|
||||||
|
|
||||||
|
[dataset.index_to_word[x] for x in [231, 19, 98, 189, 5]]
|
||||||
|
|
||||||
|
|
||||||
|
# In[9]:
|
||||||
|
|
||||||
|
|
||||||
|
input_tensor = torch.tensor([[ 0, 231, 19, 98, 189]], dtype=torch.int32).to(device)
|
||||||
|
|
||||||
|
|
||||||
|
# In[ ]:
|
||||||
|
|
||||||
|
|
||||||
|
class Model(nn.Module):
|
||||||
|
def __init__(self, vocab_size):
|
||||||
|
super(Model, self).__init__()
|
||||||
|
self.lstm_size = 128
|
||||||
|
self.embedding_dim = 128
|
||||||
|
self.num_layers = 3
|
||||||
|
|
||||||
|
self.embedding = nn.Embedding(
|
||||||
|
num_embeddings=vocab_size,
|
||||||
|
embedding_dim=self.embedding_dim,
|
||||||
|
)
|
||||||
|
self.lstm = nn.LSTM(
|
||||||
|
input_size=self.lstm_size,
|
||||||
|
hidden_size=self.lstm_size,
|
||||||
|
num_layers=self.num_layers,
|
||||||
|
dropout=0.2,
|
||||||
|
)
|
||||||
|
self.fc = nn.Linear(self.lstm_size, vocab_size)
|
||||||
|
|
||||||
|
def forward(self, x, prev_state = None):
|
||||||
|
embed = self.embedding(x)
|
||||||
|
output, state = self.lstm(embed, prev_state)
|
||||||
|
logits = self.fc(output)
|
||||||
|
return logits, state
|
||||||
|
|
||||||
|
def init_state(self, sequence_length):
|
||||||
|
return (torch.zeros(self.num_layers, sequence_length, self.lstm_size).to(device),
|
||||||
|
torch.zeros(self.num_layers, sequence_length, self.lstm_size).to(device))
|
||||||
|
|
||||||
|
|
||||||
|
# In[ ]:
|
||||||
|
|
||||||
|
|
||||||
|
model = Model(len(dataset)).to(device)
|
||||||
|
|
||||||
|
|
||||||
|
# In[ ]:
|
||||||
|
|
||||||
|
|
||||||
|
y_pred, state_h = model(input_tensor)
|
||||||
|
|
||||||
|
|
||||||
|
# In[ ]:
|
||||||
|
|
||||||
|
|
||||||
|
y_pred
|
||||||
|
|
||||||
|
|
||||||
|
# In[ ]:
|
||||||
|
|
||||||
|
|
||||||
|
y_pred.shape
|
||||||
|
|
||||||
|
|
||||||
|
# In[ ]:
|
||||||
|
|
||||||
|
|
||||||
|
def train(dataset, model, max_epochs, batch_size):
|
||||||
|
model.train()
|
||||||
|
|
||||||
|
dataloader = DataLoader(dataset, batch_size=batch_size)
|
||||||
|
criterion = nn.CrossEntropyLoss()
|
||||||
|
optimizer = optim.Adam(model.parameters(), lr=0.001)
|
||||||
|
|
||||||
|
for epoch in range(max_epochs):
|
||||||
|
for batch, (x, y) in enumerate(dataloader):
|
||||||
|
optimizer.zero_grad()
|
||||||
|
x = x.to(device)
|
||||||
|
y = y.to(device)
|
||||||
|
|
||||||
|
y_pred, state_h = model(x)
|
||||||
|
loss = criterion(y_pred.transpose(1, 2), y)
|
||||||
|
|
||||||
|
loss.backward()
|
||||||
|
optimizer.step()
|
||||||
|
|
||||||
|
print({ 'epoch': epoch, 'update in batch': batch, '/' : len(dataloader), 'loss': loss.item() })
|
||||||
|
|
||||||
|
|
||||||
|
# In[ ]:
|
||||||
|
|
||||||
|
|
||||||
|
model = Model(vocab_size = len(dataset.uniq_words)).to(device)
|
||||||
|
train(dataset, model, 1, 64)
|
||||||
|
|
||||||
|
|
||||||
|
# In[ ]:
|
||||||
|
|
||||||
|
|
||||||
|
def predict(dataset, model, text, next_words=5):
|
||||||
|
model.eval()
|
||||||
|
words = text.split(' ')
|
||||||
|
state_h = model.init_state(len(words))
|
||||||
|
res = []
|
||||||
|
|
||||||
|
x = torch.tensor([[dataset.word_to_index[w] for w in words]]).to(device)
|
||||||
|
y_pred, state_h = model(x, state_h)
|
||||||
|
|
||||||
|
last_word_logits = y_pred[0][-1]
|
||||||
|
p = torch.nn.functional.softmax(last_word_logits, dim=0).detach().cpu().numpy()
|
||||||
|
tmp = sorted(zip(p, range(len(p))), reverse=True)[:next_words]
|
||||||
|
for w in tmp:
|
||||||
|
res.append((dataset.index_to_word[w[1]], w[0]))
|
||||||
|
|
||||||
|
return res
|
||||||
|
|
||||||
|
def predict2(dataset, model, model2, text, text2, next_words=5):
|
||||||
|
model.eval()
|
||||||
|
model2.eval()
|
||||||
|
words = text.split(' ')
|
||||||
|
words2 = text2.split(' ')
|
||||||
|
words2.reverse()
|
||||||
|
state_h = model.init_state(len(words))
|
||||||
|
state_h_2 = model2.init_state(len(words))
|
||||||
|
res = []
|
||||||
|
|
||||||
|
x = torch.tensor([[dataset.word_to_index[w] for w in words]]).to(device)
|
||||||
|
x2 = torch.tensor([[dataset.word_to_index[w] for w in words2]]).to(device)
|
||||||
|
y_pred, state_h = model(x, state_h)
|
||||||
|
y_pred_2, state_h_2 = model2(x2, state_h_2)
|
||||||
|
|
||||||
|
last_word_logits = y_pred[0][-1]
|
||||||
|
last_word_logits_2 = y_pred_2[0][-1]
|
||||||
|
p = torch.nn.functional.softmax(last_word_logits, dim=0).detach().cpu().numpy()
|
||||||
|
p2 = torch.nn.functional.softmax(last_word_logits_2, dim=0).detach().cpu().numpy()
|
||||||
|
|
||||||
|
p_mean = [(g + h) / 2 for g, h in zip(p, p2)]
|
||||||
|
|
||||||
|
tmp = sorted(zip(p_mean, range(len(p_mean))), reverse=True)[:next_words]
|
||||||
|
for w in tmp:
|
||||||
|
res.append((dataset.index_to_word[w[1]], w[0]))
|
||||||
|
|
||||||
|
return res
|
||||||
|
|
||||||
|
|
||||||
|
# In[ ]:
|
||||||
|
|
||||||
|
|
||||||
|
predict(dataset, model, 'it is a')
|
||||||
|
|
||||||
|
|
||||||
|
# In[69]:
|
||||||
|
|
||||||
|
|
||||||
|
dev_data = lzma.open(f'dev-0/in.tsv.xz').read().decode('UTF-8').split('\n')
|
||||||
|
dev_data = [line.split('\t') for line in dev_data][:-1]
|
||||||
|
dev_data1 = [re.sub('[^a-z ]', '', i[6].replace('\\n', ' ').lower()).strip() for i in dev_data]
|
||||||
|
dev_data2 = [re.sub('[^a-z ]', '', i[7].replace('\\n', ' ').lower()).strip() for i in dev_data]
|
||||||
|
|
||||||
|
|
||||||
|
# In[23]:
|
||||||
|
|
||||||
|
|
||||||
|
dev_data[0]
|
||||||
|
|
||||||
|
|
||||||
|
# In[54]:
|
||||||
|
|
||||||
|
|
||||||
|
print(predict(dataset, model, ' '.join(dev_data[9].split()[-1:])))
|
||||||
|
|
||||||
|
|
||||||
|
# In[66]:
|
||||||
|
|
||||||
|
|
||||||
|
class ReversedDataset(torch.utils.data.Dataset):
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
sequence_length,
|
||||||
|
):
|
||||||
|
self.sequence_length = sequence_length
|
||||||
|
self.words = self.load()
|
||||||
|
self.uniq_words = self.get_uniq_words()
|
||||||
|
|
||||||
|
self.index_to_word = {index: word for index, word in enumerate(self.uniq_words)}
|
||||||
|
self.word_to_index = {word: index for index, word in enumerate(self.uniq_words)}
|
||||||
|
|
||||||
|
self.words_indexes = [self.word_to_index[w] for w in self.words]
|
||||||
|
|
||||||
|
def load(self):
|
||||||
|
data = lzma.open(f'train/in.tsv.xz').read().decode('UTF-8').split('\n')
|
||||||
|
data = [line.split('\t') for line in data][:-1]
|
||||||
|
data = [[i[6].replace('\\\\n', ' '), i[7].replace('\\\\n', ' ')] for i in data]
|
||||||
|
|
||||||
|
words = []
|
||||||
|
with open(f'train/expected.tsv') as file:
|
||||||
|
tsv_file = csv.reader(file, delimiter="\t")
|
||||||
|
for line in tsv_file:
|
||||||
|
words.append(line[0])
|
||||||
|
|
||||||
|
text = []
|
||||||
|
# for i in range(len(data) - 1):
|
||||||
|
for i in range(5000):
|
||||||
|
t = data[i][0] + ' ' + words[i] + ' ' + data[i][1] + ' '
|
||||||
|
text += [t.replace('\\n', ' ')]
|
||||||
|
|
||||||
|
text = ' '.join(text).lower()
|
||||||
|
text = re.sub('[^a-z ]', '', text)
|
||||||
|
text = text.split(' ')
|
||||||
|
text.reverse()
|
||||||
|
return text
|
||||||
|
|
||||||
|
|
||||||
|
def get_uniq_words(self):
|
||||||
|
word_counts = Counter(self.words)
|
||||||
|
return sorted(word_counts, key=word_counts.get, reverse=True)
|
||||||
|
|
||||||
|
def __len__(self):
|
||||||
|
return len(self.words_indexes) - self.sequence_length
|
||||||
|
|
||||||
|
def __getitem__(self, index):
|
||||||
|
return (
|
||||||
|
torch.tensor(self.words_indexes[index:index+self.sequence_length]),
|
||||||
|
torch.tensor(self.words_indexes[index+1:index+self.sequence_length+1]),
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
# In[67]:
|
||||||
|
|
||||||
|
|
||||||
|
dataset_2 = ReversedDataset(5)
|
||||||
|
input_tensor_2 = torch.tensor([[ 0, 231, 19, 98, 189]], dtype=torch.int32).to(device)
|
||||||
|
|
||||||
|
model_2 = Model(len(dataset_2)).to(device)
|
||||||
|
y_pred_2, state_h_2 = model(input_tensor_2)
|
||||||
|
model_2 = Model(vocab_size = len(dataset_2.uniq_words)).to(device)
|
||||||
|
train(dataset_2, model_2, 1, 64)
|
||||||
|
|
||||||
|
|
||||||
|
# In[96]:
|
||||||
|
|
||||||
|
|
||||||
|
n = 2
|
||||||
|
|
||||||
|
f = open("dev-0/out.tsv", "w")
|
||||||
|
|
||||||
|
for i in range(len(dev_data1)):
|
||||||
|
d1 = dev_data1[i]
|
||||||
|
d2 = dev_data2[i]
|
||||||
|
try:
|
||||||
|
tmp = predict2(dataset, model, model_2, ' '.join(d1.split()[-n:]), ' '.join(d2.split()[:n]))
|
||||||
|
f.writelines(' '.join([f'{i[0]}:{i[1]}' for i in tmp]) + ' :0.3\n')
|
||||||
|
except:
|
||||||
|
f.writelines(':1\n')
|
||||||
|
|
||||||
f.close()
|
f.close()
|
||||||
|
|
||||||
save_data('dev-0', dev_found_words)
|
|
||||||
|
|
||||||
|
|
||||||
test_data = read_data('test-A', True)
|
# In[95]:
|
||||||
test_found_words = find_words(test_data)
|
|
||||||
save_data('test-A', test_found_words)
|
|
||||||
|
len(dev_data1)
|
||||||
|
|
||||||
|
|
||||||
|
# In[93]:
|
||||||
|
|
||||||
|
|
||||||
|
test_data = lzma.open(f'test-A/in.tsv.xz').read().decode('UTF-8').split('\n')
|
||||||
|
test_data = [line.split('\t') for line in test_data][:-1]
|
||||||
|
test_data1 = [re.sub('[^a-z ]', '', i[6].replace('\\n', ' ').lower()).strip() for i in test_data]
|
||||||
|
test_data2 = [re.sub('[^a-z ]', '', i[7].replace('\\n', ' ').lower()).strip() for i in test_data]
|
||||||
|
|
||||||
|
|
||||||
|
n = 2
|
||||||
|
|
||||||
|
f = open("test-A/out.tsv", "w")
|
||||||
|
|
||||||
|
for i in range(len(test_data1)):
|
||||||
|
d1 = test_data1[i]
|
||||||
|
d2 = test_data2[i]
|
||||||
|
try:
|
||||||
|
tmp = predict2(dataset, model, model_2, ' '.join(d1.split()[-n:]), ' '.join(d2.split()[:n]))
|
||||||
|
f.writelines(' '.join([f'{i[0]}:{i[1]}' for i in tmp]) + ' :0.3\n')
|
||||||
|
except:
|
||||||
|
f.writelines(':1\n')
|
||||||
|
|
||||||
|
f.close()
|
||||||
|
|
||||||
|
|
||||||
|
7414
test-A/out.tsv
7414
test-A/out.tsv
File diff suppressed because it is too large
Load Diff
Loading…
Reference in New Issue
Block a user