Compare commits

...

3 Commits
master ... gru

9 changed files with 18290 additions and 54053 deletions

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -1,233 +0,0 @@
#!/usr/bin/env python
# coding: utf-8
# In[1]:
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import GPT2Tokenizer, GPT2LMHeadModel, AutoConfig
# In[2]:
import lzma
def read_xz_file(fname):
with lzma.open(fname, mode='rt', encoding='utf-8') as f:
return [line.strip() for line in f.readlines()]
def read_file(fname):
with open(fname, mode='rt', encoding='utf-8') as f:
return [line.strip() for line in f.readlines()]
def get_contexts(input_text):
all_fields = input_text.replace(r'\n', ' ').split('\t')
return {'left': all_fields[6], 'right': all_fields[7]}
bos = '<|endoftext|>'
eos = '<|EOS|>'
def compose_sentences(raw_input, labels):
result = []
for input, label in zip(raw_input, labels):
context = get_contexts(input)
result.append(f'{bos} {context["left"]} {input} {eos}')
result.append(f'{bos} {input} {context["right"]} {eos}')
return result
# In[6]:
pad = '<|pad|>'
special_tokens_dict = {'eos_token': eos, 'bos_token': bos, 'pad_token': pad}
tokenizer = GPT2Tokenizer.from_pretrained('distilgpt2')
num_add_tokens = tokenizer.add_special_tokens(special_tokens_dict)
# In[4]:
class AmericaDataset(Dataset):
def __init__(self, tokenizer, data):
self.tokenizer = tokenizer
self.sentences = []
for entry in data:
self.sentences.append(
torch.tensor(self.tokenizer.encode(entry, padding=True))
)
def __len__(self):
return len(self.sentences)
def __getitem__(self, item):
return self.sentences[item]
# In[5]:
train_input_raw = read_xz_file('challenging-america-word-gap-prediction/train/in.tsv.xz')
train_labels = read_file('challenging-america-word-gap-prediction/train/expected.tsv')
train_sentences = compose_sentences(train_input_raw, train_labels)
train_dataset = AmericaDataset(tokenizer, train_sentences)
# In[11]:
config = AutoConfig.from_pretrained('distilgpt2', bos_token_id=tokenizer.bos_token_id,
eos_token_id=tokenizer.eos_token_id,
pad_token_id=tokenizer.pad_token_id, output_hidden_states=False, return_dict_in_generate=True)
model = GPT2LMHeadModel.from_pretrained('distilgpt2', config=config)
model.resize_token_embeddings(len(tokenizer))
device = torch.device('cuda')
model.to(device)
# In[8]:
def pack_tensor(new_tensor, packed_tensor, max_seq_len):
if packed_tensor is None:
return new_tensor, True, None
if new_tensor.size()[1] + packed_tensor.size()[1] > max_seq_len:
return packed_tensor, False, new_tensor
else:
packed_tensor = torch.cat([new_tensor, packed_tensor[:, 1:]], dim=1)
return packed_tensor, True, None
# In[9]:
import os
from transformers import AdamW, get_linear_schedule_with_warmup
from tqdm import tqdm
def train(
model,
dataset,
batch_size=16, epochs=5, lr=2e-5,
warmup_steps=200,
output_dir=".", output_prefix="gpt2",
save_model_on_epoch=False,
):
device = torch.device("cuda")
model = model.to(device)
model.train()
optimizer = AdamW(model.parameters(), lr=lr)
scheduler = get_linear_schedule_with_warmup(
optimizer, num_warmup_steps=warmup_steps, num_training_steps=-1
)
loss = 0
accumulating_batch_count = 0
input_tensor = None
dataloader = DataLoader(dataset, batch_size=1, shuffle=True)
for epoch in range(epochs):
print(f"Training epoch {epoch}")
print(loss)
for idx, entry in tqdm(enumerate(dataloader)):
(input_tensor, carry_on, remainder) = pack_tensor(entry, input_tensor, 1024)
if carry_on and idx != len(dataset) - 1:
continue
input_tensor = input_tensor.to(device)
outputs = model(input_tensor, labels=input_tensor)
loss = outputs[0]
loss.backward()
if (accumulating_batch_count % batch_size) == 0:
optimizer.step()
scheduler.step()
optimizer.zero_grad()
model.zero_grad()
accumulating_batch_count += 1
input_tensor = None
if save_model_on_epoch:
torch.save(
model.state_dict(),
os.path.join(output_dir, f"{output_prefix}-{epoch}.pt"),
)
return model
# In[12]:
model = train(model, train_dataset)
# In[3]:
dev_input_raw = read_xz_file('challenging-america-word-gap-prediction/dev-0/in.tsv.xz')
dev_input_contexts = [get_contexts(input_text) for input_text in dev_input_raw]
test_input_raw = read_xz_file('challenging-america-word-gap-prediction/test-A/in.tsv.xz')
test_input_contexts = [get_contexts(input_text) for input_text in test_input_raw]
# In[15]:
from tqdm import tqdm
tokenizer.truncation_side = 'left'
blacklist = ['ia', 'ix', 'io',
'ik'] # Te tokeny się prawie zawsze powtarzają, a nie są to żadne słowa w języku angielskim.
def predict_words(dataset):
preds = []
for entry in tqdm(dataset):
text = f"{entry['left']}"
src = tokenizer.encode(text, return_tensors="pt", truncation=True).to(device)
output = model.generate(src, max_length=len(src[0]) + 1, do_sample=True, top_k=0, temperature=0.8,
num_return_sequences=1, no_repeat_ngram_size=2, output_scores=True)
probs, idxs = torch.softmax(output.scores[0][-1], dim=0).topk(50)
current_output = ''
accumulated_probability = 0
for prob, token_id in zip(probs, idxs):
token = tokenizer.decode(token_id, skip_special_tokens=True).split(' ')[-1]
if not token.isalnum() or token in blacklist:
continue
prob_value = prob.item()
accumulated_probability += prob_value
current_output += f'{token.strip()}:{prob_value} '
current_output += f':{1 - accumulated_probability}'
preds.append(current_output)
return preds
# In[ ]:
dev_preds = predict_words(dev_input_contexts)
with open('challenging-america-word-gap-prediction/dev-0/out.tsv', 'w') as f:
f.writelines(line + '\n' for line in dev_preds)
# In[ ]:
test_preds = predict_words(test_input_contexts)
with open('challenging-america-word-gap-prediction/test-A/out.tsv', 'w') as f:
f.writelines(line + '\n' for line in test_preds)

291
run-gru.py Normal file
View File

@ -0,0 +1,291 @@
#!/usr/bin/env python
# coding: utf-8
# In[1]:
from collections import Counter
import torch
from torch.utils.data import Dataset
device = torch.device("cuda")
# In[2]:
import lzma
def read_xz_file(fname):
with lzma.open(fname, mode='rt', encoding='utf-8') as f:
return [line.strip() for line in f.readlines()]
def read_file(fname):
with open(fname, mode='rt', encoding='utf-8') as f:
return [line.strip() for line in f.readlines()]
def get_contexts(input_text):
all_fields = input_text.replace(r'\n', ' ').split('\t')
return {'left': all_fields[6], 'right': all_fields[7]}
def compose_sentences(raw_input, labels) -> list[dict[str, str]]:
result = []
for input, label in zip(raw_input, labels):
context = get_contexts(input)
result.append(f'{context["left"]} {input} {context["right"]}')
return result
# In[3]:
train_input_raw = read_xz_file('challenging-america-word-gap-prediction/train/in.tsv.xz')
train_labels = read_file('challenging-america-word-gap-prediction/train/expected.tsv')
train_sentences = compose_sentences(train_input_raw, train_labels)
# In[21]:
unk_token = '<unk>'
# In[26]:
class BaseDataset(torch.utils.data.Dataset):
def __init__(
self,
sequence_length,
sentences: list[str]
):
self.sequence_length = sequence_length
self.words = self.load(sentences)
self.uniq_words = self.get_uniq_words()
self.index_to_word = {index: word for index, word in enumerate(self.uniq_words)}
self.word_to_index = {word: index for index, word in enumerate(self.uniq_words)}
self.word_to_index[unk_token] = len(self.uniq_words)
self.index_to_word[len(self.uniq_words)] = unk_token
self.words_indexes = [self.word_to_index[w] for w in self.words]
def get_uniq_words(self):
word_counts = Counter(self.words)
return sorted(word_counts, key=word_counts.get, reverse=True)
def load(self, sentences: list[str]):
raise NotImplementedError
def __len__(self):
return len(self.words_indexes) - self.sequence_length
def __getitem__(self, index):
return (
torch.tensor(self.words_indexes[index:index + self.sequence_length]),
torch.tensor(self.words_indexes[index + 1:index + self.sequence_length + 1]),
)
# In[27]:
class ForwardDataset(BaseDataset):
def load(self, sentences):
words = [x.rstrip() for x in sentences if x.strip()]
words = ' '.join(words).lower()
words = words.split(' ')
return words
# In[28]:
class BackwardsDataset(ForwardDataset):
def load(self, sentences):
words = super(BackwardsDataset, self).load(sentences)
words.reverse()
return words
# In[29]:
train_forwards_dataset = ForwardDataset(6, train_sentences)
train_backwards_dataset = BackwardsDataset(6, train_sentences)
# In[8]:
from torch import nn, optim
class LanguageModel(nn.Module):
def __init__(self,
vocabulary_size=12800,
embedding_size=128,
hidden_size=256,
num_layers=4
):
super(LanguageModel, self).__init__()
self.embedding_size = embedding_size
self.embedding = nn.Embedding(vocabulary_size, embedding_size)
self.gru = nn.GRU(
input_size=self.embedding_size,
hidden_size=hidden_size,
num_layers=num_layers,
dropout=0.2,
batch_first=True
)
self.linear = nn.Linear(hidden_size, vocabulary_size)
def forward(self, x, h=None):
embeds = self.embedding(x)
out, h = self.gru(embeds, h)
out = self.linear(out)
return out, h
# In[9]:
forward_model = LanguageModel(len(train_forwards_dataset)).to(device)
# In[10]:
from torch.utils.data import DataLoader
from torch import save as save_model
def train(model, dataset, max_epochs, batch_size, out_file):
model.train()
dataloader = DataLoader(dataset, batch_size=batch_size)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)
for epoch in range(max_epochs):
for batch, (x, y) in enumerate(dataloader):
optimizer.zero_grad()
x = x.to(device)
y = y.to(device)
y_pred, _ = model(x)
loss = criterion(y_pred.transpose(1, 2), y)
loss.backward()
optimizer.step()
print({'epoch': epoch, 'update in batch': batch, '/': len(dataloader), 'loss': loss.item()})
save_model(model.state_dict(), out_file)
# In[11]:
train(forward_model, train_forwards_dataset, 10, 64, 'forward_model')
# In[12]:
backwards_model = LanguageModel(len(train_backwards_dataset)).to(device)
train(backwards_model, train_backwards_dataset, 10, 64, 'backwards_model')
# In[13]:
dev_input_raw = read_xz_file('challenging-america-word-gap-prediction/dev-0/in.tsv.xz')
dev_input_contexts = [get_contexts(input_text) for input_text in dev_input_raw]
test_input_raw = read_xz_file('challenging-america-word-gap-prediction/test-A/in.tsv.xz')
test_input_contexts = [get_contexts(input_text) for input_text in test_input_raw]
# In[82]:
from torch import topk
from tqdm import tqdm
import math
def get_pairs_tokens_probs(model, sentence, dataset, top):
preds = {}
src = torch.tensor([[dataset.word_to_index.get(w, dataset.word_to_index[unk_token]) for w in sentence]]).to(device)
output = model(src)
top = topk(output[0][-1][-1], top)
probs, tokens = top.values.tolist(), [dataset.index_to_word[idx] for idx in top.indices.tolist()]
accumulated_probability = 0
for prob, token in zip(probs, tokens):
accumulated_probability += prob
preds[token.strip()] = prob
preds[''] = 1 - accumulated_probability
return preds
def trim_results(results: dict, top):
"""
Przycinamy resultaty do `top` najbardziej prawdopodobnych wystąpień;
prawdopodobieństwo wystąpienia pozostałych tokenów obliczamy na nowo
"""
new = dict(sorted(results.items(), key=lambda item: item[1], reverse=True))
del new['']
new = {k[0]: k[1] for k in sorted(new.items(), key=lambda item: item[1], reverse=True)[:top-1]}
new[''] = 1.0 - math.fsum(map(lambda x: float(x), new.values()))
return new
def merge_results(result: dict, other: dict, top):
final = {}
for left, right in zip(result.items(), other.items()):
if left[0] in final:
final[left[0]] = (final[left[0]] + left[1]) / 2
else:
final[left[0]] = left[1]
if right[0] in final:
final[right[0]] = (final[right[0]] + right[1]) / 2
else:
final[right[0]] = right[1]
return trim_results(final, top)
def predict_words(dataset: BaseDataset, fwd_model: LanguageModel, back_model: LanguageModel, sentences: list[dict],
top=50):
preds = []
for sentence in tqdm(sentences):
left = sentence['left'].split(' ')
right = sentence['right'].split(' ')
left_results = get_pairs_tokens_probs(fwd_model, left, dataset, top)
right_results = get_pairs_tokens_probs(back_model, right, dataset, top)
merged_results = merge_results(left_results, right_results, top)
results_as_string = ''
for prob, token in merged_results.items():
results_as_string += f'{token}:{prob} '
preds.append(results_as_string)
return preds
# In[83]:
dev_preds = predict_words(train_forwards_dataset, forward_model, backwards_model, dev_input_contexts)
with open('challenging-america-word-gap-prediction/dev-0/out.tsv', 'w') as f:
f.writelines(line + '\n' for line in dev_preds)
# In[1]:
test_preds = predict_words(test_input_contexts)
with open('challenging-america-word-gap-prediction/test-A/out.tsv', 'w') as f:
f.writelines(line + '\n' for line in test_preds)

87
run.py
View File

@ -5,12 +5,8 @@
import torch
from transformers import GPT2Tokenizer, GPT2LMHeadModel
from transformers import GPT2Tokenizer, GPT2LMHeadModel, AutoConfig
device = torch.device('cuda')
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
model: GPT2LMHeadModel = GPT2LMHeadModel.from_pretrained('gpt2', pad_token_id=tokenizer.eos_token_id)
model.to(device)
# In[2]:
@ -23,14 +19,9 @@ def read_xz_file(fname):
return [line.strip() for line in f.readlines()]
# In[3]:
dev_input_raw = read_xz_file('dev-0/in.tsv.xz')
test_input_raw = read_xz_file('test-A/in.tsv.xz')
# In[4]:
def read_file(fname):
with open(fname, mode='rt', encoding='utf-8') as f:
return [line.strip() for line in f.readlines()]
def get_contexts(input_text):
@ -38,13 +29,53 @@ def get_contexts(input_text):
return {'left': all_fields[6], 'right': all_fields[7]}
dev_input_contexts = [get_contexts(input_text) for input_text in dev_input_raw]
bos = '<|endoftext|>'
eos = '<|EOS|>'
def compose_sentences(raw_input, labels):
result = []
for input, label in zip(raw_input, labels):
context = get_contexts(input)
result.append(f'{bos} {context["left"]} {input} {eos}')
result.append(f'{bos} {input} {context["right"]} {eos}')
return result
# In[3]:
pad = '<|pad|>'
special_tokens_dict = {'eos_token': eos, 'bos_token': bos, 'pad_token': pad}
tokenizer = GPT2Tokenizer.from_pretrained('distilgpt2')
num_add_tokens = tokenizer.add_special_tokens(special_tokens_dict)
config = AutoConfig.from_pretrained('distilgpt2', bos_token_id=tokenizer.bos_token_id,
eos_token_id=tokenizer.eos_token_id,
pad_token_id=tokenizer.pad_token_id, output_hidden_states=False, return_dict_in_generate=True)
# In[4]:
model = GPT2LMHeadModel.from_pretrained('distilgpt2', config=config)
model.resize_token_embeddings(len(tokenizer))
device = torch.device('cuda')
model.to(device)
# In[5]:
dev_input_raw = read_xz_file('challenging-america-word-gap-prediction/dev-0/in.tsv.xz')
dev_input_contexts = [get_contexts(input_text) for input_text in dev_input_raw]
test_input_raw = read_xz_file('challenging-america-word-gap-prediction/test-A/in.tsv.xz')
test_input_contexts = [get_contexts(input_text) for input_text in test_input_raw]
# In[6]:
@ -52,16 +83,28 @@ from tqdm import tqdm
tokenizer.truncation_side = 'left'
blacklist = ['ia', 'ix', 'io',
'ik'] # Te tokeny się prawie zawsze powtarzają, a nie są to żadne słowa w języku angielskim.
def predict_words(dataset):
preds = []
for entry in tqdm(dataset):
text = f"{entry['left']}"
text = f"{entry['right']}"
src = tokenizer.encode(text, return_tensors="pt", truncation=True).to(device)
output = model.generate(src, max_length=len(src[0]) + 1, do_sample=True, top_k=0, temperature=0.8,
num_return_sequences=1, no_repeat_ngram_size=2)
generated_word = tokenizer.decode(output[0], skip_special_tokens=True).split(' ')[-1]
preds.append(f'{generated_word.strip()}:0.99 :0.01')
output = model.generate(torch.flip(src, dims=(1,)), max_length=len(src[0]) + 1, do_sample=True, top_k=0, temperature=0.8,
num_return_sequences=1, no_repeat_ngram_size=2, output_scores=True)
probs, idxs = torch.softmax(output.scores[0][-1], dim=0).topk(30)
current_output = ''
accumulated_probability = 0
for prob, token_id in zip(probs, idxs):
token = tokenizer.decode(token_id, skip_special_tokens=True).split(' ')[-1]
if not token.isalnum() or token in blacklist:
continue
prob_value = prob.item()
accumulated_probability += prob_value
current_output += f'{token.strip()}:{prob_value} '
current_output += f':{1 - accumulated_probability}'
preds.append(current_output)
return preds
@ -69,12 +112,14 @@ def predict_words(dataset):
dev_preds = predict_words(dev_input_contexts)
with open('dev-0/out.tsv', 'w') as f:
with open('challenging-america-word-gap-prediction/dev-0/out.tsv', 'w') as f:
f.writelines(line + '\n' for line in dev_preds)
# In[8]:
test_preds = predict_words(test_input_contexts)
with open('test-A/out.tsv', 'w') as f:
with open('challenging-america-word-gap-prediction/test-A/out.tsv', 'w') as f:
f.writelines(line + '\n' for line in test_preds)

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff