Compare commits

...

2 Commits
master ... gpt2

8 changed files with 17999 additions and 54053 deletions

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -1,233 +0,0 @@
#!/usr/bin/env python
# coding: utf-8
# In[1]:
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import GPT2Tokenizer, GPT2LMHeadModel, AutoConfig
# In[2]:
import lzma
def read_xz_file(fname):
with lzma.open(fname, mode='rt', encoding='utf-8') as f:
return [line.strip() for line in f.readlines()]
def read_file(fname):
with open(fname, mode='rt', encoding='utf-8') as f:
return [line.strip() for line in f.readlines()]
def get_contexts(input_text):
all_fields = input_text.replace(r'\n', ' ').split('\t')
return {'left': all_fields[6], 'right': all_fields[7]}
bos = '<|endoftext|>'
eos = '<|EOS|>'
def compose_sentences(raw_input, labels):
result = []
for input, label in zip(raw_input, labels):
context = get_contexts(input)
result.append(f'{bos} {context["left"]} {input} {eos}')
result.append(f'{bos} {input} {context["right"]} {eos}')
return result
# In[6]:
pad = '<|pad|>'
special_tokens_dict = {'eos_token': eos, 'bos_token': bos, 'pad_token': pad}
tokenizer = GPT2Tokenizer.from_pretrained('distilgpt2')
num_add_tokens = tokenizer.add_special_tokens(special_tokens_dict)
# In[4]:
class AmericaDataset(Dataset):
def __init__(self, tokenizer, data):
self.tokenizer = tokenizer
self.sentences = []
for entry in data:
self.sentences.append(
torch.tensor(self.tokenizer.encode(entry, padding=True))
)
def __len__(self):
return len(self.sentences)
def __getitem__(self, item):
return self.sentences[item]
# In[5]:
train_input_raw = read_xz_file('challenging-america-word-gap-prediction/train/in.tsv.xz')
train_labels = read_file('challenging-america-word-gap-prediction/train/expected.tsv')
train_sentences = compose_sentences(train_input_raw, train_labels)
train_dataset = AmericaDataset(tokenizer, train_sentences)
# In[11]:
config = AutoConfig.from_pretrained('distilgpt2', bos_token_id=tokenizer.bos_token_id,
eos_token_id=tokenizer.eos_token_id,
pad_token_id=tokenizer.pad_token_id, output_hidden_states=False, return_dict_in_generate=True)
model = GPT2LMHeadModel.from_pretrained('distilgpt2', config=config)
model.resize_token_embeddings(len(tokenizer))
device = torch.device('cuda')
model.to(device)
# In[8]:
def pack_tensor(new_tensor, packed_tensor, max_seq_len):
if packed_tensor is None:
return new_tensor, True, None
if new_tensor.size()[1] + packed_tensor.size()[1] > max_seq_len:
return packed_tensor, False, new_tensor
else:
packed_tensor = torch.cat([new_tensor, packed_tensor[:, 1:]], dim=1)
return packed_tensor, True, None
# In[9]:
import os
from transformers import AdamW, get_linear_schedule_with_warmup
from tqdm import tqdm
def train(
model,
dataset,
batch_size=16, epochs=5, lr=2e-5,
warmup_steps=200,
output_dir=".", output_prefix="gpt2",
save_model_on_epoch=False,
):
device = torch.device("cuda")
model = model.to(device)
model.train()
optimizer = AdamW(model.parameters(), lr=lr)
scheduler = get_linear_schedule_with_warmup(
optimizer, num_warmup_steps=warmup_steps, num_training_steps=-1
)
loss = 0
accumulating_batch_count = 0
input_tensor = None
dataloader = DataLoader(dataset, batch_size=1, shuffle=True)
for epoch in range(epochs):
print(f"Training epoch {epoch}")
print(loss)
for idx, entry in tqdm(enumerate(dataloader)):
(input_tensor, carry_on, remainder) = pack_tensor(entry, input_tensor, 1024)
if carry_on and idx != len(dataset) - 1:
continue
input_tensor = input_tensor.to(device)
outputs = model(input_tensor, labels=input_tensor)
loss = outputs[0]
loss.backward()
if (accumulating_batch_count % batch_size) == 0:
optimizer.step()
scheduler.step()
optimizer.zero_grad()
model.zero_grad()
accumulating_batch_count += 1
input_tensor = None
if save_model_on_epoch:
torch.save(
model.state_dict(),
os.path.join(output_dir, f"{output_prefix}-{epoch}.pt"),
)
return model
# In[12]:
model = train(model, train_dataset)
# In[3]:
dev_input_raw = read_xz_file('challenging-america-word-gap-prediction/dev-0/in.tsv.xz')
dev_input_contexts = [get_contexts(input_text) for input_text in dev_input_raw]
test_input_raw = read_xz_file('challenging-america-word-gap-prediction/test-A/in.tsv.xz')
test_input_contexts = [get_contexts(input_text) for input_text in test_input_raw]
# In[15]:
from tqdm import tqdm
tokenizer.truncation_side = 'left'
blacklist = ['ia', 'ix', 'io',
'ik'] # Te tokeny się prawie zawsze powtarzają, a nie są to żadne słowa w języku angielskim.
def predict_words(dataset):
preds = []
for entry in tqdm(dataset):
text = f"{entry['left']}"
src = tokenizer.encode(text, return_tensors="pt", truncation=True).to(device)
output = model.generate(src, max_length=len(src[0]) + 1, do_sample=True, top_k=0, temperature=0.8,
num_return_sequences=1, no_repeat_ngram_size=2, output_scores=True)
probs, idxs = torch.softmax(output.scores[0][-1], dim=0).topk(50)
current_output = ''
accumulated_probability = 0
for prob, token_id in zip(probs, idxs):
token = tokenizer.decode(token_id, skip_special_tokens=True).split(' ')[-1]
if not token.isalnum() or token in blacklist:
continue
prob_value = prob.item()
accumulated_probability += prob_value
current_output += f'{token.strip()}:{prob_value} '
current_output += f':{1 - accumulated_probability}'
preds.append(current_output)
return preds
# In[ ]:
dev_preds = predict_words(dev_input_contexts)
with open('challenging-america-word-gap-prediction/dev-0/out.tsv', 'w') as f:
f.writelines(line + '\n' for line in dev_preds)
# In[ ]:
test_preds = predict_words(test_input_contexts)
with open('challenging-america-word-gap-prediction/test-A/out.tsv', 'w') as f:
f.writelines(line + '\n' for line in test_preds)

87
run.py
View File

@ -5,12 +5,8 @@
import torch
from transformers import GPT2Tokenizer, GPT2LMHeadModel
from transformers import GPT2Tokenizer, GPT2LMHeadModel, AutoConfig
device = torch.device('cuda')
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
model: GPT2LMHeadModel = GPT2LMHeadModel.from_pretrained('gpt2', pad_token_id=tokenizer.eos_token_id)
model.to(device)
# In[2]:
@ -23,14 +19,9 @@ def read_xz_file(fname):
return [line.strip() for line in f.readlines()]
# In[3]:
dev_input_raw = read_xz_file('dev-0/in.tsv.xz')
test_input_raw = read_xz_file('test-A/in.tsv.xz')
# In[4]:
def read_file(fname):
with open(fname, mode='rt', encoding='utf-8') as f:
return [line.strip() for line in f.readlines()]
def get_contexts(input_text):
@ -38,13 +29,53 @@ def get_contexts(input_text):
return {'left': all_fields[6], 'right': all_fields[7]}
dev_input_contexts = [get_contexts(input_text) for input_text in dev_input_raw]
bos = '<|endoftext|>'
eos = '<|EOS|>'
def compose_sentences(raw_input, labels):
result = []
for input, label in zip(raw_input, labels):
context = get_contexts(input)
result.append(f'{bos} {context["left"]} {input} {eos}')
result.append(f'{bos} {input} {context["right"]} {eos}')
return result
# In[3]:
pad = '<|pad|>'
special_tokens_dict = {'eos_token': eos, 'bos_token': bos, 'pad_token': pad}
tokenizer = GPT2Tokenizer.from_pretrained('distilgpt2')
num_add_tokens = tokenizer.add_special_tokens(special_tokens_dict)
config = AutoConfig.from_pretrained('distilgpt2', bos_token_id=tokenizer.bos_token_id,
eos_token_id=tokenizer.eos_token_id,
pad_token_id=tokenizer.pad_token_id, output_hidden_states=False, return_dict_in_generate=True)
# In[4]:
model = GPT2LMHeadModel.from_pretrained('distilgpt2', config=config)
model.resize_token_embeddings(len(tokenizer))
device = torch.device('cuda')
model.to(device)
# In[5]:
dev_input_raw = read_xz_file('challenging-america-word-gap-prediction/dev-0/in.tsv.xz')
dev_input_contexts = [get_contexts(input_text) for input_text in dev_input_raw]
test_input_raw = read_xz_file('challenging-america-word-gap-prediction/test-A/in.tsv.xz')
test_input_contexts = [get_contexts(input_text) for input_text in test_input_raw]
# In[6]:
@ -52,16 +83,28 @@ from tqdm import tqdm
tokenizer.truncation_side = 'left'
blacklist = ['ia', 'ix', 'io',
'ik'] # Te tokeny się prawie zawsze powtarzają, a nie są to żadne słowa w języku angielskim.
def predict_words(dataset):
preds = []
for entry in tqdm(dataset):
text = f"{entry['left']}"
text = f"{entry['right']}"
src = tokenizer.encode(text, return_tensors="pt", truncation=True).to(device)
output = model.generate(src, max_length=len(src[0]) + 1, do_sample=True, top_k=0, temperature=0.8,
num_return_sequences=1, no_repeat_ngram_size=2)
generated_word = tokenizer.decode(output[0], skip_special_tokens=True).split(' ')[-1]
preds.append(f'{generated_word.strip()}:0.99 :0.01')
output = model.generate(torch.flip(src, dims=(1,)), max_length=len(src[0]) + 1, do_sample=True, top_k=0, temperature=0.8,
num_return_sequences=1, no_repeat_ngram_size=2, output_scores=True)
probs, idxs = torch.softmax(output.scores[0][-1], dim=0).topk(30)
current_output = ''
accumulated_probability = 0
for prob, token_id in zip(probs, idxs):
token = tokenizer.decode(token_id, skip_special_tokens=True).split(' ')[-1]
if not token.isalnum() or token in blacklist:
continue
prob_value = prob.item()
accumulated_probability += prob_value
current_output += f'{token.strip()}:{prob_value} '
current_output += f':{1 - accumulated_probability}'
preds.append(current_output)
return preds
@ -69,12 +112,14 @@ def predict_words(dataset):
dev_preds = predict_words(dev_input_contexts)
with open('dev-0/out.tsv', 'w') as f:
with open('challenging-america-word-gap-prediction/dev-0/out.tsv', 'w') as f:
f.writelines(line + '\n' for line in dev_preds)
# In[8]:
test_preds = predict_words(test_input_contexts)
with open('test-A/out.tsv', 'w') as f:
with open('challenging-america-word-gap-prediction/test-A/out.tsv', 'w') as f:
f.writelines(line + '\n' for line in test_preds)

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff