hf roberta (linear top layer by hand instead of hf)
This commit is contained in:
parent
fb6215f9c1
commit
c85f1611e6
298228
dev-0/out.tsv
298228
dev-0/out.tsv
File diff suppressed because it is too large
Load Diff
@ -4,12 +4,22 @@ from datasets import load_dataset
|
|||||||
from transformers import AutoTokenizer, RobertaModel, RobertaTokenizer
|
from transformers import AutoTokenizer, RobertaModel, RobertaTokenizer
|
||||||
from torch.utils.data import DataLoader
|
from torch.utils.data import DataLoader
|
||||||
from transformers import AutoModelForSequenceClassification
|
from transformers import AutoModelForSequenceClassification
|
||||||
#from transformers import AdamW
|
|
||||||
from torch.optim import Adam
|
from torch.optim import Adam
|
||||||
from transformers import get_scheduler
|
from transformers import get_scheduler
|
||||||
import torch
|
import torch
|
||||||
from tqdm.auto import tqdm
|
from tqdm.auto import tqdm
|
||||||
|
import os
|
||||||
|
import pickle
|
||||||
|
from regressor_head import RegressorHead
|
||||||
|
|
||||||
|
try:
|
||||||
|
os.mkdir('roberta_year_prediction')
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
|
||||||
|
def pickle_model_save(name):
|
||||||
|
with open(f'roberta_year_prediction/{name}', 'wb') as f:
|
||||||
|
pickle.dump(model,f)
|
||||||
|
|
||||||
if TEST:
|
if TEST:
|
||||||
STEPS_EVAL = 10
|
STEPS_EVAL = 10
|
||||||
@ -29,9 +39,13 @@ train_dataloader = DataLoader(train_dataset, shuffle=True, batch_size=BATCH_SIZE
|
|||||||
eval_dataloader_small = DataLoader(eval_dataset_small, batch_size=BATCH_SIZE)
|
eval_dataloader_small = DataLoader(eval_dataset_small, batch_size=BATCH_SIZE)
|
||||||
eval_dataloader_full = DataLoader(eval_dataset_full, batch_size=BATCH_SIZE)
|
eval_dataloader_full = DataLoader(eval_dataset_full, batch_size=BATCH_SIZE)
|
||||||
|
|
||||||
model = AutoModelForSequenceClassification.from_pretrained(MODEL, num_labels=1)
|
|
||||||
optimizer = Adam(model.parameters(), lr=LR)
|
|
||||||
|
|
||||||
|
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
|
||||||
|
model = RobertaModel.from_pretrained('roberta-base')
|
||||||
|
model.regressor_head = RegressorHead(768).to('cuda')
|
||||||
|
model.to(device)
|
||||||
|
|
||||||
|
optimizer = Adam(model.parameters(), lr=LR)
|
||||||
|
|
||||||
num_training_steps = NUM_EPOCHS * len(train_dataloader)
|
num_training_steps = NUM_EPOCHS * len(train_dataloader)
|
||||||
#lr_scheduler = get_scheduler(
|
#lr_scheduler = get_scheduler(
|
||||||
@ -42,10 +56,6 @@ num_training_steps = NUM_EPOCHS * len(train_dataloader)
|
|||||||
#)
|
#)
|
||||||
|
|
||||||
|
|
||||||
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
|
|
||||||
model.to(device)
|
|
||||||
|
|
||||||
|
|
||||||
progress_bar = tqdm(range(num_training_steps))
|
progress_bar = tqdm(range(num_training_steps))
|
||||||
model.train()
|
model.train()
|
||||||
|
|
||||||
@ -55,15 +65,15 @@ model.to(device)
|
|||||||
def transform_batch(batch):
|
def transform_batch(batch):
|
||||||
batch['input_ids'] = torch.stack(batch['input_ids']).permute(1,0).to(device)
|
batch['input_ids'] = torch.stack(batch['input_ids']).permute(1,0).to(device)
|
||||||
batch['attention_mask'] = torch.stack(batch['attention_mask']).permute(1,0).to(device)
|
batch['attention_mask'] = torch.stack(batch['attention_mask']).permute(1,0).to(device)
|
||||||
batch['labels'] = batch['year_scaled'].to(device).float()
|
labels = batch['year_scaled'].to(device).float()
|
||||||
|
|
||||||
batch['labels'].to(device)
|
|
||||||
batch['input_ids'].to(device)
|
batch['input_ids'].to(device)
|
||||||
batch['attention_mask'].to(device)
|
batch['attention_mask'].to(device)
|
||||||
|
|
||||||
for c in set(batch.keys()) - {'input_ids', 'attention_mask', 'labels'}:
|
for c in set(batch.keys()) - {'input_ids', 'attention_mask'}:
|
||||||
del batch[c]
|
del batch[c]
|
||||||
return batch
|
|
||||||
|
return batch, labels
|
||||||
|
|
||||||
|
|
||||||
def eval(full = False):
|
def eval(full = False):
|
||||||
@ -74,12 +84,12 @@ def eval(full = False):
|
|||||||
items_passed = 0
|
items_passed = 0
|
||||||
for i, batch in enumerate(dataloader):
|
for i, batch in enumerate(dataloader):
|
||||||
items_passed += len(batch)
|
items_passed += len(batch)
|
||||||
batch = transform_batch(batch)
|
|
||||||
labels = batch['labels']
|
batch, labels = transform_batch(batch)
|
||||||
del batch['labels']
|
outputs = model(**batch)[0]
|
||||||
outputs = model(**batch)
|
outputs = model.regressor_head(outputs)
|
||||||
o = soft_clip(outputs['logits']).squeeze()
|
|
||||||
loss = criterion(o, labels)
|
loss = criterion(outputs.squeeze(), labels)
|
||||||
eval_loss += loss.item()
|
eval_loss += loss.item()
|
||||||
eval_loss = (eval_loss / items_passed)
|
eval_loss = (eval_loss / items_passed)
|
||||||
print(f'eval loss full={full}: {eval_loss:.5f}', end = '\n')
|
print(f'eval loss full={full}: {eval_loss:.5f}', end = '\n')
|
||||||
@ -88,11 +98,6 @@ def eval(full = False):
|
|||||||
|
|
||||||
criterion = torch.nn.MSELoss(reduction='sum').to(device)
|
criterion = torch.nn.MSELoss(reduction='sum').to(device)
|
||||||
|
|
||||||
lrelu = torch.nn.LeakyReLU(0.1)
|
|
||||||
def soft_clip(t):
|
|
||||||
t = lrelu(t)
|
|
||||||
t = -lrelu(-t + 1 ) + 1
|
|
||||||
return t
|
|
||||||
|
|
||||||
best_eval_loss = 9999
|
best_eval_loss = 9999
|
||||||
epochs_without_progress = 0
|
epochs_without_progress = 0
|
||||||
@ -101,12 +106,12 @@ for epoch in range(NUM_EPOCHS):
|
|||||||
items_passed = 0
|
items_passed = 0
|
||||||
for i, batch in enumerate(train_dataloader):
|
for i, batch in enumerate(train_dataloader):
|
||||||
items_passed += len(batch)
|
items_passed += len(batch)
|
||||||
batch = transform_batch(batch)
|
|
||||||
labels = batch['labels']
|
batch, labels = transform_batch(batch)
|
||||||
del batch['labels']
|
outputs = model(**batch)[0]
|
||||||
outputs = model(**batch)
|
outputs = model.regressor_head(outputs)
|
||||||
o = soft_clip(outputs['logits']).squeeze()
|
|
||||||
loss = criterion(o, labels)
|
loss = criterion(outputs.squeeze(), labels)
|
||||||
loss.backward()
|
loss.backward()
|
||||||
train_loss += loss.item()
|
train_loss += loss.item()
|
||||||
progress_bar.update(1)
|
progress_bar.update(1)
|
||||||
@ -123,11 +128,13 @@ for epoch in range(NUM_EPOCHS):
|
|||||||
eval(full = False)
|
eval(full = False)
|
||||||
|
|
||||||
eval_loss = eval(full=True)
|
eval_loss = eval(full=True)
|
||||||
model.save_pretrained(f'roberta_year_prediction/epoch_{epoch}_loss{eval_loss:.5f}')
|
|
||||||
model.save_pretrained(f'roberta_year_prediction/epoch_last')
|
|
||||||
|
pickle_model_save(f'epoch_{epoch}')
|
||||||
|
pickle_model_save(f'epoch_last')
|
||||||
|
|
||||||
if eval_loss < best_eval_loss:
|
if eval_loss < best_eval_loss:
|
||||||
model.save_pretrained(f'roberta_year_prediction/epoch_best')
|
pickle_model_save(f'epoch_best')
|
||||||
print('\nsaving best model')
|
print('\nsaving best model')
|
||||||
best_eval_loss = eval_loss
|
best_eval_loss = eval_loss
|
||||||
else:
|
else:
|
||||||
|
@ -1,6 +1,7 @@
|
|||||||
import pickle
|
import pickle
|
||||||
import torch
|
import torch
|
||||||
from transformers import AutoModelForSequenceClassification
|
from transformers import AutoTokenizer, RobertaModel, RobertaTokenizer
|
||||||
|
from regressor_head import RegressorHead
|
||||||
from torch.utils.data import DataLoader
|
from torch.utils.data import DataLoader
|
||||||
from tqdm.auto import tqdm
|
from tqdm.auto import tqdm
|
||||||
|
|
||||||
@ -11,12 +12,14 @@ with open('test_dataset_A.pickle','rb') as f_p:
|
|||||||
test_dataset = pickle.load(f_p)
|
test_dataset = pickle.load(f_p)
|
||||||
|
|
||||||
device = 'cuda'
|
device = 'cuda'
|
||||||
model = AutoModelForSequenceClassification.from_pretrained('./roberta_year_prediction/epoch_best')
|
with open('./roberta_year_prediction/epoch_best', 'rb') as f:
|
||||||
|
model = pickle.load(f)
|
||||||
|
|
||||||
model.eval()
|
model.eval()
|
||||||
model.to(device)
|
model.to(device)
|
||||||
|
|
||||||
lrelu = torch.nn.LeakyReLU(0.0)
|
lrelu = torch.nn.LeakyReLU(0.0)
|
||||||
def soft_clip(t):
|
def hard_clip(t):
|
||||||
t = lrelu(t)
|
t = lrelu(t)
|
||||||
t = -lrelu(-t + 1 ) + 1
|
t = -lrelu(-t + 1 ) + 1
|
||||||
return t
|
return t
|
||||||
@ -25,7 +28,7 @@ with open('scalers.pickle', 'rb') as f_scaler:
|
|||||||
scalers = pickle.load(f_scaler)
|
scalers = pickle.load(f_scaler)
|
||||||
|
|
||||||
def predict(dataset, out_f):
|
def predict(dataset, out_f):
|
||||||
eval_dataloader = DataLoader(dataset, batch_size=50)
|
eval_dataloader = DataLoader(dataset, batch_size=20)
|
||||||
outputs = []
|
outputs = []
|
||||||
|
|
||||||
progress_bar = tqdm(range(len(eval_dataloader)))
|
progress_bar = tqdm(range(len(eval_dataloader)))
|
||||||
@ -33,15 +36,18 @@ def predict(dataset, out_f):
|
|||||||
for batch in eval_dataloader:
|
for batch in eval_dataloader:
|
||||||
batch['input_ids'] = torch.stack(batch['input_ids']).permute(1,0).to(device)
|
batch['input_ids'] = torch.stack(batch['input_ids']).permute(1,0).to(device)
|
||||||
batch['attention_mask'] = torch.stack(batch['attention_mask']).permute(1,0).to(device)
|
batch['attention_mask'] = torch.stack(batch['attention_mask']).permute(1,0).to(device)
|
||||||
batch['labels'] = batch['year_scaled'].to(device).float()
|
|
||||||
|
|
||||||
batch['labels'].to(device)
|
|
||||||
batch['input_ids'].to(device)
|
batch['input_ids'].to(device)
|
||||||
batch['attention_mask'].to(device)
|
batch['attention_mask'].to(device)
|
||||||
|
|
||||||
for c in set(batch.keys()) - {'input_ids', 'attention_mask', 'labels'}:
|
for c in set(batch.keys()) - {'input_ids', 'attention_mask', 'labels'}:
|
||||||
del batch[c]
|
del batch[c]
|
||||||
outputs.extend(soft_clip(model(**batch).logits).tolist())
|
|
||||||
|
o = model(**batch)[0]
|
||||||
|
o = model.regressor_head(o)
|
||||||
|
o = hard_clip(o)
|
||||||
|
|
||||||
|
outputs.extend(o.tolist())
|
||||||
progress_bar.update(1)
|
progress_bar.update(1)
|
||||||
outputs_transformed = scalers['year'].inverse_transform(outputs)
|
outputs_transformed = scalers['year'].inverse_transform(outputs)
|
||||||
|
|
||||||
|
13
hf_roberta_base/regressor_head.py
Normal file
13
hf_roberta_base/regressor_head.py
Normal file
@ -0,0 +1,13 @@
|
|||||||
|
import torch
|
||||||
|
|
||||||
|
class RegressorHead(torch.nn.Module):
|
||||||
|
def __init__(self, in_dim):
|
||||||
|
super(RegressorHead, self).__init__()
|
||||||
|
self.linear = torch.nn.Linear(in_dim, 1)
|
||||||
|
self.m = torch.nn.LeakyReLU(0.1)
|
||||||
|
def forward(self, x):
|
||||||
|
x = x.mean(1)
|
||||||
|
x = self.linear(x)
|
||||||
|
x = self.m(x)
|
||||||
|
x = - self.m(-x + 1 ) +1
|
||||||
|
return x
|
296564
test-A/out.tsv
296564
test-A/out.tsv
File diff suppressed because it is too large
Load Diff
Loading…
Reference in New Issue
Block a user