roberta batched training and prediction
This commit is contained in:
parent
85cee1421c
commit
46e06b748e
@ -1,196 +0,0 @@
|
|||||||
import os
|
|
||||||
import torch
|
|
||||||
import random
|
|
||||||
import copy
|
|
||||||
from fairseq.models.roberta import RobertaModel, RobertaHubInterface
|
|
||||||
from fairseq import hub_utils
|
|
||||||
from fairseq.data.data_utils import collate_tokens
|
|
||||||
from tqdm import tqdm
|
|
||||||
import numpy as np
|
|
||||||
from sklearn.preprocessing import MinMaxScaler
|
|
||||||
|
|
||||||
|
|
||||||
EVAL_OFTEN = True
|
|
||||||
EVAL_EVERY = 10000
|
|
||||||
|
|
||||||
|
|
||||||
roberta = torch.hub.load('pytorch/fairseq', 'roberta.base')
|
|
||||||
roberta.cuda()
|
|
||||||
device='cuda'
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
train_in = [l.rstrip('\n') for l in open('../train/in.tsv',newline='\n').readlines()] # shuffled
|
|
||||||
dev_in = [l.rstrip('\n') for l in open('../dev-0/in.tsv',newline='\n').readlines()] # shuffled
|
|
||||||
|
|
||||||
train_year = [float(l.rstrip('\n')) for l in open('../train/expected.tsv',newline='\n').readlines()]
|
|
||||||
dev_year = [float(l.rstrip('\n')) for l in open('../dev-0/expected.tsv',newline='\n').readlines()]
|
|
||||||
|
|
||||||
dev_in_not_shuffled = copy.deepcopy(dev_in) # not shuffled
|
|
||||||
test_in = [l.rstrip('\n') for l in open('../test-A/in.tsv',newline='\n').readlines()] # not shuffled
|
|
||||||
|
|
||||||
c = list(zip(train_in,train_year))
|
|
||||||
random.shuffle(c)
|
|
||||||
train_in, train_year = zip(*c)
|
|
||||||
c = list(zip(dev_in,dev_year))
|
|
||||||
random.shuffle(c)
|
|
||||||
dev_in, dev_year = zip(*c)
|
|
||||||
|
|
||||||
scaler = MinMaxScaler()
|
|
||||||
|
|
||||||
train_year_scaled = scaler.fit_transform(np.array(train_year).reshape(-1,1))
|
|
||||||
dev_year_scaled = scaler.transform(np.array(dev_year).reshape(-1,1))
|
|
||||||
|
|
||||||
|
|
||||||
class RegressorHead(torch.nn.Module):
|
|
||||||
def __init__(self):
|
|
||||||
super(RegressorHead, self).__init__()
|
|
||||||
self.linear1 = torch.nn.Linear(768,300)
|
|
||||||
self.linear2 = torch.nn.Linear(300,1)
|
|
||||||
self.linearxxx = torch.nn.Linear(768,1)
|
|
||||||
self.dropout1 = torch.nn.Dropout(0.0)
|
|
||||||
self.dropout2 = torch.nn.Dropout(0.0)
|
|
||||||
self.m = torch.nn.LeakyReLU(0.1)
|
|
||||||
def forward(self,x):
|
|
||||||
#x = self.dropout1(x)
|
|
||||||
#x = self.linear1(x)
|
|
||||||
#x = self.dropout2(x)
|
|
||||||
x = self.linearxxx(x)
|
|
||||||
x = self.m(x)
|
|
||||||
x = -self.m(-x +1 ) +1
|
|
||||||
return x
|
|
||||||
|
|
||||||
regressor_head = RegressorHead().to(device)
|
|
||||||
|
|
||||||
optimizer = torch.optim.Adam(list(roberta.parameters()) + list(regressor_head.parameters()), lr = 1e-6)
|
|
||||||
criterion = torch.nn.MSELoss(reduction='sum').to(device)
|
|
||||||
|
|
||||||
BATCH_SIZE = 1
|
|
||||||
def get_train_batch(dataset_in,dataset_y):
|
|
||||||
for i in tqdm(range(0,len(dataset_in), BATCH_SIZE)):
|
|
||||||
batch_of_text = dataset_in[i:i+BATCH_SIZE]
|
|
||||||
|
|
||||||
#batch = collate_tokens([roberta.encode(p)[:512] for p in batch_of_text], pad_idx=1)
|
|
||||||
batch = roberta.encode(batch_of_text[0])
|
|
||||||
output= None
|
|
||||||
for j in range(0,1,512): # only first 512 tokens instead of all
|
|
||||||
if output is None:
|
|
||||||
output = roberta.extract_features(batch[j:j+512])
|
|
||||||
else:
|
|
||||||
output_new = roberta.extract_features(batch[j:j+512])
|
|
||||||
output = torch.cat((output, output_new),1)
|
|
||||||
features = torch.mean(output,1)
|
|
||||||
years = torch.FloatTensor(dataset_y[i:i+BATCH_SIZE]).to(device).squeeze()
|
|
||||||
|
|
||||||
yield features, years
|
|
||||||
|
|
||||||
|
|
||||||
def eval():
|
|
||||||
criterion_eval = torch.nn.MSELoss(reduction='sum')
|
|
||||||
roberta.eval()
|
|
||||||
regressor_head.eval()
|
|
||||||
loss = 0.0
|
|
||||||
loss_clipped = 0.0
|
|
||||||
loss_scaled = 0.0
|
|
||||||
for batch, year in tqdm(get_train_batch(dev_in,dev_year_scaled)):
|
|
||||||
|
|
||||||
x = regressor_head(batch.to(device)).squeeze()
|
|
||||||
x_clipped = torch.clamp(x,0.0,1.0)
|
|
||||||
|
|
||||||
original_x = torch.FloatTensor(scaler.inverse_transform(x.detach().cpu().numpy().reshape(1,-1)))
|
|
||||||
original_x_clipped = torch.FloatTensor(scaler.inverse_transform(x_clipped.detach().cpu().numpy().reshape(1,-1)))
|
|
||||||
original_year = torch.FloatTensor(scaler.inverse_transform(year.detach().cpu().numpy().reshape(1,-1)))
|
|
||||||
|
|
||||||
loss_scaled += criterion_eval(x, year).item()
|
|
||||||
loss += criterion_eval(original_x, original_year).item()
|
|
||||||
loss_clipped += criterion_eval(original_x_clipped, original_year).item()
|
|
||||||
print(' full valid loss scaled: ' + str(np.sqrt(loss_scaled/len(dev_year))))
|
|
||||||
print(' full valid loss: ' + str(np.sqrt(loss/len(dev_year))))
|
|
||||||
print(' full valid loss clipped: ' + str(np.sqrt(loss_clipped/len(dev_year))))
|
|
||||||
|
|
||||||
def eval_short():
|
|
||||||
criterion_eval = torch.nn.MSELoss(reduction='sum')
|
|
||||||
roberta.eval()
|
|
||||||
regressor_head.eval()
|
|
||||||
loss = 0.0
|
|
||||||
loss_clipped = 0.0
|
|
||||||
loss_scaled = 0.0
|
|
||||||
for batch, year in tqdm(get_train_batch(dev_in[:1000],dev_year_scaled[:1000])):
|
|
||||||
|
|
||||||
x = regressor_head(batch.to(device)).squeeze()
|
|
||||||
x_clipped = torch.clamp(x,0.0,1.0)
|
|
||||||
|
|
||||||
original_x = torch.FloatTensor(scaler.inverse_transform(x.detach().cpu().numpy().reshape(1,-1)))
|
|
||||||
original_x_clipped = torch.FloatTensor(scaler.inverse_transform(x_clipped.detach().cpu().numpy().reshape(1,-1)))
|
|
||||||
original_year = torch.FloatTensor(scaler.inverse_transform(year.detach().cpu().numpy().reshape(1,-1)))
|
|
||||||
|
|
||||||
loss_scaled += criterion_eval(x, year).item()
|
|
||||||
loss += criterion_eval(original_x, original_year).item()
|
|
||||||
loss_clipped += criterion_eval(original_x_clipped, original_year).item()
|
|
||||||
print('valid loss scaled: ' + str(np.sqrt(loss_scaled/1000)))
|
|
||||||
print('valid loss: ' + str(np.sqrt(loss/1000)))
|
|
||||||
print('valid loss clipped: ' + str(np.sqrt(loss_clipped/len(dev_year))))
|
|
||||||
|
|
||||||
|
|
||||||
def train_one_epoch():
|
|
||||||
roberta.train()
|
|
||||||
regressor_head.train()
|
|
||||||
loss_value=0.0
|
|
||||||
iteration = 0
|
|
||||||
for batch, year in get_train_batch(train_in,train_year_scaled):
|
|
||||||
iteration +=1
|
|
||||||
roberta.zero_grad()
|
|
||||||
regressor_head.zero_grad()
|
|
||||||
#import pdb; pdb.set_trace()
|
|
||||||
|
|
||||||
x = regressor_head(batch.to(device)).squeeze()
|
|
||||||
|
|
||||||
loss = criterion(x, year)
|
|
||||||
loss_value += loss.item()
|
|
||||||
loss.backward()
|
|
||||||
optimizer.step()
|
|
||||||
|
|
||||||
roberta.zero_grad()
|
|
||||||
regressor_head.zero_grad()
|
|
||||||
|
|
||||||
|
|
||||||
if EVAL_OFTEN and (iteration > 1) and (iteration % EVAL_EVERY == 1):
|
|
||||||
print('train loss: ' + str(np.sqrt(loss_value / EVAL_EVERY)))
|
|
||||||
eval_short()
|
|
||||||
roberta.train()
|
|
||||||
regressor_head.train()
|
|
||||||
loss_value = 0.0
|
|
||||||
#print('train loss: ' + str(loss_value/len(train_year)))
|
|
||||||
|
|
||||||
|
|
||||||
def predict_dev():
|
|
||||||
roberta.eval()
|
|
||||||
regressor_head.eval()
|
|
||||||
f_out = open('../dev-0/out.tsv','w')
|
|
||||||
for batch, year in tqdm(get_train_batch(dev_in_not_shuffled,dev_year_scaled)):
|
|
||||||
#batch_first = roberta.extract_features(batch)[:,0].to(device)
|
|
||||||
x = regressor_head(batch).squeeze()
|
|
||||||
x_clipped = torch.clamp(x,0.0,1.0)
|
|
||||||
original_x_clipped = scaler.inverse_transform(x_clipped.detach().cpu().numpy().reshape(1,-1))
|
|
||||||
for y in original_x_clipped[0]:
|
|
||||||
f_out.write(str(y) + '\n')
|
|
||||||
f_out.close()
|
|
||||||
|
|
||||||
def predict_test():
|
|
||||||
roberta.eval()
|
|
||||||
regressor_head.eval()
|
|
||||||
f_out = open('../test-A/out.tsv','w')
|
|
||||||
for batch, year in tqdm(get_train_batch(test_in,dev_year_scaled)):
|
|
||||||
#batch_first = roberta.extract_features(batch)[:,0].to(device)
|
|
||||||
x = regressor_head(batch).squeeze()
|
|
||||||
x_clipped = torch.clamp(x,0.0,1.0)
|
|
||||||
original_x_clipped = scaler.inverse_transform(x_clipped.detach().cpu().numpy().reshape(1,-1))
|
|
||||||
for y in original_x_clipped[0]:
|
|
||||||
f_out.write(str(y) + '\n')
|
|
||||||
f_out.close()
|
|
||||||
|
|
||||||
|
|
||||||
roberta.load_state_dict(torch.load('checkpoints/roberta_to_regressor3.pt'))
|
|
||||||
regressor_head.load_state_dict(torch.load('checkpoints/regressor_head3.pt'))
|
|
||||||
predict_dev()
|
|
||||||
predict_test()
|
|
@ -11,15 +11,18 @@ from sklearn.preprocessing import MinMaxScaler
|
|||||||
|
|
||||||
|
|
||||||
EVAL_OFTEN = True
|
EVAL_OFTEN = True
|
||||||
EVAL_EVERY = 10000
|
EVAL_EVERY = 50
|
||||||
|
BATCH_SIZE = 3
|
||||||
|
model_type = 'base' # base or large
|
||||||
|
|
||||||
|
|
||||||
roberta = torch.hub.load('pytorch/fairseq', 'roberta.base')
|
|
||||||
|
roberta = torch.hub.load('pytorch/fairseq', f'roberta.{model_type}')
|
||||||
roberta.cuda()
|
roberta.cuda()
|
||||||
device='cuda'
|
device='cuda'
|
||||||
|
|
||||||
|
|
||||||
|
# LOAD DATA
|
||||||
train_in = [l.rstrip('\n') for l in open('../train/in.tsv',newline='\n').readlines()] # shuffled
|
train_in = [l.rstrip('\n') for l in open('../train/in.tsv',newline='\n').readlines()] # shuffled
|
||||||
dev_in = [l.rstrip('\n') for l in open('../dev-0/in.tsv',newline='\n').readlines()] # shuffled
|
dev_in = [l.rstrip('\n') for l in open('../dev-0/in.tsv',newline='\n').readlines()] # shuffled
|
||||||
|
|
||||||
@ -29,6 +32,7 @@ dev_year = [float(l.rstrip('\n')) for l in open('../dev-0/expected.tsv',newline=
|
|||||||
dev_in_not_shuffled = copy.deepcopy(dev_in) # not shuffled
|
dev_in_not_shuffled = copy.deepcopy(dev_in) # not shuffled
|
||||||
test_in = [l.rstrip('\n') for l in open('../test-A/in.tsv',newline='\n').readlines()] # not shuffled
|
test_in = [l.rstrip('\n') for l in open('../test-A/in.tsv',newline='\n').readlines()] # not shuffled
|
||||||
|
|
||||||
|
# SHUFFLE DATA
|
||||||
c = list(zip(train_in,train_year))
|
c = list(zip(train_in,train_year))
|
||||||
random.shuffle(c)
|
random.shuffle(c)
|
||||||
train_in, train_year = zip(*c)
|
train_in, train_year = zip(*c)
|
||||||
@ -36,8 +40,8 @@ c = list(zip(dev_in,dev_year))
|
|||||||
random.shuffle(c)
|
random.shuffle(c)
|
||||||
dev_in, dev_year = zip(*c)
|
dev_in, dev_year = zip(*c)
|
||||||
|
|
||||||
|
# SCALE DATA
|
||||||
scaler = MinMaxScaler()
|
scaler = MinMaxScaler()
|
||||||
|
|
||||||
train_year_scaled = scaler.fit_transform(np.array(train_year).reshape(-1,1))
|
train_year_scaled = scaler.fit_transform(np.array(train_year).reshape(-1,1))
|
||||||
dev_year_scaled = scaler.transform(np.array(dev_year).reshape(-1,1))
|
dev_year_scaled = scaler.transform(np.array(dev_year).reshape(-1,1))
|
||||||
|
|
||||||
@ -45,107 +49,75 @@ dev_year_scaled = scaler.transform(np.array(dev_year).reshape(-1,1))
|
|||||||
class RegressorHead(torch.nn.Module):
|
class RegressorHead(torch.nn.Module):
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
super(RegressorHead, self).__init__()
|
super(RegressorHead, self).__init__()
|
||||||
self.linear1 = torch.nn.Linear(768,300)
|
in_dim = 768 if model_type == 'base' else 1024
|
||||||
self.linear2 = torch.nn.Linear(300,1)
|
self.linear = torch.nn.Linear(in_dim, 1)
|
||||||
self.linearxxx = torch.nn.Linear(768,1)
|
self.m = torch.nn.LeakyReLU(0.1)
|
||||||
self.dropout1 = torch.nn.Dropout(0.0)
|
def forward(self, x):
|
||||||
self.dropout2 = torch.nn.Dropout(0.0)
|
x = self.linear(x)
|
||||||
self.m = torch.nn.LeakyReLU(0.1)
|
|
||||||
def forward(self,x):
|
|
||||||
#x = self.dropout1(x)
|
|
||||||
#x = self.linear1(x)
|
|
||||||
#x = self.dropout2(x)
|
|
||||||
x = self.linearxxx(x)
|
|
||||||
x = self.m(x)
|
x = self.m(x)
|
||||||
x = -self.m(-x +1 ) +1
|
x = - self.m(-x + 1 ) +1
|
||||||
return x
|
return x
|
||||||
|
|
||||||
regressor_head = RegressorHead().to(device)
|
def get_features_and_year(dataset_in,dataset_y):
|
||||||
|
|
||||||
optimizer = torch.optim.Adam(list(roberta.parameters()) + list(regressor_head.parameters()), lr=1e-6)
|
|
||||||
criterion = torch.nn.MSELoss(reduction='sum').to(device)
|
|
||||||
|
|
||||||
BATCH_SIZE = 1
|
|
||||||
def get_train_batch(dataset_in,dataset_y):
|
|
||||||
for i in tqdm(range(0,len(dataset_in), BATCH_SIZE)):
|
for i in tqdm(range(0,len(dataset_in), BATCH_SIZE)):
|
||||||
batch_of_text = dataset_in[i:i+BATCH_SIZE]
|
batch_of_text = dataset_in[i:i+BATCH_SIZE]
|
||||||
|
|
||||||
#batch = collate_tokens([roberta.encode(p)[:512] for p in batch_of_text], pad_idx=1)
|
batch = collate_tokens([roberta.encode(p)[:512] for p in batch_of_text], pad_idx=1)
|
||||||
batch = roberta.encode(batch_of_text[0])
|
features = roberta.extract_features(batch).mean(1)
|
||||||
output= None
|
years = torch.FloatTensor(dataset_y[i:i+BATCH_SIZE]).to(device)
|
||||||
for j in range(0,1,512): # only first 512 tokens instead of all
|
|
||||||
if output is None:
|
|
||||||
output = roberta.extract_features(batch[j:j+512])
|
|
||||||
else:
|
|
||||||
output_new = roberta.extract_features(batch[j:j+512])
|
|
||||||
output = torch.cat((output, output_new),1)
|
|
||||||
features = torch.mean(output,1)
|
|
||||||
years = torch.FloatTensor(dataset_y[i:i+BATCH_SIZE]).to(device).squeeze()
|
|
||||||
|
|
||||||
yield features, years
|
yield features, years
|
||||||
|
|
||||||
|
def eval_dev(short=False):
|
||||||
def eval():
|
|
||||||
criterion_eval = torch.nn.MSELoss(reduction='sum')
|
criterion_eval = torch.nn.MSELoss(reduction='sum')
|
||||||
roberta.eval()
|
roberta.eval()
|
||||||
regressor_head.eval()
|
regressor_head.eval()
|
||||||
|
|
||||||
loss = 0.0
|
loss = 0.0
|
||||||
loss_clipped = 0.0
|
loss_clipped = 0.0
|
||||||
loss_scaled = 0.0
|
loss_scaled = 0.0
|
||||||
for batch, year in tqdm(get_train_batch(dev_in,dev_year_scaled)):
|
|
||||||
|
|
||||||
x = regressor_head(batch.to(device)).squeeze()
|
if short:
|
||||||
|
dataset_in = dev_in[:1000]
|
||||||
|
dataset_years = dev_year_scaled[:1000]
|
||||||
|
else:
|
||||||
|
dataset_in = dev_in
|
||||||
|
dataset_years = dev_year_scaled
|
||||||
|
|
||||||
|
predictions_sum = 0
|
||||||
|
for batch, year in tqdm(get_features_and_year(dataset_in, dataset_years)):
|
||||||
|
|
||||||
|
predictions_sum += year.shape[0]
|
||||||
|
x = regressor_head(batch.to(device))
|
||||||
x_clipped = torch.clamp(x,0.0,1.0)
|
x_clipped = torch.clamp(x,0.0,1.0)
|
||||||
|
|
||||||
original_x = torch.FloatTensor(scaler.inverse_transform(x.detach().cpu().numpy().reshape(1,-1)))
|
original_x = torch.FloatTensor(scaler.inverse_transform(x.detach().cpu().numpy().reshape(1,-1)))
|
||||||
original_x_clipped = torch.FloatTensor(scaler.inverse_transform(x_clipped.detach().cpu().numpy().reshape(1,-1)))
|
original_x_clipped = torch.FloatTensor(scaler.inverse_transform(x_clipped.detach().cpu().numpy().reshape(1,-1)))
|
||||||
original_year = torch.FloatTensor(scaler.inverse_transform(year.detach().cpu().numpy().reshape(1,-1)))
|
original_year = torch.FloatTensor(scaler.inverse_transform(year.detach().cpu().numpy().reshape(1,-1)))
|
||||||
|
|
||||||
loss_scaled += criterion_eval(x, year).item()
|
loss_scaled += criterion_eval(x, year).item()
|
||||||
loss += criterion_eval(original_x, original_year).item()
|
loss += criterion_eval(original_x, original_year).item()
|
||||||
loss_clipped += criterion_eval(original_x_clipped, original_year).item()
|
loss_clipped += criterion_eval(original_x_clipped, original_year).item()
|
||||||
print(' full valid loss scaled: ' + str(np.sqrt(loss_scaled/len(dev_year))))
|
|
||||||
print(' full valid loss: ' + str(np.sqrt(loss/len(dev_year))))
|
|
||||||
print(' full valid loss clipped: ' + str(np.sqrt(loss_clipped/len(dev_year))))
|
|
||||||
|
|
||||||
def eval_short():
|
print('valid loss scaled: ' + str(np.sqrt(loss_scaled/predictions_sum)))
|
||||||
criterion_eval = torch.nn.MSELoss(reduction='sum')
|
print('valid loss: ' + str(np.sqrt(loss/predictions_sum)))
|
||||||
roberta.eval()
|
print('valid loss clipped: ' + str(np.sqrt(loss_clipped/predictions_sum)))
|
||||||
regressor_head.eval()
|
|
||||||
loss = 0.0
|
|
||||||
loss_clipped = 0.0
|
|
||||||
loss_scaled = 0.0
|
|
||||||
for batch, year in tqdm(get_train_batch(dev_in[:1000],dev_year_scaled[:1000])):
|
|
||||||
|
|
||||||
x = regressor_head(batch.to(device)).squeeze()
|
|
||||||
x_clipped = torch.clamp(x,0.0,1.0)
|
|
||||||
|
|
||||||
original_x = torch.FloatTensor(scaler.inverse_transform(x.detach().cpu().numpy().reshape(1,-1)))
|
|
||||||
original_x_clipped = torch.FloatTensor(scaler.inverse_transform(x_clipped.detach().cpu().numpy().reshape(1,-1)))
|
|
||||||
original_year = torch.FloatTensor(scaler.inverse_transform(year.detach().cpu().numpy().reshape(1,-1)))
|
|
||||||
|
|
||||||
loss_scaled += criterion_eval(x, year).item()
|
|
||||||
loss += criterion_eval(original_x, original_year).item()
|
|
||||||
loss_clipped += criterion_eval(original_x_clipped, original_year).item()
|
|
||||||
print('valid loss scaled: ' + str(np.sqrt(loss_scaled/1000)))
|
|
||||||
print('valid loss: ' + str(np.sqrt(loss/1000)))
|
|
||||||
print('valid loss clipped: ' + str(np.sqrt(loss_clipped/len(dev_year))))
|
|
||||||
|
|
||||||
|
|
||||||
def train_one_epoch():
|
def train_one_epoch():
|
||||||
roberta.train()
|
roberta.train()
|
||||||
regressor_head.train()
|
regressor_head.train()
|
||||||
loss_value=0.0
|
loss_value=0.0
|
||||||
iteration = 0
|
iteration = 0
|
||||||
for batch, year in get_train_batch(train_in,train_year_scaled):
|
for batch, year in get_features_and_year(train_in,train_year_scaled):
|
||||||
iteration +=1
|
iteration +=1
|
||||||
roberta.zero_grad()
|
roberta.zero_grad()
|
||||||
regressor_head.zero_grad()
|
regressor_head.zero_grad()
|
||||||
#import pdb; pdb.set_trace()
|
|
||||||
|
|
||||||
x = regressor_head(batch.to(device)).squeeze()
|
predictions = regressor_head(batch.to(device))
|
||||||
|
|
||||||
loss = criterion(x, year)
|
loss = criterion(predictions, year)
|
||||||
loss_value += loss.item()
|
loss_value += loss.item()
|
||||||
loss.backward()
|
loss.backward()
|
||||||
optimizer.step()
|
optimizer.step()
|
||||||
@ -155,48 +127,53 @@ def train_one_epoch():
|
|||||||
|
|
||||||
|
|
||||||
if EVAL_OFTEN and (iteration > 1) and (iteration % EVAL_EVERY == 1):
|
if EVAL_OFTEN and (iteration > 1) and (iteration % EVAL_EVERY == 1):
|
||||||
print('train loss: ' + str(np.sqrt(loss_value / EVAL_EVERY)))
|
print('train loss: ' + str(np.sqrt(loss_value / (EVAL_EVERY*BATCH_SIZE))))
|
||||||
eval_short()
|
eval_dev(True)
|
||||||
roberta.train()
|
roberta.train()
|
||||||
regressor_head.train()
|
regressor_head.train()
|
||||||
loss_value = 0.0
|
loss_value = 0.0
|
||||||
#print('train loss: ' + str(loss_value/len(train_year)))
|
|
||||||
|
|
||||||
|
|
||||||
def predict_dev():
|
def predict(dataset='dev'):
|
||||||
|
if dataset=='dev':
|
||||||
|
f_out_path = '../dev-0/out.tsv'
|
||||||
|
dataset_in_not_shuffled = dev_in_not_shuffled
|
||||||
|
elif dataset=='test':
|
||||||
|
f_out_path = '../test-A/out.tsv'
|
||||||
|
dataset_in_not_shuffled = test_in
|
||||||
roberta.eval()
|
roberta.eval()
|
||||||
regressor_head.eval()
|
regressor_head.eval()
|
||||||
f_out = open('../dev-0/out.tsv','w')
|
f_out = open(f_out_path,'w')
|
||||||
for batch, year in tqdm(get_train_batch(dev_in_not_shuffled,dev_year_scaled)):
|
for batch, year in tqdm(get_features_and_year(dataset_in_not_shuffled, dev_year_scaled)):
|
||||||
#batch_first = roberta.extract_features(batch)[:,0].to(device)
|
x = regressor_head(batch)
|
||||||
x = regressor_head(batch).squeeze()
|
|
||||||
x_clipped = torch.clamp(x,0.0,1.0)
|
x_clipped = torch.clamp(x,0.0,1.0)
|
||||||
original_x_clipped = scaler.inverse_transform(x_clipped.detach().cpu().numpy().reshape(1,-1))
|
original_x_clipped = scaler.inverse_transform(x_clipped.detach().cpu().numpy().reshape(1,-1))
|
||||||
for y in original_x_clipped[0]:
|
for y in original_x_clipped[0]:
|
||||||
f_out.write(str(y) + '\n')
|
f_out.write(str(y) + '\n')
|
||||||
f_out.close()
|
f_out.close()
|
||||||
|
|
||||||
def predict_test():
|
|
||||||
roberta.eval()
|
regressor_head = RegressorHead().to(device)
|
||||||
regressor_head.eval()
|
|
||||||
f_out = open('../test-A/out.tsv','w')
|
optimizer = torch.optim.Adam(list(roberta.parameters()) + list(regressor_head.parameters()), lr=1e-6)
|
||||||
for batch, year in tqdm(get_train_batch(test_in,dev_year_scaled)):
|
criterion = torch.nn.MSELoss(reduction='sum').to(device)
|
||||||
#batch_first = roberta.extract_features(batch)[:,0].to(device)
|
|
||||||
x = regressor_head(batch).squeeze()
|
|
||||||
x_clipped = torch.clamp(x,0.0,1.0)
|
|
||||||
original_x_clipped = scaler.inverse_transform(x_clipped.detach().cpu().numpy().reshape(1,-1))
|
|
||||||
for y in original_x_clipped[0]:
|
|
||||||
f_out.write(str(y) + '\n')
|
|
||||||
f_out.close()
|
|
||||||
|
|
||||||
|
|
||||||
for i in range(100):
|
for i in range(100):
|
||||||
print('epoch ' + str(i))
|
print('epoch ' + str(i))
|
||||||
train_one_epoch()
|
train_one_epoch()
|
||||||
eval()
|
|
||||||
predict_dev()
|
print(f'epoch {i} done, EVALUATION ON FULL DEV:')
|
||||||
predict_test()
|
eval_dev()
|
||||||
|
print('evaluation done')
|
||||||
|
predict('dev')
|
||||||
|
predict('test')
|
||||||
|
|
||||||
torch.save(roberta.state_dict(),'checkpoints/roberta_to_regressor' + str(i) + '.pt')
|
torch.save(roberta.state_dict(),'checkpoints/roberta_to_regressor' + str(i) + '.pt')
|
||||||
torch.save(regressor_head.state_dict(),'checkpoints/regressor_head' + str(i) + '.pt')
|
torch.save(regressor_head.state_dict(),'checkpoints/regressor_head' + str(i) + '.pt')
|
||||||
predict_dev()
|
|
||||||
predict_test()
|
|
||||||
|
roberta.load_state_dict(torch.load('checkpoints/roberta_to_regressor1.pt'))
|
||||||
|
regressor_head.load_state_dict(torch.load('checkpoints/regressor_head1.pt'))
|
||||||
|
predict('dev')
|
||||||
|
predict('test')
|
||||||
|
Loading…
Reference in New Issue
Block a user