diff --git a/roberta_regressor_head/predict.py b/roberta_regressor_head/predict.py
deleted file mode 100644
index 55f31fb..0000000
--- a/roberta_regressor_head/predict.py
+++ /dev/null
@@ -1,196 +0,0 @@
-import os
-import torch
-import random
-import copy
-from fairseq.models.roberta import RobertaModel, RobertaHubInterface
-from fairseq import hub_utils
-from fairseq.data.data_utils import collate_tokens
-from tqdm import tqdm
-import numpy as np
-from sklearn.preprocessing import MinMaxScaler
-
-
-EVAL_OFTEN = True
-EVAL_EVERY = 10000
-
-
-roberta = torch.hub.load('pytorch/fairseq', 'roberta.base')
-roberta.cuda()
-device='cuda'
-
-
-
-train_in = [l.rstrip('\n') for l in open('../train/in.tsv',newline='\n').readlines()] # shuffled
-dev_in = [l.rstrip('\n') for l in open('../dev-0/in.tsv',newline='\n').readlines()] # shuffled
-
-train_year = [float(l.rstrip('\n')) for l in open('../train/expected.tsv',newline='\n').readlines()]
-dev_year = [float(l.rstrip('\n')) for l in open('../dev-0/expected.tsv',newline='\n').readlines()]
-
-dev_in_not_shuffled = copy.deepcopy(dev_in) # not shuffled
-test_in = [l.rstrip('\n') for l in open('../test-A/in.tsv',newline='\n').readlines()] # not shuffled
-
-c = list(zip(train_in,train_year))
-random.shuffle(c)
-train_in, train_year = zip(*c) 
-c = list(zip(dev_in,dev_year))
-random.shuffle(c)
-dev_in, dev_year = zip(*c) 
-
-scaler = MinMaxScaler()
-
-train_year_scaled = scaler.fit_transform(np.array(train_year).reshape(-1,1))
-dev_year_scaled = scaler.transform(np.array(dev_year).reshape(-1,1))
-
-
-class RegressorHead(torch.nn.Module):
-    def __init__(self):
-        super(RegressorHead, self).__init__()
-        self.linear1 = torch.nn.Linear(768,300)
-        self.linear2 = torch.nn.Linear(300,1)
-        self.linearxxx = torch.nn.Linear(768,1)
-        self.dropout1 = torch.nn.Dropout(0.0)
-        self.dropout2 = torch.nn.Dropout(0.0)
-        self.m =  torch.nn.LeakyReLU(0.1)
-    def forward(self,x):
-        #x = self.dropout1(x)
-        #x = self.linear1(x)
-        #x = self.dropout2(x)
-        x = self.linearxxx(x)
-        x = self.m(x)
-        x = -self.m(-x +1 ) +1
-        return x 
-
-regressor_head = RegressorHead().to(device)
-
-optimizer = torch.optim.Adam(list(roberta.parameters()) + list(regressor_head.parameters()), lr = 1e-6)
-criterion = torch.nn.MSELoss(reduction='sum').to(device)
-
-BATCH_SIZE = 1
-def get_train_batch(dataset_in,dataset_y):
-    for i in tqdm(range(0,len(dataset_in), BATCH_SIZE)):
-        batch_of_text = dataset_in[i:i+BATCH_SIZE]
-        
-        #batch = collate_tokens([roberta.encode(p)[:512]  for p in batch_of_text], pad_idx=1)
-        batch = roberta.encode(batch_of_text[0])
-        output= None
-        for j in range(0,1,512): # only first 512 tokens instead of all
-            if output is None:
-                output = roberta.extract_features(batch[j:j+512])
-            else:
-                output_new = roberta.extract_features(batch[j:j+512])
-                output = torch.cat((output, output_new),1)
-        features = torch.mean(output,1)
-        years = torch.FloatTensor(dataset_y[i:i+BATCH_SIZE]).to(device).squeeze()
-
-        yield features, years
-
-
-def eval():
-    criterion_eval = torch.nn.MSELoss(reduction='sum')
-    roberta.eval()
-    regressor_head.eval()
-    loss = 0.0
-    loss_clipped = 0.0
-    loss_scaled = 0.0
-    for batch, year in tqdm(get_train_batch(dev_in,dev_year_scaled)):
-
-        x = regressor_head(batch.to(device)).squeeze()
-        x_clipped = torch.clamp(x,0.0,1.0)
-
-        original_x =  torch.FloatTensor(scaler.inverse_transform(x.detach().cpu().numpy().reshape(1,-1)))
-        original_x_clipped =  torch.FloatTensor(scaler.inverse_transform(x_clipped.detach().cpu().numpy().reshape(1,-1)))
-        original_year =  torch.FloatTensor(scaler.inverse_transform(year.detach().cpu().numpy().reshape(1,-1)))
-
-        loss_scaled += criterion_eval(x, year).item()
-        loss += criterion_eval(original_x, original_year).item()
-        loss_clipped += criterion_eval(original_x_clipped, original_year).item()
-    print(' full valid loss scaled: ' + str(np.sqrt(loss_scaled/len(dev_year))))
-    print(' full valid loss: ' + str(np.sqrt(loss/len(dev_year))))
-    print(' full valid loss clipped: ' + str(np.sqrt(loss_clipped/len(dev_year))))
-
-def eval_short():
-    criterion_eval = torch.nn.MSELoss(reduction='sum')
-    roberta.eval()
-    regressor_head.eval()
-    loss = 0.0
-    loss_clipped = 0.0
-    loss_scaled = 0.0
-    for batch, year in tqdm(get_train_batch(dev_in[:1000],dev_year_scaled[:1000])):
-
-        x = regressor_head(batch.to(device)).squeeze()
-        x_clipped = torch.clamp(x,0.0,1.0)
-
-        original_x =  torch.FloatTensor(scaler.inverse_transform(x.detach().cpu().numpy().reshape(1,-1)))
-        original_x_clipped =  torch.FloatTensor(scaler.inverse_transform(x_clipped.detach().cpu().numpy().reshape(1,-1)))
-        original_year =  torch.FloatTensor(scaler.inverse_transform(year.detach().cpu().numpy().reshape(1,-1)))
-
-        loss_scaled += criterion_eval(x, year).item()
-        loss += criterion_eval(original_x, original_year).item()
-        loss_clipped += criterion_eval(original_x_clipped, original_year).item()
-    print('valid loss scaled: ' + str(np.sqrt(loss_scaled/1000)))
-    print('valid loss: ' + str(np.sqrt(loss/1000)))
-    print('valid loss clipped: ' + str(np.sqrt(loss_clipped/len(dev_year))))
-
-
-def train_one_epoch():
-    roberta.train()
-    regressor_head.train()
-    loss_value=0.0
-    iteration  = 0
-    for batch, year in get_train_batch(train_in,train_year_scaled):
-        iteration +=1
-        roberta.zero_grad()
-        regressor_head.zero_grad()
-        #import pdb; pdb.set_trace()
-
-        x = regressor_head(batch.to(device)).squeeze()
-
-        loss = criterion(x, year)
-        loss_value += loss.item()
-        loss.backward()
-        optimizer.step()
-
-        roberta.zero_grad()
-        regressor_head.zero_grad()
-
-
-        if EVAL_OFTEN and (iteration > 1) and (iteration % EVAL_EVERY == 1):
-            print('train loss: ' + str(np.sqrt(loss_value / EVAL_EVERY)))
-            eval_short()
-            roberta.train()
-            regressor_head.train()
-            loss_value = 0.0
-    #print('train loss: ' + str(loss_value/len(train_year)))
-
-
-def predict_dev():
-    roberta.eval()
-    regressor_head.eval()
-    f_out = open('../dev-0/out.tsv','w')
-    for batch, year in tqdm(get_train_batch(dev_in_not_shuffled,dev_year_scaled)):
-        #batch_first = roberta.extract_features(batch)[:,0].to(device)
-        x = regressor_head(batch).squeeze()
-        x_clipped = torch.clamp(x,0.0,1.0)
-        original_x_clipped =  scaler.inverse_transform(x_clipped.detach().cpu().numpy().reshape(1,-1))
-        for y in original_x_clipped[0]:
-            f_out.write(str(y) + '\n')
-    f_out.close()
-
-def predict_test():
-    roberta.eval()
-    regressor_head.eval()
-    f_out = open('../test-A/out.tsv','w')
-    for batch, year in tqdm(get_train_batch(test_in,dev_year_scaled)):
-        #batch_first = roberta.extract_features(batch)[:,0].to(device)
-        x = regressor_head(batch).squeeze()
-        x_clipped = torch.clamp(x,0.0,1.0)
-        original_x_clipped =  scaler.inverse_transform(x_clipped.detach().cpu().numpy().reshape(1,-1))
-        for y in original_x_clipped[0]:
-            f_out.write(str(y) + '\n')                                                                                                                                                                                                        
-    f_out.close()
-
-
-roberta.load_state_dict(torch.load('checkpoints/roberta_to_regressor3.pt'))
-regressor_head.load_state_dict(torch.load('checkpoints/regressor_head3.pt'))
-predict_dev()
-predict_test()
diff --git a/roberta_regressor_head/train.py b/roberta_regressor_head/train.py
index 1cdfe3e..651a7bb 100644
--- a/roberta_regressor_head/train.py
+++ b/roberta_regressor_head/train.py
@@ -11,15 +11,18 @@ from sklearn.preprocessing import MinMaxScaler
 
 
 EVAL_OFTEN = True
-EVAL_EVERY = 10000
+EVAL_EVERY = 50
+BATCH_SIZE = 3
+model_type = 'base' # base or large
 
 
-roberta = torch.hub.load('pytorch/fairseq', 'roberta.base')
+
+roberta = torch.hub.load('pytorch/fairseq', f'roberta.{model_type}')
 roberta.cuda()
 device='cuda'
 
 
-
+# LOAD DATA
 train_in = [l.rstrip('\n') for l in open('../train/in.tsv',newline='\n').readlines()] # shuffled
 dev_in = [l.rstrip('\n') for l in open('../dev-0/in.tsv',newline='\n').readlines()] # shuffled
 
@@ -29,6 +32,7 @@ dev_year = [float(l.rstrip('\n')) for l in open('../dev-0/expected.tsv',newline=
 dev_in_not_shuffled = copy.deepcopy(dev_in) # not shuffled
 test_in = [l.rstrip('\n') for l in open('../test-A/in.tsv',newline='\n').readlines()] # not shuffled
 
+# SHUFFLE DATA
 c = list(zip(train_in,train_year))
 random.shuffle(c)
 train_in, train_year = zip(*c) 
@@ -36,8 +40,8 @@ c = list(zip(dev_in,dev_year))
 random.shuffle(c)
 dev_in, dev_year = zip(*c) 
 
+# SCALE DATA
 scaler = MinMaxScaler()
-
 train_year_scaled = scaler.fit_transform(np.array(train_year).reshape(-1,1))
 dev_year_scaled = scaler.transform(np.array(dev_year).reshape(-1,1))
 
@@ -45,107 +49,75 @@ dev_year_scaled = scaler.transform(np.array(dev_year).reshape(-1,1))
 class RegressorHead(torch.nn.Module):
     def __init__(self):
         super(RegressorHead, self).__init__()
-        self.linear1 = torch.nn.Linear(768,300)
-        self.linear2 = torch.nn.Linear(300,1)
-        self.linearxxx = torch.nn.Linear(768,1)
-        self.dropout1 = torch.nn.Dropout(0.0)
-        self.dropout2 = torch.nn.Dropout(0.0)
-        self.m =  torch.nn.LeakyReLU(0.1)
-    def forward(self,x):
-        #x = self.dropout1(x)
-        #x = self.linear1(x)
-        #x = self.dropout2(x)
-        x = self.linearxxx(x)
+        in_dim = 768 if model_type == 'base' else 1024
+        self.linear = torch.nn.Linear(in_dim, 1)
+        self.m = torch.nn.LeakyReLU(0.1)
+    def forward(self, x):
+        x = self.linear(x)
         x = self.m(x)
-        x = -self.m(-x +1 ) +1
+        x = - self.m(-x + 1 ) +1
         return x 
 
-regressor_head = RegressorHead().to(device)
-
-optimizer = torch.optim.Adam(list(roberta.parameters()) + list(regressor_head.parameters()), lr=1e-6)
-criterion = torch.nn.MSELoss(reduction='sum').to(device)
-
-BATCH_SIZE = 1
-def get_train_batch(dataset_in,dataset_y):
+def get_features_and_year(dataset_in,dataset_y):
     for i in tqdm(range(0,len(dataset_in), BATCH_SIZE)):
         batch_of_text = dataset_in[i:i+BATCH_SIZE]
         
-        #batch = collate_tokens([roberta.encode(p)[:512]  for p in batch_of_text], pad_idx=1)
-        batch = roberta.encode(batch_of_text[0])
-        output= None
-        for j in range(0,1,512): # only first 512 tokens instead of all
-            if output is None:
-                output = roberta.extract_features(batch[j:j+512])
-            else:
-                output_new = roberta.extract_features(batch[j:j+512])
-                output = torch.cat((output, output_new),1)
-        features = torch.mean(output,1)
-        years = torch.FloatTensor(dataset_y[i:i+BATCH_SIZE]).to(device).squeeze()
+        batch = collate_tokens([roberta.encode(p)[:512] for p in batch_of_text], pad_idx=1)
+        features = roberta.extract_features(batch).mean(1)
+        years = torch.FloatTensor(dataset_y[i:i+BATCH_SIZE]).to(device)
 
         yield features, years
 
-
-def eval():
+def eval_dev(short=False):
     criterion_eval = torch.nn.MSELoss(reduction='sum')
     roberta.eval()
     regressor_head.eval()
+
     loss = 0.0
     loss_clipped = 0.0
     loss_scaled = 0.0
-    for batch, year in tqdm(get_train_batch(dev_in,dev_year_scaled)):
 
-        x = regressor_head(batch.to(device)).squeeze()
+    if short:
+        dataset_in = dev_in[:1000]
+        dataset_years = dev_year_scaled[:1000]
+    else:
+        dataset_in = dev_in
+        dataset_years = dev_year_scaled
+
+    predictions_sum = 0
+    for batch, year in tqdm(get_features_and_year(dataset_in, dataset_years)):
+
+        predictions_sum += year.shape[0]
+        x = regressor_head(batch.to(device))
         x_clipped = torch.clamp(x,0.0,1.0)
 
-        original_x =  torch.FloatTensor(scaler.inverse_transform(x.detach().cpu().numpy().reshape(1,-1)))
-        original_x_clipped =  torch.FloatTensor(scaler.inverse_transform(x_clipped.detach().cpu().numpy().reshape(1,-1)))
+        original_x = torch.FloatTensor(scaler.inverse_transform(x.detach().cpu().numpy().reshape(1,-1)))
+        original_x_clipped = torch.FloatTensor(scaler.inverse_transform(x_clipped.detach().cpu().numpy().reshape(1,-1)))
         original_year =  torch.FloatTensor(scaler.inverse_transform(year.detach().cpu().numpy().reshape(1,-1)))
 
         loss_scaled += criterion_eval(x, year).item()
         loss += criterion_eval(original_x, original_year).item()
         loss_clipped += criterion_eval(original_x_clipped, original_year).item()
-    print(' full valid loss scaled: ' + str(np.sqrt(loss_scaled/len(dev_year))))
-    print(' full valid loss: ' + str(np.sqrt(loss/len(dev_year))))
-    print(' full valid loss clipped: ' + str(np.sqrt(loss_clipped/len(dev_year))))
 
-def eval_short():
-    criterion_eval = torch.nn.MSELoss(reduction='sum')
-    roberta.eval()
-    regressor_head.eval()
-    loss = 0.0
-    loss_clipped = 0.0
-    loss_scaled = 0.0
-    for batch, year in tqdm(get_train_batch(dev_in[:1000],dev_year_scaled[:1000])):
+    print('valid loss scaled: ' + str(np.sqrt(loss_scaled/predictions_sum)))
+    print('valid loss: ' + str(np.sqrt(loss/predictions_sum)))
+    print('valid loss clipped: ' + str(np.sqrt(loss_clipped/predictions_sum)))
 
-        x = regressor_head(batch.to(device)).squeeze()
-        x_clipped = torch.clamp(x,0.0,1.0)
-
-        original_x =  torch.FloatTensor(scaler.inverse_transform(x.detach().cpu().numpy().reshape(1,-1)))
-        original_x_clipped =  torch.FloatTensor(scaler.inverse_transform(x_clipped.detach().cpu().numpy().reshape(1,-1)))
-        original_year =  torch.FloatTensor(scaler.inverse_transform(year.detach().cpu().numpy().reshape(1,-1)))
-
-        loss_scaled += criterion_eval(x, year).item()
-        loss += criterion_eval(original_x, original_year).item()
-        loss_clipped += criterion_eval(original_x_clipped, original_year).item()
-    print('valid loss scaled: ' + str(np.sqrt(loss_scaled/1000)))
-    print('valid loss: ' + str(np.sqrt(loss/1000)))
-    print('valid loss clipped: ' + str(np.sqrt(loss_clipped/len(dev_year))))
 
 
 def train_one_epoch():
     roberta.train()
     regressor_head.train()
     loss_value=0.0
-    iteration  = 0
-    for batch, year in get_train_batch(train_in,train_year_scaled):
+    iteration = 0
+    for batch, year in get_features_and_year(train_in,train_year_scaled):
         iteration +=1
         roberta.zero_grad()
         regressor_head.zero_grad()
-        #import pdb; pdb.set_trace()
 
-        x = regressor_head(batch.to(device)).squeeze()
+        predictions = regressor_head(batch.to(device))
 
-        loss = criterion(x, year)
+        loss = criterion(predictions, year)
         loss_value += loss.item()
         loss.backward()
         optimizer.step()
@@ -155,48 +127,53 @@ def train_one_epoch():
 
 
         if EVAL_OFTEN and (iteration > 1) and (iteration % EVAL_EVERY == 1):
-            print('train loss: ' + str(np.sqrt(loss_value / EVAL_EVERY)))
-            eval_short()
+            print('train loss: ' + str(np.sqrt(loss_value / (EVAL_EVERY*BATCH_SIZE))))
+            eval_dev(True)
             roberta.train()
             regressor_head.train()
             loss_value = 0.0
-    #print('train loss: ' + str(loss_value/len(train_year)))
 
 
-def predict_dev():
+def predict(dataset='dev'):
+    if dataset=='dev':
+        f_out_path = '../dev-0/out.tsv'
+        dataset_in_not_shuffled = dev_in_not_shuffled
+    elif dataset=='test':
+        f_out_path = '../test-A/out.tsv'
+        dataset_in_not_shuffled = test_in
     roberta.eval()
     regressor_head.eval()
-    f_out = open('../dev-0/out.tsv','w')
-    for batch, year in tqdm(get_train_batch(dev_in_not_shuffled,dev_year_scaled)):
-        #batch_first = roberta.extract_features(batch)[:,0].to(device)
-        x = regressor_head(batch).squeeze()
+    f_out = open(f_out_path,'w')
+    for batch, year in tqdm(get_features_and_year(dataset_in_not_shuffled, dev_year_scaled)):
+        x = regressor_head(batch)
         x_clipped = torch.clamp(x,0.0,1.0)
         original_x_clipped =  scaler.inverse_transform(x_clipped.detach().cpu().numpy().reshape(1,-1))
         for y in original_x_clipped[0]:
             f_out.write(str(y) + '\n')
     f_out.close()
 
-def predict_test():
-    roberta.eval()
-    regressor_head.eval()
-    f_out = open('../test-A/out.tsv','w')
-    for batch, year in tqdm(get_train_batch(test_in,dev_year_scaled)):
-        #batch_first = roberta.extract_features(batch)[:,0].to(device)
-        x = regressor_head(batch).squeeze()
-        x_clipped = torch.clamp(x,0.0,1.0)
-        original_x_clipped =  scaler.inverse_transform(x_clipped.detach().cpu().numpy().reshape(1,-1))
-        for y in original_x_clipped[0]:
-            f_out.write(str(y) + '\n')                                                                                                                                                                                                        
-    f_out.close()
+
+regressor_head = RegressorHead().to(device)
+
+optimizer = torch.optim.Adam(list(roberta.parameters()) + list(regressor_head.parameters()), lr=1e-6)
+criterion = torch.nn.MSELoss(reduction='sum').to(device)
 
 
 for i in range(100):
     print('epoch ' + str(i))
     train_one_epoch()
-    eval()
-    predict_dev()
-    predict_test()
+
+    print(f'epoch {i} done, EVALUATION ON FULL DEV:')
+    eval_dev()
+    print('evaluation done')
+    predict('dev')
+    predict('test')
+
     torch.save(roberta.state_dict(),'checkpoints/roberta_to_regressor' + str(i) + '.pt')
     torch.save(regressor_head.state_dict(),'checkpoints/regressor_head' + str(i) + '.pt')
-predict_dev()
-predict_test()
+
+
+roberta.load_state_dict(torch.load('checkpoints/roberta_to_regressor1.pt'))
+regressor_head.load_state_dict(torch.load('checkpoints/regressor_head1.pt'))
+predict('dev')
+predict('test')