diff --git a/run_glue.py b/run_glue.py
index 6446f68..100beab 100644
--- a/run_glue.py
+++ b/run_glue.py
@@ -20,6 +20,7 @@ import logging
 import os
 import random
 import sys
+from collections import defaultdict
 from dataclasses import dataclass, field
 from typing import Optional
 
@@ -47,6 +48,17 @@ from transformers.utils import check_min_version, send_example_telemetry
 from transformers.utils.versions import require_version
 
 
+from roberta import RobertaForSequenceClassificationCustomSimple, RobertaForSequenceClassificationCustom, RobertaForSequenceClassificationCustomAlternative
+from gpt2 import GPT2ForSequenceClassificationCustomSimple, GPT2ForSequenceClassificationCustom
+
+MODEL_NAME_TO_CLASS = {
+    'roberta_simple': RobertaForSequenceClassificationCustomSimple,
+    'roberta_hidden': RobertaForSequenceClassificationCustom,
+    'roberta_hidden_v2': RobertaForSequenceClassificationCustomAlternative,
+    'gpt2_simple': GPT2ForSequenceClassificationCustomSimple,
+    'gpt2_hidden': GPT2ForSequenceClassificationCustom,
+}
+
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
 check_min_version("4.23.0")
 
@@ -207,6 +219,13 @@ class ModelArguments:
         metadata={"help": "Freeze encoder weights"},
     )
 
+    custom_model: str = field(
+        default=None,
+        metadata={
+            "help": "Use custom implementation from available list",
+            "choices": list(MODEL_NAME_TO_CLASS.keys()),
+        },
+    )
 
 def freeze_model_weights(model: torch.nn.Module) -> None:
     count = 0
@@ -384,7 +403,25 @@ def main():
         revision=model_args.model_revision,
         use_auth_token=True if model_args.use_auth_token else None,
     )
-    model = AutoModelForSequenceClassification.from_pretrained(
+
+    custom_model = model_args.custom_model
+    if custom_model is not None:
+        # Check model and implementation is the same
+        if 'roberta' in custom_model and 'roberta' not in model_args.model_name_or_path:
+            raise RuntimeError('Model and custom implementation should be the same type: RoBERTa')
+        elif 'gpt2' in custom_model and 'gpt2' not in model_args.model_name_or_path:
+            raise RuntimeError('Model and custom implementation should be the same type: GPT-2')
+
+        # Set custom configuration in model configuration
+        config.use_hidden_states = 'hidden' in custom_model
+        logger.info(f'Using hidden states in model: {config.use_hidden_states}')
+
+        # Get class to initialize model
+        model_cls = MODEL_NAME_TO_CLASS[custom_model]
+    else:
+        model_cls = AutoModelForSequenceClassification
+    logger.info(f'Using implementation from class: {model_cls.__name__}')
+    model = model_cls.from_pretrained(
         model_args.model_name_or_path,
         from_tf=bool(".ckpt" in model_args.model_name_or_path),
         config=config,
@@ -399,6 +436,11 @@ def main():
         freeze_model_weights(model.decoder)
         
 
+    if 'gpt2' in tokenizer.name_or_path and tokenizer.pad_token is None:
+        logger.info(f'Set PAD token to EOS: {tokenizer.eos_token}')
+        tokenizer._pad_token = tokenizer.eos_token
+        model.config.pad_token_id = model.config.eos_token_id
+
     # Preprocessing the raw_datasets
     if data_args.task_name is not None:
         sentence1_key, sentence2_key = task_to_keys[data_args.task_name]
@@ -459,11 +501,6 @@ def main():
         args = (
             (examples[sentence1_key],) if sentence2_key is None else (examples[sentence1_key], examples[sentence2_key])
         )
-        if 'gpt2' in tokenizer.name_or_path and tokenizer.pad_token is None:
-          logger.info(f'Set PAD token to EOS: {tokenizer.eos_token}')
-          tokenizer._pad_token = tokenizer.eos_token
-          model.config.pad_token_id = model.config.eos_token_id
-
         result = tokenizer(*args, padding=padding, max_length=max_seq_length, truncation=True)
 
         # Map labels to IDs (not necessary for GLUE tasks)
@@ -492,7 +529,16 @@ def main():
         eval_dataset = raw_datasets["validation_matched" if data_args.task_name == "mnli" else "validation"]
         if data_args.max_eval_samples is not None:
             max_eval_samples = min(len(eval_dataset), data_args.max_eval_samples)
-            eval_dataset = eval_dataset.select(range(max_eval_samples))
+            label_to_indexes = defaultdict(list)
+            for index, eval_sample in enumerate(eval_dataset):
+                label_to_indexes[eval_sample['label']].append(index)
+            max_samples_per_label = int(max_eval_samples / len(label_to_indexes))
+            eval_sample_indexes = []
+            for label, indexes in label_to_indexes.items():
+                eval_sample_indexes.extend(indexes[:max_samples_per_label])
+                logger.info(f"Set {max_samples_per_label} samples for {label}-class")
+            eval_sample_indexes.sort()
+            eval_dataset = eval_dataset.select(eval_sample_indexes)
 
     if training_args.do_predict or data_args.task_name is not None or data_args.test_file is not None:
         if "test" not in raw_datasets and "test_matched" not in raw_datasets:
@@ -549,13 +595,14 @@ def main():
     )
 
     # Training
+    ignore_keys_for_eval = ['hidden_states', 'attentions', 'past_key_values']
     if training_args.do_train:
         checkpoint = None
         if training_args.resume_from_checkpoint is not None:
             checkpoint = training_args.resume_from_checkpoint
         elif last_checkpoint is not None:
             checkpoint = last_checkpoint
-        train_result = trainer.train(resume_from_checkpoint=checkpoint)
+        train_result = trainer.train(resume_from_checkpoint=checkpoint, ignore_keys_for_eval=ignore_keys_for_eval)
         metrics = train_result.metrics
         max_train_samples = (
             data_args.max_train_samples if data_args.max_train_samples is not None else len(train_dataset)
@@ -585,7 +632,7 @@ def main():
             combined = {}
 
         for eval_dataset, task in zip(eval_datasets, tasks):
-            metrics = trainer.evaluate(eval_dataset=eval_dataset)
+            metrics = trainer.evaluate(eval_dataset=eval_dataset, ignore_keys=ignore_keys_for_eval)
 
             max_eval_samples = (
                 data_args.max_eval_samples if data_args.max_eval_samples is not None else len(eval_dataset)
@@ -613,7 +660,7 @@ def main():
         for predict_dataset, task in zip(predict_datasets, tasks):
             # Removing the `label` columns because it contains -1 and Trainer won't like that.
             predict_dataset = predict_dataset.remove_columns("label")
-            predictions = trainer.predict(predict_dataset, metric_key_prefix="predict").predictions
+            predictions = trainer.predict(predict_dataset, metric_key_prefix="predict", ignore_keys=ignore_keys_for_eval).predictions
             predictions = np.squeeze(predictions) if is_regression else np.argmax(predictions, axis=1)
 
             output_predict_file = os.path.join(training_args.output_dir, f"predict_results_{task}.txt")