commit e1b933bf4d64d283c5269b9430a73ae781fd7e33
Author: Jakub Kaczmarek <kuba300698@gmail.com>
Date:   Thu Feb 16 18:21:17 2023 +0100

    Initial commit

diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..e79af7f
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,5 @@
+data
+out/t5
+out/gpt2
+out/roberta
+.cache_training
\ No newline at end of file
diff --git a/__pycache__/gpt2.cpython-310.pyc b/__pycache__/gpt2.cpython-310.pyc
new file mode 100644
index 0000000..83547e4
Binary files /dev/null and b/__pycache__/gpt2.cpython-310.pyc differ
diff --git a/__pycache__/gpt2.cpython-39.pyc b/__pycache__/gpt2.cpython-39.pyc
new file mode 100644
index 0000000..466f07e
Binary files /dev/null and b/__pycache__/gpt2.cpython-39.pyc differ
diff --git a/__pycache__/roberta.cpython-310.pyc b/__pycache__/roberta.cpython-310.pyc
new file mode 100644
index 0000000..e4ff9b8
Binary files /dev/null and b/__pycache__/roberta.cpython-310.pyc differ
diff --git a/__pycache__/roberta.cpython-39.pyc b/__pycache__/roberta.cpython-39.pyc
new file mode 100644
index 0000000..87fd2f0
Binary files /dev/null and b/__pycache__/roberta.cpython-39.pyc differ
diff --git a/__pycache__/t5.cpython-310.pyc b/__pycache__/t5.cpython-310.pyc
new file mode 100644
index 0000000..a38ca25
Binary files /dev/null and b/__pycache__/t5.cpython-310.pyc differ
diff --git a/__pycache__/t5.cpython-39.pyc b/__pycache__/t5.cpython-39.pyc
new file mode 100644
index 0000000..1cce867
Binary files /dev/null and b/__pycache__/t5.cpython-39.pyc differ
diff --git a/bart.py b/bart.py
new file mode 100644
index 0000000..809e36d
--- /dev/null
+++ b/bart.py
@@ -0,0 +1,10 @@
+from transformers import BartConfig, BartForSequenceClassification, BartModel
+from torch import nn
+
+class BartForClassification(BartForSequenceClassification):
+    def __init__(self, config: BartConfig):
+        self.config = config
+        self.bart = BartForSequenceClassification(config)
+        self.bart.out_proj = nn.Linear(768, 4)
+        
+
diff --git a/gpt2.py b/gpt2.py
new file mode 100644
index 0000000..342f5d9
--- /dev/null
+++ b/gpt2.py
@@ -0,0 +1,154 @@
+import torch
+from torch import nn
+from transformers import GPT2PreTrainedModel, GPT2Model
+from transformers.modeling_outputs import SequenceClassifierOutputWithPast
+
+class GPT2ForSequenceClassification(GPT2PreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+        self.num_labels = config.num_labels
+        self.transformer = GPT2Model(config)
+        self.score = nn.Linear(config.n_embd, self.num_labels, bias=False)
+
+        # Model parallel
+        self.model_parallel = False
+        self.device_map = None
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+
+class GPT2ClassificationHeadCustom(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        hidden_size = config.n_embd
+        self.dense_1_input = nn.Linear(hidden_size, 2 * hidden_size)
+        self.dense_1_hidden = nn.Linear(hidden_size, 2 * hidden_size)
+        self.dense_2 = nn.Linear(4 * hidden_size, 4 * hidden_size)
+        self.dense_3 = nn.Linear(4 * hidden_size, 4 * hidden_size)
+        self.dense_4 = nn.Linear(4 * hidden_size, hidden_size)
+        self.dropout = nn.Dropout(config.resid_pdrop)
+        self.out_proj = nn.Linear(hidden_size, config.num_labels, bias=False)
+
+    def forward(self, x, **kwargs):
+        if 'hidden_states' in kwargs and kwargs['hidden_states'] is not None:
+            # Get hidden states from last layer
+            hidden = kwargs['hidden_states'][-1]
+        else:
+            hidden = torch.zeros(x.size(), dtype=x.dtype, device=x.device)
+
+        x = self.dense_1_input(x)
+        x = torch.relu(x)
+        x = self.dropout(x)
+
+        hidden = self.dense_1_hidden(hidden)
+        hidden = torch.relu(hidden)
+        hidden = self.dropout(hidden)
+
+        x = torch.cat((x, hidden), dim=2)
+        x = self.dense_2(x)
+        x = torch.relu(x)
+        x = self.dense_3(x)
+        x = torch.relu(x)
+        x = self.dense_4(x)
+        x = torch.relu(x)
+        x = self.dropout(x)
+
+        x = self.out_proj(x)
+        return x
+
+class GPT2ForSequenceClassificationCustom(GPT2ForSequenceClassification):
+    def __init__(self, config):
+        super().__init__(config)
+        self.num_labels = config.num_labels
+        self.transformer = GPT2Model(config)
+
+        self.score = GPT2ClassificationHeadCustom(config)
+
+        self.init_weights()
+
+        # Model parallel
+        self.model_parallel = False
+        self.device_map = None
+
+    def forward(
+        self,
+        input_ids=None,
+        past_key_values=None,
+        attention_mask=None,
+        token_type_ids=None,
+        position_ids=None,
+        head_mask=None,
+        inputs_embeds=None,
+        labels=None,
+        use_cache=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+    ):
+        r"""
+        labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`):
+            Labels for computing the sequence classification/regression loss. Indices should be in :obj:`[0, ...,
+            config.num_labels - 1]`. If :obj:`config.num_labels == 1` a regression loss is computed (Mean-Square loss),
+            If :obj:`config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        transformer_outputs = self.transformer(
+            input_ids,
+            past_key_values=past_key_values,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        hidden_states = transformer_outputs[0]
+        if return_dict:
+            logits = self.score(hidden_states, hidden_states=transformer_outputs.hidden_states)
+        else:
+            raise NotImplemented('Not implemented for using non-dictionary object')
+
+        if input_ids is not None:
+            batch_size, sequence_length = input_ids.shape[:2]
+        else:
+            batch_size, sequence_length = inputs_embeds.shape[:2]
+
+        assert (
+            self.config.pad_token_id is not None or batch_size == 1
+        ), "Cannot handle batch sizes > 1 if no padding token is defined."
+        if self.config.pad_token_id is None:
+            sequence_lengths = -1
+        else:
+            if input_ids is not None:
+                sequence_lengths = torch.ne(input_ids, self.config.pad_token_id).sum(-1) - 1
+            else:
+                sequence_lengths = -1
+
+        pooled_logits = logits[range(batch_size), sequence_lengths]
+
+        loss = None
+        if labels is not None:
+            if self.num_labels == 1:
+                #  We are doing regression
+                loss_fct = nn.MSELoss()
+                loss = loss_fct(pooled_logits.view(-1), labels.to(self.dtype).view(-1))
+            else:
+                loss_fct = nn.CrossEntropyLoss()
+                loss = loss_fct(pooled_logits.view(-1, self.num_labels), labels.view(-1))
+
+        if not return_dict:
+            output = (pooled_logits,) + transformer_outputs[1:]
+            return ((loss,) + output) if loss is not None else output
+
+        return SequenceClassifierOutputWithPast(
+            loss=loss,
+            logits=pooled_logits,
+            past_key_values=transformer_outputs.past_key_values,
+            hidden_states=transformer_outputs.hidden_states,
+            attentions=transformer_outputs.attentions,
+        )
diff --git a/out/gpt2_results/README.md b/out/gpt2_results/README.md
new file mode 100644
index 0000000..856e902
--- /dev/null
+++ b/out/gpt2_results/README.md
@@ -0,0 +1,53 @@
+---
+tags:
+- generated_from_trainer
+model-index:
+- name: gpt2_results
+  results: []
+---
+
+<!-- This model card has been generated automatically according to the information the Trainer had access to. You
+should probably proofread and complete it, then remove this comment. -->
+
+# gpt2_results
+
+This model is a fine-tuned version of [out/gpt2](https://huggingface.co/out/gpt2) on an unknown dataset.
+It achieves the following results on the evaluation set:
+- eval_loss: 0.3020
+- eval_accuracy: 0.9195
+- eval_runtime: 24.1139
+- eval_samples_per_second: 82.94
+- eval_steps_per_second: 10.367
+- step: 0
+
+## Model description
+
+More information needed
+
+## Intended uses & limitations
+
+More information needed
+
+## Training and evaluation data
+
+More information needed
+
+## Training procedure
+
+### Training hyperparameters
+
+The following hyperparameters were used during training:
+- learning_rate: 2e-05
+- train_batch_size: 8
+- eval_batch_size: 8
+- seed: 42
+- optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08
+- lr_scheduler_type: linear
+- training_steps: 2500
+
+### Framework versions
+
+- Transformers 4.26.1
+- Pytorch 1.13.1+cu117
+- Datasets 2.9.0
+- Tokenizers 0.13.2
diff --git a/out/gpt2_results/all_results.json b/out/gpt2_results/all_results.json
new file mode 100644
index 0000000..0a68305
--- /dev/null
+++ b/out/gpt2_results/all_results.json
@@ -0,0 +1,8 @@
+{
+    "eval_accuracy": 0.9194999933242798,
+    "eval_loss": 0.3020096719264984,
+    "eval_runtime": 24.1139,
+    "eval_samples": 2000,
+    "eval_samples_per_second": 82.94,
+    "eval_steps_per_second": 10.367
+}
\ No newline at end of file
diff --git a/out/gpt2_results/eval_results.json b/out/gpt2_results/eval_results.json
new file mode 100644
index 0000000..0a68305
--- /dev/null
+++ b/out/gpt2_results/eval_results.json
@@ -0,0 +1,8 @@
+{
+    "eval_accuracy": 0.9194999933242798,
+    "eval_loss": 0.3020096719264984,
+    "eval_runtime": 24.1139,
+    "eval_samples": 2000,
+    "eval_samples_per_second": 82.94,
+    "eval_steps_per_second": 10.367
+}
\ No newline at end of file
diff --git a/out/gpt2_results/predict_results_None.txt b/out/gpt2_results/predict_results_None.txt
new file mode 100644
index 0000000..c1f964f
--- /dev/null
+++ b/out/gpt2_results/predict_results_None.txt
@@ -0,0 +1,3801 @@
+index	prediction
+0	0
+1	0
+2	0
+3	0
+4	0
+5	0
+6	0
+7	0
+8	0
+9	0
+10	0
+11	1
+12	2
+13	0
+14	0
+15	0
+16	0
+17	0
+18	0
+19	3
+20	0
+21	0
+22	0
+23	0
+24	3
+25	0
+26	0
+27	2
+28	0
+29	0
+30	0
+31	0
+32	0
+33	0
+34	0
+35	0
+36	0
+37	0
+38	0
+39	0
+40	0
+41	0
+42	0
+43	3
+44	0
+45	0
+46	0
+47	0
+48	0
+49	2
+50	0
+51	0
+52	0
+53	1
+54	1
+55	0
+56	0
+57	0
+58	0
+59	0
+60	0
+61	0
+62	0
+63	0
+64	0
+65	0
+66	0
+67	0
+68	0
+69	0
+70	0
+71	0
+72	0
+73	0
+74	0
+75	0
+76	0
+77	0
+78	0
+79	0
+80	0
+81	0
+82	0
+83	0
+84	0
+85	0
+86	0
+87	0
+88	0
+89	1
+90	0
+91	0
+92	0
+93	0
+94	0
+95	0
+96	0
+97	0
+98	0
+99	0
+100	0
+101	0
+102	0
+103	0
+104	1
+105	0
+106	3
+107	0
+108	0
+109	0
+110	0
+111	0
+112	0
+113	0
+114	1
+115	0
+116	0
+117	0
+118	0
+119	0
+120	0
+121	0
+122	0
+123	2
+124	0
+125	0
+126	0
+127	0
+128	0
+129	0
+130	0
+131	0
+132	0
+133	0
+134	0
+135	0
+136	0
+137	0
+138	1
+139	0
+140	0
+141	0
+142	0
+143	0
+144	0
+145	0
+146	0
+147	0
+148	3
+149	0
+150	0
+151	0
+152	1
+153	0
+154	0
+155	0
+156	0
+157	2
+158	0
+159	0
+160	0
+161	0
+162	0
+163	0
+164	0
+165	0
+166	0
+167	0
+168	0
+169	0
+170	0
+171	0
+172	0
+173	0
+174	0
+175	0
+176	0
+177	0
+178	2
+179	0
+180	0
+181	0
+182	0
+183	0
+184	0
+185	0
+186	0
+187	0
+188	0
+189	0
+190	0
+191	0
+192	0
+193	0
+194	0
+195	0
+196	0
+197	0
+198	0
+199	0
+200	0
+201	0
+202	0
+203	0
+204	0
+205	0
+206	0
+207	0
+208	0
+209	3
+210	0
+211	0
+212	0
+213	0
+214	0
+215	0
+216	0
+217	0
+218	2
+219	0
+220	0
+221	0
+222	0
+223	0
+224	0
+225	1
+226	0
+227	0
+228	0
+229	2
+230	0
+231	0
+232	0
+233	0
+234	0
+235	0
+236	0
+237	0
+238	0
+239	0
+240	0
+241	0
+242	0
+243	0
+244	0
+245	0
+246	0
+247	0
+248	0
+249	0
+250	2
+251	0
+252	0
+253	0
+254	0
+255	0
+256	0
+257	0
+258	1
+259	0
+260	0
+261	3
+262	0
+263	0
+264	0
+265	0
+266	0
+267	0
+268	3
+269	0
+270	0
+271	0
+272	0
+273	0
+274	3
+275	3
+276	0
+277	1
+278	0
+279	0
+280	0
+281	0
+282	0
+283	0
+284	0
+285	0
+286	0
+287	0
+288	0
+289	0
+290	0
+291	0
+292	0
+293	0
+294	0
+295	0
+296	0
+297	0
+298	0
+299	0
+300	0
+301	0
+302	0
+303	2
+304	0
+305	0
+306	0
+307	0
+308	0
+309	0
+310	0
+311	0
+312	0
+313	0
+314	0
+315	0
+316	0
+317	0
+318	0
+319	0
+320	0
+321	0
+322	0
+323	0
+324	0
+325	0
+326	0
+327	0
+328	0
+329	0
+330	0
+331	0
+332	0
+333	0
+334	0
+335	2
+336	0
+337	0
+338	0
+339	0
+340	0
+341	0
+342	0
+343	0
+344	0
+345	0
+346	0
+347	0
+348	0
+349	0
+350	0
+351	0
+352	0
+353	0
+354	0
+355	3
+356	0
+357	0
+358	0
+359	2
+360	0
+361	1
+362	2
+363	0
+364	0
+365	0
+366	0
+367	0
+368	0
+369	0
+370	0
+371	0
+372	0
+373	0
+374	0
+375	0
+376	0
+377	0
+378	0
+379	0
+380	0
+381	0
+382	0
+383	0
+384	0
+385	0
+386	0
+387	0
+388	0
+389	0
+390	0
+391	0
+392	0
+393	0
+394	0
+395	0
+396	0
+397	0
+398	0
+399	0
+400	0
+401	0
+402	0
+403	0
+404	0
+405	0
+406	0
+407	0
+408	0
+409	0
+410	2
+411	0
+412	0
+413	0
+414	0
+415	0
+416	0
+417	0
+418	2
+419	0
+420	3
+421	0
+422	0
+423	0
+424	0
+425	0
+426	0
+427	0
+428	0
+429	0
+430	0
+431	0
+432	0
+433	0
+434	0
+435	0
+436	0
+437	0
+438	0
+439	0
+440	0
+441	0
+442	0
+443	0
+444	0
+445	0
+446	0
+447	0
+448	0
+449	0
+450	0
+451	0
+452	0
+453	0
+454	0
+455	0
+456	0
+457	0
+458	2
+459	0
+460	0
+461	0
+462	2
+463	0
+464	0
+465	0
+466	0
+467	0
+468	0
+469	0
+470	0
+471	0
+472	0
+473	0
+474	0
+475	0
+476	0
+477	0
+478	0
+479	0
+480	0
+481	0
+482	0
+483	0
+484	0
+485	2
+486	0
+487	0
+488	0
+489	0
+490	0
+491	0
+492	0
+493	0
+494	0
+495	0
+496	0
+497	0
+498	0
+499	2
+500	0
+501	0
+502	0
+503	0
+504	0
+505	0
+506	0
+507	0
+508	0
+509	3
+510	0
+511	0
+512	0
+513	0
+514	0
+515	0
+516	0
+517	0
+518	0
+519	3
+520	0
+521	0
+522	0
+523	0
+524	0
+525	0
+526	0
+527	0
+528	0
+529	2
+530	0
+531	2
+532	0
+533	0
+534	0
+535	1
+536	0
+537	0
+538	0
+539	0
+540	0
+541	2
+542	0
+543	0
+544	0
+545	0
+546	0
+547	3
+548	0
+549	0
+550	1
+551	0
+552	0
+553	0
+554	2
+555	0
+556	0
+557	0
+558	0
+559	0
+560	0
+561	3
+562	0
+563	0
+564	0
+565	0
+566	0
+567	0
+568	0
+569	0
+570	0
+571	0
+572	0
+573	0
+574	0
+575	0
+576	0
+577	0
+578	2
+579	0
+580	0
+581	0
+582	1
+583	1
+584	0
+585	0
+586	0
+587	2
+588	0
+589	0
+590	0
+591	0
+592	0
+593	3
+594	0
+595	0
+596	0
+597	0
+598	0
+599	1
+600	0
+601	0
+602	0
+603	0
+604	0
+605	0
+606	0
+607	0
+608	0
+609	0
+610	0
+611	0
+612	0
+613	0
+614	2
+615	0
+616	0
+617	0
+618	0
+619	0
+620	2
+621	0
+622	0
+623	0
+624	0
+625	0
+626	3
+627	0
+628	0
+629	0
+630	0
+631	2
+632	0
+633	0
+634	0
+635	0
+636	0
+637	0
+638	0
+639	0
+640	3
+641	0
+642	0
+643	0
+644	0
+645	0
+646	0
+647	0
+648	0
+649	0
+650	0
+651	0
+652	0
+653	0
+654	0
+655	0
+656	0
+657	0
+658	0
+659	0
+660	0
+661	0
+662	0
+663	0
+664	0
+665	0
+666	0
+667	0
+668	0
+669	0
+670	0
+671	0
+672	0
+673	0
+674	0
+675	0
+676	0
+677	0
+678	0
+679	0
+680	0
+681	0
+682	0
+683	0
+684	0
+685	0
+686	0
+687	0
+688	0
+689	0
+690	2
+691	3
+692	0
+693	3
+694	0
+695	0
+696	0
+697	0
+698	0
+699	0
+700	0
+701	0
+702	0
+703	0
+704	0
+705	0
+706	3
+707	0
+708	0
+709	0
+710	3
+711	0
+712	0
+713	0
+714	0
+715	0
+716	0
+717	0
+718	0
+719	0
+720	0
+721	0
+722	0
+723	0
+724	2
+725	0
+726	0
+727	0
+728	0
+729	0
+730	2
+731	0
+732	3
+733	0
+734	0
+735	0
+736	0
+737	0
+738	0
+739	3
+740	0
+741	0
+742	3
+743	0
+744	2
+745	0
+746	0
+747	0
+748	0
+749	3
+750	2
+751	0
+752	0
+753	0
+754	0
+755	0
+756	2
+757	0
+758	0
+759	0
+760	0
+761	0
+762	0
+763	0
+764	0
+765	0
+766	0
+767	0
+768	0
+769	0
+770	0
+771	0
+772	0
+773	0
+774	0
+775	0
+776	0
+777	0
+778	0
+779	0
+780	0
+781	0
+782	0
+783	0
+784	0
+785	2
+786	3
+787	0
+788	0
+789	0
+790	0
+791	0
+792	0
+793	0
+794	0
+795	0
+796	0
+797	0
+798	2
+799	0
+800	2
+801	0
+802	0
+803	2
+804	3
+805	0
+806	0
+807	3
+808	0
+809	0
+810	0
+811	0
+812	0
+813	0
+814	0
+815	0
+816	0
+817	0
+818	0
+819	0
+820	0
+821	2
+822	0
+823	2
+824	0
+825	1
+826	0
+827	0
+828	0
+829	0
+830	0
+831	0
+832	0
+833	0
+834	3
+835	0
+836	0
+837	0
+838	0
+839	0
+840	0
+841	0
+842	0
+843	0
+844	3
+845	0
+846	0
+847	0
+848	0
+849	0
+850	0
+851	0
+852	0
+853	0
+854	0
+855	0
+856	0
+857	0
+858	0
+859	0
+860	0
+861	0
+862	0
+863	0
+864	1
+865	3
+866	0
+867	0
+868	0
+869	0
+870	0
+871	0
+872	0
+873	0
+874	0
+875	0
+876	0
+877	0
+878	0
+879	0
+880	0
+881	0
+882	0
+883	0
+884	0
+885	0
+886	0
+887	0
+888	0
+889	0
+890	2
+891	0
+892	0
+893	0
+894	0
+895	0
+896	0
+897	0
+898	0
+899	0
+900	0
+901	0
+902	0
+903	0
+904	0
+905	0
+906	0
+907	0
+908	0
+909	0
+910	0
+911	0
+912	0
+913	0
+914	1
+915	0
+916	0
+917	0
+918	0
+919	0
+920	0
+921	0
+922	0
+923	0
+924	3
+925	0
+926	0
+927	0
+928	0
+929	0
+930	0
+931	0
+932	0
+933	0
+934	0
+935	0
+936	3
+937	0
+938	0
+939	0
+940	0
+941	0
+942	0
+943	0
+944	0
+945	2
+946	0
+947	0
+948	0
+949	0
+950	1
+951	1
+952	1
+953	1
+954	1
+955	1
+956	1
+957	1
+958	1
+959	1
+960	1
+961	1
+962	1
+963	1
+964	1
+965	1
+966	1
+967	1
+968	1
+969	1
+970	1
+971	1
+972	1
+973	1
+974	1
+975	1
+976	1
+977	1
+978	1
+979	1
+980	1
+981	1
+982	1
+983	1
+984	1
+985	1
+986	1
+987	1
+988	1
+989	1
+990	1
+991	1
+992	1
+993	1
+994	1
+995	1
+996	1
+997	1
+998	1
+999	1
+1000	1
+1001	1
+1002	1
+1003	1
+1004	1
+1005	1
+1006	1
+1007	1
+1008	1
+1009	1
+1010	1
+1011	1
+1012	1
+1013	1
+1014	1
+1015	1
+1016	1
+1017	1
+1018	1
+1019	1
+1020	1
+1021	1
+1022	1
+1023	1
+1024	1
+1025	1
+1026	1
+1027	1
+1028	1
+1029	1
+1030	1
+1031	1
+1032	1
+1033	1
+1034	1
+1035	1
+1036	1
+1037	1
+1038	1
+1039	1
+1040	1
+1041	1
+1042	1
+1043	1
+1044	1
+1045	1
+1046	1
+1047	1
+1048	1
+1049	1
+1050	1
+1051	1
+1052	1
+1053	1
+1054	1
+1055	1
+1056	1
+1057	1
+1058	1
+1059	1
+1060	1
+1061	1
+1062	1
+1063	1
+1064	1
+1065	1
+1066	1
+1067	1
+1068	1
+1069	1
+1070	2
+1071	1
+1072	1
+1073	1
+1074	1
+1075	1
+1076	1
+1077	1
+1078	1
+1079	1
+1080	1
+1081	1
+1082	1
+1083	1
+1084	1
+1085	1
+1086	1
+1087	1
+1088	1
+1089	1
+1090	1
+1091	1
+1092	1
+1093	1
+1094	1
+1095	1
+1096	1
+1097	1
+1098	1
+1099	1
+1100	1
+1101	1
+1102	1
+1103	1
+1104	1
+1105	1
+1106	1
+1107	1
+1108	1
+1109	1
+1110	1
+1111	1
+1112	1
+1113	1
+1114	1
+1115	1
+1116	1
+1117	1
+1118	1
+1119	1
+1120	1
+1121	1
+1122	1
+1123	1
+1124	1
+1125	1
+1126	1
+1127	1
+1128	1
+1129	1
+1130	1
+1131	1
+1132	1
+1133	1
+1134	1
+1135	1
+1136	1
+1137	1
+1138	1
+1139	1
+1140	1
+1141	1
+1142	1
+1143	1
+1144	1
+1145	1
+1146	1
+1147	1
+1148	1
+1149	1
+1150	1
+1151	1
+1152	1
+1153	1
+1154	0
+1155	1
+1156	1
+1157	1
+1158	1
+1159	1
+1160	1
+1161	1
+1162	1
+1163	1
+1164	1
+1165	1
+1166	0
+1167	1
+1168	1
+1169	1
+1170	1
+1171	1
+1172	1
+1173	1
+1174	1
+1175	1
+1176	1
+1177	1
+1178	1
+1179	1
+1180	1
+1181	1
+1182	1
+1183	1
+1184	1
+1185	1
+1186	1
+1187	1
+1188	1
+1189	1
+1190	1
+1191	1
+1192	1
+1193	1
+1194	1
+1195	1
+1196	1
+1197	1
+1198	1
+1199	1
+1200	1
+1201	1
+1202	1
+1203	1
+1204	1
+1205	1
+1206	1
+1207	1
+1208	1
+1209	1
+1210	1
+1211	1
+1212	1
+1213	1
+1214	1
+1215	1
+1216	1
+1217	1
+1218	1
+1219	1
+1220	1
+1221	1
+1222	1
+1223	2
+1224	1
+1225	0
+1226	1
+1227	1
+1228	1
+1229	1
+1230	1
+1231	1
+1232	1
+1233	1
+1234	1
+1235	1
+1236	1
+1237	1
+1238	1
+1239	1
+1240	1
+1241	1
+1242	1
+1243	1
+1244	1
+1245	1
+1246	1
+1247	1
+1248	1
+1249	1
+1250	1
+1251	1
+1252	1
+1253	1
+1254	1
+1255	1
+1256	1
+1257	1
+1258	1
+1259	1
+1260	1
+1261	1
+1262	1
+1263	1
+1264	1
+1265	1
+1266	1
+1267	1
+1268	1
+1269	1
+1270	1
+1271	1
+1272	1
+1273	1
+1274	1
+1275	1
+1276	1
+1277	1
+1278	0
+1279	1
+1280	1
+1281	1
+1282	1
+1283	1
+1284	1
+1285	1
+1286	1
+1287	1
+1288	1
+1289	1
+1290	1
+1291	1
+1292	1
+1293	1
+1294	1
+1295	1
+1296	1
+1297	1
+1298	1
+1299	1
+1300	1
+1301	1
+1302	1
+1303	1
+1304	1
+1305	1
+1306	1
+1307	1
+1308	1
+1309	1
+1310	1
+1311	1
+1312	1
+1313	1
+1314	1
+1315	1
+1316	1
+1317	1
+1318	1
+1319	1
+1320	1
+1321	1
+1322	1
+1323	1
+1324	1
+1325	1
+1326	1
+1327	1
+1328	1
+1329	1
+1330	1
+1331	1
+1332	1
+1333	1
+1334	1
+1335	1
+1336	1
+1337	1
+1338	1
+1339	1
+1340	1
+1341	1
+1342	1
+1343	1
+1344	1
+1345	1
+1346	1
+1347	1
+1348	1
+1349	1
+1350	1
+1351	1
+1352	1
+1353	1
+1354	1
+1355	1
+1356	1
+1357	1
+1358	1
+1359	1
+1360	1
+1361	1
+1362	1
+1363	1
+1364	1
+1365	1
+1366	1
+1367	1
+1368	1
+1369	1
+1370	1
+1371	1
+1372	1
+1373	1
+1374	1
+1375	1
+1376	1
+1377	1
+1378	1
+1379	1
+1380	1
+1381	1
+1382	1
+1383	1
+1384	1
+1385	1
+1386	1
+1387	1
+1388	1
+1389	1
+1390	1
+1391	1
+1392	1
+1393	1
+1394	1
+1395	1
+1396	1
+1397	1
+1398	1
+1399	1
+1400	1
+1401	1
+1402	1
+1403	1
+1404	1
+1405	1
+1406	1
+1407	1
+1408	1
+1409	1
+1410	1
+1411	1
+1412	1
+1413	1
+1414	1
+1415	1
+1416	1
+1417	1
+1418	1
+1419	1
+1420	1
+1421	1
+1422	1
+1423	1
+1424	2
+1425	1
+1426	1
+1427	1
+1428	1
+1429	1
+1430	1
+1431	1
+1432	1
+1433	1
+1434	1
+1435	1
+1436	1
+1437	1
+1438	1
+1439	1
+1440	1
+1441	1
+1442	1
+1443	1
+1444	1
+1445	1
+1446	1
+1447	1
+1448	1
+1449	1
+1450	1
+1451	1
+1452	1
+1453	1
+1454	1
+1455	1
+1456	1
+1457	1
+1458	1
+1459	1
+1460	1
+1461	1
+1462	1
+1463	1
+1464	2
+1465	1
+1466	1
+1467	1
+1468	1
+1469	1
+1470	1
+1471	1
+1472	1
+1473	1
+1474	1
+1475	1
+1476	1
+1477	1
+1478	1
+1479	1
+1480	1
+1481	1
+1482	1
+1483	1
+1484	1
+1485	1
+1486	1
+1487	1
+1488	1
+1489	1
+1490	1
+1491	1
+1492	1
+1493	1
+1494	1
+1495	1
+1496	1
+1497	1
+1498	1
+1499	1
+1500	1
+1501	1
+1502	1
+1503	1
+1504	1
+1505	1
+1506	1
+1507	1
+1508	1
+1509	1
+1510	1
+1511	1
+1512	1
+1513	1
+1514	1
+1515	1
+1516	1
+1517	1
+1518	1
+1519	1
+1520	1
+1521	1
+1522	1
+1523	1
+1524	1
+1525	1
+1526	1
+1527	1
+1528	1
+1529	1
+1530	1
+1531	1
+1532	1
+1533	1
+1534	1
+1535	1
+1536	1
+1537	1
+1538	1
+1539	1
+1540	1
+1541	1
+1542	1
+1543	1
+1544	1
+1545	1
+1546	1
+1547	1
+1548	1
+1549	1
+1550	1
+1551	1
+1552	1
+1553	1
+1554	1
+1555	1
+1556	1
+1557	1
+1558	1
+1559	1
+1560	1
+1561	1
+1562	1
+1563	1
+1564	1
+1565	1
+1566	1
+1567	1
+1568	1
+1569	3
+1570	1
+1571	1
+1572	1
+1573	1
+1574	1
+1575	1
+1576	1
+1577	1
+1578	1
+1579	1
+1580	1
+1581	1
+1582	1
+1583	1
+1584	1
+1585	1
+1586	1
+1587	1
+1588	1
+1589	1
+1590	1
+1591	1
+1592	0
+1593	1
+1594	1
+1595	1
+1596	1
+1597	1
+1598	1
+1599	1
+1600	1
+1601	1
+1602	1
+1603	1
+1604	1
+1605	1
+1606	1
+1607	1
+1608	1
+1609	1
+1610	1
+1611	1
+1612	1
+1613	1
+1614	1
+1615	1
+1616	1
+1617	1
+1618	1
+1619	1
+1620	1
+1621	1
+1622	1
+1623	1
+1624	1
+1625	1
+1626	1
+1627	1
+1628	1
+1629	1
+1630	1
+1631	1
+1632	1
+1633	1
+1634	1
+1635	1
+1636	1
+1637	1
+1638	1
+1639	1
+1640	1
+1641	1
+1642	1
+1643	1
+1644	1
+1645	1
+1646	1
+1647	1
+1648	1
+1649	1
+1650	1
+1651	1
+1652	1
+1653	1
+1654	1
+1655	1
+1656	1
+1657	1
+1658	1
+1659	1
+1660	1
+1661	1
+1662	1
+1663	1
+1664	1
+1665	1
+1666	1
+1667	1
+1668	1
+1669	1
+1670	1
+1671	1
+1672	1
+1673	1
+1674	1
+1675	1
+1676	1
+1677	1
+1678	1
+1679	1
+1680	1
+1681	3
+1682	1
+1683	1
+1684	1
+1685	1
+1686	1
+1687	1
+1688	1
+1689	1
+1690	1
+1691	1
+1692	1
+1693	1
+1694	1
+1695	1
+1696	1
+1697	1
+1698	1
+1699	1
+1700	1
+1701	1
+1702	1
+1703	1
+1704	1
+1705	1
+1706	1
+1707	1
+1708	1
+1709	1
+1710	1
+1711	1
+1712	1
+1713	1
+1714	1
+1715	1
+1716	1
+1717	1
+1718	1
+1719	1
+1720	1
+1721	1
+1722	1
+1723	1
+1724	1
+1725	1
+1726	1
+1727	1
+1728	1
+1729	1
+1730	1
+1731	1
+1732	1
+1733	1
+1734	1
+1735	1
+1736	1
+1737	1
+1738	1
+1739	1
+1740	1
+1741	1
+1742	1
+1743	1
+1744	1
+1745	1
+1746	1
+1747	1
+1748	1
+1749	1
+1750	1
+1751	1
+1752	1
+1753	1
+1754	0
+1755	1
+1756	1
+1757	1
+1758	1
+1759	1
+1760	1
+1761	1
+1762	1
+1763	1
+1764	1
+1765	1
+1766	1
+1767	1
+1768	1
+1769	1
+1770	1
+1771	1
+1772	1
+1773	1
+1774	1
+1775	1
+1776	1
+1777	1
+1778	1
+1779	1
+1780	1
+1781	1
+1782	1
+1783	1
+1784	1
+1785	1
+1786	1
+1787	1
+1788	1
+1789	1
+1790	1
+1791	1
+1792	1
+1793	1
+1794	1
+1795	1
+1796	1
+1797	1
+1798	1
+1799	1
+1800	1
+1801	1
+1802	1
+1803	1
+1804	1
+1805	1
+1806	1
+1807	1
+1808	1
+1809	1
+1810	1
+1811	1
+1812	1
+1813	1
+1814	1
+1815	1
+1816	1
+1817	1
+1818	1
+1819	1
+1820	1
+1821	1
+1822	1
+1823	1
+1824	1
+1825	1
+1826	1
+1827	1
+1828	1
+1829	1
+1830	1
+1831	1
+1832	1
+1833	1
+1834	3
+1835	1
+1836	1
+1837	1
+1838	1
+1839	1
+1840	1
+1841	1
+1842	1
+1843	1
+1844	1
+1845	1
+1846	1
+1847	1
+1848	1
+1849	1
+1850	1
+1851	1
+1852	1
+1853	1
+1854	1
+1855	1
+1856	1
+1857	1
+1858	1
+1859	1
+1860	1
+1861	1
+1862	1
+1863	1
+1864	1
+1865	1
+1866	1
+1867	1
+1868	1
+1869	1
+1870	1
+1871	1
+1872	1
+1873	1
+1874	1
+1875	1
+1876	1
+1877	1
+1878	1
+1879	1
+1880	1
+1881	1
+1882	1
+1883	1
+1884	1
+1885	1
+1886	1
+1887	1
+1888	1
+1889	1
+1890	1
+1891	1
+1892	1
+1893	1
+1894	1
+1895	1
+1896	1
+1897	1
+1898	1
+1899	1
+1900	2
+1901	2
+1902	2
+1903	2
+1904	2
+1905	2
+1906	2
+1907	2
+1908	2
+1909	2
+1910	2
+1911	2
+1912	2
+1913	2
+1914	2
+1915	2
+1916	2
+1917	2
+1918	2
+1919	2
+1920	2
+1921	2
+1922	2
+1923	2
+1924	2
+1925	2
+1926	2
+1927	2
+1928	2
+1929	2
+1930	2
+1931	2
+1932	2
+1933	2
+1934	2
+1935	2
+1936	2
+1937	2
+1938	2
+1939	2
+1940	3
+1941	2
+1942	2
+1943	2
+1944	2
+1945	2
+1946	2
+1947	2
+1948	2
+1949	2
+1950	2
+1951	2
+1952	2
+1953	2
+1954	2
+1955	2
+1956	2
+1957	2
+1958	2
+1959	2
+1960	2
+1961	2
+1962	2
+1963	2
+1964	2
+1965	2
+1966	2
+1967	2
+1968	2
+1969	2
+1970	2
+1971	2
+1972	2
+1973	2
+1974	2
+1975	2
+1976	2
+1977	0
+1978	3
+1979	2
+1980	2
+1981	2
+1982	2
+1983	2
+1984	2
+1985	2
+1986	2
+1987	2
+1988	2
+1989	2
+1990	2
+1991	2
+1992	2
+1993	2
+1994	2
+1995	2
+1996	2
+1997	2
+1998	2
+1999	3
+2000	2
+2001	2
+2002	2
+2003	2
+2004	2
+2005	2
+2006	2
+2007	2
+2008	2
+2009	2
+2010	2
+2011	2
+2012	2
+2013	2
+2014	2
+2015	2
+2016	2
+2017	2
+2018	2
+2019	2
+2020	2
+2021	2
+2022	2
+2023	2
+2024	2
+2025	2
+2026	2
+2027	2
+2028	2
+2029	2
+2030	3
+2031	2
+2032	2
+2033	2
+2034	2
+2035	2
+2036	2
+2037	2
+2038	2
+2039	2
+2040	2
+2041	2
+2042	2
+2043	2
+2044	2
+2045	2
+2046	2
+2047	2
+2048	3
+2049	2
+2050	2
+2051	3
+2052	2
+2053	2
+2054	2
+2055	2
+2056	3
+2057	2
+2058	2
+2059	2
+2060	2
+2061	2
+2062	2
+2063	2
+2064	2
+2065	2
+2066	2
+2067	2
+2068	2
+2069	2
+2070	2
+2071	2
+2072	2
+2073	3
+2074	2
+2075	2
+2076	2
+2077	2
+2078	2
+2079	3
+2080	3
+2081	2
+2082	2
+2083	2
+2084	2
+2085	2
+2086	2
+2087	2
+2088	2
+2089	2
+2090	2
+2091	2
+2092	2
+2093	2
+2094	2
+2095	2
+2096	2
+2097	2
+2098	2
+2099	2
+2100	3
+2101	1
+2102	3
+2103	2
+2104	2
+2105	2
+2106	2
+2107	2
+2108	2
+2109	1
+2110	2
+2111	2
+2112	2
+2113	2
+2114	2
+2115	2
+2116	2
+2117	2
+2118	2
+2119	3
+2120	2
+2121	2
+2122	2
+2123	2
+2124	2
+2125	2
+2126	1
+2127	2
+2128	2
+2129	1
+2130	2
+2131	2
+2132	2
+2133	2
+2134	2
+2135	2
+2136	2
+2137	2
+2138	2
+2139	2
+2140	2
+2141	2
+2142	2
+2143	2
+2144	2
+2145	2
+2146	2
+2147	2
+2148	2
+2149	2
+2150	2
+2151	2
+2152	2
+2153	2
+2154	2
+2155	2
+2156	2
+2157	3
+2158	2
+2159	2
+2160	3
+2161	2
+2162	2
+2163	2
+2164	2
+2165	2
+2166	2
+2167	0
+2168	2
+2169	2
+2170	2
+2171	3
+2172	2
+2173	2
+2174	2
+2175	2
+2176	2
+2177	0
+2178	2
+2179	2
+2180	3
+2181	2
+2182	2
+2183	2
+2184	2
+2185	2
+2186	2
+2187	2
+2188	2
+2189	2
+2190	2
+2191	2
+2192	2
+2193	2
+2194	2
+2195	2
+2196	2
+2197	3
+2198	2
+2199	2
+2200	2
+2201	2
+2202	2
+2203	2
+2204	2
+2205	2
+2206	2
+2207	2
+2208	2
+2209	2
+2210	2
+2211	2
+2212	2
+2213	2
+2214	2
+2215	2
+2216	0
+2217	2
+2218	2
+2219	2
+2220	2
+2221	2
+2222	0
+2223	2
+2224	2
+2225	2
+2226	2
+2227	2
+2228	2
+2229	2
+2230	2
+2231	2
+2232	2
+2233	3
+2234	3
+2235	2
+2236	0
+2237	2
+2238	2
+2239	2
+2240	2
+2241	2
+2242	2
+2243	2
+2244	2
+2245	2
+2246	2
+2247	2
+2248	3
+2249	2
+2250	3
+2251	2
+2252	2
+2253	2
+2254	2
+2255	1
+2256	2
+2257	2
+2258	2
+2259	2
+2260	2
+2261	2
+2262	2
+2263	2
+2264	2
+2265	2
+2266	2
+2267	3
+2268	2
+2269	2
+2270	2
+2271	2
+2272	2
+2273	2
+2274	2
+2275	2
+2276	2
+2277	2
+2278	3
+2279	2
+2280	3
+2281	2
+2282	2
+2283	2
+2284	2
+2285	3
+2286	2
+2287	2
+2288	2
+2289	2
+2290	2
+2291	3
+2292	2
+2293	2
+2294	2
+2295	2
+2296	2
+2297	2
+2298	2
+2299	2
+2300	2
+2301	2
+2302	2
+2303	2
+2304	2
+2305	2
+2306	2
+2307	2
+2308	2
+2309	2
+2310	3
+2311	2
+2312	2
+2313	2
+2314	2
+2315	2
+2316	2
+2317	3
+2318	3
+2319	2
+2320	2
+2321	2
+2322	2
+2323	3
+2324	2
+2325	2
+2326	2
+2327	2
+2328	2
+2329	3
+2330	0
+2331	2
+2332	2
+2333	2
+2334	2
+2335	2
+2336	2
+2337	3
+2338	2
+2339	2
+2340	2
+2341	0
+2342	2
+2343	3
+2344	2
+2345	2
+2346	2
+2347	2
+2348	2
+2349	3
+2350	2
+2351	2
+2352	2
+2353	2
+2354	3
+2355	2
+2356	2
+2357	2
+2358	2
+2359	3
+2360	2
+2361	2
+2362	2
+2363	2
+2364	2
+2365	2
+2366	2
+2367	3
+2368	2
+2369	2
+2370	3
+2371	2
+2372	2
+2373	2
+2374	2
+2375	2
+2376	2
+2377	2
+2378	2
+2379	2
+2380	3
+2381	2
+2382	3
+2383	2
+2384	2
+2385	2
+2386	3
+2387	2
+2388	2
+2389	0
+2390	3
+2391	2
+2392	2
+2393	2
+2394	2
+2395	2
+2396	2
+2397	2
+2398	2
+2399	2
+2400	3
+2401	0
+2402	2
+2403	2
+2404	2
+2405	2
+2406	2
+2407	2
+2408	3
+2409	2
+2410	2
+2411	2
+2412	2
+2413	2
+2414	2
+2415	2
+2416	2
+2417	2
+2418	2
+2419	2
+2420	2
+2421	3
+2422	2
+2423	0
+2424	2
+2425	2
+2426	2
+2427	2
+2428	3
+2429	2
+2430	2
+2431	2
+2432	2
+2433	3
+2434	3
+2435	2
+2436	3
+2437	2
+2438	2
+2439	2
+2440	2
+2441	2
+2442	2
+2443	2
+2444	0
+2445	2
+2446	2
+2447	2
+2448	2
+2449	2
+2450	2
+2451	3
+2452	2
+2453	2
+2454	3
+2455	2
+2456	3
+2457	2
+2458	2
+2459	2
+2460	2
+2461	2
+2462	2
+2463	2
+2464	2
+2465	2
+2466	3
+2467	3
+2468	2
+2469	2
+2470	2
+2471	2
+2472	2
+2473	2
+2474	2
+2475	2
+2476	1
+2477	2
+2478	2
+2479	2
+2480	2
+2481	2
+2482	3
+2483	2
+2484	2
+2485	2
+2486	3
+2487	2
+2488	2
+2489	2
+2490	2
+2491	2
+2492	2
+2493	2
+2494	2
+2495	2
+2496	2
+2497	2
+2498	2
+2499	2
+2500	2
+2501	1
+2502	2
+2503	2
+2504	2
+2505	3
+2506	2
+2507	2
+2508	2
+2509	2
+2510	2
+2511	2
+2512	2
+2513	2
+2514	2
+2515	2
+2516	2
+2517	2
+2518	3
+2519	2
+2520	3
+2521	2
+2522	3
+2523	2
+2524	2
+2525	2
+2526	2
+2527	2
+2528	3
+2529	2
+2530	2
+2531	2
+2532	2
+2533	2
+2534	2
+2535	2
+2536	2
+2537	0
+2538	2
+2539	2
+2540	1
+2541	2
+2542	2
+2543	2
+2544	2
+2545	2
+2546	2
+2547	2
+2548	2
+2549	2
+2550	2
+2551	2
+2552	2
+2553	2
+2554	2
+2555	2
+2556	2
+2557	2
+2558	2
+2559	2
+2560	2
+2561	2
+2562	2
+2563	2
+2564	2
+2565	2
+2566	2
+2567	2
+2568	2
+2569	2
+2570	3
+2571	2
+2572	2
+2573	2
+2574	3
+2575	2
+2576	3
+2577	3
+2578	2
+2579	2
+2580	2
+2581	2
+2582	2
+2583	3
+2584	2
+2585	2
+2586	2
+2587	2
+2588	2
+2589	2
+2590	2
+2591	2
+2592	2
+2593	2
+2594	2
+2595	2
+2596	2
+2597	2
+2598	2
+2599	2
+2600	2
+2601	2
+2602	2
+2603	2
+2604	2
+2605	2
+2606	3
+2607	2
+2608	2
+2609	2
+2610	2
+2611	2
+2612	2
+2613	2
+2614	2
+2615	2
+2616	2
+2617	2
+2618	3
+2619	3
+2620	2
+2621	2
+2622	2
+2623	2
+2624	2
+2625	2
+2626	2
+2627	2
+2628	3
+2629	2
+2630	3
+2631	2
+2632	2
+2633	2
+2634	2
+2635	2
+2636	2
+2637	2
+2638	2
+2639	2
+2640	2
+2641	2
+2642	2
+2643	2
+2644	2
+2645	2
+2646	2
+2647	2
+2648	2
+2649	2
+2650	2
+2651	2
+2652	3
+2653	2
+2654	3
+2655	2
+2656	2
+2657	2
+2658	2
+2659	2
+2660	2
+2661	3
+2662	2
+2663	3
+2664	2
+2665	2
+2666	3
+2667	2
+2668	2
+2669	2
+2670	3
+2671	2
+2672	2
+2673	3
+2674	2
+2675	2
+2676	2
+2677	2
+2678	2
+2679	2
+2680	2
+2681	2
+2682	2
+2683	2
+2684	2
+2685	2
+2686	2
+2687	2
+2688	3
+2689	2
+2690	2
+2691	2
+2692	2
+2693	2
+2694	2
+2695	3
+2696	2
+2697	0
+2698	3
+2699	2
+2700	2
+2701	2
+2702	1
+2703	2
+2704	2
+2705	2
+2706	2
+2707	2
+2708	2
+2709	2
+2710	3
+2711	2
+2712	2
+2713	2
+2714	2
+2715	2
+2716	2
+2717	2
+2718	3
+2719	2
+2720	2
+2721	2
+2722	2
+2723	2
+2724	2
+2725	2
+2726	2
+2727	2
+2728	3
+2729	2
+2730	2
+2731	2
+2732	2
+2733	2
+2734	2
+2735	2
+2736	2
+2737	2
+2738	3
+2739	2
+2740	2
+2741	2
+2742	2
+2743	2
+2744	3
+2745	3
+2746	2
+2747	2
+2748	2
+2749	2
+2750	2
+2751	0
+2752	2
+2753	2
+2754	2
+2755	2
+2756	3
+2757	2
+2758	2
+2759	2
+2760	2
+2761	2
+2762	2
+2763	2
+2764	2
+2765	2
+2766	2
+2767	2
+2768	3
+2769	2
+2770	2
+2771	2
+2772	2
+2773	2
+2774	2
+2775	2
+2776	2
+2777	2
+2778	2
+2779	2
+2780	3
+2781	2
+2782	2
+2783	2
+2784	2
+2785	2
+2786	0
+2787	2
+2788	2
+2789	2
+2790	2
+2791	3
+2792	3
+2793	2
+2794	3
+2795	2
+2796	0
+2797	2
+2798	2
+2799	2
+2800	2
+2801	2
+2802	2
+2803	2
+2804	2
+2805	2
+2806	0
+2807	2
+2808	2
+2809	2
+2810	2
+2811	2
+2812	2
+2813	2
+2814	3
+2815	2
+2816	2
+2817	3
+2818	3
+2819	2
+2820	2
+2821	3
+2822	2
+2823	2
+2824	2
+2825	2
+2826	2
+2827	2
+2828	2
+2829	2
+2830	2
+2831	2
+2832	2
+2833	2
+2834	2
+2835	2
+2836	2
+2837	2
+2838	2
+2839	2
+2840	2
+2841	2
+2842	2
+2843	2
+2844	2
+2845	2
+2846	2
+2847	2
+2848	2
+2849	2
+2850	2
+2851	3
+2852	3
+2853	2
+2854	3
+2855	0
+2856	3
+2857	3
+2858	3
+2859	3
+2860	3
+2861	3
+2862	3
+2863	3
+2864	3
+2865	3
+2866	3
+2867	3
+2868	3
+2869	3
+2870	3
+2871	2
+2872	3
+2873	3
+2874	3
+2875	3
+2876	3
+2877	3
+2878	3
+2879	3
+2880	3
+2881	2
+2882	3
+2883	3
+2884	3
+2885	3
+2886	3
+2887	2
+2888	3
+2889	1
+2890	3
+2891	3
+2892	3
+2893	3
+2894	3
+2895	3
+2896	3
+2897	3
+2898	3
+2899	3
+2900	3
+2901	3
+2902	0
+2903	3
+2904	3
+2905	3
+2906	3
+2907	3
+2908	3
+2909	3
+2910	3
+2911	2
+2912	3
+2913	3
+2914	3
+2915	3
+2916	2
+2917	3
+2918	3
+2919	3
+2920	3
+2921	3
+2922	3
+2923	3
+2924	3
+2925	3
+2926	3
+2927	3
+2928	3
+2929	2
+2930	3
+2931	3
+2932	3
+2933	3
+2934	3
+2935	3
+2936	3
+2937	3
+2938	3
+2939	3
+2940	3
+2941	3
+2942	3
+2943	2
+2944	3
+2945	3
+2946	3
+2947	3
+2948	3
+2949	3
+2950	3
+2951	3
+2952	3
+2953	3
+2954	3
+2955	3
+2956	3
+2957	3
+2958	3
+2959	3
+2960	3
+2961	2
+2962	3
+2963	3
+2964	3
+2965	3
+2966	3
+2967	3
+2968	3
+2969	3
+2970	3
+2971	3
+2972	0
+2973	3
+2974	3
+2975	3
+2976	3
+2977	3
+2978	3
+2979	3
+2980	3
+2981	3
+2982	3
+2983	3
+2984	3
+2985	3
+2986	3
+2987	3
+2988	3
+2989	3
+2990	3
+2991	3
+2992	2
+2993	3
+2994	3
+2995	3
+2996	3
+2997	3
+2998	3
+2999	3
+3000	3
+3001	3
+3002	2
+3003	3
+3004	2
+3005	3
+3006	3
+3007	3
+3008	3
+3009	3
+3010	3
+3011	3
+3012	3
+3013	3
+3014	3
+3015	3
+3016	3
+3017	3
+3018	3
+3019	3
+3020	3
+3021	3
+3022	3
+3023	3
+3024	3
+3025	3
+3026	3
+3027	3
+3028	2
+3029	2
+3030	3
+3031	3
+3032	3
+3033	3
+3034	2
+3035	0
+3036	3
+3037	3
+3038	2
+3039	3
+3040	3
+3041	3
+3042	3
+3043	3
+3044	3
+3045	3
+3046	3
+3047	3
+3048	3
+3049	3
+3050	3
+3051	3
+3052	3
+3053	3
+3054	3
+3055	3
+3056	3
+3057	3
+3058	3
+3059	3
+3060	3
+3061	3
+3062	3
+3063	3
+3064	3
+3065	3
+3066	3
+3067	3
+3068	3
+3069	3
+3070	3
+3071	3
+3072	3
+3073	3
+3074	3
+3075	3
+3076	3
+3077	2
+3078	3
+3079	3
+3080	3
+3081	3
+3082	3
+3083	3
+3084	3
+3085	3
+3086	3
+3087	3
+3088	3
+3089	3
+3090	3
+3091	2
+3092	3
+3093	3
+3094	3
+3095	3
+3096	3
+3097	3
+3098	3
+3099	1
+3100	3
+3101	3
+3102	3
+3103	0
+3104	3
+3105	3
+3106	3
+3107	3
+3108	3
+3109	3
+3110	3
+3111	3
+3112	3
+3113	3
+3114	3
+3115	3
+3116	2
+3117	3
+3118	3
+3119	3
+3120	3
+3121	3
+3122	3
+3123	3
+3124	3
+3125	0
+3126	3
+3127	3
+3128	3
+3129	3
+3130	3
+3131	3
+3132	3
+3133	3
+3134	0
+3135	3
+3136	3
+3137	3
+3138	3
+3139	3
+3140	3
+3141	3
+3142	3
+3143	3
+3144	3
+3145	3
+3146	3
+3147	2
+3148	3
+3149	3
+3150	3
+3151	3
+3152	3
+3153	3
+3154	3
+3155	3
+3156	3
+3157	3
+3158	3
+3159	3
+3160	3
+3161	3
+3162	3
+3163	3
+3164	3
+3165	3
+3166	3
+3167	0
+3168	3
+3169	3
+3170	3
+3171	3
+3172	3
+3173	3
+3174	3
+3175	3
+3176	3
+3177	3
+3178	3
+3179	0
+3180	3
+3181	3
+3182	2
+3183	3
+3184	3
+3185	3
+3186	3
+3187	3
+3188	3
+3189	3
+3190	3
+3191	3
+3192	3
+3193	0
+3194	3
+3195	3
+3196	3
+3197	3
+3198	3
+3199	3
+3200	3
+3201	3
+3202	3
+3203	2
+3204	3
+3205	3
+3206	3
+3207	2
+3208	3
+3209	2
+3210	3
+3211	3
+3212	3
+3213	3
+3214	3
+3215	3
+3216	3
+3217	3
+3218	3
+3219	3
+3220	3
+3221	3
+3222	3
+3223	3
+3224	3
+3225	2
+3226	3
+3227	3
+3228	3
+3229	3
+3230	3
+3231	3
+3232	3
+3233	3
+3234	3
+3235	3
+3236	3
+3237	3
+3238	0
+3239	3
+3240	3
+3241	3
+3242	3
+3243	2
+3244	3
+3245	3
+3246	3
+3247	3
+3248	3
+3249	3
+3250	3
+3251	3
+3252	3
+3253	3
+3254	3
+3255	3
+3256	3
+3257	3
+3258	3
+3259	2
+3260	3
+3261	3
+3262	3
+3263	3
+3264	3
+3265	3
+3266	3
+3267	3
+3268	2
+3269	3
+3270	3
+3271	3
+3272	3
+3273	3
+3274	3
+3275	3
+3276	3
+3277	3
+3278	3
+3279	3
+3280	3
+3281	3
+3282	3
+3283	3
+3284	3
+3285	3
+3286	3
+3287	3
+3288	3
+3289	3
+3290	3
+3291	3
+3292	3
+3293	3
+3294	3
+3295	3
+3296	3
+3297	3
+3298	3
+3299	3
+3300	3
+3301	3
+3302	3
+3303	3
+3304	3
+3305	3
+3306	3
+3307	3
+3308	3
+3309	3
+3310	3
+3311	3
+3312	3
+3313	3
+3314	3
+3315	3
+3316	3
+3317	3
+3318	3
+3319	3
+3320	3
+3321	3
+3322	3
+3323	3
+3324	0
+3325	3
+3326	3
+3327	3
+3328	3
+3329	3
+3330	3
+3331	3
+3332	3
+3333	3
+3334	1
+3335	3
+3336	3
+3337	3
+3338	3
+3339	3
+3340	3
+3341	3
+3342	3
+3343	3
+3344	3
+3345	3
+3346	3
+3347	3
+3348	3
+3349	3
+3350	3
+3351	3
+3352	3
+3353	3
+3354	3
+3355	3
+3356	3
+3357	3
+3358	2
+3359	3
+3360	3
+3361	3
+3362	3
+3363	3
+3364	3
+3365	3
+3366	3
+3367	3
+3368	3
+3369	3
+3370	3
+3371	3
+3372	3
+3373	3
+3374	3
+3375	3
+3376	3
+3377	3
+3378	3
+3379	3
+3380	3
+3381	3
+3382	3
+3383	3
+3384	3
+3385	3
+3386	3
+3387	3
+3388	3
+3389	3
+3390	3
+3391	3
+3392	3
+3393	3
+3394	3
+3395	3
+3396	3
+3397	3
+3398	3
+3399	3
+3400	1
+3401	1
+3402	3
+3403	3
+3404	3
+3405	3
+3406	3
+3407	3
+3408	3
+3409	3
+3410	3
+3411	3
+3412	3
+3413	3
+3414	3
+3415	3
+3416	3
+3417	3
+3418	3
+3419	3
+3420	3
+3421	3
+3422	3
+3423	3
+3424	3
+3425	3
+3426	3
+3427	3
+3428	3
+3429	3
+3430	3
+3431	3
+3432	3
+3433	3
+3434	3
+3435	3
+3436	3
+3437	3
+3438	2
+3439	3
+3440	3
+3441	3
+3442	3
+3443	3
+3444	3
+3445	3
+3446	2
+3447	2
+3448	3
+3449	3
+3450	3
+3451	3
+3452	3
+3453	3
+3454	3
+3455	3
+3456	3
+3457	3
+3458	3
+3459	3
+3460	3
+3461	3
+3462	3
+3463	3
+3464	3
+3465	3
+3466	3
+3467	3
+3468	3
+3469	3
+3470	2
+3471	3
+3472	3
+3473	3
+3474	3
+3475	3
+3476	3
+3477	3
+3478	3
+3479	3
+3480	2
+3481	2
+3482	3
+3483	3
+3484	3
+3485	3
+3486	3
+3487	3
+3488	3
+3489	3
+3490	3
+3491	3
+3492	3
+3493	3
+3494	3
+3495	3
+3496	3
+3497	2
+3498	3
+3499	3
+3500	0
+3501	3
+3502	3
+3503	3
+3504	3
+3505	3
+3506	3
+3507	3
+3508	3
+3509	3
+3510	3
+3511	3
+3512	3
+3513	3
+3514	3
+3515	3
+3516	3
+3517	3
+3518	3
+3519	2
+3520	3
+3521	3
+3522	3
+3523	3
+3524	3
+3525	3
+3526	3
+3527	3
+3528	3
+3529	3
+3530	3
+3531	3
+3532	3
+3533	3
+3534	3
+3535	3
+3536	3
+3537	3
+3538	3
+3539	3
+3540	3
+3541	3
+3542	3
+3543	2
+3544	3
+3545	3
+3546	3
+3547	3
+3548	3
+3549	3
+3550	3
+3551	3
+3552	3
+3553	3
+3554	3
+3555	3
+3556	3
+3557	3
+3558	3
+3559	3
+3560	3
+3561	3
+3562	3
+3563	3
+3564	3
+3565	3
+3566	3
+3567	3
+3568	3
+3569	3
+3570	3
+3571	3
+3572	3
+3573	3
+3574	3
+3575	3
+3576	3
+3577	3
+3578	3
+3579	3
+3580	3
+3581	3
+3582	3
+3583	3
+3584	3
+3585	3
+3586	3
+3587	3
+3588	3
+3589	3
+3590	3
+3591	3
+3592	3
+3593	3
+3594	3
+3595	3
+3596	3
+3597	3
+3598	3
+3599	3
+3600	3
+3601	3
+3602	3
+3603	3
+3604	3
+3605	3
+3606	3
+3607	3
+3608	3
+3609	3
+3610	3
+3611	2
+3612	3
+3613	3
+3614	3
+3615	3
+3616	3
+3617	3
+3618	3
+3619	0
+3620	3
+3621	3
+3622	3
+3623	3
+3624	3
+3625	3
+3626	3
+3627	3
+3628	3
+3629	3
+3630	3
+3631	3
+3632	3
+3633	3
+3634	3
+3635	3
+3636	3
+3637	3
+3638	3
+3639	3
+3640	3
+3641	3
+3642	3
+3643	3
+3644	3
+3645	3
+3646	3
+3647	3
+3648	3
+3649	3
+3650	3
+3651	3
+3652	3
+3653	3
+3654	3
+3655	3
+3656	3
+3657	3
+3658	3
+3659	3
+3660	3
+3661	3
+3662	3
+3663	3
+3664	3
+3665	3
+3666	3
+3667	3
+3668	3
+3669	3
+3670	3
+3671	3
+3672	3
+3673	2
+3674	3
+3675	3
+3676	3
+3677	3
+3678	3
+3679	3
+3680	3
+3681	3
+3682	3
+3683	3
+3684	3
+3685	3
+3686	3
+3687	3
+3688	3
+3689	3
+3690	3
+3691	3
+3692	2
+3693	3
+3694	3
+3695	3
+3696	3
+3697	3
+3698	3
+3699	3
+3700	3
+3701	0
+3702	3
+3703	3
+3704	3
+3705	3
+3706	3
+3707	0
+3708	3
+3709	3
+3710	3
+3711	3
+3712	3
+3713	3
+3714	3
+3715	3
+3716	3
+3717	3
+3718	3
+3719	3
+3720	3
+3721	2
+3722	3
+3723	3
+3724	3
+3725	2
+3726	3
+3727	3
+3728	3
+3729	3
+3730	3
+3731	3
+3732	3
+3733	3
+3734	3
+3735	3
+3736	3
+3737	3
+3738	3
+3739	3
+3740	3
+3741	3
+3742	3
+3743	3
+3744	3
+3745	3
+3746	3
+3747	3
+3748	3
+3749	3
+3750	3
+3751	3
+3752	3
+3753	3
+3754	3
+3755	3
+3756	2
+3757	3
+3758	0
+3759	3
+3760	3
+3761	3
+3762	3
+3763	2
+3764	3
+3765	3
+3766	3
+3767	3
+3768	3
+3769	3
+3770	3
+3771	3
+3772	3
+3773	3
+3774	3
+3775	3
+3776	3
+3777	3
+3778	3
+3779	3
+3780	3
+3781	3
+3782	3
+3783	3
+3784	3
+3785	3
+3786	3
+3787	3
+3788	2
+3789	3
+3790	3
+3791	3
+3792	2
+3793	2
+3794	3
+3795	3
+3796	3
+3797	3
+3798	3
+3799	3
diff --git a/out/roberta_results/README.md b/out/roberta_results/README.md
new file mode 100644
index 0000000..efb30d6
--- /dev/null
+++ b/out/roberta_results/README.md
@@ -0,0 +1,53 @@
+---
+tags:
+- generated_from_trainer
+model-index:
+- name: roberta_results
+  results: []
+---
+
+<!-- This model card has been generated automatically according to the information the Trainer had access to. You
+should probably proofread and complete it, then remove this comment. -->
+
+# roberta_results
+
+This model is a fine-tuned version of [out/roberta](https://huggingface.co/out/roberta) on an unknown dataset.
+It achieves the following results on the evaluation set:
+- eval_loss: 0.2960
+- eval_accuracy: 0.9230
+- eval_runtime: 17.8166
+- eval_samples_per_second: 112.255
+- eval_steps_per_second: 14.032
+- step: 0
+
+## Model description
+
+More information needed
+
+## Intended uses & limitations
+
+More information needed
+
+## Training and evaluation data
+
+More information needed
+
+## Training procedure
+
+### Training hyperparameters
+
+The following hyperparameters were used during training:
+- learning_rate: 2e-05
+- train_batch_size: 8
+- eval_batch_size: 8
+- seed: 42
+- optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08
+- lr_scheduler_type: linear
+- training_steps: 2500
+
+### Framework versions
+
+- Transformers 4.26.1
+- Pytorch 1.13.1+cu117
+- Datasets 2.9.0
+- Tokenizers 0.13.2
diff --git a/out/roberta_results/all_results.json b/out/roberta_results/all_results.json
new file mode 100644
index 0000000..3fe0a29
--- /dev/null
+++ b/out/roberta_results/all_results.json
@@ -0,0 +1,8 @@
+{
+    "eval_accuracy": 0.9229999780654907,
+    "eval_loss": 0.29598742723464966,
+    "eval_runtime": 17.8166,
+    "eval_samples": 2000,
+    "eval_samples_per_second": 112.255,
+    "eval_steps_per_second": 14.032
+}
\ No newline at end of file
diff --git a/out/roberta_results/eval_results.json b/out/roberta_results/eval_results.json
new file mode 100644
index 0000000..3fe0a29
--- /dev/null
+++ b/out/roberta_results/eval_results.json
@@ -0,0 +1,8 @@
+{
+    "eval_accuracy": 0.9229999780654907,
+    "eval_loss": 0.29598742723464966,
+    "eval_runtime": 17.8166,
+    "eval_samples": 2000,
+    "eval_samples_per_second": 112.255,
+    "eval_steps_per_second": 14.032
+}
\ No newline at end of file
diff --git a/out/roberta_results/predict_results_None.txt b/out/roberta_results/predict_results_None.txt
new file mode 100644
index 0000000..f6c33da
--- /dev/null
+++ b/out/roberta_results/predict_results_None.txt
@@ -0,0 +1,3801 @@
+index	prediction
+0	0
+1	0
+2	0
+3	0
+4	0
+5	0
+6	0
+7	0
+8	0
+9	0
+10	0
+11	1
+12	2
+13	0
+14	0
+15	0
+16	0
+17	0
+18	0
+19	3
+20	0
+21	0
+22	0
+23	0
+24	0
+25	0
+26	0
+27	2
+28	0
+29	1
+30	0
+31	0
+32	0
+33	0
+34	0
+35	0
+36	0
+37	0
+38	0
+39	0
+40	0
+41	2
+42	0
+43	3
+44	0
+45	0
+46	0
+47	0
+48	0
+49	2
+50	0
+51	0
+52	0
+53	1
+54	1
+55	0
+56	0
+57	0
+58	0
+59	0
+60	0
+61	0
+62	0
+63	0
+64	0
+65	0
+66	0
+67	0
+68	0
+69	0
+70	0
+71	0
+72	0
+73	0
+74	0
+75	0
+76	0
+77	0
+78	0
+79	0
+80	0
+81	0
+82	0
+83	0
+84	0
+85	0
+86	0
+87	0
+88	0
+89	1
+90	2
+91	1
+92	0
+93	0
+94	0
+95	0
+96	0
+97	0
+98	0
+99	0
+100	0
+101	0
+102	0
+103	0
+104	1
+105	0
+106	3
+107	0
+108	0
+109	0
+110	0
+111	0
+112	0
+113	0
+114	1
+115	0
+116	0
+117	0
+118	0
+119	0
+120	0
+121	0
+122	0
+123	2
+124	0
+125	0
+126	0
+127	0
+128	0
+129	0
+130	0
+131	0
+132	0
+133	0
+134	0
+135	0
+136	0
+137	0
+138	1
+139	0
+140	0
+141	0
+142	0
+143	0
+144	0
+145	0
+146	0
+147	0
+148	0
+149	0
+150	0
+151	0
+152	1
+153	0
+154	0
+155	0
+156	0
+157	3
+158	0
+159	0
+160	0
+161	0
+162	0
+163	0
+164	0
+165	0
+166	0
+167	0
+168	0
+169	0
+170	0
+171	0
+172	0
+173	0
+174	0
+175	0
+176	0
+177	0
+178	2
+179	0
+180	0
+181	0
+182	0
+183	0
+184	0
+185	0
+186	0
+187	0
+188	0
+189	0
+190	0
+191	0
+192	0
+193	0
+194	0
+195	0
+196	0
+197	0
+198	0
+199	0
+200	0
+201	0
+202	0
+203	0
+204	0
+205	0
+206	0
+207	0
+208	0
+209	3
+210	0
+211	0
+212	0
+213	0
+214	0
+215	0
+216	0
+217	0
+218	2
+219	0
+220	0
+221	0
+222	0
+223	0
+224	0
+225	1
+226	0
+227	0
+228	0
+229	2
+230	0
+231	0
+232	0
+233	0
+234	0
+235	0
+236	0
+237	0
+238	0
+239	0
+240	0
+241	0
+242	0
+243	0
+244	0
+245	0
+246	0
+247	0
+248	0
+249	0
+250	2
+251	0
+252	0
+253	0
+254	0
+255	0
+256	0
+257	0
+258	1
+259	0
+260	0
+261	3
+262	0
+263	0
+264	0
+265	0
+266	0
+267	0
+268	0
+269	0
+270	0
+271	0
+272	0
+273	0
+274	0
+275	3
+276	0
+277	0
+278	0
+279	0
+280	0
+281	0
+282	0
+283	0
+284	0
+285	0
+286	0
+287	0
+288	0
+289	0
+290	0
+291	0
+292	0
+293	0
+294	0
+295	0
+296	0
+297	0
+298	0
+299	0
+300	0
+301	0
+302	0
+303	3
+304	0
+305	0
+306	0
+307	0
+308	0
+309	0
+310	0
+311	0
+312	0
+313	0
+314	0
+315	0
+316	0
+317	0
+318	1
+319	0
+320	0
+321	0
+322	0
+323	0
+324	0
+325	0
+326	0
+327	0
+328	0
+329	0
+330	0
+331	0
+332	0
+333	0
+334	0
+335	2
+336	0
+337	0
+338	0
+339	0
+340	0
+341	0
+342	0
+343	0
+344	0
+345	0
+346	0
+347	0
+348	0
+349	0
+350	0
+351	0
+352	0
+353	0
+354	0
+355	3
+356	0
+357	0
+358	0
+359	2
+360	0
+361	1
+362	2
+363	0
+364	0
+365	0
+366	0
+367	3
+368	0
+369	0
+370	0
+371	0
+372	0
+373	0
+374	0
+375	0
+376	0
+377	0
+378	0
+379	0
+380	0
+381	0
+382	0
+383	0
+384	0
+385	0
+386	0
+387	0
+388	0
+389	0
+390	0
+391	0
+392	0
+393	0
+394	0
+395	0
+396	0
+397	0
+398	0
+399	0
+400	0
+401	0
+402	0
+403	0
+404	0
+405	0
+406	0
+407	0
+408	0
+409	0
+410	2
+411	0
+412	0
+413	0
+414	0
+415	0
+416	0
+417	0
+418	2
+419	0
+420	3
+421	0
+422	0
+423	0
+424	0
+425	0
+426	0
+427	0
+428	0
+429	0
+430	0
+431	0
+432	0
+433	0
+434	0
+435	0
+436	0
+437	0
+438	0
+439	0
+440	0
+441	0
+442	0
+443	0
+444	0
+445	0
+446	0
+447	0
+448	0
+449	0
+450	0
+451	0
+452	0
+453	0
+454	0
+455	0
+456	0
+457	0
+458	2
+459	0
+460	0
+461	0
+462	0
+463	0
+464	0
+465	0
+466	0
+467	0
+468	0
+469	0
+470	0
+471	0
+472	0
+473	0
+474	0
+475	0
+476	0
+477	0
+478	0
+479	0
+480	0
+481	0
+482	0
+483	0
+484	0
+485	2
+486	0
+487	0
+488	0
+489	0
+490	0
+491	0
+492	0
+493	0
+494	0
+495	0
+496	0
+497	0
+498	0
+499	2
+500	0
+501	0
+502	0
+503	0
+504	0
+505	0
+506	0
+507	0
+508	0
+509	3
+510	0
+511	0
+512	0
+513	0
+514	0
+515	0
+516	0
+517	0
+518	0
+519	0
+520	0
+521	0
+522	0
+523	0
+524	0
+525	0
+526	0
+527	0
+528	0
+529	2
+530	0
+531	2
+532	0
+533	0
+534	0
+535	0
+536	0
+537	0
+538	0
+539	0
+540	0
+541	2
+542	0
+543	0
+544	0
+545	2
+546	0
+547	3
+548	0
+549	0
+550	1
+551	0
+552	0
+553	0
+554	2
+555	0
+556	0
+557	0
+558	0
+559	3
+560	0
+561	0
+562	0
+563	0
+564	0
+565	0
+566	0
+567	0
+568	0
+569	0
+570	0
+571	0
+572	0
+573	0
+574	0
+575	0
+576	0
+577	0
+578	2
+579	0
+580	0
+581	0
+582	1
+583	1
+584	0
+585	0
+586	0
+587	2
+588	0
+589	0
+590	0
+591	0
+592	0
+593	0
+594	0
+595	0
+596	0
+597	0
+598	0
+599	1
+600	0
+601	0
+602	0
+603	0
+604	3
+605	0
+606	0
+607	0
+608	0
+609	0
+610	0
+611	0
+612	2
+613	0
+614	2
+615	0
+616	0
+617	0
+618	0
+619	0
+620	2
+621	0
+622	0
+623	0
+624	0
+625	0
+626	0
+627	0
+628	0
+629	0
+630	0
+631	0
+632	0
+633	0
+634	0
+635	0
+636	0
+637	0
+638	0
+639	0
+640	3
+641	0
+642	0
+643	0
+644	0
+645	0
+646	0
+647	0
+648	0
+649	0
+650	0
+651	0
+652	0
+653	0
+654	0
+655	0
+656	0
+657	0
+658	0
+659	0
+660	0
+661	0
+662	0
+663	0
+664	0
+665	0
+666	0
+667	0
+668	0
+669	0
+670	0
+671	0
+672	0
+673	0
+674	0
+675	0
+676	0
+677	0
+678	0
+679	0
+680	0
+681	0
+682	0
+683	0
+684	0
+685	0
+686	0
+687	0
+688	0
+689	0
+690	0
+691	3
+692	0
+693	3
+694	0
+695	0
+696	0
+697	0
+698	3
+699	0
+700	0
+701	0
+702	0
+703	0
+704	0
+705	0
+706	3
+707	0
+708	0
+709	0
+710	3
+711	0
+712	0
+713	0
+714	0
+715	0
+716	0
+717	0
+718	0
+719	0
+720	0
+721	0
+722	0
+723	0
+724	2
+725	0
+726	0
+727	0
+728	2
+729	0
+730	2
+731	0
+732	3
+733	0
+734	0
+735	1
+736	0
+737	0
+738	0
+739	3
+740	0
+741	0
+742	3
+743	0
+744	2
+745	0
+746	0
+747	0
+748	0
+749	3
+750	2
+751	0
+752	0
+753	0
+754	0
+755	0
+756	0
+757	0
+758	0
+759	0
+760	0
+761	0
+762	0
+763	0
+764	0
+765	0
+766	0
+767	0
+768	0
+769	0
+770	0
+771	0
+772	0
+773	0
+774	0
+775	0
+776	0
+777	0
+778	0
+779	0
+780	0
+781	0
+782	0
+783	0
+784	0
+785	0
+786	3
+787	0
+788	0
+789	0
+790	0
+791	0
+792	0
+793	0
+794	0
+795	0
+796	0
+797	0
+798	2
+799	0
+800	2
+801	0
+802	0
+803	0
+804	3
+805	0
+806	0
+807	3
+808	0
+809	0
+810	0
+811	0
+812	0
+813	0
+814	0
+815	0
+816	0
+817	0
+818	0
+819	0
+820	0
+821	0
+822	0
+823	0
+824	0
+825	0
+826	0
+827	0
+828	0
+829	0
+830	0
+831	0
+832	0
+833	0
+834	3
+835	0
+836	0
+837	0
+838	0
+839	0
+840	0
+841	3
+842	0
+843	0
+844	3
+845	0
+846	0
+847	0
+848	0
+849	0
+850	0
+851	0
+852	0
+853	0
+854	0
+855	0
+856	0
+857	0
+858	0
+859	0
+860	0
+861	0
+862	0
+863	0
+864	1
+865	3
+866	0
+867	0
+868	0
+869	0
+870	0
+871	0
+872	0
+873	0
+874	0
+875	0
+876	0
+877	0
+878	2
+879	0
+880	0
+881	0
+882	0
+883	0
+884	0
+885	0
+886	0
+887	0
+888	0
+889	0
+890	2
+891	0
+892	0
+893	0
+894	0
+895	0
+896	0
+897	0
+898	0
+899	0
+900	0
+901	0
+902	0
+903	0
+904	0
+905	0
+906	0
+907	0
+908	0
+909	0
+910	0
+911	0
+912	0
+913	1
+914	1
+915	0
+916	0
+917	0
+918	0
+919	0
+920	0
+921	0
+922	0
+923	0
+924	3
+925	0
+926	0
+927	0
+928	0
+929	0
+930	0
+931	0
+932	0
+933	0
+934	0
+935	0
+936	3
+937	0
+938	0
+939	0
+940	0
+941	0
+942	0
+943	0
+944	0
+945	2
+946	0
+947	0
+948	0
+949	0
+950	1
+951	1
+952	1
+953	1
+954	1
+955	1
+956	1
+957	1
+958	1
+959	1
+960	1
+961	1
+962	1
+963	1
+964	1
+965	1
+966	1
+967	1
+968	1
+969	1
+970	1
+971	1
+972	1
+973	1
+974	1
+975	1
+976	1
+977	1
+978	1
+979	1
+980	1
+981	1
+982	1
+983	1
+984	1
+985	1
+986	1
+987	1
+988	1
+989	1
+990	1
+991	1
+992	1
+993	1
+994	1
+995	1
+996	1
+997	1
+998	1
+999	1
+1000	1
+1001	1
+1002	1
+1003	1
+1004	1
+1005	1
+1006	1
+1007	1
+1008	1
+1009	1
+1010	1
+1011	1
+1012	1
+1013	1
+1014	1
+1015	1
+1016	1
+1017	1
+1018	1
+1019	1
+1020	1
+1021	1
+1022	1
+1023	1
+1024	1
+1025	1
+1026	1
+1027	1
+1028	1
+1029	1
+1030	1
+1031	1
+1032	1
+1033	1
+1034	1
+1035	1
+1036	1
+1037	1
+1038	1
+1039	1
+1040	1
+1041	1
+1042	1
+1043	1
+1044	1
+1045	1
+1046	1
+1047	1
+1048	1
+1049	1
+1050	1
+1051	1
+1052	1
+1053	1
+1054	1
+1055	1
+1056	1
+1057	1
+1058	1
+1059	1
+1060	1
+1061	1
+1062	1
+1063	1
+1064	1
+1065	1
+1066	1
+1067	1
+1068	1
+1069	1
+1070	2
+1071	1
+1072	1
+1073	1
+1074	1
+1075	1
+1076	1
+1077	1
+1078	1
+1079	1
+1080	1
+1081	1
+1082	1
+1083	1
+1084	1
+1085	1
+1086	1
+1087	1
+1088	1
+1089	1
+1090	1
+1091	1
+1092	1
+1093	1
+1094	1
+1095	1
+1096	1
+1097	1
+1098	1
+1099	1
+1100	1
+1101	1
+1102	1
+1103	1
+1104	1
+1105	1
+1106	1
+1107	1
+1108	1
+1109	1
+1110	1
+1111	1
+1112	1
+1113	1
+1114	1
+1115	1
+1116	1
+1117	1
+1118	1
+1119	1
+1120	1
+1121	1
+1122	1
+1123	1
+1124	1
+1125	1
+1126	1
+1127	1
+1128	1
+1129	1
+1130	1
+1131	1
+1132	1
+1133	1
+1134	1
+1135	1
+1136	1
+1137	1
+1138	1
+1139	1
+1140	1
+1141	1
+1142	1
+1143	1
+1144	1
+1145	1
+1146	1
+1147	1
+1148	1
+1149	1
+1150	1
+1151	1
+1152	1
+1153	1
+1154	0
+1155	1
+1156	1
+1157	1
+1158	1
+1159	1
+1160	1
+1161	1
+1162	1
+1163	1
+1164	1
+1165	1
+1166	0
+1167	1
+1168	1
+1169	1
+1170	1
+1171	1
+1172	1
+1173	1
+1174	1
+1175	1
+1176	1
+1177	1
+1178	1
+1179	1
+1180	1
+1181	1
+1182	1
+1183	1
+1184	1
+1185	1
+1186	1
+1187	1
+1188	1
+1189	1
+1190	1
+1191	1
+1192	1
+1193	1
+1194	1
+1195	1
+1196	1
+1197	1
+1198	1
+1199	1
+1200	1
+1201	1
+1202	1
+1203	1
+1204	1
+1205	1
+1206	1
+1207	1
+1208	1
+1209	1
+1210	1
+1211	1
+1212	1
+1213	1
+1214	1
+1215	1
+1216	1
+1217	1
+1218	1
+1219	1
+1220	1
+1221	1
+1222	1
+1223	2
+1224	1
+1225	1
+1226	1
+1227	1
+1228	1
+1229	1
+1230	1
+1231	1
+1232	1
+1233	1
+1234	1
+1235	1
+1236	1
+1237	1
+1238	1
+1239	1
+1240	1
+1241	1
+1242	1
+1243	1
+1244	1
+1245	1
+1246	1
+1247	1
+1248	1
+1249	1
+1250	1
+1251	1
+1252	1
+1253	1
+1254	1
+1255	1
+1256	1
+1257	1
+1258	1
+1259	1
+1260	1
+1261	1
+1262	1
+1263	1
+1264	1
+1265	1
+1266	1
+1267	1
+1268	1
+1269	1
+1270	1
+1271	1
+1272	1
+1273	1
+1274	1
+1275	1
+1276	1
+1277	1
+1278	1
+1279	1
+1280	1
+1281	1
+1282	1
+1283	1
+1284	1
+1285	1
+1286	1
+1287	1
+1288	1
+1289	1
+1290	1
+1291	1
+1292	1
+1293	1
+1294	1
+1295	1
+1296	1
+1297	1
+1298	1
+1299	1
+1300	1
+1301	1
+1302	1
+1303	1
+1304	1
+1305	1
+1306	1
+1307	1
+1308	1
+1309	1
+1310	1
+1311	1
+1312	1
+1313	1
+1314	1
+1315	1
+1316	1
+1317	1
+1318	1
+1319	1
+1320	1
+1321	1
+1322	1
+1323	1
+1324	1
+1325	1
+1326	1
+1327	1
+1328	1
+1329	1
+1330	1
+1331	1
+1332	1
+1333	1
+1334	1
+1335	1
+1336	1
+1337	1
+1338	1
+1339	1
+1340	1
+1341	1
+1342	1
+1343	1
+1344	1
+1345	1
+1346	1
+1347	1
+1348	1
+1349	1
+1350	1
+1351	1
+1352	1
+1353	1
+1354	1
+1355	1
+1356	1
+1357	1
+1358	1
+1359	1
+1360	1
+1361	1
+1362	1
+1363	1
+1364	1
+1365	1
+1366	1
+1367	1
+1368	1
+1369	1
+1370	1
+1371	1
+1372	1
+1373	1
+1374	1
+1375	1
+1376	1
+1377	1
+1378	1
+1379	1
+1380	1
+1381	1
+1382	1
+1383	1
+1384	1
+1385	1
+1386	1
+1387	1
+1388	1
+1389	1
+1390	1
+1391	1
+1392	1
+1393	1
+1394	1
+1395	1
+1396	1
+1397	1
+1398	1
+1399	1
+1400	1
+1401	1
+1402	1
+1403	1
+1404	1
+1405	1
+1406	1
+1407	2
+1408	1
+1409	1
+1410	1
+1411	1
+1412	1
+1413	1
+1414	1
+1415	1
+1416	1
+1417	1
+1418	1
+1419	1
+1420	1
+1421	1
+1422	1
+1423	1
+1424	0
+1425	1
+1426	1
+1427	1
+1428	1
+1429	1
+1430	1
+1431	1
+1432	1
+1433	1
+1434	1
+1435	1
+1436	1
+1437	1
+1438	1
+1439	1
+1440	1
+1441	1
+1442	1
+1443	1
+1444	1
+1445	1
+1446	1
+1447	1
+1448	1
+1449	1
+1450	1
+1451	1
+1452	1
+1453	1
+1454	1
+1455	1
+1456	1
+1457	1
+1458	1
+1459	1
+1460	1
+1461	1
+1462	1
+1463	1
+1464	2
+1465	1
+1466	1
+1467	1
+1468	1
+1469	1
+1470	1
+1471	1
+1472	1
+1473	1
+1474	1
+1475	1
+1476	1
+1477	1
+1478	1
+1479	1
+1480	1
+1481	1
+1482	1
+1483	1
+1484	1
+1485	1
+1486	1
+1487	1
+1488	1
+1489	1
+1490	1
+1491	1
+1492	1
+1493	1
+1494	1
+1495	1
+1496	1
+1497	1
+1498	1
+1499	1
+1500	1
+1501	1
+1502	1
+1503	1
+1504	1
+1505	1
+1506	1
+1507	1
+1508	1
+1509	1
+1510	1
+1511	1
+1512	1
+1513	1
+1514	1
+1515	1
+1516	1
+1517	1
+1518	1
+1519	1
+1520	1
+1521	1
+1522	1
+1523	1
+1524	1
+1525	1
+1526	1
+1527	1
+1528	1
+1529	1
+1530	1
+1531	1
+1532	1
+1533	1
+1534	1
+1535	1
+1536	1
+1537	1
+1538	1
+1539	1
+1540	1
+1541	1
+1542	1
+1543	1
+1544	1
+1545	1
+1546	1
+1547	1
+1548	1
+1549	1
+1550	1
+1551	1
+1552	1
+1553	1
+1554	1
+1555	1
+1556	1
+1557	1
+1558	1
+1559	1
+1560	1
+1561	1
+1562	1
+1563	1
+1564	1
+1565	1
+1566	1
+1567	1
+1568	1
+1569	3
+1570	1
+1571	1
+1572	1
+1573	1
+1574	1
+1575	1
+1576	1
+1577	1
+1578	1
+1579	1
+1580	1
+1581	1
+1582	1
+1583	1
+1584	1
+1585	1
+1586	1
+1587	1
+1588	1
+1589	1
+1590	1
+1591	1
+1592	1
+1593	1
+1594	1
+1595	1
+1596	1
+1597	1
+1598	1
+1599	1
+1600	1
+1601	1
+1602	1
+1603	1
+1604	1
+1605	1
+1606	1
+1607	1
+1608	1
+1609	1
+1610	1
+1611	1
+1612	1
+1613	1
+1614	1
+1615	1
+1616	1
+1617	1
+1618	1
+1619	1
+1620	1
+1621	1
+1622	1
+1623	1
+1624	1
+1625	1
+1626	1
+1627	1
+1628	1
+1629	1
+1630	1
+1631	1
+1632	1
+1633	1
+1634	1
+1635	1
+1636	1
+1637	1
+1638	1
+1639	1
+1640	1
+1641	1
+1642	1
+1643	1
+1644	1
+1645	1
+1646	1
+1647	1
+1648	1
+1649	1
+1650	1
+1651	1
+1652	1
+1653	1
+1654	1
+1655	1
+1656	1
+1657	1
+1658	1
+1659	1
+1660	1
+1661	1
+1662	1
+1663	1
+1664	1
+1665	1
+1666	1
+1667	1
+1668	1
+1669	1
+1670	1
+1671	1
+1672	1
+1673	1
+1674	1
+1675	1
+1676	1
+1677	1
+1678	1
+1679	1
+1680	1
+1681	3
+1682	1
+1683	1
+1684	1
+1685	1
+1686	1
+1687	1
+1688	1
+1689	1
+1690	1
+1691	1
+1692	1
+1693	1
+1694	1
+1695	1
+1696	1
+1697	1
+1698	1
+1699	1
+1700	1
+1701	1
+1702	1
+1703	0
+1704	1
+1705	1
+1706	1
+1707	1
+1708	1
+1709	1
+1710	1
+1711	1
+1712	1
+1713	1
+1714	1
+1715	1
+1716	1
+1717	1
+1718	1
+1719	1
+1720	1
+1721	1
+1722	1
+1723	1
+1724	1
+1725	1
+1726	1
+1727	1
+1728	1
+1729	1
+1730	1
+1731	1
+1732	1
+1733	1
+1734	1
+1735	1
+1736	1
+1737	1
+1738	1
+1739	1
+1740	1
+1741	1
+1742	1
+1743	1
+1744	1
+1745	1
+1746	1
+1747	1
+1748	1
+1749	1
+1750	1
+1751	1
+1752	1
+1753	1
+1754	2
+1755	1
+1756	1
+1757	1
+1758	1
+1759	1
+1760	1
+1761	1
+1762	1
+1763	1
+1764	1
+1765	1
+1766	1
+1767	1
+1768	1
+1769	1
+1770	1
+1771	1
+1772	1
+1773	1
+1774	1
+1775	1
+1776	1
+1777	1
+1778	1
+1779	1
+1780	1
+1781	1
+1782	1
+1783	1
+1784	1
+1785	1
+1786	1
+1787	1
+1788	1
+1789	1
+1790	1
+1791	1
+1792	1
+1793	1
+1794	1
+1795	1
+1796	1
+1797	1
+1798	1
+1799	1
+1800	1
+1801	1
+1802	1
+1803	1
+1804	1
+1805	1
+1806	1
+1807	1
+1808	1
+1809	1
+1810	1
+1811	1
+1812	1
+1813	1
+1814	1
+1815	1
+1816	1
+1817	1
+1818	1
+1819	1
+1820	0
+1821	1
+1822	1
+1823	1
+1824	1
+1825	1
+1826	1
+1827	1
+1828	1
+1829	1
+1830	1
+1831	1
+1832	1
+1833	1
+1834	3
+1835	1
+1836	1
+1837	1
+1838	1
+1839	1
+1840	1
+1841	1
+1842	1
+1843	1
+1844	1
+1845	1
+1846	1
+1847	1
+1848	1
+1849	1
+1850	1
+1851	1
+1852	1
+1853	1
+1854	1
+1855	1
+1856	1
+1857	1
+1858	1
+1859	1
+1860	1
+1861	1
+1862	1
+1863	1
+1864	1
+1865	1
+1866	1
+1867	1
+1868	1
+1869	1
+1870	1
+1871	1
+1872	1
+1873	1
+1874	1
+1875	1
+1876	1
+1877	1
+1878	1
+1879	1
+1880	1
+1881	1
+1882	1
+1883	1
+1884	1
+1885	1
+1886	1
+1887	1
+1888	1
+1889	1
+1890	1
+1891	1
+1892	1
+1893	1
+1894	1
+1895	1
+1896	1
+1897	1
+1898	1
+1899	1
+1900	2
+1901	2
+1902	2
+1903	2
+1904	2
+1905	2
+1906	2
+1907	2
+1908	2
+1909	2
+1910	2
+1911	2
+1912	2
+1913	2
+1914	2
+1915	2
+1916	2
+1917	2
+1918	2
+1919	2
+1920	2
+1921	2
+1922	2
+1923	2
+1924	2
+1925	2
+1926	2
+1927	2
+1928	2
+1929	2
+1930	2
+1931	2
+1932	2
+1933	2
+1934	2
+1935	2
+1936	2
+1937	2
+1938	2
+1939	2
+1940	3
+1941	2
+1942	2
+1943	2
+1944	2
+1945	2
+1946	2
+1947	2
+1948	2
+1949	2
+1950	2
+1951	2
+1952	2
+1953	2
+1954	2
+1955	2
+1956	2
+1957	2
+1958	2
+1959	2
+1960	2
+1961	2
+1962	2
+1963	2
+1964	2
+1965	2
+1966	2
+1967	2
+1968	2
+1969	2
+1970	2
+1971	2
+1972	2
+1973	2
+1974	2
+1975	2
+1976	2
+1977	0
+1978	3
+1979	2
+1980	2
+1981	2
+1982	2
+1983	2
+1984	2
+1985	2
+1986	2
+1987	2
+1988	2
+1989	2
+1990	2
+1991	2
+1992	2
+1993	2
+1994	2
+1995	2
+1996	2
+1997	2
+1998	2
+1999	3
+2000	2
+2001	2
+2002	2
+2003	2
+2004	2
+2005	2
+2006	2
+2007	2
+2008	2
+2009	2
+2010	2
+2011	2
+2012	2
+2013	2
+2014	2
+2015	2
+2016	2
+2017	2
+2018	2
+2019	2
+2020	2
+2021	2
+2022	2
+2023	2
+2024	2
+2025	2
+2026	2
+2027	2
+2028	2
+2029	2
+2030	3
+2031	2
+2032	2
+2033	2
+2034	2
+2035	2
+2036	2
+2037	2
+2038	2
+2039	2
+2040	2
+2041	2
+2042	2
+2043	2
+2044	2
+2045	2
+2046	2
+2047	2
+2048	3
+2049	2
+2050	2
+2051	3
+2052	2
+2053	2
+2054	2
+2055	2
+2056	3
+2057	2
+2058	2
+2059	2
+2060	2
+2061	2
+2062	2
+2063	2
+2064	2
+2065	2
+2066	2
+2067	2
+2068	2
+2069	2
+2070	2
+2071	2
+2072	2
+2073	3
+2074	2
+2075	2
+2076	2
+2077	3
+2078	2
+2079	3
+2080	3
+2081	2
+2082	2
+2083	2
+2084	2
+2085	2
+2086	2
+2087	2
+2088	2
+2089	2
+2090	2
+2091	2
+2092	2
+2093	2
+2094	2
+2095	2
+2096	2
+2097	2
+2098	2
+2099	2
+2100	3
+2101	1
+2102	3
+2103	2
+2104	2
+2105	2
+2106	2
+2107	2
+2108	2
+2109	2
+2110	2
+2111	2
+2112	2
+2113	2
+2114	2
+2115	2
+2116	2
+2117	2
+2118	2
+2119	3
+2120	2
+2121	2
+2122	2
+2123	2
+2124	2
+2125	2
+2126	2
+2127	2
+2128	2
+2129	2
+2130	2
+2131	2
+2132	2
+2133	2
+2134	2
+2135	2
+2136	2
+2137	2
+2138	2
+2139	2
+2140	2
+2141	2
+2142	2
+2143	2
+2144	2
+2145	2
+2146	2
+2147	2
+2148	2
+2149	2
+2150	2
+2151	2
+2152	2
+2153	2
+2154	2
+2155	2
+2156	2
+2157	2
+2158	2
+2159	2
+2160	3
+2161	2
+2162	2
+2163	2
+2164	2
+2165	2
+2166	2
+2167	2
+2168	2
+2169	2
+2170	2
+2171	2
+2172	2
+2173	2
+2174	2
+2175	2
+2176	3
+2177	0
+2178	2
+2179	2
+2180	2
+2181	2
+2182	2
+2183	2
+2184	2
+2185	2
+2186	2
+2187	2
+2188	2
+2189	2
+2190	2
+2191	2
+2192	2
+2193	2
+2194	2
+2195	2
+2196	2
+2197	3
+2198	2
+2199	2
+2200	2
+2201	2
+2202	2
+2203	2
+2204	2
+2205	2
+2206	2
+2207	2
+2208	2
+2209	2
+2210	2
+2211	2
+2212	2
+2213	2
+2214	2
+2215	2
+2216	0
+2217	2
+2218	2
+2219	2
+2220	2
+2221	2
+2222	0
+2223	2
+2224	2
+2225	2
+2226	2
+2227	2
+2228	2
+2229	2
+2230	2
+2231	2
+2232	2
+2233	3
+2234	3
+2235	2
+2236	0
+2237	2
+2238	2
+2239	2
+2240	2
+2241	2
+2242	3
+2243	2
+2244	2
+2245	2
+2246	2
+2247	3
+2248	3
+2249	2
+2250	2
+2251	2
+2252	2
+2253	2
+2254	2
+2255	1
+2256	2
+2257	2
+2258	2
+2259	2
+2260	2
+2261	2
+2262	2
+2263	2
+2264	2
+2265	2
+2266	2
+2267	3
+2268	2
+2269	2
+2270	2
+2271	2
+2272	2
+2273	2
+2274	2
+2275	2
+2276	2
+2277	2
+2278	3
+2279	2
+2280	3
+2281	2
+2282	2
+2283	2
+2284	2
+2285	3
+2286	2
+2287	2
+2288	2
+2289	2
+2290	2
+2291	3
+2292	2
+2293	2
+2294	2
+2295	2
+2296	2
+2297	2
+2298	2
+2299	2
+2300	2
+2301	2
+2302	2
+2303	2
+2304	2
+2305	2
+2306	2
+2307	2
+2308	2
+2309	2
+2310	2
+2311	2
+2312	2
+2313	2
+2314	2
+2315	2
+2316	2
+2317	3
+2318	3
+2319	2
+2320	2
+2321	2
+2322	2
+2323	2
+2324	2
+2325	2
+2326	2
+2327	2
+2328	2
+2329	3
+2330	0
+2331	2
+2332	2
+2333	2
+2334	2
+2335	2
+2336	2
+2337	2
+2338	2
+2339	2
+2340	2
+2341	0
+2342	2
+2343	3
+2344	2
+2345	2
+2346	2
+2347	2
+2348	2
+2349	3
+2350	2
+2351	2
+2352	2
+2353	2
+2354	3
+2355	2
+2356	2
+2357	2
+2358	2
+2359	3
+2360	2
+2361	2
+2362	2
+2363	2
+2364	2
+2365	2
+2366	2
+2367	3
+2368	2
+2369	2
+2370	3
+2371	2
+2372	2
+2373	2
+2374	2
+2375	2
+2376	2
+2377	2
+2378	2
+2379	2
+2380	3
+2381	2
+2382	3
+2383	2
+2384	2
+2385	2
+2386	3
+2387	2
+2388	2
+2389	0
+2390	3
+2391	2
+2392	2
+2393	2
+2394	2
+2395	2
+2396	2
+2397	2
+2398	2
+2399	2
+2400	2
+2401	2
+2402	2
+2403	2
+2404	2
+2405	2
+2406	2
+2407	2
+2408	2
+2409	2
+2410	2
+2411	2
+2412	2
+2413	2
+2414	2
+2415	2
+2416	2
+2417	2
+2418	2
+2419	2
+2420	2
+2421	3
+2422	2
+2423	0
+2424	2
+2425	2
+2426	2
+2427	2
+2428	3
+2429	2
+2430	2
+2431	2
+2432	2
+2433	3
+2434	3
+2435	2
+2436	3
+2437	2
+2438	2
+2439	2
+2440	2
+2441	2
+2442	2
+2443	2
+2444	2
+2445	2
+2446	2
+2447	2
+2448	2
+2449	2
+2450	2
+2451	3
+2452	2
+2453	2
+2454	3
+2455	2
+2456	2
+2457	2
+2458	2
+2459	2
+2460	2
+2461	2
+2462	2
+2463	2
+2464	2
+2465	2
+2466	3
+2467	2
+2468	2
+2469	2
+2470	2
+2471	2
+2472	2
+2473	2
+2474	2
+2475	2
+2476	1
+2477	2
+2478	2
+2479	2
+2480	2
+2481	2
+2482	3
+2483	2
+2484	2
+2485	2
+2486	3
+2487	2
+2488	2
+2489	2
+2490	2
+2491	2
+2492	2
+2493	2
+2494	2
+2495	2
+2496	2
+2497	2
+2498	2
+2499	2
+2500	2
+2501	1
+2502	2
+2503	2
+2504	2
+2505	3
+2506	2
+2507	2
+2508	2
+2509	2
+2510	2
+2511	2
+2512	2
+2513	2
+2514	2
+2515	2
+2516	2
+2517	2
+2518	3
+2519	2
+2520	2
+2521	2
+2522	3
+2523	2
+2524	2
+2525	2
+2526	2
+2527	2
+2528	2
+2529	2
+2530	2
+2531	2
+2532	2
+2533	2
+2534	2
+2535	2
+2536	2
+2537	2
+2538	2
+2539	2
+2540	2
+2541	2
+2542	2
+2543	3
+2544	2
+2545	2
+2546	2
+2547	2
+2548	2
+2549	2
+2550	2
+2551	2
+2552	2
+2553	2
+2554	2
+2555	2
+2556	2
+2557	2
+2558	2
+2559	2
+2560	2
+2561	2
+2562	2
+2563	2
+2564	2
+2565	2
+2566	2
+2567	2
+2568	2
+2569	2
+2570	3
+2571	2
+2572	2
+2573	2
+2574	2
+2575	2
+2576	3
+2577	3
+2578	2
+2579	2
+2580	2
+2581	2
+2582	2
+2583	3
+2584	2
+2585	2
+2586	2
+2587	2
+2588	2
+2589	2
+2590	2
+2591	2
+2592	2
+2593	2
+2594	2
+2595	2
+2596	2
+2597	2
+2598	2
+2599	2
+2600	2
+2601	2
+2602	2
+2603	2
+2604	2
+2605	2
+2606	2
+2607	2
+2608	2
+2609	2
+2610	2
+2611	2
+2612	2
+2613	2
+2614	2
+2615	2
+2616	2
+2617	2
+2618	3
+2619	3
+2620	2
+2621	2
+2622	2
+2623	2
+2624	2
+2625	2
+2626	2
+2627	2
+2628	3
+2629	2
+2630	3
+2631	2
+2632	2
+2633	2
+2634	2
+2635	2
+2636	2
+2637	2
+2638	2
+2639	2
+2640	2
+2641	2
+2642	2
+2643	2
+2644	2
+2645	2
+2646	2
+2647	2
+2648	2
+2649	2
+2650	2
+2651	2
+2652	3
+2653	2
+2654	3
+2655	2
+2656	2
+2657	2
+2658	2
+2659	2
+2660	2
+2661	3
+2662	2
+2663	3
+2664	0
+2665	2
+2666	3
+2667	2
+2668	2
+2669	2
+2670	2
+2671	2
+2672	2
+2673	3
+2674	2
+2675	2
+2676	2
+2677	2
+2678	2
+2679	2
+2680	2
+2681	2
+2682	2
+2683	2
+2684	2
+2685	2
+2686	2
+2687	0
+2688	3
+2689	2
+2690	2
+2691	2
+2692	2
+2693	2
+2694	2
+2695	2
+2696	2
+2697	0
+2698	3
+2699	2
+2700	2
+2701	2
+2702	0
+2703	2
+2704	2
+2705	2
+2706	2
+2707	2
+2708	2
+2709	2
+2710	3
+2711	2
+2712	2
+2713	2
+2714	2
+2715	2
+2716	2
+2717	3
+2718	3
+2719	2
+2720	2
+2721	2
+2722	2
+2723	2
+2724	2
+2725	2
+2726	2
+2727	2
+2728	3
+2729	2
+2730	2
+2731	2
+2732	2
+2733	2
+2734	2
+2735	2
+2736	2
+2737	2
+2738	3
+2739	2
+2740	2
+2741	2
+2742	2
+2743	2
+2744	3
+2745	3
+2746	2
+2747	2
+2748	2
+2749	2
+2750	2
+2751	0
+2752	2
+2753	2
+2754	2
+2755	2
+2756	2
+2757	2
+2758	2
+2759	2
+2760	2
+2761	2
+2762	2
+2763	2
+2764	2
+2765	2
+2766	2
+2767	2
+2768	3
+2769	2
+2770	2
+2771	2
+2772	2
+2773	2
+2774	2
+2775	2
+2776	2
+2777	2
+2778	2
+2779	2
+2780	3
+2781	2
+2782	2
+2783	2
+2784	2
+2785	2
+2786	2
+2787	2
+2788	2
+2789	2
+2790	2
+2791	2
+2792	2
+2793	2
+2794	3
+2795	2
+2796	0
+2797	2
+2798	2
+2799	2
+2800	2
+2801	2
+2802	2
+2803	2
+2804	2
+2805	2
+2806	0
+2807	2
+2808	2
+2809	2
+2810	2
+2811	2
+2812	3
+2813	2
+2814	3
+2815	2
+2816	2
+2817	2
+2818	3
+2819	2
+2820	2
+2821	3
+2822	2
+2823	2
+2824	2
+2825	2
+2826	2
+2827	2
+2828	2
+2829	2
+2830	2
+2831	2
+2832	2
+2833	2
+2834	2
+2835	2
+2836	2
+2837	2
+2838	2
+2839	2
+2840	2
+2841	2
+2842	2
+2843	2
+2844	2
+2845	2
+2846	2
+2847	2
+2848	2
+2849	2
+2850	3
+2851	3
+2852	3
+2853	2
+2854	3
+2855	0
+2856	3
+2857	3
+2858	3
+2859	3
+2860	3
+2861	3
+2862	3
+2863	3
+2864	3
+2865	3
+2866	3
+2867	3
+2868	3
+2869	3
+2870	3
+2871	3
+2872	3
+2873	3
+2874	3
+2875	3
+2876	3
+2877	3
+2878	3
+2879	3
+2880	3
+2881	2
+2882	3
+2883	3
+2884	0
+2885	3
+2886	3
+2887	2
+2888	3
+2889	1
+2890	3
+2891	3
+2892	3
+2893	3
+2894	3
+2895	3
+2896	3
+2897	3
+2898	3
+2899	3
+2900	3
+2901	3
+2902	3
+2903	3
+2904	3
+2905	3
+2906	3
+2907	3
+2908	3
+2909	3
+2910	3
+2911	2
+2912	3
+2913	3
+2914	3
+2915	3
+2916	2
+2917	3
+2918	3
+2919	3
+2920	3
+2921	3
+2922	3
+2923	3
+2924	3
+2925	3
+2926	3
+2927	3
+2928	3
+2929	2
+2930	3
+2931	3
+2932	3
+2933	3
+2934	3
+2935	3
+2936	3
+2937	3
+2938	3
+2939	3
+2940	3
+2941	3
+2942	3
+2943	2
+2944	3
+2945	3
+2946	3
+2947	3
+2948	3
+2949	3
+2950	3
+2951	3
+2952	3
+2953	3
+2954	3
+2955	3
+2956	3
+2957	3
+2958	3
+2959	3
+2960	3
+2961	2
+2962	3
+2963	3
+2964	3
+2965	3
+2966	3
+2967	3
+2968	3
+2969	3
+2970	3
+2971	3
+2972	3
+2973	3
+2974	3
+2975	3
+2976	3
+2977	3
+2978	3
+2979	3
+2980	3
+2981	3
+2982	3
+2983	3
+2984	3
+2985	3
+2986	3
+2987	3
+2988	3
+2989	3
+2990	3
+2991	3
+2992	2
+2993	3
+2994	3
+2995	3
+2996	3
+2997	3
+2998	3
+2999	3
+3000	3
+3001	3
+3002	2
+3003	3
+3004	2
+3005	3
+3006	3
+3007	3
+3008	3
+3009	3
+3010	3
+3011	3
+3012	3
+3013	3
+3014	3
+3015	3
+3016	3
+3017	3
+3018	3
+3019	3
+3020	3
+3021	3
+3022	3
+3023	3
+3024	3
+3025	3
+3026	3
+3027	3
+3028	2
+3029	2
+3030	3
+3031	3
+3032	3
+3033	3
+3034	2
+3035	3
+3036	3
+3037	3
+3038	2
+3039	3
+3040	3
+3041	3
+3042	3
+3043	3
+3044	3
+3045	3
+3046	3
+3047	3
+3048	3
+3049	3
+3050	3
+3051	3
+3052	3
+3053	3
+3054	3
+3055	3
+3056	3
+3057	3
+3058	3
+3059	3
+3060	3
+3061	3
+3062	3
+3063	3
+3064	3
+3065	3
+3066	3
+3067	3
+3068	3
+3069	3
+3070	3
+3071	3
+3072	3
+3073	3
+3074	3
+3075	3
+3076	3
+3077	2
+3078	3
+3079	3
+3080	3
+3081	3
+3082	3
+3083	3
+3084	3
+3085	3
+3086	3
+3087	3
+3088	3
+3089	3
+3090	3
+3091	2
+3092	3
+3093	3
+3094	3
+3095	3
+3096	3
+3097	3
+3098	3
+3099	3
+3100	3
+3101	3
+3102	3
+3103	3
+3104	3
+3105	3
+3106	2
+3107	3
+3108	3
+3109	3
+3110	3
+3111	3
+3112	3
+3113	3
+3114	3
+3115	3
+3116	3
+3117	3
+3118	3
+3119	3
+3120	3
+3121	3
+3122	3
+3123	3
+3124	3
+3125	3
+3126	3
+3127	3
+3128	3
+3129	3
+3130	3
+3131	3
+3132	3
+3133	3
+3134	2
+3135	3
+3136	3
+3137	3
+3138	3
+3139	3
+3140	3
+3141	3
+3142	3
+3143	3
+3144	3
+3145	3
+3146	3
+3147	2
+3148	3
+3149	3
+3150	3
+3151	3
+3152	3
+3153	3
+3154	3
+3155	3
+3156	3
+3157	3
+3158	3
+3159	3
+3160	3
+3161	3
+3162	3
+3163	3
+3164	3
+3165	3
+3166	3
+3167	0
+3168	3
+3169	3
+3170	3
+3171	3
+3172	3
+3173	2
+3174	3
+3175	3
+3176	3
+3177	3
+3178	3
+3179	0
+3180	3
+3181	3
+3182	2
+3183	0
+3184	3
+3185	3
+3186	3
+3187	3
+3188	3
+3189	3
+3190	3
+3191	3
+3192	3
+3193	0
+3194	3
+3195	3
+3196	3
+3197	3
+3198	3
+3199	3
+3200	3
+3201	3
+3202	3
+3203	2
+3204	3
+3205	3
+3206	3
+3207	2
+3208	3
+3209	3
+3210	3
+3211	3
+3212	3
+3213	3
+3214	3
+3215	3
+3216	3
+3217	3
+3218	3
+3219	3
+3220	3
+3221	3
+3222	3
+3223	3
+3224	3
+3225	2
+3226	3
+3227	3
+3228	3
+3229	3
+3230	3
+3231	3
+3232	3
+3233	3
+3234	3
+3235	3
+3236	3
+3237	3
+3238	0
+3239	3
+3240	3
+3241	3
+3242	3
+3243	3
+3244	3
+3245	3
+3246	3
+3247	3
+3248	3
+3249	3
+3250	3
+3251	3
+3252	3
+3253	3
+3254	3
+3255	3
+3256	3
+3257	3
+3258	3
+3259	2
+3260	3
+3261	3
+3262	3
+3263	3
+3264	3
+3265	3
+3266	3
+3267	3
+3268	2
+3269	3
+3270	3
+3271	3
+3272	2
+3273	3
+3274	3
+3275	3
+3276	3
+3277	3
+3278	3
+3279	3
+3280	3
+3281	3
+3282	3
+3283	3
+3284	3
+3285	3
+3286	3
+3287	3
+3288	3
+3289	3
+3290	3
+3291	3
+3292	3
+3293	3
+3294	3
+3295	3
+3296	3
+3297	3
+3298	2
+3299	3
+3300	3
+3301	3
+3302	3
+3303	3
+3304	3
+3305	3
+3306	3
+3307	3
+3308	3
+3309	3
+3310	3
+3311	3
+3312	3
+3313	3
+3314	3
+3315	3
+3316	3
+3317	3
+3318	3
+3319	3
+3320	3
+3321	3
+3322	3
+3323	3
+3324	1
+3325	3
+3326	3
+3327	3
+3328	1
+3329	3
+3330	3
+3331	3
+3332	3
+3333	3
+3334	1
+3335	3
+3336	3
+3337	3
+3338	3
+3339	3
+3340	3
+3341	3
+3342	3
+3343	3
+3344	3
+3345	3
+3346	3
+3347	3
+3348	3
+3349	3
+3350	3
+3351	3
+3352	3
+3353	3
+3354	3
+3355	3
+3356	3
+3357	3
+3358	2
+3359	3
+3360	3
+3361	3
+3362	3
+3363	3
+3364	3
+3365	3
+3366	3
+3367	3
+3368	3
+3369	3
+3370	3
+3371	3
+3372	3
+3373	3
+3374	3
+3375	3
+3376	3
+3377	3
+3378	3
+3379	3
+3380	3
+3381	3
+3382	3
+3383	3
+3384	3
+3385	3
+3386	3
+3387	3
+3388	3
+3389	3
+3390	3
+3391	3
+3392	3
+3393	3
+3394	3
+3395	3
+3396	3
+3397	3
+3398	3
+3399	3
+3400	2
+3401	3
+3402	3
+3403	3
+3404	3
+3405	3
+3406	3
+3407	3
+3408	3
+3409	3
+3410	3
+3411	3
+3412	3
+3413	3
+3414	3
+3415	3
+3416	3
+3417	3
+3418	3
+3419	3
+3420	3
+3421	3
+3422	3
+3423	3
+3424	3
+3425	3
+3426	3
+3427	3
+3428	2
+3429	3
+3430	3
+3431	3
+3432	3
+3433	3
+3434	3
+3435	3
+3436	3
+3437	3
+3438	2
+3439	3
+3440	3
+3441	3
+3442	3
+3443	3
+3444	3
+3445	3
+3446	2
+3447	2
+3448	3
+3449	3
+3450	3
+3451	3
+3452	3
+3453	3
+3454	3
+3455	3
+3456	3
+3457	3
+3458	3
+3459	3
+3460	3
+3461	3
+3462	3
+3463	3
+3464	3
+3465	3
+3466	3
+3467	3
+3468	3
+3469	3
+3470	2
+3471	3
+3472	3
+3473	3
+3474	3
+3475	3
+3476	3
+3477	3
+3478	3
+3479	3
+3480	3
+3481	2
+3482	3
+3483	3
+3484	3
+3485	3
+3486	3
+3487	3
+3488	3
+3489	3
+3490	3
+3491	3
+3492	2
+3493	3
+3494	3
+3495	3
+3496	3
+3497	2
+3498	3
+3499	3
+3500	0
+3501	3
+3502	3
+3503	3
+3504	3
+3505	3
+3506	3
+3507	3
+3508	3
+3509	3
+3510	3
+3511	3
+3512	3
+3513	3
+3514	3
+3515	3
+3516	3
+3517	3
+3518	3
+3519	2
+3520	3
+3521	3
+3522	3
+3523	3
+3524	3
+3525	3
+3526	3
+3527	3
+3528	3
+3529	3
+3530	3
+3531	3
+3532	3
+3533	3
+3534	3
+3535	3
+3536	3
+3537	3
+3538	3
+3539	3
+3540	3
+3541	3
+3542	3
+3543	2
+3544	3
+3545	2
+3546	3
+3547	3
+3548	3
+3549	3
+3550	3
+3551	3
+3552	3
+3553	3
+3554	3
+3555	3
+3556	3
+3557	3
+3558	3
+3559	3
+3560	3
+3561	3
+3562	3
+3563	3
+3564	3
+3565	3
+3566	3
+3567	3
+3568	3
+3569	3
+3570	3
+3571	3
+3572	3
+3573	3
+3574	3
+3575	3
+3576	3
+3577	3
+3578	3
+3579	3
+3580	3
+3581	3
+3582	3
+3583	3
+3584	3
+3585	3
+3586	3
+3587	3
+3588	3
+3589	3
+3590	3
+3591	3
+3592	3
+3593	3
+3594	3
+3595	3
+3596	3
+3597	3
+3598	3
+3599	3
+3600	3
+3601	3
+3602	3
+3603	3
+3604	3
+3605	3
+3606	3
+3607	3
+3608	3
+3609	3
+3610	2
+3611	3
+3612	3
+3613	3
+3614	3
+3615	3
+3616	3
+3617	3
+3618	3
+3619	3
+3620	3
+3621	3
+3622	3
+3623	3
+3624	3
+3625	3
+3626	3
+3627	3
+3628	3
+3629	3
+3630	3
+3631	2
+3632	3
+3633	3
+3634	3
+3635	3
+3636	3
+3637	3
+3638	3
+3639	3
+3640	3
+3641	3
+3642	3
+3643	3
+3644	3
+3645	3
+3646	3
+3647	3
+3648	2
+3649	3
+3650	3
+3651	3
+3652	3
+3653	3
+3654	3
+3655	3
+3656	3
+3657	3
+3658	3
+3659	3
+3660	3
+3661	3
+3662	3
+3663	3
+3664	3
+3665	3
+3666	3
+3667	3
+3668	3
+3669	3
+3670	3
+3671	3
+3672	3
+3673	2
+3674	3
+3675	3
+3676	3
+3677	3
+3678	3
+3679	3
+3680	3
+3681	3
+3682	3
+3683	3
+3684	3
+3685	3
+3686	3
+3687	3
+3688	3
+3689	3
+3690	3
+3691	3
+3692	2
+3693	3
+3694	3
+3695	3
+3696	3
+3697	3
+3698	3
+3699	3
+3700	3
+3701	0
+3702	3
+3703	3
+3704	3
+3705	3
+3706	3
+3707	0
+3708	3
+3709	3
+3710	3
+3711	3
+3712	3
+3713	3
+3714	3
+3715	3
+3716	3
+3717	3
+3718	3
+3719	3
+3720	3
+3721	2
+3722	3
+3723	3
+3724	3
+3725	2
+3726	3
+3727	3
+3728	3
+3729	3
+3730	3
+3731	3
+3732	3
+3733	3
+3734	3
+3735	3
+3736	3
+3737	3
+3738	3
+3739	3
+3740	3
+3741	3
+3742	3
+3743	3
+3744	3
+3745	3
+3746	3
+3747	3
+3748	3
+3749	3
+3750	3
+3751	3
+3752	3
+3753	3
+3754	3
+3755	3
+3756	2
+3757	3
+3758	0
+3759	3
+3760	3
+3761	3
+3762	3
+3763	2
+3764	3
+3765	2
+3766	3
+3767	3
+3768	3
+3769	3
+3770	3
+3771	3
+3772	3
+3773	3
+3774	3
+3775	3
+3776	3
+3777	3
+3778	3
+3779	3
+3780	3
+3781	3
+3782	3
+3783	3
+3784	3
+3785	3
+3786	3
+3787	3
+3788	2
+3789	3
+3790	3
+3791	3
+3792	2
+3793	2
+3794	3
+3795	3
+3796	3
+3797	3
+3798	3
+3799	3
diff --git a/out/t5_results/README.md b/out/t5_results/README.md
new file mode 100644
index 0000000..e0f5354
--- /dev/null
+++ b/out/t5_results/README.md
@@ -0,0 +1,53 @@
+---
+tags:
+- generated_from_trainer
+model-index:
+- name: t5_results
+  results: []
+---
+
+<!-- This model card has been generated automatically according to the information the Trainer had access to. You
+should probably proofread and complete it, then remove this comment. -->
+
+# t5_results
+
+This model is a fine-tuned version of [out/t5](https://huggingface.co/out/t5) on an unknown dataset.
+It achieves the following results on the evaluation set:
+- eval_loss: 1.2139
+- eval_accuracy: 0.4675
+- eval_runtime: 40.5651
+- eval_samples_per_second: 49.303
+- eval_steps_per_second: 6.163
+- step: 0
+
+## Model description
+
+More information needed
+
+## Intended uses & limitations
+
+More information needed
+
+## Training and evaluation data
+
+More information needed
+
+## Training procedure
+
+### Training hyperparameters
+
+The following hyperparameters were used during training:
+- learning_rate: 2e-05
+- train_batch_size: 8
+- eval_batch_size: 8
+- seed: 42
+- optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08
+- lr_scheduler_type: linear
+- training_steps: 2500
+
+### Framework versions
+
+- Transformers 4.26.1
+- Pytorch 1.13.1+cu117
+- Datasets 2.9.0
+- Tokenizers 0.13.2
diff --git a/out/t5_results/all_results.json b/out/t5_results/all_results.json
new file mode 100644
index 0000000..e900025
--- /dev/null
+++ b/out/t5_results/all_results.json
@@ -0,0 +1,8 @@
+{
+    "eval_accuracy": 0.4675000011920929,
+    "eval_loss": 1.213880181312561,
+    "eval_runtime": 40.5651,
+    "eval_samples": 2000,
+    "eval_samples_per_second": 49.303,
+    "eval_steps_per_second": 6.163
+}
\ No newline at end of file
diff --git a/out/t5_results/eval_results.json b/out/t5_results/eval_results.json
new file mode 100644
index 0000000..e900025
--- /dev/null
+++ b/out/t5_results/eval_results.json
@@ -0,0 +1,8 @@
+{
+    "eval_accuracy": 0.4675000011920929,
+    "eval_loss": 1.213880181312561,
+    "eval_runtime": 40.5651,
+    "eval_samples": 2000,
+    "eval_samples_per_second": 49.303,
+    "eval_steps_per_second": 6.163
+}
\ No newline at end of file
diff --git a/out/t5_results/predict_results_None.txt b/out/t5_results/predict_results_None.txt
new file mode 100644
index 0000000..01cd7c7
--- /dev/null
+++ b/out/t5_results/predict_results_None.txt
@@ -0,0 +1,3801 @@
+index	prediction
+0	0
+1	2
+2	0
+3	0
+4	0
+5	1
+6	0
+7	1
+8	1
+9	2
+10	0
+11	1
+12	0
+13	0
+14	0
+15	0
+16	0
+17	0
+18	0
+19	0
+20	0
+21	0
+22	0
+23	0
+24	2
+25	3
+26	1
+27	2
+28	3
+29	0
+30	1
+31	2
+32	0
+33	0
+34	0
+35	0
+36	0
+37	1
+38	0
+39	0
+40	0
+41	0
+42	3
+43	0
+44	2
+45	0
+46	0
+47	0
+48	0
+49	1
+50	1
+51	1
+52	0
+53	0
+54	1
+55	0
+56	0
+57	0
+58	3
+59	0
+60	0
+61	1
+62	0
+63	0
+64	0
+65	0
+66	0
+67	0
+68	0
+69	0
+70	0
+71	0
+72	0
+73	3
+74	0
+75	0
+76	0
+77	0
+78	0
+79	0
+80	0
+81	1
+82	2
+83	2
+84	2
+85	2
+86	1
+87	0
+88	0
+89	1
+90	2
+91	0
+92	0
+93	0
+94	0
+95	0
+96	0
+97	1
+98	3
+99	1
+100	0
+101	1
+102	0
+103	0
+104	1
+105	1
+106	0
+107	1
+108	0
+109	1
+110	2
+111	3
+112	0
+113	0
+114	2
+115	1
+116	3
+117	1
+118	1
+119	0
+120	0
+121	0
+122	0
+123	2
+124	1
+125	0
+126	0
+127	1
+128	0
+129	2
+130	1
+131	0
+132	2
+133	0
+134	0
+135	0
+136	2
+137	0
+138	1
+139	1
+140	0
+141	0
+142	0
+143	0
+144	0
+145	0
+146	0
+147	0
+148	0
+149	0
+150	2
+151	2
+152	0
+153	3
+154	2
+155	0
+156	3
+157	0
+158	0
+159	1
+160	0
+161	1
+162	1
+163	0
+164	1
+165	0
+166	0
+167	0
+168	0
+169	0
+170	0
+171	2
+172	0
+173	0
+174	0
+175	0
+176	0
+177	1
+178	1
+179	0
+180	0
+181	0
+182	0
+183	2
+184	0
+185	0
+186	0
+187	0
+188	1
+189	0
+190	0
+191	0
+192	0
+193	1
+194	1
+195	1
+196	1
+197	0
+198	1
+199	0
+200	0
+201	0
+202	0
+203	0
+204	0
+205	0
+206	0
+207	0
+208	0
+209	1
+210	0
+211	0
+212	0
+213	2
+214	0
+215	0
+216	3
+217	0
+218	1
+219	0
+220	0
+221	0
+222	0
+223	0
+224	2
+225	0
+226	2
+227	3
+228	0
+229	0
+230	0
+231	0
+232	2
+233	1
+234	0
+235	0
+236	0
+237	0
+238	1
+239	3
+240	0
+241	0
+242	0
+243	0
+244	0
+245	1
+246	0
+247	3
+248	0
+249	0
+250	1
+251	1
+252	0
+253	0
+254	1
+255	0
+256	0
+257	1
+258	1
+259	0
+260	1
+261	1
+262	1
+263	0
+264	0
+265	1
+266	0
+267	2
+268	0
+269	0
+270	1
+271	0
+272	1
+273	0
+274	1
+275	1
+276	0
+277	0
+278	0
+279	2
+280	1
+281	0
+282	0
+283	1
+284	0
+285	1
+286	0
+287	0
+288	0
+289	2
+290	0
+291	0
+292	1
+293	0
+294	0
+295	0
+296	1
+297	1
+298	0
+299	0
+300	1
+301	0
+302	1
+303	0
+304	0
+305	0
+306	1
+307	0
+308	0
+309	3
+310	0
+311	0
+312	0
+313	2
+314	0
+315	0
+316	1
+317	0
+318	3
+319	1
+320	0
+321	0
+322	0
+323	1
+324	0
+325	1
+326	0
+327	0
+328	3
+329	0
+330	1
+331	0
+332	1
+333	0
+334	1
+335	2
+336	0
+337	0
+338	2
+339	1
+340	2
+341	0
+342	1
+343	0
+344	1
+345	1
+346	0
+347	0
+348	0
+349	0
+350	1
+351	0
+352	0
+353	2
+354	0
+355	0
+356	0
+357	2
+358	0
+359	1
+360	0
+361	0
+362	0
+363	0
+364	1
+365	0
+366	0
+367	3
+368	0
+369	0
+370	0
+371	1
+372	0
+373	0
+374	0
+375	0
+376	0
+377	0
+378	0
+379	3
+380	0
+381	0
+382	0
+383	0
+384	0
+385	0
+386	0
+387	1
+388	0
+389	1
+390	1
+391	0
+392	1
+393	0
+394	1
+395	0
+396	1
+397	2
+398	0
+399	0
+400	0
+401	0
+402	3
+403	0
+404	1
+405	3
+406	0
+407	2
+408	1
+409	1
+410	0
+411	0
+412	2
+413	0
+414	0
+415	0
+416	0
+417	0
+418	1
+419	0
+420	1
+421	0
+422	1
+423	0
+424	1
+425	0
+426	0
+427	0
+428	0
+429	0
+430	0
+431	0
+432	0
+433	1
+434	0
+435	1
+436	0
+437	2
+438	2
+439	0
+440	1
+441	1
+442	0
+443	2
+444	0
+445	0
+446	0
+447	0
+448	0
+449	0
+450	0
+451	1
+452	0
+453	0
+454	3
+455	1
+456	0
+457	0
+458	0
+459	1
+460	0
+461	0
+462	0
+463	0
+464	0
+465	0
+466	0
+467	0
+468	0
+469	1
+470	1
+471	1
+472	0
+473	0
+474	2
+475	1
+476	2
+477	0
+478	0
+479	0
+480	0
+481	0
+482	1
+483	0
+484	3
+485	2
+486	1
+487	0
+488	1
+489	0
+490	0
+491	0
+492	0
+493	0
+494	1
+495	0
+496	1
+497	1
+498	0
+499	3
+500	0
+501	1
+502	0
+503	0
+504	0
+505	1
+506	0
+507	1
+508	0
+509	3
+510	1
+511	0
+512	0
+513	0
+514	0
+515	0
+516	1
+517	0
+518	0
+519	0
+520	0
+521	0
+522	1
+523	0
+524	0
+525	0
+526	1
+527	0
+528	0
+529	0
+530	1
+531	2
+532	0
+533	0
+534	1
+535	1
+536	1
+537	0
+538	1
+539	1
+540	0
+541	1
+542	0
+543	2
+544	0
+545	1
+546	1
+547	1
+548	0
+549	2
+550	1
+551	0
+552	0
+553	1
+554	0
+555	0
+556	0
+557	3
+558	0
+559	1
+560	0
+561	0
+562	0
+563	0
+564	3
+565	2
+566	0
+567	1
+568	0
+569	2
+570	2
+571	0
+572	1
+573	0
+574	1
+575	2
+576	2
+577	0
+578	1
+579	0
+580	0
+581	2
+582	1
+583	1
+584	1
+585	0
+586	0
+587	0
+588	0
+589	1
+590	1
+591	1
+592	0
+593	3
+594	2
+595	0
+596	0
+597	0
+598	1
+599	0
+600	0
+601	0
+602	1
+603	0
+604	1
+605	2
+606	2
+607	0
+608	0
+609	2
+610	0
+611	0
+612	1
+613	0
+614	0
+615	0
+616	3
+617	0
+618	0
+619	3
+620	0
+621	0
+622	0
+623	0
+624	1
+625	0
+626	0
+627	0
+628	0
+629	1
+630	0
+631	1
+632	1
+633	0
+634	0
+635	0
+636	0
+637	0
+638	1
+639	1
+640	3
+641	0
+642	0
+643	0
+644	0
+645	1
+646	2
+647	0
+648	1
+649	1
+650	1
+651	1
+652	0
+653	0
+654	3
+655	0
+656	0
+657	0
+658	0
+659	0
+660	0
+661	1
+662	0
+663	0
+664	1
+665	1
+666	1
+667	0
+668	0
+669	1
+670	0
+671	0
+672	1
+673	2
+674	0
+675	1
+676	1
+677	3
+678	0
+679	0
+680	0
+681	2
+682	0
+683	0
+684	2
+685	0
+686	1
+687	0
+688	0
+689	0
+690	0
+691	2
+692	0
+693	3
+694	1
+695	0
+696	0
+697	1
+698	1
+699	0
+700	0
+701	0
+702	0
+703	2
+704	0
+705	0
+706	3
+707	2
+708	0
+709	0
+710	1
+711	0
+712	0
+713	0
+714	0
+715	0
+716	0
+717	1
+718	0
+719	1
+720	0
+721	1
+722	0
+723	1
+724	1
+725	0
+726	2
+727	0
+728	1
+729	1
+730	2
+731	0
+732	0
+733	3
+734	0
+735	2
+736	3
+737	1
+738	0
+739	0
+740	0
+741	1
+742	1
+743	0
+744	0
+745	1
+746	0
+747	0
+748	0
+749	3
+750	0
+751	0
+752	0
+753	1
+754	0
+755	1
+756	1
+757	2
+758	1
+759	0
+760	2
+761	1
+762	0
+763	0
+764	0
+765	0
+766	0
+767	0
+768	3
+769	0
+770	0
+771	0
+772	0
+773	0
+774	1
+775	3
+776	3
+777	1
+778	1
+779	0
+780	0
+781	0
+782	2
+783	0
+784	0
+785	3
+786	2
+787	0
+788	0
+789	0
+790	0
+791	0
+792	1
+793	1
+794	0
+795	0
+796	1
+797	0
+798	2
+799	0
+800	1
+801	0
+802	0
+803	3
+804	3
+805	1
+806	0
+807	2
+808	1
+809	1
+810	3
+811	0
+812	1
+813	1
+814	0
+815	0
+816	2
+817	2
+818	1
+819	1
+820	0
+821	2
+822	1
+823	1
+824	3
+825	0
+826	0
+827	0
+828	1
+829	0
+830	2
+831	0
+832	3
+833	1
+834	3
+835	0
+836	0
+837	0
+838	1
+839	0
+840	1
+841	0
+842	0
+843	0
+844	0
+845	0
+846	0
+847	2
+848	1
+849	0
+850	0
+851	0
+852	1
+853	2
+854	0
+855	0
+856	0
+857	0
+858	1
+859	0
+860	0
+861	1
+862	0
+863	1
+864	0
+865	1
+866	0
+867	0
+868	0
+869	0
+870	0
+871	0
+872	1
+873	1
+874	1
+875	1
+876	1
+877	0
+878	0
+879	0
+880	0
+881	0
+882	1
+883	0
+884	0
+885	0
+886	0
+887	0
+888	0
+889	1
+890	2
+891	0
+892	0
+893	1
+894	0
+895	1
+896	0
+897	1
+898	0
+899	0
+900	0
+901	0
+902	0
+903	0
+904	0
+905	0
+906	0
+907	0
+908	0
+909	0
+910	3
+911	2
+912	0
+913	2
+914	0
+915	0
+916	0
+917	1
+918	0
+919	1
+920	2
+921	1
+922	0
+923	0
+924	1
+925	0
+926	0
+927	0
+928	0
+929	0
+930	0
+931	1
+932	0
+933	0
+934	2
+935	0
+936	1
+937	0
+938	0
+939	0
+940	0
+941	0
+942	0
+943	0
+944	0
+945	2
+946	0
+947	2
+948	0
+949	1
+950	1
+951	1
+952	1
+953	2
+954	1
+955	1
+956	2
+957	1
+958	0
+959	3
+960	0
+961	1
+962	1
+963	1
+964	0
+965	1
+966	3
+967	1
+968	1
+969	0
+970	0
+971	1
+972	0
+973	1
+974	1
+975	1
+976	1
+977	1
+978	0
+979	1
+980	0
+981	1
+982	1
+983	1
+984	3
+985	1
+986	1
+987	1
+988	2
+989	1
+990	1
+991	2
+992	0
+993	1
+994	0
+995	1
+996	1
+997	3
+998	0
+999	1
+1000	1
+1001	1
+1002	1
+1003	0
+1004	1
+1005	1
+1006	1
+1007	1
+1008	0
+1009	1
+1010	1
+1011	1
+1012	1
+1013	0
+1014	1
+1015	1
+1016	1
+1017	3
+1018	1
+1019	1
+1020	0
+1021	1
+1022	2
+1023	0
+1024	0
+1025	1
+1026	1
+1027	1
+1028	1
+1029	2
+1030	0
+1031	0
+1032	0
+1033	1
+1034	0
+1035	1
+1036	1
+1037	1
+1038	0
+1039	0
+1040	2
+1041	2
+1042	1
+1043	1
+1044	1
+1045	0
+1046	1
+1047	1
+1048	2
+1049	1
+1050	1
+1051	0
+1052	1
+1053	0
+1054	0
+1055	1
+1056	1
+1057	2
+1058	1
+1059	1
+1060	1
+1061	1
+1062	0
+1063	1
+1064	1
+1065	2
+1066	0
+1067	1
+1068	1
+1069	0
+1070	0
+1071	1
+1072	1
+1073	1
+1074	1
+1075	0
+1076	1
+1077	1
+1078	1
+1079	2
+1080	1
+1081	1
+1082	1
+1083	1
+1084	3
+1085	1
+1086	0
+1087	1
+1088	1
+1089	2
+1090	1
+1091	1
+1092	0
+1093	1
+1094	1
+1095	1
+1096	1
+1097	3
+1098	3
+1099	1
+1100	1
+1101	1
+1102	3
+1103	1
+1104	0
+1105	1
+1106	3
+1107	1
+1108	3
+1109	0
+1110	1
+1111	0
+1112	0
+1113	1
+1114	0
+1115	0
+1116	1
+1117	0
+1118	2
+1119	1
+1120	1
+1121	2
+1122	0
+1123	0
+1124	1
+1125	1
+1126	1
+1127	1
+1128	3
+1129	1
+1130	0
+1131	0
+1132	1
+1133	1
+1134	0
+1135	2
+1136	1
+1137	0
+1138	1
+1139	0
+1140	1
+1141	1
+1142	1
+1143	0
+1144	1
+1145	1
+1146	1
+1147	0
+1148	1
+1149	3
+1150	3
+1151	2
+1152	1
+1153	0
+1154	0
+1155	1
+1156	0
+1157	1
+1158	1
+1159	0
+1160	1
+1161	0
+1162	3
+1163	1
+1164	1
+1165	1
+1166	0
+1167	0
+1168	1
+1169	1
+1170	1
+1171	0
+1172	1
+1173	1
+1174	1
+1175	3
+1176	1
+1177	1
+1178	3
+1179	1
+1180	1
+1181	2
+1182	1
+1183	1
+1184	0
+1185	0
+1186	2
+1187	2
+1188	1
+1189	1
+1190	2
+1191	0
+1192	1
+1193	1
+1194	1
+1195	1
+1196	1
+1197	1
+1198	0
+1199	3
+1200	1
+1201	1
+1202	1
+1203	0
+1204	2
+1205	1
+1206	0
+1207	1
+1208	1
+1209	0
+1210	1
+1211	1
+1212	0
+1213	1
+1214	2
+1215	1
+1216	0
+1217	3
+1218	1
+1219	1
+1220	0
+1221	2
+1222	0
+1223	3
+1224	0
+1225	1
+1226	1
+1227	3
+1228	0
+1229	1
+1230	1
+1231	1
+1232	1
+1233	0
+1234	1
+1235	1
+1236	0
+1237	0
+1238	2
+1239	0
+1240	2
+1241	1
+1242	2
+1243	0
+1244	1
+1245	1
+1246	1
+1247	1
+1248	1
+1249	1
+1250	0
+1251	1
+1252	1
+1253	3
+1254	1
+1255	0
+1256	1
+1257	1
+1258	2
+1259	1
+1260	1
+1261	2
+1262	1
+1263	1
+1264	3
+1265	2
+1266	1
+1267	0
+1268	0
+1269	0
+1270	1
+1271	1
+1272	0
+1273	1
+1274	0
+1275	1
+1276	1
+1277	0
+1278	2
+1279	2
+1280	3
+1281	2
+1282	1
+1283	1
+1284	1
+1285	1
+1286	3
+1287	1
+1288	0
+1289	2
+1290	1
+1291	1
+1292	0
+1293	1
+1294	1
+1295	1
+1296	1
+1297	3
+1298	1
+1299	0
+1300	0
+1301	1
+1302	1
+1303	3
+1304	1
+1305	0
+1306	0
+1307	3
+1308	1
+1309	1
+1310	1
+1311	0
+1312	1
+1313	0
+1314	2
+1315	2
+1316	1
+1317	1
+1318	1
+1319	1
+1320	0
+1321	1
+1322	2
+1323	1
+1324	0
+1325	1
+1326	1
+1327	1
+1328	1
+1329	1
+1330	1
+1331	1
+1332	0
+1333	2
+1334	1
+1335	2
+1336	0
+1337	1
+1338	1
+1339	3
+1340	3
+1341	3
+1342	1
+1343	0
+1344	0
+1345	1
+1346	0
+1347	1
+1348	2
+1349	1
+1350	3
+1351	3
+1352	1
+1353	1
+1354	2
+1355	1
+1356	1
+1357	1
+1358	0
+1359	1
+1360	0
+1361	1
+1362	1
+1363	1
+1364	3
+1365	1
+1366	1
+1367	1
+1368	1
+1369	2
+1370	3
+1371	1
+1372	2
+1373	2
+1374	2
+1375	0
+1376	0
+1377	1
+1378	3
+1379	0
+1380	1
+1381	2
+1382	1
+1383	1
+1384	0
+1385	1
+1386	1
+1387	0
+1388	0
+1389	1
+1390	1
+1391	1
+1392	3
+1393	2
+1394	3
+1395	1
+1396	1
+1397	2
+1398	3
+1399	1
+1400	0
+1401	0
+1402	1
+1403	1
+1404	0
+1405	1
+1406	0
+1407	1
+1408	2
+1409	1
+1410	1
+1411	1
+1412	1
+1413	1
+1414	1
+1415	2
+1416	1
+1417	1
+1418	1
+1419	1
+1420	2
+1421	1
+1422	0
+1423	0
+1424	2
+1425	1
+1426	1
+1427	1
+1428	1
+1429	1
+1430	1
+1431	1
+1432	3
+1433	0
+1434	0
+1435	1
+1436	1
+1437	0
+1438	0
+1439	3
+1440	0
+1441	0
+1442	0
+1443	3
+1444	1
+1445	1
+1446	1
+1447	1
+1448	1
+1449	1
+1450	1
+1451	3
+1452	1
+1453	2
+1454	1
+1455	1
+1456	1
+1457	1
+1458	1
+1459	1
+1460	1
+1461	3
+1462	2
+1463	0
+1464	0
+1465	1
+1466	0
+1467	0
+1468	2
+1469	0
+1470	2
+1471	1
+1472	0
+1473	2
+1474	1
+1475	1
+1476	1
+1477	1
+1478	1
+1479	1
+1480	1
+1481	1
+1482	1
+1483	1
+1484	2
+1485	0
+1486	2
+1487	0
+1488	0
+1489	1
+1490	1
+1491	1
+1492	0
+1493	1
+1494	3
+1495	2
+1496	0
+1497	3
+1498	0
+1499	1
+1500	3
+1501	0
+1502	0
+1503	1
+1504	1
+1505	1
+1506	1
+1507	1
+1508	1
+1509	0
+1510	1
+1511	1
+1512	1
+1513	0
+1514	1
+1515	2
+1516	1
+1517	2
+1518	1
+1519	1
+1520	2
+1521	0
+1522	1
+1523	1
+1524	0
+1525	1
+1526	1
+1527	0
+1528	1
+1529	1
+1530	1
+1531	1
+1532	1
+1533	1
+1534	1
+1535	1
+1536	1
+1537	1
+1538	1
+1539	0
+1540	0
+1541	0
+1542	0
+1543	3
+1544	1
+1545	0
+1546	0
+1547	1
+1548	1
+1549	1
+1550	1
+1551	2
+1552	0
+1553	0
+1554	1
+1555	3
+1556	2
+1557	1
+1558	0
+1559	1
+1560	2
+1561	1
+1562	1
+1563	2
+1564	1
+1565	1
+1566	1
+1567	3
+1568	1
+1569	1
+1570	1
+1571	1
+1572	1
+1573	1
+1574	1
+1575	1
+1576	1
+1577	0
+1578	0
+1579	1
+1580	1
+1581	1
+1582	1
+1583	1
+1584	2
+1585	1
+1586	0
+1587	1
+1588	1
+1589	1
+1590	3
+1591	2
+1592	0
+1593	1
+1594	1
+1595	1
+1596	1
+1597	1
+1598	1
+1599	1
+1600	1
+1601	0
+1602	2
+1603	1
+1604	0
+1605	1
+1606	1
+1607	1
+1608	1
+1609	1
+1610	1
+1611	1
+1612	1
+1613	1
+1614	1
+1615	1
+1616	0
+1617	1
+1618	0
+1619	1
+1620	1
+1621	1
+1622	1
+1623	1
+1624	0
+1625	1
+1626	1
+1627	1
+1628	1
+1629	2
+1630	1
+1631	1
+1632	2
+1633	1
+1634	3
+1635	2
+1636	1
+1637	1
+1638	1
+1639	1
+1640	1
+1641	2
+1642	1
+1643	2
+1644	0
+1645	1
+1646	1
+1647	1
+1648	2
+1649	2
+1650	1
+1651	1
+1652	1
+1653	0
+1654	0
+1655	1
+1656	1
+1657	1
+1658	1
+1659	1
+1660	1
+1661	1
+1662	0
+1663	1
+1664	0
+1665	1
+1666	1
+1667	1
+1668	0
+1669	0
+1670	0
+1671	1
+1672	1
+1673	1
+1674	0
+1675	0
+1676	1
+1677	1
+1678	1
+1679	1
+1680	1
+1681	1
+1682	0
+1683	2
+1684	1
+1685	1
+1686	0
+1687	1
+1688	2
+1689	0
+1690	0
+1691	1
+1692	0
+1693	1
+1694	0
+1695	1
+1696	1
+1697	0
+1698	1
+1699	1
+1700	0
+1701	1
+1702	0
+1703	0
+1704	1
+1705	1
+1706	1
+1707	1
+1708	2
+1709	1
+1710	3
+1711	1
+1712	1
+1713	1
+1714	1
+1715	1
+1716	2
+1717	0
+1718	0
+1719	1
+1720	1
+1721	1
+1722	1
+1723	1
+1724	1
+1725	2
+1726	0
+1727	3
+1728	1
+1729	1
+1730	1
+1731	1
+1732	1
+1733	0
+1734	1
+1735	1
+1736	1
+1737	0
+1738	1
+1739	1
+1740	0
+1741	0
+1742	1
+1743	0
+1744	1
+1745	1
+1746	0
+1747	1
+1748	0
+1749	1
+1750	1
+1751	1
+1752	0
+1753	3
+1754	1
+1755	1
+1756	1
+1757	1
+1758	1
+1759	3
+1760	1
+1761	0
+1762	1
+1763	1
+1764	0
+1765	1
+1766	0
+1767	0
+1768	2
+1769	0
+1770	1
+1771	3
+1772	3
+1773	2
+1774	0
+1775	1
+1776	1
+1777	1
+1778	2
+1779	2
+1780	0
+1781	0
+1782	1
+1783	1
+1784	1
+1785	1
+1786	1
+1787	3
+1788	0
+1789	1
+1790	1
+1791	3
+1792	1
+1793	1
+1794	1
+1795	1
+1796	1
+1797	1
+1798	1
+1799	1
+1800	1
+1801	1
+1802	2
+1803	1
+1804	1
+1805	1
+1806	1
+1807	3
+1808	1
+1809	1
+1810	2
+1811	0
+1812	1
+1813	2
+1814	0
+1815	1
+1816	1
+1817	2
+1818	2
+1819	0
+1820	0
+1821	2
+1822	0
+1823	0
+1824	1
+1825	1
+1826	1
+1827	1
+1828	1
+1829	1
+1830	0
+1831	1
+1832	1
+1833	2
+1834	2
+1835	1
+1836	1
+1837	1
+1838	1
+1839	1
+1840	1
+1841	1
+1842	0
+1843	1
+1844	1
+1845	0
+1846	2
+1847	1
+1848	1
+1849	2
+1850	2
+1851	1
+1852	1
+1853	0
+1854	0
+1855	1
+1856	1
+1857	0
+1858	1
+1859	1
+1860	1
+1861	1
+1862	1
+1863	0
+1864	3
+1865	3
+1866	1
+1867	1
+1868	1
+1869	2
+1870	1
+1871	0
+1872	1
+1873	0
+1874	2
+1875	2
+1876	1
+1877	2
+1878	2
+1879	0
+1880	1
+1881	0
+1882	1
+1883	1
+1884	1
+1885	1
+1886	1
+1887	1
+1888	0
+1889	2
+1890	0
+1891	0
+1892	1
+1893	2
+1894	1
+1895	1
+1896	0
+1897	1
+1898	1
+1899	0
+1900	3
+1901	2
+1902	0
+1903	2
+1904	0
+1905	0
+1906	2
+1907	2
+1908	0
+1909	3
+1910	0
+1911	1
+1912	3
+1913	1
+1914	2
+1915	2
+1916	0
+1917	0
+1918	2
+1919	1
+1920	2
+1921	0
+1922	1
+1923	1
+1924	0
+1925	2
+1926	0
+1927	3
+1928	1
+1929	1
+1930	1
+1931	3
+1932	0
+1933	2
+1934	2
+1935	0
+1936	2
+1937	1
+1938	2
+1939	2
+1940	3
+1941	2
+1942	1
+1943	0
+1944	1
+1945	1
+1946	0
+1947	1
+1948	1
+1949	0
+1950	0
+1951	2
+1952	0
+1953	2
+1954	0
+1955	1
+1956	0
+1957	0
+1958	1
+1959	0
+1960	1
+1961	1
+1962	3
+1963	2
+1964	3
+1965	2
+1966	1
+1967	1
+1968	0
+1969	3
+1970	1
+1971	1
+1972	1
+1973	0
+1974	1
+1975	3
+1976	1
+1977	0
+1978	3
+1979	0
+1980	1
+1981	1
+1982	2
+1983	2
+1984	2
+1985	0
+1986	2
+1987	2
+1988	0
+1989	1
+1990	1
+1991	1
+1992	3
+1993	1
+1994	0
+1995	0
+1996	0
+1997	2
+1998	2
+1999	3
+2000	1
+2001	1
+2002	1
+2003	1
+2004	2
+2005	3
+2006	2
+2007	1
+2008	1
+2009	1
+2010	1
+2011	2
+2012	2
+2013	0
+2014	1
+2015	0
+2016	1
+2017	2
+2018	1
+2019	2
+2020	1
+2021	1
+2022	1
+2023	0
+2024	3
+2025	1
+2026	2
+2027	0
+2028	3
+2029	0
+2030	1
+2031	2
+2032	3
+2033	0
+2034	0
+2035	1
+2036	0
+2037	3
+2038	1
+2039	0
+2040	1
+2041	2
+2042	0
+2043	1
+2044	3
+2045	0
+2046	2
+2047	1
+2048	3
+2049	1
+2050	3
+2051	2
+2052	3
+2053	1
+2054	1
+2055	1
+2056	2
+2057	3
+2058	3
+2059	1
+2060	1
+2061	0
+2062	1
+2063	2
+2064	0
+2065	0
+2066	2
+2067	1
+2068	2
+2069	0
+2070	2
+2071	2
+2072	1
+2073	2
+2074	2
+2075	0
+2076	0
+2077	1
+2078	0
+2079	3
+2080	3
+2081	1
+2082	1
+2083	1
+2084	0
+2085	1
+2086	2
+2087	3
+2088	1
+2089	2
+2090	0
+2091	3
+2092	1
+2093	2
+2094	0
+2095	2
+2096	3
+2097	2
+2098	1
+2099	2
+2100	1
+2101	1
+2102	1
+2103	3
+2104	3
+2105	0
+2106	2
+2107	2
+2108	2
+2109	2
+2110	0
+2111	1
+2112	1
+2113	1
+2114	1
+2115	0
+2116	2
+2117	2
+2118	0
+2119	1
+2120	3
+2121	3
+2122	2
+2123	2
+2124	3
+2125	0
+2126	1
+2127	1
+2128	3
+2129	2
+2130	0
+2131	1
+2132	3
+2133	0
+2134	3
+2135	3
+2136	3
+2137	1
+2138	1
+2139	1
+2140	1
+2141	0
+2142	0
+2143	2
+2144	3
+2145	1
+2146	3
+2147	0
+2148	3
+2149	0
+2150	2
+2151	1
+2152	2
+2153	2
+2154	3
+2155	1
+2156	2
+2157	1
+2158	2
+2159	1
+2160	1
+2161	2
+2162	1
+2163	3
+2164	2
+2165	2
+2166	2
+2167	1
+2168	1
+2169	2
+2170	3
+2171	2
+2172	3
+2173	0
+2174	2
+2175	1
+2176	0
+2177	3
+2178	1
+2179	3
+2180	3
+2181	3
+2182	0
+2183	1
+2184	3
+2185	2
+2186	2
+2187	2
+2188	3
+2189	3
+2190	1
+2191	1
+2192	2
+2193	0
+2194	2
+2195	1
+2196	1
+2197	3
+2198	2
+2199	2
+2200	2
+2201	0
+2202	1
+2203	2
+2204	0
+2205	3
+2206	2
+2207	1
+2208	3
+2209	0
+2210	3
+2211	1
+2212	1
+2213	2
+2214	2
+2215	3
+2216	1
+2217	1
+2218	0
+2219	1
+2220	0
+2221	0
+2222	0
+2223	0
+2224	0
+2225	1
+2226	1
+2227	2
+2228	3
+2229	1
+2230	2
+2231	1
+2232	1
+2233	0
+2234	3
+2235	0
+2236	3
+2237	0
+2238	2
+2239	2
+2240	3
+2241	2
+2242	2
+2243	3
+2244	0
+2245	3
+2246	2
+2247	2
+2248	3
+2249	3
+2250	0
+2251	1
+2252	1
+2253	3
+2254	2
+2255	1
+2256	0
+2257	3
+2258	2
+2259	2
+2260	3
+2261	3
+2262	2
+2263	1
+2264	2
+2265	1
+2266	0
+2267	3
+2268	2
+2269	1
+2270	2
+2271	1
+2272	1
+2273	3
+2274	0
+2275	3
+2276	0
+2277	2
+2278	3
+2279	3
+2280	2
+2281	0
+2282	2
+2283	2
+2284	1
+2285	1
+2286	0
+2287	1
+2288	1
+2289	3
+2290	0
+2291	1
+2292	2
+2293	2
+2294	2
+2295	2
+2296	3
+2297	2
+2298	3
+2299	2
+2300	1
+2301	0
+2302	2
+2303	3
+2304	1
+2305	0
+2306	1
+2307	1
+2308	1
+2309	2
+2310	2
+2311	2
+2312	2
+2313	1
+2314	3
+2315	2
+2316	2
+2317	3
+2318	3
+2319	0
+2320	1
+2321	2
+2322	0
+2323	2
+2324	0
+2325	1
+2326	0
+2327	0
+2328	2
+2329	2
+2330	1
+2331	0
+2332	3
+2333	1
+2334	0
+2335	2
+2336	2
+2337	0
+2338	0
+2339	3
+2340	0
+2341	1
+2342	2
+2343	1
+2344	1
+2345	0
+2346	0
+2347	1
+2348	2
+2349	3
+2350	1
+2351	2
+2352	2
+2353	2
+2354	3
+2355	1
+2356	3
+2357	3
+2358	2
+2359	3
+2360	3
+2361	1
+2362	3
+2363	2
+2364	3
+2365	3
+2366	1
+2367	3
+2368	0
+2369	1
+2370	3
+2371	1
+2372	0
+2373	0
+2374	2
+2375	3
+2376	1
+2377	2
+2378	1
+2379	0
+2380	3
+2381	1
+2382	1
+2383	1
+2384	3
+2385	3
+2386	3
+2387	1
+2388	1
+2389	0
+2390	0
+2391	3
+2392	2
+2393	2
+2394	3
+2395	1
+2396	0
+2397	2
+2398	2
+2399	3
+2400	3
+2401	3
+2402	1
+2403	1
+2404	1
+2405	0
+2406	0
+2407	2
+2408	3
+2409	1
+2410	2
+2411	1
+2412	0
+2413	0
+2414	1
+2415	0
+2416	0
+2417	2
+2418	0
+2419	1
+2420	0
+2421	3
+2422	2
+2423	0
+2424	2
+2425	3
+2426	2
+2427	0
+2428	3
+2429	2
+2430	3
+2431	1
+2432	1
+2433	0
+2434	3
+2435	0
+2436	0
+2437	3
+2438	3
+2439	1
+2440	3
+2441	3
+2442	3
+2443	1
+2444	1
+2445	0
+2446	1
+2447	3
+2448	0
+2449	3
+2450	1
+2451	3
+2452	3
+2453	2
+2454	1
+2455	3
+2456	1
+2457	2
+2458	2
+2459	2
+2460	2
+2461	1
+2462	1
+2463	2
+2464	1
+2465	0
+2466	1
+2467	1
+2468	2
+2469	0
+2470	3
+2471	0
+2472	0
+2473	3
+2474	2
+2475	1
+2476	1
+2477	2
+2478	3
+2479	2
+2480	1
+2481	3
+2482	3
+2483	1
+2484	1
+2485	0
+2486	3
+2487	1
+2488	2
+2489	0
+2490	0
+2491	0
+2492	0
+2493	0
+2494	0
+2495	2
+2496	3
+2497	1
+2498	1
+2499	1
+2500	1
+2501	2
+2502	3
+2503	2
+2504	1
+2505	2
+2506	0
+2507	0
+2508	1
+2509	0
+2510	2
+2511	1
+2512	1
+2513	1
+2514	1
+2515	1
+2516	1
+2517	0
+2518	3
+2519	0
+2520	1
+2521	1
+2522	0
+2523	2
+2524	0
+2525	0
+2526	0
+2527	2
+2528	2
+2529	2
+2530	2
+2531	0
+2532	2
+2533	1
+2534	2
+2535	0
+2536	2
+2537	1
+2538	3
+2539	0
+2540	2
+2541	0
+2542	0
+2543	1
+2544	0
+2545	1
+2546	1
+2547	2
+2548	1
+2549	2
+2550	1
+2551	1
+2552	1
+2553	0
+2554	2
+2555	1
+2556	3
+2557	2
+2558	0
+2559	1
+2560	1
+2561	2
+2562	0
+2563	2
+2564	2
+2565	2
+2566	2
+2567	2
+2568	1
+2569	2
+2570	3
+2571	2
+2572	0
+2573	2
+2574	3
+2575	1
+2576	2
+2577	1
+2578	2
+2579	0
+2580	2
+2581	1
+2582	0
+2583	3
+2584	0
+2585	0
+2586	1
+2587	0
+2588	3
+2589	3
+2590	1
+2591	3
+2592	2
+2593	2
+2594	0
+2595	1
+2596	0
+2597	3
+2598	0
+2599	1
+2600	0
+2601	1
+2602	1
+2603	2
+2604	1
+2605	2
+2606	1
+2607	1
+2608	1
+2609	1
+2610	3
+2611	0
+2612	3
+2613	0
+2614	2
+2615	0
+2616	0
+2617	2
+2618	1
+2619	1
+2620	3
+2621	0
+2622	3
+2623	2
+2624	1
+2625	1
+2626	0
+2627	3
+2628	1
+2629	1
+2630	3
+2631	1
+2632	1
+2633	1
+2634	0
+2635	0
+2636	3
+2637	1
+2638	0
+2639	1
+2640	1
+2641	0
+2642	3
+2643	3
+2644	1
+2645	2
+2646	3
+2647	1
+2648	2
+2649	0
+2650	1
+2651	0
+2652	3
+2653	1
+2654	1
+2655	2
+2656	1
+2657	1
+2658	0
+2659	0
+2660	2
+2661	3
+2662	1
+2663	3
+2664	0
+2665	2
+2666	3
+2667	0
+2668	1
+2669	1
+2670	1
+2671	2
+2672	2
+2673	1
+2674	0
+2675	3
+2676	3
+2677	2
+2678	3
+2679	2
+2680	2
+2681	1
+2682	2
+2683	0
+2684	0
+2685	1
+2686	1
+2687	1
+2688	3
+2689	2
+2690	1
+2691	3
+2692	2
+2693	0
+2694	2
+2695	2
+2696	2
+2697	1
+2698	1
+2699	1
+2700	1
+2701	3
+2702	1
+2703	0
+2704	2
+2705	0
+2706	2
+2707	0
+2708	3
+2709	0
+2710	3
+2711	2
+2712	0
+2713	1
+2714	1
+2715	0
+2716	2
+2717	1
+2718	0
+2719	2
+2720	2
+2721	3
+2722	1
+2723	2
+2724	0
+2725	1
+2726	0
+2727	0
+2728	3
+2729	0
+2730	2
+2731	3
+2732	3
+2733	2
+2734	2
+2735	2
+2736	1
+2737	1
+2738	3
+2739	2
+2740	3
+2741	1
+2742	3
+2743	0
+2744	0
+2745	1
+2746	2
+2747	3
+2748	2
+2749	3
+2750	2
+2751	0
+2752	1
+2753	1
+2754	1
+2755	1
+2756	3
+2757	3
+2758	0
+2759	0
+2760	0
+2761	1
+2762	0
+2763	0
+2764	0
+2765	0
+2766	0
+2767	0
+2768	3
+2769	2
+2770	3
+2771	3
+2772	1
+2773	3
+2774	1
+2775	2
+2776	1
+2777	3
+2778	2
+2779	2
+2780	1
+2781	2
+2782	2
+2783	2
+2784	2
+2785	3
+2786	0
+2787	1
+2788	0
+2789	0
+2790	3
+2791	2
+2792	3
+2793	3
+2794	0
+2795	0
+2796	0
+2797	3
+2798	1
+2799	3
+2800	1
+2801	2
+2802	0
+2803	2
+2804	0
+2805	3
+2806	0
+2807	2
+2808	0
+2809	0
+2810	0
+2811	2
+2812	0
+2813	0
+2814	1
+2815	1
+2816	2
+2817	3
+2818	3
+2819	2
+2820	2
+2821	2
+2822	2
+2823	1
+2824	0
+2825	1
+2826	1
+2827	1
+2828	0
+2829	1
+2830	3
+2831	1
+2832	2
+2833	3
+2834	3
+2835	2
+2836	1
+2837	3
+2838	0
+2839	0
+2840	3
+2841	0
+2842	0
+2843	1
+2844	2
+2845	0
+2846	1
+2847	0
+2848	2
+2849	0
+2850	3
+2851	3
+2852	3
+2853	1
+2854	3
+2855	0
+2856	0
+2857	2
+2858	3
+2859	1
+2860	3
+2861	3
+2862	2
+2863	3
+2864	1
+2865	3
+2866	1
+2867	3
+2868	0
+2869	1
+2870	3
+2871	3
+2872	3
+2873	3
+2874	2
+2875	3
+2876	3
+2877	0
+2878	3
+2879	3
+2880	3
+2881	3
+2882	1
+2883	3
+2884	1
+2885	3
+2886	1
+2887	3
+2888	3
+2889	2
+2890	3
+2891	1
+2892	2
+2893	3
+2894	1
+2895	1
+2896	3
+2897	2
+2898	1
+2899	3
+2900	3
+2901	1
+2902	0
+2903	0
+2904	2
+2905	3
+2906	1
+2907	1
+2908	1
+2909	3
+2910	3
+2911	3
+2912	3
+2913	0
+2914	0
+2915	2
+2916	1
+2917	3
+2918	1
+2919	3
+2920	2
+2921	3
+2922	3
+2923	3
+2924	3
+2925	3
+2926	3
+2927	2
+2928	0
+2929	2
+2930	0
+2931	3
+2932	3
+2933	3
+2934	2
+2935	1
+2936	2
+2937	1
+2938	3
+2939	3
+2940	1
+2941	0
+2942	3
+2943	3
+2944	1
+2945	3
+2946	2
+2947	3
+2948	2
+2949	2
+2950	1
+2951	3
+2952	2
+2953	1
+2954	2
+2955	3
+2956	3
+2957	2
+2958	3
+2959	2
+2960	2
+2961	0
+2962	1
+2963	2
+2964	1
+2965	3
+2966	0
+2967	0
+2968	3
+2969	2
+2970	1
+2971	1
+2972	2
+2973	3
+2974	3
+2975	3
+2976	0
+2977	0
+2978	3
+2979	0
+2980	1
+2981	3
+2982	0
+2983	1
+2984	1
+2985	3
+2986	3
+2987	3
+2988	3
+2989	3
+2990	3
+2991	3
+2992	3
+2993	1
+2994	1
+2995	3
+2996	1
+2997	0
+2998	0
+2999	3
+3000	0
+3001	3
+3002	2
+3003	2
+3004	1
+3005	1
+3006	0
+3007	3
+3008	3
+3009	3
+3010	1
+3011	3
+3012	3
+3013	2
+3014	1
+3015	3
+3016	1
+3017	3
+3018	3
+3019	3
+3020	1
+3021	3
+3022	3
+3023	3
+3024	3
+3025	3
+3026	1
+3027	3
+3028	1
+3029	1
+3030	3
+3031	3
+3032	3
+3033	3
+3034	2
+3035	0
+3036	1
+3037	1
+3038	3
+3039	1
+3040	3
+3041	1
+3042	1
+3043	3
+3044	1
+3045	2
+3046	3
+3047	1
+3048	1
+3049	0
+3050	3
+3051	3
+3052	0
+3053	0
+3054	3
+3055	3
+3056	0
+3057	3
+3058	3
+3059	1
+3060	1
+3061	1
+3062	3
+3063	3
+3064	2
+3065	3
+3066	3
+3067	1
+3068	2
+3069	3
+3070	3
+3071	0
+3072	0
+3073	0
+3074	3
+3075	3
+3076	1
+3077	1
+3078	3
+3079	3
+3080	3
+3081	3
+3082	1
+3083	3
+3084	3
+3085	3
+3086	2
+3087	1
+3088	3
+3089	1
+3090	0
+3091	3
+3092	1
+3093	3
+3094	2
+3095	3
+3096	1
+3097	0
+3098	3
+3099	1
+3100	1
+3101	1
+3102	1
+3103	2
+3104	1
+3105	1
+3106	0
+3107	3
+3108	3
+3109	1
+3110	3
+3111	0
+3112	2
+3113	1
+3114	0
+3115	1
+3116	3
+3117	1
+3118	3
+3119	2
+3120	2
+3121	1
+3122	0
+3123	1
+3124	2
+3125	1
+3126	0
+3127	0
+3128	1
+3129	3
+3130	1
+3131	0
+3132	3
+3133	0
+3134	0
+3135	1
+3136	1
+3137	3
+3138	3
+3139	0
+3140	3
+3141	2
+3142	1
+3143	1
+3144	3
+3145	3
+3146	3
+3147	3
+3148	1
+3149	3
+3150	3
+3151	3
+3152	3
+3153	3
+3154	3
+3155	3
+3156	3
+3157	3
+3158	3
+3159	3
+3160	3
+3161	3
+3162	3
+3163	1
+3164	1
+3165	2
+3166	3
+3167	1
+3168	0
+3169	3
+3170	0
+3171	2
+3172	3
+3173	1
+3174	1
+3175	1
+3176	3
+3177	3
+3178	3
+3179	3
+3180	0
+3181	3
+3182	0
+3183	1
+3184	1
+3185	3
+3186	1
+3187	0
+3188	3
+3189	3
+3190	0
+3191	1
+3192	1
+3193	2
+3194	3
+3195	3
+3196	1
+3197	1
+3198	0
+3199	1
+3200	3
+3201	3
+3202	3
+3203	2
+3204	1
+3205	3
+3206	1
+3207	1
+3208	1
+3209	1
+3210	2
+3211	1
+3212	0
+3213	3
+3214	3
+3215	3
+3216	1
+3217	3
+3218	3
+3219	1
+3220	3
+3221	3
+3222	1
+3223	3
+3224	0
+3225	3
+3226	2
+3227	3
+3228	2
+3229	2
+3230	2
+3231	0
+3232	3
+3233	1
+3234	3
+3235	1
+3236	1
+3237	3
+3238	0
+3239	3
+3240	3
+3241	3
+3242	3
+3243	0
+3244	1
+3245	3
+3246	1
+3247	3
+3248	1
+3249	3
+3250	3
+3251	1
+3252	2
+3253	1
+3254	3
+3255	2
+3256	3
+3257	3
+3258	3
+3259	2
+3260	3
+3261	3
+3262	1
+3263	1
+3264	2
+3265	0
+3266	2
+3267	0
+3268	2
+3269	0
+3270	3
+3271	2
+3272	2
+3273	0
+3274	3
+3275	3
+3276	3
+3277	3
+3278	1
+3279	3
+3280	1
+3281	3
+3282	3
+3283	3
+3284	3
+3285	1
+3286	1
+3287	3
+3288	3
+3289	3
+3290	0
+3291	3
+3292	0
+3293	1
+3294	3
+3295	1
+3296	3
+3297	2
+3298	1
+3299	1
+3300	3
+3301	1
+3302	0
+3303	1
+3304	3
+3305	2
+3306	2
+3307	0
+3308	1
+3309	1
+3310	1
+3311	3
+3312	3
+3313	3
+3314	1
+3315	0
+3316	3
+3317	2
+3318	3
+3319	1
+3320	0
+3321	1
+3322	0
+3323	3
+3324	2
+3325	3
+3326	1
+3327	3
+3328	1
+3329	3
+3330	0
+3331	3
+3332	1
+3333	3
+3334	1
+3335	1
+3336	3
+3337	2
+3338	3
+3339	1
+3340	3
+3341	3
+3342	3
+3343	3
+3344	1
+3345	3
+3346	1
+3347	1
+3348	3
+3349	3
+3350	3
+3351	3
+3352	3
+3353	3
+3354	3
+3355	3
+3356	3
+3357	1
+3358	1
+3359	1
+3360	3
+3361	1
+3362	1
+3363	0
+3364	1
+3365	3
+3366	2
+3367	3
+3368	3
+3369	2
+3370	1
+3371	0
+3372	3
+3373	1
+3374	3
+3375	3
+3376	0
+3377	2
+3378	0
+3379	1
+3380	3
+3381	0
+3382	3
+3383	3
+3384	3
+3385	3
+3386	3
+3387	1
+3388	3
+3389	0
+3390	3
+3391	3
+3392	3
+3393	3
+3394	0
+3395	3
+3396	0
+3397	3
+3398	3
+3399	2
+3400	2
+3401	3
+3402	0
+3403	3
+3404	3
+3405	3
+3406	3
+3407	3
+3408	3
+3409	3
+3410	3
+3411	3
+3412	1
+3413	1
+3414	1
+3415	3
+3416	2
+3417	1
+3418	2
+3419	3
+3420	3
+3421	1
+3422	2
+3423	1
+3424	2
+3425	2
+3426	1
+3427	3
+3428	3
+3429	3
+3430	3
+3431	2
+3432	3
+3433	3
+3434	3
+3435	1
+3436	0
+3437	0
+3438	1
+3439	1
+3440	3
+3441	3
+3442	3
+3443	2
+3444	1
+3445	3
+3446	1
+3447	1
+3448	1
+3449	3
+3450	2
+3451	2
+3452	0
+3453	3
+3454	1
+3455	1
+3456	1
+3457	2
+3458	3
+3459	3
+3460	3
+3461	3
+3462	0
+3463	3
+3464	1
+3465	1
+3466	3
+3467	1
+3468	3
+3469	1
+3470	2
+3471	3
+3472	3
+3473	3
+3474	1
+3475	3
+3476	1
+3477	1
+3478	2
+3479	0
+3480	3
+3481	1
+3482	2
+3483	2
+3484	0
+3485	3
+3486	3
+3487	2
+3488	1
+3489	0
+3490	1
+3491	3
+3492	1
+3493	2
+3494	1
+3495	1
+3496	1
+3497	1
+3498	3
+3499	3
+3500	1
+3501	1
+3502	3
+3503	1
+3504	2
+3505	3
+3506	2
+3507	3
+3508	1
+3509	1
+3510	0
+3511	3
+3512	3
+3513	1
+3514	3
+3515	0
+3516	2
+3517	1
+3518	3
+3519	1
+3520	0
+3521	3
+3522	1
+3523	3
+3524	3
+3525	2
+3526	1
+3527	2
+3528	1
+3529	3
+3530	2
+3531	2
+3532	2
+3533	2
+3534	2
+3535	3
+3536	2
+3537	3
+3538	1
+3539	2
+3540	3
+3541	3
+3542	2
+3543	1
+3544	3
+3545	1
+3546	1
+3547	3
+3548	3
+3549	3
+3550	3
+3551	2
+3552	3
+3553	3
+3554	1
+3555	3
+3556	3
+3557	3
+3558	3
+3559	2
+3560	1
+3561	1
+3562	0
+3563	1
+3564	1
+3565	2
+3566	0
+3567	2
+3568	2
+3569	0
+3570	0
+3571	3
+3572	1
+3573	3
+3574	3
+3575	2
+3576	3
+3577	1
+3578	0
+3579	2
+3580	3
+3581	0
+3582	1
+3583	1
+3584	3
+3585	0
+3586	0
+3587	0
+3588	0
+3589	3
+3590	1
+3591	0
+3592	1
+3593	2
+3594	3
+3595	2
+3596	3
+3597	3
+3598	0
+3599	3
+3600	2
+3601	1
+3602	0
+3603	3
+3604	2
+3605	3
+3606	2
+3607	2
+3608	3
+3609	0
+3610	1
+3611	2
+3612	3
+3613	2
+3614	2
+3615	0
+3616	3
+3617	1
+3618	1
+3619	0
+3620	3
+3621	0
+3622	1
+3623	1
+3624	1
+3625	0
+3626	3
+3627	1
+3628	3
+3629	2
+3630	3
+3631	0
+3632	2
+3633	2
+3634	3
+3635	0
+3636	0
+3637	1
+3638	3
+3639	3
+3640	0
+3641	2
+3642	0
+3643	3
+3644	3
+3645	1
+3646	0
+3647	2
+3648	2
+3649	2
+3650	1
+3651	2
+3652	3
+3653	0
+3654	0
+3655	3
+3656	1
+3657	3
+3658	1
+3659	2
+3660	3
+3661	1
+3662	3
+3663	3
+3664	2
+3665	0
+3666	3
+3667	2
+3668	2
+3669	3
+3670	3
+3671	1
+3672	3
+3673	3
+3674	1
+3675	3
+3676	1
+3677	0
+3678	1
+3679	3
+3680	3
+3681	2
+3682	3
+3683	1
+3684	3
+3685	0
+3686	0
+3687	0
+3688	2
+3689	3
+3690	3
+3691	3
+3692	3
+3693	1
+3694	3
+3695	1
+3696	3
+3697	3
+3698	1
+3699	3
+3700	3
+3701	1
+3702	3
+3703	3
+3704	3
+3705	1
+3706	3
+3707	1
+3708	1
+3709	1
+3710	0
+3711	1
+3712	3
+3713	3
+3714	3
+3715	1
+3716	3
+3717	3
+3718	3
+3719	3
+3720	0
+3721	2
+3722	0
+3723	3
+3724	1
+3725	1
+3726	1
+3727	3
+3728	1
+3729	2
+3730	3
+3731	3
+3732	3
+3733	3
+3734	3
+3735	1
+3736	1
+3737	3
+3738	1
+3739	0
+3740	1
+3741	3
+3742	1
+3743	3
+3744	2
+3745	1
+3746	3
+3747	3
+3748	3
+3749	3
+3750	3
+3751	3
+3752	1
+3753	1
+3754	2
+3755	0
+3756	3
+3757	3
+3758	3
+3759	3
+3760	3
+3761	3
+3762	1
+3763	1
+3764	3
+3765	3
+3766	1
+3767	3
+3768	3
+3769	1
+3770	1
+3771	3
+3772	3
+3773	1
+3774	2
+3775	3
+3776	3
+3777	1
+3778	1
+3779	3
+3780	2
+3781	2
+3782	1
+3783	3
+3784	1
+3785	3
+3786	0
+3787	3
+3788	3
+3789	3
+3790	2
+3791	3
+3792	3
+3793	1
+3794	3
+3795	3
+3796	0
+3797	3
+3798	1
+3799	3
diff --git a/preparer_ag_nenws.py b/preparer_ag_nenws.py
new file mode 100644
index 0000000..ff77eef
--- /dev/null
+++ b/preparer_ag_nenws.py
@@ -0,0 +1,114 @@
+#!/usr/bin/env python3
+
+import json
+import logging
+from pathlib import Path
+from typing import List, Dict
+
+from datasets import load_dataset
+
+logger = logging.getLogger(__name__)
+
+MAP_LABEL_TRANSLATION = {
+    0: 'world',
+    1: 'sport',
+    2: 'business',
+    3: 'scitech'
+}
+
+
+def save_as_translations(original_save_path: Path, data_to_save: List[Dict]) -> None:
+    file_name = 's2s-' + original_save_path.name
+    file_path = original_save_path.parent / file_name
+
+    print(f'Saving into: {file_path}')
+    with open(file_path, 'wt') as f_write:
+        for data_line in data_to_save:
+            label = data_line['label']
+            new_label = MAP_LABEL_TRANSLATION[label]
+            data_line['label'] = new_label
+            data_line_str = json.dumps(data_line)
+            f_write.write(f'{data_line_str}\n')
+
+
+def main() -> None:
+    loaded_data = load_dataset('ag_news')
+    logger.info(f'Loaded dataset ag_news: {loaded_data}')
+
+    save_path = Path('data/')
+    save_train_path = save_path / 'train.json'
+    save_valid_path = save_path / 'valid.json'
+    save_test_path = save_path / 'test.json'
+    if not save_path.exists():
+        save_path.mkdir()
+
+    # Read train and validation data
+    data_train, data_valid, data_test = [], [], []
+    for source_data, dataset, max_size in [
+        (loaded_data['train'], data_train, None),
+        (loaded_data['test'], data_valid, None)
+    ]:
+        for i, data in enumerate(source_data):
+            if max_size is not None and i >= max_size:
+                break
+            data_line = {
+                'label': int(data['label']),
+                'text': data['text'],
+            }
+            dataset.append(data_line)
+    logger.info(f'Train: {len(data_train):6d}')
+
+    # Split validation set into 2 classes for validation and test splitting
+    world, sport, business, scitech = [], [], [], []
+
+    for data in data_valid:
+        label = data['label']
+        if label == 0:
+            world.append(data)
+        elif label == 1:
+            sport.append(data)
+        elif label == 2:
+            business.append(data)
+        elif label == 3:
+            scitech.append(data)
+
+    logger.info(f'World: {len(world):6d}')
+    logger.info(f'Sport: {len(sport):6d}')
+    logger.info(f'Business: {len(business):6d}')
+    logger.info(f'Scitech: {len(scitech):6d}')
+
+    print(world)
+    print(f'World: {len(world)}')
+    print(f'Sport: {len(sport):6d}')
+    print(f'Business: {len(business):6d}')
+    print(f'Scitech: {len(scitech):6d}')
+
+
+    # Split 2 classes into validation and test
+    size_half_world = int(len(world) / 2)
+    size_half_sport = int(len(sport) / 2)
+    size_half_business = int(len(business) / 2)
+    size_half_scitech = int(len(scitech) / 2)
+    logger.info(f'Valid: {len(data_valid):6d}')
+    logger.info(f'Test : {len(data_test):6d}')
+
+    data_valid = world[:size_half_world] + sport[:size_half_sport] + business[:size_half_business] + scitech[:size_half_scitech]
+    data_test = world[size_half_world:] + sport[size_half_sport:] + business[size_half_business:] + scitech[size_half_scitech:]
+
+    # Save files
+    for file_path, data_to_save in [
+        (save_train_path, data_train),
+        (save_valid_path, data_valid),
+        (save_test_path, data_test)
+    ]:
+        print(f'Saving into: {file_path}')
+        with open(file_path, 'wt') as f_write:
+            for data_line in data_to_save:
+                data_line_str = json.dumps(data_line)
+                f_write.write(f'{data_line_str}\n')
+
+        save_as_translations(file_path, data_to_save)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/projektV2.ipynb b/projektV2.ipynb
new file mode 100644
index 0000000..7450a9b
--- /dev/null
+++ b/projektV2.ipynb
@@ -0,0 +1,7335 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "!pip install transformers torch datasets evaluate scikit-learn sacremoses sentencepiece ipywidgets > /dev/null"
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Roberta"
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Modifications"
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "- Custom classification head with bigger hidden size\n",
+    "- Changed activation function to GELU"
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Code"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from torch import nn\n",
+    "from transformers import RobertaForSequenceClassification, RobertaModel\n",
+    "\n",
+    "\n",
+    "# Simple version #\n",
+    "\n",
+    "class RobertaClassificationHeadCustomSimple(nn.Module):\n",
+    "    \"\"\"Head for sentence-level classification tasks.\"\"\"\n",
+    "\n",
+    "    def __init__(self, config):\n",
+    "        super().__init__()\n",
+    "        hidden_size = config.hidden_size\n",
+    "        self.dense_1 = nn.Linear(hidden_size, 4 * hidden_size)\n",
+    "        self.dense_2 = nn.Linear(4 * hidden_size, hidden_size)\n",
+    "        classifier_dropout = (\n",
+    "            config.classifier_dropout if config.classifier_dropout is not None else config.hidden_dropout_prob\n",
+    "        )\n",
+    "        self.dropout = nn.Dropout(classifier_dropout)\n",
+    "        self.out_proj = nn.Linear(hidden_size, config.num_labels)\n",
+    "        self.activation = nn.GELU()\n",
+    "\n",
+    "    def forward(self, features, **kwargs):\n",
+    "        x = features[:, 0, :]  # take <s> token (equiv. to [CLS])\n",
+    "\n",
+    "        x = self.dense_1(x)\n",
+    "        x = self.activation(x)\n",
+    "        x = self.dropout(x)\n",
+    "\n",
+    "        x = self.dense_2(x)\n",
+    "        x = self.activation(x)\n",
+    "        x = self.dropout(x)\n",
+    "\n",
+    "        x = self.out_proj(x)\n",
+    "        return x\n",
+    "\n",
+    "\n",
+    "class RobertaForSequenceClassificationCustomSimple(RobertaForSequenceClassification):\n",
+    "    _keys_to_ignore_on_load_missing = [r\"position_ids\"]\n",
+    "\n",
+    "    def __init__(self, config):\n",
+    "        super().__init__(config)\n",
+    "        self.num_labels = config.num_labels\n",
+    "        self.config = config\n",
+    "\n",
+    "        self.roberta = RobertaModel(config, add_pooling_layer=False)\n",
+    "        self.classifier = RobertaClassificationHeadCustomSimple(config)\n",
+    "\n",
+    "        # Initialize weights and apply final processing\n",
+    "        self.post_init()\n"
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Model"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForSequenceClassificationCustomSimple: ['roberta.pooler.dense.weight', 'lm_head.dense.bias', 'lm_head.decoder.weight', 'lm_head.bias', 'lm_head.dense.weight', 'roberta.pooler.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias']\n",
+      "- This IS expected if you are initializing RobertaForSequenceClassificationCustomSimple from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).\n",
+      "- This IS NOT expected if you are initializing RobertaForSequenceClassificationCustomSimple from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).\n",
+      "Some weights of RobertaForSequenceClassificationCustomSimple were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense_1.weight', 'classifier.out_proj.bias', 'classifier.dense_2.bias', 'classifier.dense_2.weight', 'classifier.out_proj.weight', 'classifier.dense_1.bias']\n",
+      "You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n"
+     ]
+    },
+    {
+     "data": {
+      "text/plain": [
+       "RobertaForSequenceClassificationCustomSimple(\n",
+       "  (roberta): RobertaModel(\n",
+       "    (embeddings): RobertaEmbeddings(\n",
+       "      (word_embeddings): Embedding(50265, 768, padding_idx=1)\n",
+       "      (position_embeddings): Embedding(514, 768, padding_idx=1)\n",
+       "      (token_type_embeddings): Embedding(1, 768)\n",
+       "      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n",
+       "      (dropout): Dropout(p=0.1, inplace=False)\n",
+       "    )\n",
+       "    (encoder): RobertaEncoder(\n",
+       "      (layer): ModuleList(\n",
+       "        (0): RobertaLayer(\n",
+       "          (attention): RobertaAttention(\n",
+       "            (self): RobertaSelfAttention(\n",
+       "              (query): Linear(in_features=768, out_features=768, bias=True)\n",
+       "              (key): Linear(in_features=768, out_features=768, bias=True)\n",
+       "              (value): Linear(in_features=768, out_features=768, bias=True)\n",
+       "              (dropout): Dropout(p=0.1, inplace=False)\n",
+       "            )\n",
+       "            (output): RobertaSelfOutput(\n",
+       "              (dense): Linear(in_features=768, out_features=768, bias=True)\n",
+       "              (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n",
+       "              (dropout): Dropout(p=0.1, inplace=False)\n",
+       "            )\n",
+       "          )\n",
+       "          (intermediate): RobertaIntermediate(\n",
+       "            (dense): Linear(in_features=768, out_features=3072, bias=True)\n",
+       "            (intermediate_act_fn): GELUActivation()\n",
+       "          )\n",
+       "          (output): RobertaOutput(\n",
+       "            (dense): Linear(in_features=3072, out_features=768, bias=True)\n",
+       "            (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n",
+       "            (dropout): Dropout(p=0.1, inplace=False)\n",
+       "          )\n",
+       "        )\n",
+       "        (1): RobertaLayer(\n",
+       "          (attention): RobertaAttention(\n",
+       "            (self): RobertaSelfAttention(\n",
+       "              (query): Linear(in_features=768, out_features=768, bias=True)\n",
+       "              (key): Linear(in_features=768, out_features=768, bias=True)\n",
+       "              (value): Linear(in_features=768, out_features=768, bias=True)\n",
+       "              (dropout): Dropout(p=0.1, inplace=False)\n",
+       "            )\n",
+       "            (output): RobertaSelfOutput(\n",
+       "              (dense): Linear(in_features=768, out_features=768, bias=True)\n",
+       "              (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n",
+       "              (dropout): Dropout(p=0.1, inplace=False)\n",
+       "            )\n",
+       "          )\n",
+       "          (intermediate): RobertaIntermediate(\n",
+       "            (dense): Linear(in_features=768, out_features=3072, bias=True)\n",
+       "            (intermediate_act_fn): GELUActivation()\n",
+       "          )\n",
+       "          (output): RobertaOutput(\n",
+       "            (dense): Linear(in_features=3072, out_features=768, bias=True)\n",
+       "            (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n",
+       "            (dropout): Dropout(p=0.1, inplace=False)\n",
+       "          )\n",
+       "        )\n",
+       "        (2): RobertaLayer(\n",
+       "          (attention): RobertaAttention(\n",
+       "            (self): RobertaSelfAttention(\n",
+       "              (query): Linear(in_features=768, out_features=768, bias=True)\n",
+       "              (key): Linear(in_features=768, out_features=768, bias=True)\n",
+       "              (value): Linear(in_features=768, out_features=768, bias=True)\n",
+       "              (dropout): Dropout(p=0.1, inplace=False)\n",
+       "            )\n",
+       "            (output): RobertaSelfOutput(\n",
+       "              (dense): Linear(in_features=768, out_features=768, bias=True)\n",
+       "              (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n",
+       "              (dropout): Dropout(p=0.1, inplace=False)\n",
+       "            )\n",
+       "          )\n",
+       "          (intermediate): RobertaIntermediate(\n",
+       "            (dense): Linear(in_features=768, out_features=3072, bias=True)\n",
+       "            (intermediate_act_fn): GELUActivation()\n",
+       "          )\n",
+       "          (output): RobertaOutput(\n",
+       "            (dense): Linear(in_features=3072, out_features=768, bias=True)\n",
+       "            (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n",
+       "            (dropout): Dropout(p=0.1, inplace=False)\n",
+       "          )\n",
+       "        )\n",
+       "        (3): RobertaLayer(\n",
+       "          (attention): RobertaAttention(\n",
+       "            (self): RobertaSelfAttention(\n",
+       "              (query): Linear(in_features=768, out_features=768, bias=True)\n",
+       "              (key): Linear(in_features=768, out_features=768, bias=True)\n",
+       "              (value): Linear(in_features=768, out_features=768, bias=True)\n",
+       "              (dropout): Dropout(p=0.1, inplace=False)\n",
+       "            )\n",
+       "            (output): RobertaSelfOutput(\n",
+       "              (dense): Linear(in_features=768, out_features=768, bias=True)\n",
+       "              (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n",
+       "              (dropout): Dropout(p=0.1, inplace=False)\n",
+       "            )\n",
+       "          )\n",
+       "          (intermediate): RobertaIntermediate(\n",
+       "            (dense): Linear(in_features=768, out_features=3072, bias=True)\n",
+       "            (intermediate_act_fn): GELUActivation()\n",
+       "          )\n",
+       "          (output): RobertaOutput(\n",
+       "            (dense): Linear(in_features=3072, out_features=768, bias=True)\n",
+       "            (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n",
+       "            (dropout): Dropout(p=0.1, inplace=False)\n",
+       "          )\n",
+       "        )\n",
+       "        (4): RobertaLayer(\n",
+       "          (attention): RobertaAttention(\n",
+       "            (self): RobertaSelfAttention(\n",
+       "              (query): Linear(in_features=768, out_features=768, bias=True)\n",
+       "              (key): Linear(in_features=768, out_features=768, bias=True)\n",
+       "              (value): Linear(in_features=768, out_features=768, bias=True)\n",
+       "              (dropout): Dropout(p=0.1, inplace=False)\n",
+       "            )\n",
+       "            (output): RobertaSelfOutput(\n",
+       "              (dense): Linear(in_features=768, out_features=768, bias=True)\n",
+       "              (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n",
+       "              (dropout): Dropout(p=0.1, inplace=False)\n",
+       "            )\n",
+       "          )\n",
+       "          (intermediate): RobertaIntermediate(\n",
+       "            (dense): Linear(in_features=768, out_features=3072, bias=True)\n",
+       "            (intermediate_act_fn): GELUActivation()\n",
+       "          )\n",
+       "          (output): RobertaOutput(\n",
+       "            (dense): Linear(in_features=3072, out_features=768, bias=True)\n",
+       "            (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n",
+       "            (dropout): Dropout(p=0.1, inplace=False)\n",
+       "          )\n",
+       "        )\n",
+       "        (5): RobertaLayer(\n",
+       "          (attention): RobertaAttention(\n",
+       "            (self): RobertaSelfAttention(\n",
+       "              (query): Linear(in_features=768, out_features=768, bias=True)\n",
+       "              (key): Linear(in_features=768, out_features=768, bias=True)\n",
+       "              (value): Linear(in_features=768, out_features=768, bias=True)\n",
+       "              (dropout): Dropout(p=0.1, inplace=False)\n",
+       "            )\n",
+       "            (output): RobertaSelfOutput(\n",
+       "              (dense): Linear(in_features=768, out_features=768, bias=True)\n",
+       "              (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n",
+       "              (dropout): Dropout(p=0.1, inplace=False)\n",
+       "            )\n",
+       "          )\n",
+       "          (intermediate): RobertaIntermediate(\n",
+       "            (dense): Linear(in_features=768, out_features=3072, bias=True)\n",
+       "            (intermediate_act_fn): GELUActivation()\n",
+       "          )\n",
+       "          (output): RobertaOutput(\n",
+       "            (dense): Linear(in_features=3072, out_features=768, bias=True)\n",
+       "            (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n",
+       "            (dropout): Dropout(p=0.1, inplace=False)\n",
+       "          )\n",
+       "        )\n",
+       "        (6): RobertaLayer(\n",
+       "          (attention): RobertaAttention(\n",
+       "            (self): RobertaSelfAttention(\n",
+       "              (query): Linear(in_features=768, out_features=768, bias=True)\n",
+       "              (key): Linear(in_features=768, out_features=768, bias=True)\n",
+       "              (value): Linear(in_features=768, out_features=768, bias=True)\n",
+       "              (dropout): Dropout(p=0.1, inplace=False)\n",
+       "            )\n",
+       "            (output): RobertaSelfOutput(\n",
+       "              (dense): Linear(in_features=768, out_features=768, bias=True)\n",
+       "              (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n",
+       "              (dropout): Dropout(p=0.1, inplace=False)\n",
+       "            )\n",
+       "          )\n",
+       "          (intermediate): RobertaIntermediate(\n",
+       "            (dense): Linear(in_features=768, out_features=3072, bias=True)\n",
+       "            (intermediate_act_fn): GELUActivation()\n",
+       "          )\n",
+       "          (output): RobertaOutput(\n",
+       "            (dense): Linear(in_features=3072, out_features=768, bias=True)\n",
+       "            (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n",
+       "            (dropout): Dropout(p=0.1, inplace=False)\n",
+       "          )\n",
+       "        )\n",
+       "        (7): RobertaLayer(\n",
+       "          (attention): RobertaAttention(\n",
+       "            (self): RobertaSelfAttention(\n",
+       "              (query): Linear(in_features=768, out_features=768, bias=True)\n",
+       "              (key): Linear(in_features=768, out_features=768, bias=True)\n",
+       "              (value): Linear(in_features=768, out_features=768, bias=True)\n",
+       "              (dropout): Dropout(p=0.1, inplace=False)\n",
+       "            )\n",
+       "            (output): RobertaSelfOutput(\n",
+       "              (dense): Linear(in_features=768, out_features=768, bias=True)\n",
+       "              (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n",
+       "              (dropout): Dropout(p=0.1, inplace=False)\n",
+       "            )\n",
+       "          )\n",
+       "          (intermediate): RobertaIntermediate(\n",
+       "            (dense): Linear(in_features=768, out_features=3072, bias=True)\n",
+       "            (intermediate_act_fn): GELUActivation()\n",
+       "          )\n",
+       "          (output): RobertaOutput(\n",
+       "            (dense): Linear(in_features=3072, out_features=768, bias=True)\n",
+       "            (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n",
+       "            (dropout): Dropout(p=0.1, inplace=False)\n",
+       "          )\n",
+       "        )\n",
+       "        (8): RobertaLayer(\n",
+       "          (attention): RobertaAttention(\n",
+       "            (self): RobertaSelfAttention(\n",
+       "              (query): Linear(in_features=768, out_features=768, bias=True)\n",
+       "              (key): Linear(in_features=768, out_features=768, bias=True)\n",
+       "              (value): Linear(in_features=768, out_features=768, bias=True)\n",
+       "              (dropout): Dropout(p=0.1, inplace=False)\n",
+       "            )\n",
+       "            (output): RobertaSelfOutput(\n",
+       "              (dense): Linear(in_features=768, out_features=768, bias=True)\n",
+       "              (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n",
+       "              (dropout): Dropout(p=0.1, inplace=False)\n",
+       "            )\n",
+       "          )\n",
+       "          (intermediate): RobertaIntermediate(\n",
+       "            (dense): Linear(in_features=768, out_features=3072, bias=True)\n",
+       "            (intermediate_act_fn): GELUActivation()\n",
+       "          )\n",
+       "          (output): RobertaOutput(\n",
+       "            (dense): Linear(in_features=3072, out_features=768, bias=True)\n",
+       "            (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n",
+       "            (dropout): Dropout(p=0.1, inplace=False)\n",
+       "          )\n",
+       "        )\n",
+       "        (9): RobertaLayer(\n",
+       "          (attention): RobertaAttention(\n",
+       "            (self): RobertaSelfAttention(\n",
+       "              (query): Linear(in_features=768, out_features=768, bias=True)\n",
+       "              (key): Linear(in_features=768, out_features=768, bias=True)\n",
+       "              (value): Linear(in_features=768, out_features=768, bias=True)\n",
+       "              (dropout): Dropout(p=0.1, inplace=False)\n",
+       "            )\n",
+       "            (output): RobertaSelfOutput(\n",
+       "              (dense): Linear(in_features=768, out_features=768, bias=True)\n",
+       "              (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n",
+       "              (dropout): Dropout(p=0.1, inplace=False)\n",
+       "            )\n",
+       "          )\n",
+       "          (intermediate): RobertaIntermediate(\n",
+       "            (dense): Linear(in_features=768, out_features=3072, bias=True)\n",
+       "            (intermediate_act_fn): GELUActivation()\n",
+       "          )\n",
+       "          (output): RobertaOutput(\n",
+       "            (dense): Linear(in_features=3072, out_features=768, bias=True)\n",
+       "            (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n",
+       "            (dropout): Dropout(p=0.1, inplace=False)\n",
+       "          )\n",
+       "        )\n",
+       "        (10): RobertaLayer(\n",
+       "          (attention): RobertaAttention(\n",
+       "            (self): RobertaSelfAttention(\n",
+       "              (query): Linear(in_features=768, out_features=768, bias=True)\n",
+       "              (key): Linear(in_features=768, out_features=768, bias=True)\n",
+       "              (value): Linear(in_features=768, out_features=768, bias=True)\n",
+       "              (dropout): Dropout(p=0.1, inplace=False)\n",
+       "            )\n",
+       "            (output): RobertaSelfOutput(\n",
+       "              (dense): Linear(in_features=768, out_features=768, bias=True)\n",
+       "              (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n",
+       "              (dropout): Dropout(p=0.1, inplace=False)\n",
+       "            )\n",
+       "          )\n",
+       "          (intermediate): RobertaIntermediate(\n",
+       "            (dense): Linear(in_features=768, out_features=3072, bias=True)\n",
+       "            (intermediate_act_fn): GELUActivation()\n",
+       "          )\n",
+       "          (output): RobertaOutput(\n",
+       "            (dense): Linear(in_features=3072, out_features=768, bias=True)\n",
+       "            (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n",
+       "            (dropout): Dropout(p=0.1, inplace=False)\n",
+       "          )\n",
+       "        )\n",
+       "        (11): RobertaLayer(\n",
+       "          (attention): RobertaAttention(\n",
+       "            (self): RobertaSelfAttention(\n",
+       "              (query): Linear(in_features=768, out_features=768, bias=True)\n",
+       "              (key): Linear(in_features=768, out_features=768, bias=True)\n",
+       "              (value): Linear(in_features=768, out_features=768, bias=True)\n",
+       "              (dropout): Dropout(p=0.1, inplace=False)\n",
+       "            )\n",
+       "            (output): RobertaSelfOutput(\n",
+       "              (dense): Linear(in_features=768, out_features=768, bias=True)\n",
+       "              (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n",
+       "              (dropout): Dropout(p=0.1, inplace=False)\n",
+       "            )\n",
+       "          )\n",
+       "          (intermediate): RobertaIntermediate(\n",
+       "            (dense): Linear(in_features=768, out_features=3072, bias=True)\n",
+       "            (intermediate_act_fn): GELUActivation()\n",
+       "          )\n",
+       "          (output): RobertaOutput(\n",
+       "            (dense): Linear(in_features=3072, out_features=768, bias=True)\n",
+       "            (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n",
+       "            (dropout): Dropout(p=0.1, inplace=False)\n",
+       "          )\n",
+       "        )\n",
+       "      )\n",
+       "    )\n",
+       "  )\n",
+       "  (classifier): RobertaClassificationHeadCustomSimple(\n",
+       "    (dense_1): Linear(in_features=768, out_features=3072, bias=True)\n",
+       "    (dense_2): Linear(in_features=3072, out_features=768, bias=True)\n",
+       "    (dropout): Dropout(p=0.1, inplace=False)\n",
+       "    (out_proj): Linear(in_features=768, out_features=2, bias=True)\n",
+       "    (activation): GELU(approximate='none')\n",
+       "  )\n",
+       ")"
+      ]
+     },
+     "execution_count": 4,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "RobertaForSequenceClassificationCustomSimple.from_pretrained(\"roberta-base\")"
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Training"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "02/16/2023 15:21:14 - WARNING - __main__ - Process rank: -1, device: cuda:0, n_gpu: 1distributed training: False, 16-bits training: False\n",
+      "02/16/2023 15:21:14 - INFO - __main__ - Training/evaluation parameters TrainingArguments(\n",
+      "_n_gpu=1,\n",
+      "adafactor=False,\n",
+      "adam_beta1=0.9,\n",
+      "adam_beta2=0.999,\n",
+      "adam_epsilon=1e-08,\n",
+      "auto_find_batch_size=False,\n",
+      "bf16=False,\n",
+      "bf16_full_eval=False,\n",
+      "data_seed=None,\n",
+      "dataloader_drop_last=False,\n",
+      "dataloader_num_workers=0,\n",
+      "dataloader_pin_memory=True,\n",
+      "ddp_bucket_cap_mb=None,\n",
+      "ddp_find_unused_parameters=None,\n",
+      "ddp_timeout=1800,\n",
+      "debug=[],\n",
+      "deepspeed=None,\n",
+      "disable_tqdm=False,\n",
+      "do_eval=True,\n",
+      "do_predict=True,\n",
+      "do_train=True,\n",
+      "eval_accumulation_steps=None,\n",
+      "eval_delay=0,\n",
+      "eval_steps=250,\n",
+      "evaluation_strategy=steps,\n",
+      "fp16=False,\n",
+      "fp16_backend=auto,\n",
+      "fp16_full_eval=False,\n",
+      "fp16_opt_level=O1,\n",
+      "fsdp=[],\n",
+      "fsdp_min_num_params=0,\n",
+      "fsdp_transformer_layer_cls_to_wrap=None,\n",
+      "full_determinism=False,\n",
+      "gradient_accumulation_steps=1,\n",
+      "gradient_checkpointing=False,\n",
+      "greater_is_better=True,\n",
+      "group_by_length=False,\n",
+      "half_precision_backend=auto,\n",
+      "hub_model_id=None,\n",
+      "hub_private_repo=False,\n",
+      "hub_strategy=every_save,\n",
+      "hub_token=<HUB_TOKEN>,\n",
+      "ignore_data_skip=False,\n",
+      "include_inputs_for_metrics=False,\n",
+      "jit_mode_eval=False,\n",
+      "label_names=None,\n",
+      "label_smoothing_factor=0.0,\n",
+      "learning_rate=2e-05,\n",
+      "length_column_name=length,\n",
+      "load_best_model_at_end=True,\n",
+      "local_rank=-1,\n",
+      "log_level=passive,\n",
+      "log_level_replica=passive,\n",
+      "log_on_each_node=True,\n",
+      "logging_dir=out/roberta/runs/Feb16_15-21-13_DESKTOP-R7JO8BQ,\n",
+      "logging_first_step=False,\n",
+      "logging_nan_inf_filter=True,\n",
+      "logging_steps=100,\n",
+      "logging_strategy=steps,\n",
+      "lr_scheduler_type=linear,\n",
+      "max_grad_norm=1.0,\n",
+      "max_steps=2500,\n",
+      "metric_for_best_model=accuracy,\n",
+      "mp_parameters=,\n",
+      "no_cuda=False,\n",
+      "num_train_epochs=1.0,\n",
+      "optim=adamw_hf,\n",
+      "optim_args=None,\n",
+      "output_dir=out/roberta,\n",
+      "overwrite_output_dir=False,\n",
+      "past_index=-1,\n",
+      "per_device_eval_batch_size=8,\n",
+      "per_device_train_batch_size=8,\n",
+      "prediction_loss_only=False,\n",
+      "push_to_hub=False,\n",
+      "push_to_hub_model_id=None,\n",
+      "push_to_hub_organization=None,\n",
+      "push_to_hub_token=<PUSH_TO_HUB_TOKEN>,\n",
+      "ray_scope=last,\n",
+      "remove_unused_columns=True,\n",
+      "report_to=[],\n",
+      "resume_from_checkpoint=None,\n",
+      "run_name=out/roberta,\n",
+      "save_on_each_node=False,\n",
+      "save_steps=250,\n",
+      "save_strategy=steps,\n",
+      "save_total_limit=5,\n",
+      "seed=42,\n",
+      "sharded_ddp=[],\n",
+      "skip_memory_metrics=True,\n",
+      "tf32=None,\n",
+      "torch_compile=False,\n",
+      "torch_compile_backend=None,\n",
+      "torch_compile_mode=None,\n",
+      "torchdynamo=None,\n",
+      "tpu_metrics_debug=False,\n",
+      "tpu_num_cores=None,\n",
+      "use_ipex=False,\n",
+      "use_legacy_prediction_loop=False,\n",
+      "use_mps_device=False,\n",
+      "warmup_ratio=0.0,\n",
+      "warmup_steps=0,\n",
+      "weight_decay=0.0,\n",
+      "xpu_backend=None,\n",
+      ")\n",
+      "02/16/2023 15:21:14 - INFO - __main__ - Checkpoint detected, resuming training at out/roberta/checkpoint-2500. To avoid this behavior, change the `--output_dir` or add `--overwrite_output_dir` to train from scratch.\n",
+      "02/16/2023 15:21:14 - INFO - __main__ - load a local file for train: data/train.json\n",
+      "02/16/2023 15:21:14 - INFO - __main__ - load a local file for validation: data/valid.json\n",
+      "02/16/2023 15:21:14 - INFO - __main__ - load a local file for test: data/test.json\n",
+      "02/16/2023 15:21:14 - WARNING - datasets.builder - Using custom data configuration default-f6e8039906850c57\n",
+      "02/16/2023 15:21:14 - INFO - datasets.info - Loading Dataset Infos from /home/jacob/anaconda3/envs/ugp/lib/python3.10/site-packages/datasets/packaged_modules/json\n",
+      "02/16/2023 15:21:14 - INFO - datasets.builder - Overwrite dataset info from restored data version.\n",
+      "02/16/2023 15:21:14 - INFO - datasets.info - Loading Dataset info from .cache_training/json/default-f6e8039906850c57/0.0.0/0f7e3662623656454fcd2b650f34e886a7db4b9104504885bd462096cc7a9f51\n",
+      "02/16/2023 15:21:14 - WARNING - datasets.builder - Found cached dataset json (/home/jacob/code/university/uczenie_glebokie_w_przetwarzaniu_tekstu/.cache_training/json/default-f6e8039906850c57/0.0.0/0f7e3662623656454fcd2b650f34e886a7db4b9104504885bd462096cc7a9f51)\n",
+      "02/16/2023 15:21:14 - INFO - datasets.info - Loading Dataset info from /home/jacob/code/university/uczenie_glebokie_w_przetwarzaniu_tekstu/.cache_training/json/default-f6e8039906850c57/0.0.0/0f7e3662623656454fcd2b650f34e886a7db4b9104504885bd462096cc7a9f51\n",
+      "100%|█████████████████████████████████████████████| 3/3 [00:00<00:00, 48.00it/s]\n",
+      "[INFO|configuration_utils.py:660] 2023-02-16 15:21:15,174 >> loading configuration file config.json from cache at .cache_training/models--roberta-base/snapshots/ff46155979338ff8063cdad90908b498ab91b181/config.json\n",
+      "[INFO|configuration_utils.py:712] 2023-02-16 15:21:15,175 >> Model config RobertaConfig {\n",
+      "  \"_name_or_path\": \"roberta-base\",\n",
+      "  \"architectures\": [\n",
+      "    \"RobertaForMaskedLM\"\n",
+      "  ],\n",
+      "  \"attention_probs_dropout_prob\": 0.1,\n",
+      "  \"bos_token_id\": 0,\n",
+      "  \"classifier_dropout\": null,\n",
+      "  \"eos_token_id\": 2,\n",
+      "  \"hidden_act\": \"gelu\",\n",
+      "  \"hidden_dropout_prob\": 0.1,\n",
+      "  \"hidden_size\": 768,\n",
+      "  \"id2label\": {\n",
+      "    \"0\": \"LABEL_0\",\n",
+      "    \"1\": \"LABEL_1\",\n",
+      "    \"2\": \"LABEL_2\",\n",
+      "    \"3\": \"LABEL_3\"\n",
+      "  },\n",
+      "  \"initializer_range\": 0.02,\n",
+      "  \"intermediate_size\": 3072,\n",
+      "  \"label2id\": {\n",
+      "    \"LABEL_0\": 0,\n",
+      "    \"LABEL_1\": 1,\n",
+      "    \"LABEL_2\": 2,\n",
+      "    \"LABEL_3\": 3\n",
+      "  },\n",
+      "  \"layer_norm_eps\": 1e-05,\n",
+      "  \"max_position_embeddings\": 514,\n",
+      "  \"model_type\": \"roberta\",\n",
+      "  \"num_attention_heads\": 12,\n",
+      "  \"num_hidden_layers\": 12,\n",
+      "  \"pad_token_id\": 1,\n",
+      "  \"position_embedding_type\": \"absolute\",\n",
+      "  \"transformers_version\": \"4.26.1\",\n",
+      "  \"type_vocab_size\": 1,\n",
+      "  \"use_cache\": true,\n",
+      "  \"vocab_size\": 50265\n",
+      "}\n",
+      "\n",
+      "[INFO|tokenization_auto.py:458] 2023-02-16 15:21:15,654 >> Could not locate the tokenizer configuration file, will try to use the model config instead.\n",
+      "[INFO|configuration_utils.py:660] 2023-02-16 15:21:16,123 >> loading configuration file config.json from cache at .cache_training/models--roberta-base/snapshots/ff46155979338ff8063cdad90908b498ab91b181/config.json\n",
+      "[INFO|configuration_utils.py:712] 2023-02-16 15:21:16,123 >> Model config RobertaConfig {\n",
+      "  \"_name_or_path\": \"roberta-base\",\n",
+      "  \"architectures\": [\n",
+      "    \"RobertaForMaskedLM\"\n",
+      "  ],\n",
+      "  \"attention_probs_dropout_prob\": 0.1,\n",
+      "  \"bos_token_id\": 0,\n",
+      "  \"classifier_dropout\": null,\n",
+      "  \"eos_token_id\": 2,\n",
+      "  \"hidden_act\": \"gelu\",\n",
+      "  \"hidden_dropout_prob\": 0.1,\n",
+      "  \"hidden_size\": 768,\n",
+      "  \"initializer_range\": 0.02,\n",
+      "  \"intermediate_size\": 3072,\n",
+      "  \"layer_norm_eps\": 1e-05,\n",
+      "  \"max_position_embeddings\": 514,\n",
+      "  \"model_type\": \"roberta\",\n",
+      "  \"num_attention_heads\": 12,\n",
+      "  \"num_hidden_layers\": 12,\n",
+      "  \"pad_token_id\": 1,\n",
+      "  \"position_embedding_type\": \"absolute\",\n",
+      "  \"transformers_version\": \"4.26.1\",\n",
+      "  \"type_vocab_size\": 1,\n",
+      "  \"use_cache\": true,\n",
+      "  \"vocab_size\": 50265\n",
+      "}\n",
+      "\n",
+      "[INFO|tokenization_utils_base.py:1802] 2023-02-16 15:21:17,045 >> loading file vocab.json from cache at .cache_training/models--roberta-base/snapshots/ff46155979338ff8063cdad90908b498ab91b181/vocab.json\n",
+      "[INFO|tokenization_utils_base.py:1802] 2023-02-16 15:21:17,045 >> loading file merges.txt from cache at .cache_training/models--roberta-base/snapshots/ff46155979338ff8063cdad90908b498ab91b181/merges.txt\n",
+      "[INFO|tokenization_utils_base.py:1802] 2023-02-16 15:21:17,045 >> loading file tokenizer.json from cache at .cache_training/models--roberta-base/snapshots/ff46155979338ff8063cdad90908b498ab91b181/tokenizer.json\n",
+      "[INFO|tokenization_utils_base.py:1802] 2023-02-16 15:21:17,045 >> loading file added_tokens.json from cache at None\n",
+      "[INFO|tokenization_utils_base.py:1802] 2023-02-16 15:21:17,045 >> loading file special_tokens_map.json from cache at None\n",
+      "[INFO|tokenization_utils_base.py:1802] 2023-02-16 15:21:17,045 >> loading file tokenizer_config.json from cache at None\n",
+      "[INFO|configuration_utils.py:660] 2023-02-16 15:21:17,045 >> loading configuration file config.json from cache at .cache_training/models--roberta-base/snapshots/ff46155979338ff8063cdad90908b498ab91b181/config.json\n",
+      "[INFO|configuration_utils.py:712] 2023-02-16 15:21:17,046 >> Model config RobertaConfig {\n",
+      "  \"_name_or_path\": \"roberta-base\",\n",
+      "  \"architectures\": [\n",
+      "    \"RobertaForMaskedLM\"\n",
+      "  ],\n",
+      "  \"attention_probs_dropout_prob\": 0.1,\n",
+      "  \"bos_token_id\": 0,\n",
+      "  \"classifier_dropout\": null,\n",
+      "  \"eos_token_id\": 2,\n",
+      "  \"hidden_act\": \"gelu\",\n",
+      "  \"hidden_dropout_prob\": 0.1,\n",
+      "  \"hidden_size\": 768,\n",
+      "  \"initializer_range\": 0.02,\n",
+      "  \"intermediate_size\": 3072,\n",
+      "  \"layer_norm_eps\": 1e-05,\n",
+      "  \"max_position_embeddings\": 514,\n",
+      "  \"model_type\": \"roberta\",\n",
+      "  \"num_attention_heads\": 12,\n",
+      "  \"num_hidden_layers\": 12,\n",
+      "  \"pad_token_id\": 1,\n",
+      "  \"position_embedding_type\": \"absolute\",\n",
+      "  \"transformers_version\": \"4.26.1\",\n",
+      "  \"type_vocab_size\": 1,\n",
+      "  \"use_cache\": true,\n",
+      "  \"vocab_size\": 50265\n",
+      "}\n",
+      "\n",
+      "02/16/2023 15:21:17 - INFO - __main__ - Using hidden states in model: False\n",
+      "-------------------------------------------------------- Using hidden: False\n",
+      "02/16/2023 15:21:17 - INFO - __main__ - Using implementation from class: RobertaForSequenceClassificationCustomSimple\n",
+      "[INFO|modeling_utils.py:2275] 2023-02-16 15:21:17,101 >> loading weights file pytorch_model.bin from cache at .cache_training/models--roberta-base/snapshots/ff46155979338ff8063cdad90908b498ab91b181/pytorch_model.bin\n",
+      "[WARNING|modeling_utils.py:2847] 2023-02-16 15:21:22,965 >> Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForSequenceClassificationCustomSimple: ['lm_head.dense.weight', 'lm_head.bias', 'lm_head.dense.bias', 'lm_head.decoder.weight', 'lm_head.layer_norm.bias', 'roberta.pooler.dense.weight', 'lm_head.layer_norm.weight', 'roberta.pooler.dense.bias']\n",
+      "- This IS expected if you are initializing RobertaForSequenceClassificationCustomSimple from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).\n",
+      "- This IS NOT expected if you are initializing RobertaForSequenceClassificationCustomSimple from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).\n",
+      "[WARNING|modeling_utils.py:2859] 2023-02-16 15:21:22,965 >> Some weights of RobertaForSequenceClassificationCustomSimple were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense_1.bias', 'classifier.dense_2.weight', 'classifier.out_proj.weight', 'classifier.dense_2.bias', 'classifier.out_proj.bias', 'classifier.dense_1.weight']\n",
+      "You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n",
+      "RobertaForSequenceClassificationCustomSimple(\n",
+      "  (roberta): RobertaModel(\n",
+      "    (embeddings): RobertaEmbeddings(\n",
+      "      (word_embeddings): Embedding(50265, 768, padding_idx=1)\n",
+      "      (position_embeddings): Embedding(514, 768, padding_idx=1)\n",
+      "      (token_type_embeddings): Embedding(1, 768)\n",
+      "      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n",
+      "      (dropout): Dropout(p=0.1, inplace=False)\n",
+      "    )\n",
+      "    (encoder): RobertaEncoder(\n",
+      "      (layer): ModuleList(\n",
+      "        (0): RobertaLayer(\n",
+      "          (attention): RobertaAttention(\n",
+      "            (self): RobertaSelfAttention(\n",
+      "              (query): Linear(in_features=768, out_features=768, bias=True)\n",
+      "              (key): Linear(in_features=768, out_features=768, bias=True)\n",
+      "              (value): Linear(in_features=768, out_features=768, bias=True)\n",
+      "              (dropout): Dropout(p=0.1, inplace=False)\n",
+      "            )\n",
+      "            (output): RobertaSelfOutput(\n",
+      "              (dense): Linear(in_features=768, out_features=768, bias=True)\n",
+      "              (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n",
+      "              (dropout): Dropout(p=0.1, inplace=False)\n",
+      "            )\n",
+      "          )\n",
+      "          (intermediate): RobertaIntermediate(\n",
+      "            (dense): Linear(in_features=768, out_features=3072, bias=True)\n",
+      "            (intermediate_act_fn): GELUActivation()\n",
+      "          )\n",
+      "          (output): RobertaOutput(\n",
+      "            (dense): Linear(in_features=3072, out_features=768, bias=True)\n",
+      "            (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n",
+      "            (dropout): Dropout(p=0.1, inplace=False)\n",
+      "          )\n",
+      "        )\n",
+      "        (1): RobertaLayer(\n",
+      "          (attention): RobertaAttention(\n",
+      "            (self): RobertaSelfAttention(\n",
+      "              (query): Linear(in_features=768, out_features=768, bias=True)\n",
+      "              (key): Linear(in_features=768, out_features=768, bias=True)\n",
+      "              (value): Linear(in_features=768, out_features=768, bias=True)\n",
+      "              (dropout): Dropout(p=0.1, inplace=False)\n",
+      "            )\n",
+      "            (output): RobertaSelfOutput(\n",
+      "              (dense): Linear(in_features=768, out_features=768, bias=True)\n",
+      "              (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n",
+      "              (dropout): Dropout(p=0.1, inplace=False)\n",
+      "            )\n",
+      "          )\n",
+      "          (intermediate): RobertaIntermediate(\n",
+      "            (dense): Linear(in_features=768, out_features=3072, bias=True)\n",
+      "            (intermediate_act_fn): GELUActivation()\n",
+      "          )\n",
+      "          (output): RobertaOutput(\n",
+      "            (dense): Linear(in_features=3072, out_features=768, bias=True)\n",
+      "            (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n",
+      "            (dropout): Dropout(p=0.1, inplace=False)\n",
+      "          )\n",
+      "        )\n",
+      "        (2): RobertaLayer(\n",
+      "          (attention): RobertaAttention(\n",
+      "            (self): RobertaSelfAttention(\n",
+      "              (query): Linear(in_features=768, out_features=768, bias=True)\n",
+      "              (key): Linear(in_features=768, out_features=768, bias=True)\n",
+      "              (value): Linear(in_features=768, out_features=768, bias=True)\n",
+      "              (dropout): Dropout(p=0.1, inplace=False)\n",
+      "            )\n",
+      "            (output): RobertaSelfOutput(\n",
+      "              (dense): Linear(in_features=768, out_features=768, bias=True)\n",
+      "              (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n",
+      "              (dropout): Dropout(p=0.1, inplace=False)\n",
+      "            )\n",
+      "          )\n",
+      "          (intermediate): RobertaIntermediate(\n",
+      "            (dense): Linear(in_features=768, out_features=3072, bias=True)\n",
+      "            (intermediate_act_fn): GELUActivation()\n",
+      "          )\n",
+      "          (output): RobertaOutput(\n",
+      "            (dense): Linear(in_features=3072, out_features=768, bias=True)\n",
+      "            (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n",
+      "            (dropout): Dropout(p=0.1, inplace=False)\n",
+      "          )\n",
+      "        )\n",
+      "        (3): RobertaLayer(\n",
+      "          (attention): RobertaAttention(\n",
+      "            (self): RobertaSelfAttention(\n",
+      "              (query): Linear(in_features=768, out_features=768, bias=True)\n",
+      "              (key): Linear(in_features=768, out_features=768, bias=True)\n",
+      "              (value): Linear(in_features=768, out_features=768, bias=True)\n",
+      "              (dropout): Dropout(p=0.1, inplace=False)\n",
+      "            )\n",
+      "            (output): RobertaSelfOutput(\n",
+      "              (dense): Linear(in_features=768, out_features=768, bias=True)\n",
+      "              (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n",
+      "              (dropout): Dropout(p=0.1, inplace=False)\n",
+      "            )\n",
+      "          )\n",
+      "          (intermediate): RobertaIntermediate(\n",
+      "            (dense): Linear(in_features=768, out_features=3072, bias=True)\n",
+      "            (intermediate_act_fn): GELUActivation()\n",
+      "          )\n",
+      "          (output): RobertaOutput(\n",
+      "            (dense): Linear(in_features=3072, out_features=768, bias=True)\n",
+      "            (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n",
+      "            (dropout): Dropout(p=0.1, inplace=False)\n",
+      "          )\n",
+      "        )\n",
+      "        (4): RobertaLayer(\n",
+      "          (attention): RobertaAttention(\n",
+      "            (self): RobertaSelfAttention(\n",
+      "              (query): Linear(in_features=768, out_features=768, bias=True)\n",
+      "              (key): Linear(in_features=768, out_features=768, bias=True)\n",
+      "              (value): Linear(in_features=768, out_features=768, bias=True)\n",
+      "              (dropout): Dropout(p=0.1, inplace=False)\n",
+      "            )\n",
+      "            (output): RobertaSelfOutput(\n",
+      "              (dense): Linear(in_features=768, out_features=768, bias=True)\n",
+      "              (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n",
+      "              (dropout): Dropout(p=0.1, inplace=False)\n",
+      "            )\n",
+      "          )\n",
+      "          (intermediate): RobertaIntermediate(\n",
+      "            (dense): Linear(in_features=768, out_features=3072, bias=True)\n",
+      "            (intermediate_act_fn): GELUActivation()\n",
+      "          )\n",
+      "          (output): RobertaOutput(\n",
+      "            (dense): Linear(in_features=3072, out_features=768, bias=True)\n",
+      "            (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n",
+      "            (dropout): Dropout(p=0.1, inplace=False)\n",
+      "          )\n",
+      "        )\n",
+      "        (5): RobertaLayer(\n",
+      "          (attention): RobertaAttention(\n",
+      "            (self): RobertaSelfAttention(\n",
+      "              (query): Linear(in_features=768, out_features=768, bias=True)\n",
+      "              (key): Linear(in_features=768, out_features=768, bias=True)\n",
+      "              (value): Linear(in_features=768, out_features=768, bias=True)\n",
+      "              (dropout): Dropout(p=0.1, inplace=False)\n",
+      "            )\n",
+      "            (output): RobertaSelfOutput(\n",
+      "              (dense): Linear(in_features=768, out_features=768, bias=True)\n",
+      "              (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n",
+      "              (dropout): Dropout(p=0.1, inplace=False)\n",
+      "            )\n",
+      "          )\n",
+      "          (intermediate): RobertaIntermediate(\n",
+      "            (dense): Linear(in_features=768, out_features=3072, bias=True)\n",
+      "            (intermediate_act_fn): GELUActivation()\n",
+      "          )\n",
+      "          (output): RobertaOutput(\n",
+      "            (dense): Linear(in_features=3072, out_features=768, bias=True)\n",
+      "            (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n",
+      "            (dropout): Dropout(p=0.1, inplace=False)\n",
+      "          )\n",
+      "        )\n",
+      "        (6): RobertaLayer(\n",
+      "          (attention): RobertaAttention(\n",
+      "            (self): RobertaSelfAttention(\n",
+      "              (query): Linear(in_features=768, out_features=768, bias=True)\n",
+      "              (key): Linear(in_features=768, out_features=768, bias=True)\n",
+      "              (value): Linear(in_features=768, out_features=768, bias=True)\n",
+      "              (dropout): Dropout(p=0.1, inplace=False)\n",
+      "            )\n",
+      "            (output): RobertaSelfOutput(\n",
+      "              (dense): Linear(in_features=768, out_features=768, bias=True)\n",
+      "              (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n",
+      "              (dropout): Dropout(p=0.1, inplace=False)\n",
+      "            )\n",
+      "          )\n",
+      "          (intermediate): RobertaIntermediate(\n",
+      "            (dense): Linear(in_features=768, out_features=3072, bias=True)\n",
+      "            (intermediate_act_fn): GELUActivation()\n",
+      "          )\n",
+      "          (output): RobertaOutput(\n",
+      "            (dense): Linear(in_features=3072, out_features=768, bias=True)\n",
+      "            (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n",
+      "            (dropout): Dropout(p=0.1, inplace=False)\n",
+      "          )\n",
+      "        )\n",
+      "        (7): RobertaLayer(\n",
+      "          (attention): RobertaAttention(\n",
+      "            (self): RobertaSelfAttention(\n",
+      "              (query): Linear(in_features=768, out_features=768, bias=True)\n",
+      "              (key): Linear(in_features=768, out_features=768, bias=True)\n",
+      "              (value): Linear(in_features=768, out_features=768, bias=True)\n",
+      "              (dropout): Dropout(p=0.1, inplace=False)\n",
+      "            )\n",
+      "            (output): RobertaSelfOutput(\n",
+      "              (dense): Linear(in_features=768, out_features=768, bias=True)\n",
+      "              (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n",
+      "              (dropout): Dropout(p=0.1, inplace=False)\n",
+      "            )\n",
+      "          )\n",
+      "          (intermediate): RobertaIntermediate(\n",
+      "            (dense): Linear(in_features=768, out_features=3072, bias=True)\n",
+      "            (intermediate_act_fn): GELUActivation()\n",
+      "          )\n",
+      "          (output): RobertaOutput(\n",
+      "            (dense): Linear(in_features=3072, out_features=768, bias=True)\n",
+      "            (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n",
+      "            (dropout): Dropout(p=0.1, inplace=False)\n",
+      "          )\n",
+      "        )\n",
+      "        (8): RobertaLayer(\n",
+      "          (attention): RobertaAttention(\n",
+      "            (self): RobertaSelfAttention(\n",
+      "              (query): Linear(in_features=768, out_features=768, bias=True)\n",
+      "              (key): Linear(in_features=768, out_features=768, bias=True)\n",
+      "              (value): Linear(in_features=768, out_features=768, bias=True)\n",
+      "              (dropout): Dropout(p=0.1, inplace=False)\n",
+      "            )\n",
+      "            (output): RobertaSelfOutput(\n",
+      "              (dense): Linear(in_features=768, out_features=768, bias=True)\n",
+      "              (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n",
+      "              (dropout): Dropout(p=0.1, inplace=False)\n",
+      "            )\n",
+      "          )\n",
+      "          (intermediate): RobertaIntermediate(\n",
+      "            (dense): Linear(in_features=768, out_features=3072, bias=True)\n",
+      "            (intermediate_act_fn): GELUActivation()\n",
+      "          )\n",
+      "          (output): RobertaOutput(\n",
+      "            (dense): Linear(in_features=3072, out_features=768, bias=True)\n",
+      "            (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n",
+      "            (dropout): Dropout(p=0.1, inplace=False)\n",
+      "          )\n",
+      "        )\n",
+      "        (9): RobertaLayer(\n",
+      "          (attention): RobertaAttention(\n",
+      "            (self): RobertaSelfAttention(\n",
+      "              (query): Linear(in_features=768, out_features=768, bias=True)\n",
+      "              (key): Linear(in_features=768, out_features=768, bias=True)\n",
+      "              (value): Linear(in_features=768, out_features=768, bias=True)\n",
+      "              (dropout): Dropout(p=0.1, inplace=False)\n",
+      "            )\n",
+      "            (output): RobertaSelfOutput(\n",
+      "              (dense): Linear(in_features=768, out_features=768, bias=True)\n",
+      "              (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n",
+      "              (dropout): Dropout(p=0.1, inplace=False)\n",
+      "            )\n",
+      "          )\n",
+      "          (intermediate): RobertaIntermediate(\n",
+      "            (dense): Linear(in_features=768, out_features=3072, bias=True)\n",
+      "            (intermediate_act_fn): GELUActivation()\n",
+      "          )\n",
+      "          (output): RobertaOutput(\n",
+      "            (dense): Linear(in_features=3072, out_features=768, bias=True)\n",
+      "            (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n",
+      "            (dropout): Dropout(p=0.1, inplace=False)\n",
+      "          )\n",
+      "        )\n",
+      "        (10): RobertaLayer(\n",
+      "          (attention): RobertaAttention(\n",
+      "            (self): RobertaSelfAttention(\n",
+      "              (query): Linear(in_features=768, out_features=768, bias=True)\n",
+      "              (key): Linear(in_features=768, out_features=768, bias=True)\n",
+      "              (value): Linear(in_features=768, out_features=768, bias=True)\n",
+      "              (dropout): Dropout(p=0.1, inplace=False)\n",
+      "            )\n",
+      "            (output): RobertaSelfOutput(\n",
+      "              (dense): Linear(in_features=768, out_features=768, bias=True)\n",
+      "              (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n",
+      "              (dropout): Dropout(p=0.1, inplace=False)\n",
+      "            )\n",
+      "          )\n",
+      "          (intermediate): RobertaIntermediate(\n",
+      "            (dense): Linear(in_features=768, out_features=3072, bias=True)\n",
+      "            (intermediate_act_fn): GELUActivation()\n",
+      "          )\n",
+      "          (output): RobertaOutput(\n",
+      "            (dense): Linear(in_features=3072, out_features=768, bias=True)\n",
+      "            (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n",
+      "            (dropout): Dropout(p=0.1, inplace=False)\n",
+      "          )\n",
+      "        )\n",
+      "        (11): RobertaLayer(\n",
+      "          (attention): RobertaAttention(\n",
+      "            (self): RobertaSelfAttention(\n",
+      "              (query): Linear(in_features=768, out_features=768, bias=True)\n",
+      "              (key): Linear(in_features=768, out_features=768, bias=True)\n",
+      "              (value): Linear(in_features=768, out_features=768, bias=True)\n",
+      "              (dropout): Dropout(p=0.1, inplace=False)\n",
+      "            )\n",
+      "            (output): RobertaSelfOutput(\n",
+      "              (dense): Linear(in_features=768, out_features=768, bias=True)\n",
+      "              (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n",
+      "              (dropout): Dropout(p=0.1, inplace=False)\n",
+      "            )\n",
+      "          )\n",
+      "          (intermediate): RobertaIntermediate(\n",
+      "            (dense): Linear(in_features=768, out_features=3072, bias=True)\n",
+      "            (intermediate_act_fn): GELUActivation()\n",
+      "          )\n",
+      "          (output): RobertaOutput(\n",
+      "            (dense): Linear(in_features=3072, out_features=768, bias=True)\n",
+      "            (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n",
+      "            (dropout): Dropout(p=0.1, inplace=False)\n",
+      "          )\n",
+      "        )\n",
+      "      )\n",
+      "    )\n",
+      "  )\n",
+      "  (classifier): RobertaClassificationHeadCustomSimple(\n",
+      "    (dense_1): Linear(in_features=768, out_features=3072, bias=True)\n",
+      "    (dense_2): Linear(in_features=3072, out_features=768, bias=True)\n",
+      "    (dropout): Dropout(p=0.1, inplace=False)\n",
+      "    (out_proj): Linear(in_features=768, out_features=4, bias=True)\n",
+      "    (activation): GELU(approximate='none')\n",
+      "  )\n",
+      ")\n",
+      "02/16/2023 15:21:22 - WARNING - datasets.arrow_dataset - Loading cached processed dataset at /home/jacob/code/university/uczenie_glebokie_w_przetwarzaniu_tekstu/.cache_training/json/default-f6e8039906850c57/0.0.0/0f7e3662623656454fcd2b650f34e886a7db4b9104504885bd462096cc7a9f51/cache-204a6dc6fcae3352.arrow\n",
+      "Running tokenizer on dataset:   0%|                       | 0/4 [00:00<?, ?ba/s]02/16/2023 15:21:23 - INFO - datasets.arrow_dataset - Caching processed dataset at /home/jacob/code/university/uczenie_glebokie_w_przetwarzaniu_tekstu/.cache_training/json/default-f6e8039906850c57/0.0.0/0f7e3662623656454fcd2b650f34e886a7db4b9104504885bd462096cc7a9f51/cache-9091129e58fb62d5.arrow\n",
+      "Running tokenizer on dataset: 100%|███████████████| 4/4 [00:00<00:00, 15.86ba/s]\n",
+      "02/16/2023 15:21:23 - WARNING - datasets.arrow_dataset - Loading cached processed dataset at /home/jacob/code/university/uczenie_glebokie_w_przetwarzaniu_tekstu/.cache_training/json/default-f6e8039906850c57/0.0.0/0f7e3662623656454fcd2b650f34e886a7db4b9104504885bd462096cc7a9f51/cache-bdfe4224bf4c9f20.arrow\n",
+      "02/16/2023 15:21:23 - INFO - __main__ - Set 500 samples for 0-class\n",
+      "02/16/2023 15:21:23 - INFO - __main__ - Set 500 samples for 1-class\n",
+      "02/16/2023 15:21:23 - INFO - __main__ - Set 500 samples for 2-class\n",
+      "02/16/2023 15:21:23 - INFO - __main__ - Set 500 samples for 3-class\n",
+      "02/16/2023 15:21:23 - INFO - __main__ - Sample 83810 of the training set: {'label': 0, 'text': \"Policeman 'saw fatal train crash' An off-duty policeman watched a train plough into a car on a level crossing  in Berkshire, killing six people.\", 'input_ids': [0, 510, 12589, 5649, 128, 35349, 6484, 2341, 2058, 108, 660, 160, 12, 15593, 20976, 3996, 10, 2341, 2968, 4894, 88, 10, 512, 15, 10, 672, 6724, 1437, 11, 16563, 6, 2429, 411, 82, 4, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]}.\n",
+      "02/16/2023 15:21:23 - INFO - __main__ - Sample 14592 of the training set: {'label': 1, 'text': 'Silver finale for USA In the last event of the 2004 Olympic Games, the United States track team produced one last surprise. Meb Keflezighi, a native of Eritrea who moved to the United States as ', 'input_ids': [0, 39008, 7712, 13, 2805, 96, 5, 94, 515, 9, 5, 4482, 3336, 3100, 6, 5, 315, 532, 1349, 165, 2622, 65, 94, 2755, 4, 256, 3209, 229, 4550, 23250, 8774, 118, 6, 10, 3763, 9, 24372, 9891, 54, 1410, 7, 5, 315, 532, 25, 1437, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]}.\n",
+      "02/16/2023 15:21:23 - INFO - __main__ - Sample 3278 of the training set: {'label': 3, 'text': 'Compuware Blasts IBM #39;s Legal Tactics Two years ago, IBM was ordered to produce the source code for its products, which Compuware identified as containing its pirated intellectual property. The code was missing. But lo and behold -- last week, they called and said they had it, quot; ...', 'input_ids': [0, 24699, 257, 10680, 2091, 13651, 11510, 849, 3416, 131, 29, 10661, 45689, 1596, 107, 536, 6, 11510, 21, 2740, 7, 2592, 5, 1300, 3260, 13, 63, 785, 6, 61, 10081, 257, 10680, 2006, 25, 8200, 63, 36287, 1070, 9594, 1038, 4, 20, 3260, 21, 1716, 4, 125, 4600, 8, 29308, 480, 94, 186, 6, 51, 373, 8, 26, 51, 56, 24, 6, 39809, 131, 1666, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]}.\n",
+      "[INFO|trainer.py:511] 2023-02-16 15:21:27,576 >> max_steps is given, it will override any value given in num_train_epochs\n",
+      "[INFO|trainer.py:1972] 2023-02-16 15:21:27,576 >> Loading model from out/roberta/checkpoint-2500.\n",
+      "[INFO|trainer.py:710] 2023-02-16 15:21:29,498 >> The following columns in the training set don't have a corresponding argument in `RobertaForSequenceClassificationCustomSimple.forward` and have been ignored: text. If text are not expected by `RobertaForSequenceClassificationCustomSimple.forward`,  you can safely ignore this message.\n",
+      "/home/jacob/anaconda3/envs/ugp/lib/python3.10/site-packages/transformers/optimization.py:306: FutureWarning: This implementation of AdamW is deprecated and will be removed in a future version. Use the PyTorch implementation torch.optim.AdamW instead, or set `no_deprecation_warning=True` to disable this warning\n",
+      "  warnings.warn(\n",
+      "[INFO|trainer.py:1650] 2023-02-16 15:21:31,949 >> ***** Running training *****\n",
+      "[INFO|trainer.py:1651] 2023-02-16 15:21:31,950 >>   Num examples = 120000\n",
+      "[INFO|trainer.py:1652] 2023-02-16 15:21:31,950 >>   Num Epochs = 1\n",
+      "[INFO|trainer.py:1653] 2023-02-16 15:21:31,950 >>   Instantaneous batch size per device = 8\n",
+      "[INFO|trainer.py:1654] 2023-02-16 15:21:31,950 >>   Total train batch size (w. parallel, distributed & accumulation) = 8\n",
+      "[INFO|trainer.py:1655] 2023-02-16 15:21:31,950 >>   Gradient Accumulation steps = 1\n",
+      "[INFO|trainer.py:1656] 2023-02-16 15:21:31,950 >>   Total optimization steps = 2500\n",
+      "[INFO|trainer.py:1657] 2023-02-16 15:21:31,951 >>   Number of trainable parameters = 128780548\n",
+      "[INFO|trainer.py:1679] 2023-02-16 15:21:31,951 >>   Continuing training from checkpoint, will skip to saved global_step\n",
+      "[INFO|trainer.py:1680] 2023-02-16 15:21:31,951 >>   Continuing training from epoch 0\n",
+      "[INFO|trainer.py:1681] 2023-02-16 15:21:31,951 >>   Continuing training from global step 2500\n",
+      "[INFO|trainer.py:1683] 2023-02-16 15:21:31,951 >>   Will skip the first 0 epochs then the first 2500 batches in the first epoch. If this takes a lot of time, you can add the `--ignore_data_skip` flag to your launch command, but you will resume the training on data already seen by your model.\n",
+      "Skipping the first batches:   0%|                      | 0/2500 [00:00<?, ?it/s]\n",
+      "Skipping the first batches: 100%|██████████| 2500/2500 [00:03<00:00, 717.10it/s]\u001b[A\n",
+      "\n",
+      "2501it [00:04, 522.91it/s]                                                      \u001b[A[INFO|trainer.py:1901] 2023-02-16 15:21:36,738 >> \n",
+      "\n",
+      "Training completed. Do not forget to share your model on huggingface.co/models =)\n",
+      "\n",
+      "\n",
+      "[INFO|trainer.py:2025] 2023-02-16 15:21:36,738 >> Loading best model from out/roberta/checkpoint-2500 (score: 0.9229999780654907).\n",
+      "\n",
+      "\u001b[A{'train_runtime': 5.7972, 'train_samples_per_second': 3449.95, 'train_steps_per_second': 431.244, 'train_loss': 3.2215512862971954e-06, 'epoch': 0.17}\n",
+      "\n",
+      "2501it [00:05, 431.57it/s]\u001b[A\n",
+      "[INFO|trainer.py:2709] 2023-02-16 15:21:37,750 >> Saving model checkpoint to out/roberta\n",
+      "[INFO|configuration_utils.py:453] 2023-02-16 15:21:37,751 >> Configuration saved in out/roberta/config.json\n",
+      "[INFO|modeling_utils.py:1704] 2023-02-16 15:21:38,719 >> Model weights saved in out/roberta/pytorch_model.bin\n",
+      "[INFO|tokenization_utils_base.py:2160] 2023-02-16 15:21:38,742 >> tokenizer config file saved in out/roberta/tokenizer_config.json\n",
+      "[INFO|tokenization_utils_base.py:2167] 2023-02-16 15:21:38,743 >> Special tokens file saved in out/roberta/special_tokens_map.json\n",
+      "***** train metrics *****\n",
+      "  epoch                    =       0.17\n",
+      "  train_loss               =        0.0\n",
+      "  train_runtime            = 0:00:05.79\n",
+      "  train_samples            =     120000\n",
+      "  train_samples_per_second =    3449.95\n",
+      "  train_steps_per_second   =    431.244\n",
+      "02/16/2023 15:21:38 - INFO - __main__ - *** Evaluate ***\n",
+      "[INFO|trainer.py:710] 2023-02-16 15:21:38,862 >> The following columns in the evaluation set don't have a corresponding argument in `RobertaForSequenceClassificationCustomSimple.forward` and have been ignored: text. If text are not expected by `RobertaForSequenceClassificationCustomSimple.forward`,  you can safely ignore this message.\n",
+      "[INFO|trainer.py:2964] 2023-02-16 15:21:38,863 >> ***** Running Evaluation *****\n",
+      "[INFO|trainer.py:2966] 2023-02-16 15:21:38,863 >>   Num examples = 2000\n",
+      "[INFO|trainer.py:2969] 2023-02-16 15:21:38,863 >>   Batch size = 8\n",
+      "100%|█████████████████████████████████████████| 250/250 [00:16<00:00, 14.75it/s]\n",
+      "***** eval metrics *****\n",
+      "  epoch                   =       0.17\n",
+      "  eval_accuracy           =      0.923\n",
+      "  eval_loss               =      0.296\n",
+      "  eval_runtime            = 0:00:17.06\n",
+      "  eval_samples            =       2000\n",
+      "  eval_samples_per_second =    117.168\n",
+      "  eval_steps_per_second   =     14.646\n",
+      "02/16/2023 15:21:55 - INFO - __main__ - *** Predict ***\n",
+      "[INFO|trainer.py:710] 2023-02-16 15:21:55,934 >> The following columns in the test set don't have a corresponding argument in `RobertaForSequenceClassificationCustomSimple.forward` and have been ignored: text. If text are not expected by `RobertaForSequenceClassificationCustomSimple.forward`,  you can safely ignore this message.\n",
+      "[INFO|trainer.py:2964] 2023-02-16 15:21:55,935 >> ***** Running Prediction *****\n",
+      "[INFO|trainer.py:2966] 2023-02-16 15:21:55,935 >>   Num examples = 3800\n",
+      "[INFO|trainer.py:2969] 2023-02-16 15:21:55,935 >>   Batch size = 8\n",
+      "100%|█████████████████████████████████████████| 475/475 [00:32<00:00, 14.74it/s]\n",
+      "02/16/2023 15:22:28 - INFO - __main__ - ***** Predict results None *****\n",
+      "[INFO|modelcard.py:449] 2023-02-16 15:22:28,796 >> Dropping the following result as it does not have all the necessary fields:\n",
+      "{'task': {'name': 'Text Classification', 'type': 'text-classification'}, 'metrics': [{'name': 'Accuracy', 'type': 'accuracy', 'value': 0.9229999780654907}]}\n"
+     ]
+    }
+   ],
+   "source": [
+    "!python run_glue.py \\\n",
+    "  --cache_dir .cache_training \\\n",
+    "  --model_name_or_path roberta-base \\\n",
+    "  --custom_model roberta_simple \\\n",
+    "  --train_file data/train.json  \\\n",
+    "  --validation_file data/valid.json \\\n",
+    "  --test_file data/test.json \\\n",
+    "  --per_device_train_batch_size 8 \\\n",
+    "  --per_device_eval_batch_size 8 \\\n",
+    "  --do_train \\\n",
+    "  --do_eval \\\n",
+    "  --do_predict \\\n",
+    "  --max_seq_length 128 \\\n",
+    "  --learning_rate 2e-5 \\\n",
+    "  --max_eval_samples 2000 \\\n",
+    "  --max_steps 2500 \\\n",
+    "  --num_train_epochs 1 \\\n",
+    "  --save_strategy steps \\\n",
+    "  --save_steps 250 \\\n",
+    "  --save_total_limit 5 \\\n",
+    "  --logging_strategy steps \\\n",
+    "  --logging_steps 100 \\\n",
+    "  --eval_steps 250 \\\n",
+    "  --evaluation_strategy steps \\\n",
+    "  --metric_for_best_model accuracy \\\n",
+    "  --greater_is_better True \\\n",
+    "  --load_best_model_at_end True \\\n",
+    "  --output_dir out/roberta"
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Evaluation"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "02/16/2023 16:46:49 - WARNING - __main__ - Process rank: -1, device: cuda:0, n_gpu: 1distributed training: False, 16-bits training: False\n",
+      "02/16/2023 16:46:49 - INFO - __main__ - Training/evaluation parameters TrainingArguments(\n",
+      "_n_gpu=1,\n",
+      "adafactor=False,\n",
+      "adam_beta1=0.9,\n",
+      "adam_beta2=0.999,\n",
+      "adam_epsilon=1e-08,\n",
+      "auto_find_batch_size=False,\n",
+      "bf16=False,\n",
+      "bf16_full_eval=False,\n",
+      "data_seed=None,\n",
+      "dataloader_drop_last=False,\n",
+      "dataloader_num_workers=0,\n",
+      "dataloader_pin_memory=True,\n",
+      "ddp_bucket_cap_mb=None,\n",
+      "ddp_find_unused_parameters=None,\n",
+      "ddp_timeout=1800,\n",
+      "debug=[],\n",
+      "deepspeed=None,\n",
+      "disable_tqdm=False,\n",
+      "do_eval=True,\n",
+      "do_predict=True,\n",
+      "do_train=False,\n",
+      "eval_accumulation_steps=None,\n",
+      "eval_delay=0,\n",
+      "eval_steps=250,\n",
+      "evaluation_strategy=steps,\n",
+      "fp16=False,\n",
+      "fp16_backend=auto,\n",
+      "fp16_full_eval=False,\n",
+      "fp16_opt_level=O1,\n",
+      "fsdp=[],\n",
+      "fsdp_min_num_params=0,\n",
+      "fsdp_transformer_layer_cls_to_wrap=None,\n",
+      "full_determinism=False,\n",
+      "gradient_accumulation_steps=1,\n",
+      "gradient_checkpointing=False,\n",
+      "greater_is_better=True,\n",
+      "group_by_length=False,\n",
+      "half_precision_backend=auto,\n",
+      "hub_model_id=None,\n",
+      "hub_private_repo=False,\n",
+      "hub_strategy=every_save,\n",
+      "hub_token=<HUB_TOKEN>,\n",
+      "ignore_data_skip=False,\n",
+      "include_inputs_for_metrics=False,\n",
+      "jit_mode_eval=False,\n",
+      "label_names=None,\n",
+      "label_smoothing_factor=0.0,\n",
+      "learning_rate=2e-05,\n",
+      "length_column_name=length,\n",
+      "load_best_model_at_end=True,\n",
+      "local_rank=-1,\n",
+      "log_level=passive,\n",
+      "log_level_replica=passive,\n",
+      "log_on_each_node=True,\n",
+      "logging_dir=out/roberta_results/runs/Feb16_16-46-48_DESKTOP-R7JO8BQ,\n",
+      "logging_first_step=False,\n",
+      "logging_nan_inf_filter=True,\n",
+      "logging_steps=100,\n",
+      "logging_strategy=steps,\n",
+      "lr_scheduler_type=linear,\n",
+      "max_grad_norm=1.0,\n",
+      "max_steps=2500,\n",
+      "metric_for_best_model=accuracy,\n",
+      "mp_parameters=,\n",
+      "no_cuda=False,\n",
+      "num_train_epochs=1.0,\n",
+      "optim=adamw_hf,\n",
+      "optim_args=None,\n",
+      "output_dir=out/roberta_results,\n",
+      "overwrite_output_dir=False,\n",
+      "past_index=-1,\n",
+      "per_device_eval_batch_size=8,\n",
+      "per_device_train_batch_size=8,\n",
+      "prediction_loss_only=False,\n",
+      "push_to_hub=False,\n",
+      "push_to_hub_model_id=None,\n",
+      "push_to_hub_organization=None,\n",
+      "push_to_hub_token=<PUSH_TO_HUB_TOKEN>,\n",
+      "ray_scope=last,\n",
+      "remove_unused_columns=True,\n",
+      "report_to=[],\n",
+      "resume_from_checkpoint=None,\n",
+      "run_name=out/roberta_results,\n",
+      "save_on_each_node=False,\n",
+      "save_steps=250,\n",
+      "save_strategy=steps,\n",
+      "save_total_limit=5,\n",
+      "seed=42,\n",
+      "sharded_ddp=[],\n",
+      "skip_memory_metrics=True,\n",
+      "tf32=None,\n",
+      "torch_compile=False,\n",
+      "torch_compile_backend=None,\n",
+      "torch_compile_mode=None,\n",
+      "torchdynamo=None,\n",
+      "tpu_metrics_debug=False,\n",
+      "tpu_num_cores=None,\n",
+      "use_ipex=False,\n",
+      "use_legacy_prediction_loop=False,\n",
+      "use_mps_device=False,\n",
+      "warmup_ratio=0.0,\n",
+      "warmup_steps=0,\n",
+      "weight_decay=0.0,\n",
+      "xpu_backend=None,\n",
+      ")\n",
+      "02/16/2023 16:46:49 - INFO - __main__ - load a local file for train: data/train.json\n",
+      "02/16/2023 16:46:49 - INFO - __main__ - load a local file for validation: data/valid.json\n",
+      "02/16/2023 16:46:49 - INFO - __main__ - load a local file for test: data/test.json\n",
+      "02/16/2023 16:46:50 - WARNING - datasets.builder - Using custom data configuration default-f6e8039906850c57\n",
+      "02/16/2023 16:46:50 - INFO - datasets.info - Loading Dataset Infos from /home/jacob/anaconda3/envs/ugp/lib/python3.10/site-packages/datasets/packaged_modules/json\n",
+      "02/16/2023 16:46:50 - INFO - datasets.builder - Overwrite dataset info from restored data version.\n",
+      "02/16/2023 16:46:50 - INFO - datasets.info - Loading Dataset info from .cache_training/json/default-f6e8039906850c57/0.0.0/0f7e3662623656454fcd2b650f34e886a7db4b9104504885bd462096cc7a9f51\n",
+      "02/16/2023 16:46:50 - WARNING - datasets.builder - Found cached dataset json (/home/jacob/code/university/uczenie_glebokie_w_przetwarzaniu_tekstu/.cache_training/json/default-f6e8039906850c57/0.0.0/0f7e3662623656454fcd2b650f34e886a7db4b9104504885bd462096cc7a9f51)\n",
+      "02/16/2023 16:46:50 - INFO - datasets.info - Loading Dataset info from /home/jacob/code/university/uczenie_glebokie_w_przetwarzaniu_tekstu/.cache_training/json/default-f6e8039906850c57/0.0.0/0f7e3662623656454fcd2b650f34e886a7db4b9104504885bd462096cc7a9f51\n",
+      "100%|████████████████████████████████████████████| 3/3 [00:00<00:00, 752.21it/s]\n",
+      "[INFO|configuration_utils.py:658] 2023-02-16 16:46:50,276 >> loading configuration file out/roberta/config.json\n",
+      "[INFO|configuration_utils.py:712] 2023-02-16 16:46:50,277 >> Model config RobertaConfig {\n",
+      "  \"_name_or_path\": \"out/roberta\",\n",
+      "  \"architectures\": [\n",
+      "    \"RobertaForSequenceClassificationCustomSimple\"\n",
+      "  ],\n",
+      "  \"attention_probs_dropout_prob\": 0.1,\n",
+      "  \"bos_token_id\": 0,\n",
+      "  \"classifier_dropout\": null,\n",
+      "  \"eos_token_id\": 2,\n",
+      "  \"hidden_act\": \"gelu\",\n",
+      "  \"hidden_dropout_prob\": 0.1,\n",
+      "  \"hidden_size\": 768,\n",
+      "  \"id2label\": {\n",
+      "    \"0\": 0,\n",
+      "    \"1\": 1,\n",
+      "    \"2\": 2,\n",
+      "    \"3\": 3\n",
+      "  },\n",
+      "  \"initializer_range\": 0.02,\n",
+      "  \"intermediate_size\": 3072,\n",
+      "  \"label2id\": {\n",
+      "    \"0\": 0,\n",
+      "    \"1\": 1,\n",
+      "    \"2\": 2,\n",
+      "    \"3\": 3\n",
+      "  },\n",
+      "  \"layer_norm_eps\": 1e-05,\n",
+      "  \"max_position_embeddings\": 514,\n",
+      "  \"model_type\": \"roberta\",\n",
+      "  \"num_attention_heads\": 12,\n",
+      "  \"num_hidden_layers\": 12,\n",
+      "  \"pad_token_id\": 1,\n",
+      "  \"position_embedding_type\": \"absolute\",\n",
+      "  \"problem_type\": \"single_label_classification\",\n",
+      "  \"torch_dtype\": \"float32\",\n",
+      "  \"transformers_version\": \"4.26.1\",\n",
+      "  \"type_vocab_size\": 1,\n",
+      "  \"use_cache\": true,\n",
+      "  \"use_hidden_states\": false,\n",
+      "  \"vocab_size\": 50265\n",
+      "}\n",
+      "\n",
+      "[INFO|tokenization_utils_base.py:1800] 2023-02-16 16:46:50,283 >> loading file vocab.json\n",
+      "[INFO|tokenization_utils_base.py:1800] 2023-02-16 16:46:50,283 >> loading file merges.txt\n",
+      "[INFO|tokenization_utils_base.py:1800] 2023-02-16 16:46:50,284 >> loading file tokenizer.json\n",
+      "[INFO|tokenization_utils_base.py:1800] 2023-02-16 16:46:50,284 >> loading file added_tokens.json\n",
+      "[INFO|tokenization_utils_base.py:1800] 2023-02-16 16:46:50,284 >> loading file special_tokens_map.json\n",
+      "[INFO|tokenization_utils_base.py:1800] 2023-02-16 16:46:50,284 >> loading file tokenizer_config.json\n",
+      "02/16/2023 16:46:50 - INFO - __main__ - Using hidden states in model: False\n",
+      "-------------------------------------------------------- Using hidden: False\n",
+      "02/16/2023 16:46:50 - INFO - __main__ - Using implementation from class: RobertaForSequenceClassificationCustomSimple\n",
+      "[INFO|modeling_utils.py:2272] 2023-02-16 16:46:50,339 >> loading weights file out/roberta/pytorch_model.bin\n",
+      "[INFO|modeling_utils.py:2857] 2023-02-16 16:46:52,079 >> All model checkpoint weights were used when initializing RobertaForSequenceClassificationCustomSimple.\n",
+      "\n",
+      "[INFO|modeling_utils.py:2865] 2023-02-16 16:46:52,079 >> All the weights of RobertaForSequenceClassificationCustomSimple were initialized from the model checkpoint at out/roberta.\n",
+      "If your task is similar to the task the model of the checkpoint was trained on, you can already use RobertaForSequenceClassificationCustomSimple for predictions without further training.\n",
+      "RobertaForSequenceClassificationCustomSimple(\n",
+      "  (roberta): RobertaModel(\n",
+      "    (embeddings): RobertaEmbeddings(\n",
+      "      (word_embeddings): Embedding(50265, 768, padding_idx=1)\n",
+      "      (position_embeddings): Embedding(514, 768, padding_idx=1)\n",
+      "      (token_type_embeddings): Embedding(1, 768)\n",
+      "      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n",
+      "      (dropout): Dropout(p=0.1, inplace=False)\n",
+      "    )\n",
+      "    (encoder): RobertaEncoder(\n",
+      "      (layer): ModuleList(\n",
+      "        (0): RobertaLayer(\n",
+      "          (attention): RobertaAttention(\n",
+      "            (self): RobertaSelfAttention(\n",
+      "              (query): Linear(in_features=768, out_features=768, bias=True)\n",
+      "              (key): Linear(in_features=768, out_features=768, bias=True)\n",
+      "              (value): Linear(in_features=768, out_features=768, bias=True)\n",
+      "              (dropout): Dropout(p=0.1, inplace=False)\n",
+      "            )\n",
+      "            (output): RobertaSelfOutput(\n",
+      "              (dense): Linear(in_features=768, out_features=768, bias=True)\n",
+      "              (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n",
+      "              (dropout): Dropout(p=0.1, inplace=False)\n",
+      "            )\n",
+      "          )\n",
+      "          (intermediate): RobertaIntermediate(\n",
+      "            (dense): Linear(in_features=768, out_features=3072, bias=True)\n",
+      "            (intermediate_act_fn): GELUActivation()\n",
+      "          )\n",
+      "          (output): RobertaOutput(\n",
+      "            (dense): Linear(in_features=3072, out_features=768, bias=True)\n",
+      "            (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n",
+      "            (dropout): Dropout(p=0.1, inplace=False)\n",
+      "          )\n",
+      "        )\n",
+      "        (1): RobertaLayer(\n",
+      "          (attention): RobertaAttention(\n",
+      "            (self): RobertaSelfAttention(\n",
+      "              (query): Linear(in_features=768, out_features=768, bias=True)\n",
+      "              (key): Linear(in_features=768, out_features=768, bias=True)\n",
+      "              (value): Linear(in_features=768, out_features=768, bias=True)\n",
+      "              (dropout): Dropout(p=0.1, inplace=False)\n",
+      "            )\n",
+      "            (output): RobertaSelfOutput(\n",
+      "              (dense): Linear(in_features=768, out_features=768, bias=True)\n",
+      "              (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n",
+      "              (dropout): Dropout(p=0.1, inplace=False)\n",
+      "            )\n",
+      "          )\n",
+      "          (intermediate): RobertaIntermediate(\n",
+      "            (dense): Linear(in_features=768, out_features=3072, bias=True)\n",
+      "            (intermediate_act_fn): GELUActivation()\n",
+      "          )\n",
+      "          (output): RobertaOutput(\n",
+      "            (dense): Linear(in_features=3072, out_features=768, bias=True)\n",
+      "            (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n",
+      "            (dropout): Dropout(p=0.1, inplace=False)\n",
+      "          )\n",
+      "        )\n",
+      "        (2): RobertaLayer(\n",
+      "          (attention): RobertaAttention(\n",
+      "            (self): RobertaSelfAttention(\n",
+      "              (query): Linear(in_features=768, out_features=768, bias=True)\n",
+      "              (key): Linear(in_features=768, out_features=768, bias=True)\n",
+      "              (value): Linear(in_features=768, out_features=768, bias=True)\n",
+      "              (dropout): Dropout(p=0.1, inplace=False)\n",
+      "            )\n",
+      "            (output): RobertaSelfOutput(\n",
+      "              (dense): Linear(in_features=768, out_features=768, bias=True)\n",
+      "              (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n",
+      "              (dropout): Dropout(p=0.1, inplace=False)\n",
+      "            )\n",
+      "          )\n",
+      "          (intermediate): RobertaIntermediate(\n",
+      "            (dense): Linear(in_features=768, out_features=3072, bias=True)\n",
+      "            (intermediate_act_fn): GELUActivation()\n",
+      "          )\n",
+      "          (output): RobertaOutput(\n",
+      "            (dense): Linear(in_features=3072, out_features=768, bias=True)\n",
+      "            (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n",
+      "            (dropout): Dropout(p=0.1, inplace=False)\n",
+      "          )\n",
+      "        )\n",
+      "        (3): RobertaLayer(\n",
+      "          (attention): RobertaAttention(\n",
+      "            (self): RobertaSelfAttention(\n",
+      "              (query): Linear(in_features=768, out_features=768, bias=True)\n",
+      "              (key): Linear(in_features=768, out_features=768, bias=True)\n",
+      "              (value): Linear(in_features=768, out_features=768, bias=True)\n",
+      "              (dropout): Dropout(p=0.1, inplace=False)\n",
+      "            )\n",
+      "            (output): RobertaSelfOutput(\n",
+      "              (dense): Linear(in_features=768, out_features=768, bias=True)\n",
+      "              (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n",
+      "              (dropout): Dropout(p=0.1, inplace=False)\n",
+      "            )\n",
+      "          )\n",
+      "          (intermediate): RobertaIntermediate(\n",
+      "            (dense): Linear(in_features=768, out_features=3072, bias=True)\n",
+      "            (intermediate_act_fn): GELUActivation()\n",
+      "          )\n",
+      "          (output): RobertaOutput(\n",
+      "            (dense): Linear(in_features=3072, out_features=768, bias=True)\n",
+      "            (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n",
+      "            (dropout): Dropout(p=0.1, inplace=False)\n",
+      "          )\n",
+      "        )\n",
+      "        (4): RobertaLayer(\n",
+      "          (attention): RobertaAttention(\n",
+      "            (self): RobertaSelfAttention(\n",
+      "              (query): Linear(in_features=768, out_features=768, bias=True)\n",
+      "              (key): Linear(in_features=768, out_features=768, bias=True)\n",
+      "              (value): Linear(in_features=768, out_features=768, bias=True)\n",
+      "              (dropout): Dropout(p=0.1, inplace=False)\n",
+      "            )\n",
+      "            (output): RobertaSelfOutput(\n",
+      "              (dense): Linear(in_features=768, out_features=768, bias=True)\n",
+      "              (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n",
+      "              (dropout): Dropout(p=0.1, inplace=False)\n",
+      "            )\n",
+      "          )\n",
+      "          (intermediate): RobertaIntermediate(\n",
+      "            (dense): Linear(in_features=768, out_features=3072, bias=True)\n",
+      "            (intermediate_act_fn): GELUActivation()\n",
+      "          )\n",
+      "          (output): RobertaOutput(\n",
+      "            (dense): Linear(in_features=3072, out_features=768, bias=True)\n",
+      "            (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n",
+      "            (dropout): Dropout(p=0.1, inplace=False)\n",
+      "          )\n",
+      "        )\n",
+      "        (5): RobertaLayer(\n",
+      "          (attention): RobertaAttention(\n",
+      "            (self): RobertaSelfAttention(\n",
+      "              (query): Linear(in_features=768, out_features=768, bias=True)\n",
+      "              (key): Linear(in_features=768, out_features=768, bias=True)\n",
+      "              (value): Linear(in_features=768, out_features=768, bias=True)\n",
+      "              (dropout): Dropout(p=0.1, inplace=False)\n",
+      "            )\n",
+      "            (output): RobertaSelfOutput(\n",
+      "              (dense): Linear(in_features=768, out_features=768, bias=True)\n",
+      "              (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n",
+      "              (dropout): Dropout(p=0.1, inplace=False)\n",
+      "            )\n",
+      "          )\n",
+      "          (intermediate): RobertaIntermediate(\n",
+      "            (dense): Linear(in_features=768, out_features=3072, bias=True)\n",
+      "            (intermediate_act_fn): GELUActivation()\n",
+      "          )\n",
+      "          (output): RobertaOutput(\n",
+      "            (dense): Linear(in_features=3072, out_features=768, bias=True)\n",
+      "            (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n",
+      "            (dropout): Dropout(p=0.1, inplace=False)\n",
+      "          )\n",
+      "        )\n",
+      "        (6): RobertaLayer(\n",
+      "          (attention): RobertaAttention(\n",
+      "            (self): RobertaSelfAttention(\n",
+      "              (query): Linear(in_features=768, out_features=768, bias=True)\n",
+      "              (key): Linear(in_features=768, out_features=768, bias=True)\n",
+      "              (value): Linear(in_features=768, out_features=768, bias=True)\n",
+      "              (dropout): Dropout(p=0.1, inplace=False)\n",
+      "            )\n",
+      "            (output): RobertaSelfOutput(\n",
+      "              (dense): Linear(in_features=768, out_features=768, bias=True)\n",
+      "              (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n",
+      "              (dropout): Dropout(p=0.1, inplace=False)\n",
+      "            )\n",
+      "          )\n",
+      "          (intermediate): RobertaIntermediate(\n",
+      "            (dense): Linear(in_features=768, out_features=3072, bias=True)\n",
+      "            (intermediate_act_fn): GELUActivation()\n",
+      "          )\n",
+      "          (output): RobertaOutput(\n",
+      "            (dense): Linear(in_features=3072, out_features=768, bias=True)\n",
+      "            (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n",
+      "            (dropout): Dropout(p=0.1, inplace=False)\n",
+      "          )\n",
+      "        )\n",
+      "        (7): RobertaLayer(\n",
+      "          (attention): RobertaAttention(\n",
+      "            (self): RobertaSelfAttention(\n",
+      "              (query): Linear(in_features=768, out_features=768, bias=True)\n",
+      "              (key): Linear(in_features=768, out_features=768, bias=True)\n",
+      "              (value): Linear(in_features=768, out_features=768, bias=True)\n",
+      "              (dropout): Dropout(p=0.1, inplace=False)\n",
+      "            )\n",
+      "            (output): RobertaSelfOutput(\n",
+      "              (dense): Linear(in_features=768, out_features=768, bias=True)\n",
+      "              (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n",
+      "              (dropout): Dropout(p=0.1, inplace=False)\n",
+      "            )\n",
+      "          )\n",
+      "          (intermediate): RobertaIntermediate(\n",
+      "            (dense): Linear(in_features=768, out_features=3072, bias=True)\n",
+      "            (intermediate_act_fn): GELUActivation()\n",
+      "          )\n",
+      "          (output): RobertaOutput(\n",
+      "            (dense): Linear(in_features=3072, out_features=768, bias=True)\n",
+      "            (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n",
+      "            (dropout): Dropout(p=0.1, inplace=False)\n",
+      "          )\n",
+      "        )\n",
+      "        (8): RobertaLayer(\n",
+      "          (attention): RobertaAttention(\n",
+      "            (self): RobertaSelfAttention(\n",
+      "              (query): Linear(in_features=768, out_features=768, bias=True)\n",
+      "              (key): Linear(in_features=768, out_features=768, bias=True)\n",
+      "              (value): Linear(in_features=768, out_features=768, bias=True)\n",
+      "              (dropout): Dropout(p=0.1, inplace=False)\n",
+      "            )\n",
+      "            (output): RobertaSelfOutput(\n",
+      "              (dense): Linear(in_features=768, out_features=768, bias=True)\n",
+      "              (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n",
+      "              (dropout): Dropout(p=0.1, inplace=False)\n",
+      "            )\n",
+      "          )\n",
+      "          (intermediate): RobertaIntermediate(\n",
+      "            (dense): Linear(in_features=768, out_features=3072, bias=True)\n",
+      "            (intermediate_act_fn): GELUActivation()\n",
+      "          )\n",
+      "          (output): RobertaOutput(\n",
+      "            (dense): Linear(in_features=3072, out_features=768, bias=True)\n",
+      "            (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n",
+      "            (dropout): Dropout(p=0.1, inplace=False)\n",
+      "          )\n",
+      "        )\n",
+      "        (9): RobertaLayer(\n",
+      "          (attention): RobertaAttention(\n",
+      "            (self): RobertaSelfAttention(\n",
+      "              (query): Linear(in_features=768, out_features=768, bias=True)\n",
+      "              (key): Linear(in_features=768, out_features=768, bias=True)\n",
+      "              (value): Linear(in_features=768, out_features=768, bias=True)\n",
+      "              (dropout): Dropout(p=0.1, inplace=False)\n",
+      "            )\n",
+      "            (output): RobertaSelfOutput(\n",
+      "              (dense): Linear(in_features=768, out_features=768, bias=True)\n",
+      "              (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n",
+      "              (dropout): Dropout(p=0.1, inplace=False)\n",
+      "            )\n",
+      "          )\n",
+      "          (intermediate): RobertaIntermediate(\n",
+      "            (dense): Linear(in_features=768, out_features=3072, bias=True)\n",
+      "            (intermediate_act_fn): GELUActivation()\n",
+      "          )\n",
+      "          (output): RobertaOutput(\n",
+      "            (dense): Linear(in_features=3072, out_features=768, bias=True)\n",
+      "            (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n",
+      "            (dropout): Dropout(p=0.1, inplace=False)\n",
+      "          )\n",
+      "        )\n",
+      "        (10): RobertaLayer(\n",
+      "          (attention): RobertaAttention(\n",
+      "            (self): RobertaSelfAttention(\n",
+      "              (query): Linear(in_features=768, out_features=768, bias=True)\n",
+      "              (key): Linear(in_features=768, out_features=768, bias=True)\n",
+      "              (value): Linear(in_features=768, out_features=768, bias=True)\n",
+      "              (dropout): Dropout(p=0.1, inplace=False)\n",
+      "            )\n",
+      "            (output): RobertaSelfOutput(\n",
+      "              (dense): Linear(in_features=768, out_features=768, bias=True)\n",
+      "              (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n",
+      "              (dropout): Dropout(p=0.1, inplace=False)\n",
+      "            )\n",
+      "          )\n",
+      "          (intermediate): RobertaIntermediate(\n",
+      "            (dense): Linear(in_features=768, out_features=3072, bias=True)\n",
+      "            (intermediate_act_fn): GELUActivation()\n",
+      "          )\n",
+      "          (output): RobertaOutput(\n",
+      "            (dense): Linear(in_features=3072, out_features=768, bias=True)\n",
+      "            (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n",
+      "            (dropout): Dropout(p=0.1, inplace=False)\n",
+      "          )\n",
+      "        )\n",
+      "        (11): RobertaLayer(\n",
+      "          (attention): RobertaAttention(\n",
+      "            (self): RobertaSelfAttention(\n",
+      "              (query): Linear(in_features=768, out_features=768, bias=True)\n",
+      "              (key): Linear(in_features=768, out_features=768, bias=True)\n",
+      "              (value): Linear(in_features=768, out_features=768, bias=True)\n",
+      "              (dropout): Dropout(p=0.1, inplace=False)\n",
+      "            )\n",
+      "            (output): RobertaSelfOutput(\n",
+      "              (dense): Linear(in_features=768, out_features=768, bias=True)\n",
+      "              (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n",
+      "              (dropout): Dropout(p=0.1, inplace=False)\n",
+      "            )\n",
+      "          )\n",
+      "          (intermediate): RobertaIntermediate(\n",
+      "            (dense): Linear(in_features=768, out_features=3072, bias=True)\n",
+      "            (intermediate_act_fn): GELUActivation()\n",
+      "          )\n",
+      "          (output): RobertaOutput(\n",
+      "            (dense): Linear(in_features=3072, out_features=768, bias=True)\n",
+      "            (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n",
+      "            (dropout): Dropout(p=0.1, inplace=False)\n",
+      "          )\n",
+      "        )\n",
+      "      )\n",
+      "    )\n",
+      "  )\n",
+      "  (classifier): RobertaClassificationHeadCustomSimple(\n",
+      "    (dense_1): Linear(in_features=768, out_features=3072, bias=True)\n",
+      "    (dense_2): Linear(in_features=3072, out_features=768, bias=True)\n",
+      "    (dropout): Dropout(p=0.1, inplace=False)\n",
+      "    (out_proj): Linear(in_features=768, out_features=4, bias=True)\n",
+      "    (activation): GELU(approximate='none')\n",
+      "  )\n",
+      ")\n",
+      "02/16/2023 16:46:52 - WARNING - datasets.arrow_dataset - Loading cached processed dataset at /home/jacob/code/university/uczenie_glebokie_w_przetwarzaniu_tekstu/.cache_training/json/default-f6e8039906850c57/0.0.0/0f7e3662623656454fcd2b650f34e886a7db4b9104504885bd462096cc7a9f51/cache-df96547ec55a44ce.arrow\n",
+      "02/16/2023 16:46:52 - WARNING - datasets.arrow_dataset - Loading cached processed dataset at /home/jacob/code/university/uczenie_glebokie_w_przetwarzaniu_tekstu/.cache_training/json/default-f6e8039906850c57/0.0.0/0f7e3662623656454fcd2b650f34e886a7db4b9104504885bd462096cc7a9f51/cache-67b1030adaffbb4a.arrow\n",
+      "02/16/2023 16:46:52 - WARNING - datasets.arrow_dataset - Loading cached processed dataset at /home/jacob/code/university/uczenie_glebokie_w_przetwarzaniu_tekstu/.cache_training/json/default-f6e8039906850c57/0.0.0/0f7e3662623656454fcd2b650f34e886a7db4b9104504885bd462096cc7a9f51/cache-ae09252df5e9bac1.arrow\n",
+      "02/16/2023 16:46:52 - INFO - __main__ - Set 500 samples for 0-class\n",
+      "02/16/2023 16:46:52 - INFO - __main__ - Set 500 samples for 1-class\n",
+      "02/16/2023 16:46:52 - INFO - __main__ - Set 500 samples for 2-class\n",
+      "02/16/2023 16:46:52 - INFO - __main__ - Set 500 samples for 3-class\n",
+      "[INFO|trainer.py:511] 2023-02-16 16:46:55,346 >> max_steps is given, it will override any value given in num_train_epochs\n",
+      "02/16/2023 16:46:55 - INFO - __main__ - *** Evaluate ***\n",
+      "[INFO|trainer.py:710] 2023-02-16 16:46:55,346 >> The following columns in the evaluation set don't have a corresponding argument in `RobertaForSequenceClassificationCustomSimple.forward` and have been ignored: text. If text are not expected by `RobertaForSequenceClassificationCustomSimple.forward`,  you can safely ignore this message.\n",
+      "[INFO|trainer.py:2964] 2023-02-16 16:46:55,348 >> ***** Running Evaluation *****\n",
+      "[INFO|trainer.py:2966] 2023-02-16 16:46:55,348 >>   Num examples = 2000\n",
+      "[INFO|trainer.py:2969] 2023-02-16 16:46:55,348 >>   Batch size = 8\n",
+      "100%|█████████████████████████████████████████| 250/250 [00:17<00:00, 14.53it/s]\n",
+      "***** eval metrics *****\n",
+      "  eval_accuracy           =      0.923\n",
+      "  eval_loss               =      0.296\n",
+      "  eval_runtime            = 0:00:17.81\n",
+      "  eval_samples            =       2000\n",
+      "  eval_samples_per_second =    112.255\n",
+      "  eval_steps_per_second   =     14.032\n",
+      "02/16/2023 16:47:13 - INFO - __main__ - *** Predict ***\n",
+      "[INFO|trainer.py:710] 2023-02-16 16:47:13,166 >> The following columns in the test set don't have a corresponding argument in `RobertaForSequenceClassificationCustomSimple.forward` and have been ignored: text. If text are not expected by `RobertaForSequenceClassificationCustomSimple.forward`,  you can safely ignore this message.\n",
+      "[INFO|trainer.py:2964] 2023-02-16 16:47:13,167 >> ***** Running Prediction *****\n",
+      "[INFO|trainer.py:2966] 2023-02-16 16:47:13,167 >>   Num examples = 3800\n",
+      "[INFO|trainer.py:2969] 2023-02-16 16:47:13,167 >>   Batch size = 8\n",
+      "100%|█████████████████████████████████████████| 475/475 [00:32<00:00, 14.53it/s]\n",
+      "02/16/2023 16:47:45 - INFO - __main__ - ***** Predict results None *****\n",
+      "[INFO|modelcard.py:449] 2023-02-16 16:47:46,438 >> Dropping the following result as it does not have all the necessary fields:\n",
+      "{'task': {'name': 'Text Classification', 'type': 'text-classification'}}\n"
+     ]
+    }
+   ],
+   "source": [
+    "!python run_glue.py \\\n",
+    "  --cache_dir .cache_training \\\n",
+    "  --model_name_or_path out/roberta \\\n",
+    "  --custom_model roberta_simple \\\n",
+    "  --train_file data/train.json  \\\n",
+    "  --validation_file data/valid.json \\\n",
+    "  --test_file data/test.json \\\n",
+    "  --per_device_train_batch_size 8 \\\n",
+    "  --per_device_eval_batch_size 8 \\\n",
+    "  --do_eval \\\n",
+    "  --do_predict \\\n",
+    "  --max_seq_length 128 \\\n",
+    "  --learning_rate 2e-5 \\\n",
+    "  --max_eval_samples 2000 \\\n",
+    "  --max_steps 2500 \\\n",
+    "  --num_train_epochs 1 \\\n",
+    "  --save_strategy steps \\\n",
+    "  --save_steps 250 \\\n",
+    "  --save_total_limit 5 \\\n",
+    "  --logging_strategy steps \\\n",
+    "  --logging_steps 100 \\\n",
+    "  --eval_steps 250 \\\n",
+    "  --evaluation_strategy steps \\\n",
+    "  --metric_for_best_model accuracy \\\n",
+    "  --greater_is_better True \\\n",
+    "  --load_best_model_at_end True \\\n",
+    "  --output_dir out/roberta_results"
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Results"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 14,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\u001b[0;39m0.9229999780654907\u001b[0m\n"
+     ]
+    }
+   ],
+   "source": [
+    "!cat out/roberta_results/eval_results.json | jq .eval_accuracy"
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# GPT2"
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Modifications"
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "- Custom classification head with 3 dense layers\n",
+    "- Using hidden states from last layer"
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Code"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import torch\n",
+    "from torch import nn\n",
+    "from transformers import GPT2PreTrainedModel, GPT2Model\n",
+    "from transformers.modeling_outputs import SequenceClassifierOutputWithPast\n",
+    "\n",
+    "class GPT2ForSequenceClassification(GPT2PreTrainedModel):\n",
+    "    def __init__(self, config):\n",
+    "        super().__init__(config)\n",
+    "        self.num_labels = config.num_labels\n",
+    "        self.transformer = GPT2Model(config)\n",
+    "        self.score = nn.Linear(config.n_embd, self.num_labels, bias=False)\n",
+    "\n",
+    "        # Model parallel\n",
+    "        self.model_parallel = False\n",
+    "        self.device_map = None\n",
+    "\n",
+    "        # Initialize weights and apply final processing\n",
+    "        self.post_init()\n",
+    "\n",
+    "\n",
+    "class GPT2ClassificationHeadCustom(nn.Module):\n",
+    "    def __init__(self, config):\n",
+    "        super().__init__()\n",
+    "        hidden_size = config.n_embd\n",
+    "        self.dense_1_input = nn.Linear(hidden_size, 2 * hidden_size)\n",
+    "        self.dense_1_hidden = nn.Linear(hidden_size, 2 * hidden_size)\n",
+    "        self.dense_2 = nn.Linear(4 * hidden_size, hidden_size)\n",
+    "        self.dropout = nn.Dropout(config.resid_pdrop)\n",
+    "        self.out_proj = nn.Linear(hidden_size, config.num_labels, bias=False)\n",
+    "\n",
+    "    def forward(self, x, **kwargs):\n",
+    "        if 'hidden_states' in kwargs and kwargs['hidden_states'] is not None:\n",
+    "            hidden = kwargs['hidden_states'][-1]\n",
+    "        else:\n",
+    "            hidden = torch.zeros(x.size(), dtype=x.dtype, device=x.device)\n",
+    "\n",
+    "        x = self.dense_1_input(x)\n",
+    "        x = torch.relu(x)\n",
+    "        x = self.dropout(x)\n",
+    "\n",
+    "        hidden = self.dense_1_hidden(hidden)\n",
+    "        hidden = torch.relu(hidden)\n",
+    "        hidden = self.dropout(hidden)\n",
+    "\n",
+    "        x = torch.cat((x, hidden), dim=2)\n",
+    "        x = self.dense_2(x)\n",
+    "        x = torch.relu(x)\n",
+    "        x = self.dropout(x)\n",
+    "\n",
+    "        x = self.out_proj(x)\n",
+    "        return x\n",
+    "\n",
+    "class GPT2ForSequenceClassificationCustom(GPT2ForSequenceClassification):\n",
+    "    def __init__(self, config):\n",
+    "        super().__init__(config)\n",
+    "        self.num_labels = config.num_labels\n",
+    "        self.transformer = GPT2Model(config)\n",
+    "        self.score = GPT2ClassificationHeadCustom(config)\n",
+    "\n",
+    "        self.init_weights()\n",
+    "\n",
+    "        # Model parallel\n",
+    "        self.model_parallel = False\n",
+    "        self.device_map = None\n",
+    "\n",
+    "    def forward(\n",
+    "        self,\n",
+    "        input_ids=None,\n",
+    "        past_key_values=None,\n",
+    "        attention_mask=None,\n",
+    "        token_type_ids=None,\n",
+    "        position_ids=None,\n",
+    "        head_mask=None,\n",
+    "        inputs_embeds=None,\n",
+    "        labels=None,\n",
+    "        use_cache=None,\n",
+    "        output_attentions=None,\n",
+    "        output_hidden_states=None,\n",
+    "        return_dict=None,\n",
+    "    ):\n",
+    "        r\"\"\"\n",
+    "        labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`):\n",
+    "            Labels for computing the sequence classification/regression loss. Indices should be in :obj:`[0, ...,\n",
+    "            config.num_labels - 1]`. If :obj:`config.num_labels == 1` a regression loss is computed (Mean-Square loss),\n",
+    "            If :obj:`config.num_labels > 1` a classification loss is computed (Cross-Entropy).\n",
+    "        \"\"\"\n",
+    "        return_dict = return_dict if return_dict is not None else self.config.use_return_dict\n",
+    "\n",
+    "        transformer_outputs = self.transformer(\n",
+    "            input_ids,\n",
+    "            past_key_values=past_key_values,\n",
+    "            attention_mask=attention_mask,\n",
+    "            token_type_ids=token_type_ids,\n",
+    "            position_ids=position_ids,\n",
+    "            head_mask=head_mask,\n",
+    "            inputs_embeds=inputs_embeds,\n",
+    "            use_cache=use_cache,\n",
+    "            output_attentions=output_attentions,\n",
+    "            output_hidden_states=output_hidden_states,\n",
+    "            return_dict=return_dict,\n",
+    "        )\n",
+    "        hidden_states = transformer_outputs[0]\n",
+    "        if return_dict:\n",
+    "            logits = self.score(hidden_states, hidden_states=transformer_outputs.hidden_states)\n",
+    "        else:\n",
+    "            raise NotImplemented('Not implemented for using non-dictionary object')\n",
+    "\n",
+    "        if input_ids is not None:\n",
+    "            batch_size, sequence_length = input_ids.shape[:2]\n",
+    "        else:\n",
+    "            batch_size, sequence_length = inputs_embeds.shape[:2]\n",
+    "\n",
+    "        assert (\n",
+    "            self.config.pad_token_id is not None or batch_size == 1\n",
+    "        ), \"Cannot handle batch sizes > 1 if no padding token is defined.\"\n",
+    "        if self.config.pad_token_id is None:\n",
+    "            sequence_lengths = -1\n",
+    "        else:\n",
+    "            if input_ids is not None:\n",
+    "                sequence_lengths = torch.ne(input_ids, self.config.pad_token_id).sum(-1) - 1\n",
+    "            else:\n",
+    "                sequence_lengths = -1\n",
+    "\n",
+    "        pooled_logits = logits[range(batch_size), sequence_lengths]\n",
+    "\n",
+    "        loss = None\n",
+    "        if labels is not None:\n",
+    "            if self.num_labels == 1:\n",
+    "                #  We are doing regression\n",
+    "                loss_fct = nn.MSELoss()\n",
+    "                loss = loss_fct(pooled_logits.view(-1), labels.to(self.dtype).view(-1))\n",
+    "            else:\n",
+    "                loss_fct = nn.CrossEntropyLoss()\n",
+    "                loss = loss_fct(pooled_logits.view(-1, self.num_labels), labels.view(-1))\n",
+    "\n",
+    "        if not return_dict:\n",
+    "            output = (pooled_logits,) + transformer_outputs[1:]\n",
+    "            return ((loss,) + output) if loss is not None else output\n",
+    "\n",
+    "        return SequenceClassifierOutputWithPast(\n",
+    "            loss=loss,\n",
+    "            logits=pooled_logits,\n",
+    "            past_key_values=transformer_outputs.past_key_values,\n",
+    "            hidden_states=transformer_outputs.hidden_states,\n",
+    "            attentions=transformer_outputs.attentions,\n",
+    "        )"
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Model"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "4f980b257c2b453797f63ddc89c98923",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Downloading (…)lve/main/config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Some weights of GPT2ForSequenceClassificationCustom were not initialized from the model checkpoint at gpt2 and are newly initialized: ['h.10.attn.masked_bias', 'h.2.attn.masked_bias', 'h.5.attn.masked_bias', 'score.dense_2.weight', 'h.9.attn.masked_bias', 'score.dense_1_input.bias', 'score.out_proj.weight', 'h.7.attn.masked_bias', 'h.4.attn.masked_bias', 'h.3.attn.masked_bias', 'h.11.attn.masked_bias', 'h.6.attn.masked_bias', 'h.8.attn.masked_bias', 'score.dense_1_hidden.weight', 'h.1.attn.masked_bias', 'h.0.attn.masked_bias', 'score.dense_1_input.weight', 'score.dense_1_hidden.bias', 'score.dense_2.bias']\n",
+      "You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n"
+     ]
+    },
+    {
+     "data": {
+      "text/plain": [
+       "GPT2ForSequenceClassificationCustom(\n",
+       "  (transformer): GPT2Model(\n",
+       "    (wte): Embedding(50257, 768)\n",
+       "    (wpe): Embedding(1024, 768)\n",
+       "    (drop): Dropout(p=0.1, inplace=False)\n",
+       "    (h): ModuleList(\n",
+       "      (0): GPT2Block(\n",
+       "        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n",
+       "        (attn): GPT2Attention(\n",
+       "          (c_attn): Conv1D()\n",
+       "          (c_proj): Conv1D()\n",
+       "          (attn_dropout): Dropout(p=0.1, inplace=False)\n",
+       "          (resid_dropout): Dropout(p=0.1, inplace=False)\n",
+       "        )\n",
+       "        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n",
+       "        (mlp): GPT2MLP(\n",
+       "          (c_fc): Conv1D()\n",
+       "          (c_proj): Conv1D()\n",
+       "          (act): NewGELUActivation()\n",
+       "          (dropout): Dropout(p=0.1, inplace=False)\n",
+       "        )\n",
+       "      )\n",
+       "      (1): GPT2Block(\n",
+       "        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n",
+       "        (attn): GPT2Attention(\n",
+       "          (c_attn): Conv1D()\n",
+       "          (c_proj): Conv1D()\n",
+       "          (attn_dropout): Dropout(p=0.1, inplace=False)\n",
+       "          (resid_dropout): Dropout(p=0.1, inplace=False)\n",
+       "        )\n",
+       "        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n",
+       "        (mlp): GPT2MLP(\n",
+       "          (c_fc): Conv1D()\n",
+       "          (c_proj): Conv1D()\n",
+       "          (act): NewGELUActivation()\n",
+       "          (dropout): Dropout(p=0.1, inplace=False)\n",
+       "        )\n",
+       "      )\n",
+       "      (2): GPT2Block(\n",
+       "        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n",
+       "        (attn): GPT2Attention(\n",
+       "          (c_attn): Conv1D()\n",
+       "          (c_proj): Conv1D()\n",
+       "          (attn_dropout): Dropout(p=0.1, inplace=False)\n",
+       "          (resid_dropout): Dropout(p=0.1, inplace=False)\n",
+       "        )\n",
+       "        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n",
+       "        (mlp): GPT2MLP(\n",
+       "          (c_fc): Conv1D()\n",
+       "          (c_proj): Conv1D()\n",
+       "          (act): NewGELUActivation()\n",
+       "          (dropout): Dropout(p=0.1, inplace=False)\n",
+       "        )\n",
+       "      )\n",
+       "      (3): GPT2Block(\n",
+       "        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n",
+       "        (attn): GPT2Attention(\n",
+       "          (c_attn): Conv1D()\n",
+       "          (c_proj): Conv1D()\n",
+       "          (attn_dropout): Dropout(p=0.1, inplace=False)\n",
+       "          (resid_dropout): Dropout(p=0.1, inplace=False)\n",
+       "        )\n",
+       "        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n",
+       "        (mlp): GPT2MLP(\n",
+       "          (c_fc): Conv1D()\n",
+       "          (c_proj): Conv1D()\n",
+       "          (act): NewGELUActivation()\n",
+       "          (dropout): Dropout(p=0.1, inplace=False)\n",
+       "        )\n",
+       "      )\n",
+       "      (4): GPT2Block(\n",
+       "        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n",
+       "        (attn): GPT2Attention(\n",
+       "          (c_attn): Conv1D()\n",
+       "          (c_proj): Conv1D()\n",
+       "          (attn_dropout): Dropout(p=0.1, inplace=False)\n",
+       "          (resid_dropout): Dropout(p=0.1, inplace=False)\n",
+       "        )\n",
+       "        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n",
+       "        (mlp): GPT2MLP(\n",
+       "          (c_fc): Conv1D()\n",
+       "          (c_proj): Conv1D()\n",
+       "          (act): NewGELUActivation()\n",
+       "          (dropout): Dropout(p=0.1, inplace=False)\n",
+       "        )\n",
+       "      )\n",
+       "      (5): GPT2Block(\n",
+       "        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n",
+       "        (attn): GPT2Attention(\n",
+       "          (c_attn): Conv1D()\n",
+       "          (c_proj): Conv1D()\n",
+       "          (attn_dropout): Dropout(p=0.1, inplace=False)\n",
+       "          (resid_dropout): Dropout(p=0.1, inplace=False)\n",
+       "        )\n",
+       "        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n",
+       "        (mlp): GPT2MLP(\n",
+       "          (c_fc): Conv1D()\n",
+       "          (c_proj): Conv1D()\n",
+       "          (act): NewGELUActivation()\n",
+       "          (dropout): Dropout(p=0.1, inplace=False)\n",
+       "        )\n",
+       "      )\n",
+       "      (6): GPT2Block(\n",
+       "        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n",
+       "        (attn): GPT2Attention(\n",
+       "          (c_attn): Conv1D()\n",
+       "          (c_proj): Conv1D()\n",
+       "          (attn_dropout): Dropout(p=0.1, inplace=False)\n",
+       "          (resid_dropout): Dropout(p=0.1, inplace=False)\n",
+       "        )\n",
+       "        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n",
+       "        (mlp): GPT2MLP(\n",
+       "          (c_fc): Conv1D()\n",
+       "          (c_proj): Conv1D()\n",
+       "          (act): NewGELUActivation()\n",
+       "          (dropout): Dropout(p=0.1, inplace=False)\n",
+       "        )\n",
+       "      )\n",
+       "      (7): GPT2Block(\n",
+       "        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n",
+       "        (attn): GPT2Attention(\n",
+       "          (c_attn): Conv1D()\n",
+       "          (c_proj): Conv1D()\n",
+       "          (attn_dropout): Dropout(p=0.1, inplace=False)\n",
+       "          (resid_dropout): Dropout(p=0.1, inplace=False)\n",
+       "        )\n",
+       "        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n",
+       "        (mlp): GPT2MLP(\n",
+       "          (c_fc): Conv1D()\n",
+       "          (c_proj): Conv1D()\n",
+       "          (act): NewGELUActivation()\n",
+       "          (dropout): Dropout(p=0.1, inplace=False)\n",
+       "        )\n",
+       "      )\n",
+       "      (8): GPT2Block(\n",
+       "        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n",
+       "        (attn): GPT2Attention(\n",
+       "          (c_attn): Conv1D()\n",
+       "          (c_proj): Conv1D()\n",
+       "          (attn_dropout): Dropout(p=0.1, inplace=False)\n",
+       "          (resid_dropout): Dropout(p=0.1, inplace=False)\n",
+       "        )\n",
+       "        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n",
+       "        (mlp): GPT2MLP(\n",
+       "          (c_fc): Conv1D()\n",
+       "          (c_proj): Conv1D()\n",
+       "          (act): NewGELUActivation()\n",
+       "          (dropout): Dropout(p=0.1, inplace=False)\n",
+       "        )\n",
+       "      )\n",
+       "      (9): GPT2Block(\n",
+       "        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n",
+       "        (attn): GPT2Attention(\n",
+       "          (c_attn): Conv1D()\n",
+       "          (c_proj): Conv1D()\n",
+       "          (attn_dropout): Dropout(p=0.1, inplace=False)\n",
+       "          (resid_dropout): Dropout(p=0.1, inplace=False)\n",
+       "        )\n",
+       "        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n",
+       "        (mlp): GPT2MLP(\n",
+       "          (c_fc): Conv1D()\n",
+       "          (c_proj): Conv1D()\n",
+       "          (act): NewGELUActivation()\n",
+       "          (dropout): Dropout(p=0.1, inplace=False)\n",
+       "        )\n",
+       "      )\n",
+       "      (10): GPT2Block(\n",
+       "        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n",
+       "        (attn): GPT2Attention(\n",
+       "          (c_attn): Conv1D()\n",
+       "          (c_proj): Conv1D()\n",
+       "          (attn_dropout): Dropout(p=0.1, inplace=False)\n",
+       "          (resid_dropout): Dropout(p=0.1, inplace=False)\n",
+       "        )\n",
+       "        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n",
+       "        (mlp): GPT2MLP(\n",
+       "          (c_fc): Conv1D()\n",
+       "          (c_proj): Conv1D()\n",
+       "          (act): NewGELUActivation()\n",
+       "          (dropout): Dropout(p=0.1, inplace=False)\n",
+       "        )\n",
+       "      )\n",
+       "      (11): GPT2Block(\n",
+       "        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n",
+       "        (attn): GPT2Attention(\n",
+       "          (c_attn): Conv1D()\n",
+       "          (c_proj): Conv1D()\n",
+       "          (attn_dropout): Dropout(p=0.1, inplace=False)\n",
+       "          (resid_dropout): Dropout(p=0.1, inplace=False)\n",
+       "        )\n",
+       "        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n",
+       "        (mlp): GPT2MLP(\n",
+       "          (c_fc): Conv1D()\n",
+       "          (c_proj): Conv1D()\n",
+       "          (act): NewGELUActivation()\n",
+       "          (dropout): Dropout(p=0.1, inplace=False)\n",
+       "        )\n",
+       "      )\n",
+       "    )\n",
+       "    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n",
+       "  )\n",
+       "  (score): GPT2ClassificationHeadCustom(\n",
+       "    (dense_1_input): Linear(in_features=768, out_features=1536, bias=True)\n",
+       "    (dense_1_hidden): Linear(in_features=768, out_features=1536, bias=True)\n",
+       "    (dense_2): Linear(in_features=3072, out_features=768, bias=True)\n",
+       "    (dropout): Dropout(p=0.1, inplace=False)\n",
+       "    (out_proj): Linear(in_features=768, out_features=2, bias=False)\n",
+       "  )\n",
+       ")"
+      ]
+     },
+     "execution_count": 7,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "GPT2ForSequenceClassificationCustom.from_pretrained('gpt2')"
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Training"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "02/16/2023 15:22:37 - WARNING - __main__ - Process rank: -1, device: cuda:0, n_gpu: 1distributed training: False, 16-bits training: False\n",
+      "02/16/2023 15:22:37 - INFO - __main__ - Training/evaluation parameters TrainingArguments(\n",
+      "_n_gpu=1,\n",
+      "adafactor=False,\n",
+      "adam_beta1=0.9,\n",
+      "adam_beta2=0.999,\n",
+      "adam_epsilon=1e-08,\n",
+      "auto_find_batch_size=False,\n",
+      "bf16=False,\n",
+      "bf16_full_eval=False,\n",
+      "data_seed=None,\n",
+      "dataloader_drop_last=False,\n",
+      "dataloader_num_workers=0,\n",
+      "dataloader_pin_memory=True,\n",
+      "ddp_bucket_cap_mb=None,\n",
+      "ddp_find_unused_parameters=None,\n",
+      "ddp_timeout=1800,\n",
+      "debug=[],\n",
+      "deepspeed=None,\n",
+      "disable_tqdm=False,\n",
+      "do_eval=True,\n",
+      "do_predict=False,\n",
+      "do_train=True,\n",
+      "eval_accumulation_steps=None,\n",
+      "eval_delay=0,\n",
+      "eval_steps=250,\n",
+      "evaluation_strategy=steps,\n",
+      "fp16=False,\n",
+      "fp16_backend=auto,\n",
+      "fp16_full_eval=False,\n",
+      "fp16_opt_level=O1,\n",
+      "fsdp=[],\n",
+      "fsdp_min_num_params=0,\n",
+      "fsdp_transformer_layer_cls_to_wrap=None,\n",
+      "full_determinism=False,\n",
+      "gradient_accumulation_steps=1,\n",
+      "gradient_checkpointing=False,\n",
+      "greater_is_better=True,\n",
+      "group_by_length=False,\n",
+      "half_precision_backend=auto,\n",
+      "hub_model_id=None,\n",
+      "hub_private_repo=False,\n",
+      "hub_strategy=every_save,\n",
+      "hub_token=<HUB_TOKEN>,\n",
+      "ignore_data_skip=False,\n",
+      "include_inputs_for_metrics=False,\n",
+      "jit_mode_eval=False,\n",
+      "label_names=None,\n",
+      "label_smoothing_factor=0.0,\n",
+      "learning_rate=2e-05,\n",
+      "length_column_name=length,\n",
+      "load_best_model_at_end=True,\n",
+      "local_rank=-1,\n",
+      "log_level=passive,\n",
+      "log_level_replica=passive,\n",
+      "log_on_each_node=True,\n",
+      "logging_dir=out/gpt2/runs/Feb16_15-22-36_DESKTOP-R7JO8BQ,\n",
+      "logging_first_step=False,\n",
+      "logging_nan_inf_filter=True,\n",
+      "logging_steps=100,\n",
+      "logging_strategy=steps,\n",
+      "lr_scheduler_type=linear,\n",
+      "max_grad_norm=1.0,\n",
+      "max_steps=2500,\n",
+      "metric_for_best_model=accuracy,\n",
+      "mp_parameters=,\n",
+      "no_cuda=False,\n",
+      "num_train_epochs=1.0,\n",
+      "optim=adamw_hf,\n",
+      "optim_args=None,\n",
+      "output_dir=out/gpt2,\n",
+      "overwrite_output_dir=False,\n",
+      "past_index=-1,\n",
+      "per_device_eval_batch_size=8,\n",
+      "per_device_train_batch_size=8,\n",
+      "prediction_loss_only=False,\n",
+      "push_to_hub=False,\n",
+      "push_to_hub_model_id=None,\n",
+      "push_to_hub_organization=None,\n",
+      "push_to_hub_token=<PUSH_TO_HUB_TOKEN>,\n",
+      "ray_scope=last,\n",
+      "remove_unused_columns=True,\n",
+      "report_to=[],\n",
+      "resume_from_checkpoint=None,\n",
+      "run_name=out/gpt2,\n",
+      "save_on_each_node=False,\n",
+      "save_steps=250,\n",
+      "save_strategy=steps,\n",
+      "save_total_limit=5,\n",
+      "seed=42,\n",
+      "sharded_ddp=[],\n",
+      "skip_memory_metrics=True,\n",
+      "tf32=None,\n",
+      "torch_compile=False,\n",
+      "torch_compile_backend=None,\n",
+      "torch_compile_mode=None,\n",
+      "torchdynamo=None,\n",
+      "tpu_metrics_debug=False,\n",
+      "tpu_num_cores=None,\n",
+      "use_ipex=False,\n",
+      "use_legacy_prediction_loop=False,\n",
+      "use_mps_device=False,\n",
+      "warmup_ratio=0.0,\n",
+      "warmup_steps=0,\n",
+      "weight_decay=0.0,\n",
+      "xpu_backend=None,\n",
+      ")\n",
+      "02/16/2023 15:22:37 - INFO - __main__ - Checkpoint detected, resuming training at out/gpt2/checkpoint-2500. To avoid this behavior, change the `--output_dir` or add `--overwrite_output_dir` to train from scratch.\n",
+      "02/16/2023 15:22:37 - INFO - __main__ - load a local file for train: data/train.json\n",
+      "02/16/2023 15:22:37 - INFO - __main__ - load a local file for validation: data/valid.json\n",
+      "02/16/2023 15:22:37 - WARNING - datasets.builder - Using custom data configuration default-e10a382a423bbb9a\n",
+      "02/16/2023 15:22:37 - INFO - datasets.info - Loading Dataset Infos from /home/jacob/anaconda3/envs/ugp/lib/python3.10/site-packages/datasets/packaged_modules/json\n",
+      "02/16/2023 15:22:37 - INFO - datasets.builder - Generating dataset json (/home/jacob/code/university/uczenie_glebokie_w_przetwarzaniu_tekstu/.cache_training/json/default-e10a382a423bbb9a/0.0.0/0f7e3662623656454fcd2b650f34e886a7db4b9104504885bd462096cc7a9f51)\n",
+      "Downloading and preparing dataset json/default to /home/jacob/code/university/uczenie_glebokie_w_przetwarzaniu_tekstu/.cache_training/json/default-e10a382a423bbb9a/0.0.0/0f7e3662623656454fcd2b650f34e886a7db4b9104504885bd462096cc7a9f51...\n",
+      "Downloading data files: 100%|██████████████████| 2/2 [00:00<00:00, 14820.86it/s]\n",
+      "02/16/2023 15:22:37 - INFO - datasets.download.download_manager - Downloading took 0.0 min\n",
+      "02/16/2023 15:22:37 - INFO - datasets.download.download_manager - Checksum Computation took 0.0 min\n",
+      "Extracting data files: 100%|████████████████████| 2/2 [00:00<00:00, 2476.71it/s]\n",
+      "02/16/2023 15:22:37 - INFO - datasets.utils.info_utils - Unable to verify checksums.\n",
+      "02/16/2023 15:22:37 - INFO - datasets.builder - Generating train split\n",
+      "02/16/2023 15:22:37 - INFO - datasets.builder - Generating validation split\n",
+      "02/16/2023 15:22:37 - INFO - datasets.utils.info_utils - Unable to verify splits sizes.\n",
+      "Dataset json downloaded and prepared to /home/jacob/code/university/uczenie_glebokie_w_przetwarzaniu_tekstu/.cache_training/json/default-e10a382a423bbb9a/0.0.0/0f7e3662623656454fcd2b650f34e886a7db4b9104504885bd462096cc7a9f51. Subsequent calls will reuse this data.\n",
+      "100%|████████████████████████████████████████████| 2/2 [00:00<00:00, 642.61it/s]\n",
+      "[INFO|configuration_utils.py:660] 2023-02-16 15:22:38,465 >> loading configuration file config.json from cache at .cache_training/models--gpt2/snapshots/e7da7f221d5bf496a48136c0cd264e630fe9fcc8/config.json\n",
+      "[INFO|configuration_utils.py:712] 2023-02-16 15:22:38,465 >> Model config GPT2Config {\n",
+      "  \"_name_or_path\": \"gpt2\",\n",
+      "  \"activation_function\": \"gelu_new\",\n",
+      "  \"architectures\": [\n",
+      "    \"GPT2LMHeadModel\"\n",
+      "  ],\n",
+      "  \"attn_pdrop\": 0.1,\n",
+      "  \"bos_token_id\": 50256,\n",
+      "  \"embd_pdrop\": 0.1,\n",
+      "  \"eos_token_id\": 50256,\n",
+      "  \"id2label\": {\n",
+      "    \"0\": \"LABEL_0\",\n",
+      "    \"1\": \"LABEL_1\",\n",
+      "    \"2\": \"LABEL_2\",\n",
+      "    \"3\": \"LABEL_3\"\n",
+      "  },\n",
+      "  \"initializer_range\": 0.02,\n",
+      "  \"label2id\": {\n",
+      "    \"LABEL_0\": 0,\n",
+      "    \"LABEL_1\": 1,\n",
+      "    \"LABEL_2\": 2,\n",
+      "    \"LABEL_3\": 3\n",
+      "  },\n",
+      "  \"layer_norm_epsilon\": 1e-05,\n",
+      "  \"model_type\": \"gpt2\",\n",
+      "  \"n_ctx\": 1024,\n",
+      "  \"n_embd\": 768,\n",
+      "  \"n_head\": 12,\n",
+      "  \"n_inner\": null,\n",
+      "  \"n_layer\": 12,\n",
+      "  \"n_positions\": 1024,\n",
+      "  \"reorder_and_upcast_attn\": false,\n",
+      "  \"resid_pdrop\": 0.1,\n",
+      "  \"scale_attn_by_inverse_layer_idx\": false,\n",
+      "  \"scale_attn_weights\": true,\n",
+      "  \"summary_activation\": null,\n",
+      "  \"summary_first_dropout\": 0.1,\n",
+      "  \"summary_proj_to_labels\": true,\n",
+      "  \"summary_type\": \"cls_index\",\n",
+      "  \"summary_use_proj\": true,\n",
+      "  \"task_specific_params\": {\n",
+      "    \"text-generation\": {\n",
+      "      \"do_sample\": true,\n",
+      "      \"max_length\": 50\n",
+      "    }\n",
+      "  },\n",
+      "  \"transformers_version\": \"4.26.1\",\n",
+      "  \"use_cache\": true,\n",
+      "  \"vocab_size\": 50257\n",
+      "}\n",
+      "\n",
+      "[INFO|tokenization_auto.py:458] 2023-02-16 15:22:38,945 >> Could not locate the tokenizer configuration file, will try to use the model config instead.\n",
+      "[INFO|configuration_utils.py:660] 2023-02-16 15:22:39,423 >> loading configuration file config.json from cache at .cache_training/models--gpt2/snapshots/e7da7f221d5bf496a48136c0cd264e630fe9fcc8/config.json\n",
+      "[INFO|configuration_utils.py:712] 2023-02-16 15:22:39,424 >> Model config GPT2Config {\n",
+      "  \"_name_or_path\": \"gpt2\",\n",
+      "  \"activation_function\": \"gelu_new\",\n",
+      "  \"architectures\": [\n",
+      "    \"GPT2LMHeadModel\"\n",
+      "  ],\n",
+      "  \"attn_pdrop\": 0.1,\n",
+      "  \"bos_token_id\": 50256,\n",
+      "  \"embd_pdrop\": 0.1,\n",
+      "  \"eos_token_id\": 50256,\n",
+      "  \"initializer_range\": 0.02,\n",
+      "  \"layer_norm_epsilon\": 1e-05,\n",
+      "  \"model_type\": \"gpt2\",\n",
+      "  \"n_ctx\": 1024,\n",
+      "  \"n_embd\": 768,\n",
+      "  \"n_head\": 12,\n",
+      "  \"n_inner\": null,\n",
+      "  \"n_layer\": 12,\n",
+      "  \"n_positions\": 1024,\n",
+      "  \"reorder_and_upcast_attn\": false,\n",
+      "  \"resid_pdrop\": 0.1,\n",
+      "  \"scale_attn_by_inverse_layer_idx\": false,\n",
+      "  \"scale_attn_weights\": true,\n",
+      "  \"summary_activation\": null,\n",
+      "  \"summary_first_dropout\": 0.1,\n",
+      "  \"summary_proj_to_labels\": true,\n",
+      "  \"summary_type\": \"cls_index\",\n",
+      "  \"summary_use_proj\": true,\n",
+      "  \"task_specific_params\": {\n",
+      "    \"text-generation\": {\n",
+      "      \"do_sample\": true,\n",
+      "      \"max_length\": 50\n",
+      "    }\n",
+      "  },\n",
+      "  \"transformers_version\": \"4.26.1\",\n",
+      "  \"use_cache\": true,\n",
+      "  \"vocab_size\": 50257\n",
+      "}\n",
+      "\n",
+      "[INFO|tokenization_utils_base.py:1802] 2023-02-16 15:22:40,400 >> loading file vocab.json from cache at .cache_training/models--gpt2/snapshots/e7da7f221d5bf496a48136c0cd264e630fe9fcc8/vocab.json\n",
+      "[INFO|tokenization_utils_base.py:1802] 2023-02-16 15:22:40,400 >> loading file merges.txt from cache at .cache_training/models--gpt2/snapshots/e7da7f221d5bf496a48136c0cd264e630fe9fcc8/merges.txt\n",
+      "[INFO|tokenization_utils_base.py:1802] 2023-02-16 15:22:40,400 >> loading file tokenizer.json from cache at .cache_training/models--gpt2/snapshots/e7da7f221d5bf496a48136c0cd264e630fe9fcc8/tokenizer.json\n",
+      "[INFO|tokenization_utils_base.py:1802] 2023-02-16 15:22:40,400 >> loading file added_tokens.json from cache at None\n",
+      "[INFO|tokenization_utils_base.py:1802] 2023-02-16 15:22:40,400 >> loading file special_tokens_map.json from cache at None\n",
+      "[INFO|tokenization_utils_base.py:1802] 2023-02-16 15:22:40,400 >> loading file tokenizer_config.json from cache at None\n",
+      "[INFO|configuration_utils.py:660] 2023-02-16 15:22:40,400 >> loading configuration file config.json from cache at .cache_training/models--gpt2/snapshots/e7da7f221d5bf496a48136c0cd264e630fe9fcc8/config.json\n",
+      "[INFO|configuration_utils.py:712] 2023-02-16 15:22:40,400 >> Model config GPT2Config {\n",
+      "  \"_name_or_path\": \"gpt2\",\n",
+      "  \"activation_function\": \"gelu_new\",\n",
+      "  \"architectures\": [\n",
+      "    \"GPT2LMHeadModel\"\n",
+      "  ],\n",
+      "  \"attn_pdrop\": 0.1,\n",
+      "  \"bos_token_id\": 50256,\n",
+      "  \"embd_pdrop\": 0.1,\n",
+      "  \"eos_token_id\": 50256,\n",
+      "  \"initializer_range\": 0.02,\n",
+      "  \"layer_norm_epsilon\": 1e-05,\n",
+      "  \"model_type\": \"gpt2\",\n",
+      "  \"n_ctx\": 1024,\n",
+      "  \"n_embd\": 768,\n",
+      "  \"n_head\": 12,\n",
+      "  \"n_inner\": null,\n",
+      "  \"n_layer\": 12,\n",
+      "  \"n_positions\": 1024,\n",
+      "  \"reorder_and_upcast_attn\": false,\n",
+      "  \"resid_pdrop\": 0.1,\n",
+      "  \"scale_attn_by_inverse_layer_idx\": false,\n",
+      "  \"scale_attn_weights\": true,\n",
+      "  \"summary_activation\": null,\n",
+      "  \"summary_first_dropout\": 0.1,\n",
+      "  \"summary_proj_to_labels\": true,\n",
+      "  \"summary_type\": \"cls_index\",\n",
+      "  \"summary_use_proj\": true,\n",
+      "  \"task_specific_params\": {\n",
+      "    \"text-generation\": {\n",
+      "      \"do_sample\": true,\n",
+      "      \"max_length\": 50\n",
+      "    }\n",
+      "  },\n",
+      "  \"transformers_version\": \"4.26.1\",\n",
+      "  \"use_cache\": true,\n",
+      "  \"vocab_size\": 50257\n",
+      "}\n",
+      "\n",
+      "02/16/2023 15:22:40 - INFO - __main__ - Using hidden states in model: True\n",
+      "-------------------------------------------------------- Using hidden: True\n",
+      "02/16/2023 15:22:40 - INFO - __main__ - Using implementation from class: GPT2ForSequenceClassificationCustom\n",
+      "[INFO|modeling_utils.py:2275] 2023-02-16 15:22:40,458 >> loading weights file pytorch_model.bin from cache at .cache_training/models--gpt2/snapshots/e7da7f221d5bf496a48136c0cd264e630fe9fcc8/pytorch_model.bin\n",
+      "[INFO|modeling_utils.py:2857] 2023-02-16 15:22:42,848 >> All model checkpoint weights were used when initializing GPT2ForSequenceClassificationCustom.\n",
+      "\n",
+      "[WARNING|modeling_utils.py:2859] 2023-02-16 15:22:42,849 >> Some weights of GPT2ForSequenceClassificationCustom were not initialized from the model checkpoint at gpt2 and are newly initialized: ['h.11.attn.masked_bias', 'score.out_proj.weight', 'h.7.attn.masked_bias', 'h.6.attn.masked_bias', 'h.8.attn.masked_bias', 'h.5.attn.masked_bias', 'score.dense_2.weight', 'h.9.attn.masked_bias', 'score.dense_4.bias', 'score.dense_1_input.bias', 'score.dense_3.weight', 'score.dense_1_hidden.bias', 'score.dense_1_input.weight', 'h.1.attn.masked_bias', 'score.dense_3.bias', 'h.10.attn.masked_bias', 'h.2.attn.masked_bias', 'h.4.attn.masked_bias', 'score.dense_1_hidden.weight', 'score.dense_2.bias', 'score.dense_4.weight', 'h.0.attn.masked_bias', 'h.3.attn.masked_bias']\n",
+      "You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n",
+      "GPT2ForSequenceClassificationCustom(\n",
+      "  (transformer): GPT2Model(\n",
+      "    (wte): Embedding(50257, 768)\n",
+      "    (wpe): Embedding(1024, 768)\n",
+      "    (drop): Dropout(p=0.1, inplace=False)\n",
+      "    (h): ModuleList(\n",
+      "      (0): GPT2Block(\n",
+      "        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n",
+      "        (attn): GPT2Attention(\n",
+      "          (c_attn): Conv1D()\n",
+      "          (c_proj): Conv1D()\n",
+      "          (attn_dropout): Dropout(p=0.1, inplace=False)\n",
+      "          (resid_dropout): Dropout(p=0.1, inplace=False)\n",
+      "        )\n",
+      "        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n",
+      "        (mlp): GPT2MLP(\n",
+      "          (c_fc): Conv1D()\n",
+      "          (c_proj): Conv1D()\n",
+      "          (act): NewGELUActivation()\n",
+      "          (dropout): Dropout(p=0.1, inplace=False)\n",
+      "        )\n",
+      "      )\n",
+      "      (1): GPT2Block(\n",
+      "        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n",
+      "        (attn): GPT2Attention(\n",
+      "          (c_attn): Conv1D()\n",
+      "          (c_proj): Conv1D()\n",
+      "          (attn_dropout): Dropout(p=0.1, inplace=False)\n",
+      "          (resid_dropout): Dropout(p=0.1, inplace=False)\n",
+      "        )\n",
+      "        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n",
+      "        (mlp): GPT2MLP(\n",
+      "          (c_fc): Conv1D()\n",
+      "          (c_proj): Conv1D()\n",
+      "          (act): NewGELUActivation()\n",
+      "          (dropout): Dropout(p=0.1, inplace=False)\n",
+      "        )\n",
+      "      )\n",
+      "      (2): GPT2Block(\n",
+      "        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n",
+      "        (attn): GPT2Attention(\n",
+      "          (c_attn): Conv1D()\n",
+      "          (c_proj): Conv1D()\n",
+      "          (attn_dropout): Dropout(p=0.1, inplace=False)\n",
+      "          (resid_dropout): Dropout(p=0.1, inplace=False)\n",
+      "        )\n",
+      "        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n",
+      "        (mlp): GPT2MLP(\n",
+      "          (c_fc): Conv1D()\n",
+      "          (c_proj): Conv1D()\n",
+      "          (act): NewGELUActivation()\n",
+      "          (dropout): Dropout(p=0.1, inplace=False)\n",
+      "        )\n",
+      "      )\n",
+      "      (3): GPT2Block(\n",
+      "        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n",
+      "        (attn): GPT2Attention(\n",
+      "          (c_attn): Conv1D()\n",
+      "          (c_proj): Conv1D()\n",
+      "          (attn_dropout): Dropout(p=0.1, inplace=False)\n",
+      "          (resid_dropout): Dropout(p=0.1, inplace=False)\n",
+      "        )\n",
+      "        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n",
+      "        (mlp): GPT2MLP(\n",
+      "          (c_fc): Conv1D()\n",
+      "          (c_proj): Conv1D()\n",
+      "          (act): NewGELUActivation()\n",
+      "          (dropout): Dropout(p=0.1, inplace=False)\n",
+      "        )\n",
+      "      )\n",
+      "      (4): GPT2Block(\n",
+      "        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n",
+      "        (attn): GPT2Attention(\n",
+      "          (c_attn): Conv1D()\n",
+      "          (c_proj): Conv1D()\n",
+      "          (attn_dropout): Dropout(p=0.1, inplace=False)\n",
+      "          (resid_dropout): Dropout(p=0.1, inplace=False)\n",
+      "        )\n",
+      "        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n",
+      "        (mlp): GPT2MLP(\n",
+      "          (c_fc): Conv1D()\n",
+      "          (c_proj): Conv1D()\n",
+      "          (act): NewGELUActivation()\n",
+      "          (dropout): Dropout(p=0.1, inplace=False)\n",
+      "        )\n",
+      "      )\n",
+      "      (5): GPT2Block(\n",
+      "        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n",
+      "        (attn): GPT2Attention(\n",
+      "          (c_attn): Conv1D()\n",
+      "          (c_proj): Conv1D()\n",
+      "          (attn_dropout): Dropout(p=0.1, inplace=False)\n",
+      "          (resid_dropout): Dropout(p=0.1, inplace=False)\n",
+      "        )\n",
+      "        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n",
+      "        (mlp): GPT2MLP(\n",
+      "          (c_fc): Conv1D()\n",
+      "          (c_proj): Conv1D()\n",
+      "          (act): NewGELUActivation()\n",
+      "          (dropout): Dropout(p=0.1, inplace=False)\n",
+      "        )\n",
+      "      )\n",
+      "      (6): GPT2Block(\n",
+      "        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n",
+      "        (attn): GPT2Attention(\n",
+      "          (c_attn): Conv1D()\n",
+      "          (c_proj): Conv1D()\n",
+      "          (attn_dropout): Dropout(p=0.1, inplace=False)\n",
+      "          (resid_dropout): Dropout(p=0.1, inplace=False)\n",
+      "        )\n",
+      "        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n",
+      "        (mlp): GPT2MLP(\n",
+      "          (c_fc): Conv1D()\n",
+      "          (c_proj): Conv1D()\n",
+      "          (act): NewGELUActivation()\n",
+      "          (dropout): Dropout(p=0.1, inplace=False)\n",
+      "        )\n",
+      "      )\n",
+      "      (7): GPT2Block(\n",
+      "        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n",
+      "        (attn): GPT2Attention(\n",
+      "          (c_attn): Conv1D()\n",
+      "          (c_proj): Conv1D()\n",
+      "          (attn_dropout): Dropout(p=0.1, inplace=False)\n",
+      "          (resid_dropout): Dropout(p=0.1, inplace=False)\n",
+      "        )\n",
+      "        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n",
+      "        (mlp): GPT2MLP(\n",
+      "          (c_fc): Conv1D()\n",
+      "          (c_proj): Conv1D()\n",
+      "          (act): NewGELUActivation()\n",
+      "          (dropout): Dropout(p=0.1, inplace=False)\n",
+      "        )\n",
+      "      )\n",
+      "      (8): GPT2Block(\n",
+      "        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n",
+      "        (attn): GPT2Attention(\n",
+      "          (c_attn): Conv1D()\n",
+      "          (c_proj): Conv1D()\n",
+      "          (attn_dropout): Dropout(p=0.1, inplace=False)\n",
+      "          (resid_dropout): Dropout(p=0.1, inplace=False)\n",
+      "        )\n",
+      "        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n",
+      "        (mlp): GPT2MLP(\n",
+      "          (c_fc): Conv1D()\n",
+      "          (c_proj): Conv1D()\n",
+      "          (act): NewGELUActivation()\n",
+      "          (dropout): Dropout(p=0.1, inplace=False)\n",
+      "        )\n",
+      "      )\n",
+      "      (9): GPT2Block(\n",
+      "        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n",
+      "        (attn): GPT2Attention(\n",
+      "          (c_attn): Conv1D()\n",
+      "          (c_proj): Conv1D()\n",
+      "          (attn_dropout): Dropout(p=0.1, inplace=False)\n",
+      "          (resid_dropout): Dropout(p=0.1, inplace=False)\n",
+      "        )\n",
+      "        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n",
+      "        (mlp): GPT2MLP(\n",
+      "          (c_fc): Conv1D()\n",
+      "          (c_proj): Conv1D()\n",
+      "          (act): NewGELUActivation()\n",
+      "          (dropout): Dropout(p=0.1, inplace=False)\n",
+      "        )\n",
+      "      )\n",
+      "      (10): GPT2Block(\n",
+      "        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n",
+      "        (attn): GPT2Attention(\n",
+      "          (c_attn): Conv1D()\n",
+      "          (c_proj): Conv1D()\n",
+      "          (attn_dropout): Dropout(p=0.1, inplace=False)\n",
+      "          (resid_dropout): Dropout(p=0.1, inplace=False)\n",
+      "        )\n",
+      "        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n",
+      "        (mlp): GPT2MLP(\n",
+      "          (c_fc): Conv1D()\n",
+      "          (c_proj): Conv1D()\n",
+      "          (act): NewGELUActivation()\n",
+      "          (dropout): Dropout(p=0.1, inplace=False)\n",
+      "        )\n",
+      "      )\n",
+      "      (11): GPT2Block(\n",
+      "        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n",
+      "        (attn): GPT2Attention(\n",
+      "          (c_attn): Conv1D()\n",
+      "          (c_proj): Conv1D()\n",
+      "          (attn_dropout): Dropout(p=0.1, inplace=False)\n",
+      "          (resid_dropout): Dropout(p=0.1, inplace=False)\n",
+      "        )\n",
+      "        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n",
+      "        (mlp): GPT2MLP(\n",
+      "          (c_fc): Conv1D()\n",
+      "          (c_proj): Conv1D()\n",
+      "          (act): NewGELUActivation()\n",
+      "          (dropout): Dropout(p=0.1, inplace=False)\n",
+      "        )\n",
+      "      )\n",
+      "    )\n",
+      "    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n",
+      "  )\n",
+      "  (score): GPT2ClassificationHeadCustom(\n",
+      "    (dense_1_input): Linear(in_features=768, out_features=1536, bias=True)\n",
+      "    (dense_1_hidden): Linear(in_features=768, out_features=1536, bias=True)\n",
+      "    (dense_2): Linear(in_features=3072, out_features=3072, bias=True)\n",
+      "    (dense_3): Linear(in_features=3072, out_features=3072, bias=True)\n",
+      "    (dense_4): Linear(in_features=3072, out_features=768, bias=True)\n",
+      "    (dropout): Dropout(p=0.1, inplace=False)\n",
+      "    (out_proj): Linear(in_features=768, out_features=4, bias=False)\n",
+      "  )\n",
+      ")\n",
+      "[ERROR|tokenization_utils_base.py:1042] 2023-02-16 15:22:42,852 >> Using pad_token, but it is not set yet.\n",
+      "02/16/2023 15:22:42 - INFO - __main__ - Set PAD token to EOS: <|endoftext|>\n",
+      "Running tokenizer on dataset:   0%|                     | 0/120 [00:00<?, ?ba/s]02/16/2023 15:22:42 - INFO - datasets.arrow_dataset - Caching processed dataset at /home/jacob/code/university/uczenie_glebokie_w_przetwarzaniu_tekstu/.cache_training/json/default-e10a382a423bbb9a/0.0.0/0f7e3662623656454fcd2b650f34e886a7db4b9104504885bd462096cc7a9f51/cache-d91f860557c08124.arrow\n",
+      "Running tokenizer on dataset: 100%|███████████| 120/120 [00:06<00:00, 17.67ba/s]\n",
+      "Running tokenizer on dataset:   0%|                       | 0/4 [00:00<?, ?ba/s]02/16/2023 15:22:49 - INFO - datasets.arrow_dataset - Caching processed dataset at /home/jacob/code/university/uczenie_glebokie_w_przetwarzaniu_tekstu/.cache_training/json/default-e10a382a423bbb9a/0.0.0/0f7e3662623656454fcd2b650f34e886a7db4b9104504885bd462096cc7a9f51/cache-b30f34d164a78c00.arrow\n",
+      "Running tokenizer on dataset: 100%|███████████████| 4/4 [00:00<00:00, 19.47ba/s]\n",
+      "02/16/2023 15:22:50 - INFO - __main__ - Set 500 samples for 0-class\n",
+      "02/16/2023 15:22:50 - INFO - __main__ - Set 500 samples for 1-class\n",
+      "02/16/2023 15:22:50 - INFO - __main__ - Set 500 samples for 2-class\n",
+      "02/16/2023 15:22:50 - INFO - __main__ - Set 500 samples for 3-class\n",
+      "Traceback (most recent call last):\n",
+      "  File \"/home/jacob/code/university/uczenie_glebokie_w_przetwarzaniu_tekstu/run_glue.py\", line 685, in <module>\n",
+      "    main()\n",
+      "  File \"/home/jacob/code/university/uczenie_glebokie_w_przetwarzaniu_tekstu/run_glue.py\", line 533, in main\n",
+      "    raise ValueError(\"--do_predict requires a test dataset\")\n",
+      "ValueError: --do_predict requires a test dataset\n"
+     ]
+    }
+   ],
+   "source": [
+    "!python run_glue.py \\\n",
+    "  --cache_dir .cache_training \\\n",
+    "  --model_name_or_path gpt2 \\\n",
+    "  --custom_model gpt2_hidden \\\n",
+    "  --train_file data/train.json  \\\n",
+    "  --validation_file data/valid.json \\\n",
+    "  --test_file data/test.json \\\n",
+    "  --per_device_train_batch_size 8 \\\n",
+    "  --per_device_eval_batch_size 8 \\\n",
+    "  --do_train \\\n",
+    "  --do_eval \\\n",
+    "  --max_seq_length 128 \\\n",
+    "  --learning_rate 2e-5 \\\n",
+    "  --max_eval_samples 2000 \\\n",
+    "  --max_steps 2500 \\\n",
+    "  --num_train_epochs 1 \\\n",
+    "  --save_strategy steps \\\n",
+    "  --save_steps 250 \\\n",
+    "  --save_total_limit 5 \\\n",
+    "  --logging_strategy steps \\\n",
+    "  --logging_steps 100 \\\n",
+    "  --eval_steps 250 \\\n",
+    "  --evaluation_strategy steps \\\n",
+    "  --metric_for_best_model accuracy \\\n",
+    "  --greater_is_better True \\\n",
+    "  --load_best_model_at_end True \\\n",
+    "  --output_dir out/gpt2                 "
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Evaluation"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 15,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "02/16/2023 16:51:20 - WARNING - __main__ - Process rank: -1, device: cuda:0, n_gpu: 1distributed training: False, 16-bits training: False\n",
+      "02/16/2023 16:51:20 - INFO - __main__ - Training/evaluation parameters TrainingArguments(\n",
+      "_n_gpu=1,\n",
+      "adafactor=False,\n",
+      "adam_beta1=0.9,\n",
+      "adam_beta2=0.999,\n",
+      "adam_epsilon=1e-08,\n",
+      "auto_find_batch_size=False,\n",
+      "bf16=False,\n",
+      "bf16_full_eval=False,\n",
+      "data_seed=None,\n",
+      "dataloader_drop_last=False,\n",
+      "dataloader_num_workers=0,\n",
+      "dataloader_pin_memory=True,\n",
+      "ddp_bucket_cap_mb=None,\n",
+      "ddp_find_unused_parameters=None,\n",
+      "ddp_timeout=1800,\n",
+      "debug=[],\n",
+      "deepspeed=None,\n",
+      "disable_tqdm=False,\n",
+      "do_eval=True,\n",
+      "do_predict=True,\n",
+      "do_train=False,\n",
+      "eval_accumulation_steps=None,\n",
+      "eval_delay=0,\n",
+      "eval_steps=250,\n",
+      "evaluation_strategy=steps,\n",
+      "fp16=False,\n",
+      "fp16_backend=auto,\n",
+      "fp16_full_eval=False,\n",
+      "fp16_opt_level=O1,\n",
+      "fsdp=[],\n",
+      "fsdp_min_num_params=0,\n",
+      "fsdp_transformer_layer_cls_to_wrap=None,\n",
+      "full_determinism=False,\n",
+      "gradient_accumulation_steps=1,\n",
+      "gradient_checkpointing=False,\n",
+      "greater_is_better=True,\n",
+      "group_by_length=False,\n",
+      "half_precision_backend=auto,\n",
+      "hub_model_id=None,\n",
+      "hub_private_repo=False,\n",
+      "hub_strategy=every_save,\n",
+      "hub_token=<HUB_TOKEN>,\n",
+      "ignore_data_skip=False,\n",
+      "include_inputs_for_metrics=False,\n",
+      "jit_mode_eval=False,\n",
+      "label_names=None,\n",
+      "label_smoothing_factor=0.0,\n",
+      "learning_rate=2e-05,\n",
+      "length_column_name=length,\n",
+      "load_best_model_at_end=True,\n",
+      "local_rank=-1,\n",
+      "log_level=passive,\n",
+      "log_level_replica=passive,\n",
+      "log_on_each_node=True,\n",
+      "logging_dir=out/gpt2_results/runs/Feb16_16-51-19_DESKTOP-R7JO8BQ,\n",
+      "logging_first_step=False,\n",
+      "logging_nan_inf_filter=True,\n",
+      "logging_steps=100,\n",
+      "logging_strategy=steps,\n",
+      "lr_scheduler_type=linear,\n",
+      "max_grad_norm=1.0,\n",
+      "max_steps=2500,\n",
+      "metric_for_best_model=accuracy,\n",
+      "mp_parameters=,\n",
+      "no_cuda=False,\n",
+      "num_train_epochs=1.0,\n",
+      "optim=adamw_hf,\n",
+      "optim_args=None,\n",
+      "output_dir=out/gpt2_results,\n",
+      "overwrite_output_dir=False,\n",
+      "past_index=-1,\n",
+      "per_device_eval_batch_size=8,\n",
+      "per_device_train_batch_size=8,\n",
+      "prediction_loss_only=False,\n",
+      "push_to_hub=False,\n",
+      "push_to_hub_model_id=None,\n",
+      "push_to_hub_organization=None,\n",
+      "push_to_hub_token=<PUSH_TO_HUB_TOKEN>,\n",
+      "ray_scope=last,\n",
+      "remove_unused_columns=True,\n",
+      "report_to=[],\n",
+      "resume_from_checkpoint=None,\n",
+      "run_name=out/gpt2_results,\n",
+      "save_on_each_node=False,\n",
+      "save_steps=250,\n",
+      "save_strategy=steps,\n",
+      "save_total_limit=5,\n",
+      "seed=42,\n",
+      "sharded_ddp=[],\n",
+      "skip_memory_metrics=True,\n",
+      "tf32=None,\n",
+      "torch_compile=False,\n",
+      "torch_compile_backend=None,\n",
+      "torch_compile_mode=None,\n",
+      "torchdynamo=None,\n",
+      "tpu_metrics_debug=False,\n",
+      "tpu_num_cores=None,\n",
+      "use_ipex=False,\n",
+      "use_legacy_prediction_loop=False,\n",
+      "use_mps_device=False,\n",
+      "warmup_ratio=0.0,\n",
+      "warmup_steps=0,\n",
+      "weight_decay=0.0,\n",
+      "xpu_backend=None,\n",
+      ")\n",
+      "02/16/2023 16:51:20 - INFO - __main__ - load a local file for train: data/train.json\n",
+      "02/16/2023 16:51:20 - INFO - __main__ - load a local file for validation: data/valid.json\n",
+      "02/16/2023 16:51:20 - INFO - __main__ - load a local file for test: data/test.json\n",
+      "02/16/2023 16:51:20 - WARNING - datasets.builder - Using custom data configuration default-f6e8039906850c57\n",
+      "02/16/2023 16:51:20 - INFO - datasets.info - Loading Dataset Infos from /home/jacob/anaconda3/envs/ugp/lib/python3.10/site-packages/datasets/packaged_modules/json\n",
+      "02/16/2023 16:51:20 - INFO - datasets.builder - Overwrite dataset info from restored data version.\n",
+      "02/16/2023 16:51:20 - INFO - datasets.info - Loading Dataset info from .cache_training/json/default-f6e8039906850c57/0.0.0/0f7e3662623656454fcd2b650f34e886a7db4b9104504885bd462096cc7a9f51\n",
+      "02/16/2023 16:51:20 - WARNING - datasets.builder - Found cached dataset json (/home/jacob/code/university/uczenie_glebokie_w_przetwarzaniu_tekstu/.cache_training/json/default-f6e8039906850c57/0.0.0/0f7e3662623656454fcd2b650f34e886a7db4b9104504885bd462096cc7a9f51)\n",
+      "02/16/2023 16:51:20 - INFO - datasets.info - Loading Dataset info from /home/jacob/code/university/uczenie_glebokie_w_przetwarzaniu_tekstu/.cache_training/json/default-f6e8039906850c57/0.0.0/0f7e3662623656454fcd2b650f34e886a7db4b9104504885bd462096cc7a9f51\n",
+      "100%|████████████████████████████████████████████| 3/3 [00:00<00:00, 591.33it/s]\n",
+      "[INFO|configuration_utils.py:658] 2023-02-16 16:51:20,920 >> loading configuration file out/gpt2/config.json\n",
+      "[INFO|configuration_utils.py:712] 2023-02-16 16:51:20,921 >> Model config GPT2Config {\n",
+      "  \"_name_or_path\": \"out/gpt2\",\n",
+      "  \"activation_function\": \"gelu_new\",\n",
+      "  \"architectures\": [\n",
+      "    \"GPT2ForSequenceClassificationCustom\"\n",
+      "  ],\n",
+      "  \"attn_pdrop\": 0.1,\n",
+      "  \"bos_token_id\": 50256,\n",
+      "  \"embd_pdrop\": 0.1,\n",
+      "  \"eos_token_id\": 50256,\n",
+      "  \"id2label\": {\n",
+      "    \"0\": 0,\n",
+      "    \"1\": 1,\n",
+      "    \"2\": 2,\n",
+      "    \"3\": 3\n",
+      "  },\n",
+      "  \"initializer_range\": 0.02,\n",
+      "  \"label2id\": {\n",
+      "    \"0\": 0,\n",
+      "    \"1\": 1,\n",
+      "    \"2\": 2,\n",
+      "    \"3\": 3\n",
+      "  },\n",
+      "  \"layer_norm_epsilon\": 1e-05,\n",
+      "  \"model_type\": \"gpt2\",\n",
+      "  \"n_ctx\": 1024,\n",
+      "  \"n_embd\": 768,\n",
+      "  \"n_head\": 12,\n",
+      "  \"n_inner\": null,\n",
+      "  \"n_layer\": 12,\n",
+      "  \"n_positions\": 1024,\n",
+      "  \"pad_token_id\": 50256,\n",
+      "  \"reorder_and_upcast_attn\": false,\n",
+      "  \"resid_pdrop\": 0.1,\n",
+      "  \"scale_attn_by_inverse_layer_idx\": false,\n",
+      "  \"scale_attn_weights\": true,\n",
+      "  \"summary_activation\": null,\n",
+      "  \"summary_first_dropout\": 0.1,\n",
+      "  \"summary_proj_to_labels\": true,\n",
+      "  \"summary_type\": \"cls_index\",\n",
+      "  \"summary_use_proj\": true,\n",
+      "  \"task_specific_params\": {\n",
+      "    \"text-generation\": {\n",
+      "      \"do_sample\": true,\n",
+      "      \"max_length\": 50\n",
+      "    }\n",
+      "  },\n",
+      "  \"torch_dtype\": \"float32\",\n",
+      "  \"transformers_version\": \"4.26.1\",\n",
+      "  \"use_cache\": true,\n",
+      "  \"use_hidden_states\": true,\n",
+      "  \"vocab_size\": 50257\n",
+      "}\n",
+      "\n",
+      "[INFO|tokenization_utils_base.py:1800] 2023-02-16 16:51:20,929 >> loading file vocab.json\n",
+      "[INFO|tokenization_utils_base.py:1800] 2023-02-16 16:51:20,929 >> loading file merges.txt\n",
+      "[INFO|tokenization_utils_base.py:1800] 2023-02-16 16:51:20,929 >> loading file tokenizer.json\n",
+      "[INFO|tokenization_utils_base.py:1800] 2023-02-16 16:51:20,929 >> loading file added_tokens.json\n",
+      "[INFO|tokenization_utils_base.py:1800] 2023-02-16 16:51:20,929 >> loading file special_tokens_map.json\n",
+      "[INFO|tokenization_utils_base.py:1800] 2023-02-16 16:51:20,929 >> loading file tokenizer_config.json\n",
+      "02/16/2023 16:51:20 - INFO - __main__ - Using hidden states in model: True\n",
+      "-------------------------------------------------------- Using hidden: True\n",
+      "02/16/2023 16:51:20 - INFO - __main__ - Using implementation from class: GPT2ForSequenceClassificationCustom\n",
+      "[INFO|modeling_utils.py:2272] 2023-02-16 16:51:20,982 >> loading weights file out/gpt2/pytorch_model.bin\n",
+      "[INFO|modeling_utils.py:2857] 2023-02-16 16:51:23,451 >> All model checkpoint weights were used when initializing GPT2ForSequenceClassificationCustom.\n",
+      "\n",
+      "[INFO|modeling_utils.py:2865] 2023-02-16 16:51:23,451 >> All the weights of GPT2ForSequenceClassificationCustom were initialized from the model checkpoint at out/gpt2.\n",
+      "If your task is similar to the task the model of the checkpoint was trained on, you can already use GPT2ForSequenceClassificationCustom for predictions without further training.\n",
+      "GPT2ForSequenceClassificationCustom(\n",
+      "  (transformer): GPT2Model(\n",
+      "    (wte): Embedding(50257, 768)\n",
+      "    (wpe): Embedding(1024, 768)\n",
+      "    (drop): Dropout(p=0.1, inplace=False)\n",
+      "    (h): ModuleList(\n",
+      "      (0): GPT2Block(\n",
+      "        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n",
+      "        (attn): GPT2Attention(\n",
+      "          (c_attn): Conv1D()\n",
+      "          (c_proj): Conv1D()\n",
+      "          (attn_dropout): Dropout(p=0.1, inplace=False)\n",
+      "          (resid_dropout): Dropout(p=0.1, inplace=False)\n",
+      "        )\n",
+      "        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n",
+      "        (mlp): GPT2MLP(\n",
+      "          (c_fc): Conv1D()\n",
+      "          (c_proj): Conv1D()\n",
+      "          (act): NewGELUActivation()\n",
+      "          (dropout): Dropout(p=0.1, inplace=False)\n",
+      "        )\n",
+      "      )\n",
+      "      (1): GPT2Block(\n",
+      "        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n",
+      "        (attn): GPT2Attention(\n",
+      "          (c_attn): Conv1D()\n",
+      "          (c_proj): Conv1D()\n",
+      "          (attn_dropout): Dropout(p=0.1, inplace=False)\n",
+      "          (resid_dropout): Dropout(p=0.1, inplace=False)\n",
+      "        )\n",
+      "        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n",
+      "        (mlp): GPT2MLP(\n",
+      "          (c_fc): Conv1D()\n",
+      "          (c_proj): Conv1D()\n",
+      "          (act): NewGELUActivation()\n",
+      "          (dropout): Dropout(p=0.1, inplace=False)\n",
+      "        )\n",
+      "      )\n",
+      "      (2): GPT2Block(\n",
+      "        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n",
+      "        (attn): GPT2Attention(\n",
+      "          (c_attn): Conv1D()\n",
+      "          (c_proj): Conv1D()\n",
+      "          (attn_dropout): Dropout(p=0.1, inplace=False)\n",
+      "          (resid_dropout): Dropout(p=0.1, inplace=False)\n",
+      "        )\n",
+      "        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n",
+      "        (mlp): GPT2MLP(\n",
+      "          (c_fc): Conv1D()\n",
+      "          (c_proj): Conv1D()\n",
+      "          (act): NewGELUActivation()\n",
+      "          (dropout): Dropout(p=0.1, inplace=False)\n",
+      "        )\n",
+      "      )\n",
+      "      (3): GPT2Block(\n",
+      "        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n",
+      "        (attn): GPT2Attention(\n",
+      "          (c_attn): Conv1D()\n",
+      "          (c_proj): Conv1D()\n",
+      "          (attn_dropout): Dropout(p=0.1, inplace=False)\n",
+      "          (resid_dropout): Dropout(p=0.1, inplace=False)\n",
+      "        )\n",
+      "        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n",
+      "        (mlp): GPT2MLP(\n",
+      "          (c_fc): Conv1D()\n",
+      "          (c_proj): Conv1D()\n",
+      "          (act): NewGELUActivation()\n",
+      "          (dropout): Dropout(p=0.1, inplace=False)\n",
+      "        )\n",
+      "      )\n",
+      "      (4): GPT2Block(\n",
+      "        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n",
+      "        (attn): GPT2Attention(\n",
+      "          (c_attn): Conv1D()\n",
+      "          (c_proj): Conv1D()\n",
+      "          (attn_dropout): Dropout(p=0.1, inplace=False)\n",
+      "          (resid_dropout): Dropout(p=0.1, inplace=False)\n",
+      "        )\n",
+      "        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n",
+      "        (mlp): GPT2MLP(\n",
+      "          (c_fc): Conv1D()\n",
+      "          (c_proj): Conv1D()\n",
+      "          (act): NewGELUActivation()\n",
+      "          (dropout): Dropout(p=0.1, inplace=False)\n",
+      "        )\n",
+      "      )\n",
+      "      (5): GPT2Block(\n",
+      "        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n",
+      "        (attn): GPT2Attention(\n",
+      "          (c_attn): Conv1D()\n",
+      "          (c_proj): Conv1D()\n",
+      "          (attn_dropout): Dropout(p=0.1, inplace=False)\n",
+      "          (resid_dropout): Dropout(p=0.1, inplace=False)\n",
+      "        )\n",
+      "        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n",
+      "        (mlp): GPT2MLP(\n",
+      "          (c_fc): Conv1D()\n",
+      "          (c_proj): Conv1D()\n",
+      "          (act): NewGELUActivation()\n",
+      "          (dropout): Dropout(p=0.1, inplace=False)\n",
+      "        )\n",
+      "      )\n",
+      "      (6): GPT2Block(\n",
+      "        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n",
+      "        (attn): GPT2Attention(\n",
+      "          (c_attn): Conv1D()\n",
+      "          (c_proj): Conv1D()\n",
+      "          (attn_dropout): Dropout(p=0.1, inplace=False)\n",
+      "          (resid_dropout): Dropout(p=0.1, inplace=False)\n",
+      "        )\n",
+      "        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n",
+      "        (mlp): GPT2MLP(\n",
+      "          (c_fc): Conv1D()\n",
+      "          (c_proj): Conv1D()\n",
+      "          (act): NewGELUActivation()\n",
+      "          (dropout): Dropout(p=0.1, inplace=False)\n",
+      "        )\n",
+      "      )\n",
+      "      (7): GPT2Block(\n",
+      "        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n",
+      "        (attn): GPT2Attention(\n",
+      "          (c_attn): Conv1D()\n",
+      "          (c_proj): Conv1D()\n",
+      "          (attn_dropout): Dropout(p=0.1, inplace=False)\n",
+      "          (resid_dropout): Dropout(p=0.1, inplace=False)\n",
+      "        )\n",
+      "        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n",
+      "        (mlp): GPT2MLP(\n",
+      "          (c_fc): Conv1D()\n",
+      "          (c_proj): Conv1D()\n",
+      "          (act): NewGELUActivation()\n",
+      "          (dropout): Dropout(p=0.1, inplace=False)\n",
+      "        )\n",
+      "      )\n",
+      "      (8): GPT2Block(\n",
+      "        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n",
+      "        (attn): GPT2Attention(\n",
+      "          (c_attn): Conv1D()\n",
+      "          (c_proj): Conv1D()\n",
+      "          (attn_dropout): Dropout(p=0.1, inplace=False)\n",
+      "          (resid_dropout): Dropout(p=0.1, inplace=False)\n",
+      "        )\n",
+      "        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n",
+      "        (mlp): GPT2MLP(\n",
+      "          (c_fc): Conv1D()\n",
+      "          (c_proj): Conv1D()\n",
+      "          (act): NewGELUActivation()\n",
+      "          (dropout): Dropout(p=0.1, inplace=False)\n",
+      "        )\n",
+      "      )\n",
+      "      (9): GPT2Block(\n",
+      "        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n",
+      "        (attn): GPT2Attention(\n",
+      "          (c_attn): Conv1D()\n",
+      "          (c_proj): Conv1D()\n",
+      "          (attn_dropout): Dropout(p=0.1, inplace=False)\n",
+      "          (resid_dropout): Dropout(p=0.1, inplace=False)\n",
+      "        )\n",
+      "        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n",
+      "        (mlp): GPT2MLP(\n",
+      "          (c_fc): Conv1D()\n",
+      "          (c_proj): Conv1D()\n",
+      "          (act): NewGELUActivation()\n",
+      "          (dropout): Dropout(p=0.1, inplace=False)\n",
+      "        )\n",
+      "      )\n",
+      "      (10): GPT2Block(\n",
+      "        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n",
+      "        (attn): GPT2Attention(\n",
+      "          (c_attn): Conv1D()\n",
+      "          (c_proj): Conv1D()\n",
+      "          (attn_dropout): Dropout(p=0.1, inplace=False)\n",
+      "          (resid_dropout): Dropout(p=0.1, inplace=False)\n",
+      "        )\n",
+      "        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n",
+      "        (mlp): GPT2MLP(\n",
+      "          (c_fc): Conv1D()\n",
+      "          (c_proj): Conv1D()\n",
+      "          (act): NewGELUActivation()\n",
+      "          (dropout): Dropout(p=0.1, inplace=False)\n",
+      "        )\n",
+      "      )\n",
+      "      (11): GPT2Block(\n",
+      "        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n",
+      "        (attn): GPT2Attention(\n",
+      "          (c_attn): Conv1D()\n",
+      "          (c_proj): Conv1D()\n",
+      "          (attn_dropout): Dropout(p=0.1, inplace=False)\n",
+      "          (resid_dropout): Dropout(p=0.1, inplace=False)\n",
+      "        )\n",
+      "        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n",
+      "        (mlp): GPT2MLP(\n",
+      "          (c_fc): Conv1D()\n",
+      "          (c_proj): Conv1D()\n",
+      "          (act): NewGELUActivation()\n",
+      "          (dropout): Dropout(p=0.1, inplace=False)\n",
+      "        )\n",
+      "      )\n",
+      "    )\n",
+      "    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n",
+      "  )\n",
+      "  (score): GPT2ClassificationHeadCustom(\n",
+      "    (dense_1_input): Linear(in_features=768, out_features=1536, bias=True)\n",
+      "    (dense_1_hidden): Linear(in_features=768, out_features=1536, bias=True)\n",
+      "    (dense_2): Linear(in_features=3072, out_features=3072, bias=True)\n",
+      "    (dense_3): Linear(in_features=3072, out_features=3072, bias=True)\n",
+      "    (dense_4): Linear(in_features=3072, out_features=768, bias=True)\n",
+      "    (dropout): Dropout(p=0.1, inplace=False)\n",
+      "    (out_proj): Linear(in_features=768, out_features=4, bias=False)\n",
+      "  )\n",
+      ")\n",
+      "Running tokenizer on dataset:   0%|                     | 0/120 [00:00<?, ?ba/s]02/16/2023 16:51:23 - INFO - datasets.arrow_dataset - Caching processed dataset at /home/jacob/code/university/uczenie_glebokie_w_przetwarzaniu_tekstu/.cache_training/json/default-f6e8039906850c57/0.0.0/0f7e3662623656454fcd2b650f34e886a7db4b9104504885bd462096cc7a9f51/cache-7179a56e6d5f6003.arrow\n",
+      "Running tokenizer on dataset: 100%|███████████| 120/120 [00:07<00:00, 15.47ba/s]\n",
+      "Running tokenizer on dataset:   0%|                       | 0/4 [00:00<?, ?ba/s]02/16/2023 16:51:31 - INFO - datasets.arrow_dataset - Caching processed dataset at /home/jacob/code/university/uczenie_glebokie_w_przetwarzaniu_tekstu/.cache_training/json/default-f6e8039906850c57/0.0.0/0f7e3662623656454fcd2b650f34e886a7db4b9104504885bd462096cc7a9f51/cache-dd7e86ec7f74125a.arrow\n",
+      "Running tokenizer on dataset: 100%|███████████████| 4/4 [00:00<00:00, 15.75ba/s]\n",
+      "Running tokenizer on dataset:   0%|                       | 0/4 [00:00<?, ?ba/s]02/16/2023 16:51:31 - INFO - datasets.arrow_dataset - Caching processed dataset at /home/jacob/code/university/uczenie_glebokie_w_przetwarzaniu_tekstu/.cache_training/json/default-f6e8039906850c57/0.0.0/0f7e3662623656454fcd2b650f34e886a7db4b9104504885bd462096cc7a9f51/cache-a11e14ac330179d1.arrow\n",
+      "Running tokenizer on dataset: 100%|███████████████| 4/4 [00:00<00:00, 15.37ba/s]\n",
+      "02/16/2023 16:51:32 - INFO - __main__ - Set 500 samples for 0-class\n",
+      "02/16/2023 16:51:32 - INFO - __main__ - Set 500 samples for 1-class\n",
+      "02/16/2023 16:51:32 - INFO - __main__ - Set 500 samples for 2-class\n",
+      "02/16/2023 16:51:32 - INFO - __main__ - Set 500 samples for 3-class\n",
+      "[INFO|trainer.py:511] 2023-02-16 16:51:35,119 >> max_steps is given, it will override any value given in num_train_epochs\n",
+      "02/16/2023 16:51:35 - INFO - __main__ - *** Evaluate ***\n",
+      "[INFO|trainer.py:710] 2023-02-16 16:51:35,120 >> The following columns in the evaluation set don't have a corresponding argument in `GPT2ForSequenceClassificationCustom.forward` and have been ignored: text. If text are not expected by `GPT2ForSequenceClassificationCustom.forward`,  you can safely ignore this message.\n",
+      "[INFO|trainer.py:2964] 2023-02-16 16:51:35,123 >> ***** Running Evaluation *****\n",
+      "[INFO|trainer.py:2966] 2023-02-16 16:51:35,123 >>   Num examples = 2000\n",
+      "[INFO|trainer.py:2969] 2023-02-16 16:51:35,123 >>   Batch size = 8\n",
+      "100%|█████████████████████████████████████████| 250/250 [00:23<00:00, 10.65it/s]\n",
+      "***** eval metrics *****\n",
+      "  eval_accuracy           =     0.9195\n",
+      "  eval_loss               =      0.302\n",
+      "  eval_runtime            = 0:00:24.11\n",
+      "  eval_samples            =       2000\n",
+      "  eval_samples_per_second =      82.94\n",
+      "  eval_steps_per_second   =     10.367\n",
+      "02/16/2023 16:51:59 - INFO - __main__ - *** Predict ***\n",
+      "[INFO|trainer.py:710] 2023-02-16 16:51:59,239 >> The following columns in the test set don't have a corresponding argument in `GPT2ForSequenceClassificationCustom.forward` and have been ignored: text. If text are not expected by `GPT2ForSequenceClassificationCustom.forward`,  you can safely ignore this message.\n",
+      "[INFO|trainer.py:2964] 2023-02-16 16:51:59,240 >> ***** Running Prediction *****\n",
+      "[INFO|trainer.py:2966] 2023-02-16 16:51:59,240 >>   Num examples = 3800\n",
+      "[INFO|trainer.py:2969] 2023-02-16 16:51:59,240 >>   Batch size = 8\n",
+      "100%|█████████████████████████████████████████| 475/475 [00:43<00:00, 10.84it/s]\n",
+      "02/16/2023 16:52:43 - INFO - __main__ - ***** Predict results None *****\n",
+      "[INFO|modelcard.py:449] 2023-02-16 16:52:43,692 >> Dropping the following result as it does not have all the necessary fields:\n",
+      "{'task': {'name': 'Text Classification', 'type': 'text-classification'}}\n"
+     ]
+    }
+   ],
+   "source": [
+    "!python run_glue.py \\\n",
+    "  --cache_dir .cache_training \\\n",
+    "  --model_name_or_path out/gpt2 \\\n",
+    "  --custom_model gpt2_hidden \\\n",
+    "  --train_file data/train.json  \\\n",
+    "  --validation_file data/valid.json \\\n",
+    "  --test_file data/test.json \\\n",
+    "  --per_device_train_batch_size 8 \\\n",
+    "  --per_device_eval_batch_size 8 \\\n",
+    "  --do_eval \\\n",
+    "  --do_predict \\\n",
+    "  --max_seq_length 128 \\\n",
+    "  --learning_rate 2e-5 \\\n",
+    "  --max_eval_samples 2000 \\\n",
+    "  --max_steps 2500 \\\n",
+    "  --num_train_epochs 1 \\\n",
+    "  --save_strategy steps \\\n",
+    "  --save_steps 250 \\\n",
+    "  --save_total_limit 5 \\\n",
+    "  --logging_strategy steps \\\n",
+    "  --logging_steps 100 \\\n",
+    "  --eval_steps 250 \\\n",
+    "  --evaluation_strategy steps \\\n",
+    "  --metric_for_best_model accuracy \\\n",
+    "  --greater_is_better True \\\n",
+    "  --load_best_model_at_end True \\\n",
+    "  --output_dir out/gpt2_results              "
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Results"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 16,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\u001b[0;39m0.9194999933242798\u001b[0m\n"
+     ]
+    }
+   ],
+   "source": [
+    "!cat out/gpt2_results/eval_results.json | jq .eval_accuracy"
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# T5"
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Modifications"
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "- Custom classification head with 3 dense layers\n",
+    "- Encoder layers frozen\n",
+    "- Decoder layers frozen"
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Code"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import torch\n",
+    "import copy\n",
+    "from torch import nn\n",
+    "from transformers import T5PreTrainedModel, T5Config\n",
+    "from transformers.models.t5.modeling_t5 import T5Stack\n",
+    "from transformers.modeling_outputs import SequenceClassifierOutput\n",
+    "\n",
+    "\n",
+    "class T5ClassificationHead(nn.Module):\n",
+    "    def __init__(self, config: T5Config):\n",
+    "        super().__init__()\n",
+    "\n",
+    "        self.dense_in = nn.Linear(config.d_model, 768)\n",
+    "        self.dense = nn.Linear(768, 768)\n",
+    "        self.dense_out = nn.Linear(768, config.num_labels)\n",
+    "        self.dropout = nn.Dropout(0.1)\n",
+    "\n",
+    "    def forward(self, features, **kwargs):\n",
+    "        x = features[:, 0, :]\n",
+    "        x = self.dropout(x)\n",
+    "        x = self.dense_in(x)\n",
+    "        x = torch.relu(x)\n",
+    "        x = self.dropout(x)\n",
+    "        x = self.dense(x)\n",
+    "        x = torch.relu(x)\n",
+    "        x = self.dropout(x)\n",
+    "        x = self.dense_out(x)\n",
+    "\n",
+    "        return x\n",
+    "\n",
+    "\n",
+    "class T5ForClassification(T5PreTrainedModel):\n",
+    "    def __init__(self, config: T5Config):\n",
+    "        super().__init__(config)\n",
+    "        self.model_dim = config.d_model\n",
+    "\n",
+    "        self.shared = nn.Embedding(config.vocab_size, config.d_model)\n",
+    "\n",
+    "        encoder_config = copy.deepcopy(config)\n",
+    "        encoder_config.is_decoder = False\n",
+    "        encoder_config.use_cache = False\n",
+    "        encoder_config.is_encoder_decoder = False\n",
+    "        self.encoder = T5Stack(encoder_config, self.shared)\n",
+    "\n",
+    "        decoder_config = copy.deepcopy(config)\n",
+    "        decoder_config.is_decoder = True\n",
+    "        decoder_config.is_encoder_decoder = False\n",
+    "        decoder_config.num_layers = config.num_decoder_layers\n",
+    "        self.decoder = T5Stack(decoder_config, self.shared)\n",
+    "\n",
+    "        modules_to_freeze = [self.encoder.block[i].layer[0] for i in range(len(self.encoder.block))]\n",
+    "        modules_to_freeze.extend([self.decoder.block[i].layer[0] for i in range(len(self.decoder.block))])\n",
+    "        modules_to_freeze.extend([self.decoder.block[i].layer[1] for i in range(len(self.decoder.block))])\n",
+    "\n",
+    "        for module in modules_to_freeze:\n",
+    "            for param in module.parameters():\n",
+    "                param.requires_grad = False\n",
+    "\n",
+    "        self.lm_head = T5ClassificationHead(config)\n",
+    "\n",
+    "        # Initialize weights and apply final processing\n",
+    "        self.post_init()\n",
+    "\n",
+    "        # Model parallel\n",
+    "        self.model_parallel = False\n",
+    "        self.device_map = None\n",
+    "\n",
+    "\n",
+    "    def forward(\n",
+    "            self,\n",
+    "            input_ids=None,\n",
+    "            attention_mask=None,\n",
+    "            head_mask=None,\n",
+    "            cross_attn_head_mask=None,\n",
+    "            past_key_values=None,\n",
+    "            inputs_embeds=None,\n",
+    "            decoder_inputs_embeds=None,\n",
+    "            use_cache=None,\n",
+    "            output_attentions=None,\n",
+    "            output_hidden_states=None,\n",
+    "            return_dict=None,\n",
+    "            labels=None\n",
+    "            ):\n",
+    "        return_dict = return_dict if return_dict is not None else self.config.use_return_dict\n",
+    "\n",
+    "        outputs = self.encoder(\n",
+    "                input_ids,\n",
+    "                attention_mask=attention_mask,\n",
+    "                head_mask=head_mask,\n",
+    "                cross_attn_head_mask=cross_attn_head_mask,\n",
+    "                past_key_values=past_key_values,\n",
+    "                inputs_embeds=inputs_embeds,\n",
+    "                use_cache=use_cache,\n",
+    "                output_attentions=output_attentions,\n",
+    "                output_hidden_states=output_hidden_states,\n",
+    "                return_dict=return_dict,\n",
+    "            )\n",
+    "\n",
+    "        outputs = self.decoder(\n",
+    "                input_ids,\n",
+    "                attention_mask=attention_mask,\n",
+    "                head_mask=head_mask,\n",
+    "                cross_attn_head_mask=cross_attn_head_mask,\n",
+    "                past_key_values=past_key_values,\n",
+    "                inputs_embeds=inputs_embeds,\n",
+    "                use_cache=use_cache,\n",
+    "                output_attentions=output_attentions,\n",
+    "                output_hidden_states=output_hidden_states,\n",
+    "                return_dict=return_dict,\n",
+    "            )\n",
+    "\n",
+    "\n",
+    "        logits = self.lm_head(outputs[0])\n",
+    "\n",
+    "\n",
+    "        loss = None\n",
+    "        if labels is not None:\n",
+    "            loss_fct = nn.CrossEntropyLoss()\n",
+    "            loss = loss_fct(logits.view(-1, self.config.num_labels), labels.view(-1))\n",
+    "\n",
+    "\n",
+    "        return SequenceClassifierOutput(\n",
+    "            loss=loss,\n",
+    "            logits=logits,\n",
+    "        )\n"
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Model"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "fda885ac92b1459ba9c0faf41a9d925f",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Downloading (…)lve/main/config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "7b82fb0c2b284fcd940e67f81abbf397",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Downloading (…)\"pytorch_model.bin\";:   0%|          | 0.00/892M [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Some weights of the model checkpoint at t5-base were not used when initializing T5ForClassification: ['decoder.block.0.layer.1.EncDecAttention.relative_attention_bias.weight']\n",
+      "- This IS expected if you are initializing T5ForClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).\n",
+      "- This IS NOT expected if you are initializing T5ForClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).\n",
+      "Some weights of T5ForClassification were not initialized from the model checkpoint at t5-base and are newly initialized: ['lm_head.dense_out.bias', 'lm_head.dense.bias', 'encoder.embed_tokens.weight', 'decoder.embed_tokens.weight', 'lm_head.dense_out.weight', 'lm_head.dense.weight', 'lm_head.dense_in.bias', 'lm_head.dense_in.weight']\n",
+      "You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n"
+     ]
+    },
+    {
+     "data": {
+      "text/plain": [
+       "T5ForClassification(\n",
+       "  (shared): Embedding(32128, 768)\n",
+       "  (encoder): T5Stack(\n",
+       "    (embed_tokens): Embedding(32128, 768)\n",
+       "    (block): ModuleList(\n",
+       "      (0): T5Block(\n",
+       "        (layer): ModuleList(\n",
+       "          (0): T5LayerSelfAttention(\n",
+       "            (SelfAttention): T5Attention(\n",
+       "              (q): Linear(in_features=768, out_features=768, bias=False)\n",
+       "              (k): Linear(in_features=768, out_features=768, bias=False)\n",
+       "              (v): Linear(in_features=768, out_features=768, bias=False)\n",
+       "              (o): Linear(in_features=768, out_features=768, bias=False)\n",
+       "              (relative_attention_bias): Embedding(32, 12)\n",
+       "            )\n",
+       "            (layer_norm): T5LayerNorm()\n",
+       "            (dropout): Dropout(p=0.1, inplace=False)\n",
+       "          )\n",
+       "          (1): T5LayerFF(\n",
+       "            (DenseReluDense): T5DenseActDense(\n",
+       "              (wi): Linear(in_features=768, out_features=3072, bias=False)\n",
+       "              (wo): Linear(in_features=3072, out_features=768, bias=False)\n",
+       "              (dropout): Dropout(p=0.1, inplace=False)\n",
+       "              (act): ReLU()\n",
+       "            )\n",
+       "            (layer_norm): T5LayerNorm()\n",
+       "            (dropout): Dropout(p=0.1, inplace=False)\n",
+       "          )\n",
+       "        )\n",
+       "      )\n",
+       "      (1): T5Block(\n",
+       "        (layer): ModuleList(\n",
+       "          (0): T5LayerSelfAttention(\n",
+       "            (SelfAttention): T5Attention(\n",
+       "              (q): Linear(in_features=768, out_features=768, bias=False)\n",
+       "              (k): Linear(in_features=768, out_features=768, bias=False)\n",
+       "              (v): Linear(in_features=768, out_features=768, bias=False)\n",
+       "              (o): Linear(in_features=768, out_features=768, bias=False)\n",
+       "            )\n",
+       "            (layer_norm): T5LayerNorm()\n",
+       "            (dropout): Dropout(p=0.1, inplace=False)\n",
+       "          )\n",
+       "          (1): T5LayerFF(\n",
+       "            (DenseReluDense): T5DenseActDense(\n",
+       "              (wi): Linear(in_features=768, out_features=3072, bias=False)\n",
+       "              (wo): Linear(in_features=3072, out_features=768, bias=False)\n",
+       "              (dropout): Dropout(p=0.1, inplace=False)\n",
+       "              (act): ReLU()\n",
+       "            )\n",
+       "            (layer_norm): T5LayerNorm()\n",
+       "            (dropout): Dropout(p=0.1, inplace=False)\n",
+       "          )\n",
+       "        )\n",
+       "      )\n",
+       "      (2): T5Block(\n",
+       "        (layer): ModuleList(\n",
+       "          (0): T5LayerSelfAttention(\n",
+       "            (SelfAttention): T5Attention(\n",
+       "              (q): Linear(in_features=768, out_features=768, bias=False)\n",
+       "              (k): Linear(in_features=768, out_features=768, bias=False)\n",
+       "              (v): Linear(in_features=768, out_features=768, bias=False)\n",
+       "              (o): Linear(in_features=768, out_features=768, bias=False)\n",
+       "            )\n",
+       "            (layer_norm): T5LayerNorm()\n",
+       "            (dropout): Dropout(p=0.1, inplace=False)\n",
+       "          )\n",
+       "          (1): T5LayerFF(\n",
+       "            (DenseReluDense): T5DenseActDense(\n",
+       "              (wi): Linear(in_features=768, out_features=3072, bias=False)\n",
+       "              (wo): Linear(in_features=3072, out_features=768, bias=False)\n",
+       "              (dropout): Dropout(p=0.1, inplace=False)\n",
+       "              (act): ReLU()\n",
+       "            )\n",
+       "            (layer_norm): T5LayerNorm()\n",
+       "            (dropout): Dropout(p=0.1, inplace=False)\n",
+       "          )\n",
+       "        )\n",
+       "      )\n",
+       "      (3): T5Block(\n",
+       "        (layer): ModuleList(\n",
+       "          (0): T5LayerSelfAttention(\n",
+       "            (SelfAttention): T5Attention(\n",
+       "              (q): Linear(in_features=768, out_features=768, bias=False)\n",
+       "              (k): Linear(in_features=768, out_features=768, bias=False)\n",
+       "              (v): Linear(in_features=768, out_features=768, bias=False)\n",
+       "              (o): Linear(in_features=768, out_features=768, bias=False)\n",
+       "            )\n",
+       "            (layer_norm): T5LayerNorm()\n",
+       "            (dropout): Dropout(p=0.1, inplace=False)\n",
+       "          )\n",
+       "          (1): T5LayerFF(\n",
+       "            (DenseReluDense): T5DenseActDense(\n",
+       "              (wi): Linear(in_features=768, out_features=3072, bias=False)\n",
+       "              (wo): Linear(in_features=3072, out_features=768, bias=False)\n",
+       "              (dropout): Dropout(p=0.1, inplace=False)\n",
+       "              (act): ReLU()\n",
+       "            )\n",
+       "            (layer_norm): T5LayerNorm()\n",
+       "            (dropout): Dropout(p=0.1, inplace=False)\n",
+       "          )\n",
+       "        )\n",
+       "      )\n",
+       "      (4): T5Block(\n",
+       "        (layer): ModuleList(\n",
+       "          (0): T5LayerSelfAttention(\n",
+       "            (SelfAttention): T5Attention(\n",
+       "              (q): Linear(in_features=768, out_features=768, bias=False)\n",
+       "              (k): Linear(in_features=768, out_features=768, bias=False)\n",
+       "              (v): Linear(in_features=768, out_features=768, bias=False)\n",
+       "              (o): Linear(in_features=768, out_features=768, bias=False)\n",
+       "            )\n",
+       "            (layer_norm): T5LayerNorm()\n",
+       "            (dropout): Dropout(p=0.1, inplace=False)\n",
+       "          )\n",
+       "          (1): T5LayerFF(\n",
+       "            (DenseReluDense): T5DenseActDense(\n",
+       "              (wi): Linear(in_features=768, out_features=3072, bias=False)\n",
+       "              (wo): Linear(in_features=3072, out_features=768, bias=False)\n",
+       "              (dropout): Dropout(p=0.1, inplace=False)\n",
+       "              (act): ReLU()\n",
+       "            )\n",
+       "            (layer_norm): T5LayerNorm()\n",
+       "            (dropout): Dropout(p=0.1, inplace=False)\n",
+       "          )\n",
+       "        )\n",
+       "      )\n",
+       "      (5): T5Block(\n",
+       "        (layer): ModuleList(\n",
+       "          (0): T5LayerSelfAttention(\n",
+       "            (SelfAttention): T5Attention(\n",
+       "              (q): Linear(in_features=768, out_features=768, bias=False)\n",
+       "              (k): Linear(in_features=768, out_features=768, bias=False)\n",
+       "              (v): Linear(in_features=768, out_features=768, bias=False)\n",
+       "              (o): Linear(in_features=768, out_features=768, bias=False)\n",
+       "            )\n",
+       "            (layer_norm): T5LayerNorm()\n",
+       "            (dropout): Dropout(p=0.1, inplace=False)\n",
+       "          )\n",
+       "          (1): T5LayerFF(\n",
+       "            (DenseReluDense): T5DenseActDense(\n",
+       "              (wi): Linear(in_features=768, out_features=3072, bias=False)\n",
+       "              (wo): Linear(in_features=3072, out_features=768, bias=False)\n",
+       "              (dropout): Dropout(p=0.1, inplace=False)\n",
+       "              (act): ReLU()\n",
+       "            )\n",
+       "            (layer_norm): T5LayerNorm()\n",
+       "            (dropout): Dropout(p=0.1, inplace=False)\n",
+       "          )\n",
+       "        )\n",
+       "      )\n",
+       "      (6): T5Block(\n",
+       "        (layer): ModuleList(\n",
+       "          (0): T5LayerSelfAttention(\n",
+       "            (SelfAttention): T5Attention(\n",
+       "              (q): Linear(in_features=768, out_features=768, bias=False)\n",
+       "              (k): Linear(in_features=768, out_features=768, bias=False)\n",
+       "              (v): Linear(in_features=768, out_features=768, bias=False)\n",
+       "              (o): Linear(in_features=768, out_features=768, bias=False)\n",
+       "            )\n",
+       "            (layer_norm): T5LayerNorm()\n",
+       "            (dropout): Dropout(p=0.1, inplace=False)\n",
+       "          )\n",
+       "          (1): T5LayerFF(\n",
+       "            (DenseReluDense): T5DenseActDense(\n",
+       "              (wi): Linear(in_features=768, out_features=3072, bias=False)\n",
+       "              (wo): Linear(in_features=3072, out_features=768, bias=False)\n",
+       "              (dropout): Dropout(p=0.1, inplace=False)\n",
+       "              (act): ReLU()\n",
+       "            )\n",
+       "            (layer_norm): T5LayerNorm()\n",
+       "            (dropout): Dropout(p=0.1, inplace=False)\n",
+       "          )\n",
+       "        )\n",
+       "      )\n",
+       "      (7): T5Block(\n",
+       "        (layer): ModuleList(\n",
+       "          (0): T5LayerSelfAttention(\n",
+       "            (SelfAttention): T5Attention(\n",
+       "              (q): Linear(in_features=768, out_features=768, bias=False)\n",
+       "              (k): Linear(in_features=768, out_features=768, bias=False)\n",
+       "              (v): Linear(in_features=768, out_features=768, bias=False)\n",
+       "              (o): Linear(in_features=768, out_features=768, bias=False)\n",
+       "            )\n",
+       "            (layer_norm): T5LayerNorm()\n",
+       "            (dropout): Dropout(p=0.1, inplace=False)\n",
+       "          )\n",
+       "          (1): T5LayerFF(\n",
+       "            (DenseReluDense): T5DenseActDense(\n",
+       "              (wi): Linear(in_features=768, out_features=3072, bias=False)\n",
+       "              (wo): Linear(in_features=3072, out_features=768, bias=False)\n",
+       "              (dropout): Dropout(p=0.1, inplace=False)\n",
+       "              (act): ReLU()\n",
+       "            )\n",
+       "            (layer_norm): T5LayerNorm()\n",
+       "            (dropout): Dropout(p=0.1, inplace=False)\n",
+       "          )\n",
+       "        )\n",
+       "      )\n",
+       "      (8): T5Block(\n",
+       "        (layer): ModuleList(\n",
+       "          (0): T5LayerSelfAttention(\n",
+       "            (SelfAttention): T5Attention(\n",
+       "              (q): Linear(in_features=768, out_features=768, bias=False)\n",
+       "              (k): Linear(in_features=768, out_features=768, bias=False)\n",
+       "              (v): Linear(in_features=768, out_features=768, bias=False)\n",
+       "              (o): Linear(in_features=768, out_features=768, bias=False)\n",
+       "            )\n",
+       "            (layer_norm): T5LayerNorm()\n",
+       "            (dropout): Dropout(p=0.1, inplace=False)\n",
+       "          )\n",
+       "          (1): T5LayerFF(\n",
+       "            (DenseReluDense): T5DenseActDense(\n",
+       "              (wi): Linear(in_features=768, out_features=3072, bias=False)\n",
+       "              (wo): Linear(in_features=3072, out_features=768, bias=False)\n",
+       "              (dropout): Dropout(p=0.1, inplace=False)\n",
+       "              (act): ReLU()\n",
+       "            )\n",
+       "            (layer_norm): T5LayerNorm()\n",
+       "            (dropout): Dropout(p=0.1, inplace=False)\n",
+       "          )\n",
+       "        )\n",
+       "      )\n",
+       "      (9): T5Block(\n",
+       "        (layer): ModuleList(\n",
+       "          (0): T5LayerSelfAttention(\n",
+       "            (SelfAttention): T5Attention(\n",
+       "              (q): Linear(in_features=768, out_features=768, bias=False)\n",
+       "              (k): Linear(in_features=768, out_features=768, bias=False)\n",
+       "              (v): Linear(in_features=768, out_features=768, bias=False)\n",
+       "              (o): Linear(in_features=768, out_features=768, bias=False)\n",
+       "            )\n",
+       "            (layer_norm): T5LayerNorm()\n",
+       "            (dropout): Dropout(p=0.1, inplace=False)\n",
+       "          )\n",
+       "          (1): T5LayerFF(\n",
+       "            (DenseReluDense): T5DenseActDense(\n",
+       "              (wi): Linear(in_features=768, out_features=3072, bias=False)\n",
+       "              (wo): Linear(in_features=3072, out_features=768, bias=False)\n",
+       "              (dropout): Dropout(p=0.1, inplace=False)\n",
+       "              (act): ReLU()\n",
+       "            )\n",
+       "            (layer_norm): T5LayerNorm()\n",
+       "            (dropout): Dropout(p=0.1, inplace=False)\n",
+       "          )\n",
+       "        )\n",
+       "      )\n",
+       "      (10): T5Block(\n",
+       "        (layer): ModuleList(\n",
+       "          (0): T5LayerSelfAttention(\n",
+       "            (SelfAttention): T5Attention(\n",
+       "              (q): Linear(in_features=768, out_features=768, bias=False)\n",
+       "              (k): Linear(in_features=768, out_features=768, bias=False)\n",
+       "              (v): Linear(in_features=768, out_features=768, bias=False)\n",
+       "              (o): Linear(in_features=768, out_features=768, bias=False)\n",
+       "            )\n",
+       "            (layer_norm): T5LayerNorm()\n",
+       "            (dropout): Dropout(p=0.1, inplace=False)\n",
+       "          )\n",
+       "          (1): T5LayerFF(\n",
+       "            (DenseReluDense): T5DenseActDense(\n",
+       "              (wi): Linear(in_features=768, out_features=3072, bias=False)\n",
+       "              (wo): Linear(in_features=3072, out_features=768, bias=False)\n",
+       "              (dropout): Dropout(p=0.1, inplace=False)\n",
+       "              (act): ReLU()\n",
+       "            )\n",
+       "            (layer_norm): T5LayerNorm()\n",
+       "            (dropout): Dropout(p=0.1, inplace=False)\n",
+       "          )\n",
+       "        )\n",
+       "      )\n",
+       "      (11): T5Block(\n",
+       "        (layer): ModuleList(\n",
+       "          (0): T5LayerSelfAttention(\n",
+       "            (SelfAttention): T5Attention(\n",
+       "              (q): Linear(in_features=768, out_features=768, bias=False)\n",
+       "              (k): Linear(in_features=768, out_features=768, bias=False)\n",
+       "              (v): Linear(in_features=768, out_features=768, bias=False)\n",
+       "              (o): Linear(in_features=768, out_features=768, bias=False)\n",
+       "            )\n",
+       "            (layer_norm): T5LayerNorm()\n",
+       "            (dropout): Dropout(p=0.1, inplace=False)\n",
+       "          )\n",
+       "          (1): T5LayerFF(\n",
+       "            (DenseReluDense): T5DenseActDense(\n",
+       "              (wi): Linear(in_features=768, out_features=3072, bias=False)\n",
+       "              (wo): Linear(in_features=3072, out_features=768, bias=False)\n",
+       "              (dropout): Dropout(p=0.1, inplace=False)\n",
+       "              (act): ReLU()\n",
+       "            )\n",
+       "            (layer_norm): T5LayerNorm()\n",
+       "            (dropout): Dropout(p=0.1, inplace=False)\n",
+       "          )\n",
+       "        )\n",
+       "      )\n",
+       "    )\n",
+       "    (final_layer_norm): T5LayerNorm()\n",
+       "    (dropout): Dropout(p=0.1, inplace=False)\n",
+       "  )\n",
+       "  (decoder): T5Stack(\n",
+       "    (embed_tokens): Embedding(32128, 768)\n",
+       "    (block): ModuleList(\n",
+       "      (0): T5Block(\n",
+       "        (layer): ModuleList(\n",
+       "          (0): T5LayerSelfAttention(\n",
+       "            (SelfAttention): T5Attention(\n",
+       "              (q): Linear(in_features=768, out_features=768, bias=False)\n",
+       "              (k): Linear(in_features=768, out_features=768, bias=False)\n",
+       "              (v): Linear(in_features=768, out_features=768, bias=False)\n",
+       "              (o): Linear(in_features=768, out_features=768, bias=False)\n",
+       "              (relative_attention_bias): Embedding(32, 12)\n",
+       "            )\n",
+       "            (layer_norm): T5LayerNorm()\n",
+       "            (dropout): Dropout(p=0.1, inplace=False)\n",
+       "          )\n",
+       "          (1): T5LayerCrossAttention(\n",
+       "            (EncDecAttention): T5Attention(\n",
+       "              (q): Linear(in_features=768, out_features=768, bias=False)\n",
+       "              (k): Linear(in_features=768, out_features=768, bias=False)\n",
+       "              (v): Linear(in_features=768, out_features=768, bias=False)\n",
+       "              (o): Linear(in_features=768, out_features=768, bias=False)\n",
+       "            )\n",
+       "            (layer_norm): T5LayerNorm()\n",
+       "            (dropout): Dropout(p=0.1, inplace=False)\n",
+       "          )\n",
+       "          (2): T5LayerFF(\n",
+       "            (DenseReluDense): T5DenseActDense(\n",
+       "              (wi): Linear(in_features=768, out_features=3072, bias=False)\n",
+       "              (wo): Linear(in_features=3072, out_features=768, bias=False)\n",
+       "              (dropout): Dropout(p=0.1, inplace=False)\n",
+       "              (act): ReLU()\n",
+       "            )\n",
+       "            (layer_norm): T5LayerNorm()\n",
+       "            (dropout): Dropout(p=0.1, inplace=False)\n",
+       "          )\n",
+       "        )\n",
+       "      )\n",
+       "      (1): T5Block(\n",
+       "        (layer): ModuleList(\n",
+       "          (0): T5LayerSelfAttention(\n",
+       "            (SelfAttention): T5Attention(\n",
+       "              (q): Linear(in_features=768, out_features=768, bias=False)\n",
+       "              (k): Linear(in_features=768, out_features=768, bias=False)\n",
+       "              (v): Linear(in_features=768, out_features=768, bias=False)\n",
+       "              (o): Linear(in_features=768, out_features=768, bias=False)\n",
+       "            )\n",
+       "            (layer_norm): T5LayerNorm()\n",
+       "            (dropout): Dropout(p=0.1, inplace=False)\n",
+       "          )\n",
+       "          (1): T5LayerCrossAttention(\n",
+       "            (EncDecAttention): T5Attention(\n",
+       "              (q): Linear(in_features=768, out_features=768, bias=False)\n",
+       "              (k): Linear(in_features=768, out_features=768, bias=False)\n",
+       "              (v): Linear(in_features=768, out_features=768, bias=False)\n",
+       "              (o): Linear(in_features=768, out_features=768, bias=False)\n",
+       "            )\n",
+       "            (layer_norm): T5LayerNorm()\n",
+       "            (dropout): Dropout(p=0.1, inplace=False)\n",
+       "          )\n",
+       "          (2): T5LayerFF(\n",
+       "            (DenseReluDense): T5DenseActDense(\n",
+       "              (wi): Linear(in_features=768, out_features=3072, bias=False)\n",
+       "              (wo): Linear(in_features=3072, out_features=768, bias=False)\n",
+       "              (dropout): Dropout(p=0.1, inplace=False)\n",
+       "              (act): ReLU()\n",
+       "            )\n",
+       "            (layer_norm): T5LayerNorm()\n",
+       "            (dropout): Dropout(p=0.1, inplace=False)\n",
+       "          )\n",
+       "        )\n",
+       "      )\n",
+       "      (2): T5Block(\n",
+       "        (layer): ModuleList(\n",
+       "          (0): T5LayerSelfAttention(\n",
+       "            (SelfAttention): T5Attention(\n",
+       "              (q): Linear(in_features=768, out_features=768, bias=False)\n",
+       "              (k): Linear(in_features=768, out_features=768, bias=False)\n",
+       "              (v): Linear(in_features=768, out_features=768, bias=False)\n",
+       "              (o): Linear(in_features=768, out_features=768, bias=False)\n",
+       "            )\n",
+       "            (layer_norm): T5LayerNorm()\n",
+       "            (dropout): Dropout(p=0.1, inplace=False)\n",
+       "          )\n",
+       "          (1): T5LayerCrossAttention(\n",
+       "            (EncDecAttention): T5Attention(\n",
+       "              (q): Linear(in_features=768, out_features=768, bias=False)\n",
+       "              (k): Linear(in_features=768, out_features=768, bias=False)\n",
+       "              (v): Linear(in_features=768, out_features=768, bias=False)\n",
+       "              (o): Linear(in_features=768, out_features=768, bias=False)\n",
+       "            )\n",
+       "            (layer_norm): T5LayerNorm()\n",
+       "            (dropout): Dropout(p=0.1, inplace=False)\n",
+       "          )\n",
+       "          (2): T5LayerFF(\n",
+       "            (DenseReluDense): T5DenseActDense(\n",
+       "              (wi): Linear(in_features=768, out_features=3072, bias=False)\n",
+       "              (wo): Linear(in_features=3072, out_features=768, bias=False)\n",
+       "              (dropout): Dropout(p=0.1, inplace=False)\n",
+       "              (act): ReLU()\n",
+       "            )\n",
+       "            (layer_norm): T5LayerNorm()\n",
+       "            (dropout): Dropout(p=0.1, inplace=False)\n",
+       "          )\n",
+       "        )\n",
+       "      )\n",
+       "      (3): T5Block(\n",
+       "        (layer): ModuleList(\n",
+       "          (0): T5LayerSelfAttention(\n",
+       "            (SelfAttention): T5Attention(\n",
+       "              (q): Linear(in_features=768, out_features=768, bias=False)\n",
+       "              (k): Linear(in_features=768, out_features=768, bias=False)\n",
+       "              (v): Linear(in_features=768, out_features=768, bias=False)\n",
+       "              (o): Linear(in_features=768, out_features=768, bias=False)\n",
+       "            )\n",
+       "            (layer_norm): T5LayerNorm()\n",
+       "            (dropout): Dropout(p=0.1, inplace=False)\n",
+       "          )\n",
+       "          (1): T5LayerCrossAttention(\n",
+       "            (EncDecAttention): T5Attention(\n",
+       "              (q): Linear(in_features=768, out_features=768, bias=False)\n",
+       "              (k): Linear(in_features=768, out_features=768, bias=False)\n",
+       "              (v): Linear(in_features=768, out_features=768, bias=False)\n",
+       "              (o): Linear(in_features=768, out_features=768, bias=False)\n",
+       "            )\n",
+       "            (layer_norm): T5LayerNorm()\n",
+       "            (dropout): Dropout(p=0.1, inplace=False)\n",
+       "          )\n",
+       "          (2): T5LayerFF(\n",
+       "            (DenseReluDense): T5DenseActDense(\n",
+       "              (wi): Linear(in_features=768, out_features=3072, bias=False)\n",
+       "              (wo): Linear(in_features=3072, out_features=768, bias=False)\n",
+       "              (dropout): Dropout(p=0.1, inplace=False)\n",
+       "              (act): ReLU()\n",
+       "            )\n",
+       "            (layer_norm): T5LayerNorm()\n",
+       "            (dropout): Dropout(p=0.1, inplace=False)\n",
+       "          )\n",
+       "        )\n",
+       "      )\n",
+       "      (4): T5Block(\n",
+       "        (layer): ModuleList(\n",
+       "          (0): T5LayerSelfAttention(\n",
+       "            (SelfAttention): T5Attention(\n",
+       "              (q): Linear(in_features=768, out_features=768, bias=False)\n",
+       "              (k): Linear(in_features=768, out_features=768, bias=False)\n",
+       "              (v): Linear(in_features=768, out_features=768, bias=False)\n",
+       "              (o): Linear(in_features=768, out_features=768, bias=False)\n",
+       "            )\n",
+       "            (layer_norm): T5LayerNorm()\n",
+       "            (dropout): Dropout(p=0.1, inplace=False)\n",
+       "          )\n",
+       "          (1): T5LayerCrossAttention(\n",
+       "            (EncDecAttention): T5Attention(\n",
+       "              (q): Linear(in_features=768, out_features=768, bias=False)\n",
+       "              (k): Linear(in_features=768, out_features=768, bias=False)\n",
+       "              (v): Linear(in_features=768, out_features=768, bias=False)\n",
+       "              (o): Linear(in_features=768, out_features=768, bias=False)\n",
+       "            )\n",
+       "            (layer_norm): T5LayerNorm()\n",
+       "            (dropout): Dropout(p=0.1, inplace=False)\n",
+       "          )\n",
+       "          (2): T5LayerFF(\n",
+       "            (DenseReluDense): T5DenseActDense(\n",
+       "              (wi): Linear(in_features=768, out_features=3072, bias=False)\n",
+       "              (wo): Linear(in_features=3072, out_features=768, bias=False)\n",
+       "              (dropout): Dropout(p=0.1, inplace=False)\n",
+       "              (act): ReLU()\n",
+       "            )\n",
+       "            (layer_norm): T5LayerNorm()\n",
+       "            (dropout): Dropout(p=0.1, inplace=False)\n",
+       "          )\n",
+       "        )\n",
+       "      )\n",
+       "      (5): T5Block(\n",
+       "        (layer): ModuleList(\n",
+       "          (0): T5LayerSelfAttention(\n",
+       "            (SelfAttention): T5Attention(\n",
+       "              (q): Linear(in_features=768, out_features=768, bias=False)\n",
+       "              (k): Linear(in_features=768, out_features=768, bias=False)\n",
+       "              (v): Linear(in_features=768, out_features=768, bias=False)\n",
+       "              (o): Linear(in_features=768, out_features=768, bias=False)\n",
+       "            )\n",
+       "            (layer_norm): T5LayerNorm()\n",
+       "            (dropout): Dropout(p=0.1, inplace=False)\n",
+       "          )\n",
+       "          (1): T5LayerCrossAttention(\n",
+       "            (EncDecAttention): T5Attention(\n",
+       "              (q): Linear(in_features=768, out_features=768, bias=False)\n",
+       "              (k): Linear(in_features=768, out_features=768, bias=False)\n",
+       "              (v): Linear(in_features=768, out_features=768, bias=False)\n",
+       "              (o): Linear(in_features=768, out_features=768, bias=False)\n",
+       "            )\n",
+       "            (layer_norm): T5LayerNorm()\n",
+       "            (dropout): Dropout(p=0.1, inplace=False)\n",
+       "          )\n",
+       "          (2): T5LayerFF(\n",
+       "            (DenseReluDense): T5DenseActDense(\n",
+       "              (wi): Linear(in_features=768, out_features=3072, bias=False)\n",
+       "              (wo): Linear(in_features=3072, out_features=768, bias=False)\n",
+       "              (dropout): Dropout(p=0.1, inplace=False)\n",
+       "              (act): ReLU()\n",
+       "            )\n",
+       "            (layer_norm): T5LayerNorm()\n",
+       "            (dropout): Dropout(p=0.1, inplace=False)\n",
+       "          )\n",
+       "        )\n",
+       "      )\n",
+       "      (6): T5Block(\n",
+       "        (layer): ModuleList(\n",
+       "          (0): T5LayerSelfAttention(\n",
+       "            (SelfAttention): T5Attention(\n",
+       "              (q): Linear(in_features=768, out_features=768, bias=False)\n",
+       "              (k): Linear(in_features=768, out_features=768, bias=False)\n",
+       "              (v): Linear(in_features=768, out_features=768, bias=False)\n",
+       "              (o): Linear(in_features=768, out_features=768, bias=False)\n",
+       "            )\n",
+       "            (layer_norm): T5LayerNorm()\n",
+       "            (dropout): Dropout(p=0.1, inplace=False)\n",
+       "          )\n",
+       "          (1): T5LayerCrossAttention(\n",
+       "            (EncDecAttention): T5Attention(\n",
+       "              (q): Linear(in_features=768, out_features=768, bias=False)\n",
+       "              (k): Linear(in_features=768, out_features=768, bias=False)\n",
+       "              (v): Linear(in_features=768, out_features=768, bias=False)\n",
+       "              (o): Linear(in_features=768, out_features=768, bias=False)\n",
+       "            )\n",
+       "            (layer_norm): T5LayerNorm()\n",
+       "            (dropout): Dropout(p=0.1, inplace=False)\n",
+       "          )\n",
+       "          (2): T5LayerFF(\n",
+       "            (DenseReluDense): T5DenseActDense(\n",
+       "              (wi): Linear(in_features=768, out_features=3072, bias=False)\n",
+       "              (wo): Linear(in_features=3072, out_features=768, bias=False)\n",
+       "              (dropout): Dropout(p=0.1, inplace=False)\n",
+       "              (act): ReLU()\n",
+       "            )\n",
+       "            (layer_norm): T5LayerNorm()\n",
+       "            (dropout): Dropout(p=0.1, inplace=False)\n",
+       "          )\n",
+       "        )\n",
+       "      )\n",
+       "      (7): T5Block(\n",
+       "        (layer): ModuleList(\n",
+       "          (0): T5LayerSelfAttention(\n",
+       "            (SelfAttention): T5Attention(\n",
+       "              (q): Linear(in_features=768, out_features=768, bias=False)\n",
+       "              (k): Linear(in_features=768, out_features=768, bias=False)\n",
+       "              (v): Linear(in_features=768, out_features=768, bias=False)\n",
+       "              (o): Linear(in_features=768, out_features=768, bias=False)\n",
+       "            )\n",
+       "            (layer_norm): T5LayerNorm()\n",
+       "            (dropout): Dropout(p=0.1, inplace=False)\n",
+       "          )\n",
+       "          (1): T5LayerCrossAttention(\n",
+       "            (EncDecAttention): T5Attention(\n",
+       "              (q): Linear(in_features=768, out_features=768, bias=False)\n",
+       "              (k): Linear(in_features=768, out_features=768, bias=False)\n",
+       "              (v): Linear(in_features=768, out_features=768, bias=False)\n",
+       "              (o): Linear(in_features=768, out_features=768, bias=False)\n",
+       "            )\n",
+       "            (layer_norm): T5LayerNorm()\n",
+       "            (dropout): Dropout(p=0.1, inplace=False)\n",
+       "          )\n",
+       "          (2): T5LayerFF(\n",
+       "            (DenseReluDense): T5DenseActDense(\n",
+       "              (wi): Linear(in_features=768, out_features=3072, bias=False)\n",
+       "              (wo): Linear(in_features=3072, out_features=768, bias=False)\n",
+       "              (dropout): Dropout(p=0.1, inplace=False)\n",
+       "              (act): ReLU()\n",
+       "            )\n",
+       "            (layer_norm): T5LayerNorm()\n",
+       "            (dropout): Dropout(p=0.1, inplace=False)\n",
+       "          )\n",
+       "        )\n",
+       "      )\n",
+       "      (8): T5Block(\n",
+       "        (layer): ModuleList(\n",
+       "          (0): T5LayerSelfAttention(\n",
+       "            (SelfAttention): T5Attention(\n",
+       "              (q): Linear(in_features=768, out_features=768, bias=False)\n",
+       "              (k): Linear(in_features=768, out_features=768, bias=False)\n",
+       "              (v): Linear(in_features=768, out_features=768, bias=False)\n",
+       "              (o): Linear(in_features=768, out_features=768, bias=False)\n",
+       "            )\n",
+       "            (layer_norm): T5LayerNorm()\n",
+       "            (dropout): Dropout(p=0.1, inplace=False)\n",
+       "          )\n",
+       "          (1): T5LayerCrossAttention(\n",
+       "            (EncDecAttention): T5Attention(\n",
+       "              (q): Linear(in_features=768, out_features=768, bias=False)\n",
+       "              (k): Linear(in_features=768, out_features=768, bias=False)\n",
+       "              (v): Linear(in_features=768, out_features=768, bias=False)\n",
+       "              (o): Linear(in_features=768, out_features=768, bias=False)\n",
+       "            )\n",
+       "            (layer_norm): T5LayerNorm()\n",
+       "            (dropout): Dropout(p=0.1, inplace=False)\n",
+       "          )\n",
+       "          (2): T5LayerFF(\n",
+       "            (DenseReluDense): T5DenseActDense(\n",
+       "              (wi): Linear(in_features=768, out_features=3072, bias=False)\n",
+       "              (wo): Linear(in_features=3072, out_features=768, bias=False)\n",
+       "              (dropout): Dropout(p=0.1, inplace=False)\n",
+       "              (act): ReLU()\n",
+       "            )\n",
+       "            (layer_norm): T5LayerNorm()\n",
+       "            (dropout): Dropout(p=0.1, inplace=False)\n",
+       "          )\n",
+       "        )\n",
+       "      )\n",
+       "      (9): T5Block(\n",
+       "        (layer): ModuleList(\n",
+       "          (0): T5LayerSelfAttention(\n",
+       "            (SelfAttention): T5Attention(\n",
+       "              (q): Linear(in_features=768, out_features=768, bias=False)\n",
+       "              (k): Linear(in_features=768, out_features=768, bias=False)\n",
+       "              (v): Linear(in_features=768, out_features=768, bias=False)\n",
+       "              (o): Linear(in_features=768, out_features=768, bias=False)\n",
+       "            )\n",
+       "            (layer_norm): T5LayerNorm()\n",
+       "            (dropout): Dropout(p=0.1, inplace=False)\n",
+       "          )\n",
+       "          (1): T5LayerCrossAttention(\n",
+       "            (EncDecAttention): T5Attention(\n",
+       "              (q): Linear(in_features=768, out_features=768, bias=False)\n",
+       "              (k): Linear(in_features=768, out_features=768, bias=False)\n",
+       "              (v): Linear(in_features=768, out_features=768, bias=False)\n",
+       "              (o): Linear(in_features=768, out_features=768, bias=False)\n",
+       "            )\n",
+       "            (layer_norm): T5LayerNorm()\n",
+       "            (dropout): Dropout(p=0.1, inplace=False)\n",
+       "          )\n",
+       "          (2): T5LayerFF(\n",
+       "            (DenseReluDense): T5DenseActDense(\n",
+       "              (wi): Linear(in_features=768, out_features=3072, bias=False)\n",
+       "              (wo): Linear(in_features=3072, out_features=768, bias=False)\n",
+       "              (dropout): Dropout(p=0.1, inplace=False)\n",
+       "              (act): ReLU()\n",
+       "            )\n",
+       "            (layer_norm): T5LayerNorm()\n",
+       "            (dropout): Dropout(p=0.1, inplace=False)\n",
+       "          )\n",
+       "        )\n",
+       "      )\n",
+       "      (10): T5Block(\n",
+       "        (layer): ModuleList(\n",
+       "          (0): T5LayerSelfAttention(\n",
+       "            (SelfAttention): T5Attention(\n",
+       "              (q): Linear(in_features=768, out_features=768, bias=False)\n",
+       "              (k): Linear(in_features=768, out_features=768, bias=False)\n",
+       "              (v): Linear(in_features=768, out_features=768, bias=False)\n",
+       "              (o): Linear(in_features=768, out_features=768, bias=False)\n",
+       "            )\n",
+       "            (layer_norm): T5LayerNorm()\n",
+       "            (dropout): Dropout(p=0.1, inplace=False)\n",
+       "          )\n",
+       "          (1): T5LayerCrossAttention(\n",
+       "            (EncDecAttention): T5Attention(\n",
+       "              (q): Linear(in_features=768, out_features=768, bias=False)\n",
+       "              (k): Linear(in_features=768, out_features=768, bias=False)\n",
+       "              (v): Linear(in_features=768, out_features=768, bias=False)\n",
+       "              (o): Linear(in_features=768, out_features=768, bias=False)\n",
+       "            )\n",
+       "            (layer_norm): T5LayerNorm()\n",
+       "            (dropout): Dropout(p=0.1, inplace=False)\n",
+       "          )\n",
+       "          (2): T5LayerFF(\n",
+       "            (DenseReluDense): T5DenseActDense(\n",
+       "              (wi): Linear(in_features=768, out_features=3072, bias=False)\n",
+       "              (wo): Linear(in_features=3072, out_features=768, bias=False)\n",
+       "              (dropout): Dropout(p=0.1, inplace=False)\n",
+       "              (act): ReLU()\n",
+       "            )\n",
+       "            (layer_norm): T5LayerNorm()\n",
+       "            (dropout): Dropout(p=0.1, inplace=False)\n",
+       "          )\n",
+       "        )\n",
+       "      )\n",
+       "      (11): T5Block(\n",
+       "        (layer): ModuleList(\n",
+       "          (0): T5LayerSelfAttention(\n",
+       "            (SelfAttention): T5Attention(\n",
+       "              (q): Linear(in_features=768, out_features=768, bias=False)\n",
+       "              (k): Linear(in_features=768, out_features=768, bias=False)\n",
+       "              (v): Linear(in_features=768, out_features=768, bias=False)\n",
+       "              (o): Linear(in_features=768, out_features=768, bias=False)\n",
+       "            )\n",
+       "            (layer_norm): T5LayerNorm()\n",
+       "            (dropout): Dropout(p=0.1, inplace=False)\n",
+       "          )\n",
+       "          (1): T5LayerCrossAttention(\n",
+       "            (EncDecAttention): T5Attention(\n",
+       "              (q): Linear(in_features=768, out_features=768, bias=False)\n",
+       "              (k): Linear(in_features=768, out_features=768, bias=False)\n",
+       "              (v): Linear(in_features=768, out_features=768, bias=False)\n",
+       "              (o): Linear(in_features=768, out_features=768, bias=False)\n",
+       "            )\n",
+       "            (layer_norm): T5LayerNorm()\n",
+       "            (dropout): Dropout(p=0.1, inplace=False)\n",
+       "          )\n",
+       "          (2): T5LayerFF(\n",
+       "            (DenseReluDense): T5DenseActDense(\n",
+       "              (wi): Linear(in_features=768, out_features=3072, bias=False)\n",
+       "              (wo): Linear(in_features=3072, out_features=768, bias=False)\n",
+       "              (dropout): Dropout(p=0.1, inplace=False)\n",
+       "              (act): ReLU()\n",
+       "            )\n",
+       "            (layer_norm): T5LayerNorm()\n",
+       "            (dropout): Dropout(p=0.1, inplace=False)\n",
+       "          )\n",
+       "        )\n",
+       "      )\n",
+       "    )\n",
+       "    (final_layer_norm): T5LayerNorm()\n",
+       "    (dropout): Dropout(p=0.1, inplace=False)\n",
+       "  )\n",
+       "  (lm_head): T5ClassificationHead(\n",
+       "    (dense_in): Linear(in_features=768, out_features=768, bias=True)\n",
+       "    (dense): Linear(in_features=768, out_features=768, bias=True)\n",
+       "    (dense_out): Linear(in_features=768, out_features=2, bias=True)\n",
+       "    (dropout): Dropout(p=0.1, inplace=False)\n",
+       "  )\n",
+       ")"
+      ]
+     },
+     "execution_count": 10,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "T5ForClassification.from_pretrained(\"t5-base\")"
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Training"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "02/16/2023 15:24:13 - WARNING - __main__ - Process rank: -1, device: cuda:0, n_gpu: 1distributed training: False, 16-bits training: False\n",
+      "02/16/2023 15:24:13 - INFO - __main__ - Training/evaluation parameters TrainingArguments(\n",
+      "_n_gpu=1,\n",
+      "adafactor=False,\n",
+      "adam_beta1=0.9,\n",
+      "adam_beta2=0.999,\n",
+      "adam_epsilon=1e-08,\n",
+      "auto_find_batch_size=False,\n",
+      "bf16=False,\n",
+      "bf16_full_eval=False,\n",
+      "data_seed=None,\n",
+      "dataloader_drop_last=False,\n",
+      "dataloader_num_workers=0,\n",
+      "dataloader_pin_memory=True,\n",
+      "ddp_bucket_cap_mb=None,\n",
+      "ddp_find_unused_parameters=None,\n",
+      "ddp_timeout=1800,\n",
+      "debug=[],\n",
+      "deepspeed=None,\n",
+      "disable_tqdm=False,\n",
+      "do_eval=True,\n",
+      "do_predict=False,\n",
+      "do_train=True,\n",
+      "eval_accumulation_steps=None,\n",
+      "eval_delay=0,\n",
+      "eval_steps=250,\n",
+      "evaluation_strategy=steps,\n",
+      "fp16=False,\n",
+      "fp16_backend=auto,\n",
+      "fp16_full_eval=False,\n",
+      "fp16_opt_level=O1,\n",
+      "fsdp=[],\n",
+      "fsdp_min_num_params=0,\n",
+      "fsdp_transformer_layer_cls_to_wrap=None,\n",
+      "full_determinism=False,\n",
+      "gradient_accumulation_steps=1,\n",
+      "gradient_checkpointing=False,\n",
+      "greater_is_better=True,\n",
+      "group_by_length=False,\n",
+      "half_precision_backend=auto,\n",
+      "hub_model_id=None,\n",
+      "hub_private_repo=False,\n",
+      "hub_strategy=every_save,\n",
+      "hub_token=<HUB_TOKEN>,\n",
+      "ignore_data_skip=False,\n",
+      "include_inputs_for_metrics=False,\n",
+      "jit_mode_eval=False,\n",
+      "label_names=None,\n",
+      "label_smoothing_factor=0.0,\n",
+      "learning_rate=2e-05,\n",
+      "length_column_name=length,\n",
+      "load_best_model_at_end=True,\n",
+      "local_rank=-1,\n",
+      "log_level=passive,\n",
+      "log_level_replica=passive,\n",
+      "log_on_each_node=True,\n",
+      "logging_dir=out/t5/runs/Feb16_15-24-12_DESKTOP-R7JO8BQ,\n",
+      "logging_first_step=False,\n",
+      "logging_nan_inf_filter=True,\n",
+      "logging_steps=100,\n",
+      "logging_strategy=steps,\n",
+      "lr_scheduler_type=linear,\n",
+      "max_grad_norm=1.0,\n",
+      "max_steps=2500,\n",
+      "metric_for_best_model=accuracy,\n",
+      "mp_parameters=,\n",
+      "no_cuda=False,\n",
+      "num_train_epochs=1.0,\n",
+      "optim=adamw_hf,\n",
+      "optim_args=None,\n",
+      "output_dir=out/t5,\n",
+      "overwrite_output_dir=False,\n",
+      "past_index=-1,\n",
+      "per_device_eval_batch_size=8,\n",
+      "per_device_train_batch_size=8,\n",
+      "prediction_loss_only=False,\n",
+      "push_to_hub=False,\n",
+      "push_to_hub_model_id=None,\n",
+      "push_to_hub_organization=None,\n",
+      "push_to_hub_token=<PUSH_TO_HUB_TOKEN>,\n",
+      "ray_scope=last,\n",
+      "remove_unused_columns=True,\n",
+      "report_to=[],\n",
+      "resume_from_checkpoint=None,\n",
+      "run_name=out/t5,\n",
+      "save_on_each_node=False,\n",
+      "save_steps=250,\n",
+      "save_strategy=steps,\n",
+      "save_total_limit=5,\n",
+      "seed=42,\n",
+      "sharded_ddp=[],\n",
+      "skip_memory_metrics=True,\n",
+      "tf32=None,\n",
+      "torch_compile=False,\n",
+      "torch_compile_backend=None,\n",
+      "torch_compile_mode=None,\n",
+      "torchdynamo=None,\n",
+      "tpu_metrics_debug=False,\n",
+      "tpu_num_cores=None,\n",
+      "use_ipex=False,\n",
+      "use_legacy_prediction_loop=False,\n",
+      "use_mps_device=False,\n",
+      "warmup_ratio=0.0,\n",
+      "warmup_steps=0,\n",
+      "weight_decay=0.0,\n",
+      "xpu_backend=None,\n",
+      ")\n",
+      "02/16/2023 15:24:13 - INFO - __main__ - Checkpoint detected, resuming training at out/t5/checkpoint-2500. To avoid this behavior, change the `--output_dir` or add `--overwrite_output_dir` to train from scratch.\n",
+      "02/16/2023 15:24:13 - INFO - __main__ - load a local file for train: data/train.json\n",
+      "02/16/2023 15:24:13 - INFO - __main__ - load a local file for validation: data/valid.json\n",
+      "02/16/2023 15:24:13 - WARNING - datasets.builder - Using custom data configuration default-e10a382a423bbb9a\n",
+      "02/16/2023 15:24:13 - INFO - datasets.info - Loading Dataset Infos from /home/jacob/anaconda3/envs/ugp/lib/python3.10/site-packages/datasets/packaged_modules/json\n",
+      "02/16/2023 15:24:13 - INFO - datasets.builder - Overwrite dataset info from restored data version.\n",
+      "02/16/2023 15:24:13 - INFO - datasets.info - Loading Dataset info from .cache_training/json/default-e10a382a423bbb9a/0.0.0/0f7e3662623656454fcd2b650f34e886a7db4b9104504885bd462096cc7a9f51\n",
+      "02/16/2023 15:24:13 - WARNING - datasets.builder - Found cached dataset json (/home/jacob/code/university/uczenie_glebokie_w_przetwarzaniu_tekstu/.cache_training/json/default-e10a382a423bbb9a/0.0.0/0f7e3662623656454fcd2b650f34e886a7db4b9104504885bd462096cc7a9f51)\n",
+      "02/16/2023 15:24:13 - INFO - datasets.info - Loading Dataset info from /home/jacob/code/university/uczenie_glebokie_w_przetwarzaniu_tekstu/.cache_training/json/default-e10a382a423bbb9a/0.0.0/0f7e3662623656454fcd2b650f34e886a7db4b9104504885bd462096cc7a9f51\n",
+      "100%|████████████████████████████████████████████| 2/2 [00:00<00:00, 426.97it/s]\n",
+      "[INFO|configuration_utils.py:660] 2023-02-16 15:24:14,422 >> loading configuration file config.json from cache at .cache_training/models--t5-base/snapshots/0db7e623bcaee2daf9b859a646637ea39bf016cd/config.json\n",
+      "[INFO|configuration_utils.py:712] 2023-02-16 15:24:14,423 >> Model config T5Config {\n",
+      "  \"_name_or_path\": \"t5-base\",\n",
+      "  \"architectures\": [\n",
+      "    \"T5ForConditionalGeneration\"\n",
+      "  ],\n",
+      "  \"d_ff\": 3072,\n",
+      "  \"d_kv\": 64,\n",
+      "  \"d_model\": 768,\n",
+      "  \"decoder_start_token_id\": 0,\n",
+      "  \"dense_act_fn\": \"relu\",\n",
+      "  \"dropout_rate\": 0.1,\n",
+      "  \"eos_token_id\": 1,\n",
+      "  \"feed_forward_proj\": \"relu\",\n",
+      "  \"id2label\": {\n",
+      "    \"0\": \"LABEL_0\",\n",
+      "    \"1\": \"LABEL_1\",\n",
+      "    \"2\": \"LABEL_2\",\n",
+      "    \"3\": \"LABEL_3\"\n",
+      "  },\n",
+      "  \"initializer_factor\": 1.0,\n",
+      "  \"is_encoder_decoder\": true,\n",
+      "  \"is_gated_act\": false,\n",
+      "  \"label2id\": {\n",
+      "    \"LABEL_0\": 0,\n",
+      "    \"LABEL_1\": 1,\n",
+      "    \"LABEL_2\": 2,\n",
+      "    \"LABEL_3\": 3\n",
+      "  },\n",
+      "  \"layer_norm_epsilon\": 1e-06,\n",
+      "  \"model_type\": \"t5\",\n",
+      "  \"n_positions\": 512,\n",
+      "  \"num_decoder_layers\": 12,\n",
+      "  \"num_heads\": 12,\n",
+      "  \"num_layers\": 12,\n",
+      "  \"output_past\": true,\n",
+      "  \"pad_token_id\": 0,\n",
+      "  \"relative_attention_max_distance\": 128,\n",
+      "  \"relative_attention_num_buckets\": 32,\n",
+      "  \"task_specific_params\": {\n",
+      "    \"summarization\": {\n",
+      "      \"early_stopping\": true,\n",
+      "      \"length_penalty\": 2.0,\n",
+      "      \"max_length\": 200,\n",
+      "      \"min_length\": 30,\n",
+      "      \"no_repeat_ngram_size\": 3,\n",
+      "      \"num_beams\": 4,\n",
+      "      \"prefix\": \"summarize: \"\n",
+      "    },\n",
+      "    \"translation_en_to_de\": {\n",
+      "      \"early_stopping\": true,\n",
+      "      \"max_length\": 300,\n",
+      "      \"num_beams\": 4,\n",
+      "      \"prefix\": \"translate English to German: \"\n",
+      "    },\n",
+      "    \"translation_en_to_fr\": {\n",
+      "      \"early_stopping\": true,\n",
+      "      \"max_length\": 300,\n",
+      "      \"num_beams\": 4,\n",
+      "      \"prefix\": \"translate English to French: \"\n",
+      "    },\n",
+      "    \"translation_en_to_ro\": {\n",
+      "      \"early_stopping\": true,\n",
+      "      \"max_length\": 300,\n",
+      "      \"num_beams\": 4,\n",
+      "      \"prefix\": \"translate English to Romanian: \"\n",
+      "    }\n",
+      "  },\n",
+      "  \"transformers_version\": \"4.26.1\",\n",
+      "  \"use_cache\": true,\n",
+      "  \"vocab_size\": 32128\n",
+      "}\n",
+      "\n",
+      "[INFO|tokenization_auto.py:458] 2023-02-16 15:24:14,918 >> Could not locate the tokenizer configuration file, will try to use the model config instead.\n",
+      "[INFO|configuration_utils.py:660] 2023-02-16 15:24:15,378 >> loading configuration file config.json from cache at .cache_training/models--t5-base/snapshots/0db7e623bcaee2daf9b859a646637ea39bf016cd/config.json\n",
+      "[INFO|configuration_utils.py:712] 2023-02-16 15:24:15,378 >> Model config T5Config {\n",
+      "  \"_name_or_path\": \"t5-base\",\n",
+      "  \"architectures\": [\n",
+      "    \"T5ForConditionalGeneration\"\n",
+      "  ],\n",
+      "  \"d_ff\": 3072,\n",
+      "  \"d_kv\": 64,\n",
+      "  \"d_model\": 768,\n",
+      "  \"decoder_start_token_id\": 0,\n",
+      "  \"dense_act_fn\": \"relu\",\n",
+      "  \"dropout_rate\": 0.1,\n",
+      "  \"eos_token_id\": 1,\n",
+      "  \"feed_forward_proj\": \"relu\",\n",
+      "  \"initializer_factor\": 1.0,\n",
+      "  \"is_encoder_decoder\": true,\n",
+      "  \"is_gated_act\": false,\n",
+      "  \"layer_norm_epsilon\": 1e-06,\n",
+      "  \"model_type\": \"t5\",\n",
+      "  \"n_positions\": 512,\n",
+      "  \"num_decoder_layers\": 12,\n",
+      "  \"num_heads\": 12,\n",
+      "  \"num_layers\": 12,\n",
+      "  \"output_past\": true,\n",
+      "  \"pad_token_id\": 0,\n",
+      "  \"relative_attention_max_distance\": 128,\n",
+      "  \"relative_attention_num_buckets\": 32,\n",
+      "  \"task_specific_params\": {\n",
+      "    \"summarization\": {\n",
+      "      \"early_stopping\": true,\n",
+      "      \"length_penalty\": 2.0,\n",
+      "      \"max_length\": 200,\n",
+      "      \"min_length\": 30,\n",
+      "      \"no_repeat_ngram_size\": 3,\n",
+      "      \"num_beams\": 4,\n",
+      "      \"prefix\": \"summarize: \"\n",
+      "    },\n",
+      "    \"translation_en_to_de\": {\n",
+      "      \"early_stopping\": true,\n",
+      "      \"max_length\": 300,\n",
+      "      \"num_beams\": 4,\n",
+      "      \"prefix\": \"translate English to German: \"\n",
+      "    },\n",
+      "    \"translation_en_to_fr\": {\n",
+      "      \"early_stopping\": true,\n",
+      "      \"max_length\": 300,\n",
+      "      \"num_beams\": 4,\n",
+      "      \"prefix\": \"translate English to French: \"\n",
+      "    },\n",
+      "    \"translation_en_to_ro\": {\n",
+      "      \"early_stopping\": true,\n",
+      "      \"max_length\": 300,\n",
+      "      \"num_beams\": 4,\n",
+      "      \"prefix\": \"translate English to Romanian: \"\n",
+      "    }\n",
+      "  },\n",
+      "  \"transformers_version\": \"4.26.1\",\n",
+      "  \"use_cache\": true,\n",
+      "  \"vocab_size\": 32128\n",
+      "}\n",
+      "\n",
+      "[INFO|tokenization_utils_base.py:1802] 2023-02-16 15:24:16,341 >> loading file spiece.model from cache at .cache_training/models--t5-base/snapshots/0db7e623bcaee2daf9b859a646637ea39bf016cd/spiece.model\n",
+      "[INFO|tokenization_utils_base.py:1802] 2023-02-16 15:24:16,341 >> loading file tokenizer.json from cache at .cache_training/models--t5-base/snapshots/0db7e623bcaee2daf9b859a646637ea39bf016cd/tokenizer.json\n",
+      "[INFO|tokenization_utils_base.py:1802] 2023-02-16 15:24:16,341 >> loading file added_tokens.json from cache at None\n",
+      "[INFO|tokenization_utils_base.py:1802] 2023-02-16 15:24:16,341 >> loading file special_tokens_map.json from cache at None\n",
+      "[INFO|tokenization_utils_base.py:1802] 2023-02-16 15:24:16,341 >> loading file tokenizer_config.json from cache at None\n",
+      "[INFO|configuration_utils.py:660] 2023-02-16 15:24:16,342 >> loading configuration file config.json from cache at .cache_training/models--t5-base/snapshots/0db7e623bcaee2daf9b859a646637ea39bf016cd/config.json\n",
+      "[INFO|configuration_utils.py:712] 2023-02-16 15:24:16,342 >> Model config T5Config {\n",
+      "  \"_name_or_path\": \"t5-base\",\n",
+      "  \"architectures\": [\n",
+      "    \"T5ForConditionalGeneration\"\n",
+      "  ],\n",
+      "  \"d_ff\": 3072,\n",
+      "  \"d_kv\": 64,\n",
+      "  \"d_model\": 768,\n",
+      "  \"decoder_start_token_id\": 0,\n",
+      "  \"dense_act_fn\": \"relu\",\n",
+      "  \"dropout_rate\": 0.1,\n",
+      "  \"eos_token_id\": 1,\n",
+      "  \"feed_forward_proj\": \"relu\",\n",
+      "  \"initializer_factor\": 1.0,\n",
+      "  \"is_encoder_decoder\": true,\n",
+      "  \"is_gated_act\": false,\n",
+      "  \"layer_norm_epsilon\": 1e-06,\n",
+      "  \"model_type\": \"t5\",\n",
+      "  \"n_positions\": 512,\n",
+      "  \"num_decoder_layers\": 12,\n",
+      "  \"num_heads\": 12,\n",
+      "  \"num_layers\": 12,\n",
+      "  \"output_past\": true,\n",
+      "  \"pad_token_id\": 0,\n",
+      "  \"relative_attention_max_distance\": 128,\n",
+      "  \"relative_attention_num_buckets\": 32,\n",
+      "  \"task_specific_params\": {\n",
+      "    \"summarization\": {\n",
+      "      \"early_stopping\": true,\n",
+      "      \"length_penalty\": 2.0,\n",
+      "      \"max_length\": 200,\n",
+      "      \"min_length\": 30,\n",
+      "      \"no_repeat_ngram_size\": 3,\n",
+      "      \"num_beams\": 4,\n",
+      "      \"prefix\": \"summarize: \"\n",
+      "    },\n",
+      "    \"translation_en_to_de\": {\n",
+      "      \"early_stopping\": true,\n",
+      "      \"max_length\": 300,\n",
+      "      \"num_beams\": 4,\n",
+      "      \"prefix\": \"translate English to German: \"\n",
+      "    },\n",
+      "    \"translation_en_to_fr\": {\n",
+      "      \"early_stopping\": true,\n",
+      "      \"max_length\": 300,\n",
+      "      \"num_beams\": 4,\n",
+      "      \"prefix\": \"translate English to French: \"\n",
+      "    },\n",
+      "    \"translation_en_to_ro\": {\n",
+      "      \"early_stopping\": true,\n",
+      "      \"max_length\": 300,\n",
+      "      \"num_beams\": 4,\n",
+      "      \"prefix\": \"translate English to Romanian: \"\n",
+      "    }\n",
+      "  },\n",
+      "  \"transformers_version\": \"4.26.1\",\n",
+      "  \"use_cache\": true,\n",
+      "  \"vocab_size\": 32128\n",
+      "}\n",
+      "\n",
+      "/home/jacob/anaconda3/envs/ugp/lib/python3.10/site-packages/transformers/models/t5/tokenization_t5_fast.py:155: FutureWarning: This tokenizer was incorrectly instantiated with a model max length of 512 which will be corrected in Transformers v5.\n",
+      "For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.\n",
+      "- Be aware that you SHOULD NOT rely on t5-base automatically truncating your input to 512 when padding/encoding.\n",
+      "- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.\n",
+      "- To avoid this warning, please instantiate this tokenizer with `model_max_length` set to your preferred value.\n",
+      "  warnings.warn(\n",
+      "02/16/2023 15:24:16 - INFO - __main__ - Using hidden states in model: False\n",
+      "-------------------------------------------------------- Using hidden: False\n",
+      "02/16/2023 15:24:16 - INFO - __main__ - Using implementation from class: T5ForClassification\n",
+      "[INFO|modeling_utils.py:2275] 2023-02-16 15:24:16,391 >> loading weights file pytorch_model.bin from cache at .cache_training/models--t5-base/snapshots/0db7e623bcaee2daf9b859a646637ea39bf016cd/pytorch_model.bin\n",
+      "[WARNING|modeling_utils.py:2847] 2023-02-16 15:24:19,101 >> Some weights of the model checkpoint at t5-base were not used when initializing T5ForClassification: ['decoder.block.0.layer.1.EncDecAttention.relative_attention_bias.weight']\n",
+      "- This IS expected if you are initializing T5ForClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).\n",
+      "- This IS NOT expected if you are initializing T5ForClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).\n",
+      "[WARNING|modeling_utils.py:2859] 2023-02-16 15:24:19,102 >> Some weights of T5ForClassification were not initialized from the model checkpoint at t5-base and are newly initialized: ['decoder.embed_tokens.weight', 'lm_head.dense.bias', 'lm_head.dense_out.bias', 'encoder.embed_tokens.weight', 'lm_head.dense_in.bias', 'lm_head.dense_in.weight', 'lm_head.dense.weight', 'lm_head.dense_out.weight']\n",
+      "You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n",
+      "T5ForClassification(\n",
+      "  (shared): Embedding(32128, 768)\n",
+      "  (encoder): T5Stack(\n",
+      "    (embed_tokens): Embedding(32128, 768)\n",
+      "    (block): ModuleList(\n",
+      "      (0): T5Block(\n",
+      "        (layer): ModuleList(\n",
+      "          (0): T5LayerSelfAttention(\n",
+      "            (SelfAttention): T5Attention(\n",
+      "              (q): Linear(in_features=768, out_features=768, bias=False)\n",
+      "              (k): Linear(in_features=768, out_features=768, bias=False)\n",
+      "              (v): Linear(in_features=768, out_features=768, bias=False)\n",
+      "              (o): Linear(in_features=768, out_features=768, bias=False)\n",
+      "              (relative_attention_bias): Embedding(32, 12)\n",
+      "            )\n",
+      "            (layer_norm): T5LayerNorm()\n",
+      "            (dropout): Dropout(p=0.1, inplace=False)\n",
+      "          )\n",
+      "          (1): T5LayerFF(\n",
+      "            (DenseReluDense): T5DenseActDense(\n",
+      "              (wi): Linear(in_features=768, out_features=3072, bias=False)\n",
+      "              (wo): Linear(in_features=3072, out_features=768, bias=False)\n",
+      "              (dropout): Dropout(p=0.1, inplace=False)\n",
+      "              (act): ReLU()\n",
+      "            )\n",
+      "            (layer_norm): T5LayerNorm()\n",
+      "            (dropout): Dropout(p=0.1, inplace=False)\n",
+      "          )\n",
+      "        )\n",
+      "      )\n",
+      "      (1): T5Block(\n",
+      "        (layer): ModuleList(\n",
+      "          (0): T5LayerSelfAttention(\n",
+      "            (SelfAttention): T5Attention(\n",
+      "              (q): Linear(in_features=768, out_features=768, bias=False)\n",
+      "              (k): Linear(in_features=768, out_features=768, bias=False)\n",
+      "              (v): Linear(in_features=768, out_features=768, bias=False)\n",
+      "              (o): Linear(in_features=768, out_features=768, bias=False)\n",
+      "            )\n",
+      "            (layer_norm): T5LayerNorm()\n",
+      "            (dropout): Dropout(p=0.1, inplace=False)\n",
+      "          )\n",
+      "          (1): T5LayerFF(\n",
+      "            (DenseReluDense): T5DenseActDense(\n",
+      "              (wi): Linear(in_features=768, out_features=3072, bias=False)\n",
+      "              (wo): Linear(in_features=3072, out_features=768, bias=False)\n",
+      "              (dropout): Dropout(p=0.1, inplace=False)\n",
+      "              (act): ReLU()\n",
+      "            )\n",
+      "            (layer_norm): T5LayerNorm()\n",
+      "            (dropout): Dropout(p=0.1, inplace=False)\n",
+      "          )\n",
+      "        )\n",
+      "      )\n",
+      "      (2): T5Block(\n",
+      "        (layer): ModuleList(\n",
+      "          (0): T5LayerSelfAttention(\n",
+      "            (SelfAttention): T5Attention(\n",
+      "              (q): Linear(in_features=768, out_features=768, bias=False)\n",
+      "              (k): Linear(in_features=768, out_features=768, bias=False)\n",
+      "              (v): Linear(in_features=768, out_features=768, bias=False)\n",
+      "              (o): Linear(in_features=768, out_features=768, bias=False)\n",
+      "            )\n",
+      "            (layer_norm): T5LayerNorm()\n",
+      "            (dropout): Dropout(p=0.1, inplace=False)\n",
+      "          )\n",
+      "          (1): T5LayerFF(\n",
+      "            (DenseReluDense): T5DenseActDense(\n",
+      "              (wi): Linear(in_features=768, out_features=3072, bias=False)\n",
+      "              (wo): Linear(in_features=3072, out_features=768, bias=False)\n",
+      "              (dropout): Dropout(p=0.1, inplace=False)\n",
+      "              (act): ReLU()\n",
+      "            )\n",
+      "            (layer_norm): T5LayerNorm()\n",
+      "            (dropout): Dropout(p=0.1, inplace=False)\n",
+      "          )\n",
+      "        )\n",
+      "      )\n",
+      "      (3): T5Block(\n",
+      "        (layer): ModuleList(\n",
+      "          (0): T5LayerSelfAttention(\n",
+      "            (SelfAttention): T5Attention(\n",
+      "              (q): Linear(in_features=768, out_features=768, bias=False)\n",
+      "              (k): Linear(in_features=768, out_features=768, bias=False)\n",
+      "              (v): Linear(in_features=768, out_features=768, bias=False)\n",
+      "              (o): Linear(in_features=768, out_features=768, bias=False)\n",
+      "            )\n",
+      "            (layer_norm): T5LayerNorm()\n",
+      "            (dropout): Dropout(p=0.1, inplace=False)\n",
+      "          )\n",
+      "          (1): T5LayerFF(\n",
+      "            (DenseReluDense): T5DenseActDense(\n",
+      "              (wi): Linear(in_features=768, out_features=3072, bias=False)\n",
+      "              (wo): Linear(in_features=3072, out_features=768, bias=False)\n",
+      "              (dropout): Dropout(p=0.1, inplace=False)\n",
+      "              (act): ReLU()\n",
+      "            )\n",
+      "            (layer_norm): T5LayerNorm()\n",
+      "            (dropout): Dropout(p=0.1, inplace=False)\n",
+      "          )\n",
+      "        )\n",
+      "      )\n",
+      "      (4): T5Block(\n",
+      "        (layer): ModuleList(\n",
+      "          (0): T5LayerSelfAttention(\n",
+      "            (SelfAttention): T5Attention(\n",
+      "              (q): Linear(in_features=768, out_features=768, bias=False)\n",
+      "              (k): Linear(in_features=768, out_features=768, bias=False)\n",
+      "              (v): Linear(in_features=768, out_features=768, bias=False)\n",
+      "              (o): Linear(in_features=768, out_features=768, bias=False)\n",
+      "            )\n",
+      "            (layer_norm): T5LayerNorm()\n",
+      "            (dropout): Dropout(p=0.1, inplace=False)\n",
+      "          )\n",
+      "          (1): T5LayerFF(\n",
+      "            (DenseReluDense): T5DenseActDense(\n",
+      "              (wi): Linear(in_features=768, out_features=3072, bias=False)\n",
+      "              (wo): Linear(in_features=3072, out_features=768, bias=False)\n",
+      "              (dropout): Dropout(p=0.1, inplace=False)\n",
+      "              (act): ReLU()\n",
+      "            )\n",
+      "            (layer_norm): T5LayerNorm()\n",
+      "            (dropout): Dropout(p=0.1, inplace=False)\n",
+      "          )\n",
+      "        )\n",
+      "      )\n",
+      "      (5): T5Block(\n",
+      "        (layer): ModuleList(\n",
+      "          (0): T5LayerSelfAttention(\n",
+      "            (SelfAttention): T5Attention(\n",
+      "              (q): Linear(in_features=768, out_features=768, bias=False)\n",
+      "              (k): Linear(in_features=768, out_features=768, bias=False)\n",
+      "              (v): Linear(in_features=768, out_features=768, bias=False)\n",
+      "              (o): Linear(in_features=768, out_features=768, bias=False)\n",
+      "            )\n",
+      "            (layer_norm): T5LayerNorm()\n",
+      "            (dropout): Dropout(p=0.1, inplace=False)\n",
+      "          )\n",
+      "          (1): T5LayerFF(\n",
+      "            (DenseReluDense): T5DenseActDense(\n",
+      "              (wi): Linear(in_features=768, out_features=3072, bias=False)\n",
+      "              (wo): Linear(in_features=3072, out_features=768, bias=False)\n",
+      "              (dropout): Dropout(p=0.1, inplace=False)\n",
+      "              (act): ReLU()\n",
+      "            )\n",
+      "            (layer_norm): T5LayerNorm()\n",
+      "            (dropout): Dropout(p=0.1, inplace=False)\n",
+      "          )\n",
+      "        )\n",
+      "      )\n",
+      "      (6): T5Block(\n",
+      "        (layer): ModuleList(\n",
+      "          (0): T5LayerSelfAttention(\n",
+      "            (SelfAttention): T5Attention(\n",
+      "              (q): Linear(in_features=768, out_features=768, bias=False)\n",
+      "              (k): Linear(in_features=768, out_features=768, bias=False)\n",
+      "              (v): Linear(in_features=768, out_features=768, bias=False)\n",
+      "              (o): Linear(in_features=768, out_features=768, bias=False)\n",
+      "            )\n",
+      "            (layer_norm): T5LayerNorm()\n",
+      "            (dropout): Dropout(p=0.1, inplace=False)\n",
+      "          )\n",
+      "          (1): T5LayerFF(\n",
+      "            (DenseReluDense): T5DenseActDense(\n",
+      "              (wi): Linear(in_features=768, out_features=3072, bias=False)\n",
+      "              (wo): Linear(in_features=3072, out_features=768, bias=False)\n",
+      "              (dropout): Dropout(p=0.1, inplace=False)\n",
+      "              (act): ReLU()\n",
+      "            )\n",
+      "            (layer_norm): T5LayerNorm()\n",
+      "            (dropout): Dropout(p=0.1, inplace=False)\n",
+      "          )\n",
+      "        )\n",
+      "      )\n",
+      "      (7): T5Block(\n",
+      "        (layer): ModuleList(\n",
+      "          (0): T5LayerSelfAttention(\n",
+      "            (SelfAttention): T5Attention(\n",
+      "              (q): Linear(in_features=768, out_features=768, bias=False)\n",
+      "              (k): Linear(in_features=768, out_features=768, bias=False)\n",
+      "              (v): Linear(in_features=768, out_features=768, bias=False)\n",
+      "              (o): Linear(in_features=768, out_features=768, bias=False)\n",
+      "            )\n",
+      "            (layer_norm): T5LayerNorm()\n",
+      "            (dropout): Dropout(p=0.1, inplace=False)\n",
+      "          )\n",
+      "          (1): T5LayerFF(\n",
+      "            (DenseReluDense): T5DenseActDense(\n",
+      "              (wi): Linear(in_features=768, out_features=3072, bias=False)\n",
+      "              (wo): Linear(in_features=3072, out_features=768, bias=False)\n",
+      "              (dropout): Dropout(p=0.1, inplace=False)\n",
+      "              (act): ReLU()\n",
+      "            )\n",
+      "            (layer_norm): T5LayerNorm()\n",
+      "            (dropout): Dropout(p=0.1, inplace=False)\n",
+      "          )\n",
+      "        )\n",
+      "      )\n",
+      "      (8): T5Block(\n",
+      "        (layer): ModuleList(\n",
+      "          (0): T5LayerSelfAttention(\n",
+      "            (SelfAttention): T5Attention(\n",
+      "              (q): Linear(in_features=768, out_features=768, bias=False)\n",
+      "              (k): Linear(in_features=768, out_features=768, bias=False)\n",
+      "              (v): Linear(in_features=768, out_features=768, bias=False)\n",
+      "              (o): Linear(in_features=768, out_features=768, bias=False)\n",
+      "            )\n",
+      "            (layer_norm): T5LayerNorm()\n",
+      "            (dropout): Dropout(p=0.1, inplace=False)\n",
+      "          )\n",
+      "          (1): T5LayerFF(\n",
+      "            (DenseReluDense): T5DenseActDense(\n",
+      "              (wi): Linear(in_features=768, out_features=3072, bias=False)\n",
+      "              (wo): Linear(in_features=3072, out_features=768, bias=False)\n",
+      "              (dropout): Dropout(p=0.1, inplace=False)\n",
+      "              (act): ReLU()\n",
+      "            )\n",
+      "            (layer_norm): T5LayerNorm()\n",
+      "            (dropout): Dropout(p=0.1, inplace=False)\n",
+      "          )\n",
+      "        )\n",
+      "      )\n",
+      "      (9): T5Block(\n",
+      "        (layer): ModuleList(\n",
+      "          (0): T5LayerSelfAttention(\n",
+      "            (SelfAttention): T5Attention(\n",
+      "              (q): Linear(in_features=768, out_features=768, bias=False)\n",
+      "              (k): Linear(in_features=768, out_features=768, bias=False)\n",
+      "              (v): Linear(in_features=768, out_features=768, bias=False)\n",
+      "              (o): Linear(in_features=768, out_features=768, bias=False)\n",
+      "            )\n",
+      "            (layer_norm): T5LayerNorm()\n",
+      "            (dropout): Dropout(p=0.1, inplace=False)\n",
+      "          )\n",
+      "          (1): T5LayerFF(\n",
+      "            (DenseReluDense): T5DenseActDense(\n",
+      "              (wi): Linear(in_features=768, out_features=3072, bias=False)\n",
+      "              (wo): Linear(in_features=3072, out_features=768, bias=False)\n",
+      "              (dropout): Dropout(p=0.1, inplace=False)\n",
+      "              (act): ReLU()\n",
+      "            )\n",
+      "            (layer_norm): T5LayerNorm()\n",
+      "            (dropout): Dropout(p=0.1, inplace=False)\n",
+      "          )\n",
+      "        )\n",
+      "      )\n",
+      "      (10): T5Block(\n",
+      "        (layer): ModuleList(\n",
+      "          (0): T5LayerSelfAttention(\n",
+      "            (SelfAttention): T5Attention(\n",
+      "              (q): Linear(in_features=768, out_features=768, bias=False)\n",
+      "              (k): Linear(in_features=768, out_features=768, bias=False)\n",
+      "              (v): Linear(in_features=768, out_features=768, bias=False)\n",
+      "              (o): Linear(in_features=768, out_features=768, bias=False)\n",
+      "            )\n",
+      "            (layer_norm): T5LayerNorm()\n",
+      "            (dropout): Dropout(p=0.1, inplace=False)\n",
+      "          )\n",
+      "          (1): T5LayerFF(\n",
+      "            (DenseReluDense): T5DenseActDense(\n",
+      "              (wi): Linear(in_features=768, out_features=3072, bias=False)\n",
+      "              (wo): Linear(in_features=3072, out_features=768, bias=False)\n",
+      "              (dropout): Dropout(p=0.1, inplace=False)\n",
+      "              (act): ReLU()\n",
+      "            )\n",
+      "            (layer_norm): T5LayerNorm()\n",
+      "            (dropout): Dropout(p=0.1, inplace=False)\n",
+      "          )\n",
+      "        )\n",
+      "      )\n",
+      "      (11): T5Block(\n",
+      "        (layer): ModuleList(\n",
+      "          (0): T5LayerSelfAttention(\n",
+      "            (SelfAttention): T5Attention(\n",
+      "              (q): Linear(in_features=768, out_features=768, bias=False)\n",
+      "              (k): Linear(in_features=768, out_features=768, bias=False)\n",
+      "              (v): Linear(in_features=768, out_features=768, bias=False)\n",
+      "              (o): Linear(in_features=768, out_features=768, bias=False)\n",
+      "            )\n",
+      "            (layer_norm): T5LayerNorm()\n",
+      "            (dropout): Dropout(p=0.1, inplace=False)\n",
+      "          )\n",
+      "          (1): T5LayerFF(\n",
+      "            (DenseReluDense): T5DenseActDense(\n",
+      "              (wi): Linear(in_features=768, out_features=3072, bias=False)\n",
+      "              (wo): Linear(in_features=3072, out_features=768, bias=False)\n",
+      "              (dropout): Dropout(p=0.1, inplace=False)\n",
+      "              (act): ReLU()\n",
+      "            )\n",
+      "            (layer_norm): T5LayerNorm()\n",
+      "            (dropout): Dropout(p=0.1, inplace=False)\n",
+      "          )\n",
+      "        )\n",
+      "      )\n",
+      "    )\n",
+      "    (final_layer_norm): T5LayerNorm()\n",
+      "    (dropout): Dropout(p=0.1, inplace=False)\n",
+      "  )\n",
+      "  (decoder): T5Stack(\n",
+      "    (embed_tokens): Embedding(32128, 768)\n",
+      "    (block): ModuleList(\n",
+      "      (0): T5Block(\n",
+      "        (layer): ModuleList(\n",
+      "          (0): T5LayerSelfAttention(\n",
+      "            (SelfAttention): T5Attention(\n",
+      "              (q): Linear(in_features=768, out_features=768, bias=False)\n",
+      "              (k): Linear(in_features=768, out_features=768, bias=False)\n",
+      "              (v): Linear(in_features=768, out_features=768, bias=False)\n",
+      "              (o): Linear(in_features=768, out_features=768, bias=False)\n",
+      "              (relative_attention_bias): Embedding(32, 12)\n",
+      "            )\n",
+      "            (layer_norm): T5LayerNorm()\n",
+      "            (dropout): Dropout(p=0.1, inplace=False)\n",
+      "          )\n",
+      "          (1): T5LayerCrossAttention(\n",
+      "            (EncDecAttention): T5Attention(\n",
+      "              (q): Linear(in_features=768, out_features=768, bias=False)\n",
+      "              (k): Linear(in_features=768, out_features=768, bias=False)\n",
+      "              (v): Linear(in_features=768, out_features=768, bias=False)\n",
+      "              (o): Linear(in_features=768, out_features=768, bias=False)\n",
+      "            )\n",
+      "            (layer_norm): T5LayerNorm()\n",
+      "            (dropout): Dropout(p=0.1, inplace=False)\n",
+      "          )\n",
+      "          (2): T5LayerFF(\n",
+      "            (DenseReluDense): T5DenseActDense(\n",
+      "              (wi): Linear(in_features=768, out_features=3072, bias=False)\n",
+      "              (wo): Linear(in_features=3072, out_features=768, bias=False)\n",
+      "              (dropout): Dropout(p=0.1, inplace=False)\n",
+      "              (act): ReLU()\n",
+      "            )\n",
+      "            (layer_norm): T5LayerNorm()\n",
+      "            (dropout): Dropout(p=0.1, inplace=False)\n",
+      "          )\n",
+      "        )\n",
+      "      )\n",
+      "      (1): T5Block(\n",
+      "        (layer): ModuleList(\n",
+      "          (0): T5LayerSelfAttention(\n",
+      "            (SelfAttention): T5Attention(\n",
+      "              (q): Linear(in_features=768, out_features=768, bias=False)\n",
+      "              (k): Linear(in_features=768, out_features=768, bias=False)\n",
+      "              (v): Linear(in_features=768, out_features=768, bias=False)\n",
+      "              (o): Linear(in_features=768, out_features=768, bias=False)\n",
+      "            )\n",
+      "            (layer_norm): T5LayerNorm()\n",
+      "            (dropout): Dropout(p=0.1, inplace=False)\n",
+      "          )\n",
+      "          (1): T5LayerCrossAttention(\n",
+      "            (EncDecAttention): T5Attention(\n",
+      "              (q): Linear(in_features=768, out_features=768, bias=False)\n",
+      "              (k): Linear(in_features=768, out_features=768, bias=False)\n",
+      "              (v): Linear(in_features=768, out_features=768, bias=False)\n",
+      "              (o): Linear(in_features=768, out_features=768, bias=False)\n",
+      "            )\n",
+      "            (layer_norm): T5LayerNorm()\n",
+      "            (dropout): Dropout(p=0.1, inplace=False)\n",
+      "          )\n",
+      "          (2): T5LayerFF(\n",
+      "            (DenseReluDense): T5DenseActDense(\n",
+      "              (wi): Linear(in_features=768, out_features=3072, bias=False)\n",
+      "              (wo): Linear(in_features=3072, out_features=768, bias=False)\n",
+      "              (dropout): Dropout(p=0.1, inplace=False)\n",
+      "              (act): ReLU()\n",
+      "            )\n",
+      "            (layer_norm): T5LayerNorm()\n",
+      "            (dropout): Dropout(p=0.1, inplace=False)\n",
+      "          )\n",
+      "        )\n",
+      "      )\n",
+      "      (2): T5Block(\n",
+      "        (layer): ModuleList(\n",
+      "          (0): T5LayerSelfAttention(\n",
+      "            (SelfAttention): T5Attention(\n",
+      "              (q): Linear(in_features=768, out_features=768, bias=False)\n",
+      "              (k): Linear(in_features=768, out_features=768, bias=False)\n",
+      "              (v): Linear(in_features=768, out_features=768, bias=False)\n",
+      "              (o): Linear(in_features=768, out_features=768, bias=False)\n",
+      "            )\n",
+      "            (layer_norm): T5LayerNorm()\n",
+      "            (dropout): Dropout(p=0.1, inplace=False)\n",
+      "          )\n",
+      "          (1): T5LayerCrossAttention(\n",
+      "            (EncDecAttention): T5Attention(\n",
+      "              (q): Linear(in_features=768, out_features=768, bias=False)\n",
+      "              (k): Linear(in_features=768, out_features=768, bias=False)\n",
+      "              (v): Linear(in_features=768, out_features=768, bias=False)\n",
+      "              (o): Linear(in_features=768, out_features=768, bias=False)\n",
+      "            )\n",
+      "            (layer_norm): T5LayerNorm()\n",
+      "            (dropout): Dropout(p=0.1, inplace=False)\n",
+      "          )\n",
+      "          (2): T5LayerFF(\n",
+      "            (DenseReluDense): T5DenseActDense(\n",
+      "              (wi): Linear(in_features=768, out_features=3072, bias=False)\n",
+      "              (wo): Linear(in_features=3072, out_features=768, bias=False)\n",
+      "              (dropout): Dropout(p=0.1, inplace=False)\n",
+      "              (act): ReLU()\n",
+      "            )\n",
+      "            (layer_norm): T5LayerNorm()\n",
+      "            (dropout): Dropout(p=0.1, inplace=False)\n",
+      "          )\n",
+      "        )\n",
+      "      )\n",
+      "      (3): T5Block(\n",
+      "        (layer): ModuleList(\n",
+      "          (0): T5LayerSelfAttention(\n",
+      "            (SelfAttention): T5Attention(\n",
+      "              (q): Linear(in_features=768, out_features=768, bias=False)\n",
+      "              (k): Linear(in_features=768, out_features=768, bias=False)\n",
+      "              (v): Linear(in_features=768, out_features=768, bias=False)\n",
+      "              (o): Linear(in_features=768, out_features=768, bias=False)\n",
+      "            )\n",
+      "            (layer_norm): T5LayerNorm()\n",
+      "            (dropout): Dropout(p=0.1, inplace=False)\n",
+      "          )\n",
+      "          (1): T5LayerCrossAttention(\n",
+      "            (EncDecAttention): T5Attention(\n",
+      "              (q): Linear(in_features=768, out_features=768, bias=False)\n",
+      "              (k): Linear(in_features=768, out_features=768, bias=False)\n",
+      "              (v): Linear(in_features=768, out_features=768, bias=False)\n",
+      "              (o): Linear(in_features=768, out_features=768, bias=False)\n",
+      "            )\n",
+      "            (layer_norm): T5LayerNorm()\n",
+      "            (dropout): Dropout(p=0.1, inplace=False)\n",
+      "          )\n",
+      "          (2): T5LayerFF(\n",
+      "            (DenseReluDense): T5DenseActDense(\n",
+      "              (wi): Linear(in_features=768, out_features=3072, bias=False)\n",
+      "              (wo): Linear(in_features=3072, out_features=768, bias=False)\n",
+      "              (dropout): Dropout(p=0.1, inplace=False)\n",
+      "              (act): ReLU()\n",
+      "            )\n",
+      "            (layer_norm): T5LayerNorm()\n",
+      "            (dropout): Dropout(p=0.1, inplace=False)\n",
+      "          )\n",
+      "        )\n",
+      "      )\n",
+      "      (4): T5Block(\n",
+      "        (layer): ModuleList(\n",
+      "          (0): T5LayerSelfAttention(\n",
+      "            (SelfAttention): T5Attention(\n",
+      "              (q): Linear(in_features=768, out_features=768, bias=False)\n",
+      "              (k): Linear(in_features=768, out_features=768, bias=False)\n",
+      "              (v): Linear(in_features=768, out_features=768, bias=False)\n",
+      "              (o): Linear(in_features=768, out_features=768, bias=False)\n",
+      "            )\n",
+      "            (layer_norm): T5LayerNorm()\n",
+      "            (dropout): Dropout(p=0.1, inplace=False)\n",
+      "          )\n",
+      "          (1): T5LayerCrossAttention(\n",
+      "            (EncDecAttention): T5Attention(\n",
+      "              (q): Linear(in_features=768, out_features=768, bias=False)\n",
+      "              (k): Linear(in_features=768, out_features=768, bias=False)\n",
+      "              (v): Linear(in_features=768, out_features=768, bias=False)\n",
+      "              (o): Linear(in_features=768, out_features=768, bias=False)\n",
+      "            )\n",
+      "            (layer_norm): T5LayerNorm()\n",
+      "            (dropout): Dropout(p=0.1, inplace=False)\n",
+      "          )\n",
+      "          (2): T5LayerFF(\n",
+      "            (DenseReluDense): T5DenseActDense(\n",
+      "              (wi): Linear(in_features=768, out_features=3072, bias=False)\n",
+      "              (wo): Linear(in_features=3072, out_features=768, bias=False)\n",
+      "              (dropout): Dropout(p=0.1, inplace=False)\n",
+      "              (act): ReLU()\n",
+      "            )\n",
+      "            (layer_norm): T5LayerNorm()\n",
+      "            (dropout): Dropout(p=0.1, inplace=False)\n",
+      "          )\n",
+      "        )\n",
+      "      )\n",
+      "      (5): T5Block(\n",
+      "        (layer): ModuleList(\n",
+      "          (0): T5LayerSelfAttention(\n",
+      "            (SelfAttention): T5Attention(\n",
+      "              (q): Linear(in_features=768, out_features=768, bias=False)\n",
+      "              (k): Linear(in_features=768, out_features=768, bias=False)\n",
+      "              (v): Linear(in_features=768, out_features=768, bias=False)\n",
+      "              (o): Linear(in_features=768, out_features=768, bias=False)\n",
+      "            )\n",
+      "            (layer_norm): T5LayerNorm()\n",
+      "            (dropout): Dropout(p=0.1, inplace=False)\n",
+      "          )\n",
+      "          (1): T5LayerCrossAttention(\n",
+      "            (EncDecAttention): T5Attention(\n",
+      "              (q): Linear(in_features=768, out_features=768, bias=False)\n",
+      "              (k): Linear(in_features=768, out_features=768, bias=False)\n",
+      "              (v): Linear(in_features=768, out_features=768, bias=False)\n",
+      "              (o): Linear(in_features=768, out_features=768, bias=False)\n",
+      "            )\n",
+      "            (layer_norm): T5LayerNorm()\n",
+      "            (dropout): Dropout(p=0.1, inplace=False)\n",
+      "          )\n",
+      "          (2): T5LayerFF(\n",
+      "            (DenseReluDense): T5DenseActDense(\n",
+      "              (wi): Linear(in_features=768, out_features=3072, bias=False)\n",
+      "              (wo): Linear(in_features=3072, out_features=768, bias=False)\n",
+      "              (dropout): Dropout(p=0.1, inplace=False)\n",
+      "              (act): ReLU()\n",
+      "            )\n",
+      "            (layer_norm): T5LayerNorm()\n",
+      "            (dropout): Dropout(p=0.1, inplace=False)\n",
+      "          )\n",
+      "        )\n",
+      "      )\n",
+      "      (6): T5Block(\n",
+      "        (layer): ModuleList(\n",
+      "          (0): T5LayerSelfAttention(\n",
+      "            (SelfAttention): T5Attention(\n",
+      "              (q): Linear(in_features=768, out_features=768, bias=False)\n",
+      "              (k): Linear(in_features=768, out_features=768, bias=False)\n",
+      "              (v): Linear(in_features=768, out_features=768, bias=False)\n",
+      "              (o): Linear(in_features=768, out_features=768, bias=False)\n",
+      "            )\n",
+      "            (layer_norm): T5LayerNorm()\n",
+      "            (dropout): Dropout(p=0.1, inplace=False)\n",
+      "          )\n",
+      "          (1): T5LayerCrossAttention(\n",
+      "            (EncDecAttention): T5Attention(\n",
+      "              (q): Linear(in_features=768, out_features=768, bias=False)\n",
+      "              (k): Linear(in_features=768, out_features=768, bias=False)\n",
+      "              (v): Linear(in_features=768, out_features=768, bias=False)\n",
+      "              (o): Linear(in_features=768, out_features=768, bias=False)\n",
+      "            )\n",
+      "            (layer_norm): T5LayerNorm()\n",
+      "            (dropout): Dropout(p=0.1, inplace=False)\n",
+      "          )\n",
+      "          (2): T5LayerFF(\n",
+      "            (DenseReluDense): T5DenseActDense(\n",
+      "              (wi): Linear(in_features=768, out_features=3072, bias=False)\n",
+      "              (wo): Linear(in_features=3072, out_features=768, bias=False)\n",
+      "              (dropout): Dropout(p=0.1, inplace=False)\n",
+      "              (act): ReLU()\n",
+      "            )\n",
+      "            (layer_norm): T5LayerNorm()\n",
+      "            (dropout): Dropout(p=0.1, inplace=False)\n",
+      "          )\n",
+      "        )\n",
+      "      )\n",
+      "      (7): T5Block(\n",
+      "        (layer): ModuleList(\n",
+      "          (0): T5LayerSelfAttention(\n",
+      "            (SelfAttention): T5Attention(\n",
+      "              (q): Linear(in_features=768, out_features=768, bias=False)\n",
+      "              (k): Linear(in_features=768, out_features=768, bias=False)\n",
+      "              (v): Linear(in_features=768, out_features=768, bias=False)\n",
+      "              (o): Linear(in_features=768, out_features=768, bias=False)\n",
+      "            )\n",
+      "            (layer_norm): T5LayerNorm()\n",
+      "            (dropout): Dropout(p=0.1, inplace=False)\n",
+      "          )\n",
+      "          (1): T5LayerCrossAttention(\n",
+      "            (EncDecAttention): T5Attention(\n",
+      "              (q): Linear(in_features=768, out_features=768, bias=False)\n",
+      "              (k): Linear(in_features=768, out_features=768, bias=False)\n",
+      "              (v): Linear(in_features=768, out_features=768, bias=False)\n",
+      "              (o): Linear(in_features=768, out_features=768, bias=False)\n",
+      "            )\n",
+      "            (layer_norm): T5LayerNorm()\n",
+      "            (dropout): Dropout(p=0.1, inplace=False)\n",
+      "          )\n",
+      "          (2): T5LayerFF(\n",
+      "            (DenseReluDense): T5DenseActDense(\n",
+      "              (wi): Linear(in_features=768, out_features=3072, bias=False)\n",
+      "              (wo): Linear(in_features=3072, out_features=768, bias=False)\n",
+      "              (dropout): Dropout(p=0.1, inplace=False)\n",
+      "              (act): ReLU()\n",
+      "            )\n",
+      "            (layer_norm): T5LayerNorm()\n",
+      "            (dropout): Dropout(p=0.1, inplace=False)\n",
+      "          )\n",
+      "        )\n",
+      "      )\n",
+      "      (8): T5Block(\n",
+      "        (layer): ModuleList(\n",
+      "          (0): T5LayerSelfAttention(\n",
+      "            (SelfAttention): T5Attention(\n",
+      "              (q): Linear(in_features=768, out_features=768, bias=False)\n",
+      "              (k): Linear(in_features=768, out_features=768, bias=False)\n",
+      "              (v): Linear(in_features=768, out_features=768, bias=False)\n",
+      "              (o): Linear(in_features=768, out_features=768, bias=False)\n",
+      "            )\n",
+      "            (layer_norm): T5LayerNorm()\n",
+      "            (dropout): Dropout(p=0.1, inplace=False)\n",
+      "          )\n",
+      "          (1): T5LayerCrossAttention(\n",
+      "            (EncDecAttention): T5Attention(\n",
+      "              (q): Linear(in_features=768, out_features=768, bias=False)\n",
+      "              (k): Linear(in_features=768, out_features=768, bias=False)\n",
+      "              (v): Linear(in_features=768, out_features=768, bias=False)\n",
+      "              (o): Linear(in_features=768, out_features=768, bias=False)\n",
+      "            )\n",
+      "            (layer_norm): T5LayerNorm()\n",
+      "            (dropout): Dropout(p=0.1, inplace=False)\n",
+      "          )\n",
+      "          (2): T5LayerFF(\n",
+      "            (DenseReluDense): T5DenseActDense(\n",
+      "              (wi): Linear(in_features=768, out_features=3072, bias=False)\n",
+      "              (wo): Linear(in_features=3072, out_features=768, bias=False)\n",
+      "              (dropout): Dropout(p=0.1, inplace=False)\n",
+      "              (act): ReLU()\n",
+      "            )\n",
+      "            (layer_norm): T5LayerNorm()\n",
+      "            (dropout): Dropout(p=0.1, inplace=False)\n",
+      "          )\n",
+      "        )\n",
+      "      )\n",
+      "      (9): T5Block(\n",
+      "        (layer): ModuleList(\n",
+      "          (0): T5LayerSelfAttention(\n",
+      "            (SelfAttention): T5Attention(\n",
+      "              (q): Linear(in_features=768, out_features=768, bias=False)\n",
+      "              (k): Linear(in_features=768, out_features=768, bias=False)\n",
+      "              (v): Linear(in_features=768, out_features=768, bias=False)\n",
+      "              (o): Linear(in_features=768, out_features=768, bias=False)\n",
+      "            )\n",
+      "            (layer_norm): T5LayerNorm()\n",
+      "            (dropout): Dropout(p=0.1, inplace=False)\n",
+      "          )\n",
+      "          (1): T5LayerCrossAttention(\n",
+      "            (EncDecAttention): T5Attention(\n",
+      "              (q): Linear(in_features=768, out_features=768, bias=False)\n",
+      "              (k): Linear(in_features=768, out_features=768, bias=False)\n",
+      "              (v): Linear(in_features=768, out_features=768, bias=False)\n",
+      "              (o): Linear(in_features=768, out_features=768, bias=False)\n",
+      "            )\n",
+      "            (layer_norm): T5LayerNorm()\n",
+      "            (dropout): Dropout(p=0.1, inplace=False)\n",
+      "          )\n",
+      "          (2): T5LayerFF(\n",
+      "            (DenseReluDense): T5DenseActDense(\n",
+      "              (wi): Linear(in_features=768, out_features=3072, bias=False)\n",
+      "              (wo): Linear(in_features=3072, out_features=768, bias=False)\n",
+      "              (dropout): Dropout(p=0.1, inplace=False)\n",
+      "              (act): ReLU()\n",
+      "            )\n",
+      "            (layer_norm): T5LayerNorm()\n",
+      "            (dropout): Dropout(p=0.1, inplace=False)\n",
+      "          )\n",
+      "        )\n",
+      "      )\n",
+      "      (10): T5Block(\n",
+      "        (layer): ModuleList(\n",
+      "          (0): T5LayerSelfAttention(\n",
+      "            (SelfAttention): T5Attention(\n",
+      "              (q): Linear(in_features=768, out_features=768, bias=False)\n",
+      "              (k): Linear(in_features=768, out_features=768, bias=False)\n",
+      "              (v): Linear(in_features=768, out_features=768, bias=False)\n",
+      "              (o): Linear(in_features=768, out_features=768, bias=False)\n",
+      "            )\n",
+      "            (layer_norm): T5LayerNorm()\n",
+      "            (dropout): Dropout(p=0.1, inplace=False)\n",
+      "          )\n",
+      "          (1): T5LayerCrossAttention(\n",
+      "            (EncDecAttention): T5Attention(\n",
+      "              (q): Linear(in_features=768, out_features=768, bias=False)\n",
+      "              (k): Linear(in_features=768, out_features=768, bias=False)\n",
+      "              (v): Linear(in_features=768, out_features=768, bias=False)\n",
+      "              (o): Linear(in_features=768, out_features=768, bias=False)\n",
+      "            )\n",
+      "            (layer_norm): T5LayerNorm()\n",
+      "            (dropout): Dropout(p=0.1, inplace=False)\n",
+      "          )\n",
+      "          (2): T5LayerFF(\n",
+      "            (DenseReluDense): T5DenseActDense(\n",
+      "              (wi): Linear(in_features=768, out_features=3072, bias=False)\n",
+      "              (wo): Linear(in_features=3072, out_features=768, bias=False)\n",
+      "              (dropout): Dropout(p=0.1, inplace=False)\n",
+      "              (act): ReLU()\n",
+      "            )\n",
+      "            (layer_norm): T5LayerNorm()\n",
+      "            (dropout): Dropout(p=0.1, inplace=False)\n",
+      "          )\n",
+      "        )\n",
+      "      )\n",
+      "      (11): T5Block(\n",
+      "        (layer): ModuleList(\n",
+      "          (0): T5LayerSelfAttention(\n",
+      "            (SelfAttention): T5Attention(\n",
+      "              (q): Linear(in_features=768, out_features=768, bias=False)\n",
+      "              (k): Linear(in_features=768, out_features=768, bias=False)\n",
+      "              (v): Linear(in_features=768, out_features=768, bias=False)\n",
+      "              (o): Linear(in_features=768, out_features=768, bias=False)\n",
+      "            )\n",
+      "            (layer_norm): T5LayerNorm()\n",
+      "            (dropout): Dropout(p=0.1, inplace=False)\n",
+      "          )\n",
+      "          (1): T5LayerCrossAttention(\n",
+      "            (EncDecAttention): T5Attention(\n",
+      "              (q): Linear(in_features=768, out_features=768, bias=False)\n",
+      "              (k): Linear(in_features=768, out_features=768, bias=False)\n",
+      "              (v): Linear(in_features=768, out_features=768, bias=False)\n",
+      "              (o): Linear(in_features=768, out_features=768, bias=False)\n",
+      "            )\n",
+      "            (layer_norm): T5LayerNorm()\n",
+      "            (dropout): Dropout(p=0.1, inplace=False)\n",
+      "          )\n",
+      "          (2): T5LayerFF(\n",
+      "            (DenseReluDense): T5DenseActDense(\n",
+      "              (wi): Linear(in_features=768, out_features=3072, bias=False)\n",
+      "              (wo): Linear(in_features=3072, out_features=768, bias=False)\n",
+      "              (dropout): Dropout(p=0.1, inplace=False)\n",
+      "              (act): ReLU()\n",
+      "            )\n",
+      "            (layer_norm): T5LayerNorm()\n",
+      "            (dropout): Dropout(p=0.1, inplace=False)\n",
+      "          )\n",
+      "        )\n",
+      "      )\n",
+      "    )\n",
+      "    (final_layer_norm): T5LayerNorm()\n",
+      "    (dropout): Dropout(p=0.1, inplace=False)\n",
+      "  )\n",
+      "  (lm_head): T5ClassificationHead(\n",
+      "    (dense_in): Linear(in_features=768, out_features=768, bias=True)\n",
+      "    (dense): Linear(in_features=768, out_features=768, bias=True)\n",
+      "    (dense_out): Linear(in_features=768, out_features=4, bias=True)\n",
+      "    (dropout): Dropout(p=0.1, inplace=False)\n",
+      "  )\n",
+      ")\n",
+      "Running tokenizer on dataset:   0%|                     | 0/120 [00:00<?, ?ba/s]02/16/2023 15:24:19 - INFO - datasets.arrow_dataset - Caching processed dataset at /home/jacob/code/university/uczenie_glebokie_w_przetwarzaniu_tekstu/.cache_training/json/default-e10a382a423bbb9a/0.0.0/0f7e3662623656454fcd2b650f34e886a7db4b9104504885bd462096cc7a9f51/cache-0f99c998b010fbf8.arrow\n",
+      "Running tokenizer on dataset: 100%|███████████| 120/120 [00:07<00:00, 15.69ba/s]\n",
+      "Running tokenizer on dataset:   0%|                       | 0/4 [00:00<?, ?ba/s]02/16/2023 15:24:26 - INFO - datasets.arrow_dataset - Caching processed dataset at /home/jacob/code/university/uczenie_glebokie_w_przetwarzaniu_tekstu/.cache_training/json/default-e10a382a423bbb9a/0.0.0/0f7e3662623656454fcd2b650f34e886a7db4b9104504885bd462096cc7a9f51/cache-0cfaba6ab7fdc0e3.arrow\n",
+      "Running tokenizer on dataset: 100%|███████████████| 4/4 [00:00<00:00, 17.12ba/s]\n",
+      "02/16/2023 15:24:27 - INFO - __main__ - Set 500 samples for 0-class\n",
+      "02/16/2023 15:24:27 - INFO - __main__ - Set 500 samples for 1-class\n",
+      "02/16/2023 15:24:27 - INFO - __main__ - Set 500 samples for 2-class\n",
+      "02/16/2023 15:24:27 - INFO - __main__ - Set 500 samples for 3-class\n",
+      "Traceback (most recent call last):\n",
+      "  File \"/home/jacob/code/university/uczenie_glebokie_w_przetwarzaniu_tekstu/run_glue.py\", line 685, in <module>\n",
+      "    main()\n",
+      "  File \"/home/jacob/code/university/uczenie_glebokie_w_przetwarzaniu_tekstu/run_glue.py\", line 533, in main\n",
+      "    raise ValueError(\"--do_predict requires a test dataset\")\n",
+      "ValueError: --do_predict requires a test dataset\n"
+     ]
+    }
+   ],
+   "source": [
+    "!python run_glue.py \\\n",
+    "  --cache_dir .cache_training \\\n",
+    "  --model_name_or_path t5-base \\\n",
+    "  --custom_model t5_custom \\\n",
+    "  --train_file data/train.json  \\\n",
+    "  --validation_file data/valid.json \\\n",
+    "  --test_file data/test.json \\\n",
+    "  --per_device_train_batch_size 8 \\\n",
+    "  --per_device_eval_batch_size 8 \\\n",
+    "  --do_train \\\n",
+    "  --do_eval \\\n",
+    "  --max_seq_length 128 \\\n",
+    "  --learning_rate 2e-5 \\\n",
+    "  --max_eval_samples 2000 \\\n",
+    "  --max_steps 2500 \\\n",
+    "  --num_train_epochs 1 \\\n",
+    "  --save_strategy steps \\\n",
+    "  --save_steps 250 \\\n",
+    "  --save_total_limit 5 \\\n",
+    "  --logging_strategy steps \\\n",
+    "  --logging_steps 100 \\\n",
+    "  --eval_steps 250 \\\n",
+    "  --evaluation_strategy steps \\\n",
+    "  --metric_for_best_model accuracy \\\n",
+    "  --greater_is_better True \\\n",
+    "  --load_best_model_at_end True \\\n",
+    "  --output_dir out/t5"
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Evaluation"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 17,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "02/16/2023 16:52:57 - WARNING - __main__ - Process rank: -1, device: cuda:0, n_gpu: 1distributed training: False, 16-bits training: False\n",
+      "02/16/2023 16:52:57 - INFO - __main__ - Training/evaluation parameters TrainingArguments(\n",
+      "_n_gpu=1,\n",
+      "adafactor=False,\n",
+      "adam_beta1=0.9,\n",
+      "adam_beta2=0.999,\n",
+      "adam_epsilon=1e-08,\n",
+      "auto_find_batch_size=False,\n",
+      "bf16=False,\n",
+      "bf16_full_eval=False,\n",
+      "data_seed=None,\n",
+      "dataloader_drop_last=False,\n",
+      "dataloader_num_workers=0,\n",
+      "dataloader_pin_memory=True,\n",
+      "ddp_bucket_cap_mb=None,\n",
+      "ddp_find_unused_parameters=None,\n",
+      "ddp_timeout=1800,\n",
+      "debug=[],\n",
+      "deepspeed=None,\n",
+      "disable_tqdm=False,\n",
+      "do_eval=True,\n",
+      "do_predict=True,\n",
+      "do_train=False,\n",
+      "eval_accumulation_steps=None,\n",
+      "eval_delay=0,\n",
+      "eval_steps=250,\n",
+      "evaluation_strategy=steps,\n",
+      "fp16=False,\n",
+      "fp16_backend=auto,\n",
+      "fp16_full_eval=False,\n",
+      "fp16_opt_level=O1,\n",
+      "fsdp=[],\n",
+      "fsdp_min_num_params=0,\n",
+      "fsdp_transformer_layer_cls_to_wrap=None,\n",
+      "full_determinism=False,\n",
+      "gradient_accumulation_steps=1,\n",
+      "gradient_checkpointing=False,\n",
+      "greater_is_better=True,\n",
+      "group_by_length=False,\n",
+      "half_precision_backend=auto,\n",
+      "hub_model_id=None,\n",
+      "hub_private_repo=False,\n",
+      "hub_strategy=every_save,\n",
+      "hub_token=<HUB_TOKEN>,\n",
+      "ignore_data_skip=False,\n",
+      "include_inputs_for_metrics=False,\n",
+      "jit_mode_eval=False,\n",
+      "label_names=None,\n",
+      "label_smoothing_factor=0.0,\n",
+      "learning_rate=2e-05,\n",
+      "length_column_name=length,\n",
+      "load_best_model_at_end=True,\n",
+      "local_rank=-1,\n",
+      "log_level=passive,\n",
+      "log_level_replica=passive,\n",
+      "log_on_each_node=True,\n",
+      "logging_dir=out/t5_results/runs/Feb16_16-52-56_DESKTOP-R7JO8BQ,\n",
+      "logging_first_step=False,\n",
+      "logging_nan_inf_filter=True,\n",
+      "logging_steps=100,\n",
+      "logging_strategy=steps,\n",
+      "lr_scheduler_type=linear,\n",
+      "max_grad_norm=1.0,\n",
+      "max_steps=2500,\n",
+      "metric_for_best_model=accuracy,\n",
+      "mp_parameters=,\n",
+      "no_cuda=False,\n",
+      "num_train_epochs=1.0,\n",
+      "optim=adamw_hf,\n",
+      "optim_args=None,\n",
+      "output_dir=out/t5_results,\n",
+      "overwrite_output_dir=False,\n",
+      "past_index=-1,\n",
+      "per_device_eval_batch_size=8,\n",
+      "per_device_train_batch_size=8,\n",
+      "prediction_loss_only=False,\n",
+      "push_to_hub=False,\n",
+      "push_to_hub_model_id=None,\n",
+      "push_to_hub_organization=None,\n",
+      "push_to_hub_token=<PUSH_TO_HUB_TOKEN>,\n",
+      "ray_scope=last,\n",
+      "remove_unused_columns=True,\n",
+      "report_to=[],\n",
+      "resume_from_checkpoint=None,\n",
+      "run_name=out/t5_results,\n",
+      "save_on_each_node=False,\n",
+      "save_steps=250,\n",
+      "save_strategy=steps,\n",
+      "save_total_limit=5,\n",
+      "seed=42,\n",
+      "sharded_ddp=[],\n",
+      "skip_memory_metrics=True,\n",
+      "tf32=None,\n",
+      "torch_compile=False,\n",
+      "torch_compile_backend=None,\n",
+      "torch_compile_mode=None,\n",
+      "torchdynamo=None,\n",
+      "tpu_metrics_debug=False,\n",
+      "tpu_num_cores=None,\n",
+      "use_ipex=False,\n",
+      "use_legacy_prediction_loop=False,\n",
+      "use_mps_device=False,\n",
+      "warmup_ratio=0.0,\n",
+      "warmup_steps=0,\n",
+      "weight_decay=0.0,\n",
+      "xpu_backend=None,\n",
+      ")\n",
+      "02/16/2023 16:52:57 - INFO - __main__ - load a local file for train: data/train.json\n",
+      "02/16/2023 16:52:57 - INFO - __main__ - load a local file for validation: data/valid.json\n",
+      "02/16/2023 16:52:57 - INFO - __main__ - load a local file for test: data/test.json\n",
+      "02/16/2023 16:52:58 - WARNING - datasets.builder - Using custom data configuration default-f6e8039906850c57\n",
+      "02/16/2023 16:52:58 - INFO - datasets.info - Loading Dataset Infos from /home/jacob/anaconda3/envs/ugp/lib/python3.10/site-packages/datasets/packaged_modules/json\n",
+      "02/16/2023 16:52:58 - INFO - datasets.builder - Overwrite dataset info from restored data version.\n",
+      "02/16/2023 16:52:58 - INFO - datasets.info - Loading Dataset info from .cache_training/json/default-f6e8039906850c57/0.0.0/0f7e3662623656454fcd2b650f34e886a7db4b9104504885bd462096cc7a9f51\n",
+      "02/16/2023 16:52:58 - WARNING - datasets.builder - Found cached dataset json (/home/jacob/code/university/uczenie_glebokie_w_przetwarzaniu_tekstu/.cache_training/json/default-f6e8039906850c57/0.0.0/0f7e3662623656454fcd2b650f34e886a7db4b9104504885bd462096cc7a9f51)\n",
+      "02/16/2023 16:52:58 - INFO - datasets.info - Loading Dataset info from /home/jacob/code/university/uczenie_glebokie_w_przetwarzaniu_tekstu/.cache_training/json/default-f6e8039906850c57/0.0.0/0f7e3662623656454fcd2b650f34e886a7db4b9104504885bd462096cc7a9f51\n",
+      "100%|████████████████████████████████████████████| 3/3 [00:00<00:00, 769.41it/s]\n",
+      "[INFO|configuration_utils.py:658] 2023-02-16 16:52:58,326 >> loading configuration file out/t5/config.json\n",
+      "[INFO|configuration_utils.py:712] 2023-02-16 16:52:58,327 >> Model config T5Config {\n",
+      "  \"_name_or_path\": \"out/t5\",\n",
+      "  \"architectures\": [\n",
+      "    \"T5ForClassification\"\n",
+      "  ],\n",
+      "  \"d_ff\": 3072,\n",
+      "  \"d_kv\": 64,\n",
+      "  \"d_model\": 768,\n",
+      "  \"decoder_start_token_id\": 0,\n",
+      "  \"dense_act_fn\": \"relu\",\n",
+      "  \"dropout_rate\": 0.1,\n",
+      "  \"eos_token_id\": 1,\n",
+      "  \"feed_forward_proj\": \"relu\",\n",
+      "  \"id2label\": {\n",
+      "    \"0\": 0,\n",
+      "    \"1\": 1,\n",
+      "    \"2\": 2,\n",
+      "    \"3\": 3\n",
+      "  },\n",
+      "  \"initializer_factor\": 1.0,\n",
+      "  \"is_encoder_decoder\": true,\n",
+      "  \"is_gated_act\": false,\n",
+      "  \"label2id\": {\n",
+      "    \"0\": 0,\n",
+      "    \"1\": 1,\n",
+      "    \"2\": 2,\n",
+      "    \"3\": 3\n",
+      "  },\n",
+      "  \"layer_norm_epsilon\": 1e-06,\n",
+      "  \"model_type\": \"t5\",\n",
+      "  \"n_positions\": 512,\n",
+      "  \"num_decoder_layers\": 12,\n",
+      "  \"num_heads\": 12,\n",
+      "  \"num_layers\": 12,\n",
+      "  \"output_past\": true,\n",
+      "  \"pad_token_id\": 0,\n",
+      "  \"relative_attention_max_distance\": 128,\n",
+      "  \"relative_attention_num_buckets\": 32,\n",
+      "  \"task_specific_params\": {\n",
+      "    \"summarization\": {\n",
+      "      \"early_stopping\": true,\n",
+      "      \"length_penalty\": 2.0,\n",
+      "      \"max_length\": 200,\n",
+      "      \"min_length\": 30,\n",
+      "      \"no_repeat_ngram_size\": 3,\n",
+      "      \"num_beams\": 4,\n",
+      "      \"prefix\": \"summarize: \"\n",
+      "    },\n",
+      "    \"translation_en_to_de\": {\n",
+      "      \"early_stopping\": true,\n",
+      "      \"max_length\": 300,\n",
+      "      \"num_beams\": 4,\n",
+      "      \"prefix\": \"translate English to German: \"\n",
+      "    },\n",
+      "    \"translation_en_to_fr\": {\n",
+      "      \"early_stopping\": true,\n",
+      "      \"max_length\": 300,\n",
+      "      \"num_beams\": 4,\n",
+      "      \"prefix\": \"translate English to French: \"\n",
+      "    },\n",
+      "    \"translation_en_to_ro\": {\n",
+      "      \"early_stopping\": true,\n",
+      "      \"max_length\": 300,\n",
+      "      \"num_beams\": 4,\n",
+      "      \"prefix\": \"translate English to Romanian: \"\n",
+      "    }\n",
+      "  },\n",
+      "  \"torch_dtype\": \"float32\",\n",
+      "  \"transformers_version\": \"4.26.1\",\n",
+      "  \"use_cache\": true,\n",
+      "  \"use_hidden_states\": false,\n",
+      "  \"vocab_size\": 32128\n",
+      "}\n",
+      "\n",
+      "[INFO|tokenization_utils_base.py:1800] 2023-02-16 16:52:58,328 >> loading file spiece.model\n",
+      "[INFO|tokenization_utils_base.py:1800] 2023-02-16 16:52:58,328 >> loading file tokenizer.json\n",
+      "[INFO|tokenization_utils_base.py:1800] 2023-02-16 16:52:58,328 >> loading file added_tokens.json\n",
+      "[INFO|tokenization_utils_base.py:1800] 2023-02-16 16:52:58,328 >> loading file special_tokens_map.json\n",
+      "[INFO|tokenization_utils_base.py:1800] 2023-02-16 16:52:58,328 >> loading file tokenizer_config.json\n",
+      "02/16/2023 16:52:58 - INFO - __main__ - Using hidden states in model: False\n",
+      "-------------------------------------------------------- Using hidden: False\n",
+      "02/16/2023 16:52:58 - INFO - __main__ - Using implementation from class: T5ForClassification\n",
+      "[INFO|modeling_utils.py:2272] 2023-02-16 16:52:58,375 >> loading weights file out/t5/pytorch_model.bin\n",
+      "[INFO|modeling_utils.py:2857] 2023-02-16 16:53:00,690 >> All model checkpoint weights were used when initializing T5ForClassification.\n",
+      "\n",
+      "[INFO|modeling_utils.py:2865] 2023-02-16 16:53:00,690 >> All the weights of T5ForClassification were initialized from the model checkpoint at out/t5.\n",
+      "If your task is similar to the task the model of the checkpoint was trained on, you can already use T5ForClassification for predictions without further training.\n",
+      "T5ForClassification(\n",
+      "  (shared): Embedding(32128, 768)\n",
+      "  (encoder): T5Stack(\n",
+      "    (embed_tokens): Embedding(32128, 768)\n",
+      "    (block): ModuleList(\n",
+      "      (0): T5Block(\n",
+      "        (layer): ModuleList(\n",
+      "          (0): T5LayerSelfAttention(\n",
+      "            (SelfAttention): T5Attention(\n",
+      "              (q): Linear(in_features=768, out_features=768, bias=False)\n",
+      "              (k): Linear(in_features=768, out_features=768, bias=False)\n",
+      "              (v): Linear(in_features=768, out_features=768, bias=False)\n",
+      "              (o): Linear(in_features=768, out_features=768, bias=False)\n",
+      "              (relative_attention_bias): Embedding(32, 12)\n",
+      "            )\n",
+      "            (layer_norm): T5LayerNorm()\n",
+      "            (dropout): Dropout(p=0.1, inplace=False)\n",
+      "          )\n",
+      "          (1): T5LayerFF(\n",
+      "            (DenseReluDense): T5DenseActDense(\n",
+      "              (wi): Linear(in_features=768, out_features=3072, bias=False)\n",
+      "              (wo): Linear(in_features=3072, out_features=768, bias=False)\n",
+      "              (dropout): Dropout(p=0.1, inplace=False)\n",
+      "              (act): ReLU()\n",
+      "            )\n",
+      "            (layer_norm): T5LayerNorm()\n",
+      "            (dropout): Dropout(p=0.1, inplace=False)\n",
+      "          )\n",
+      "        )\n",
+      "      )\n",
+      "      (1): T5Block(\n",
+      "        (layer): ModuleList(\n",
+      "          (0): T5LayerSelfAttention(\n",
+      "            (SelfAttention): T5Attention(\n",
+      "              (q): Linear(in_features=768, out_features=768, bias=False)\n",
+      "              (k): Linear(in_features=768, out_features=768, bias=False)\n",
+      "              (v): Linear(in_features=768, out_features=768, bias=False)\n",
+      "              (o): Linear(in_features=768, out_features=768, bias=False)\n",
+      "            )\n",
+      "            (layer_norm): T5LayerNorm()\n",
+      "            (dropout): Dropout(p=0.1, inplace=False)\n",
+      "          )\n",
+      "          (1): T5LayerFF(\n",
+      "            (DenseReluDense): T5DenseActDense(\n",
+      "              (wi): Linear(in_features=768, out_features=3072, bias=False)\n",
+      "              (wo): Linear(in_features=3072, out_features=768, bias=False)\n",
+      "              (dropout): Dropout(p=0.1, inplace=False)\n",
+      "              (act): ReLU()\n",
+      "            )\n",
+      "            (layer_norm): T5LayerNorm()\n",
+      "            (dropout): Dropout(p=0.1, inplace=False)\n",
+      "          )\n",
+      "        )\n",
+      "      )\n",
+      "      (2): T5Block(\n",
+      "        (layer): ModuleList(\n",
+      "          (0): T5LayerSelfAttention(\n",
+      "            (SelfAttention): T5Attention(\n",
+      "              (q): Linear(in_features=768, out_features=768, bias=False)\n",
+      "              (k): Linear(in_features=768, out_features=768, bias=False)\n",
+      "              (v): Linear(in_features=768, out_features=768, bias=False)\n",
+      "              (o): Linear(in_features=768, out_features=768, bias=False)\n",
+      "            )\n",
+      "            (layer_norm): T5LayerNorm()\n",
+      "            (dropout): Dropout(p=0.1, inplace=False)\n",
+      "          )\n",
+      "          (1): T5LayerFF(\n",
+      "            (DenseReluDense): T5DenseActDense(\n",
+      "              (wi): Linear(in_features=768, out_features=3072, bias=False)\n",
+      "              (wo): Linear(in_features=3072, out_features=768, bias=False)\n",
+      "              (dropout): Dropout(p=0.1, inplace=False)\n",
+      "              (act): ReLU()\n",
+      "            )\n",
+      "            (layer_norm): T5LayerNorm()\n",
+      "            (dropout): Dropout(p=0.1, inplace=False)\n",
+      "          )\n",
+      "        )\n",
+      "      )\n",
+      "      (3): T5Block(\n",
+      "        (layer): ModuleList(\n",
+      "          (0): T5LayerSelfAttention(\n",
+      "            (SelfAttention): T5Attention(\n",
+      "              (q): Linear(in_features=768, out_features=768, bias=False)\n",
+      "              (k): Linear(in_features=768, out_features=768, bias=False)\n",
+      "              (v): Linear(in_features=768, out_features=768, bias=False)\n",
+      "              (o): Linear(in_features=768, out_features=768, bias=False)\n",
+      "            )\n",
+      "            (layer_norm): T5LayerNorm()\n",
+      "            (dropout): Dropout(p=0.1, inplace=False)\n",
+      "          )\n",
+      "          (1): T5LayerFF(\n",
+      "            (DenseReluDense): T5DenseActDense(\n",
+      "              (wi): Linear(in_features=768, out_features=3072, bias=False)\n",
+      "              (wo): Linear(in_features=3072, out_features=768, bias=False)\n",
+      "              (dropout): Dropout(p=0.1, inplace=False)\n",
+      "              (act): ReLU()\n",
+      "            )\n",
+      "            (layer_norm): T5LayerNorm()\n",
+      "            (dropout): Dropout(p=0.1, inplace=False)\n",
+      "          )\n",
+      "        )\n",
+      "      )\n",
+      "      (4): T5Block(\n",
+      "        (layer): ModuleList(\n",
+      "          (0): T5LayerSelfAttention(\n",
+      "            (SelfAttention): T5Attention(\n",
+      "              (q): Linear(in_features=768, out_features=768, bias=False)\n",
+      "              (k): Linear(in_features=768, out_features=768, bias=False)\n",
+      "              (v): Linear(in_features=768, out_features=768, bias=False)\n",
+      "              (o): Linear(in_features=768, out_features=768, bias=False)\n",
+      "            )\n",
+      "            (layer_norm): T5LayerNorm()\n",
+      "            (dropout): Dropout(p=0.1, inplace=False)\n",
+      "          )\n",
+      "          (1): T5LayerFF(\n",
+      "            (DenseReluDense): T5DenseActDense(\n",
+      "              (wi): Linear(in_features=768, out_features=3072, bias=False)\n",
+      "              (wo): Linear(in_features=3072, out_features=768, bias=False)\n",
+      "              (dropout): Dropout(p=0.1, inplace=False)\n",
+      "              (act): ReLU()\n",
+      "            )\n",
+      "            (layer_norm): T5LayerNorm()\n",
+      "            (dropout): Dropout(p=0.1, inplace=False)\n",
+      "          )\n",
+      "        )\n",
+      "      )\n",
+      "      (5): T5Block(\n",
+      "        (layer): ModuleList(\n",
+      "          (0): T5LayerSelfAttention(\n",
+      "            (SelfAttention): T5Attention(\n",
+      "              (q): Linear(in_features=768, out_features=768, bias=False)\n",
+      "              (k): Linear(in_features=768, out_features=768, bias=False)\n",
+      "              (v): Linear(in_features=768, out_features=768, bias=False)\n",
+      "              (o): Linear(in_features=768, out_features=768, bias=False)\n",
+      "            )\n",
+      "            (layer_norm): T5LayerNorm()\n",
+      "            (dropout): Dropout(p=0.1, inplace=False)\n",
+      "          )\n",
+      "          (1): T5LayerFF(\n",
+      "            (DenseReluDense): T5DenseActDense(\n",
+      "              (wi): Linear(in_features=768, out_features=3072, bias=False)\n",
+      "              (wo): Linear(in_features=3072, out_features=768, bias=False)\n",
+      "              (dropout): Dropout(p=0.1, inplace=False)\n",
+      "              (act): ReLU()\n",
+      "            )\n",
+      "            (layer_norm): T5LayerNorm()\n",
+      "            (dropout): Dropout(p=0.1, inplace=False)\n",
+      "          )\n",
+      "        )\n",
+      "      )\n",
+      "      (6): T5Block(\n",
+      "        (layer): ModuleList(\n",
+      "          (0): T5LayerSelfAttention(\n",
+      "            (SelfAttention): T5Attention(\n",
+      "              (q): Linear(in_features=768, out_features=768, bias=False)\n",
+      "              (k): Linear(in_features=768, out_features=768, bias=False)\n",
+      "              (v): Linear(in_features=768, out_features=768, bias=False)\n",
+      "              (o): Linear(in_features=768, out_features=768, bias=False)\n",
+      "            )\n",
+      "            (layer_norm): T5LayerNorm()\n",
+      "            (dropout): Dropout(p=0.1, inplace=False)\n",
+      "          )\n",
+      "          (1): T5LayerFF(\n",
+      "            (DenseReluDense): T5DenseActDense(\n",
+      "              (wi): Linear(in_features=768, out_features=3072, bias=False)\n",
+      "              (wo): Linear(in_features=3072, out_features=768, bias=False)\n",
+      "              (dropout): Dropout(p=0.1, inplace=False)\n",
+      "              (act): ReLU()\n",
+      "            )\n",
+      "            (layer_norm): T5LayerNorm()\n",
+      "            (dropout): Dropout(p=0.1, inplace=False)\n",
+      "          )\n",
+      "        )\n",
+      "      )\n",
+      "      (7): T5Block(\n",
+      "        (layer): ModuleList(\n",
+      "          (0): T5LayerSelfAttention(\n",
+      "            (SelfAttention): T5Attention(\n",
+      "              (q): Linear(in_features=768, out_features=768, bias=False)\n",
+      "              (k): Linear(in_features=768, out_features=768, bias=False)\n",
+      "              (v): Linear(in_features=768, out_features=768, bias=False)\n",
+      "              (o): Linear(in_features=768, out_features=768, bias=False)\n",
+      "            )\n",
+      "            (layer_norm): T5LayerNorm()\n",
+      "            (dropout): Dropout(p=0.1, inplace=False)\n",
+      "          )\n",
+      "          (1): T5LayerFF(\n",
+      "            (DenseReluDense): T5DenseActDense(\n",
+      "              (wi): Linear(in_features=768, out_features=3072, bias=False)\n",
+      "              (wo): Linear(in_features=3072, out_features=768, bias=False)\n",
+      "              (dropout): Dropout(p=0.1, inplace=False)\n",
+      "              (act): ReLU()\n",
+      "            )\n",
+      "            (layer_norm): T5LayerNorm()\n",
+      "            (dropout): Dropout(p=0.1, inplace=False)\n",
+      "          )\n",
+      "        )\n",
+      "      )\n",
+      "      (8): T5Block(\n",
+      "        (layer): ModuleList(\n",
+      "          (0): T5LayerSelfAttention(\n",
+      "            (SelfAttention): T5Attention(\n",
+      "              (q): Linear(in_features=768, out_features=768, bias=False)\n",
+      "              (k): Linear(in_features=768, out_features=768, bias=False)\n",
+      "              (v): Linear(in_features=768, out_features=768, bias=False)\n",
+      "              (o): Linear(in_features=768, out_features=768, bias=False)\n",
+      "            )\n",
+      "            (layer_norm): T5LayerNorm()\n",
+      "            (dropout): Dropout(p=0.1, inplace=False)\n",
+      "          )\n",
+      "          (1): T5LayerFF(\n",
+      "            (DenseReluDense): T5DenseActDense(\n",
+      "              (wi): Linear(in_features=768, out_features=3072, bias=False)\n",
+      "              (wo): Linear(in_features=3072, out_features=768, bias=False)\n",
+      "              (dropout): Dropout(p=0.1, inplace=False)\n",
+      "              (act): ReLU()\n",
+      "            )\n",
+      "            (layer_norm): T5LayerNorm()\n",
+      "            (dropout): Dropout(p=0.1, inplace=False)\n",
+      "          )\n",
+      "        )\n",
+      "      )\n",
+      "      (9): T5Block(\n",
+      "        (layer): ModuleList(\n",
+      "          (0): T5LayerSelfAttention(\n",
+      "            (SelfAttention): T5Attention(\n",
+      "              (q): Linear(in_features=768, out_features=768, bias=False)\n",
+      "              (k): Linear(in_features=768, out_features=768, bias=False)\n",
+      "              (v): Linear(in_features=768, out_features=768, bias=False)\n",
+      "              (o): Linear(in_features=768, out_features=768, bias=False)\n",
+      "            )\n",
+      "            (layer_norm): T5LayerNorm()\n",
+      "            (dropout): Dropout(p=0.1, inplace=False)\n",
+      "          )\n",
+      "          (1): T5LayerFF(\n",
+      "            (DenseReluDense): T5DenseActDense(\n",
+      "              (wi): Linear(in_features=768, out_features=3072, bias=False)\n",
+      "              (wo): Linear(in_features=3072, out_features=768, bias=False)\n",
+      "              (dropout): Dropout(p=0.1, inplace=False)\n",
+      "              (act): ReLU()\n",
+      "            )\n",
+      "            (layer_norm): T5LayerNorm()\n",
+      "            (dropout): Dropout(p=0.1, inplace=False)\n",
+      "          )\n",
+      "        )\n",
+      "      )\n",
+      "      (10): T5Block(\n",
+      "        (layer): ModuleList(\n",
+      "          (0): T5LayerSelfAttention(\n",
+      "            (SelfAttention): T5Attention(\n",
+      "              (q): Linear(in_features=768, out_features=768, bias=False)\n",
+      "              (k): Linear(in_features=768, out_features=768, bias=False)\n",
+      "              (v): Linear(in_features=768, out_features=768, bias=False)\n",
+      "              (o): Linear(in_features=768, out_features=768, bias=False)\n",
+      "            )\n",
+      "            (layer_norm): T5LayerNorm()\n",
+      "            (dropout): Dropout(p=0.1, inplace=False)\n",
+      "          )\n",
+      "          (1): T5LayerFF(\n",
+      "            (DenseReluDense): T5DenseActDense(\n",
+      "              (wi): Linear(in_features=768, out_features=3072, bias=False)\n",
+      "              (wo): Linear(in_features=3072, out_features=768, bias=False)\n",
+      "              (dropout): Dropout(p=0.1, inplace=False)\n",
+      "              (act): ReLU()\n",
+      "            )\n",
+      "            (layer_norm): T5LayerNorm()\n",
+      "            (dropout): Dropout(p=0.1, inplace=False)\n",
+      "          )\n",
+      "        )\n",
+      "      )\n",
+      "      (11): T5Block(\n",
+      "        (layer): ModuleList(\n",
+      "          (0): T5LayerSelfAttention(\n",
+      "            (SelfAttention): T5Attention(\n",
+      "              (q): Linear(in_features=768, out_features=768, bias=False)\n",
+      "              (k): Linear(in_features=768, out_features=768, bias=False)\n",
+      "              (v): Linear(in_features=768, out_features=768, bias=False)\n",
+      "              (o): Linear(in_features=768, out_features=768, bias=False)\n",
+      "            )\n",
+      "            (layer_norm): T5LayerNorm()\n",
+      "            (dropout): Dropout(p=0.1, inplace=False)\n",
+      "          )\n",
+      "          (1): T5LayerFF(\n",
+      "            (DenseReluDense): T5DenseActDense(\n",
+      "              (wi): Linear(in_features=768, out_features=3072, bias=False)\n",
+      "              (wo): Linear(in_features=3072, out_features=768, bias=False)\n",
+      "              (dropout): Dropout(p=0.1, inplace=False)\n",
+      "              (act): ReLU()\n",
+      "            )\n",
+      "            (layer_norm): T5LayerNorm()\n",
+      "            (dropout): Dropout(p=0.1, inplace=False)\n",
+      "          )\n",
+      "        )\n",
+      "      )\n",
+      "    )\n",
+      "    (final_layer_norm): T5LayerNorm()\n",
+      "    (dropout): Dropout(p=0.1, inplace=False)\n",
+      "  )\n",
+      "  (decoder): T5Stack(\n",
+      "    (embed_tokens): Embedding(32128, 768)\n",
+      "    (block): ModuleList(\n",
+      "      (0): T5Block(\n",
+      "        (layer): ModuleList(\n",
+      "          (0): T5LayerSelfAttention(\n",
+      "            (SelfAttention): T5Attention(\n",
+      "              (q): Linear(in_features=768, out_features=768, bias=False)\n",
+      "              (k): Linear(in_features=768, out_features=768, bias=False)\n",
+      "              (v): Linear(in_features=768, out_features=768, bias=False)\n",
+      "              (o): Linear(in_features=768, out_features=768, bias=False)\n",
+      "              (relative_attention_bias): Embedding(32, 12)\n",
+      "            )\n",
+      "            (layer_norm): T5LayerNorm()\n",
+      "            (dropout): Dropout(p=0.1, inplace=False)\n",
+      "          )\n",
+      "          (1): T5LayerCrossAttention(\n",
+      "            (EncDecAttention): T5Attention(\n",
+      "              (q): Linear(in_features=768, out_features=768, bias=False)\n",
+      "              (k): Linear(in_features=768, out_features=768, bias=False)\n",
+      "              (v): Linear(in_features=768, out_features=768, bias=False)\n",
+      "              (o): Linear(in_features=768, out_features=768, bias=False)\n",
+      "            )\n",
+      "            (layer_norm): T5LayerNorm()\n",
+      "            (dropout): Dropout(p=0.1, inplace=False)\n",
+      "          )\n",
+      "          (2): T5LayerFF(\n",
+      "            (DenseReluDense): T5DenseActDense(\n",
+      "              (wi): Linear(in_features=768, out_features=3072, bias=False)\n",
+      "              (wo): Linear(in_features=3072, out_features=768, bias=False)\n",
+      "              (dropout): Dropout(p=0.1, inplace=False)\n",
+      "              (act): ReLU()\n",
+      "            )\n",
+      "            (layer_norm): T5LayerNorm()\n",
+      "            (dropout): Dropout(p=0.1, inplace=False)\n",
+      "          )\n",
+      "        )\n",
+      "      )\n",
+      "      (1): T5Block(\n",
+      "        (layer): ModuleList(\n",
+      "          (0): T5LayerSelfAttention(\n",
+      "            (SelfAttention): T5Attention(\n",
+      "              (q): Linear(in_features=768, out_features=768, bias=False)\n",
+      "              (k): Linear(in_features=768, out_features=768, bias=False)\n",
+      "              (v): Linear(in_features=768, out_features=768, bias=False)\n",
+      "              (o): Linear(in_features=768, out_features=768, bias=False)\n",
+      "            )\n",
+      "            (layer_norm): T5LayerNorm()\n",
+      "            (dropout): Dropout(p=0.1, inplace=False)\n",
+      "          )\n",
+      "          (1): T5LayerCrossAttention(\n",
+      "            (EncDecAttention): T5Attention(\n",
+      "              (q): Linear(in_features=768, out_features=768, bias=False)\n",
+      "              (k): Linear(in_features=768, out_features=768, bias=False)\n",
+      "              (v): Linear(in_features=768, out_features=768, bias=False)\n",
+      "              (o): Linear(in_features=768, out_features=768, bias=False)\n",
+      "            )\n",
+      "            (layer_norm): T5LayerNorm()\n",
+      "            (dropout): Dropout(p=0.1, inplace=False)\n",
+      "          )\n",
+      "          (2): T5LayerFF(\n",
+      "            (DenseReluDense): T5DenseActDense(\n",
+      "              (wi): Linear(in_features=768, out_features=3072, bias=False)\n",
+      "              (wo): Linear(in_features=3072, out_features=768, bias=False)\n",
+      "              (dropout): Dropout(p=0.1, inplace=False)\n",
+      "              (act): ReLU()\n",
+      "            )\n",
+      "            (layer_norm): T5LayerNorm()\n",
+      "            (dropout): Dropout(p=0.1, inplace=False)\n",
+      "          )\n",
+      "        )\n",
+      "      )\n",
+      "      (2): T5Block(\n",
+      "        (layer): ModuleList(\n",
+      "          (0): T5LayerSelfAttention(\n",
+      "            (SelfAttention): T5Attention(\n",
+      "              (q): Linear(in_features=768, out_features=768, bias=False)\n",
+      "              (k): Linear(in_features=768, out_features=768, bias=False)\n",
+      "              (v): Linear(in_features=768, out_features=768, bias=False)\n",
+      "              (o): Linear(in_features=768, out_features=768, bias=False)\n",
+      "            )\n",
+      "            (layer_norm): T5LayerNorm()\n",
+      "            (dropout): Dropout(p=0.1, inplace=False)\n",
+      "          )\n",
+      "          (1): T5LayerCrossAttention(\n",
+      "            (EncDecAttention): T5Attention(\n",
+      "              (q): Linear(in_features=768, out_features=768, bias=False)\n",
+      "              (k): Linear(in_features=768, out_features=768, bias=False)\n",
+      "              (v): Linear(in_features=768, out_features=768, bias=False)\n",
+      "              (o): Linear(in_features=768, out_features=768, bias=False)\n",
+      "            )\n",
+      "            (layer_norm): T5LayerNorm()\n",
+      "            (dropout): Dropout(p=0.1, inplace=False)\n",
+      "          )\n",
+      "          (2): T5LayerFF(\n",
+      "            (DenseReluDense): T5DenseActDense(\n",
+      "              (wi): Linear(in_features=768, out_features=3072, bias=False)\n",
+      "              (wo): Linear(in_features=3072, out_features=768, bias=False)\n",
+      "              (dropout): Dropout(p=0.1, inplace=False)\n",
+      "              (act): ReLU()\n",
+      "            )\n",
+      "            (layer_norm): T5LayerNorm()\n",
+      "            (dropout): Dropout(p=0.1, inplace=False)\n",
+      "          )\n",
+      "        )\n",
+      "      )\n",
+      "      (3): T5Block(\n",
+      "        (layer): ModuleList(\n",
+      "          (0): T5LayerSelfAttention(\n",
+      "            (SelfAttention): T5Attention(\n",
+      "              (q): Linear(in_features=768, out_features=768, bias=False)\n",
+      "              (k): Linear(in_features=768, out_features=768, bias=False)\n",
+      "              (v): Linear(in_features=768, out_features=768, bias=False)\n",
+      "              (o): Linear(in_features=768, out_features=768, bias=False)\n",
+      "            )\n",
+      "            (layer_norm): T5LayerNorm()\n",
+      "            (dropout): Dropout(p=0.1, inplace=False)\n",
+      "          )\n",
+      "          (1): T5LayerCrossAttention(\n",
+      "            (EncDecAttention): T5Attention(\n",
+      "              (q): Linear(in_features=768, out_features=768, bias=False)\n",
+      "              (k): Linear(in_features=768, out_features=768, bias=False)\n",
+      "              (v): Linear(in_features=768, out_features=768, bias=False)\n",
+      "              (o): Linear(in_features=768, out_features=768, bias=False)\n",
+      "            )\n",
+      "            (layer_norm): T5LayerNorm()\n",
+      "            (dropout): Dropout(p=0.1, inplace=False)\n",
+      "          )\n",
+      "          (2): T5LayerFF(\n",
+      "            (DenseReluDense): T5DenseActDense(\n",
+      "              (wi): Linear(in_features=768, out_features=3072, bias=False)\n",
+      "              (wo): Linear(in_features=3072, out_features=768, bias=False)\n",
+      "              (dropout): Dropout(p=0.1, inplace=False)\n",
+      "              (act): ReLU()\n",
+      "            )\n",
+      "            (layer_norm): T5LayerNorm()\n",
+      "            (dropout): Dropout(p=0.1, inplace=False)\n",
+      "          )\n",
+      "        )\n",
+      "      )\n",
+      "      (4): T5Block(\n",
+      "        (layer): ModuleList(\n",
+      "          (0): T5LayerSelfAttention(\n",
+      "            (SelfAttention): T5Attention(\n",
+      "              (q): Linear(in_features=768, out_features=768, bias=False)\n",
+      "              (k): Linear(in_features=768, out_features=768, bias=False)\n",
+      "              (v): Linear(in_features=768, out_features=768, bias=False)\n",
+      "              (o): Linear(in_features=768, out_features=768, bias=False)\n",
+      "            )\n",
+      "            (layer_norm): T5LayerNorm()\n",
+      "            (dropout): Dropout(p=0.1, inplace=False)\n",
+      "          )\n",
+      "          (1): T5LayerCrossAttention(\n",
+      "            (EncDecAttention): T5Attention(\n",
+      "              (q): Linear(in_features=768, out_features=768, bias=False)\n",
+      "              (k): Linear(in_features=768, out_features=768, bias=False)\n",
+      "              (v): Linear(in_features=768, out_features=768, bias=False)\n",
+      "              (o): Linear(in_features=768, out_features=768, bias=False)\n",
+      "            )\n",
+      "            (layer_norm): T5LayerNorm()\n",
+      "            (dropout): Dropout(p=0.1, inplace=False)\n",
+      "          )\n",
+      "          (2): T5LayerFF(\n",
+      "            (DenseReluDense): T5DenseActDense(\n",
+      "              (wi): Linear(in_features=768, out_features=3072, bias=False)\n",
+      "              (wo): Linear(in_features=3072, out_features=768, bias=False)\n",
+      "              (dropout): Dropout(p=0.1, inplace=False)\n",
+      "              (act): ReLU()\n",
+      "            )\n",
+      "            (layer_norm): T5LayerNorm()\n",
+      "            (dropout): Dropout(p=0.1, inplace=False)\n",
+      "          )\n",
+      "        )\n",
+      "      )\n",
+      "      (5): T5Block(\n",
+      "        (layer): ModuleList(\n",
+      "          (0): T5LayerSelfAttention(\n",
+      "            (SelfAttention): T5Attention(\n",
+      "              (q): Linear(in_features=768, out_features=768, bias=False)\n",
+      "              (k): Linear(in_features=768, out_features=768, bias=False)\n",
+      "              (v): Linear(in_features=768, out_features=768, bias=False)\n",
+      "              (o): Linear(in_features=768, out_features=768, bias=False)\n",
+      "            )\n",
+      "            (layer_norm): T5LayerNorm()\n",
+      "            (dropout): Dropout(p=0.1, inplace=False)\n",
+      "          )\n",
+      "          (1): T5LayerCrossAttention(\n",
+      "            (EncDecAttention): T5Attention(\n",
+      "              (q): Linear(in_features=768, out_features=768, bias=False)\n",
+      "              (k): Linear(in_features=768, out_features=768, bias=False)\n",
+      "              (v): Linear(in_features=768, out_features=768, bias=False)\n",
+      "              (o): Linear(in_features=768, out_features=768, bias=False)\n",
+      "            )\n",
+      "            (layer_norm): T5LayerNorm()\n",
+      "            (dropout): Dropout(p=0.1, inplace=False)\n",
+      "          )\n",
+      "          (2): T5LayerFF(\n",
+      "            (DenseReluDense): T5DenseActDense(\n",
+      "              (wi): Linear(in_features=768, out_features=3072, bias=False)\n",
+      "              (wo): Linear(in_features=3072, out_features=768, bias=False)\n",
+      "              (dropout): Dropout(p=0.1, inplace=False)\n",
+      "              (act): ReLU()\n",
+      "            )\n",
+      "            (layer_norm): T5LayerNorm()\n",
+      "            (dropout): Dropout(p=0.1, inplace=False)\n",
+      "          )\n",
+      "        )\n",
+      "      )\n",
+      "      (6): T5Block(\n",
+      "        (layer): ModuleList(\n",
+      "          (0): T5LayerSelfAttention(\n",
+      "            (SelfAttention): T5Attention(\n",
+      "              (q): Linear(in_features=768, out_features=768, bias=False)\n",
+      "              (k): Linear(in_features=768, out_features=768, bias=False)\n",
+      "              (v): Linear(in_features=768, out_features=768, bias=False)\n",
+      "              (o): Linear(in_features=768, out_features=768, bias=False)\n",
+      "            )\n",
+      "            (layer_norm): T5LayerNorm()\n",
+      "            (dropout): Dropout(p=0.1, inplace=False)\n",
+      "          )\n",
+      "          (1): T5LayerCrossAttention(\n",
+      "            (EncDecAttention): T5Attention(\n",
+      "              (q): Linear(in_features=768, out_features=768, bias=False)\n",
+      "              (k): Linear(in_features=768, out_features=768, bias=False)\n",
+      "              (v): Linear(in_features=768, out_features=768, bias=False)\n",
+      "              (o): Linear(in_features=768, out_features=768, bias=False)\n",
+      "            )\n",
+      "            (layer_norm): T5LayerNorm()\n",
+      "            (dropout): Dropout(p=0.1, inplace=False)\n",
+      "          )\n",
+      "          (2): T5LayerFF(\n",
+      "            (DenseReluDense): T5DenseActDense(\n",
+      "              (wi): Linear(in_features=768, out_features=3072, bias=False)\n",
+      "              (wo): Linear(in_features=3072, out_features=768, bias=False)\n",
+      "              (dropout): Dropout(p=0.1, inplace=False)\n",
+      "              (act): ReLU()\n",
+      "            )\n",
+      "            (layer_norm): T5LayerNorm()\n",
+      "            (dropout): Dropout(p=0.1, inplace=False)\n",
+      "          )\n",
+      "        )\n",
+      "      )\n",
+      "      (7): T5Block(\n",
+      "        (layer): ModuleList(\n",
+      "          (0): T5LayerSelfAttention(\n",
+      "            (SelfAttention): T5Attention(\n",
+      "              (q): Linear(in_features=768, out_features=768, bias=False)\n",
+      "              (k): Linear(in_features=768, out_features=768, bias=False)\n",
+      "              (v): Linear(in_features=768, out_features=768, bias=False)\n",
+      "              (o): Linear(in_features=768, out_features=768, bias=False)\n",
+      "            )\n",
+      "            (layer_norm): T5LayerNorm()\n",
+      "            (dropout): Dropout(p=0.1, inplace=False)\n",
+      "          )\n",
+      "          (1): T5LayerCrossAttention(\n",
+      "            (EncDecAttention): T5Attention(\n",
+      "              (q): Linear(in_features=768, out_features=768, bias=False)\n",
+      "              (k): Linear(in_features=768, out_features=768, bias=False)\n",
+      "              (v): Linear(in_features=768, out_features=768, bias=False)\n",
+      "              (o): Linear(in_features=768, out_features=768, bias=False)\n",
+      "            )\n",
+      "            (layer_norm): T5LayerNorm()\n",
+      "            (dropout): Dropout(p=0.1, inplace=False)\n",
+      "          )\n",
+      "          (2): T5LayerFF(\n",
+      "            (DenseReluDense): T5DenseActDense(\n",
+      "              (wi): Linear(in_features=768, out_features=3072, bias=False)\n",
+      "              (wo): Linear(in_features=3072, out_features=768, bias=False)\n",
+      "              (dropout): Dropout(p=0.1, inplace=False)\n",
+      "              (act): ReLU()\n",
+      "            )\n",
+      "            (layer_norm): T5LayerNorm()\n",
+      "            (dropout): Dropout(p=0.1, inplace=False)\n",
+      "          )\n",
+      "        )\n",
+      "      )\n",
+      "      (8): T5Block(\n",
+      "        (layer): ModuleList(\n",
+      "          (0): T5LayerSelfAttention(\n",
+      "            (SelfAttention): T5Attention(\n",
+      "              (q): Linear(in_features=768, out_features=768, bias=False)\n",
+      "              (k): Linear(in_features=768, out_features=768, bias=False)\n",
+      "              (v): Linear(in_features=768, out_features=768, bias=False)\n",
+      "              (o): Linear(in_features=768, out_features=768, bias=False)\n",
+      "            )\n",
+      "            (layer_norm): T5LayerNorm()\n",
+      "            (dropout): Dropout(p=0.1, inplace=False)\n",
+      "          )\n",
+      "          (1): T5LayerCrossAttention(\n",
+      "            (EncDecAttention): T5Attention(\n",
+      "              (q): Linear(in_features=768, out_features=768, bias=False)\n",
+      "              (k): Linear(in_features=768, out_features=768, bias=False)\n",
+      "              (v): Linear(in_features=768, out_features=768, bias=False)\n",
+      "              (o): Linear(in_features=768, out_features=768, bias=False)\n",
+      "            )\n",
+      "            (layer_norm): T5LayerNorm()\n",
+      "            (dropout): Dropout(p=0.1, inplace=False)\n",
+      "          )\n",
+      "          (2): T5LayerFF(\n",
+      "            (DenseReluDense): T5DenseActDense(\n",
+      "              (wi): Linear(in_features=768, out_features=3072, bias=False)\n",
+      "              (wo): Linear(in_features=3072, out_features=768, bias=False)\n",
+      "              (dropout): Dropout(p=0.1, inplace=False)\n",
+      "              (act): ReLU()\n",
+      "            )\n",
+      "            (layer_norm): T5LayerNorm()\n",
+      "            (dropout): Dropout(p=0.1, inplace=False)\n",
+      "          )\n",
+      "        )\n",
+      "      )\n",
+      "      (9): T5Block(\n",
+      "        (layer): ModuleList(\n",
+      "          (0): T5LayerSelfAttention(\n",
+      "            (SelfAttention): T5Attention(\n",
+      "              (q): Linear(in_features=768, out_features=768, bias=False)\n",
+      "              (k): Linear(in_features=768, out_features=768, bias=False)\n",
+      "              (v): Linear(in_features=768, out_features=768, bias=False)\n",
+      "              (o): Linear(in_features=768, out_features=768, bias=False)\n",
+      "            )\n",
+      "            (layer_norm): T5LayerNorm()\n",
+      "            (dropout): Dropout(p=0.1, inplace=False)\n",
+      "          )\n",
+      "          (1): T5LayerCrossAttention(\n",
+      "            (EncDecAttention): T5Attention(\n",
+      "              (q): Linear(in_features=768, out_features=768, bias=False)\n",
+      "              (k): Linear(in_features=768, out_features=768, bias=False)\n",
+      "              (v): Linear(in_features=768, out_features=768, bias=False)\n",
+      "              (o): Linear(in_features=768, out_features=768, bias=False)\n",
+      "            )\n",
+      "            (layer_norm): T5LayerNorm()\n",
+      "            (dropout): Dropout(p=0.1, inplace=False)\n",
+      "          )\n",
+      "          (2): T5LayerFF(\n",
+      "            (DenseReluDense): T5DenseActDense(\n",
+      "              (wi): Linear(in_features=768, out_features=3072, bias=False)\n",
+      "              (wo): Linear(in_features=3072, out_features=768, bias=False)\n",
+      "              (dropout): Dropout(p=0.1, inplace=False)\n",
+      "              (act): ReLU()\n",
+      "            )\n",
+      "            (layer_norm): T5LayerNorm()\n",
+      "            (dropout): Dropout(p=0.1, inplace=False)\n",
+      "          )\n",
+      "        )\n",
+      "      )\n",
+      "      (10): T5Block(\n",
+      "        (layer): ModuleList(\n",
+      "          (0): T5LayerSelfAttention(\n",
+      "            (SelfAttention): T5Attention(\n",
+      "              (q): Linear(in_features=768, out_features=768, bias=False)\n",
+      "              (k): Linear(in_features=768, out_features=768, bias=False)\n",
+      "              (v): Linear(in_features=768, out_features=768, bias=False)\n",
+      "              (o): Linear(in_features=768, out_features=768, bias=False)\n",
+      "            )\n",
+      "            (layer_norm): T5LayerNorm()\n",
+      "            (dropout): Dropout(p=0.1, inplace=False)\n",
+      "          )\n",
+      "          (1): T5LayerCrossAttention(\n",
+      "            (EncDecAttention): T5Attention(\n",
+      "              (q): Linear(in_features=768, out_features=768, bias=False)\n",
+      "              (k): Linear(in_features=768, out_features=768, bias=False)\n",
+      "              (v): Linear(in_features=768, out_features=768, bias=False)\n",
+      "              (o): Linear(in_features=768, out_features=768, bias=False)\n",
+      "            )\n",
+      "            (layer_norm): T5LayerNorm()\n",
+      "            (dropout): Dropout(p=0.1, inplace=False)\n",
+      "          )\n",
+      "          (2): T5LayerFF(\n",
+      "            (DenseReluDense): T5DenseActDense(\n",
+      "              (wi): Linear(in_features=768, out_features=3072, bias=False)\n",
+      "              (wo): Linear(in_features=3072, out_features=768, bias=False)\n",
+      "              (dropout): Dropout(p=0.1, inplace=False)\n",
+      "              (act): ReLU()\n",
+      "            )\n",
+      "            (layer_norm): T5LayerNorm()\n",
+      "            (dropout): Dropout(p=0.1, inplace=False)\n",
+      "          )\n",
+      "        )\n",
+      "      )\n",
+      "      (11): T5Block(\n",
+      "        (layer): ModuleList(\n",
+      "          (0): T5LayerSelfAttention(\n",
+      "            (SelfAttention): T5Attention(\n",
+      "              (q): Linear(in_features=768, out_features=768, bias=False)\n",
+      "              (k): Linear(in_features=768, out_features=768, bias=False)\n",
+      "              (v): Linear(in_features=768, out_features=768, bias=False)\n",
+      "              (o): Linear(in_features=768, out_features=768, bias=False)\n",
+      "            )\n",
+      "            (layer_norm): T5LayerNorm()\n",
+      "            (dropout): Dropout(p=0.1, inplace=False)\n",
+      "          )\n",
+      "          (1): T5LayerCrossAttention(\n",
+      "            (EncDecAttention): T5Attention(\n",
+      "              (q): Linear(in_features=768, out_features=768, bias=False)\n",
+      "              (k): Linear(in_features=768, out_features=768, bias=False)\n",
+      "              (v): Linear(in_features=768, out_features=768, bias=False)\n",
+      "              (o): Linear(in_features=768, out_features=768, bias=False)\n",
+      "            )\n",
+      "            (layer_norm): T5LayerNorm()\n",
+      "            (dropout): Dropout(p=0.1, inplace=False)\n",
+      "          )\n",
+      "          (2): T5LayerFF(\n",
+      "            (DenseReluDense): T5DenseActDense(\n",
+      "              (wi): Linear(in_features=768, out_features=3072, bias=False)\n",
+      "              (wo): Linear(in_features=3072, out_features=768, bias=False)\n",
+      "              (dropout): Dropout(p=0.1, inplace=False)\n",
+      "              (act): ReLU()\n",
+      "            )\n",
+      "            (layer_norm): T5LayerNorm()\n",
+      "            (dropout): Dropout(p=0.1, inplace=False)\n",
+      "          )\n",
+      "        )\n",
+      "      )\n",
+      "    )\n",
+      "    (final_layer_norm): T5LayerNorm()\n",
+      "    (dropout): Dropout(p=0.1, inplace=False)\n",
+      "  )\n",
+      "  (lm_head): T5ClassificationHead(\n",
+      "    (dense_in): Linear(in_features=768, out_features=768, bias=True)\n",
+      "    (dense): Linear(in_features=768, out_features=768, bias=True)\n",
+      "    (dense_out): Linear(in_features=768, out_features=4, bias=True)\n",
+      "    (dropout): Dropout(p=0.1, inplace=False)\n",
+      "  )\n",
+      ")\n",
+      "Running tokenizer on dataset:   0%|                     | 0/120 [00:00<?, ?ba/s]02/16/2023 16:53:00 - INFO - datasets.arrow_dataset - Caching processed dataset at /home/jacob/code/university/uczenie_glebokie_w_przetwarzaniu_tekstu/.cache_training/json/default-f6e8039906850c57/0.0.0/0f7e3662623656454fcd2b650f34e886a7db4b9104504885bd462096cc7a9f51/cache-461127b59c7ea04e.arrow\n",
+      "Running tokenizer on dataset: 100%|███████████| 120/120 [00:08<00:00, 14.36ba/s]\n",
+      "Running tokenizer on dataset:   0%|                       | 0/4 [00:00<?, ?ba/s]02/16/2023 16:53:09 - INFO - datasets.arrow_dataset - Caching processed dataset at /home/jacob/code/university/uczenie_glebokie_w_przetwarzaniu_tekstu/.cache_training/json/default-f6e8039906850c57/0.0.0/0f7e3662623656454fcd2b650f34e886a7db4b9104504885bd462096cc7a9f51/cache-bbee377e7bea95e7.arrow\n",
+      "Running tokenizer on dataset: 100%|███████████████| 4/4 [00:00<00:00, 15.94ba/s]\n",
+      "Running tokenizer on dataset:   0%|                       | 0/4 [00:00<?, ?ba/s]02/16/2023 16:53:09 - INFO - datasets.arrow_dataset - Caching processed dataset at /home/jacob/code/university/uczenie_glebokie_w_przetwarzaniu_tekstu/.cache_training/json/default-f6e8039906850c57/0.0.0/0f7e3662623656454fcd2b650f34e886a7db4b9104504885bd462096cc7a9f51/cache-4e0cbdadca2e6dc6.arrow\n",
+      "Running tokenizer on dataset: 100%|███████████████| 4/4 [00:00<00:00, 16.87ba/s]\n",
+      "02/16/2023 16:53:10 - INFO - __main__ - Set 500 samples for 0-class\n",
+      "02/16/2023 16:53:10 - INFO - __main__ - Set 500 samples for 1-class\n",
+      "02/16/2023 16:53:10 - INFO - __main__ - Set 500 samples for 2-class\n",
+      "02/16/2023 16:53:10 - INFO - __main__ - Set 500 samples for 3-class\n",
+      "[INFO|trainer.py:511] 2023-02-16 16:53:12,738 >> max_steps is given, it will override any value given in num_train_epochs\n",
+      "02/16/2023 16:53:12 - INFO - __main__ - *** Evaluate ***\n",
+      "[INFO|trainer.py:710] 2023-02-16 16:53:12,739 >> The following columns in the evaluation set don't have a corresponding argument in `T5ForClassification.forward` and have been ignored: text. If text are not expected by `T5ForClassification.forward`,  you can safely ignore this message.\n",
+      "[INFO|trainer.py:2964] 2023-02-16 16:53:12,740 >> ***** Running Evaluation *****\n",
+      "[INFO|trainer.py:2966] 2023-02-16 16:53:12,740 >>   Num examples = 2000\n",
+      "[INFO|trainer.py:2969] 2023-02-16 16:53:12,740 >>   Batch size = 8\n",
+      "100%|█████████████████████████████████████████| 250/250 [00:39<00:00,  6.26it/s]\n",
+      "***** eval metrics *****\n",
+      "  eval_accuracy           =     0.4675\n",
+      "  eval_loss               =     1.2139\n",
+      "  eval_runtime            = 0:00:40.56\n",
+      "  eval_samples            =       2000\n",
+      "  eval_samples_per_second =     49.303\n",
+      "  eval_steps_per_second   =      6.163\n",
+      "02/16/2023 16:53:53 - INFO - __main__ - *** Predict ***\n",
+      "[INFO|trainer.py:710] 2023-02-16 16:53:53,307 >> The following columns in the test set don't have a corresponding argument in `T5ForClassification.forward` and have been ignored: text. If text are not expected by `T5ForClassification.forward`,  you can safely ignore this message.\n",
+      "[INFO|trainer.py:2964] 2023-02-16 16:53:53,308 >> ***** Running Prediction *****\n",
+      "[INFO|trainer.py:2966] 2023-02-16 16:53:53,308 >>   Num examples = 3800\n",
+      "[INFO|trainer.py:2969] 2023-02-16 16:53:53,308 >>   Batch size = 8\n",
+      "100%|█████████████████████████████████████████| 475/475 [01:15<00:00,  6.32it/s]\n",
+      "02/16/2023 16:55:08 - INFO - __main__ - ***** Predict results None *****\n",
+      "[INFO|modelcard.py:449] 2023-02-16 16:55:09,179 >> Dropping the following result as it does not have all the necessary fields:\n",
+      "{'task': {'name': 'Text Classification', 'type': 'text-classification'}}\n"
+     ]
+    }
+   ],
+   "source": [
+    "!python run_glue.py \\\n",
+    "  --cache_dir .cache_training \\\n",
+    "  --model_name_or_path out/t5 \\\n",
+    "  --custom_model t5_custom \\\n",
+    "  --train_file data/train.json  \\\n",
+    "  --validation_file data/valid.json \\\n",
+    "  --test_file data/test.json \\\n",
+    "  --per_device_train_batch_size 8 \\\n",
+    "  --per_device_eval_batch_size 8 \\\n",
+    "  --do_eval \\\n",
+    "  --do_predict \\\n",
+    "  --max_seq_length 128 \\\n",
+    "  --learning_rate 2e-5 \\\n",
+    "  --max_eval_samples 2000 \\\n",
+    "  --max_steps 2500 \\\n",
+    "  --num_train_epochs 1 \\\n",
+    "  --save_strategy steps \\\n",
+    "  --save_steps 250 \\\n",
+    "  --save_total_limit 5 \\\n",
+    "  --logging_strategy steps \\\n",
+    "  --logging_steps 100 \\\n",
+    "  --eval_steps 250 \\\n",
+    "  --evaluation_strategy steps \\\n",
+    "  --metric_for_best_model accuracy \\\n",
+    "  --greater_is_better True \\\n",
+    "  --load_best_model_at_end True \\\n",
+    "  --output_dir out/t5_results"
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Result"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 18,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\u001b[0;39m0.4675000011920929\u001b[0m\n"
+     ]
+    }
+   ],
+   "source": [
+    "!cat out/t5_results/eval_results.json | jq .eval_accuracy"
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Bart - Zero shot"
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Code"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "8de84b2cf8ed46488a6eb0bb4e0b11ef",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Downloading (…)lve/main/config.json:   0%|          | 0.00/1.40k [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "a0821410f9c64d608250175972c7e65e",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Downloading (…)\"pytorch_model.bin\";:   0%|          | 0.00/990M [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "10ea3442bf2e4af88050e6b6bf9ced14",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Downloading (…)neration_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "aef52d2ec9594d21a1e328b8cd9b78e4",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Downloading (…)okenizer_config.json:   0%|          | 0.00/2.54k [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "68abdd279b314c9794d8d7c697f534cd",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Downloading (…)\"spiece.model\";:   0%|          | 0.00/792k [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "ee44259445634805b86f28a00817e036",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Downloading (…)/main/tokenizer.json:   0%|          | 0.00/2.42M [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "5a7a9a0ca77d4cfba7132ee76fec44e7",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Downloading (…)cial_tokens_map.json:   0%|          | 0.00/2.20k [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    }
+   ],
+   "source": [
+    "from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, pipeline\n",
+    "from datasets import load_dataset\n",
+    "from tqdm.notebook import tqdm\n",
+    "model = AutoModelForSeq2SeqLM.from_pretrained(\"google/flan-t5-base\")\n",
+    "tokenizer = AutoTokenizer.from_pretrained(\"google/flan-t5-base\")\n",
+    "\n",
+    "pipeline = pipeline(\"text2text-generation\", model=model, tokenizer=tokenizer)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Using custom data configuration default-20e4aa4ef5e587fb\n",
+      "Found cached dataset json (/home/jacob/.cache/huggingface/datasets/json/default-20e4aa4ef5e587fb/0.0.0/0f7e3662623656454fcd2b650f34e886a7db4b9104504885bd462096cc7a9f51)\n"
+     ]
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "575936fbea7d4ceabb455ed732bace7e",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "  0%|          | 0/1 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Loading cached processed dataset at /home/jacob/.cache/huggingface/datasets/json/default-20e4aa4ef5e587fb/0.0.0/0f7e3662623656454fcd2b650f34e886a7db4b9104504885bd462096cc7a9f51/cache-6a7c4b64ea03ea9d.arrow\n"
+     ]
+    }
+   ],
+   "source": [
+    "MAP_LABEL_TRANSLATION = {\n",
+    "    0: 'world',\n",
+    "    1: 'sport',\n",
+    "    2: 'business',\n",
+    "    3: 'scitech'\n",
+    "}\n",
+    "dataset = load_dataset(\"json\", data_files={'test': 'data/test.json'})\n",
+    "\n",
+    "dataset['test'] = dataset['test'].map(lambda x: { 'label': MAP_LABEL_TRANSLATION[x['label']], 'text': x['text']})"
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Model"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "T5ForConditionalGeneration(\n",
+       "  (shared): Embedding(32128, 768)\n",
+       "  (encoder): T5Stack(\n",
+       "    (embed_tokens): Embedding(32128, 768)\n",
+       "    (block): ModuleList(\n",
+       "      (0): T5Block(\n",
+       "        (layer): ModuleList(\n",
+       "          (0): T5LayerSelfAttention(\n",
+       "            (SelfAttention): T5Attention(\n",
+       "              (q): Linear(in_features=768, out_features=768, bias=False)\n",
+       "              (k): Linear(in_features=768, out_features=768, bias=False)\n",
+       "              (v): Linear(in_features=768, out_features=768, bias=False)\n",
+       "              (o): Linear(in_features=768, out_features=768, bias=False)\n",
+       "              (relative_attention_bias): Embedding(32, 12)\n",
+       "            )\n",
+       "            (layer_norm): T5LayerNorm()\n",
+       "            (dropout): Dropout(p=0.1, inplace=False)\n",
+       "          )\n",
+       "          (1): T5LayerFF(\n",
+       "            (DenseReluDense): T5DenseGatedActDense(\n",
+       "              (wi_0): Linear(in_features=768, out_features=2048, bias=False)\n",
+       "              (wi_1): Linear(in_features=768, out_features=2048, bias=False)\n",
+       "              (wo): Linear(in_features=2048, out_features=768, bias=False)\n",
+       "              (dropout): Dropout(p=0.1, inplace=False)\n",
+       "              (act): NewGELUActivation()\n",
+       "            )\n",
+       "            (layer_norm): T5LayerNorm()\n",
+       "            (dropout): Dropout(p=0.1, inplace=False)\n",
+       "          )\n",
+       "        )\n",
+       "      )\n",
+       "      (1): T5Block(\n",
+       "        (layer): ModuleList(\n",
+       "          (0): T5LayerSelfAttention(\n",
+       "            (SelfAttention): T5Attention(\n",
+       "              (q): Linear(in_features=768, out_features=768, bias=False)\n",
+       "              (k): Linear(in_features=768, out_features=768, bias=False)\n",
+       "              (v): Linear(in_features=768, out_features=768, bias=False)\n",
+       "              (o): Linear(in_features=768, out_features=768, bias=False)\n",
+       "            )\n",
+       "            (layer_norm): T5LayerNorm()\n",
+       "            (dropout): Dropout(p=0.1, inplace=False)\n",
+       "          )\n",
+       "          (1): T5LayerFF(\n",
+       "            (DenseReluDense): T5DenseGatedActDense(\n",
+       "              (wi_0): Linear(in_features=768, out_features=2048, bias=False)\n",
+       "              (wi_1): Linear(in_features=768, out_features=2048, bias=False)\n",
+       "              (wo): Linear(in_features=2048, out_features=768, bias=False)\n",
+       "              (dropout): Dropout(p=0.1, inplace=False)\n",
+       "              (act): NewGELUActivation()\n",
+       "            )\n",
+       "            (layer_norm): T5LayerNorm()\n",
+       "            (dropout): Dropout(p=0.1, inplace=False)\n",
+       "          )\n",
+       "        )\n",
+       "      )\n",
+       "      (2): T5Block(\n",
+       "        (layer): ModuleList(\n",
+       "          (0): T5LayerSelfAttention(\n",
+       "            (SelfAttention): T5Attention(\n",
+       "              (q): Linear(in_features=768, out_features=768, bias=False)\n",
+       "              (k): Linear(in_features=768, out_features=768, bias=False)\n",
+       "              (v): Linear(in_features=768, out_features=768, bias=False)\n",
+       "              (o): Linear(in_features=768, out_features=768, bias=False)\n",
+       "            )\n",
+       "            (layer_norm): T5LayerNorm()\n",
+       "            (dropout): Dropout(p=0.1, inplace=False)\n",
+       "          )\n",
+       "          (1): T5LayerFF(\n",
+       "            (DenseReluDense): T5DenseGatedActDense(\n",
+       "              (wi_0): Linear(in_features=768, out_features=2048, bias=False)\n",
+       "              (wi_1): Linear(in_features=768, out_features=2048, bias=False)\n",
+       "              (wo): Linear(in_features=2048, out_features=768, bias=False)\n",
+       "              (dropout): Dropout(p=0.1, inplace=False)\n",
+       "              (act): NewGELUActivation()\n",
+       "            )\n",
+       "            (layer_norm): T5LayerNorm()\n",
+       "            (dropout): Dropout(p=0.1, inplace=False)\n",
+       "          )\n",
+       "        )\n",
+       "      )\n",
+       "      (3): T5Block(\n",
+       "        (layer): ModuleList(\n",
+       "          (0): T5LayerSelfAttention(\n",
+       "            (SelfAttention): T5Attention(\n",
+       "              (q): Linear(in_features=768, out_features=768, bias=False)\n",
+       "              (k): Linear(in_features=768, out_features=768, bias=False)\n",
+       "              (v): Linear(in_features=768, out_features=768, bias=False)\n",
+       "              (o): Linear(in_features=768, out_features=768, bias=False)\n",
+       "            )\n",
+       "            (layer_norm): T5LayerNorm()\n",
+       "            (dropout): Dropout(p=0.1, inplace=False)\n",
+       "          )\n",
+       "          (1): T5LayerFF(\n",
+       "            (DenseReluDense): T5DenseGatedActDense(\n",
+       "              (wi_0): Linear(in_features=768, out_features=2048, bias=False)\n",
+       "              (wi_1): Linear(in_features=768, out_features=2048, bias=False)\n",
+       "              (wo): Linear(in_features=2048, out_features=768, bias=False)\n",
+       "              (dropout): Dropout(p=0.1, inplace=False)\n",
+       "              (act): NewGELUActivation()\n",
+       "            )\n",
+       "            (layer_norm): T5LayerNorm()\n",
+       "            (dropout): Dropout(p=0.1, inplace=False)\n",
+       "          )\n",
+       "        )\n",
+       "      )\n",
+       "      (4): T5Block(\n",
+       "        (layer): ModuleList(\n",
+       "          (0): T5LayerSelfAttention(\n",
+       "            (SelfAttention): T5Attention(\n",
+       "              (q): Linear(in_features=768, out_features=768, bias=False)\n",
+       "              (k): Linear(in_features=768, out_features=768, bias=False)\n",
+       "              (v): Linear(in_features=768, out_features=768, bias=False)\n",
+       "              (o): Linear(in_features=768, out_features=768, bias=False)\n",
+       "            )\n",
+       "            (layer_norm): T5LayerNorm()\n",
+       "            (dropout): Dropout(p=0.1, inplace=False)\n",
+       "          )\n",
+       "          (1): T5LayerFF(\n",
+       "            (DenseReluDense): T5DenseGatedActDense(\n",
+       "              (wi_0): Linear(in_features=768, out_features=2048, bias=False)\n",
+       "              (wi_1): Linear(in_features=768, out_features=2048, bias=False)\n",
+       "              (wo): Linear(in_features=2048, out_features=768, bias=False)\n",
+       "              (dropout): Dropout(p=0.1, inplace=False)\n",
+       "              (act): NewGELUActivation()\n",
+       "            )\n",
+       "            (layer_norm): T5LayerNorm()\n",
+       "            (dropout): Dropout(p=0.1, inplace=False)\n",
+       "          )\n",
+       "        )\n",
+       "      )\n",
+       "      (5): T5Block(\n",
+       "        (layer): ModuleList(\n",
+       "          (0): T5LayerSelfAttention(\n",
+       "            (SelfAttention): T5Attention(\n",
+       "              (q): Linear(in_features=768, out_features=768, bias=False)\n",
+       "              (k): Linear(in_features=768, out_features=768, bias=False)\n",
+       "              (v): Linear(in_features=768, out_features=768, bias=False)\n",
+       "              (o): Linear(in_features=768, out_features=768, bias=False)\n",
+       "            )\n",
+       "            (layer_norm): T5LayerNorm()\n",
+       "            (dropout): Dropout(p=0.1, inplace=False)\n",
+       "          )\n",
+       "          (1): T5LayerFF(\n",
+       "            (DenseReluDense): T5DenseGatedActDense(\n",
+       "              (wi_0): Linear(in_features=768, out_features=2048, bias=False)\n",
+       "              (wi_1): Linear(in_features=768, out_features=2048, bias=False)\n",
+       "              (wo): Linear(in_features=2048, out_features=768, bias=False)\n",
+       "              (dropout): Dropout(p=0.1, inplace=False)\n",
+       "              (act): NewGELUActivation()\n",
+       "            )\n",
+       "            (layer_norm): T5LayerNorm()\n",
+       "            (dropout): Dropout(p=0.1, inplace=False)\n",
+       "          )\n",
+       "        )\n",
+       "      )\n",
+       "      (6): T5Block(\n",
+       "        (layer): ModuleList(\n",
+       "          (0): T5LayerSelfAttention(\n",
+       "            (SelfAttention): T5Attention(\n",
+       "              (q): Linear(in_features=768, out_features=768, bias=False)\n",
+       "              (k): Linear(in_features=768, out_features=768, bias=False)\n",
+       "              (v): Linear(in_features=768, out_features=768, bias=False)\n",
+       "              (o): Linear(in_features=768, out_features=768, bias=False)\n",
+       "            )\n",
+       "            (layer_norm): T5LayerNorm()\n",
+       "            (dropout): Dropout(p=0.1, inplace=False)\n",
+       "          )\n",
+       "          (1): T5LayerFF(\n",
+       "            (DenseReluDense): T5DenseGatedActDense(\n",
+       "              (wi_0): Linear(in_features=768, out_features=2048, bias=False)\n",
+       "              (wi_1): Linear(in_features=768, out_features=2048, bias=False)\n",
+       "              (wo): Linear(in_features=2048, out_features=768, bias=False)\n",
+       "              (dropout): Dropout(p=0.1, inplace=False)\n",
+       "              (act): NewGELUActivation()\n",
+       "            )\n",
+       "            (layer_norm): T5LayerNorm()\n",
+       "            (dropout): Dropout(p=0.1, inplace=False)\n",
+       "          )\n",
+       "        )\n",
+       "      )\n",
+       "      (7): T5Block(\n",
+       "        (layer): ModuleList(\n",
+       "          (0): T5LayerSelfAttention(\n",
+       "            (SelfAttention): T5Attention(\n",
+       "              (q): Linear(in_features=768, out_features=768, bias=False)\n",
+       "              (k): Linear(in_features=768, out_features=768, bias=False)\n",
+       "              (v): Linear(in_features=768, out_features=768, bias=False)\n",
+       "              (o): Linear(in_features=768, out_features=768, bias=False)\n",
+       "            )\n",
+       "            (layer_norm): T5LayerNorm()\n",
+       "            (dropout): Dropout(p=0.1, inplace=False)\n",
+       "          )\n",
+       "          (1): T5LayerFF(\n",
+       "            (DenseReluDense): T5DenseGatedActDense(\n",
+       "              (wi_0): Linear(in_features=768, out_features=2048, bias=False)\n",
+       "              (wi_1): Linear(in_features=768, out_features=2048, bias=False)\n",
+       "              (wo): Linear(in_features=2048, out_features=768, bias=False)\n",
+       "              (dropout): Dropout(p=0.1, inplace=False)\n",
+       "              (act): NewGELUActivation()\n",
+       "            )\n",
+       "            (layer_norm): T5LayerNorm()\n",
+       "            (dropout): Dropout(p=0.1, inplace=False)\n",
+       "          )\n",
+       "        )\n",
+       "      )\n",
+       "      (8): T5Block(\n",
+       "        (layer): ModuleList(\n",
+       "          (0): T5LayerSelfAttention(\n",
+       "            (SelfAttention): T5Attention(\n",
+       "              (q): Linear(in_features=768, out_features=768, bias=False)\n",
+       "              (k): Linear(in_features=768, out_features=768, bias=False)\n",
+       "              (v): Linear(in_features=768, out_features=768, bias=False)\n",
+       "              (o): Linear(in_features=768, out_features=768, bias=False)\n",
+       "            )\n",
+       "            (layer_norm): T5LayerNorm()\n",
+       "            (dropout): Dropout(p=0.1, inplace=False)\n",
+       "          )\n",
+       "          (1): T5LayerFF(\n",
+       "            (DenseReluDense): T5DenseGatedActDense(\n",
+       "              (wi_0): Linear(in_features=768, out_features=2048, bias=False)\n",
+       "              (wi_1): Linear(in_features=768, out_features=2048, bias=False)\n",
+       "              (wo): Linear(in_features=2048, out_features=768, bias=False)\n",
+       "              (dropout): Dropout(p=0.1, inplace=False)\n",
+       "              (act): NewGELUActivation()\n",
+       "            )\n",
+       "            (layer_norm): T5LayerNorm()\n",
+       "            (dropout): Dropout(p=0.1, inplace=False)\n",
+       "          )\n",
+       "        )\n",
+       "      )\n",
+       "      (9): T5Block(\n",
+       "        (layer): ModuleList(\n",
+       "          (0): T5LayerSelfAttention(\n",
+       "            (SelfAttention): T5Attention(\n",
+       "              (q): Linear(in_features=768, out_features=768, bias=False)\n",
+       "              (k): Linear(in_features=768, out_features=768, bias=False)\n",
+       "              (v): Linear(in_features=768, out_features=768, bias=False)\n",
+       "              (o): Linear(in_features=768, out_features=768, bias=False)\n",
+       "            )\n",
+       "            (layer_norm): T5LayerNorm()\n",
+       "            (dropout): Dropout(p=0.1, inplace=False)\n",
+       "          )\n",
+       "          (1): T5LayerFF(\n",
+       "            (DenseReluDense): T5DenseGatedActDense(\n",
+       "              (wi_0): Linear(in_features=768, out_features=2048, bias=False)\n",
+       "              (wi_1): Linear(in_features=768, out_features=2048, bias=False)\n",
+       "              (wo): Linear(in_features=2048, out_features=768, bias=False)\n",
+       "              (dropout): Dropout(p=0.1, inplace=False)\n",
+       "              (act): NewGELUActivation()\n",
+       "            )\n",
+       "            (layer_norm): T5LayerNorm()\n",
+       "            (dropout): Dropout(p=0.1, inplace=False)\n",
+       "          )\n",
+       "        )\n",
+       "      )\n",
+       "      (10): T5Block(\n",
+       "        (layer): ModuleList(\n",
+       "          (0): T5LayerSelfAttention(\n",
+       "            (SelfAttention): T5Attention(\n",
+       "              (q): Linear(in_features=768, out_features=768, bias=False)\n",
+       "              (k): Linear(in_features=768, out_features=768, bias=False)\n",
+       "              (v): Linear(in_features=768, out_features=768, bias=False)\n",
+       "              (o): Linear(in_features=768, out_features=768, bias=False)\n",
+       "            )\n",
+       "            (layer_norm): T5LayerNorm()\n",
+       "            (dropout): Dropout(p=0.1, inplace=False)\n",
+       "          )\n",
+       "          (1): T5LayerFF(\n",
+       "            (DenseReluDense): T5DenseGatedActDense(\n",
+       "              (wi_0): Linear(in_features=768, out_features=2048, bias=False)\n",
+       "              (wi_1): Linear(in_features=768, out_features=2048, bias=False)\n",
+       "              (wo): Linear(in_features=2048, out_features=768, bias=False)\n",
+       "              (dropout): Dropout(p=0.1, inplace=False)\n",
+       "              (act): NewGELUActivation()\n",
+       "            )\n",
+       "            (layer_norm): T5LayerNorm()\n",
+       "            (dropout): Dropout(p=0.1, inplace=False)\n",
+       "          )\n",
+       "        )\n",
+       "      )\n",
+       "      (11): T5Block(\n",
+       "        (layer): ModuleList(\n",
+       "          (0): T5LayerSelfAttention(\n",
+       "            (SelfAttention): T5Attention(\n",
+       "              (q): Linear(in_features=768, out_features=768, bias=False)\n",
+       "              (k): Linear(in_features=768, out_features=768, bias=False)\n",
+       "              (v): Linear(in_features=768, out_features=768, bias=False)\n",
+       "              (o): Linear(in_features=768, out_features=768, bias=False)\n",
+       "            )\n",
+       "            (layer_norm): T5LayerNorm()\n",
+       "            (dropout): Dropout(p=0.1, inplace=False)\n",
+       "          )\n",
+       "          (1): T5LayerFF(\n",
+       "            (DenseReluDense): T5DenseGatedActDense(\n",
+       "              (wi_0): Linear(in_features=768, out_features=2048, bias=False)\n",
+       "              (wi_1): Linear(in_features=768, out_features=2048, bias=False)\n",
+       "              (wo): Linear(in_features=2048, out_features=768, bias=False)\n",
+       "              (dropout): Dropout(p=0.1, inplace=False)\n",
+       "              (act): NewGELUActivation()\n",
+       "            )\n",
+       "            (layer_norm): T5LayerNorm()\n",
+       "            (dropout): Dropout(p=0.1, inplace=False)\n",
+       "          )\n",
+       "        )\n",
+       "      )\n",
+       "    )\n",
+       "    (final_layer_norm): T5LayerNorm()\n",
+       "    (dropout): Dropout(p=0.1, inplace=False)\n",
+       "  )\n",
+       "  (decoder): T5Stack(\n",
+       "    (embed_tokens): Embedding(32128, 768)\n",
+       "    (block): ModuleList(\n",
+       "      (0): T5Block(\n",
+       "        (layer): ModuleList(\n",
+       "          (0): T5LayerSelfAttention(\n",
+       "            (SelfAttention): T5Attention(\n",
+       "              (q): Linear(in_features=768, out_features=768, bias=False)\n",
+       "              (k): Linear(in_features=768, out_features=768, bias=False)\n",
+       "              (v): Linear(in_features=768, out_features=768, bias=False)\n",
+       "              (o): Linear(in_features=768, out_features=768, bias=False)\n",
+       "              (relative_attention_bias): Embedding(32, 12)\n",
+       "            )\n",
+       "            (layer_norm): T5LayerNorm()\n",
+       "            (dropout): Dropout(p=0.1, inplace=False)\n",
+       "          )\n",
+       "          (1): T5LayerCrossAttention(\n",
+       "            (EncDecAttention): T5Attention(\n",
+       "              (q): Linear(in_features=768, out_features=768, bias=False)\n",
+       "              (k): Linear(in_features=768, out_features=768, bias=False)\n",
+       "              (v): Linear(in_features=768, out_features=768, bias=False)\n",
+       "              (o): Linear(in_features=768, out_features=768, bias=False)\n",
+       "            )\n",
+       "            (layer_norm): T5LayerNorm()\n",
+       "            (dropout): Dropout(p=0.1, inplace=False)\n",
+       "          )\n",
+       "          (2): T5LayerFF(\n",
+       "            (DenseReluDense): T5DenseGatedActDense(\n",
+       "              (wi_0): Linear(in_features=768, out_features=2048, bias=False)\n",
+       "              (wi_1): Linear(in_features=768, out_features=2048, bias=False)\n",
+       "              (wo): Linear(in_features=2048, out_features=768, bias=False)\n",
+       "              (dropout): Dropout(p=0.1, inplace=False)\n",
+       "              (act): NewGELUActivation()\n",
+       "            )\n",
+       "            (layer_norm): T5LayerNorm()\n",
+       "            (dropout): Dropout(p=0.1, inplace=False)\n",
+       "          )\n",
+       "        )\n",
+       "      )\n",
+       "      (1): T5Block(\n",
+       "        (layer): ModuleList(\n",
+       "          (0): T5LayerSelfAttention(\n",
+       "            (SelfAttention): T5Attention(\n",
+       "              (q): Linear(in_features=768, out_features=768, bias=False)\n",
+       "              (k): Linear(in_features=768, out_features=768, bias=False)\n",
+       "              (v): Linear(in_features=768, out_features=768, bias=False)\n",
+       "              (o): Linear(in_features=768, out_features=768, bias=False)\n",
+       "            )\n",
+       "            (layer_norm): T5LayerNorm()\n",
+       "            (dropout): Dropout(p=0.1, inplace=False)\n",
+       "          )\n",
+       "          (1): T5LayerCrossAttention(\n",
+       "            (EncDecAttention): T5Attention(\n",
+       "              (q): Linear(in_features=768, out_features=768, bias=False)\n",
+       "              (k): Linear(in_features=768, out_features=768, bias=False)\n",
+       "              (v): Linear(in_features=768, out_features=768, bias=False)\n",
+       "              (o): Linear(in_features=768, out_features=768, bias=False)\n",
+       "            )\n",
+       "            (layer_norm): T5LayerNorm()\n",
+       "            (dropout): Dropout(p=0.1, inplace=False)\n",
+       "          )\n",
+       "          (2): T5LayerFF(\n",
+       "            (DenseReluDense): T5DenseGatedActDense(\n",
+       "              (wi_0): Linear(in_features=768, out_features=2048, bias=False)\n",
+       "              (wi_1): Linear(in_features=768, out_features=2048, bias=False)\n",
+       "              (wo): Linear(in_features=2048, out_features=768, bias=False)\n",
+       "              (dropout): Dropout(p=0.1, inplace=False)\n",
+       "              (act): NewGELUActivation()\n",
+       "            )\n",
+       "            (layer_norm): T5LayerNorm()\n",
+       "            (dropout): Dropout(p=0.1, inplace=False)\n",
+       "          )\n",
+       "        )\n",
+       "      )\n",
+       "      (2): T5Block(\n",
+       "        (layer): ModuleList(\n",
+       "          (0): T5LayerSelfAttention(\n",
+       "            (SelfAttention): T5Attention(\n",
+       "              (q): Linear(in_features=768, out_features=768, bias=False)\n",
+       "              (k): Linear(in_features=768, out_features=768, bias=False)\n",
+       "              (v): Linear(in_features=768, out_features=768, bias=False)\n",
+       "              (o): Linear(in_features=768, out_features=768, bias=False)\n",
+       "            )\n",
+       "            (layer_norm): T5LayerNorm()\n",
+       "            (dropout): Dropout(p=0.1, inplace=False)\n",
+       "          )\n",
+       "          (1): T5LayerCrossAttention(\n",
+       "            (EncDecAttention): T5Attention(\n",
+       "              (q): Linear(in_features=768, out_features=768, bias=False)\n",
+       "              (k): Linear(in_features=768, out_features=768, bias=False)\n",
+       "              (v): Linear(in_features=768, out_features=768, bias=False)\n",
+       "              (o): Linear(in_features=768, out_features=768, bias=False)\n",
+       "            )\n",
+       "            (layer_norm): T5LayerNorm()\n",
+       "            (dropout): Dropout(p=0.1, inplace=False)\n",
+       "          )\n",
+       "          (2): T5LayerFF(\n",
+       "            (DenseReluDense): T5DenseGatedActDense(\n",
+       "              (wi_0): Linear(in_features=768, out_features=2048, bias=False)\n",
+       "              (wi_1): Linear(in_features=768, out_features=2048, bias=False)\n",
+       "              (wo): Linear(in_features=2048, out_features=768, bias=False)\n",
+       "              (dropout): Dropout(p=0.1, inplace=False)\n",
+       "              (act): NewGELUActivation()\n",
+       "            )\n",
+       "            (layer_norm): T5LayerNorm()\n",
+       "            (dropout): Dropout(p=0.1, inplace=False)\n",
+       "          )\n",
+       "        )\n",
+       "      )\n",
+       "      (3): T5Block(\n",
+       "        (layer): ModuleList(\n",
+       "          (0): T5LayerSelfAttention(\n",
+       "            (SelfAttention): T5Attention(\n",
+       "              (q): Linear(in_features=768, out_features=768, bias=False)\n",
+       "              (k): Linear(in_features=768, out_features=768, bias=False)\n",
+       "              (v): Linear(in_features=768, out_features=768, bias=False)\n",
+       "              (o): Linear(in_features=768, out_features=768, bias=False)\n",
+       "            )\n",
+       "            (layer_norm): T5LayerNorm()\n",
+       "            (dropout): Dropout(p=0.1, inplace=False)\n",
+       "          )\n",
+       "          (1): T5LayerCrossAttention(\n",
+       "            (EncDecAttention): T5Attention(\n",
+       "              (q): Linear(in_features=768, out_features=768, bias=False)\n",
+       "              (k): Linear(in_features=768, out_features=768, bias=False)\n",
+       "              (v): Linear(in_features=768, out_features=768, bias=False)\n",
+       "              (o): Linear(in_features=768, out_features=768, bias=False)\n",
+       "            )\n",
+       "            (layer_norm): T5LayerNorm()\n",
+       "            (dropout): Dropout(p=0.1, inplace=False)\n",
+       "          )\n",
+       "          (2): T5LayerFF(\n",
+       "            (DenseReluDense): T5DenseGatedActDense(\n",
+       "              (wi_0): Linear(in_features=768, out_features=2048, bias=False)\n",
+       "              (wi_1): Linear(in_features=768, out_features=2048, bias=False)\n",
+       "              (wo): Linear(in_features=2048, out_features=768, bias=False)\n",
+       "              (dropout): Dropout(p=0.1, inplace=False)\n",
+       "              (act): NewGELUActivation()\n",
+       "            )\n",
+       "            (layer_norm): T5LayerNorm()\n",
+       "            (dropout): Dropout(p=0.1, inplace=False)\n",
+       "          )\n",
+       "        )\n",
+       "      )\n",
+       "      (4): T5Block(\n",
+       "        (layer): ModuleList(\n",
+       "          (0): T5LayerSelfAttention(\n",
+       "            (SelfAttention): T5Attention(\n",
+       "              (q): Linear(in_features=768, out_features=768, bias=False)\n",
+       "              (k): Linear(in_features=768, out_features=768, bias=False)\n",
+       "              (v): Linear(in_features=768, out_features=768, bias=False)\n",
+       "              (o): Linear(in_features=768, out_features=768, bias=False)\n",
+       "            )\n",
+       "            (layer_norm): T5LayerNorm()\n",
+       "            (dropout): Dropout(p=0.1, inplace=False)\n",
+       "          )\n",
+       "          (1): T5LayerCrossAttention(\n",
+       "            (EncDecAttention): T5Attention(\n",
+       "              (q): Linear(in_features=768, out_features=768, bias=False)\n",
+       "              (k): Linear(in_features=768, out_features=768, bias=False)\n",
+       "              (v): Linear(in_features=768, out_features=768, bias=False)\n",
+       "              (o): Linear(in_features=768, out_features=768, bias=False)\n",
+       "            )\n",
+       "            (layer_norm): T5LayerNorm()\n",
+       "            (dropout): Dropout(p=0.1, inplace=False)\n",
+       "          )\n",
+       "          (2): T5LayerFF(\n",
+       "            (DenseReluDense): T5DenseGatedActDense(\n",
+       "              (wi_0): Linear(in_features=768, out_features=2048, bias=False)\n",
+       "              (wi_1): Linear(in_features=768, out_features=2048, bias=False)\n",
+       "              (wo): Linear(in_features=2048, out_features=768, bias=False)\n",
+       "              (dropout): Dropout(p=0.1, inplace=False)\n",
+       "              (act): NewGELUActivation()\n",
+       "            )\n",
+       "            (layer_norm): T5LayerNorm()\n",
+       "            (dropout): Dropout(p=0.1, inplace=False)\n",
+       "          )\n",
+       "        )\n",
+       "      )\n",
+       "      (5): T5Block(\n",
+       "        (layer): ModuleList(\n",
+       "          (0): T5LayerSelfAttention(\n",
+       "            (SelfAttention): T5Attention(\n",
+       "              (q): Linear(in_features=768, out_features=768, bias=False)\n",
+       "              (k): Linear(in_features=768, out_features=768, bias=False)\n",
+       "              (v): Linear(in_features=768, out_features=768, bias=False)\n",
+       "              (o): Linear(in_features=768, out_features=768, bias=False)\n",
+       "            )\n",
+       "            (layer_norm): T5LayerNorm()\n",
+       "            (dropout): Dropout(p=0.1, inplace=False)\n",
+       "          )\n",
+       "          (1): T5LayerCrossAttention(\n",
+       "            (EncDecAttention): T5Attention(\n",
+       "              (q): Linear(in_features=768, out_features=768, bias=False)\n",
+       "              (k): Linear(in_features=768, out_features=768, bias=False)\n",
+       "              (v): Linear(in_features=768, out_features=768, bias=False)\n",
+       "              (o): Linear(in_features=768, out_features=768, bias=False)\n",
+       "            )\n",
+       "            (layer_norm): T5LayerNorm()\n",
+       "            (dropout): Dropout(p=0.1, inplace=False)\n",
+       "          )\n",
+       "          (2): T5LayerFF(\n",
+       "            (DenseReluDense): T5DenseGatedActDense(\n",
+       "              (wi_0): Linear(in_features=768, out_features=2048, bias=False)\n",
+       "              (wi_1): Linear(in_features=768, out_features=2048, bias=False)\n",
+       "              (wo): Linear(in_features=2048, out_features=768, bias=False)\n",
+       "              (dropout): Dropout(p=0.1, inplace=False)\n",
+       "              (act): NewGELUActivation()\n",
+       "            )\n",
+       "            (layer_norm): T5LayerNorm()\n",
+       "            (dropout): Dropout(p=0.1, inplace=False)\n",
+       "          )\n",
+       "        )\n",
+       "      )\n",
+       "      (6): T5Block(\n",
+       "        (layer): ModuleList(\n",
+       "          (0): T5LayerSelfAttention(\n",
+       "            (SelfAttention): T5Attention(\n",
+       "              (q): Linear(in_features=768, out_features=768, bias=False)\n",
+       "              (k): Linear(in_features=768, out_features=768, bias=False)\n",
+       "              (v): Linear(in_features=768, out_features=768, bias=False)\n",
+       "              (o): Linear(in_features=768, out_features=768, bias=False)\n",
+       "            )\n",
+       "            (layer_norm): T5LayerNorm()\n",
+       "            (dropout): Dropout(p=0.1, inplace=False)\n",
+       "          )\n",
+       "          (1): T5LayerCrossAttention(\n",
+       "            (EncDecAttention): T5Attention(\n",
+       "              (q): Linear(in_features=768, out_features=768, bias=False)\n",
+       "              (k): Linear(in_features=768, out_features=768, bias=False)\n",
+       "              (v): Linear(in_features=768, out_features=768, bias=False)\n",
+       "              (o): Linear(in_features=768, out_features=768, bias=False)\n",
+       "            )\n",
+       "            (layer_norm): T5LayerNorm()\n",
+       "            (dropout): Dropout(p=0.1, inplace=False)\n",
+       "          )\n",
+       "          (2): T5LayerFF(\n",
+       "            (DenseReluDense): T5DenseGatedActDense(\n",
+       "              (wi_0): Linear(in_features=768, out_features=2048, bias=False)\n",
+       "              (wi_1): Linear(in_features=768, out_features=2048, bias=False)\n",
+       "              (wo): Linear(in_features=2048, out_features=768, bias=False)\n",
+       "              (dropout): Dropout(p=0.1, inplace=False)\n",
+       "              (act): NewGELUActivation()\n",
+       "            )\n",
+       "            (layer_norm): T5LayerNorm()\n",
+       "            (dropout): Dropout(p=0.1, inplace=False)\n",
+       "          )\n",
+       "        )\n",
+       "      )\n",
+       "      (7): T5Block(\n",
+       "        (layer): ModuleList(\n",
+       "          (0): T5LayerSelfAttention(\n",
+       "            (SelfAttention): T5Attention(\n",
+       "              (q): Linear(in_features=768, out_features=768, bias=False)\n",
+       "              (k): Linear(in_features=768, out_features=768, bias=False)\n",
+       "              (v): Linear(in_features=768, out_features=768, bias=False)\n",
+       "              (o): Linear(in_features=768, out_features=768, bias=False)\n",
+       "            )\n",
+       "            (layer_norm): T5LayerNorm()\n",
+       "            (dropout): Dropout(p=0.1, inplace=False)\n",
+       "          )\n",
+       "          (1): T5LayerCrossAttention(\n",
+       "            (EncDecAttention): T5Attention(\n",
+       "              (q): Linear(in_features=768, out_features=768, bias=False)\n",
+       "              (k): Linear(in_features=768, out_features=768, bias=False)\n",
+       "              (v): Linear(in_features=768, out_features=768, bias=False)\n",
+       "              (o): Linear(in_features=768, out_features=768, bias=False)\n",
+       "            )\n",
+       "            (layer_norm): T5LayerNorm()\n",
+       "            (dropout): Dropout(p=0.1, inplace=False)\n",
+       "          )\n",
+       "          (2): T5LayerFF(\n",
+       "            (DenseReluDense): T5DenseGatedActDense(\n",
+       "              (wi_0): Linear(in_features=768, out_features=2048, bias=False)\n",
+       "              (wi_1): Linear(in_features=768, out_features=2048, bias=False)\n",
+       "              (wo): Linear(in_features=2048, out_features=768, bias=False)\n",
+       "              (dropout): Dropout(p=0.1, inplace=False)\n",
+       "              (act): NewGELUActivation()\n",
+       "            )\n",
+       "            (layer_norm): T5LayerNorm()\n",
+       "            (dropout): Dropout(p=0.1, inplace=False)\n",
+       "          )\n",
+       "        )\n",
+       "      )\n",
+       "      (8): T5Block(\n",
+       "        (layer): ModuleList(\n",
+       "          (0): T5LayerSelfAttention(\n",
+       "            (SelfAttention): T5Attention(\n",
+       "              (q): Linear(in_features=768, out_features=768, bias=False)\n",
+       "              (k): Linear(in_features=768, out_features=768, bias=False)\n",
+       "              (v): Linear(in_features=768, out_features=768, bias=False)\n",
+       "              (o): Linear(in_features=768, out_features=768, bias=False)\n",
+       "            )\n",
+       "            (layer_norm): T5LayerNorm()\n",
+       "            (dropout): Dropout(p=0.1, inplace=False)\n",
+       "          )\n",
+       "          (1): T5LayerCrossAttention(\n",
+       "            (EncDecAttention): T5Attention(\n",
+       "              (q): Linear(in_features=768, out_features=768, bias=False)\n",
+       "              (k): Linear(in_features=768, out_features=768, bias=False)\n",
+       "              (v): Linear(in_features=768, out_features=768, bias=False)\n",
+       "              (o): Linear(in_features=768, out_features=768, bias=False)\n",
+       "            )\n",
+       "            (layer_norm): T5LayerNorm()\n",
+       "            (dropout): Dropout(p=0.1, inplace=False)\n",
+       "          )\n",
+       "          (2): T5LayerFF(\n",
+       "            (DenseReluDense): T5DenseGatedActDense(\n",
+       "              (wi_0): Linear(in_features=768, out_features=2048, bias=False)\n",
+       "              (wi_1): Linear(in_features=768, out_features=2048, bias=False)\n",
+       "              (wo): Linear(in_features=2048, out_features=768, bias=False)\n",
+       "              (dropout): Dropout(p=0.1, inplace=False)\n",
+       "              (act): NewGELUActivation()\n",
+       "            )\n",
+       "            (layer_norm): T5LayerNorm()\n",
+       "            (dropout): Dropout(p=0.1, inplace=False)\n",
+       "          )\n",
+       "        )\n",
+       "      )\n",
+       "      (9): T5Block(\n",
+       "        (layer): ModuleList(\n",
+       "          (0): T5LayerSelfAttention(\n",
+       "            (SelfAttention): T5Attention(\n",
+       "              (q): Linear(in_features=768, out_features=768, bias=False)\n",
+       "              (k): Linear(in_features=768, out_features=768, bias=False)\n",
+       "              (v): Linear(in_features=768, out_features=768, bias=False)\n",
+       "              (o): Linear(in_features=768, out_features=768, bias=False)\n",
+       "            )\n",
+       "            (layer_norm): T5LayerNorm()\n",
+       "            (dropout): Dropout(p=0.1, inplace=False)\n",
+       "          )\n",
+       "          (1): T5LayerCrossAttention(\n",
+       "            (EncDecAttention): T5Attention(\n",
+       "              (q): Linear(in_features=768, out_features=768, bias=False)\n",
+       "              (k): Linear(in_features=768, out_features=768, bias=False)\n",
+       "              (v): Linear(in_features=768, out_features=768, bias=False)\n",
+       "              (o): Linear(in_features=768, out_features=768, bias=False)\n",
+       "            )\n",
+       "            (layer_norm): T5LayerNorm()\n",
+       "            (dropout): Dropout(p=0.1, inplace=False)\n",
+       "          )\n",
+       "          (2): T5LayerFF(\n",
+       "            (DenseReluDense): T5DenseGatedActDense(\n",
+       "              (wi_0): Linear(in_features=768, out_features=2048, bias=False)\n",
+       "              (wi_1): Linear(in_features=768, out_features=2048, bias=False)\n",
+       "              (wo): Linear(in_features=2048, out_features=768, bias=False)\n",
+       "              (dropout): Dropout(p=0.1, inplace=False)\n",
+       "              (act): NewGELUActivation()\n",
+       "            )\n",
+       "            (layer_norm): T5LayerNorm()\n",
+       "            (dropout): Dropout(p=0.1, inplace=False)\n",
+       "          )\n",
+       "        )\n",
+       "      )\n",
+       "      (10): T5Block(\n",
+       "        (layer): ModuleList(\n",
+       "          (0): T5LayerSelfAttention(\n",
+       "            (SelfAttention): T5Attention(\n",
+       "              (q): Linear(in_features=768, out_features=768, bias=False)\n",
+       "              (k): Linear(in_features=768, out_features=768, bias=False)\n",
+       "              (v): Linear(in_features=768, out_features=768, bias=False)\n",
+       "              (o): Linear(in_features=768, out_features=768, bias=False)\n",
+       "            )\n",
+       "            (layer_norm): T5LayerNorm()\n",
+       "            (dropout): Dropout(p=0.1, inplace=False)\n",
+       "          )\n",
+       "          (1): T5LayerCrossAttention(\n",
+       "            (EncDecAttention): T5Attention(\n",
+       "              (q): Linear(in_features=768, out_features=768, bias=False)\n",
+       "              (k): Linear(in_features=768, out_features=768, bias=False)\n",
+       "              (v): Linear(in_features=768, out_features=768, bias=False)\n",
+       "              (o): Linear(in_features=768, out_features=768, bias=False)\n",
+       "            )\n",
+       "            (layer_norm): T5LayerNorm()\n",
+       "            (dropout): Dropout(p=0.1, inplace=False)\n",
+       "          )\n",
+       "          (2): T5LayerFF(\n",
+       "            (DenseReluDense): T5DenseGatedActDense(\n",
+       "              (wi_0): Linear(in_features=768, out_features=2048, bias=False)\n",
+       "              (wi_1): Linear(in_features=768, out_features=2048, bias=False)\n",
+       "              (wo): Linear(in_features=2048, out_features=768, bias=False)\n",
+       "              (dropout): Dropout(p=0.1, inplace=False)\n",
+       "              (act): NewGELUActivation()\n",
+       "            )\n",
+       "            (layer_norm): T5LayerNorm()\n",
+       "            (dropout): Dropout(p=0.1, inplace=False)\n",
+       "          )\n",
+       "        )\n",
+       "      )\n",
+       "      (11): T5Block(\n",
+       "        (layer): ModuleList(\n",
+       "          (0): T5LayerSelfAttention(\n",
+       "            (SelfAttention): T5Attention(\n",
+       "              (q): Linear(in_features=768, out_features=768, bias=False)\n",
+       "              (k): Linear(in_features=768, out_features=768, bias=False)\n",
+       "              (v): Linear(in_features=768, out_features=768, bias=False)\n",
+       "              (o): Linear(in_features=768, out_features=768, bias=False)\n",
+       "            )\n",
+       "            (layer_norm): T5LayerNorm()\n",
+       "            (dropout): Dropout(p=0.1, inplace=False)\n",
+       "          )\n",
+       "          (1): T5LayerCrossAttention(\n",
+       "            (EncDecAttention): T5Attention(\n",
+       "              (q): Linear(in_features=768, out_features=768, bias=False)\n",
+       "              (k): Linear(in_features=768, out_features=768, bias=False)\n",
+       "              (v): Linear(in_features=768, out_features=768, bias=False)\n",
+       "              (o): Linear(in_features=768, out_features=768, bias=False)\n",
+       "            )\n",
+       "            (layer_norm): T5LayerNorm()\n",
+       "            (dropout): Dropout(p=0.1, inplace=False)\n",
+       "          )\n",
+       "          (2): T5LayerFF(\n",
+       "            (DenseReluDense): T5DenseGatedActDense(\n",
+       "              (wi_0): Linear(in_features=768, out_features=2048, bias=False)\n",
+       "              (wi_1): Linear(in_features=768, out_features=2048, bias=False)\n",
+       "              (wo): Linear(in_features=2048, out_features=768, bias=False)\n",
+       "              (dropout): Dropout(p=0.1, inplace=False)\n",
+       "              (act): NewGELUActivation()\n",
+       "            )\n",
+       "            (layer_norm): T5LayerNorm()\n",
+       "            (dropout): Dropout(p=0.1, inplace=False)\n",
+       "          )\n",
+       "        )\n",
+       "      )\n",
+       "    )\n",
+       "    (final_layer_norm): T5LayerNorm()\n",
+       "    (dropout): Dropout(p=0.1, inplace=False)\n",
+       "  )\n",
+       "  (lm_head): Linear(in_features=768, out_features=32128, bias=False)\n",
+       ")"
+      ]
+     },
+     "execution_count": 3,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "model"
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Validation"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Accuracy: 0.7560526315789474\n"
+     ]
+    }
+   ],
+   "source": [
+    "correct = 0\n",
+    "labels = \"sport, world, business, scitech\"\n",
+    "\n",
+    "for entry in dataset['test']:\n",
+    "    prompt = f\"classify with possible labels: {labels}\\ntext: {entry['text']}\"\n",
+    "    output = pipeline(prompt, do_sample=False)[0]['generated_text'].lower()\n",
+    "    if output == entry['label']:\n",
+    "        correct += 1\n",
+    "\n",
+    "accuracy = correct / len(dataset['test'])\n",
+    "print(f\"Accuracy: {accuracy}\")"
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Summary"
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "|        |Roberta|GPT2 |T5   |Flan-T5|\n",
+    "|--------|-------|-----|-----|-------|\n",
+    "|Accuracy|92.2%  |91.9%|46.7%|75.6% |"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3.10.9 ('ugp')",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.9"
+  },
+  "orig_nbformat": 4,
+  "vscode": {
+   "interpreter": {
+    "hash": "4f917e9727e89f2278497f95f2732cc5b9cb99f840615e0399f81b235c1c2211"
+   }
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/roberta.py b/roberta.py
new file mode 100644
index 0000000..962d237
--- /dev/null
+++ b/roberta.py
@@ -0,0 +1,54 @@
+from typing import Optional, Union, Tuple
+
+import torch
+from torch import nn
+from torch.nn import MSELoss, CrossEntropyLoss, BCEWithLogitsLoss
+from transformers import RobertaForSequenceClassification, RobertaModel
+from transformers.modeling_outputs import SequenceClassifierOutput
+
+
+# Simple version #
+
+class RobertaClassificationHeadCustomSimple(nn.Module):
+    """Head for sentence-level classification tasks."""
+
+    def __init__(self, config):
+        super().__init__()
+        hidden_size = config.hidden_size
+        self.dense_1 = nn.Linear(hidden_size, 4 * hidden_size)
+        self.dense_2 = nn.Linear(4 * hidden_size, hidden_size)
+        classifier_dropout = (
+            config.classifier_dropout if config.classifier_dropout is not None else config.hidden_dropout_prob
+        )
+        self.dropout = nn.Dropout(classifier_dropout)
+        self.out_proj = nn.Linear(hidden_size, config.num_labels)
+        self.activation = nn.GELU()
+
+    def forward(self, features, **kwargs):
+        x = features[:, 0, :]  # take <s> token (equiv. to [CLS])
+
+        x = self.dense_1(x)
+        x = self.activation(x)
+        x = self.dropout(x)
+
+        x = self.dense_2(x)
+        x = self.activation(x)
+        x = self.dropout(x)
+
+        x = self.out_proj(x)
+        return x
+
+
+class RobertaForSequenceClassificationCustomSimple(RobertaForSequenceClassification):
+    _keys_to_ignore_on_load_missing = [r"position_ids"]
+
+    def __init__(self, config):
+        super().__init__(config)
+        self.num_labels = config.num_labels
+        self.config = config
+
+        self.roberta = RobertaModel(config, add_pooling_layer=False)
+        self.classifier = RobertaClassificationHeadCustomSimple(config)
+
+        # Initialize weights and apply final processing
+        self.post_init()
diff --git a/run_glue.py b/run_glue.py
new file mode 100644
index 0000000..f835943
--- /dev/null
+++ b/run_glue.py
@@ -0,0 +1,685 @@
+#!/usr/bin/env python
+# coding=utf-8
+# Copyright 2020 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" Finetuning the library models for sequence classification on GLUE."""
+# You can also adapt this script on your own text classification task. Pointers for this are left as comments.
+
+import logging
+import os
+import random
+import sys
+from collections import defaultdict
+from dataclasses import dataclass, field
+from typing import Optional
+
+import datasets
+import numpy as np
+from datasets import load_dataset
+
+import evaluate
+import transformers
+from transformers import (
+    AutoConfig,
+    AutoModelForSequenceClassification,
+    AutoTokenizer,
+    DataCollatorWithPadding,
+    EvalPrediction,
+    HfArgumentParser,
+    PretrainedConfig,
+    Trainer,
+    TrainingArguments,
+    default_data_collator,
+    set_seed,
+)
+from transformers.trainer_utils import get_last_checkpoint
+from transformers.utils import check_min_version, send_example_telemetry
+from transformers.utils.versions import require_version
+
+from roberta import RobertaForSequenceClassificationCustomSimple
+from gpt2 import GPT2ForSequenceClassificationCustom
+from t5 import T5ForClassification
+from transformers import BartForSequenceClassification
+
+MODEL_NAME_TO_CLASS = {
+    'roberta_simple': RobertaForSequenceClassificationCustomSimple,
+    'gpt2_hidden': GPT2ForSequenceClassificationCustom,
+    't5_custom': T5ForClassification,
+    'bart_base': BartForSequenceClassification,
+}
+
+# Will error if the minimal version of Transformers is not installed. Remove at your own risks.
+check_min_version("4.23.0")
+
+require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/text-classification/requirements.txt")
+
+task_to_keys = {
+    "cola": ("sentence", None),
+    "mnli": ("premise", "hypothesis"),
+    "mrpc": ("sentence1", "sentence2"),
+    "qnli": ("question", "sentence"),
+    "qqp": ("question1", "question2"),
+    "rte": ("sentence1", "sentence2"),
+    "sst2": ("sentence", None),
+    "stsb": ("sentence1", "sentence2"),
+    "wnli": ("sentence1", "sentence2"),
+}
+
+logger = logging.getLogger(__name__)
+
+
+@dataclass
+class DataTrainingArguments:
+    """
+    Arguments pertaining to what data we are going to input our model for training and eval.
+
+    Using `HfArgumentParser` we can turn this class
+    into argparse arguments to be able to specify them on
+    the command line.
+    """
+
+    task_name: Optional[str] = field(
+        default=None,
+        metadata={"help": "The name of the task to train on: " + ", ".join(task_to_keys.keys())},
+    )
+    dataset_name: Optional[str] = field(
+        default=None, metadata={"help": "The name of the dataset to use (via the datasets library)."}
+    )
+    dataset_config_name: Optional[str] = field(
+        default=None, metadata={"help": "The configuration name of the dataset to use (via the datasets library)."}
+    )
+    max_seq_length: int = field(
+        default=128,
+        metadata={
+            "help": (
+                "The maximum total input sequence length after tokenization. Sequences longer "
+                "than this will be truncated, sequences shorter will be padded."
+            )
+        },
+    )
+    overwrite_cache: bool = field(
+        default=False, metadata={"help": "Overwrite the cached preprocessed datasets or not."}
+    )
+    pad_to_max_length: bool = field(
+        default=True,
+        metadata={
+            "help": (
+                "Whether to pad all samples to `max_seq_length`. "
+                "If False, will pad the samples dynamically when batching to the maximum length in the batch."
+            )
+        },
+    )
+    max_train_samples: Optional[int] = field(
+        default=None,
+        metadata={
+            "help": (
+                "For debugging purposes or quicker training, truncate the number of training examples to this "
+                "value if set."
+            )
+        },
+    )
+    max_eval_samples: Optional[int] = field(
+        default=None,
+        metadata={
+            "help": (
+                "For debugging purposes or quicker training, truncate the number of evaluation examples to this "
+                "value if set."
+            )
+        },
+    )
+    max_predict_samples: Optional[int] = field(
+        default=None,
+        metadata={
+            "help": (
+                "For debugging purposes or quicker training, truncate the number of prediction examples to this "
+                "value if set."
+            )
+        },
+    )
+    train_file: Optional[str] = field(
+        default=None, metadata={"help": "A csv or a json file containing the training data."}
+    )
+    validation_file: Optional[str] = field(
+        default=None, metadata={"help": "A csv or a json file containing the validation data."}
+    )
+    test_file: Optional[str] = field(default=None, metadata={"help": "A csv or a json file containing the test data."})
+
+    def __post_init__(self):
+        if self.task_name is not None:
+            self.task_name = self.task_name.lower()
+            if self.task_name not in task_to_keys.keys():
+                raise ValueError("Unknown task, you should pick one in " + ",".join(task_to_keys.keys()))
+        elif self.dataset_name is not None:
+            pass
+        elif self.train_file is None or self.validation_file is None:
+            raise ValueError("Need either a GLUE task, a training/validation file or a dataset name.")
+        else:
+            train_extension = self.train_file.split(".")[-1]
+            assert train_extension in ["csv", "json"], "`train_file` should be a csv or a json file."
+            validation_extension = self.validation_file.split(".")[-1]
+            assert (
+                validation_extension == train_extension
+            ), "`validation_file` should have the same extension (csv or json) as `train_file`."
+
+
+@dataclass
+class ModelArguments:
+    """
+    Arguments pertaining to which model/config/tokenizer we are going to fine-tune from.
+    """
+
+    model_name_or_path: str = field(
+        metadata={"help": "Path to pretrained model or model identifier from huggingface.co/models"}
+    )
+    config_name: Optional[str] = field(
+        default=None, metadata={"help": "Pretrained config name or path if not the same as model_name"}
+    )
+    tokenizer_name: Optional[str] = field(
+        default=None, metadata={"help": "Pretrained tokenizer name or path if not the same as model_name"}
+    )
+    cache_dir: Optional[str] = field(
+        default=None,
+        metadata={"help": "Where do you want to store the pretrained models downloaded from huggingface.co"},
+    )
+    use_fast_tokenizer: bool = field(
+        default=True,
+        metadata={"help": "Whether to use one of the fast tokenizer (backed by the tokenizers library) or not."},
+    )
+    model_revision: str = field(
+        default="main",
+        metadata={"help": "The specific model version to use (can be a branch name, tag name or commit id)."},
+    )
+    use_auth_token: bool = field(
+        default=False,
+        metadata={
+            "help": (
+                "Will use the token generated when running `huggingface-cli login` (necessary to use this script "
+                "with private models)."
+            )
+        },
+    )
+    ignore_mismatched_sizes: bool = field(
+        default=False,
+        metadata={"help": "Will enable to load a pretrained model whose head dimensions are different."},
+    )
+    custom_model: str = field(
+        default=None,
+        metadata={
+            "help": "Use custom implementation from available list",
+            "choices": list(MODEL_NAME_TO_CLASS.keys()),
+        },
+    )
+
+
+def main():
+    # See all possible arguments in src/transformers/training_args.py
+    # or by passing the --help flag to this script.
+    # We now keep distinct sets of args, for a cleaner separation of concerns.
+
+    parser = HfArgumentParser((ModelArguments, DataTrainingArguments, TrainingArguments))
+    if len(sys.argv) == 2 and sys.argv[1].endswith(".json"):
+        # If we pass only one argument to the script and it's the path to a json file,
+        # let's parse it to get our arguments.
+        model_args, data_args, training_args = parser.parse_json_file(json_file=os.path.abspath(sys.argv[1]))
+    else:
+        model_args, data_args, training_args = parser.parse_args_into_dataclasses()
+
+    # Sending telemetry. Tracking the example usage helps us better allocate resources to maintain them. The
+    # information sent is the one passed as arguments along with your Python/PyTorch versions.
+    send_example_telemetry("run_glue", model_args, data_args)
+
+    if 'bart' in model_args.model_name_or_path:
+        model_args.ignore_mismatched_sizes = True
+    
+    # Setup logging
+    logging.basicConfig(
+        format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
+        datefmt="%m/%d/%Y %H:%M:%S",
+        handlers=[logging.StreamHandler(sys.stdout)],
+    )
+
+    log_level = training_args.get_process_log_level()
+    logger.setLevel(log_level)
+    datasets.utils.logging.set_verbosity(log_level)
+    transformers.utils.logging.set_verbosity(log_level)
+    transformers.utils.logging.enable_default_handler()
+    transformers.utils.logging.enable_explicit_format()
+
+    # Log on each process the small summary:
+    logger.warning(
+        f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}"
+        + f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}"
+    )
+    logger.info(f"Training/evaluation parameters {training_args}")
+
+    # Detecting last checkpoint.
+    last_checkpoint = None
+    if os.path.isdir(training_args.output_dir) and training_args.do_train and not training_args.overwrite_output_dir:
+        last_checkpoint = get_last_checkpoint(training_args.output_dir)
+        if last_checkpoint is None and len(os.listdir(training_args.output_dir)) > 0:
+            raise ValueError(
+                f"Output directory ({training_args.output_dir}) already exists and is not empty. "
+                "Use --overwrite_output_dir to overcome."
+            )
+        elif last_checkpoint is not None and training_args.resume_from_checkpoint is None:
+            logger.info(
+                f"Checkpoint detected, resuming training at {last_checkpoint}. To avoid this behavior, change "
+                "the `--output_dir` or add `--overwrite_output_dir` to train from scratch."
+            )
+
+    # Set seed before initializing model.
+    set_seed(training_args.seed)
+
+    # Get the datasets: you can either provide your own CSV/JSON training and evaluation files (see below)
+    # or specify a GLUE benchmark task (the dataset will be downloaded automatically from the datasets Hub).
+    #
+    # For CSV/JSON files, this script will use as labels the column called 'label' and as pair of sentences the
+    # sentences in columns called 'sentence1' and 'sentence2' if such column exists or the first two columns not named
+    # label if at least two columns are provided.
+    #
+    # If the CSVs/JSONs contain only one non-label column, the script does single sentence classification on this
+    # single column. You can easily tweak this behavior (see below)
+    #
+    # In distributed training, the load_dataset function guarantee that only one local process can concurrently
+    # download the dataset.
+    if data_args.task_name is not None:
+        # Downloading and loading a dataset from the hub.
+        raw_datasets = load_dataset(
+            "glue",
+            data_args.task_name,
+            cache_dir=model_args.cache_dir,
+            use_auth_token=True if model_args.use_auth_token else None,
+        )
+    elif data_args.dataset_name is not None:
+        # Downloading and loading a dataset from the hub.
+        raw_datasets = load_dataset(
+            data_args.dataset_name,
+            data_args.dataset_config_name,
+            cache_dir=model_args.cache_dir,
+            use_auth_token=True if model_args.use_auth_token else None,
+        )
+    else:
+        # Loading a dataset from your local files.
+        # CSV/JSON training and evaluation files are needed.
+        data_files = {"train": data_args.train_file, "validation": data_args.validation_file}
+
+        # Get the test dataset: you can provide your own CSV/JSON test file (see below)
+        # when you use `do_predict` without specifying a GLUE benchmark task.
+        if training_args.do_predict:
+            if data_args.test_file is not None:
+                train_extension = data_args.train_file.split(".")[-1]
+                test_extension = data_args.test_file.split(".")[-1]
+                assert (
+                    test_extension == train_extension
+                ), "`test_file` should have the same extension (csv or json) as `train_file`."
+                data_files["test"] = data_args.test_file
+            else:
+                raise ValueError("Need either a GLUE task or a test file for `do_predict`.")
+
+        for key in data_files.keys():
+            logger.info(f"load a local file for {key}: {data_files[key]}")
+
+        if data_args.train_file.endswith(".csv"):
+            # Loading a dataset from local csv files
+            raw_datasets = load_dataset(
+                "csv",
+                data_files=data_files,
+                cache_dir=model_args.cache_dir,
+                use_auth_token=True if model_args.use_auth_token else None,
+            )
+        else:
+            # Loading a dataset from local json files
+            raw_datasets = load_dataset(
+                "json",
+                data_files=data_files,
+                cache_dir=model_args.cache_dir,
+                use_auth_token=True if model_args.use_auth_token else None,
+            )
+    # See more about loading any type of standard or custom dataset at
+    # https://huggingface.co/docs/datasets/loading_datasets.html.
+
+    # Labels
+    if data_args.task_name is not None:
+        is_regression = data_args.task_name == "stsb"
+        if not is_regression:
+            label_list = raw_datasets["train"].features["label"].names
+            num_labels = len(label_list)
+        else:
+            num_labels = 1
+    else:
+        # Trying to have good defaults here, don't hesitate to tweak to your needs.
+        is_regression = raw_datasets["train"].features["label"].dtype in ["float32", "float64"]
+        if is_regression:
+            num_labels = 1
+        else:
+            # A useful fast method:
+            # https://huggingface.co/docs/datasets/package_reference/main_classes.html#datasets.Dataset.unique
+            label_list = raw_datasets["train"].unique("label")
+            label_list.sort()  # Let's sort it for determinism
+            num_labels = len(label_list)
+
+    # Load pretrained model and tokenizer
+    #
+    # In distributed training, the .from_pretrained methods guarantee that only one local process can concurrently
+    # download model & vocab.
+    config = AutoConfig.from_pretrained(
+        model_args.config_name if model_args.config_name else model_args.model_name_or_path,
+        num_labels=num_labels,
+        finetuning_task=data_args.task_name,
+        cache_dir=model_args.cache_dir,
+        revision=model_args.model_revision,
+        use_auth_token=True if model_args.use_auth_token else None,
+    )
+    tokenizer = AutoTokenizer.from_pretrained(
+        model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path,
+        cache_dir=model_args.cache_dir,
+        use_fast=model_args.use_fast_tokenizer,
+        revision=model_args.model_revision,
+        use_auth_token=True if model_args.use_auth_token else None,
+    )
+    custom_model = model_args.custom_model
+
+    if custom_model is not None:
+        # Check model and implementation is the same
+        if 'roberta' in custom_model and 'roberta' not in model_args.model_name_or_path:
+            raise RuntimeError('Model and custom implementation should be the same type: RoBERTa')
+        elif 'gpt2' in custom_model and 'gpt2' not in model_args.model_name_or_path:
+            raise RuntimeError('Model and custom implementation should be the same type: GPT-2')
+
+        # Set custom configuration in model configuration
+        config.use_hidden_states = 'hidden' in custom_model
+        logger.info(f'Using hidden states in model: {config.use_hidden_states}')
+
+        print(f'-------------------------------------------------------- Using hidden: {config.use_hidden_states}')
+
+        # Get class to initialize model
+        model_cls = MODEL_NAME_TO_CLASS[custom_model]
+    else:
+        model_cls = AutoModelForSequenceClassification
+    logger.info(f'Using implementation from class: {model_cls.__name__}')
+    model = model_cls.from_pretrained(
+        model_args.model_name_or_path,
+        from_tf=bool(".ckpt" in model_args.model_name_or_path),
+        config=config,
+        cache_dir=model_args.cache_dir,
+        revision=model_args.model_revision,
+        use_auth_token=True if model_args.use_auth_token else None,
+        ignore_mismatched_sizes=model_args.ignore_mismatched_sizes,        
+    )
+
+    print(model)
+
+    if 'gpt2' in tokenizer.name_or_path and tokenizer.pad_token is None:
+        logger.info(f'Set PAD token to EOS: {tokenizer.eos_token}')
+        tokenizer._pad_token = tokenizer.eos_token
+        model.config.pad_token_id = model.config.eos_token_id
+
+    # Preprocessing the raw_datasets
+    if data_args.task_name is not None:
+         sentence1_key, sentence2_key = task_to_keys[data_args.task_name]
+
+    # Preprocessing the raw_datasets
+    if data_args.task_name is not None:
+        sentence1_key, sentence2_key = task_to_keys[data_args.task_name]
+    else:
+        # Again, we try to have some nice defaults but don't hesitate to tweak to your use case.
+        non_label_column_names = [name for name in raw_datasets["train"].column_names if name != "label"]
+        if "sentence1" in non_label_column_names and "sentence2" in non_label_column_names:
+            sentence1_key, sentence2_key = "sentence1", "sentence2"
+        else:
+            if len(non_label_column_names) >= 2:
+                sentence1_key, sentence2_key = non_label_column_names[:2]
+            else:
+                sentence1_key, sentence2_key = non_label_column_names[0], None
+
+    # Padding strategy
+    if data_args.pad_to_max_length:
+        padding = "max_length"
+    else:
+        # We will pad later, dynamically at batch creation, to the max sequence length in each batch
+        padding = False
+
+    # Some models have set the order of the labels to use, so let's make sure we do use it.
+    label_to_id = None
+    if (
+        model.config.label2id != PretrainedConfig(num_labels=num_labels).label2id
+        and data_args.task_name is not None
+        and not is_regression
+    ):
+        # Some have all caps in their config, some don't.
+        label_name_to_id = {k.lower(): v for k, v in model.config.label2id.items()}
+        if list(sorted(label_name_to_id.keys())) == list(sorted(label_list)):
+            label_to_id = {i: int(label_name_to_id[label_list[i]]) for i in range(num_labels)}
+        else:
+            logger.warning(
+                "Your model seems to have been trained with labels, but they don't match the dataset: ",
+                f"model labels: {list(sorted(label_name_to_id.keys()))}, dataset labels: {list(sorted(label_list))}."
+                "\nIgnoring the model labels as a result.",
+            )
+    elif data_args.task_name is None and not is_regression:
+        label_to_id = {v: i for i, v in enumerate(label_list)}
+
+    if label_to_id is not None:
+        model.config.label2id = label_to_id
+        model.config.id2label = {id: label for label, id in config.label2id.items()}
+    elif data_args.task_name is not None and not is_regression:
+        model.config.label2id = {l: i for i, l in enumerate(label_list)}
+        model.config.id2label = {id: label for label, id in config.label2id.items()}
+
+    if data_args.max_seq_length > tokenizer.model_max_length:
+        logger.warning(
+            f"The max_seq_length passed ({data_args.max_seq_length}) is larger than the maximum length for the"
+            f"model ({tokenizer.model_max_length}). Using max_seq_length={tokenizer.model_max_length}."
+        )
+    max_seq_length = min(data_args.max_seq_length, tokenizer.model_max_length)
+
+    def preprocess_function(examples):
+        # Tokenize the texts
+        args = (
+            (examples[sentence1_key],) if sentence2_key is None else (examples[sentence1_key], examples[sentence2_key])
+        )
+        result = tokenizer(*args, padding=padding, max_length=max_seq_length, truncation=True)
+
+        # Map labels to IDs (not necessary for GLUE tasks)
+        if label_to_id is not None and "label" in examples:
+            result["label"] = [(label_to_id[l] if l != -1 else -1) for l in examples["label"]]
+        return result
+
+    with training_args.main_process_first(desc="dataset map pre-processing"):
+        raw_datasets = raw_datasets.map(
+            preprocess_function,
+            batched=True,
+            load_from_cache_file=not data_args.overwrite_cache,
+            desc="Running tokenizer on dataset",
+        )
+    if training_args.do_train:
+        if "train" not in raw_datasets:
+            raise ValueError("--do_train requires a train dataset")
+        train_dataset = raw_datasets["train"]
+        if data_args.max_train_samples is not None:
+            max_train_samples = min(len(train_dataset), data_args.max_train_samples)
+            train_dataset = train_dataset.select(range(max_train_samples))
+
+    if training_args.do_eval:
+        if "validation" not in raw_datasets and "validation_matched" not in raw_datasets:
+            raise ValueError("--do_eval requires a validation dataset")
+        eval_dataset = raw_datasets["validation_matched" if data_args.task_name == "mnli" else "validation"]
+        if data_args.max_eval_samples is not None:
+            max_eval_samples = min(len(eval_dataset), data_args.max_eval_samples)
+            label_to_indexes = defaultdict(list)
+            for index, eval_sample in enumerate(eval_dataset):
+                label_to_indexes[eval_sample['label']].append(index)
+            max_samples_per_label = int(max_eval_samples / len(label_to_indexes))
+            eval_sample_indexes = []
+            for label, indexes in label_to_indexes.items():
+                eval_sample_indexes.extend(indexes[:max_samples_per_label])
+                logger.info(f"Set {max_samples_per_label} samples for {label}-class")
+            eval_sample_indexes.sort()
+            eval_dataset = eval_dataset.select(eval_sample_indexes)
+
+    if training_args.do_predict or data_args.task_name is not None or data_args.test_file is not None:
+        if "test" not in raw_datasets and "test_matched" not in raw_datasets:
+            raise ValueError("--do_predict requires a test dataset")
+        predict_dataset = raw_datasets["test_matched" if data_args.task_name == "mnli" else "test"]
+        if data_args.max_predict_samples is not None:
+            max_predict_samples = min(len(predict_dataset), data_args.max_predict_samples)
+            predict_dataset = predict_dataset.select(range(max_predict_samples))
+
+    # Log a few random samples from the training set:
+    if training_args.do_train:
+        for index in random.sample(range(len(train_dataset)), 3):
+            logger.info(f"Sample {index} of the training set: {train_dataset[index]}.")
+
+    # Get the metric function
+    if data_args.task_name is not None:
+        metric = evaluate.load("glue", data_args.task_name)
+    else:
+        metric = evaluate.load("accuracy")
+
+    # You can define your custom compute_metrics function. It takes an `EvalPrediction` object (a namedtuple with a
+    # predictions and label_ids field) and has to return a dictionary string to float.
+    def compute_metrics(p: EvalPrediction):
+        preds = p.predictions[0] if isinstance(p.predictions, tuple) else p.predictions
+        preds = np.squeeze(preds) if is_regression else np.argmax(preds, axis=1)
+        if data_args.task_name is not None:
+            result = metric.compute(predictions=preds, references=p.label_ids)
+            if len(result) > 1:
+                result["combined_score"] = np.mean(list(result.values())).item()
+            return result
+        elif is_regression:
+            return {"mse": ((preds - p.label_ids) ** 2).mean().item()}
+        else:
+            return {"accuracy": (preds == p.label_ids).astype(np.float32).mean().item()}
+
+    # Data collator will default to DataCollatorWithPadding when the tokenizer is passed to Trainer, so we change it if
+    # we already did the padding.
+    if data_args.pad_to_max_length:
+        data_collator = default_data_collator
+    elif training_args.fp16:
+        data_collator = DataCollatorWithPadding(tokenizer, pad_to_multiple_of=8)
+    else:
+        data_collator = None
+
+    # Initialize our Trainer
+    trainer = Trainer(
+        model=model,
+        args=training_args,
+        train_dataset=train_dataset if training_args.do_train else None,
+        eval_dataset=eval_dataset if training_args.do_eval else None,
+        compute_metrics=compute_metrics,
+        tokenizer=tokenizer,
+        data_collator=data_collator,
+    )
+
+    # Training
+    ignore_keys_for_eval = ['hidden_states', 'attentions', 'past_key_values']
+    if training_args.do_train:
+        checkpoint = None
+        if training_args.resume_from_checkpoint is not None:
+            checkpoint = training_args.resume_from_checkpoint
+        elif last_checkpoint is not None:
+            checkpoint = last_checkpoint
+        train_result = trainer.train(resume_from_checkpoint=checkpoint, ignore_keys_for_eval=ignore_keys_for_eval)
+        metrics = train_result.metrics
+        max_train_samples = (
+            data_args.max_train_samples if data_args.max_train_samples is not None else len(train_dataset)
+        )
+        metrics["train_samples"] = min(max_train_samples, len(train_dataset))
+
+        trainer.save_model()  # Saves the tokenizer too for easy upload
+
+        trainer.log_metrics("train", metrics)
+        trainer.save_metrics("train", metrics)
+        trainer.save_state()
+
+    # Evaluation
+    if training_args.do_eval:
+        logger.info("*** Evaluate ***")
+
+        # Loop to handle MNLI double evaluation (matched, mis-matched)
+        tasks = [data_args.task_name]
+        eval_datasets = [eval_dataset]
+        if data_args.task_name == "mnli":
+            tasks.append("mnli-mm")
+            valid_mm_dataset = raw_datasets["validation_mismatched"]
+            if data_args.max_eval_samples is not None:
+                max_eval_samples = min(len(valid_mm_dataset), data_args.max_eval_samples)
+                valid_mm_dataset = valid_mm_dataset.select(range(max_eval_samples))
+            eval_datasets.append(valid_mm_dataset)
+            combined = {}
+
+        for eval_dataset, task in zip(eval_datasets, tasks):
+            metrics = trainer.evaluate(eval_dataset=eval_dataset, ignore_keys=ignore_keys_for_eval)
+
+            max_eval_samples = (
+                data_args.max_eval_samples if data_args.max_eval_samples is not None else len(eval_dataset)
+            )
+            metrics["eval_samples"] = min(max_eval_samples, len(eval_dataset))
+
+            if task == "mnli-mm":
+                metrics = {k + "_mm": v for k, v in metrics.items()}
+            if task is not None and "mnli" in task:
+                combined.update(metrics)
+
+            trainer.log_metrics("eval", metrics)
+            trainer.save_metrics("eval", combined if task is not None and "mnli" in task else metrics)
+
+    if training_args.do_predict:
+        logger.info("*** Predict ***")
+
+        # Loop to handle MNLI double evaluation (matched, mis-matched)
+        tasks = [data_args.task_name]
+        predict_datasets = [predict_dataset]
+        if data_args.task_name == "mnli":
+            tasks.append("mnli-mm")
+            predict_datasets.append(raw_datasets["test_mismatched"])
+
+        for predict_dataset, task in zip(predict_datasets, tasks):
+            # Removing the `label` columns because it contains -1 and Trainer won't like that.
+            predict_dataset = predict_dataset.remove_columns("label")
+            predictions = trainer.predict(predict_dataset, metric_key_prefix="predict", ignore_keys=ignore_keys_for_eval).predictions
+            predictions = np.squeeze(predictions) if is_regression else np.argmax(predictions, axis=1)
+
+            output_predict_file = os.path.join(training_args.output_dir, f"predict_results_{task}.txt")
+            if trainer.is_world_process_zero():
+                with open(output_predict_file, "w") as writer:
+                    logger.info(f"***** Predict results {task} *****")
+                    writer.write("index\tprediction\n")
+                    for index, item in enumerate(predictions):
+                        if is_regression:
+                            writer.write(f"{index}\t{item:3.3f}\n")
+                        else:
+                            item = label_list[item]
+                            writer.write(f"{index}\t{item}\n")
+
+    kwargs = {"finetuned_from": model_args.model_name_or_path, "tasks": "text-classification"}
+    if data_args.task_name is not None:
+        kwargs["language"] = "en"
+        kwargs["dataset_tags"] = "glue"
+        kwargs["dataset_args"] = data_args.task_name
+        kwargs["dataset"] = f"GLUE {data_args.task_name.upper()}"
+
+    if training_args.push_to_hub:
+        trainer.push_to_hub(**kwargs)
+    else:
+        trainer.create_model_card(**kwargs)
+
+
+def _mp_fn(index):
+    # For xla_spawn (TPUs)
+    main()
+
+
+if __name__ == "__main__":
+    main()
diff --git a/t5.py b/t5.py
new file mode 100644
index 0000000..cc042dc
--- /dev/null
+++ b/t5.py
@@ -0,0 +1,125 @@
+import torch
+import copy
+from torch import nn
+from transformers import T5PreTrainedModel, T5Config
+from transformers.models.t5.modeling_t5 import T5Stack
+from transformers.modeling_outputs import SequenceClassifierOutput
+
+
+class T5ClassificationHead(nn.Module):
+    def __init__(self, config: T5Config):
+        super().__init__()
+
+        self.dense_in = nn.Linear(config.d_model, 768)
+        self.dense = nn.Linear(768, 768)
+        self.dense_out = nn.Linear(768, config.num_labels)
+        self.dropout = nn.Dropout(0.1)
+
+    def forward(self, features, **kwargs):
+        x = features[:, 0, :]
+        x = self.dropout(x)
+        x = self.dense_in(x)
+        x = torch.relu(x)
+        x = self.dropout(x)
+        x = self.dense(x)
+        x = torch.relu(x)
+        x = self.dropout(x)
+        x = self.dense_out(x)
+
+        return x
+
+
+class T5ForClassification(T5PreTrainedModel):
+    def __init__(self, config: T5Config):
+        super().__init__(config)
+        self.model_dim = config.d_model
+
+        self.shared = nn.Embedding(config.vocab_size, config.d_model)
+
+        encoder_config = copy.deepcopy(config)
+        encoder_config.is_decoder = False
+        encoder_config.use_cache = False
+        encoder_config.is_encoder_decoder = False
+        self.encoder = T5Stack(encoder_config, self.shared)
+
+        decoder_config = copy.deepcopy(config)
+        decoder_config.is_decoder = True
+        decoder_config.is_encoder_decoder = False
+        decoder_config.num_layers = config.num_decoder_layers
+        self.decoder = T5Stack(decoder_config, self.shared)
+
+        modules_to_freeze = [self.encoder.block[i].layer[0] for i in range(len(self.encoder.block))]
+        modules_to_freeze.extend([self.decoder.block[i].layer[0] for i in range(len(self.decoder.block))])
+        modules_to_freeze.extend([self.decoder.block[i].layer[1] for i in range(len(self.decoder.block))])
+
+        for module in modules_to_freeze:
+            for param in module.parameters():
+                param.requires_grad = False
+
+        self.lm_head = T5ClassificationHead(config)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+        # Model parallel
+        self.model_parallel = False
+        self.device_map = None
+
+
+    def forward(
+            self,
+            input_ids=None,
+            attention_mask=None,
+            head_mask=None,
+            cross_attn_head_mask=None,
+            past_key_values=None,
+            inputs_embeds=None,
+            decoder_inputs_embeds=None,
+            use_cache=None,
+            output_attentions=None,
+            output_hidden_states=None,
+            return_dict=None,
+            labels=None
+            ):
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        outputs = self.encoder(
+                input_ids,
+                attention_mask=attention_mask,
+                head_mask=head_mask,
+                cross_attn_head_mask=cross_attn_head_mask,
+                past_key_values=past_key_values,
+                inputs_embeds=inputs_embeds,
+                use_cache=use_cache,
+                output_attentions=output_attentions,
+                output_hidden_states=output_hidden_states,
+                return_dict=return_dict,
+            )
+
+        outputs = self.decoder(
+                input_ids,
+                attention_mask=attention_mask,
+                head_mask=head_mask,
+                cross_attn_head_mask=cross_attn_head_mask,
+                past_key_values=past_key_values,
+                inputs_embeds=inputs_embeds,
+                use_cache=use_cache,
+                output_attentions=output_attentions,
+                output_hidden_states=output_hidden_states,
+                return_dict=return_dict,
+            )
+
+
+        logits = self.lm_head(outputs[0])
+
+
+        loss = None
+        if labels is not None:
+            loss_fct = nn.CrossEntropyLoss()
+            loss = loss_fct(logits.view(-1, self.config.num_labels), labels.view(-1))
+
+
+        return SequenceClassifierOutput(
+            loss=loss,
+            logits=logits,
+        )