zugb-materials-extra/meetup.ipynb

38339 lines
2.2 MiB
Plaintext
Raw Permalink Normal View History

2023-06-19 19:01:55 +02:00
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"id": "6d6f716e",
"metadata": {},
"outputs": [],
"source": [
"from transformers import AutoTokenizer, AutoModelForCausalLM\n",
"import transformers\n",
"import torch\n"
]
},
{
"cell_type": "code",
"execution_count": 2,
"id": "e7f22bb8",
"metadata": {},
"outputs": [],
"source": [
"model = \"tiiuae/falcon-40b\""
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "c5aa7b73",
"metadata": {},
"outputs": [],
"source": [
"tokenizer = AutoTokenizer.from_pretrained(model)"
]
},
{
"cell_type": "code",
"execution_count": 4,
"id": "b27c38b7",
"metadata": {},
"outputs": [],
"source": [
"vocab = tokenizer.get_vocab()"
]
},
{
"cell_type": "code",
"execution_count": 5,
"id": "d82e4242",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"65024"
]
},
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"len(vocab)"
]
},
{
"cell_type": "code",
"execution_count": 6,
"id": "a0827f01",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"['D',\n",
" 'z',\n",
" 'isia',\n",
" 'j',\n",
" 'Ġr',\n",
" 'ano',\n",
" 'Ġw',\n",
" 'ĠPoz',\n",
" 'n',\n",
" 'aniu',\n",
" 'Ġna',\n",
" 'ĠÅģ',\n",
" 'ÄĻ',\n",
" 'g',\n",
" 'ach',\n",
" 'Ġw',\n",
" 'yl',\n",
" 'Äħd',\n",
" 'owaÅĤ',\n",
" 'Ġl',\n",
" 'ata',\n",
" 'jÄħ',\n",
" 'cy',\n",
" 'Ġtal',\n",
" 'er',\n",
" 'z',\n",
" '.',\n",
" 'ĠPremier']"
]
},
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"tokenizer.tokenize('Dzisiaj rano w Poznaniu na Łęgach wylądował latający talerz. Premier')"
]
},
{
"cell_type": "code",
"execution_count": 7,
"id": "1013bedc",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"['Today',\n",
" 'Ġa',\n",
" 'Ġflying',\n",
" 'Ġsau',\n",
" 'cer',\n",
" 'Ġhas',\n",
" 'Ġlanded',\n",
" 'Ġin',\n",
" 'ĠPoz',\n",
" 'nan',\n",
" '.',\n",
" 'ĠThe',\n",
" 'Ġprime']"
]
},
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"tokenizer.tokenize('Today a flying saucer has landed in Poznan. The prime')"
]
},
{
"cell_type": "code",
"execution_count": 8,
"id": "7da80f8a",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"A new version of the following files was downloaded from https://huggingface.co/tiiuae/falcon-40b:\n",
"- configuration_RW.py\n",
". Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.\n",
"A new version of the following files was downloaded from https://huggingface.co/tiiuae/falcon-40b:\n",
"- modelling_RW.py\n",
". Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.\n"
]
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "02beba8d3bab444fa90c6b3b22e38916",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"Downloading shards: 0%| | 0/9 [00:00<?, ?it/s]"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n",
"To disable this warning, you can either:\n",
"\t- Avoid using `tokenizers` before the fork if possible\n",
"\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n",
"huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n",
"To disable this warning, you can either:\n",
"\t- Avoid using `tokenizers` before the fork if possible\n",
"\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n",
"huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n",
"To disable this warning, you can either:\n",
"\t- Avoid using `tokenizers` before the fork if possible\n",
"\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n",
"huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n",
"To disable this warning, you can either:\n",
"\t- Avoid using `tokenizers` before the fork if possible\n",
"\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n",
"huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n",
"To disable this warning, you can either:\n",
"\t- Avoid using `tokenizers` before the fork if possible\n",
"\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n",
"huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n",
"To disable this warning, you can either:\n",
"\t- Avoid using `tokenizers` before the fork if possible\n",
"\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n",
"huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n",
"To disable this warning, you can either:\n",
"\t- Avoid using `tokenizers` before the fork if possible\n",
"\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n",
"huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n",
"To disable this warning, you can either:\n",
"\t- Avoid using `tokenizers` before the fork if possible\n",
"\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n",
"huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n",
"To disable this warning, you can either:\n",
"\t- Avoid using `tokenizers` before the fork if possible\n",
"\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n",
"huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n",
"To disable this warning, you can either:\n",
"\t- Avoid using `tokenizers` before the fork if possible\n",
"\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n",
"huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n",
"To disable this warning, you can either:\n",
"\t- Avoid using `tokenizers` before the fork if possible\n",
"\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n",
"huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n",
"To disable this warning, you can either:\n",
"\t- Avoid using `tokenizers` before the fork if possible\n",
"\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n",
"huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n",
"To disable this warning, you can either:\n",
"\t- Avoid using `tokenizers` before the fork if possible\n",
"\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n",
"huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n",
"To disable this warning, you can either:\n",
"\t- Avoid using `tokenizers` before the fork if possible\n",
"\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n",
"huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n",
"To disable this warning, you can either:\n",
"\t- Avoid using `tokenizers` before the fork if possible\n",
"\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n",
"huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n",
"To disable this warning, you can either:\n",
"\t- Avoid using `tokenizers` before the fork if possible\n",
"\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n",
"huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n",
"To disable this warning, you can either:\n",
"\t- Avoid using `tokenizers` before the fork if possible\n",
"\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n",
"huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n",
"To disable this warning, you can either:\n",
"\t- Avoid using `tokenizers` before the fork if possible\n",
"\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n",
"huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n",
"To disable this warning, you can either:\n",
"\t- Avoid using `tokenizers` before the fork if possible\n",
"\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n",
"huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n",
"To disable this warning, you can either:\n",
"\t- Avoid using `tokenizers` before the fork if possible\n",
"\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n",
"huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n",
"To disable this warning, you can either:\n",
"\t- Avoid using `tokenizers` before the fork if possible\n",
"\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n",
"huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n",
"To disable this warning, you can either:\n",
"\t- Avoid using `tokenizers` before the fork if possible\n",
"\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n",
"huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n",
"To disable this warning, you can either:\n",
"\t- Avoid using `tokenizers` before the fork if possible\n",
"\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n",
"huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n",
"To disable this warning, you can either:\n",
"\t- Avoid using `tokenizers` before the fork if possible\n",
"\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n",
"huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n",
"To disable this warning, you can either:\n",
"\t- Avoid using `tokenizers` before the fork if possible\n",
"\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n",
"huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n",
"To disable this warning, you can either:\n",
"\t- Avoid using `tokenizers` before the fork if possible\n",
"\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n",
"huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n",
"To disable this warning, you can either:\n",
"\t- Avoid using `tokenizers` before the fork if possible\n",
"\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n",
"To disable this warning, you can either:\n",
"\t- Avoid using `tokenizers` before the fork if possible\n",
"\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n",
"huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n",
"To disable this warning, you can either:\n",
"\t- Avoid using `tokenizers` before the fork if possible\n",
"\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n",
"huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n",
"To disable this warning, you can either:\n",
"\t- Avoid using `tokenizers` before the fork if possible\n",
"\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n"
]
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "14cc1a15b75540978f45085eb60072ab",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"Loading checkpoint shards: 0%| | 0/9 [00:00<?, ?it/s]"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"Xformers is not installed correctly. If you want to use memorry_efficient_attention to accelerate training use the following command to install Xformers\n",
"pip install xformers.\n",
"The model 'RWForCausalLM' is not supported for text-generation. Supported models are ['BartForCausalLM', 'BertLMHeadModel', 'BertGenerationDecoder', 'BigBirdForCausalLM', 'BigBirdPegasusForCausalLM', 'BioGptForCausalLM', 'BlenderbotForCausalLM', 'BlenderbotSmallForCausalLM', 'BloomForCausalLM', 'CamembertForCausalLM', 'CodeGenForCausalLM', 'CpmAntForCausalLM', 'CTRLLMHeadModel', 'Data2VecTextForCausalLM', 'ElectraForCausalLM', 'ErnieForCausalLM', 'GitForCausalLM', 'GPT2LMHeadModel', 'GPT2LMHeadModel', 'GPTBigCodeForCausalLM', 'GPTNeoForCausalLM', 'GPTNeoXForCausalLM', 'GPTNeoXJapaneseForCausalLM', 'GPTJForCausalLM', 'LlamaForCausalLM', 'MarianForCausalLM', 'MBartForCausalLM', 'MegaForCausalLM', 'MegatronBertForCausalLM', 'MvpForCausalLM', 'OpenLlamaForCausalLM', 'OpenAIGPTLMHeadModel', 'OPTForCausalLM', 'PegasusForCausalLM', 'PLBartForCausalLM', 'ProphetNetForCausalLM', 'QDQBertLMHeadModel', 'ReformerModelWithLMHead', 'RemBertForCausalLM', 'RobertaForCausalLM', 'RobertaPreLayerNormForCausalLM', 'RoCBertForCausalLM', 'RoFormerForCausalLM', 'RwkvForCausalLM', 'Speech2Text2ForCausalLM', 'TransfoXLLMHeadModel', 'TrOCRForCausalLM', 'XGLMForCausalLM', 'XLMWithLMHeadModel', 'XLMProphetNetForCausalLM', 'XLMRobertaForCausalLM', 'XLMRobertaXLForCausalLM', 'XLNetLMHeadModel', 'XmodForCausalLM'].\n"
]
}
],
"source": [
"pipeline = transformers.pipeline(\n",
" \"text-generation\",\n",
" model=model,\n",
" tokenizer=tokenizer,\n",
" torch_dtype=torch.bfloat16,\n",
" trust_remote_code=True,\n",
" device_map=\"auto\",\n",
")\n"
]
},
{
"cell_type": "code",
"execution_count": 9,
"id": "4f862bb1",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/home/filipg/miniconda3/envs/torch2b/lib/python3.11/site-packages/transformers/generation/utils.py:1255: UserWarning: You have modified the pretrained model configuration to control generation. This is a deprecated strategy to control generation and will be removed soon, in a future version. Please use a generation configuration file (see https://huggingface.co/docs/transformers/main_classes/text_generation)\n",
" warnings.warn(\n",
"Setting `pad_token_id` to `eos_token_id`:11 for open-end generation.\n"
]
},
{
"data": {
"text/plain": [
"'Dzisiaj rano w Piotrkowie Trybunalskim wybuchła epidemia tajemniczego nowego wirusa. Premier w osobie Krzysztofa Rutkowskiego ogłosił stan wyjątkowy i areszt domowy. Tymczasem bary i restauracje otwierają się, a ludzie wchodzą do nich.\\nBary i restauracje otwierają się, a ludzie wchodzą do nich.\\nKiedy wchodzę do baru, to widzę, że po całym mieście otwierają się bary i restauracje. Ludzie wchodzą do tych miejsc, nie zdając sobie sprawy, że wirus może przenosić się w powietrzu. Mimo stanu wyjątkowego, ludzie jakby nic nie widzieli i nie słyszeli.\\nJak można'"
]
},
"execution_count": 9,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"pipeline.model.eval()\n",
"\n",
"sequences = pipeline(\n",
" \"Dzisiaj rano w Piotrkowie Trybunalskim wybuchła epidemia tajemniczego nowego wirusa. Premier\",\n",
" max_length=200,\n",
" do_sample=True,\n",
" top_k=100,\n",
" temperature=0.7,\n",
" num_return_sequences=1,\n",
" eos_token_id=tokenizer.eos_token_id,\n",
")\n",
"\n",
"sequences[0]['generated_text']\n"
]
},
{
"cell_type": "code",
"execution_count": 10,
"id": "c2affa6e",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"RWForCausalLM(\n",
" (transformer): RWModel(\n",
" (word_embeddings): Embedding(65024, 8192)\n",
" (h): ModuleList(\n",
" (0-59): 60 x DecoderLayer(\n",
" (ln_attn): LayerNorm((8192,), eps=1e-05, elementwise_affine=True)\n",
" (ln_mlp): LayerNorm((8192,), eps=1e-05, elementwise_affine=True)\n",
" (self_attention): Attention(\n",
" (maybe_rotary): RotaryEmbedding()\n",
" (query_key_value): Linear(in_features=8192, out_features=9216, bias=False)\n",
" (dense): Linear(in_features=8192, out_features=8192, bias=False)\n",
" (attention_dropout): Dropout(p=0.0, inplace=False)\n",
" )\n",
" (mlp): MLP(\n",
" (dense_h_to_4h): Linear(in_features=8192, out_features=32768, bias=False)\n",
" (act): GELU(approximate='none')\n",
" (dense_4h_to_h): Linear(in_features=32768, out_features=8192, bias=False)\n",
" )\n",
" )\n",
" )\n",
" (ln_f): LayerNorm((8192,), eps=1e-05, elementwise_affine=True)\n",
" )\n",
" (lm_head): Linear(in_features=8192, out_features=65024, bias=False)\n",
")"
]
},
"execution_count": 10,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"pipeline.model"
]
},
{
"cell_type": "code",
"execution_count": 20,
"id": "839cd492",
"metadata": {},
"outputs": [],
"source": [
"tokens = tokenizer.tokenize(\"Dzisiaj rano w Madrycie wylądował latający talerz. Premier\")"
]
},
{
"cell_type": "code",
"execution_count": 21,
"id": "d64d3368",
"metadata": {},
"outputs": [],
"source": [
"token_ids = tokenizer.convert_tokens_to_ids(tokens)"
]
},
{
"cell_type": "code",
"execution_count": 22,
"id": "7c11a72c",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"[47,\n",
" 101,\n",
" 33623,\n",
" 85,\n",
" 392,\n",
" 2808,\n",
" 251,\n",
" 5509,\n",
" 547,\n",
" 9228,\n",
" 251,\n",
" 1985,\n",
" 27255,\n",
" 38281,\n",
" 282,\n",
" 785,\n",
" 14585,\n",
" 2586,\n",
" 3438,\n",
" 246,\n",
" 101,\n",
" 25,\n",
" 15222]"
]
},
"execution_count": 22,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"token_ids"
]
},
{
"cell_type": "code",
"execution_count": 23,
"id": "6813774c",
"metadata": {},
"outputs": [],
"source": [
"output = pipeline.model(torch.tensor([token_ids]))"
]
},
{
"cell_type": "code",
"execution_count": 24,
"id": "b801dd24",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"CausalLMOutputWithCrossAttentions(loss={'logits': tensor([[[ -8.7500, -10.6250, -11.9375, ..., -10.8125, -12.0000, -9.5000],\n",
" [ -9.1250, -10.5000, -12.1250, ..., -8.1250, -9.4375, -7.1562],\n",
" [-16.1250, -22.2500, -24.0000, ..., -19.1250, -19.5000, -18.6250],\n",
" ...,\n",
" [-14.1250, -16.0000, -20.2500, ..., -12.3750, -17.7500, -11.3125],\n",
" [-13.1875, -14.3125, -18.8750, ..., -15.3125, -17.6250, -14.6250],\n",
" [-13.5000, -14.4375, -18.0000, ..., -12.5000, -16.5000, -8.8750]]],\n",
" dtype=torch.bfloat16, grad_fn=<ToCopyBackward0>), 'past_key_values': ((tensor([[[ 0.5547, -1.4062, -0.3340, ..., 0.1514, -0.9297, -1.0312],\n",
" [-0.4785, -1.4062, -0.4316, ..., 0.0583, -1.4062, 0.1992],\n",
" [-2.2500, -2.3281, 0.4707, ..., -1.1562, -0.1973, 0.9883],\n",
" ...,\n",
" [-0.5508, 1.2656, 0.2754, ..., 0.0527, -1.4062, 0.1992],\n",
" [-0.6953, 1.1406, -0.3242, ..., -0.9414, -0.6211, -1.1953],\n",
" [-1.4141, 1.2656, 0.2314, ..., -0.1182, -0.2041, -0.2812]],\n",
"\n",
" [[ 0.5547, -1.4062, -0.3340, ..., 0.1514, -0.9297, -1.0312],\n",
" [-0.4785, -1.4062, -0.4316, ..., 0.0583, -1.4062, 0.1992],\n",
" [-2.2500, -2.3281, 0.4707, ..., -1.1562, -0.1973, 0.9883],\n",
" ...,\n",
" [-0.5508, 1.2656, 0.2754, ..., 0.0527, -1.4062, 0.1992],\n",
" [-0.6953, 1.1406, -0.3242, ..., -0.9414, -0.6211, -1.1953],\n",
" [-1.4141, 1.2656, 0.2314, ..., -0.1182, -0.2041, -0.2812]],\n",
"\n",
" [[ 0.5547, -1.4062, -0.3340, ..., 0.1514, -0.9297, -1.0312],\n",
" [-0.4785, -1.4062, -0.4316, ..., 0.0583, -1.4062, 0.1992],\n",
" [-2.2500, -2.3281, 0.4707, ..., -1.1562, -0.1973, 0.9883],\n",
" ...,\n",
" [-0.5508, 1.2656, 0.2754, ..., 0.0527, -1.4062, 0.1992],\n",
" [-0.6953, 1.1406, -0.3242, ..., -0.9414, -0.6211, -1.1953],\n",
" [-1.4141, 1.2656, 0.2314, ..., -0.1182, -0.2041, -0.2812]],\n",
"\n",
" ...,\n",
"\n",
" [[ 0.4551, -0.1377, -0.2383, ..., 0.9648, -1.7422, 0.1562],\n",
" [ 0.2422, -1.1875, -0.4629, ..., -0.1816, -0.8867, -0.7070],\n",
" [-1.3672, -1.3750, -1.0781, ..., 0.2119, 1.6172, 0.6758],\n",
" ...,\n",
" [ 0.0078, -0.5391, 0.4355, ..., -0.1787, -0.8945, -0.7070],\n",
" [-0.1738, 0.0957, 0.4004, ..., 0.7578, -1.8203, 1.6328],\n",
" [-0.4863, 1.2500, -2.1094, ..., 0.9961, 1.2109, 1.5938]],\n",
"\n",
" [[ 0.4551, -0.1377, -0.2383, ..., 0.9648, -1.7422, 0.1562],\n",
" [ 0.2422, -1.1875, -0.4629, ..., -0.1816, -0.8867, -0.7070],\n",
" [-1.3672, -1.3750, -1.0781, ..., 0.2119, 1.6172, 0.6758],\n",
" ...,\n",
" [ 0.0078, -0.5391, 0.4355, ..., -0.1787, -0.8945, -0.7070],\n",
" [-0.1738, 0.0957, 0.4004, ..., 0.7578, -1.8203, 1.6328],\n",
" [-0.4863, 1.2500, -2.1094, ..., 0.9961, 1.2109, 1.5938]],\n",
"\n",
" [[ 0.4551, -0.1377, -0.2383, ..., 0.9648, -1.7422, 0.1562],\n",
" [ 0.2422, -1.1875, -0.4629, ..., -0.1816, -0.8867, -0.7070],\n",
" [-1.3672, -1.3750, -1.0781, ..., 0.2119, 1.6172, 0.6758],\n",
" ...,\n",
" [ 0.0078, -0.5391, 0.4355, ..., -0.1787, -0.8945, -0.7070],\n",
" [-0.1738, 0.0957, 0.4004, ..., 0.7578, -1.8203, 1.6328],\n",
" [-0.4863, 1.2500, -2.1094, ..., 0.9961, 1.2109, 1.5938]]],\n",
" dtype=torch.bfloat16, grad_fn=<ToCopyBackward0>), tensor([[[-0.0036, 0.0417, 0.0364, ..., -0.0087, -0.0391, -0.0474],\n",
" [-0.0889, 0.0193, -0.0654, ..., -0.0253, 0.0302, -0.0610],\n",
" [-0.0591, -0.0118, -0.1279, ..., -0.1592, 0.1338, 0.0255],\n",
" ...,\n",
" [-0.0889, 0.0193, -0.0654, ..., -0.0253, 0.0302, -0.0610],\n",
" [ 0.0087, 0.0205, 0.0557, ..., 0.0085, 0.0039, -0.0454],\n",
" [-0.0659, 0.0427, 0.1006, ..., 0.1055, -0.0527, -0.0339]],\n",
"\n",
" [[-0.0036, 0.0417, 0.0364, ..., -0.0087, -0.0391, -0.0474],\n",
" [-0.0889, 0.0193, -0.0654, ..., -0.0253, 0.0302, -0.0610],\n",
" [-0.0591, -0.0118, -0.1279, ..., -0.1592, 0.1338, 0.0255],\n",
" ...,\n",
" [-0.0889, 0.0193, -0.0654, ..., -0.0253, 0.0302, -0.0610],\n",
" [ 0.0087, 0.0205, 0.0557, ..., 0.0085, 0.0039, -0.0454],\n",
" [-0.0659, 0.0427, 0.1006, ..., 0.1055, -0.0527, -0.0339]],\n",
"\n",
" [[-0.0036, 0.0417, 0.0364, ..., -0.0087, -0.0391, -0.0474],\n",
" [-0.0889, 0.0193, -0.0654, ..., -0.0253, 0.0302, -0.0610],\n",
" [-0.0591, -0.0118, -0.1279, ..., -0.1592, 0.1338, 0.0255],\n",
" ...,\n",
" [-0.0889, 0.0193, -0.0654, ..., -0.0253, 0.0302, -0.0610],\n",
" [ 0.0087, 0.0205, 0.0557, ..., 0.0085, 0.0039, -0.0454],\n",
" [-0.0659, 0.0427, 0.1006, ..., 0.1055, -0.0527, -0.0339]],\n",
"\n",
" ...,\n",
"\n",
" [[-0.0593, -0.0640, -0.0276, ..., -0.0116, -0.0459, -0.0016],\n",
" [-0.1680, 0.1338, 0.0145, ..., 0.0097, -0.0281, 0.0104],\n",
" [-0.0708, -0.0918, 0.2285, ..., -0.0635, 0.1396, 0.0603],\n",
" ...,\n",
" [-0.1680, 0.1338, 0.0145, ..., 0.0097, -0.0281, 0.0104],\n",
" [-0.0923, 0.0505, 0.0068, ..., 0.0239, -0.0119, 0.0031],\n",
" [-0.2773, -0.3125, -0.3086, ..., 0.0464, 0.1826, -0.2871]],\n",
"\n",
" [[-0.0593, -0.0640, -0.0276, ..., -0.0116, -0.0459, -0.0016],\n",
" [-0.1680, 0.1338, 0.0145, ..., 0.0097, -0.0281, 0.0104],\n",
" [-0.0708, -0.0918, 0.2285, ..., -0.0635, 0.1396, 0.0603],\n",
" ...,\n",
" [-0.1680, 0.1338, 0.0145, ..., 0.0097, -0.0281, 0.0104],\n",
" [-0.0923, 0.0505, 0.0068, ..., 0.0239, -0.0119, 0.0031],\n",
" [-0.2773, -0.3125, -0.3086, ..., 0.0464, 0.1826, -0.2871]],\n",
"\n",
" [[-0.0593, -0.0640, -0.0276, ..., -0.0116, -0.0459, -0.0016],\n",
" [-0.1680, 0.1338, 0.0145, ..., 0.0097, -0.0281, 0.0104],\n",
" [-0.0708, -0.0918, 0.2285, ..., -0.0635, 0.1396, 0.0603],\n",
" ...,\n",
" [-0.1680, 0.1338, 0.0145, ..., 0.0097, -0.0281, 0.0104],\n",
" [-0.0923, 0.0505, 0.0068, ..., 0.0239, -0.0119, 0.0031],\n",
" [-0.2773, -0.3125, -0.3086, ..., 0.0464, 0.1826, -0.2871]]],\n",
" dtype=torch.bfloat16, grad_fn=<ToCopyBackward0>)), (tensor([[[ 2.4531, 1.6719, -4.0938, ..., -1.4219, 0.8477, 1.2422],\n",
" [ 7.1250, 2.4375, -2.5938, ..., -1.1406, 0.6445, 1.5938],\n",
" [ 4.6875, 4.4375, -2.6875, ..., -0.1709, 0.5898, 3.0156],\n",
" ...,\n",
" [ 7.1875, 3.5156, 0.3594, ..., -2.2969, 0.7539, 3.3750],\n",
" [ 2.5312, 0.1777, -1.4219, ..., 0.1572, 2.0312, -0.9766],\n",
" [-3.5000, -2.9531, -4.5312, ..., -1.9609, 1.1016, 1.3672]],\n",
"\n",
" [[ 2.4531, 1.6719, -4.0938, ..., -1.4219, 0.8477, 1.2422],\n",
" [ 7.1250, 2.4375, -2.5938, ..., -1.1406, 0.6445, 1.5938],\n",
" [ 4.6875, 4.4375, -2.6875, ..., -0.1709, 0.5898, 3.0156],\n",
" ...,\n",
" [ 7.1875, 3.5156, 0.3594, ..., -2.2969, 0.7539, 3.3750],\n",
" [ 2.5312, 0.1777, -1.4219, ..., 0.1572, 2.0312, -0.9766],\n",
" [-3.5000, -2.9531, -4.5312, ..., -1.9609, 1.1016, 1.3672]],\n",
"\n",
" [[ 2.4531, 1.6719, -4.0938, ..., -1.4219, 0.8477, 1.2422],\n",
" [ 7.1250, 2.4375, -2.5938, ..., -1.1406, 0.6445, 1.5938],\n",
" [ 4.6875, 4.4375, -2.6875, ..., -0.1709, 0.5898, 3.0156],\n",
" ...,\n",
" [ 7.1875, 3.5156, 0.3594, ..., -2.2969, 0.7539, 3.3750],\n",
" [ 2.5312, 0.1777, -1.4219, ..., 0.1572, 2.0312, -0.9766],\n",
" [-3.5000, -2.9531, -4.5312, ..., -1.9609, 1.1016, 1.3672]],\n",
"\n",
" ...,\n",
"\n",
" [[ 0.1846, 0.7070, -2.5938, ..., -1.6562, 3.1719, -1.0938],\n",
" [ 3.0312, 1.2500, -1.8672, ..., -2.9219, 2.3594, -0.6680],\n",
" [ 5.0625, 0.7070, -0.4258, ..., -4.0938, 3.5781, -3.2344],\n",
" ...,\n",
" [ 3.1094, -0.4531, -1.0938, ..., -2.7969, 4.0625, -3.1562],\n",
" [ 0.1914, -0.6250, -0.1875, ..., -4.5000, 2.4844, -2.0000],\n",
" [-2.2344, 3.1250, -4.0625, ..., -2.6406, 3.4844, -0.3887]],\n",
"\n",
" [[ 0.1846, 0.7070, -2.5938, ..., -1.6562, 3.1719, -1.0938],\n",
" [ 3.0312, 1.2500, -1.8672, ..., -2.9219, 2.3594, -0.6680],\n",
" [ 5.0625, 0.7070, -0.4258, ..., -4.0938, 3.5781, -3.2344],\n",
" ...,\n",
" [ 3.1094, -0.4531, -1.0938, ..., -2.7969, 4.0625, -3.1562],\n",
" [ 0.1914, -0.6250, -0.1875, ..., -4.5000, 2.4844, -2.0000],\n",
" [-2.2344, 3.1250, -4.0625, ..., -2.6406, 3.4844, -0.3887]],\n",
"\n",
" [[ 0.1846, 0.7070, -2.5938, ..., -1.6562, 3.1719, -1.0938],\n",
" [ 3.0312, 1.2500, -1.8672, ..., -2.9219, 2.3594, -0.6680],\n",
" [ 5.0625, 0.7070, -0.4258, ..., -4.0938, 3.5781, -3.2344],\n",
" ...,\n",
" [ 3.1094, -0.4531, -1.0938, ..., -2.7969, 4.0625, -3.1562],\n",
" [ 0.1914, -0.6250, -0.1875, ..., -4.5000, 2.4844, -2.0000],\n",
" [-2.2344, 3.1250, -4.0625, ..., -2.6406, 3.4844, -0.3887]]],\n",
" dtype=torch.bfloat16, grad_fn=<ToCopyBackward0>), tensor([[[ 0.0366, 0.0337, 0.0234, ..., 0.1299, 0.2031, -0.1338],\n",
" [-0.0269, -0.0304, 0.0752, ..., -0.1118, -0.0603, 0.0413],\n",
" [-0.0747, 0.0601, 0.0187, ..., -0.0109, -0.2598, -0.1670],\n",
" ...,\n",
" [ 0.2061, -0.0820, 0.0376, ..., -0.0986, 0.1738, 0.1660],\n",
" [-0.0284, -0.0187, 0.0200, ..., 0.0508, -0.0062, -0.0474],\n",
" [-0.1807, 0.1826, 0.0069, ..., 0.1045, -0.3145, -0.1138]],\n",
"\n",
" [[ 0.0366, 0.0337, 0.0234, ..., 0.1299, 0.2031, -0.1338],\n",
" [-0.0269, -0.0304, 0.0752, ..., -0.1118, -0.0603, 0.0413],\n",
" [-0.0747, 0.0601, 0.0187, ..., -0.0109, -0.2598, -0.1670],\n",
" ...,\n",
" [ 0.2061, -0.0820, 0.0376, ..., -0.0986, 0.1738, 0.1660],\n",
" [-0.0284, -0.0187, 0.0200, ..., 0.0508, -0.0062, -0.0474],\n",
" [-0.1807, 0.1826, 0.0069, ..., 0.1045, -0.3145, -0.1138]],\n",
"\n",
" [[ 0.0366, 0.0337, 0.0234, ..., 0.1299, 0.2031, -0.1338],\n",
" [-0.0269, -0.0304, 0.0752, ..., -0.1118, -0.0603, 0.0413],\n",
" [-0.0747, 0.0601, 0.0187, ..., -0.0109, -0.2598, -0.1670],\n",
" ...,\n",
" [ 0.2061, -0.0820, 0.0376, ..., -0.0986, 0.1738, 0.1660],\n",
" [-0.0284, -0.0187, 0.0200, ..., 0.0508, -0.0062, -0.0474],\n",
" [-0.1807, 0.1826, 0.0069, ..., 0.1045, -0.3145, -0.1138]],\n",
"\n",
" ...,\n",
"\n",
" [[ 0.0688, 0.0198, 0.0096, ..., -0.0469, -0.0825, -0.0283],\n",
" [-0.0437, 0.0231, -0.0981, ..., 0.0354, -0.0835, -0.0356],\n",
" [-0.1641, 0.0330, -0.0334, ..., 0.0674, -0.1543, 0.1328],\n",
" ...,\n",
" [ 0.0854, -0.0124, -0.1245, ..., 0.0864, -0.0591, -0.0588],\n",
" [-0.0104, -0.0232, 0.0012, ..., 0.0289, 0.0244, 0.0532],\n",
" [ 0.0466, 0.1074, 0.2637, ..., -0.0938, 0.0044, 0.0801]],\n",
"\n",
" [[ 0.0688, 0.0198, 0.0096, ..., -0.0469, -0.0825, -0.0283],\n",
" [-0.0437, 0.0231, -0.0981, ..., 0.0354, -0.0835, -0.0356],\n",
" [-0.1641, 0.0330, -0.0334, ..., 0.0674, -0.1543, 0.1328],\n",
" ...,\n",
" [ 0.0854, -0.0124, -0.1245, ..., 0.0864, -0.0591, -0.0588],\n",
" [-0.0104, -0.0232, 0.0012, ..., 0.0289, 0.0244, 0.0532],\n",
" [ 0.0466, 0.1074, 0.2637, ..., -0.0938, 0.0044, 0.0801]],\n",
"\n",
" [[ 0.0688, 0.0198, 0.0096, ..., -0.0469, -0.0825, -0.0283],\n",
" [-0.0437, 0.0231, -0.0981, ..., 0.0354, -0.0835, -0.0356],\n",
" [-0.1641, 0.0330, -0.0334, ..., 0.0674, -0.1543, 0.1328],\n",
" ...,\n",
" [ 0.0854, -0.0124, -0.1245, ..., 0.0864, -0.0591, -0.0588],\n",
" [-0.0104, -0.0232, 0.0012, ..., 0.0289, 0.0244, 0.0532],\n",
" [ 0.0466, 0.1074, 0.2637, ..., -0.0938, 0.0044, 0.0801]]],\n",
" dtype=torch.bfloat16, grad_fn=<ToCopyBackward0>)), (tensor([[[-2.1820e-03, -5.0964e-03, 8.5831e-04, ..., -9.5312e-01,\n",
" 6.8970e-03, 4.1406e-01],\n",
" [ 1.4219e+00, 1.4141e+00, 2.2188e+00, ..., 6.9062e+00,\n",
" 1.8164e-01, -1.7188e+00],\n",
" [ 1.3672e+00, 7.8613e-02, 1.4531e+00, ..., 6.3438e+00,\n",
" -1.0547e+00, -2.1562e+00],\n",
" ...,\n",
" [ 2.2031e+00, -3.1250e-01, -2.8516e-01, ..., 7.3438e+00,\n",
" -2.1406e+00, -2.9688e+00],\n",
" [ 1.5234e-01, -2.6367e-01, 1.6699e-01, ..., 6.3125e+00,\n",
" -8.0469e-01, -1.4844e+00],\n",
" [ 1.2031e+00, -1.0469e+00, 1.1016e+00, ..., 6.2812e+00,\n",
" 1.1250e+00, -1.1328e+00]],\n",
"\n",
" [[-2.1820e-03, -5.0964e-03, 8.5831e-04, ..., -9.5312e-01,\n",
" 6.8970e-03, 4.1406e-01],\n",
" [ 1.4219e+00, 1.4141e+00, 2.2188e+00, ..., 6.9062e+00,\n",
" 1.8164e-01, -1.7188e+00],\n",
" [ 1.3672e+00, 7.8613e-02, 1.4531e+00, ..., 6.3438e+00,\n",
" -1.0547e+00, -2.1562e+00],\n",
" ...,\n",
" [ 2.2031e+00, -3.1250e-01, -2.8516e-01, ..., 7.3438e+00,\n",
" -2.1406e+00, -2.9688e+00],\n",
" [ 1.5234e-01, -2.6367e-01, 1.6699e-01, ..., 6.3125e+00,\n",
" -8.0469e-01, -1.4844e+00],\n",
" [ 1.2031e+00, -1.0469e+00, 1.1016e+00, ..., 6.2812e+00,\n",
" 1.1250e+00, -1.1328e+00]],\n",
"\n",
" [[-2.1820e-03, -5.0964e-03, 8.5831e-04, ..., -9.5312e-01,\n",
" 6.8970e-03, 4.1406e-01],\n",
" [ 1.4219e+00, 1.4141e+00, 2.2188e+00, ..., 6.9062e+00,\n",
" 1.8164e-01, -1.7188e+00],\n",
" [ 1.3672e+00, 7.8613e-02, 1.4531e+00, ..., 6.3438e+00,\n",
" -1.0547e+00, -2.1562e+00],\n",
" ...,\n",
" [ 2.2031e+00, -3.1250e-01, -2.8516e-01, ..., 7.3438e+00,\n",
" -2.1406e+00, -2.9688e+00],\n",
" [ 1.5234e-01, -2.6367e-01, 1.6699e-01, ..., 6.3125e+00,\n",
" -8.0469e-01, -1.4844e+00],\n",
" [ 1.2031e+00, -1.0469e+00, 1.1016e+00, ..., 6.2812e+00,\n",
" 1.1250e+00, -1.1328e+00]],\n",
"\n",
" ...,\n",
"\n",
" [[ 1.0315e-02, 7.4005e-04, -5.4626e-03, ..., -4.5654e-02,\n",
" 9.8145e-02, 1.5015e-02],\n",
" [ 4.0000e+00, 1.5156e+00, 1.7188e+00, ..., 8.3008e-02,\n",
" -2.4844e+00, 1.7188e+00],\n",
" [ 2.1875e+00, -4.4434e-02, 1.3672e+00, ..., 1.4609e+00,\n",
" -1.6250e+00, 1.2578e+00],\n",
" ...,\n",
" [ 3.4844e+00, 9.6484e-01, 5.4297e-01, ..., 1.0625e+00,\n",
" -1.5938e+00, 7.6172e-01],\n",
" [ 8.1250e-01, 7.3242e-04, 7.5000e-01, ..., 1.5156e+00,\n",
" 4.2773e-01, 2.3594e+00],\n",
" [-5.3516e-01, 1.0469e+00, 2.1719e+00, ..., 6.0547e-01,\n",
" -1.0391e+00, 2.4805e-01]],\n",
"\n",
" [[ 1.0315e-02, 7.4005e-04, -5.4626e-03, ..., -4.5654e-02,\n",
" 9.8145e-02, 1.5015e-02],\n",
" [ 4.0000e+00, 1.5156e+00, 1.7188e+00, ..., 8.3008e-02,\n",
" -2.4844e+00, 1.7188e+00],\n",
" [ 2.1875e+00, -4.4434e-02, 1.3672e+00, ..., 1.4609e+00,\n",
" -1.6250e+00, 1.2578e+00],\n",
" ...,\n",
" [ 3.4844e+00, 9.6484e-01, 5.4297e-01, ..., 1.0625e+00,\n",
" -1.5938e+00, 7.6172e-01],\n",
" [ 8.1250e-01, 7.3242e-04, 7.5000e-01, ..., 1.5156e+00,\n",
" 4.2773e-01, 2.3594e+00],\n",
" [-5.3516e-01, 1.0469e+00, 2.1719e+00, ..., 6.0547e-01,\n",
" -1.0391e+00, 2.4805e-01]],\n",
"\n",
" [[ 1.0315e-02, 7.4005e-04, -5.4626e-03, ..., -4.5654e-02,\n",
" 9.8145e-02, 1.5015e-02],\n",
" [ 4.0000e+00, 1.5156e+00, 1.7188e+00, ..., 8.3008e-02,\n",
" -2.4844e+00, 1.7188e+00],\n",
" [ 2.1875e+00, -4.4434e-02, 1.3672e+00, ..., 1.4609e+00,\n",
" -1.6250e+00, 1.2578e+00],\n",
" ...,\n",
" [ 3.4844e+00, 9.6484e-01, 5.4297e-01, ..., 1.0625e+00,\n",
" -1.5938e+00, 7.6172e-01],\n",
" [ 8.1250e-01, 7.3242e-04, 7.5000e-01, ..., 1.5156e+00,\n",
" 4.2773e-01, 2.3594e+00],\n",
" [-5.3516e-01, 1.0469e+00, 2.1719e+00, ..., 6.0547e-01,\n",
" -1.0391e+00, 2.4805e-01]]], dtype=torch.bfloat16,\n",
" grad_fn=<ToCopyBackward0>), tensor([[[-1.8997e-03, -1.0864e-02, 2.7313e-03, ..., -5.3711e-03,\n",
" 7.7820e-04, 3.0975e-03],\n",
" [ 1.2793e-01, 3.1445e-01, 5.7812e-01, ..., 5.1953e-01,\n",
" -1.4771e-02, -3.0078e-01],\n",
" [-2.1973e-01, 5.4443e-02, -1.6699e-01, ..., -5.3516e-01,\n",
" 2.7930e-01, 1.0205e-01],\n",
" ...,\n",
" [-1.7480e-01, 7.2266e-02, 2.2949e-01, ..., 2.8809e-02,\n",
" 2.4512e-01, -1.9775e-02],\n",
" [ 3.5742e-01, -2.8198e-02, 5.3955e-02, ..., 3.4766e-01,\n",
" -1.0400e-01, -1.5820e-01],\n",
" [ 4.0234e-01, 1.1719e+00, -1.2812e+00, ..., 1.0156e+00,\n",
" -5.1172e-01, 5.5469e-01]],\n",
"\n",
" [[-1.8997e-03, -1.0864e-02, 2.7313e-03, ..., -5.3711e-03,\n",
" 7.7820e-04, 3.0975e-03],\n",
" [ 1.2793e-01, 3.1445e-01, 5.7812e-01, ..., 5.1953e-01,\n",
" -1.4771e-02, -3.0078e-01],\n",
" [-2.1973e-01, 5.4443e-02, -1.6699e-01, ..., -5.3516e-01,\n",
" 2.7930e-01, 1.0205e-01],\n",
" ...,\n",
" [-1.7480e-01, 7.2266e-02, 2.2949e-01, ..., 2.8809e-02,\n",
" 2.4512e-01, -1.9775e-02],\n",
" [ 3.5742e-01, -2.8198e-02, 5.3955e-02, ..., 3.4766e-01,\n",
" -1.0400e-01, -1.5820e-01],\n",
" [ 4.0234e-01, 1.1719e+00, -1.2812e+00, ..., 1.0156e+00,\n",
" -5.1172e-01, 5.5469e-01]],\n",
"\n",
" [[-1.8997e-03, -1.0864e-02, 2.7313e-03, ..., -5.3711e-03,\n",
" 7.7820e-04, 3.0975e-03],\n",
" [ 1.2793e-01, 3.1445e-01, 5.7812e-01, ..., 5.1953e-01,\n",
" -1.4771e-02, -3.0078e-01],\n",
" [-2.1973e-01, 5.4443e-02, -1.6699e-01, ..., -5.3516e-01,\n",
" 2.7930e-01, 1.0205e-01],\n",
" ...,\n",
" [-1.7480e-01, 7.2266e-02, 2.2949e-01, ..., 2.8809e-02,\n",
" 2.4512e-01, -1.9775e-02],\n",
" [ 3.5742e-01, -2.8198e-02, 5.3955e-02, ..., 3.4766e-01,\n",
" -1.0400e-01, -1.5820e-01],\n",
" [ 4.0234e-01, 1.1719e+00, -1.2812e+00, ..., 1.0156e+00,\n",
" -5.1172e-01, 5.5469e-01]],\n",
"\n",
" ...,\n",
"\n",
" [[ 1.1658e-02, -7.6599e-03, -4.5967e-04, ..., -3.1128e-03,\n",
" 3.2349e-03, 9.1934e-04],\n",
" [ 2.6953e-01, 2.1582e-01, -2.3633e-01, ..., 2.2095e-02,\n",
" 2.5195e-01, 5.7373e-02],\n",
" [-3.2031e-01, 1.0315e-02, 5.2979e-02, ..., 4.0820e-01,\n",
" 1.6895e-01, -2.3047e-01],\n",
" ...,\n",
" [-2.4048e-02, 1.3184e-01, -2.1289e-01, ..., 2.3340e-01,\n",
" 2.2949e-01, 3.2617e-01],\n",
" [-3.1250e-01, 1.7090e-01, -1.5918e-01, ..., 2.1289e-01,\n",
" -2.3926e-01, -2.6245e-02],\n",
" [-1.4062e-01, -1.6504e-01, 1.4282e-02, ..., -2.7344e-01,\n",
" -2.6758e-01, -4.4727e-01]],\n",
"\n",
" [[ 1.1658e-02, -7.6599e-03, -4.5967e-04, ..., -3.1128e-03,\n",
" 3.2349e-03, 9.1934e-04],\n",
" [ 2.6953e-01, 2.1582e-01, -2.3633e-01, ..., 2.2095e-02,\n",
" 2.5195e-01, 5.7373e-02],\n",
" [-3.2031e-01, 1.0315e-02, 5.2979e-02, ..., 4.0820e-01,\n",
" 1.6895e-01, -2.3047e-01],\n",
" ...,\n",
" [-2.4048e-02, 1.3184e-01, -2.1289e-01, ..., 2.3340e-01,\n",
" 2.2949e-01, 3.2617e-01],\n",
" [-3.1250e-01, 1.7090e-01, -1.5918e-01, ..., 2.1289e-01,\n",
" -2.3926e-01, -2.6245e-02],\n",
" [-1.4062e-01, -1.6504e-01, 1.4282e-02, ..., -2.7344e-01,\n",
" -2.6758e-01, -4.4727e-01]],\n",
"\n",
" [[ 1.1658e-02, -7.6599e-03, -4.5967e-04, ..., -3.1128e-03,\n",
" 3.2349e-03, 9.1934e-04],\n",
" [ 2.6953e-01, 2.1582e-01, -2.3633e-01, ..., 2.2095e-02,\n",
" 2.5195e-01, 5.7373e-02],\n",
" [-3.2031e-01, 1.0315e-02, 5.2979e-02, ..., 4.0820e-01,\n",
" 1.6895e-01, -2.3047e-01],\n",
" ...,\n",
" [-2.4048e-02, 1.3184e-01, -2.1289e-01, ..., 2.3340e-01,\n",
" 2.2949e-01, 3.2617e-01],\n",
" [-3.1250e-01, 1.7090e-01, -1.5918e-01, ..., 2.1289e-01,\n",
" -2.3926e-01, -2.6245e-02],\n",
" [-1.4062e-01, -1.6504e-01, 1.4282e-02, ..., -2.7344e-01,\n",
" -2.6758e-01, -4.4727e-01]]], dtype=torch.bfloat16,\n",
" grad_fn=<ToCopyBackward0>)), (tensor([[[ 7.3242e-03, 4.6387e-03, 8.1787e-03, ..., 6.4844e-01,\n",
" 5.0000e-01, -3.8477e-01],\n",
" [-1.9688e+00, 1.5000e+00, -7.1484e-01, ..., -3.2031e+00,\n",
" -2.9375e+00, 2.9883e-01],\n",
" [-5.7031e-01, 5.5078e-01, -1.0781e+00, ..., -3.3281e+00,\n",
" -3.3906e+00, 1.0469e+00],\n",
" ...,\n",
" [-6.6406e-01, 3.0078e-01, 8.3984e-01, ..., -4.9375e+00,\n",
" -2.9844e+00, 8.1543e-02],\n",
" [-3.3203e-01, -7.6172e-02, 1.9531e-01, ..., -4.9062e+00,\n",
" -3.5625e+00, 6.4453e-01],\n",
" [ 4.2188e-01, -8.7891e-01, 1.4355e-01, ..., -5.2188e+00,\n",
" -2.3281e+00, 1.0625e+00]],\n",
"\n",
" [[ 7.3242e-03, 4.6387e-03, 8.1787e-03, ..., 6.4844e-01,\n",
" 5.0000e-01, -3.8477e-01],\n",
" [-1.9688e+00, 1.5000e+00, -7.1484e-01, ..., -3.2031e+00,\n",
" -2.9375e+00, 2.9883e-01],\n",
" [-5.7031e-01, 5.5078e-01, -1.0781e+00, ..., -3.3281e+00,\n",
" -3.3906e+00, 1.0469e+00],\n",
" ...,\n",
" [-6.6406e-01, 3.0078e-01, 8.3984e-01, ..., -4.9375e+00,\n",
" -2.9844e+00, 8.1543e-02],\n",
" [-3.3203e-01, -7.6172e-02, 1.9531e-01, ..., -4.9062e+00,\n",
" -3.5625e+00, 6.4453e-01],\n",
" [ 4.2188e-01, -8.7891e-01, 1.4355e-01, ..., -5.2188e+00,\n",
" -2.3281e+00, 1.0625e+00]],\n",
"\n",
" [[ 7.3242e-03, 4.6387e-03, 8.1787e-03, ..., 6.4844e-01,\n",
" 5.0000e-01, -3.8477e-01],\n",
" [-1.9688e+00, 1.5000e+00, -7.1484e-01, ..., -3.2031e+00,\n",
" -2.9375e+00, 2.9883e-01],\n",
" [-5.7031e-01, 5.5078e-01, -1.0781e+00, ..., -3.3281e+00,\n",
" -3.3906e+00, 1.0469e+00],\n",
" ...,\n",
" [-6.6406e-01, 3.0078e-01, 8.3984e-01, ..., -4.9375e+00,\n",
" -2.9844e+00, 8.1543e-02],\n",
" [-3.3203e-01, -7.6172e-02, 1.9531e-01, ..., -4.9062e+00,\n",
" -3.5625e+00, 6.4453e-01],\n",
" [ 4.2188e-01, -8.7891e-01, 1.4355e-01, ..., -5.2188e+00,\n",
" -2.3281e+00, 1.0625e+00]],\n",
"\n",
" ...,\n",
"\n",
" [[ 2.6398e-03, -5.1575e-03, 3.1586e-03, ..., 8.0078e-02,\n",
" -5.4321e-03, -2.4219e-01],\n",
" [-5.1875e+00, -8.9062e-01, 7.8516e-01, ..., -6.6406e-01,\n",
" -1.3281e+00, -6.8750e-01],\n",
" [-3.2188e+00, -5.5664e-02, 1.1719e-02, ..., 5.1172e-01,\n",
" 3.5352e-01, -9.7266e-01],\n",
" ...,\n",
" [-4.7188e+00, 2.7344e+00, 1.2500e+00, ..., -6.9531e-01,\n",
" 4.5508e-01, -7.4219e-01],\n",
" [-2.1406e+00, 2.5312e+00, 1.1875e+00, ..., 6.6406e-01,\n",
" 7.3828e-01, -7.6953e-01],\n",
" [ 3.0469e+00, 1.2812e+00, 2.6562e+00, ..., -1.1094e+00,\n",
" 2.1875e-01, -2.4292e-02]],\n",
"\n",
" [[ 2.6398e-03, -5.1575e-03, 3.1586e-03, ..., 8.0078e-02,\n",
" -5.4321e-03, -2.4219e-01],\n",
" [-5.1875e+00, -8.9062e-01, 7.8516e-01, ..., -6.6406e-01,\n",
" -1.3281e+00, -6.8750e-01],\n",
" [-3.2188e+00, -5.5664e-02, 1.1719e-02, ..., 5.1172e-01,\n",
" 3.5352e-01, -9.7266e-01],\n",
" ...,\n",
" [-4.7188e+00, 2.7344e+00, 1.2500e+00, ..., -6.9531e-01,\n",
" 4.5508e-01, -7.4219e-01],\n",
" [-2.1406e+00, 2.5312e+00, 1.1875e+00, ..., 6.6406e-01,\n",
" 7.3828e-01, -7.6953e-01],\n",
" [ 3.0469e+00, 1.2812e+00, 2.6562e+00, ..., -1.1094e+00,\n",
" 2.1875e-01, -2.4292e-02]],\n",
"\n",
" [[ 2.6398e-03, -5.1575e-03, 3.1586e-03, ..., 8.0078e-02,\n",
" -5.4321e-03, -2.4219e-01],\n",
" [-5.1875e+00, -8.9062e-01, 7.8516e-01, ..., -6.6406e-01,\n",
" -1.3281e+00, -6.8750e-01],\n",
" [-3.2188e+00, -5.5664e-02, 1.1719e-02, ..., 5.1172e-01,\n",
" 3.5352e-01, -9.7266e-01],\n",
" ...,\n",
" [-4.7188e+00, 2.7344e+00, 1.2500e+00, ..., -6.9531e-01,\n",
" 4.5508e-01, -7.4219e-01],\n",
" [-2.1406e+00, 2.5312e+00, 1.1875e+00, ..., 6.6406e-01,\n",
" 7.3828e-01, -7.6953e-01],\n",
" [ 3.0469e+00, 1.2812e+00, 2.6562e+00, ..., -1.1094e+00,\n",
" 2.1875e-01, -2.4292e-02]]], dtype=torch.bfloat16,\n",
" grad_fn=<ToCopyBackward0>), tensor([[[ 5.3406e-04, 6.9580e-03, -4.0283e-03, ..., 1.3428e-03,\n",
" -3.2349e-03, -1.6174e-03],\n",
" [ 1.1963e-01, -6.2109e-01, 2.1680e-01, ..., -2.0874e-02,\n",
" 1.0193e-02, 2.7930e-01],\n",
" [-3.1055e-01, 4.1016e-01, 8.4375e-01, ..., 1.8457e-01,\n",
" -7.3730e-02, -3.7305e-01],\n",
" ...,\n",
" [ 1.3672e-01, 4.7656e-01, 3.8477e-01, ..., 1.4258e-01,\n",
" 3.2422e-01, 1.6479e-02],\n",
" [ 7.4609e-01, -6.8359e-01, 1.6211e-01, ..., -2.5781e-01,\n",
" 4.6875e-01, 9.9121e-02],\n",
" [-1.3867e-01, 1.6699e-01, 1.1279e-01, ..., 4.2969e-01,\n",
" -8.2397e-03, -2.7539e-01]],\n",
"\n",
" [[ 5.3406e-04, 6.9580e-03, -4.0283e-03, ..., 1.3428e-03,\n",
" -3.2349e-03, -1.6174e-03],\n",
" [ 1.1963e-01, -6.2109e-01, 2.1680e-01, ..., -2.0874e-02,\n",
" 1.0193e-02, 2.7930e-01],\n",
" [-3.1055e-01, 4.1016e-01, 8.4375e-01, ..., 1.8457e-01,\n",
" -7.3730e-02, -3.7305e-01],\n",
" ...,\n",
" [ 1.3672e-01, 4.7656e-01, 3.8477e-01, ..., 1.4258e-01,\n",
" 3.2422e-01, 1.6479e-02],\n",
" [ 7.4609e-01, -6.8359e-01, 1.6211e-01, ..., -2.5781e-01,\n",
" 4.6875e-01, 9.9121e-02],\n",
" [-1.3867e-01, 1.6699e-01, 1.1279e-01, ..., 4.2969e-01,\n",
" -8.2397e-03, -2.7539e-01]],\n",
"\n",
" [[ 5.3406e-04, 6.9580e-03, -4.0283e-03, ..., 1.3428e-03,\n",
" -3.2349e-03, -1.6174e-03],\n",
" [ 1.1963e-01, -6.2109e-01, 2.1680e-01, ..., -2.0874e-02,\n",
" 1.0193e-02, 2.7930e-01],\n",
" [-3.1055e-01, 4.1016e-01, 8.4375e-01, ..., 1.8457e-01,\n",
" -7.3730e-02, -3.7305e-01],\n",
" ...,\n",
" [ 1.3672e-01, 4.7656e-01, 3.8477e-01, ..., 1.4258e-01,\n",
" 3.2422e-01, 1.6479e-02],\n",
" [ 7.4609e-01, -6.8359e-01, 1.6211e-01, ..., -2.5781e-01,\n",
" 4.6875e-01, 9.9121e-02],\n",
" [-1.3867e-01, 1.6699e-01, 1.1279e-01, ..., 4.2969e-01,\n",
" -8.2397e-03, -2.7539e-01]],\n",
"\n",
" ...,\n",
"\n",
" [[-3.9368e-03, -2.0752e-03, 3.1433e-03, ..., 3.3569e-04,\n",
" 3.0212e-03, -7.6675e-04],\n",
" [-3.5547e-01, 2.2339e-02, -1.4648e-01, ..., -1.3770e-01,\n",
" -2.1973e-01, 1.2024e-02],\n",
" [ 8.7891e-02, -1.6699e-01, 6.2500e-02, ..., 4.0039e-01,\n",
" -2.6953e-01, 1.7383e-01],\n",
" ...,\n",
" [ 7.7148e-02, 2.5586e-01, -2.6172e-01, ..., 1.7578e-01,\n",
" 9.2773e-02, -1.6968e-02],\n",
" [ 1.5039e-01, 4.4678e-02, 1.2061e-01, ..., 1.4648e-02,\n",
" 2.7539e-01, -1.4453e-01],\n",
" [-1.5234e-01, -3.2617e-01, -4.0625e-01, ..., 7.1716e-03,\n",
" -9.5215e-02, 6.0791e-02]],\n",
"\n",
" [[-3.9368e-03, -2.0752e-03, 3.1433e-03, ..., 3.3569e-04,\n",
" 3.0212e-03, -7.6675e-04],\n",
" [-3.5547e-01, 2.2339e-02, -1.4648e-01, ..., -1.3770e-01,\n",
" -2.1973e-01, 1.2024e-02],\n",
" [ 8.7891e-02, -1.6699e-01, 6.2500e-02, ..., 4.0039e-01,\n",
" -2.6953e-01, 1.7383e-01],\n",
" ...,\n",
" [ 7.7148e-02, 2.5586e-01, -2.6172e-01, ..., 1.7578e-01,\n",
" 9.2773e-02, -1.6968e-02],\n",
" [ 1.5039e-01, 4.4678e-02, 1.2061e-01, ..., 1.4648e-02,\n",
" 2.7539e-01, -1.4453e-01],\n",
" [-1.5234e-01, -3.2617e-01, -4.0625e-01, ..., 7.1716e-03,\n",
" -9.5215e-02, 6.0791e-02]],\n",
"\n",
" [[-3.9368e-03, -2.0752e-03, 3.1433e-03, ..., 3.3569e-04,\n",
" 3.0212e-03, -7.6675e-04],\n",
" [-3.5547e-01, 2.2339e-02, -1.4648e-01, ..., -1.3770e-01,\n",
" -2.1973e-01, 1.2024e-02],\n",
" [ 8.7891e-02, -1.6699e-01, 6.2500e-02, ..., 4.0039e-01,\n",
" -2.6953e-01, 1.7383e-01],\n",
" ...,\n",
" [ 7.7148e-02, 2.5586e-01, -2.6172e-01, ..., 1.7578e-01,\n",
" 9.2773e-02, -1.6968e-02],\n",
" [ 1.5039e-01, 4.4678e-02, 1.2061e-01, ..., 1.4648e-02,\n",
" 2.7539e-01, -1.4453e-01],\n",
" [-1.5234e-01, -3.2617e-01, -4.0625e-01, ..., 7.1716e-03,\n",
" -9.5215e-02, 6.0791e-02]]], dtype=torch.bfloat16,\n",
" grad_fn=<ToCopyBackward0>)), (tensor([[[ 0.0146, 0.0317, -0.0143, ..., -0.2393, -0.6523, -0.1162],\n",
" [ 1.6094, -1.1406, -0.0879, ..., 0.4395, 2.6562, -0.4863],\n",
" [ 2.5781, -0.8555, -0.3262, ..., 0.2451, 3.0469, -0.1484],\n",
" ...,\n",
" [ 1.9609, 0.9023, -1.1562, ..., 2.4219, 3.5000, 1.3750],\n",
" [-0.1348, -0.6406, -0.3516, ..., 1.4609, 0.8047, 0.5312],\n",
" [ 0.5703, 0.3828, -0.3555, ..., 1.1250, 2.1094, -2.1875]],\n",
"\n",
" [[ 0.0146, 0.0317, -0.0143, ..., -0.2393, -0.6523, -0.1162],\n",
" [ 1.6094, -1.1406, -0.0879, ..., 0.4395, 2.6562, -0.4863],\n",
" [ 2.5781, -0.8555, -0.3262, ..., 0.2451, 3.0469, -0.1484],\n",
" ...,\n",
" [ 1.9609, 0.9023, -1.1562, ..., 2.4219, 3.5000, 1.3750],\n",
" [-0.1348, -0.6406, -0.3516, ..., 1.4609, 0.8047, 0.5312],\n",
" [ 0.5703, 0.3828, -0.3555, ..., 1.1250, 2.1094, -2.1875]],\n",
"\n",
" [[ 0.0146, 0.0317, -0.0143, ..., -0.2393, -0.6523, -0.1162],\n",
" [ 1.6094, -1.1406, -0.0879, ..., 0.4395, 2.6562, -0.4863],\n",
" [ 2.5781, -0.8555, -0.3262, ..., 0.2451, 3.0469, -0.1484],\n",
" ...,\n",
" [ 1.9609, 0.9023, -1.1562, ..., 2.4219, 3.5000, 1.3750],\n",
" [-0.1348, -0.6406, -0.3516, ..., 1.4609, 0.8047, 0.5312],\n",
" [ 0.5703, 0.3828, -0.3555, ..., 1.1250, 2.1094, -2.1875]],\n",
"\n",
" ...,\n",
"\n",
" [[-0.0132, -0.0184, 0.0106, ..., 0.0322, -0.2773, 0.1357],\n",
" [ 2.0625, -1.0625, -1.7812, ..., -0.1167, -0.0369, -0.5156],\n",
" [ 0.3672, -0.5586, -0.3867, ..., 0.9570, -0.5742, -0.1230],\n",
" ...,\n",
" [ 1.5938, -0.4102, 0.0332, ..., 1.7266, 1.3203, -1.1406],\n",
" [ 0.0674, 0.0227, 0.0391, ..., 0.0649, -1.4062, 1.2812],\n",
" [-0.1367, 0.7305, -0.2539, ..., 1.7578, 0.3047, -4.2812]],\n",
"\n",
" [[-0.0132, -0.0184, 0.0106, ..., 0.0322, -0.2773, 0.1357],\n",
" [ 2.0625, -1.0625, -1.7812, ..., -0.1167, -0.0369, -0.5156],\n",
" [ 0.3672, -0.5586, -0.3867, ..., 0.9570, -0.5742, -0.1230],\n",
" ...,\n",
" [ 1.5938, -0.4102, 0.0332, ..., 1.7266, 1.3203, -1.1406],\n",
" [ 0.0674, 0.0227, 0.0391, ..., 0.0649, -1.4062, 1.2812],\n",
" [-0.1367, 0.7305, -0.2539, ..., 1.7578, 0.3047, -4.2812]],\n",
"\n",
" [[-0.0132, -0.0184, 0.0106, ..., 0.0322, -0.2773, 0.1357],\n",
" [ 2.0625, -1.0625, -1.7812, ..., -0.1167, -0.0369, -0.5156],\n",
" [ 0.3672, -0.5586, -0.3867, ..., 0.9570, -0.5742, -0.1230],\n",
" ...,\n",
" [ 1.5938, -0.4102, 0.0332, ..., 1.7266, 1.3203, -1.1406],\n",
" [ 0.0674, 0.0227, 0.0391, ..., 0.0649, -1.4062, 1.2812],\n",
" [-0.1367, 0.7305, -0.2539, ..., 1.7578, 0.3047, -4.2812]]],\n",
" dtype=torch.bfloat16, grad_fn=<ToCopyBackward0>), tensor([[[-9.8267e-03, 3.4027e-03, -1.1963e-02, ..., 4.8523e-03,\n",
" -4.0894e-03, -9.4604e-03],\n",
" [-1.2988e-01, 6.7578e-01, -2.5586e-01, ..., -2.1191e-01,\n",
" -2.3828e-01, 6.7188e-01],\n",
" [ 3.7109e-01, -2.2363e-01, -2.2559e-01, ..., 5.8594e-01,\n",
" 6.9531e-01, -2.0117e-01],\n",
" ...,\n",
" [ 2.3193e-02, 1.3867e-01, 7.3828e-01, ..., 5.7031e-01,\n",
" 1.9434e-01, 1.4648e-01],\n",
" [ 8.2422e-01, -5.4688e-01, 6.0938e-01, ..., -4.4141e-01,\n",
" -1.9434e-01, 5.6641e-01],\n",
" [-1.5332e-01, 3.4912e-02, -2.3535e-01, ..., 3.2715e-02,\n",
" 2.0508e-01, -1.7285e-01]],\n",
"\n",
" [[-9.8267e-03, 3.4027e-03, -1.1963e-02, ..., 4.8523e-03,\n",
" -4.0894e-03, -9.4604e-03],\n",
" [-1.2988e-01, 6.7578e-01, -2.5586e-01, ..., -2.1191e-01,\n",
" -2.3828e-01, 6.7188e-01],\n",
" [ 3.7109e-01, -2.2363e-01, -2.2559e-01, ..., 5.8594e-01,\n",
" 6.9531e-01, -2.0117e-01],\n",
" ...,\n",
" [ 2.3193e-02, 1.3867e-01, 7.3828e-01, ..., 5.7031e-01,\n",
" 1.9434e-01, 1.4648e-01],\n",
" [ 8.2422e-01, -5.4688e-01, 6.0938e-01, ..., -4.4141e-01,\n",
" -1.9434e-01, 5.6641e-01],\n",
" [-1.5332e-01, 3.4912e-02, -2.3535e-01, ..., 3.2715e-02,\n",
" 2.0508e-01, -1.7285e-01]],\n",
"\n",
" [[-9.8267e-03, 3.4027e-03, -1.1963e-02, ..., 4.8523e-03,\n",
" -4.0894e-03, -9.4604e-03],\n",
" [-1.2988e-01, 6.7578e-01, -2.5586e-01, ..., -2.1191e-01,\n",
" -2.3828e-01, 6.7188e-01],\n",
" [ 3.7109e-01, -2.2363e-01, -2.2559e-01, ..., 5.8594e-01,\n",
" 6.9531e-01, -2.0117e-01],\n",
" ...,\n",
" [ 2.3193e-02, 1.3867e-01, 7.3828e-01, ..., 5.7031e-01,\n",
" 1.9434e-01, 1.4648e-01],\n",
" [ 8.2422e-01, -5.4688e-01, 6.0938e-01, ..., -4.4141e-01,\n",
" -1.9434e-01, 5.6641e-01],\n",
" [-1.5332e-01, 3.4912e-02, -2.3535e-01, ..., 3.2715e-02,\n",
" 2.0508e-01, -1.7285e-01]],\n",
"\n",
" ...,\n",
"\n",
" [[-4.5967e-04, -4.8523e-03, -3.3447e-02, ..., 6.7139e-03,\n",
" 7.4768e-03, 5.1880e-03],\n",
" [-1.9629e-01, -2.9541e-02, 9.2163e-03, ..., -6.4844e-01,\n",
" -5.3516e-01, 3.8574e-02],\n",
" [ 4.1602e-01, -6.7578e-01, 1.9531e-01, ..., -4.4531e-01,\n",
" 3.5156e-02, 4.7070e-01],\n",
" ...,\n",
" [ 4.4141e-01, 5.0391e-01, -5.2002e-02, ..., -3.4375e-01,\n",
" 6.8848e-02, 2.1973e-01],\n",
" [ 2.0996e-01, 4.8828e-01, 4.5508e-01, ..., -2.5195e-01,\n",
" -1.0547e-01, 3.1836e-01],\n",
" [-1.2500e-01, -6.4844e-01, -1.1816e-01, ..., -1.4648e-01,\n",
" 8.2016e-04, -3.0859e-01]],\n",
"\n",
" [[-4.5967e-04, -4.8523e-03, -3.3447e-02, ..., 6.7139e-03,\n",
" 7.4768e-03, 5.1880e-03],\n",
" [-1.9629e-01, -2.9541e-02, 9.2163e-03, ..., -6.4844e-01,\n",
" -5.3516e-01, 3.8574e-02],\n",
" [ 4.1602e-01, -6.7578e-01, 1.9531e-01, ..., -4.4531e-01,\n",
" 3.5156e-02, 4.7070e-01],\n",
" ...,\n",
" [ 4.4141e-01, 5.0391e-01, -5.2002e-02, ..., -3.4375e-01,\n",
" 6.8848e-02, 2.1973e-01],\n",
" [ 2.0996e-01, 4.8828e-01, 4.5508e-01, ..., -2.5195e-01,\n",
" -1.0547e-01, 3.1836e-01],\n",
" [-1.2500e-01, -6.4844e-01, -1.1816e-01, ..., -1.4648e-01,\n",
" 8.2016e-04, -3.0859e-01]],\n",
"\n",
" [[-4.5967e-04, -4.8523e-03, -3.3447e-02, ..., 6.7139e-03,\n",
" 7.4768e-03, 5.1880e-03],\n",
" [-1.9629e-01, -2.9541e-02, 9.2163e-03, ..., -6.4844e-01,\n",
" -5.3516e-01, 3.8574e-02],\n",
" [ 4.1602e-01, -6.7578e-01, 1.9531e-01, ..., -4.4531e-01,\n",
" 3.5156e-02, 4.7070e-01],\n",
" ...,\n",
" [ 4.4141e-01, 5.0391e-01, -5.2002e-02, ..., -3.4375e-01,\n",
" 6.8848e-02, 2.1973e-01],\n",
" [ 2.0996e-01, 4.8828e-01, 4.5508e-01, ..., -2.5195e-01,\n",
" -1.0547e-01, 3.1836e-01],\n",
" [-1.2500e-01, -6.4844e-01, -1.1816e-01, ..., -1.4648e-01,\n",
" 8.2016e-04, -3.0859e-01]]], dtype=torch.bfloat16,\n",
" grad_fn=<ToCopyBackward0>)), (tensor([[[-5.8289e-03, -5.2185e-03, -3.2043e-03, ..., 1.9141e-01,\n",
" -9.6191e-02, -6.4062e-01],\n",
" [-3.3750e+00, -1.9297e+00, 2.3828e-01, ..., -1.8359e-01,\n",
" 7.5391e-01, 2.9375e+00],\n",
" [ 5.9375e-01, -2.5312e+00, -1.2266e+00, ..., -3.8574e-02,\n",
" 8.2812e-01, 1.7812e+00],\n",
" ...,\n",
" [-1.8828e+00, -1.8125e+00, 1.3438e+00, ..., -1.3125e+00,\n",
" -4.3359e-01, 2.8594e+00],\n",
" [ 1.2188e+00, -1.3867e-01, 1.0703e+00, ..., -2.2812e+00,\n",
" -1.9824e-01, 3.6406e+00],\n",
" [ 3.5156e+00, 1.7188e+00, 1.7188e+00, ..., 6.7188e-01,\n",
" -2.3340e-01, 2.0469e+00]],\n",
"\n",
" [[-5.8289e-03, -5.2185e-03, -3.2043e-03, ..., 1.9141e-01,\n",
" -9.6191e-02, -6.4062e-01],\n",
" [-3.3750e+00, -1.9297e+00, 2.3828e-01, ..., -1.8359e-01,\n",
" 7.5391e-01, 2.9375e+00],\n",
" [ 5.9375e-01, -2.5312e+00, -1.2266e+00, ..., -3.8574e-02,\n",
" 8.2812e-01, 1.7812e+00],\n",
" ...,\n",
" [-1.8828e+00, -1.8125e+00, 1.3438e+00, ..., -1.3125e+00,\n",
" -4.3359e-01, 2.8594e+00],\n",
" [ 1.2188e+00, -1.3867e-01, 1.0703e+00, ..., -2.2812e+00,\n",
" -1.9824e-01, 3.6406e+00],\n",
" [ 3.5156e+00, 1.7188e+00, 1.7188e+00, ..., 6.7188e-01,\n",
" -2.3340e-01, 2.0469e+00]],\n",
"\n",
" [[-5.8289e-03, -5.2185e-03, -3.2043e-03, ..., 1.9141e-01,\n",
" -9.6191e-02, -6.4062e-01],\n",
" [-3.3750e+00, -1.9297e+00, 2.3828e-01, ..., -1.8359e-01,\n",
" 7.5391e-01, 2.9375e+00],\n",
" [ 5.9375e-01, -2.5312e+00, -1.2266e+00, ..., -3.8574e-02,\n",
" 8.2812e-01, 1.7812e+00],\n",
" ...,\n",
" [-1.8828e+00, -1.8125e+00, 1.3438e+00, ..., -1.3125e+00,\n",
" -4.3359e-01, 2.8594e+00],\n",
" [ 1.2188e+00, -1.3867e-01, 1.0703e+00, ..., -2.2812e+00,\n",
" -1.9824e-01, 3.6406e+00],\n",
" [ 3.5156e+00, 1.7188e+00, 1.7188e+00, ..., 6.7188e-01,\n",
" -2.3340e-01, 2.0469e+00]],\n",
"\n",
" ...,\n",
"\n",
" [[-2.0599e-03, -3.8338e-04, 5.0049e-03, ..., 1.0437e-02,\n",
" 1.6235e-02, 2.6758e-01],\n",
" [-6.3672e-01, 5.5469e-01, -4.7656e-01, ..., -2.0625e+00,\n",
" 9.9609e-01, -1.2969e+00],\n",
" [ 1.2266e+00, 1.9531e+00, 3.3984e-01, ..., -7.0703e-01,\n",
" -2.5781e+00, 2.0605e-01],\n",
" ...,\n",
" [ 3.9062e-01, 1.8906e+00, 3.1250e-02, ..., 6.8359e-01,\n",
" 2.2852e-01, -3.0664e-01],\n",
" [ 2.9688e-01, -9.6094e-01, 9.8828e-01, ..., 1.2578e+00,\n",
" -2.0000e+00, -1.3516e+00],\n",
" [ 1.7734e+00, -2.6953e-01, -1.8906e+00, ..., 1.6328e+00,\n",
" -9.9609e-01, -6.7188e-01]],\n",
"\n",
" [[-2.0599e-03, -3.8338e-04, 5.0049e-03, ..., 1.0437e-02,\n",
" 1.6235e-02, 2.6758e-01],\n",
" [-6.3672e-01, 5.5469e-01, -4.7656e-01, ..., -2.0625e+00,\n",
" 9.9609e-01, -1.2969e+00],\n",
" [ 1.2266e+00, 1.9531e+00, 3.3984e-01, ..., -7.0703e-01,\n",