80 KiB
80 KiB
from transformers import VisionEncoderDecoderConfig, DonutProcessor, VisionEncoderDecoderModel
from datasets import load_dataset
import json
import random
from typing import Any, List, Tuple
import torch
from torch.utils.data import Dataset, DataLoader
import re
from nltk import edit_distance
import numpy as np
from pytorch_lightning.loggers import WandbLogger
from pytorch_lightning.callbacks import Callback
import pytorch_lightning as pl
import os
from huggingface_hub import login
DATASET_PATH = "Zombely/pl-text-images"
PRETRAINED_MODEL_PATH = "nielsr/donut-proto"
OUTPUT_MODEL_PATH = "Zombely/plwiki-test-proto"
LOGGING_PATH = "plwiki-test-run-proto"
train_config = {
"max_epochs":5,
"val_check_interval":0.2, # how many times we want to validate during an epoch
"check_val_every_n_epoch":1,
"gradient_clip_val":1.0,
"num_training_samples_per_epoch": 800,
"lr":3e-5,
"train_batch_sizes": [8],
"val_batch_sizes": [1],
# "seed":2022,
"num_nodes": 1,
"warmup_steps": 300, # 800/8*30/10, 10%
"result_path": "./result",
"verbose": True,
}
dataset = load_dataset(DATASET_PATH)
Using custom data configuration Zombely--pl-text-images-f3f66e614f4d9a7a Found cached dataset parquet (/home/pc/.cache/huggingface/datasets/Zombely___parquet/Zombely--pl-text-images-f3f66e614f4d9a7a/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)
0%| | 0/3 [00:00<?, ?it/s]
max_length = 768
image_size = [1280, 960]
config = VisionEncoderDecoderConfig.from_pretrained(PRETRAINED_MODEL_PATH)
config.encoder.image_size = image_size # (height, width)
config.decoder.max_length = max_length
processor = DonutProcessor.from_pretrained(PRETRAINED_MODEL_PATH)
model = VisionEncoderDecoderModel.from_pretrained(PRETRAINED_MODEL_PATH, config=config)
Could not find image processor class in the image processor config or the model config. Loading based on pattern matching with the model's feature extractor configuration. Some weights of the model checkpoint at nielsr/donut-proto were not used when initializing VisionEncoderDecoderModel: ['encoder.encoder.layers.2.blocks.13.attn_mask', 'encoder.encoder.layers.2.blocks.17.attn_mask', 'encoder.encoder.layers.2.blocks.1.attn_mask', 'encoder.encoder.layers.2.blocks.9.attn_mask', 'encoder.encoder.layers.2.blocks.7.attn_mask', 'encoder.encoder.layers.3.blocks.1.attn_mask', 'encoder.encoder.layers.2.blocks.11.attn_mask', 'encoder.encoder.layers.2.blocks.5.attn_mask', 'encoder.encoder.layers.2.blocks.15.attn_mask', 'encoder.encoder.layers.0.blocks.1.attn_mask', 'encoder.encoder.layers.1.blocks.1.attn_mask', 'encoder.encoder.layers.2.blocks.3.attn_mask'] - This IS expected if you are initializing VisionEncoderDecoderModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model). - This IS NOT expected if you are initializing VisionEncoderDecoderModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model). Some weights of VisionEncoderDecoderModel were not initialized from the model checkpoint at nielsr/donut-proto and are newly initialized: ['encoder.layernorm.weight', 'encoder.layernorm.bias'] You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
added_tokens = []
class DonutDataset(Dataset):
"""
DonutDataset which is saved in huggingface datasets format. (see details in https://huggingface.co/docs/datasets)
Each row, consists of image path(png/jpg/jpeg) and gt data (json/jsonl/txt),
and it will be converted into input_tensor(vectorized image) and input_ids(tokenized string).
Args:
dataset_name_or_path: name of dataset (available at huggingface.co/datasets) or the path containing image files and metadata.jsonl
max_length: the max number of tokens for the target sequences
split: whether to load "train", "validation" or "test" split
ignore_id: ignore_index for torch.nn.CrossEntropyLoss
task_start_token: the special token to be fed to the decoder to conduct the target task
prompt_end_token: the special token at the end of the sequences
sort_json_key: whether or not to sort the JSON keys
"""
def __init__(
self,
dataset_name_or_path: str,
max_length: int,
split: str = "train",
ignore_id: int = -100,
task_start_token: str = "<s>",
prompt_end_token: str = None,
sort_json_key: bool = True,
):
super().__init__()
self.max_length = max_length
self.split = split
self.ignore_id = ignore_id
self.task_start_token = task_start_token
self.prompt_end_token = prompt_end_token if prompt_end_token else task_start_token
self.sort_json_key = sort_json_key
self.dataset = load_dataset(dataset_name_or_path, split=self.split)
self.dataset_length = len(self.dataset)
self.gt_token_sequences = []
for sample in self.dataset:
ground_truth = json.loads(sample["ground_truth"])
if "gt_parses" in ground_truth: # when multiple ground truths are available, e.g., docvqa
assert isinstance(ground_truth["gt_parses"], list)
gt_jsons = ground_truth["gt_parses"]
else:
assert "gt_parse" in ground_truth and isinstance(ground_truth["gt_parse"], dict)
gt_jsons = [ground_truth["gt_parse"]]
self.gt_token_sequences.append(
[
self.json2token(
gt_json,
update_special_tokens_for_json_key=self.split == "train",
sort_json_key=self.sort_json_key,
)
+ processor.tokenizer.eos_token
for gt_json in gt_jsons # load json from list of json
]
)
self.add_tokens([self.task_start_token, self.prompt_end_token])
self.prompt_end_token_id = processor.tokenizer.convert_tokens_to_ids(self.prompt_end_token)
def json2token(self, obj: Any, update_special_tokens_for_json_key: bool = True, sort_json_key: bool = True):
"""
Convert an ordered JSON object into a token sequence
"""
if type(obj) == dict:
if len(obj) == 1 and "text_sequence" in obj:
return obj["text_sequence"]
else:
output = ""
if sort_json_key:
keys = sorted(obj.keys(), reverse=True)
else:
keys = obj.keys()
for k in keys:
if update_special_tokens_for_json_key:
self.add_tokens([fr"<s_{k}>", fr"</s_{k}>"])
output += (
fr"<s_{k}>"
+ self.json2token(obj[k], update_special_tokens_for_json_key, sort_json_key)
+ fr"</s_{k}>"
)
return output
elif type(obj) == list:
return r"<sep/>".join(
[self.json2token(item, update_special_tokens_for_json_key, sort_json_key) for item in obj]
)
else:
obj = str(obj)
if f"<{obj}/>" in added_tokens:
obj = f"<{obj}/>" # for categorical special tokens
return obj
def add_tokens(self, list_of_tokens: List[str]):
"""
Add special tokens to tokenizer and resize the token embeddings of the decoder
"""
newly_added_num = processor.tokenizer.add_tokens(list_of_tokens)
if newly_added_num > 0:
model.decoder.resize_token_embeddings(len(processor.tokenizer))
added_tokens.extend(list_of_tokens)
def __len__(self) -> int:
return self.dataset_length
def __getitem__(self, idx: int) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
"""
Load image from image_path of given dataset_path and convert into input_tensor and labels
Convert gt data into input_ids (tokenized string)
Returns:
input_tensor : preprocessed image
input_ids : tokenized gt_data
labels : masked labels (model doesn't need to predict prompt and pad token)
"""
sample = self.dataset[idx]
# inputs
pixel_values = processor(sample["image"], random_padding=self.split == "train", return_tensors="pt").pixel_values
pixel_values = pixel_values.squeeze()
# targets
target_sequence = random.choice(self.gt_token_sequences[idx]) # can be more than one, e.g., DocVQA Task 1
input_ids = processor.tokenizer(
target_sequence,
add_special_tokens=False,
max_length=self.max_length,
padding="max_length",
truncation=True,
return_tensors="pt",
)["input_ids"].squeeze(0)
labels = input_ids.clone()
labels[labels == processor.tokenizer.pad_token_id] = self.ignore_id # model doesn't need to predict pad token
# labels[: torch.nonzero(labels == self.prompt_end_token_id).sum() + 1] = self.ignore_id # model doesn't need to predict prompt (for VQA)
return pixel_values, labels, target_sequence
processor.image_processor.size = image_size[::-1] # should be (width, height)
processor.image_processor.do_align_long_axis = False
train_dataset = DonutDataset(DATASET_PATH, max_length=max_length,
split="train", task_start_token="<s_cord-v2>", prompt_end_token="<s_cord-v2>",
sort_json_key=False, # cord dataset is preprocessed, so no need for this
)
val_dataset = DonutDataset(DATASET_PATH, max_length=max_length,
split="validation", task_start_token="<s_cord-v2>", prompt_end_token="<s_cord-v2>",
sort_json_key=False, # cord dataset is preprocessed, so no need for this
)
Using custom data configuration Zombely--pl-text-images-f3f66e614f4d9a7a Found cached dataset parquet (/home/pc/.cache/huggingface/datasets/Zombely___parquet/Zombely--pl-text-images-f3f66e614f4d9a7a/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec) Using custom data configuration Zombely--pl-text-images-f3f66e614f4d9a7a Found cached dataset parquet (/home/pc/.cache/huggingface/datasets/Zombely___parquet/Zombely--pl-text-images-f3f66e614f4d9a7a/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)
model.config.pad_token_id = processor.tokenizer.pad_token_id
model.config.decoder_start_token_id = processor.tokenizer.convert_tokens_to_ids(['<s_cord-v2>'])[0]
train_dataloader = DataLoader(train_dataset, batch_size=1, shuffle=True, num_workers=4)
val_dataloader = DataLoader(val_dataset, batch_size=1, shuffle=False, num_workers=4)
class DonutModelPLModule(pl.LightningModule):
def __init__(self, config, processor, model):
super().__init__()
self.config = config
self.processor = processor
self.model = model
def training_step(self, batch, batch_idx):
pixel_values, labels, _ = batch
outputs = self.model(pixel_values, labels=labels)
loss = outputs.loss
self.log_dict({"train_loss": loss}, sync_dist=True)
return loss
def validation_step(self, batch, batch_idx, dataset_idx=0):
pixel_values, labels, answers = batch
batch_size = pixel_values.shape[0]
# we feed the prompt to the model
decoder_input_ids = torch.full((batch_size, 1), self.model.config.decoder_start_token_id, device=self.device)
outputs = self.model.generate(pixel_values,
decoder_input_ids=decoder_input_ids,
max_length=max_length,
early_stopping=True,
pad_token_id=self.processor.tokenizer.pad_token_id,
eos_token_id=self.processor.tokenizer.eos_token_id,
use_cache=True,
num_beams=1,
bad_words_ids=[[self.processor.tokenizer.unk_token_id]],
return_dict_in_generate=True,)
predictions = []
for seq in self.processor.tokenizer.batch_decode(outputs.sequences):
seq = seq.replace(self.processor.tokenizer.eos_token, "").replace(self.processor.tokenizer.pad_token, "")
seq = re.sub(r"<.*?>", "", seq, count=1).strip() # remove first task start token
predictions.append(seq)
scores = list()
for pred, answer in zip(predictions, answers):
pred = re.sub(r"(?:(?<=>) | (?=</s_))", "", pred)
# NOT NEEDED ANYMORE
# answer = re.sub(r"<.*?>", "", answer, count=1)
answer = answer.replace(self.processor.tokenizer.eos_token, "")
scores.append(edit_distance(pred, answer) / max(len(pred), len(answer)))
if self.config.get("verbose", False) and len(scores) == 1:
print(f"Prediction: {pred}")
print(f" Answer: {answer}")
print(f" Normed ED: {scores[0]}")
return scores
def validation_epoch_end(self, validation_step_outputs):
# I set this to 1 manually
# (previously set to len(self.config.dataset_name_or_paths))
num_of_loaders = 1
if num_of_loaders == 1:
validation_step_outputs = [validation_step_outputs]
assert len(validation_step_outputs) == num_of_loaders
cnt = [0] * num_of_loaders
total_metric = [0] * num_of_loaders
val_metric = [0] * num_of_loaders
for i, results in enumerate(validation_step_outputs):
for scores in results:
cnt[i] += len(scores)
total_metric[i] += np.sum(scores)
val_metric[i] = total_metric[i] / cnt[i]
val_metric_name = f"val_metric_{i}th_dataset"
self.log_dict({val_metric_name: val_metric[i]}, sync_dist=True)
self.log_dict({"val_metric": np.sum(total_metric) / np.sum(cnt)}, sync_dist=True)
def configure_optimizers(self):
# TODO add scheduler
optimizer = torch.optim.Adam(self.parameters(), lr=self.config.get("lr"))
return optimizer
def train_dataloader(self):
return train_dataloader
def val_dataloader(self):
return val_dataloader
class PushToHubCallback(Callback):
def on_train_epoch_end(self, trainer, pl_module):
print(f"Pushing model to the hub, epoch {trainer.current_epoch}")
pl_module.model.push_to_hub(OUTPUT_MODEL_PATH,
commit_message=f"Training in progress, epoch {trainer.current_epoch}")
def on_train_end(self, trainer, pl_module):
print(f"Pushing model to the hub after training")
pl_module.processor.push_to_hub(OUTPUT_MODEL_PATH,
commit_message=f"Training done")
pl_module.model.push_to_hub(OUTPUT_MODEL_PATH,
commit_message=f"Training done")
login(os.environ.get("HUG_TOKEN"))
Token will not been saved to git credential helper. Pass `add_to_git_credential=True` if you want to set the git credential as well. Token is valid. Your token has been saved to /home/pc/.huggingface/token Login successful
Wandb.ai link: https://wandb.ai/michalkozlowski936/Donut?workspace=user-michalkozlowski936
Hugging_face link https://huggingface.co/Zombely
model_module = DonutModelPLModule(train_config, processor, model)
wandb_logger = WandbLogger(project="Donut", name=LOGGING_PATH)
trainer = pl.Trainer(
accelerator="cpu", # change to gpu
devices=1,
max_epochs=train_config.get("max_epochs"),
val_check_interval=train_config.get("val_check_interval"),
check_val_every_n_epoch=train_config.get("check_val_every_n_epoch"),
gradient_clip_val=train_config.get("gradient_clip_val"),
precision=16, # we'll use mixed precision
num_sanity_val_steps=0,
logger=wandb_logger,
callbacks=[PushToHubCallback()],
)
trainer.fit(model_module)
Using bfloat16 Automatic Mixed Precision (AMP) GPU available: False, used: False TPU available: False, using: 0 TPU cores IPU available: False, using: 0 IPUs HPU available: False, using: 0 HPUs | Name | Type | Params ---------------------------------------------------- 0 | model | VisionEncoderDecoderModel | 213 M ---------------------------------------------------- 213 M Trainable params 0 Non-trainable params 213 M Total params 854.597 Total estimated model params size (MB)
Training: 0it [00:00, ?it/s]
[0;31m---------------------------------------------------------------------------[0m [0;31mRuntimeError[0m Traceback (most recent call last) [0;32m/tmp/ipykernel_294/2569065759.py[0m in [0;36m<module>[0;34m[0m [1;32m 27[0m ) [1;32m 28[0m [0;34m[0m[0m [0;32m---> 29[0;31m [0mtrainer[0m[0;34m.[0m[0mfit[0m[0;34m([0m[0mmodel_module[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m [0m [0;32m~/anaconda3/envs/donut/lib/python3.7/site-packages/pytorch_lightning/trainer/trainer.py[0m in [0;36mfit[0;34m(self, model, train_dataloaders, val_dataloaders, datamodule, ckpt_path)[0m [1;32m 581[0m [0mself[0m[0;34m.[0m[0mstrategy[0m[0;34m.[0m[0m_lightning_module[0m [0;34m=[0m [0mmodel[0m[0;34m[0m[0;34m[0m[0m [1;32m 582[0m call._call_and_handle_interrupt( [0;32m--> 583[0;31m [0mself[0m[0;34m,[0m [0mself[0m[0;34m.[0m[0m_fit_impl[0m[0;34m,[0m [0mmodel[0m[0;34m,[0m [0mtrain_dataloaders[0m[0;34m,[0m [0mval_dataloaders[0m[0;34m,[0m [0mdatamodule[0m[0;34m,[0m [0mckpt_path[0m[0;34m[0m[0;34m[0m[0m [0m[1;32m 584[0m ) [1;32m 585[0m [0;34m[0m[0m [0;32m~/anaconda3/envs/donut/lib/python3.7/site-packages/pytorch_lightning/trainer/call.py[0m in [0;36m_call_and_handle_interrupt[0;34m(trainer, trainer_fn, *args, **kwargs)[0m [1;32m 36[0m [0;32mreturn[0m [0mtrainer[0m[0;34m.[0m[0mstrategy[0m[0;34m.[0m[0mlauncher[0m[0;34m.[0m[0mlaunch[0m[0;34m([0m[0mtrainer_fn[0m[0;34m,[0m [0;34m*[0m[0margs[0m[0;34m,[0m [0mtrainer[0m[0;34m=[0m[0mtrainer[0m[0;34m,[0m [0;34m**[0m[0mkwargs[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m [1;32m 37[0m [0;32melse[0m[0;34m:[0m[0;34m[0m[0;34m[0m[0m [0;32m---> 38[0;31m [0;32mreturn[0m [0mtrainer_fn[0m[0;34m([0m[0;34m*[0m[0margs[0m[0;34m,[0m [0;34m**[0m[0mkwargs[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m [0m[1;32m 39[0m [0;34m[0m[0m [1;32m 40[0m [0;32mexcept[0m [0m_TunerExitException[0m[0;34m:[0m[0;34m[0m[0;34m[0m[0m [0;32m~/anaconda3/envs/donut/lib/python3.7/site-packages/pytorch_lightning/trainer/trainer.py[0m in [0;36m_fit_impl[0;34m(self, model, train_dataloaders, val_dataloaders, datamodule, ckpt_path)[0m [1;32m 622[0m [0mmodel_connected[0m[0;34m=[0m[0mself[0m[0;34m.[0m[0mlightning_module[0m [0;32mis[0m [0;32mnot[0m [0;32mNone[0m[0;34m,[0m[0;34m[0m[0;34m[0m[0m [1;32m 623[0m ) [0;32m--> 624[0;31m [0mself[0m[0;34m.[0m[0m_run[0m[0;34m([0m[0mmodel[0m[0;34m,[0m [0mckpt_path[0m[0;34m=[0m[0mself[0m[0;34m.[0m[0mckpt_path[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m [0m[1;32m 625[0m [0;34m[0m[0m [1;32m 626[0m [0;32massert[0m [0mself[0m[0;34m.[0m[0mstate[0m[0;34m.[0m[0mstopped[0m[0;34m[0m[0;34m[0m[0m [0;32m~/anaconda3/envs/donut/lib/python3.7/site-packages/pytorch_lightning/trainer/trainer.py[0m in [0;36m_run[0;34m(self, model, ckpt_path)[0m [1;32m 1059[0m [0mself[0m[0;34m.[0m[0m_checkpoint_connector[0m[0;34m.[0m[0mresume_end[0m[0;34m([0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m [1;32m 1060[0m [0;34m[0m[0m [0;32m-> 1061[0;31m [0mresults[0m [0;34m=[0m [0mself[0m[0;34m.[0m[0m_run_stage[0m[0;34m([0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m [0m[1;32m 1062[0m [0;34m[0m[0m [1;32m 1063[0m [0mlog[0m[0;34m.[0m[0mdetail[0m[0;34m([0m[0;34mf"{self.__class__.__name__}: trainer tearing down"[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m [0;32m~/anaconda3/envs/donut/lib/python3.7/site-packages/pytorch_lightning/trainer/trainer.py[0m in [0;36m_run_stage[0;34m(self)[0m [1;32m 1138[0m [0;32mif[0m [0mself[0m[0;34m.[0m[0mpredicting[0m[0;34m:[0m[0;34m[0m[0;34m[0m[0m [1;32m 1139[0m [0;32mreturn[0m [0mself[0m[0;34m.[0m[0m_run_predict[0m[0;34m([0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m [0;32m-> 1140[0;31m [0mself[0m[0;34m.[0m[0m_run_train[0m[0;34m([0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m [0m[1;32m 1141[0m [0;34m[0m[0m [1;32m 1142[0m [0;32mdef[0m [0m_pre_training_routine[0m[0;34m([0m[0mself[0m[0;34m)[0m [0;34m->[0m [0;32mNone[0m[0;34m:[0m[0;34m[0m[0;34m[0m[0m [0;32m~/anaconda3/envs/donut/lib/python3.7/site-packages/pytorch_lightning/trainer/trainer.py[0m in [0;36m_run_train[0;34m(self)[0m [1;32m 1161[0m [0;34m[0m[0m [1;32m 1162[0m [0;32mwith[0m [0mtorch[0m[0;34m.[0m[0mautograd[0m[0;34m.[0m[0mset_detect_anomaly[0m[0;34m([0m[0mself[0m[0;34m.[0m[0m_detect_anomaly[0m[0;34m)[0m[0;34m:[0m[0;34m[0m[0;34m[0m[0m [0;32m-> 1163[0;31m [0mself[0m[0;34m.[0m[0mfit_loop[0m[0;34m.[0m[0mrun[0m[0;34m([0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m [0m[1;32m 1164[0m [0;34m[0m[0m [1;32m 1165[0m [0;32mdef[0m [0m_run_evaluate[0m[0;34m([0m[0mself[0m[0;34m)[0m [0;34m->[0m [0m_EVALUATE_OUTPUT[0m[0;34m:[0m[0;34m[0m[0;34m[0m[0m [0;32m~/anaconda3/envs/donut/lib/python3.7/site-packages/pytorch_lightning/loops/loop.py[0m in [0;36mrun[0;34m(self, *args, **kwargs)[0m [1;32m 197[0m [0;32mtry[0m[0;34m:[0m[0;34m[0m[0;34m[0m[0m [1;32m 198[0m [0mself[0m[0;34m.[0m[0mon_advance_start[0m[0;34m([0m[0;34m*[0m[0margs[0m[0;34m,[0m [0;34m**[0m[0mkwargs[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m [0;32m--> 199[0;31m [0mself[0m[0;34m.[0m[0madvance[0m[0;34m([0m[0;34m*[0m[0margs[0m[0;34m,[0m [0;34m**[0m[0mkwargs[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m [0m[1;32m 200[0m [0mself[0m[0;34m.[0m[0mon_advance_end[0m[0;34m([0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m [1;32m 201[0m [0mself[0m[0;34m.[0m[0m_restarting[0m [0;34m=[0m [0;32mFalse[0m[0;34m[0m[0;34m[0m[0m [0;32m~/anaconda3/envs/donut/lib/python3.7/site-packages/pytorch_lightning/loops/fit_loop.py[0m in [0;36madvance[0;34m(self)[0m [1;32m 265[0m [0mself[0m[0;34m.[0m[0m_data_fetcher[0m[0;34m.[0m[0msetup[0m[0;34m([0m[0mdataloader[0m[0;34m,[0m [0mbatch_to_device[0m[0;34m=[0m[0mbatch_to_device[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m [1;32m 266[0m [0;32mwith[0m [0mself[0m[0;34m.[0m[0mtrainer[0m[0;34m.[0m[0mprofiler[0m[0;34m.[0m[0mprofile[0m[0;34m([0m[0;34m"run_training_epoch"[0m[0;34m)[0m[0;34m:[0m[0;34m[0m[0;34m[0m[0m [0;32m--> 267[0;31m [0mself[0m[0;34m.[0m[0m_outputs[0m [0;34m=[0m [0mself[0m[0;34m.[0m[0mepoch_loop[0m[0;34m.[0m[0mrun[0m[0;34m([0m[0mself[0m[0;34m.[0m[0m_data_fetcher[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m [0m[1;32m 268[0m [0;34m[0m[0m [1;32m 269[0m [0;32mdef[0m [0mon_advance_end[0m[0;34m([0m[0mself[0m[0;34m)[0m [0;34m->[0m [0;32mNone[0m[0;34m:[0m[0;34m[0m[0;34m[0m[0m [0;32m~/anaconda3/envs/donut/lib/python3.7/site-packages/pytorch_lightning/loops/loop.py[0m in [0;36mrun[0;34m(self, *args, **kwargs)[0m [1;32m 197[0m [0;32mtry[0m[0;34m:[0m[0;34m[0m[0;34m[0m[0m [1;32m 198[0m [0mself[0m[0;34m.[0m[0mon_advance_start[0m[0;34m([0m[0;34m*[0m[0margs[0m[0;34m,[0m [0;34m**[0m[0mkwargs[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m [0;32m--> 199[0;31m [0mself[0m[0;34m.[0m[0madvance[0m[0;34m([0m[0;34m*[0m[0margs[0m[0;34m,[0m [0;34m**[0m[0mkwargs[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m [0m[1;32m 200[0m [0mself[0m[0;34m.[0m[0mon_advance_end[0m[0;34m([0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m [1;32m 201[0m [0mself[0m[0;34m.[0m[0m_restarting[0m [0;34m=[0m [0;32mFalse[0m[0;34m[0m[0;34m[0m[0m [0;32m~/anaconda3/envs/donut/lib/python3.7/site-packages/pytorch_lightning/loops/epoch/training_epoch_loop.py[0m in [0;36madvance[0;34m(self, data_fetcher)[0m [1;32m 212[0m [0;34m[0m[0m [1;32m 213[0m [0;32mwith[0m [0mself[0m[0;34m.[0m[0mtrainer[0m[0;34m.[0m[0mprofiler[0m[0;34m.[0m[0mprofile[0m[0;34m([0m[0;34m"run_training_batch"[0m[0;34m)[0m[0;34m:[0m[0;34m[0m[0;34m[0m[0m [0;32m--> 214[0;31m [0mbatch_output[0m [0;34m=[0m [0mself[0m[0;34m.[0m[0mbatch_loop[0m[0;34m.[0m[0mrun[0m[0;34m([0m[0mkwargs[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m [0m[1;32m 215[0m [0;34m[0m[0m [1;32m 216[0m [0mself[0m[0;34m.[0m[0mbatch_progress[0m[0;34m.[0m[0mincrement_processed[0m[0;34m([0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m [0;32m~/anaconda3/envs/donut/lib/python3.7/site-packages/pytorch_lightning/loops/loop.py[0m in [0;36mrun[0;34m(self, *args, **kwargs)[0m [1;32m 197[0m [0;32mtry[0m[0;34m:[0m[0;34m[0m[0;34m[0m[0m [1;32m 198[0m [0mself[0m[0;34m.[0m[0mon_advance_start[0m[0;34m([0m[0;34m*[0m[0margs[0m[0;34m,[0m [0;34m**[0m[0mkwargs[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m [0;32m--> 199[0;31m [0mself[0m[0;34m.[0m[0madvance[0m[0;34m([0m[0;34m*[0m[0margs[0m[0;34m,[0m [0;34m**[0m[0mkwargs[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m [0m[1;32m 200[0m [0mself[0m[0;34m.[0m[0mon_advance_end[0m[0;34m([0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m [1;32m 201[0m [0mself[0m[0;34m.[0m[0m_restarting[0m [0;34m=[0m [0;32mFalse[0m[0;34m[0m[0;34m[0m[0m [0;32m~/anaconda3/envs/donut/lib/python3.7/site-packages/pytorch_lightning/loops/batch/training_batch_loop.py[0m in [0;36madvance[0;34m(self, kwargs)[0m [1;32m 86[0m [0mself[0m[0;34m.[0m[0mtrainer[0m[0;34m.[0m[0moptimizers[0m[0;34m,[0m [0mself[0m[0;34m.[0m[0mtrainer[0m[0;34m.[0m[0moptimizer_frequencies[0m[0;34m,[0m [0mkwargs[0m[0;34m.[0m[0mget[0m[0;34m([0m[0;34m"batch_idx"[0m[0;34m,[0m [0;36m0[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m [1;32m 87[0m ) [0;32m---> 88[0;31m [0moutputs[0m [0;34m=[0m [0mself[0m[0;34m.[0m[0moptimizer_loop[0m[0;34m.[0m[0mrun[0m[0;34m([0m[0moptimizers[0m[0;34m,[0m [0mkwargs[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m [0m[1;32m 89[0m [0;32melse[0m[0;34m:[0m[0;34m[0m[0;34m[0m[0m [1;32m 90[0m [0moutputs[0m [0;34m=[0m [0mself[0m[0;34m.[0m[0mmanual_loop[0m[0;34m.[0m[0mrun[0m[0;34m([0m[0mkwargs[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m [0;32m~/anaconda3/envs/donut/lib/python3.7/site-packages/pytorch_lightning/loops/loop.py[0m in [0;36mrun[0;34m(self, *args, **kwargs)[0m [1;32m 197[0m [0;32mtry[0m[0;34m:[0m[0;34m[0m[0;34m[0m[0m [1;32m 198[0m [0mself[0m[0;34m.[0m[0mon_advance_start[0m[0;34m([0m[0;34m*[0m[0margs[0m[0;34m,[0m [0;34m**[0m[0mkwargs[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m [0;32m--> 199[0;31m [0mself[0m[0;34m.[0m[0madvance[0m[0;34m([0m[0;34m*[0m[0margs[0m[0;34m,[0m [0;34m**[0m[0mkwargs[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m [0m[1;32m 200[0m [0mself[0m[0;34m.[0m[0mon_advance_end[0m[0;34m([0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m [1;32m 201[0m [0mself[0m[0;34m.[0m[0m_restarting[0m [0;34m=[0m [0;32mFalse[0m[0;34m[0m[0;34m[0m[0m [0;32m~/anaconda3/envs/donut/lib/python3.7/site-packages/pytorch_lightning/loops/optimization/optimizer_loop.py[0m in [0;36madvance[0;34m(self, optimizers, kwargs)[0m [1;32m 198[0m [0mkwargs[0m [0;34m=[0m [0mself[0m[0;34m.[0m[0m_build_kwargs[0m[0;34m([0m[0mkwargs[0m[0;34m,[0m [0mself[0m[0;34m.[0m[0moptimizer_idx[0m[0;34m,[0m [0mself[0m[0;34m.[0m[0m_hiddens[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m [1;32m 199[0m [0;34m[0m[0m [0;32m--> 200[0;31m [0mresult[0m [0;34m=[0m [0mself[0m[0;34m.[0m[0m_run_optimization[0m[0;34m([0m[0mkwargs[0m[0;34m,[0m [0mself[0m[0;34m.[0m[0m_optimizers[0m[0;34m[[0m[0mself[0m[0;34m.[0m[0moptim_progress[0m[0;34m.[0m[0moptimizer_position[0m[0;34m][0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m [0m[1;32m 201[0m [0;32mif[0m [0mresult[0m[0;34m.[0m[0mloss[0m [0;32mis[0m [0;32mnot[0m [0;32mNone[0m[0;34m:[0m[0;34m[0m[0;34m[0m[0m [1;32m 202[0m [0;31m# automatic optimization assumes a loss needs to be returned for extras to be considered as the batch[0m[0;34m[0m[0;34m[0m[0;34m[0m[0m [0;32m~/anaconda3/envs/donut/lib/python3.7/site-packages/pytorch_lightning/loops/optimization/optimizer_loop.py[0m in [0;36m_run_optimization[0;34m(self, kwargs, optimizer)[0m [1;32m 245[0m [0;32melse[0m[0;34m:[0m[0;34m[0m[0;34m[0m[0m [1;32m 246[0m [0;31m# the `batch_idx` is optional with inter-batch parallelism[0m[0;34m[0m[0;34m[0m[0;34m[0m[0m [0;32m--> 247[0;31m [0mself[0m[0;34m.[0m[0m_optimizer_step[0m[0;34m([0m[0moptimizer[0m[0;34m,[0m [0mopt_idx[0m[0;34m,[0m [0mkwargs[0m[0;34m.[0m[0mget[0m[0;34m([0m[0;34m"batch_idx"[0m[0;34m,[0m [0;36m0[0m[0;34m)[0m[0;34m,[0m [0mclosure[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m [0m[1;32m 248[0m [0;34m[0m[0m [1;32m 249[0m [0mresult[0m [0;34m=[0m [0mclosure[0m[0;34m.[0m[0mconsume_result[0m[0;34m([0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m [0;32m~/anaconda3/envs/donut/lib/python3.7/site-packages/pytorch_lightning/loops/optimization/optimizer_loop.py[0m in [0;36m_optimizer_step[0;34m(self, optimizer, opt_idx, batch_idx, train_step_and_backward_closure)[0m [1;32m 364[0m [0mon_tpu[0m[0;34m=[0m[0misinstance[0m[0;34m([0m[0mself[0m[0;34m.[0m[0mtrainer[0m[0;34m.[0m[0maccelerator[0m[0;34m,[0m [0mTPUAccelerator[0m[0;34m)[0m[0;34m,[0m[0;34m[0m[0;34m[0m[0m [1;32m 365[0m [0musing_native_amp[0m[0;34m=[0m[0;34m([0m[0mself[0m[0;34m.[0m[0mtrainer[0m[0;34m.[0m[0mamp_backend[0m [0;34m==[0m [0mAMPType[0m[0;34m.[0m[0mNATIVE[0m[0;34m)[0m[0;34m,[0m[0;34m[0m[0;34m[0m[0m [0;32m--> 366[0;31m [0musing_lbfgs[0m[0;34m=[0m[0mis_lbfgs[0m[0;34m,[0m[0;34m[0m[0;34m[0m[0m [0m[1;32m 367[0m ) [1;32m 368[0m [0;34m[0m[0m [0;32m~/anaconda3/envs/donut/lib/python3.7/site-packages/pytorch_lightning/trainer/trainer.py[0m in [0;36m_call_lightning_module_hook[0;34m(self, hook_name, pl_module, *args, **kwargs)[0m [1;32m 1303[0m [0;34m[0m[0m [1;32m 1304[0m [0;32mwith[0m [0mself[0m[0;34m.[0m[0mprofiler[0m[0;34m.[0m[0mprofile[0m[0;34m([0m[0;34mf"[LightningModule]{pl_module.__class__.__name__}.{hook_name}"[0m[0;34m)[0m[0;34m:[0m[0;34m[0m[0;34m[0m[0m [0;32m-> 1305[0;31m [0moutput[0m [0;34m=[0m [0mfn[0m[0;34m([0m[0;34m*[0m[0margs[0m[0;34m,[0m [0;34m**[0m[0mkwargs[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m [0m[1;32m 1306[0m [0;34m[0m[0m [1;32m 1307[0m [0;31m# restore current_fx when nested context[0m[0;34m[0m[0;34m[0m[0;34m[0m[0m [0;32m~/anaconda3/envs/donut/lib/python3.7/site-packages/pytorch_lightning/core/module.py[0m in [0;36moptimizer_step[0;34m(self, epoch, batch_idx, optimizer, optimizer_idx, optimizer_closure, on_tpu, using_native_amp, using_lbfgs)[0m [1;32m 1659[0m [0;34m[0m[0m [1;32m 1660[0m """ [0;32m-> 1661[0;31m [0moptimizer[0m[0;34m.[0m[0mstep[0m[0;34m([0m[0mclosure[0m[0;34m=[0m[0moptimizer_closure[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m [0m[1;32m 1662[0m [0;34m[0m[0m [1;32m 1663[0m [0;32mdef[0m [0moptimizer_zero_grad[0m[0;34m([0m[0mself[0m[0;34m,[0m [0mepoch[0m[0;34m:[0m [0mint[0m[0;34m,[0m [0mbatch_idx[0m[0;34m:[0m [0mint[0m[0;34m,[0m [0moptimizer[0m[0;34m:[0m [0mOptimizer[0m[0;34m,[0m [0moptimizer_idx[0m[0;34m:[0m [0mint[0m[0;34m)[0m [0;34m->[0m [0;32mNone[0m[0;34m:[0m[0;34m[0m[0;34m[0m[0m [0;32m~/anaconda3/envs/donut/lib/python3.7/site-packages/pytorch_lightning/core/optimizer.py[0m in [0;36mstep[0;34m(self, closure, **kwargs)[0m [1;32m 167[0m [0;34m[0m[0m [1;32m 168[0m [0;32massert[0m [0mself[0m[0;34m.[0m[0m_strategy[0m [0;32mis[0m [0;32mnot[0m [0;32mNone[0m[0;34m[0m[0;34m[0m[0m [0;32m--> 169[0;31m [0mstep_output[0m [0;34m=[0m [0mself[0m[0;34m.[0m[0m_strategy[0m[0;34m.[0m[0moptimizer_step[0m[0;34m([0m[0mself[0m[0;34m.[0m[0m_optimizer[0m[0;34m,[0m [0mself[0m[0;34m.[0m[0m_optimizer_idx[0m[0;34m,[0m [0mclosure[0m[0;34m,[0m [0;34m**[0m[0mkwargs[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m [0m[1;32m 170[0m [0;34m[0m[0m [1;32m 171[0m [0mself[0m[0;34m.[0m[0m_on_after_step[0m[0;34m([0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m [0;32m~/anaconda3/envs/donut/lib/python3.7/site-packages/pytorch_lightning/strategies/strategy.py[0m in [0;36moptimizer_step[0;34m(self, optimizer, opt_idx, closure, model, **kwargs)[0m [1;32m 233[0m [0;32massert[0m [0misinstance[0m[0;34m([0m[0mmodel[0m[0;34m,[0m [0mpl[0m[0;34m.[0m[0mLightningModule[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m [1;32m 234[0m return self.precision_plugin.optimizer_step( [0;32m--> 235[0;31m [0moptimizer[0m[0;34m,[0m [0mmodel[0m[0;34m=[0m[0mmodel[0m[0;34m,[0m [0moptimizer_idx[0m[0;34m=[0m[0mopt_idx[0m[0;34m,[0m [0mclosure[0m[0;34m=[0m[0mclosure[0m[0;34m,[0m [0;34m**[0m[0mkwargs[0m[0;34m[0m[0;34m[0m[0m [0m[1;32m 236[0m ) [1;32m 237[0m [0;34m[0m[0m [0;32m~/anaconda3/envs/donut/lib/python3.7/site-packages/pytorch_lightning/plugins/precision/native_amp.py[0m in [0;36moptimizer_step[0;34m(self, optimizer, model, optimizer_idx, closure, **kwargs)[0m [1;32m 77[0m [0;31m# skip scaler logic, as bfloat16 does not require scaler[0m[0;34m[0m[0;34m[0m[0;34m[0m[0m [1;32m 78[0m return super().optimizer_step( [0;32m---> 79[0;31m [0moptimizer[0m[0;34m,[0m [0mmodel[0m[0;34m=[0m[0mmodel[0m[0;34m,[0m [0moptimizer_idx[0m[0;34m=[0m[0moptimizer_idx[0m[0;34m,[0m [0mclosure[0m[0;34m=[0m[0mclosure[0m[0;34m,[0m [0;34m**[0m[0mkwargs[0m[0;34m[0m[0;34m[0m[0m [0m[1;32m 80[0m ) [1;32m 81[0m [0;32mif[0m [0misinstance[0m[0;34m([0m[0moptimizer[0m[0;34m,[0m [0mLBFGS[0m[0;34m)[0m[0;34m:[0m[0;34m[0m[0;34m[0m[0m [0;32m~/anaconda3/envs/donut/lib/python3.7/site-packages/pytorch_lightning/plugins/precision/precision_plugin.py[0m in [0;36moptimizer_step[0;34m(self, optimizer, model, optimizer_idx, closure, **kwargs)[0m [1;32m 119[0m [0;34m"""Hook to run the optimizer step."""[0m[0;34m[0m[0;34m[0m[0m [1;32m 120[0m [0mclosure[0m [0;34m=[0m [0mpartial[0m[0;34m([0m[0mself[0m[0;34m.[0m[0m_wrap_closure[0m[0;34m,[0m [0mmodel[0m[0;34m,[0m [0moptimizer[0m[0;34m,[0m [0moptimizer_idx[0m[0;34m,[0m [0mclosure[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m [0;32m--> 121[0;31m [0;32mreturn[0m [0moptimizer[0m[0;34m.[0m[0mstep[0m[0;34m([0m[0mclosure[0m[0;34m=[0m[0mclosure[0m[0;34m,[0m [0;34m**[0m[0mkwargs[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m [0m[1;32m 122[0m [0;34m[0m[0m [1;32m 123[0m [0;32mdef[0m [0m_track_grad_norm[0m[0;34m([0m[0mself[0m[0;34m,[0m [0mtrainer[0m[0;34m:[0m [0;34m"pl.Trainer"[0m[0;34m)[0m [0;34m->[0m [0;32mNone[0m[0;34m:[0m[0;34m[0m[0;34m[0m[0m [0;32m~/anaconda3/envs/donut/lib/python3.7/site-packages/torch/optim/optimizer.py[0m in [0;36mwrapper[0;34m(*args, **kwargs)[0m [1;32m 138[0m [0mprofile_name[0m [0;34m=[0m [0;34m"Optimizer.step#{}.step"[0m[0;34m.[0m[0mformat[0m[0;34m([0m[0mobj[0m[0;34m.[0m[0m__class__[0m[0;34m.[0m[0m__name__[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m [1;32m 139[0m [0;32mwith[0m [0mtorch[0m[0;34m.[0m[0mautograd[0m[0;34m.[0m[0mprofiler[0m[0;34m.[0m[0mrecord_function[0m[0;34m([0m[0mprofile_name[0m[0;34m)[0m[0;34m:[0m[0;34m[0m[0;34m[0m[0m [0;32m--> 140[0;31m [0mout[0m [0;34m=[0m [0mfunc[0m[0;34m([0m[0;34m*[0m[0margs[0m[0;34m,[0m [0;34m**[0m[0mkwargs[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m [0m[1;32m 141[0m [0mobj[0m[0;34m.[0m[0m_optimizer_step_code[0m[0;34m([0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m [1;32m 142[0m [0;32mreturn[0m [0mout[0m[0;34m[0m[0;34m[0m[0m [0;32m~/anaconda3/envs/donut/lib/python3.7/site-packages/torch/optim/optimizer.py[0m in [0;36m_use_grad[0;34m(self, *args, **kwargs)[0m [1;32m 21[0m [0;32mtry[0m[0;34m:[0m[0;34m[0m[0;34m[0m[0m [1;32m 22[0m [0mtorch[0m[0;34m.[0m[0mset_grad_enabled[0m[0;34m([0m[0mself[0m[0;34m.[0m[0mdefaults[0m[0;34m[[0m[0;34m'differentiable'[0m[0;34m][0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m [0;32m---> 23[0;31m [0mret[0m [0;34m=[0m [0mfunc[0m[0;34m([0m[0mself[0m[0;34m,[0m [0;34m*[0m[0margs[0m[0;34m,[0m [0;34m**[0m[0mkwargs[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m [0m[1;32m 24[0m [0;32mfinally[0m[0;34m:[0m[0;34m[0m[0;34m[0m[0m [1;32m 25[0m [0mtorch[0m[0;34m.[0m[0mset_grad_enabled[0m[0;34m([0m[0mprev_grad[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m [0;32m~/anaconda3/envs/donut/lib/python3.7/site-packages/torch/optim/adam.py[0m in [0;36mstep[0;34m(self, closure, grad_scaler)[0m [1;32m 181[0m [0;32mif[0m [0mclosure[0m [0;32mis[0m [0;32mnot[0m [0;32mNone[0m[0;34m:[0m[0;34m[0m[0;34m[0m[0m [1;32m 182[0m [0;32mwith[0m [0mtorch[0m[0;34m.[0m[0menable_grad[0m[0;34m([0m[0;34m)[0m[0;34m:[0m[0;34m[0m[0;34m[0m[0m [0;32m--> 183[0;31m [0mloss[0m [0;34m=[0m [0mclosure[0m[0;34m([0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m [0m[1;32m 184[0m [0;34m[0m[0m [1;32m 185[0m [0;32mfor[0m [0mgroup[0m [0;32min[0m [0mself[0m[0;34m.[0m[0mparam_groups[0m[0;34m:[0m[0;34m[0m[0;34m[0m[0m [0;32m~/anaconda3/envs/donut/lib/python3.7/site-packages/pytorch_lightning/plugins/precision/precision_plugin.py[0m in [0;36m_wrap_closure[0;34m(self, model, optimizer, optimizer_idx, closure)[0m [1;32m 105[0m [0mconsistent[0m [0;32mwith[0m [0mthe[0m[0;31m [0m[0;31m`[0m[0;31m`[0m[0mPrecisionPlugin[0m[0;31m`[0m[0;31m`[0m [0msubclasses[0m [0mthat[0m [0mcannot[0m [0;32mpass[0m[0;31m [0m[0;31m`[0m[0;31m`[0m[0moptimizer[0m[0;34m.[0m[0mstep[0m[0;34m([0m[0mclosure[0m[0;34m)[0m[0;31m`[0m[0;31m`[0m [0mdirectly[0m[0;34m.[0m[0;34m[0m[0;34m[0m[0m [1;32m 106[0m """ [0;32m--> 107[0;31m [0mclosure_result[0m [0;34m=[0m [0mclosure[0m[0;34m([0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m [0m[1;32m 108[0m [0mself[0m[0;34m.[0m[0m_after_closure[0m[0;34m([0m[0mmodel[0m[0;34m,[0m [0moptimizer[0m[0;34m,[0m [0moptimizer_idx[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m [1;32m 109[0m [0;32mreturn[0m [0mclosure_result[0m[0;34m[0m[0;34m[0m[0m [0;32m~/anaconda3/envs/donut/lib/python3.7/site-packages/pytorch_lightning/loops/optimization/optimizer_loop.py[0m in [0;36m__call__[0;34m(self, *args, **kwargs)[0m [1;32m 145[0m [0;34m[0m[0m [1;32m 146[0m [0;32mdef[0m [0m__call__[0m[0;34m([0m[0mself[0m[0;34m,[0m [0;34m*[0m[0margs[0m[0;34m:[0m [0mAny[0m[0;34m,[0m [0;34m**[0m[0mkwargs[0m[0;34m:[0m [0mAny[0m[0;34m)[0m [0;34m->[0m [0mOptional[0m[0;34m[[0m[0mTensor[0m[0;34m][0m[0;34m:[0m[0;34m[0m[0;34m[0m[0m [0;32m--> 147[0;31m [0mself[0m[0;34m.[0m[0m_result[0m [0;34m=[0m [0mself[0m[0;34m.[0m[0mclosure[0m[0;34m([0m[0;34m*[0m[0margs[0m[0;34m,[0m [0;34m**[0m[0mkwargs[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m [0m[1;32m 148[0m [0;32mreturn[0m [0mself[0m[0;34m.[0m[0m_result[0m[0;34m.[0m[0mloss[0m[0;34m[0m[0;34m[0m[0m [1;32m 149[0m [0;34m[0m[0m [0;32m~/anaconda3/envs/donut/lib/python3.7/site-packages/pytorch_lightning/loops/optimization/optimizer_loop.py[0m in [0;36mclosure[0;34m(self, *args, **kwargs)[0m [1;32m 131[0m [0;34m[0m[0m [1;32m 132[0m [0;32mdef[0m [0mclosure[0m[0;34m([0m[0mself[0m[0;34m,[0m [0;34m*[0m[0margs[0m[0;34m:[0m [0mAny[0m[0;34m,[0m [0;34m**[0m[0mkwargs[0m[0;34m:[0m [0mAny[0m[0;34m)[0m [0;34m->[0m [0mClosureResult[0m[0;34m:[0m[0;34m[0m[0;34m[0m[0m [0;32m--> 133[0;31m [0mstep_output[0m [0;34m=[0m [0mself[0m[0;34m.[0m[0m_step_fn[0m[0;34m([0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m [0m[1;32m 134[0m [0;34m[0m[0m [1;32m 135[0m [0;32mif[0m [0mstep_output[0m[0;34m.[0m[0mclosure_loss[0m [0;32mis[0m [0;32mNone[0m[0;34m:[0m[0;34m[0m[0;34m[0m[0m [0;32m~/anaconda3/envs/donut/lib/python3.7/site-packages/pytorch_lightning/loops/optimization/optimizer_loop.py[0m in [0;36m_training_step[0;34m(self, kwargs)[0m [1;32m 404[0m """ [1;32m 405[0m [0;31m# manually capture logged metrics[0m[0;34m[0m[0;34m[0m[0;34m[0m[0m [0;32m--> 406[0;31m [0mtraining_step_output[0m [0;34m=[0m [0mself[0m[0;34m.[0m[0mtrainer[0m[0;34m.[0m[0m_call_strategy_hook[0m[0;34m([0m[0;34m"training_step"[0m[0;34m,[0m [0;34m*[0m[0mkwargs[0m[0;34m.[0m[0mvalues[0m[0;34m([0m[0;34m)[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m [0m[1;32m 407[0m [0mself[0m[0;34m.[0m[0mtrainer[0m[0;34m.[0m[0mstrategy[0m[0;34m.[0m[0mpost_training_step[0m[0;34m([0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m [1;32m 408[0m [0;34m[0m[0m [0;32m~/anaconda3/envs/donut/lib/python3.7/site-packages/pytorch_lightning/trainer/trainer.py[0m in [0;36m_call_strategy_hook[0;34m(self, hook_name, *args, **kwargs)[0m [1;32m 1441[0m [0;34m[0m[0m [1;32m 1442[0m [0;32mwith[0m [0mself[0m[0;34m.[0m[0mprofiler[0m[0;34m.[0m[0mprofile[0m[0;34m([0m[0;34mf"[Strategy]{self.strategy.__class__.__name__}.{hook_name}"[0m[0;34m)[0m[0;34m:[0m[0;34m[0m[0;34m[0m[0m [0;32m-> 1443[0;31m [0moutput[0m [0;34m=[0m [0mfn[0m[0;34m([0m[0;34m*[0m[0margs[0m[0;34m,[0m [0;34m**[0m[0mkwargs[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m [0m[1;32m 1444[0m [0;34m[0m[0m [1;32m 1445[0m [0;31m# restore current_fx when nested context[0m[0;34m[0m[0;34m[0m[0;34m[0m[0m [0;32m~/anaconda3/envs/donut/lib/python3.7/site-packages/pytorch_lightning/strategies/strategy.py[0m in [0;36mtraining_step[0;34m(self, *args, **kwargs)[0m [1;32m 376[0m [0;32mwith[0m [0mself[0m[0;34m.[0m[0mprecision_plugin[0m[0;34m.[0m[0mtrain_step_context[0m[0;34m([0m[0;34m)[0m[0;34m:[0m[0;34m[0m[0;34m[0m[0m [1;32m 377[0m [0;32massert[0m [0misinstance[0m[0;34m([0m[0mself[0m[0;34m.[0m[0mmodel[0m[0;34m,[0m [0mTrainingStep[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m [0;32m--> 378[0;31m [0;32mreturn[0m [0mself[0m[0;34m.[0m[0mmodel[0m[0;34m.[0m[0mtraining_step[0m[0;34m([0m[0;34m*[0m[0margs[0m[0;34m,[0m [0;34m**[0m[0mkwargs[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m [0m[1;32m 379[0m [0;34m[0m[0m [1;32m 380[0m [0;32mdef[0m [0mpost_training_step[0m[0;34m([0m[0mself[0m[0;34m)[0m [0;34m->[0m [0;32mNone[0m[0;34m:[0m[0;34m[0m[0;34m[0m[0m [0;32m/tmp/ipykernel_294/1279761003.py[0m in [0;36mtraining_step[0;34m(self, batch, batch_idx)[0m [1;32m 9[0m [0mpixel_values[0m[0;34m,[0m [0mlabels[0m[0;34m,[0m [0m_[0m [0;34m=[0m [0mbatch[0m[0;34m[0m[0;34m[0m[0m [1;32m 10[0m [0;34m[0m[0m [0;32m---> 11[0;31m [0moutputs[0m [0;34m=[0m [0mself[0m[0;34m.[0m[0mmodel[0m[0;34m([0m[0mpixel_values[0m[0;34m,[0m [0mlabels[0m[0;34m=[0m[0mlabels[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m [0m[1;32m 12[0m [0mloss[0m [0;34m=[0m [0moutputs[0m[0;34m.[0m[0mloss[0m[0;34m[0m[0;34m[0m[0m [1;32m 13[0m [0mself[0m[0;34m.[0m[0mlog_dict[0m[0;34m([0m[0;34m{[0m[0;34m"train_loss"[0m[0;34m:[0m [0mloss[0m[0;34m}[0m[0;34m,[0m [0msync_dist[0m[0;34m=[0m[0;32mTrue[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m [0;32m~/anaconda3/envs/donut/lib/python3.7/site-packages/torch/nn/modules/module.py[0m in [0;36m_call_impl[0;34m(self, *input, **kwargs)[0m [1;32m 1188[0m if not (self._backward_hooks or self._forward_hooks or self._forward_pre_hooks or _global_backward_hooks [1;32m 1189[0m or _global_forward_hooks or _global_forward_pre_hooks): [0;32m-> 1190[0;31m [0;32mreturn[0m [0mforward_call[0m[0;34m([0m[0;34m*[0m[0minput[0m[0;34m,[0m [0;34m**[0m[0mkwargs[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m [0m[1;32m 1191[0m [0;31m# Do not call functions when jit is used[0m[0;34m[0m[0;34m[0m[0;34m[0m[0m [1;32m 1192[0m [0mfull_backward_hooks[0m[0;34m,[0m [0mnon_full_backward_hooks[0m [0;34m=[0m [0;34m[[0m[0;34m][0m[0;34m,[0m [0;34m[[0m[0;34m][0m[0;34m[0m[0;34m[0m[0m [0;32m~/anaconda3/envs/donut/lib/python3.7/site-packages/transformers/models/vision_encoder_decoder/modeling_vision_encoder_decoder.py[0m in [0;36mforward[0;34m(self, pixel_values, decoder_input_ids, decoder_attention_mask, encoder_outputs, past_key_values, decoder_inputs_embeds, labels, use_cache, output_attentions, output_hidden_states, return_dict, **kwargs)[0m [1;32m 584[0m [0moutput_hidden_states[0m[0;34m=[0m[0moutput_hidden_states[0m[0;34m,[0m[0;34m[0m[0;34m[0m[0m [1;32m 585[0m [0mreturn_dict[0m[0;34m=[0m[0mreturn_dict[0m[0;34m,[0m[0;34m[0m[0;34m[0m[0m [0;32m--> 586[0;31m [0;34m**[0m[0mkwargs_encoder[0m[0;34m,[0m[0;34m[0m[0;34m[0m[0m [0m[1;32m 587[0m ) [1;32m 588[0m [0;32melif[0m [0misinstance[0m[0;34m([0m[0mencoder_outputs[0m[0;34m,[0m [0mtuple[0m[0;34m)[0m[0;34m:[0m[0;34m[0m[0;34m[0m[0m [0;32m~/anaconda3/envs/donut/lib/python3.7/site-packages/torch/nn/modules/module.py[0m in [0;36m_call_impl[0;34m(self, *input, **kwargs)[0m [1;32m 1188[0m if not (self._backward_hooks or self._forward_hooks or self._forward_pre_hooks or _global_backward_hooks [1;32m 1189[0m or _global_forward_hooks or _global_forward_pre_hooks): [0;32m-> 1190[0;31m [0;32mreturn[0m [0mforward_call[0m[0;34m([0m[0;34m*[0m[0minput[0m[0;34m,[0m [0;34m**[0m[0mkwargs[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m [0m[1;32m 1191[0m [0;31m# Do not call functions when jit is used[0m[0;34m[0m[0;34m[0m[0;34m[0m[0m [1;32m 1192[0m [0mfull_backward_hooks[0m[0;34m,[0m [0mnon_full_backward_hooks[0m [0;34m=[0m [0;34m[[0m[0;34m][0m[0;34m,[0m [0;34m[[0m[0;34m][0m[0;34m[0m[0;34m[0m[0m [0;32m~/anaconda3/envs/donut/lib/python3.7/site-packages/transformers/models/swin/modeling_swin.py[0m in [0;36mforward[0;34m(self, pixel_values, bool_masked_pos, head_mask, output_attentions, output_hidden_states, return_dict)[0m [1;32m 973[0m [0mhead_mask[0m [0;34m=[0m [0mself[0m[0;34m.[0m[0mget_head_mask[0m[0;34m([0m[0mhead_mask[0m[0;34m,[0m [0mlen[0m[0;34m([0m[0mself[0m[0;34m.[0m[0mconfig[0m[0;34m.[0m[0mdepths[0m[0;34m)[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m [1;32m 974[0m [0;34m[0m[0m [0;32m--> 975[0;31m [0membedding_output[0m[0;34m,[0m [0minput_dimensions[0m [0;34m=[0m [0mself[0m[0;34m.[0m[0membeddings[0m[0;34m([0m[0mpixel_values[0m[0;34m,[0m [0mbool_masked_pos[0m[0;34m=[0m[0mbool_masked_pos[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m [0m[1;32m 976[0m [0;34m[0m[0m [1;32m 977[0m encoder_outputs = self.encoder( [0;32m~/anaconda3/envs/donut/lib/python3.7/site-packages/torch/nn/modules/module.py[0m in [0;36m_call_impl[0;34m(self, *input, **kwargs)[0m [1;32m 1188[0m if not (self._backward_hooks or self._forward_hooks or self._forward_pre_hooks or _global_backward_hooks [1;32m 1189[0m or _global_forward_hooks or _global_forward_pre_hooks): [0;32m-> 1190[0;31m [0;32mreturn[0m [0mforward_call[0m[0;34m([0m[0;34m*[0m[0minput[0m[0;34m,[0m [0;34m**[0m[0mkwargs[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m [0m[1;32m 1191[0m [0;31m# Do not call functions when jit is used[0m[0;34m[0m[0;34m[0m[0;34m[0m[0m [1;32m 1192[0m [0mfull_backward_hooks[0m[0;34m,[0m [0mnon_full_backward_hooks[0m [0;34m=[0m [0;34m[[0m[0;34m][0m[0;34m,[0m [0;34m[[0m[0;34m][0m[0;34m[0m[0;34m[0m[0m [0;32m~/anaconda3/envs/donut/lib/python3.7/site-packages/transformers/models/swin/modeling_swin.py[0m in [0;36mforward[0;34m(self, pixel_values, bool_masked_pos)[0m [1;32m 251[0m ) -> Tuple[torch.Tensor]: [1;32m 252[0m [0membeddings[0m[0;34m,[0m [0moutput_dimensions[0m [0;34m=[0m [0mself[0m[0;34m.[0m[0mpatch_embeddings[0m[0;34m([0m[0mpixel_values[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m [0;32m--> 253[0;31m [0membeddings[0m [0;34m=[0m [0mself[0m[0;34m.[0m[0mnorm[0m[0;34m([0m[0membeddings[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m [0m[1;32m 254[0m [0mbatch_size[0m[0;34m,[0m [0mseq_len[0m[0;34m,[0m [0m_[0m [0;34m=[0m [0membeddings[0m[0;34m.[0m[0msize[0m[0;34m([0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m [1;32m 255[0m [0;34m[0m[0m [0;32m~/anaconda3/envs/donut/lib/python3.7/site-packages/torch/nn/modules/module.py[0m in [0;36m_call_impl[0;34m(self, *input, **kwargs)[0m [1;32m 1188[0m if not (self._backward_hooks or self._forward_hooks or self._forward_pre_hooks or _global_backward_hooks [1;32m 1189[0m or _global_forward_hooks or _global_forward_pre_hooks): [0;32m-> 1190[0;31m [0;32mreturn[0m [0mforward_call[0m[0;34m([0m[0;34m*[0m[0minput[0m[0;34m,[0m [0;34m**[0m[0mkwargs[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m [0m[1;32m 1191[0m [0;31m# Do not call functions when jit is used[0m[0;34m[0m[0;34m[0m[0;34m[0m[0m [1;32m 1192[0m [0mfull_backward_hooks[0m[0;34m,[0m [0mnon_full_backward_hooks[0m [0;34m=[0m [0;34m[[0m[0;34m][0m[0;34m,[0m [0;34m[[0m[0;34m][0m[0;34m[0m[0;34m[0m[0m [0;32m~/anaconda3/envs/donut/lib/python3.7/site-packages/torch/nn/modules/normalization.py[0m in [0;36mforward[0;34m(self, input)[0m [1;32m 189[0m [0;32mdef[0m [0mforward[0m[0;34m([0m[0mself[0m[0;34m,[0m [0minput[0m[0;34m:[0m [0mTensor[0m[0;34m)[0m [0;34m->[0m [0mTensor[0m[0;34m:[0m[0;34m[0m[0;34m[0m[0m [1;32m 190[0m return F.layer_norm( [0;32m--> 191[0;31m input, self.normalized_shape, self.weight, self.bias, self.eps) [0m[1;32m 192[0m [0;34m[0m[0m [1;32m 193[0m [0;32mdef[0m [0mextra_repr[0m[0;34m([0m[0mself[0m[0;34m)[0m [0;34m->[0m [0mstr[0m[0;34m:[0m[0;34m[0m[0;34m[0m[0m [0;32m~/anaconda3/envs/donut/lib/python3.7/site-packages/torch/nn/functional.py[0m in [0;36mlayer_norm[0;34m(input, normalized_shape, weight, bias, eps)[0m [1;32m 2513[0m [0mlayer_norm[0m[0;34m,[0m [0;34m([0m[0minput[0m[0;34m,[0m [0mweight[0m[0;34m,[0m [0mbias[0m[0;34m)[0m[0;34m,[0m [0minput[0m[0;34m,[0m [0mnormalized_shape[0m[0;34m,[0m [0mweight[0m[0;34m=[0m[0mweight[0m[0;34m,[0m [0mbias[0m[0;34m=[0m[0mbias[0m[0;34m,[0m [0meps[0m[0;34m=[0m[0meps[0m[0;34m[0m[0;34m[0m[0m [1;32m 2514[0m ) [0;32m-> 2515[0;31m [0;32mreturn[0m [0mtorch[0m[0;34m.[0m[0mlayer_norm[0m[0;34m([0m[0minput[0m[0;34m,[0m [0mnormalized_shape[0m[0;34m,[0m [0mweight[0m[0;34m,[0m [0mbias[0m[0;34m,[0m [0meps[0m[0;34m,[0m [0mtorch[0m[0;34m.[0m[0mbackends[0m[0;34m.[0m[0mcudnn[0m[0;34m.[0m[0menabled[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m [0m[1;32m 2516[0m [0;34m[0m[0m [1;32m 2517[0m [0;34m[0m[0m [0;31mRuntimeError[0m: expected scalar type BFloat16 but found Float