utils deleted

2023-01-04 09:52:43 +01:00 · 2023-01-04 09:52:43 +01:00 · ac724952a1
commit ac724952a1
parent ccd4090d4b
3 changed files with 0 additions and 153 deletions
--- a/utils/init.py
+++ b/utils/init.py
--- a/utils/checkpoint.py
+++ b/utils/checkpoint.py
@ -1,17 +0,0 @@
-import torch
-from pytorch_lightning.plugins import CheckpointIO
-
-
-class CustomCheckpointIO(CheckpointIO):
-    def save_checkpoint(self, checkpoint, path, storage_options=None):
-        del checkpoint["state_dict"]
-        torch.save(checkpoint, path)
-
-    def load_checkpoint(self, path, storage_options=None):
-        checkpoint = torch.load(path + "artifacts.ckpt")
-        state_dict = torch.load(path + "pytorch_model.bin")
-        checkpoint["state_dict"] = {"model." + key: value for key, value in state_dict.items()}
-        return checkpoint
-
-    def remove_checkpoint(self, path) -> None:
-        return super().remove_checkpoint(path)
--- a/utils/donut_dataset.py
+++ b/utils/donut_dataset.py
@ -1,136 +0,0 @@
-class DonutDataset(Dataset):
-    """
-    DonutDataset which is saved in huggingface datasets format. (see details in https://huggingface.co/docs/datasets)
-    Each row, consists of image path(png/jpg/jpeg) and gt data (json/jsonl/txt),
-    and it will be converted into input_tensor(vectorized image) and input_ids(tokenized string).
-    Args:
-        dataset_name_or_path: name of dataset (available at huggingface.co/datasets) or the path containing image files and metadata.jsonl
-        max_length: the max number of tokens for the target sequences
-        split: whether to load "train", "validation" or "test" split
-        ignore_id: ignore_index for torch.nn.CrossEntropyLoss
-        task_start_token: the special token to be fed to the decoder to conduct the target task
-        prompt_end_token: the special token at the end of the sequences
-        sort_json_key: whether or not to sort the JSON keys
-    """
-
-    def __init__(
-        self,
-        dataset_name_or_path: str,
-        max_length: int,
-        split: str = "train",
-        ignore_id: int = -100,
-        task_start_token: str = "<s>",
-        prompt_end_token: str = None,
-        sort_json_key: bool = True,
-    ):
-        super().__init__()
-
-        self.max_length = max_length
-        self.split = split
-        self.ignore_id = ignore_id
-        self.task_start_token = task_start_token
-        self.prompt_end_token = prompt_end_token if prompt_end_token else task_start_token
-        self.sort_json_key = sort_json_key
-
-        self.dataset = load_dataset(dataset_name_or_path, split=self.split)
-        self.dataset_length = len(self.dataset)
-
-        self.gt_token_sequences = []
-        for sample in self.dataset:
-            ground_truth = json.loads(sample["ground_truth"])
-            if "gt_parses" in ground_truth:  # when multiple ground truths are available, e.g., docvqa
-                assert isinstance(ground_truth["gt_parses"], list)
-                gt_jsons = ground_truth["gt_parses"]
-            else:
-                assert "gt_parse" in ground_truth and isinstance(ground_truth["gt_parse"], dict)
-                gt_jsons = [ground_truth["gt_parse"]]
-
-            self.gt_token_sequences.append(
-                [
-                    self.json2token(
-                        gt_json,
-                        update_special_tokens_for_json_key=self.split == "train",
-                        sort_json_key=self.sort_json_key,
-                    )
-                    + processor.tokenizer.eos_token
-                    for gt_json in gt_jsons  # load json from list of json
-                ]
-            )
-
-        self.add_tokens([self.task_start_token, self.prompt_end_token])
-        self.prompt_end_token_id = processor.tokenizer.convert_tokens_to_ids(self.prompt_end_token)
-
-    def json2token(self, obj: Any, update_special_tokens_for_json_key: bool = True, sort_json_key: bool = True):
-        """
-        Convert an ordered JSON object into a token sequence
-        """
-        if type(obj) == dict:
-            if len(obj) == 1 and "text_sequence" in obj:
-                return obj["text_sequence"]
-            else:
-                output = ""
-                if sort_json_key:
-                    keys = sorted(obj.keys(), reverse=True)
-                else:
-                    keys = obj.keys()
-                for k in keys:
-                    if update_special_tokens_for_json_key:
-                        self.add_tokens([fr"<s_{k}>", fr"</s_{k}>"])
-                    output += (
-                        fr"<s_{k}>"
-                        + self.json2token(obj[k], update_special_tokens_for_json_key, sort_json_key)
-                        + fr"</s_{k}>"
-                    )
-                return output
-        elif type(obj) == list:
-            return r"<sep/>".join(
-                [self.json2token(item, update_special_tokens_for_json_key, sort_json_key) for item in obj]
-            )
-        else:
-            obj = str(obj)
-            if f"<{obj}/>" in added_tokens:
-                obj = f"<{obj}/>"  # for categorical special tokens
-            return obj
-    
-    def add_tokens(self, list_of_tokens: List[str]):
-        """
-        Add special tokens to tokenizer and resize the token embeddings of the decoder
-        """
-        newly_added_num = processor.tokenizer.add_tokens(list_of_tokens)
-        if newly_added_num > 0:
-            model.decoder.resize_token_embeddings(len(processor.tokenizer))
-            added_tokens.extend(list_of_tokens)
-    
-    def __len__(self) -> int:
-        return self.dataset_length
-
-    def __getitem__(self, idx: int) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
-        """
-        Load image from image_path of given dataset_path and convert into input_tensor and labels
-        Convert gt data into input_ids (tokenized string)
-        Returns:
-            input_tensor : preprocessed image
-            input_ids : tokenized gt_data
-            labels : masked labels (model doesn't need to predict prompt and pad token)
-        """
-        sample = self.dataset[idx]
-
-        # inputs
-        pixel_values = processor(sample["image"], random_padding=self.split == "train", return_tensors="pt").pixel_values
-        pixel_values = pixel_values.squeeze()
-
-        # targets
-        target_sequence = random.choice(self.gt_token_sequences[idx])  # can be more than one, e.g., DocVQA Task 1
-        input_ids = processor.tokenizer(
-            target_sequence,
-            add_special_tokens=False,
-            max_length=self.max_length,
-            padding="max_length",
-            truncation=True,
-            return_tensors="pt",
-        )["input_ids"].squeeze(0)
-
-        labels = input_ids.clone()
-        labels[labels == processor.tokenizer.pad_token_id] = self.ignore_id  # model doesn't need to predict pad token
-        # labels[: torch.nonzero(labels == self.prompt_end_token_id).sum() + 1] = self.ignore_id  # model doesn't need to predict prompt (for VQA)
-        return pixel_values, labels, target_sequence