train pure function for map, gitignore added vscode

2023-03-14 20:52:24 +01:00 · 2023-03-14 20:52:24 +01:00 · c474b560aa
commit c474b560aa
parent 2f1176b3c0
3 changed files with 125 additions and 39 deletions
--- a/.gitignore
+++ b/.gitignore
@ -4,3 +4,4 @@ nohup.out
 wandb
 __pycache__/
 checkpoint
 .vscode
--- a/train_stream.py
+++ b/train_stream.py
@ -1,3 +1,4 @@
 from typing import Any, List
 from transformers import VisionEncoderDecoderConfig, DonutProcessor, VisionEncoderDecoderModel
 import torch
 from torch.utils.data import DataLoader
@ -15,7 +16,7 @@ from utils.callbacks import PushToHubCallback
 import warnings
 from datasets import load_dataset, interleave_datasets
 from torchdata.datapipes.iter import IterableWrapper
-
+import json
 def main(config, hug_token):
@ -34,30 +35,128 @@ def main(config, hug_token):
    added_tokens = []
-    dataset = load_dataset(config.dataset_path, split="train[:80%]")
+    ### PROCESS FUNC START ###
    dataset = dataset.train_test_split(test_size=0.1)
-    train_dataset_process = DonutDatasetStream(
+    def add_tokens(list_of_tokens: List[str]):
-                        processor=processor, 
+        """
-                        model=model, 
+        Add special tokens to tokenizer and resize the token embeddings of the decoder
-                        max_length=config.max_length,
+        """
-                        split="train", 
+        newly_added_num = processor.tokenizer.add_tokens(list_of_tokens)
-                        task_start_token="<s_cord-v2>", 
+        if newly_added_num > 0:
-                        prompt_end_token="<s_cord-v2>",
+            model.decoder.resize_token_embeddings(len(processor.tokenizer))
-                        added_tokens=added_tokens,
+            added_tokens.extend(list_of_tokens)
-                        sort_json_key=False,  # cord dataset is preprocessed, so no need for this
+
    def json2token(obj: Any, update_special_tokens_for_json_key: bool = True, sort_json_key: bool = True):
        """
        Convert an ordered JSON object into a token sequence
        """
        if type(obj) == dict:
            if len(obj) == 1 and "text_sequence" in obj:
                return obj["text_sequence"]
            else:
                output = ""
                if sort_json_key:
                    keys = sorted(obj.keys(), reverse=True)
                else:
                    keys = obj.keys()
                for k in keys:
                    if update_special_tokens_for_json_key:
                        add_tokens([fr"<s_{k}>", fr"</s_{k}>"])
                    output += (
                        fr"<s_{k}>"
                        + json2token(obj[k], update_special_tokens_for_json_key, sort_json_key)
                        + fr"</s_{k}>"
                    )
                return output
        elif type(obj) == list:
            return r"<sep/>".join(
                [json2token(item, update_special_tokens_for_json_key, sort_json_key) for item in obj]
            )
        else:
            obj = str(obj)
            if f"<{obj}/>" in added_tokens:
                obj = f"<{obj}/>"  # for categorical special tokens
            return obj
    def process(row, split):
        task_start_token, prompt_end_token = "<s_cord-v2>"
        ground_truth = json.loads(row["ground_truth"])
        if "gt_parses" in ground_truth:  # when multiple ground truths are available, e.g., docvqa
            assert isinstance(ground_truth["gt_parses"], list)
            gt_jsons = ground_truth["gt_parses"]
        else:
            assert "gt_parse" in ground_truth and isinstance(ground_truth["gt_parse"], dict)
            gt_jsons = [ground_truth["gt_parse"]]
        gt_token_sequences = (
            [
                json2token(
                    gt_json,
                    update_special_tokens_for_json_key=split == "train",
                    sort_json_key=False,
                )
                + processor.tokenizer.eos_token
                for gt_json in gt_jsons  # load json from list of json
            ]
        )
-    val_dataset_process = DonutDatasetStream(
+        add_tokens([task_start_token, prompt_end_token])
-                        processor=processor, 
+        prompt_end_token_id = processor.tokenizer.convert_tokens_to_ids(prompt_end_token)
-                        model=model, 
+
        # change if not 3 channels
        if row['image'].mode != "RGB":
            row['image'] = row['image'].convert("RGB")
        # inputs
        pixel_values = processor(row["image"], random_padding=split == "train", return_tensors="pt").pixel_values
        pixel_values = pixel_values.squeeze()
        # targets
        input_ids = processor.tokenizer(
            gt_token_sequences,
            add_special_tokens=False,
            max_length=config.max_length,
-                        split="test", 
+            padding="max_length",
-                        task_start_token="<s_cord-v2>", 
+            truncation=True,
-                        prompt_end_token="<s_cord-v2>",
+            return_tensors="pt",
-                        added_tokens=added_tokens,
+        )["input_ids"].squeeze(0)
-                        sort_json_key=False,  # cord dataset is preprocessed, so no need for this
+
-                    )
+        labels = input_ids.clone()
        labels[labels == processor.tokenizer.pad_token_id] = -100  # model doesn't need to predict pad token
        return {"pixel_values": pixel_values, "labels": labels, 'target_sequence': gt_token_sequences }
    def proces_train(row):
        return process(row, 'train')
    def proces_val(row):
        return process(row, 'validation')
    ### PROCESS FUNC END ###
    # train_dataset_process = DonutDatasetStream(
    #                     processor=processor, 
    #                     model=model, 
    #                     max_length=config.max_length,
    #                     split="train", 
    #                     task_start_token="<s_cord-v2>", 
    #                     prompt_end_token="<s_cord-v2>",
    #                     added_tokens=added_tokens,
    #                     sort_json_key=False,  # cord dataset is preprocessed, so no need for this
    #                 )
    # val_dataset_process = DonutDatasetStream(
    #                     processor=processor, 
    #                     model=model, 
    #                     max_length=config.max_length,
    #                     split="validation", 
    #                     task_start_token="<s_cord-v2>", 
    #                     prompt_end_token="<s_cord-v2>",
    #                     added_tokens=added_tokens,
    #                     sort_json_key=False,  # cord dataset is preprocessed, so no need for this
    #                 )
    dataset = load_dataset(config.dataset_path, streaming=True)
    val_dataset = dataset.pop('validation') 
@ -66,8 +165,8 @@ def main(config, hug_token):
    # val_length = list(val_dataset.info.splits.values())[-1].num_examples
-    train_dataset = train_dataset.map(lambda x: train_dataset_process.process(x), remove_columns = ['image', 'ground_truth'])
+    train_dataset = train_dataset.map(proces_train, remove_columns = ['image', 'ground_truth'])
-    val_dataset = val_dataset.map(lambda x: val_dataset_process.process(x), remove_columns = ['image', 'ground_truth'])
+    val_dataset = val_dataset.map(proces_val, remove_columns = ['image', 'ground_truth'])
    # train_dataset = train_dataset.with_format('torch')
    # val_dataset = val_dataset.with_format('torch')
--- a/utils/donut_dataset_stream.py
+++ b/utils/donut_dataset_stream.py
@ -8,19 +8,6 @@ from transformers import DonutProcessor, VisionEncoderDecoderModel
 class DonutDatasetStream:
    """
    DonutDataset which is saved in huggingface datasets format. (see details in https://huggingface.co/docs/datasets)
    Each row, consists of image path(png/jpg/jpeg) and gt data (json/jsonl/txt),
    and it will be converted into input_tensor(vectorized image) and input_ids(tokenized string).
    Args:
        dataset_name_or_path: name of dataset (available at huggingface.co/datasets) or the path containing image files and metadata.jsonl
        max_length: the max number of tokens for the target sequences
        split: whether to load "train", "validation" or "test" split
        ignore_id: ignore_index for torch.nn.CrossEntropyLoss
        task_start_token: the special token to be fed to the decoder to conduct the target task
        prompt_end_token: the special token at the end of the sequences
        sort_json_key: whether or not to sort the JSON keys
    """
    def __init__(
        self,
@ -34,7 +21,6 @@ class DonutDatasetStream:
        sort_json_key: bool = True,
        added_tokens: list = []
    ):
        super().__init__()
        self.split = split
        self.max_length = max_length