diff --git a/.gitignore b/.gitignore
index 5cee8f1..747d90c 100644
--- a/.gitignore
+++ b/.gitignore
@@ -3,4 +3,5 @@ Donut
 nohup.out
 wandb
 __pycache__/
-checkpoint
\ No newline at end of file
+checkpoint
+.vscode
\ No newline at end of file
diff --git a/train_stream.py b/train_stream.py
index 3079ffc..4e21818 100644
--- a/train_stream.py
+++ b/train_stream.py
@@ -1,3 +1,4 @@
+from typing import Any, List
 from transformers import VisionEncoderDecoderConfig, DonutProcessor, VisionEncoderDecoderModel
 import torch
 from torch.utils.data import DataLoader
@@ -15,7 +16,7 @@ from utils.callbacks import PushToHubCallback
 import warnings
 from datasets import load_dataset, interleave_datasets
 from torchdata.datapipes.iter import IterableWrapper
-
+import json
 
 
 def main(config, hug_token):
@@ -34,30 +35,128 @@ def main(config, hug_token):
 
     added_tokens = []
 
-    dataset = load_dataset(config.dataset_path, split="train[:80%]")
-    dataset = dataset.train_test_split(test_size=0.1)
+    ### PROCESS FUNC START ###
 
-    train_dataset_process = DonutDatasetStream(
-                        processor=processor, 
-                        model=model, 
-                        max_length=config.max_length,
-                        split="train", 
-                        task_start_token="<s_cord-v2>", 
-                        prompt_end_token="<s_cord-v2>",
-                        added_tokens=added_tokens,
-                        sort_json_key=False,  # cord dataset is preprocessed, so no need for this
-                    )
+    def add_tokens(list_of_tokens: List[str]):
+        """
+        Add special tokens to tokenizer and resize the token embeddings of the decoder
+        """
+        newly_added_num = processor.tokenizer.add_tokens(list_of_tokens)
+        if newly_added_num > 0:
+            model.decoder.resize_token_embeddings(len(processor.tokenizer))
+            added_tokens.extend(list_of_tokens)
 
-    val_dataset_process = DonutDatasetStream(
-                        processor=processor, 
-                        model=model, 
-                        max_length=config.max_length,
-                        split="test", 
-                        task_start_token="<s_cord-v2>", 
-                        prompt_end_token="<s_cord-v2>",
-                        added_tokens=added_tokens,
-                        sort_json_key=False,  # cord dataset is preprocessed, so no need for this
+    def json2token(obj: Any, update_special_tokens_for_json_key: bool = True, sort_json_key: bool = True):
+        """
+        Convert an ordered JSON object into a token sequence
+        """
+        if type(obj) == dict:
+            if len(obj) == 1 and "text_sequence" in obj:
+                return obj["text_sequence"]
+            else:
+                output = ""
+                if sort_json_key:
+                    keys = sorted(obj.keys(), reverse=True)
+                else:
+                    keys = obj.keys()
+                for k in keys:
+                    if update_special_tokens_for_json_key:
+                        add_tokens([fr"<s_{k}>", fr"</s_{k}>"])
+                    output += (
+                        fr"<s_{k}>"
+                        + json2token(obj[k], update_special_tokens_for_json_key, sort_json_key)
+                        + fr"</s_{k}>"
                     )
+                return output
+        elif type(obj) == list:
+            return r"<sep/>".join(
+                [json2token(item, update_special_tokens_for_json_key, sort_json_key) for item in obj]
+            )
+        else:
+            obj = str(obj)
+            if f"<{obj}/>" in added_tokens:
+                obj = f"<{obj}/>"  # for categorical special tokens
+            return obj
+
+    def process(row, split):
+        task_start_token, prompt_end_token = "<s_cord-v2>"
+        ground_truth = json.loads(row["ground_truth"])
+        if "gt_parses" in ground_truth:  # when multiple ground truths are available, e.g., docvqa
+            assert isinstance(ground_truth["gt_parses"], list)
+            gt_jsons = ground_truth["gt_parses"]
+        else:
+            assert "gt_parse" in ground_truth and isinstance(ground_truth["gt_parse"], dict)
+            gt_jsons = [ground_truth["gt_parse"]]
+
+        gt_token_sequences = (
+            [
+                json2token(
+                    gt_json,
+                    update_special_tokens_for_json_key=split == "train",
+                    sort_json_key=False,
+                )
+                + processor.tokenizer.eos_token
+                for gt_json in gt_jsons  # load json from list of json
+            ]
+        )
+
+        add_tokens([task_start_token, prompt_end_token])
+        prompt_end_token_id = processor.tokenizer.convert_tokens_to_ids(prompt_end_token)
+
+        # change if not 3 channels
+        if row['image'].mode != "RGB":
+            row['image'] = row['image'].convert("RGB")
+
+        # inputs
+        pixel_values = processor(row["image"], random_padding=split == "train", return_tensors="pt").pixel_values
+        pixel_values = pixel_values.squeeze()
+
+        # targets
+        input_ids = processor.tokenizer(
+            gt_token_sequences,
+            add_special_tokens=False,
+            max_length=config.max_length,
+            padding="max_length",
+            truncation=True,
+            return_tensors="pt",
+        )["input_ids"].squeeze(0)
+
+        labels = input_ids.clone()
+        labels[labels == processor.tokenizer.pad_token_id] = -100  # model doesn't need to predict pad token
+        return {"pixel_values": pixel_values, "labels": labels, 'target_sequence': gt_token_sequences }
+
+    def proces_train(row):
+        return process(row, 'train')
+    
+    def proces_val(row):
+        return process(row, 'validation')
+
+
+
+
+    ### PROCESS FUNC END ###
+
+    # train_dataset_process = DonutDatasetStream(
+    #                     processor=processor, 
+    #                     model=model, 
+    #                     max_length=config.max_length,
+    #                     split="train", 
+    #                     task_start_token="<s_cord-v2>", 
+    #                     prompt_end_token="<s_cord-v2>",
+    #                     added_tokens=added_tokens,
+    #                     sort_json_key=False,  # cord dataset is preprocessed, so no need for this
+    #                 )
+
+    # val_dataset_process = DonutDatasetStream(
+    #                     processor=processor, 
+    #                     model=model, 
+    #                     max_length=config.max_length,
+    #                     split="validation", 
+    #                     task_start_token="<s_cord-v2>", 
+    #                     prompt_end_token="<s_cord-v2>",
+    #                     added_tokens=added_tokens,
+    #                     sort_json_key=False,  # cord dataset is preprocessed, so no need for this
+    #                 )
     
     dataset = load_dataset(config.dataset_path, streaming=True)
     val_dataset = dataset.pop('validation') 
@@ -66,8 +165,8 @@ def main(config, hug_token):
     # val_length = list(val_dataset.info.splits.values())[-1].num_examples
 
 
-    train_dataset = train_dataset.map(lambda x: train_dataset_process.process(x), remove_columns = ['image', 'ground_truth'])
-    val_dataset = val_dataset.map(lambda x: val_dataset_process.process(x), remove_columns = ['image', 'ground_truth'])
+    train_dataset = train_dataset.map(proces_train, remove_columns = ['image', 'ground_truth'])
+    val_dataset = val_dataset.map(proces_val, remove_columns = ['image', 'ground_truth'])
 
     # train_dataset = train_dataset.with_format('torch')
     # val_dataset = val_dataset.with_format('torch')
diff --git a/utils/donut_dataset_stream.py b/utils/donut_dataset_stream.py
index 3757a4d..62f06ea 100644
--- a/utils/donut_dataset_stream.py
+++ b/utils/donut_dataset_stream.py
@@ -8,19 +8,6 @@ from transformers import DonutProcessor, VisionEncoderDecoderModel
 
 
 class DonutDatasetStream:
-    """
-    DonutDataset which is saved in huggingface datasets format. (see details in https://huggingface.co/docs/datasets)
-    Each row, consists of image path(png/jpg/jpeg) and gt data (json/jsonl/txt),
-    and it will be converted into input_tensor(vectorized image) and input_ids(tokenized string).
-    Args:
-        dataset_name_or_path: name of dataset (available at huggingface.co/datasets) or the path containing image files and metadata.jsonl
-        max_length: the max number of tokens for the target sequences
-        split: whether to load "train", "validation" or "test" split
-        ignore_id: ignore_index for torch.nn.CrossEntropyLoss
-        task_start_token: the special token to be fed to the decoder to conduct the target task
-        prompt_end_token: the special token at the end of the sequences
-        sort_json_key: whether or not to sort the JSON keys
-    """
 
     def __init__(
         self,
@@ -34,7 +21,6 @@ class DonutDatasetStream:
         sort_json_key: bool = True,
         added_tokens: list = []
     ):
-        super().__init__()
 
         self.split = split
         self.max_length = max_length