diff --git a/config-eval.yaml b/config-eval.yaml new file mode 100644 index 0000000..9dc286e --- /dev/null +++ b/config-eval.yaml @@ -0,0 +1,8 @@ +pretrained_processor_path: "Zombely/plwiki-proto-fine-tuned-v2" +pretrained_model_path: "Zombely/plwiki-proto-fine-tuned-v2" +validation_dataset_path: "Zombely/diachronia-ocr" +validation_dataset_split: "train" +has_metadata: False +print_output: True +output_file_dir: "../../gonito-outs" +test_name: "fine-tuned-test" \ No newline at end of file diff --git a/donut-eval.py b/donut-eval.py index c5548e6..5a03d45 100644 --- a/donut-eval.py +++ b/donut-eval.py @@ -1,83 +1,90 @@ #!/usr/bin/env python # coding: utf-8 -# In[1]: - - -from transformers import DonutProcessor, VisionEncoderDecoderModel +from transformers import DonutProcessor, VisionEncoderDecoderModel, VisionEncoderDecoderConfig from datasets import load_dataset import re import json import torch from tqdm.auto import tqdm import numpy as np - +import pandas as pd from donut import JSONParseEvaluator +import argparse +from sconf import Config +def main(config): -# In[2]: + # image_size = [1920, 2560] + # config_vision = VisionEncoderDecoderConfig.from_pretrained(config.pretrained_model_path) + # config_vision.encoder.image_size = image_size # (height, width) + # config_vision.decoder.max_length = 768 + processor = DonutProcessor.from_pretrained(config.pretrained_processor_path) + model = VisionEncoderDecoderModel.from_pretrained(config.pretrained_model_path) -processor = DonutProcessor.from_pretrained("Zombely/plwiki-proto-fine-tuned") -model = VisionEncoderDecoderModel.from_pretrained("Zombely/plwiki-proto-fine-tuned") + # processor.image_processor.size = image_size[::-1] # should be (width, height) + processor.image_processor.do_align_long_axis = False + dataset = load_dataset(config.validation_dataset_path, split=config.validation_dataset_split) + device = "cuda" if torch.cuda.is_available() else "cpu" + model.eval() + model.to(device) + output_list = [] + accs = [] -# In[3]: + for idx, sample in tqdm(enumerate(dataset), total=len(dataset)): + # prepare encoder inputs + pixel_values = processor(sample['image'].convert("RGB"), return_tensors="pt").pixel_values + pixel_values = pixel_values.to(device) + # prepare decoder inputs + task_prompt = "" + decoder_input_ids = processor.tokenizer(task_prompt, add_special_tokens=False, return_tensors="pt").input_ids + decoder_input_ids = decoder_input_ids.to(device) + + # autoregressively generate sequence + outputs = model.generate( + pixel_values, + decoder_input_ids=decoder_input_ids, + max_length=model.decoder.config.max_position_embeddings, + early_stopping=True, + pad_token_id=processor.tokenizer.pad_token_id, + eos_token_id=processor.tokenizer.eos_token_id, + use_cache=True, + num_beams=1, + bad_words_ids=[[processor.tokenizer.unk_token_id]], + return_dict_in_generate=True, + ) + # turn into JSON + seq = processor.batch_decode(outputs.sequences)[0] + seq = seq.replace(processor.tokenizer.eos_token, "").replace(processor.tokenizer.pad_token, "") + seq = re.sub(r"<.*?>", "", seq, count=1).strip() # remove first task start token + seq = processor.token2json(seq) + if config.has_metadata: + ground_truth = json.loads(sample["ground_truth"]) + ground_truth = ground_truth["gt_parse"] + evaluator = JSONParseEvaluator() + score = evaluator.cal_acc(seq, ground_truth) -dataset = load_dataset("Zombely/pl-text-images-5000-whole", split="validation") + accs.append(score) + if config.print_output: + print(seq) + output_list.append(seq) + if config.output_file_dir: + df = pd.DataFrame(map(lambda x: x.get('text_sequence', ''), output_list)) + df.to_csv(f'{config.output_file_dir}/{config.test_name}-out.tsv', sep='\t', header=False, index=False) + if config.has_metadata: + scores = {"accuracies": accs, "mean_accuracy": np.mean(accs)} + print(scores, f"length : {len(accs)}") + print("Mean accuracy:", np.mean(accs)) -# In[4]: - - -device = "cuda" if torch.cuda.is_available() else "cpu" - -model.eval() -model.to(device) - -output_list = [] -accs = [] - - -for idx, sample in tqdm(enumerate(dataset), total=len(dataset)): - # prepare encoder inputs - pixel_values = processor(sample["image"].convert("RGB"), return_tensors="pt").pixel_values - pixel_values = pixel_values.to(device) - # prepare decoder inputs - task_prompt = "" - decoder_input_ids = processor.tokenizer(task_prompt, add_special_tokens=False, return_tensors="pt").input_ids - decoder_input_ids = decoder_input_ids.to(device) - - # autoregressively generate sequence - outputs = model.generate( - pixel_values, - decoder_input_ids=decoder_input_ids, - max_length=model.decoder.config.max_position_embeddings, - early_stopping=True, - pad_token_id=processor.tokenizer.pad_token_id, - eos_token_id=processor.tokenizer.eos_token_id, - use_cache=True, - num_beams=1, - bad_words_ids=[[processor.tokenizer.unk_token_id]], - return_dict_in_generate=True, - ) - - # turn into JSON - seq = processor.batch_decode(outputs.sequences)[0] - seq = seq.replace(processor.tokenizer.eos_token, "").replace(processor.tokenizer.pad_token, "") - seq = re.sub(r"<.*?>", "", seq, count=1).strip() # remove first task start token - seq = processor.token2json(seq) - - ground_truth = json.loads(sample["ground_truth"]) - ground_truth = ground_truth["gt_parse"] - evaluator = JSONParseEvaluator() - score = evaluator.cal_acc(seq, ground_truth) - - accs.append(score) - output_list.append(seq) - -scores = {"accuracies": accs, "mean_accuracy": np.mean(accs)} -print(scores, f"length : {len(accs)}") -print("Mean accuracy:", np.mean(accs)) +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("--config", type=str, required=True) + args, left_argv = parser.parse_known_args() + config = Config(args.config) + config.argv_update(left_argv) + main(config) diff --git a/donut-train.py b/donut-train.py index 743c9f4..95dee08 100644 --- a/donut-train.py +++ b/donut-train.py @@ -21,16 +21,16 @@ from pytorch_lightning.plugins import CheckpointIO -DATASET_PATH = "Zombely/pl-text-images-5000-whole" -PRETRAINED_MODEL_PATH = "Zombely/plwiki-proto-fine-tuned" -START_MODEL_PATH = "Zombely/plwiki-proto-fine-tuned" -OUTPUT_MODEL_PATH = "Zombely/plwiki-proto-fine-tuned-v2" -LOGGING_PATH = "plwiki-proto-ft-second-iter" +DATASET_PATH = "Zombely/fiszki-ocr-train" +PRETRAINED_MODEL_PATH = "Zombely/plwiki-proto-fine-tuned-v2" +START_MODEL_PATH = "Zombely/plwiki-proto-fine-tuned-v2" +OUTPUT_MODEL_PATH = "Zombely/plwiki-proto-fine-tuned-v3" +LOGGING_PATH = "fiszki-ocr-fine-tune" CHECKPOINT_PATH = "./checkpoint" train_config = { - "max_epochs":30, + "max_epochs":1, "val_check_interval":0.5, # how many times we want to validate during an epoch "check_val_every_n_epoch":1, "gradient_clip_val":1.0, @@ -362,7 +362,7 @@ custom_ckpt = CustomCheckpointIO() trainer = pl.Trainer( - accelerator="gpu", # change to gpu + accelerator="gpu" if torch.cuda.is_available() else 'cpu', # change to gpu devices=1, max_epochs=train_config.get("max_epochs"), val_check_interval=train_config.get("val_check_interval"), diff --git a/notepads/dataset_create.ipynb b/notepads/dataset_create.ipynb new file mode 100644 index 0000000..4fbcd73 --- /dev/null +++ b/notepads/dataset_create.ipynb @@ -0,0 +1,822 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "from huggingface_hub import login\n", + "from datasets import load_dataset\n", + "import os\n", + "import json\n", + "import shutil" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "f0476002f8d14822a24f1376cfe29a07", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "VBox(children=(HTML(value='