config and params for donut-eval

This commit is contained in:
Michał Kozłowski 2022-12-16 13:53:03 +01:00
parent d98383197f
commit 8ccd1aabb6
2 changed files with 68 additions and 68 deletions

7
config.yaml Normal file
View File

@ -0,0 +1,7 @@
pretrained_processor_path: "Zombely/plwiki-proto-fine-tuned-v2"
pretrained_model_path: "Zombely/plwiki-proto-fine-tuned-v2"
validation_dataset_path: "Zombely/diachronia-ocr"
validation_dataset_split: "train"
has_metadata: False
print_output: True
output_file_dir: "../../gonito-outs"

View File

@ -1,9 +1,6 @@
#!/usr/bin/env python #!/usr/bin/env python
# coding: utf-8 # coding: utf-8
# In[1]:
from transformers import DonutProcessor, VisionEncoderDecoderModel from transformers import DonutProcessor, VisionEncoderDecoderModel
from datasets import load_dataset from datasets import load_dataset
import re import re
@ -13,75 +10,71 @@ from tqdm.auto import tqdm
import numpy as np import numpy as np
import pandas as pd import pandas as pd
from donut import JSONParseEvaluator from donut import JSONParseEvaluator
import argparse
from sconf import Config
def main(config):
processor = DonutProcessor.from_pretrained(config.pretrained_processor_path)
model = VisionEncoderDecoderModel.from_pretrained(config.pretrained_model_path)
dataset = load_dataset(config.validation_dataset_path, split=config.validation_dataset_split)
device = "cuda" if torch.cuda.is_available() else "cpu"
model.eval()
model.to(device)
output_list = []
accs = []
# In[2]: for idx, sample in tqdm(enumerate(dataset), total=len(dataset)):
# prepare encoder inputs
pixel_values = processor(sample['image'].convert("RGB"), return_tensors="pt").pixel_values
pixel_values = pixel_values.to(device)
# prepare decoder inputs
task_prompt = "<s_cord-v2>"
decoder_input_ids = processor.tokenizer(task_prompt, add_special_tokens=False, return_tensors="pt").input_ids
decoder_input_ids = decoder_input_ids.to(device)
# autoregressively generate sequence
outputs = model.generate(
pixel_values,
decoder_input_ids=decoder_input_ids,
max_length=model.decoder.config.max_position_embeddings,
early_stopping=True,
pad_token_id=processor.tokenizer.pad_token_id,
eos_token_id=processor.tokenizer.eos_token_id,
use_cache=True,
num_beams=1,
bad_words_ids=[[processor.tokenizer.unk_token_id]],
return_dict_in_generate=True,
)
processor = DonutProcessor.from_pretrained("Zombely/plwiki-proto-fine-tuned-v2") # turn into JSON
model = VisionEncoderDecoderModel.from_pretrained("Zombely/plwiki-proto-fine-tuned-v2") seq = processor.batch_decode(outputs.sequences)[0]
seq = seq.replace(processor.tokenizer.eos_token, "").replace(processor.tokenizer.pad_token, "")
seq = re.sub(r"<.*?>", "", seq, count=1).strip() # remove first task start token
seq = processor.token2json(seq)
if config.has_metadata:
ground_truth = json.loads(sample["ground_truth"])
ground_truth = ground_truth["gt_parse"]
evaluator = JSONParseEvaluator()
score = evaluator.cal_acc(seq, ground_truth)
accs.append(score)
if config.print_output:
print(seq)
output_list.append(seq)
if config.output_file_dir:
df = pd.DataFrame(map(lambda x: x.get('text_sequence', ''), output_list))
df.to_csv(f'{config.output_file_dir}/{config.pretrained_processor_path}-out.tsv', sep='\t', header=False, index=False)
# In[3]: if config.has_metadata:
scores = {"accuracies": accs, "mean_accuracy": np.mean(accs)}
print(scores, f"length : {len(accs)}")
print("Mean accuracy:", np.mean(accs))
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("--config", type=str, required=True)
args, left_argv = parser.parse_known_args()
config = Config(args.config)
config.argv_update(left_argv)
dataset = load_dataset("Zombely/diachronia-ocr", split='train') main(config)
# In[4]:
device = "cuda" if torch.cuda.is_available() else "cpu"
model.eval()
model.to(device)
output_list = []
accs = []
has_metadata = bool(dataset[0].get('ground_truth'))
for idx, sample in tqdm(enumerate(dataset), total=len(dataset)):
# prepare encoder inputs
pixel_values = processor(sample['image'].convert("RGB"), return_tensors="pt").pixel_values
pixel_values = pixel_values.to(device)
# prepare decoder inputs
task_prompt = "<s_cord-v2>"
decoder_input_ids = processor.tokenizer(task_prompt, add_special_tokens=False, return_tensors="pt").input_ids
decoder_input_ids = decoder_input_ids.to(device)
# autoregressively generate sequence
outputs = model.generate(
pixel_values,
decoder_input_ids=decoder_input_ids,
max_length=model.decoder.config.max_position_embeddings,
early_stopping=True,
pad_token_id=processor.tokenizer.pad_token_id,
eos_token_id=processor.tokenizer.eos_token_id,
use_cache=True,
num_beams=1,
bad_words_ids=[[processor.tokenizer.unk_token_id]],
return_dict_in_generate=True,
)
# turn into JSON
seq = processor.batch_decode(outputs.sequences)[0]
seq = seq.replace(processor.tokenizer.eos_token, "").replace(processor.tokenizer.pad_token, "")
seq = re.sub(r"<.*?>", "", seq, count=1).strip() # remove first task start token
seq = processor.token2json(seq)
if has_metadata:
ground_truth = json.loads(sample["ground_truth"])
ground_truth = ground_truth["gt_parse"]
evaluator = JSONParseEvaluator()
score = evaluator.cal_acc(seq, ground_truth)
accs.append(score)
print(seq)
output_list.append(seq)
df = pd.DataFrame(map(lambda x: x.get('text_sequence', ''), output_list))
df.to_csv('out.tsv', sep='\t', header=False, index=False)
if has_metadata:
scores = {"accuracies": accs, "mean_accuracy": np.mean(accs)}
print(scores, f"length : {len(accs)}")
print("Mean accuracy:", np.mean(accs))