Merge branch 'vm-changes'
This commit is contained in:
commit
ccd4090d4b
8
config-eval.yaml
Normal file
8
config-eval.yaml
Normal file
@ -0,0 +1,8 @@
|
|||||||
|
pretrained_processor_path: "Zombely/plwiki-proto-fine-tuned-v2"
|
||||||
|
pretrained_model_path: "Zombely/plwiki-proto-fine-tuned-v2"
|
||||||
|
validation_dataset_path: "Zombely/diachronia-ocr"
|
||||||
|
validation_dataset_split: "train"
|
||||||
|
has_metadata: False
|
||||||
|
print_output: True
|
||||||
|
output_file_dir: "../../gonito-outs"
|
||||||
|
test_name: "fine-tuned-test"
|
131
donut-eval.py
131
donut-eval.py
@ -1,83 +1,90 @@
|
|||||||
#!/usr/bin/env python
|
#!/usr/bin/env python
|
||||||
# coding: utf-8
|
# coding: utf-8
|
||||||
|
|
||||||
# In[1]:
|
from transformers import DonutProcessor, VisionEncoderDecoderModel, VisionEncoderDecoderConfig
|
||||||
|
|
||||||
|
|
||||||
from transformers import DonutProcessor, VisionEncoderDecoderModel
|
|
||||||
from datasets import load_dataset
|
from datasets import load_dataset
|
||||||
import re
|
import re
|
||||||
import json
|
import json
|
||||||
import torch
|
import torch
|
||||||
from tqdm.auto import tqdm
|
from tqdm.auto import tqdm
|
||||||
import numpy as np
|
import numpy as np
|
||||||
|
import pandas as pd
|
||||||
from donut import JSONParseEvaluator
|
from donut import JSONParseEvaluator
|
||||||
|
import argparse
|
||||||
|
from sconf import Config
|
||||||
|
|
||||||
|
def main(config):
|
||||||
|
|
||||||
# In[2]:
|
# image_size = [1920, 2560]
|
||||||
|
# config_vision = VisionEncoderDecoderConfig.from_pretrained(config.pretrained_model_path)
|
||||||
|
# config_vision.encoder.image_size = image_size # (height, width)
|
||||||
|
# config_vision.decoder.max_length = 768
|
||||||
|
|
||||||
|
processor = DonutProcessor.from_pretrained(config.pretrained_processor_path)
|
||||||
|
model = VisionEncoderDecoderModel.from_pretrained(config.pretrained_model_path)
|
||||||
|
|
||||||
processor = DonutProcessor.from_pretrained("Zombely/plwiki-proto-fine-tuned")
|
# processor.image_processor.size = image_size[::-1] # should be (width, height)
|
||||||
model = VisionEncoderDecoderModel.from_pretrained("Zombely/plwiki-proto-fine-tuned")
|
processor.image_processor.do_align_long_axis = False
|
||||||
|
|
||||||
|
dataset = load_dataset(config.validation_dataset_path, split=config.validation_dataset_split)
|
||||||
|
device = "cuda" if torch.cuda.is_available() else "cpu"
|
||||||
|
model.eval()
|
||||||
|
model.to(device)
|
||||||
|
output_list = []
|
||||||
|
accs = []
|
||||||
|
|
||||||
# In[3]:
|
for idx, sample in tqdm(enumerate(dataset), total=len(dataset)):
|
||||||
|
# prepare encoder inputs
|
||||||
|
pixel_values = processor(sample['image'].convert("RGB"), return_tensors="pt").pixel_values
|
||||||
|
pixel_values = pixel_values.to(device)
|
||||||
|
# prepare decoder inputs
|
||||||
|
task_prompt = "<s_cord-v2>"
|
||||||
|
decoder_input_ids = processor.tokenizer(task_prompt, add_special_tokens=False, return_tensors="pt").input_ids
|
||||||
|
decoder_input_ids = decoder_input_ids.to(device)
|
||||||
|
|
||||||
|
# autoregressively generate sequence
|
||||||
|
outputs = model.generate(
|
||||||
|
pixel_values,
|
||||||
|
decoder_input_ids=decoder_input_ids,
|
||||||
|
max_length=model.decoder.config.max_position_embeddings,
|
||||||
|
early_stopping=True,
|
||||||
|
pad_token_id=processor.tokenizer.pad_token_id,
|
||||||
|
eos_token_id=processor.tokenizer.eos_token_id,
|
||||||
|
use_cache=True,
|
||||||
|
num_beams=1,
|
||||||
|
bad_words_ids=[[processor.tokenizer.unk_token_id]],
|
||||||
|
return_dict_in_generate=True,
|
||||||
|
)
|
||||||
|
|
||||||
|
# turn into JSON
|
||||||
|
seq = processor.batch_decode(outputs.sequences)[0]
|
||||||
|
seq = seq.replace(processor.tokenizer.eos_token, "").replace(processor.tokenizer.pad_token, "")
|
||||||
|
seq = re.sub(r"<.*?>", "", seq, count=1).strip() # remove first task start token
|
||||||
|
seq = processor.token2json(seq)
|
||||||
|
if config.has_metadata:
|
||||||
|
ground_truth = json.loads(sample["ground_truth"])
|
||||||
|
ground_truth = ground_truth["gt_parse"]
|
||||||
|
evaluator = JSONParseEvaluator()
|
||||||
|
score = evaluator.cal_acc(seq, ground_truth)
|
||||||
|
|
||||||
dataset = load_dataset("Zombely/pl-text-images-5000-whole", split="validation")
|
accs.append(score)
|
||||||
|
if config.print_output:
|
||||||
|
print(seq)
|
||||||
|
output_list.append(seq)
|
||||||
|
if config.output_file_dir:
|
||||||
|
df = pd.DataFrame(map(lambda x: x.get('text_sequence', ''), output_list))
|
||||||
|
df.to_csv(f'{config.output_file_dir}/{config.test_name}-out.tsv', sep='\t', header=False, index=False)
|
||||||
|
|
||||||
|
if config.has_metadata:
|
||||||
|
scores = {"accuracies": accs, "mean_accuracy": np.mean(accs)}
|
||||||
|
print(scores, f"length : {len(accs)}")
|
||||||
|
print("Mean accuracy:", np.mean(accs))
|
||||||
|
|
||||||
# In[4]:
|
if __name__ == "__main__":
|
||||||
|
parser = argparse.ArgumentParser()
|
||||||
|
parser.add_argument("--config", type=str, required=True)
|
||||||
device = "cuda" if torch.cuda.is_available() else "cpu"
|
args, left_argv = parser.parse_known_args()
|
||||||
|
config = Config(args.config)
|
||||||
model.eval()
|
config.argv_update(left_argv)
|
||||||
model.to(device)
|
|
||||||
|
|
||||||
output_list = []
|
|
||||||
accs = []
|
|
||||||
|
|
||||||
|
|
||||||
for idx, sample in tqdm(enumerate(dataset), total=len(dataset)):
|
|
||||||
# prepare encoder inputs
|
|
||||||
pixel_values = processor(sample["image"].convert("RGB"), return_tensors="pt").pixel_values
|
|
||||||
pixel_values = pixel_values.to(device)
|
|
||||||
# prepare decoder inputs
|
|
||||||
task_prompt = "<s_cord-v2>"
|
|
||||||
decoder_input_ids = processor.tokenizer(task_prompt, add_special_tokens=False, return_tensors="pt").input_ids
|
|
||||||
decoder_input_ids = decoder_input_ids.to(device)
|
|
||||||
|
|
||||||
# autoregressively generate sequence
|
|
||||||
outputs = model.generate(
|
|
||||||
pixel_values,
|
|
||||||
decoder_input_ids=decoder_input_ids,
|
|
||||||
max_length=model.decoder.config.max_position_embeddings,
|
|
||||||
early_stopping=True,
|
|
||||||
pad_token_id=processor.tokenizer.pad_token_id,
|
|
||||||
eos_token_id=processor.tokenizer.eos_token_id,
|
|
||||||
use_cache=True,
|
|
||||||
num_beams=1,
|
|
||||||
bad_words_ids=[[processor.tokenizer.unk_token_id]],
|
|
||||||
return_dict_in_generate=True,
|
|
||||||
)
|
|
||||||
|
|
||||||
# turn into JSON
|
|
||||||
seq = processor.batch_decode(outputs.sequences)[0]
|
|
||||||
seq = seq.replace(processor.tokenizer.eos_token, "").replace(processor.tokenizer.pad_token, "")
|
|
||||||
seq = re.sub(r"<.*?>", "", seq, count=1).strip() # remove first task start token
|
|
||||||
seq = processor.token2json(seq)
|
|
||||||
|
|
||||||
ground_truth = json.loads(sample["ground_truth"])
|
|
||||||
ground_truth = ground_truth["gt_parse"]
|
|
||||||
evaluator = JSONParseEvaluator()
|
|
||||||
score = evaluator.cal_acc(seq, ground_truth)
|
|
||||||
|
|
||||||
accs.append(score)
|
|
||||||
output_list.append(seq)
|
|
||||||
|
|
||||||
scores = {"accuracies": accs, "mean_accuracy": np.mean(accs)}
|
|
||||||
print(scores, f"length : {len(accs)}")
|
|
||||||
print("Mean accuracy:", np.mean(accs))
|
|
||||||
|
|
||||||
|
main(config)
|
||||||
|
@ -21,16 +21,16 @@ from pytorch_lightning.plugins import CheckpointIO
|
|||||||
|
|
||||||
|
|
||||||
|
|
||||||
DATASET_PATH = "Zombely/pl-text-images-5000-whole"
|
DATASET_PATH = "Zombely/fiszki-ocr-train"
|
||||||
PRETRAINED_MODEL_PATH = "Zombely/plwiki-proto-fine-tuned"
|
PRETRAINED_MODEL_PATH = "Zombely/plwiki-proto-fine-tuned-v2"
|
||||||
START_MODEL_PATH = "Zombely/plwiki-proto-fine-tuned"
|
START_MODEL_PATH = "Zombely/plwiki-proto-fine-tuned-v2"
|
||||||
OUTPUT_MODEL_PATH = "Zombely/plwiki-proto-fine-tuned-v2"
|
OUTPUT_MODEL_PATH = "Zombely/plwiki-proto-fine-tuned-v3"
|
||||||
LOGGING_PATH = "plwiki-proto-ft-second-iter"
|
LOGGING_PATH = "fiszki-ocr-fine-tune"
|
||||||
CHECKPOINT_PATH = "./checkpoint"
|
CHECKPOINT_PATH = "./checkpoint"
|
||||||
|
|
||||||
|
|
||||||
train_config = {
|
train_config = {
|
||||||
"max_epochs":30,
|
"max_epochs":1,
|
||||||
"val_check_interval":0.5, # how many times we want to validate during an epoch
|
"val_check_interval":0.5, # how many times we want to validate during an epoch
|
||||||
"check_val_every_n_epoch":1,
|
"check_val_every_n_epoch":1,
|
||||||
"gradient_clip_val":1.0,
|
"gradient_clip_val":1.0,
|
||||||
@ -362,7 +362,7 @@ custom_ckpt = CustomCheckpointIO()
|
|||||||
|
|
||||||
|
|
||||||
trainer = pl.Trainer(
|
trainer = pl.Trainer(
|
||||||
accelerator="gpu", # change to gpu
|
accelerator="gpu" if torch.cuda.is_available() else 'cpu', # change to gpu
|
||||||
devices=1,
|
devices=1,
|
||||||
max_epochs=train_config.get("max_epochs"),
|
max_epochs=train_config.get("max_epochs"),
|
||||||
val_check_interval=train_config.get("val_check_interval"),
|
val_check_interval=train_config.get("val_check_interval"),
|
||||||
|
822
notepads/dataset_create.ipynb
Normal file
822
notepads/dataset_create.ipynb
Normal file
@ -0,0 +1,822 @@
|
|||||||
|
{
|
||||||
|
"cells": [
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 8,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"import pandas as pd\n",
|
||||||
|
"from huggingface_hub import login\n",
|
||||||
|
"from datasets import load_dataset\n",
|
||||||
|
"import os\n",
|
||||||
|
"import json\n",
|
||||||
|
"import shutil"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 9,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"application/vnd.jupyter.widget-view+json": {
|
||||||
|
"model_id": "f0476002f8d14822a24f1376cfe29a07",
|
||||||
|
"version_major": 2,
|
||||||
|
"version_minor": 0
|
||||||
|
},
|
||||||
|
"text/plain": [
|
||||||
|
"VBox(children=(HTML(value='<center> <img\\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "display_data"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"login(os.environ.get(\"HUG_TOKKEN\"))"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 10,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"df_train = pd.read_csv('../fiszki-ocr/train/in.tsv', sep='\\t', header=None, index_col=False)\n",
|
||||||
|
"files = [file[0] for file in df_train.iloc()]\n",
|
||||||
|
"df_train_out = pd.read_csv('../fiszki-ocr/train/expected.tsv', sep='\\t', header=None, index_col=False)\n",
|
||||||
|
"files_out = [file_out[0] for file_out in df_train_out.iloc()]"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 18,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"whole = []\n",
|
||||||
|
"for file, out in zip(files, files_out):\n",
|
||||||
|
" whole.append({\"file_name\": file, \"ground_truth\": json.dumps({\"gt_parse\": {\"text_sequance\": out}}, ensure_ascii=False)})"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 19,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"train = whole[:85]\n",
|
||||||
|
"validation = whole[85:]"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 20,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"train_files = [file.get(\"file_name\") for file in train]\n",
|
||||||
|
"validation_files = [file.get(\"file_name\") for file in validation]"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 14,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"for image in os.listdir(\"../fiszki-ocr/images\"):\n",
|
||||||
|
" if image in train_files:\n",
|
||||||
|
" shutil.copy(f\"/home/pc/work/fiszki-ocr/images/{image}\", f\"./images-split-fiszki/train/{image}\")\n",
|
||||||
|
" if image in validation_files:\n",
|
||||||
|
" shutil.copy(f\"/home/pc/work/fiszki-ocr/images/{image}\", f\"./images-split-fiszki/validation/{image}\")"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 21,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"\n",
|
||||||
|
"with open('./images-split-fiszki/train/metadata.jsonl', 'w', encoding='utf-8') as f:\n",
|
||||||
|
" for entry in train:\n",
|
||||||
|
" json.dump(entry, f, ensure_ascii=False)\n",
|
||||||
|
" f.write(\"\\n\")\n",
|
||||||
|
"with open('./images-split-fiszki/validation/metadata.jsonl', 'w', encoding='utf-8') as f:\n",
|
||||||
|
" for entry in validation:\n",
|
||||||
|
" json.dump(entry, f, ensure_ascii=False)\n",
|
||||||
|
" f.write(\"\\n\")\n",
|
||||||
|
" "
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 22,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"application/vnd.jupyter.widget-view+json": {
|
||||||
|
"model_id": "ca154573c11a44a8a1fa7dede4c54e26",
|
||||||
|
"version_major": 2,
|
||||||
|
"version_minor": 0
|
||||||
|
},
|
||||||
|
"text/plain": [
|
||||||
|
"Resolving data files: 0%| | 0/86 [00:00<?, ?it/s]"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "display_data"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "stderr",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"Using custom data configuration images-split-fiszki-0b6e02834f7867a1\n"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"Downloading and preparing dataset imagefolder/images-split-fiszki to /home/pc/.cache/huggingface/datasets/imagefolder/images-split-fiszki-0b6e02834f7867a1/0.0.0/37fbb85cc714a338bea574ac6c7d0b5be5aff46c1862c1989b20e0771199e93f...\n",
|
||||||
|
" "
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"application/vnd.jupyter.widget-view+json": {
|
||||||
|
"model_id": "2677f9a18a4d40768ebfee41eb5ee208",
|
||||||
|
"version_major": 2,
|
||||||
|
"version_minor": 0
|
||||||
|
},
|
||||||
|
"text/plain": [
|
||||||
|
"Downloading data files #4: 0%| | 0/6 [00:00<?, ?obj/s]"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "display_data"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"application/vnd.jupyter.widget-view+json": {
|
||||||
|
"model_id": "b742285b54724ef895dc3f1c76510030",
|
||||||
|
"version_major": 2,
|
||||||
|
"version_minor": 0
|
||||||
|
},
|
||||||
|
"text/plain": [
|
||||||
|
"Downloading data files #1: 0%| | 0/6 [00:00<?, ?obj/s]"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "display_data"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"application/vnd.jupyter.widget-view+json": {
|
||||||
|
"model_id": "27b3c6bbb7fe4220b20a13c6b720b99e",
|
||||||
|
"version_major": 2,
|
||||||
|
"version_minor": 0
|
||||||
|
},
|
||||||
|
"text/plain": [
|
||||||
|
"Downloading data files #15: 0%| | 0/5 [00:00<?, ?obj/s]"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "display_data"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"application/vnd.jupyter.widget-view+json": {
|
||||||
|
"model_id": "b9961292c96c404582fe522ff8d93e1d",
|
||||||
|
"version_major": 2,
|
||||||
|
"version_minor": 0
|
||||||
|
},
|
||||||
|
"text/plain": [
|
||||||
|
"Downloading data files #10: 0%| | 0/5 [00:00<?, ?obj/s]"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "display_data"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"application/vnd.jupyter.widget-view+json": {
|
||||||
|
"model_id": "d496ade67a244136b1fe5a00e539dc9f",
|
||||||
|
"version_major": 2,
|
||||||
|
"version_minor": 0
|
||||||
|
},
|
||||||
|
"text/plain": [
|
||||||
|
"Downloading data files #6: 0%| | 0/6 [00:00<?, ?obj/s]"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "display_data"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"application/vnd.jupyter.widget-view+json": {
|
||||||
|
"model_id": "61b0ebdef7814d0ab6f9fa796b67f033",
|
||||||
|
"version_major": 2,
|
||||||
|
"version_minor": 0
|
||||||
|
},
|
||||||
|
"text/plain": [
|
||||||
|
"Downloading data files #14: 0%| | 0/5 [00:00<?, ?obj/s]"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "display_data"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"application/vnd.jupyter.widget-view+json": {
|
||||||
|
"model_id": "9040a6b8a24f4ab793d0cf459f5f35b3",
|
||||||
|
"version_major": 2,
|
||||||
|
"version_minor": 0
|
||||||
|
},
|
||||||
|
"text/plain": [
|
||||||
|
"Downloading data files #2: 0%| | 0/6 [00:00<?, ?obj/s]"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "display_data"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"application/vnd.jupyter.widget-view+json": {
|
||||||
|
"model_id": "e4e6f1800d37456ebc095f7a096082fe",
|
||||||
|
"version_major": 2,
|
||||||
|
"version_minor": 0
|
||||||
|
},
|
||||||
|
"text/plain": [
|
||||||
|
"Downloading data files #7: 0%| | 0/6 [00:00<?, ?obj/s]"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "display_data"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"application/vnd.jupyter.widget-view+json": {
|
||||||
|
"model_id": "badee192d70a4d109cf38b3539876221",
|
||||||
|
"version_major": 2,
|
||||||
|
"version_minor": 0
|
||||||
|
},
|
||||||
|
"text/plain": [
|
||||||
|
"Downloading data files #8: 0%| | 0/5 [00:00<?, ?obj/s]"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "display_data"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"application/vnd.jupyter.widget-view+json": {
|
||||||
|
"model_id": "05801dc38fd24f4382f488c8a3fa92bc",
|
||||||
|
"version_major": 2,
|
||||||
|
"version_minor": 0
|
||||||
|
},
|
||||||
|
"text/plain": [
|
||||||
|
"Downloading data files #3: 0%| | 0/6 [00:00<?, ?obj/s]"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "display_data"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"application/vnd.jupyter.widget-view+json": {
|
||||||
|
"model_id": "704b4bd67b044e9c8d3cb009df4be325",
|
||||||
|
"version_major": 2,
|
||||||
|
"version_minor": 0
|
||||||
|
},
|
||||||
|
"text/plain": [
|
||||||
|
"Downloading data files #0: 0%| | 0/6 [00:00<?, ?obj/s]"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "display_data"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"application/vnd.jupyter.widget-view+json": {
|
||||||
|
"model_id": "ee591babc11e479c8263368893964589",
|
||||||
|
"version_major": 2,
|
||||||
|
"version_minor": 0
|
||||||
|
},
|
||||||
|
"text/plain": [
|
||||||
|
"Downloading data files #9: 0%| | 0/5 [00:00<?, ?obj/s]"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "display_data"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"application/vnd.jupyter.widget-view+json": {
|
||||||
|
"model_id": "2e6e2f9a00774a6ba35330a0e1104968",
|
||||||
|
"version_major": 2,
|
||||||
|
"version_minor": 0
|
||||||
|
},
|
||||||
|
"text/plain": [
|
||||||
|
"Downloading data files #5: 0%| | 0/6 [00:00<?, ?obj/s]"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "display_data"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"application/vnd.jupyter.widget-view+json": {
|
||||||
|
"model_id": "c97fd7b70b544c068d13eef90ad05127",
|
||||||
|
"version_major": 2,
|
||||||
|
"version_minor": 0
|
||||||
|
},
|
||||||
|
"text/plain": [
|
||||||
|
"Downloading data files #11: 0%| | 0/5 [00:00<?, ?obj/s]"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "display_data"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"application/vnd.jupyter.widget-view+json": {
|
||||||
|
"model_id": "9d9741e14c7945c4aac512ebe6effbba",
|
||||||
|
"version_major": 2,
|
||||||
|
"version_minor": 0
|
||||||
|
},
|
||||||
|
"text/plain": [
|
||||||
|
"Downloading data files #12: 0%| | 0/5 [00:00<?, ?obj/s]"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "display_data"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"application/vnd.jupyter.widget-view+json": {
|
||||||
|
"model_id": "f0ec07904f434cf7b8d7e98702979c83",
|
||||||
|
"version_major": 2,
|
||||||
|
"version_minor": 0
|
||||||
|
},
|
||||||
|
"text/plain": [
|
||||||
|
"Downloading data files #13: 0%| | 0/5 [00:00<?, ?obj/s]"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "display_data"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"application/vnd.jupyter.widget-view+json": {
|
||||||
|
"model_id": "3881282964584fe8906257ca4edb825b",
|
||||||
|
"version_major": 2,
|
||||||
|
"version_minor": 0
|
||||||
|
},
|
||||||
|
"text/plain": [
|
||||||
|
"Downloading data files: 0it [00:00, ?it/s]"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "display_data"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"application/vnd.jupyter.widget-view+json": {
|
||||||
|
"model_id": "3d543609550c438c891b36e2406cb1ae",
|
||||||
|
"version_major": 2,
|
||||||
|
"version_minor": 0
|
||||||
|
},
|
||||||
|
"text/plain": [
|
||||||
|
"Extracting data files: 0it [00:00, ?it/s]"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "display_data"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
" "
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"application/vnd.jupyter.widget-view+json": {
|
||||||
|
"model_id": "d48e39b33fb74375894bff21bd91dd56",
|
||||||
|
"version_major": 2,
|
||||||
|
"version_minor": 0
|
||||||
|
},
|
||||||
|
"text/plain": [
|
||||||
|
"Downloading data files #2: 0%| | 0/1 [00:00<?, ?obj/s]"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "display_data"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"application/vnd.jupyter.widget-view+json": {
|
||||||
|
"model_id": "3e8add0aead64b06b4b630a9e3cd7614",
|
||||||
|
"version_major": 2,
|
||||||
|
"version_minor": 0
|
||||||
|
},
|
||||||
|
"text/plain": [
|
||||||
|
"Downloading data files #4: 0%| | 0/1 [00:00<?, ?obj/s]"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "display_data"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"application/vnd.jupyter.widget-view+json": {
|
||||||
|
"model_id": "7e3311255c414944965ac6d19e3520bb",
|
||||||
|
"version_major": 2,
|
||||||
|
"version_minor": 0
|
||||||
|
},
|
||||||
|
"text/plain": [
|
||||||
|
"Downloading data files #3: 0%| | 0/1 [00:00<?, ?obj/s]"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "display_data"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"application/vnd.jupyter.widget-view+json": {
|
||||||
|
"model_id": "89fe0a3793d0442ab9d91a98e39b05f1",
|
||||||
|
"version_major": 2,
|
||||||
|
"version_minor": 0
|
||||||
|
},
|
||||||
|
"text/plain": [
|
||||||
|
"Downloading data files #0: 0%| | 0/2 [00:00<?, ?obj/s]"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "display_data"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"application/vnd.jupyter.widget-view+json": {
|
||||||
|
"model_id": "e89ba8660b684c028d15b5b62f22c3ba",
|
||||||
|
"version_major": 2,
|
||||||
|
"version_minor": 0
|
||||||
|
},
|
||||||
|
"text/plain": [
|
||||||
|
"Downloading data files #5: 0%| | 0/1 [00:00<?, ?obj/s]"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "display_data"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"application/vnd.jupyter.widget-view+json": {
|
||||||
|
"model_id": "db6d55a219704ffa8f73a31d928fe47e",
|
||||||
|
"version_major": 2,
|
||||||
|
"version_minor": 0
|
||||||
|
},
|
||||||
|
"text/plain": [
|
||||||
|
"Downloading data files #1: 0%| | 0/1 [00:00<?, ?obj/s]"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "display_data"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"application/vnd.jupyter.widget-view+json": {
|
||||||
|
"model_id": "ca45c0d6589d4b858a9914ef9f8845d4",
|
||||||
|
"version_major": 2,
|
||||||
|
"version_minor": 0
|
||||||
|
},
|
||||||
|
"text/plain": [
|
||||||
|
"Downloading data files #9: 0%| | 0/1 [00:00<?, ?obj/s]"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "display_data"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"application/vnd.jupyter.widget-view+json": {
|
||||||
|
"model_id": "45ea9e09950d4ec0b0529db382b14d6f",
|
||||||
|
"version_major": 2,
|
||||||
|
"version_minor": 0
|
||||||
|
},
|
||||||
|
"text/plain": [
|
||||||
|
"Downloading data files #10: 0%| | 0/1 [00:00<?, ?obj/s]"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "display_data"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"application/vnd.jupyter.widget-view+json": {
|
||||||
|
"model_id": "cfeb70d806d344b683aa9e772b468e6e",
|
||||||
|
"version_major": 2,
|
||||||
|
"version_minor": 0
|
||||||
|
},
|
||||||
|
"text/plain": [
|
||||||
|
"Downloading data files #14: 0%| | 0/1 [00:00<?, ?obj/s]"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "display_data"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"application/vnd.jupyter.widget-view+json": {
|
||||||
|
"model_id": "3dd158c57c8c46b19b86ddd7e31915fd",
|
||||||
|
"version_major": 2,
|
||||||
|
"version_minor": 0
|
||||||
|
},
|
||||||
|
"text/plain": [
|
||||||
|
"Downloading data files #15: 0%| | 0/1 [00:00<?, ?obj/s]"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "display_data"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"application/vnd.jupyter.widget-view+json": {
|
||||||
|
"model_id": "367863a5314d494f929aae0ca91e0a33",
|
||||||
|
"version_major": 2,
|
||||||
|
"version_minor": 0
|
||||||
|
},
|
||||||
|
"text/plain": [
|
||||||
|
"Downloading data files #7: 0%| | 0/1 [00:00<?, ?obj/s]"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "display_data"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"application/vnd.jupyter.widget-view+json": {
|
||||||
|
"model_id": "a1a0e3b6a0234188b34c11fae2f6503d",
|
||||||
|
"version_major": 2,
|
||||||
|
"version_minor": 0
|
||||||
|
},
|
||||||
|
"text/plain": [
|
||||||
|
"Downloading data files #6: 0%| | 0/1 [00:00<?, ?obj/s]"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "display_data"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"application/vnd.jupyter.widget-view+json": {
|
||||||
|
"model_id": "d8e9df2e9ace4365b3e6faf80c2b7cbb",
|
||||||
|
"version_major": 2,
|
||||||
|
"version_minor": 0
|
||||||
|
},
|
||||||
|
"text/plain": [
|
||||||
|
"Downloading data files #11: 0%| | 0/1 [00:00<?, ?obj/s]"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "display_data"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"application/vnd.jupyter.widget-view+json": {
|
||||||
|
"model_id": "6df8b58b7a934f8eaf0422ce9f704d38",
|
||||||
|
"version_major": 2,
|
||||||
|
"version_minor": 0
|
||||||
|
},
|
||||||
|
"text/plain": [
|
||||||
|
"Downloading data files #13: 0%| | 0/1 [00:00<?, ?obj/s]"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "display_data"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"application/vnd.jupyter.widget-view+json": {
|
||||||
|
"model_id": "d1f58d8da7f24d6394e2c2ace1372d92",
|
||||||
|
"version_major": 2,
|
||||||
|
"version_minor": 0
|
||||||
|
},
|
||||||
|
"text/plain": [
|
||||||
|
"Downloading data files #8: 0%| | 0/1 [00:00<?, ?obj/s]"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "display_data"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"application/vnd.jupyter.widget-view+json": {
|
||||||
|
"model_id": "208c67f7ffb64f548726bbe2443f6930",
|
||||||
|
"version_major": 2,
|
||||||
|
"version_minor": 0
|
||||||
|
},
|
||||||
|
"text/plain": [
|
||||||
|
"Downloading data files #12: 0%| | 0/1 [00:00<?, ?obj/s]"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "display_data"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"application/vnd.jupyter.widget-view+json": {
|
||||||
|
"model_id": "905e84687479471daaadfd9850c52a88",
|
||||||
|
"version_major": 2,
|
||||||
|
"version_minor": 0
|
||||||
|
},
|
||||||
|
"text/plain": [
|
||||||
|
"Downloading data files: 0it [00:00, ?it/s]"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "display_data"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"application/vnd.jupyter.widget-view+json": {
|
||||||
|
"model_id": "3db88be1336b4ea0b03638761a6d69e7",
|
||||||
|
"version_major": 2,
|
||||||
|
"version_minor": 0
|
||||||
|
},
|
||||||
|
"text/plain": [
|
||||||
|
"Extracting data files: 0it [00:00, ?it/s]"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "display_data"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"application/vnd.jupyter.widget-view+json": {
|
||||||
|
"model_id": "96090b6f92eb46be8b44dde7d96f225a",
|
||||||
|
"version_major": 2,
|
||||||
|
"version_minor": 0
|
||||||
|
},
|
||||||
|
"text/plain": [
|
||||||
|
"Generating train split: 0 examples [00:00, ? examples/s]"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "display_data"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"application/vnd.jupyter.widget-view+json": {
|
||||||
|
"model_id": "419401799c864422b9669c66c44159bd",
|
||||||
|
"version_major": 2,
|
||||||
|
"version_minor": 0
|
||||||
|
},
|
||||||
|
"text/plain": [
|
||||||
|
"Generating validation split: 0 examples [00:00, ? examples/s]"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "display_data"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"Dataset imagefolder downloaded and prepared to /home/pc/.cache/huggingface/datasets/imagefolder/images-split-fiszki-0b6e02834f7867a1/0.0.0/37fbb85cc714a338bea574ac6c7d0b5be5aff46c1862c1989b20e0771199e93f. Subsequent calls will reuse this data.\n"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"application/vnd.jupyter.widget-view+json": {
|
||||||
|
"model_id": "835ebdd301dc469dbb0ad6f1838403a5",
|
||||||
|
"version_major": 2,
|
||||||
|
"version_minor": 0
|
||||||
|
},
|
||||||
|
"text/plain": [
|
||||||
|
" 0%| | 0/2 [00:00<?, ?it/s]"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "display_data"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"dataset = load_dataset('./images-split-fiszki')"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 23,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stderr",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"Pushing split train to the Hub.\n"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"application/vnd.jupyter.widget-view+json": {
|
||||||
|
"model_id": "a92c7f7732054b479a26b1f32621cf20",
|
||||||
|
"version_major": 2,
|
||||||
|
"version_minor": 0
|
||||||
|
},
|
||||||
|
"text/plain": [
|
||||||
|
" 0%| | 0/1 [00:00<?, ?ba/s]"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "display_data"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"application/vnd.jupyter.widget-view+json": {
|
||||||
|
"model_id": "d934bd92d4af41e492ecac230b635903",
|
||||||
|
"version_major": 2,
|
||||||
|
"version_minor": 0
|
||||||
|
},
|
||||||
|
"text/plain": [
|
||||||
|
"Pushing dataset shards to the dataset hub: 0%| | 0/1 [00:00<?, ?it/s]"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "display_data"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "stderr",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"Pushing split validation to the Hub.\n"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"application/vnd.jupyter.widget-view+json": {
|
||||||
|
"model_id": "dd0c40c587e84870ad9c3089d401b80a",
|
||||||
|
"version_major": 2,
|
||||||
|
"version_minor": 0
|
||||||
|
},
|
||||||
|
"text/plain": [
|
||||||
|
" 0%| | 0/1 [00:00<?, ?ba/s]"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "display_data"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"application/vnd.jupyter.widget-view+json": {
|
||||||
|
"model_id": "eeac023e001349a48133da21f7656378",
|
||||||
|
"version_major": 2,
|
||||||
|
"version_minor": 0
|
||||||
|
},
|
||||||
|
"text/plain": [
|
||||||
|
"Pushing dataset shards to the dataset hub: 0%| | 0/1 [00:00<?, ?it/s]"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "display_data"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"dataset.push_to_hub(\"Zombely/fiszki-ocr-train\")"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"metadata": {
|
||||||
|
"kernelspec": {
|
||||||
|
"display_name": "hug_donut",
|
||||||
|
"language": "python",
|
||||||
|
"name": "python3"
|
||||||
|
},
|
||||||
|
"language_info": {
|
||||||
|
"codemirror_mode": {
|
||||||
|
"name": "ipython",
|
||||||
|
"version": 3
|
||||||
|
},
|
||||||
|
"file_extension": ".py",
|
||||||
|
"mimetype": "text/x-python",
|
||||||
|
"name": "python",
|
||||||
|
"nbconvert_exporter": "python",
|
||||||
|
"pygments_lexer": "ipython3",
|
||||||
|
"version": "3.9.15 (main, Nov 4 2022, 16:13:54) \n[GCC 11.2.0]"
|
||||||
|
},
|
||||||
|
"orig_nbformat": 4,
|
||||||
|
"vscode": {
|
||||||
|
"interpreter": {
|
||||||
|
"hash": "8f1c1b41577d000ca6512e75d22d324bbd1d5e060e99f4f49d98cf0adf636690"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"nbformat": 4,
|
||||||
|
"nbformat_minor": 2
|
||||||
|
}
|
Loading…
Reference in New Issue
Block a user