Merge branch 'vm-changes'

This commit is contained in:
s444415 2023-01-04 09:52:13 +01:00
commit ccd4090d4b
6 changed files with 906 additions and 69 deletions

8
config-eval.yaml Normal file
View File

@ -0,0 +1,8 @@
pretrained_processor_path: "Zombely/plwiki-proto-fine-tuned-v2"
pretrained_model_path: "Zombely/plwiki-proto-fine-tuned-v2"
validation_dataset_path: "Zombely/diachronia-ocr"
validation_dataset_split: "train"
has_metadata: False
print_output: True
output_file_dir: "../../gonito-outs"
test_name: "fine-tuned-test"

View File

@ -1,83 +1,90 @@
#!/usr/bin/env python #!/usr/bin/env python
# coding: utf-8 # coding: utf-8
# In[1]: from transformers import DonutProcessor, VisionEncoderDecoderModel, VisionEncoderDecoderConfig
from transformers import DonutProcessor, VisionEncoderDecoderModel
from datasets import load_dataset from datasets import load_dataset
import re import re
import json import json
import torch import torch
from tqdm.auto import tqdm from tqdm.auto import tqdm
import numpy as np import numpy as np
import pandas as pd
from donut import JSONParseEvaluator from donut import JSONParseEvaluator
import argparse
from sconf import Config
def main(config):
# In[2]: # image_size = [1920, 2560]
# config_vision = VisionEncoderDecoderConfig.from_pretrained(config.pretrained_model_path)
# config_vision.encoder.image_size = image_size # (height, width)
# config_vision.decoder.max_length = 768
processor = DonutProcessor.from_pretrained(config.pretrained_processor_path)
model = VisionEncoderDecoderModel.from_pretrained(config.pretrained_model_path)
processor = DonutProcessor.from_pretrained("Zombely/plwiki-proto-fine-tuned") # processor.image_processor.size = image_size[::-1] # should be (width, height)
model = VisionEncoderDecoderModel.from_pretrained("Zombely/plwiki-proto-fine-tuned") processor.image_processor.do_align_long_axis = False
dataset = load_dataset(config.validation_dataset_path, split=config.validation_dataset_split)
device = "cuda" if torch.cuda.is_available() else "cpu"
model.eval()
model.to(device)
output_list = []
accs = []
# In[3]: for idx, sample in tqdm(enumerate(dataset), total=len(dataset)):
# prepare encoder inputs
pixel_values = processor(sample['image'].convert("RGB"), return_tensors="pt").pixel_values
pixel_values = pixel_values.to(device)
# prepare decoder inputs
task_prompt = "<s_cord-v2>"
decoder_input_ids = processor.tokenizer(task_prompt, add_special_tokens=False, return_tensors="pt").input_ids
decoder_input_ids = decoder_input_ids.to(device)
# autoregressively generate sequence
outputs = model.generate(
pixel_values,
decoder_input_ids=decoder_input_ids,
max_length=model.decoder.config.max_position_embeddings,
early_stopping=True,
pad_token_id=processor.tokenizer.pad_token_id,
eos_token_id=processor.tokenizer.eos_token_id,
use_cache=True,
num_beams=1,
bad_words_ids=[[processor.tokenizer.unk_token_id]],
return_dict_in_generate=True,
)
# turn into JSON
seq = processor.batch_decode(outputs.sequences)[0]
seq = seq.replace(processor.tokenizer.eos_token, "").replace(processor.tokenizer.pad_token, "")
seq = re.sub(r"<.*?>", "", seq, count=1).strip() # remove first task start token
seq = processor.token2json(seq)
if config.has_metadata:
ground_truth = json.loads(sample["ground_truth"])
ground_truth = ground_truth["gt_parse"]
evaluator = JSONParseEvaluator()
score = evaluator.cal_acc(seq, ground_truth)
dataset = load_dataset("Zombely/pl-text-images-5000-whole", split="validation") accs.append(score)
if config.print_output:
print(seq)
output_list.append(seq)
if config.output_file_dir:
df = pd.DataFrame(map(lambda x: x.get('text_sequence', ''), output_list))
df.to_csv(f'{config.output_file_dir}/{config.test_name}-out.tsv', sep='\t', header=False, index=False)
if config.has_metadata:
scores = {"accuracies": accs, "mean_accuracy": np.mean(accs)}
print(scores, f"length : {len(accs)}")
print("Mean accuracy:", np.mean(accs))
# In[4]: if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("--config", type=str, required=True)
device = "cuda" if torch.cuda.is_available() else "cpu" args, left_argv = parser.parse_known_args()
config = Config(args.config)
model.eval() config.argv_update(left_argv)
model.to(device)
output_list = []
accs = []
for idx, sample in tqdm(enumerate(dataset), total=len(dataset)):
# prepare encoder inputs
pixel_values = processor(sample["image"].convert("RGB"), return_tensors="pt").pixel_values
pixel_values = pixel_values.to(device)
# prepare decoder inputs
task_prompt = "<s_cord-v2>"
decoder_input_ids = processor.tokenizer(task_prompt, add_special_tokens=False, return_tensors="pt").input_ids
decoder_input_ids = decoder_input_ids.to(device)
# autoregressively generate sequence
outputs = model.generate(
pixel_values,
decoder_input_ids=decoder_input_ids,
max_length=model.decoder.config.max_position_embeddings,
early_stopping=True,
pad_token_id=processor.tokenizer.pad_token_id,
eos_token_id=processor.tokenizer.eos_token_id,
use_cache=True,
num_beams=1,
bad_words_ids=[[processor.tokenizer.unk_token_id]],
return_dict_in_generate=True,
)
# turn into JSON
seq = processor.batch_decode(outputs.sequences)[0]
seq = seq.replace(processor.tokenizer.eos_token, "").replace(processor.tokenizer.pad_token, "")
seq = re.sub(r"<.*?>", "", seq, count=1).strip() # remove first task start token
seq = processor.token2json(seq)
ground_truth = json.loads(sample["ground_truth"])
ground_truth = ground_truth["gt_parse"]
evaluator = JSONParseEvaluator()
score = evaluator.cal_acc(seq, ground_truth)
accs.append(score)
output_list.append(seq)
scores = {"accuracies": accs, "mean_accuracy": np.mean(accs)}
print(scores, f"length : {len(accs)}")
print("Mean accuracy:", np.mean(accs))
main(config)

View File

@ -21,16 +21,16 @@ from pytorch_lightning.plugins import CheckpointIO
DATASET_PATH = "Zombely/pl-text-images-5000-whole" DATASET_PATH = "Zombely/fiszki-ocr-train"
PRETRAINED_MODEL_PATH = "Zombely/plwiki-proto-fine-tuned" PRETRAINED_MODEL_PATH = "Zombely/plwiki-proto-fine-tuned-v2"
START_MODEL_PATH = "Zombely/plwiki-proto-fine-tuned" START_MODEL_PATH = "Zombely/plwiki-proto-fine-tuned-v2"
OUTPUT_MODEL_PATH = "Zombely/plwiki-proto-fine-tuned-v2" OUTPUT_MODEL_PATH = "Zombely/plwiki-proto-fine-tuned-v3"
LOGGING_PATH = "plwiki-proto-ft-second-iter" LOGGING_PATH = "fiszki-ocr-fine-tune"
CHECKPOINT_PATH = "./checkpoint" CHECKPOINT_PATH = "./checkpoint"
train_config = { train_config = {
"max_epochs":30, "max_epochs":1,
"val_check_interval":0.5, # how many times we want to validate during an epoch "val_check_interval":0.5, # how many times we want to validate during an epoch
"check_val_every_n_epoch":1, "check_val_every_n_epoch":1,
"gradient_clip_val":1.0, "gradient_clip_val":1.0,
@ -362,7 +362,7 @@ custom_ckpt = CustomCheckpointIO()
trainer = pl.Trainer( trainer = pl.Trainer(
accelerator="gpu", # change to gpu accelerator="gpu" if torch.cuda.is_available() else 'cpu', # change to gpu
devices=1, devices=1,
max_epochs=train_config.get("max_epochs"), max_epochs=train_config.get("max_epochs"),
val_check_interval=train_config.get("val_check_interval"), val_check_interval=train_config.get("val_check_interval"),

View File

@ -0,0 +1,822 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"from huggingface_hub import login\n",
"from datasets import load_dataset\n",
"import os\n",
"import json\n",
"import shutil"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "f0476002f8d14822a24f1376cfe29a07",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"VBox(children=(HTML(value='<center> <img\\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"login(os.environ.get(\"HUG_TOKKEN\"))"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [],
"source": [
"df_train = pd.read_csv('../fiszki-ocr/train/in.tsv', sep='\\t', header=None, index_col=False)\n",
"files = [file[0] for file in df_train.iloc()]\n",
"df_train_out = pd.read_csv('../fiszki-ocr/train/expected.tsv', sep='\\t', header=None, index_col=False)\n",
"files_out = [file_out[0] for file_out in df_train_out.iloc()]"
]
},
{
"cell_type": "code",
"execution_count": 18,
"metadata": {},
"outputs": [],
"source": [
"whole = []\n",
"for file, out in zip(files, files_out):\n",
" whole.append({\"file_name\": file, \"ground_truth\": json.dumps({\"gt_parse\": {\"text_sequance\": out}}, ensure_ascii=False)})"
]
},
{
"cell_type": "code",
"execution_count": 19,
"metadata": {},
"outputs": [],
"source": [
"train = whole[:85]\n",
"validation = whole[85:]"
]
},
{
"cell_type": "code",
"execution_count": 20,
"metadata": {},
"outputs": [],
"source": [
"train_files = [file.get(\"file_name\") for file in train]\n",
"validation_files = [file.get(\"file_name\") for file in validation]"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {},
"outputs": [],
"source": [
"for image in os.listdir(\"../fiszki-ocr/images\"):\n",
" if image in train_files:\n",
" shutil.copy(f\"/home/pc/work/fiszki-ocr/images/{image}\", f\"./images-split-fiszki/train/{image}\")\n",
" if image in validation_files:\n",
" shutil.copy(f\"/home/pc/work/fiszki-ocr/images/{image}\", f\"./images-split-fiszki/validation/{image}\")"
]
},
{
"cell_type": "code",
"execution_count": 21,
"metadata": {},
"outputs": [],
"source": [
"\n",
"with open('./images-split-fiszki/train/metadata.jsonl', 'w', encoding='utf-8') as f:\n",
" for entry in train:\n",
" json.dump(entry, f, ensure_ascii=False)\n",
" f.write(\"\\n\")\n",
"with open('./images-split-fiszki/validation/metadata.jsonl', 'w', encoding='utf-8') as f:\n",
" for entry in validation:\n",
" json.dump(entry, f, ensure_ascii=False)\n",
" f.write(\"\\n\")\n",
" "
]
},
{
"cell_type": "code",
"execution_count": 22,
"metadata": {},
"outputs": [
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "ca154573c11a44a8a1fa7dede4c54e26",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"Resolving data files: 0%| | 0/86 [00:00<?, ?it/s]"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"Using custom data configuration images-split-fiszki-0b6e02834f7867a1\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Downloading and preparing dataset imagefolder/images-split-fiszki to /home/pc/.cache/huggingface/datasets/imagefolder/images-split-fiszki-0b6e02834f7867a1/0.0.0/37fbb85cc714a338bea574ac6c7d0b5be5aff46c1862c1989b20e0771199e93f...\n",
" "
]
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "2677f9a18a4d40768ebfee41eb5ee208",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"Downloading data files #4: 0%| | 0/6 [00:00<?, ?obj/s]"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "b742285b54724ef895dc3f1c76510030",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"Downloading data files #1: 0%| | 0/6 [00:00<?, ?obj/s]"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "27b3c6bbb7fe4220b20a13c6b720b99e",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"Downloading data files #15: 0%| | 0/5 [00:00<?, ?obj/s]"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "b9961292c96c404582fe522ff8d93e1d",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"Downloading data files #10: 0%| | 0/5 [00:00<?, ?obj/s]"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "d496ade67a244136b1fe5a00e539dc9f",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"Downloading data files #6: 0%| | 0/6 [00:00<?, ?obj/s]"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "61b0ebdef7814d0ab6f9fa796b67f033",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"Downloading data files #14: 0%| | 0/5 [00:00<?, ?obj/s]"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "9040a6b8a24f4ab793d0cf459f5f35b3",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"Downloading data files #2: 0%| | 0/6 [00:00<?, ?obj/s]"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "e4e6f1800d37456ebc095f7a096082fe",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"Downloading data files #7: 0%| | 0/6 [00:00<?, ?obj/s]"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "badee192d70a4d109cf38b3539876221",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"Downloading data files #8: 0%| | 0/5 [00:00<?, ?obj/s]"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "05801dc38fd24f4382f488c8a3fa92bc",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"Downloading data files #3: 0%| | 0/6 [00:00<?, ?obj/s]"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "704b4bd67b044e9c8d3cb009df4be325",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"Downloading data files #0: 0%| | 0/6 [00:00<?, ?obj/s]"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "ee591babc11e479c8263368893964589",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"Downloading data files #9: 0%| | 0/5 [00:00<?, ?obj/s]"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "2e6e2f9a00774a6ba35330a0e1104968",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"Downloading data files #5: 0%| | 0/6 [00:00<?, ?obj/s]"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "c97fd7b70b544c068d13eef90ad05127",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"Downloading data files #11: 0%| | 0/5 [00:00<?, ?obj/s]"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "9d9741e14c7945c4aac512ebe6effbba",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"Downloading data files #12: 0%| | 0/5 [00:00<?, ?obj/s]"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "f0ec07904f434cf7b8d7e98702979c83",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"Downloading data files #13: 0%| | 0/5 [00:00<?, ?obj/s]"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "3881282964584fe8906257ca4edb825b",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"Downloading data files: 0it [00:00, ?it/s]"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "3d543609550c438c891b36e2406cb1ae",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"Extracting data files: 0it [00:00, ?it/s]"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
" "
]
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "d48e39b33fb74375894bff21bd91dd56",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"Downloading data files #2: 0%| | 0/1 [00:00<?, ?obj/s]"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "3e8add0aead64b06b4b630a9e3cd7614",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"Downloading data files #4: 0%| | 0/1 [00:00<?, ?obj/s]"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "7e3311255c414944965ac6d19e3520bb",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"Downloading data files #3: 0%| | 0/1 [00:00<?, ?obj/s]"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "89fe0a3793d0442ab9d91a98e39b05f1",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"Downloading data files #0: 0%| | 0/2 [00:00<?, ?obj/s]"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "e89ba8660b684c028d15b5b62f22c3ba",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"Downloading data files #5: 0%| | 0/1 [00:00<?, ?obj/s]"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "db6d55a219704ffa8f73a31d928fe47e",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"Downloading data files #1: 0%| | 0/1 [00:00<?, ?obj/s]"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "ca45c0d6589d4b858a9914ef9f8845d4",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"Downloading data files #9: 0%| | 0/1 [00:00<?, ?obj/s]"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "45ea9e09950d4ec0b0529db382b14d6f",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"Downloading data files #10: 0%| | 0/1 [00:00<?, ?obj/s]"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "cfeb70d806d344b683aa9e772b468e6e",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"Downloading data files #14: 0%| | 0/1 [00:00<?, ?obj/s]"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "3dd158c57c8c46b19b86ddd7e31915fd",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"Downloading data files #15: 0%| | 0/1 [00:00<?, ?obj/s]"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "367863a5314d494f929aae0ca91e0a33",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"Downloading data files #7: 0%| | 0/1 [00:00<?, ?obj/s]"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "a1a0e3b6a0234188b34c11fae2f6503d",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"Downloading data files #6: 0%| | 0/1 [00:00<?, ?obj/s]"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "d8e9df2e9ace4365b3e6faf80c2b7cbb",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"Downloading data files #11: 0%| | 0/1 [00:00<?, ?obj/s]"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "6df8b58b7a934f8eaf0422ce9f704d38",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"Downloading data files #13: 0%| | 0/1 [00:00<?, ?obj/s]"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "d1f58d8da7f24d6394e2c2ace1372d92",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"Downloading data files #8: 0%| | 0/1 [00:00<?, ?obj/s]"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "208c67f7ffb64f548726bbe2443f6930",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"Downloading data files #12: 0%| | 0/1 [00:00<?, ?obj/s]"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "905e84687479471daaadfd9850c52a88",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"Downloading data files: 0it [00:00, ?it/s]"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "3db88be1336b4ea0b03638761a6d69e7",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"Extracting data files: 0it [00:00, ?it/s]"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "96090b6f92eb46be8b44dde7d96f225a",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"Generating train split: 0 examples [00:00, ? examples/s]"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "419401799c864422b9669c66c44159bd",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"Generating validation split: 0 examples [00:00, ? examples/s]"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Dataset imagefolder downloaded and prepared to /home/pc/.cache/huggingface/datasets/imagefolder/images-split-fiszki-0b6e02834f7867a1/0.0.0/37fbb85cc714a338bea574ac6c7d0b5be5aff46c1862c1989b20e0771199e93f. Subsequent calls will reuse this data.\n"
]
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "835ebdd301dc469dbb0ad6f1838403a5",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
" 0%| | 0/2 [00:00<?, ?it/s]"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"dataset = load_dataset('./images-split-fiszki')"
]
},
{
"cell_type": "code",
"execution_count": 23,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"Pushing split train to the Hub.\n"
]
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "a92c7f7732054b479a26b1f32621cf20",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
" 0%| | 0/1 [00:00<?, ?ba/s]"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "d934bd92d4af41e492ecac230b635903",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"Pushing dataset shards to the dataset hub: 0%| | 0/1 [00:00<?, ?it/s]"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"Pushing split validation to the Hub.\n"
]
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "dd0c40c587e84870ad9c3089d401b80a",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
" 0%| | 0/1 [00:00<?, ?ba/s]"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "eeac023e001349a48133da21f7656378",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"Pushing dataset shards to the dataset hub: 0%| | 0/1 [00:00<?, ?it/s]"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"dataset.push_to_hub(\"Zombely/fiszki-ocr-train\")"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "hug_donut",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.15 (main, Nov 4 2022, 16:13:54) \n[GCC 11.2.0]"
},
"orig_nbformat": 4,
"vscode": {
"interpreter": {
"hash": "8f1c1b41577d000ca6512e75d22d324bbd1d5e060e99f4f49d98cf0adf636690"
}
}
},
"nbformat": 4,
"nbformat_minor": 2
}