21 KiB
21 KiB
import pandas as pd
from huggingface_hub import login
from datasets import load_dataset
import os
import json
import shutil
login(os.environ.get("HUG_TOKKEN"))
VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…
df_train = pd.read_csv('../fiszki-ocr/train/in.tsv', sep='\t', header=None, index_col=False)
files = [file[0] for file in df_train.iloc()]
df_train_out = pd.read_csv('../fiszki-ocr/train/expected.tsv', sep='\t', header=None, index_col=False)
files_out = [file_out[0] for file_out in df_train_out.iloc()]
whole = []
for file, out in zip(files, files_out):
whole.append({"file_name": file, "ground_truth": json.dumps({"gt_parse": {"text_sequance": out}}, ensure_ascii=False)})
train = whole[:85]
validation = whole[85:]
train_files = [file.get("file_name") for file in train]
validation_files = [file.get("file_name") for file in validation]
for image in os.listdir("../fiszki-ocr/images"):
if image in train_files:
shutil.copy(f"/home/pc/work/fiszki-ocr/images/{image}", f"./images-split-fiszki/train/{image}")
if image in validation_files:
shutil.copy(f"/home/pc/work/fiszki-ocr/images/{image}", f"./images-split-fiszki/validation/{image}")
with open('./images-split-fiszki/train/metadata.jsonl', 'w', encoding='utf-8') as f:
for entry in train:
json.dump(entry, f, ensure_ascii=False)
f.write("\n")
with open('./images-split-fiszki/validation/metadata.jsonl', 'w', encoding='utf-8') as f:
for entry in validation:
json.dump(entry, f, ensure_ascii=False)
f.write("\n")
dataset = load_dataset('./images-split-fiszki')
Resolving data files: 0%| | 0/86 [00:00<?, ?it/s]
Using custom data configuration images-split-fiszki-0b6e02834f7867a1
Downloading and preparing dataset imagefolder/images-split-fiszki to /home/pc/.cache/huggingface/datasets/imagefolder/images-split-fiszki-0b6e02834f7867a1/0.0.0/37fbb85cc714a338bea574ac6c7d0b5be5aff46c1862c1989b20e0771199e93f...
Downloading data files #4: 0%| | 0/6 [00:00<?, ?obj/s]
Downloading data files #1: 0%| | 0/6 [00:00<?, ?obj/s]
Downloading data files #15: 0%| | 0/5 [00:00<?, ?obj/s]
Downloading data files #10: 0%| | 0/5 [00:00<?, ?obj/s]
Downloading data files #6: 0%| | 0/6 [00:00<?, ?obj/s]
Downloading data files #14: 0%| | 0/5 [00:00<?, ?obj/s]
Downloading data files #2: 0%| | 0/6 [00:00<?, ?obj/s]
Downloading data files #7: 0%| | 0/6 [00:00<?, ?obj/s]
Downloading data files #8: 0%| | 0/5 [00:00<?, ?obj/s]
Downloading data files #3: 0%| | 0/6 [00:00<?, ?obj/s]
Downloading data files #0: 0%| | 0/6 [00:00<?, ?obj/s]
Downloading data files #9: 0%| | 0/5 [00:00<?, ?obj/s]
Downloading data files #5: 0%| | 0/6 [00:00<?, ?obj/s]
Downloading data files #11: 0%| | 0/5 [00:00<?, ?obj/s]
Downloading data files #12: 0%| | 0/5 [00:00<?, ?obj/s]
Downloading data files #13: 0%| | 0/5 [00:00<?, ?obj/s]
Downloading data files: 0it [00:00, ?it/s]
Extracting data files: 0it [00:00, ?it/s]
Downloading data files #2: 0%| | 0/1 [00:00<?, ?obj/s]
Downloading data files #4: 0%| | 0/1 [00:00<?, ?obj/s]
Downloading data files #3: 0%| | 0/1 [00:00<?, ?obj/s]
Downloading data files #0: 0%| | 0/2 [00:00<?, ?obj/s]
Downloading data files #5: 0%| | 0/1 [00:00<?, ?obj/s]
Downloading data files #1: 0%| | 0/1 [00:00<?, ?obj/s]
Downloading data files #9: 0%| | 0/1 [00:00<?, ?obj/s]
Downloading data files #10: 0%| | 0/1 [00:00<?, ?obj/s]
Downloading data files #14: 0%| | 0/1 [00:00<?, ?obj/s]
Downloading data files #15: 0%| | 0/1 [00:00<?, ?obj/s]
Downloading data files #7: 0%| | 0/1 [00:00<?, ?obj/s]
Downloading data files #6: 0%| | 0/1 [00:00<?, ?obj/s]
Downloading data files #11: 0%| | 0/1 [00:00<?, ?obj/s]
Downloading data files #13: 0%| | 0/1 [00:00<?, ?obj/s]
Downloading data files #8: 0%| | 0/1 [00:00<?, ?obj/s]
Downloading data files #12: 0%| | 0/1 [00:00<?, ?obj/s]
Downloading data files: 0it [00:00, ?it/s]
Extracting data files: 0it [00:00, ?it/s]
Generating train split: 0 examples [00:00, ? examples/s]
Generating validation split: 0 examples [00:00, ? examples/s]
Dataset imagefolder downloaded and prepared to /home/pc/.cache/huggingface/datasets/imagefolder/images-split-fiszki-0b6e02834f7867a1/0.0.0/37fbb85cc714a338bea574ac6c7d0b5be5aff46c1862c1989b20e0771199e93f. Subsequent calls will reuse this data.
0%| | 0/2 [00:00<?, ?it/s]
dataset.push_to_hub("Zombely/fiszki-ocr-train")
Pushing split train to the Hub.
0%| | 0/1 [00:00<?, ?ba/s]
Pushing dataset shards to the dataset hub: 0%| | 0/1 [00:00<?, ?it/s]
Pushing split validation to the Hub.
0%| | 0/1 [00:00<?, ?ba/s]
Pushing dataset shards to the dataset hub: 0%| | 0/1 [00:00<?, ?it/s]