donut/notepads/dataset_create.ipynb
Michał Kozłowski b737d8bb8a naming change
2023-01-10 18:24:44 +01:00

21 KiB

import pandas as pd
from huggingface_hub import login
from datasets import load_dataset
import os
import json
import shutil
login(os.environ.get("HUG_TOKKEN"))
VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…
df_train = pd.read_csv('../fiszki-ocr/train/in.tsv', sep='\t', header=None, index_col=False)
files = [file[0] for file in df_train.iloc()]
df_train_out = pd.read_csv('../fiszki-ocr/train/expected.tsv', sep='\t', header=None, index_col=False)
files_out = [file_out[0] for file_out in df_train_out.iloc()]
whole = []
for file, out in zip(files, files_out):
        whole.append({"file_name": file, "ground_truth": json.dumps({"gt_parse": {"text_sequance": out}}, ensure_ascii=False)})
train = whole[:85]
validation = whole[85:]
train_files = [file.get("file_name") for file in train]
validation_files = [file.get("file_name") for file in validation]
for image in os.listdir("../fiszki-ocr/images"):
    if image in train_files:
        shutil.copy(f"/home/pc/work/fiszki-ocr/images/{image}", f"./images-split-fiszki/train/{image}")
    if image in validation_files:
        shutil.copy(f"/home/pc/work/fiszki-ocr/images/{image}", f"./images-split-fiszki/validation/{image}")

with open('./images-split-fiszki/train/metadata.jsonl', 'w', encoding='utf-8') as f:
    for entry in train:
        json.dump(entry, f, ensure_ascii=False)
        f.write("\n")
with open('./images-split-fiszki/validation/metadata.jsonl', 'w', encoding='utf-8') as f:
    for entry in validation:
        json.dump(entry, f, ensure_ascii=False)
        f.write("\n")
    
dataset = load_dataset('./images-split-fiszki')
Resolving data files:   0%|          | 0/86 [00:00<?, ?it/s]
Using custom data configuration images-split-fiszki-0b6e02834f7867a1
Downloading and preparing dataset imagefolder/images-split-fiszki to /home/pc/.cache/huggingface/datasets/imagefolder/images-split-fiszki-0b6e02834f7867a1/0.0.0/37fbb85cc714a338bea574ac6c7d0b5be5aff46c1862c1989b20e0771199e93f...
                
Downloading data files #4:   0%|          | 0/6 [00:00<?, ?obj/s]
Downloading data files #1:   0%|          | 0/6 [00:00<?, ?obj/s]
Downloading data files #15:   0%|          | 0/5 [00:00<?, ?obj/s]
Downloading data files #10:   0%|          | 0/5 [00:00<?, ?obj/s]
Downloading data files #6:   0%|          | 0/6 [00:00<?, ?obj/s]
Downloading data files #14:   0%|          | 0/5 [00:00<?, ?obj/s]
Downloading data files #2:   0%|          | 0/6 [00:00<?, ?obj/s]
Downloading data files #7:   0%|          | 0/6 [00:00<?, ?obj/s]
Downloading data files #8:   0%|          | 0/5 [00:00<?, ?obj/s]
Downloading data files #3:   0%|          | 0/6 [00:00<?, ?obj/s]
Downloading data files #0:   0%|          | 0/6 [00:00<?, ?obj/s]
Downloading data files #9:   0%|          | 0/5 [00:00<?, ?obj/s]
Downloading data files #5:   0%|          | 0/6 [00:00<?, ?obj/s]
Downloading data files #11:   0%|          | 0/5 [00:00<?, ?obj/s]
Downloading data files #12:   0%|          | 0/5 [00:00<?, ?obj/s]
Downloading data files #13:   0%|          | 0/5 [00:00<?, ?obj/s]
Downloading data files: 0it [00:00, ?it/s]
Extracting data files: 0it [00:00, ?it/s]
                
Downloading data files #2:   0%|          | 0/1 [00:00<?, ?obj/s]
Downloading data files #4:   0%|          | 0/1 [00:00<?, ?obj/s]
Downloading data files #3:   0%|          | 0/1 [00:00<?, ?obj/s]
Downloading data files #0:   0%|          | 0/2 [00:00<?, ?obj/s]
Downloading data files #5:   0%|          | 0/1 [00:00<?, ?obj/s]
Downloading data files #1:   0%|          | 0/1 [00:00<?, ?obj/s]
Downloading data files #9:   0%|          | 0/1 [00:00<?, ?obj/s]
Downloading data files #10:   0%|          | 0/1 [00:00<?, ?obj/s]
Downloading data files #14:   0%|          | 0/1 [00:00<?, ?obj/s]
Downloading data files #15:   0%|          | 0/1 [00:00<?, ?obj/s]
Downloading data files #7:   0%|          | 0/1 [00:00<?, ?obj/s]
Downloading data files #6:   0%|          | 0/1 [00:00<?, ?obj/s]
Downloading data files #11:   0%|          | 0/1 [00:00<?, ?obj/s]
Downloading data files #13:   0%|          | 0/1 [00:00<?, ?obj/s]
Downloading data files #8:   0%|          | 0/1 [00:00<?, ?obj/s]
Downloading data files #12:   0%|          | 0/1 [00:00<?, ?obj/s]
Downloading data files: 0it [00:00, ?it/s]
Extracting data files: 0it [00:00, ?it/s]
Generating train split: 0 examples [00:00, ? examples/s]
Generating validation split: 0 examples [00:00, ? examples/s]
Dataset imagefolder downloaded and prepared to /home/pc/.cache/huggingface/datasets/imagefolder/images-split-fiszki-0b6e02834f7867a1/0.0.0/37fbb85cc714a338bea574ac6c7d0b5be5aff46c1862c1989b20e0771199e93f. Subsequent calls will reuse this data.
  0%|          | 0/2 [00:00<?, ?it/s]
dataset.push_to_hub("Zombely/fiszki-ocr-train")
Pushing split train to the Hub.
  0%|          | 0/1 [00:00<?, ?ba/s]
Pushing dataset shards to the dataset hub:   0%|          | 0/1 [00:00<?, ?it/s]
Pushing split validation to the Hub.
  0%|          | 0/1 [00:00<?, ?ba/s]
Pushing dataset shards to the dataset hub:   0%|          | 0/1 [00:00<?, ?it/s]