donut/notepads/dataset_create.ipynb

823 lines
21 KiB
Plaintext
Raw Permalink Normal View History

{
"cells": [
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"from huggingface_hub import login\n",
"from datasets import load_dataset\n",
"import os\n",
"import json\n",
"import shutil"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "f0476002f8d14822a24f1376cfe29a07",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"VBox(children=(HTML(value='<center> <img\\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"login(os.environ.get(\"HUG_TOKKEN\"))"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [],
"source": [
"df_train = pd.read_csv('../fiszki-ocr/train/in.tsv', sep='\\t', header=None, index_col=False)\n",
"files = [file[0] for file in df_train.iloc()]\n",
"df_train_out = pd.read_csv('../fiszki-ocr/train/expected.tsv', sep='\\t', header=None, index_col=False)\n",
"files_out = [file_out[0] for file_out in df_train_out.iloc()]"
]
},
{
"cell_type": "code",
"execution_count": 18,
"metadata": {},
"outputs": [],
"source": [
"whole = []\n",
"for file, out in zip(files, files_out):\n",
" whole.append({\"file_name\": file, \"ground_truth\": json.dumps({\"gt_parse\": {\"text_sequance\": out}}, ensure_ascii=False)})"
]
},
{
"cell_type": "code",
"execution_count": 19,
"metadata": {},
"outputs": [],
"source": [
"train = whole[:85]\n",
"validation = whole[85:]"
]
},
{
"cell_type": "code",
"execution_count": 20,
"metadata": {},
"outputs": [],
"source": [
"train_files = [file.get(\"file_name\") for file in train]\n",
"validation_files = [file.get(\"file_name\") for file in validation]"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {},
"outputs": [],
"source": [
"for image in os.listdir(\"../fiszki-ocr/images\"):\n",
" if image in train_files:\n",
" shutil.copy(f\"/home/pc/work/fiszki-ocr/images/{image}\", f\"./images-split-fiszki/train/{image}\")\n",
" if image in validation_files:\n",
" shutil.copy(f\"/home/pc/work/fiszki-ocr/images/{image}\", f\"./images-split-fiszki/validation/{image}\")"
]
},
{
"cell_type": "code",
"execution_count": 21,
"metadata": {},
"outputs": [],
"source": [
"\n",
"with open('./images-split-fiszki/train/metadata.jsonl', 'w', encoding='utf-8') as f:\n",
" for entry in train:\n",
" json.dump(entry, f, ensure_ascii=False)\n",
" f.write(\"\\n\")\n",
"with open('./images-split-fiszki/validation/metadata.jsonl', 'w', encoding='utf-8') as f:\n",
" for entry in validation:\n",
" json.dump(entry, f, ensure_ascii=False)\n",
" f.write(\"\\n\")\n",
" "
]
},
{
"cell_type": "code",
"execution_count": 22,
"metadata": {},
"outputs": [
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "ca154573c11a44a8a1fa7dede4c54e26",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"Resolving data files: 0%| | 0/86 [00:00<?, ?it/s]"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"Using custom data configuration images-split-fiszki-0b6e02834f7867a1\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Downloading and preparing dataset imagefolder/images-split-fiszki to /home/pc/.cache/huggingface/datasets/imagefolder/images-split-fiszki-0b6e02834f7867a1/0.0.0/37fbb85cc714a338bea574ac6c7d0b5be5aff46c1862c1989b20e0771199e93f...\n",
" "
]
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "2677f9a18a4d40768ebfee41eb5ee208",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"Downloading data files #4: 0%| | 0/6 [00:00<?, ?obj/s]"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "b742285b54724ef895dc3f1c76510030",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"Downloading data files #1: 0%| | 0/6 [00:00<?, ?obj/s]"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "27b3c6bbb7fe4220b20a13c6b720b99e",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"Downloading data files #15: 0%| | 0/5 [00:00<?, ?obj/s]"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "b9961292c96c404582fe522ff8d93e1d",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"Downloading data files #10: 0%| | 0/5 [00:00<?, ?obj/s]"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "d496ade67a244136b1fe5a00e539dc9f",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"Downloading data files #6: 0%| | 0/6 [00:00<?, ?obj/s]"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "61b0ebdef7814d0ab6f9fa796b67f033",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"Downloading data files #14: 0%| | 0/5 [00:00<?, ?obj/s]"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "9040a6b8a24f4ab793d0cf459f5f35b3",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"Downloading data files #2: 0%| | 0/6 [00:00<?, ?obj/s]"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "e4e6f1800d37456ebc095f7a096082fe",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"Downloading data files #7: 0%| | 0/6 [00:00<?, ?obj/s]"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "badee192d70a4d109cf38b3539876221",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"Downloading data files #8: 0%| | 0/5 [00:00<?, ?obj/s]"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "05801dc38fd24f4382f488c8a3fa92bc",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"Downloading data files #3: 0%| | 0/6 [00:00<?, ?obj/s]"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "704b4bd67b044e9c8d3cb009df4be325",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"Downloading data files #0: 0%| | 0/6 [00:00<?, ?obj/s]"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "ee591babc11e479c8263368893964589",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"Downloading data files #9: 0%| | 0/5 [00:00<?, ?obj/s]"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "2e6e2f9a00774a6ba35330a0e1104968",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"Downloading data files #5: 0%| | 0/6 [00:00<?, ?obj/s]"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "c97fd7b70b544c068d13eef90ad05127",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"Downloading data files #11: 0%| | 0/5 [00:00<?, ?obj/s]"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "9d9741e14c7945c4aac512ebe6effbba",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"Downloading data files #12: 0%| | 0/5 [00:00<?, ?obj/s]"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "f0ec07904f434cf7b8d7e98702979c83",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"Downloading data files #13: 0%| | 0/5 [00:00<?, ?obj/s]"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "3881282964584fe8906257ca4edb825b",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"Downloading data files: 0it [00:00, ?it/s]"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "3d543609550c438c891b36e2406cb1ae",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"Extracting data files: 0it [00:00, ?it/s]"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
" "
]
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "d48e39b33fb74375894bff21bd91dd56",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"Downloading data files #2: 0%| | 0/1 [00:00<?, ?obj/s]"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "3e8add0aead64b06b4b630a9e3cd7614",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"Downloading data files #4: 0%| | 0/1 [00:00<?, ?obj/s]"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "7e3311255c414944965ac6d19e3520bb",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"Downloading data files #3: 0%| | 0/1 [00:00<?, ?obj/s]"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "89fe0a3793d0442ab9d91a98e39b05f1",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"Downloading data files #0: 0%| | 0/2 [00:00<?, ?obj/s]"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "e89ba8660b684c028d15b5b62f22c3ba",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"Downloading data files #5: 0%| | 0/1 [00:00<?, ?obj/s]"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "db6d55a219704ffa8f73a31d928fe47e",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"Downloading data files #1: 0%| | 0/1 [00:00<?, ?obj/s]"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "ca45c0d6589d4b858a9914ef9f8845d4",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"Downloading data files #9: 0%| | 0/1 [00:00<?, ?obj/s]"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "45ea9e09950d4ec0b0529db382b14d6f",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"Downloading data files #10: 0%| | 0/1 [00:00<?, ?obj/s]"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "cfeb70d806d344b683aa9e772b468e6e",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"Downloading data files #14: 0%| | 0/1 [00:00<?, ?obj/s]"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "3dd158c57c8c46b19b86ddd7e31915fd",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"Downloading data files #15: 0%| | 0/1 [00:00<?, ?obj/s]"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "367863a5314d494f929aae0ca91e0a33",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"Downloading data files #7: 0%| | 0/1 [00:00<?, ?obj/s]"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "a1a0e3b6a0234188b34c11fae2f6503d",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"Downloading data files #6: 0%| | 0/1 [00:00<?, ?obj/s]"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "d8e9df2e9ace4365b3e6faf80c2b7cbb",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"Downloading data files #11: 0%| | 0/1 [00:00<?, ?obj/s]"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "6df8b58b7a934f8eaf0422ce9f704d38",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"Downloading data files #13: 0%| | 0/1 [00:00<?, ?obj/s]"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "d1f58d8da7f24d6394e2c2ace1372d92",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"Downloading data files #8: 0%| | 0/1 [00:00<?, ?obj/s]"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "208c67f7ffb64f548726bbe2443f6930",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"Downloading data files #12: 0%| | 0/1 [00:00<?, ?obj/s]"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "905e84687479471daaadfd9850c52a88",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"Downloading data files: 0it [00:00, ?it/s]"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "3db88be1336b4ea0b03638761a6d69e7",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"Extracting data files: 0it [00:00, ?it/s]"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "96090b6f92eb46be8b44dde7d96f225a",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"Generating train split: 0 examples [00:00, ? examples/s]"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "419401799c864422b9669c66c44159bd",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"Generating validation split: 0 examples [00:00, ? examples/s]"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Dataset imagefolder downloaded and prepared to /home/pc/.cache/huggingface/datasets/imagefolder/images-split-fiszki-0b6e02834f7867a1/0.0.0/37fbb85cc714a338bea574ac6c7d0b5be5aff46c1862c1989b20e0771199e93f. Subsequent calls will reuse this data.\n"
]
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "835ebdd301dc469dbb0ad6f1838403a5",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
" 0%| | 0/2 [00:00<?, ?it/s]"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"dataset = load_dataset('./images-split-fiszki')"
]
},
{
"cell_type": "code",
"execution_count": 23,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"Pushing split train to the Hub.\n"
]
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "a92c7f7732054b479a26b1f32621cf20",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
" 0%| | 0/1 [00:00<?, ?ba/s]"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "d934bd92d4af41e492ecac230b635903",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"Pushing dataset shards to the dataset hub: 0%| | 0/1 [00:00<?, ?it/s]"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"Pushing split validation to the Hub.\n"
]
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "dd0c40c587e84870ad9c3089d401b80a",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
" 0%| | 0/1 [00:00<?, ?ba/s]"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "eeac023e001349a48133da21f7656378",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"Pushing dataset shards to the dataset hub: 0%| | 0/1 [00:00<?, ?it/s]"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"dataset.push_to_hub(\"Zombely/fiszki-ocr-train\")"
]
}
],
"metadata": {
"kernelspec": {
2023-01-10 18:24:44 +01:00
"display_name": "donut",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
2023-01-10 18:24:44 +01:00
"version": "3.10.8 | packaged by conda-forge | (main, Nov 24 2022, 14:07:00) [MSC v.1916 64 bit (AMD64)]"
},
"orig_nbformat": 4,
"vscode": {
"interpreter": {
2023-01-10 18:24:44 +01:00
"hash": "5f15394fbb90e53eb87c79cee123a308177758b46ab7bd2ba3c7b07360ea775a"
}
}
},
"nbformat": 4,
"nbformat_minor": 2
}