This commit is contained in:
Michał Kozłowski 2023-01-10 22:57:22 +01:00
parent fe144a70b9
commit 680c3d000c
2 changed files with 82 additions and 7 deletions

View File

@ -31,15 +31,18 @@ def main(args):
print("Starting from checkpoint, index: ", offset)
df = df[offset:]
pbar = tqdm(df.iterrows(), total=len(df), desc=f"0/{args.max_folder_size_mb} MB")
pbar = tqdm(df.iterrows(), total=len(df), desc=f"0/{args.max_folder_size_mb if args.max_folder_size_mb else 'No limit given'} MB")
for n, row in enumerate(pbar):
try:
time.sleep(0.2)
r = requests.get(f"https:{row[1]['image_url']}", stream=True, headers=headers)
if r.status_code != 200:
pprint(r.__dict__)
save_state(n, offset)
return
time.sleep(80)
r = requests.get(f"https:{row[1]['image_url']}", stream=True, headers=headers)
if r.status_code != 200:
pprint(r.__dict__)
save_state(n, offset)
return
image = Image.open(r.raw)
if image.mode != "RGB":
image = image.convert("RGB")
@ -47,13 +50,13 @@ def main(args):
image.save(f"{args.output_folder}/{title}.png")
with open(f"{args.output_folder}/metadata.jsonl", mode='a', encoding='utf-8') as f:
f.write(str({"file_name": title, "ground_truth": json.dumps({"gt_parse": {"text_sequance": row[1]['text']}}, ensure_ascii=False)}) + "\n")
f.write(str({"file_name": f"{title}.png", "ground_truth": json.dumps({"gt_parse": {"text_sequance": row[1]['text']}}, ensure_ascii=False)}) + "\n")
dir_size = round(sum(os.path.getsize(f"./{args.output_folder}/{file}") for file in os.listdir(f"./{args.output_folder}")) * 0.000001, 2)
pbar.set_description(f"{dir_size}/{args.max_folder_size_mb if args.max_folder_size_mb else ''} MB")
pbar.set_description(f"{dir_size}/{args.max_folder_size_mb if args.max_folder_size_mb else 'No limit given'} MB")
if args.max_folder.size_mb and dir_size > args.max_folder_size_mb:
if args.max_folder_size_mb and dir_size > args.max_folder_size_mb:
print(f"Limit size of: {args.max_folder_size_mb}, exceeded")
save_state(n, offset)
return

View File

@ -134,6 +134,78 @@
"source": [
"a.head()"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"from datasets import load_dataset\n",
"from huggingface_hub import login"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"Resolving data files: 100%|██████████| 29/29 [00:00<?, ?it/s]\n",
"Using custom data configuration images-8b1ad802b6988161\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Downloading and preparing dataset imagefolder/images to C:/Users/PC/.cache/huggingface/datasets/imagefolder/images-8b1ad802b6988161/0.0.0/37fbb85cc714a338bea574ac6c7d0b5be5aff46c1862c1989b20e0771199e93f...\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"Downloading data files: 0it [00:00, ?it/s]\n",
"Extracting data files: 0it [00:00, ?it/s]\n"
]
},
{
"ename": "ArrowInvalid",
"evalue": "JSON parse error: Missing a name for object member. in row 0",
"output_type": "error",
"traceback": [
"\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[1;31mArrowInvalid\u001b[0m Traceback (most recent call last)",
"Cell \u001b[1;32mIn[6], line 1\u001b[0m\n\u001b[1;32m----> 1\u001b[0m dataset \u001b[39m=\u001b[39m load_dataset(\u001b[39m\"\u001b[39;49m\u001b[39m../images\u001b[39;49m\u001b[39m\"\u001b[39;49m)\n",
"File \u001b[1;32mc:\\Users\\PC\\anaconda3\\envs\\um\\lib\\site-packages\\datasets\\load.py:1741\u001b[0m, in \u001b[0;36mload_dataset\u001b[1;34m(path, name, data_dir, data_files, split, cache_dir, features, download_config, download_mode, ignore_verifications, keep_in_memory, save_infos, revision, use_auth_token, task, streaming, num_proc, **config_kwargs)\u001b[0m\n\u001b[0;32m 1738\u001b[0m try_from_hf_gcs \u001b[39m=\u001b[39m path \u001b[39mnot\u001b[39;00m \u001b[39min\u001b[39;00m _PACKAGED_DATASETS_MODULES\n\u001b[0;32m 1740\u001b[0m \u001b[39m# Download and prepare data\u001b[39;00m\n\u001b[1;32m-> 1741\u001b[0m builder_instance\u001b[39m.\u001b[39;49mdownload_and_prepare(\n\u001b[0;32m 1742\u001b[0m download_config\u001b[39m=\u001b[39;49mdownload_config,\n\u001b[0;32m 1743\u001b[0m download_mode\u001b[39m=\u001b[39;49mdownload_mode,\n\u001b[0;32m 1744\u001b[0m ignore_verifications\u001b[39m=\u001b[39;49mignore_verifications,\n\u001b[0;32m 1745\u001b[0m try_from_hf_gcs\u001b[39m=\u001b[39;49mtry_from_hf_gcs,\n\u001b[0;32m 1746\u001b[0m use_auth_token\u001b[39m=\u001b[39;49muse_auth_token,\n\u001b[0;32m 1747\u001b[0m num_proc\u001b[39m=\u001b[39;49mnum_proc,\n\u001b[0;32m 1748\u001b[0m )\n\u001b[0;32m 1750\u001b[0m \u001b[39m# Build dataset for splits\u001b[39;00m\n\u001b[0;32m 1751\u001b[0m keep_in_memory \u001b[39m=\u001b[39m (\n\u001b[0;32m 1752\u001b[0m keep_in_memory \u001b[39mif\u001b[39;00m keep_in_memory \u001b[39mis\u001b[39;00m \u001b[39mnot\u001b[39;00m \u001b[39mNone\u001b[39;00m \u001b[39melse\u001b[39;00m is_small_dataset(builder_instance\u001b[39m.\u001b[39minfo\u001b[39m.\u001b[39mdataset_size)\n\u001b[0;32m 1753\u001b[0m )\n",
"File \u001b[1;32mc:\\Users\\PC\\anaconda3\\envs\\um\\lib\\site-packages\\datasets\\builder.py:822\u001b[0m, in \u001b[0;36mDatasetBuilder.download_and_prepare\u001b[1;34m(self, output_dir, download_config, download_mode, ignore_verifications, try_from_hf_gcs, dl_manager, base_path, use_auth_token, file_format, max_shard_size, num_proc, storage_options, **download_and_prepare_kwargs)\u001b[0m\n\u001b[0;32m 820\u001b[0m \u001b[39mif\u001b[39;00m num_proc \u001b[39mis\u001b[39;00m \u001b[39mnot\u001b[39;00m \u001b[39mNone\u001b[39;00m:\n\u001b[0;32m 821\u001b[0m prepare_split_kwargs[\u001b[39m\"\u001b[39m\u001b[39mnum_proc\u001b[39m\u001b[39m\"\u001b[39m] \u001b[39m=\u001b[39m num_proc\n\u001b[1;32m--> 822\u001b[0m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_download_and_prepare(\n\u001b[0;32m 823\u001b[0m dl_manager\u001b[39m=\u001b[39mdl_manager,\n\u001b[0;32m 824\u001b[0m verify_infos\u001b[39m=\u001b[39mverify_infos,\n\u001b[0;32m 825\u001b[0m \u001b[39m*\u001b[39m\u001b[39m*\u001b[39mprepare_split_kwargs,\n\u001b[0;32m 826\u001b[0m \u001b[39m*\u001b[39m\u001b[39m*\u001b[39mdownload_and_prepare_kwargs,\n\u001b[0;32m 827\u001b[0m )\n\u001b[0;32m 828\u001b[0m \u001b[39m# Sync info\u001b[39;00m\n\u001b[0;32m 829\u001b[0m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39minfo\u001b[39m.\u001b[39mdataset_size \u001b[39m=\u001b[39m \u001b[39msum\u001b[39m(split\u001b[39m.\u001b[39mnum_bytes \u001b[39mfor\u001b[39;00m split \u001b[39min\u001b[39;00m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39minfo\u001b[39m.\u001b[39msplits\u001b[39m.\u001b[39mvalues())\n",
"File \u001b[1;32mc:\\Users\\PC\\anaconda3\\envs\\um\\lib\\site-packages\\datasets\\builder.py:1555\u001b[0m, in \u001b[0;36mGeneratorBasedBuilder._download_and_prepare\u001b[1;34m(self, dl_manager, verify_infos, **prepare_splits_kwargs)\u001b[0m\n\u001b[0;32m 1554\u001b[0m \u001b[39mdef\u001b[39;00m \u001b[39m_download_and_prepare\u001b[39m(\u001b[39mself\u001b[39m, dl_manager, verify_infos, \u001b[39m*\u001b[39m\u001b[39m*\u001b[39mprepare_splits_kwargs):\n\u001b[1;32m-> 1555\u001b[0m \u001b[39msuper\u001b[39m()\u001b[39m.\u001b[39m_download_and_prepare(\n\u001b[0;32m 1556\u001b[0m dl_manager, verify_infos, check_duplicate_keys\u001b[39m=\u001b[39mverify_infos, \u001b[39m*\u001b[39m\u001b[39m*\u001b[39mprepare_splits_kwargs\n\u001b[0;32m 1557\u001b[0m )\n",
"File \u001b[1;32mc:\\Users\\PC\\anaconda3\\envs\\um\\lib\\site-packages\\datasets\\builder.py:891\u001b[0m, in \u001b[0;36mDatasetBuilder._download_and_prepare\u001b[1;34m(self, dl_manager, verify_infos, **prepare_split_kwargs)\u001b[0m\n\u001b[0;32m 889\u001b[0m split_dict \u001b[39m=\u001b[39m SplitDict(dataset_name\u001b[39m=\u001b[39m\u001b[39mself\u001b[39m\u001b[39m.\u001b[39mname)\n\u001b[0;32m 890\u001b[0m split_generators_kwargs \u001b[39m=\u001b[39m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_make_split_generators_kwargs(prepare_split_kwargs)\n\u001b[1;32m--> 891\u001b[0m split_generators \u001b[39m=\u001b[39m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_split_generators(dl_manager, \u001b[39m*\u001b[39m\u001b[39m*\u001b[39msplit_generators_kwargs)\n\u001b[0;32m 893\u001b[0m \u001b[39m# Checksums verification\u001b[39;00m\n\u001b[0;32m 894\u001b[0m \u001b[39mif\u001b[39;00m verify_infos \u001b[39mand\u001b[39;00m dl_manager\u001b[39m.\u001b[39mrecord_checksums:\n",
"File \u001b[1;32mc:\\Users\\PC\\anaconda3\\envs\\um\\lib\\site-packages\\datasets\\packaged_modules\\folder_based_builder\\folder_based_builder.py:189\u001b[0m, in \u001b[0;36mFolderBasedBuilder._split_generators\u001b[1;34m(self, dl_manager)\u001b[0m\n\u001b[0;32m 186\u001b[0m metadata_ext \u001b[39m=\u001b[39m metadata_ext\u001b[39m.\u001b[39mpop()\n\u001b[0;32m 188\u001b[0m \u001b[39mfor\u001b[39;00m _, downloaded_metadata_file \u001b[39min\u001b[39;00m itertools\u001b[39m.\u001b[39mchain\u001b[39m.\u001b[39mfrom_iterable(metadata_files\u001b[39m.\u001b[39mvalues()):\n\u001b[1;32m--> 189\u001b[0m pa_metadata_table \u001b[39m=\u001b[39m \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49m_read_metadata(downloaded_metadata_file)\n\u001b[0;32m 190\u001b[0m features_per_metadata_file\u001b[39m.\u001b[39mappend(\n\u001b[0;32m 191\u001b[0m (downloaded_metadata_file, datasets\u001b[39m.\u001b[39mFeatures\u001b[39m.\u001b[39mfrom_arrow_schema(pa_metadata_table\u001b[39m.\u001b[39mschema))\n\u001b[0;32m 192\u001b[0m )\n\u001b[0;32m 193\u001b[0m \u001b[39mfor\u001b[39;00m downloaded_metadata_file, metadata_features \u001b[39min\u001b[39;00m features_per_metadata_file:\n",
"File \u001b[1;32mc:\\Users\\PC\\anaconda3\\envs\\um\\lib\\site-packages\\datasets\\packaged_modules\\folder_based_builder\\folder_based_builder.py:260\u001b[0m, in \u001b[0;36mFolderBasedBuilder._read_metadata\u001b[1;34m(self, metadata_file)\u001b[0m\n\u001b[0;32m 258\u001b[0m \u001b[39melse\u001b[39;00m:\n\u001b[0;32m 259\u001b[0m \u001b[39mwith\u001b[39;00m \u001b[39mopen\u001b[39m(metadata_file, \u001b[39m\"\u001b[39m\u001b[39mrb\u001b[39m\u001b[39m\"\u001b[39m) \u001b[39mas\u001b[39;00m f:\n\u001b[1;32m--> 260\u001b[0m \u001b[39mreturn\u001b[39;00m paj\u001b[39m.\u001b[39;49mread_json(f)\n",
"File \u001b[1;32mc:\\Users\\PC\\anaconda3\\envs\\um\\lib\\site-packages\\pyarrow\\_json.pyx:259\u001b[0m, in \u001b[0;36mpyarrow._json.read_json\u001b[1;34m()\u001b[0m\n",
"File \u001b[1;32mc:\\Users\\PC\\anaconda3\\envs\\um\\lib\\site-packages\\pyarrow\\error.pxi:144\u001b[0m, in \u001b[0;36mpyarrow.lib.pyarrow_internal_check_status\u001b[1;34m()\u001b[0m\n",
"File \u001b[1;32mc:\\Users\\PC\\anaconda3\\envs\\um\\lib\\site-packages\\pyarrow\\error.pxi:100\u001b[0m, in \u001b[0;36mpyarrow.lib.check_status\u001b[1;34m()\u001b[0m\n",
"\u001b[1;31mArrowInvalid\u001b[0m: JSON parse error: Missing a name for object member. in row 0"
]
}
],
"source": [
"dataset = load_dataset(\"../images\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"login('',True)"
]
}
],
"metadata": {