From 680c3d000cb675d160a9bda0fd4e65ef4db13a43 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Micha=C5=82=20Koz=C5=82owski?= Date: Tue, 10 Jan 2023 22:57:22 +0100 Subject: [PATCH] fixing --- image_download.py | 17 ++++---- notebooks/image_download.ipynb | 72 ++++++++++++++++++++++++++++++++++ 2 files changed, 82 insertions(+), 7 deletions(-) diff --git a/image_download.py b/image_download.py index 882edb5..7415b91 100644 --- a/image_download.py +++ b/image_download.py @@ -31,15 +31,18 @@ def main(args): print("Starting from checkpoint, index: ", offset) df = df[offset:] - pbar = tqdm(df.iterrows(), total=len(df), desc=f"0/{args.max_folder_size_mb} MB") + pbar = tqdm(df.iterrows(), total=len(df), desc=f"0/{args.max_folder_size_mb if args.max_folder_size_mb else 'No limit given'} MB") for n, row in enumerate(pbar): try: time.sleep(0.2) r = requests.get(f"https:{row[1]['image_url']}", stream=True, headers=headers) if r.status_code != 200: - pprint(r.__dict__) - save_state(n, offset) - return + time.sleep(80) + r = requests.get(f"https:{row[1]['image_url']}", stream=True, headers=headers) + if r.status_code != 200: + pprint(r.__dict__) + save_state(n, offset) + return image = Image.open(r.raw) if image.mode != "RGB": image = image.convert("RGB") @@ -47,13 +50,13 @@ def main(args): image.save(f"{args.output_folder}/{title}.png") with open(f"{args.output_folder}/metadata.jsonl", mode='a', encoding='utf-8') as f: - f.write(str({"file_name": title, "ground_truth": json.dumps({"gt_parse": {"text_sequance": row[1]['text']}}, ensure_ascii=False)}) + "\n") + f.write(str({"file_name": f"{title}.png", "ground_truth": json.dumps({"gt_parse": {"text_sequance": row[1]['text']}}, ensure_ascii=False)}) + "\n") dir_size = round(sum(os.path.getsize(f"./{args.output_folder}/{file}") for file in os.listdir(f"./{args.output_folder}")) * 0.000001, 2) - pbar.set_description(f"{dir_size}/{args.max_folder_size_mb if args.max_folder_size_mb else ''} MB") + pbar.set_description(f"{dir_size}/{args.max_folder_size_mb if args.max_folder_size_mb else 'No limit given'} MB") - if args.max_folder.size_mb and dir_size > args.max_folder_size_mb: + if args.max_folder_size_mb and dir_size > args.max_folder_size_mb: print(f"Limit size of: {args.max_folder_size_mb}, exceeded") save_state(n, offset) return diff --git a/notebooks/image_download.ipynb b/notebooks/image_download.ipynb index 45e6de8..fa198a8 100644 --- a/notebooks/image_download.ipynb +++ b/notebooks/image_download.ipynb @@ -134,6 +134,78 @@ "source": [ "a.head()" ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "from datasets import load_dataset\n", + "from huggingface_hub import login" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Resolving data files: 100%|██████████| 29/29 [00:00 1\u001b[0m dataset \u001b[39m=\u001b[39m load_dataset(\u001b[39m\"\u001b[39;49m\u001b[39m../images\u001b[39;49m\u001b[39m\"\u001b[39;49m)\n", + "File \u001b[1;32mc:\\Users\\PC\\anaconda3\\envs\\um\\lib\\site-packages\\datasets\\load.py:1741\u001b[0m, in \u001b[0;36mload_dataset\u001b[1;34m(path, name, data_dir, data_files, split, cache_dir, features, download_config, download_mode, ignore_verifications, keep_in_memory, save_infos, revision, use_auth_token, task, streaming, num_proc, **config_kwargs)\u001b[0m\n\u001b[0;32m 1738\u001b[0m try_from_hf_gcs \u001b[39m=\u001b[39m path \u001b[39mnot\u001b[39;00m \u001b[39min\u001b[39;00m _PACKAGED_DATASETS_MODULES\n\u001b[0;32m 1740\u001b[0m \u001b[39m# Download and prepare data\u001b[39;00m\n\u001b[1;32m-> 1741\u001b[0m builder_instance\u001b[39m.\u001b[39;49mdownload_and_prepare(\n\u001b[0;32m 1742\u001b[0m download_config\u001b[39m=\u001b[39;49mdownload_config,\n\u001b[0;32m 1743\u001b[0m download_mode\u001b[39m=\u001b[39;49mdownload_mode,\n\u001b[0;32m 1744\u001b[0m ignore_verifications\u001b[39m=\u001b[39;49mignore_verifications,\n\u001b[0;32m 1745\u001b[0m try_from_hf_gcs\u001b[39m=\u001b[39;49mtry_from_hf_gcs,\n\u001b[0;32m 1746\u001b[0m use_auth_token\u001b[39m=\u001b[39;49muse_auth_token,\n\u001b[0;32m 1747\u001b[0m num_proc\u001b[39m=\u001b[39;49mnum_proc,\n\u001b[0;32m 1748\u001b[0m )\n\u001b[0;32m 1750\u001b[0m \u001b[39m# Build dataset for splits\u001b[39;00m\n\u001b[0;32m 1751\u001b[0m keep_in_memory \u001b[39m=\u001b[39m (\n\u001b[0;32m 1752\u001b[0m keep_in_memory \u001b[39mif\u001b[39;00m keep_in_memory \u001b[39mis\u001b[39;00m \u001b[39mnot\u001b[39;00m \u001b[39mNone\u001b[39;00m \u001b[39melse\u001b[39;00m is_small_dataset(builder_instance\u001b[39m.\u001b[39minfo\u001b[39m.\u001b[39mdataset_size)\n\u001b[0;32m 1753\u001b[0m )\n", + "File \u001b[1;32mc:\\Users\\PC\\anaconda3\\envs\\um\\lib\\site-packages\\datasets\\builder.py:822\u001b[0m, in \u001b[0;36mDatasetBuilder.download_and_prepare\u001b[1;34m(self, output_dir, download_config, download_mode, ignore_verifications, try_from_hf_gcs, dl_manager, base_path, use_auth_token, file_format, max_shard_size, num_proc, storage_options, **download_and_prepare_kwargs)\u001b[0m\n\u001b[0;32m 820\u001b[0m \u001b[39mif\u001b[39;00m num_proc \u001b[39mis\u001b[39;00m \u001b[39mnot\u001b[39;00m \u001b[39mNone\u001b[39;00m:\n\u001b[0;32m 821\u001b[0m prepare_split_kwargs[\u001b[39m\"\u001b[39m\u001b[39mnum_proc\u001b[39m\u001b[39m\"\u001b[39m] \u001b[39m=\u001b[39m num_proc\n\u001b[1;32m--> 822\u001b[0m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_download_and_prepare(\n\u001b[0;32m 823\u001b[0m dl_manager\u001b[39m=\u001b[39mdl_manager,\n\u001b[0;32m 824\u001b[0m verify_infos\u001b[39m=\u001b[39mverify_infos,\n\u001b[0;32m 825\u001b[0m \u001b[39m*\u001b[39m\u001b[39m*\u001b[39mprepare_split_kwargs,\n\u001b[0;32m 826\u001b[0m \u001b[39m*\u001b[39m\u001b[39m*\u001b[39mdownload_and_prepare_kwargs,\n\u001b[0;32m 827\u001b[0m )\n\u001b[0;32m 828\u001b[0m \u001b[39m# Sync info\u001b[39;00m\n\u001b[0;32m 829\u001b[0m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39minfo\u001b[39m.\u001b[39mdataset_size \u001b[39m=\u001b[39m \u001b[39msum\u001b[39m(split\u001b[39m.\u001b[39mnum_bytes \u001b[39mfor\u001b[39;00m split \u001b[39min\u001b[39;00m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39minfo\u001b[39m.\u001b[39msplits\u001b[39m.\u001b[39mvalues())\n", + "File \u001b[1;32mc:\\Users\\PC\\anaconda3\\envs\\um\\lib\\site-packages\\datasets\\builder.py:1555\u001b[0m, in \u001b[0;36mGeneratorBasedBuilder._download_and_prepare\u001b[1;34m(self, dl_manager, verify_infos, **prepare_splits_kwargs)\u001b[0m\n\u001b[0;32m 1554\u001b[0m \u001b[39mdef\u001b[39;00m \u001b[39m_download_and_prepare\u001b[39m(\u001b[39mself\u001b[39m, dl_manager, verify_infos, \u001b[39m*\u001b[39m\u001b[39m*\u001b[39mprepare_splits_kwargs):\n\u001b[1;32m-> 1555\u001b[0m \u001b[39msuper\u001b[39m()\u001b[39m.\u001b[39m_download_and_prepare(\n\u001b[0;32m 1556\u001b[0m dl_manager, verify_infos, check_duplicate_keys\u001b[39m=\u001b[39mverify_infos, \u001b[39m*\u001b[39m\u001b[39m*\u001b[39mprepare_splits_kwargs\n\u001b[0;32m 1557\u001b[0m )\n", + "File \u001b[1;32mc:\\Users\\PC\\anaconda3\\envs\\um\\lib\\site-packages\\datasets\\builder.py:891\u001b[0m, in \u001b[0;36mDatasetBuilder._download_and_prepare\u001b[1;34m(self, dl_manager, verify_infos, **prepare_split_kwargs)\u001b[0m\n\u001b[0;32m 889\u001b[0m split_dict \u001b[39m=\u001b[39m SplitDict(dataset_name\u001b[39m=\u001b[39m\u001b[39mself\u001b[39m\u001b[39m.\u001b[39mname)\n\u001b[0;32m 890\u001b[0m split_generators_kwargs \u001b[39m=\u001b[39m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_make_split_generators_kwargs(prepare_split_kwargs)\n\u001b[1;32m--> 891\u001b[0m split_generators \u001b[39m=\u001b[39m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_split_generators(dl_manager, \u001b[39m*\u001b[39m\u001b[39m*\u001b[39msplit_generators_kwargs)\n\u001b[0;32m 893\u001b[0m \u001b[39m# Checksums verification\u001b[39;00m\n\u001b[0;32m 894\u001b[0m \u001b[39mif\u001b[39;00m verify_infos \u001b[39mand\u001b[39;00m dl_manager\u001b[39m.\u001b[39mrecord_checksums:\n", + "File \u001b[1;32mc:\\Users\\PC\\anaconda3\\envs\\um\\lib\\site-packages\\datasets\\packaged_modules\\folder_based_builder\\folder_based_builder.py:189\u001b[0m, in \u001b[0;36mFolderBasedBuilder._split_generators\u001b[1;34m(self, dl_manager)\u001b[0m\n\u001b[0;32m 186\u001b[0m metadata_ext \u001b[39m=\u001b[39m metadata_ext\u001b[39m.\u001b[39mpop()\n\u001b[0;32m 188\u001b[0m \u001b[39mfor\u001b[39;00m _, downloaded_metadata_file \u001b[39min\u001b[39;00m itertools\u001b[39m.\u001b[39mchain\u001b[39m.\u001b[39mfrom_iterable(metadata_files\u001b[39m.\u001b[39mvalues()):\n\u001b[1;32m--> 189\u001b[0m pa_metadata_table \u001b[39m=\u001b[39m \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49m_read_metadata(downloaded_metadata_file)\n\u001b[0;32m 190\u001b[0m features_per_metadata_file\u001b[39m.\u001b[39mappend(\n\u001b[0;32m 191\u001b[0m (downloaded_metadata_file, datasets\u001b[39m.\u001b[39mFeatures\u001b[39m.\u001b[39mfrom_arrow_schema(pa_metadata_table\u001b[39m.\u001b[39mschema))\n\u001b[0;32m 192\u001b[0m )\n\u001b[0;32m 193\u001b[0m \u001b[39mfor\u001b[39;00m downloaded_metadata_file, metadata_features \u001b[39min\u001b[39;00m features_per_metadata_file:\n", + "File \u001b[1;32mc:\\Users\\PC\\anaconda3\\envs\\um\\lib\\site-packages\\datasets\\packaged_modules\\folder_based_builder\\folder_based_builder.py:260\u001b[0m, in \u001b[0;36mFolderBasedBuilder._read_metadata\u001b[1;34m(self, metadata_file)\u001b[0m\n\u001b[0;32m 258\u001b[0m \u001b[39melse\u001b[39;00m:\n\u001b[0;32m 259\u001b[0m \u001b[39mwith\u001b[39;00m \u001b[39mopen\u001b[39m(metadata_file, \u001b[39m\"\u001b[39m\u001b[39mrb\u001b[39m\u001b[39m\"\u001b[39m) \u001b[39mas\u001b[39;00m f:\n\u001b[1;32m--> 260\u001b[0m \u001b[39mreturn\u001b[39;00m paj\u001b[39m.\u001b[39;49mread_json(f)\n", + "File \u001b[1;32mc:\\Users\\PC\\anaconda3\\envs\\um\\lib\\site-packages\\pyarrow\\_json.pyx:259\u001b[0m, in \u001b[0;36mpyarrow._json.read_json\u001b[1;34m()\u001b[0m\n", + "File \u001b[1;32mc:\\Users\\PC\\anaconda3\\envs\\um\\lib\\site-packages\\pyarrow\\error.pxi:144\u001b[0m, in \u001b[0;36mpyarrow.lib.pyarrow_internal_check_status\u001b[1;34m()\u001b[0m\n", + "File \u001b[1;32mc:\\Users\\PC\\anaconda3\\envs\\um\\lib\\site-packages\\pyarrow\\error.pxi:100\u001b[0m, in \u001b[0;36mpyarrow.lib.check_status\u001b[1;34m()\u001b[0m\n", + "\u001b[1;31mArrowInvalid\u001b[0m: JSON parse error: Missing a name for object member. in row 0" + ] + } + ], + "source": [ + "dataset = load_dataset(\"../images\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "login('',True)" + ] } ], "metadata": {