wikisource-crawler/notebooks/image_download.ipynb
Michał Kozłowski 680c3d000c fixing
2023-01-10 22:57:22 +01:00

239 lines
15 KiB
Plaintext
Raw Permalink Blame History

This file contains invisible Unicode characters

This file contains invisible Unicode characters that are indistinguishable to humans but may be processed differently by a computer. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"a = pd.read_csv(\"../../wikisource-data/yellow-continue-yellow.tsv.tsv\", sep=\"\\t\")"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Unnamed: 0</th>\n",
" <th>title</th>\n",
" <th>href</th>\n",
" <th>image_url</th>\n",
" <th>text</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>0</td>\n",
" <td>Strona:Stanisław Antoni Wotowski - George Sand...</td>\n",
" <td>https://pl.wikisource.org//wiki/Strona:Stanis%...</td>\n",
" <td>//upload.wikimedia.org/wikipedia/commons/thumb...</td>\n",
" <td>zmieniła się; piękne oczy są tak samo błyszczą...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>1</td>\n",
" <td>Strona:Stanisław Antoni Wotowski - George Sand...</td>\n",
" <td>https://pl.wikisource.org//wiki/Strona:Stanis%...</td>\n",
" <td>//upload.wikimedia.org/wikipedia/commons/thumb...</td>\n",
" <td>najświetniejszej chociażby sławy... i po piętn...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>2</td>\n",
" <td>Strona:Stanisław Antoni Wotowski - George Sand...</td>\n",
" <td>https://pl.wikisource.org//wiki/Strona:Stanis%...</td>\n",
" <td>//upload.wikimedia.org/wikipedia/commons/thumb...</td>\n",
" <td>Chopin gra. Ledwie dostrzegalnie muskają smuk...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>3</td>\n",
" <td>Strona:Stanisław Antoni Wotowski - George Sand...</td>\n",
" <td>https://pl.wikisource.org//wiki/Strona:Stanis%...</td>\n",
" <td>//upload.wikimedia.org/wikipedia/commons/thumb...</td>\n",
" <td>\\nDZIWACZNE MAŁŻEŃSTWO.\\n\\nBył grudzień 1830 ...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>4</td>\n",
" <td>Strona:Stanisław Antoni Wotowski - George Sand...</td>\n",
" <td>https://pl.wikisource.org//wiki/Strona:Stanis%...</td>\n",
" <td>//upload.wikimedia.org/wikipedia/commons/thumb...</td>\n",
" <td>Ale bliższego związku z panią Sand jakby się ...</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" Unnamed: 0 title \\\n",
"0 0 Strona:Stanisław Antoni Wotowski - George Sand... \n",
"1 1 Strona:Stanisław Antoni Wotowski - George Sand... \n",
"2 2 Strona:Stanisław Antoni Wotowski - George Sand... \n",
"3 3 Strona:Stanisław Antoni Wotowski - George Sand... \n",
"4 4 Strona:Stanisław Antoni Wotowski - George Sand... \n",
"\n",
" href \\\n",
"0 https://pl.wikisource.org//wiki/Strona:Stanis%... \n",
"1 https://pl.wikisource.org//wiki/Strona:Stanis%... \n",
"2 https://pl.wikisource.org//wiki/Strona:Stanis%... \n",
"3 https://pl.wikisource.org//wiki/Strona:Stanis%... \n",
"4 https://pl.wikisource.org//wiki/Strona:Stanis%... \n",
"\n",
" image_url \\\n",
"0 //upload.wikimedia.org/wikipedia/commons/thumb... \n",
"1 //upload.wikimedia.org/wikipedia/commons/thumb... \n",
"2 //upload.wikimedia.org/wikipedia/commons/thumb... \n",
"3 //upload.wikimedia.org/wikipedia/commons/thumb... \n",
"4 //upload.wikimedia.org/wikipedia/commons/thumb... \n",
"\n",
" text \n",
"0 zmieniła się; piękne oczy są tak samo błyszczą... \n",
"1 najświetniejszej chociażby sławy... i po piętn... \n",
"2 Chopin gra. Ledwie dostrzegalnie muskają smuk... \n",
"3 \\nDZIWACZNE MAŁŻEŃSTWO.\\n\\nBył grudzień 1830 ... \n",
"4 Ale bliższego związku z panią Sand jakby się ... "
]
},
"execution_count": 3,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"a.head()"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"from datasets import load_dataset\n",
"from huggingface_hub import login"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"Resolving data files: 100%|██████████| 29/29 [00:00<?, ?it/s]\n",
"Using custom data configuration images-8b1ad802b6988161\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Downloading and preparing dataset imagefolder/images to C:/Users/PC/.cache/huggingface/datasets/imagefolder/images-8b1ad802b6988161/0.0.0/37fbb85cc714a338bea574ac6c7d0b5be5aff46c1862c1989b20e0771199e93f...\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"Downloading data files: 0it [00:00, ?it/s]\n",
"Extracting data files: 0it [00:00, ?it/s]\n"
]
},
{
"ename": "ArrowInvalid",
"evalue": "JSON parse error: Missing a name for object member. in row 0",
"output_type": "error",
"traceback": [
"\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[1;31mArrowInvalid\u001b[0m Traceback (most recent call last)",
"Cell \u001b[1;32mIn[6], line 1\u001b[0m\n\u001b[1;32m----> 1\u001b[0m dataset \u001b[39m=\u001b[39m load_dataset(\u001b[39m\"\u001b[39;49m\u001b[39m../images\u001b[39;49m\u001b[39m\"\u001b[39;49m)\n",
"File \u001b[1;32mc:\\Users\\PC\\anaconda3\\envs\\um\\lib\\site-packages\\datasets\\load.py:1741\u001b[0m, in \u001b[0;36mload_dataset\u001b[1;34m(path, name, data_dir, data_files, split, cache_dir, features, download_config, download_mode, ignore_verifications, keep_in_memory, save_infos, revision, use_auth_token, task, streaming, num_proc, **config_kwargs)\u001b[0m\n\u001b[0;32m 1738\u001b[0m try_from_hf_gcs \u001b[39m=\u001b[39m path \u001b[39mnot\u001b[39;00m \u001b[39min\u001b[39;00m _PACKAGED_DATASETS_MODULES\n\u001b[0;32m 1740\u001b[0m \u001b[39m# Download and prepare data\u001b[39;00m\n\u001b[1;32m-> 1741\u001b[0m builder_instance\u001b[39m.\u001b[39;49mdownload_and_prepare(\n\u001b[0;32m 1742\u001b[0m download_config\u001b[39m=\u001b[39;49mdownload_config,\n\u001b[0;32m 1743\u001b[0m download_mode\u001b[39m=\u001b[39;49mdownload_mode,\n\u001b[0;32m 1744\u001b[0m ignore_verifications\u001b[39m=\u001b[39;49mignore_verifications,\n\u001b[0;32m 1745\u001b[0m try_from_hf_gcs\u001b[39m=\u001b[39;49mtry_from_hf_gcs,\n\u001b[0;32m 1746\u001b[0m use_auth_token\u001b[39m=\u001b[39;49muse_auth_token,\n\u001b[0;32m 1747\u001b[0m num_proc\u001b[39m=\u001b[39;49mnum_proc,\n\u001b[0;32m 1748\u001b[0m )\n\u001b[0;32m 1750\u001b[0m \u001b[39m# Build dataset for splits\u001b[39;00m\n\u001b[0;32m 1751\u001b[0m keep_in_memory \u001b[39m=\u001b[39m (\n\u001b[0;32m 1752\u001b[0m keep_in_memory \u001b[39mif\u001b[39;00m keep_in_memory \u001b[39mis\u001b[39;00m \u001b[39mnot\u001b[39;00m \u001b[39mNone\u001b[39;00m \u001b[39melse\u001b[39;00m is_small_dataset(builder_instance\u001b[39m.\u001b[39minfo\u001b[39m.\u001b[39mdataset_size)\n\u001b[0;32m 1753\u001b[0m )\n",
"File \u001b[1;32mc:\\Users\\PC\\anaconda3\\envs\\um\\lib\\site-packages\\datasets\\builder.py:822\u001b[0m, in \u001b[0;36mDatasetBuilder.download_and_prepare\u001b[1;34m(self, output_dir, download_config, download_mode, ignore_verifications, try_from_hf_gcs, dl_manager, base_path, use_auth_token, file_format, max_shard_size, num_proc, storage_options, **download_and_prepare_kwargs)\u001b[0m\n\u001b[0;32m 820\u001b[0m \u001b[39mif\u001b[39;00m num_proc \u001b[39mis\u001b[39;00m \u001b[39mnot\u001b[39;00m \u001b[39mNone\u001b[39;00m:\n\u001b[0;32m 821\u001b[0m prepare_split_kwargs[\u001b[39m\"\u001b[39m\u001b[39mnum_proc\u001b[39m\u001b[39m\"\u001b[39m] \u001b[39m=\u001b[39m num_proc\n\u001b[1;32m--> 822\u001b[0m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_download_and_prepare(\n\u001b[0;32m 823\u001b[0m dl_manager\u001b[39m=\u001b[39mdl_manager,\n\u001b[0;32m 824\u001b[0m verify_infos\u001b[39m=\u001b[39mverify_infos,\n\u001b[0;32m 825\u001b[0m \u001b[39m*\u001b[39m\u001b[39m*\u001b[39mprepare_split_kwargs,\n\u001b[0;32m 826\u001b[0m \u001b[39m*\u001b[39m\u001b[39m*\u001b[39mdownload_and_prepare_kwargs,\n\u001b[0;32m 827\u001b[0m )\n\u001b[0;32m 828\u001b[0m \u001b[39m# Sync info\u001b[39;00m\n\u001b[0;32m 829\u001b[0m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39minfo\u001b[39m.\u001b[39mdataset_size \u001b[39m=\u001b[39m \u001b[39msum\u001b[39m(split\u001b[39m.\u001b[39mnum_bytes \u001b[39mfor\u001b[39;00m split \u001b[39min\u001b[39;00m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39minfo\u001b[39m.\u001b[39msplits\u001b[39m.\u001b[39mvalues())\n",
"File \u001b[1;32mc:\\Users\\PC\\anaconda3\\envs\\um\\lib\\site-packages\\datasets\\builder.py:1555\u001b[0m, in \u001b[0;36mGeneratorBasedBuilder._download_and_prepare\u001b[1;34m(self, dl_manager, verify_infos, **prepare_splits_kwargs)\u001b[0m\n\u001b[0;32m 1554\u001b[0m \u001b[39mdef\u001b[39;00m \u001b[39m_download_and_prepare\u001b[39m(\u001b[39mself\u001b[39m, dl_manager, verify_infos, \u001b[39m*\u001b[39m\u001b[39m*\u001b[39mprepare_splits_kwargs):\n\u001b[1;32m-> 1555\u001b[0m \u001b[39msuper\u001b[39m()\u001b[39m.\u001b[39m_download_and_prepare(\n\u001b[0;32m 1556\u001b[0m dl_manager, verify_infos, check_duplicate_keys\u001b[39m=\u001b[39mverify_infos, \u001b[39m*\u001b[39m\u001b[39m*\u001b[39mprepare_splits_kwargs\n\u001b[0;32m 1557\u001b[0m )\n",
"File \u001b[1;32mc:\\Users\\PC\\anaconda3\\envs\\um\\lib\\site-packages\\datasets\\builder.py:891\u001b[0m, in \u001b[0;36mDatasetBuilder._download_and_prepare\u001b[1;34m(self, dl_manager, verify_infos, **prepare_split_kwargs)\u001b[0m\n\u001b[0;32m 889\u001b[0m split_dict \u001b[39m=\u001b[39m SplitDict(dataset_name\u001b[39m=\u001b[39m\u001b[39mself\u001b[39m\u001b[39m.\u001b[39mname)\n\u001b[0;32m 890\u001b[0m split_generators_kwargs \u001b[39m=\u001b[39m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_make_split_generators_kwargs(prepare_split_kwargs)\n\u001b[1;32m--> 891\u001b[0m split_generators \u001b[39m=\u001b[39m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_split_generators(dl_manager, \u001b[39m*\u001b[39m\u001b[39m*\u001b[39msplit_generators_kwargs)\n\u001b[0;32m 893\u001b[0m \u001b[39m# Checksums verification\u001b[39;00m\n\u001b[0;32m 894\u001b[0m \u001b[39mif\u001b[39;00m verify_infos \u001b[39mand\u001b[39;00m dl_manager\u001b[39m.\u001b[39mrecord_checksums:\n",
"File \u001b[1;32mc:\\Users\\PC\\anaconda3\\envs\\um\\lib\\site-packages\\datasets\\packaged_modules\\folder_based_builder\\folder_based_builder.py:189\u001b[0m, in \u001b[0;36mFolderBasedBuilder._split_generators\u001b[1;34m(self, dl_manager)\u001b[0m\n\u001b[0;32m 186\u001b[0m metadata_ext \u001b[39m=\u001b[39m metadata_ext\u001b[39m.\u001b[39mpop()\n\u001b[0;32m 188\u001b[0m \u001b[39mfor\u001b[39;00m _, downloaded_metadata_file \u001b[39min\u001b[39;00m itertools\u001b[39m.\u001b[39mchain\u001b[39m.\u001b[39mfrom_iterable(metadata_files\u001b[39m.\u001b[39mvalues()):\n\u001b[1;32m--> 189\u001b[0m pa_metadata_table \u001b[39m=\u001b[39m \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49m_read_metadata(downloaded_metadata_file)\n\u001b[0;32m 190\u001b[0m features_per_metadata_file\u001b[39m.\u001b[39mappend(\n\u001b[0;32m 191\u001b[0m (downloaded_metadata_file, datasets\u001b[39m.\u001b[39mFeatures\u001b[39m.\u001b[39mfrom_arrow_schema(pa_metadata_table\u001b[39m.\u001b[39mschema))\n\u001b[0;32m 192\u001b[0m )\n\u001b[0;32m 193\u001b[0m \u001b[39mfor\u001b[39;00m downloaded_metadata_file, metadata_features \u001b[39min\u001b[39;00m features_per_metadata_file:\n",
"File \u001b[1;32mc:\\Users\\PC\\anaconda3\\envs\\um\\lib\\site-packages\\datasets\\packaged_modules\\folder_based_builder\\folder_based_builder.py:260\u001b[0m, in \u001b[0;36mFolderBasedBuilder._read_metadata\u001b[1;34m(self, metadata_file)\u001b[0m\n\u001b[0;32m 258\u001b[0m \u001b[39melse\u001b[39;00m:\n\u001b[0;32m 259\u001b[0m \u001b[39mwith\u001b[39;00m \u001b[39mopen\u001b[39m(metadata_file, \u001b[39m\"\u001b[39m\u001b[39mrb\u001b[39m\u001b[39m\"\u001b[39m) \u001b[39mas\u001b[39;00m f:\n\u001b[1;32m--> 260\u001b[0m \u001b[39mreturn\u001b[39;00m paj\u001b[39m.\u001b[39;49mread_json(f)\n",
"File \u001b[1;32mc:\\Users\\PC\\anaconda3\\envs\\um\\lib\\site-packages\\pyarrow\\_json.pyx:259\u001b[0m, in \u001b[0;36mpyarrow._json.read_json\u001b[1;34m()\u001b[0m\n",
"File \u001b[1;32mc:\\Users\\PC\\anaconda3\\envs\\um\\lib\\site-packages\\pyarrow\\error.pxi:144\u001b[0m, in \u001b[0;36mpyarrow.lib.pyarrow_internal_check_status\u001b[1;34m()\u001b[0m\n",
"File \u001b[1;32mc:\\Users\\PC\\anaconda3\\envs\\um\\lib\\site-packages\\pyarrow\\error.pxi:100\u001b[0m, in \u001b[0;36mpyarrow.lib.check_status\u001b[1;34m()\u001b[0m\n",
"\u001b[1;31mArrowInvalid\u001b[0m: JSON parse error: Missing a name for object member. in row 0"
]
}
],
"source": [
"dataset = load_dataset(\"../images\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"login('',True)"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "um",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.15"
},
"orig_nbformat": 4,
"vscode": {
"interpreter": {
"hash": "876e189cbbe99a9a838ece62aae1013186c4bb7e0254a10cfa2f9b2381853efb"
}
}
},
"nbformat": 4,
"nbformat_minor": 2
}