wikisource-crawler/notebooks/image_download.ipynb
Michał Kozłowski 680c3d000c fixing
2023-01-10 22:57:22 +01:00

15 KiB
Raw Permalink Blame History

import pandas as pd
a = pd.read_csv("../../wikisource-data/yellow-continue-yellow.tsv.tsv", sep="\t")
a.head()
Unnamed: 0 title href image_url text
0 0 Strona:Stanisław Antoni Wotowski - George Sand... https://pl.wikisource.org//wiki/Strona:Stanis%... //upload.wikimedia.org/wikipedia/commons/thumb... zmieniła się; piękne oczy są tak samo błyszczą...
1 1 Strona:Stanisław Antoni Wotowski - George Sand... https://pl.wikisource.org//wiki/Strona:Stanis%... //upload.wikimedia.org/wikipedia/commons/thumb... najświetniejszej chociażby sławy... i po piętn...
2 2 Strona:Stanisław Antoni Wotowski - George Sand... https://pl.wikisource.org//wiki/Strona:Stanis%... //upload.wikimedia.org/wikipedia/commons/thumb... Chopin gra. Ledwie dostrzegalnie muskają smuk...
3 3 Strona:Stanisław Antoni Wotowski - George Sand... https://pl.wikisource.org//wiki/Strona:Stanis%... //upload.wikimedia.org/wikipedia/commons/thumb... \nDZIWACZNE MAŁŻEŃSTWO.\n\nBył grudzień 1830 ...
4 4 Strona:Stanisław Antoni Wotowski - George Sand... https://pl.wikisource.org//wiki/Strona:Stanis%... //upload.wikimedia.org/wikipedia/commons/thumb... Ale bliższego związku z panią Sand jakby się ...
from datasets import load_dataset
from huggingface_hub import login
dataset = load_dataset("../images")
Resolving data files: 100%|██████████| 29/29 [00:00<?, ?it/s]
Using custom data configuration images-8b1ad802b6988161
Downloading and preparing dataset imagefolder/images to C:/Users/PC/.cache/huggingface/datasets/imagefolder/images-8b1ad802b6988161/0.0.0/37fbb85cc714a338bea574ac6c7d0b5be5aff46c1862c1989b20e0771199e93f...
Downloading data files: 0it [00:00, ?it/s]
Extracting data files: 0it [00:00, ?it/s]
---------------------------------------------------------------------------
ArrowInvalid                              Traceback (most recent call last)
Cell In[6], line 1
----> 1 dataset = load_dataset("../images")

File c:\Users\PC\anaconda3\envs\um\lib\site-packages\datasets\load.py:1741, in load_dataset(path, name, data_dir, data_files, split, cache_dir, features, download_config, download_mode, ignore_verifications, keep_in_memory, save_infos, revision, use_auth_token, task, streaming, num_proc, **config_kwargs)
   1738 try_from_hf_gcs = path not in _PACKAGED_DATASETS_MODULES
   1740 # Download and prepare data
-> 1741 builder_instance.download_and_prepare(
   1742     download_config=download_config,
   1743     download_mode=download_mode,
   1744     ignore_verifications=ignore_verifications,
   1745     try_from_hf_gcs=try_from_hf_gcs,
   1746     use_auth_token=use_auth_token,
   1747     num_proc=num_proc,
   1748 )
   1750 # Build dataset for splits
   1751 keep_in_memory = (
   1752     keep_in_memory if keep_in_memory is not None else is_small_dataset(builder_instance.info.dataset_size)
   1753 )

File c:\Users\PC\anaconda3\envs\um\lib\site-packages\datasets\builder.py:822, in DatasetBuilder.download_and_prepare(self, output_dir, download_config, download_mode, ignore_verifications, try_from_hf_gcs, dl_manager, base_path, use_auth_token, file_format, max_shard_size, num_proc, storage_options, **download_and_prepare_kwargs)
    820     if num_proc is not None:
    821         prepare_split_kwargs["num_proc"] = num_proc
--> 822     self._download_and_prepare(
    823         dl_manager=dl_manager,
    824         verify_infos=verify_infos,
    825         **prepare_split_kwargs,
    826         **download_and_prepare_kwargs,
    827     )
    828 # Sync info
    829 self.info.dataset_size = sum(split.num_bytes for split in self.info.splits.values())

File c:\Users\PC\anaconda3\envs\um\lib\site-packages\datasets\builder.py:1555, in GeneratorBasedBuilder._download_and_prepare(self, dl_manager, verify_infos, **prepare_splits_kwargs)
   1554 def _download_and_prepare(self, dl_manager, verify_infos, **prepare_splits_kwargs):
-> 1555     super()._download_and_prepare(
   1556         dl_manager, verify_infos, check_duplicate_keys=verify_infos, **prepare_splits_kwargs
   1557     )

File c:\Users\PC\anaconda3\envs\um\lib\site-packages\datasets\builder.py:891, in DatasetBuilder._download_and_prepare(self, dl_manager, verify_infos, **prepare_split_kwargs)
    889 split_dict = SplitDict(dataset_name=self.name)
    890 split_generators_kwargs = self._make_split_generators_kwargs(prepare_split_kwargs)
--> 891 split_generators = self._split_generators(dl_manager, **split_generators_kwargs)
    893 # Checksums verification
    894 if verify_infos and dl_manager.record_checksums:

File c:\Users\PC\anaconda3\envs\um\lib\site-packages\datasets\packaged_modules\folder_based_builder\folder_based_builder.py:189, in FolderBasedBuilder._split_generators(self, dl_manager)
    186 metadata_ext = metadata_ext.pop()
    188 for _, downloaded_metadata_file in itertools.chain.from_iterable(metadata_files.values()):
--> 189     pa_metadata_table = self._read_metadata(downloaded_metadata_file)
    190     features_per_metadata_file.append(
    191         (downloaded_metadata_file, datasets.Features.from_arrow_schema(pa_metadata_table.schema))
    192     )
    193 for downloaded_metadata_file, metadata_features in features_per_metadata_file:

File c:\Users\PC\anaconda3\envs\um\lib\site-packages\datasets\packaged_modules\folder_based_builder\folder_based_builder.py:260, in FolderBasedBuilder._read_metadata(self, metadata_file)
    258 else:
    259     with open(metadata_file, "rb") as f:
--> 260         return paj.read_json(f)

File c:\Users\PC\anaconda3\envs\um\lib\site-packages\pyarrow\_json.pyx:259, in pyarrow._json.read_json()

File c:\Users\PC\anaconda3\envs\um\lib\site-packages\pyarrow\error.pxi:144, in pyarrow.lib.pyarrow_internal_check_status()

File c:\Users\PC\anaconda3\envs\um\lib\site-packages\pyarrow\error.pxi:100, in pyarrow.lib.check_status()

ArrowInvalid: JSON parse error: Missing a name for object member. in row 0
login('',True)