15 KiB
15 KiB
import pandas as pd
a = pd.read_csv("../../wikisource-data/yellow-continue-yellow.tsv.tsv", sep="\t")
a.head()
Unnamed: 0 | title | href | image_url | text | |
---|---|---|---|---|---|
0 | 0 | Strona:Stanisław Antoni Wotowski - George Sand... | https://pl.wikisource.org//wiki/Strona:Stanis%... | //upload.wikimedia.org/wikipedia/commons/thumb... | zmieniła się; piękne oczy są tak samo błyszczą... |
1 | 1 | Strona:Stanisław Antoni Wotowski - George Sand... | https://pl.wikisource.org//wiki/Strona:Stanis%... | //upload.wikimedia.org/wikipedia/commons/thumb... | najświetniejszej chociażby sławy... i po piętn... |
2 | 2 | Strona:Stanisław Antoni Wotowski - George Sand... | https://pl.wikisource.org//wiki/Strona:Stanis%... | //upload.wikimedia.org/wikipedia/commons/thumb... | Chopin gra. Ledwie dostrzegalnie muskają smuk... |
3 | 3 | Strona:Stanisław Antoni Wotowski - George Sand... | https://pl.wikisource.org//wiki/Strona:Stanis%... | //upload.wikimedia.org/wikipedia/commons/thumb... | \nDZIWACZNE MAŁŻEŃSTWO.\n\n Był grudzień 1830 ... |
4 | 4 | Strona:Stanisław Antoni Wotowski - George Sand... | https://pl.wikisource.org//wiki/Strona:Stanis%... | //upload.wikimedia.org/wikipedia/commons/thumb... | Ale bliższego związku z panią Sand jakby się ... |
from datasets import load_dataset
from huggingface_hub import login
dataset = load_dataset("../images")
Resolving data files: 100%|██████████| 29/29 [00:00<?, ?it/s] Using custom data configuration images-8b1ad802b6988161
Downloading and preparing dataset imagefolder/images to C:/Users/PC/.cache/huggingface/datasets/imagefolder/images-8b1ad802b6988161/0.0.0/37fbb85cc714a338bea574ac6c7d0b5be5aff46c1862c1989b20e0771199e93f...
Downloading data files: 0it [00:00, ?it/s] Extracting data files: 0it [00:00, ?it/s]
[1;31m---------------------------------------------------------------------------[0m [1;31mArrowInvalid[0m Traceback (most recent call last) Cell [1;32mIn[6], line 1[0m [1;32m----> 1[0m dataset [39m=[39m load_dataset([39m"[39;49m[39m../images[39;49m[39m"[39;49m) File [1;32mc:\Users\PC\anaconda3\envs\um\lib\site-packages\datasets\load.py:1741[0m, in [0;36mload_dataset[1;34m(path, name, data_dir, data_files, split, cache_dir, features, download_config, download_mode, ignore_verifications, keep_in_memory, save_infos, revision, use_auth_token, task, streaming, num_proc, **config_kwargs)[0m [0;32m 1738[0m try_from_hf_gcs [39m=[39m path [39mnot[39;00m [39min[39;00m _PACKAGED_DATASETS_MODULES [0;32m 1740[0m [39m# Download and prepare data[39;00m [1;32m-> 1741[0m builder_instance[39m.[39;49mdownload_and_prepare( [0;32m 1742[0m download_config[39m=[39;49mdownload_config, [0;32m 1743[0m download_mode[39m=[39;49mdownload_mode, [0;32m 1744[0m ignore_verifications[39m=[39;49mignore_verifications, [0;32m 1745[0m try_from_hf_gcs[39m=[39;49mtry_from_hf_gcs, [0;32m 1746[0m use_auth_token[39m=[39;49muse_auth_token, [0;32m 1747[0m num_proc[39m=[39;49mnum_proc, [0;32m 1748[0m ) [0;32m 1750[0m [39m# Build dataset for splits[39;00m [0;32m 1751[0m keep_in_memory [39m=[39m ( [0;32m 1752[0m keep_in_memory [39mif[39;00m keep_in_memory [39mis[39;00m [39mnot[39;00m [39mNone[39;00m [39melse[39;00m is_small_dataset(builder_instance[39m.[39minfo[39m.[39mdataset_size) [0;32m 1753[0m ) File [1;32mc:\Users\PC\anaconda3\envs\um\lib\site-packages\datasets\builder.py:822[0m, in [0;36mDatasetBuilder.download_and_prepare[1;34m(self, output_dir, download_config, download_mode, ignore_verifications, try_from_hf_gcs, dl_manager, base_path, use_auth_token, file_format, max_shard_size, num_proc, storage_options, **download_and_prepare_kwargs)[0m [0;32m 820[0m [39mif[39;00m num_proc [39mis[39;00m [39mnot[39;00m [39mNone[39;00m: [0;32m 821[0m prepare_split_kwargs[[39m"[39m[39mnum_proc[39m[39m"[39m] [39m=[39m num_proc [1;32m--> 822[0m [39mself[39m[39m.[39m_download_and_prepare( [0;32m 823[0m dl_manager[39m=[39mdl_manager, [0;32m 824[0m verify_infos[39m=[39mverify_infos, [0;32m 825[0m [39m*[39m[39m*[39mprepare_split_kwargs, [0;32m 826[0m [39m*[39m[39m*[39mdownload_and_prepare_kwargs, [0;32m 827[0m ) [0;32m 828[0m [39m# Sync info[39;00m [0;32m 829[0m [39mself[39m[39m.[39minfo[39m.[39mdataset_size [39m=[39m [39msum[39m(split[39m.[39mnum_bytes [39mfor[39;00m split [39min[39;00m [39mself[39m[39m.[39minfo[39m.[39msplits[39m.[39mvalues()) File [1;32mc:\Users\PC\anaconda3\envs\um\lib\site-packages\datasets\builder.py:1555[0m, in [0;36mGeneratorBasedBuilder._download_and_prepare[1;34m(self, dl_manager, verify_infos, **prepare_splits_kwargs)[0m [0;32m 1554[0m [39mdef[39;00m [39m_download_and_prepare[39m([39mself[39m, dl_manager, verify_infos, [39m*[39m[39m*[39mprepare_splits_kwargs): [1;32m-> 1555[0m [39msuper[39m()[39m.[39m_download_and_prepare( [0;32m 1556[0m dl_manager, verify_infos, check_duplicate_keys[39m=[39mverify_infos, [39m*[39m[39m*[39mprepare_splits_kwargs [0;32m 1557[0m ) File [1;32mc:\Users\PC\anaconda3\envs\um\lib\site-packages\datasets\builder.py:891[0m, in [0;36mDatasetBuilder._download_and_prepare[1;34m(self, dl_manager, verify_infos, **prepare_split_kwargs)[0m [0;32m 889[0m split_dict [39m=[39m SplitDict(dataset_name[39m=[39m[39mself[39m[39m.[39mname) [0;32m 890[0m split_generators_kwargs [39m=[39m [39mself[39m[39m.[39m_make_split_generators_kwargs(prepare_split_kwargs) [1;32m--> 891[0m split_generators [39m=[39m [39mself[39m[39m.[39m_split_generators(dl_manager, [39m*[39m[39m*[39msplit_generators_kwargs) [0;32m 893[0m [39m# Checksums verification[39;00m [0;32m 894[0m [39mif[39;00m verify_infos [39mand[39;00m dl_manager[39m.[39mrecord_checksums: File [1;32mc:\Users\PC\anaconda3\envs\um\lib\site-packages\datasets\packaged_modules\folder_based_builder\folder_based_builder.py:189[0m, in [0;36mFolderBasedBuilder._split_generators[1;34m(self, dl_manager)[0m [0;32m 186[0m metadata_ext [39m=[39m metadata_ext[39m.[39mpop() [0;32m 188[0m [39mfor[39;00m _, downloaded_metadata_file [39min[39;00m itertools[39m.[39mchain[39m.[39mfrom_iterable(metadata_files[39m.[39mvalues()): [1;32m--> 189[0m pa_metadata_table [39m=[39m [39mself[39;49m[39m.[39;49m_read_metadata(downloaded_metadata_file) [0;32m 190[0m features_per_metadata_file[39m.[39mappend( [0;32m 191[0m (downloaded_metadata_file, datasets[39m.[39mFeatures[39m.[39mfrom_arrow_schema(pa_metadata_table[39m.[39mschema)) [0;32m 192[0m ) [0;32m 193[0m [39mfor[39;00m downloaded_metadata_file, metadata_features [39min[39;00m features_per_metadata_file: File [1;32mc:\Users\PC\anaconda3\envs\um\lib\site-packages\datasets\packaged_modules\folder_based_builder\folder_based_builder.py:260[0m, in [0;36mFolderBasedBuilder._read_metadata[1;34m(self, metadata_file)[0m [0;32m 258[0m [39melse[39;00m: [0;32m 259[0m [39mwith[39;00m [39mopen[39m(metadata_file, [39m"[39m[39mrb[39m[39m"[39m) [39mas[39;00m f: [1;32m--> 260[0m [39mreturn[39;00m paj[39m.[39;49mread_json(f) File [1;32mc:\Users\PC\anaconda3\envs\um\lib\site-packages\pyarrow\_json.pyx:259[0m, in [0;36mpyarrow._json.read_json[1;34m()[0m File [1;32mc:\Users\PC\anaconda3\envs\um\lib\site-packages\pyarrow\error.pxi:144[0m, in [0;36mpyarrow.lib.pyarrow_internal_check_status[1;34m()[0m File [1;32mc:\Users\PC\anaconda3\envs\um\lib\site-packages\pyarrow\error.pxi:100[0m, in [0;36mpyarrow.lib.check_status[1;34m()[0m [1;31mArrowInvalid[0m: JSON parse error: Missing a name for object member. in row 0
login('',True)