msc-patryk-bartkowiak/code/download_the_stack.ipynb

19 KiB

from huggingface_hub import list_repo_files, hf_hub_download
import os
# Define the repository details
repo_id = "bigcode/the-stack-dedup"  # Repository ID for the dataset
subfolder = "data/python"  # Subfolder path within the repository
repo_type = "dataset"  # Specify that it's a dataset

# Specify the local directory to save the files
local_dir = "/work/s452638/datasets/the-stack-python"
os.makedirs(local_dir, exist_ok=True)

# List all files in the repository's subfolder
files_list = list_repo_files(repo_id=repo_id, repo_type=repo_type)

# Filter files in the desired subfolder
files_to_download = [file for file in files_list if file.startswith(f'{subfolder}/')]

print(f"Found {len(files_to_download)} files in the repository.")

# Download each file
for file_name in files_to_download:
    print(f"Downloading {file_name}...")
    hf_hub_download(repo_id=repo_id, repo_type=repo_type, filename=file_name, local_dir=local_dir)

print("All files have been downloaded successfully.")
Found 5144 files in the repository.
['data/python/data-00000-of-00144.parquet', 'data/python/data-00001-of-00144.parquet', 'data/python/data-00002-of-00144.parquet', 'data/python/data-00003-of-00144.parquet', 'data/python/data-00004-of-00144.parquet', 'data/python/data-00005-of-00144.parquet', 'data/python/data-00006-of-00144.parquet', 'data/python/data-00007-of-00144.parquet', 'data/python/data-00008-of-00144.parquet', 'data/python/data-00009-of-00144.parquet', 'data/python/data-00010-of-00144.parquet', 'data/python/data-00011-of-00144.parquet', 'data/python/data-00012-of-00144.parquet', 'data/python/data-00013-of-00144.parquet', 'data/python/data-00014-of-00144.parquet', 'data/python/data-00015-of-00144.parquet', 'data/python/data-00016-of-00144.parquet', 'data/python/data-00017-of-00144.parquet', 'data/python/data-00018-of-00144.parquet', 'data/python/data-00019-of-00144.parquet', 'data/python/data-00020-of-00144.parquet', 'data/python/data-00021-of-00144.parquet', 'data/python/data-00022-of-00144.parquet', 'data/python/data-00023-of-00144.parquet', 'data/python/data-00024-of-00144.parquet', 'data/python/data-00025-of-00144.parquet', 'data/python/data-00026-of-00144.parquet', 'data/python/data-00027-of-00144.parquet', 'data/python/data-00028-of-00144.parquet', 'data/python/data-00029-of-00144.parquet', 'data/python/data-00030-of-00144.parquet', 'data/python/data-00031-of-00144.parquet', 'data/python/data-00032-of-00144.parquet', 'data/python/data-00033-of-00144.parquet', 'data/python/data-00034-of-00144.parquet', 'data/python/data-00035-of-00144.parquet', 'data/python/data-00036-of-00144.parquet', 'data/python/data-00037-of-00144.parquet', 'data/python/data-00038-of-00144.parquet', 'data/python/data-00039-of-00144.parquet', 'data/python/data-00040-of-00144.parquet', 'data/python/data-00041-of-00144.parquet', 'data/python/data-00042-of-00144.parquet', 'data/python/data-00043-of-00144.parquet', 'data/python/data-00044-of-00144.parquet', 'data/python/data-00045-of-00144.parquet', 'data/python/data-00046-of-00144.parquet', 'data/python/data-00047-of-00144.parquet', 'data/python/data-00048-of-00144.parquet', 'data/python/data-00049-of-00144.parquet', 'data/python/data-00050-of-00144.parquet', 'data/python/data-00051-of-00144.parquet', 'data/python/data-00052-of-00144.parquet', 'data/python/data-00053-of-00144.parquet', 'data/python/data-00054-of-00144.parquet', 'data/python/data-00055-of-00144.parquet', 'data/python/data-00056-of-00144.parquet', 'data/python/data-00057-of-00144.parquet', 'data/python/data-00058-of-00144.parquet', 'data/python/data-00059-of-00144.parquet', 'data/python/data-00060-of-00144.parquet', 'data/python/data-00061-of-00144.parquet', 'data/python/data-00062-of-00144.parquet', 'data/python/data-00063-of-00144.parquet', 'data/python/data-00064-of-00144.parquet', 'data/python/data-00065-of-00144.parquet', 'data/python/data-00066-of-00144.parquet', 'data/python/data-00067-of-00144.parquet', 'data/python/data-00068-of-00144.parquet', 'data/python/data-00069-of-00144.parquet', 'data/python/data-00070-of-00144.parquet', 'data/python/data-00071-of-00144.parquet', 'data/python/data-00072-of-00144.parquet', 'data/python/data-00073-of-00144.parquet', 'data/python/data-00074-of-00144.parquet', 'data/python/data-00075-of-00144.parquet', 'data/python/data-00076-of-00144.parquet', 'data/python/data-00077-of-00144.parquet', 'data/python/data-00078-of-00144.parquet', 'data/python/data-00079-of-00144.parquet', 'data/python/data-00080-of-00144.parquet', 'data/python/data-00081-of-00144.parquet', 'data/python/data-00082-of-00144.parquet', 'data/python/data-00083-of-00144.parquet', 'data/python/data-00084-of-00144.parquet', 'data/python/data-00085-of-00144.parquet', 'data/python/data-00086-of-00144.parquet', 'data/python/data-00087-of-00144.parquet', 'data/python/data-00088-of-00144.parquet', 'data/python/data-00089-of-00144.parquet', 'data/python/data-00090-of-00144.parquet', 'data/python/data-00091-of-00144.parquet', 'data/python/data-00092-of-00144.parquet', 'data/python/data-00093-of-00144.parquet', 'data/python/data-00094-of-00144.parquet', 'data/python/data-00095-of-00144.parquet', 'data/python/data-00096-of-00144.parquet', 'data/python/data-00097-of-00144.parquet', 'data/python/data-00098-of-00144.parquet', 'data/python/data-00099-of-00144.parquet', 'data/python/data-00100-of-00144.parquet', 'data/python/data-00101-of-00144.parquet', 'data/python/data-00102-of-00144.parquet', 'data/python/data-00103-of-00144.parquet', 'data/python/data-00104-of-00144.parquet', 'data/python/data-00105-of-00144.parquet', 'data/python/data-00106-of-00144.parquet', 'data/python/data-00107-of-00144.parquet', 'data/python/data-00108-of-00144.parquet', 'data/python/data-00109-of-00144.parquet', 'data/python/data-00110-of-00144.parquet', 'data/python/data-00111-of-00144.parquet', 'data/python/data-00112-of-00144.parquet', 'data/python/data-00113-of-00144.parquet', 'data/python/data-00114-of-00144.parquet', 'data/python/data-00115-of-00144.parquet', 'data/python/data-00116-of-00144.parquet', 'data/python/data-00117-of-00144.parquet', 'data/python/data-00118-of-00144.parquet', 'data/python/data-00119-of-00144.parquet', 'data/python/data-00120-of-00144.parquet', 'data/python/data-00121-of-00144.parquet', 'data/python/data-00122-of-00144.parquet', 'data/python/data-00123-of-00144.parquet', 'data/python/data-00124-of-00144.parquet', 'data/python/data-00125-of-00144.parquet', 'data/python/data-00126-of-00144.parquet', 'data/python/data-00127-of-00144.parquet', 'data/python/data-00128-of-00144.parquet', 'data/python/data-00129-of-00144.parquet', 'data/python/data-00130-of-00144.parquet', 'data/python/data-00131-of-00144.parquet', 'data/python/data-00132-of-00144.parquet', 'data/python/data-00133-of-00144.parquet', 'data/python/data-00134-of-00144.parquet', 'data/python/data-00135-of-00144.parquet', 'data/python/data-00136-of-00144.parquet', 'data/python/data-00137-of-00144.parquet', 'data/python/data-00138-of-00144.parquet', 'data/python/data-00139-of-00144.parquet', 'data/python/data-00140-of-00144.parquet', 'data/python/data-00141-of-00144.parquet', 'data/python/data-00142-of-00144.parquet', 'data/python/data-00143-of-00144.parquet']
Downloading data/python/data-00000-of-00144.parquet...
Downloading data/python/data-00001-of-00144.parquet...
Downloading data/python/data-00002-of-00144.parquet...
Downloading data/python/data-00003-of-00144.parquet...
Downloading data/python/data-00004-of-00144.parquet...
Downloading data/python/data-00005-of-00144.parquet...
Downloading data/python/data-00006-of-00144.parquet...
Downloading data/python/data-00007-of-00144.parquet...
Downloading data/python/data-00008-of-00144.parquet...
Downloading data/python/data-00009-of-00144.parquet...
Downloading data/python/data-00010-of-00144.parquet...
Downloading data/python/data-00011-of-00144.parquet...
Downloading data/python/data-00012-of-00144.parquet...
Downloading data/python/data-00013-of-00144.parquet...
Downloading data/python/data-00014-of-00144.parquet...
Downloading data/python/data-00015-of-00144.parquet...
Downloading data/python/data-00016-of-00144.parquet...
Downloading data/python/data-00017-of-00144.parquet...
Downloading data/python/data-00018-of-00144.parquet...
Downloading data/python/data-00019-of-00144.parquet...
Downloading data/python/data-00020-of-00144.parquet...
Downloading data/python/data-00021-of-00144.parquet...
Downloading data/python/data-00022-of-00144.parquet...
Downloading data/python/data-00023-of-00144.parquet...
Downloading data/python/data-00024-of-00144.parquet...
Downloading data/python/data-00025-of-00144.parquet...
Downloading data/python/data-00026-of-00144.parquet...
Downloading data/python/data-00027-of-00144.parquet...
Downloading data/python/data-00028-of-00144.parquet...
Downloading data/python/data-00029-of-00144.parquet...
Downloading data/python/data-00030-of-00144.parquet...
Downloading data/python/data-00031-of-00144.parquet...
Downloading data/python/data-00032-of-00144.parquet...
Downloading data/python/data-00033-of-00144.parquet...
Downloading data/python/data-00034-of-00144.parquet...
Downloading data/python/data-00035-of-00144.parquet...
Downloading data/python/data-00036-of-00144.parquet...
Downloading data/python/data-00037-of-00144.parquet...
Downloading data/python/data-00038-of-00144.parquet...
Downloading data/python/data-00039-of-00144.parquet...
Downloading data/python/data-00040-of-00144.parquet...
Downloading data/python/data-00041-of-00144.parquet...
Downloading data/python/data-00042-of-00144.parquet...
Downloading data/python/data-00043-of-00144.parquet...
Downloading data/python/data-00044-of-00144.parquet...
Downloading data/python/data-00045-of-00144.parquet...
Downloading data/python/data-00046-of-00144.parquet...
Downloading data/python/data-00047-of-00144.parquet...
Downloading data/python/data-00048-of-00144.parquet...
Downloading data/python/data-00049-of-00144.parquet...
Downloading data/python/data-00050-of-00144.parquet...
Downloading data/python/data-00051-of-00144.parquet...
Downloading data/python/data-00052-of-00144.parquet...
Downloading data/python/data-00053-of-00144.parquet...
Downloading data/python/data-00054-of-00144.parquet...
Downloading data/python/data-00055-of-00144.parquet...
Downloading data/python/data-00056-of-00144.parquet...
Downloading data/python/data-00057-of-00144.parquet...
Downloading data/python/data-00058-of-00144.parquet...
Downloading data/python/data-00059-of-00144.parquet...
Downloading data/python/data-00060-of-00144.parquet...
Downloading data/python/data-00061-of-00144.parquet...
Downloading data/python/data-00062-of-00144.parquet...
Downloading data/python/data-00063-of-00144.parquet...
Downloading data/python/data-00064-of-00144.parquet...
Downloading data/python/data-00065-of-00144.parquet...
Downloading data/python/data-00066-of-00144.parquet...
Downloading data/python/data-00067-of-00144.parquet...
Downloading data/python/data-00068-of-00144.parquet...
Downloading data/python/data-00069-of-00144.parquet...
Downloading data/python/data-00070-of-00144.parquet...
Downloading data/python/data-00071-of-00144.parquet...
Downloading data/python/data-00072-of-00144.parquet...
Downloading data/python/data-00073-of-00144.parquet...
Downloading data/python/data-00074-of-00144.parquet...
Downloading data/python/data-00075-of-00144.parquet...
Downloading data/python/data-00076-of-00144.parquet...
Downloading data/python/data-00077-of-00144.parquet...
Downloading data/python/data-00078-of-00144.parquet...
Downloading data/python/data-00079-of-00144.parquet...
Downloading data/python/data-00080-of-00144.parquet...
Downloading data/python/data-00081-of-00144.parquet...
Downloading data/python/data-00082-of-00144.parquet...
Downloading data/python/data-00083-of-00144.parquet...
Downloading data/python/data-00084-of-00144.parquet...
Downloading data/python/data-00085-of-00144.parquet...
Downloading data/python/data-00086-of-00144.parquet...
Downloading data/python/data-00087-of-00144.parquet...
Downloading data/python/data-00088-of-00144.parquet...
Downloading data/python/data-00089-of-00144.parquet...
Downloading data/python/data-00090-of-00144.parquet...
Downloading data/python/data-00091-of-00144.parquet...
Downloading data/python/data-00092-of-00144.parquet...
Downloading data/python/data-00093-of-00144.parquet...
Downloading data/python/data-00094-of-00144.parquet...
Downloading data/python/data-00095-of-00144.parquet...
Downloading data/python/data-00096-of-00144.parquet...
Downloading data/python/data-00097-of-00144.parquet...
Downloading data/python/data-00098-of-00144.parquet...
Downloading data/python/data-00099-of-00144.parquet...
Downloading data/python/data-00100-of-00144.parquet...
Downloading data/python/data-00101-of-00144.parquet...
Downloading data/python/data-00102-of-00144.parquet...
Downloading data/python/data-00103-of-00144.parquet...
Downloading data/python/data-00104-of-00144.parquet...
Downloading data/python/data-00105-of-00144.parquet...
Downloading data/python/data-00106-of-00144.parquet...
Downloading data/python/data-00107-of-00144.parquet...
Downloading data/python/data-00108-of-00144.parquet...
Downloading data/python/data-00109-of-00144.parquet...
Downloading data/python/data-00110-of-00144.parquet...
Downloading data/python/data-00111-of-00144.parquet...
Downloading data/python/data-00112-of-00144.parquet...
Downloading data/python/data-00113-of-00144.parquet...
Downloading data/python/data-00114-of-00144.parquet...
Downloading data/python/data-00115-of-00144.parquet...
Downloading data/python/data-00116-of-00144.parquet...
Downloading data/python/data-00117-of-00144.parquet...
Downloading data/python/data-00118-of-00144.parquet...
Downloading data/python/data-00119-of-00144.parquet...
Downloading data/python/data-00120-of-00144.parquet...
Downloading data/python/data-00121-of-00144.parquet...
Downloading data/python/data-00122-of-00144.parquet...
Downloading data/python/data-00123-of-00144.parquet...
Downloading data/python/data-00124-of-00144.parquet...
Downloading data/python/data-00125-of-00144.parquet...
Downloading data/python/data-00126-of-00144.parquet...
Downloading data/python/data-00127-of-00144.parquet...
Downloading data/python/data-00128-of-00144.parquet...
Downloading data/python/data-00129-of-00144.parquet...
Downloading data/python/data-00130-of-00144.parquet...
Downloading data/python/data-00131-of-00144.parquet...
Downloading data/python/data-00132-of-00144.parquet...
Downloading data/python/data-00133-of-00144.parquet...
Downloading data/python/data-00134-of-00144.parquet...
Downloading data/python/data-00135-of-00144.parquet...
Downloading data/python/data-00136-of-00144.parquet...
Downloading data/python/data-00137-of-00144.parquet...
Downloading data/python/data-00138-of-00144.parquet...
Downloading data/python/data-00139-of-00144.parquet...
Downloading data/python/data-00140-of-00144.parquet...
Downloading data/python/data-00141-of-00144.parquet...
Downloading data/python/data-00142-of-00144.parquet...
Downloading data/python/data-00143-of-00144.parquet...
All files have been downloaded successfully.
# Define the repository details
repo_id = "bigcode/the-stack-v2-dedup"  # Repository ID for the dataset
subfolder = "data/Python"  # Subfolder path within the repository
repo_type = "dataset"  # Specify that it's a dataset

# Specify the local directory to save the files
local_dir = "/work/s452638/datasets/the-stack-v2-python"
os.makedirs(local_dir, exist_ok=True)

# List all files in the repository's subfolder
files_list = list_repo_files(repo_id=repo_id, repo_type=repo_type)

# Filter files in the desired subfolder
files_to_download = [file for file in files_list if file.startswith(f'{subfolder}/')]

print(f"Found {len(files_to_download)} files in the repository.")

# Download each file
for file_name in files_to_download[4:]:
    print(f"Downloading {file_name}...")
    hf_hub_download(repo_id=repo_id, repo_type=repo_type, filename=file_name, local_dir=local_dir)

print("All files have been downloaded successfully.")
Found 6 files in the repository.
Downloading data/Python/train-00004-of-00006.parquet...
Downloading data/Python/train-00005-of-00006.parquet...
All files have been downloaded successfully.