data added
This commit is contained in:
commit
b5cf1db41a
@ -0,0 +1,63 @@
|
||||
# HOW TO DOWNLOAD THE CLOUD COVER COMPETITION IMAGES
|
||||
|
||||
======================================
|
||||
|
||||
Welcome to the On Cloud N: Cloud Cover Detection Challenge! These instructions will help you access the satellite image files for this competition. The imagery and labels are hosted in a set of Azure Blob Storage containers in three regions.
|
||||
|
||||
Here are the steps to download the entire training set features and labels to your local machine.
|
||||
|
||||
1. Save the `download_data.py` script from the competition data download page:
|
||||
|
||||
https://www.drivendata.org/competitions/83/cloud-cover/data/
|
||||
|
||||
2. Install the requirements with the command below. You may want to create and activate a Python (>3.6) virtual environment first, for example with conda (https://docs.conda.io/projects/conda/en/latest/user-guide/tasks/manage-environments.html).
|
||||
|
||||
pip install "cloudpathlib[azure]" loguru tqdm typer
|
||||
|
||||
3. To download from the competition container, you'll need a Shared Access Signature (SAS). The competition data is stored on three Azure Blob Containers in different regions―Central US, East Asia, and West Europe. Each region includes identical data, so choose the region closest to the machine you are downloading the data to. If your code is running on the Planetary Computer Hub, use West Europe, which is where the cluster is located. Download the token for your chosen region from the data download page. You should now have a plain-text file named `sas_centralus.txt`, `sas_eastasia.txt`, or `sas_westeurope.txt` containing a token that starts with `https://cloudcoverdata`.
|
||||
|
||||
4. Run the download script, passing the path to your SAS token with the `--sas-url` argument. By default, this will download the entire competition dataset to a directory named `data` in your current working directory. You can change the destination with the `--local-directory` argument.
|
||||
|
||||
`python download_data.py --sas-url <path to sas file>`
|
||||
|
||||
For example, to download from the West Europe region with an SAS token saved as `sas_westeurope.txt`:
|
||||
|
||||
`python download_data.py --sas-url sas_westeurope.txt`
|
||||
|
||||
You can also download a single directory containing a subset of the data with the `--cloud-directory` flag. For example:
|
||||
|
||||
# train features
|
||||
`python download_data.py --cloud-directory "az://./train_features" --sas-url sas_westeurope.txt`
|
||||
|
||||
# train labels
|
||||
`python download_data.py --cloud-directory "az://./train_labels" --sas-url sas_westeurope.txt`
|
||||
|
||||
# single chip
|
||||
`python download_data.py --cloud-directory "az://./train_features/akny" --sas-url sas_westeurope.txt`
|
||||
|
||||
## Download script documentation
|
||||
|
||||
```
|
||||
$ python download_data.py --help
|
||||
|
||||
Usage: download_data.py [OPTIONS]
|
||||
|
||||
Downloads the challenge dataset to your local machine.
|
||||
|
||||
Options:
|
||||
--sas-url TEXT Shared Access Signature URL that allows you
|
||||
to access the files (starting with
|
||||
https://...). This can be either the SAS URL
|
||||
itself or a path to a file containing the
|
||||
SAS URL, available from the competition
|
||||
datasets page. [required]
|
||||
--cloud-directory TEXT Cloudpathlib URI (`az://./<directory>`) for
|
||||
cloud directory to be downloaded. [default:
|
||||
az://.]
|
||||
--local-directory PATH Directory on your local machine to which the
|
||||
files are downloaded. [default: data]
|
||||
```
|
||||
|
||||
Good luck! If you have any questions you can always visit the user forum:
|
||||
|
||||
https://community.drivendata.org/c/cloud-cover
|
@ -0,0 +1,60 @@
|
||||
from pathlib import Path
|
||||
|
||||
from cloudpathlib import AzureBlobClient, AzureBlobPath
|
||||
from loguru import logger
|
||||
from tqdm.contrib.concurrent import process_map
|
||||
import typer
|
||||
|
||||
app = typer.Typer()
|
||||
|
||||
|
||||
def download_path(path: AzureBlobPath):
|
||||
"""Downloads a single cloud path."""
|
||||
try:
|
||||
if path.is_file():
|
||||
path.fspath # downloads cloud asset to local_cache_dir
|
||||
return {"path": path, "status": "success"}
|
||||
except Exception as exc:
|
||||
logger.debug(f"Failed to download {path}. {exc}")
|
||||
return {"path": path, "status": "failed", "message": str(exc)}
|
||||
|
||||
|
||||
@app.command()
|
||||
def main(
|
||||
sas_url: str = typer.Option(
|
||||
...,
|
||||
help="Shared Access Signature URL that allows you to access the files (starting with "
|
||||
"https://...). This can be either the SAS URL itself or a path to a file containing the "
|
||||
"SAS URL, available from the competition datasets page.",
|
||||
),
|
||||
cloud_directory: str = typer.Option(
|
||||
"az://.",
|
||||
help="Cloudpathlib URI (`az://./<directory>`) for cloud directory to be downloaded.",
|
||||
),
|
||||
local_directory: Path = typer.Option(
|
||||
"data",
|
||||
help="Directory on your local machine to which the files are downloaded.",
|
||||
),
|
||||
):
|
||||
"""Downloads the challenge dataset to your local machine."""
|
||||
if Path(sas_url).exists():
|
||||
logger.info(f"Loading SAS URL from {sas_url}")
|
||||
sas_url = Path(sas_url).read_text().strip()
|
||||
|
||||
client = AzureBlobClient(account_url=sas_url, local_cache_dir=local_directory)
|
||||
directory = client.CloudPath(cloud_directory)
|
||||
logger.info("Retrieving path list.")
|
||||
path_list = [
|
||||
path
|
||||
for path in directory.rglob("*")
|
||||
if path._path.suffix.lower() in (".tif", ".geojson")
|
||||
]
|
||||
logger.info(f"Downloading {len(path_list)} files.")
|
||||
results = process_map(download_path, path_list, total=len(path_list), chunksize=10)
|
||||
failures = [result for result in results if result["status"] == "failed"]
|
||||
if len(failures) > 0:
|
||||
logger.warning(f"{len(failures)} files failed to download.")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
app()
|
@ -0,0 +1 @@
|
||||
https://cloudcoverdatawesteurope.blob.core.windows.net/public?se=2022-08-01T12%3A00Z&sp=rl&sv=2018-11-09&sr=c&sig=DrqaBLSI9t1nnx1sekyPaMgsqMiO9%2BBzjU/JwDhfQ64%3D
|
11749
data/On_Cloud_N_Cloud_Cover_Detection_Challenge_-_train_metadata.csv.csv
Normal file
11749
data/On_Cloud_N_Cloud_Cover_Detection_Challenge_-_train_metadata.csv.csv
Normal file
File diff suppressed because it is too large
Load Diff
Loading…
Reference in New Issue
Block a user