Intelegentny_Pszczelarz/.venv/Lib/site-packages/sklearn/datasets/_covtype.py

213 lines
6.8 KiB
Python
Raw Normal View History

2023-06-19 00:49:18 +02:00
"""Forest covertype dataset.
A classic dataset for classification benchmarks, featuring categorical and
real-valued features.
The dataset page is available from UCI Machine Learning Repository
https://archive.ics.uci.edu/ml/datasets/Covertype
Courtesy of Jock A. Blackard and Colorado State University.
"""
# Author: Lars Buitinck
# Peter Prettenhofer <peter.prettenhofer@gmail.com>
# License: BSD 3 clause
from gzip import GzipFile
import logging
from os.path import exists, join
import os
from tempfile import TemporaryDirectory
import numpy as np
import joblib
from . import get_data_home
from ._base import _convert_data_dataframe
from ._base import _fetch_remote
from ._base import RemoteFileMetadata
from ._base import load_descr
from ..utils import Bunch
from ._base import _pkl_filepath
from ..utils import check_random_state
# The original data can be found in:
# https://archive.ics.uci.edu/ml/machine-learning-databases/covtype/covtype.data.gz
ARCHIVE = RemoteFileMetadata(
filename="covtype.data.gz",
url="https://ndownloader.figshare.com/files/5976039",
checksum="614360d0257557dd1792834a85a1cdebfadc3c4f30b011d56afee7ffb5b15771",
)
logger = logging.getLogger(__name__)
# Column names reference:
# https://archive.ics.uci.edu/ml/machine-learning-databases/covtype/covtype.info
FEATURE_NAMES = [
"Elevation",
"Aspect",
"Slope",
"Horizontal_Distance_To_Hydrology",
"Vertical_Distance_To_Hydrology",
"Horizontal_Distance_To_Roadways",
"Hillshade_9am",
"Hillshade_Noon",
"Hillshade_3pm",
"Horizontal_Distance_To_Fire_Points",
]
FEATURE_NAMES += [f"Wilderness_Area_{i}" for i in range(4)]
FEATURE_NAMES += [f"Soil_Type_{i}" for i in range(40)]
TARGET_NAMES = ["Cover_Type"]
def fetch_covtype(
*,
data_home=None,
download_if_missing=True,
random_state=None,
shuffle=False,
return_X_y=False,
as_frame=False,
):
"""Load the covertype dataset (classification).
Download it if necessary.
================= ============
Classes 7
Samples total 581012
Dimensionality 54
Features int
================= ============
Read more in the :ref:`User Guide <covtype_dataset>`.
Parameters
----------
data_home : str, default=None
Specify another download and cache folder for the datasets. By default
all scikit-learn data is stored in '~/scikit_learn_data' subfolders.
download_if_missing : bool, default=True
If False, raise a IOError if the data is not locally available
instead of trying to download the data from the source site.
random_state : int, RandomState instance or None, default=None
Determines random number generation for dataset shuffling. Pass an int
for reproducible output across multiple function calls.
See :term:`Glossary <random_state>`.
shuffle : bool, default=False
Whether to shuffle dataset.
return_X_y : bool, default=False
If True, returns ``(data.data, data.target)`` instead of a Bunch
object.
.. versionadded:: 0.20
as_frame : bool, default=False
If True, the data is a pandas DataFrame including columns with
appropriate dtypes (numeric). The target is a pandas DataFrame or
Series depending on the number of target columns. If `return_X_y` is
True, then (`data`, `target`) will be pandas DataFrames or Series as
described below.
.. versionadded:: 0.24
Returns
-------
dataset : :class:`~sklearn.utils.Bunch`
Dictionary-like object, with the following attributes.
data : ndarray of shape (581012, 54)
Each row corresponds to the 54 features in the dataset.
target : ndarray of shape (581012,)
Each value corresponds to one of
the 7 forest covertypes with values
ranging between 1 to 7.
frame : dataframe of shape (581012, 55)
Only present when `as_frame=True`. Contains `data` and `target`.
DESCR : str
Description of the forest covertype dataset.
feature_names : list
The names of the dataset columns.
target_names: list
The names of the target columns.
(data, target) : tuple if ``return_X_y`` is True
A tuple of two ndarray. The first containing a 2D array of
shape (n_samples, n_features) with each row representing one
sample and each column representing the features. The second
ndarray of shape (n_samples,) containing the target samples.
.. versionadded:: 0.20
"""
data_home = get_data_home(data_home=data_home)
covtype_dir = join(data_home, "covertype")
samples_path = _pkl_filepath(covtype_dir, "samples")
targets_path = _pkl_filepath(covtype_dir, "targets")
available = exists(samples_path) and exists(targets_path)
if download_if_missing and not available:
os.makedirs(covtype_dir, exist_ok=True)
# Creating temp_dir as a direct subdirectory of the target directory
# guarantees that both reside on the same filesystem, so that we can use
# os.rename to atomically move the data files to their target location.
with TemporaryDirectory(dir=covtype_dir) as temp_dir:
logger.info(f"Downloading {ARCHIVE.url}")
archive_path = _fetch_remote(ARCHIVE, dirname=temp_dir)
Xy = np.genfromtxt(GzipFile(filename=archive_path), delimiter=",")
X = Xy[:, :-1]
y = Xy[:, -1].astype(np.int32, copy=False)
samples_tmp_path = _pkl_filepath(temp_dir, "samples")
joblib.dump(X, samples_tmp_path, compress=9)
os.rename(samples_tmp_path, samples_path)
targets_tmp_path = _pkl_filepath(temp_dir, "targets")
joblib.dump(y, targets_tmp_path, compress=9)
os.rename(targets_tmp_path, targets_path)
elif not available and not download_if_missing:
raise IOError("Data not found and `download_if_missing` is False")
try:
X, y
except NameError:
X = joblib.load(samples_path)
y = joblib.load(targets_path)
if shuffle:
ind = np.arange(X.shape[0])
rng = check_random_state(random_state)
rng.shuffle(ind)
X = X[ind]
y = y[ind]
fdescr = load_descr("covtype.rst")
frame = None
if as_frame:
frame, X, y = _convert_data_dataframe(
caller_name="fetch_covtype",
data=X,
target=y,
feature_names=FEATURE_NAMES,
target_names=TARGET_NAMES,
)
if return_X_y:
return X, y
return Bunch(
data=X,
target=y,
frame=frame,
target_names=TARGET_NAMES,
feature_names=FEATURE_NAMES,
DESCR=fdescr,
)