3RNN/Lib/site-packages/sklearn/datasets/_california_housing.py

249 lines
7.1 KiB
Python
Raw Permalink Normal View History

2024-05-26 19:49:15 +02:00
"""California housing dataset.
The original database is available from StatLib
http://lib.stat.cmu.edu/datasets/
The data contains 20,640 observations on 9 variables.
This dataset contains the average house value as target variable
and the following input variables (features): average income,
housing average age, average rooms, average bedrooms, population,
average occupation, latitude, and longitude in that order.
References
----------
Pace, R. Kelley and Ronald Barry, Sparse Spatial Autoregressions,
Statistics and Probability Letters, 33 (1997) 291-297.
"""
# Authors: Peter Prettenhofer
# License: BSD 3 clause
import logging
import tarfile
from numbers import Integral, Real
from os import PathLike, makedirs, remove
from os.path import exists
import joblib
import numpy as np
from ..utils import Bunch
from ..utils._param_validation import Interval, validate_params
from . import get_data_home
from ._base import (
RemoteFileMetadata,
_convert_data_dataframe,
_fetch_remote,
_pkl_filepath,
load_descr,
)
# The original data can be found at:
# https://www.dcc.fc.up.pt/~ltorgo/Regression/cal_housing.tgz
ARCHIVE = RemoteFileMetadata(
filename="cal_housing.tgz",
url="https://ndownloader.figshare.com/files/5976036",
checksum="aaa5c9a6afe2225cc2aed2723682ae403280c4a3695a2ddda4ffb5d8215ea681",
)
logger = logging.getLogger(__name__)
@validate_params(
{
"data_home": [str, PathLike, None],
"download_if_missing": ["boolean"],
"return_X_y": ["boolean"],
"as_frame": ["boolean"],
"n_retries": [Interval(Integral, 1, None, closed="left")],
"delay": [Interval(Real, 0.0, None, closed="neither")],
},
prefer_skip_nested_validation=True,
)
def fetch_california_housing(
*,
data_home=None,
download_if_missing=True,
return_X_y=False,
as_frame=False,
n_retries=3,
delay=1.0,
):
"""Load the California housing dataset (regression).
============== ==============
Samples total 20640
Dimensionality 8
Features real
Target real 0.15 - 5.
============== ==============
Read more in the :ref:`User Guide <california_housing_dataset>`.
Parameters
----------
data_home : str or path-like, default=None
Specify another download and cache folder for the datasets. By default
all scikit-learn data is stored in '~/scikit_learn_data' subfolders.
download_if_missing : bool, default=True
If False, raise an OSError if the data is not locally available
instead of trying to download the data from the source site.
return_X_y : bool, default=False
If True, returns ``(data.data, data.target)`` instead of a Bunch
object.
.. versionadded:: 0.20
as_frame : bool, default=False
If True, the data is a pandas DataFrame including columns with
appropriate dtypes (numeric, string or categorical). The target is
a pandas DataFrame or Series depending on the number of target_columns.
.. versionadded:: 0.23
n_retries : int, default=3
Number of retries when HTTP errors are encountered.
.. versionadded:: 1.5
delay : float, default=1.0
Number of seconds between retries.
.. versionadded:: 1.5
Returns
-------
dataset : :class:`~sklearn.utils.Bunch`
Dictionary-like object, with the following attributes.
data : ndarray, shape (20640, 8)
Each row corresponding to the 8 feature values in order.
If ``as_frame`` is True, ``data`` is a pandas object.
target : numpy array of shape (20640,)
Each value corresponds to the average
house value in units of 100,000.
If ``as_frame`` is True, ``target`` is a pandas object.
feature_names : list of length 8
Array of ordered feature names used in the dataset.
DESCR : str
Description of the California housing dataset.
frame : pandas DataFrame
Only present when `as_frame=True`. DataFrame with ``data`` and
``target``.
.. versionadded:: 0.23
(data, target) : tuple if ``return_X_y`` is True
A tuple of two ndarray. The first containing a 2D array of
shape (n_samples, n_features) with each row representing one
sample and each column representing the features. The second
ndarray of shape (n_samples,) containing the target samples.
.. versionadded:: 0.20
Notes
-----
This dataset consists of 20,640 samples and 9 features.
Examples
--------
>>> from sklearn.datasets import fetch_california_housing
>>> housing = fetch_california_housing()
>>> print(housing.data.shape, housing.target.shape)
(20640, 8) (20640,)
>>> print(housing.feature_names[0:6])
['MedInc', 'HouseAge', 'AveRooms', 'AveBedrms', 'Population', 'AveOccup']
"""
data_home = get_data_home(data_home=data_home)
if not exists(data_home):
makedirs(data_home)
filepath = _pkl_filepath(data_home, "cal_housing.pkz")
if not exists(filepath):
if not download_if_missing:
raise OSError("Data not found and `download_if_missing` is False")
logger.info(
"Downloading Cal. housing from {} to {}".format(ARCHIVE.url, data_home)
)
archive_path = _fetch_remote(
ARCHIVE,
dirname=data_home,
n_retries=n_retries,
delay=delay,
)
with tarfile.open(mode="r:gz", name=archive_path) as f:
cal_housing = np.loadtxt(
f.extractfile("CaliforniaHousing/cal_housing.data"), delimiter=","
)
# Columns are not in the same order compared to the previous
# URL resource on lib.stat.cmu.edu
columns_index = [8, 7, 2, 3, 4, 5, 6, 1, 0]
cal_housing = cal_housing[:, columns_index]
joblib.dump(cal_housing, filepath, compress=6)
remove(archive_path)
else:
cal_housing = joblib.load(filepath)
feature_names = [
"MedInc",
"HouseAge",
"AveRooms",
"AveBedrms",
"Population",
"AveOccup",
"Latitude",
"Longitude",
]
target, data = cal_housing[:, 0], cal_housing[:, 1:]
# avg rooms = total rooms / households
data[:, 2] /= data[:, 5]
# avg bed rooms = total bed rooms / households
data[:, 3] /= data[:, 5]
# avg occupancy = population / households
data[:, 5] = data[:, 4] / data[:, 5]
# target in units of 100,000
target = target / 100000.0
descr = load_descr("california_housing.rst")
X = data
y = target
frame = None
target_names = [
"MedHouseVal",
]
if as_frame:
frame, X, y = _convert_data_dataframe(
"fetch_california_housing", data, target, feature_names, target_names
)
if return_X_y:
return X, y
return Bunch(
data=X,
target=y,
frame=frame,
target_names=target_names,
feature_names=feature_names,
DESCR=descr,
)