1932 lines
61 KiB
Python
1932 lines
61 KiB
Python
"""
|
|
Generate samples of synthetic data sets.
|
|
"""
|
|
|
|
# Authors: B. Thirion, G. Varoquaux, A. Gramfort, V. Michel, O. Grisel,
|
|
# G. Louppe, J. Nothman
|
|
# License: BSD 3 clause
|
|
|
|
import numbers
|
|
import array
|
|
import warnings
|
|
from collections.abc import Iterable
|
|
|
|
import numpy as np
|
|
from scipy import linalg
|
|
import scipy.sparse as sp
|
|
|
|
from ..preprocessing import MultiLabelBinarizer
|
|
from ..utils import check_array, check_random_state
|
|
from ..utils import shuffle as util_shuffle
|
|
from ..utils.random import sample_without_replacement
|
|
|
|
|
|
def _generate_hypercube(samples, dimensions, rng):
|
|
"""Returns distinct binary samples of length dimensions."""
|
|
if dimensions > 30:
|
|
return np.hstack(
|
|
[
|
|
rng.randint(2, size=(samples, dimensions - 30)),
|
|
_generate_hypercube(samples, 30, rng),
|
|
]
|
|
)
|
|
out = sample_without_replacement(2**dimensions, samples, random_state=rng).astype(
|
|
dtype=">u4", copy=False
|
|
)
|
|
out = np.unpackbits(out.view(">u1")).reshape((-1, 32))[:, -dimensions:]
|
|
return out
|
|
|
|
|
|
def make_classification(
|
|
n_samples=100,
|
|
n_features=20,
|
|
*,
|
|
n_informative=2,
|
|
n_redundant=2,
|
|
n_repeated=0,
|
|
n_classes=2,
|
|
n_clusters_per_class=2,
|
|
weights=None,
|
|
flip_y=0.01,
|
|
class_sep=1.0,
|
|
hypercube=True,
|
|
shift=0.0,
|
|
scale=1.0,
|
|
shuffle=True,
|
|
random_state=None,
|
|
):
|
|
"""Generate a random n-class classification problem.
|
|
|
|
This initially creates clusters of points normally distributed (std=1)
|
|
about vertices of an ``n_informative``-dimensional hypercube with sides of
|
|
length ``2*class_sep`` and assigns an equal number of clusters to each
|
|
class. It introduces interdependence between these features and adds
|
|
various types of further noise to the data.
|
|
|
|
Without shuffling, ``X`` horizontally stacks features in the following
|
|
order: the primary ``n_informative`` features, followed by ``n_redundant``
|
|
linear combinations of the informative features, followed by ``n_repeated``
|
|
duplicates, drawn randomly with replacement from the informative and
|
|
redundant features. The remaining features are filled with random noise.
|
|
Thus, without shuffling, all useful features are contained in the columns
|
|
``X[:, :n_informative + n_redundant + n_repeated]``.
|
|
|
|
Read more in the :ref:`User Guide <sample_generators>`.
|
|
|
|
Parameters
|
|
----------
|
|
n_samples : int, default=100
|
|
The number of samples.
|
|
|
|
n_features : int, default=20
|
|
The total number of features. These comprise ``n_informative``
|
|
informative features, ``n_redundant`` redundant features,
|
|
``n_repeated`` duplicated features and
|
|
``n_features-n_informative-n_redundant-n_repeated`` useless features
|
|
drawn at random.
|
|
|
|
n_informative : int, default=2
|
|
The number of informative features. Each class is composed of a number
|
|
of gaussian clusters each located around the vertices of a hypercube
|
|
in a subspace of dimension ``n_informative``. For each cluster,
|
|
informative features are drawn independently from N(0, 1) and then
|
|
randomly linearly combined within each cluster in order to add
|
|
covariance. The clusters are then placed on the vertices of the
|
|
hypercube.
|
|
|
|
n_redundant : int, default=2
|
|
The number of redundant features. These features are generated as
|
|
random linear combinations of the informative features.
|
|
|
|
n_repeated : int, default=0
|
|
The number of duplicated features, drawn randomly from the informative
|
|
and the redundant features.
|
|
|
|
n_classes : int, default=2
|
|
The number of classes (or labels) of the classification problem.
|
|
|
|
n_clusters_per_class : int, default=2
|
|
The number of clusters per class.
|
|
|
|
weights : array-like of shape (n_classes,) or (n_classes - 1,),\
|
|
default=None
|
|
The proportions of samples assigned to each class. If None, then
|
|
classes are balanced. Note that if ``len(weights) == n_classes - 1``,
|
|
then the last class weight is automatically inferred.
|
|
More than ``n_samples`` samples may be returned if the sum of
|
|
``weights`` exceeds 1. Note that the actual class proportions will
|
|
not exactly match ``weights`` when ``flip_y`` isn't 0.
|
|
|
|
flip_y : float, default=0.01
|
|
The fraction of samples whose class is assigned randomly. Larger
|
|
values introduce noise in the labels and make the classification
|
|
task harder. Note that the default setting flip_y > 0 might lead
|
|
to less than ``n_classes`` in y in some cases.
|
|
|
|
class_sep : float, default=1.0
|
|
The factor multiplying the hypercube size. Larger values spread
|
|
out the clusters/classes and make the classification task easier.
|
|
|
|
hypercube : bool, default=True
|
|
If True, the clusters are put on the vertices of a hypercube. If
|
|
False, the clusters are put on the vertices of a random polytope.
|
|
|
|
shift : float, ndarray of shape (n_features,) or None, default=0.0
|
|
Shift features by the specified value. If None, then features
|
|
are shifted by a random value drawn in [-class_sep, class_sep].
|
|
|
|
scale : float, ndarray of shape (n_features,) or None, default=1.0
|
|
Multiply features by the specified value. If None, then features
|
|
are scaled by a random value drawn in [1, 100]. Note that scaling
|
|
happens after shifting.
|
|
|
|
shuffle : bool, default=True
|
|
Shuffle the samples and the features.
|
|
|
|
random_state : int, RandomState instance or None, default=None
|
|
Determines random number generation for dataset creation. Pass an int
|
|
for reproducible output across multiple function calls.
|
|
See :term:`Glossary <random_state>`.
|
|
|
|
Returns
|
|
-------
|
|
X : ndarray of shape (n_samples, n_features)
|
|
The generated samples.
|
|
|
|
y : ndarray of shape (n_samples,)
|
|
The integer labels for class membership of each sample.
|
|
|
|
See Also
|
|
--------
|
|
make_blobs : Simplified variant.
|
|
make_multilabel_classification : Unrelated generator for multilabel tasks.
|
|
|
|
Notes
|
|
-----
|
|
The algorithm is adapted from Guyon [1] and was designed to generate
|
|
the "Madelon" dataset.
|
|
|
|
References
|
|
----------
|
|
.. [1] I. Guyon, "Design of experiments for the NIPS 2003 variable
|
|
selection benchmark", 2003.
|
|
"""
|
|
generator = check_random_state(random_state)
|
|
|
|
# Count features, clusters and samples
|
|
if n_informative + n_redundant + n_repeated > n_features:
|
|
raise ValueError(
|
|
"Number of informative, redundant and repeated "
|
|
"features must sum to less than the number of total"
|
|
" features"
|
|
)
|
|
# Use log2 to avoid overflow errors
|
|
if n_informative < np.log2(n_classes * n_clusters_per_class):
|
|
msg = "n_classes({}) * n_clusters_per_class({}) must be"
|
|
msg += " smaller or equal 2**n_informative({})={}"
|
|
raise ValueError(
|
|
msg.format(
|
|
n_classes, n_clusters_per_class, n_informative, 2**n_informative
|
|
)
|
|
)
|
|
|
|
if weights is not None:
|
|
if len(weights) not in [n_classes, n_classes - 1]:
|
|
raise ValueError(
|
|
"Weights specified but incompatible with number of classes."
|
|
)
|
|
if len(weights) == n_classes - 1:
|
|
if isinstance(weights, list):
|
|
weights = weights + [1.0 - sum(weights)]
|
|
else:
|
|
weights = np.resize(weights, n_classes)
|
|
weights[-1] = 1.0 - sum(weights[:-1])
|
|
else:
|
|
weights = [1.0 / n_classes] * n_classes
|
|
|
|
n_useless = n_features - n_informative - n_redundant - n_repeated
|
|
n_clusters = n_classes * n_clusters_per_class
|
|
|
|
# Distribute samples among clusters by weight
|
|
n_samples_per_cluster = [
|
|
int(n_samples * weights[k % n_classes] / n_clusters_per_class)
|
|
for k in range(n_clusters)
|
|
]
|
|
|
|
for i in range(n_samples - sum(n_samples_per_cluster)):
|
|
n_samples_per_cluster[i % n_clusters] += 1
|
|
|
|
# Initialize X and y
|
|
X = np.zeros((n_samples, n_features))
|
|
y = np.zeros(n_samples, dtype=int)
|
|
|
|
# Build the polytope whose vertices become cluster centroids
|
|
centroids = _generate_hypercube(n_clusters, n_informative, generator).astype(
|
|
float, copy=False
|
|
)
|
|
centroids *= 2 * class_sep
|
|
centroids -= class_sep
|
|
if not hypercube:
|
|
centroids *= generator.uniform(size=(n_clusters, 1))
|
|
centroids *= generator.uniform(size=(1, n_informative))
|
|
|
|
# Initially draw informative features from the standard normal
|
|
X[:, :n_informative] = generator.standard_normal(size=(n_samples, n_informative))
|
|
|
|
# Create each cluster; a variant of make_blobs
|
|
stop = 0
|
|
for k, centroid in enumerate(centroids):
|
|
start, stop = stop, stop + n_samples_per_cluster[k]
|
|
y[start:stop] = k % n_classes # assign labels
|
|
X_k = X[start:stop, :n_informative] # slice a view of the cluster
|
|
|
|
A = 2 * generator.uniform(size=(n_informative, n_informative)) - 1
|
|
X_k[...] = np.dot(X_k, A) # introduce random covariance
|
|
|
|
X_k += centroid # shift the cluster to a vertex
|
|
|
|
# Create redundant features
|
|
if n_redundant > 0:
|
|
B = 2 * generator.uniform(size=(n_informative, n_redundant)) - 1
|
|
X[:, n_informative : n_informative + n_redundant] = np.dot(
|
|
X[:, :n_informative], B
|
|
)
|
|
|
|
# Repeat some features
|
|
if n_repeated > 0:
|
|
n = n_informative + n_redundant
|
|
indices = ((n - 1) * generator.uniform(size=n_repeated) + 0.5).astype(np.intp)
|
|
X[:, n : n + n_repeated] = X[:, indices]
|
|
|
|
# Fill useless features
|
|
if n_useless > 0:
|
|
X[:, -n_useless:] = generator.standard_normal(size=(n_samples, n_useless))
|
|
|
|
# Randomly replace labels
|
|
if flip_y >= 0.0:
|
|
flip_mask = generator.uniform(size=n_samples) < flip_y
|
|
y[flip_mask] = generator.randint(n_classes, size=flip_mask.sum())
|
|
|
|
# Randomly shift and scale
|
|
if shift is None:
|
|
shift = (2 * generator.uniform(size=n_features) - 1) * class_sep
|
|
X += shift
|
|
|
|
if scale is None:
|
|
scale = 1 + 100 * generator.uniform(size=n_features)
|
|
X *= scale
|
|
|
|
if shuffle:
|
|
# Randomly permute samples
|
|
X, y = util_shuffle(X, y, random_state=generator)
|
|
|
|
# Randomly permute features
|
|
indices = np.arange(n_features)
|
|
generator.shuffle(indices)
|
|
X[:, :] = X[:, indices]
|
|
|
|
return X, y
|
|
|
|
|
|
def make_multilabel_classification(
|
|
n_samples=100,
|
|
n_features=20,
|
|
*,
|
|
n_classes=5,
|
|
n_labels=2,
|
|
length=50,
|
|
allow_unlabeled=True,
|
|
sparse=False,
|
|
return_indicator="dense",
|
|
return_distributions=False,
|
|
random_state=None,
|
|
):
|
|
"""Generate a random multilabel classification problem.
|
|
|
|
For each sample, the generative process is:
|
|
- pick the number of labels: n ~ Poisson(n_labels)
|
|
- n times, choose a class c: c ~ Multinomial(theta)
|
|
- pick the document length: k ~ Poisson(length)
|
|
- k times, choose a word: w ~ Multinomial(theta_c)
|
|
|
|
In the above process, rejection sampling is used to make sure that
|
|
n is never zero or more than `n_classes`, and that the document length
|
|
is never zero. Likewise, we reject classes which have already been chosen.
|
|
|
|
Read more in the :ref:`User Guide <sample_generators>`.
|
|
|
|
Parameters
|
|
----------
|
|
n_samples : int, default=100
|
|
The number of samples.
|
|
|
|
n_features : int, default=20
|
|
The total number of features.
|
|
|
|
n_classes : int, default=5
|
|
The number of classes of the classification problem.
|
|
|
|
n_labels : int, default=2
|
|
The average number of labels per instance. More precisely, the number
|
|
of labels per sample is drawn from a Poisson distribution with
|
|
``n_labels`` as its expected value, but samples are bounded (using
|
|
rejection sampling) by ``n_classes``, and must be nonzero if
|
|
``allow_unlabeled`` is False.
|
|
|
|
length : int, default=50
|
|
The sum of the features (number of words if documents) is drawn from
|
|
a Poisson distribution with this expected value.
|
|
|
|
allow_unlabeled : bool, default=True
|
|
If ``True``, some instances might not belong to any class.
|
|
|
|
sparse : bool, default=False
|
|
If ``True``, return a sparse feature matrix.
|
|
|
|
.. versionadded:: 0.17
|
|
parameter to allow *sparse* output.
|
|
|
|
return_indicator : {'dense', 'sparse'} or False, default='dense'
|
|
If ``'dense'`` return ``Y`` in the dense binary indicator format. If
|
|
``'sparse'`` return ``Y`` in the sparse binary indicator format.
|
|
``False`` returns a list of lists of labels.
|
|
|
|
return_distributions : bool, default=False
|
|
If ``True``, return the prior class probability and conditional
|
|
probabilities of features given classes, from which the data was
|
|
drawn.
|
|
|
|
random_state : int, RandomState instance or None, default=None
|
|
Determines random number generation for dataset creation. Pass an int
|
|
for reproducible output across multiple function calls.
|
|
See :term:`Glossary <random_state>`.
|
|
|
|
Returns
|
|
-------
|
|
X : ndarray of shape (n_samples, n_features)
|
|
The generated samples.
|
|
|
|
Y : {ndarray, sparse matrix} of shape (n_samples, n_classes)
|
|
The label sets. Sparse matrix should be of CSR format.
|
|
|
|
p_c : ndarray of shape (n_classes,)
|
|
The probability of each class being drawn. Only returned if
|
|
``return_distributions=True``.
|
|
|
|
p_w_c : ndarray of shape (n_features, n_classes)
|
|
The probability of each feature being drawn given each class.
|
|
Only returned if ``return_distributions=True``.
|
|
"""
|
|
if n_classes < 1:
|
|
raise ValueError(
|
|
"'n_classes' should be an integer greater than 0. Got {} instead.".format(
|
|
n_classes
|
|
)
|
|
)
|
|
if length < 1:
|
|
raise ValueError(
|
|
"'length' should be an integer greater than 0. Got {} instead.".format(
|
|
length
|
|
)
|
|
)
|
|
|
|
generator = check_random_state(random_state)
|
|
p_c = generator.uniform(size=n_classes)
|
|
p_c /= p_c.sum()
|
|
cumulative_p_c = np.cumsum(p_c)
|
|
p_w_c = generator.uniform(size=(n_features, n_classes))
|
|
p_w_c /= np.sum(p_w_c, axis=0)
|
|
|
|
def sample_example():
|
|
_, n_classes = p_w_c.shape
|
|
|
|
# pick a nonzero number of labels per document by rejection sampling
|
|
y_size = n_classes + 1
|
|
while (not allow_unlabeled and y_size == 0) or y_size > n_classes:
|
|
y_size = generator.poisson(n_labels)
|
|
|
|
# pick n classes
|
|
y = set()
|
|
while len(y) != y_size:
|
|
# pick a class with probability P(c)
|
|
c = np.searchsorted(cumulative_p_c, generator.uniform(size=y_size - len(y)))
|
|
y.update(c)
|
|
y = list(y)
|
|
|
|
# pick a non-zero document length by rejection sampling
|
|
n_words = 0
|
|
while n_words == 0:
|
|
n_words = generator.poisson(length)
|
|
|
|
# generate a document of length n_words
|
|
if len(y) == 0:
|
|
# if sample does not belong to any class, generate noise word
|
|
words = generator.randint(n_features, size=n_words)
|
|
return words, y
|
|
|
|
# sample words with replacement from selected classes
|
|
cumulative_p_w_sample = p_w_c.take(y, axis=1).sum(axis=1).cumsum()
|
|
cumulative_p_w_sample /= cumulative_p_w_sample[-1]
|
|
words = np.searchsorted(cumulative_p_w_sample, generator.uniform(size=n_words))
|
|
return words, y
|
|
|
|
X_indices = array.array("i")
|
|
X_indptr = array.array("i", [0])
|
|
Y = []
|
|
for i in range(n_samples):
|
|
words, y = sample_example()
|
|
X_indices.extend(words)
|
|
X_indptr.append(len(X_indices))
|
|
Y.append(y)
|
|
X_data = np.ones(len(X_indices), dtype=np.float64)
|
|
X = sp.csr_matrix((X_data, X_indices, X_indptr), shape=(n_samples, n_features))
|
|
X.sum_duplicates()
|
|
if not sparse:
|
|
X = X.toarray()
|
|
|
|
# return_indicator can be True due to backward compatibility
|
|
if return_indicator in (True, "sparse", "dense"):
|
|
lb = MultiLabelBinarizer(sparse_output=(return_indicator == "sparse"))
|
|
Y = lb.fit([range(n_classes)]).transform(Y)
|
|
elif return_indicator is not False:
|
|
raise ValueError("return_indicator must be either 'sparse', 'dense' or False.")
|
|
if return_distributions:
|
|
return X, Y, p_c, p_w_c
|
|
return X, Y
|
|
|
|
|
|
def make_hastie_10_2(n_samples=12000, *, random_state=None):
|
|
"""Generate data for binary classification used in Hastie et al. 2009, Example 10.2.
|
|
|
|
The ten features are standard independent Gaussian and
|
|
the target ``y`` is defined by::
|
|
|
|
y[i] = 1 if np.sum(X[i] ** 2) > 9.34 else -1
|
|
|
|
Read more in the :ref:`User Guide <sample_generators>`.
|
|
|
|
Parameters
|
|
----------
|
|
n_samples : int, default=12000
|
|
The number of samples.
|
|
|
|
random_state : int, RandomState instance or None, default=None
|
|
Determines random number generation for dataset creation. Pass an int
|
|
for reproducible output across multiple function calls.
|
|
See :term:`Glossary <random_state>`.
|
|
|
|
Returns
|
|
-------
|
|
X : ndarray of shape (n_samples, 10)
|
|
The input samples.
|
|
|
|
y : ndarray of shape (n_samples,)
|
|
The output values.
|
|
|
|
See Also
|
|
--------
|
|
make_gaussian_quantiles : A generalization of this dataset approach.
|
|
|
|
References
|
|
----------
|
|
.. [1] T. Hastie, R. Tibshirani and J. Friedman, "Elements of Statistical
|
|
Learning Ed. 2", Springer, 2009.
|
|
"""
|
|
rs = check_random_state(random_state)
|
|
|
|
shape = (n_samples, 10)
|
|
X = rs.normal(size=shape).reshape(shape)
|
|
y = ((X**2.0).sum(axis=1) > 9.34).astype(np.float64, copy=False)
|
|
y[y == 0.0] = -1.0
|
|
|
|
return X, y
|
|
|
|
|
|
def make_regression(
|
|
n_samples=100,
|
|
n_features=100,
|
|
*,
|
|
n_informative=10,
|
|
n_targets=1,
|
|
bias=0.0,
|
|
effective_rank=None,
|
|
tail_strength=0.5,
|
|
noise=0.0,
|
|
shuffle=True,
|
|
coef=False,
|
|
random_state=None,
|
|
):
|
|
"""Generate a random regression problem.
|
|
|
|
The input set can either be well conditioned (by default) or have a low
|
|
rank-fat tail singular profile. See :func:`make_low_rank_matrix` for
|
|
more details.
|
|
|
|
The output is generated by applying a (potentially biased) random linear
|
|
regression model with `n_informative` nonzero regressors to the previously
|
|
generated input and some gaussian centered noise with some adjustable
|
|
scale.
|
|
|
|
Read more in the :ref:`User Guide <sample_generators>`.
|
|
|
|
Parameters
|
|
----------
|
|
n_samples : int, default=100
|
|
The number of samples.
|
|
|
|
n_features : int, default=100
|
|
The number of features.
|
|
|
|
n_informative : int, default=10
|
|
The number of informative features, i.e., the number of features used
|
|
to build the linear model used to generate the output.
|
|
|
|
n_targets : int, default=1
|
|
The number of regression targets, i.e., the dimension of the y output
|
|
vector associated with a sample. By default, the output is a scalar.
|
|
|
|
bias : float, default=0.0
|
|
The bias term in the underlying linear model.
|
|
|
|
effective_rank : int, default=None
|
|
If not None:
|
|
The approximate number of singular vectors required to explain most
|
|
of the input data by linear combinations. Using this kind of
|
|
singular spectrum in the input allows the generator to reproduce
|
|
the correlations often observed in practice.
|
|
If None:
|
|
The input set is well conditioned, centered and gaussian with
|
|
unit variance.
|
|
|
|
tail_strength : float, default=0.5
|
|
The relative importance of the fat noisy tail of the singular values
|
|
profile if `effective_rank` is not None. When a float, it should be
|
|
between 0 and 1.
|
|
|
|
noise : float, default=0.0
|
|
The standard deviation of the gaussian noise applied to the output.
|
|
|
|
shuffle : bool, default=True
|
|
Shuffle the samples and the features.
|
|
|
|
coef : bool, default=False
|
|
If True, the coefficients of the underlying linear model are returned.
|
|
|
|
random_state : int, RandomState instance or None, default=None
|
|
Determines random number generation for dataset creation. Pass an int
|
|
for reproducible output across multiple function calls.
|
|
See :term:`Glossary <random_state>`.
|
|
|
|
Returns
|
|
-------
|
|
X : ndarray of shape (n_samples, n_features)
|
|
The input samples.
|
|
|
|
y : ndarray of shape (n_samples,) or (n_samples, n_targets)
|
|
The output values.
|
|
|
|
coef : ndarray of shape (n_features,) or (n_features, n_targets)
|
|
The coefficient of the underlying linear model. It is returned only if
|
|
coef is True.
|
|
|
|
Examples
|
|
--------
|
|
>>> from sklearn.datasets import make_regression
|
|
>>> X, y = make_regression(n_samples=5, n_features=2, noise=1, random_state=42)
|
|
>>> X
|
|
array([[ 0.4967..., -0.1382... ],
|
|
[ 0.6476..., 1.523...],
|
|
[-0.2341..., -0.2341...],
|
|
[-0.4694..., 0.5425...],
|
|
[ 1.579..., 0.7674...]])
|
|
>>> y
|
|
array([ 6.737..., 37.79..., -10.27..., 0.4017..., 42.22...])
|
|
"""
|
|
n_informative = min(n_features, n_informative)
|
|
generator = check_random_state(random_state)
|
|
|
|
if effective_rank is None:
|
|
# Randomly generate a well conditioned input set
|
|
X = generator.standard_normal(size=(n_samples, n_features))
|
|
|
|
else:
|
|
# Randomly generate a low rank, fat tail input set
|
|
X = make_low_rank_matrix(
|
|
n_samples=n_samples,
|
|
n_features=n_features,
|
|
effective_rank=effective_rank,
|
|
tail_strength=tail_strength,
|
|
random_state=generator,
|
|
)
|
|
|
|
# Generate a ground truth model with only n_informative features being non
|
|
# zeros (the other features are not correlated to y and should be ignored
|
|
# by a sparsifying regularizers such as L1 or elastic net)
|
|
ground_truth = np.zeros((n_features, n_targets))
|
|
ground_truth[:n_informative, :] = 100 * generator.uniform(
|
|
size=(n_informative, n_targets)
|
|
)
|
|
|
|
y = np.dot(X, ground_truth) + bias
|
|
|
|
# Add noise
|
|
if noise > 0.0:
|
|
y += generator.normal(scale=noise, size=y.shape)
|
|
|
|
# Randomly permute samples and features
|
|
if shuffle:
|
|
X, y = util_shuffle(X, y, random_state=generator)
|
|
|
|
indices = np.arange(n_features)
|
|
generator.shuffle(indices)
|
|
X[:, :] = X[:, indices]
|
|
ground_truth = ground_truth[indices]
|
|
|
|
y = np.squeeze(y)
|
|
|
|
if coef:
|
|
return X, y, np.squeeze(ground_truth)
|
|
|
|
else:
|
|
return X, y
|
|
|
|
|
|
def make_circles(
|
|
n_samples=100, *, shuffle=True, noise=None, random_state=None, factor=0.8
|
|
):
|
|
"""Make a large circle containing a smaller circle in 2d.
|
|
|
|
A simple toy dataset to visualize clustering and classification
|
|
algorithms.
|
|
|
|
Read more in the :ref:`User Guide <sample_generators>`.
|
|
|
|
Parameters
|
|
----------
|
|
n_samples : int or tuple of shape (2,), dtype=int, default=100
|
|
If int, it is the total number of points generated.
|
|
For odd numbers, the inner circle will have one point more than the
|
|
outer circle.
|
|
If two-element tuple, number of points in outer circle and inner
|
|
circle.
|
|
|
|
.. versionchanged:: 0.23
|
|
Added two-element tuple.
|
|
|
|
shuffle : bool, default=True
|
|
Whether to shuffle the samples.
|
|
|
|
noise : float, default=None
|
|
Standard deviation of Gaussian noise added to the data.
|
|
|
|
random_state : int, RandomState instance or None, default=None
|
|
Determines random number generation for dataset shuffling and noise.
|
|
Pass an int for reproducible output across multiple function calls.
|
|
See :term:`Glossary <random_state>`.
|
|
|
|
factor : float, default=.8
|
|
Scale factor between inner and outer circle in the range `(0, 1)`.
|
|
|
|
Returns
|
|
-------
|
|
X : ndarray of shape (n_samples, 2)
|
|
The generated samples.
|
|
|
|
y : ndarray of shape (n_samples,)
|
|
The integer labels (0 or 1) for class membership of each sample.
|
|
"""
|
|
|
|
if factor >= 1 or factor < 0:
|
|
raise ValueError("'factor' has to be between 0 and 1.")
|
|
|
|
if isinstance(n_samples, numbers.Integral):
|
|
n_samples_out = n_samples // 2
|
|
n_samples_in = n_samples - n_samples_out
|
|
else:
|
|
try:
|
|
n_samples_out, n_samples_in = n_samples
|
|
except ValueError as e:
|
|
raise ValueError(
|
|
"`n_samples` can be either an int or a two-element tuple."
|
|
) from e
|
|
|
|
generator = check_random_state(random_state)
|
|
# so as not to have the first point = last point, we set endpoint=False
|
|
linspace_out = np.linspace(0, 2 * np.pi, n_samples_out, endpoint=False)
|
|
linspace_in = np.linspace(0, 2 * np.pi, n_samples_in, endpoint=False)
|
|
outer_circ_x = np.cos(linspace_out)
|
|
outer_circ_y = np.sin(linspace_out)
|
|
inner_circ_x = np.cos(linspace_in) * factor
|
|
inner_circ_y = np.sin(linspace_in) * factor
|
|
|
|
X = np.vstack(
|
|
[np.append(outer_circ_x, inner_circ_x), np.append(outer_circ_y, inner_circ_y)]
|
|
).T
|
|
y = np.hstack(
|
|
[np.zeros(n_samples_out, dtype=np.intp), np.ones(n_samples_in, dtype=np.intp)]
|
|
)
|
|
if shuffle:
|
|
X, y = util_shuffle(X, y, random_state=generator)
|
|
|
|
if noise is not None:
|
|
X += generator.normal(scale=noise, size=X.shape)
|
|
|
|
return X, y
|
|
|
|
|
|
def make_moons(n_samples=100, *, shuffle=True, noise=None, random_state=None):
|
|
"""Make two interleaving half circles.
|
|
|
|
A simple toy dataset to visualize clustering and classification
|
|
algorithms. Read more in the :ref:`User Guide <sample_generators>`.
|
|
|
|
Parameters
|
|
----------
|
|
n_samples : int or tuple of shape (2,), dtype=int, default=100
|
|
If int, the total number of points generated.
|
|
If two-element tuple, number of points in each of two moons.
|
|
|
|
.. versionchanged:: 0.23
|
|
Added two-element tuple.
|
|
|
|
shuffle : bool, default=True
|
|
Whether to shuffle the samples.
|
|
|
|
noise : float, default=None
|
|
Standard deviation of Gaussian noise added to the data.
|
|
|
|
random_state : int, RandomState instance or None, default=None
|
|
Determines random number generation for dataset shuffling and noise.
|
|
Pass an int for reproducible output across multiple function calls.
|
|
See :term:`Glossary <random_state>`.
|
|
|
|
Returns
|
|
-------
|
|
X : ndarray of shape (n_samples, 2)
|
|
The generated samples.
|
|
|
|
y : ndarray of shape (n_samples,)
|
|
The integer labels (0 or 1) for class membership of each sample.
|
|
"""
|
|
|
|
if isinstance(n_samples, numbers.Integral):
|
|
n_samples_out = n_samples // 2
|
|
n_samples_in = n_samples - n_samples_out
|
|
else:
|
|
try:
|
|
n_samples_out, n_samples_in = n_samples
|
|
except ValueError as e:
|
|
raise ValueError(
|
|
"`n_samples` can be either an int or a two-element tuple."
|
|
) from e
|
|
|
|
generator = check_random_state(random_state)
|
|
|
|
outer_circ_x = np.cos(np.linspace(0, np.pi, n_samples_out))
|
|
outer_circ_y = np.sin(np.linspace(0, np.pi, n_samples_out))
|
|
inner_circ_x = 1 - np.cos(np.linspace(0, np.pi, n_samples_in))
|
|
inner_circ_y = 1 - np.sin(np.linspace(0, np.pi, n_samples_in)) - 0.5
|
|
|
|
X = np.vstack(
|
|
[np.append(outer_circ_x, inner_circ_x), np.append(outer_circ_y, inner_circ_y)]
|
|
).T
|
|
y = np.hstack(
|
|
[np.zeros(n_samples_out, dtype=np.intp), np.ones(n_samples_in, dtype=np.intp)]
|
|
)
|
|
|
|
if shuffle:
|
|
X, y = util_shuffle(X, y, random_state=generator)
|
|
|
|
if noise is not None:
|
|
X += generator.normal(scale=noise, size=X.shape)
|
|
|
|
return X, y
|
|
|
|
|
|
def make_blobs(
|
|
n_samples=100,
|
|
n_features=2,
|
|
*,
|
|
centers=None,
|
|
cluster_std=1.0,
|
|
center_box=(-10.0, 10.0),
|
|
shuffle=True,
|
|
random_state=None,
|
|
return_centers=False,
|
|
):
|
|
"""Generate isotropic Gaussian blobs for clustering.
|
|
|
|
Read more in the :ref:`User Guide <sample_generators>`.
|
|
|
|
Parameters
|
|
----------
|
|
n_samples : int or array-like, default=100
|
|
If int, it is the total number of points equally divided among
|
|
clusters.
|
|
If array-like, each element of the sequence indicates
|
|
the number of samples per cluster.
|
|
|
|
.. versionchanged:: v0.20
|
|
one can now pass an array-like to the ``n_samples`` parameter
|
|
|
|
n_features : int, default=2
|
|
The number of features for each sample.
|
|
|
|
centers : int or ndarray of shape (n_centers, n_features), default=None
|
|
The number of centers to generate, or the fixed center locations.
|
|
If n_samples is an int and centers is None, 3 centers are generated.
|
|
If n_samples is array-like, centers must be
|
|
either None or an array of length equal to the length of n_samples.
|
|
|
|
cluster_std : float or array-like of float, default=1.0
|
|
The standard deviation of the clusters.
|
|
|
|
center_box : tuple of float (min, max), default=(-10.0, 10.0)
|
|
The bounding box for each cluster center when centers are
|
|
generated at random.
|
|
|
|
shuffle : bool, default=True
|
|
Shuffle the samples.
|
|
|
|
random_state : int, RandomState instance or None, default=None
|
|
Determines random number generation for dataset creation. Pass an int
|
|
for reproducible output across multiple function calls.
|
|
See :term:`Glossary <random_state>`.
|
|
|
|
return_centers : bool, default=False
|
|
If True, then return the centers of each cluster.
|
|
|
|
.. versionadded:: 0.23
|
|
|
|
Returns
|
|
-------
|
|
X : ndarray of shape (n_samples, n_features)
|
|
The generated samples.
|
|
|
|
y : ndarray of shape (n_samples,)
|
|
The integer labels for cluster membership of each sample.
|
|
|
|
centers : ndarray of shape (n_centers, n_features)
|
|
The centers of each cluster. Only returned if
|
|
``return_centers=True``.
|
|
|
|
See Also
|
|
--------
|
|
make_classification : A more intricate variant.
|
|
|
|
Examples
|
|
--------
|
|
>>> from sklearn.datasets import make_blobs
|
|
>>> X, y = make_blobs(n_samples=10, centers=3, n_features=2,
|
|
... random_state=0)
|
|
>>> print(X.shape)
|
|
(10, 2)
|
|
>>> y
|
|
array([0, 0, 1, 0, 2, 2, 2, 1, 1, 0])
|
|
>>> X, y = make_blobs(n_samples=[3, 3, 4], centers=None, n_features=2,
|
|
... random_state=0)
|
|
>>> print(X.shape)
|
|
(10, 2)
|
|
>>> y
|
|
array([0, 1, 2, 0, 2, 2, 2, 1, 1, 0])
|
|
"""
|
|
generator = check_random_state(random_state)
|
|
|
|
if isinstance(n_samples, numbers.Integral):
|
|
# Set n_centers by looking at centers arg
|
|
if centers is None:
|
|
centers = 3
|
|
|
|
if isinstance(centers, numbers.Integral):
|
|
n_centers = centers
|
|
centers = generator.uniform(
|
|
center_box[0], center_box[1], size=(n_centers, n_features)
|
|
)
|
|
|
|
else:
|
|
centers = check_array(centers)
|
|
n_features = centers.shape[1]
|
|
n_centers = centers.shape[0]
|
|
|
|
else:
|
|
# Set n_centers by looking at [n_samples] arg
|
|
n_centers = len(n_samples)
|
|
if centers is None:
|
|
centers = generator.uniform(
|
|
center_box[0], center_box[1], size=(n_centers, n_features)
|
|
)
|
|
try:
|
|
assert len(centers) == n_centers
|
|
except TypeError as e:
|
|
raise ValueError(
|
|
"Parameter `centers` must be array-like. Got {!r} instead".format(
|
|
centers
|
|
)
|
|
) from e
|
|
except AssertionError as e:
|
|
raise ValueError(
|
|
"Length of `n_samples` not consistent with number of "
|
|
f"centers. Got n_samples = {n_samples} and centers = {centers}"
|
|
) from e
|
|
else:
|
|
centers = check_array(centers)
|
|
n_features = centers.shape[1]
|
|
|
|
# stds: if cluster_std is given as list, it must be consistent
|
|
# with the n_centers
|
|
if hasattr(cluster_std, "__len__") and len(cluster_std) != n_centers:
|
|
raise ValueError(
|
|
"Length of `clusters_std` not consistent with "
|
|
"number of centers. Got centers = {} "
|
|
"and cluster_std = {}".format(centers, cluster_std)
|
|
)
|
|
|
|
if isinstance(cluster_std, numbers.Real):
|
|
cluster_std = np.full(len(centers), cluster_std)
|
|
|
|
if isinstance(n_samples, Iterable):
|
|
n_samples_per_center = n_samples
|
|
else:
|
|
n_samples_per_center = [int(n_samples // n_centers)] * n_centers
|
|
|
|
for i in range(n_samples % n_centers):
|
|
n_samples_per_center[i] += 1
|
|
|
|
cum_sum_n_samples = np.cumsum(n_samples_per_center)
|
|
X = np.empty(shape=(sum(n_samples_per_center), n_features), dtype=np.float64)
|
|
y = np.empty(shape=(sum(n_samples_per_center),), dtype=int)
|
|
|
|
for i, (n, std) in enumerate(zip(n_samples_per_center, cluster_std)):
|
|
start_idx = cum_sum_n_samples[i - 1] if i > 0 else 0
|
|
end_idx = cum_sum_n_samples[i]
|
|
X[start_idx:end_idx] = generator.normal(
|
|
loc=centers[i], scale=std, size=(n, n_features)
|
|
)
|
|
y[start_idx:end_idx] = i
|
|
|
|
if shuffle:
|
|
X, y = util_shuffle(X, y, random_state=generator)
|
|
|
|
if return_centers:
|
|
return X, y, centers
|
|
else:
|
|
return X, y
|
|
|
|
|
|
def make_friedman1(n_samples=100, n_features=10, *, noise=0.0, random_state=None):
|
|
"""Generate the "Friedman #1" regression problem.
|
|
|
|
This dataset is described in Friedman [1] and Breiman [2].
|
|
|
|
Inputs `X` are independent features uniformly distributed on the interval
|
|
[0, 1]. The output `y` is created according to the formula::
|
|
|
|
y(X) = 10 * sin(pi * X[:, 0] * X[:, 1]) + 20 * (X[:, 2] - 0.5) ** 2 \
|
|
+ 10 * X[:, 3] + 5 * X[:, 4] + noise * N(0, 1).
|
|
|
|
Out of the `n_features` features, only 5 are actually used to compute
|
|
`y`. The remaining features are independent of `y`.
|
|
|
|
The number of features has to be >= 5.
|
|
|
|
Read more in the :ref:`User Guide <sample_generators>`.
|
|
|
|
Parameters
|
|
----------
|
|
n_samples : int, default=100
|
|
The number of samples.
|
|
|
|
n_features : int, default=10
|
|
The number of features. Should be at least 5.
|
|
|
|
noise : float, default=0.0
|
|
The standard deviation of the gaussian noise applied to the output.
|
|
|
|
random_state : int, RandomState instance or None, default=None
|
|
Determines random number generation for dataset noise. Pass an int
|
|
for reproducible output across multiple function calls.
|
|
See :term:`Glossary <random_state>`.
|
|
|
|
Returns
|
|
-------
|
|
X : ndarray of shape (n_samples, n_features)
|
|
The input samples.
|
|
|
|
y : ndarray of shape (n_samples,)
|
|
The output values.
|
|
|
|
References
|
|
----------
|
|
.. [1] J. Friedman, "Multivariate adaptive regression splines", The Annals
|
|
of Statistics 19 (1), pages 1-67, 1991.
|
|
|
|
.. [2] L. Breiman, "Bagging predictors", Machine Learning 24,
|
|
pages 123-140, 1996.
|
|
"""
|
|
if n_features < 5:
|
|
raise ValueError("n_features must be at least five.")
|
|
|
|
generator = check_random_state(random_state)
|
|
|
|
X = generator.uniform(size=(n_samples, n_features))
|
|
y = (
|
|
10 * np.sin(np.pi * X[:, 0] * X[:, 1])
|
|
+ 20 * (X[:, 2] - 0.5) ** 2
|
|
+ 10 * X[:, 3]
|
|
+ 5 * X[:, 4]
|
|
+ noise * generator.standard_normal(size=(n_samples))
|
|
)
|
|
|
|
return X, y
|
|
|
|
|
|
def make_friedman2(n_samples=100, *, noise=0.0, random_state=None):
|
|
"""Generate the "Friedman #2" regression problem.
|
|
|
|
This dataset is described in Friedman [1] and Breiman [2].
|
|
|
|
Inputs `X` are 4 independent features uniformly distributed on the
|
|
intervals::
|
|
|
|
0 <= X[:, 0] <= 100,
|
|
40 * pi <= X[:, 1] <= 560 * pi,
|
|
0 <= X[:, 2] <= 1,
|
|
1 <= X[:, 3] <= 11.
|
|
|
|
The output `y` is created according to the formula::
|
|
|
|
y(X) = (X[:, 0] ** 2 + (X[:, 1] * X[:, 2] \
|
|
- 1 / (X[:, 1] * X[:, 3])) ** 2) ** 0.5 + noise * N(0, 1).
|
|
|
|
Read more in the :ref:`User Guide <sample_generators>`.
|
|
|
|
Parameters
|
|
----------
|
|
n_samples : int, default=100
|
|
The number of samples.
|
|
|
|
noise : float, default=0.0
|
|
The standard deviation of the gaussian noise applied to the output.
|
|
|
|
random_state : int, RandomState instance or None, default=None
|
|
Determines random number generation for dataset noise. Pass an int
|
|
for reproducible output across multiple function calls.
|
|
See :term:`Glossary <random_state>`.
|
|
|
|
Returns
|
|
-------
|
|
X : ndarray of shape (n_samples, 4)
|
|
The input samples.
|
|
|
|
y : ndarray of shape (n_samples,)
|
|
The output values.
|
|
|
|
References
|
|
----------
|
|
.. [1] J. Friedman, "Multivariate adaptive regression splines", The Annals
|
|
of Statistics 19 (1), pages 1-67, 1991.
|
|
|
|
.. [2] L. Breiman, "Bagging predictors", Machine Learning 24,
|
|
pages 123-140, 1996.
|
|
"""
|
|
generator = check_random_state(random_state)
|
|
|
|
X = generator.uniform(size=(n_samples, 4))
|
|
X[:, 0] *= 100
|
|
X[:, 1] *= 520 * np.pi
|
|
X[:, 1] += 40 * np.pi
|
|
X[:, 3] *= 10
|
|
X[:, 3] += 1
|
|
|
|
y = (
|
|
X[:, 0] ** 2 + (X[:, 1] * X[:, 2] - 1 / (X[:, 1] * X[:, 3])) ** 2
|
|
) ** 0.5 + noise * generator.standard_normal(size=(n_samples))
|
|
|
|
return X, y
|
|
|
|
|
|
def make_friedman3(n_samples=100, *, noise=0.0, random_state=None):
|
|
"""Generate the "Friedman #3" regression problem.
|
|
|
|
This dataset is described in Friedman [1] and Breiman [2].
|
|
|
|
Inputs `X` are 4 independent features uniformly distributed on the
|
|
intervals::
|
|
|
|
0 <= X[:, 0] <= 100,
|
|
40 * pi <= X[:, 1] <= 560 * pi,
|
|
0 <= X[:, 2] <= 1,
|
|
1 <= X[:, 3] <= 11.
|
|
|
|
The output `y` is created according to the formula::
|
|
|
|
y(X) = arctan((X[:, 1] * X[:, 2] - 1 / (X[:, 1] * X[:, 3])) \
|
|
/ X[:, 0]) + noise * N(0, 1).
|
|
|
|
Read more in the :ref:`User Guide <sample_generators>`.
|
|
|
|
Parameters
|
|
----------
|
|
n_samples : int, default=100
|
|
The number of samples.
|
|
|
|
noise : float, default=0.0
|
|
The standard deviation of the gaussian noise applied to the output.
|
|
|
|
random_state : int, RandomState instance or None, default=None
|
|
Determines random number generation for dataset noise. Pass an int
|
|
for reproducible output across multiple function calls.
|
|
See :term:`Glossary <random_state>`.
|
|
|
|
Returns
|
|
-------
|
|
X : ndarray of shape (n_samples, 4)
|
|
The input samples.
|
|
|
|
y : ndarray of shape (n_samples,)
|
|
The output values.
|
|
|
|
References
|
|
----------
|
|
.. [1] J. Friedman, "Multivariate adaptive regression splines", The Annals
|
|
of Statistics 19 (1), pages 1-67, 1991.
|
|
|
|
.. [2] L. Breiman, "Bagging predictors", Machine Learning 24,
|
|
pages 123-140, 1996.
|
|
"""
|
|
generator = check_random_state(random_state)
|
|
|
|
X = generator.uniform(size=(n_samples, 4))
|
|
X[:, 0] *= 100
|
|
X[:, 1] *= 520 * np.pi
|
|
X[:, 1] += 40 * np.pi
|
|
X[:, 3] *= 10
|
|
X[:, 3] += 1
|
|
|
|
y = np.arctan(
|
|
(X[:, 1] * X[:, 2] - 1 / (X[:, 1] * X[:, 3])) / X[:, 0]
|
|
) + noise * generator.standard_normal(size=(n_samples))
|
|
|
|
return X, y
|
|
|
|
|
|
def make_low_rank_matrix(
|
|
n_samples=100,
|
|
n_features=100,
|
|
*,
|
|
effective_rank=10,
|
|
tail_strength=0.5,
|
|
random_state=None,
|
|
):
|
|
"""Generate a mostly low rank matrix with bell-shaped singular values.
|
|
|
|
Most of the variance can be explained by a bell-shaped curve of width
|
|
effective_rank: the low rank part of the singular values profile is::
|
|
|
|
(1 - tail_strength) * exp(-1.0 * (i / effective_rank) ** 2)
|
|
|
|
The remaining singular values' tail is fat, decreasing as::
|
|
|
|
tail_strength * exp(-0.1 * i / effective_rank).
|
|
|
|
The low rank part of the profile can be considered the structured
|
|
signal part of the data while the tail can be considered the noisy
|
|
part of the data that cannot be summarized by a low number of linear
|
|
components (singular vectors).
|
|
|
|
This kind of singular profiles is often seen in practice, for instance:
|
|
- gray level pictures of faces
|
|
- TF-IDF vectors of text documents crawled from the web
|
|
|
|
Read more in the :ref:`User Guide <sample_generators>`.
|
|
|
|
Parameters
|
|
----------
|
|
n_samples : int, default=100
|
|
The number of samples.
|
|
|
|
n_features : int, default=100
|
|
The number of features.
|
|
|
|
effective_rank : int, default=10
|
|
The approximate number of singular vectors required to explain most of
|
|
the data by linear combinations.
|
|
|
|
tail_strength : float, default=0.5
|
|
The relative importance of the fat noisy tail of the singular values
|
|
profile. The value should be between 0 and 1.
|
|
|
|
random_state : int, RandomState instance or None, default=None
|
|
Determines random number generation for dataset creation. Pass an int
|
|
for reproducible output across multiple function calls.
|
|
See :term:`Glossary <random_state>`.
|
|
|
|
Returns
|
|
-------
|
|
X : ndarray of shape (n_samples, n_features)
|
|
The matrix.
|
|
"""
|
|
generator = check_random_state(random_state)
|
|
n = min(n_samples, n_features)
|
|
|
|
# Random (ortho normal) vectors
|
|
u, _ = linalg.qr(
|
|
generator.standard_normal(size=(n_samples, n)),
|
|
mode="economic",
|
|
check_finite=False,
|
|
)
|
|
v, _ = linalg.qr(
|
|
generator.standard_normal(size=(n_features, n)),
|
|
mode="economic",
|
|
check_finite=False,
|
|
)
|
|
|
|
# Index of the singular values
|
|
singular_ind = np.arange(n, dtype=np.float64)
|
|
|
|
# Build the singular profile by assembling signal and noise components
|
|
low_rank = (1 - tail_strength) * np.exp(-1.0 * (singular_ind / effective_rank) ** 2)
|
|
tail = tail_strength * np.exp(-0.1 * singular_ind / effective_rank)
|
|
s = np.identity(n) * (low_rank + tail)
|
|
|
|
return np.dot(np.dot(u, s), v.T)
|
|
|
|
|
|
# TODO(1.3): Change argument `data_transposed` default from True to False.
|
|
# TODO(1.3): Deprecate data_transposed, always return data not transposed.
|
|
def make_sparse_coded_signal(
|
|
n_samples,
|
|
*,
|
|
n_components,
|
|
n_features,
|
|
n_nonzero_coefs,
|
|
random_state=None,
|
|
data_transposed="warn",
|
|
):
|
|
"""Generate a signal as a sparse combination of dictionary elements.
|
|
|
|
Returns a matrix Y = DX, such that D is (n_features, n_components),
|
|
X is (n_components, n_samples) and each column of X has exactly
|
|
n_nonzero_coefs non-zero elements.
|
|
|
|
Read more in the :ref:`User Guide <sample_generators>`.
|
|
|
|
Parameters
|
|
----------
|
|
n_samples : int
|
|
Number of samples to generate.
|
|
|
|
n_components : int
|
|
Number of components in the dictionary.
|
|
|
|
n_features : int
|
|
Number of features of the dataset to generate.
|
|
|
|
n_nonzero_coefs : int
|
|
Number of active (non-zero) coefficients in each sample.
|
|
|
|
random_state : int, RandomState instance or None, default=None
|
|
Determines random number generation for dataset creation. Pass an int
|
|
for reproducible output across multiple function calls.
|
|
See :term:`Glossary <random_state>`.
|
|
|
|
data_transposed : bool, default=True
|
|
By default, Y, D and X are transposed.
|
|
|
|
.. versionadded:: 1.1
|
|
|
|
Returns
|
|
-------
|
|
data : ndarray of shape (n_features, n_samples) or (n_samples, n_features)
|
|
The encoded signal (Y). The shape is `(n_samples, n_features)` if
|
|
`data_transposed` is False, otherwise it's `(n_features, n_samples)`.
|
|
|
|
dictionary : ndarray of shape (n_features, n_components) or \
|
|
(n_components, n_features)
|
|
The dictionary with normalized components (D). The shape is
|
|
`(n_components, n_features)` if `data_transposed` is False, otherwise it's
|
|
`(n_features, n_components)`.
|
|
|
|
code : ndarray of shape (n_components, n_samples) or (n_samples, n_components)
|
|
The sparse code such that each column of this matrix has exactly
|
|
n_nonzero_coefs non-zero items (X). The shape is `(n_samples, n_components)`
|
|
if `data_transposed` is False, otherwise it's `(n_components, n_samples)`.
|
|
"""
|
|
generator = check_random_state(random_state)
|
|
|
|
# generate dictionary
|
|
D = generator.standard_normal(size=(n_features, n_components))
|
|
D /= np.sqrt(np.sum((D**2), axis=0))
|
|
|
|
# generate code
|
|
X = np.zeros((n_components, n_samples))
|
|
for i in range(n_samples):
|
|
idx = np.arange(n_components)
|
|
generator.shuffle(idx)
|
|
idx = idx[:n_nonzero_coefs]
|
|
X[idx, i] = generator.standard_normal(size=n_nonzero_coefs)
|
|
|
|
# encode signal
|
|
Y = np.dot(D, X)
|
|
|
|
# raise warning if data_transposed is not passed explicitly
|
|
if data_transposed == "warn":
|
|
data_transposed = True
|
|
warnings.warn(
|
|
"The default value of data_transposed will change from True to False in"
|
|
" version 1.3",
|
|
FutureWarning,
|
|
)
|
|
|
|
# transpose if needed
|
|
if not data_transposed:
|
|
Y, D, X = Y.T, D.T, X.T
|
|
|
|
return map(np.squeeze, (Y, D, X))
|
|
|
|
|
|
def make_sparse_uncorrelated(n_samples=100, n_features=10, *, random_state=None):
|
|
"""Generate a random regression problem with sparse uncorrelated design.
|
|
|
|
This dataset is described in Celeux et al [1]. as::
|
|
|
|
X ~ N(0, 1)
|
|
y(X) = X[:, 0] + 2 * X[:, 1] - 2 * X[:, 2] - 1.5 * X[:, 3]
|
|
|
|
Only the first 4 features are informative. The remaining features are
|
|
useless.
|
|
|
|
Read more in the :ref:`User Guide <sample_generators>`.
|
|
|
|
Parameters
|
|
----------
|
|
n_samples : int, default=100
|
|
The number of samples.
|
|
|
|
n_features : int, default=10
|
|
The number of features.
|
|
|
|
random_state : int, RandomState instance or None, default=None
|
|
Determines random number generation for dataset creation. Pass an int
|
|
for reproducible output across multiple function calls.
|
|
See :term:`Glossary <random_state>`.
|
|
|
|
Returns
|
|
-------
|
|
X : ndarray of shape (n_samples, n_features)
|
|
The input samples.
|
|
|
|
y : ndarray of shape (n_samples,)
|
|
The output values.
|
|
|
|
References
|
|
----------
|
|
.. [1] G. Celeux, M. El Anbari, J.-M. Marin, C. P. Robert,
|
|
"Regularization in regression: comparing Bayesian and frequentist
|
|
methods in a poorly informative situation", 2009.
|
|
"""
|
|
generator = check_random_state(random_state)
|
|
|
|
X = generator.normal(loc=0, scale=1, size=(n_samples, n_features))
|
|
y = generator.normal(
|
|
loc=(X[:, 0] + 2 * X[:, 1] - 2 * X[:, 2] - 1.5 * X[:, 3]),
|
|
scale=np.ones(n_samples),
|
|
)
|
|
|
|
return X, y
|
|
|
|
|
|
def make_spd_matrix(n_dim, *, random_state=None):
|
|
"""Generate a random symmetric, positive-definite matrix.
|
|
|
|
Read more in the :ref:`User Guide <sample_generators>`.
|
|
|
|
Parameters
|
|
----------
|
|
n_dim : int
|
|
The matrix dimension.
|
|
|
|
random_state : int, RandomState instance or None, default=None
|
|
Determines random number generation for dataset creation. Pass an int
|
|
for reproducible output across multiple function calls.
|
|
See :term:`Glossary <random_state>`.
|
|
|
|
Returns
|
|
-------
|
|
X : ndarray of shape (n_dim, n_dim)
|
|
The random symmetric, positive-definite matrix.
|
|
|
|
See Also
|
|
--------
|
|
make_sparse_spd_matrix: Generate a sparse symmetric definite positive matrix.
|
|
"""
|
|
generator = check_random_state(random_state)
|
|
|
|
A = generator.uniform(size=(n_dim, n_dim))
|
|
U, _, Vt = linalg.svd(np.dot(A.T, A), check_finite=False)
|
|
X = np.dot(np.dot(U, 1.0 + np.diag(generator.uniform(size=n_dim))), Vt)
|
|
|
|
return X
|
|
|
|
|
|
def make_sparse_spd_matrix(
|
|
dim=1,
|
|
*,
|
|
alpha=0.95,
|
|
norm_diag=False,
|
|
smallest_coef=0.1,
|
|
largest_coef=0.9,
|
|
random_state=None,
|
|
):
|
|
"""Generate a sparse symmetric definite positive matrix.
|
|
|
|
Read more in the :ref:`User Guide <sample_generators>`.
|
|
|
|
Parameters
|
|
----------
|
|
dim : int, default=1
|
|
The size of the random matrix to generate.
|
|
|
|
alpha : float, default=0.95
|
|
The probability that a coefficient is zero (see notes). Larger values
|
|
enforce more sparsity. The value should be in the range 0 and 1.
|
|
|
|
norm_diag : bool, default=False
|
|
Whether to normalize the output matrix to make the leading diagonal
|
|
elements all 1.
|
|
|
|
smallest_coef : float, default=0.1
|
|
The value of the smallest coefficient between 0 and 1.
|
|
|
|
largest_coef : float, default=0.9
|
|
The value of the largest coefficient between 0 and 1.
|
|
|
|
random_state : int, RandomState instance or None, default=None
|
|
Determines random number generation for dataset creation. Pass an int
|
|
for reproducible output across multiple function calls.
|
|
See :term:`Glossary <random_state>`.
|
|
|
|
Returns
|
|
-------
|
|
prec : sparse matrix of shape (dim, dim)
|
|
The generated matrix.
|
|
|
|
See Also
|
|
--------
|
|
make_spd_matrix : Generate a random symmetric, positive-definite matrix.
|
|
|
|
Notes
|
|
-----
|
|
The sparsity is actually imposed on the cholesky factor of the matrix.
|
|
Thus alpha does not translate directly into the filling fraction of
|
|
the matrix itself.
|
|
"""
|
|
random_state = check_random_state(random_state)
|
|
|
|
chol = -np.eye(dim)
|
|
aux = random_state.uniform(size=(dim, dim))
|
|
aux[aux < alpha] = 0
|
|
aux[aux > alpha] = smallest_coef + (
|
|
largest_coef - smallest_coef
|
|
) * random_state.uniform(size=np.sum(aux > alpha))
|
|
aux = np.tril(aux, k=-1)
|
|
|
|
# Permute the lines: we don't want to have asymmetries in the final
|
|
# SPD matrix
|
|
permutation = random_state.permutation(dim)
|
|
aux = aux[permutation].T[permutation]
|
|
chol += aux
|
|
prec = np.dot(chol.T, chol)
|
|
|
|
if norm_diag:
|
|
# Form the diagonal vector into a row matrix
|
|
d = np.diag(prec).reshape(1, prec.shape[0])
|
|
d = 1.0 / np.sqrt(d)
|
|
|
|
prec *= d
|
|
prec *= d.T
|
|
|
|
return prec
|
|
|
|
|
|
def make_swiss_roll(n_samples=100, *, noise=0.0, random_state=None, hole=False):
|
|
"""Generate a swiss roll dataset.
|
|
|
|
Read more in the :ref:`User Guide <sample_generators>`.
|
|
|
|
Parameters
|
|
----------
|
|
n_samples : int, default=100
|
|
The number of sample points on the Swiss Roll.
|
|
|
|
noise : float, default=0.0
|
|
The standard deviation of the gaussian noise.
|
|
|
|
random_state : int, RandomState instance or None, default=None
|
|
Determines random number generation for dataset creation. Pass an int
|
|
for reproducible output across multiple function calls.
|
|
See :term:`Glossary <random_state>`.
|
|
|
|
hole : bool, default=False
|
|
If True generates the swiss roll with hole dataset.
|
|
|
|
Returns
|
|
-------
|
|
X : ndarray of shape (n_samples, 3)
|
|
The points.
|
|
|
|
t : ndarray of shape (n_samples,)
|
|
The univariate position of the sample according to the main dimension
|
|
of the points in the manifold.
|
|
|
|
Notes
|
|
-----
|
|
The algorithm is from Marsland [1].
|
|
|
|
References
|
|
----------
|
|
.. [1] S. Marsland, "Machine Learning: An Algorithmic Perspective", 2nd edition,
|
|
Chapter 6, 2014.
|
|
https://homepages.ecs.vuw.ac.nz/~marslast/Code/Ch6/lle.py
|
|
"""
|
|
generator = check_random_state(random_state)
|
|
|
|
if not hole:
|
|
t = 1.5 * np.pi * (1 + 2 * generator.uniform(size=n_samples))
|
|
y = 21 * generator.uniform(size=n_samples)
|
|
else:
|
|
corners = np.array(
|
|
[[np.pi * (1.5 + i), j * 7] for i in range(3) for j in range(3)]
|
|
)
|
|
corners = np.delete(corners, 4, axis=0)
|
|
corner_index = generator.choice(8, n_samples)
|
|
parameters = generator.uniform(size=(2, n_samples)) * np.array([[np.pi], [7]])
|
|
t, y = corners[corner_index].T + parameters
|
|
|
|
x = t * np.cos(t)
|
|
z = t * np.sin(t)
|
|
|
|
X = np.vstack((x, y, z))
|
|
X += noise * generator.standard_normal(size=(3, n_samples))
|
|
X = X.T
|
|
t = np.squeeze(t)
|
|
|
|
return X, t
|
|
|
|
|
|
def make_s_curve(n_samples=100, *, noise=0.0, random_state=None):
|
|
"""Generate an S curve dataset.
|
|
|
|
Read more in the :ref:`User Guide <sample_generators>`.
|
|
|
|
Parameters
|
|
----------
|
|
n_samples : int, default=100
|
|
The number of sample points on the S curve.
|
|
|
|
noise : float, default=0.0
|
|
The standard deviation of the gaussian noise.
|
|
|
|
random_state : int, RandomState instance or None, default=None
|
|
Determines random number generation for dataset creation. Pass an int
|
|
for reproducible output across multiple function calls.
|
|
See :term:`Glossary <random_state>`.
|
|
|
|
Returns
|
|
-------
|
|
X : ndarray of shape (n_samples, 3)
|
|
The points.
|
|
|
|
t : ndarray of shape (n_samples,)
|
|
The univariate position of the sample according to the main dimension
|
|
of the points in the manifold.
|
|
"""
|
|
generator = check_random_state(random_state)
|
|
|
|
t = 3 * np.pi * (generator.uniform(size=(1, n_samples)) - 0.5)
|
|
X = np.empty(shape=(n_samples, 3), dtype=np.float64)
|
|
X[:, 0] = np.sin(t)
|
|
X[:, 1] = 2.0 * generator.uniform(size=n_samples)
|
|
X[:, 2] = np.sign(t) * (np.cos(t) - 1)
|
|
X += noise * generator.standard_normal(size=(3, n_samples)).T
|
|
t = np.squeeze(t)
|
|
|
|
return X, t
|
|
|
|
|
|
def make_gaussian_quantiles(
|
|
*,
|
|
mean=None,
|
|
cov=1.0,
|
|
n_samples=100,
|
|
n_features=2,
|
|
n_classes=3,
|
|
shuffle=True,
|
|
random_state=None,
|
|
):
|
|
r"""Generate isotropic Gaussian and label samples by quantile.
|
|
|
|
This classification dataset is constructed by taking a multi-dimensional
|
|
standard normal distribution and defining classes separated by nested
|
|
concentric multi-dimensional spheres such that roughly equal numbers of
|
|
samples are in each class (quantiles of the :math:`\chi^2` distribution).
|
|
|
|
Read more in the :ref:`User Guide <sample_generators>`.
|
|
|
|
Parameters
|
|
----------
|
|
mean : ndarray of shape (n_features,), default=None
|
|
The mean of the multi-dimensional normal distribution.
|
|
If None then use the origin (0, 0, ...).
|
|
|
|
cov : float, default=1.0
|
|
The covariance matrix will be this value times the unit matrix. This
|
|
dataset only produces symmetric normal distributions.
|
|
|
|
n_samples : int, default=100
|
|
The total number of points equally divided among classes.
|
|
|
|
n_features : int, default=2
|
|
The number of features for each sample.
|
|
|
|
n_classes : int, default=3
|
|
The number of classes.
|
|
|
|
shuffle : bool, default=True
|
|
Shuffle the samples.
|
|
|
|
random_state : int, RandomState instance or None, default=None
|
|
Determines random number generation for dataset creation. Pass an int
|
|
for reproducible output across multiple function calls.
|
|
See :term:`Glossary <random_state>`.
|
|
|
|
Returns
|
|
-------
|
|
X : ndarray of shape (n_samples, n_features)
|
|
The generated samples.
|
|
|
|
y : ndarray of shape (n_samples,)
|
|
The integer labels for quantile membership of each sample.
|
|
|
|
Notes
|
|
-----
|
|
The dataset is from Zhu et al [1].
|
|
|
|
References
|
|
----------
|
|
.. [1] J. Zhu, H. Zou, S. Rosset, T. Hastie, "Multi-class AdaBoost", 2009.
|
|
"""
|
|
if n_samples < n_classes:
|
|
raise ValueError("n_samples must be at least n_classes")
|
|
|
|
generator = check_random_state(random_state)
|
|
|
|
if mean is None:
|
|
mean = np.zeros(n_features)
|
|
else:
|
|
mean = np.array(mean)
|
|
|
|
# Build multivariate normal distribution
|
|
X = generator.multivariate_normal(mean, cov * np.identity(n_features), (n_samples,))
|
|
|
|
# Sort by distance from origin
|
|
idx = np.argsort(np.sum((X - mean[np.newaxis, :]) ** 2, axis=1))
|
|
X = X[idx, :]
|
|
|
|
# Label by quantile
|
|
step = n_samples // n_classes
|
|
|
|
y = np.hstack(
|
|
[
|
|
np.repeat(np.arange(n_classes), step),
|
|
np.repeat(n_classes - 1, n_samples - step * n_classes),
|
|
]
|
|
)
|
|
|
|
if shuffle:
|
|
X, y = util_shuffle(X, y, random_state=generator)
|
|
|
|
return X, y
|
|
|
|
|
|
def _shuffle(data, random_state=None):
|
|
generator = check_random_state(random_state)
|
|
n_rows, n_cols = data.shape
|
|
row_idx = generator.permutation(n_rows)
|
|
col_idx = generator.permutation(n_cols)
|
|
result = data[row_idx][:, col_idx]
|
|
return result, row_idx, col_idx
|
|
|
|
|
|
def make_biclusters(
|
|
shape,
|
|
n_clusters,
|
|
*,
|
|
noise=0.0,
|
|
minval=10,
|
|
maxval=100,
|
|
shuffle=True,
|
|
random_state=None,
|
|
):
|
|
"""Generate a constant block diagonal structure array for biclustering.
|
|
|
|
Read more in the :ref:`User Guide <sample_generators>`.
|
|
|
|
Parameters
|
|
----------
|
|
shape : iterable of shape (n_rows, n_cols)
|
|
The shape of the result.
|
|
|
|
n_clusters : int
|
|
The number of biclusters.
|
|
|
|
noise : float, default=0.0
|
|
The standard deviation of the gaussian noise.
|
|
|
|
minval : int, default=10
|
|
Minimum value of a bicluster.
|
|
|
|
maxval : int, default=100
|
|
Maximum value of a bicluster.
|
|
|
|
shuffle : bool, default=True
|
|
Shuffle the samples.
|
|
|
|
random_state : int, RandomState instance or None, default=None
|
|
Determines random number generation for dataset creation. Pass an int
|
|
for reproducible output across multiple function calls.
|
|
See :term:`Glossary <random_state>`.
|
|
|
|
Returns
|
|
-------
|
|
X : ndarray of shape `shape`
|
|
The generated array.
|
|
|
|
rows : ndarray of shape (n_clusters, X.shape[0])
|
|
The indicators for cluster membership of each row.
|
|
|
|
cols : ndarray of shape (n_clusters, X.shape[1])
|
|
The indicators for cluster membership of each column.
|
|
|
|
See Also
|
|
--------
|
|
make_checkerboard: Generate an array with block checkerboard structure for
|
|
biclustering.
|
|
|
|
References
|
|
----------
|
|
|
|
.. [1] Dhillon, I. S. (2001, August). Co-clustering documents and
|
|
words using bipartite spectral graph partitioning. In Proceedings
|
|
of the seventh ACM SIGKDD international conference on Knowledge
|
|
discovery and data mining (pp. 269-274). ACM.
|
|
"""
|
|
generator = check_random_state(random_state)
|
|
n_rows, n_cols = shape
|
|
consts = generator.uniform(minval, maxval, n_clusters)
|
|
|
|
# row and column clusters of approximately equal sizes
|
|
row_sizes = generator.multinomial(n_rows, np.repeat(1.0 / n_clusters, n_clusters))
|
|
col_sizes = generator.multinomial(n_cols, np.repeat(1.0 / n_clusters, n_clusters))
|
|
|
|
row_labels = np.hstack(
|
|
[np.repeat(val, rep) for val, rep in zip(range(n_clusters), row_sizes)]
|
|
)
|
|
col_labels = np.hstack(
|
|
[np.repeat(val, rep) for val, rep in zip(range(n_clusters), col_sizes)]
|
|
)
|
|
|
|
result = np.zeros(shape, dtype=np.float64)
|
|
for i in range(n_clusters):
|
|
selector = np.outer(row_labels == i, col_labels == i)
|
|
result[selector] += consts[i]
|
|
|
|
if noise > 0:
|
|
result += generator.normal(scale=noise, size=result.shape)
|
|
|
|
if shuffle:
|
|
result, row_idx, col_idx = _shuffle(result, random_state)
|
|
row_labels = row_labels[row_idx]
|
|
col_labels = col_labels[col_idx]
|
|
|
|
rows = np.vstack([row_labels == c for c in range(n_clusters)])
|
|
cols = np.vstack([col_labels == c for c in range(n_clusters)])
|
|
|
|
return result, rows, cols
|
|
|
|
|
|
def make_checkerboard(
|
|
shape,
|
|
n_clusters,
|
|
*,
|
|
noise=0.0,
|
|
minval=10,
|
|
maxval=100,
|
|
shuffle=True,
|
|
random_state=None,
|
|
):
|
|
"""Generate an array with block checkerboard structure for biclustering.
|
|
|
|
Read more in the :ref:`User Guide <sample_generators>`.
|
|
|
|
Parameters
|
|
----------
|
|
shape : tuple of shape (n_rows, n_cols)
|
|
The shape of the result.
|
|
|
|
n_clusters : int or array-like or shape (n_row_clusters, n_column_clusters)
|
|
The number of row and column clusters.
|
|
|
|
noise : float, default=0.0
|
|
The standard deviation of the gaussian noise.
|
|
|
|
minval : int, default=10
|
|
Minimum value of a bicluster.
|
|
|
|
maxval : int, default=100
|
|
Maximum value of a bicluster.
|
|
|
|
shuffle : bool, default=True
|
|
Shuffle the samples.
|
|
|
|
random_state : int, RandomState instance or None, default=None
|
|
Determines random number generation for dataset creation. Pass an int
|
|
for reproducible output across multiple function calls.
|
|
See :term:`Glossary <random_state>`.
|
|
|
|
Returns
|
|
-------
|
|
X : ndarray of shape `shape`
|
|
The generated array.
|
|
|
|
rows : ndarray of shape (n_clusters, X.shape[0])
|
|
The indicators for cluster membership of each row.
|
|
|
|
cols : ndarray of shape (n_clusters, X.shape[1])
|
|
The indicators for cluster membership of each column.
|
|
|
|
See Also
|
|
--------
|
|
make_biclusters : Generate an array with constant block diagonal structure
|
|
for biclustering.
|
|
|
|
References
|
|
----------
|
|
.. [1] Kluger, Y., Basri, R., Chang, J. T., & Gerstein, M. (2003).
|
|
Spectral biclustering of microarray data: coclustering genes
|
|
and conditions. Genome research, 13(4), 703-716.
|
|
"""
|
|
generator = check_random_state(random_state)
|
|
|
|
if hasattr(n_clusters, "__len__"):
|
|
n_row_clusters, n_col_clusters = n_clusters
|
|
else:
|
|
n_row_clusters = n_col_clusters = n_clusters
|
|
|
|
# row and column clusters of approximately equal sizes
|
|
n_rows, n_cols = shape
|
|
row_sizes = generator.multinomial(
|
|
n_rows, np.repeat(1.0 / n_row_clusters, n_row_clusters)
|
|
)
|
|
col_sizes = generator.multinomial(
|
|
n_cols, np.repeat(1.0 / n_col_clusters, n_col_clusters)
|
|
)
|
|
|
|
row_labels = np.hstack(
|
|
[np.repeat(val, rep) for val, rep in zip(range(n_row_clusters), row_sizes)]
|
|
)
|
|
col_labels = np.hstack(
|
|
[np.repeat(val, rep) for val, rep in zip(range(n_col_clusters), col_sizes)]
|
|
)
|
|
|
|
result = np.zeros(shape, dtype=np.float64)
|
|
for i in range(n_row_clusters):
|
|
for j in range(n_col_clusters):
|
|
selector = np.outer(row_labels == i, col_labels == j)
|
|
result[selector] += generator.uniform(minval, maxval)
|
|
|
|
if noise > 0:
|
|
result += generator.normal(scale=noise, size=result.shape)
|
|
|
|
if shuffle:
|
|
result, row_idx, col_idx = _shuffle(result, random_state)
|
|
row_labels = row_labels[row_idx]
|
|
col_labels = col_labels[col_idx]
|
|
|
|
rows = np.vstack(
|
|
[
|
|
row_labels == label
|
|
for label in range(n_row_clusters)
|
|
for _ in range(n_col_clusters)
|
|
]
|
|
)
|
|
cols = np.vstack(
|
|
[
|
|
col_labels == label
|
|
for _ in range(n_row_clusters)
|
|
for label in range(n_col_clusters)
|
|
]
|
|
)
|
|
|
|
return result, rows, cols
|