projektAI/venv/Lib/site-packages/sklearn/datasets/tests/test_samples_generator.py
2021-06-06 22:13:05 +02:00

560 lines
23 KiB
Python

from collections import defaultdict
from functools import partial
import numpy as np
import pytest
import scipy.sparse as sp
from sklearn.utils._testing import assert_array_equal
from sklearn.utils._testing import assert_almost_equal
from sklearn.utils._testing import assert_array_almost_equal
from sklearn.utils._testing import assert_raise_message
from sklearn.datasets import make_classification
from sklearn.datasets import make_multilabel_classification
from sklearn.datasets import make_hastie_10_2
from sklearn.datasets import make_regression
from sklearn.datasets import make_blobs
from sklearn.datasets import make_friedman1
from sklearn.datasets import make_friedman2
from sklearn.datasets import make_friedman3
from sklearn.datasets import make_low_rank_matrix
from sklearn.datasets import make_moons
from sklearn.datasets import make_circles
from sklearn.datasets import make_sparse_coded_signal
from sklearn.datasets import make_sparse_uncorrelated
from sklearn.datasets import make_spd_matrix
from sklearn.datasets import make_swiss_roll
from sklearn.datasets import make_s_curve
from sklearn.datasets import make_biclusters
from sklearn.datasets import make_checkerboard
from sklearn.utils.validation import assert_all_finite
def test_make_classification():
weights = [0.1, 0.25]
X, y = make_classification(n_samples=100, n_features=20, n_informative=5,
n_redundant=1, n_repeated=1, n_classes=3,
n_clusters_per_class=1, hypercube=False,
shift=None, scale=None, weights=weights,
random_state=0)
assert weights == [0.1, 0.25]
assert X.shape == (100, 20), "X shape mismatch"
assert y.shape == (100,), "y shape mismatch"
assert np.unique(y).shape == (3,), "Unexpected number of classes"
assert sum(y == 0) == 10, "Unexpected number of samples in class #0"
assert sum(y == 1) == 25, "Unexpected number of samples in class #1"
assert sum(y == 2) == 65, "Unexpected number of samples in class #2"
# Test for n_features > 30
X, y = make_classification(n_samples=2000, n_features=31, n_informative=31,
n_redundant=0, n_repeated=0, hypercube=True,
scale=0.5, random_state=0)
assert X.shape == (2000, 31), "X shape mismatch"
assert y.shape == (2000,), "y shape mismatch"
assert (np.unique(X.view([('', X.dtype)]*X.shape[1])).view(X.dtype)
.reshape(-1, X.shape[1]).shape[0] == 2000), (
"Unexpected number of unique rows")
def test_make_classification_informative_features():
"""Test the construction of informative features in make_classification
Also tests `n_clusters_per_class`, `n_classes`, `hypercube` and
fully-specified `weights`.
"""
# Create very separate clusters; check that vertices are unique and
# correspond to classes
class_sep = 1e6
make = partial(make_classification, class_sep=class_sep, n_redundant=0,
n_repeated=0, flip_y=0, shift=0, scale=1, shuffle=False)
for n_informative, weights, n_clusters_per_class in [(2, [1], 1),
(2, [1/3] * 3, 1),
(2, [1/4] * 4, 1),
(2, [1/2] * 2, 2),
(2, [3/4, 1/4], 2),
(10, [1/3] * 3, 10),
(int(64), [1], 1)
]:
n_classes = len(weights)
n_clusters = n_classes * n_clusters_per_class
n_samples = n_clusters * 50
for hypercube in (False, True):
X, y = make(n_samples=n_samples, n_classes=n_classes,
weights=weights, n_features=n_informative,
n_informative=n_informative,
n_clusters_per_class=n_clusters_per_class,
hypercube=hypercube, random_state=0)
assert X.shape == (n_samples, n_informative)
assert y.shape == (n_samples,)
# Cluster by sign, viewed as strings to allow uniquing
signs = np.sign(X)
signs = signs.view(dtype='|S{0}'.format(signs.strides[0]))
unique_signs, cluster_index = np.unique(signs,
return_inverse=True)
assert len(unique_signs) == n_clusters, (
"Wrong number of clusters, or not in distinct quadrants")
clusters_by_class = defaultdict(set)
for cluster, cls in zip(cluster_index, y):
clusters_by_class[cls].add(cluster)
for clusters in clusters_by_class.values():
assert len(clusters) == n_clusters_per_class, (
"Wrong number of clusters per class")
assert (len(clusters_by_class) == n_classes), (
"Wrong number of classes")
assert_array_almost_equal(np.bincount(y) / len(y) // weights,
[1] * n_classes,
err_msg="Wrong number of samples "
"per class")
# Ensure on vertices of hypercube
for cluster in range(len(unique_signs)):
centroid = X[cluster_index == cluster].mean(axis=0)
if hypercube:
assert_array_almost_equal(np.abs(centroid) / class_sep,
np.ones(n_informative),
decimal=5,
err_msg="Clusters are not "
"centered on hypercube "
"vertices")
else:
with pytest.raises(AssertionError):
assert_array_almost_equal(np.abs(centroid) / class_sep,
np.ones(n_informative),
decimal=5,
err_msg="Clusters should "
"not be centered "
"on hypercube "
"vertices")
with pytest.raises(ValueError):
make(n_features=2, n_informative=2, n_classes=5,
n_clusters_per_class=1)
with pytest.raises(ValueError):
make(n_features=2, n_informative=2, n_classes=3,
n_clusters_per_class=2)
@pytest.mark.parametrize(
'weights, err_type, err_msg',
[
([], ValueError,
"Weights specified but incompatible with number of classes."),
([.25, .75, .1], ValueError,
"Weights specified but incompatible with number of classes."),
(np.array([]), ValueError,
"Weights specified but incompatible with number of classes."),
(np.array([.25, .75, .1]), ValueError,
"Weights specified but incompatible with number of classes."),
(np.random.random(3), ValueError,
"Weights specified but incompatible with number of classes.")
]
)
def test_make_classification_weights_type(weights, err_type, err_msg):
with pytest.raises(err_type, match=err_msg):
make_classification(weights=weights)
@pytest.mark.parametrize("kwargs", [{}, {"n_classes": 3, "n_informative": 3}])
def test_make_classification_weights_array_or_list_ok(kwargs):
X1, y1 = make_classification(weights=[.1, .9],
random_state=0, **kwargs)
X2, y2 = make_classification(weights=np.array([.1, .9]),
random_state=0, **kwargs)
assert_almost_equal(X1, X2)
assert_almost_equal(y1, y2)
def test_make_multilabel_classification_return_sequences():
for allow_unlabeled, min_length in zip((True, False), (0, 1)):
X, Y = make_multilabel_classification(n_samples=100, n_features=20,
n_classes=3, random_state=0,
return_indicator=False,
allow_unlabeled=allow_unlabeled)
assert X.shape == (100, 20), "X shape mismatch"
if not allow_unlabeled:
assert max([max(y) for y in Y]) == 2
assert min([len(y) for y in Y]) == min_length
assert max([len(y) for y in Y]) <= 3
def test_make_multilabel_classification_return_indicator():
for allow_unlabeled, min_length in zip((True, False), (0, 1)):
X, Y = make_multilabel_classification(n_samples=25, n_features=20,
n_classes=3, random_state=0,
allow_unlabeled=allow_unlabeled)
assert X.shape == (25, 20), "X shape mismatch"
assert Y.shape == (25, 3), "Y shape mismatch"
assert np.all(np.sum(Y, axis=0) > min_length)
# Also test return_distributions and return_indicator with True
X2, Y2, p_c, p_w_c = make_multilabel_classification(
n_samples=25, n_features=20, n_classes=3, random_state=0,
allow_unlabeled=allow_unlabeled, return_distributions=True)
assert_array_almost_equal(X, X2)
assert_array_equal(Y, Y2)
assert p_c.shape == (3,)
assert_almost_equal(p_c.sum(), 1)
assert p_w_c.shape == (20, 3)
assert_almost_equal(p_w_c.sum(axis=0), [1] * 3)
def test_make_multilabel_classification_return_indicator_sparse():
for allow_unlabeled, min_length in zip((True, False), (0, 1)):
X, Y = make_multilabel_classification(n_samples=25, n_features=20,
n_classes=3, random_state=0,
return_indicator='sparse',
allow_unlabeled=allow_unlabeled)
assert X.shape == (25, 20), "X shape mismatch"
assert Y.shape == (25, 3), "Y shape mismatch"
assert sp.issparse(Y)
@pytest.mark.parametrize(
"params, err_msg",
[
({"n_classes": 0}, "'n_classes' should be an integer"),
({"length": 0}, "'length' should be an integer")
]
)
def test_make_multilabel_classification_valid_arguments(params, err_msg):
with pytest.raises(ValueError, match=err_msg):
make_multilabel_classification(**params)
def test_make_hastie_10_2():
X, y = make_hastie_10_2(n_samples=100, random_state=0)
assert X.shape == (100, 10), "X shape mismatch"
assert y.shape == (100,), "y shape mismatch"
assert np.unique(y).shape == (2,), "Unexpected number of classes"
def test_make_regression():
X, y, c = make_regression(n_samples=100, n_features=10, n_informative=3,
effective_rank=5, coef=True, bias=0.0,
noise=1.0, random_state=0)
assert X.shape == (100, 10), "X shape mismatch"
assert y.shape == (100,), "y shape mismatch"
assert c.shape == (10,), "coef shape mismatch"
assert sum(c != 0.0) == 3, "Unexpected number of informative features"
# Test that y ~= np.dot(X, c) + bias + N(0, 1.0).
assert_almost_equal(np.std(y - np.dot(X, c)), 1.0, decimal=1)
# Test with small number of features.
X, y = make_regression(n_samples=100, n_features=1) # n_informative=3
assert X.shape == (100, 1)
def test_make_regression_multitarget():
X, y, c = make_regression(n_samples=100, n_features=10, n_informative=3,
n_targets=3, coef=True, noise=1., random_state=0)
assert X.shape == (100, 10), "X shape mismatch"
assert y.shape == (100, 3), "y shape mismatch"
assert c.shape == (10, 3), "coef shape mismatch"
assert_array_equal(sum(c != 0.0), 3,
"Unexpected number of informative features")
# Test that y ~= np.dot(X, c) + bias + N(0, 1.0)
assert_almost_equal(np.std(y - np.dot(X, c)), 1.0, decimal=1)
def test_make_blobs():
cluster_stds = np.array([0.05, 0.2, 0.4])
cluster_centers = np.array([[0.0, 0.0], [1.0, 1.0], [0.0, 1.0]])
X, y = make_blobs(random_state=0, n_samples=50, n_features=2,
centers=cluster_centers, cluster_std=cluster_stds)
assert X.shape == (50, 2), "X shape mismatch"
assert y.shape == (50,), "y shape mismatch"
assert np.unique(y).shape == (3,), "Unexpected number of blobs"
for i, (ctr, std) in enumerate(zip(cluster_centers, cluster_stds)):
assert_almost_equal((X[y == i] - ctr).std(), std, 1, "Unexpected std")
def test_make_blobs_n_samples_list():
n_samples = [50, 30, 20]
X, y = make_blobs(n_samples=n_samples, n_features=2, random_state=0)
assert X.shape == (sum(n_samples), 2), "X shape mismatch"
assert all(np.bincount(y, minlength=len(n_samples)) == n_samples), \
"Incorrect number of samples per blob"
def test_make_blobs_n_samples_list_with_centers():
n_samples = [20, 20, 20]
centers = np.array([[0.0, 0.0], [1.0, 1.0], [0.0, 1.0]])
cluster_stds = np.array([0.05, 0.2, 0.4])
X, y = make_blobs(n_samples=n_samples, centers=centers,
cluster_std=cluster_stds, random_state=0)
assert X.shape == (sum(n_samples), 2), "X shape mismatch"
assert all(np.bincount(y, minlength=len(n_samples)) == n_samples), \
"Incorrect number of samples per blob"
for i, (ctr, std) in enumerate(zip(centers, cluster_stds)):
assert_almost_equal((X[y == i] - ctr).std(), std, 1, "Unexpected std")
@pytest.mark.parametrize(
"n_samples",
[[5, 3, 0],
np.array([5, 3, 0]),
tuple([5, 3, 0])]
)
def test_make_blobs_n_samples_centers_none(n_samples):
centers = None
X, y = make_blobs(n_samples=n_samples, centers=centers, random_state=0)
assert X.shape == (sum(n_samples), 2), "X shape mismatch"
assert all(np.bincount(y, minlength=len(n_samples)) == n_samples), \
"Incorrect number of samples per blob"
def test_make_blobs_return_centers():
n_samples = [10, 20]
n_features = 3
X, y, centers = make_blobs(n_samples=n_samples, n_features=n_features,
return_centers=True, random_state=0)
assert centers.shape == (len(n_samples), n_features)
def test_make_blobs_error():
n_samples = [20, 20, 20]
centers = np.array([[0.0, 0.0], [1.0, 1.0], [0.0, 1.0]])
cluster_stds = np.array([0.05, 0.2, 0.4])
wrong_centers_msg = ("Length of `n_samples` not consistent "
"with number of centers. Got n_samples = {} "
"and centers = {}".format(n_samples, centers[:-1]))
assert_raise_message(ValueError, wrong_centers_msg,
make_blobs, n_samples, centers=centers[:-1])
wrong_std_msg = ("Length of `clusters_std` not consistent with "
"number of centers. Got centers = {} "
"and cluster_std = {}".format(centers, cluster_stds[:-1]))
assert_raise_message(ValueError, wrong_std_msg,
make_blobs, n_samples,
centers=centers, cluster_std=cluster_stds[:-1])
wrong_type_msg = ("Parameter `centers` must be array-like. "
"Got {!r} instead".format(3))
assert_raise_message(ValueError, wrong_type_msg,
make_blobs, n_samples, centers=3)
def test_make_friedman1():
X, y = make_friedman1(n_samples=5, n_features=10, noise=0.0,
random_state=0)
assert X.shape == (5, 10), "X shape mismatch"
assert y.shape == (5,), "y shape mismatch"
assert_array_almost_equal(y,
10 * np.sin(np.pi * X[:, 0] * X[:, 1])
+ 20 * (X[:, 2] - 0.5) ** 2
+ 10 * X[:, 3] + 5 * X[:, 4])
def test_make_friedman2():
X, y = make_friedman2(n_samples=5, noise=0.0, random_state=0)
assert X.shape == (5, 4), "X shape mismatch"
assert y.shape == (5,), "y shape mismatch"
assert_array_almost_equal(y,
(X[:, 0] ** 2
+ (X[:, 1] * X[:, 2] - 1
/ (X[:, 1] * X[:, 3])) ** 2) ** 0.5)
def test_make_friedman3():
X, y = make_friedman3(n_samples=5, noise=0.0, random_state=0)
assert X.shape == (5, 4), "X shape mismatch"
assert y.shape == (5,), "y shape mismatch"
assert_array_almost_equal(y, np.arctan((X[:, 1] * X[:, 2]
- 1 / (X[:, 1] * X[:, 3]))
/ X[:, 0]))
def test_make_low_rank_matrix():
X = make_low_rank_matrix(n_samples=50, n_features=25, effective_rank=5,
tail_strength=0.01, random_state=0)
assert X.shape == (50, 25), "X shape mismatch"
from numpy.linalg import svd
u, s, v = svd(X)
assert sum(s) - 5 < 0.1, "X rank is not approximately 5"
def test_make_sparse_coded_signal():
Y, D, X = make_sparse_coded_signal(n_samples=5, n_components=8,
n_features=10, n_nonzero_coefs=3,
random_state=0)
assert Y.shape == (10, 5), "Y shape mismatch"
assert D.shape == (10, 8), "D shape mismatch"
assert X.shape == (8, 5), "X shape mismatch"
for col in X.T:
assert len(np.flatnonzero(col)) == 3, 'Non-zero coefs mismatch'
assert_array_almost_equal(np.dot(D, X), Y)
assert_array_almost_equal(np.sqrt((D ** 2).sum(axis=0)),
np.ones(D.shape[1]))
def test_make_sparse_uncorrelated():
X, y = make_sparse_uncorrelated(n_samples=5, n_features=10, random_state=0)
assert X.shape == (5, 10), "X shape mismatch"
assert y.shape == (5,), "y shape mismatch"
def test_make_spd_matrix():
X = make_spd_matrix(n_dim=5, random_state=0)
assert X.shape == (5, 5), "X shape mismatch"
assert_array_almost_equal(X, X.T)
from numpy.linalg import eig
eigenvalues, _ = eig(X)
assert_array_equal(eigenvalues > 0, np.array([True] * 5),
"X is not positive-definite")
def test_make_swiss_roll():
X, t = make_swiss_roll(n_samples=5, noise=0.0, random_state=0)
assert X.shape == (5, 3), "X shape mismatch"
assert t.shape == (5,), "t shape mismatch"
assert_array_almost_equal(X[:, 0], t * np.cos(t))
assert_array_almost_equal(X[:, 2], t * np.sin(t))
def test_make_s_curve():
X, t = make_s_curve(n_samples=5, noise=0.0, random_state=0)
assert X.shape == (5, 3), "X shape mismatch"
assert t.shape == (5,), "t shape mismatch"
assert_array_almost_equal(X[:, 0], np.sin(t))
assert_array_almost_equal(X[:, 2], np.sign(t) * (np.cos(t) - 1))
def test_make_biclusters():
X, rows, cols = make_biclusters(
shape=(100, 100), n_clusters=4, shuffle=True, random_state=0)
assert X.shape == (100, 100), "X shape mismatch"
assert rows.shape == (4, 100), "rows shape mismatch"
assert cols.shape == (4, 100,), "columns shape mismatch"
assert_all_finite(X)
assert_all_finite(rows)
assert_all_finite(cols)
X2, _, _ = make_biclusters(shape=(100, 100), n_clusters=4,
shuffle=True, random_state=0)
assert_array_almost_equal(X, X2)
def test_make_checkerboard():
X, rows, cols = make_checkerboard(
shape=(100, 100), n_clusters=(20, 5),
shuffle=True, random_state=0)
assert X.shape == (100, 100), "X shape mismatch"
assert rows.shape == (100, 100), "rows shape mismatch"
assert cols.shape == (100, 100,), "columns shape mismatch"
X, rows, cols = make_checkerboard(
shape=(100, 100), n_clusters=2, shuffle=True, random_state=0)
assert_all_finite(X)
assert_all_finite(rows)
assert_all_finite(cols)
X1, _, _ = make_checkerboard(shape=(100, 100), n_clusters=2,
shuffle=True, random_state=0)
X2, _, _ = make_checkerboard(shape=(100, 100), n_clusters=2,
shuffle=True, random_state=0)
assert_array_almost_equal(X1, X2)
def test_make_moons():
X, y = make_moons(3, shuffle=False)
for x, label in zip(X, y):
center = [0.0, 0.0] if label == 0 else [1.0, 0.5]
dist_sqr = ((x - center) ** 2).sum()
assert_almost_equal(dist_sqr, 1.0,
err_msg="Point is not on expected unit circle")
def test_make_moons_unbalanced():
X, y = make_moons(n_samples=(7, 5))
assert np.sum(y == 0) == 7 and np.sum(y == 1) == 5, \
'Number of samples in a moon is wrong'
assert X.shape == (12, 2), "X shape mismatch"
assert y.shape == (12,), "y shape mismatch"
with pytest.raises(ValueError, match=r'`n_samples` can be either an int '
r'or a two-element tuple.'):
make_moons(n_samples=[1, 2, 3])
with pytest.raises(ValueError, match=r'`n_samples` can be either an int '
r'or a two-element tuple.'):
make_moons(n_samples=(10,))
def test_make_circles():
factor = 0.3
for (n_samples, n_outer, n_inner) in [(7, 3, 4), (8, 4, 4)]:
# Testing odd and even case, because in the past make_circles always
# created an even number of samples.
X, y = make_circles(n_samples, shuffle=False, noise=None,
factor=factor)
assert X.shape == (n_samples, 2), "X shape mismatch"
assert y.shape == (n_samples,), "y shape mismatch"
center = [0.0, 0.0]
for x, label in zip(X, y):
dist_sqr = ((x - center) ** 2).sum()
dist_exp = 1.0 if label == 0 else factor**2
dist_exp = 1.0 if label == 0 else factor ** 2
assert_almost_equal(dist_sqr, dist_exp,
err_msg="Point is not on expected circle")
assert X[y == 0].shape == (n_outer, 2), (
"Samples not correctly distributed across circles.")
assert X[y == 1].shape == (n_inner, 2), (
"Samples not correctly distributed across circles.")
with pytest.raises(ValueError):
make_circles(factor=-0.01)
with pytest.raises(ValueError):
make_circles(factor=1.)
def test_make_circles_unbalanced():
X, y = make_circles(n_samples=(2, 8))
assert np.sum(y == 0) == 2, 'Number of samples in inner circle is wrong'
assert np.sum(y == 1) == 8, 'Number of samples in outer circle is wrong'
assert X.shape == (10, 2), "X shape mismatch"
assert y.shape == (10,), "y shape mismatch"
with pytest.raises(ValueError, match=r'`n_samples` can be either an int '
r'or a two-element tuple.'):
make_circles(n_samples=[1, 2, 3])
with pytest.raises(ValueError, match=r'`n_samples` can be either an int '
r'or a two-element tuple.'):
make_circles(n_samples=(10,))