from itertools import product import pytest import numpy as np from scipy.sparse import (bsr_matrix, coo_matrix, csc_matrix, csr_matrix, dok_matrix, lil_matrix, issparse) from sklearn import metrics from sklearn import neighbors, datasets from sklearn.base import clone from sklearn.exceptions import DataConversionWarning from sklearn.exceptions import EfficiencyWarning from sklearn.exceptions import NotFittedError from sklearn.metrics.pairwise import pairwise_distances from sklearn.model_selection import cross_val_score from sklearn.model_selection import train_test_split from sklearn.neighbors import VALID_METRICS_SPARSE, VALID_METRICS from sklearn.neighbors._base import _is_sorted_by_data, _check_precomputed from sklearn.pipeline import make_pipeline from sklearn.utils._testing import assert_array_almost_equal from sklearn.utils._testing import assert_array_equal from sklearn.utils._testing import assert_raises from sklearn.utils._testing import assert_raises_regex from sklearn.utils._testing import assert_warns from sklearn.utils._testing import assert_warns_message from sklearn.utils._testing import assert_raise_message from sklearn.utils._testing import ignore_warnings from sklearn.utils.validation import check_random_state from sklearn.utils.fixes import sp_version, parse_version import joblib rng = np.random.RandomState(0) # load and shuffle iris dataset iris = datasets.load_iris() perm = rng.permutation(iris.target.size) iris.data = iris.data[perm] iris.target = iris.target[perm] # load and shuffle digits digits = datasets.load_digits() perm = rng.permutation(digits.target.size) digits.data = digits.data[perm] digits.target = digits.target[perm] SPARSE_TYPES = (bsr_matrix, coo_matrix, csc_matrix, csr_matrix, dok_matrix, lil_matrix) SPARSE_OR_DENSE = SPARSE_TYPES + (np.asarray,) ALGORITHMS = ('ball_tree', 'brute', 'kd_tree', 'auto') P = (1, 2, 3, 4, np.inf) JOBLIB_BACKENDS = list(joblib.parallel.BACKENDS.keys()) # Filter deprecation warnings. neighbors.kneighbors_graph = ignore_warnings(neighbors.kneighbors_graph) neighbors.radius_neighbors_graph = ignore_warnings( neighbors.radius_neighbors_graph) def _weight_func(dist): """ Weight function to replace lambda d: d ** -2. The lambda function is not valid because: if d==0 then 0^-2 is not valid. """ # Dist could be multidimensional, flatten it so all values # can be looped with np.errstate(divide='ignore'): retval = 1. / dist return retval ** 2 def test_unsupervised_kneighbors(n_samples=20, n_features=5, n_query_pts=2, n_neighbors=5): # Test unsupervised neighbors methods X = rng.rand(n_samples, n_features) test = rng.rand(n_query_pts, n_features) for p in P: results_nodist = [] results = [] for algorithm in ALGORITHMS: neigh = neighbors.NearestNeighbors(n_neighbors=n_neighbors, algorithm=algorithm, p=p) neigh.fit(X) results_nodist.append(neigh.kneighbors(test, return_distance=False)) results.append(neigh.kneighbors(test, return_distance=True)) for i in range(len(results) - 1): assert_array_almost_equal(results_nodist[i], results[i][1]) assert_array_almost_equal(results[i][0], results[i + 1][0]) assert_array_almost_equal(results[i][1], results[i + 1][1]) @pytest.mark.parametrize("NearestNeighbors", [neighbors.KNeighborsClassifier, neighbors.KNeighborsRegressor, neighbors.NearestNeighbors]) def test_unsupervised_inputs(NearestNeighbors): # Test unsupervised inputs for neighbors estimators X = rng.random_sample((10, 3)) y = rng.randint(3, size=10) nbrs_fid = neighbors.NearestNeighbors(n_neighbors=1) nbrs_fid.fit(X) dist1, ind1 = nbrs_fid.kneighbors(X) nbrs = NearestNeighbors(n_neighbors=1) for data in (nbrs_fid, neighbors.BallTree(X), neighbors.KDTree(X)): nbrs.fit(data, y) dist2, ind2 = nbrs.kneighbors(X) assert_array_almost_equal(dist1, dist2) assert_array_almost_equal(ind1, ind2) def test_n_neighbors_datatype(): # Test to check whether n_neighbors is integer X = [[1, 1], [1, 1], [1, 1]] expected_msg = "n_neighbors does not take .*float.* " \ "value, enter integer value" msg = "Expected n_neighbors > 0. Got -3" neighbors_ = neighbors.NearestNeighbors(n_neighbors=3.) assert_raises_regex(TypeError, expected_msg, neighbors_.fit, X) assert_raises_regex(ValueError, msg, neighbors_.kneighbors, X=X, n_neighbors=-3) assert_raises_regex(TypeError, expected_msg, neighbors_.kneighbors, X=X, n_neighbors=3.) def test_not_fitted_error_gets_raised(): X = [[1]] neighbors_ = neighbors.NearestNeighbors() assert_raises(NotFittedError, neighbors_.kneighbors_graph, X) assert_raises(NotFittedError, neighbors_.radius_neighbors_graph, X) @ignore_warnings(category=EfficiencyWarning) def check_precomputed(make_train_test, estimators): """Tests unsupervised NearestNeighbors with a distance matrix.""" # Note: smaller samples may result in spurious test success rng = np.random.RandomState(42) X = rng.random_sample((10, 4)) Y = rng.random_sample((3, 4)) DXX, DYX = make_train_test(X, Y) for method in ['kneighbors', ]: # TODO: also test radius_neighbors, but requires different assertion # As a feature matrix (n_samples by n_features) nbrs_X = neighbors.NearestNeighbors(n_neighbors=3) nbrs_X.fit(X) dist_X, ind_X = getattr(nbrs_X, method)(Y) # As a dense distance matrix (n_samples by n_samples) nbrs_D = neighbors.NearestNeighbors(n_neighbors=3, algorithm='brute', metric='precomputed') nbrs_D.fit(DXX) dist_D, ind_D = getattr(nbrs_D, method)(DYX) assert_array_almost_equal(dist_X, dist_D) assert_array_almost_equal(ind_X, ind_D) # Check auto works too nbrs_D = neighbors.NearestNeighbors(n_neighbors=3, algorithm='auto', metric='precomputed') nbrs_D.fit(DXX) dist_D, ind_D = getattr(nbrs_D, method)(DYX) assert_array_almost_equal(dist_X, dist_D) assert_array_almost_equal(ind_X, ind_D) # Check X=None in prediction dist_X, ind_X = getattr(nbrs_X, method)(None) dist_D, ind_D = getattr(nbrs_D, method)(None) assert_array_almost_equal(dist_X, dist_D) assert_array_almost_equal(ind_X, ind_D) # Must raise a ValueError if the matrix is not of correct shape assert_raises(ValueError, getattr(nbrs_D, method), X) target = np.arange(X.shape[0]) for Est in estimators: est = Est(metric='euclidean') est.radius = est.n_neighbors = 1 pred_X = est.fit(X, target).predict(Y) est.metric = 'precomputed' pred_D = est.fit(DXX, target).predict(DYX) assert_array_almost_equal(pred_X, pred_D) def test_precomputed_dense(): def make_train_test(X_train, X_test): return (metrics.pairwise_distances(X_train), metrics.pairwise_distances(X_test, X_train)) estimators = [ neighbors.KNeighborsClassifier, neighbors.KNeighborsRegressor, neighbors.RadiusNeighborsClassifier, neighbors.RadiusNeighborsRegressor ] check_precomputed(make_train_test, estimators) @pytest.mark.parametrize('fmt', ['csr', 'lil']) def test_precomputed_sparse_knn(fmt): def make_train_test(X_train, X_test): nn = neighbors.NearestNeighbors(n_neighbors=3 + 1).fit(X_train) return (nn.kneighbors_graph(X_train, mode='distance').asformat(fmt), nn.kneighbors_graph(X_test, mode='distance').asformat(fmt)) # We do not test RadiusNeighborsClassifier and RadiusNeighborsRegressor # since the precomputed neighbors graph is built with k neighbors only. estimators = [ neighbors.KNeighborsClassifier, neighbors.KNeighborsRegressor, ] check_precomputed(make_train_test, estimators) @pytest.mark.parametrize('fmt', ['csr', 'lil']) def test_precomputed_sparse_radius(fmt): def make_train_test(X_train, X_test): nn = neighbors.NearestNeighbors(radius=1).fit(X_train) return (nn.radius_neighbors_graph(X_train, mode='distance').asformat(fmt), nn.radius_neighbors_graph(X_test, mode='distance').asformat(fmt)) # We do not test KNeighborsClassifier and KNeighborsRegressor # since the precomputed neighbors graph is built with a radius. estimators = [ neighbors.RadiusNeighborsClassifier, neighbors.RadiusNeighborsRegressor, ] check_precomputed(make_train_test, estimators) def test_is_sorted_by_data(): # Test that _is_sorted_by_data works as expected. In CSR sparse matrix, # entries in each row can be sorted by indices, by data, or unsorted. # _is_sorted_by_data should return True when entries are sorted by data, # and False in all other cases. # Test with sorted 1D array X = csr_matrix(np.arange(10)) assert _is_sorted_by_data(X) # Test with unsorted 1D array X[0, 2] = 5 assert not _is_sorted_by_data(X) # Test when the data is sorted in each sample, but not necessarily # between samples X = csr_matrix([[0, 1, 2], [3, 0, 0], [3, 4, 0], [1, 0, 2]]) assert _is_sorted_by_data(X) # Test with duplicates entries in X.indptr data, indices, indptr = [0, 4, 2, 2], [0, 1, 1, 1], [0, 2, 2, 4] X = csr_matrix((data, indices, indptr), shape=(3, 3)) assert _is_sorted_by_data(X) @ignore_warnings(category=EfficiencyWarning) def test_check_precomputed(): # Test that _check_precomputed returns a graph sorted by data X = csr_matrix(np.abs(np.random.RandomState(42).randn(10, 10))) assert not _is_sorted_by_data(X) Xt = _check_precomputed(X) assert _is_sorted_by_data(Xt) # est with a different number of nonzero entries for each sample mask = np.random.RandomState(42).randint(2, size=(10, 10)) X = X.toarray() X[mask == 1] = 0 X = csr_matrix(X) assert not _is_sorted_by_data(X) Xt = _check_precomputed(X) assert _is_sorted_by_data(Xt) @ignore_warnings(category=EfficiencyWarning) def test_precomputed_sparse_invalid(): dist = np.array([[0., 2., 1.], [2., 0., 3.], [1., 3., 0.]]) dist_csr = csr_matrix(dist) neigh = neighbors.NearestNeighbors(n_neighbors=1, metric="precomputed") neigh.fit(dist_csr) neigh.kneighbors(None, n_neighbors=1) neigh.kneighbors(np.array([[0., 0., 0.]]), n_neighbors=2) # Ensures enough number of nearest neighbors dist = np.array([[0., 2., 0.], [2., 0., 3.], [0., 3., 0.]]) dist_csr = csr_matrix(dist) neigh.fit(dist_csr) msg = "2 neighbors per samples are required, but some samples have only 1" assert_raises_regex(ValueError, msg, neigh.kneighbors, None, n_neighbors=1) # Checks error with inconsistent distance matrix dist = np.array([[5., 2., 1.], [-2., 0., 3.], [1., 3., 0.]]) dist_csr = csr_matrix(dist) msg = "Negative values in data passed to precomputed distance matrix." assert_raises_regex(ValueError, msg, neigh.kneighbors, dist_csr, n_neighbors=1) def test_precomputed_cross_validation(): # Ensure array is split correctly rng = np.random.RandomState(0) X = rng.rand(20, 2) D = pairwise_distances(X, metric='euclidean') y = rng.randint(3, size=20) for Est in (neighbors.KNeighborsClassifier, neighbors.RadiusNeighborsClassifier, neighbors.KNeighborsRegressor, neighbors.RadiusNeighborsRegressor): metric_score = cross_val_score(Est(), X, y) precomp_score = cross_val_score(Est(metric='precomputed'), D, y) assert_array_equal(metric_score, precomp_score) def test_unsupervised_radius_neighbors(n_samples=20, n_features=5, n_query_pts=2, radius=0.5, random_state=0): # Test unsupervised radius-based query rng = np.random.RandomState(random_state) X = rng.rand(n_samples, n_features) test = rng.rand(n_query_pts, n_features) for p in P: results = [] for algorithm in ALGORITHMS: neigh = neighbors.NearestNeighbors(radius=radius, algorithm=algorithm, p=p) neigh.fit(X) ind1 = neigh.radius_neighbors(test, return_distance=False) # sort the results: this is not done automatically for # radius searches dist, ind = neigh.radius_neighbors(test, return_distance=True) for (d, i, i1) in zip(dist, ind, ind1): j = d.argsort() d[:] = d[j] i[:] = i[j] i1[:] = i1[j] results.append((dist, ind)) assert_array_almost_equal(np.concatenate(list(ind)), np.concatenate(list(ind1))) for i in range(len(results) - 1): assert_array_almost_equal(np.concatenate(list(results[i][0])), np.concatenate(list(results[i + 1][0]))), assert_array_almost_equal(np.concatenate(list(results[i][1])), np.concatenate(list(results[i + 1][1]))) def test_kneighbors_classifier(n_samples=40, n_features=5, n_test_pts=10, n_neighbors=5, random_state=0): # Test k-neighbors classification rng = np.random.RandomState(random_state) X = 2 * rng.rand(n_samples, n_features) - 1 y = ((X ** 2).sum(axis=1) < .5).astype(int) y_str = y.astype(str) weight_func = _weight_func for algorithm in ALGORITHMS: for weights in ['uniform', 'distance', weight_func]: knn = neighbors.KNeighborsClassifier(n_neighbors=n_neighbors, weights=weights, algorithm=algorithm) knn.fit(X, y) epsilon = 1e-5 * (2 * rng.rand(1, n_features) - 1) y_pred = knn.predict(X[:n_test_pts] + epsilon) assert_array_equal(y_pred, y[:n_test_pts]) # Test prediction with y_str knn.fit(X, y_str) y_pred = knn.predict(X[:n_test_pts] + epsilon) assert_array_equal(y_pred, y_str[:n_test_pts]) def test_kneighbors_classifier_float_labels(n_samples=40, n_features=5, n_test_pts=10, n_neighbors=5, random_state=0): # Test k-neighbors classification rng = np.random.RandomState(random_state) X = 2 * rng.rand(n_samples, n_features) - 1 y = ((X ** 2).sum(axis=1) < .5).astype(int) knn = neighbors.KNeighborsClassifier(n_neighbors=n_neighbors) knn.fit(X, y.astype(float)) epsilon = 1e-5 * (2 * rng.rand(1, n_features) - 1) y_pred = knn.predict(X[:n_test_pts] + epsilon) assert_array_equal(y_pred, y[:n_test_pts]) def test_kneighbors_classifier_predict_proba(): # Test KNeighborsClassifier.predict_proba() method X = np.array([[0, 2, 0], [0, 2, 1], [2, 0, 0], [2, 2, 0], [0, 0, 2], [0, 0, 1]]) y = np.array([4, 4, 5, 5, 1, 1]) cls = neighbors.KNeighborsClassifier(n_neighbors=3, p=1) # cityblock dist cls.fit(X, y) y_prob = cls.predict_proba(X) real_prob = np.array([[0, 2. / 3, 1. / 3], [1. / 3, 2. / 3, 0], [1. / 3, 0, 2. / 3], [0, 1. / 3, 2. / 3], [2. / 3, 1. / 3, 0], [2. / 3, 1. / 3, 0]]) assert_array_equal(real_prob, y_prob) # Check that it also works with non integer labels cls.fit(X, y.astype(str)) y_prob = cls.predict_proba(X) assert_array_equal(real_prob, y_prob) # Check that it works with weights='distance' cls = neighbors.KNeighborsClassifier( n_neighbors=2, p=1, weights='distance') cls.fit(X, y) y_prob = cls.predict_proba(np.array([[0, 2, 0], [2, 2, 2]])) real_prob = np.array([[0, 1, 0], [0, 0.4, 0.6]]) assert_array_almost_equal(real_prob, y_prob) def test_radius_neighbors_classifier(n_samples=40, n_features=5, n_test_pts=10, radius=0.5, random_state=0): # Test radius-based classification rng = np.random.RandomState(random_state) X = 2 * rng.rand(n_samples, n_features) - 1 y = ((X ** 2).sum(axis=1) < .5).astype(int) y_str = y.astype(str) weight_func = _weight_func for algorithm in ALGORITHMS: for weights in ['uniform', 'distance', weight_func]: neigh = neighbors.RadiusNeighborsClassifier(radius=radius, weights=weights, algorithm=algorithm) neigh.fit(X, y) epsilon = 1e-5 * (2 * rng.rand(1, n_features) - 1) y_pred = neigh.predict(X[:n_test_pts] + epsilon) assert_array_equal(y_pred, y[:n_test_pts]) neigh.fit(X, y_str) y_pred = neigh.predict(X[:n_test_pts] + epsilon) assert_array_equal(y_pred, y_str[:n_test_pts]) def test_radius_neighbors_classifier_when_no_neighbors(): # Test radius-based classifier when no neighbors found. # In this case it should rise an informative exception X = np.array([[1.0, 1.0], [2.0, 2.0]]) y = np.array([1, 2]) radius = 0.1 z1 = np.array([[1.01, 1.01], [2.01, 2.01]]) # no outliers z2 = np.array([[1.01, 1.01], [1.4, 1.4]]) # one outlier weight_func = _weight_func for outlier_label in [0, -1, None]: for algorithm in ALGORITHMS: for weights in ['uniform', 'distance', weight_func]: rnc = neighbors.RadiusNeighborsClassifier clf = rnc(radius=radius, weights=weights, algorithm=algorithm, outlier_label=outlier_label) clf.fit(X, y) assert_array_equal(np.array([1, 2]), clf.predict(z1)) if outlier_label is None: assert_raises(ValueError, clf.predict, z2) def test_radius_neighbors_classifier_outlier_labeling(): # Test radius-based classifier when no neighbors found and outliers # are labeled. X = np.array([[1.0, 1.0], [2.0, 2.0], [0.99, 0.99], [0.98, 0.98], [2.01, 2.01]]) y = np.array([1, 2, 1, 1, 2]) radius = 0.1 z1 = np.array([[1.01, 1.01], [2.01, 2.01]]) # no outliers z2 = np.array([[1.4, 1.4], [1.01, 1.01], [2.01, 2.01]]) # one outlier correct_labels1 = np.array([1, 2]) correct_labels2 = np.array([-1, 1, 2]) outlier_proba = np.array([0, 0]) weight_func = _weight_func for algorithm in ALGORITHMS: for weights in ['uniform', 'distance', weight_func]: clf = neighbors.RadiusNeighborsClassifier(radius=radius, weights=weights, algorithm=algorithm, outlier_label=-1) clf.fit(X, y) assert_array_equal(correct_labels1, clf.predict(z1)) assert_array_equal(correct_labels2, clf.predict(z2)) assert_array_equal(outlier_proba, clf.predict_proba(z2)[0]) # test outlier_labeling of using predict_proba() RNC = neighbors.RadiusNeighborsClassifier X = np.array([[0], [1], [2], [3], [4], [5], [6], [7], [8], [9]]) y = np.array([0, 2, 2, 1, 1, 1, 3, 3, 3, 3]) # test outlier_label scalar verification def check_array_exception(): clf = RNC(radius=1, outlier_label=[[5]]) clf.fit(X, y) assert_raises(TypeError, check_array_exception) # test invalid outlier_label dtype def check_dtype_exception(): clf = RNC(radius=1, outlier_label='a') clf.fit(X, y) assert_raises(TypeError, check_dtype_exception) # test most frequent clf = RNC(radius=1, outlier_label='most_frequent') clf.fit(X, y) proba = clf.predict_proba([[1], [15]]) assert_array_equal(proba[1, :], [0, 0, 0, 1]) # test manual label in y clf = RNC(radius=1, outlier_label=1) clf.fit(X, y) proba = clf.predict_proba([[1], [15]]) assert_array_equal(proba[1, :], [0, 1, 0, 0]) pred = clf.predict([[1], [15]]) assert_array_equal(pred, [2, 1]) # test manual label out of y warning def check_warning(): clf = RNC(radius=1, outlier_label=4) clf.fit(X, y) clf.predict_proba([[1], [15]]) assert_warns(UserWarning, check_warning) # test multi output same outlier label y_multi = [[0, 1], [2, 1], [2, 2], [1, 2], [1, 2], [1, 3], [3, 3], [3, 3], [3, 0], [3, 0]] clf = RNC(radius=1, outlier_label=1) clf.fit(X, y_multi) proba = clf.predict_proba([[7], [15]]) assert_array_equal(proba[1][1, :], [0, 1, 0, 0]) pred = clf.predict([[7], [15]]) assert_array_equal(pred[1, :], [1, 1]) # test multi output different outlier label y_multi = [[0, 0], [2, 2], [2, 2], [1, 1], [1, 1], [1, 1], [3, 3], [3, 3], [3, 3], [3, 3]] clf = RNC(radius=1, outlier_label=[0, 1]) clf.fit(X, y_multi) proba = clf.predict_proba([[7], [15]]) assert_array_equal(proba[0][1, :], [1, 0, 0, 0]) assert_array_equal(proba[1][1, :], [0, 1, 0, 0]) pred = clf.predict([[7], [15]]) assert_array_equal(pred[1, :], [0, 1]) # test inconsistent outlier label list length def check_exception(): clf = RNC(radius=1, outlier_label=[0, 1, 2]) clf.fit(X, y_multi) assert_raises(ValueError, check_exception) def test_radius_neighbors_classifier_zero_distance(): # Test radius-based classifier, when distance to a sample is zero. X = np.array([[1.0, 1.0], [2.0, 2.0]]) y = np.array([1, 2]) radius = 0.1 z1 = np.array([[1.01, 1.01], [2.0, 2.0]]) correct_labels1 = np.array([1, 2]) weight_func = _weight_func for algorithm in ALGORITHMS: for weights in ['uniform', 'distance', weight_func]: clf = neighbors.RadiusNeighborsClassifier(radius=radius, weights=weights, algorithm=algorithm) clf.fit(X, y) assert_array_equal(correct_labels1, clf.predict(z1)) def test_neighbors_regressors_zero_distance(): # Test radius-based regressor, when distance to a sample is zero. X = np.array([[1.0, 1.0], [1.0, 1.0], [2.0, 2.0], [2.5, 2.5]]) y = np.array([1.0, 1.5, 2.0, 0.0]) radius = 0.2 z = np.array([[1.1, 1.1], [2.0, 2.0]]) rnn_correct_labels = np.array([1.25, 2.0]) knn_correct_unif = np.array([1.25, 1.0]) knn_correct_dist = np.array([1.25, 2.0]) for algorithm in ALGORITHMS: # we don't test for weights=_weight_func since user will be expected # to handle zero distances themselves in the function. for weights in ['uniform', 'distance']: rnn = neighbors.RadiusNeighborsRegressor(radius=radius, weights=weights, algorithm=algorithm) rnn.fit(X, y) assert_array_almost_equal(rnn_correct_labels, rnn.predict(z)) for weights, corr_labels in zip(['uniform', 'distance'], [knn_correct_unif, knn_correct_dist]): knn = neighbors.KNeighborsRegressor(n_neighbors=2, weights=weights, algorithm=algorithm) knn.fit(X, y) assert_array_almost_equal(corr_labels, knn.predict(z)) def test_radius_neighbors_boundary_handling(): """Test whether points lying on boundary are handled consistently Also ensures that even with only one query point, an object array is returned rather than a 2d array. """ X = np.array([[1.5], [3.0], [3.01]]) radius = 3.0 for algorithm in ALGORITHMS: nbrs = neighbors.NearestNeighbors(radius=radius, algorithm=algorithm).fit(X) results = nbrs.radius_neighbors([[0.0]], return_distance=False) assert results.shape == (1,) assert results.dtype == object assert_array_equal(results[0], [0, 1]) def test_radius_neighbors_returns_array_of_objects(): # check that we can pass precomputed distances to # NearestNeighbors.radius_neighbors() # non-regression test for # https://github.com/scikit-learn/scikit-learn/issues/16036 X = csr_matrix(np.ones((4, 4))) X.setdiag([0, 0, 0, 0]) nbrs = neighbors.NearestNeighbors(radius=0.5, algorithm='auto', leaf_size=30, metric='precomputed').fit(X) neigh_dist, neigh_ind = nbrs.radius_neighbors(X, return_distance=True) expected_dist = np.empty(X.shape[0], dtype=object) expected_dist[:] = [np.array([0]), np.array([0]), np.array([0]), np.array([0])] expected_ind = np.empty(X.shape[0], dtype=object) expected_ind[:] = [np.array([0]), np.array([1]), np.array([2]), np.array([3])] assert_array_equal(neigh_dist, expected_dist) assert_array_equal(neigh_ind, expected_ind) @pytest.mark.parametrize(["algorithm", "metric"], [("ball_tree", "euclidean"), ("kd_tree", "euclidean"), ("brute", "euclidean"), ("brute", "precomputed")]) def test_radius_neighbors_sort_results(algorithm, metric): # Test radius_neighbors[_graph] output when sort_result is True n_samples = 10 rng = np.random.RandomState(42) X = rng.random_sample((n_samples, 4)) if metric == "precomputed": X = neighbors.radius_neighbors_graph(X, radius=np.inf, mode="distance") model = neighbors.NearestNeighbors(algorithm=algorithm, metric=metric) model.fit(X) # self.radius_neighbors distances, indices = model.radius_neighbors(X=X, radius=np.inf, sort_results=True) for ii in range(n_samples): assert_array_equal(distances[ii], np.sort(distances[ii])) # sort_results=True and return_distance=False if metric != "precomputed": # no need to raise with precomputed graph with pytest.raises(ValueError, match="return_distance must be True"): model.radius_neighbors(X=X, radius=np.inf, sort_results=True, return_distance=False) # self.radius_neighbors_graph graph = model.radius_neighbors_graph(X=X, radius=np.inf, mode="distance", sort_results=True) assert _is_sorted_by_data(graph) def test_RadiusNeighborsClassifier_multioutput(): # Test k-NN classifier on multioutput data rng = check_random_state(0) n_features = 2 n_samples = 40 n_output = 3 X = rng.rand(n_samples, n_features) y = rng.randint(0, 3, (n_samples, n_output)) X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0) weights = [None, 'uniform', 'distance', _weight_func] for algorithm, weights in product(ALGORITHMS, weights): # Stack single output prediction y_pred_so = [] for o in range(n_output): rnn = neighbors.RadiusNeighborsClassifier(weights=weights, algorithm=algorithm) rnn.fit(X_train, y_train[:, o]) y_pred_so.append(rnn.predict(X_test)) y_pred_so = np.vstack(y_pred_so).T assert y_pred_so.shape == y_test.shape # Multioutput prediction rnn_mo = neighbors.RadiusNeighborsClassifier(weights=weights, algorithm=algorithm) rnn_mo.fit(X_train, y_train) y_pred_mo = rnn_mo.predict(X_test) assert y_pred_mo.shape == y_test.shape assert_array_almost_equal(y_pred_mo, y_pred_so) def test_kneighbors_classifier_sparse(n_samples=40, n_features=5, n_test_pts=10, n_neighbors=5, random_state=0): # Test k-NN classifier on sparse matrices # Like the above, but with various types of sparse matrices rng = np.random.RandomState(random_state) X = 2 * rng.rand(n_samples, n_features) - 1 X *= X > .2 y = ((X ** 2).sum(axis=1) < .5).astype(int) for sparsemat in SPARSE_TYPES: knn = neighbors.KNeighborsClassifier(n_neighbors=n_neighbors, algorithm='auto') knn.fit(sparsemat(X), y) epsilon = 1e-5 * (2 * rng.rand(1, n_features) - 1) for sparsev in SPARSE_TYPES + (np.asarray,): X_eps = sparsev(X[:n_test_pts] + epsilon) y_pred = knn.predict(X_eps) assert_array_equal(y_pred, y[:n_test_pts]) def test_KNeighborsClassifier_multioutput(): # Test k-NN classifier on multioutput data rng = check_random_state(0) n_features = 5 n_samples = 50 n_output = 3 X = rng.rand(n_samples, n_features) y = rng.randint(0, 3, (n_samples, n_output)) X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0) weights = [None, 'uniform', 'distance', _weight_func] for algorithm, weights in product(ALGORITHMS, weights): # Stack single output prediction y_pred_so = [] y_pred_proba_so = [] for o in range(n_output): knn = neighbors.KNeighborsClassifier(weights=weights, algorithm=algorithm) knn.fit(X_train, y_train[:, o]) y_pred_so.append(knn.predict(X_test)) y_pred_proba_so.append(knn.predict_proba(X_test)) y_pred_so = np.vstack(y_pred_so).T assert y_pred_so.shape == y_test.shape assert len(y_pred_proba_so) == n_output # Multioutput prediction knn_mo = neighbors.KNeighborsClassifier(weights=weights, algorithm=algorithm) knn_mo.fit(X_train, y_train) y_pred_mo = knn_mo.predict(X_test) assert y_pred_mo.shape == y_test.shape assert_array_almost_equal(y_pred_mo, y_pred_so) # Check proba y_pred_proba_mo = knn_mo.predict_proba(X_test) assert len(y_pred_proba_mo) == n_output for proba_mo, proba_so in zip(y_pred_proba_mo, y_pred_proba_so): assert_array_almost_equal(proba_mo, proba_so) def test_kneighbors_regressor(n_samples=40, n_features=5, n_test_pts=10, n_neighbors=3, random_state=0): # Test k-neighbors regression rng = np.random.RandomState(random_state) X = 2 * rng.rand(n_samples, n_features) - 1 y = np.sqrt((X ** 2).sum(1)) y /= y.max() y_target = y[:n_test_pts] weight_func = _weight_func for algorithm in ALGORITHMS: for weights in ['uniform', 'distance', weight_func]: knn = neighbors.KNeighborsRegressor(n_neighbors=n_neighbors, weights=weights, algorithm=algorithm) knn.fit(X, y) epsilon = 1E-5 * (2 * rng.rand(1, n_features) - 1) y_pred = knn.predict(X[:n_test_pts] + epsilon) assert np.all(abs(y_pred - y_target) < 0.3) def test_KNeighborsRegressor_multioutput_uniform_weight(): # Test k-neighbors in multi-output regression with uniform weight rng = check_random_state(0) n_features = 5 n_samples = 40 n_output = 4 X = rng.rand(n_samples, n_features) y = rng.rand(n_samples, n_output) X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0) for algorithm, weights in product(ALGORITHMS, [None, 'uniform']): knn = neighbors.KNeighborsRegressor(weights=weights, algorithm=algorithm) knn.fit(X_train, y_train) neigh_idx = knn.kneighbors(X_test, return_distance=False) y_pred_idx = np.array([np.mean(y_train[idx], axis=0) for idx in neigh_idx]) y_pred = knn.predict(X_test) assert y_pred.shape == y_test.shape assert y_pred_idx.shape == y_test.shape assert_array_almost_equal(y_pred, y_pred_idx) def test_kneighbors_regressor_multioutput(n_samples=40, n_features=5, n_test_pts=10, n_neighbors=3, random_state=0): # Test k-neighbors in multi-output regression rng = np.random.RandomState(random_state) X = 2 * rng.rand(n_samples, n_features) - 1 y = np.sqrt((X ** 2).sum(1)) y /= y.max() y = np.vstack([y, y]).T y_target = y[:n_test_pts] weights = ['uniform', 'distance', _weight_func] for algorithm, weights in product(ALGORITHMS, weights): knn = neighbors.KNeighborsRegressor(n_neighbors=n_neighbors, weights=weights, algorithm=algorithm) knn.fit(X, y) epsilon = 1E-5 * (2 * rng.rand(1, n_features) - 1) y_pred = knn.predict(X[:n_test_pts] + epsilon) assert y_pred.shape == y_target.shape assert np.all(np.abs(y_pred - y_target) < 0.3) def test_radius_neighbors_regressor(n_samples=40, n_features=3, n_test_pts=10, radius=0.5, random_state=0): # Test radius-based neighbors regression rng = np.random.RandomState(random_state) X = 2 * rng.rand(n_samples, n_features) - 1 y = np.sqrt((X ** 2).sum(1)) y /= y.max() y_target = y[:n_test_pts] weight_func = _weight_func for algorithm in ALGORITHMS: for weights in ['uniform', 'distance', weight_func]: neigh = neighbors.RadiusNeighborsRegressor(radius=radius, weights=weights, algorithm=algorithm) neigh.fit(X, y) epsilon = 1E-5 * (2 * rng.rand(1, n_features) - 1) y_pred = neigh.predict(X[:n_test_pts] + epsilon) assert np.all(abs(y_pred - y_target) < radius / 2) # test that nan is returned when no nearby observations for weights in ['uniform', 'distance']: neigh = neighbors.RadiusNeighborsRegressor(radius=radius, weights=weights, algorithm='auto') neigh.fit(X, y) X_test_nan = np.full((1, n_features), -1.) empty_warning_msg = ("One or more samples have no neighbors " "within specified radius; predicting NaN.") pred = assert_warns_message(UserWarning, empty_warning_msg, neigh.predict, X_test_nan) assert np.all(np.isnan(pred)) def test_RadiusNeighborsRegressor_multioutput_with_uniform_weight(): # Test radius neighbors in multi-output regression (uniform weight) rng = check_random_state(0) n_features = 5 n_samples = 40 n_output = 4 X = rng.rand(n_samples, n_features) y = rng.rand(n_samples, n_output) X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0) for algorithm, weights in product(ALGORITHMS, [None, 'uniform']): rnn = neighbors. RadiusNeighborsRegressor(weights=weights, algorithm=algorithm) rnn.fit(X_train, y_train) neigh_idx = rnn.radius_neighbors(X_test, return_distance=False) y_pred_idx = np.array([np.mean(y_train[idx], axis=0) for idx in neigh_idx]) y_pred_idx = np.array(y_pred_idx) y_pred = rnn.predict(X_test) assert y_pred_idx.shape == y_test.shape assert y_pred.shape == y_test.shape assert_array_almost_equal(y_pred, y_pred_idx) def test_RadiusNeighborsRegressor_multioutput(n_samples=40, n_features=5, n_test_pts=10, n_neighbors=3, random_state=0): # Test k-neighbors in multi-output regression with various weight rng = np.random.RandomState(random_state) X = 2 * rng.rand(n_samples, n_features) - 1 y = np.sqrt((X ** 2).sum(1)) y /= y.max() y = np.vstack([y, y]).T y_target = y[:n_test_pts] weights = ['uniform', 'distance', _weight_func] for algorithm, weights in product(ALGORITHMS, weights): rnn = neighbors.RadiusNeighborsRegressor(n_neighbors=n_neighbors, weights=weights, algorithm=algorithm) rnn.fit(X, y) epsilon = 1E-5 * (2 * rng.rand(1, n_features) - 1) y_pred = rnn.predict(X[:n_test_pts] + epsilon) assert y_pred.shape == y_target.shape assert np.all(np.abs(y_pred - y_target) < 0.3) @ignore_warnings(category=EfficiencyWarning) def test_kneighbors_regressor_sparse(n_samples=40, n_features=5, n_test_pts=10, n_neighbors=5, random_state=0): # Test radius-based regression on sparse matrices # Like the above, but with various types of sparse matrices rng = np.random.RandomState(random_state) X = 2 * rng.rand(n_samples, n_features) - 1 y = ((X ** 2).sum(axis=1) < .25).astype(int) for sparsemat in SPARSE_TYPES: knn = neighbors.KNeighborsRegressor(n_neighbors=n_neighbors, algorithm='auto') knn.fit(sparsemat(X), y) knn_pre = neighbors.KNeighborsRegressor(n_neighbors=n_neighbors, metric='precomputed') knn_pre.fit(pairwise_distances(X, metric='euclidean'), y) for sparsev in SPARSE_OR_DENSE: X2 = sparsev(X) assert np.mean(knn.predict(X2).round() == y) > 0.95 X2_pre = sparsev(pairwise_distances(X, metric='euclidean')) assert np.mean(knn_pre.predict(X2_pre).round() == y) > 0.95 def test_neighbors_iris(): # Sanity checks on the iris dataset # Puts three points of each label in the plane and performs a # nearest neighbor query on points near the decision boundary. for algorithm in ALGORITHMS: clf = neighbors.KNeighborsClassifier(n_neighbors=1, algorithm=algorithm) clf.fit(iris.data, iris.target) assert_array_equal(clf.predict(iris.data), iris.target) clf.set_params(n_neighbors=9, algorithm=algorithm) clf.fit(iris.data, iris.target) assert np.mean(clf.predict(iris.data) == iris.target) > 0.95 rgs = neighbors.KNeighborsRegressor(n_neighbors=5, algorithm=algorithm) rgs.fit(iris.data, iris.target) assert (np.mean(rgs.predict(iris.data).round() == iris.target) > 0.95) def test_neighbors_digits(): # Sanity check on the digits dataset # the 'brute' algorithm has been observed to fail if the input # dtype is uint8 due to overflow in distance calculations. X = digits.data.astype('uint8') Y = digits.target (n_samples, n_features) = X.shape train_test_boundary = int(n_samples * 0.8) train = np.arange(0, train_test_boundary) test = np.arange(train_test_boundary, n_samples) (X_train, Y_train, X_test, Y_test) = X[train], Y[train], X[test], Y[test] clf = neighbors.KNeighborsClassifier(n_neighbors=1, algorithm='brute') score_uint8 = clf.fit(X_train, Y_train).score(X_test, Y_test) score_float = clf.fit(X_train.astype(float, copy=False), Y_train).score( X_test.astype(float, copy=False), Y_test) assert score_uint8 == score_float def test_kneighbors_graph(): # Test kneighbors_graph to build the k-Nearest Neighbor graph. X = np.array([[0, 1], [1.01, 1.], [2, 0]]) # n_neighbors = 1 A = neighbors.kneighbors_graph(X, 1, mode='connectivity', include_self=True) assert_array_equal(A.toarray(), np.eye(A.shape[0])) A = neighbors.kneighbors_graph(X, 1, mode='distance') assert_array_almost_equal( A.toarray(), [[0.00, 1.01, 0.], [1.01, 0., 0.], [0.00, 1.40716026, 0.]]) # n_neighbors = 2 A = neighbors.kneighbors_graph(X, 2, mode='connectivity', include_self=True) assert_array_equal( A.toarray(), [[1., 1., 0.], [1., 1., 0.], [0., 1., 1.]]) A = neighbors.kneighbors_graph(X, 2, mode='distance') assert_array_almost_equal( A.toarray(), [[0., 1.01, 2.23606798], [1.01, 0., 1.40716026], [2.23606798, 1.40716026, 0.]]) # n_neighbors = 3 A = neighbors.kneighbors_graph(X, 3, mode='connectivity', include_self=True) assert_array_almost_equal( A.toarray(), [[1, 1, 1], [1, 1, 1], [1, 1, 1]]) def test_kneighbors_graph_sparse(seed=36): # Test kneighbors_graph to build the k-Nearest Neighbor graph # for sparse input. rng = np.random.RandomState(seed) X = rng.randn(10, 10) Xcsr = csr_matrix(X) for n_neighbors in [1, 2, 3]: for mode in ["connectivity", "distance"]: assert_array_almost_equal( neighbors.kneighbors_graph(X, n_neighbors, mode=mode).toarray(), neighbors.kneighbors_graph(Xcsr, n_neighbors, mode=mode).toarray()) def test_radius_neighbors_graph(): # Test radius_neighbors_graph to build the Nearest Neighbor graph. X = np.array([[0, 1], [1.01, 1.], [2, 0]]) A = neighbors.radius_neighbors_graph(X, 1.5, mode='connectivity', include_self=True) assert_array_equal( A.toarray(), [[1., 1., 0.], [1., 1., 1.], [0., 1., 1.]]) A = neighbors.radius_neighbors_graph(X, 1.5, mode='distance') assert_array_almost_equal( A.toarray(), [[0., 1.01, 0.], [1.01, 0., 1.40716026], [0., 1.40716026, 0.]]) def test_radius_neighbors_graph_sparse(seed=36): # Test radius_neighbors_graph to build the Nearest Neighbor graph # for sparse input. rng = np.random.RandomState(seed) X = rng.randn(10, 10) Xcsr = csr_matrix(X) for n_neighbors in [1, 2, 3]: for mode in ["connectivity", "distance"]: assert_array_almost_equal( neighbors.radius_neighbors_graph(X, n_neighbors, mode=mode).toarray(), neighbors.radius_neighbors_graph(Xcsr, n_neighbors, mode=mode).toarray()) def test_neighbors_badargs(): # Test bad argument values: these should all raise ValueErrors assert_raises(ValueError, neighbors.NearestNeighbors, algorithm='blah') X = rng.random_sample((10, 2)) Xsparse = csr_matrix(X) X3 = rng.random_sample((10, 3)) y = np.ones(10) for cls in (neighbors.KNeighborsClassifier, neighbors.RadiusNeighborsClassifier, neighbors.KNeighborsRegressor, neighbors.RadiusNeighborsRegressor): assert_raises(ValueError, cls, weights='blah') assert_raises(ValueError, cls, p=-1) assert_raises(ValueError, cls, algorithm='blah') nbrs = cls(algorithm='ball_tree', metric='haversine') assert_raises(ValueError, nbrs.predict, X) assert_raises(ValueError, ignore_warnings(nbrs.fit), Xsparse, y) nbrs = cls(metric='haversine', algorithm='brute') nbrs.fit(X3, y) assert_raise_message(ValueError, "Haversine distance only valid in 2 dimensions", nbrs.predict, X3) nbrs = cls() assert_raises(ValueError, nbrs.fit, np.ones((0, 2)), np.ones(0)) assert_raises(ValueError, nbrs.fit, X[:, :, None], y) nbrs.fit(X, y) assert_raises(ValueError, nbrs.predict, [[]]) if (issubclass(cls, neighbors.KNeighborsClassifier) or issubclass(cls, neighbors.KNeighborsRegressor)): nbrs = cls(n_neighbors=-1) assert_raises(ValueError, nbrs.fit, X, y) nbrs = neighbors.NearestNeighbors().fit(X) assert_raises(ValueError, nbrs.kneighbors_graph, X, mode='blah') assert_raises(ValueError, nbrs.radius_neighbors_graph, X, mode='blah') def test_neighbors_metrics(n_samples=20, n_features=3, n_query_pts=2, n_neighbors=5): # Test computing the neighbors for various metrics # create a symmetric matrix V = rng.rand(n_features, n_features) VI = np.dot(V, V.T) metrics = [('euclidean', {}), ('manhattan', {}), ('minkowski', dict(p=1)), ('minkowski', dict(p=2)), ('minkowski', dict(p=3)), ('minkowski', dict(p=np.inf)), ('chebyshev', {}), ('seuclidean', dict(V=rng.rand(n_features))), ('wminkowski', dict(p=3, w=rng.rand(n_features))), ('mahalanobis', dict(VI=VI)), ('haversine', {})] algorithms = ['brute', 'ball_tree', 'kd_tree'] X = rng.rand(n_samples, n_features) test = rng.rand(n_query_pts, n_features) for metric, metric_params in metrics: if metric == "wminkowski" and sp_version >= parse_version("1.8.0"): # wminkowski will be removed in SciPy 1.8.0 continue results = {} p = metric_params.pop('p', 2) for algorithm in algorithms: # KD tree doesn't support all metrics if (algorithm == 'kd_tree' and metric not in neighbors.KDTree.valid_metrics): assert_raises(ValueError, neighbors.NearestNeighbors, algorithm=algorithm, metric=metric, metric_params=metric_params) continue neigh = neighbors.NearestNeighbors(n_neighbors=n_neighbors, algorithm=algorithm, metric=metric, p=p, metric_params=metric_params) # Haversine distance only accepts 2D data feature_sl = (slice(None, 2) if metric == 'haversine' else slice(None)) neigh.fit(X[:, feature_sl]) # wminkoski is deprecated in SciPy 1.6.0 and removed in 1.8.0 ExceptionToAssert = None if (metric == "wminkowski" and algorithm == 'brute' and sp_version >= parse_version("1.6.0")): ExceptionToAssert = DeprecationWarning with pytest.warns(ExceptionToAssert): results[algorithm] = neigh.kneighbors(test[:, feature_sl], return_distance=True) assert_array_almost_equal(results['brute'][0], results['ball_tree'][0]) assert_array_almost_equal(results['brute'][1], results['ball_tree'][1]) if 'kd_tree' in results: assert_array_almost_equal(results['brute'][0], results['kd_tree'][0]) assert_array_almost_equal(results['brute'][1], results['kd_tree'][1]) def test_callable_metric(): def custom_metric(x1, x2): return np.sqrt(np.sum(x1 ** 2 + x2 ** 2)) X = np.random.RandomState(42).rand(20, 2) nbrs1 = neighbors.NearestNeighbors(n_neighbors=3, algorithm='auto', metric=custom_metric) nbrs2 = neighbors.NearestNeighbors(n_neighbors=3, algorithm='brute', metric=custom_metric) nbrs1.fit(X) nbrs2.fit(X) dist1, ind1 = nbrs1.kneighbors(X) dist2, ind2 = nbrs2.kneighbors(X) assert_array_almost_equal(dist1, dist2) def test_valid_brute_metric_for_auto_algorithm(): X = rng.rand(12, 12) Xcsr = csr_matrix(X) # check that there is a metric that is valid for brute # but not ball_tree (so we actually test something) assert "cosine" in VALID_METRICS['brute'] assert "cosine" not in VALID_METRICS['ball_tree'] # Metric which don't required any additional parameter require_params = ['mahalanobis', 'wminkowski', 'seuclidean'] for metric in VALID_METRICS['brute']: if metric != 'precomputed' and metric not in require_params: nn = neighbors.NearestNeighbors(n_neighbors=3, algorithm='auto', metric=metric) if metric != 'haversine': nn.fit(X) nn.kneighbors(X) else: nn.fit(X[:, :2]) nn.kneighbors(X[:, :2]) elif metric == 'precomputed': X_precomputed = rng.random_sample((10, 4)) Y_precomputed = rng.random_sample((3, 4)) DXX = metrics.pairwise_distances(X_precomputed, metric='euclidean') DYX = metrics.pairwise_distances(Y_precomputed, X_precomputed, metric='euclidean') nb_p = neighbors.NearestNeighbors(n_neighbors=3) nb_p.fit(DXX) nb_p.kneighbors(DYX) for metric in VALID_METRICS_SPARSE['brute']: if metric != 'precomputed' and metric not in require_params: nn = neighbors.NearestNeighbors(n_neighbors=3, algorithm='auto', metric=metric).fit(Xcsr) nn.kneighbors(Xcsr) # Metric with parameter VI = np.dot(X, X.T) list_metrics = [('seuclidean', dict(V=rng.rand(12))), ('wminkowski', dict(w=rng.rand(12))), ('mahalanobis', dict(VI=VI))] for metric, params in list_metrics: nn = neighbors.NearestNeighbors(n_neighbors=3, algorithm='auto', metric=metric, metric_params=params).fit(X) nn.kneighbors(X) def test_metric_params_interface(): assert_warns(SyntaxWarning, neighbors.KNeighborsClassifier, metric_params={'p': 3}) def test_predict_sparse_ball_kd_tree(): rng = np.random.RandomState(0) X = rng.rand(5, 5) y = rng.randint(0, 2, 5) nbrs1 = neighbors.KNeighborsClassifier(1, algorithm='kd_tree') nbrs2 = neighbors.KNeighborsRegressor(1, algorithm='ball_tree') for model in [nbrs1, nbrs2]: model.fit(X, y) assert_raises(ValueError, model.predict, csr_matrix(X)) def test_non_euclidean_kneighbors(): rng = np.random.RandomState(0) X = rng.rand(5, 5) # Find a reasonable radius. dist_array = pairwise_distances(X).flatten() np.sort(dist_array) radius = dist_array[15] # Test kneighbors_graph for metric in ['manhattan', 'chebyshev']: nbrs_graph = neighbors.kneighbors_graph( X, 3, metric=metric, mode='connectivity', include_self=True).toarray() nbrs1 = neighbors.NearestNeighbors(n_neighbors=3, metric=metric).fit(X) assert_array_equal(nbrs_graph, nbrs1.kneighbors_graph(X).toarray()) # Test radiusneighbors_graph for metric in ['manhattan', 'chebyshev']: nbrs_graph = neighbors.radius_neighbors_graph( X, radius, metric=metric, mode='connectivity', include_self=True).toarray() nbrs1 = neighbors.NearestNeighbors(metric=metric, radius=radius).fit(X) assert_array_equal(nbrs_graph, nbrs1.radius_neighbors_graph(X).A) # Raise error when wrong parameters are supplied, X_nbrs = neighbors.NearestNeighbors(n_neighbors=3, metric='manhattan') X_nbrs.fit(X) assert_raises(ValueError, neighbors.kneighbors_graph, X_nbrs, 3, metric='euclidean') X_nbrs = neighbors.NearestNeighbors(radius=radius, metric='manhattan') X_nbrs.fit(X) assert_raises(ValueError, neighbors.radius_neighbors_graph, X_nbrs, radius, metric='euclidean') def check_object_arrays(nparray, list_check): for ind, ele in enumerate(nparray): assert_array_equal(ele, list_check[ind]) def test_k_and_radius_neighbors_train_is_not_query(): # Test kneighbors et.al when query is not training data for algorithm in ALGORITHMS: nn = neighbors.NearestNeighbors(n_neighbors=1, algorithm=algorithm) X = [[0], [1]] nn.fit(X) test_data = [[2], [1]] # Test neighbors. dist, ind = nn.kneighbors(test_data) assert_array_equal(dist, [[1], [0]]) assert_array_equal(ind, [[1], [1]]) dist, ind = nn.radius_neighbors([[2], [1]], radius=1.5) check_object_arrays(dist, [[1], [1, 0]]) check_object_arrays(ind, [[1], [0, 1]]) # Test the graph variants. assert_array_equal( nn.kneighbors_graph(test_data).A, [[0., 1.], [0., 1.]]) assert_array_equal( nn.kneighbors_graph([[2], [1]], mode='distance').A, np.array([[0., 1.], [0., 0.]])) rng = nn.radius_neighbors_graph([[2], [1]], radius=1.5) assert_array_equal(rng.A, [[0, 1], [1, 1]]) def test_k_and_radius_neighbors_X_None(): # Test kneighbors et.al when query is None for algorithm in ALGORITHMS: nn = neighbors.NearestNeighbors(n_neighbors=1, algorithm=algorithm) X = [[0], [1]] nn.fit(X) dist, ind = nn.kneighbors() assert_array_equal(dist, [[1], [1]]) assert_array_equal(ind, [[1], [0]]) dist, ind = nn.radius_neighbors(None, radius=1.5) check_object_arrays(dist, [[1], [1]]) check_object_arrays(ind, [[1], [0]]) # Test the graph variants. rng = nn.radius_neighbors_graph(None, radius=1.5) kng = nn.kneighbors_graph(None) for graph in [rng, kng]: assert_array_equal(graph.A, [[0, 1], [1, 0]]) assert_array_equal(graph.data, [1, 1]) assert_array_equal(graph.indices, [1, 0]) X = [[0, 1], [0, 1], [1, 1]] nn = neighbors.NearestNeighbors(n_neighbors=2, algorithm=algorithm) nn.fit(X) assert_array_equal( nn.kneighbors_graph().A, np.array([[0., 1., 1.], [1., 0., 1.], [1., 1., 0]])) def test_k_and_radius_neighbors_duplicates(): # Test behavior of kneighbors when duplicates are present in query for algorithm in ALGORITHMS: nn = neighbors.NearestNeighbors(n_neighbors=1, algorithm=algorithm) nn.fit([[0], [1]]) # Do not do anything special to duplicates. kng = nn.kneighbors_graph([[0], [1]], mode='distance') assert_array_equal( kng.A, np.array([[0., 0.], [0., 0.]])) assert_array_equal(kng.data, [0., 0.]) assert_array_equal(kng.indices, [0, 1]) dist, ind = nn.radius_neighbors([[0], [1]], radius=1.5) check_object_arrays(dist, [[0, 1], [1, 0]]) check_object_arrays(ind, [[0, 1], [0, 1]]) rng = nn.radius_neighbors_graph([[0], [1]], radius=1.5) assert_array_equal(rng.A, np.ones((2, 2))) rng = nn.radius_neighbors_graph([[0], [1]], radius=1.5, mode='distance') rng.sort_indices() assert_array_equal(rng.A, [[0, 1], [1, 0]]) assert_array_equal(rng.indices, [0, 1, 0, 1]) assert_array_equal(rng.data, [0, 1, 1, 0]) # Mask the first duplicates when n_duplicates > n_neighbors. X = np.ones((3, 1)) nn = neighbors.NearestNeighbors(n_neighbors=1, algorithm='brute') nn.fit(X) dist, ind = nn.kneighbors() assert_array_equal(dist, np.zeros((3, 1))) assert_array_equal(ind, [[1], [0], [1]]) # Test that zeros are explicitly marked in kneighbors_graph. kng = nn.kneighbors_graph(mode='distance') assert_array_equal( kng.A, np.zeros((3, 3))) assert_array_equal(kng.data, np.zeros(3)) assert_array_equal(kng.indices, [1., 0., 1.]) assert_array_equal( nn.kneighbors_graph().A, np.array([[0., 1., 0.], [1., 0., 0.], [0., 1., 0.]])) def test_include_self_neighbors_graph(): # Test include_self parameter in neighbors_graph X = [[2, 3], [4, 5]] kng = neighbors.kneighbors_graph(X, 1, include_self=True).A kng_not_self = neighbors.kneighbors_graph(X, 1, include_self=False).A assert_array_equal(kng, [[1., 0.], [0., 1.]]) assert_array_equal(kng_not_self, [[0., 1.], [1., 0.]]) rng = neighbors.radius_neighbors_graph(X, 5.0, include_self=True).A rng_not_self = neighbors.radius_neighbors_graph( X, 5.0, include_self=False).A assert_array_equal(rng, [[1., 1.], [1., 1.]]) assert_array_equal(rng_not_self, [[0., 1.], [1., 0.]]) @pytest.mark.parametrize('algorithm', ALGORITHMS) def test_same_knn_parallel(algorithm): X, y = datasets.make_classification(n_samples=30, n_features=5, n_redundant=0, random_state=0) X_train, X_test, y_train, y_test = train_test_split(X, y) clf = neighbors.KNeighborsClassifier(n_neighbors=3, algorithm=algorithm) clf.fit(X_train, y_train) y = clf.predict(X_test) dist, ind = clf.kneighbors(X_test) graph = clf.kneighbors_graph(X_test, mode='distance').toarray() clf.set_params(n_jobs=3) clf.fit(X_train, y_train) y_parallel = clf.predict(X_test) dist_parallel, ind_parallel = clf.kneighbors(X_test) graph_parallel = \ clf.kneighbors_graph(X_test, mode='distance').toarray() assert_array_equal(y, y_parallel) assert_array_almost_equal(dist, dist_parallel) assert_array_equal(ind, ind_parallel) assert_array_almost_equal(graph, graph_parallel) @pytest.mark.parametrize('algorithm', ALGORITHMS) def test_same_radius_neighbors_parallel(algorithm): X, y = datasets.make_classification(n_samples=30, n_features=5, n_redundant=0, random_state=0) X_train, X_test, y_train, y_test = train_test_split(X, y) clf = neighbors.RadiusNeighborsClassifier(radius=10, algorithm=algorithm) clf.fit(X_train, y_train) y = clf.predict(X_test) dist, ind = clf.radius_neighbors(X_test) graph = clf.radius_neighbors_graph(X_test, mode='distance').toarray() clf.set_params(n_jobs=3) clf.fit(X_train, y_train) y_parallel = clf.predict(X_test) dist_parallel, ind_parallel = clf.radius_neighbors(X_test) graph_parallel = \ clf.radius_neighbors_graph(X_test, mode='distance').toarray() assert_array_equal(y, y_parallel) for i in range(len(dist)): assert_array_almost_equal(dist[i], dist_parallel[i]) assert_array_equal(ind[i], ind_parallel[i]) assert_array_almost_equal(graph, graph_parallel) @pytest.mark.parametrize('backend', JOBLIB_BACKENDS) @pytest.mark.parametrize('algorithm', ALGORITHMS) def test_knn_forcing_backend(backend, algorithm): # Non-regression test which ensure the knn methods are properly working # even when forcing the global joblib backend. with joblib.parallel_backend(backend): X, y = datasets.make_classification(n_samples=30, n_features=5, n_redundant=0, random_state=0) X_train, X_test, y_train, y_test = train_test_split(X, y) clf = neighbors.KNeighborsClassifier(n_neighbors=3, algorithm=algorithm, n_jobs=3) clf.fit(X_train, y_train) clf.predict(X_test) clf.kneighbors(X_test) clf.kneighbors_graph(X_test, mode='distance').toarray() def test_dtype_convert(): classifier = neighbors.KNeighborsClassifier(n_neighbors=1) CLASSES = 15 X = np.eye(CLASSES) y = [ch for ch in 'ABCDEFGHIJKLMNOPQRSTU'[:CLASSES]] result = classifier.fit(X, y).predict(X) assert_array_equal(result, y) def test_sparse_metric_callable(): def sparse_metric(x, y): # Metric accepting sparse matrix input (only) assert issparse(x) and issparse(y) return x.dot(y.T).A.item() X = csr_matrix([ # Population matrix [1, 1, 1, 1, 1], [1, 0, 1, 0, 1], [0, 0, 1, 0, 0] ]) Y = csr_matrix([ # Query matrix [1, 1, 0, 1, 1], [1, 0, 0, 0, 1] ]) nn = neighbors.NearestNeighbors(algorithm='brute', n_neighbors=2, metric=sparse_metric).fit(X) N = nn.kneighbors(Y, return_distance=False) # GS indices of nearest neighbours in `X` for `sparse_metric` gold_standard_nn = np.array([ [2, 1], [2, 1] ]) assert_array_equal(N, gold_standard_nn) # ignore conversion to boolean in pairwise_distances @ignore_warnings(category=DataConversionWarning) def test_pairwise_boolean_distance(): # Non-regression test for #4523 # 'brute': uses scipy.spatial.distance through pairwise_distances # 'ball_tree': uses sklearn.neighbors._dist_metrics rng = np.random.RandomState(0) X = rng.uniform(size=(6, 5)) NN = neighbors.NearestNeighbors nn1 = NN(metric="jaccard", algorithm='brute').fit(X) nn2 = NN(metric="jaccard", algorithm='ball_tree').fit(X) assert_array_equal(nn1.kneighbors(X)[0], nn2.kneighbors(X)[0]) def test_radius_neighbors_predict_proba(): for seed in range(5): X, y = datasets.make_classification(n_samples=50, n_features=5, n_informative=3, n_redundant=0, n_classes=3, random_state=seed) X_tr, X_te, y_tr, y_te = train_test_split(X, y, random_state=0) outlier_label = int(2 - seed) clf = neighbors.RadiusNeighborsClassifier(radius=2, outlier_label=outlier_label) clf.fit(X_tr, y_tr) pred = clf.predict(X_te) proba = clf.predict_proba(X_te) proba_label = proba.argmax(axis=1) proba_label = np.where(proba.sum(axis=1) == 0, outlier_label, proba_label) assert_array_equal(pred, proba_label) def test_pipeline_with_nearest_neighbors_transformer(): # Test chaining KNeighborsTransformer and classifiers/regressors rng = np.random.RandomState(0) X = 2 * rng.rand(40, 5) - 1 X2 = 2 * rng.rand(40, 5) - 1 y = rng.rand(40, 1) n_neighbors = 12 radius = 1.5 # We precompute more neighbors than necessary, to have equivalence between # k-neighbors estimator after radius-neighbors transformer, and vice-versa. factor = 2 k_trans = neighbors.KNeighborsTransformer( n_neighbors=n_neighbors, mode='distance') k_trans_factor = neighbors.KNeighborsTransformer( n_neighbors=int(n_neighbors * factor), mode='distance') r_trans = neighbors.RadiusNeighborsTransformer( radius=radius, mode='distance') r_trans_factor = neighbors.RadiusNeighborsTransformer( radius=int(radius * factor), mode='distance') k_reg = neighbors.KNeighborsRegressor(n_neighbors=n_neighbors) r_reg = neighbors.RadiusNeighborsRegressor(radius=radius) test_list = [(k_trans, k_reg), (k_trans_factor, r_reg), (r_trans, r_reg), (r_trans_factor, k_reg), ] for trans, reg in test_list: # compare the chained version and the compact version reg_compact = clone(reg) reg_precomp = clone(reg) reg_precomp.set_params(metric='precomputed') reg_chain = make_pipeline(clone(trans), reg_precomp) y_pred_chain = reg_chain.fit(X, y).predict(X2) y_pred_compact = reg_compact.fit(X, y).predict(X2) assert_array_almost_equal(y_pred_chain, y_pred_compact) @pytest.mark.parametrize('X, metric, metric_params, expected_algo', [ (np.random.randint(10, size=(10, 10)), 'precomputed', None, 'brute'), (np.random.randn(10, 20), 'euclidean', None, 'brute'), (np.random.randn(8, 5), 'euclidean', None, 'brute'), (np.random.randn(10, 5), 'euclidean', None, 'kd_tree'), (np.random.randn(10, 5), 'seuclidean', {'V': [2]*5}, 'ball_tree'), (np.random.randn(10, 5), 'correlation', None, 'brute'), ]) def test_auto_algorithm(X, metric, metric_params, expected_algo): model = neighbors.NearestNeighbors( n_neighbors=4, algorithm='auto', metric=metric, metric_params=metric_params ) model.fit(X) assert model._fit_method == expected_algo # TODO: Remove in 1.1 @pytest.mark.parametrize("NearestNeighbors", [neighbors.KNeighborsClassifier, neighbors.KNeighborsRegressor, neighbors.NearestNeighbors]) def test_pairwise_deprecated(NearestNeighbors): nn = NearestNeighbors(metric='precomputed') msg = r"Attribute _pairwise was deprecated in version 0\.24" with pytest.warns(FutureWarning, match=msg): nn._pairwise