429 lines
15 KiB
Python
429 lines
15 KiB
Python
"""Isomap for manifold learning"""
|
|
|
|
# Author: Jake Vanderplas -- <vanderplas@astro.washington.edu>
|
|
# License: BSD 3 clause (C) 2011
|
|
import warnings
|
|
|
|
import numpy as np
|
|
from numbers import Integral, Real
|
|
|
|
from scipy.sparse import issparse
|
|
from scipy.sparse.csgraph import shortest_path
|
|
from scipy.sparse.csgraph import connected_components
|
|
|
|
from ..base import BaseEstimator, TransformerMixin, ClassNamePrefixFeaturesOutMixin
|
|
from ..neighbors import NearestNeighbors, kneighbors_graph
|
|
from ..neighbors import radius_neighbors_graph
|
|
from ..utils.validation import check_is_fitted
|
|
from ..decomposition import KernelPCA
|
|
from ..preprocessing import KernelCenterer
|
|
from ..utils.graph import _fix_connected_components
|
|
from ..utils._param_validation import Interval, StrOptions
|
|
from ..metrics.pairwise import _VALID_METRICS
|
|
|
|
|
|
class Isomap(ClassNamePrefixFeaturesOutMixin, TransformerMixin, BaseEstimator):
|
|
"""Isomap Embedding.
|
|
|
|
Non-linear dimensionality reduction through Isometric Mapping
|
|
|
|
Read more in the :ref:`User Guide <isomap>`.
|
|
|
|
Parameters
|
|
----------
|
|
n_neighbors : int or None, default=5
|
|
Number of neighbors to consider for each point. If `n_neighbors` is an int,
|
|
then `radius` must be `None`.
|
|
|
|
radius : float or None, default=None
|
|
Limiting distance of neighbors to return. If `radius` is a float,
|
|
then `n_neighbors` must be set to `None`.
|
|
|
|
.. versionadded:: 1.1
|
|
|
|
n_components : int, default=2
|
|
Number of coordinates for the manifold.
|
|
|
|
eigen_solver : {'auto', 'arpack', 'dense'}, default='auto'
|
|
'auto' : Attempt to choose the most efficient solver
|
|
for the given problem.
|
|
|
|
'arpack' : Use Arnoldi decomposition to find the eigenvalues
|
|
and eigenvectors.
|
|
|
|
'dense' : Use a direct solver (i.e. LAPACK)
|
|
for the eigenvalue decomposition.
|
|
|
|
tol : float, default=0
|
|
Convergence tolerance passed to arpack or lobpcg.
|
|
not used if eigen_solver == 'dense'.
|
|
|
|
max_iter : int, default=None
|
|
Maximum number of iterations for the arpack solver.
|
|
not used if eigen_solver == 'dense'.
|
|
|
|
path_method : {'auto', 'FW', 'D'}, default='auto'
|
|
Method to use in finding shortest path.
|
|
|
|
'auto' : attempt to choose the best algorithm automatically.
|
|
|
|
'FW' : Floyd-Warshall algorithm.
|
|
|
|
'D' : Dijkstra's algorithm.
|
|
|
|
neighbors_algorithm : {'auto', 'brute', 'kd_tree', 'ball_tree'}, \
|
|
default='auto'
|
|
Algorithm to use for nearest neighbors search,
|
|
passed to neighbors.NearestNeighbors instance.
|
|
|
|
n_jobs : int or None, default=None
|
|
The number of parallel jobs to run.
|
|
``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.
|
|
``-1`` means using all processors. See :term:`Glossary <n_jobs>`
|
|
for more details.
|
|
|
|
metric : str, or callable, default="minkowski"
|
|
The metric to use when calculating distance between instances in a
|
|
feature array. If metric is a string or callable, it must be one of
|
|
the options allowed by :func:`sklearn.metrics.pairwise_distances` for
|
|
its metric parameter.
|
|
If metric is "precomputed", X is assumed to be a distance matrix and
|
|
must be square. X may be a :term:`Glossary <sparse graph>`.
|
|
|
|
.. versionadded:: 0.22
|
|
|
|
p : int, default=2
|
|
Parameter for the Minkowski metric from
|
|
sklearn.metrics.pairwise.pairwise_distances. When p = 1, this is
|
|
equivalent to using manhattan_distance (l1), and euclidean_distance
|
|
(l2) for p = 2. For arbitrary p, minkowski_distance (l_p) is used.
|
|
|
|
.. versionadded:: 0.22
|
|
|
|
metric_params : dict, default=None
|
|
Additional keyword arguments for the metric function.
|
|
|
|
.. versionadded:: 0.22
|
|
|
|
Attributes
|
|
----------
|
|
embedding_ : array-like, shape (n_samples, n_components)
|
|
Stores the embedding vectors.
|
|
|
|
kernel_pca_ : object
|
|
:class:`~sklearn.decomposition.KernelPCA` object used to implement the
|
|
embedding.
|
|
|
|
nbrs_ : sklearn.neighbors.NearestNeighbors instance
|
|
Stores nearest neighbors instance, including BallTree or KDtree
|
|
if applicable.
|
|
|
|
dist_matrix_ : array-like, shape (n_samples, n_samples)
|
|
Stores the geodesic distance matrix of training data.
|
|
|
|
n_features_in_ : int
|
|
Number of features seen during :term:`fit`.
|
|
|
|
.. versionadded:: 0.24
|
|
|
|
feature_names_in_ : ndarray of shape (`n_features_in_`,)
|
|
Names of features seen during :term:`fit`. Defined only when `X`
|
|
has feature names that are all strings.
|
|
|
|
.. versionadded:: 1.0
|
|
|
|
See Also
|
|
--------
|
|
sklearn.decomposition.PCA : Principal component analysis that is a linear
|
|
dimensionality reduction method.
|
|
sklearn.decomposition.KernelPCA : Non-linear dimensionality reduction using
|
|
kernels and PCA.
|
|
MDS : Manifold learning using multidimensional scaling.
|
|
TSNE : T-distributed Stochastic Neighbor Embedding.
|
|
LocallyLinearEmbedding : Manifold learning using Locally Linear Embedding.
|
|
SpectralEmbedding : Spectral embedding for non-linear dimensionality.
|
|
|
|
References
|
|
----------
|
|
|
|
.. [1] Tenenbaum, J.B.; De Silva, V.; & Langford, J.C. A global geometric
|
|
framework for nonlinear dimensionality reduction. Science 290 (5500)
|
|
|
|
Examples
|
|
--------
|
|
>>> from sklearn.datasets import load_digits
|
|
>>> from sklearn.manifold import Isomap
|
|
>>> X, _ = load_digits(return_X_y=True)
|
|
>>> X.shape
|
|
(1797, 64)
|
|
>>> embedding = Isomap(n_components=2)
|
|
>>> X_transformed = embedding.fit_transform(X[:100])
|
|
>>> X_transformed.shape
|
|
(100, 2)
|
|
"""
|
|
|
|
_parameter_constraints: dict = {
|
|
"n_neighbors": [Interval(Integral, 1, None, closed="left"), None],
|
|
"radius": [Interval(Real, 0, None, closed="both"), None],
|
|
"n_components": [Interval(Integral, 1, None, closed="left")],
|
|
"eigen_solver": [StrOptions({"auto", "arpack", "dense"})],
|
|
"tol": [Interval(Real, 0, None, closed="left")],
|
|
"max_iter": [Interval(Integral, 1, None, closed="left"), None],
|
|
"path_method": [StrOptions({"auto", "FW", "D"})],
|
|
"neighbors_algorithm": [StrOptions({"auto", "brute", "kd_tree", "ball_tree"})],
|
|
"n_jobs": [Integral, None],
|
|
"p": [Interval(Real, 1, None, closed="left")],
|
|
"metric": [StrOptions(set(_VALID_METRICS) | {"precomputed"}), callable],
|
|
"metric_params": [dict, None],
|
|
}
|
|
|
|
def __init__(
|
|
self,
|
|
*,
|
|
n_neighbors=5,
|
|
radius=None,
|
|
n_components=2,
|
|
eigen_solver="auto",
|
|
tol=0,
|
|
max_iter=None,
|
|
path_method="auto",
|
|
neighbors_algorithm="auto",
|
|
n_jobs=None,
|
|
metric="minkowski",
|
|
p=2,
|
|
metric_params=None,
|
|
):
|
|
self.n_neighbors = n_neighbors
|
|
self.radius = radius
|
|
self.n_components = n_components
|
|
self.eigen_solver = eigen_solver
|
|
self.tol = tol
|
|
self.max_iter = max_iter
|
|
self.path_method = path_method
|
|
self.neighbors_algorithm = neighbors_algorithm
|
|
self.n_jobs = n_jobs
|
|
self.metric = metric
|
|
self.p = p
|
|
self.metric_params = metric_params
|
|
|
|
def _fit_transform(self, X):
|
|
if self.n_neighbors is not None and self.radius is not None:
|
|
raise ValueError(
|
|
"Both n_neighbors and radius are provided. Use"
|
|
f" Isomap(radius={self.radius}, n_neighbors=None) if intended to use"
|
|
" radius-based neighbors"
|
|
)
|
|
|
|
self.nbrs_ = NearestNeighbors(
|
|
n_neighbors=self.n_neighbors,
|
|
radius=self.radius,
|
|
algorithm=self.neighbors_algorithm,
|
|
metric=self.metric,
|
|
p=self.p,
|
|
metric_params=self.metric_params,
|
|
n_jobs=self.n_jobs,
|
|
)
|
|
self.nbrs_.fit(X)
|
|
self.n_features_in_ = self.nbrs_.n_features_in_
|
|
if hasattr(self.nbrs_, "feature_names_in_"):
|
|
self.feature_names_in_ = self.nbrs_.feature_names_in_
|
|
|
|
self.kernel_pca_ = KernelPCA(
|
|
n_components=self.n_components,
|
|
kernel="precomputed",
|
|
eigen_solver=self.eigen_solver,
|
|
tol=self.tol,
|
|
max_iter=self.max_iter,
|
|
n_jobs=self.n_jobs,
|
|
)
|
|
|
|
if self.n_neighbors is not None:
|
|
nbg = kneighbors_graph(
|
|
self.nbrs_,
|
|
self.n_neighbors,
|
|
metric=self.metric,
|
|
p=self.p,
|
|
metric_params=self.metric_params,
|
|
mode="distance",
|
|
n_jobs=self.n_jobs,
|
|
)
|
|
else:
|
|
nbg = radius_neighbors_graph(
|
|
self.nbrs_,
|
|
radius=self.radius,
|
|
metric=self.metric,
|
|
p=self.p,
|
|
metric_params=self.metric_params,
|
|
mode="distance",
|
|
n_jobs=self.n_jobs,
|
|
)
|
|
|
|
# Compute the number of connected components, and connect the different
|
|
# components to be able to compute a shortest path between all pairs
|
|
# of samples in the graph.
|
|
# Similar fix to cluster._agglomerative._fix_connectivity.
|
|
n_connected_components, labels = connected_components(nbg)
|
|
if n_connected_components > 1:
|
|
if self.metric == "precomputed" and issparse(X):
|
|
raise RuntimeError(
|
|
"The number of connected components of the neighbors graph"
|
|
f" is {n_connected_components} > 1. The graph cannot be "
|
|
"completed with metric='precomputed', and Isomap cannot be"
|
|
"fitted. Increase the number of neighbors to avoid this "
|
|
"issue, or precompute the full distance matrix instead "
|
|
"of passing a sparse neighbors graph."
|
|
)
|
|
warnings.warn(
|
|
"The number of connected components of the neighbors graph "
|
|
f"is {n_connected_components} > 1. Completing the graph to fit"
|
|
" Isomap might be slow. Increase the number of neighbors to "
|
|
"avoid this issue.",
|
|
stacklevel=2,
|
|
)
|
|
|
|
# use array validated by NearestNeighbors
|
|
nbg = _fix_connected_components(
|
|
X=self.nbrs_._fit_X,
|
|
graph=nbg,
|
|
n_connected_components=n_connected_components,
|
|
component_labels=labels,
|
|
mode="distance",
|
|
metric=self.nbrs_.effective_metric_,
|
|
**self.nbrs_.effective_metric_params_,
|
|
)
|
|
|
|
self.dist_matrix_ = shortest_path(nbg, method=self.path_method, directed=False)
|
|
|
|
if self.nbrs_._fit_X.dtype == np.float32:
|
|
self.dist_matrix_ = self.dist_matrix_.astype(
|
|
self.nbrs_._fit_X.dtype, copy=False
|
|
)
|
|
|
|
G = self.dist_matrix_**2
|
|
G *= -0.5
|
|
|
|
self.embedding_ = self.kernel_pca_.fit_transform(G)
|
|
self._n_features_out = self.embedding_.shape[1]
|
|
|
|
def reconstruction_error(self):
|
|
"""Compute the reconstruction error for the embedding.
|
|
|
|
Returns
|
|
-------
|
|
reconstruction_error : float
|
|
Reconstruction error.
|
|
|
|
Notes
|
|
-----
|
|
The cost function of an isomap embedding is
|
|
|
|
``E = frobenius_norm[K(D) - K(D_fit)] / n_samples``
|
|
|
|
Where D is the matrix of distances for the input data X,
|
|
D_fit is the matrix of distances for the output embedding X_fit,
|
|
and K is the isomap kernel:
|
|
|
|
``K(D) = -0.5 * (I - 1/n_samples) * D^2 * (I - 1/n_samples)``
|
|
"""
|
|
G = -0.5 * self.dist_matrix_**2
|
|
G_center = KernelCenterer().fit_transform(G)
|
|
evals = self.kernel_pca_.eigenvalues_
|
|
return np.sqrt(np.sum(G_center**2) - np.sum(evals**2)) / G.shape[0]
|
|
|
|
def fit(self, X, y=None):
|
|
"""Compute the embedding vectors for data X.
|
|
|
|
Parameters
|
|
----------
|
|
X : {array-like, sparse matrix, BallTree, KDTree, NearestNeighbors}
|
|
Sample data, shape = (n_samples, n_features), in the form of a
|
|
numpy array, sparse matrix, precomputed tree, or NearestNeighbors
|
|
object.
|
|
|
|
y : Ignored
|
|
Not used, present for API consistency by convention.
|
|
|
|
Returns
|
|
-------
|
|
self : object
|
|
Returns a fitted instance of self.
|
|
"""
|
|
self._validate_params()
|
|
self._fit_transform(X)
|
|
return self
|
|
|
|
def fit_transform(self, X, y=None):
|
|
"""Fit the model from data in X and transform X.
|
|
|
|
Parameters
|
|
----------
|
|
X : {array-like, sparse matrix, BallTree, KDTree}
|
|
Training vector, where `n_samples` is the number of samples
|
|
and `n_features` is the number of features.
|
|
|
|
y : Ignored
|
|
Not used, present for API consistency by convention.
|
|
|
|
Returns
|
|
-------
|
|
X_new : array-like, shape (n_samples, n_components)
|
|
X transformed in the new space.
|
|
"""
|
|
self._validate_params()
|
|
self._fit_transform(X)
|
|
return self.embedding_
|
|
|
|
def transform(self, X):
|
|
"""Transform X.
|
|
|
|
This is implemented by linking the points X into the graph of geodesic
|
|
distances of the training data. First the `n_neighbors` nearest
|
|
neighbors of X are found in the training data, and from these the
|
|
shortest geodesic distances from each point in X to each point in
|
|
the training data are computed in order to construct the kernel.
|
|
The embedding of X is the projection of this kernel onto the
|
|
embedding vectors of the training set.
|
|
|
|
Parameters
|
|
----------
|
|
X : {array-like, sparse matrix}, shape (n_queries, n_features)
|
|
If neighbors_algorithm='precomputed', X is assumed to be a
|
|
distance matrix or a sparse graph of shape
|
|
(n_queries, n_samples_fit).
|
|
|
|
Returns
|
|
-------
|
|
X_new : array-like, shape (n_queries, n_components)
|
|
X transformed in the new space.
|
|
"""
|
|
check_is_fitted(self)
|
|
if self.n_neighbors is not None:
|
|
distances, indices = self.nbrs_.kneighbors(X, return_distance=True)
|
|
else:
|
|
distances, indices = self.nbrs_.radius_neighbors(X, return_distance=True)
|
|
|
|
# Create the graph of shortest distances from X to
|
|
# training data via the nearest neighbors of X.
|
|
# This can be done as a single array operation, but it potentially
|
|
# takes a lot of memory. To avoid that, use a loop:
|
|
|
|
n_samples_fit = self.nbrs_.n_samples_fit_
|
|
n_queries = distances.shape[0]
|
|
|
|
if hasattr(X, "dtype") and X.dtype == np.float32:
|
|
dtype = np.float32
|
|
else:
|
|
dtype = np.float64
|
|
|
|
G_X = np.zeros((n_queries, n_samples_fit), dtype)
|
|
for i in range(n_queries):
|
|
G_X[i] = np.min(self.dist_matrix_[indices[i]] + distances[i][:, None], 0)
|
|
|
|
G_X **= 2
|
|
G_X *= -0.5
|
|
|
|
return self.kernel_pca_.transform(G_X)
|
|
|
|
def _more_tags(self):
|
|
return {"preserves_dtype": [np.float64, np.float32]}
|