629 lines
22 KiB
Python
629 lines
22 KiB
Python
"""
|
|
Multi-dimensional Scaling (MDS).
|
|
"""
|
|
|
|
# author: Nelle Varoquaux <nelle.varoquaux@gmail.com>
|
|
# License: BSD
|
|
|
|
from numbers import Integral, Real
|
|
|
|
import numpy as np
|
|
from joblib import effective_n_jobs
|
|
|
|
import warnings
|
|
|
|
from ..base import BaseEstimator
|
|
from ..metrics import euclidean_distances
|
|
from ..utils import check_random_state, check_array, check_symmetric
|
|
from ..isotonic import IsotonicRegression
|
|
from ..utils._param_validation import Interval, StrOptions, Hidden
|
|
from ..utils.parallel import delayed, Parallel
|
|
|
|
|
|
def _smacof_single(
|
|
dissimilarities,
|
|
metric=True,
|
|
n_components=2,
|
|
init=None,
|
|
max_iter=300,
|
|
verbose=0,
|
|
eps=1e-3,
|
|
random_state=None,
|
|
normalized_stress=False,
|
|
):
|
|
"""Computes multidimensional scaling using SMACOF algorithm.
|
|
|
|
Parameters
|
|
----------
|
|
dissimilarities : ndarray of shape (n_samples, n_samples)
|
|
Pairwise dissimilarities between the points. Must be symmetric.
|
|
|
|
metric : bool, default=True
|
|
Compute metric or nonmetric SMACOF algorithm.
|
|
When ``False`` (i.e. non-metric MDS), dissimilarities with 0 are considered as
|
|
missing values.
|
|
|
|
n_components : int, default=2
|
|
Number of dimensions in which to immerse the dissimilarities. If an
|
|
``init`` array is provided, this option is overridden and the shape of
|
|
``init`` is used to determine the dimensionality of the embedding
|
|
space.
|
|
|
|
init : ndarray of shape (n_samples, n_components), default=None
|
|
Starting configuration of the embedding to initialize the algorithm. By
|
|
default, the algorithm is initialized with a randomly chosen array.
|
|
|
|
max_iter : int, default=300
|
|
Maximum number of iterations of the SMACOF algorithm for a single run.
|
|
|
|
verbose : int, default=0
|
|
Level of verbosity.
|
|
|
|
eps : float, default=1e-3
|
|
Relative tolerance with respect to stress at which to declare
|
|
convergence. The value of `eps` should be tuned separately depending
|
|
on whether or not `normalized_stress` is being used.
|
|
|
|
random_state : int, RandomState instance or None, default=None
|
|
Determines the random number generator used to initialize the centers.
|
|
Pass an int for reproducible results across multiple function calls.
|
|
See :term:`Glossary <random_state>`.
|
|
|
|
normalized_stress : bool, default=False
|
|
Whether use and return normed stress value (Stress-1) instead of raw
|
|
stress calculated by default. Only supported in non-metric MDS. The
|
|
caller must ensure that if `normalized_stress=True` then `metric=False`
|
|
|
|
.. versionadded:: 1.2
|
|
|
|
Returns
|
|
-------
|
|
X : ndarray of shape (n_samples, n_components)
|
|
Coordinates of the points in a ``n_components``-space.
|
|
|
|
stress : float
|
|
The final value of the stress (sum of squared distance of the
|
|
disparities and the distances for all constrained points).
|
|
If `normalized_stress=True`, and `metric=False` returns Stress-1.
|
|
A value of 0 indicates "perfect" fit, 0.025 excellent, 0.05 good,
|
|
0.1 fair, and 0.2 poor [1]_.
|
|
|
|
n_iter : int
|
|
The number of iterations corresponding to the best stress.
|
|
|
|
References
|
|
----------
|
|
.. [1] "Nonmetric multidimensional scaling: a numerical method" Kruskal, J.
|
|
Psychometrika, 29 (1964)
|
|
|
|
.. [2] "Multidimensional scaling by optimizing goodness of fit to a nonmetric
|
|
hypothesis" Kruskal, J. Psychometrika, 29, (1964)
|
|
|
|
.. [3] "Modern Multidimensional Scaling - Theory and Applications" Borg, I.;
|
|
Groenen P. Springer Series in Statistics (1997)
|
|
"""
|
|
dissimilarities = check_symmetric(dissimilarities, raise_exception=True)
|
|
|
|
n_samples = dissimilarities.shape[0]
|
|
random_state = check_random_state(random_state)
|
|
|
|
sim_flat = ((1 - np.tri(n_samples)) * dissimilarities).ravel()
|
|
sim_flat_w = sim_flat[sim_flat != 0]
|
|
if init is None:
|
|
# Randomly choose initial configuration
|
|
X = random_state.uniform(size=n_samples * n_components)
|
|
X = X.reshape((n_samples, n_components))
|
|
else:
|
|
# overrides the parameter p
|
|
n_components = init.shape[1]
|
|
if n_samples != init.shape[0]:
|
|
raise ValueError(
|
|
"init matrix should be of shape (%d, %d)" % (n_samples, n_components)
|
|
)
|
|
X = init
|
|
|
|
old_stress = None
|
|
ir = IsotonicRegression()
|
|
for it in range(max_iter):
|
|
# Compute distance and monotonic regression
|
|
dis = euclidean_distances(X)
|
|
|
|
if metric:
|
|
disparities = dissimilarities
|
|
else:
|
|
dis_flat = dis.ravel()
|
|
# dissimilarities with 0 are considered as missing values
|
|
dis_flat_w = dis_flat[sim_flat != 0]
|
|
|
|
# Compute the disparities using a monotonic regression
|
|
disparities_flat = ir.fit_transform(sim_flat_w, dis_flat_w)
|
|
disparities = dis_flat.copy()
|
|
disparities[sim_flat != 0] = disparities_flat
|
|
disparities = disparities.reshape((n_samples, n_samples))
|
|
disparities *= np.sqrt(
|
|
(n_samples * (n_samples - 1) / 2) / (disparities**2).sum()
|
|
)
|
|
|
|
# Compute stress
|
|
stress = ((dis.ravel() - disparities.ravel()) ** 2).sum() / 2
|
|
if normalized_stress:
|
|
stress = np.sqrt(stress / ((disparities.ravel() ** 2).sum() / 2))
|
|
# Update X using the Guttman transform
|
|
dis[dis == 0] = 1e-5
|
|
ratio = disparities / dis
|
|
B = -ratio
|
|
B[np.arange(len(B)), np.arange(len(B))] += ratio.sum(axis=1)
|
|
X = 1.0 / n_samples * np.dot(B, X)
|
|
|
|
dis = np.sqrt((X**2).sum(axis=1)).sum()
|
|
if verbose >= 2:
|
|
print("it: %d, stress %s" % (it, stress))
|
|
if old_stress is not None:
|
|
if (old_stress - stress / dis) < eps:
|
|
if verbose:
|
|
print("breaking at iteration %d with stress %s" % (it, stress))
|
|
break
|
|
old_stress = stress / dis
|
|
|
|
return X, stress, it + 1
|
|
|
|
|
|
def smacof(
|
|
dissimilarities,
|
|
*,
|
|
metric=True,
|
|
n_components=2,
|
|
init=None,
|
|
n_init=8,
|
|
n_jobs=None,
|
|
max_iter=300,
|
|
verbose=0,
|
|
eps=1e-3,
|
|
random_state=None,
|
|
return_n_iter=False,
|
|
normalized_stress="warn",
|
|
):
|
|
"""Compute multidimensional scaling using the SMACOF algorithm.
|
|
|
|
The SMACOF (Scaling by MAjorizing a COmplicated Function) algorithm is a
|
|
multidimensional scaling algorithm which minimizes an objective function
|
|
(the *stress*) using a majorization technique. Stress majorization, also
|
|
known as the Guttman Transform, guarantees a monotone convergence of
|
|
stress, and is more powerful than traditional techniques such as gradient
|
|
descent.
|
|
|
|
The SMACOF algorithm for metric MDS can be summarized by the following
|
|
steps:
|
|
|
|
1. Set an initial start configuration, randomly or not.
|
|
2. Compute the stress
|
|
3. Compute the Guttman Transform
|
|
4. Iterate 2 and 3 until convergence.
|
|
|
|
The nonmetric algorithm adds a monotonic regression step before computing
|
|
the stress.
|
|
|
|
Parameters
|
|
----------
|
|
dissimilarities : ndarray of shape (n_samples, n_samples)
|
|
Pairwise dissimilarities between the points. Must be symmetric.
|
|
|
|
metric : bool, default=True
|
|
Compute metric or nonmetric SMACOF algorithm.
|
|
When ``False`` (i.e. non-metric MDS), dissimilarities with 0 are considered as
|
|
missing values.
|
|
|
|
n_components : int, default=2
|
|
Number of dimensions in which to immerse the dissimilarities. If an
|
|
``init`` array is provided, this option is overridden and the shape of
|
|
``init`` is used to determine the dimensionality of the embedding
|
|
space.
|
|
|
|
init : ndarray of shape (n_samples, n_components), default=None
|
|
Starting configuration of the embedding to initialize the algorithm. By
|
|
default, the algorithm is initialized with a randomly chosen array.
|
|
|
|
n_init : int, default=8
|
|
Number of times the SMACOF algorithm will be run with different
|
|
initializations. The final results will be the best output of the runs,
|
|
determined by the run with the smallest final stress. If ``init`` is
|
|
provided, this option is overridden and a single run is performed.
|
|
|
|
n_jobs : int, default=None
|
|
The number of jobs to use for the computation. If multiple
|
|
initializations are used (``n_init``), each run of the algorithm is
|
|
computed in parallel.
|
|
|
|
``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.
|
|
``-1`` means using all processors. See :term:`Glossary <n_jobs>`
|
|
for more details.
|
|
|
|
max_iter : int, default=300
|
|
Maximum number of iterations of the SMACOF algorithm for a single run.
|
|
|
|
verbose : int, default=0
|
|
Level of verbosity.
|
|
|
|
eps : float, default=1e-3
|
|
Relative tolerance with respect to stress at which to declare
|
|
convergence. The value of `eps` should be tuned separately depending
|
|
on whether or not `normalized_stress` is being used.
|
|
|
|
random_state : int, RandomState instance or None, default=None
|
|
Determines the random number generator used to initialize the centers.
|
|
Pass an int for reproducible results across multiple function calls.
|
|
See :term:`Glossary <random_state>`.
|
|
|
|
return_n_iter : bool, default=False
|
|
Whether or not to return the number of iterations.
|
|
|
|
normalized_stress : bool or "auto" default=False
|
|
Whether use and return normed stress value (Stress-1) instead of raw
|
|
stress calculated by default. Only supported in non-metric MDS.
|
|
|
|
.. versionadded:: 1.2
|
|
|
|
Returns
|
|
-------
|
|
X : ndarray of shape (n_samples, n_components)
|
|
Coordinates of the points in a ``n_components``-space.
|
|
|
|
stress : float
|
|
The final value of the stress (sum of squared distance of the
|
|
disparities and the distances for all constrained points).
|
|
If `normalized_stress=True`, and `metric=False` returns Stress-1.
|
|
A value of 0 indicates "perfect" fit, 0.025 excellent, 0.05 good,
|
|
0.1 fair, and 0.2 poor [1]_.
|
|
|
|
n_iter : int
|
|
The number of iterations corresponding to the best stress. Returned
|
|
only if ``return_n_iter`` is set to ``True``.
|
|
|
|
References
|
|
----------
|
|
.. [1] "Nonmetric multidimensional scaling: a numerical method" Kruskal, J.
|
|
Psychometrika, 29 (1964)
|
|
|
|
.. [2] "Multidimensional scaling by optimizing goodness of fit to a nonmetric
|
|
hypothesis" Kruskal, J. Psychometrika, 29, (1964)
|
|
|
|
.. [3] "Modern Multidimensional Scaling - Theory and Applications" Borg, I.;
|
|
Groenen P. Springer Series in Statistics (1997)
|
|
"""
|
|
|
|
dissimilarities = check_array(dissimilarities)
|
|
random_state = check_random_state(random_state)
|
|
|
|
# TODO(1.4): Remove
|
|
if normalized_stress == "warn":
|
|
warnings.warn(
|
|
"The default value of `normalized_stress` will change to `'auto'` in"
|
|
" version 1.4. To suppress this warning, manually set the value of"
|
|
" `normalized_stress`.",
|
|
FutureWarning,
|
|
)
|
|
normalized_stress = False
|
|
|
|
if normalized_stress == "auto":
|
|
normalized_stress = not metric
|
|
|
|
if normalized_stress and metric:
|
|
raise ValueError(
|
|
"Normalized stress is not supported for metric MDS. Either set"
|
|
" `normalized_stress=False` or use `metric=False`."
|
|
)
|
|
if hasattr(init, "__array__"):
|
|
init = np.asarray(init).copy()
|
|
if not n_init == 1:
|
|
warnings.warn(
|
|
"Explicit initial positions passed: "
|
|
"performing only one init of the MDS instead of %d" % n_init
|
|
)
|
|
n_init = 1
|
|
|
|
best_pos, best_stress = None, None
|
|
|
|
if effective_n_jobs(n_jobs) == 1:
|
|
for it in range(n_init):
|
|
pos, stress, n_iter_ = _smacof_single(
|
|
dissimilarities,
|
|
metric=metric,
|
|
n_components=n_components,
|
|
init=init,
|
|
max_iter=max_iter,
|
|
verbose=verbose,
|
|
eps=eps,
|
|
random_state=random_state,
|
|
normalized_stress=normalized_stress,
|
|
)
|
|
if best_stress is None or stress < best_stress:
|
|
best_stress = stress
|
|
best_pos = pos.copy()
|
|
best_iter = n_iter_
|
|
else:
|
|
seeds = random_state.randint(np.iinfo(np.int32).max, size=n_init)
|
|
results = Parallel(n_jobs=n_jobs, verbose=max(verbose - 1, 0))(
|
|
delayed(_smacof_single)(
|
|
dissimilarities,
|
|
metric=metric,
|
|
n_components=n_components,
|
|
init=init,
|
|
max_iter=max_iter,
|
|
verbose=verbose,
|
|
eps=eps,
|
|
random_state=seed,
|
|
normalized_stress=normalized_stress,
|
|
)
|
|
for seed in seeds
|
|
)
|
|
positions, stress, n_iters = zip(*results)
|
|
best = np.argmin(stress)
|
|
best_stress = stress[best]
|
|
best_pos = positions[best]
|
|
best_iter = n_iters[best]
|
|
|
|
if return_n_iter:
|
|
return best_pos, best_stress, best_iter
|
|
else:
|
|
return best_pos, best_stress
|
|
|
|
|
|
class MDS(BaseEstimator):
|
|
"""Multidimensional scaling.
|
|
|
|
Read more in the :ref:`User Guide <multidimensional_scaling>`.
|
|
|
|
Parameters
|
|
----------
|
|
n_components : int, default=2
|
|
Number of dimensions in which to immerse the dissimilarities.
|
|
|
|
metric : bool, default=True
|
|
If ``True``, perform metric MDS; otherwise, perform nonmetric MDS.
|
|
When ``False`` (i.e. non-metric MDS), dissimilarities with 0 are considered as
|
|
missing values.
|
|
|
|
n_init : int, default=4
|
|
Number of times the SMACOF algorithm will be run with different
|
|
initializations. The final results will be the best output of the runs,
|
|
determined by the run with the smallest final stress.
|
|
|
|
max_iter : int, default=300
|
|
Maximum number of iterations of the SMACOF algorithm for a single run.
|
|
|
|
verbose : int, default=0
|
|
Level of verbosity.
|
|
|
|
eps : float, default=1e-3
|
|
Relative tolerance with respect to stress at which to declare
|
|
convergence. The value of `eps` should be tuned separately depending
|
|
on whether or not `normalized_stress` is being used.
|
|
|
|
n_jobs : int, default=None
|
|
The number of jobs to use for the computation. If multiple
|
|
initializations are used (``n_init``), each run of the algorithm is
|
|
computed in parallel.
|
|
|
|
``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.
|
|
``-1`` means using all processors. See :term:`Glossary <n_jobs>`
|
|
for more details.
|
|
|
|
random_state : int, RandomState instance or None, default=None
|
|
Determines the random number generator used to initialize the centers.
|
|
Pass an int for reproducible results across multiple function calls.
|
|
See :term:`Glossary <random_state>`.
|
|
|
|
dissimilarity : {'euclidean', 'precomputed'}, default='euclidean'
|
|
Dissimilarity measure to use:
|
|
|
|
- 'euclidean':
|
|
Pairwise Euclidean distances between points in the dataset.
|
|
|
|
- 'precomputed':
|
|
Pre-computed dissimilarities are passed directly to ``fit`` and
|
|
``fit_transform``.
|
|
|
|
normalized_stress : bool or "auto" default=False
|
|
Whether use and return normed stress value (Stress-1) instead of raw
|
|
stress calculated by default. Only supported in non-metric MDS.
|
|
|
|
.. versionadded:: 1.2
|
|
|
|
Attributes
|
|
----------
|
|
embedding_ : ndarray of shape (n_samples, n_components)
|
|
Stores the position of the dataset in the embedding space.
|
|
|
|
stress_ : float
|
|
The final value of the stress (sum of squared distance of the
|
|
disparities and the distances for all constrained points).
|
|
If `normalized_stress=True`, and `metric=False` returns Stress-1.
|
|
A value of 0 indicates "perfect" fit, 0.025 excellent, 0.05 good,
|
|
0.1 fair, and 0.2 poor [1]_.
|
|
|
|
dissimilarity_matrix_ : ndarray of shape (n_samples, n_samples)
|
|
Pairwise dissimilarities between the points. Symmetric matrix that:
|
|
|
|
- either uses a custom dissimilarity matrix by setting `dissimilarity`
|
|
to 'precomputed';
|
|
- or constructs a dissimilarity matrix from data using
|
|
Euclidean distances.
|
|
|
|
n_features_in_ : int
|
|
Number of features seen during :term:`fit`.
|
|
|
|
.. versionadded:: 0.24
|
|
|
|
feature_names_in_ : ndarray of shape (`n_features_in_`,)
|
|
Names of features seen during :term:`fit`. Defined only when `X`
|
|
has feature names that are all strings.
|
|
|
|
.. versionadded:: 1.0
|
|
|
|
n_iter_ : int
|
|
The number of iterations corresponding to the best stress.
|
|
|
|
See Also
|
|
--------
|
|
sklearn.decomposition.PCA : Principal component analysis that is a linear
|
|
dimensionality reduction method.
|
|
sklearn.decomposition.KernelPCA : Non-linear dimensionality reduction using
|
|
kernels and PCA.
|
|
TSNE : T-distributed Stochastic Neighbor Embedding.
|
|
Isomap : Manifold learning based on Isometric Mapping.
|
|
LocallyLinearEmbedding : Manifold learning using Locally Linear Embedding.
|
|
SpectralEmbedding : Spectral embedding for non-linear dimensionality.
|
|
|
|
References
|
|
----------
|
|
.. [1] "Nonmetric multidimensional scaling: a numerical method" Kruskal, J.
|
|
Psychometrika, 29 (1964)
|
|
|
|
.. [2] "Multidimensional scaling by optimizing goodness of fit to a nonmetric
|
|
hypothesis" Kruskal, J. Psychometrika, 29, (1964)
|
|
|
|
.. [3] "Modern Multidimensional Scaling - Theory and Applications" Borg, I.;
|
|
Groenen P. Springer Series in Statistics (1997)
|
|
|
|
Examples
|
|
--------
|
|
>>> from sklearn.datasets import load_digits
|
|
>>> from sklearn.manifold import MDS
|
|
>>> X, _ = load_digits(return_X_y=True)
|
|
>>> X.shape
|
|
(1797, 64)
|
|
>>> embedding = MDS(n_components=2, normalized_stress='auto')
|
|
>>> X_transformed = embedding.fit_transform(X[:100])
|
|
>>> X_transformed.shape
|
|
(100, 2)
|
|
"""
|
|
|
|
_parameter_constraints: dict = {
|
|
"n_components": [Interval(Integral, 1, None, closed="left")],
|
|
"metric": ["boolean"],
|
|
"n_init": [Interval(Integral, 1, None, closed="left")],
|
|
"max_iter": [Interval(Integral, 1, None, closed="left")],
|
|
"verbose": ["verbose"],
|
|
"eps": [Interval(Real, 0.0, None, closed="left")],
|
|
"n_jobs": [None, Integral],
|
|
"random_state": ["random_state"],
|
|
"dissimilarity": [StrOptions({"euclidean", "precomputed"})],
|
|
"normalized_stress": [
|
|
"boolean",
|
|
StrOptions({"auto"}),
|
|
Hidden(StrOptions({"warn"})),
|
|
],
|
|
}
|
|
|
|
def __init__(
|
|
self,
|
|
n_components=2,
|
|
*,
|
|
metric=True,
|
|
n_init=4,
|
|
max_iter=300,
|
|
verbose=0,
|
|
eps=1e-3,
|
|
n_jobs=None,
|
|
random_state=None,
|
|
dissimilarity="euclidean",
|
|
normalized_stress="warn",
|
|
):
|
|
self.n_components = n_components
|
|
self.dissimilarity = dissimilarity
|
|
self.metric = metric
|
|
self.n_init = n_init
|
|
self.max_iter = max_iter
|
|
self.eps = eps
|
|
self.verbose = verbose
|
|
self.n_jobs = n_jobs
|
|
self.random_state = random_state
|
|
self.normalized_stress = normalized_stress
|
|
|
|
def _more_tags(self):
|
|
return {"pairwise": self.dissimilarity == "precomputed"}
|
|
|
|
def fit(self, X, y=None, init=None):
|
|
"""
|
|
Compute the position of the points in the embedding space.
|
|
|
|
Parameters
|
|
----------
|
|
X : array-like of shape (n_samples, n_features) or \
|
|
(n_samples, n_samples)
|
|
Input data. If ``dissimilarity=='precomputed'``, the input should
|
|
be the dissimilarity matrix.
|
|
|
|
y : Ignored
|
|
Not used, present for API consistency by convention.
|
|
|
|
init : ndarray of shape (n_samples, n_components), default=None
|
|
Starting configuration of the embedding to initialize the SMACOF
|
|
algorithm. By default, the algorithm is initialized with a randomly
|
|
chosen array.
|
|
|
|
Returns
|
|
-------
|
|
self : object
|
|
Fitted estimator.
|
|
"""
|
|
# parameter will be validated in `fit_transform` call
|
|
self.fit_transform(X, init=init)
|
|
return self
|
|
|
|
def fit_transform(self, X, y=None, init=None):
|
|
"""
|
|
Fit the data from `X`, and returns the embedded coordinates.
|
|
|
|
Parameters
|
|
----------
|
|
X : array-like of shape (n_samples, n_features) or \
|
|
(n_samples, n_samples)
|
|
Input data. If ``dissimilarity=='precomputed'``, the input should
|
|
be the dissimilarity matrix.
|
|
|
|
y : Ignored
|
|
Not used, present for API consistency by convention.
|
|
|
|
init : ndarray of shape (n_samples, n_components), default=None
|
|
Starting configuration of the embedding to initialize the SMACOF
|
|
algorithm. By default, the algorithm is initialized with a randomly
|
|
chosen array.
|
|
|
|
Returns
|
|
-------
|
|
X_new : ndarray of shape (n_samples, n_components)
|
|
X transformed in the new space.
|
|
"""
|
|
self._validate_params()
|
|
X = self._validate_data(X)
|
|
if X.shape[0] == X.shape[1] and self.dissimilarity != "precomputed":
|
|
warnings.warn(
|
|
"The MDS API has changed. ``fit`` now constructs an"
|
|
" dissimilarity matrix from data. To use a custom "
|
|
"dissimilarity matrix, set "
|
|
"``dissimilarity='precomputed'``."
|
|
)
|
|
|
|
if self.dissimilarity == "precomputed":
|
|
self.dissimilarity_matrix_ = X
|
|
elif self.dissimilarity == "euclidean":
|
|
self.dissimilarity_matrix_ = euclidean_distances(X)
|
|
|
|
self.embedding_, self.stress_, self.n_iter_ = smacof(
|
|
self.dissimilarity_matrix_,
|
|
metric=self.metric,
|
|
n_components=self.n_components,
|
|
init=init,
|
|
n_init=self.n_init,
|
|
n_jobs=self.n_jobs,
|
|
max_iter=self.max_iter,
|
|
verbose=self.verbose,
|
|
eps=self.eps,
|
|
random_state=self.random_state,
|
|
return_n_iter=True,
|
|
normalized_stress=self.normalized_stress,
|
|
)
|
|
|
|
return self.embedding_
|