462 lines
17 KiB
Python
462 lines
17 KiB
Python
"""
|
|
Multi-dimensional Scaling (MDS).
|
|
"""
|
|
|
|
# author: Nelle Varoquaux <nelle.varoquaux@gmail.com>
|
|
# License: BSD
|
|
|
|
import numpy as np
|
|
from joblib import Parallel, effective_n_jobs
|
|
|
|
import warnings
|
|
|
|
from ..base import BaseEstimator
|
|
from ..metrics import euclidean_distances
|
|
from ..utils import check_random_state, check_array, check_symmetric
|
|
from ..isotonic import IsotonicRegression
|
|
from ..utils.validation import _deprecate_positional_args
|
|
from ..utils.deprecation import deprecated
|
|
from ..utils.fixes import delayed
|
|
|
|
|
|
def _smacof_single(dissimilarities, metric=True, n_components=2, init=None,
|
|
max_iter=300, verbose=0, eps=1e-3, random_state=None):
|
|
"""Computes multidimensional scaling using SMACOF algorithm.
|
|
|
|
Parameters
|
|
----------
|
|
dissimilarities : ndarray of shape (n_samples, n_samples)
|
|
Pairwise dissimilarities between the points. Must be symmetric.
|
|
|
|
metric : bool, default=True
|
|
Compute metric or nonmetric SMACOF algorithm.
|
|
|
|
n_components : int, default=2
|
|
Number of dimensions in which to immerse the dissimilarities. If an
|
|
``init`` array is provided, this option is overridden and the shape of
|
|
``init`` is used to determine the dimensionality of the embedding
|
|
space.
|
|
|
|
init : ndarray of shape (n_samples, n_components), default=None
|
|
Starting configuration of the embedding to initialize the algorithm. By
|
|
default, the algorithm is initialized with a randomly chosen array.
|
|
|
|
max_iter : int, default=300
|
|
Maximum number of iterations of the SMACOF algorithm for a single run.
|
|
|
|
verbose : int, default=0
|
|
Level of verbosity.
|
|
|
|
eps : float, default=1e-3
|
|
Relative tolerance with respect to stress at which to declare
|
|
convergence.
|
|
|
|
random_state : int, RandomState instance or None, default=None
|
|
Determines the random number generator used to initialize the centers.
|
|
Pass an int for reproducible results across multiple function calls.
|
|
See :term: `Glossary <random_state>`.
|
|
|
|
Returns
|
|
-------
|
|
X : ndarray of shape (n_samples, n_components)
|
|
Coordinates of the points in a ``n_components``-space.
|
|
|
|
stress : float
|
|
The final value of the stress (sum of squared distance of the
|
|
disparities and the distances for all constrained points).
|
|
|
|
n_iter : int
|
|
The number of iterations corresponding to the best stress.
|
|
"""
|
|
dissimilarities = check_symmetric(dissimilarities, raise_exception=True)
|
|
|
|
n_samples = dissimilarities.shape[0]
|
|
random_state = check_random_state(random_state)
|
|
|
|
sim_flat = ((1 - np.tri(n_samples)) * dissimilarities).ravel()
|
|
sim_flat_w = sim_flat[sim_flat != 0]
|
|
if init is None:
|
|
# Randomly choose initial configuration
|
|
X = random_state.rand(n_samples * n_components)
|
|
X = X.reshape((n_samples, n_components))
|
|
else:
|
|
# overrides the parameter p
|
|
n_components = init.shape[1]
|
|
if n_samples != init.shape[0]:
|
|
raise ValueError("init matrix should be of shape (%d, %d)" %
|
|
(n_samples, n_components))
|
|
X = init
|
|
|
|
old_stress = None
|
|
ir = IsotonicRegression()
|
|
for it in range(max_iter):
|
|
# Compute distance and monotonic regression
|
|
dis = euclidean_distances(X)
|
|
|
|
if metric:
|
|
disparities = dissimilarities
|
|
else:
|
|
dis_flat = dis.ravel()
|
|
# dissimilarities with 0 are considered as missing values
|
|
dis_flat_w = dis_flat[sim_flat != 0]
|
|
|
|
# Compute the disparities using a monotonic regression
|
|
disparities_flat = ir.fit_transform(sim_flat_w, dis_flat_w)
|
|
disparities = dis_flat.copy()
|
|
disparities[sim_flat != 0] = disparities_flat
|
|
disparities = disparities.reshape((n_samples, n_samples))
|
|
disparities *= np.sqrt((n_samples * (n_samples - 1) / 2) /
|
|
(disparities ** 2).sum())
|
|
|
|
# Compute stress
|
|
stress = ((dis.ravel() - disparities.ravel()) ** 2).sum() / 2
|
|
|
|
# Update X using the Guttman transform
|
|
dis[dis == 0] = 1e-5
|
|
ratio = disparities / dis
|
|
B = - ratio
|
|
B[np.arange(len(B)), np.arange(len(B))] += ratio.sum(axis=1)
|
|
X = 1. / n_samples * np.dot(B, X)
|
|
|
|
dis = np.sqrt((X ** 2).sum(axis=1)).sum()
|
|
if verbose >= 2:
|
|
print('it: %d, stress %s' % (it, stress))
|
|
if old_stress is not None:
|
|
if(old_stress - stress / dis) < eps:
|
|
if verbose:
|
|
print('breaking at iteration %d with stress %s' % (it,
|
|
stress))
|
|
break
|
|
old_stress = stress / dis
|
|
|
|
return X, stress, it + 1
|
|
|
|
|
|
@_deprecate_positional_args
|
|
def smacof(dissimilarities, *, metric=True, n_components=2, init=None,
|
|
n_init=8, n_jobs=None, max_iter=300, verbose=0, eps=1e-3,
|
|
random_state=None, return_n_iter=False):
|
|
"""Computes multidimensional scaling using the SMACOF algorithm.
|
|
|
|
The SMACOF (Scaling by MAjorizing a COmplicated Function) algorithm is a
|
|
multidimensional scaling algorithm which minimizes an objective function
|
|
(the *stress*) using a majorization technique. Stress majorization, also
|
|
known as the Guttman Transform, guarantees a monotone convergence of
|
|
stress, and is more powerful than traditional techniques such as gradient
|
|
descent.
|
|
|
|
The SMACOF algorithm for metric MDS can summarized by the following steps:
|
|
|
|
1. Set an initial start configuration, randomly or not.
|
|
2. Compute the stress
|
|
3. Compute the Guttman Transform
|
|
4. Iterate 2 and 3 until convergence.
|
|
|
|
The nonmetric algorithm adds a monotonic regression step before computing
|
|
the stress.
|
|
|
|
Parameters
|
|
----------
|
|
dissimilarities : ndarray of shape (n_samples, n_samples)
|
|
Pairwise dissimilarities between the points. Must be symmetric.
|
|
|
|
metric : bool, default=True
|
|
Compute metric or nonmetric SMACOF algorithm.
|
|
|
|
n_components : int, default=2
|
|
Number of dimensions in which to immerse the dissimilarities. If an
|
|
``init`` array is provided, this option is overridden and the shape of
|
|
``init`` is used to determine the dimensionality of the embedding
|
|
space.
|
|
|
|
init : ndarray of shape (n_samples, n_components), default=None
|
|
Starting configuration of the embedding to initialize the algorithm. By
|
|
default, the algorithm is initialized with a randomly chosen array.
|
|
|
|
n_init : int, default=8
|
|
Number of times the SMACOF algorithm will be run with different
|
|
initializations. The final results will be the best output of the runs,
|
|
determined by the run with the smallest final stress. If ``init`` is
|
|
provided, this option is overridden and a single run is performed.
|
|
|
|
n_jobs : int, default=None
|
|
The number of jobs to use for the computation. If multiple
|
|
initializations are used (``n_init``), each run of the algorithm is
|
|
computed in parallel.
|
|
|
|
``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.
|
|
``-1`` means using all processors. See :term:`Glossary <n_jobs>`
|
|
for more details.
|
|
|
|
max_iter : int, default=300
|
|
Maximum number of iterations of the SMACOF algorithm for a single run.
|
|
|
|
verbose : int, default=0
|
|
Level of verbosity.
|
|
|
|
eps : float, default=1e-3
|
|
Relative tolerance with respect to stress at which to declare
|
|
convergence.
|
|
|
|
random_state : int, RandomState instance or None, default=None
|
|
Determines the random number generator used to initialize the centers.
|
|
Pass an int for reproducible results across multiple function calls.
|
|
See :term: `Glossary <random_state>`.
|
|
|
|
return_n_iter : bool, default=False
|
|
Whether or not to return the number of iterations.
|
|
|
|
Returns
|
|
-------
|
|
X : ndarray of shape (n_samples, n_components)
|
|
Coordinates of the points in a ``n_components``-space.
|
|
|
|
stress : float
|
|
The final value of the stress (sum of squared distance of the
|
|
disparities and the distances for all constrained points).
|
|
|
|
n_iter : int
|
|
The number of iterations corresponding to the best stress. Returned
|
|
only if ``return_n_iter`` is set to ``True``.
|
|
|
|
Notes
|
|
-----
|
|
"Modern Multidimensional Scaling - Theory and Applications" Borg, I.;
|
|
Groenen P. Springer Series in Statistics (1997)
|
|
|
|
"Nonmetric multidimensional scaling: a numerical method" Kruskal, J.
|
|
Psychometrika, 29 (1964)
|
|
|
|
"Multidimensional scaling by optimizing goodness of fit to a nonmetric
|
|
hypothesis" Kruskal, J. Psychometrika, 29, (1964)
|
|
"""
|
|
|
|
dissimilarities = check_array(dissimilarities)
|
|
random_state = check_random_state(random_state)
|
|
|
|
if hasattr(init, '__array__'):
|
|
init = np.asarray(init).copy()
|
|
if not n_init == 1:
|
|
warnings.warn(
|
|
'Explicit initial positions passed: '
|
|
'performing only one init of the MDS instead of %d'
|
|
% n_init)
|
|
n_init = 1
|
|
|
|
best_pos, best_stress = None, None
|
|
|
|
if effective_n_jobs(n_jobs) == 1:
|
|
for it in range(n_init):
|
|
pos, stress, n_iter_ = _smacof_single(
|
|
dissimilarities, metric=metric,
|
|
n_components=n_components, init=init,
|
|
max_iter=max_iter, verbose=verbose,
|
|
eps=eps, random_state=random_state)
|
|
if best_stress is None or stress < best_stress:
|
|
best_stress = stress
|
|
best_pos = pos.copy()
|
|
best_iter = n_iter_
|
|
else:
|
|
seeds = random_state.randint(np.iinfo(np.int32).max, size=n_init)
|
|
results = Parallel(n_jobs=n_jobs, verbose=max(verbose - 1, 0))(
|
|
delayed(_smacof_single)(
|
|
dissimilarities, metric=metric, n_components=n_components,
|
|
init=init, max_iter=max_iter, verbose=verbose, eps=eps,
|
|
random_state=seed)
|
|
for seed in seeds)
|
|
positions, stress, n_iters = zip(*results)
|
|
best = np.argmin(stress)
|
|
best_stress = stress[best]
|
|
best_pos = positions[best]
|
|
best_iter = n_iters[best]
|
|
|
|
if return_n_iter:
|
|
return best_pos, best_stress, best_iter
|
|
else:
|
|
return best_pos, best_stress
|
|
|
|
|
|
class MDS(BaseEstimator):
|
|
"""Multidimensional scaling.
|
|
|
|
Read more in the :ref:`User Guide <multidimensional_scaling>`.
|
|
|
|
Parameters
|
|
----------
|
|
n_components : int, default=2
|
|
Number of dimensions in which to immerse the dissimilarities.
|
|
|
|
metric : bool, default=True
|
|
If ``True``, perform metric MDS; otherwise, perform nonmetric MDS.
|
|
|
|
n_init : int, default=4
|
|
Number of times the SMACOF algorithm will be run with different
|
|
initializations. The final results will be the best output of the runs,
|
|
determined by the run with the smallest final stress.
|
|
|
|
max_iter : int, default=300
|
|
Maximum number of iterations of the SMACOF algorithm for a single run.
|
|
|
|
verbose : int, default=0
|
|
Level of verbosity.
|
|
|
|
eps : float, default=1e-3
|
|
Relative tolerance with respect to stress at which to declare
|
|
convergence.
|
|
|
|
n_jobs : int, default=None
|
|
The number of jobs to use for the computation. If multiple
|
|
initializations are used (``n_init``), each run of the algorithm is
|
|
computed in parallel.
|
|
|
|
``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.
|
|
``-1`` means using all processors. See :term:`Glossary <n_jobs>`
|
|
for more details.
|
|
|
|
random_state : int, RandomState instance or None, default=None
|
|
Determines the random number generator used to initialize the centers.
|
|
Pass an int for reproducible results across multiple function calls.
|
|
See :term: `Glossary <random_state>`.
|
|
|
|
dissimilarity : {'euclidean', 'precomputed'}, default='euclidean'
|
|
Dissimilarity measure to use:
|
|
|
|
- 'euclidean':
|
|
Pairwise Euclidean distances between points in the dataset.
|
|
|
|
- 'precomputed':
|
|
Pre-computed dissimilarities are passed directly to ``fit`` and
|
|
``fit_transform``.
|
|
|
|
Attributes
|
|
----------
|
|
embedding_ : ndarray of shape (n_samples, n_components)
|
|
Stores the position of the dataset in the embedding space.
|
|
|
|
stress_ : float
|
|
The final value of the stress (sum of squared distance of the
|
|
disparities and the distances for all constrained points).
|
|
|
|
dissimilarity_matrix_ : ndarray of shape (n_samples, n_samples)
|
|
Pairwise dissimilarities between the points. Symmetric matrix that:
|
|
|
|
- either uses a custom dissimilarity matrix by setting `dissimilarity`
|
|
to 'precomputed';
|
|
- or constructs a dissimilarity matrix from data using
|
|
Euclidean distances.
|
|
|
|
n_iter_ : int
|
|
The number of iterations corresponding to the best stress.
|
|
|
|
Examples
|
|
--------
|
|
>>> from sklearn.datasets import load_digits
|
|
>>> from sklearn.manifold import MDS
|
|
>>> X, _ = load_digits(return_X_y=True)
|
|
>>> X.shape
|
|
(1797, 64)
|
|
>>> embedding = MDS(n_components=2)
|
|
>>> X_transformed = embedding.fit_transform(X[:100])
|
|
>>> X_transformed.shape
|
|
(100, 2)
|
|
|
|
References
|
|
----------
|
|
"Modern Multidimensional Scaling - Theory and Applications" Borg, I.;
|
|
Groenen P. Springer Series in Statistics (1997)
|
|
|
|
"Nonmetric multidimensional scaling: a numerical method" Kruskal, J.
|
|
Psychometrika, 29 (1964)
|
|
|
|
"Multidimensional scaling by optimizing goodness of fit to a nonmetric
|
|
hypothesis" Kruskal, J. Psychometrika, 29, (1964)
|
|
|
|
"""
|
|
@_deprecate_positional_args
|
|
def __init__(self, n_components=2, *, metric=True, n_init=4,
|
|
max_iter=300, verbose=0, eps=1e-3, n_jobs=None,
|
|
random_state=None, dissimilarity="euclidean"):
|
|
self.n_components = n_components
|
|
self.dissimilarity = dissimilarity
|
|
self.metric = metric
|
|
self.n_init = n_init
|
|
self.max_iter = max_iter
|
|
self.eps = eps
|
|
self.verbose = verbose
|
|
self.n_jobs = n_jobs
|
|
self.random_state = random_state
|
|
|
|
def _more_tags(self):
|
|
return {'pairwise': self.dissimilarity == 'precomputed'}
|
|
|
|
# TODO: Remove in 1.1
|
|
# mypy error: Decorated property not supported
|
|
@deprecated("Attribute _pairwise was deprecated in " # type: ignore
|
|
"version 0.24 and will be removed in 1.1 (renaming of 0.26).")
|
|
@property
|
|
def _pairwise(self):
|
|
return self.dissimilarity == "precomputed"
|
|
|
|
def fit(self, X, y=None, init=None):
|
|
"""
|
|
Computes the position of the points in the embedding space.
|
|
|
|
Parameters
|
|
----------
|
|
X : array-like of shape (n_samples, n_features) or \
|
|
(n_samples, n_samples)
|
|
Input data. If ``dissimilarity=='precomputed'``, the input should
|
|
be the dissimilarity matrix.
|
|
|
|
y : Ignored
|
|
|
|
init : ndarray of shape (n_samples,), default=None
|
|
Starting configuration of the embedding to initialize the SMACOF
|
|
algorithm. By default, the algorithm is initialized with a randomly
|
|
chosen array.
|
|
"""
|
|
self.fit_transform(X, init=init)
|
|
return self
|
|
|
|
def fit_transform(self, X, y=None, init=None):
|
|
"""
|
|
Fit the data from X, and returns the embedded coordinates.
|
|
|
|
Parameters
|
|
----------
|
|
X : array-like of shape (n_samples, n_features) or \
|
|
(n_samples, n_samples)
|
|
Input data. If ``dissimilarity=='precomputed'``, the input should
|
|
be the dissimilarity matrix.
|
|
|
|
y : Ignored
|
|
|
|
init : ndarray of shape (n_samples,), default=None
|
|
Starting configuration of the embedding to initialize the SMACOF
|
|
algorithm. By default, the algorithm is initialized with a randomly
|
|
chosen array.
|
|
"""
|
|
X = self._validate_data(X)
|
|
if X.shape[0] == X.shape[1] and self.dissimilarity != "precomputed":
|
|
warnings.warn("The MDS API has changed. ``fit`` now constructs an"
|
|
" dissimilarity matrix from data. To use a custom "
|
|
"dissimilarity matrix, set "
|
|
"``dissimilarity='precomputed'``.")
|
|
|
|
if self.dissimilarity == "precomputed":
|
|
self.dissimilarity_matrix_ = X
|
|
elif self.dissimilarity == "euclidean":
|
|
self.dissimilarity_matrix_ = euclidean_distances(X)
|
|
else:
|
|
raise ValueError("Proximity must be 'precomputed' or 'euclidean'."
|
|
" Got %s instead" % str(self.dissimilarity))
|
|
|
|
self.embedding_, self.stress_, self.n_iter_ = smacof(
|
|
self.dissimilarity_matrix_, metric=self.metric,
|
|
n_components=self.n_components, init=init, n_init=self.n_init,
|
|
n_jobs=self.n_jobs, max_iter=self.max_iter, verbose=self.verbose,
|
|
eps=self.eps, random_state=self.random_state,
|
|
return_n_iter=True)
|
|
|
|
return self.embedding_
|