projektAI/venv/Lib/site-packages/sklearn/decomposition/_nmf.py

1393 lines
48 KiB
Python
Raw Normal View History

2021-06-06 22:13:05 +02:00
""" Non-negative matrix factorization.
"""
# Author: Vlad Niculae
# Lars Buitinck
# Mathieu Blondel <mathieu@mblondel.org>
# Tom Dupre la Tour
# License: BSD 3 clause
import numbers
import numpy as np
import scipy.sparse as sp
import time
import warnings
from math import sqrt
from ._cdnmf_fast import _update_cdnmf_fast
from .._config import config_context
from ..base import BaseEstimator, TransformerMixin
from ..exceptions import ConvergenceWarning
from ..utils import check_random_state, check_array
from ..utils.extmath import randomized_svd, safe_sparse_dot, squared_norm
from ..utils.validation import check_is_fitted, check_non_negative
from ..utils.validation import _deprecate_positional_args
EPSILON = np.finfo(np.float32).eps
def norm(x):
"""Dot product-based Euclidean norm implementation.
See: http://fseoane.net/blog/2011/computing-the-vector-norm/
Parameters
----------
x : array-like
Vector for which to compute the norm.
"""
return sqrt(squared_norm(x))
def trace_dot(X, Y):
"""Trace of np.dot(X, Y.T).
Parameters
----------
X : array-like
First matrix.
Y : array-like
Second matrix.
"""
return np.dot(X.ravel(), Y.ravel())
def _check_init(A, shape, whom):
A = check_array(A)
if np.shape(A) != shape:
raise ValueError('Array with wrong shape passed to %s. Expected %s, '
'but got %s ' % (whom, shape, np.shape(A)))
check_non_negative(A, whom)
if np.max(A) == 0:
raise ValueError('Array passed to %s is full of zeros.' % whom)
def _beta_divergence(X, W, H, beta, square_root=False):
"""Compute the beta-divergence of X and dot(W, H).
Parameters
----------
X : float or array-like of shape (n_samples, n_features)
W : float or array-like of shape (n_samples, n_components)
H : float or array-like of shape (n_components, n_features)
beta : float or {'frobenius', 'kullback-leibler', 'itakura-saito'}
Parameter of the beta-divergence.
If beta == 2, this is half the Frobenius *squared* norm.
If beta == 1, this is the generalized Kullback-Leibler divergence.
If beta == 0, this is the Itakura-Saito divergence.
Else, this is the general beta-divergence.
square_root : bool, default=False
If True, return np.sqrt(2 * res)
For beta == 2, it corresponds to the Frobenius norm.
Returns
-------
res : float
Beta divergence of X and np.dot(X, H).
"""
beta = _beta_loss_to_float(beta)
# The method can be called with scalars
if not sp.issparse(X):
X = np.atleast_2d(X)
W = np.atleast_2d(W)
H = np.atleast_2d(H)
# Frobenius norm
if beta == 2:
# Avoid the creation of the dense np.dot(W, H) if X is sparse.
if sp.issparse(X):
norm_X = np.dot(X.data, X.data)
norm_WH = trace_dot(np.linalg.multi_dot([W.T, W, H]), H)
cross_prod = trace_dot((X * H.T), W)
res = (norm_X + norm_WH - 2. * cross_prod) / 2.
else:
res = squared_norm(X - np.dot(W, H)) / 2.
if square_root:
return np.sqrt(res * 2)
else:
return res
if sp.issparse(X):
# compute np.dot(W, H) only where X is nonzero
WH_data = _special_sparse_dot(W, H, X).data
X_data = X.data
else:
WH = np.dot(W, H)
WH_data = WH.ravel()
X_data = X.ravel()
# do not affect the zeros: here 0 ** (-1) = 0 and not infinity
indices = X_data > EPSILON
WH_data = WH_data[indices]
X_data = X_data[indices]
# used to avoid division by zero
WH_data[WH_data == 0] = EPSILON
# generalized Kullback-Leibler divergence
if beta == 1:
# fast and memory efficient computation of np.sum(np.dot(W, H))
sum_WH = np.dot(np.sum(W, axis=0), np.sum(H, axis=1))
# computes np.sum(X * log(X / WH)) only where X is nonzero
div = X_data / WH_data
res = np.dot(X_data, np.log(div))
# add full np.sum(np.dot(W, H)) - np.sum(X)
res += sum_WH - X_data.sum()
# Itakura-Saito divergence
elif beta == 0:
div = X_data / WH_data
res = np.sum(div) - np.product(X.shape) - np.sum(np.log(div))
# beta-divergence, beta not in (0, 1, 2)
else:
if sp.issparse(X):
# slow loop, but memory efficient computation of :
# np.sum(np.dot(W, H) ** beta)
sum_WH_beta = 0
for i in range(X.shape[1]):
sum_WH_beta += np.sum(np.dot(W, H[:, i]) ** beta)
else:
sum_WH_beta = np.sum(WH ** beta)
sum_X_WH = np.dot(X_data, WH_data ** (beta - 1))
res = (X_data ** beta).sum() - beta * sum_X_WH
res += sum_WH_beta * (beta - 1)
res /= beta * (beta - 1)
if square_root:
return np.sqrt(2 * res)
else:
return res
def _special_sparse_dot(W, H, X):
"""Computes np.dot(W, H), only where X is non zero."""
if sp.issparse(X):
ii, jj = X.nonzero()
n_vals = ii.shape[0]
dot_vals = np.empty(n_vals)
n_components = W.shape[1]
batch_size = max(n_components, n_vals // n_components)
for start in range(0, n_vals, batch_size):
batch = slice(start, start + batch_size)
dot_vals[batch] = np.multiply(W[ii[batch], :],
H.T[jj[batch], :]).sum(axis=1)
WH = sp.coo_matrix((dot_vals, (ii, jj)), shape=X.shape)
return WH.tocsr()
else:
return np.dot(W, H)
def _compute_regularization(alpha, l1_ratio, regularization):
"""Compute L1 and L2 regularization coefficients for W and H."""
alpha_H = 0.
alpha_W = 0.
if regularization in ('both', 'components'):
alpha_H = float(alpha)
if regularization in ('both', 'transformation'):
alpha_W = float(alpha)
l1_reg_W = alpha_W * l1_ratio
l1_reg_H = alpha_H * l1_ratio
l2_reg_W = alpha_W * (1. - l1_ratio)
l2_reg_H = alpha_H * (1. - l1_ratio)
return l1_reg_W, l1_reg_H, l2_reg_W, l2_reg_H
def _check_string_param(solver, regularization, beta_loss, init):
allowed_solver = ('cd', 'mu')
if solver not in allowed_solver:
raise ValueError(
'Invalid solver parameter: got %r instead of one of %r' %
(solver, allowed_solver))
allowed_regularization = ('both', 'components', 'transformation', None)
if regularization not in allowed_regularization:
raise ValueError(
'Invalid regularization parameter: got %r instead of one of %r' %
(regularization, allowed_regularization))
# 'mu' is the only solver that handles other beta losses than 'frobenius'
if solver != 'mu' and beta_loss not in (2, 'frobenius'):
raise ValueError(
'Invalid beta_loss parameter: solver %r does not handle beta_loss'
' = %r' % (solver, beta_loss))
if solver == 'mu' and init == 'nndsvd':
warnings.warn("The multiplicative update ('mu') solver cannot update "
"zeros present in the initialization, and so leads to "
"poorer results when used jointly with init='nndsvd'. "
"You may try init='nndsvda' or init='nndsvdar' instead.",
UserWarning)
beta_loss = _beta_loss_to_float(beta_loss)
return beta_loss
def _beta_loss_to_float(beta_loss):
"""Convert string beta_loss to float."""
allowed_beta_loss = {'frobenius': 2,
'kullback-leibler': 1,
'itakura-saito': 0}
if isinstance(beta_loss, str) and beta_loss in allowed_beta_loss:
beta_loss = allowed_beta_loss[beta_loss]
if not isinstance(beta_loss, numbers.Number):
raise ValueError('Invalid beta_loss parameter: got %r instead '
'of one of %r, or a float.' %
(beta_loss, allowed_beta_loss.keys()))
return beta_loss
def _initialize_nmf(X, n_components, init='warn', eps=1e-6,
random_state=None):
"""Algorithms for NMF initialization.
Computes an initial guess for the non-negative
rank k matrix approximation for X: X = WH.
Parameters
----------
X : array-like of shape (n_samples, n_features)
The data matrix to be decomposed.
n_components : int
The number of components desired in the approximation.
init : {'random', 'nndsvd', 'nndsvda', 'nndsvdar'}, default=None
Method used to initialize the procedure.
Default: None.
Valid options:
- None: 'nndsvd' if n_components <= min(n_samples, n_features),
otherwise 'random'.
- 'random': non-negative random matrices, scaled with:
sqrt(X.mean() / n_components)
- 'nndsvd': Nonnegative Double Singular Value Decomposition (NNDSVD)
initialization (better for sparseness)
- 'nndsvda': NNDSVD with zeros filled with the average of X
(better when sparsity is not desired)
- 'nndsvdar': NNDSVD with zeros filled with small random values
(generally faster, less accurate alternative to NNDSVDa
for when sparsity is not desired)
- 'custom': use custom matrices W and H
eps : float, default=1e-6
Truncate all values less then this in output to zero.
random_state : int, RandomState instance or None, default=None
Used when ``init`` == 'nndsvdar' or 'random'. Pass an int for
reproducible results across multiple function calls.
See :term:`Glossary <random_state>`.
Returns
-------
W : array-like of shape (n_samples, n_components)
Initial guesses for solving X ~= WH.
H : array-like of shape (n_components, n_features)
Initial guesses for solving X ~= WH.
References
----------
C. Boutsidis, E. Gallopoulos: SVD based initialization: A head start for
nonnegative matrix factorization - Pattern Recognition, 2008
http://tinyurl.com/nndsvd
"""
if init == 'warn':
warnings.warn(("The 'init' value, when 'init=None' and "
"n_components is less than n_samples and "
"n_features, will be changed from 'nndsvd' to "
"'nndsvda' in 1.1 (renaming of 0.26)."), FutureWarning)
init = None
check_non_negative(X, "NMF initialization")
n_samples, n_features = X.shape
if (init is not None and init != 'random'
and n_components > min(n_samples, n_features)):
raise ValueError("init = '{}' can only be used when "
"n_components <= min(n_samples, n_features)"
.format(init))
if init is None:
if n_components <= min(n_samples, n_features):
init = 'nndsvd'
else:
init = 'random'
# Random initialization
if init == 'random':
avg = np.sqrt(X.mean() / n_components)
rng = check_random_state(random_state)
H = avg * rng.randn(n_components, n_features).astype(X.dtype,
copy=False)
W = avg * rng.randn(n_samples, n_components).astype(X.dtype,
copy=False)
np.abs(H, out=H)
np.abs(W, out=W)
return W, H
# NNDSVD initialization
U, S, V = randomized_svd(X, n_components, random_state=random_state)
W = np.zeros_like(U)
H = np.zeros_like(V)
# The leading singular triplet is non-negative
# so it can be used as is for initialization.
W[:, 0] = np.sqrt(S[0]) * np.abs(U[:, 0])
H[0, :] = np.sqrt(S[0]) * np.abs(V[0, :])
for j in range(1, n_components):
x, y = U[:, j], V[j, :]
# extract positive and negative parts of column vectors
x_p, y_p = np.maximum(x, 0), np.maximum(y, 0)
x_n, y_n = np.abs(np.minimum(x, 0)), np.abs(np.minimum(y, 0))
# and their norms
x_p_nrm, y_p_nrm = norm(x_p), norm(y_p)
x_n_nrm, y_n_nrm = norm(x_n), norm(y_n)
m_p, m_n = x_p_nrm * y_p_nrm, x_n_nrm * y_n_nrm
# choose update
if m_p > m_n:
u = x_p / x_p_nrm
v = y_p / y_p_nrm
sigma = m_p
else:
u = x_n / x_n_nrm
v = y_n / y_n_nrm
sigma = m_n
lbd = np.sqrt(S[j] * sigma)
W[:, j] = lbd * u
H[j, :] = lbd * v
W[W < eps] = 0
H[H < eps] = 0
if init == "nndsvd":
pass
elif init == "nndsvda":
avg = X.mean()
W[W == 0] = avg
H[H == 0] = avg
elif init == "nndsvdar":
rng = check_random_state(random_state)
avg = X.mean()
W[W == 0] = abs(avg * rng.randn(len(W[W == 0])) / 100)
H[H == 0] = abs(avg * rng.randn(len(H[H == 0])) / 100)
else:
raise ValueError(
'Invalid init parameter: got %r instead of one of %r' %
(init, (None, 'random', 'nndsvd', 'nndsvda', 'nndsvdar')))
return W, H
def _update_coordinate_descent(X, W, Ht, l1_reg, l2_reg, shuffle,
random_state):
"""Helper function for _fit_coordinate_descent.
Update W to minimize the objective function, iterating once over all
coordinates. By symmetry, to update H, one can call
_update_coordinate_descent(X.T, Ht, W, ...).
"""
n_components = Ht.shape[1]
HHt = np.dot(Ht.T, Ht)
XHt = safe_sparse_dot(X, Ht)
# L2 regularization corresponds to increase of the diagonal of HHt
if l2_reg != 0.:
# adds l2_reg only on the diagonal
HHt.flat[::n_components + 1] += l2_reg
# L1 regularization corresponds to decrease of each element of XHt
if l1_reg != 0.:
XHt -= l1_reg
if shuffle:
permutation = random_state.permutation(n_components)
else:
permutation = np.arange(n_components)
# The following seems to be required on 64-bit Windows w/ Python 3.5.
permutation = np.asarray(permutation, dtype=np.intp)
return _update_cdnmf_fast(W, HHt, XHt, permutation)
def _fit_coordinate_descent(X, W, H, tol=1e-4, max_iter=200, l1_reg_W=0,
l1_reg_H=0, l2_reg_W=0, l2_reg_H=0, update_H=True,
verbose=0, shuffle=False, random_state=None):
"""Compute Non-negative Matrix Factorization (NMF) with Coordinate Descent
The objective function is minimized with an alternating minimization of W
and H. Each minimization is done with a cyclic (up to a permutation of the
features) Coordinate Descent.
Parameters
----------
X : array-like of shape (n_samples, n_features)
Constant matrix.
W : array-like of shape (n_samples, n_components)
Initial guess for the solution.
H : array-like of shape (n_components, n_features)
Initial guess for the solution.
tol : float, default=1e-4
Tolerance of the stopping condition.
max_iter : int, default=200
Maximum number of iterations before timing out.
l1_reg_W : float, default=0.
L1 regularization parameter for W.
l1_reg_H : float, default=0.
L1 regularization parameter for H.
l2_reg_W : float, default=0.
L2 regularization parameter for W.
l2_reg_H : float, default=0.
L2 regularization parameter for H.
update_H : bool, default=True
Set to True, both W and H will be estimated from initial guesses.
Set to False, only W will be estimated.
verbose : int, default=0
The verbosity level.
shuffle : bool, default=False
If true, randomize the order of coordinates in the CD solver.
random_state : int, RandomState instance or None, default=None
Used to randomize the coordinates in the CD solver, when
``shuffle`` is set to ``True``. Pass an int for reproducible
results across multiple function calls.
See :term:`Glossary <random_state>`.
Returns
-------
W : ndarray of shape (n_samples, n_components)
Solution to the non-negative least squares problem.
H : ndarray of shape (n_components, n_features)
Solution to the non-negative least squares problem.
n_iter : int
The number of iterations done by the algorithm.
References
----------
Cichocki, Andrzej, and Phan, Anh-Huy. "Fast local algorithms for
large scale nonnegative matrix and tensor factorizations."
IEICE transactions on fundamentals of electronics, communications and
computer sciences 92.3: 708-721, 2009.
"""
# so W and Ht are both in C order in memory
Ht = check_array(H.T, order='C')
X = check_array(X, accept_sparse='csr')
rng = check_random_state(random_state)
for n_iter in range(1, max_iter + 1):
violation = 0.
# Update W
violation += _update_coordinate_descent(X, W, Ht, l1_reg_W,
l2_reg_W, shuffle, rng)
# Update H
if update_H:
violation += _update_coordinate_descent(X.T, Ht, W, l1_reg_H,
l2_reg_H, shuffle, rng)
if n_iter == 1:
violation_init = violation
if violation_init == 0:
break
if verbose:
print("violation:", violation / violation_init)
if violation / violation_init <= tol:
if verbose:
print("Converged at iteration", n_iter + 1)
break
return W, Ht.T, n_iter
def _multiplicative_update_w(X, W, H, beta_loss, l1_reg_W, l2_reg_W, gamma,
H_sum=None, HHt=None, XHt=None, update_H=True):
"""Update W in Multiplicative Update NMF."""
if beta_loss == 2:
# Numerator
if XHt is None:
XHt = safe_sparse_dot(X, H.T)
if update_H:
# avoid a copy of XHt, which will be re-computed (update_H=True)
numerator = XHt
else:
# preserve the XHt, which is not re-computed (update_H=False)
numerator = XHt.copy()
# Denominator
if HHt is None:
HHt = np.dot(H, H.T)
denominator = np.dot(W, HHt)
else:
# Numerator
# if X is sparse, compute WH only where X is non zero
WH_safe_X = _special_sparse_dot(W, H, X)
if sp.issparse(X):
WH_safe_X_data = WH_safe_X.data
X_data = X.data
else:
WH_safe_X_data = WH_safe_X
X_data = X
# copy used in the Denominator
WH = WH_safe_X.copy()
if beta_loss - 1. < 0:
WH[WH == 0] = EPSILON
# to avoid taking a negative power of zero
if beta_loss - 2. < 0:
WH_safe_X_data[WH_safe_X_data == 0] = EPSILON
if beta_loss == 1:
np.divide(X_data, WH_safe_X_data, out=WH_safe_X_data)
elif beta_loss == 0:
# speeds up computation time
# refer to /numpy/numpy/issues/9363
WH_safe_X_data **= -1
WH_safe_X_data **= 2
# element-wise multiplication
WH_safe_X_data *= X_data
else:
WH_safe_X_data **= beta_loss - 2
# element-wise multiplication
WH_safe_X_data *= X_data
# here numerator = dot(X * (dot(W, H) ** (beta_loss - 2)), H.T)
numerator = safe_sparse_dot(WH_safe_X, H.T)
# Denominator
if beta_loss == 1:
if H_sum is None:
H_sum = np.sum(H, axis=1) # shape(n_components, )
denominator = H_sum[np.newaxis, :]
else:
# computation of WHHt = dot(dot(W, H) ** beta_loss - 1, H.T)
if sp.issparse(X):
# memory efficient computation
# (compute row by row, avoiding the dense matrix WH)
WHHt = np.empty(W.shape)
for i in range(X.shape[0]):
WHi = np.dot(W[i, :], H)
if beta_loss - 1 < 0:
WHi[WHi == 0] = EPSILON
WHi **= beta_loss - 1
WHHt[i, :] = np.dot(WHi, H.T)
else:
WH **= beta_loss - 1
WHHt = np.dot(WH, H.T)
denominator = WHHt
# Add L1 and L2 regularization
if l1_reg_W > 0:
denominator += l1_reg_W
if l2_reg_W > 0:
denominator = denominator + l2_reg_W * W
denominator[denominator == 0] = EPSILON
numerator /= denominator
delta_W = numerator
# gamma is in ]0, 1]
if gamma != 1:
delta_W **= gamma
return delta_W, H_sum, HHt, XHt
def _multiplicative_update_h(X, W, H, beta_loss, l1_reg_H, l2_reg_H, gamma):
"""Update H in Multiplicative Update NMF."""
if beta_loss == 2:
numerator = safe_sparse_dot(W.T, X)
denominator = np.linalg.multi_dot([W.T, W, H])
else:
# Numerator
WH_safe_X = _special_sparse_dot(W, H, X)
if sp.issparse(X):
WH_safe_X_data = WH_safe_X.data
X_data = X.data
else:
WH_safe_X_data = WH_safe_X
X_data = X
# copy used in the Denominator
WH = WH_safe_X.copy()
if beta_loss - 1. < 0:
WH[WH == 0] = EPSILON
# to avoid division by zero
if beta_loss - 2. < 0:
WH_safe_X_data[WH_safe_X_data == 0] = EPSILON
if beta_loss == 1:
np.divide(X_data, WH_safe_X_data, out=WH_safe_X_data)
elif beta_loss == 0:
# speeds up computation time
# refer to /numpy/numpy/issues/9363
WH_safe_X_data **= -1
WH_safe_X_data **= 2
# element-wise multiplication
WH_safe_X_data *= X_data
else:
WH_safe_X_data **= beta_loss - 2
# element-wise multiplication
WH_safe_X_data *= X_data
# here numerator = dot(W.T, (dot(W, H) ** (beta_loss - 2)) * X)
numerator = safe_sparse_dot(W.T, WH_safe_X)
# Denominator
if beta_loss == 1:
W_sum = np.sum(W, axis=0) # shape(n_components, )
W_sum[W_sum == 0] = 1.
denominator = W_sum[:, np.newaxis]
# beta_loss not in (1, 2)
else:
# computation of WtWH = dot(W.T, dot(W, H) ** beta_loss - 1)
if sp.issparse(X):
# memory efficient computation
# (compute column by column, avoiding the dense matrix WH)
WtWH = np.empty(H.shape)
for i in range(X.shape[1]):
WHi = np.dot(W, H[:, i])
if beta_loss - 1 < 0:
WHi[WHi == 0] = EPSILON
WHi **= beta_loss - 1
WtWH[:, i] = np.dot(W.T, WHi)
else:
WH **= beta_loss - 1
WtWH = np.dot(W.T, WH)
denominator = WtWH
# Add L1 and L2 regularization
if l1_reg_H > 0:
denominator += l1_reg_H
if l2_reg_H > 0:
denominator = denominator + l2_reg_H * H
denominator[denominator == 0] = EPSILON
numerator /= denominator
delta_H = numerator
# gamma is in ]0, 1]
if gamma != 1:
delta_H **= gamma
return delta_H
def _fit_multiplicative_update(X, W, H, beta_loss='frobenius',
max_iter=200, tol=1e-4,
l1_reg_W=0, l1_reg_H=0, l2_reg_W=0, l2_reg_H=0,
update_H=True, verbose=0):
"""Compute Non-negative Matrix Factorization with Multiplicative Update.
The objective function is _beta_divergence(X, WH) and is minimized with an
alternating minimization of W and H. Each minimization is done with a
Multiplicative Update.
Parameters
----------
X : array-like of shape (n_samples, n_features)
Constant input matrix.
W : array-like of shape (n_samples, n_components)
Initial guess for the solution.
H : array-like of shape (n_components, n_features)
Initial guess for the solution.
beta_loss : float or {'frobenius', 'kullback-leibler', \
'itakura-saito'}, default='frobenius'
String must be in {'frobenius', 'kullback-leibler', 'itakura-saito'}.
Beta divergence to be minimized, measuring the distance between X
and the dot product WH. Note that values different from 'frobenius'
(or 2) and 'kullback-leibler' (or 1) lead to significantly slower
fits. Note that for beta_loss <= 0 (or 'itakura-saito'), the input
matrix X cannot contain zeros.
max_iter : int, default=200
Number of iterations.
tol : float, default=1e-4
Tolerance of the stopping condition.
l1_reg_W : float, default=0.
L1 regularization parameter for W.
l1_reg_H : float, default=0.
L1 regularization parameter for H.
l2_reg_W : float, default=0.
L2 regularization parameter for W.
l2_reg_H : float, default=0.
L2 regularization parameter for H.
update_H : bool, default=True
Set to True, both W and H will be estimated from initial guesses.
Set to False, only W will be estimated.
verbose : int, default=0
The verbosity level.
Returns
-------
W : ndarray of shape (n_samples, n_components)
Solution to the non-negative least squares problem.
H : ndarray of shape (n_components, n_features)
Solution to the non-negative least squares problem.
n_iter : int
The number of iterations done by the algorithm.
References
----------
Fevotte, C., & Idier, J. (2011). Algorithms for nonnegative matrix
factorization with the beta-divergence. Neural Computation, 23(9).
"""
start_time = time.time()
beta_loss = _beta_loss_to_float(beta_loss)
# gamma for Maximization-Minimization (MM) algorithm [Fevotte 2011]
if beta_loss < 1:
gamma = 1. / (2. - beta_loss)
elif beta_loss > 2:
gamma = 1. / (beta_loss - 1.)
else:
gamma = 1.
# used for the convergence criterion
error_at_init = _beta_divergence(X, W, H, beta_loss, square_root=True)
previous_error = error_at_init
H_sum, HHt, XHt = None, None, None
for n_iter in range(1, max_iter + 1):
# update W
# H_sum, HHt and XHt are saved and reused if not update_H
delta_W, H_sum, HHt, XHt = _multiplicative_update_w(
X, W, H, beta_loss, l1_reg_W, l2_reg_W, gamma,
H_sum, HHt, XHt, update_H)
W *= delta_W
# necessary for stability with beta_loss < 1
if beta_loss < 1:
W[W < np.finfo(np.float64).eps] = 0.
# update H
if update_H:
delta_H = _multiplicative_update_h(X, W, H, beta_loss, l1_reg_H,
l2_reg_H, gamma)
H *= delta_H
# These values will be recomputed since H changed
H_sum, HHt, XHt = None, None, None
# necessary for stability with beta_loss < 1
if beta_loss <= 1:
H[H < np.finfo(np.float64).eps] = 0.
# test convergence criterion every 10 iterations
if tol > 0 and n_iter % 10 == 0:
error = _beta_divergence(X, W, H, beta_loss, square_root=True)
if verbose:
iter_time = time.time()
print("Epoch %02d reached after %.3f seconds, error: %f" %
(n_iter, iter_time - start_time, error))
if (previous_error - error) / error_at_init < tol:
break
previous_error = error
# do not print if we have already printed in the convergence test
if verbose and (tol == 0 or n_iter % 10 != 0):
end_time = time.time()
print("Epoch %02d reached after %.3f seconds." %
(n_iter, end_time - start_time))
return W, H, n_iter
@_deprecate_positional_args
def non_negative_factorization(X, W=None, H=None, n_components=None, *,
init='warn', update_H=True, solver='cd',
beta_loss='frobenius', tol=1e-4,
max_iter=200, alpha=0., l1_ratio=0.,
regularization=None, random_state=None,
verbose=0, shuffle=False):
"""Compute Non-negative Matrix Factorization (NMF).
Find two non-negative matrices (W, H) whose product approximates the non-
negative matrix X. This factorization can be used for example for
dimensionality reduction, source separation or topic extraction.
The objective function is:
.. math::
0.5 * ||X - WH||_{loss}^2 + alpha * l1_{ratio} * ||vec(W)||_1
+ alpha * l1_{ratio} * ||vec(H)||_1
+ 0.5 * alpha * (1 - l1_{ratio}) * ||W||_{Fro}^2
+ 0.5 * alpha * (1 - l1_{ratio}) * ||H||_{Fro}^2
Where:
:math:`||A||_{Fro}^2 = \\sum_{i,j} A_{ij}^2` (Frobenius norm)
:math:`||vec(A)||_1 = \\sum_{i,j} abs(A_{ij})` (Elementwise L1 norm)
The generic norm :math:`||X - WH||_{loss}^2` may represent
the Frobenius norm or another supported beta-divergence loss.
The choice between options is controlled by the `beta_loss` parameter.
The objective function is minimized with an alternating minimization of W
and H. If H is given and update_H=False, it solves for W only.
Parameters
----------
X : array-like of shape (n_samples, n_features)
Constant matrix.
W : array-like of shape (n_samples, n_components), default=None
If init='custom', it is used as initial guess for the solution.
H : array-like of shape (n_components, n_features), default=None
If init='custom', it is used as initial guess for the solution.
If update_H=False, it is used as a constant, to solve for W only.
n_components : int, default=None
Number of components, if n_components is not set all features
are kept.
init : {'random', 'nndsvd', 'nndsvda', 'nndsvdar', 'custom'}, default=None
Method used to initialize the procedure.
Valid options:
- None: 'nndsvd' if n_components < n_features, otherwise 'random'.
- 'random': non-negative random matrices, scaled with:
sqrt(X.mean() / n_components)
- 'nndsvd': Nonnegative Double Singular Value Decomposition (NNDSVD)
initialization (better for sparseness)
- 'nndsvda': NNDSVD with zeros filled with the average of X
(better when sparsity is not desired)
- 'nndsvdar': NNDSVD with zeros filled with small random values
(generally faster, less accurate alternative to NNDSVDa
for when sparsity is not desired)
- 'custom': use custom matrices W and H if `update_H=True`. If
`update_H=False`, then only custom matrix H is used.
.. versionchanged:: 0.23
The default value of `init` changed from 'random' to None in 0.23.
update_H : bool, default=True
Set to True, both W and H will be estimated from initial guesses.
Set to False, only W will be estimated.
solver : {'cd', 'mu'}, default='cd'
Numerical solver to use:
- 'cd' is a Coordinate Descent solver that uses Fast Hierarchical
Alternating Least Squares (Fast HALS).
- 'mu' is a Multiplicative Update solver.
.. versionadded:: 0.17
Coordinate Descent solver.
.. versionadded:: 0.19
Multiplicative Update solver.
beta_loss : float or {'frobenius', 'kullback-leibler', \
'itakura-saito'}, default='frobenius'
Beta divergence to be minimized, measuring the distance between X
and the dot product WH. Note that values different from 'frobenius'
(or 2) and 'kullback-leibler' (or 1) lead to significantly slower
fits. Note that for beta_loss <= 0 (or 'itakura-saito'), the input
matrix X cannot contain zeros. Used only in 'mu' solver.
.. versionadded:: 0.19
tol : float, default=1e-4
Tolerance of the stopping condition.
max_iter : int, default=200
Maximum number of iterations before timing out.
alpha : float, default=0.
Constant that multiplies the regularization terms.
l1_ratio : float, default=0.
The regularization mixing parameter, with 0 <= l1_ratio <= 1.
For l1_ratio = 0 the penalty is an elementwise L2 penalty
(aka Frobenius Norm).
For l1_ratio = 1 it is an elementwise L1 penalty.
For 0 < l1_ratio < 1, the penalty is a combination of L1 and L2.
regularization : {'both', 'components', 'transformation'}, default=None
Select whether the regularization affects the components (H), the
transformation (W), both or none of them.
random_state : int, RandomState instance or None, default=None
Used for NMF initialisation (when ``init`` == 'nndsvdar' or
'random'), and in Coordinate Descent. Pass an int for reproducible
results across multiple function calls.
See :term:`Glossary <random_state>`.
verbose : int, default=0
The verbosity level.
shuffle : bool, default=False
If true, randomize the order of coordinates in the CD solver.
Returns
-------
W : ndarray of shape (n_samples, n_components)
Solution to the non-negative least squares problem.
H : ndarray of shape (n_components, n_features)
Solution to the non-negative least squares problem.
n_iter : int
Actual number of iterations.
Examples
--------
>>> import numpy as np
>>> X = np.array([[1,1], [2, 1], [3, 1.2], [4, 1], [5, 0.8], [6, 1]])
>>> from sklearn.decomposition import non_negative_factorization
>>> W, H, n_iter = non_negative_factorization(X, n_components=2,
... init='random', random_state=0)
References
----------
Cichocki, Andrzej, and P. H. A. N. Anh-Huy. "Fast local algorithms for
large scale nonnegative matrix and tensor factorizations."
IEICE transactions on fundamentals of electronics, communications and
computer sciences 92.3: 708-721, 2009.
Fevotte, C., & Idier, J. (2011). Algorithms for nonnegative matrix
factorization with the beta-divergence. Neural Computation, 23(9).
"""
X = check_array(X, accept_sparse=('csr', 'csc'),
dtype=[np.float64, np.float32])
check_non_negative(X, "NMF (input X)")
beta_loss = _check_string_param(solver, regularization, beta_loss, init)
if X.min() == 0 and beta_loss <= 0:
raise ValueError("When beta_loss <= 0 and X contains zeros, "
"the solver may diverge. Please add small values to "
"X, or use a positive beta_loss.")
n_samples, n_features = X.shape
if n_components is None:
n_components = n_features
if not isinstance(n_components, numbers.Integral) or n_components <= 0:
raise ValueError("Number of components must be a positive integer;"
" got (n_components=%r)" % n_components)
if not isinstance(max_iter, numbers.Integral) or max_iter < 0:
raise ValueError("Maximum number of iterations must be a positive "
"integer; got (max_iter=%r)" % max_iter)
if not isinstance(tol, numbers.Number) or tol < 0:
raise ValueError("Tolerance for stopping criteria must be "
"positive; got (tol=%r)" % tol)
# check W and H, or initialize them
if init == 'custom' and update_H:
_check_init(H, (n_components, n_features), "NMF (input H)")
_check_init(W, (n_samples, n_components), "NMF (input W)")
if H.dtype != X.dtype or W.dtype != X.dtype:
raise TypeError("H and W should have the same dtype as X. Got "
"H.dtype = {} and W.dtype = {}."
.format(H.dtype, W.dtype))
elif not update_H:
_check_init(H, (n_components, n_features), "NMF (input H)")
if H.dtype != X.dtype:
raise TypeError("H should have the same dtype as X. Got H.dtype = "
"{}.".format(H.dtype))
# 'mu' solver should not be initialized by zeros
if solver == 'mu':
avg = np.sqrt(X.mean() / n_components)
W = np.full((n_samples, n_components), avg, dtype=X.dtype)
else:
W = np.zeros((n_samples, n_components), dtype=X.dtype)
else:
W, H = _initialize_nmf(X, n_components, init=init,
random_state=random_state)
l1_reg_W, l1_reg_H, l2_reg_W, l2_reg_H = _compute_regularization(
alpha, l1_ratio, regularization)
if solver == 'cd':
W, H, n_iter = _fit_coordinate_descent(X, W, H, tol, max_iter,
l1_reg_W, l1_reg_H,
l2_reg_W, l2_reg_H,
update_H=update_H,
verbose=verbose,
shuffle=shuffle,
random_state=random_state)
elif solver == 'mu':
W, H, n_iter = _fit_multiplicative_update(X, W, H, beta_loss, max_iter,
tol, l1_reg_W, l1_reg_H,
l2_reg_W, l2_reg_H, update_H,
verbose)
else:
raise ValueError("Invalid solver parameter '%s'." % solver)
if n_iter == max_iter and tol > 0:
warnings.warn("Maximum number of iterations %d reached. Increase it to"
" improve convergence." % max_iter, ConvergenceWarning)
return W, H, n_iter
class NMF(TransformerMixin, BaseEstimator):
"""Non-Negative Matrix Factorization (NMF).
Find two non-negative matrices (W, H) whose product approximates the non-
negative matrix X. This factorization can be used for example for
dimensionality reduction, source separation or topic extraction.
The objective function is:
.. math::
0.5 * ||X - WH||_{loss}^2 + alpha * l1_{ratio} * ||vec(W)||_1
+ alpha * l1_{ratio} * ||vec(H)||_1
+ 0.5 * alpha * (1 - l1_{ratio}) * ||W||_{Fro}^2
+ 0.5 * alpha * (1 - l1_{ratio}) * ||H||_{Fro}^2
Where:
:math:`||A||_{Fro}^2 = \\sum_{i,j} A_{ij}^2` (Frobenius norm)
:math:`||vec(A)||_1 = \\sum_{i,j} abs(A_{ij})` (Elementwise L1 norm)
The generic norm :math:`||X - WH||_{loss}` may represent
the Frobenius norm or another supported beta-divergence loss.
The choice between options is controlled by the `beta_loss` parameter.
The objective function is minimized with an alternating minimization of W
and H.
Read more in the :ref:`User Guide <NMF>`.
Parameters
----------
n_components : int, default=None
Number of components, if n_components is not set all features
are kept.
init : {'random', 'nndsvd', 'nndsvda', 'nndsvdar', 'custom'}, default=None
Method used to initialize the procedure.
Default: None.
Valid options:
- `None`: 'nndsvd' if n_components <= min(n_samples, n_features),
otherwise random.
- `'random'`: non-negative random matrices, scaled with:
sqrt(X.mean() / n_components)
- `'nndsvd'`: Nonnegative Double Singular Value Decomposition (NNDSVD)
initialization (better for sparseness)
- `'nndsvda'`: NNDSVD with zeros filled with the average of X
(better when sparsity is not desired)
- `'nndsvdar'` NNDSVD with zeros filled with small random values
(generally faster, less accurate alternative to NNDSVDa
for when sparsity is not desired)
- `'custom'`: use custom matrices W and H
solver : {'cd', 'mu'}, default='cd'
Numerical solver to use:
'cd' is a Coordinate Descent solver.
'mu' is a Multiplicative Update solver.
.. versionadded:: 0.17
Coordinate Descent solver.
.. versionadded:: 0.19
Multiplicative Update solver.
beta_loss : float or {'frobenius', 'kullback-leibler', \
'itakura-saito'}, default='frobenius'
Beta divergence to be minimized, measuring the distance between X
and the dot product WH. Note that values different from 'frobenius'
(or 2) and 'kullback-leibler' (or 1) lead to significantly slower
fits. Note that for beta_loss <= 0 (or 'itakura-saito'), the input
matrix X cannot contain zeros. Used only in 'mu' solver.
.. versionadded:: 0.19
tol : float, default=1e-4
Tolerance of the stopping condition.
max_iter : int, default=200
Maximum number of iterations before timing out.
random_state : int, RandomState instance or None, default=None
Used for initialisation (when ``init`` == 'nndsvdar' or
'random'), and in Coordinate Descent. Pass an int for reproducible
results across multiple function calls.
See :term:`Glossary <random_state>`.
alpha : float, default=0.
Constant that multiplies the regularization terms. Set it to zero to
have no regularization.
.. versionadded:: 0.17
*alpha* used in the Coordinate Descent solver.
l1_ratio : float, default=0.
The regularization mixing parameter, with 0 <= l1_ratio <= 1.
For l1_ratio = 0 the penalty is an elementwise L2 penalty
(aka Frobenius Norm).
For l1_ratio = 1 it is an elementwise L1 penalty.
For 0 < l1_ratio < 1, the penalty is a combination of L1 and L2.
.. versionadded:: 0.17
Regularization parameter *l1_ratio* used in the Coordinate Descent
solver.
verbose : int, default=0
Whether to be verbose.
shuffle : bool, default=False
If true, randomize the order of coordinates in the CD solver.
.. versionadded:: 0.17
*shuffle* parameter used in the Coordinate Descent solver.
regularization : {'both', 'components', 'transformation', None}, \
default='both'
Select whether the regularization affects the components (H), the
transformation (W), both or none of them.
.. versionadded:: 0.24
Attributes
----------
components_ : ndarray of shape (n_components, n_features)
Factorization matrix, sometimes called 'dictionary'.
n_components_ : int
The number of components. It is same as the `n_components` parameter
if it was given. Otherwise, it will be same as the number of
features.
reconstruction_err_ : float
Frobenius norm of the matrix difference, or beta-divergence, between
the training data ``X`` and the reconstructed data ``WH`` from
the fitted model.
n_iter_ : int
Actual number of iterations.
Examples
--------
>>> import numpy as np
>>> X = np.array([[1, 1], [2, 1], [3, 1.2], [4, 1], [5, 0.8], [6, 1]])
>>> from sklearn.decomposition import NMF
>>> model = NMF(n_components=2, init='random', random_state=0)
>>> W = model.fit_transform(X)
>>> H = model.components_
References
----------
Cichocki, Andrzej, and P. H. A. N. Anh-Huy. "Fast local algorithms for
large scale nonnegative matrix and tensor factorizations."
IEICE transactions on fundamentals of electronics, communications and
computer sciences 92.3: 708-721, 2009.
Fevotte, C., & Idier, J. (2011). Algorithms for nonnegative matrix
factorization with the beta-divergence. Neural Computation, 23(9).
"""
@_deprecate_positional_args
def __init__(self, n_components=None, *, init='warn', solver='cd',
beta_loss='frobenius', tol=1e-4, max_iter=200,
random_state=None, alpha=0., l1_ratio=0., verbose=0,
shuffle=False, regularization='both'):
self.n_components = n_components
self.init = init
self.solver = solver
self.beta_loss = beta_loss
self.tol = tol
self.max_iter = max_iter
self.random_state = random_state
self.alpha = alpha
self.l1_ratio = l1_ratio
self.verbose = verbose
self.shuffle = shuffle
self.regularization = regularization
def _more_tags(self):
return {'requires_positive_X': True}
def fit_transform(self, X, y=None, W=None, H=None):
"""Learn a NMF model for the data X and returns the transformed data.
This is more efficient than calling fit followed by transform.
Parameters
----------
X : {array-like, sparse matrix} of shape (n_samples, n_features)
Data matrix to be decomposed
y : Ignored
W : array-like of shape (n_samples, n_components)
If init='custom', it is used as initial guess for the solution.
H : array-like of shape (n_components, n_features)
If init='custom', it is used as initial guess for the solution.
Returns
-------
W : ndarray of shape (n_samples, n_components)
Transformed data.
"""
X = self._validate_data(X, accept_sparse=('csr', 'csc'),
dtype=[np.float64, np.float32])
with config_context(assume_finite=True):
W, H, n_iter_ = non_negative_factorization(
X=X, W=W, H=H, n_components=self.n_components, init=self.init,
update_H=True, solver=self.solver, beta_loss=self.beta_loss,
tol=self.tol, max_iter=self.max_iter, alpha=self.alpha,
l1_ratio=self.l1_ratio, regularization=self.regularization,
random_state=self.random_state, verbose=self.verbose,
shuffle=self.shuffle)
self.reconstruction_err_ = _beta_divergence(X, W, H, self.beta_loss,
square_root=True)
self.n_components_ = H.shape[0]
self.components_ = H
self.n_iter_ = n_iter_
return W
def fit(self, X, y=None, **params):
"""Learn a NMF model for the data X.
Parameters
----------
X : {array-like, sparse matrix} of shape (n_samples, n_features)
Data matrix to be decomposed
y : Ignored
Returns
-------
self
"""
self.fit_transform(X, **params)
return self
def transform(self, X):
"""Transform the data X according to the fitted NMF model.
Parameters
----------
X : {array-like, sparse matrix} of shape (n_samples, n_features)
Data matrix to be transformed by the model.
Returns
-------
W : ndarray of shape (n_samples, n_components)
Transformed data.
"""
check_is_fitted(self)
X = self._validate_data(X, accept_sparse=('csr', 'csc'),
dtype=[np.float64, np.float32],
reset=False)
with config_context(assume_finite=True):
W, _, n_iter_ = non_negative_factorization(
X=X, W=None, H=self.components_,
n_components=self.n_components_,
init=self.init, update_H=False, solver=self.solver,
beta_loss=self.beta_loss, tol=self.tol, max_iter=self.max_iter,
alpha=self.alpha, l1_ratio=self.l1_ratio,
regularization=self.regularization,
random_state=self.random_state,
verbose=self.verbose, shuffle=self.shuffle)
return W
def inverse_transform(self, W):
"""Transform data back to its original space.
Parameters
----------
W : {ndarray, sparse matrix} of shape (n_samples, n_components)
Transformed data matrix.
Returns
-------
X : {ndarray, sparse matrix} of shape (n_samples, n_features)
Data matrix of original shape.
.. versionadded:: 0.18
"""
check_is_fitted(self)
return np.dot(W, self.components_)