""" Ridge regression """ # Author: Mathieu Blondel # Reuben Fletcher-Costin # Fabian Pedregosa # Michael Eickenberg # License: BSD 3 clause from abc import ABCMeta, abstractmethod import warnings import numpy as np from scipy import linalg from scipy import sparse from scipy.sparse import linalg as sp_linalg from ._base import LinearClassifierMixin, LinearModel, _rescale_data from ._sag import sag_solver from ..base import RegressorMixin, MultiOutputMixin, is_classifier from ..utils.extmath import safe_sparse_dot from ..utils.extmath import row_norms from ..utils import check_array from ..utils import check_consistent_length from ..utils import compute_sample_weight from ..utils import column_or_1d from ..utils.validation import _check_sample_weight from ..utils.validation import _deprecate_positional_args from ..preprocessing import LabelBinarizer from ..model_selection import GridSearchCV from ..metrics import check_scoring from ..exceptions import ConvergenceWarning from ..utils.sparsefuncs import mean_variance_axis def _solve_sparse_cg(X, y, alpha, max_iter=None, tol=1e-3, verbose=0, X_offset=None, X_scale=None): def _get_rescaled_operator(X): X_offset_scale = X_offset / X_scale def matvec(b): return X.dot(b) - b.dot(X_offset_scale) def rmatvec(b): return X.T.dot(b) - X_offset_scale * np.sum(b) X1 = sparse.linalg.LinearOperator(shape=X.shape, matvec=matvec, rmatvec=rmatvec) return X1 n_samples, n_features = X.shape if X_offset is None or X_scale is None: X1 = sp_linalg.aslinearoperator(X) else: X1 = _get_rescaled_operator(X) coefs = np.empty((y.shape[1], n_features), dtype=X.dtype) if n_features > n_samples: def create_mv(curr_alpha): def _mv(x): return X1.matvec(X1.rmatvec(x)) + curr_alpha * x return _mv else: def create_mv(curr_alpha): def _mv(x): return X1.rmatvec(X1.matvec(x)) + curr_alpha * x return _mv for i in range(y.shape[1]): y_column = y[:, i] mv = create_mv(alpha[i]) if n_features > n_samples: # kernel ridge # w = X.T * inv(X X^t + alpha*Id) y C = sp_linalg.LinearOperator( (n_samples, n_samples), matvec=mv, dtype=X.dtype) # FIXME atol try: coef, info = sp_linalg.cg(C, y_column, tol=tol, atol='legacy') except TypeError: # old scipy coef, info = sp_linalg.cg(C, y_column, tol=tol) coefs[i] = X1.rmatvec(coef) else: # linear ridge # w = inv(X^t X + alpha*Id) * X.T y y_column = X1.rmatvec(y_column) C = sp_linalg.LinearOperator( (n_features, n_features), matvec=mv, dtype=X.dtype) # FIXME atol try: coefs[i], info = sp_linalg.cg(C, y_column, maxiter=max_iter, tol=tol, atol='legacy') except TypeError: # old scipy coefs[i], info = sp_linalg.cg(C, y_column, maxiter=max_iter, tol=tol) if info < 0: raise ValueError("Failed with error code %d" % info) if max_iter is None and info > 0 and verbose: warnings.warn("sparse_cg did not converge after %d iterations." % info, ConvergenceWarning) return coefs def _solve_lsqr(X, y, alpha, max_iter=None, tol=1e-3): n_samples, n_features = X.shape coefs = np.empty((y.shape[1], n_features), dtype=X.dtype) n_iter = np.empty(y.shape[1], dtype=np.int32) # According to the lsqr documentation, alpha = damp^2. sqrt_alpha = np.sqrt(alpha) for i in range(y.shape[1]): y_column = y[:, i] info = sp_linalg.lsqr(X, y_column, damp=sqrt_alpha[i], atol=tol, btol=tol, iter_lim=max_iter) coefs[i] = info[0] n_iter[i] = info[2] return coefs, n_iter def _solve_cholesky(X, y, alpha): # w = inv(X^t X + alpha*Id) * X.T y n_features = X.shape[1] n_targets = y.shape[1] A = safe_sparse_dot(X.T, X, dense_output=True) Xy = safe_sparse_dot(X.T, y, dense_output=True) one_alpha = np.array_equal(alpha, len(alpha) * [alpha[0]]) if one_alpha: A.flat[::n_features + 1] += alpha[0] return linalg.solve(A, Xy, sym_pos=True, overwrite_a=True).T else: coefs = np.empty([n_targets, n_features], dtype=X.dtype) for coef, target, current_alpha in zip(coefs, Xy.T, alpha): A.flat[::n_features + 1] += current_alpha coef[:] = linalg.solve(A, target, sym_pos=True, overwrite_a=False).ravel() A.flat[::n_features + 1] -= current_alpha return coefs def _solve_cholesky_kernel(K, y, alpha, sample_weight=None, copy=False): # dual_coef = inv(X X^t + alpha*Id) y n_samples = K.shape[0] n_targets = y.shape[1] if copy: K = K.copy() alpha = np.atleast_1d(alpha) one_alpha = (alpha == alpha[0]).all() has_sw = isinstance(sample_weight, np.ndarray) \ or sample_weight not in [1.0, None] if has_sw: # Unlike other solvers, we need to support sample_weight directly # because K might be a pre-computed kernel. sw = np.sqrt(np.atleast_1d(sample_weight)) y = y * sw[:, np.newaxis] K *= np.outer(sw, sw) if one_alpha: # Only one penalty, we can solve multi-target problems in one time. K.flat[::n_samples + 1] += alpha[0] try: # Note: we must use overwrite_a=False in order to be able to # use the fall-back solution below in case a LinAlgError # is raised dual_coef = linalg.solve(K, y, sym_pos=True, overwrite_a=False) except np.linalg.LinAlgError: warnings.warn("Singular matrix in solving dual problem. Using " "least-squares solution instead.") dual_coef = linalg.lstsq(K, y)[0] # K is expensive to compute and store in memory so change it back in # case it was user-given. K.flat[::n_samples + 1] -= alpha[0] if has_sw: dual_coef *= sw[:, np.newaxis] return dual_coef else: # One penalty per target. We need to solve each target separately. dual_coefs = np.empty([n_targets, n_samples], K.dtype) for dual_coef, target, current_alpha in zip(dual_coefs, y.T, alpha): K.flat[::n_samples + 1] += current_alpha dual_coef[:] = linalg.solve(K, target, sym_pos=True, overwrite_a=False).ravel() K.flat[::n_samples + 1] -= current_alpha if has_sw: dual_coefs *= sw[np.newaxis, :] return dual_coefs.T def _solve_svd(X, y, alpha): U, s, Vt = linalg.svd(X, full_matrices=False) idx = s > 1e-15 # same default value as scipy.linalg.pinv s_nnz = s[idx][:, np.newaxis] UTy = np.dot(U.T, y) d = np.zeros((s.size, alpha.size), dtype=X.dtype) d[idx] = s_nnz / (s_nnz ** 2 + alpha) d_UT_y = d * UTy return np.dot(Vt.T, d_UT_y).T def _get_valid_accept_sparse(is_X_sparse, solver): if is_X_sparse and solver in ['auto', 'sag', 'saga']: return 'csr' else: return ['csr', 'csc', 'coo'] @_deprecate_positional_args def ridge_regression(X, y, alpha, *, sample_weight=None, solver='auto', max_iter=None, tol=1e-3, verbose=0, random_state=None, return_n_iter=False, return_intercept=False, check_input=True): """Solve the ridge equation by the method of normal equations. Read more in the :ref:`User Guide `. Parameters ---------- X : {ndarray, sparse matrix, LinearOperator} of shape \ (n_samples, n_features) Training data y : ndarray of shape (n_samples,) or (n_samples, n_targets) Target values alpha : float or array-like of shape (n_targets,) Regularization strength; must be a positive float. Regularization improves the conditioning of the problem and reduces the variance of the estimates. Larger values specify stronger regularization. Alpha corresponds to ``1 / (2C)`` in other linear models such as :class:`~sklearn.linear_model.LogisticRegression` or :class:`~sklearn.svm.LinearSVC`. If an array is passed, penalties are assumed to be specific to the targets. Hence they must correspond in number. sample_weight : float or array-like of shape (n_samples,), default=None Individual weights for each sample. If given a float, every sample will have the same weight. If sample_weight is not None and solver='auto', the solver will be set to 'cholesky'. .. versionadded:: 0.17 solver : {'auto', 'svd', 'cholesky', 'lsqr', 'sparse_cg', 'sag', 'saga'}, \ default='auto' Solver to use in the computational routines: - 'auto' chooses the solver automatically based on the type of data. - 'svd' uses a Singular Value Decomposition of X to compute the Ridge coefficients. More stable for singular matrices than 'cholesky'. - 'cholesky' uses the standard scipy.linalg.solve function to obtain a closed-form solution via a Cholesky decomposition of dot(X.T, X) - 'sparse_cg' uses the conjugate gradient solver as found in scipy.sparse.linalg.cg. As an iterative algorithm, this solver is more appropriate than 'cholesky' for large-scale data (possibility to set `tol` and `max_iter`). - 'lsqr' uses the dedicated regularized least-squares routine scipy.sparse.linalg.lsqr. It is the fastest and uses an iterative procedure. - 'sag' uses a Stochastic Average Gradient descent, and 'saga' uses its improved, unbiased version named SAGA. Both methods also use an iterative procedure, and are often faster than other solvers when both n_samples and n_features are large. Note that 'sag' and 'saga' fast convergence is only guaranteed on features with approximately the same scale. You can preprocess the data with a scaler from sklearn.preprocessing. All last five solvers support both dense and sparse data. However, only 'sag' and 'sparse_cg' supports sparse input when `fit_intercept` is True. .. versionadded:: 0.17 Stochastic Average Gradient descent solver. .. versionadded:: 0.19 SAGA solver. max_iter : int, default=None Maximum number of iterations for conjugate gradient solver. For the 'sparse_cg' and 'lsqr' solvers, the default value is determined by scipy.sparse.linalg. For 'sag' and saga solver, the default value is 1000. tol : float, default=1e-3 Precision of the solution. verbose : int, default=0 Verbosity level. Setting verbose > 0 will display additional information depending on the solver used. random_state : int, RandomState instance, default=None Used when ``solver`` == 'sag' or 'saga' to shuffle the data. See :term:`Glossary ` for details. return_n_iter : bool, default=False If True, the method also returns `n_iter`, the actual number of iteration performed by the solver. .. versionadded:: 0.17 return_intercept : bool, default=False If True and if X is sparse, the method also returns the intercept, and the solver is automatically changed to 'sag'. This is only a temporary fix for fitting the intercept with sparse data. For dense data, use sklearn.linear_model._preprocess_data before your regression. .. versionadded:: 0.17 check_input : bool, default=True If False, the input arrays X and y will not be checked. .. versionadded:: 0.21 Returns ------- coef : ndarray of shape (n_features,) or (n_targets, n_features) Weight vector(s). n_iter : int, optional The actual number of iteration performed by the solver. Only returned if `return_n_iter` is True. intercept : float or ndarray of shape (n_targets,) The intercept of the model. Only returned if `return_intercept` is True and if X is a scipy sparse array. Notes ----- This function won't compute the intercept. """ return _ridge_regression(X, y, alpha, sample_weight=sample_weight, solver=solver, max_iter=max_iter, tol=tol, verbose=verbose, random_state=random_state, return_n_iter=return_n_iter, return_intercept=return_intercept, X_scale=None, X_offset=None, check_input=check_input) def _ridge_regression(X, y, alpha, sample_weight=None, solver='auto', max_iter=None, tol=1e-3, verbose=0, random_state=None, return_n_iter=False, return_intercept=False, X_scale=None, X_offset=None, check_input=True): has_sw = sample_weight is not None if solver == 'auto': if return_intercept: # only sag supports fitting intercept directly solver = "sag" elif not sparse.issparse(X): solver = "cholesky" else: solver = "sparse_cg" if solver not in ('sparse_cg', 'cholesky', 'svd', 'lsqr', 'sag', 'saga'): raise ValueError("Known solvers are 'sparse_cg', 'cholesky', 'svd'" " 'lsqr', 'sag' or 'saga'. Got %s." % solver) if return_intercept and solver != 'sag': raise ValueError("In Ridge, only 'sag' solver can directly fit the " "intercept. Please change solver to 'sag' or set " "return_intercept=False.") if check_input: _dtype = [np.float64, np.float32] _accept_sparse = _get_valid_accept_sparse(sparse.issparse(X), solver) X = check_array(X, accept_sparse=_accept_sparse, dtype=_dtype, order="C") y = check_array(y, dtype=X.dtype, ensure_2d=False, order=None) check_consistent_length(X, y) n_samples, n_features = X.shape if y.ndim > 2: raise ValueError("Target y has the wrong shape %s" % str(y.shape)) ravel = False if y.ndim == 1: y = y.reshape(-1, 1) ravel = True n_samples_, n_targets = y.shape if n_samples != n_samples_: raise ValueError("Number of samples in X and y does not correspond:" " %d != %d" % (n_samples, n_samples_)) if has_sw: sample_weight = _check_sample_weight(sample_weight, X, dtype=X.dtype) if solver not in ['sag', 'saga']: # SAG supports sample_weight directly. For other solvers, # we implement sample_weight via a simple rescaling. X, y = _rescale_data(X, y, sample_weight) # There should be either 1 or n_targets penalties alpha = np.asarray(alpha, dtype=X.dtype).ravel() if alpha.size not in [1, n_targets]: raise ValueError("Number of targets and number of penalties " "do not correspond: %d != %d" % (alpha.size, n_targets)) if alpha.size == 1 and n_targets > 1: alpha = np.repeat(alpha, n_targets) n_iter = None if solver == 'sparse_cg': coef = _solve_sparse_cg(X, y, alpha, max_iter=max_iter, tol=tol, verbose=verbose, X_offset=X_offset, X_scale=X_scale) elif solver == 'lsqr': coef, n_iter = _solve_lsqr(X, y, alpha, max_iter, tol) elif solver == 'cholesky': if n_features > n_samples: K = safe_sparse_dot(X, X.T, dense_output=True) try: dual_coef = _solve_cholesky_kernel(K, y, alpha) coef = safe_sparse_dot(X.T, dual_coef, dense_output=True).T except linalg.LinAlgError: # use SVD solver if matrix is singular solver = 'svd' else: try: coef = _solve_cholesky(X, y, alpha) except linalg.LinAlgError: # use SVD solver if matrix is singular solver = 'svd' elif solver in ['sag', 'saga']: # precompute max_squared_sum for all targets max_squared_sum = row_norms(X, squared=True).max() coef = np.empty((y.shape[1], n_features), dtype=X.dtype) n_iter = np.empty(y.shape[1], dtype=np.int32) intercept = np.zeros((y.shape[1], ), dtype=X.dtype) for i, (alpha_i, target) in enumerate(zip(alpha, y.T)): init = {'coef': np.zeros((n_features + int(return_intercept), 1), dtype=X.dtype)} coef_, n_iter_, _ = sag_solver( X, target.ravel(), sample_weight, 'squared', alpha_i, 0, max_iter, tol, verbose, random_state, False, max_squared_sum, init, is_saga=solver == 'saga') if return_intercept: coef[i] = coef_[:-1] intercept[i] = coef_[-1] else: coef[i] = coef_ n_iter[i] = n_iter_ if intercept.shape[0] == 1: intercept = intercept[0] coef = np.asarray(coef) if solver == 'svd': if sparse.issparse(X): raise TypeError('SVD solver does not support sparse' ' inputs currently') coef = _solve_svd(X, y, alpha) if ravel: # When y was passed as a 1d-array, we flatten the coefficients. coef = coef.ravel() if return_n_iter and return_intercept: return coef, n_iter, intercept elif return_intercept: return coef, intercept elif return_n_iter: return coef, n_iter else: return coef class _BaseRidge(LinearModel, metaclass=ABCMeta): @abstractmethod @_deprecate_positional_args def __init__(self, alpha=1.0, *, fit_intercept=True, normalize=False, copy_X=True, max_iter=None, tol=1e-3, solver="auto", random_state=None): self.alpha = alpha self.fit_intercept = fit_intercept self.normalize = normalize self.copy_X = copy_X self.max_iter = max_iter self.tol = tol self.solver = solver self.random_state = random_state def fit(self, X, y, sample_weight=None): # all other solvers work at both float precision levels _dtype = [np.float64, np.float32] _accept_sparse = _get_valid_accept_sparse(sparse.issparse(X), self.solver) X, y = self._validate_data(X, y, accept_sparse=_accept_sparse, dtype=_dtype, multi_output=True, y_numeric=True) if sparse.issparse(X) and self.fit_intercept: if self.solver not in ['auto', 'sparse_cg', 'sag']: raise ValueError( "solver='{}' does not support fitting the intercept " "on sparse data. Please set the solver to 'auto' or " "'sparse_cg', 'sag', or set `fit_intercept=False`" .format(self.solver)) if (self.solver == 'sag' and self.max_iter is None and self.tol > 1e-4): warnings.warn( '"sag" solver requires many iterations to fit ' 'an intercept with sparse inputs. Either set the ' 'solver to "auto" or "sparse_cg", or set a low ' '"tol" and a high "max_iter" (especially if inputs are ' 'not standardized).') solver = 'sag' else: solver = 'sparse_cg' else: solver = self.solver if sample_weight is not None: sample_weight = _check_sample_weight(sample_weight, X, dtype=X.dtype) # when X is sparse we only remove offset from y X, y, X_offset, y_offset, X_scale = self._preprocess_data( X, y, self.fit_intercept, self.normalize, self.copy_X, sample_weight=sample_weight, return_mean=True) if solver == 'sag' and sparse.issparse(X) and self.fit_intercept: self.coef_, self.n_iter_, self.intercept_ = _ridge_regression( X, y, alpha=self.alpha, sample_weight=sample_weight, max_iter=self.max_iter, tol=self.tol, solver='sag', random_state=self.random_state, return_n_iter=True, return_intercept=True, check_input=False) # add the offset which was subtracted by _preprocess_data self.intercept_ += y_offset else: if sparse.issparse(X) and self.fit_intercept: # required to fit intercept with sparse_cg solver params = {'X_offset': X_offset, 'X_scale': X_scale} else: # for dense matrices or when intercept is set to 0 params = {} self.coef_, self.n_iter_ = _ridge_regression( X, y, alpha=self.alpha, sample_weight=sample_weight, max_iter=self.max_iter, tol=self.tol, solver=solver, random_state=self.random_state, return_n_iter=True, return_intercept=False, check_input=False, **params) self._set_intercept(X_offset, y_offset, X_scale) return self class Ridge(MultiOutputMixin, RegressorMixin, _BaseRidge): """Linear least squares with l2 regularization. Minimizes the objective function:: ||y - Xw||^2_2 + alpha * ||w||^2_2 This model solves a regression model where the loss function is the linear least squares function and regularization is given by the l2-norm. Also known as Ridge Regression or Tikhonov regularization. This estimator has built-in support for multi-variate regression (i.e., when y is a 2d-array of shape (n_samples, n_targets)). Read more in the :ref:`User Guide `. Parameters ---------- alpha : {float, ndarray of shape (n_targets,)}, default=1.0 Regularization strength; must be a positive float. Regularization improves the conditioning of the problem and reduces the variance of the estimates. Larger values specify stronger regularization. Alpha corresponds to ``1 / (2C)`` in other linear models such as :class:`~sklearn.linear_model.LogisticRegression` or :class:`~sklearn.svm.LinearSVC`. If an array is passed, penalties are assumed to be specific to the targets. Hence they must correspond in number. fit_intercept : bool, default=True Whether to fit the intercept for this model. If set to false, no intercept will be used in calculations (i.e. ``X`` and ``y`` are expected to be centered). normalize : bool, default=False This parameter is ignored when ``fit_intercept`` is set to False. If True, the regressors X will be normalized before regression by subtracting the mean and dividing by the l2-norm. If you wish to standardize, please use :class:`~sklearn.preprocessing.StandardScaler` before calling ``fit`` on an estimator with ``normalize=False``. copy_X : bool, default=True If True, X will be copied; else, it may be overwritten. max_iter : int, default=None Maximum number of iterations for conjugate gradient solver. For 'sparse_cg' and 'lsqr' solvers, the default value is determined by scipy.sparse.linalg. For 'sag' solver, the default value is 1000. tol : float, default=1e-3 Precision of the solution. solver : {'auto', 'svd', 'cholesky', 'lsqr', 'sparse_cg', 'sag', 'saga'}, \ default='auto' Solver to use in the computational routines: - 'auto' chooses the solver automatically based on the type of data. - 'svd' uses a Singular Value Decomposition of X to compute the Ridge coefficients. More stable for singular matrices than 'cholesky'. - 'cholesky' uses the standard scipy.linalg.solve function to obtain a closed-form solution. - 'sparse_cg' uses the conjugate gradient solver as found in scipy.sparse.linalg.cg. As an iterative algorithm, this solver is more appropriate than 'cholesky' for large-scale data (possibility to set `tol` and `max_iter`). - 'lsqr' uses the dedicated regularized least-squares routine scipy.sparse.linalg.lsqr. It is the fastest and uses an iterative procedure. - 'sag' uses a Stochastic Average Gradient descent, and 'saga' uses its improved, unbiased version named SAGA. Both methods also use an iterative procedure, and are often faster than other solvers when both n_samples and n_features are large. Note that 'sag' and 'saga' fast convergence is only guaranteed on features with approximately the same scale. You can preprocess the data with a scaler from sklearn.preprocessing. All last five solvers support both dense and sparse data. However, only 'sag' and 'sparse_cg' supports sparse input when `fit_intercept` is True. .. versionadded:: 0.17 Stochastic Average Gradient descent solver. .. versionadded:: 0.19 SAGA solver. random_state : int, RandomState instance, default=None Used when ``solver`` == 'sag' or 'saga' to shuffle the data. See :term:`Glossary ` for details. .. versionadded:: 0.17 `random_state` to support Stochastic Average Gradient. Attributes ---------- coef_ : ndarray of shape (n_features,) or (n_targets, n_features) Weight vector(s). intercept_ : float or ndarray of shape (n_targets,) Independent term in decision function. Set to 0.0 if ``fit_intercept = False``. n_iter_ : None or ndarray of shape (n_targets,) Actual number of iterations for each target. Available only for sag and lsqr solvers. Other solvers will return None. .. versionadded:: 0.17 See Also -------- RidgeClassifier : Ridge classifier. RidgeCV : Ridge regression with built-in cross validation. :class:`~sklearn.kernel_ridge.KernelRidge` : Kernel ridge regression combines ridge regression with the kernel trick. Examples -------- >>> from sklearn.linear_model import Ridge >>> import numpy as np >>> n_samples, n_features = 10, 5 >>> rng = np.random.RandomState(0) >>> y = rng.randn(n_samples) >>> X = rng.randn(n_samples, n_features) >>> clf = Ridge(alpha=1.0) >>> clf.fit(X, y) Ridge() """ @_deprecate_positional_args def __init__(self, alpha=1.0, *, fit_intercept=True, normalize=False, copy_X=True, max_iter=None, tol=1e-3, solver="auto", random_state=None): super().__init__( alpha=alpha, fit_intercept=fit_intercept, normalize=normalize, copy_X=copy_X, max_iter=max_iter, tol=tol, solver=solver, random_state=random_state) def fit(self, X, y, sample_weight=None): """Fit Ridge regression model. Parameters ---------- X : {ndarray, sparse matrix} of shape (n_samples, n_features) Training data y : ndarray of shape (n_samples,) or (n_samples, n_targets) Target values sample_weight : float or ndarray of shape (n_samples,), default=None Individual weights for each sample. If given a float, every sample will have the same weight. Returns ------- self : returns an instance of self. """ return super().fit(X, y, sample_weight=sample_weight) class RidgeClassifier(LinearClassifierMixin, _BaseRidge): """Classifier using Ridge regression. This classifier first converts the target values into ``{-1, 1}`` and then treats the problem as a regression task (multi-output regression in the multiclass case). Read more in the :ref:`User Guide `. Parameters ---------- alpha : float, default=1.0 Regularization strength; must be a positive float. Regularization improves the conditioning of the problem and reduces the variance of the estimates. Larger values specify stronger regularization. Alpha corresponds to ``1 / (2C)`` in other linear models such as :class:`~sklearn.linear_model.LogisticRegression` or :class:`~sklearn.svm.LinearSVC`. fit_intercept : bool, default=True Whether to calculate the intercept for this model. If set to false, no intercept will be used in calculations (e.g. data is expected to be already centered). normalize : bool, default=False This parameter is ignored when ``fit_intercept`` is set to False. If True, the regressors X will be normalized before regression by subtracting the mean and dividing by the l2-norm. If you wish to standardize, please use :class:`~sklearn.preprocessing.StandardScaler` before calling ``fit`` on an estimator with ``normalize=False``. copy_X : bool, default=True If True, X will be copied; else, it may be overwritten. max_iter : int, default=None Maximum number of iterations for conjugate gradient solver. The default value is determined by scipy.sparse.linalg. tol : float, default=1e-3 Precision of the solution. class_weight : dict or 'balanced', default=None Weights associated with classes in the form ``{class_label: weight}``. If not given, all classes are supposed to have weight one. The "balanced" mode uses the values of y to automatically adjust weights inversely proportional to class frequencies in the input data as ``n_samples / (n_classes * np.bincount(y))``. solver : {'auto', 'svd', 'cholesky', 'lsqr', 'sparse_cg', 'sag', 'saga'}, \ default='auto' Solver to use in the computational routines: - 'auto' chooses the solver automatically based on the type of data. - 'svd' uses a Singular Value Decomposition of X to compute the Ridge coefficients. More stable for singular matrices than 'cholesky'. - 'cholesky' uses the standard scipy.linalg.solve function to obtain a closed-form solution. - 'sparse_cg' uses the conjugate gradient solver as found in scipy.sparse.linalg.cg. As an iterative algorithm, this solver is more appropriate than 'cholesky' for large-scale data (possibility to set `tol` and `max_iter`). - 'lsqr' uses the dedicated regularized least-squares routine scipy.sparse.linalg.lsqr. It is the fastest and uses an iterative procedure. - 'sag' uses a Stochastic Average Gradient descent, and 'saga' uses its unbiased and more flexible version named SAGA. Both methods use an iterative procedure, and are often faster than other solvers when both n_samples and n_features are large. Note that 'sag' and 'saga' fast convergence is only guaranteed on features with approximately the same scale. You can preprocess the data with a scaler from sklearn.preprocessing. .. versionadded:: 0.17 Stochastic Average Gradient descent solver. .. versionadded:: 0.19 SAGA solver. random_state : int, RandomState instance, default=None Used when ``solver`` == 'sag' or 'saga' to shuffle the data. See :term:`Glossary ` for details. Attributes ---------- coef_ : ndarray of shape (1, n_features) or (n_classes, n_features) Coefficient of the features in the decision function. ``coef_`` is of shape (1, n_features) when the given problem is binary. intercept_ : float or ndarray of shape (n_targets,) Independent term in decision function. Set to 0.0 if ``fit_intercept = False``. n_iter_ : None or ndarray of shape (n_targets,) Actual number of iterations for each target. Available only for sag and lsqr solvers. Other solvers will return None. classes_ : ndarray of shape (n_classes,) The classes labels. See Also -------- Ridge : Ridge regression. RidgeClassifierCV : Ridge classifier with built-in cross validation. Notes ----- For multi-class classification, n_class classifiers are trained in a one-versus-all approach. Concretely, this is implemented by taking advantage of the multi-variate response support in Ridge. Examples -------- >>> from sklearn.datasets import load_breast_cancer >>> from sklearn.linear_model import RidgeClassifier >>> X, y = load_breast_cancer(return_X_y=True) >>> clf = RidgeClassifier().fit(X, y) >>> clf.score(X, y) 0.9595... """ @_deprecate_positional_args def __init__(self, alpha=1.0, *, fit_intercept=True, normalize=False, copy_X=True, max_iter=None, tol=1e-3, class_weight=None, solver="auto", random_state=None): super().__init__( alpha=alpha, fit_intercept=fit_intercept, normalize=normalize, copy_X=copy_X, max_iter=max_iter, tol=tol, solver=solver, random_state=random_state) self.class_weight = class_weight def fit(self, X, y, sample_weight=None): """Fit Ridge classifier model. Parameters ---------- X : {ndarray, sparse matrix} of shape (n_samples, n_features) Training data. y : ndarray of shape (n_samples,) Target values. sample_weight : float or ndarray of shape (n_samples,), default=None Individual weights for each sample. If given a float, every sample will have the same weight. .. versionadded:: 0.17 *sample_weight* support to Classifier. Returns ------- self : object Instance of the estimator. """ _accept_sparse = _get_valid_accept_sparse(sparse.issparse(X), self.solver) X, y = self._validate_data(X, y, accept_sparse=_accept_sparse, multi_output=True, y_numeric=False) sample_weight = _check_sample_weight(sample_weight, X, dtype=X.dtype) self._label_binarizer = LabelBinarizer(pos_label=1, neg_label=-1) Y = self._label_binarizer.fit_transform(y) if not self._label_binarizer.y_type_.startswith('multilabel'): y = column_or_1d(y, warn=True) else: # we don't (yet) support multi-label classification in Ridge raise ValueError( "%s doesn't support multi-label classification" % ( self.__class__.__name__)) if self.class_weight: # modify the sample weights with the corresponding class weight sample_weight = (sample_weight * compute_sample_weight(self.class_weight, y)) super().fit(X, Y, sample_weight=sample_weight) return self @property def classes_(self): return self._label_binarizer.classes_ def _check_gcv_mode(X, gcv_mode): possible_gcv_modes = [None, 'auto', 'svd', 'eigen'] if gcv_mode not in possible_gcv_modes: raise ValueError( "Unknown value for 'gcv_mode'. " "Got {} instead of one of {}" .format( gcv_mode, possible_gcv_modes)) if gcv_mode in ['eigen', 'svd']: return gcv_mode # if X has more rows than columns, use decomposition of X^T.X, # otherwise X.X^T if X.shape[0] > X.shape[1]: return 'svd' return 'eigen' def _find_smallest_angle(query, vectors): """Find the column of vectors that is most aligned with the query. Both query and the columns of vectors must have their l2 norm equal to 1. Parameters ---------- query : ndarray of shape (n_samples,) Normalized query vector. vectors : ndarray of shape (n_samples, n_features) Vectors to which we compare query, as columns. Must be normalized. """ abs_cosine = np.abs(query.dot(vectors)) index = np.argmax(abs_cosine) return index class _X_CenterStackOp(sparse.linalg.LinearOperator): """Behaves as centered and scaled X with an added intercept column. This operator behaves as np.hstack([X - sqrt_sw[:, None] * X_mean, sqrt_sw[:, None]]) """ def __init__(self, X, X_mean, sqrt_sw): n_samples, n_features = X.shape super().__init__(X.dtype, (n_samples, n_features + 1)) self.X = X self.X_mean = X_mean self.sqrt_sw = sqrt_sw def _matvec(self, v): v = v.ravel() return safe_sparse_dot( self.X, v[:-1], dense_output=True ) - self.sqrt_sw * self.X_mean.dot(v[:-1]) + v[-1] * self.sqrt_sw def _matmat(self, v): return ( safe_sparse_dot(self.X, v[:-1], dense_output=True) - self.sqrt_sw[:, None] * self.X_mean.dot(v[:-1]) + v[-1] * self.sqrt_sw[:, None]) def _transpose(self): return _XT_CenterStackOp(self.X, self.X_mean, self.sqrt_sw) class _XT_CenterStackOp(sparse.linalg.LinearOperator): """Behaves as transposed centered and scaled X with an intercept column. This operator behaves as np.hstack([X - sqrt_sw[:, None] * X_mean, sqrt_sw[:, None]]).T """ def __init__(self, X, X_mean, sqrt_sw): n_samples, n_features = X.shape super().__init__(X.dtype, (n_features + 1, n_samples)) self.X = X self.X_mean = X_mean self.sqrt_sw = sqrt_sw def _matvec(self, v): v = v.ravel() n_features = self.shape[0] res = np.empty(n_features, dtype=self.X.dtype) res[:-1] = ( safe_sparse_dot(self.X.T, v, dense_output=True) - (self.X_mean * self.sqrt_sw.dot(v)) ) res[-1] = np.dot(v, self.sqrt_sw) return res def _matmat(self, v): n_features = self.shape[0] res = np.empty((n_features, v.shape[1]), dtype=self.X.dtype) res[:-1] = ( safe_sparse_dot(self.X.T, v, dense_output=True) - self.X_mean[:, None] * self.sqrt_sw.dot(v) ) res[-1] = np.dot(self.sqrt_sw, v) return res class _IdentityRegressor: """Fake regressor which will directly output the prediction.""" def decision_function(self, y_predict): return y_predict def predict(self, y_predict): return y_predict class _IdentityClassifier(LinearClassifierMixin): """Fake classifier which will directly output the prediction. We inherit from LinearClassifierMixin to get the proper shape for the output `y`. """ def __init__(self, classes): self.classes_ = classes def decision_function(self, y_predict): return y_predict class _RidgeGCV(LinearModel): """Ridge regression with built-in Leave-one-out Cross-Validation. This class is not intended to be used directly. Use RidgeCV instead. Notes ----- We want to solve (K + alpha*Id)c = y, where K = X X^T is the kernel matrix. Let G = (K + alpha*Id). Dual solution: c = G^-1y Primal solution: w = X^T c Compute eigendecomposition K = Q V Q^T. Then G^-1 = Q (V + alpha*Id)^-1 Q^T, where (V + alpha*Id) is diagonal. It is thus inexpensive to inverse for many alphas. Let loov be the vector of prediction values for each example when the model was fitted with all examples but this example. loov = (KG^-1Y - diag(KG^-1)Y) / diag(I-KG^-1) Let looe be the vector of prediction errors for each example when the model was fitted with all examples but this example. looe = y - loov = c / diag(G^-1) The best score (negative mean squared error or user-provided scoring) is stored in the `best_score_` attribute, and the selected hyperparameter in `alpha_`. References ---------- http://cbcl.mit.edu/publications/ps/MIT-CSAIL-TR-2007-025.pdf https://www.mit.edu/~9.520/spring07/Classes/rlsslides.pdf """ @_deprecate_positional_args def __init__(self, alphas=(0.1, 1.0, 10.0), *, fit_intercept=True, normalize=False, scoring=None, copy_X=True, gcv_mode=None, store_cv_values=False, is_clf=False, alpha_per_target=False): self.alphas = np.asarray(alphas) self.fit_intercept = fit_intercept self.normalize = normalize self.scoring = scoring self.copy_X = copy_X self.gcv_mode = gcv_mode self.store_cv_values = store_cv_values self.is_clf = is_clf self.alpha_per_target = alpha_per_target @staticmethod def _decomp_diag(v_prime, Q): # compute diagonal of the matrix: dot(Q, dot(diag(v_prime), Q^T)) return (v_prime * Q ** 2).sum(axis=-1) @staticmethod def _diag_dot(D, B): # compute dot(diag(D), B) if len(B.shape) > 1: # handle case where B is > 1-d D = D[(slice(None), ) + (np.newaxis, ) * (len(B.shape) - 1)] return D * B def _compute_gram(self, X, sqrt_sw): """Computes the Gram matrix XX^T with possible centering. Parameters ---------- X : {ndarray, sparse matrix} of shape (n_samples, n_features) The preprocessed design matrix. sqrt_sw : ndarray of shape (n_samples,) square roots of sample weights Returns ------- gram : ndarray of shape (n_samples, n_samples) The Gram matrix. X_mean : ndarray of shape (n_feature,) The weighted mean of ``X`` for each feature. Notes ----- When X is dense the centering has been done in preprocessing so the mean is 0 and we just compute XX^T. When X is sparse it has not been centered in preprocessing, but it has been scaled by sqrt(sample weights). When self.fit_intercept is False no centering is done. The centered X is never actually computed because centering would break the sparsity of X. """ center = self.fit_intercept and sparse.issparse(X) if not center: # in this case centering has been done in preprocessing # or we are not fitting an intercept. X_mean = np.zeros(X.shape[1], dtype=X.dtype) return safe_sparse_dot(X, X.T, dense_output=True), X_mean # X is sparse n_samples = X.shape[0] sample_weight_matrix = sparse.dia_matrix( (sqrt_sw, 0), shape=(n_samples, n_samples)) X_weighted = sample_weight_matrix.dot(X) X_mean, _ = mean_variance_axis(X_weighted, axis=0) X_mean *= n_samples / sqrt_sw.dot(sqrt_sw) X_mX = sqrt_sw[:, None] * safe_sparse_dot( X_mean, X.T, dense_output=True) X_mX_m = np.outer(sqrt_sw, sqrt_sw) * np.dot(X_mean, X_mean) return (safe_sparse_dot(X, X.T, dense_output=True) + X_mX_m - X_mX - X_mX.T, X_mean) def _compute_covariance(self, X, sqrt_sw): """Computes covariance matrix X^TX with possible centering. Parameters ---------- X : sparse matrix of shape (n_samples, n_features) The preprocessed design matrix. sqrt_sw : ndarray of shape (n_samples,) square roots of sample weights Returns ------- covariance : ndarray of shape (n_features, n_features) The covariance matrix. X_mean : ndarray of shape (n_feature,) The weighted mean of ``X`` for each feature. Notes ----- Since X is sparse it has not been centered in preprocessing, but it has been scaled by sqrt(sample weights). When self.fit_intercept is False no centering is done. The centered X is never actually computed because centering would break the sparsity of X. """ if not self.fit_intercept: # in this case centering has been done in preprocessing # or we are not fitting an intercept. X_mean = np.zeros(X.shape[1], dtype=X.dtype) return safe_sparse_dot(X.T, X, dense_output=True), X_mean # this function only gets called for sparse X n_samples = X.shape[0] sample_weight_matrix = sparse.dia_matrix( (sqrt_sw, 0), shape=(n_samples, n_samples)) X_weighted = sample_weight_matrix.dot(X) X_mean, _ = mean_variance_axis(X_weighted, axis=0) X_mean = X_mean * n_samples / sqrt_sw.dot(sqrt_sw) weight_sum = sqrt_sw.dot(sqrt_sw) return (safe_sparse_dot(X.T, X, dense_output=True) - weight_sum * np.outer(X_mean, X_mean), X_mean) def _sparse_multidot_diag(self, X, A, X_mean, sqrt_sw): """Compute the diagonal of (X - X_mean).dot(A).dot((X - X_mean).T) without explicitely centering X nor computing X.dot(A) when X is sparse. Parameters ---------- X : sparse matrix of shape (n_samples, n_features) A : ndarray of shape (n_features, n_features) X_mean : ndarray of shape (n_features,) sqrt_sw : ndarray of shape (n_features,) square roots of sample weights Returns ------- diag : np.ndarray, shape (n_samples,) The computed diagonal. """ intercept_col = scale = sqrt_sw batch_size = X.shape[1] diag = np.empty(X.shape[0], dtype=X.dtype) for start in range(0, X.shape[0], batch_size): batch = slice(start, min(X.shape[0], start + batch_size), 1) X_batch = np.empty( (X[batch].shape[0], X.shape[1] + self.fit_intercept), dtype=X.dtype ) if self.fit_intercept: X_batch[:, :-1] = X[batch].A - X_mean * scale[batch][:, None] X_batch[:, -1] = intercept_col[batch] else: X_batch = X[batch].A diag[batch] = (X_batch.dot(A) * X_batch).sum(axis=1) return diag def _eigen_decompose_gram(self, X, y, sqrt_sw): """Eigendecomposition of X.X^T, used when n_samples <= n_features.""" # if X is dense it has already been centered in preprocessing K, X_mean = self._compute_gram(X, sqrt_sw) if self.fit_intercept: # to emulate centering X with sample weights, # ie removing the weighted average, we add a column # containing the square roots of the sample weights. # by centering, it is orthogonal to the other columns K += np.outer(sqrt_sw, sqrt_sw) eigvals, Q = linalg.eigh(K) QT_y = np.dot(Q.T, y) return X_mean, eigvals, Q, QT_y def _solve_eigen_gram(self, alpha, y, sqrt_sw, X_mean, eigvals, Q, QT_y): """Compute dual coefficients and diagonal of G^-1. Used when we have a decomposition of X.X^T (n_samples <= n_features). """ w = 1. / (eigvals + alpha) if self.fit_intercept: # the vector containing the square roots of the sample weights (1 # when no sample weights) is the eigenvector of XX^T which # corresponds to the intercept; we cancel the regularization on # this dimension. the corresponding eigenvalue is # sum(sample_weight). normalized_sw = sqrt_sw / np.linalg.norm(sqrt_sw) intercept_dim = _find_smallest_angle(normalized_sw, Q) w[intercept_dim] = 0 # cancel regularization for the intercept c = np.dot(Q, self._diag_dot(w, QT_y)) G_inverse_diag = self._decomp_diag(w, Q) # handle case where y is 2-d if len(y.shape) != 1: G_inverse_diag = G_inverse_diag[:, np.newaxis] return G_inverse_diag, c def _eigen_decompose_covariance(self, X, y, sqrt_sw): """Eigendecomposition of X^T.X, used when n_samples > n_features and X is sparse. """ n_samples, n_features = X.shape cov = np.empty((n_features + 1, n_features + 1), dtype=X.dtype) cov[:-1, :-1], X_mean = self._compute_covariance(X, sqrt_sw) if not self.fit_intercept: cov = cov[:-1, :-1] # to emulate centering X with sample weights, # ie removing the weighted average, we add a column # containing the square roots of the sample weights. # by centering, it is orthogonal to the other columns # when all samples have the same weight we add a column of 1 else: cov[-1] = 0 cov[:, -1] = 0 cov[-1, -1] = sqrt_sw.dot(sqrt_sw) nullspace_dim = max(0, n_features - n_samples) eigvals, V = linalg.eigh(cov) # remove eigenvalues and vectors in the null space of X^T.X eigvals = eigvals[nullspace_dim:] V = V[:, nullspace_dim:] return X_mean, eigvals, V, X def _solve_eigen_covariance_no_intercept( self, alpha, y, sqrt_sw, X_mean, eigvals, V, X): """Compute dual coefficients and diagonal of G^-1. Used when we have a decomposition of X^T.X (n_samples > n_features and X is sparse), and not fitting an intercept. """ w = 1 / (eigvals + alpha) A = (V * w).dot(V.T) AXy = A.dot(safe_sparse_dot(X.T, y, dense_output=True)) y_hat = safe_sparse_dot(X, AXy, dense_output=True) hat_diag = self._sparse_multidot_diag(X, A, X_mean, sqrt_sw) if len(y.shape) != 1: # handle case where y is 2-d hat_diag = hat_diag[:, np.newaxis] return (1 - hat_diag) / alpha, (y - y_hat) / alpha def _solve_eigen_covariance_intercept( self, alpha, y, sqrt_sw, X_mean, eigvals, V, X): """Compute dual coefficients and diagonal of G^-1. Used when we have a decomposition of X^T.X (n_samples > n_features and X is sparse), and we are fitting an intercept. """ # the vector [0, 0, ..., 0, 1] # is the eigenvector of X^TX which # corresponds to the intercept; we cancel the regularization on # this dimension. the corresponding eigenvalue is # sum(sample_weight), e.g. n when uniform sample weights. intercept_sv = np.zeros(V.shape[0]) intercept_sv[-1] = 1 intercept_dim = _find_smallest_angle(intercept_sv, V) w = 1 / (eigvals + alpha) w[intercept_dim] = 1 / eigvals[intercept_dim] A = (V * w).dot(V.T) # add a column to X containing the square roots of sample weights X_op = _X_CenterStackOp(X, X_mean, sqrt_sw) AXy = A.dot(X_op.T.dot(y)) y_hat = X_op.dot(AXy) hat_diag = self._sparse_multidot_diag(X, A, X_mean, sqrt_sw) # return (1 - hat_diag), (y - y_hat) if len(y.shape) != 1: # handle case where y is 2-d hat_diag = hat_diag[:, np.newaxis] return (1 - hat_diag) / alpha, (y - y_hat) / alpha def _solve_eigen_covariance( self, alpha, y, sqrt_sw, X_mean, eigvals, V, X): """Compute dual coefficients and diagonal of G^-1. Used when we have a decomposition of X^T.X (n_samples > n_features and X is sparse). """ if self.fit_intercept: return self._solve_eigen_covariance_intercept( alpha, y, sqrt_sw, X_mean, eigvals, V, X) return self._solve_eigen_covariance_no_intercept( alpha, y, sqrt_sw, X_mean, eigvals, V, X) def _svd_decompose_design_matrix(self, X, y, sqrt_sw): # X already centered X_mean = np.zeros(X.shape[1], dtype=X.dtype) if self.fit_intercept: # to emulate fit_intercept=True situation, add a column # containing the square roots of the sample weights # by centering, the other columns are orthogonal to that one intercept_column = sqrt_sw[:, None] X = np.hstack((X, intercept_column)) U, singvals, _ = linalg.svd(X, full_matrices=0) singvals_sq = singvals ** 2 UT_y = np.dot(U.T, y) return X_mean, singvals_sq, U, UT_y def _solve_svd_design_matrix( self, alpha, y, sqrt_sw, X_mean, singvals_sq, U, UT_y): """Compute dual coefficients and diagonal of G^-1. Used when we have an SVD decomposition of X (n_samples > n_features and X is dense). """ w = ((singvals_sq + alpha) ** -1) - (alpha ** -1) if self.fit_intercept: # detect intercept column normalized_sw = sqrt_sw / np.linalg.norm(sqrt_sw) intercept_dim = _find_smallest_angle(normalized_sw, U) # cancel the regularization for the intercept w[intercept_dim] = - (alpha ** -1) c = np.dot(U, self._diag_dot(w, UT_y)) + (alpha ** -1) * y G_inverse_diag = self._decomp_diag(w, U) + (alpha ** -1) if len(y.shape) != 1: # handle case where y is 2-d G_inverse_diag = G_inverse_diag[:, np.newaxis] return G_inverse_diag, c def fit(self, X, y, sample_weight=None): """Fit Ridge regression model with gcv. Parameters ---------- X : {ndarray, sparse matrix} of shape (n_samples, n_features) Training data. Will be cast to float64 if necessary. y : ndarray of shape (n_samples,) or (n_samples, n_targets) Target values. Will be cast to float64 if necessary. sample_weight : float or ndarray of shape (n_samples,), default=None Individual weights for each sample. If given a float, every sample will have the same weight. Returns ------- self : object """ X, y = self._validate_data(X, y, accept_sparse=['csr', 'csc', 'coo'], dtype=[np.float64], multi_output=True, y_numeric=True) # alpha_per_target cannot be used in classifier mode. All subclasses # of _RidgeGCV that are classifiers keep alpha_per_target at its # default value: False, so the condition below should never happen. assert not (self.is_clf and self.alpha_per_target) if sample_weight is not None: sample_weight = _check_sample_weight(sample_weight, X, dtype=X.dtype) if np.any(self.alphas <= 0): raise ValueError( "alphas must be positive. Got {} containing some " "negative or null value instead.".format(self.alphas)) X, y, X_offset, y_offset, X_scale = LinearModel._preprocess_data( X, y, self.fit_intercept, self.normalize, self.copy_X, sample_weight=sample_weight) gcv_mode = _check_gcv_mode(X, self.gcv_mode) if gcv_mode == 'eigen': decompose = self._eigen_decompose_gram solve = self._solve_eigen_gram elif gcv_mode == 'svd': if sparse.issparse(X): decompose = self._eigen_decompose_covariance solve = self._solve_eigen_covariance else: decompose = self._svd_decompose_design_matrix solve = self._solve_svd_design_matrix n_samples = X.shape[0] if sample_weight is not None: X, y = _rescale_data(X, y, sample_weight) sqrt_sw = np.sqrt(sample_weight) else: sqrt_sw = np.ones(n_samples, dtype=X.dtype) X_mean, *decomposition = decompose(X, y, sqrt_sw) scorer = check_scoring(self, scoring=self.scoring, allow_none=True) error = scorer is None n_y = 1 if len(y.shape) == 1 else y.shape[1] n_alphas = 1 if np.ndim(self.alphas) == 0 else len(self.alphas) if self.store_cv_values: self.cv_values_ = np.empty( (n_samples * n_y, n_alphas), dtype=X.dtype) best_coef, best_score, best_alpha = None, None, None for i, alpha in enumerate(np.atleast_1d(self.alphas)): G_inverse_diag, c = solve( float(alpha), y, sqrt_sw, X_mean, *decomposition) if error: squared_errors = (c / G_inverse_diag) ** 2 if self.alpha_per_target: alpha_score = -squared_errors.mean(axis=0) else: alpha_score = -squared_errors.mean() if self.store_cv_values: self.cv_values_[:, i] = squared_errors.ravel() else: predictions = y - (c / G_inverse_diag) if self.store_cv_values: self.cv_values_[:, i] = predictions.ravel() if self.is_clf: identity_estimator = _IdentityClassifier( classes=np.arange(n_y) ) alpha_score = scorer(identity_estimator, predictions, y.argmax(axis=1)) else: identity_estimator = _IdentityRegressor() if self.alpha_per_target: alpha_score = np.array([ scorer(identity_estimator, predictions[:, j], y[:, j]) for j in range(n_y) ]) else: alpha_score = scorer(identity_estimator, predictions.ravel(), y.ravel()) # Keep track of the best model if best_score is None: # initialize if self.alpha_per_target and n_y > 1: best_coef = c best_score = np.atleast_1d(alpha_score) best_alpha = np.full(n_y, alpha) else: best_coef = c best_score = alpha_score best_alpha = alpha else: # update if self.alpha_per_target and n_y > 1: to_update = alpha_score > best_score best_coef[:, to_update] = c[:, to_update] best_score[to_update] = alpha_score[to_update] best_alpha[to_update] = alpha elif alpha_score > best_score: best_coef, best_score, best_alpha = c, alpha_score, alpha self.alpha_ = best_alpha self.best_score_ = best_score self.dual_coef_ = best_coef self.coef_ = safe_sparse_dot(self.dual_coef_.T, X) X_offset += X_mean * X_scale self._set_intercept(X_offset, y_offset, X_scale) if self.store_cv_values: if len(y.shape) == 1: cv_values_shape = n_samples, n_alphas else: cv_values_shape = n_samples, n_y, n_alphas self.cv_values_ = self.cv_values_.reshape(cv_values_shape) return self class _BaseRidgeCV(LinearModel): @_deprecate_positional_args def __init__(self, alphas=(0.1, 1.0, 10.0), *, fit_intercept=True, normalize=False, scoring=None, cv=None, gcv_mode=None, store_cv_values=False, alpha_per_target=False): self.alphas = np.asarray(alphas) self.fit_intercept = fit_intercept self.normalize = normalize self.scoring = scoring self.cv = cv self.gcv_mode = gcv_mode self.store_cv_values = store_cv_values self.alpha_per_target = alpha_per_target def fit(self, X, y, sample_weight=None): """Fit Ridge regression model with cv. Parameters ---------- X : ndarray of shape (n_samples, n_features) Training data. If using GCV, will be cast to float64 if necessary. y : ndarray of shape (n_samples,) or (n_samples, n_targets) Target values. Will be cast to X's dtype if necessary. sample_weight : float or ndarray of shape (n_samples,), default=None Individual weights for each sample. If given a float, every sample will have the same weight. Returns ------- self : object Notes ----- When sample_weight is provided, the selected hyperparameter may depend on whether we use leave-one-out cross-validation (cv=None or cv='auto') or another form of cross-validation, because only leave-one-out cross-validation takes the sample weights into account when computing the validation score. """ cv = self.cv if cv is None: estimator = _RidgeGCV(self.alphas, fit_intercept=self.fit_intercept, normalize=self.normalize, scoring=self.scoring, gcv_mode=self.gcv_mode, store_cv_values=self.store_cv_values, is_clf=is_classifier(self), alpha_per_target=self.alpha_per_target) estimator.fit(X, y, sample_weight=sample_weight) self.alpha_ = estimator.alpha_ self.best_score_ = estimator.best_score_ if self.store_cv_values: self.cv_values_ = estimator.cv_values_ else: if self.store_cv_values: raise ValueError("cv!=None and store_cv_values=True" " are incompatible") if self.alpha_per_target: raise ValueError("cv!=None and alpha_per_target=True" " are incompatible") parameters = {'alpha': self.alphas} solver = 'sparse_cg' if sparse.issparse(X) else 'auto' model = RidgeClassifier if is_classifier(self) else Ridge gs = GridSearchCV(model(fit_intercept=self.fit_intercept, normalize=self.normalize, solver=solver), parameters, cv=cv, scoring=self.scoring) gs.fit(X, y, sample_weight=sample_weight) estimator = gs.best_estimator_ self.alpha_ = gs.best_estimator_.alpha self.best_score_ = gs.best_score_ self.coef_ = estimator.coef_ self.intercept_ = estimator.intercept_ self.n_features_in_ = estimator.n_features_in_ return self class RidgeCV(MultiOutputMixin, RegressorMixin, _BaseRidgeCV): """Ridge regression with built-in cross-validation. See glossary entry for :term:`cross-validation estimator`. By default, it performs efficient Leave-One-Out Cross-Validation. Read more in the :ref:`User Guide `. Parameters ---------- alphas : ndarray of shape (n_alphas,), default=(0.1, 1.0, 10.0) Array of alpha values to try. Regularization strength; must be a positive float. Regularization improves the conditioning of the problem and reduces the variance of the estimates. Larger values specify stronger regularization. Alpha corresponds to ``1 / (2C)`` in other linear models such as :class:`~sklearn.linear_model.LogisticRegression` or :class:`~sklearn.svm.LinearSVC`. If using Leave-One-Out cross-validation, alphas must be positive. fit_intercept : bool, default=True Whether to calculate the intercept for this model. If set to false, no intercept will be used in calculations (i.e. data is expected to be centered). normalize : bool, default=False This parameter is ignored when ``fit_intercept`` is set to False. If True, the regressors X will be normalized before regression by subtracting the mean and dividing by the l2-norm. If you wish to standardize, please use :class:`~sklearn.preprocessing.StandardScaler` before calling ``fit`` on an estimator with ``normalize=False``. scoring : string, callable, default=None A string (see model evaluation documentation) or a scorer callable object / function with signature ``scorer(estimator, X, y)``. If None, the negative mean squared error if cv is 'auto' or None (i.e. when using leave-one-out cross-validation), and r2 score otherwise. cv : int, cross-validation generator or an iterable, default=None Determines the cross-validation splitting strategy. Possible inputs for cv are: - None, to use the efficient Leave-One-Out cross-validation - integer, to specify the number of folds. - :term:`CV splitter`, - An iterable yielding (train, test) splits as arrays of indices. For integer/None inputs, if ``y`` is binary or multiclass, :class:`~sklearn.model_selection.StratifiedKFold` is used, else, :class:`~sklearn.model_selection.KFold` is used. Refer :ref:`User Guide ` for the various cross-validation strategies that can be used here. gcv_mode : {'auto', 'svd', eigen'}, default='auto' Flag indicating which strategy to use when performing Leave-One-Out Cross-Validation. Options are:: 'auto' : use 'svd' if n_samples > n_features, otherwise use 'eigen' 'svd' : force use of singular value decomposition of X when X is dense, eigenvalue decomposition of X^T.X when X is sparse. 'eigen' : force computation via eigendecomposition of X.X^T The 'auto' mode is the default and is intended to pick the cheaper option of the two depending on the shape of the training data. store_cv_values : bool, default=False Flag indicating if the cross-validation values corresponding to each alpha should be stored in the ``cv_values_`` attribute (see below). This flag is only compatible with ``cv=None`` (i.e. using Leave-One-Out Cross-Validation). alpha_per_target : bool, default=False Flag indicating whether to optimize the alpha value (picked from the `alphas` parameter list) for each target separately (for multi-output settings: multiple prediction targets). When set to `True`, after fitting, the `alpha_` attribute will contain a value for each target. When set to `False`, a single alpha is used for all targets. .. versionadded:: 0.24 Attributes ---------- cv_values_ : ndarray of shape (n_samples, n_alphas) or \ shape (n_samples, n_targets, n_alphas), optional Cross-validation values for each alpha (only available if ``store_cv_values=True`` and ``cv=None``). After ``fit()`` has been called, this attribute will contain the mean squared errors (by default) or the values of the ``{loss,score}_func`` function (if provided in the constructor). coef_ : ndarray of shape (n_features) or (n_targets, n_features) Weight vector(s). intercept_ : float or ndarray of shape (n_targets,) Independent term in decision function. Set to 0.0 if ``fit_intercept = False``. alpha_ : float or ndarray of shape (n_targets,) Estimated regularization parameter, or, if ``alpha_per_target=True``, the estimated regularization parameter for each target. best_score_ : float or ndarray of shape (n_targets,) Score of base estimator with best alpha, or, if ``alpha_per_target=True``, a score for each target. .. versionadded:: 0.23 Examples -------- >>> from sklearn.datasets import load_diabetes >>> from sklearn.linear_model import RidgeCV >>> X, y = load_diabetes(return_X_y=True) >>> clf = RidgeCV(alphas=[1e-3, 1e-2, 1e-1, 1]).fit(X, y) >>> clf.score(X, y) 0.5166... See Also -------- Ridge : Ridge regression. RidgeClassifier : Ridge classifier. RidgeClassifierCV : Ridge classifier with built-in cross validation. """ class RidgeClassifierCV(LinearClassifierMixin, _BaseRidgeCV): """Ridge classifier with built-in cross-validation. See glossary entry for :term:`cross-validation estimator`. By default, it performs Leave-One-Out Cross-Validation. Currently, only the n_features > n_samples case is handled efficiently. Read more in the :ref:`User Guide `. Parameters ---------- alphas : ndarray of shape (n_alphas,), default=(0.1, 1.0, 10.0) Array of alpha values to try. Regularization strength; must be a positive float. Regularization improves the conditioning of the problem and reduces the variance of the estimates. Larger values specify stronger regularization. Alpha corresponds to ``1 / (2C)`` in other linear models such as :class:`~sklearn.linear_model.LogisticRegression` or :class:`~sklearn.svm.LinearSVC`. fit_intercept : bool, default=True Whether to calculate the intercept for this model. If set to false, no intercept will be used in calculations (i.e. data is expected to be centered). normalize : bool, default=False This parameter is ignored when ``fit_intercept`` is set to False. If True, the regressors X will be normalized before regression by subtracting the mean and dividing by the l2-norm. If you wish to standardize, please use :class:`~sklearn.preprocessing.StandardScaler` before calling ``fit`` on an estimator with ``normalize=False``. scoring : string, callable, default=None A string (see model evaluation documentation) or a scorer callable object / function with signature ``scorer(estimator, X, y)``. cv : int, cross-validation generator or an iterable, default=None Determines the cross-validation splitting strategy. Possible inputs for cv are: - None, to use the efficient Leave-One-Out cross-validation - integer, to specify the number of folds. - :term:`CV splitter`, - An iterable yielding (train, test) splits as arrays of indices. Refer :ref:`User Guide ` for the various cross-validation strategies that can be used here. class_weight : dict or 'balanced', default=None Weights associated with classes in the form ``{class_label: weight}``. If not given, all classes are supposed to have weight one. The "balanced" mode uses the values of y to automatically adjust weights inversely proportional to class frequencies in the input data as ``n_samples / (n_classes * np.bincount(y))`` store_cv_values : bool, default=False Flag indicating if the cross-validation values corresponding to each alpha should be stored in the ``cv_values_`` attribute (see below). This flag is only compatible with ``cv=None`` (i.e. using Leave-One-Out Cross-Validation). Attributes ---------- cv_values_ : ndarray of shape (n_samples, n_targets, n_alphas), optional Cross-validation values for each alpha (if ``store_cv_values=True`` and ``cv=None``). After ``fit()`` has been called, this attribute will contain the mean squared errors (by default) or the values of the ``{loss,score}_func`` function (if provided in the constructor). This attribute exists only when ``store_cv_values`` is True. coef_ : ndarray of shape (1, n_features) or (n_targets, n_features) Coefficient of the features in the decision function. ``coef_`` is of shape (1, n_features) when the given problem is binary. intercept_ : float or ndarray of shape (n_targets,) Independent term in decision function. Set to 0.0 if ``fit_intercept = False``. alpha_ : float Estimated regularization parameter. best_score_ : float Score of base estimator with best alpha. .. versionadded:: 0.23 classes_ : ndarray of shape (n_classes,) The classes labels. Examples -------- >>> from sklearn.datasets import load_breast_cancer >>> from sklearn.linear_model import RidgeClassifierCV >>> X, y = load_breast_cancer(return_X_y=True) >>> clf = RidgeClassifierCV(alphas=[1e-3, 1e-2, 1e-1, 1]).fit(X, y) >>> clf.score(X, y) 0.9630... See Also -------- Ridge : Ridge regression. RidgeClassifier : Ridge classifier. RidgeCV : Ridge regression with built-in cross validation. Notes ----- For multi-class classification, n_class classifiers are trained in a one-versus-all approach. Concretely, this is implemented by taking advantage of the multi-variate response support in Ridge. """ @_deprecate_positional_args def __init__(self, alphas=(0.1, 1.0, 10.0), *, fit_intercept=True, normalize=False, scoring=None, cv=None, class_weight=None, store_cv_values=False): super().__init__( alphas=alphas, fit_intercept=fit_intercept, normalize=normalize, scoring=scoring, cv=cv, store_cv_values=store_cv_values) self.class_weight = class_weight def fit(self, X, y, sample_weight=None): """Fit Ridge classifier with cv. Parameters ---------- X : ndarray of shape (n_samples, n_features) Training vectors, where n_samples is the number of samples and n_features is the number of features. When using GCV, will be cast to float64 if necessary. y : ndarray of shape (n_samples,) Target values. Will be cast to X's dtype if necessary. sample_weight : float or ndarray of shape (n_samples,), default=None Individual weights for each sample. If given a float, every sample will have the same weight. Returns ------- self : object """ X, y = self._validate_data(X, y, accept_sparse=['csr', 'csc', 'coo'], multi_output=True, y_numeric=False) sample_weight = _check_sample_weight(sample_weight, X, dtype=X.dtype) self._label_binarizer = LabelBinarizer(pos_label=1, neg_label=-1) Y = self._label_binarizer.fit_transform(y) if not self._label_binarizer.y_type_.startswith('multilabel'): y = column_or_1d(y, warn=True) if self.class_weight: # modify the sample weights with the corresponding class weight sample_weight = (sample_weight * compute_sample_weight(self.class_weight, y)) target = Y if self.cv is None else y _BaseRidgeCV.fit(self, X, target, sample_weight=sample_weight) return self @property def classes_(self): return self._label_binarizer.classes_ def _more_tags(self): return { '_xfail_checks': { 'check_sample_weights_invariance': 'zero sample_weight is not equivalent to removing samples', } }