785 lines
27 KiB
Python
785 lines
27 KiB
Python
|
"""
|
||
|
Various bayesian regression
|
||
|
"""
|
||
|
|
||
|
# Authors: V. Michel, F. Pedregosa, A. Gramfort
|
||
|
# License: BSD 3 clause
|
||
|
|
||
|
from math import log
|
||
|
from numbers import Integral, Real
|
||
|
|
||
|
import numpy as np
|
||
|
from scipy import linalg
|
||
|
from scipy.linalg import pinvh
|
||
|
|
||
|
from ..base import RegressorMixin, _fit_context
|
||
|
from ..utils import _safe_indexing
|
||
|
from ..utils._param_validation import Interval
|
||
|
from ..utils.extmath import fast_logdet
|
||
|
from ..utils.validation import _check_sample_weight
|
||
|
from ._base import LinearModel, _preprocess_data, _rescale_data
|
||
|
|
||
|
###############################################################################
|
||
|
# BayesianRidge regression
|
||
|
|
||
|
|
||
|
class BayesianRidge(RegressorMixin, LinearModel):
|
||
|
"""Bayesian ridge regression.
|
||
|
|
||
|
Fit a Bayesian ridge model. See the Notes section for details on this
|
||
|
implementation and the optimization of the regularization parameters
|
||
|
lambda (precision of the weights) and alpha (precision of the noise).
|
||
|
|
||
|
Read more in the :ref:`User Guide <bayesian_regression>`.
|
||
|
For an intuitive visualization of how the sinusoid is approximated by
|
||
|
a polynomial using different pairs of initial values, see
|
||
|
:ref:`sphx_glr_auto_examples_linear_model_plot_bayesian_ridge_curvefit.py`.
|
||
|
|
||
|
Parameters
|
||
|
----------
|
||
|
max_iter : int, default=300
|
||
|
Maximum number of iterations over the complete dataset before
|
||
|
stopping independently of any early stopping criterion.
|
||
|
|
||
|
.. versionchanged:: 1.3
|
||
|
|
||
|
tol : float, default=1e-3
|
||
|
Stop the algorithm if w has converged.
|
||
|
|
||
|
alpha_1 : float, default=1e-6
|
||
|
Hyper-parameter : shape parameter for the Gamma distribution prior
|
||
|
over the alpha parameter.
|
||
|
|
||
|
alpha_2 : float, default=1e-6
|
||
|
Hyper-parameter : inverse scale parameter (rate parameter) for the
|
||
|
Gamma distribution prior over the alpha parameter.
|
||
|
|
||
|
lambda_1 : float, default=1e-6
|
||
|
Hyper-parameter : shape parameter for the Gamma distribution prior
|
||
|
over the lambda parameter.
|
||
|
|
||
|
lambda_2 : float, default=1e-6
|
||
|
Hyper-parameter : inverse scale parameter (rate parameter) for the
|
||
|
Gamma distribution prior over the lambda parameter.
|
||
|
|
||
|
alpha_init : float, default=None
|
||
|
Initial value for alpha (precision of the noise).
|
||
|
If not set, alpha_init is 1/Var(y).
|
||
|
|
||
|
.. versionadded:: 0.22
|
||
|
|
||
|
lambda_init : float, default=None
|
||
|
Initial value for lambda (precision of the weights).
|
||
|
If not set, lambda_init is 1.
|
||
|
|
||
|
.. versionadded:: 0.22
|
||
|
|
||
|
compute_score : bool, default=False
|
||
|
If True, compute the log marginal likelihood at each iteration of the
|
||
|
optimization.
|
||
|
|
||
|
fit_intercept : bool, default=True
|
||
|
Whether to calculate the intercept for this model.
|
||
|
The intercept is not treated as a probabilistic parameter
|
||
|
and thus has no associated variance. If set
|
||
|
to False, no intercept will be used in calculations
|
||
|
(i.e. data is expected to be centered).
|
||
|
|
||
|
copy_X : bool, default=True
|
||
|
If True, X will be copied; else, it may be overwritten.
|
||
|
|
||
|
verbose : bool, default=False
|
||
|
Verbose mode when fitting the model.
|
||
|
|
||
|
Attributes
|
||
|
----------
|
||
|
coef_ : array-like of shape (n_features,)
|
||
|
Coefficients of the regression model (mean of distribution)
|
||
|
|
||
|
intercept_ : float
|
||
|
Independent term in decision function. Set to 0.0 if
|
||
|
`fit_intercept = False`.
|
||
|
|
||
|
alpha_ : float
|
||
|
Estimated precision of the noise.
|
||
|
|
||
|
lambda_ : float
|
||
|
Estimated precision of the weights.
|
||
|
|
||
|
sigma_ : array-like of shape (n_features, n_features)
|
||
|
Estimated variance-covariance matrix of the weights
|
||
|
|
||
|
scores_ : array-like of shape (n_iter_+1,)
|
||
|
If computed_score is True, value of the log marginal likelihood (to be
|
||
|
maximized) at each iteration of the optimization. The array starts
|
||
|
with the value of the log marginal likelihood obtained for the initial
|
||
|
values of alpha and lambda and ends with the value obtained for the
|
||
|
estimated alpha and lambda.
|
||
|
|
||
|
n_iter_ : int
|
||
|
The actual number of iterations to reach the stopping criterion.
|
||
|
|
||
|
X_offset_ : ndarray of shape (n_features,)
|
||
|
If `fit_intercept=True`, offset subtracted for centering data to a
|
||
|
zero mean. Set to np.zeros(n_features) otherwise.
|
||
|
|
||
|
X_scale_ : ndarray of shape (n_features,)
|
||
|
Set to np.ones(n_features).
|
||
|
|
||
|
n_features_in_ : int
|
||
|
Number of features seen during :term:`fit`.
|
||
|
|
||
|
.. versionadded:: 0.24
|
||
|
|
||
|
feature_names_in_ : ndarray of shape (`n_features_in_`,)
|
||
|
Names of features seen during :term:`fit`. Defined only when `X`
|
||
|
has feature names that are all strings.
|
||
|
|
||
|
.. versionadded:: 1.0
|
||
|
|
||
|
See Also
|
||
|
--------
|
||
|
ARDRegression : Bayesian ARD regression.
|
||
|
|
||
|
Notes
|
||
|
-----
|
||
|
There exist several strategies to perform Bayesian ridge regression. This
|
||
|
implementation is based on the algorithm described in Appendix A of
|
||
|
(Tipping, 2001) where updates of the regularization parameters are done as
|
||
|
suggested in (MacKay, 1992). Note that according to A New
|
||
|
View of Automatic Relevance Determination (Wipf and Nagarajan, 2008) these
|
||
|
update rules do not guarantee that the marginal likelihood is increasing
|
||
|
between two consecutive iterations of the optimization.
|
||
|
|
||
|
References
|
||
|
----------
|
||
|
D. J. C. MacKay, Bayesian Interpolation, Computation and Neural Systems,
|
||
|
Vol. 4, No. 3, 1992.
|
||
|
|
||
|
M. E. Tipping, Sparse Bayesian Learning and the Relevance Vector Machine,
|
||
|
Journal of Machine Learning Research, Vol. 1, 2001.
|
||
|
|
||
|
Examples
|
||
|
--------
|
||
|
>>> from sklearn import linear_model
|
||
|
>>> clf = linear_model.BayesianRidge()
|
||
|
>>> clf.fit([[0,0], [1, 1], [2, 2]], [0, 1, 2])
|
||
|
BayesianRidge()
|
||
|
>>> clf.predict([[1, 1]])
|
||
|
array([1.])
|
||
|
"""
|
||
|
|
||
|
_parameter_constraints: dict = {
|
||
|
"max_iter": [Interval(Integral, 1, None, closed="left")],
|
||
|
"tol": [Interval(Real, 0, None, closed="neither")],
|
||
|
"alpha_1": [Interval(Real, 0, None, closed="left")],
|
||
|
"alpha_2": [Interval(Real, 0, None, closed="left")],
|
||
|
"lambda_1": [Interval(Real, 0, None, closed="left")],
|
||
|
"lambda_2": [Interval(Real, 0, None, closed="left")],
|
||
|
"alpha_init": [None, Interval(Real, 0, None, closed="left")],
|
||
|
"lambda_init": [None, Interval(Real, 0, None, closed="left")],
|
||
|
"compute_score": ["boolean"],
|
||
|
"fit_intercept": ["boolean"],
|
||
|
"copy_X": ["boolean"],
|
||
|
"verbose": ["verbose"],
|
||
|
}
|
||
|
|
||
|
def __init__(
|
||
|
self,
|
||
|
*,
|
||
|
max_iter=300,
|
||
|
tol=1.0e-3,
|
||
|
alpha_1=1.0e-6,
|
||
|
alpha_2=1.0e-6,
|
||
|
lambda_1=1.0e-6,
|
||
|
lambda_2=1.0e-6,
|
||
|
alpha_init=None,
|
||
|
lambda_init=None,
|
||
|
compute_score=False,
|
||
|
fit_intercept=True,
|
||
|
copy_X=True,
|
||
|
verbose=False,
|
||
|
):
|
||
|
self.max_iter = max_iter
|
||
|
self.tol = tol
|
||
|
self.alpha_1 = alpha_1
|
||
|
self.alpha_2 = alpha_2
|
||
|
self.lambda_1 = lambda_1
|
||
|
self.lambda_2 = lambda_2
|
||
|
self.alpha_init = alpha_init
|
||
|
self.lambda_init = lambda_init
|
||
|
self.compute_score = compute_score
|
||
|
self.fit_intercept = fit_intercept
|
||
|
self.copy_X = copy_X
|
||
|
self.verbose = verbose
|
||
|
|
||
|
@_fit_context(prefer_skip_nested_validation=True)
|
||
|
def fit(self, X, y, sample_weight=None):
|
||
|
"""Fit the model.
|
||
|
|
||
|
Parameters
|
||
|
----------
|
||
|
X : ndarray of shape (n_samples, n_features)
|
||
|
Training data.
|
||
|
y : ndarray of shape (n_samples,)
|
||
|
Target values. Will be cast to X's dtype if necessary.
|
||
|
|
||
|
sample_weight : ndarray of shape (n_samples,), default=None
|
||
|
Individual weights for each sample.
|
||
|
|
||
|
.. versionadded:: 0.20
|
||
|
parameter *sample_weight* support to BayesianRidge.
|
||
|
|
||
|
Returns
|
||
|
-------
|
||
|
self : object
|
||
|
Returns the instance itself.
|
||
|
"""
|
||
|
X, y = self._validate_data(X, y, dtype=[np.float64, np.float32], y_numeric=True)
|
||
|
dtype = X.dtype
|
||
|
|
||
|
if sample_weight is not None:
|
||
|
sample_weight = _check_sample_weight(sample_weight, X, dtype=dtype)
|
||
|
|
||
|
X, y, X_offset_, y_offset_, X_scale_ = _preprocess_data(
|
||
|
X,
|
||
|
y,
|
||
|
fit_intercept=self.fit_intercept,
|
||
|
copy=self.copy_X,
|
||
|
sample_weight=sample_weight,
|
||
|
)
|
||
|
|
||
|
if sample_weight is not None:
|
||
|
# Sample weight can be implemented via a simple rescaling.
|
||
|
X, y, _ = _rescale_data(X, y, sample_weight)
|
||
|
|
||
|
self.X_offset_ = X_offset_
|
||
|
self.X_scale_ = X_scale_
|
||
|
n_samples, n_features = X.shape
|
||
|
|
||
|
# Initialization of the values of the parameters
|
||
|
eps = np.finfo(np.float64).eps
|
||
|
# Add `eps` in the denominator to omit division by zero if `np.var(y)`
|
||
|
# is zero
|
||
|
alpha_ = self.alpha_init
|
||
|
lambda_ = self.lambda_init
|
||
|
if alpha_ is None:
|
||
|
alpha_ = 1.0 / (np.var(y) + eps)
|
||
|
if lambda_ is None:
|
||
|
lambda_ = 1.0
|
||
|
|
||
|
# Avoid unintended type promotion to float64 with numpy 2
|
||
|
alpha_ = np.asarray(alpha_, dtype=dtype)
|
||
|
lambda_ = np.asarray(lambda_, dtype=dtype)
|
||
|
|
||
|
verbose = self.verbose
|
||
|
lambda_1 = self.lambda_1
|
||
|
lambda_2 = self.lambda_2
|
||
|
alpha_1 = self.alpha_1
|
||
|
alpha_2 = self.alpha_2
|
||
|
|
||
|
self.scores_ = list()
|
||
|
coef_old_ = None
|
||
|
|
||
|
XT_y = np.dot(X.T, y)
|
||
|
U, S, Vh = linalg.svd(X, full_matrices=False)
|
||
|
eigen_vals_ = S**2
|
||
|
|
||
|
# Convergence loop of the bayesian ridge regression
|
||
|
for iter_ in range(self.max_iter):
|
||
|
# update posterior mean coef_ based on alpha_ and lambda_ and
|
||
|
# compute corresponding rmse
|
||
|
coef_, rmse_ = self._update_coef_(
|
||
|
X, y, n_samples, n_features, XT_y, U, Vh, eigen_vals_, alpha_, lambda_
|
||
|
)
|
||
|
if self.compute_score:
|
||
|
# compute the log marginal likelihood
|
||
|
s = self._log_marginal_likelihood(
|
||
|
n_samples, n_features, eigen_vals_, alpha_, lambda_, coef_, rmse_
|
||
|
)
|
||
|
self.scores_.append(s)
|
||
|
|
||
|
# Update alpha and lambda according to (MacKay, 1992)
|
||
|
gamma_ = np.sum((alpha_ * eigen_vals_) / (lambda_ + alpha_ * eigen_vals_))
|
||
|
lambda_ = (gamma_ + 2 * lambda_1) / (np.sum(coef_**2) + 2 * lambda_2)
|
||
|
alpha_ = (n_samples - gamma_ + 2 * alpha_1) / (rmse_ + 2 * alpha_2)
|
||
|
|
||
|
# Check for convergence
|
||
|
if iter_ != 0 and np.sum(np.abs(coef_old_ - coef_)) < self.tol:
|
||
|
if verbose:
|
||
|
print("Convergence after ", str(iter_), " iterations")
|
||
|
break
|
||
|
coef_old_ = np.copy(coef_)
|
||
|
|
||
|
self.n_iter_ = iter_ + 1
|
||
|
|
||
|
# return regularization parameters and corresponding posterior mean,
|
||
|
# log marginal likelihood and posterior covariance
|
||
|
self.alpha_ = alpha_
|
||
|
self.lambda_ = lambda_
|
||
|
self.coef_, rmse_ = self._update_coef_(
|
||
|
X, y, n_samples, n_features, XT_y, U, Vh, eigen_vals_, alpha_, lambda_
|
||
|
)
|
||
|
if self.compute_score:
|
||
|
# compute the log marginal likelihood
|
||
|
s = self._log_marginal_likelihood(
|
||
|
n_samples, n_features, eigen_vals_, alpha_, lambda_, coef_, rmse_
|
||
|
)
|
||
|
self.scores_.append(s)
|
||
|
self.scores_ = np.array(self.scores_)
|
||
|
|
||
|
# posterior covariance is given by 1/alpha_ * scaled_sigma_
|
||
|
scaled_sigma_ = np.dot(
|
||
|
Vh.T, Vh / (eigen_vals_ + lambda_ / alpha_)[:, np.newaxis]
|
||
|
)
|
||
|
self.sigma_ = (1.0 / alpha_) * scaled_sigma_
|
||
|
|
||
|
self._set_intercept(X_offset_, y_offset_, X_scale_)
|
||
|
|
||
|
return self
|
||
|
|
||
|
def predict(self, X, return_std=False):
|
||
|
"""Predict using the linear model.
|
||
|
|
||
|
In addition to the mean of the predictive distribution, also its
|
||
|
standard deviation can be returned.
|
||
|
|
||
|
Parameters
|
||
|
----------
|
||
|
X : {array-like, sparse matrix} of shape (n_samples, n_features)
|
||
|
Samples.
|
||
|
|
||
|
return_std : bool, default=False
|
||
|
Whether to return the standard deviation of posterior prediction.
|
||
|
|
||
|
Returns
|
||
|
-------
|
||
|
y_mean : array-like of shape (n_samples,)
|
||
|
Mean of predictive distribution of query points.
|
||
|
|
||
|
y_std : array-like of shape (n_samples,)
|
||
|
Standard deviation of predictive distribution of query points.
|
||
|
"""
|
||
|
y_mean = self._decision_function(X)
|
||
|
if not return_std:
|
||
|
return y_mean
|
||
|
else:
|
||
|
sigmas_squared_data = (np.dot(X, self.sigma_) * X).sum(axis=1)
|
||
|
y_std = np.sqrt(sigmas_squared_data + (1.0 / self.alpha_))
|
||
|
return y_mean, y_std
|
||
|
|
||
|
def _update_coef_(
|
||
|
self, X, y, n_samples, n_features, XT_y, U, Vh, eigen_vals_, alpha_, lambda_
|
||
|
):
|
||
|
"""Update posterior mean and compute corresponding rmse.
|
||
|
|
||
|
Posterior mean is given by coef_ = scaled_sigma_ * X.T * y where
|
||
|
scaled_sigma_ = (lambda_/alpha_ * np.eye(n_features)
|
||
|
+ np.dot(X.T, X))^-1
|
||
|
"""
|
||
|
|
||
|
if n_samples > n_features:
|
||
|
coef_ = np.linalg.multi_dot(
|
||
|
[Vh.T, Vh / (eigen_vals_ + lambda_ / alpha_)[:, np.newaxis], XT_y]
|
||
|
)
|
||
|
else:
|
||
|
coef_ = np.linalg.multi_dot(
|
||
|
[X.T, U / (eigen_vals_ + lambda_ / alpha_)[None, :], U.T, y]
|
||
|
)
|
||
|
|
||
|
rmse_ = np.sum((y - np.dot(X, coef_)) ** 2)
|
||
|
|
||
|
return coef_, rmse_
|
||
|
|
||
|
def _log_marginal_likelihood(
|
||
|
self, n_samples, n_features, eigen_vals, alpha_, lambda_, coef, rmse
|
||
|
):
|
||
|
"""Log marginal likelihood."""
|
||
|
alpha_1 = self.alpha_1
|
||
|
alpha_2 = self.alpha_2
|
||
|
lambda_1 = self.lambda_1
|
||
|
lambda_2 = self.lambda_2
|
||
|
|
||
|
# compute the log of the determinant of the posterior covariance.
|
||
|
# posterior covariance is given by
|
||
|
# sigma = (lambda_ * np.eye(n_features) + alpha_ * np.dot(X.T, X))^-1
|
||
|
if n_samples > n_features:
|
||
|
logdet_sigma = -np.sum(np.log(lambda_ + alpha_ * eigen_vals))
|
||
|
else:
|
||
|
logdet_sigma = np.full(n_features, lambda_, dtype=np.array(lambda_).dtype)
|
||
|
logdet_sigma[:n_samples] += alpha_ * eigen_vals
|
||
|
logdet_sigma = -np.sum(np.log(logdet_sigma))
|
||
|
|
||
|
score = lambda_1 * log(lambda_) - lambda_2 * lambda_
|
||
|
score += alpha_1 * log(alpha_) - alpha_2 * alpha_
|
||
|
score += 0.5 * (
|
||
|
n_features * log(lambda_)
|
||
|
+ n_samples * log(alpha_)
|
||
|
- alpha_ * rmse
|
||
|
- lambda_ * np.sum(coef**2)
|
||
|
+ logdet_sigma
|
||
|
- n_samples * log(2 * np.pi)
|
||
|
)
|
||
|
|
||
|
return score
|
||
|
|
||
|
|
||
|
###############################################################################
|
||
|
# ARD (Automatic Relevance Determination) regression
|
||
|
|
||
|
|
||
|
class ARDRegression(RegressorMixin, LinearModel):
|
||
|
"""Bayesian ARD regression.
|
||
|
|
||
|
Fit the weights of a regression model, using an ARD prior. The weights of
|
||
|
the regression model are assumed to be in Gaussian distributions.
|
||
|
Also estimate the parameters lambda (precisions of the distributions of the
|
||
|
weights) and alpha (precision of the distribution of the noise).
|
||
|
The estimation is done by an iterative procedures (Evidence Maximization)
|
||
|
|
||
|
Read more in the :ref:`User Guide <bayesian_regression>`.
|
||
|
|
||
|
Parameters
|
||
|
----------
|
||
|
max_iter : int, default=300
|
||
|
Maximum number of iterations.
|
||
|
|
||
|
.. versionchanged:: 1.3
|
||
|
|
||
|
tol : float, default=1e-3
|
||
|
Stop the algorithm if w has converged.
|
||
|
|
||
|
alpha_1 : float, default=1e-6
|
||
|
Hyper-parameter : shape parameter for the Gamma distribution prior
|
||
|
over the alpha parameter.
|
||
|
|
||
|
alpha_2 : float, default=1e-6
|
||
|
Hyper-parameter : inverse scale parameter (rate parameter) for the
|
||
|
Gamma distribution prior over the alpha parameter.
|
||
|
|
||
|
lambda_1 : float, default=1e-6
|
||
|
Hyper-parameter : shape parameter for the Gamma distribution prior
|
||
|
over the lambda parameter.
|
||
|
|
||
|
lambda_2 : float, default=1e-6
|
||
|
Hyper-parameter : inverse scale parameter (rate parameter) for the
|
||
|
Gamma distribution prior over the lambda parameter.
|
||
|
|
||
|
compute_score : bool, default=False
|
||
|
If True, compute the objective function at each step of the model.
|
||
|
|
||
|
threshold_lambda : float, default=10 000
|
||
|
Threshold for removing (pruning) weights with high precision from
|
||
|
the computation.
|
||
|
|
||
|
fit_intercept : bool, default=True
|
||
|
Whether to calculate the intercept for this model. If set
|
||
|
to false, no intercept will be used in calculations
|
||
|
(i.e. data is expected to be centered).
|
||
|
|
||
|
copy_X : bool, default=True
|
||
|
If True, X will be copied; else, it may be overwritten.
|
||
|
|
||
|
verbose : bool, default=False
|
||
|
Verbose mode when fitting the model.
|
||
|
|
||
|
Attributes
|
||
|
----------
|
||
|
coef_ : array-like of shape (n_features,)
|
||
|
Coefficients of the regression model (mean of distribution)
|
||
|
|
||
|
alpha_ : float
|
||
|
estimated precision of the noise.
|
||
|
|
||
|
lambda_ : array-like of shape (n_features,)
|
||
|
estimated precisions of the weights.
|
||
|
|
||
|
sigma_ : array-like of shape (n_features, n_features)
|
||
|
estimated variance-covariance matrix of the weights
|
||
|
|
||
|
scores_ : float
|
||
|
if computed, value of the objective function (to be maximized)
|
||
|
|
||
|
n_iter_ : int
|
||
|
The actual number of iterations to reach the stopping criterion.
|
||
|
|
||
|
.. versionadded:: 1.3
|
||
|
|
||
|
intercept_ : float
|
||
|
Independent term in decision function. Set to 0.0 if
|
||
|
``fit_intercept = False``.
|
||
|
|
||
|
X_offset_ : float
|
||
|
If `fit_intercept=True`, offset subtracted for centering data to a
|
||
|
zero mean. Set to np.zeros(n_features) otherwise.
|
||
|
|
||
|
X_scale_ : float
|
||
|
Set to np.ones(n_features).
|
||
|
|
||
|
n_features_in_ : int
|
||
|
Number of features seen during :term:`fit`.
|
||
|
|
||
|
.. versionadded:: 0.24
|
||
|
|
||
|
feature_names_in_ : ndarray of shape (`n_features_in_`,)
|
||
|
Names of features seen during :term:`fit`. Defined only when `X`
|
||
|
has feature names that are all strings.
|
||
|
|
||
|
.. versionadded:: 1.0
|
||
|
|
||
|
See Also
|
||
|
--------
|
||
|
BayesianRidge : Bayesian ridge regression.
|
||
|
|
||
|
Notes
|
||
|
-----
|
||
|
For an example, see :ref:`examples/linear_model/plot_ard.py
|
||
|
<sphx_glr_auto_examples_linear_model_plot_ard.py>`.
|
||
|
|
||
|
References
|
||
|
----------
|
||
|
D. J. C. MacKay, Bayesian nonlinear modeling for the prediction
|
||
|
competition, ASHRAE Transactions, 1994.
|
||
|
|
||
|
R. Salakhutdinov, Lecture notes on Statistical Machine Learning,
|
||
|
http://www.utstat.toronto.edu/~rsalakhu/sta4273/notes/Lecture2.pdf#page=15
|
||
|
Their beta is our ``self.alpha_``
|
||
|
Their alpha is our ``self.lambda_``
|
||
|
ARD is a little different than the slide: only dimensions/features for
|
||
|
which ``self.lambda_ < self.threshold_lambda`` are kept and the rest are
|
||
|
discarded.
|
||
|
|
||
|
Examples
|
||
|
--------
|
||
|
>>> from sklearn import linear_model
|
||
|
>>> clf = linear_model.ARDRegression()
|
||
|
>>> clf.fit([[0,0], [1, 1], [2, 2]], [0, 1, 2])
|
||
|
ARDRegression()
|
||
|
>>> clf.predict([[1, 1]])
|
||
|
array([1.])
|
||
|
"""
|
||
|
|
||
|
_parameter_constraints: dict = {
|
||
|
"max_iter": [Interval(Integral, 1, None, closed="left")],
|
||
|
"tol": [Interval(Real, 0, None, closed="left")],
|
||
|
"alpha_1": [Interval(Real, 0, None, closed="left")],
|
||
|
"alpha_2": [Interval(Real, 0, None, closed="left")],
|
||
|
"lambda_1": [Interval(Real, 0, None, closed="left")],
|
||
|
"lambda_2": [Interval(Real, 0, None, closed="left")],
|
||
|
"compute_score": ["boolean"],
|
||
|
"threshold_lambda": [Interval(Real, 0, None, closed="left")],
|
||
|
"fit_intercept": ["boolean"],
|
||
|
"copy_X": ["boolean"],
|
||
|
"verbose": ["verbose"],
|
||
|
}
|
||
|
|
||
|
def __init__(
|
||
|
self,
|
||
|
*,
|
||
|
max_iter=300,
|
||
|
tol=1.0e-3,
|
||
|
alpha_1=1.0e-6,
|
||
|
alpha_2=1.0e-6,
|
||
|
lambda_1=1.0e-6,
|
||
|
lambda_2=1.0e-6,
|
||
|
compute_score=False,
|
||
|
threshold_lambda=1.0e4,
|
||
|
fit_intercept=True,
|
||
|
copy_X=True,
|
||
|
verbose=False,
|
||
|
):
|
||
|
self.max_iter = max_iter
|
||
|
self.tol = tol
|
||
|
self.fit_intercept = fit_intercept
|
||
|
self.alpha_1 = alpha_1
|
||
|
self.alpha_2 = alpha_2
|
||
|
self.lambda_1 = lambda_1
|
||
|
self.lambda_2 = lambda_2
|
||
|
self.compute_score = compute_score
|
||
|
self.threshold_lambda = threshold_lambda
|
||
|
self.copy_X = copy_X
|
||
|
self.verbose = verbose
|
||
|
|
||
|
@_fit_context(prefer_skip_nested_validation=True)
|
||
|
def fit(self, X, y):
|
||
|
"""Fit the model according to the given training data and parameters.
|
||
|
|
||
|
Iterative procedure to maximize the evidence
|
||
|
|
||
|
Parameters
|
||
|
----------
|
||
|
X : array-like of shape (n_samples, n_features)
|
||
|
Training vector, where `n_samples` is the number of samples and
|
||
|
`n_features` is the number of features.
|
||
|
y : array-like of shape (n_samples,)
|
||
|
Target values (integers). Will be cast to X's dtype if necessary.
|
||
|
|
||
|
Returns
|
||
|
-------
|
||
|
self : object
|
||
|
Fitted estimator.
|
||
|
"""
|
||
|
X, y = self._validate_data(
|
||
|
X, y, dtype=[np.float64, np.float32], y_numeric=True, ensure_min_samples=2
|
||
|
)
|
||
|
dtype = X.dtype
|
||
|
|
||
|
n_samples, n_features = X.shape
|
||
|
coef_ = np.zeros(n_features, dtype=dtype)
|
||
|
|
||
|
X, y, X_offset_, y_offset_, X_scale_ = _preprocess_data(
|
||
|
X, y, fit_intercept=self.fit_intercept, copy=self.copy_X
|
||
|
)
|
||
|
|
||
|
self.X_offset_ = X_offset_
|
||
|
self.X_scale_ = X_scale_
|
||
|
|
||
|
# Launch the convergence loop
|
||
|
keep_lambda = np.ones(n_features, dtype=bool)
|
||
|
|
||
|
lambda_1 = self.lambda_1
|
||
|
lambda_2 = self.lambda_2
|
||
|
alpha_1 = self.alpha_1
|
||
|
alpha_2 = self.alpha_2
|
||
|
verbose = self.verbose
|
||
|
|
||
|
# Initialization of the values of the parameters
|
||
|
eps = np.finfo(np.float64).eps
|
||
|
# Add `eps` in the denominator to omit division by zero if `np.var(y)`
|
||
|
# is zero.
|
||
|
# Explicitly set dtype to avoid unintended type promotion with numpy 2.
|
||
|
alpha_ = np.asarray(1.0 / (np.var(y) + eps), dtype=dtype)
|
||
|
lambda_ = np.ones(n_features, dtype=dtype)
|
||
|
|
||
|
self.scores_ = list()
|
||
|
coef_old_ = None
|
||
|
|
||
|
def update_coeff(X, y, coef_, alpha_, keep_lambda, sigma_):
|
||
|
coef_[keep_lambda] = alpha_ * np.linalg.multi_dot(
|
||
|
[sigma_, X[:, keep_lambda].T, y]
|
||
|
)
|
||
|
return coef_
|
||
|
|
||
|
update_sigma = (
|
||
|
self._update_sigma
|
||
|
if n_samples >= n_features
|
||
|
else self._update_sigma_woodbury
|
||
|
)
|
||
|
# Iterative procedure of ARDRegression
|
||
|
for iter_ in range(self.max_iter):
|
||
|
sigma_ = update_sigma(X, alpha_, lambda_, keep_lambda)
|
||
|
coef_ = update_coeff(X, y, coef_, alpha_, keep_lambda, sigma_)
|
||
|
|
||
|
# Update alpha and lambda
|
||
|
rmse_ = np.sum((y - np.dot(X, coef_)) ** 2)
|
||
|
gamma_ = 1.0 - lambda_[keep_lambda] * np.diag(sigma_)
|
||
|
lambda_[keep_lambda] = (gamma_ + 2.0 * lambda_1) / (
|
||
|
(coef_[keep_lambda]) ** 2 + 2.0 * lambda_2
|
||
|
)
|
||
|
alpha_ = (n_samples - gamma_.sum() + 2.0 * alpha_1) / (
|
||
|
rmse_ + 2.0 * alpha_2
|
||
|
)
|
||
|
|
||
|
# Prune the weights with a precision over a threshold
|
||
|
keep_lambda = lambda_ < self.threshold_lambda
|
||
|
coef_[~keep_lambda] = 0
|
||
|
|
||
|
# Compute the objective function
|
||
|
if self.compute_score:
|
||
|
s = (lambda_1 * np.log(lambda_) - lambda_2 * lambda_).sum()
|
||
|
s += alpha_1 * log(alpha_) - alpha_2 * alpha_
|
||
|
s += 0.5 * (
|
||
|
fast_logdet(sigma_)
|
||
|
+ n_samples * log(alpha_)
|
||
|
+ np.sum(np.log(lambda_))
|
||
|
)
|
||
|
s -= 0.5 * (alpha_ * rmse_ + (lambda_ * coef_**2).sum())
|
||
|
self.scores_.append(s)
|
||
|
|
||
|
# Check for convergence
|
||
|
if iter_ > 0 and np.sum(np.abs(coef_old_ - coef_)) < self.tol:
|
||
|
if verbose:
|
||
|
print("Converged after %s iterations" % iter_)
|
||
|
break
|
||
|
coef_old_ = np.copy(coef_)
|
||
|
|
||
|
if not keep_lambda.any():
|
||
|
break
|
||
|
|
||
|
self.n_iter_ = iter_ + 1
|
||
|
|
||
|
if keep_lambda.any():
|
||
|
# update sigma and mu using updated params from the last iteration
|
||
|
sigma_ = update_sigma(X, alpha_, lambda_, keep_lambda)
|
||
|
coef_ = update_coeff(X, y, coef_, alpha_, keep_lambda, sigma_)
|
||
|
else:
|
||
|
sigma_ = np.array([]).reshape(0, 0)
|
||
|
|
||
|
self.coef_ = coef_
|
||
|
self.alpha_ = alpha_
|
||
|
self.sigma_ = sigma_
|
||
|
self.lambda_ = lambda_
|
||
|
self._set_intercept(X_offset_, y_offset_, X_scale_)
|
||
|
return self
|
||
|
|
||
|
def _update_sigma_woodbury(self, X, alpha_, lambda_, keep_lambda):
|
||
|
# See slides as referenced in the docstring note
|
||
|
# this function is used when n_samples < n_features and will invert
|
||
|
# a matrix of shape (n_samples, n_samples) making use of the
|
||
|
# woodbury formula:
|
||
|
# https://en.wikipedia.org/wiki/Woodbury_matrix_identity
|
||
|
n_samples = X.shape[0]
|
||
|
X_keep = X[:, keep_lambda]
|
||
|
inv_lambda = 1 / lambda_[keep_lambda].reshape(1, -1)
|
||
|
sigma_ = pinvh(
|
||
|
np.eye(n_samples, dtype=X.dtype) / alpha_
|
||
|
+ np.dot(X_keep * inv_lambda, X_keep.T)
|
||
|
)
|
||
|
sigma_ = np.dot(sigma_, X_keep * inv_lambda)
|
||
|
sigma_ = -np.dot(inv_lambda.reshape(-1, 1) * X_keep.T, sigma_)
|
||
|
sigma_[np.diag_indices(sigma_.shape[1])] += 1.0 / lambda_[keep_lambda]
|
||
|
return sigma_
|
||
|
|
||
|
def _update_sigma(self, X, alpha_, lambda_, keep_lambda):
|
||
|
# See slides as referenced in the docstring note
|
||
|
# this function is used when n_samples >= n_features and will
|
||
|
# invert a matrix of shape (n_features, n_features)
|
||
|
X_keep = X[:, keep_lambda]
|
||
|
gram = np.dot(X_keep.T, X_keep)
|
||
|
eye = np.eye(gram.shape[0], dtype=X.dtype)
|
||
|
sigma_inv = lambda_[keep_lambda] * eye + alpha_ * gram
|
||
|
sigma_ = pinvh(sigma_inv)
|
||
|
return sigma_
|
||
|
|
||
|
def predict(self, X, return_std=False):
|
||
|
"""Predict using the linear model.
|
||
|
|
||
|
In addition to the mean of the predictive distribution, also its
|
||
|
standard deviation can be returned.
|
||
|
|
||
|
Parameters
|
||
|
----------
|
||
|
X : {array-like, sparse matrix} of shape (n_samples, n_features)
|
||
|
Samples.
|
||
|
|
||
|
return_std : bool, default=False
|
||
|
Whether to return the standard deviation of posterior prediction.
|
||
|
|
||
|
Returns
|
||
|
-------
|
||
|
y_mean : array-like of shape (n_samples,)
|
||
|
Mean of predictive distribution of query points.
|
||
|
|
||
|
y_std : array-like of shape (n_samples,)
|
||
|
Standard deviation of predictive distribution of query points.
|
||
|
"""
|
||
|
y_mean = self._decision_function(X)
|
||
|
if return_std is False:
|
||
|
return y_mean
|
||
|
else:
|
||
|
col_index = self.lambda_ < self.threshold_lambda
|
||
|
X = _safe_indexing(X, indices=col_index, axis=1)
|
||
|
sigmas_squared_data = (np.dot(X, self.sigma_) * X).sum(axis=1)
|
||
|
y_std = np.sqrt(sigmas_squared_data + (1.0 / self.alpha_))
|
||
|
return y_mean, y_std
|