519 lines
19 KiB
Python
519 lines
19 KiB
Python
|
"""
|
||
|
Newton solver for Generalized Linear Models
|
||
|
"""
|
||
|
|
||
|
# Author: Christian Lorentzen <lorentzen.ch@gmail.com>
|
||
|
# License: BSD 3 clause
|
||
|
|
||
|
import warnings
|
||
|
from abc import ABC, abstractmethod
|
||
|
|
||
|
import numpy as np
|
||
|
import scipy.linalg
|
||
|
import scipy.optimize
|
||
|
|
||
|
from ..._loss.loss import HalfSquaredError
|
||
|
from ...exceptions import ConvergenceWarning
|
||
|
from ...utils.optimize import _check_optimize_result
|
||
|
from .._linear_loss import LinearModelLoss
|
||
|
|
||
|
|
||
|
class NewtonSolver(ABC):
|
||
|
"""Newton solver for GLMs.
|
||
|
|
||
|
This class implements Newton/2nd-order optimization routines for GLMs. Each Newton
|
||
|
iteration aims at finding the Newton step which is done by the inner solver. With
|
||
|
Hessian H, gradient g and coefficients coef, one step solves:
|
||
|
|
||
|
H @ coef_newton = -g
|
||
|
|
||
|
For our GLM / LinearModelLoss, we have gradient g and Hessian H:
|
||
|
|
||
|
g = X.T @ loss.gradient + l2_reg_strength * coef
|
||
|
H = X.T @ diag(loss.hessian) @ X + l2_reg_strength * identity
|
||
|
|
||
|
Backtracking line search updates coef = coef_old + t * coef_newton for some t in
|
||
|
(0, 1].
|
||
|
|
||
|
This is a base class, actual implementations (child classes) may deviate from the
|
||
|
above pattern and use structure specific tricks.
|
||
|
|
||
|
Usage pattern:
|
||
|
- initialize solver: sol = NewtonSolver(...)
|
||
|
- solve the problem: sol.solve(X, y, sample_weight)
|
||
|
|
||
|
References
|
||
|
----------
|
||
|
- Jorge Nocedal, Stephen J. Wright. (2006) "Numerical Optimization"
|
||
|
2nd edition
|
||
|
https://doi.org/10.1007/978-0-387-40065-5
|
||
|
|
||
|
- Stephen P. Boyd, Lieven Vandenberghe. (2004) "Convex Optimization."
|
||
|
Cambridge University Press, 2004.
|
||
|
https://web.stanford.edu/~boyd/cvxbook/bv_cvxbook.pdf
|
||
|
|
||
|
Parameters
|
||
|
----------
|
||
|
coef : ndarray of shape (n_dof,), (n_classes, n_dof) or (n_classes * n_dof,)
|
||
|
Initial coefficients of a linear model.
|
||
|
If shape (n_classes * n_dof,), the classes of one feature are contiguous,
|
||
|
i.e. one reconstructs the 2d-array via
|
||
|
coef.reshape((n_classes, -1), order="F").
|
||
|
|
||
|
linear_loss : LinearModelLoss
|
||
|
The loss to be minimized.
|
||
|
|
||
|
l2_reg_strength : float, default=0.0
|
||
|
L2 regularization strength.
|
||
|
|
||
|
tol : float, default=1e-4
|
||
|
The optimization problem is solved when each of the following condition is
|
||
|
fulfilled:
|
||
|
1. maximum |gradient| <= tol
|
||
|
2. Newton decrement d: 1/2 * d^2 <= tol
|
||
|
|
||
|
max_iter : int, default=100
|
||
|
Maximum number of Newton steps allowed.
|
||
|
|
||
|
n_threads : int, default=1
|
||
|
Number of OpenMP threads to use for the computation of the Hessian and gradient
|
||
|
of the loss function.
|
||
|
|
||
|
Attributes
|
||
|
----------
|
||
|
coef_old : ndarray of shape coef.shape
|
||
|
Coefficient of previous iteration.
|
||
|
|
||
|
coef_newton : ndarray of shape coef.shape
|
||
|
Newton step.
|
||
|
|
||
|
gradient : ndarray of shape coef.shape
|
||
|
Gradient of the loss w.r.t. the coefficients.
|
||
|
|
||
|
gradient_old : ndarray of shape coef.shape
|
||
|
Gradient of previous iteration.
|
||
|
|
||
|
loss_value : float
|
||
|
Value of objective function = loss + penalty.
|
||
|
|
||
|
loss_value_old : float
|
||
|
Value of objective function of previous itertion.
|
||
|
|
||
|
raw_prediction : ndarray of shape (n_samples,) or (n_samples, n_classes)
|
||
|
|
||
|
converged : bool
|
||
|
Indicator for convergence of the solver.
|
||
|
|
||
|
iteration : int
|
||
|
Number of Newton steps, i.e. calls to inner_solve
|
||
|
|
||
|
use_fallback_lbfgs_solve : bool
|
||
|
If set to True, the solver will resort to call LBFGS to finish the optimisation
|
||
|
procedure in case of convergence issues.
|
||
|
|
||
|
gradient_times_newton : float
|
||
|
gradient @ coef_newton, set in inner_solve and used by line_search. If the
|
||
|
Newton step is a descent direction, this is negative.
|
||
|
"""
|
||
|
|
||
|
def __init__(
|
||
|
self,
|
||
|
*,
|
||
|
coef,
|
||
|
linear_loss=LinearModelLoss(base_loss=HalfSquaredError(), fit_intercept=True),
|
||
|
l2_reg_strength=0.0,
|
||
|
tol=1e-4,
|
||
|
max_iter=100,
|
||
|
n_threads=1,
|
||
|
verbose=0,
|
||
|
):
|
||
|
self.coef = coef
|
||
|
self.linear_loss = linear_loss
|
||
|
self.l2_reg_strength = l2_reg_strength
|
||
|
self.tol = tol
|
||
|
self.max_iter = max_iter
|
||
|
self.n_threads = n_threads
|
||
|
self.verbose = verbose
|
||
|
|
||
|
def setup(self, X, y, sample_weight):
|
||
|
"""Precomputations
|
||
|
|
||
|
If None, initializes:
|
||
|
- self.coef
|
||
|
Sets:
|
||
|
- self.raw_prediction
|
||
|
- self.loss_value
|
||
|
"""
|
||
|
_, _, self.raw_prediction = self.linear_loss.weight_intercept_raw(self.coef, X)
|
||
|
self.loss_value = self.linear_loss.loss(
|
||
|
coef=self.coef,
|
||
|
X=X,
|
||
|
y=y,
|
||
|
sample_weight=sample_weight,
|
||
|
l2_reg_strength=self.l2_reg_strength,
|
||
|
n_threads=self.n_threads,
|
||
|
raw_prediction=self.raw_prediction,
|
||
|
)
|
||
|
|
||
|
@abstractmethod
|
||
|
def update_gradient_hessian(self, X, y, sample_weight):
|
||
|
"""Update gradient and Hessian."""
|
||
|
|
||
|
@abstractmethod
|
||
|
def inner_solve(self, X, y, sample_weight):
|
||
|
"""Compute Newton step.
|
||
|
|
||
|
Sets:
|
||
|
- self.coef_newton
|
||
|
- self.gradient_times_newton
|
||
|
"""
|
||
|
|
||
|
def fallback_lbfgs_solve(self, X, y, sample_weight):
|
||
|
"""Fallback solver in case of emergency.
|
||
|
|
||
|
If a solver detects convergence problems, it may fall back to this methods in
|
||
|
the hope to exit with success instead of raising an error.
|
||
|
|
||
|
Sets:
|
||
|
- self.coef
|
||
|
- self.converged
|
||
|
"""
|
||
|
opt_res = scipy.optimize.minimize(
|
||
|
self.linear_loss.loss_gradient,
|
||
|
self.coef,
|
||
|
method="L-BFGS-B",
|
||
|
jac=True,
|
||
|
options={
|
||
|
"maxiter": self.max_iter,
|
||
|
"maxls": 50, # default is 20
|
||
|
"iprint": self.verbose - 1,
|
||
|
"gtol": self.tol,
|
||
|
"ftol": 64 * np.finfo(np.float64).eps,
|
||
|
},
|
||
|
args=(X, y, sample_weight, self.l2_reg_strength, self.n_threads),
|
||
|
)
|
||
|
self.n_iter_ = _check_optimize_result("lbfgs", opt_res)
|
||
|
self.coef = opt_res.x
|
||
|
self.converged = opt_res.status == 0
|
||
|
|
||
|
def line_search(self, X, y, sample_weight):
|
||
|
"""Backtracking line search.
|
||
|
|
||
|
Sets:
|
||
|
- self.coef_old
|
||
|
- self.coef
|
||
|
- self.loss_value_old
|
||
|
- self.loss_value
|
||
|
- self.gradient_old
|
||
|
- self.gradient
|
||
|
- self.raw_prediction
|
||
|
"""
|
||
|
# line search parameters
|
||
|
beta, sigma = 0.5, 0.00048828125 # 1/2, 1/2**11
|
||
|
eps = 16 * np.finfo(self.loss_value.dtype).eps
|
||
|
t = 1 # step size
|
||
|
|
||
|
# gradient_times_newton = self.gradient @ self.coef_newton
|
||
|
# was computed in inner_solve.
|
||
|
armijo_term = sigma * self.gradient_times_newton
|
||
|
_, _, raw_prediction_newton = self.linear_loss.weight_intercept_raw(
|
||
|
self.coef_newton, X
|
||
|
)
|
||
|
|
||
|
self.coef_old = self.coef
|
||
|
self.loss_value_old = self.loss_value
|
||
|
self.gradient_old = self.gradient
|
||
|
|
||
|
# np.sum(np.abs(self.gradient_old))
|
||
|
sum_abs_grad_old = -1
|
||
|
|
||
|
is_verbose = self.verbose >= 2
|
||
|
if is_verbose:
|
||
|
print(" Backtracking Line Search")
|
||
|
print(f" eps=10 * finfo.eps={eps}")
|
||
|
|
||
|
for i in range(21): # until and including t = beta**20 ~ 1e-6
|
||
|
self.coef = self.coef_old + t * self.coef_newton
|
||
|
raw = self.raw_prediction + t * raw_prediction_newton
|
||
|
self.loss_value, self.gradient = self.linear_loss.loss_gradient(
|
||
|
coef=self.coef,
|
||
|
X=X,
|
||
|
y=y,
|
||
|
sample_weight=sample_weight,
|
||
|
l2_reg_strength=self.l2_reg_strength,
|
||
|
n_threads=self.n_threads,
|
||
|
raw_prediction=raw,
|
||
|
)
|
||
|
# Note: If coef_newton is too large, loss_gradient may produce inf values,
|
||
|
# potentially accompanied by a RuntimeWarning.
|
||
|
# This case will be captured by the Armijo condition.
|
||
|
|
||
|
# 1. Check Armijo / sufficient decrease condition.
|
||
|
# The smaller (more negative) the better.
|
||
|
loss_improvement = self.loss_value - self.loss_value_old
|
||
|
check = loss_improvement <= t * armijo_term
|
||
|
if is_verbose:
|
||
|
print(
|
||
|
f" line search iteration={i+1}, step size={t}\n"
|
||
|
f" check loss improvement <= armijo term: {loss_improvement} "
|
||
|
f"<= {t * armijo_term} {check}"
|
||
|
)
|
||
|
if check:
|
||
|
break
|
||
|
# 2. Deal with relative loss differences around machine precision.
|
||
|
tiny_loss = np.abs(self.loss_value_old * eps)
|
||
|
check = np.abs(loss_improvement) <= tiny_loss
|
||
|
if is_verbose:
|
||
|
print(
|
||
|
" check loss |improvement| <= eps * |loss_old|:"
|
||
|
f" {np.abs(loss_improvement)} <= {tiny_loss} {check}"
|
||
|
)
|
||
|
if check:
|
||
|
if sum_abs_grad_old < 0:
|
||
|
sum_abs_grad_old = scipy.linalg.norm(self.gradient_old, ord=1)
|
||
|
# 2.1 Check sum of absolute gradients as alternative condition.
|
||
|
sum_abs_grad = scipy.linalg.norm(self.gradient, ord=1)
|
||
|
check = sum_abs_grad < sum_abs_grad_old
|
||
|
if is_verbose:
|
||
|
print(
|
||
|
" check sum(|gradient|) < sum(|gradient_old|): "
|
||
|
f"{sum_abs_grad} < {sum_abs_grad_old} {check}"
|
||
|
)
|
||
|
if check:
|
||
|
break
|
||
|
|
||
|
t *= beta
|
||
|
else:
|
||
|
warnings.warn(
|
||
|
f"Line search of Newton solver {self.__class__.__name__} at iteration "
|
||
|
f"#{self.iteration} did no converge after 21 line search refinement "
|
||
|
"iterations. It will now resort to lbfgs instead.",
|
||
|
ConvergenceWarning,
|
||
|
)
|
||
|
if self.verbose:
|
||
|
print(" Line search did not converge and resorts to lbfgs instead.")
|
||
|
self.use_fallback_lbfgs_solve = True
|
||
|
return
|
||
|
|
||
|
self.raw_prediction = raw
|
||
|
|
||
|
def check_convergence(self, X, y, sample_weight):
|
||
|
"""Check for convergence.
|
||
|
|
||
|
Sets self.converged.
|
||
|
"""
|
||
|
if self.verbose:
|
||
|
print(" Check Convergence")
|
||
|
# Note: Checking maximum relative change of coefficient <= tol is a bad
|
||
|
# convergence criterion because even a large step could have brought us close
|
||
|
# to the true minimum.
|
||
|
# coef_step = self.coef - self.coef_old
|
||
|
# check = np.max(np.abs(coef_step) / np.maximum(1, np.abs(self.coef_old)))
|
||
|
|
||
|
# 1. Criterion: maximum |gradient| <= tol
|
||
|
# The gradient was already updated in line_search()
|
||
|
check = np.max(np.abs(self.gradient))
|
||
|
if self.verbose:
|
||
|
print(f" 1. max |gradient| {check} <= {self.tol}")
|
||
|
if check > self.tol:
|
||
|
return
|
||
|
|
||
|
# 2. Criterion: For Newton decrement d, check 1/2 * d^2 <= tol
|
||
|
# d = sqrt(grad @ hessian^-1 @ grad)
|
||
|
# = sqrt(coef_newton @ hessian @ coef_newton)
|
||
|
# See Boyd, Vanderberghe (2009) "Convex Optimization" Chapter 9.5.1.
|
||
|
d2 = self.coef_newton @ self.hessian @ self.coef_newton
|
||
|
if self.verbose:
|
||
|
print(f" 2. Newton decrement {0.5 * d2} <= {self.tol}")
|
||
|
if 0.5 * d2 > self.tol:
|
||
|
return
|
||
|
|
||
|
if self.verbose:
|
||
|
loss_value = self.linear_loss.loss(
|
||
|
coef=self.coef,
|
||
|
X=X,
|
||
|
y=y,
|
||
|
sample_weight=sample_weight,
|
||
|
l2_reg_strength=self.l2_reg_strength,
|
||
|
n_threads=self.n_threads,
|
||
|
)
|
||
|
print(f" Solver did converge at loss = {loss_value}.")
|
||
|
self.converged = True
|
||
|
|
||
|
def finalize(self, X, y, sample_weight):
|
||
|
"""Finalize the solvers results.
|
||
|
|
||
|
Some solvers may need this, others not.
|
||
|
"""
|
||
|
pass
|
||
|
|
||
|
def solve(self, X, y, sample_weight):
|
||
|
"""Solve the optimization problem.
|
||
|
|
||
|
This is the main routine.
|
||
|
|
||
|
Order of calls:
|
||
|
self.setup()
|
||
|
while iteration:
|
||
|
self.update_gradient_hessian()
|
||
|
self.inner_solve()
|
||
|
self.line_search()
|
||
|
self.check_convergence()
|
||
|
self.finalize()
|
||
|
|
||
|
Returns
|
||
|
-------
|
||
|
coef : ndarray of shape (n_dof,), (n_classes, n_dof) or (n_classes * n_dof,)
|
||
|
Solution of the optimization problem.
|
||
|
"""
|
||
|
# setup usually:
|
||
|
# - initializes self.coef if needed
|
||
|
# - initializes and calculates self.raw_predictions, self.loss_value
|
||
|
self.setup(X=X, y=y, sample_weight=sample_weight)
|
||
|
|
||
|
self.iteration = 1
|
||
|
self.converged = False
|
||
|
|
||
|
while self.iteration <= self.max_iter and not self.converged:
|
||
|
if self.verbose:
|
||
|
print(f"Newton iter={self.iteration}")
|
||
|
|
||
|
self.use_fallback_lbfgs_solve = False # Fallback solver.
|
||
|
|
||
|
# 1. Update Hessian and gradient
|
||
|
self.update_gradient_hessian(X=X, y=y, sample_weight=sample_weight)
|
||
|
|
||
|
# TODO:
|
||
|
# if iteration == 1:
|
||
|
# We might stop early, e.g. we already are close to the optimum,
|
||
|
# usually detected by zero gradients at this stage.
|
||
|
|
||
|
# 2. Inner solver
|
||
|
# Calculate Newton step/direction
|
||
|
# This usually sets self.coef_newton and self.gradient_times_newton.
|
||
|
self.inner_solve(X=X, y=y, sample_weight=sample_weight)
|
||
|
if self.use_fallback_lbfgs_solve:
|
||
|
break
|
||
|
|
||
|
# 3. Backtracking line search
|
||
|
# This usually sets self.coef_old, self.coef, self.loss_value_old
|
||
|
# self.loss_value, self.gradient_old, self.gradient,
|
||
|
# self.raw_prediction.
|
||
|
self.line_search(X=X, y=y, sample_weight=sample_weight)
|
||
|
if self.use_fallback_lbfgs_solve:
|
||
|
break
|
||
|
|
||
|
# 4. Check convergence
|
||
|
# Sets self.converged.
|
||
|
self.check_convergence(X=X, y=y, sample_weight=sample_weight)
|
||
|
|
||
|
# 5. Next iteration
|
||
|
self.iteration += 1
|
||
|
|
||
|
if not self.converged:
|
||
|
if self.use_fallback_lbfgs_solve:
|
||
|
# Note: The fallback solver circumvents check_convergence and relies on
|
||
|
# the convergence checks of lbfgs instead. Enough warnings have been
|
||
|
# raised on the way.
|
||
|
self.fallback_lbfgs_solve(X=X, y=y, sample_weight=sample_weight)
|
||
|
else:
|
||
|
warnings.warn(
|
||
|
f"Newton solver did not converge after {self.iteration - 1} "
|
||
|
"iterations.",
|
||
|
ConvergenceWarning,
|
||
|
)
|
||
|
|
||
|
self.iteration -= 1
|
||
|
self.finalize(X=X, y=y, sample_weight=sample_weight)
|
||
|
return self.coef
|
||
|
|
||
|
|
||
|
class NewtonCholeskySolver(NewtonSolver):
|
||
|
"""Cholesky based Newton solver.
|
||
|
|
||
|
Inner solver for finding the Newton step H w_newton = -g uses Cholesky based linear
|
||
|
solver.
|
||
|
"""
|
||
|
|
||
|
def setup(self, X, y, sample_weight):
|
||
|
super().setup(X=X, y=y, sample_weight=sample_weight)
|
||
|
n_dof = X.shape[1]
|
||
|
if self.linear_loss.fit_intercept:
|
||
|
n_dof += 1
|
||
|
self.gradient = np.empty_like(self.coef)
|
||
|
self.hessian = np.empty_like(self.coef, shape=(n_dof, n_dof))
|
||
|
|
||
|
def update_gradient_hessian(self, X, y, sample_weight):
|
||
|
_, _, self.hessian_warning = self.linear_loss.gradient_hessian(
|
||
|
coef=self.coef,
|
||
|
X=X,
|
||
|
y=y,
|
||
|
sample_weight=sample_weight,
|
||
|
l2_reg_strength=self.l2_reg_strength,
|
||
|
n_threads=self.n_threads,
|
||
|
gradient_out=self.gradient,
|
||
|
hessian_out=self.hessian,
|
||
|
raw_prediction=self.raw_prediction, # this was updated in line_search
|
||
|
)
|
||
|
|
||
|
def inner_solve(self, X, y, sample_weight):
|
||
|
if self.hessian_warning:
|
||
|
warnings.warn(
|
||
|
f"The inner solver of {self.__class__.__name__} detected a "
|
||
|
"pointwise hessian with many negative values at iteration "
|
||
|
f"#{self.iteration}. It will now resort to lbfgs instead.",
|
||
|
ConvergenceWarning,
|
||
|
)
|
||
|
if self.verbose:
|
||
|
print(
|
||
|
" The inner solver detected a pointwise Hessian with many "
|
||
|
"negative values and resorts to lbfgs instead."
|
||
|
)
|
||
|
self.use_fallback_lbfgs_solve = True
|
||
|
return
|
||
|
|
||
|
try:
|
||
|
with warnings.catch_warnings():
|
||
|
warnings.simplefilter("error", scipy.linalg.LinAlgWarning)
|
||
|
self.coef_newton = scipy.linalg.solve(
|
||
|
self.hessian, -self.gradient, check_finite=False, assume_a="sym"
|
||
|
)
|
||
|
self.gradient_times_newton = self.gradient @ self.coef_newton
|
||
|
if self.gradient_times_newton > 0:
|
||
|
if self.verbose:
|
||
|
print(
|
||
|
" The inner solver found a Newton step that is not a "
|
||
|
"descent direction and resorts to LBFGS steps instead."
|
||
|
)
|
||
|
self.use_fallback_lbfgs_solve = True
|
||
|
return
|
||
|
except (np.linalg.LinAlgError, scipy.linalg.LinAlgWarning) as e:
|
||
|
warnings.warn(
|
||
|
f"The inner solver of {self.__class__.__name__} stumbled upon a "
|
||
|
"singular or very ill-conditioned Hessian matrix at iteration "
|
||
|
f"#{self.iteration}. It will now resort to lbfgs instead.\n"
|
||
|
"Further options are to use another solver or to avoid such situation "
|
||
|
"in the first place. Possible remedies are removing collinear features"
|
||
|
" of X or increasing the penalization strengths.\n"
|
||
|
"The original Linear Algebra message was:\n"
|
||
|
+ str(e),
|
||
|
scipy.linalg.LinAlgWarning,
|
||
|
)
|
||
|
# Possible causes:
|
||
|
# 1. hess_pointwise is negative. But this is already taken care in
|
||
|
# LinearModelLoss.gradient_hessian.
|
||
|
# 2. X is singular or ill-conditioned
|
||
|
# This might be the most probable cause.
|
||
|
#
|
||
|
# There are many possible ways to deal with this situation. Most of them
|
||
|
# add, explicitly or implicitly, a matrix to the hessian to make it
|
||
|
# positive definite, confer to Chapter 3.4 of Nocedal & Wright 2nd ed.
|
||
|
# Instead, we resort to lbfgs.
|
||
|
if self.verbose:
|
||
|
print(
|
||
|
" The inner solver stumbled upon an singular or ill-conditioned "
|
||
|
"Hessian matrix and resorts to LBFGS instead."
|
||
|
)
|
||
|
self.use_fallback_lbfgs_solve = True
|
||
|
return
|