Intelegentny_Pszczelarz/.venv/Lib/site-packages/sklearn/linear_model/_glm/_newton_solver.py

"""
Newton solver for Generalized Linear Models
"""

# Author: Christian Lorentzen <lorentzen.ch@gmail.com>
# License: BSD 3 clause

import warnings
from abc import ABC, abstractmethod

import numpy as np
import scipy.linalg
import scipy.optimize

from ..._loss.loss import HalfSquaredError
from ...exceptions import ConvergenceWarning
from ...utils.optimize import _check_optimize_result
from .._linear_loss import LinearModelLoss


class NewtonSolver(ABC):
    """Newton solver for GLMs.

    This class implements Newton/2nd-order optimization routines for GLMs. Each Newton
    iteration aims at finding the Newton step which is done by the inner solver. With
    Hessian H, gradient g and coefficients coef, one step solves:

        H @ coef_newton = -g

    For our GLM / LinearModelLoss, we have gradient g and Hessian H:

        g = X.T @ loss.gradient + l2_reg_strength * coef
        H = X.T @ diag(loss.hessian) @ X + l2_reg_strength * identity

    Backtracking line search updates coef = coef_old + t * coef_newton for some t in
    (0, 1].

    This is a base class, actual implementations (child classes) may deviate from the
    above pattern and use structure specific tricks.

    Usage pattern:
        - initialize solver: sol = NewtonSolver(...)
        - solve the problem: sol.solve(X, y, sample_weight)

    References
    ----------
    - Jorge Nocedal, Stephen J. Wright. (2006) "Numerical Optimization"
      2nd edition
      https://doi.org/10.1007/978-0-387-40065-5

    - Stephen P. Boyd, Lieven Vandenberghe. (2004) "Convex Optimization."
      Cambridge University Press, 2004.
      https://web.stanford.edu/~boyd/cvxbook/bv_cvxbook.pdf

    Parameters
    ----------
    coef : ndarray of shape (n_dof,), (n_classes, n_dof) or (n_classes * n_dof,)
        Initial coefficients of a linear model.
        If shape (n_classes * n_dof,), the classes of one feature are contiguous,
        i.e. one reconstructs the 2d-array via
        coef.reshape((n_classes, -1), order="F").

    linear_loss : LinearModelLoss
        The loss to be minimized.

    l2_reg_strength : float, default=0.0
        L2 regularization strength.

    tol : float, default=1e-4
        The optimization problem is solved when each of the following condition is
        fulfilled:
        1. maximum |gradient| <= tol
        2. Newton decrement d: 1/2 * d^2 <= tol

    max_iter : int, default=100
        Maximum number of Newton steps allowed.

    n_threads : int, default=1
        Number of OpenMP threads to use for the computation of the Hessian and gradient
        of the loss function.

    Attributes
    ----------
    coef_old : ndarray of shape coef.shape
        Coefficient of previous iteration.

    coef_newton : ndarray of shape coef.shape
        Newton step.

    gradient : ndarray of shape coef.shape
        Gradient of the loss w.r.t. the coefficients.

    gradient_old : ndarray of shape coef.shape
        Gradient of previous iteration.

    loss_value : float
        Value of objective function = loss + penalty.

    loss_value_old : float
        Value of objective function of previous itertion.

    raw_prediction : ndarray of shape (n_samples,) or (n_samples, n_classes)

    converged : bool
        Indicator for convergence of the solver.

    iteration : int
        Number of Newton steps, i.e. calls to inner_solve

    use_fallback_lbfgs_solve : bool
        If set to True, the solver will resort to call LBFGS to finish the optimisation
        procedure in case of convergence issues.

    gradient_times_newton : float
        gradient @ coef_newton, set in inner_solve and used by line_search. If the
        Newton step is a descent direction, this is negative.
    """

    def __init__(
        self,
        *,
        coef,
        linear_loss=LinearModelLoss(base_loss=HalfSquaredError(), fit_intercept=True),
        l2_reg_strength=0.0,
        tol=1e-4,
        max_iter=100,
        n_threads=1,
        verbose=0,
    ):
        self.coef = coef
        self.linear_loss = linear_loss
        self.l2_reg_strength = l2_reg_strength
        self.tol = tol
        self.max_iter = max_iter
        self.n_threads = n_threads
        self.verbose = verbose

    def setup(self, X, y, sample_weight):
        """Precomputations

        If None, initializes:
            - self.coef
        Sets:
            - self.raw_prediction
            - self.loss_value
        """
        _, _, self.raw_prediction = self.linear_loss.weight_intercept_raw(self.coef, X)
        self.loss_value = self.linear_loss.loss(
            coef=self.coef,
            X=X,
            y=y,
            sample_weight=sample_weight,
            l2_reg_strength=self.l2_reg_strength,
            n_threads=self.n_threads,
            raw_prediction=self.raw_prediction,
        )

    @abstractmethod
    def update_gradient_hessian(self, X, y, sample_weight):
        """Update gradient and Hessian."""

    @abstractmethod
    def inner_solve(self, X, y, sample_weight):
        """Compute Newton step.

        Sets:
            - self.coef_newton
            - self.gradient_times_newton
        """

    def fallback_lbfgs_solve(self, X, y, sample_weight):
        """Fallback solver in case of emergency.

        If a solver detects convergence problems, it may fall back to this methods in
        the hope to exit with success instead of raising an error.

        Sets:
            - self.coef
            - self.converged
        """
        opt_res = scipy.optimize.minimize(
            self.linear_loss.loss_gradient,
            self.coef,
            method="L-BFGS-B",
            jac=True,
            options={
                "maxiter": self.max_iter,
                "maxls": 50,  # default is 20
                "iprint": self.verbose - 1,
                "gtol": self.tol,
                "ftol": 64 * np.finfo(np.float64).eps,
            },
            args=(X, y, sample_weight, self.l2_reg_strength, self.n_threads),
        )
        self.n_iter_ = _check_optimize_result("lbfgs", opt_res)
        self.coef = opt_res.x
        self.converged = opt_res.status == 0

    def line_search(self, X, y, sample_weight):
        """Backtracking line search.

        Sets:
            - self.coef_old
            - self.coef
            - self.loss_value_old
            - self.loss_value
            - self.gradient_old
            - self.gradient
            - self.raw_prediction
        """
        # line search parameters
        beta, sigma = 0.5, 0.00048828125  # 1/2, 1/2**11
        eps = 16 * np.finfo(self.loss_value.dtype).eps
        t = 1  # step size

        # gradient_times_newton = self.gradient @ self.coef_newton
        # was computed in inner_solve.
        armijo_term = sigma * self.gradient_times_newton
        _, _, raw_prediction_newton = self.linear_loss.weight_intercept_raw(
            self.coef_newton, X
        )

        self.coef_old = self.coef
        self.loss_value_old = self.loss_value
        self.gradient_old = self.gradient

        # np.sum(np.abs(self.gradient_old))
        sum_abs_grad_old = -1

        is_verbose = self.verbose >= 2
        if is_verbose:
            print("  Backtracking Line Search")
            print(f"    eps=10 * finfo.eps={eps}")

        for i in range(21):  # until and including t = beta**20 ~ 1e-6
            self.coef = self.coef_old + t * self.coef_newton
            raw = self.raw_prediction + t * raw_prediction_newton
            self.loss_value, self.gradient = self.linear_loss.loss_gradient(
                coef=self.coef,
                X=X,
                y=y,
                sample_weight=sample_weight,
                l2_reg_strength=self.l2_reg_strength,
                n_threads=self.n_threads,
                raw_prediction=raw,
            )
            # Note: If coef_newton is too large, loss_gradient may produce inf values,
            # potentially accompanied by a RuntimeWarning.
            # This case will be captured by the Armijo condition.

            # 1. Check Armijo / sufficient decrease condition.
            # The smaller (more negative) the better.
            loss_improvement = self.loss_value - self.loss_value_old
            check = loss_improvement <= t * armijo_term
            if is_verbose:
                print(
                    f"    line search iteration={i+1}, step size={t}\n"
                    f"      check loss improvement <= armijo term: {loss_improvement} "
                    f"<= {t * armijo_term} {check}"
                )
            if check:
                break
            # 2. Deal with relative loss differences around machine precision.
            tiny_loss = np.abs(self.loss_value_old * eps)
            check = np.abs(loss_improvement) <= tiny_loss
            if is_verbose:
                print(
                    "      check loss |improvement| <= eps * |loss_old|:"
                    f" {np.abs(loss_improvement)} <= {tiny_loss} {check}"
                )
            if check:
                if sum_abs_grad_old < 0:
                    sum_abs_grad_old = scipy.linalg.norm(self.gradient_old, ord=1)
                # 2.1 Check sum of absolute gradients as alternative condition.
                sum_abs_grad = scipy.linalg.norm(self.gradient, ord=1)
                check = sum_abs_grad < sum_abs_grad_old
                if is_verbose:
                    print(
                        "      check sum(|gradient|) < sum(|gradient_old|): "
                        f"{sum_abs_grad} < {sum_abs_grad_old} {check}"
                    )
                if check:
                    break

            t *= beta
        else:
            warnings.warn(
                f"Line search of Newton solver {self.__class__.__name__} at iteration "
                f"#{self.iteration} did no converge after 21 line search refinement "
                "iterations. It will now resort to lbfgs instead.",
                ConvergenceWarning,
            )
            if self.verbose:
                print("  Line search did not converge and resorts to lbfgs instead.")
            self.use_fallback_lbfgs_solve = True
            return

        self.raw_prediction = raw

    def check_convergence(self, X, y, sample_weight):
        """Check for convergence.

        Sets self.converged.
        """
        if self.verbose:
            print("  Check Convergence")
        # Note: Checking maximum relative change of coefficient <= tol is a bad
        # convergence criterion because even a large step could have brought us close
        # to the true minimum.
        # coef_step = self.coef - self.coef_old
        # check = np.max(np.abs(coef_step) / np.maximum(1, np.abs(self.coef_old)))

        # 1. Criterion: maximum |gradient| <= tol
        #    The gradient was already updated in line_search()
        check = np.max(np.abs(self.gradient))
        if self.verbose:
            print(f"    1. max |gradient| {check} <= {self.tol}")
        if check > self.tol:
            return

        # 2. Criterion: For Newton decrement d, check 1/2 * d^2 <= tol
        #       d = sqrt(grad @ hessian^-1 @ grad)
        #         = sqrt(coef_newton @ hessian @ coef_newton)
        #    See Boyd, Vanderberghe (2009) "Convex Optimization" Chapter 9.5.1.
        d2 = self.coef_newton @ self.hessian @ self.coef_newton
        if self.verbose:
            print(f"    2. Newton decrement {0.5 * d2} <= {self.tol}")
        if 0.5 * d2 > self.tol:
            return

        if self.verbose:
            loss_value = self.linear_loss.loss(
                coef=self.coef,
                X=X,
                y=y,
                sample_weight=sample_weight,
                l2_reg_strength=self.l2_reg_strength,
                n_threads=self.n_threads,
            )
            print(f"  Solver did converge at loss = {loss_value}.")
        self.converged = True

    def finalize(self, X, y, sample_weight):
        """Finalize the solvers results.

        Some solvers may need this, others not.
        """
        pass

    def solve(self, X, y, sample_weight):
        """Solve the optimization problem.

        This is the main routine.

        Order of calls:
            self.setup()
            while iteration:
                self.update_gradient_hessian()
                self.inner_solve()
                self.line_search()
                self.check_convergence()
            self.finalize()

        Returns
        -------
        coef : ndarray of shape (n_dof,), (n_classes, n_dof) or (n_classes * n_dof,)
            Solution of the optimization problem.
        """
        # setup usually:
        #   - initializes self.coef if needed
        #   - initializes and calculates self.raw_predictions, self.loss_value
        self.setup(X=X, y=y, sample_weight=sample_weight)

        self.iteration = 1
        self.converged = False

        while self.iteration <= self.max_iter and not self.converged:
            if self.verbose:
                print(f"Newton iter={self.iteration}")

            self.use_fallback_lbfgs_solve = False  # Fallback solver.

            # 1. Update Hessian and gradient
            self.update_gradient_hessian(X=X, y=y, sample_weight=sample_weight)

            # TODO:
            # if iteration == 1:
            # We might stop early, e.g. we already are close to the optimum,
            # usually detected by zero gradients at this stage.

            # 2. Inner solver
            #    Calculate Newton step/direction
            #    This usually sets self.coef_newton and self.gradient_times_newton.
            self.inner_solve(X=X, y=y, sample_weight=sample_weight)
            if self.use_fallback_lbfgs_solve:
                break

            # 3. Backtracking line search
            #    This usually sets self.coef_old, self.coef, self.loss_value_old
            #    self.loss_value, self.gradient_old, self.gradient,
            #    self.raw_prediction.
            self.line_search(X=X, y=y, sample_weight=sample_weight)
            if self.use_fallback_lbfgs_solve:
                break

            # 4. Check convergence
            #    Sets self.converged.
            self.check_convergence(X=X, y=y, sample_weight=sample_weight)

            # 5. Next iteration
            self.iteration += 1

        if not self.converged:
            if self.use_fallback_lbfgs_solve:
                # Note: The fallback solver circumvents check_convergence and relies on
                # the convergence checks of lbfgs instead. Enough warnings have been
                # raised on the way.
                self.fallback_lbfgs_solve(X=X, y=y, sample_weight=sample_weight)
            else:
                warnings.warn(
                    f"Newton solver did not converge after {self.iteration - 1} "
                    "iterations.",
                    ConvergenceWarning,
                )

        self.iteration -= 1
        self.finalize(X=X, y=y, sample_weight=sample_weight)
        return self.coef


class NewtonCholeskySolver(NewtonSolver):
    """Cholesky based Newton solver.

    Inner solver for finding the Newton step H w_newton = -g uses Cholesky based linear
    solver.
    """

    def setup(self, X, y, sample_weight):
        super().setup(X=X, y=y, sample_weight=sample_weight)
        n_dof = X.shape[1]
        if self.linear_loss.fit_intercept:
            n_dof += 1
        self.gradient = np.empty_like(self.coef)
        self.hessian = np.empty_like(self.coef, shape=(n_dof, n_dof))

    def update_gradient_hessian(self, X, y, sample_weight):
        _, _, self.hessian_warning = self.linear_loss.gradient_hessian(
            coef=self.coef,
            X=X,
            y=y,
            sample_weight=sample_weight,
            l2_reg_strength=self.l2_reg_strength,
            n_threads=self.n_threads,
            gradient_out=self.gradient,
            hessian_out=self.hessian,
            raw_prediction=self.raw_prediction,  # this was updated in line_search
        )

    def inner_solve(self, X, y, sample_weight):
        if self.hessian_warning:
            warnings.warn(
                f"The inner solver of {self.__class__.__name__} detected a "
                "pointwise hessian with many negative values at iteration "
                f"#{self.iteration}. It will now resort to lbfgs instead.",
                ConvergenceWarning,
            )
            if self.verbose:
                print(
                    "  The inner solver detected a pointwise Hessian with many "
                    "negative values and resorts to lbfgs instead."
                )
            self.use_fallback_lbfgs_solve = True
            return

        try:
            with warnings.catch_warnings():
                warnings.simplefilter("error", scipy.linalg.LinAlgWarning)
                self.coef_newton = scipy.linalg.solve(
                    self.hessian, -self.gradient, check_finite=False, assume_a="sym"
                )
                self.gradient_times_newton = self.gradient @ self.coef_newton
                if self.gradient_times_newton > 0:
                    if self.verbose:
                        print(
                            "  The inner solver found a Newton step that is not a "
                            "descent direction and resorts to LBFGS steps instead."
                        )
                    self.use_fallback_lbfgs_solve = True
                    return
        except (np.linalg.LinAlgError, scipy.linalg.LinAlgWarning) as e:
            warnings.warn(
                f"The inner solver of {self.__class__.__name__} stumbled upon a "
                "singular or very ill-conditioned Hessian matrix at iteration "
                f"#{self.iteration}. It will now resort to lbfgs instead.\n"
                "Further options are to use another solver or to avoid such situation "
                "in the first place. Possible remedies are removing collinear features"
                " of X or increasing the penalization strengths.\n"
                "The original Linear Algebra message was:\n"
                + str(e),
                scipy.linalg.LinAlgWarning,
            )
            # Possible causes:
            # 1. hess_pointwise is negative. But this is already taken care in
            #    LinearModelLoss.gradient_hessian.
            # 2. X is singular or ill-conditioned
            #    This might be the most probable cause.
            #
            # There are many possible ways to deal with this situation. Most of them
            # add, explicitly or implicitly, a matrix to the hessian to make it
            # positive definite, confer to Chapter 3.4 of Nocedal & Wright 2nd ed.
            # Instead, we resort to lbfgs.
            if self.verbose:
                print(
                    "  The inner solver stumbled upon an singular or ill-conditioned "
                    "Hessian matrix and resorts to LBFGS instead."
                )
            self.use_fallback_lbfgs_solve = True
            return
feature: "ANN commit 2" 2023-06-19 00:49:18 +02:00			`"""`
			`Newton solver for Generalized Linear Models`
			`"""`

			`# Author: Christian Lorentzen <lorentzen.ch@gmail.com>`
			`# License: BSD 3 clause`

			`import warnings`
			`from abc import ABC, abstractmethod`

			`import numpy as np`
			`import scipy.linalg`
			`import scipy.optimize`

			`from ..._loss.loss import HalfSquaredError`
			`from ...exceptions import ConvergenceWarning`
			`from ...utils.optimize import _check_optimize_result`
			`from .._linear_loss import LinearModelLoss`


			`class NewtonSolver(ABC):`
			`"""Newton solver for GLMs.`

			`This class implements Newton/2nd-order optimization routines for GLMs. Each Newton`
			`iteration aims at finding the Newton step which is done by the inner solver. With`
			`Hessian H, gradient g and coefficients coef, one step solves:`

			`H @ coef_newton = -g`

			`For our GLM / LinearModelLoss, we have gradient g and Hessian H:`

			`g = X.T @ loss.gradient + l2_reg_strength * coef`
			`H = X.T @ diag(loss.hessian) @ X + l2_reg_strength * identity`

			`Backtracking line search updates coef = coef_old + t * coef_newton for some t in`
			`(0, 1].`

			`This is a base class, actual implementations (child classes) may deviate from the`
			`above pattern and use structure specific tricks.`

			`Usage pattern:`
			`- initialize solver: sol = NewtonSolver(...)`
			`- solve the problem: sol.solve(X, y, sample_weight)`

			`References`
			`----------`
			`- Jorge Nocedal, Stephen J. Wright. (2006) "Numerical Optimization"`
			`2nd edition`
			`https://doi.org/10.1007/978-0-387-40065-5`

			`- Stephen P. Boyd, Lieven Vandenberghe. (2004) "Convex Optimization."`
			`Cambridge University Press, 2004.`
			`https://web.stanford.edu/~boyd/cvxbook/bv_cvxbook.pdf`

			`Parameters`
			`----------`
			`coef : ndarray of shape (n_dof,), (n_classes, n_dof) or (n_classes * n_dof,)`
			`Initial coefficients of a linear model.`
			`If shape (n_classes * n_dof,), the classes of one feature are contiguous,`
			`i.e. one reconstructs the 2d-array via`
			`coef.reshape((n_classes, -1), order="F").`

			`linear_loss : LinearModelLoss`
			`The loss to be minimized.`

			`l2_reg_strength : float, default=0.0`
			`L2 regularization strength.`

			`tol : float, default=1e-4`
			`The optimization problem is solved when each of the following condition is`
			`fulfilled:`
			`1. maximum \|gradient\| <= tol`
			`2. Newton decrement d: 1/2 * d^2 <= tol`

			`max_iter : int, default=100`
			`Maximum number of Newton steps allowed.`

			`n_threads : int, default=1`
			`Number of OpenMP threads to use for the computation of the Hessian and gradient`
			`of the loss function.`

			`Attributes`
			`----------`
			`coef_old : ndarray of shape coef.shape`
			`Coefficient of previous iteration.`

			`coef_newton : ndarray of shape coef.shape`
			`Newton step.`

			`gradient : ndarray of shape coef.shape`
			`Gradient of the loss w.r.t. the coefficients.`

			`gradient_old : ndarray of shape coef.shape`
			`Gradient of previous iteration.`

			`loss_value : float`
			`Value of objective function = loss + penalty.`

			`loss_value_old : float`
			`Value of objective function of previous itertion.`

			`raw_prediction : ndarray of shape (n_samples,) or (n_samples, n_classes)`

			`converged : bool`
			`Indicator for convergence of the solver.`

			`iteration : int`
			`Number of Newton steps, i.e. calls to inner_solve`

			`use_fallback_lbfgs_solve : bool`
			`If set to True, the solver will resort to call LBFGS to finish the optimisation`
			`procedure in case of convergence issues.`

			`gradient_times_newton : float`
			`gradient @ coef_newton, set in inner_solve and used by line_search. If the`
			`Newton step is a descent direction, this is negative.`
			`"""`

			`def __init__(`
			`self,`
			`*,`
			`coef,`
			`linear_loss=LinearModelLoss(base_loss=HalfSquaredError(), fit_intercept=True),`
			`l2_reg_strength=0.0,`
			`tol=1e-4,`
			`max_iter=100,`
			`n_threads=1,`
			`verbose=0,`
			`):`
			`self.coef = coef`
			`self.linear_loss = linear_loss`
			`self.l2_reg_strength = l2_reg_strength`
			`self.tol = tol`
			`self.max_iter = max_iter`
			`self.n_threads = n_threads`
			`self.verbose = verbose`

			`def setup(self, X, y, sample_weight):`
			`"""Precomputations`

			`If None, initializes:`
			`- self.coef`
			`Sets:`
			`- self.raw_prediction`
			`- self.loss_value`
			`"""`
			`_, _, self.raw_prediction = self.linear_loss.weight_intercept_raw(self.coef, X)`
			`self.loss_value = self.linear_loss.loss(`
			`coef=self.coef,`
			`X=X,`
			`y=y,`
			`sample_weight=sample_weight,`
			`l2_reg_strength=self.l2_reg_strength,`
			`n_threads=self.n_threads,`
			`raw_prediction=self.raw_prediction,`
			`)`

			`@abstractmethod`
			`def update_gradient_hessian(self, X, y, sample_weight):`
			`"""Update gradient and Hessian."""`

			`@abstractmethod`
			`def inner_solve(self, X, y, sample_weight):`
			`"""Compute Newton step.`

			`Sets:`
			`- self.coef_newton`
			`- self.gradient_times_newton`
			`"""`

			`def fallback_lbfgs_solve(self, X, y, sample_weight):`
			`"""Fallback solver in case of emergency.`

			`If a solver detects convergence problems, it may fall back to this methods in`
			`the hope to exit with success instead of raising an error.`

			`Sets:`
			`- self.coef`
			`- self.converged`
			`"""`
			`opt_res = scipy.optimize.minimize(`
			`self.linear_loss.loss_gradient,`
			`self.coef,`
			`method="L-BFGS-B",`
			`jac=True,`
			`options={`
			`"maxiter": self.max_iter,`
			`"maxls": 50, # default is 20`
			`"iprint": self.verbose - 1,`
			`"gtol": self.tol,`
			`"ftol": 64 * np.finfo(np.float64).eps,`
			`},`
			`args=(X, y, sample_weight, self.l2_reg_strength, self.n_threads),`
			`)`
			`self.n_iter_ = _check_optimize_result("lbfgs", opt_res)`
			`self.coef = opt_res.x`
			`self.converged = opt_res.status == 0`

			`def line_search(self, X, y, sample_weight):`
			`"""Backtracking line search.`

			`Sets:`
			`- self.coef_old`
			`- self.coef`
			`- self.loss_value_old`
			`- self.loss_value`
			`- self.gradient_old`
			`- self.gradient`
			`- self.raw_prediction`
			`"""`
			`# line search parameters`
			`beta, sigma = 0.5, 0.00048828125 # 1/2, 1/2**11`
			`eps = 16 * np.finfo(self.loss_value.dtype).eps`
			`t = 1 # step size`

			`# gradient_times_newton = self.gradient @ self.coef_newton`
			`# was computed in inner_solve.`
			`armijo_term = sigma * self.gradient_times_newton`
			`_, _, raw_prediction_newton = self.linear_loss.weight_intercept_raw(`
			`self.coef_newton, X`
			`)`

			`self.coef_old = self.coef`
			`self.loss_value_old = self.loss_value`
			`self.gradient_old = self.gradient`

			`# np.sum(np.abs(self.gradient_old))`
			`sum_abs_grad_old = -1`

			`is_verbose = self.verbose >= 2`
			`if is_verbose:`
			`print(" Backtracking Line Search")`
			`print(f" eps=10 * finfo.eps={eps}")`

			`for i in range(21): # until and including t = beta**20 ~ 1e-6`
			`self.coef = self.coef_old + t * self.coef_newton`
			`raw = self.raw_prediction + t * raw_prediction_newton`
			`self.loss_value, self.gradient = self.linear_loss.loss_gradient(`
			`coef=self.coef,`
			`X=X,`
			`y=y,`
			`sample_weight=sample_weight,`
			`l2_reg_strength=self.l2_reg_strength,`
			`n_threads=self.n_threads,`
			`raw_prediction=raw,`
			`)`
			`# Note: If coef_newton is too large, loss_gradient may produce inf values,`
			`# potentially accompanied by a RuntimeWarning.`
			`# This case will be captured by the Armijo condition.`

			`# 1. Check Armijo / sufficient decrease condition.`
			`# The smaller (more negative) the better.`
			`loss_improvement = self.loss_value - self.loss_value_old`
			`check = loss_improvement <= t * armijo_term`
			`if is_verbose:`
			`print(`
			`f" line search iteration={i+1}, step size={t}\n"`
			`f" check loss improvement <= armijo term: {loss_improvement} "`
			`f"<= {t * armijo_term} {check}"`
			`)`
			`if check:`
			`break`
			`# 2. Deal with relative loss differences around machine precision.`
			`tiny_loss = np.abs(self.loss_value_old * eps)`
			`check = np.abs(loss_improvement) <= tiny_loss`
			`if is_verbose:`
			`print(`
			`" check loss \|improvement\| <= eps * \|loss_old\|:"`
			`f" {np.abs(loss_improvement)} <= {tiny_loss} {check}"`
			`)`
			`if check:`
			`if sum_abs_grad_old < 0:`
			`sum_abs_grad_old = scipy.linalg.norm(self.gradient_old, ord=1)`
			`# 2.1 Check sum of absolute gradients as alternative condition.`
			`sum_abs_grad = scipy.linalg.norm(self.gradient, ord=1)`
			`check = sum_abs_grad < sum_abs_grad_old`
			`if is_verbose:`
			`print(`
			`" check sum(\|gradient\|) < sum(\|gradient_old\|): "`
			`f"{sum_abs_grad} < {sum_abs_grad_old} {check}"`
			`)`
			`if check:`
			`break`

			`t *= beta`
			`else:`
			`warnings.warn(`
			`f"Line search of Newton solver {self.__class__.__name__} at iteration "`
			`f"#{self.iteration} did no converge after 21 line search refinement "`
			`"iterations. It will now resort to lbfgs instead.",`
			`ConvergenceWarning,`
			`)`
			`if self.verbose:`
			`print(" Line search did not converge and resorts to lbfgs instead.")`
			`self.use_fallback_lbfgs_solve = True`
			`return`

			`self.raw_prediction = raw`

			`def check_convergence(self, X, y, sample_weight):`
			`"""Check for convergence.`

			`Sets self.converged.`
			`"""`
			`if self.verbose:`
			`print(" Check Convergence")`
			`# Note: Checking maximum relative change of coefficient <= tol is a bad`
			`# convergence criterion because even a large step could have brought us close`
			`# to the true minimum.`
			`# coef_step = self.coef - self.coef_old`
			`# check = np.max(np.abs(coef_step) / np.maximum(1, np.abs(self.coef_old)))`

			`# 1. Criterion: maximum \|gradient\| <= tol`
			`# The gradient was already updated in line_search()`
			`check = np.max(np.abs(self.gradient))`
			`if self.verbose:`
			`print(f" 1. max \|gradient\| {check} <= {self.tol}")`
			`if check > self.tol:`
			`return`

			`# 2. Criterion: For Newton decrement d, check 1/2 * d^2 <= tol`
			`# d = sqrt(grad @ hessian^-1 @ grad)`
			`# = sqrt(coef_newton @ hessian @ coef_newton)`
			`# See Boyd, Vanderberghe (2009) "Convex Optimization" Chapter 9.5.1.`
			`d2 = self.coef_newton @ self.hessian @ self.coef_newton`
			`if self.verbose:`
			`print(f" 2. Newton decrement {0.5 * d2} <= {self.tol}")`
			`if 0.5 * d2 > self.tol:`
			`return`

			`if self.verbose:`
			`loss_value = self.linear_loss.loss(`
			`coef=self.coef,`
			`X=X,`
			`y=y,`
			`sample_weight=sample_weight,`
			`l2_reg_strength=self.l2_reg_strength,`
			`n_threads=self.n_threads,`
			`)`
			`print(f" Solver did converge at loss = {loss_value}.")`
			`self.converged = True`

			`def finalize(self, X, y, sample_weight):`
			`"""Finalize the solvers results.`

			`Some solvers may need this, others not.`
			`"""`
			`pass`

			`def solve(self, X, y, sample_weight):`
			`"""Solve the optimization problem.`

			`This is the main routine.`

			`Order of calls:`
			`self.setup()`
			`while iteration:`
			`self.update_gradient_hessian()`
			`self.inner_solve()`
			`self.line_search()`
			`self.check_convergence()`
			`self.finalize()`

			`Returns`
			`-------`
			`coef : ndarray of shape (n_dof,), (n_classes, n_dof) or (n_classes * n_dof,)`
			`Solution of the optimization problem.`
			`"""`
			`# setup usually:`
			`# - initializes self.coef if needed`
			`# - initializes and calculates self.raw_predictions, self.loss_value`
			`self.setup(X=X, y=y, sample_weight=sample_weight)`

			`self.iteration = 1`
			`self.converged = False`

			`while self.iteration <= self.max_iter and not self.converged:`
			`if self.verbose:`
			`print(f"Newton iter={self.iteration}")`

			`self.use_fallback_lbfgs_solve = False # Fallback solver.`

			`# 1. Update Hessian and gradient`
			`self.update_gradient_hessian(X=X, y=y, sample_weight=sample_weight)`

			`# TODO:`
			`# if iteration == 1:`
			`# We might stop early, e.g. we already are close to the optimum,`
			`# usually detected by zero gradients at this stage.`

			`# 2. Inner solver`
			`# Calculate Newton step/direction`
			`# This usually sets self.coef_newton and self.gradient_times_newton.`
			`self.inner_solve(X=X, y=y, sample_weight=sample_weight)`
			`if self.use_fallback_lbfgs_solve:`
			`break`

			`# 3. Backtracking line search`
			`# This usually sets self.coef_old, self.coef, self.loss_value_old`
			`# self.loss_value, self.gradient_old, self.gradient,`
			`# self.raw_prediction.`
			`self.line_search(X=X, y=y, sample_weight=sample_weight)`
			`if self.use_fallback_lbfgs_solve:`
			`break`

			`# 4. Check convergence`
			`# Sets self.converged.`
			`self.check_convergence(X=X, y=y, sample_weight=sample_weight)`

			`# 5. Next iteration`
			`self.iteration += 1`

			`if not self.converged:`
			`if self.use_fallback_lbfgs_solve:`
			`# Note: The fallback solver circumvents check_convergence and relies on`
			`# the convergence checks of lbfgs instead. Enough warnings have been`
			`# raised on the way.`
			`self.fallback_lbfgs_solve(X=X, y=y, sample_weight=sample_weight)`
			`else:`
			`warnings.warn(`
			`f"Newton solver did not converge after {self.iteration - 1} "`
			`"iterations.",`
			`ConvergenceWarning,`
			`)`

			`self.iteration -= 1`
			`self.finalize(X=X, y=y, sample_weight=sample_weight)`
			`return self.coef`


			`class NewtonCholeskySolver(NewtonSolver):`
			`"""Cholesky based Newton solver.`

			`Inner solver for finding the Newton step H w_newton = -g uses Cholesky based linear`
			`solver.`
			`"""`

			`def setup(self, X, y, sample_weight):`
			`super().setup(X=X, y=y, sample_weight=sample_weight)`
			`n_dof = X.shape[1]`
			`if self.linear_loss.fit_intercept:`
			`n_dof += 1`
			`self.gradient = np.empty_like(self.coef)`
			`self.hessian = np.empty_like(self.coef, shape=(n_dof, n_dof))`

			`def update_gradient_hessian(self, X, y, sample_weight):`
			`_, _, self.hessian_warning = self.linear_loss.gradient_hessian(`
			`coef=self.coef,`
			`X=X,`
			`y=y,`
			`sample_weight=sample_weight,`
			`l2_reg_strength=self.l2_reg_strength,`
			`n_threads=self.n_threads,`
			`gradient_out=self.gradient,`
			`hessian_out=self.hessian,`
			`raw_prediction=self.raw_prediction, # this was updated in line_search`
			`)`

			`def inner_solve(self, X, y, sample_weight):`
			`if self.hessian_warning:`
			`warnings.warn(`
			`f"The inner solver of {self.__class__.__name__} detected a "`
			`"pointwise hessian with many negative values at iteration "`
			`f"#{self.iteration}. It will now resort to lbfgs instead.",`
			`ConvergenceWarning,`
			`)`
			`if self.verbose:`
			`print(`
			`" The inner solver detected a pointwise Hessian with many "`
			`"negative values and resorts to lbfgs instead."`
			`)`
			`self.use_fallback_lbfgs_solve = True`
			`return`

			`try:`
			`with warnings.catch_warnings():`
			`warnings.simplefilter("error", scipy.linalg.LinAlgWarning)`
			`self.coef_newton = scipy.linalg.solve(`
			`self.hessian, -self.gradient, check_finite=False, assume_a="sym"`
			`)`
			`self.gradient_times_newton = self.gradient @ self.coef_newton`
			`if self.gradient_times_newton > 0:`
			`if self.verbose:`
			`print(`
			`" The inner solver found a Newton step that is not a "`
			`"descent direction and resorts to LBFGS steps instead."`
			`)`
			`self.use_fallback_lbfgs_solve = True`
			`return`
			`except (np.linalg.LinAlgError, scipy.linalg.LinAlgWarning) as e:`
			`warnings.warn(`
			`f"The inner solver of {self.__class__.__name__} stumbled upon a "`
			`"singular or very ill-conditioned Hessian matrix at iteration "`
			`f"#{self.iteration}. It will now resort to lbfgs instead.\n"`
			`"Further options are to use another solver or to avoid such situation "`
			`"in the first place. Possible remedies are removing collinear features"`
			`" of X or increasing the penalization strengths.\n"`
			`"The original Linear Algebra message was:\n"`
			`+ str(e),`
			`scipy.linalg.LinAlgWarning,`
			`)`
			`# Possible causes:`
			`# 1. hess_pointwise is negative. But this is already taken care in`
			`# LinearModelLoss.gradient_hessian.`
			`# 2. X is singular or ill-conditioned`
			`# This might be the most probable cause.`
			`#`
			`# There are many possible ways to deal with this situation. Most of them`
			`# add, explicitly or implicitly, a matrix to the hessian to make it`
			`# positive definite, confer to Chapter 3.4 of Nocedal & Wright 2nd ed.`
			`# Instead, we resort to lbfgs.`
			`if self.verbose:`
			`print(`
			`" The inner solver stumbled upon an singular or ill-conditioned "`
			`"Hessian matrix and resorts to LBFGS instead."`
			`)`
			`self.use_fallback_lbfgs_solve = True`
			`return`