3RNN/Lib/site-packages/sklearn/linear_model/_sgd_fast.pyx.tp

{{py:

"""
Template file to easily generate fused types consistent code using Tempita
(https://github.com/cython/cython/blob/master/Cython/Tempita/_tempita.py).

Generated file: _sgd_fast.pyx

Each relevant function is duplicated for the dtypes float and double.
The keywords between double braces are substituted in setup.py.

Authors: Peter Prettenhofer <peter.prettenhofer@gmail.com>
         Mathieu Blondel (partial_fit support)
         Rob Zinkov (passive-aggressive)
         Lars Buitinck

License: BSD 3 clause
"""

# The dtypes are defined as follows (name_suffix, c_type, np_type)
dtypes = [
    ("64", "double", "np.float64"),
    ("32", "float", "np.float32"),
]

}}
"""SGD implementation"""

import numpy as np
from time import time

from cython cimport floating
from libc.math cimport exp, fabs, isfinite, log, pow, INFINITY

from ..utils._typedefs cimport uint32_t
from ..utils._weight_vector cimport WeightVector32, WeightVector64
from ..utils._seq_dataset cimport SequentialDataset32, SequentialDataset64


cdef extern from *:
    """
    /* Penalty constants */
    #define NO_PENALTY 0
    #define L1 1
    #define L2 2
    #define ELASTICNET 3

    /* Learning rate constants */
    #define CONSTANT 1
    #define OPTIMAL 2
    #define INVSCALING 3
    #define ADAPTIVE 4
    #define PA1 5
    #define PA2 6
    """
    int NO_PENALTY = 0
    int L1 = 1
    int L2 = 2
    int ELASTICNET = 3

    int CONSTANT = 1
    int OPTIMAL = 2
    int INVSCALING = 3
    int ADAPTIVE = 4
    int PA1 = 5
    int PA2 = 6


# ----------------------------------------
# Extension Types for Loss Functions
# ----------------------------------------

cdef class LossFunction:
    """Base class for convex loss functions"""

    cdef double loss(self, double y, double p) noexcept nogil:
        """Evaluate the loss function.

        Parameters
        ----------
        y : double
            The true value (aka target).
        p : double
            The prediction, `p = w^T x + intercept`.

        Returns
        -------
        double
            The loss evaluated at `p` and `y`.
        """
        return 0.

    def py_dloss(self, double p, double y):
        """Python version of `dloss` for testing.

        Pytest needs a python function and can't use cdef functions.

        Parameters
        ----------
        p : double
            The prediction, `p = w^T x`.
        y : double
            The true value (aka target).

        Returns
        -------
        double
            The derivative of the loss function with regards to `p`.
        """
        return self.dloss(y, p)

    def py_loss(self, double p, double y):
        """Python version of `loss` for testing.

        Pytest needs a python function and can't use cdef functions.

        Parameters
        ----------
        p : double
            The prediction, `p = w^T x + intercept`.
        y : double
            The true value (aka target).

        Returns
        -------
        double
            The loss evaluated at `p` and `y`.
        """
        return self.loss(y, p)

    cdef double dloss(self, double y, double p) noexcept nogil:
        """Evaluate the derivative of the loss function with respect to
        the prediction `p`.

        Parameters
        ----------
        y : double
            The true value (aka target).
        p : double
            The prediction, `p = w^T x`.

        Returns
        -------
        double
            The derivative of the loss function with regards to `p`.
        """
        return 0.


cdef class Regression(LossFunction):
    """Base class for loss functions for regression"""

    cdef double loss(self, double y, double p) noexcept nogil:
        return 0.

    cdef double dloss(self, double y, double p) noexcept nogil:
        return 0.


cdef class Classification(LossFunction):
    """Base class for loss functions for classification"""

    cdef double loss(self, double y, double p) noexcept nogil:
        return 0.

    cdef double dloss(self, double y, double p) noexcept nogil:
        return 0.


cdef class ModifiedHuber(Classification):
    """Modified Huber loss for binary classification with y in {-1, 1}

    This is equivalent to quadratically smoothed SVM with gamma = 2.

    See T. Zhang 'Solving Large Scale Linear Prediction Problems Using
    Stochastic Gradient Descent', ICML'04.
    """
    cdef double loss(self, double y, double p) noexcept nogil:
        cdef double z = p * y
        if z >= 1.0:
            return 0.0
        elif z >= -1.0:
            return (1.0 - z) * (1.0 - z)
        else:
            return -4.0 * z

    cdef double dloss(self, double y, double p) noexcept nogil:
        cdef double z = p * y
        if z >= 1.0:
            return 0.0
        elif z >= -1.0:
            return 2.0 * (1.0 - z) * -y
        else:
            return -4.0 * y

    def __reduce__(self):
        return ModifiedHuber, ()


cdef class Hinge(Classification):
    """Hinge loss for binary classification tasks with y in {-1,1}

    Parameters
    ----------

    threshold : float > 0.0
        Margin threshold. When threshold=1.0, one gets the loss used by SVM.
        When threshold=0.0, one gets the loss used by the Perceptron.
    """

    cdef double threshold

    def __init__(self, double threshold=1.0):
        self.threshold = threshold

    cdef double loss(self, double y, double p) noexcept nogil:
        cdef double z = p * y
        if z <= self.threshold:
            return self.threshold - z
        return 0.0

    cdef double dloss(self, double y, double p) noexcept nogil:
        cdef double z = p * y
        if z <= self.threshold:
            return -y
        return 0.0

    def __reduce__(self):
        return Hinge, (self.threshold,)


cdef class SquaredHinge(Classification):
    """Squared Hinge loss for binary classification tasks with y in {-1,1}

    Parameters
    ----------

    threshold : float > 0.0
        Margin threshold. When threshold=1.0, one gets the loss used by
        (quadratically penalized) SVM.
    """

    cdef double threshold

    def __init__(self, double threshold=1.0):
        self.threshold = threshold

    cdef double loss(self, double y, double p) noexcept nogil:
        cdef double z = self.threshold - p * y
        if z > 0:
            return z * z
        return 0.0

    cdef double dloss(self, double y, double p) noexcept nogil:
        cdef double z = self.threshold - p * y
        if z > 0:
            return -2 * y * z
        return 0.0

    def __reduce__(self):
        return SquaredHinge, (self.threshold,)


cdef class Log(Classification):
    """Logistic regression loss for binary classification with y in {-1, 1}"""

    cdef double loss(self, double y, double p) noexcept nogil:
        cdef double z = p * y
        # approximately equal and saves the computation of the log
        if z > 18:
            return exp(-z)
        if z < -18:
            return -z
        return log(1.0 + exp(-z))

    cdef double dloss(self, double y, double p) noexcept nogil:
        cdef double z = p * y
        # approximately equal and saves the computation of the log
        if z > 18.0:
            return exp(-z) * -y
        if z < -18.0:
            return -y
        return -y / (exp(z) + 1.0)

    def __reduce__(self):
        return Log, ()


cdef class SquaredLoss(Regression):
    """Squared loss traditional used in linear regression."""
    cdef double loss(self, double y, double p) noexcept nogil:
        return 0.5 * (p - y) * (p - y)

    cdef double dloss(self, double y, double p) noexcept nogil:
        return p - y

    def __reduce__(self):
        return SquaredLoss, ()


cdef class Huber(Regression):
    """Huber regression loss

    Variant of the SquaredLoss that is robust to outliers (quadratic near zero,
    linear in for large errors).

    https://en.wikipedia.org/wiki/Huber_Loss_Function
    """

    cdef double c

    def __init__(self, double c):
        self.c = c

    cdef double loss(self, double y, double p) noexcept nogil:
        cdef double r = p - y
        cdef double abs_r = fabs(r)
        if abs_r <= self.c:
            return 0.5 * r * r
        else:
            return self.c * abs_r - (0.5 * self.c * self.c)

    cdef double dloss(self, double y, double p) noexcept nogil:
        cdef double r = p - y
        cdef double abs_r = fabs(r)
        if abs_r <= self.c:
            return r
        elif r > 0.0:
            return self.c
        else:
            return -self.c

    def __reduce__(self):
        return Huber, (self.c,)


cdef class EpsilonInsensitive(Regression):
    """Epsilon-Insensitive loss (used by SVR).

    loss = max(0, |y - p| - epsilon)
    """

    cdef double epsilon

    def __init__(self, double epsilon):
        self.epsilon = epsilon

    cdef double loss(self, double y, double p) noexcept nogil:
        cdef double ret = fabs(y - p) - self.epsilon
        return ret if ret > 0 else 0

    cdef double dloss(self, double y, double p) noexcept nogil:
        if y - p > self.epsilon:
            return -1
        elif p - y > self.epsilon:
            return 1
        else:
            return 0

    def __reduce__(self):
        return EpsilonInsensitive, (self.epsilon,)


cdef class SquaredEpsilonInsensitive(Regression):
    """Epsilon-Insensitive loss.

    loss = max(0, |y - p| - epsilon)^2
    """

    cdef double epsilon

    def __init__(self, double epsilon):
        self.epsilon = epsilon

    cdef double loss(self, double y, double p) noexcept nogil:
        cdef double ret = fabs(y - p) - self.epsilon
        return ret * ret if ret > 0 else 0

    cdef double dloss(self, double y, double p) noexcept nogil:
        cdef double z
        z = y - p
        if z > self.epsilon:
            return -2 * (z - self.epsilon)
        elif z < -self.epsilon:
            return 2 * (-z - self.epsilon)
        else:
            return 0

    def __reduce__(self):
        return SquaredEpsilonInsensitive, (self.epsilon,)

{{for name_suffix, c_type, np_type in dtypes}}

def _plain_sgd{{name_suffix}}(
    const {{c_type}}[::1] weights,
    double intercept,
    const {{c_type}}[::1] average_weights,
    double average_intercept,
    LossFunction loss,
    int penalty_type,
    double alpha,
    double C,
    double l1_ratio,
    SequentialDataset{{name_suffix}} dataset,
    const unsigned char[::1] validation_mask,
    bint early_stopping,
    validation_score_cb,
    int n_iter_no_change,
    unsigned int max_iter,
    double tol,
    int fit_intercept,
    int verbose,
    bint shuffle,
    uint32_t seed,
    double weight_pos,
    double weight_neg,
    int learning_rate,
    double eta0,
    double power_t,
    bint one_class,
    double t=1.0,
    double intercept_decay=1.0,
    int average=0,
):
    """SGD for generic loss functions and penalties with optional averaging

    Parameters
    ----------
    weights : ndarray[{{c_type}}, ndim=1]
        The allocated vector of weights.
    intercept : double
        The initial intercept.
    average_weights : ndarray[{{c_type}}, ndim=1]
        The average weights as computed for ASGD. Should be None if average
        is 0.
    average_intercept : double
        The average intercept for ASGD. Should be 0 if average is 0.
    loss : LossFunction
        A concrete ``LossFunction`` object.
    penalty_type : int
        The penalty 2 for L2, 1 for L1, and 3 for Elastic-Net.
    alpha : float
        The regularization parameter.
    C : float
        Maximum step size for passive aggressive.
    l1_ratio : float
        The Elastic Net mixing parameter, with 0 <= l1_ratio <= 1.
        l1_ratio=0 corresponds to L2 penalty, l1_ratio=1 to L1.
    dataset : SequentialDataset
        A concrete ``SequentialDataset`` object.
    validation_mask : ndarray[unsigned char, ndim=1]
        Equal to True on the validation set.
    early_stopping : boolean
        Whether to use a stopping criterion based on the validation set.
    validation_score_cb : callable
        A callable to compute a validation score given the current
        coefficients and intercept values.
        Used only if early_stopping is True.
    n_iter_no_change : int
        Number of iteration with no improvement to wait before stopping.
    max_iter : int
        The maximum number of iterations (epochs).
    tol: double
        The tolerance for the stopping criterion.
    fit_intercept : int
        Whether or not to fit the intercept (1 or 0).
    verbose : int
        Print verbose output; 0 for quite.
    shuffle : boolean
        Whether to shuffle the training data before each epoch.
    weight_pos : float
        The weight of the positive class.
    weight_neg : float
        The weight of the negative class.
    seed : uint32_t
        Seed of the pseudorandom number generator used to shuffle the data.
    learning_rate : int
        The learning rate:
        (1) constant, eta = eta0
        (2) optimal, eta = 1.0/(alpha * t).
        (3) inverse scaling, eta = eta0 / pow(t, power_t)
        (4) adaptive decrease
        (5) Passive Aggressive-I, eta = min(alpha, loss/norm(x))
        (6) Passive Aggressive-II, eta = 1.0 / (norm(x) + 0.5*alpha)
    eta0 : double
        The initial learning rate.
    power_t : double
        The exponent for inverse scaling learning rate.
    one_class : boolean
        Whether to solve the One-Class SVM optimization problem.
    t : double
        Initial state of the learning rate. This value is equal to the
        iteration count except when the learning rate is set to `optimal`.
        Default: 1.0.
    average : int
        The number of iterations before averaging starts. average=1 is
        equivalent to averaging for all iterations.


    Returns
    -------
    weights : array, shape=[n_features]
        The fitted weight vector.
    intercept : float
        The fitted intercept term.
    average_weights : array shape=[n_features]
        The averaged weights across iterations. Values are valid only if
        average > 0.
    average_intercept : float
        The averaged intercept across iterations.
        Values are valid only if average > 0.
    n_iter_ : int
        The actual number of iter (epochs).
    """

    # get the data information into easy vars
    cdef Py_ssize_t n_samples = dataset.n_samples
    cdef Py_ssize_t n_features = weights.shape[0]

    cdef WeightVector{{name_suffix}} w = WeightVector{{name_suffix}}(weights, average_weights)
    cdef {{c_type}} *x_data_ptr = NULL
    cdef int *x_ind_ptr = NULL

    # helper variables
    cdef int no_improvement_count = 0
    cdef bint infinity = False
    cdef int xnnz
    cdef double eta = 0.0
    cdef double p = 0.0
    cdef double update = 0.0
    cdef double intercept_update = 0.0
    cdef double sumloss = 0.0
    cdef double score = 0.0
    cdef double best_loss = INFINITY
    cdef double best_score = -INFINITY
    cdef {{c_type}} y = 0.0
    cdef {{c_type}} sample_weight
    cdef {{c_type}} class_weight = 1.0
    cdef unsigned int count = 0
    cdef unsigned int train_count = n_samples - np.sum(validation_mask)
    cdef unsigned int epoch = 0
    cdef unsigned int i = 0
    cdef int is_hinge = isinstance(loss, Hinge)
    cdef double optimal_init = 0.0
    cdef double dloss = 0.0
    cdef double MAX_DLOSS = 1e12

    cdef long long sample_index

    # q vector is only used for L1 regularization
    cdef {{c_type}}[::1] q = None
    cdef {{c_type}} * q_data_ptr = NULL
    if penalty_type == L1 or penalty_type == ELASTICNET:
        q = np.zeros((n_features,), dtype={{np_type}}, order="c")
        q_data_ptr = &q[0]
    cdef double u = 0.0

    if penalty_type == L2:
        l1_ratio = 0.0
    elif penalty_type == L1:
        l1_ratio = 1.0

    eta = eta0

    if learning_rate == OPTIMAL:
        typw = np.sqrt(1.0 / np.sqrt(alpha))
        # computing eta0, the initial learning rate
        initial_eta0 = typw / max(1.0, loss.dloss(1.0, -typw))
        # initialize t such that eta at first sample equals eta0
        optimal_init = 1.0 / (initial_eta0 * alpha)

    t_start = time()
    with nogil:
        for epoch in range(max_iter):
            sumloss = 0
            if verbose > 0:
                with gil:
                    print("-- Epoch %d" % (epoch + 1))
            if shuffle:
                dataset.shuffle(seed)
            for i in range(n_samples):
                dataset.next(&x_data_ptr, &x_ind_ptr, &xnnz,
                             &y, &sample_weight)

                sample_index = dataset.index_data_ptr[dataset.current_index]
                if validation_mask[sample_index]:
                    # do not learn on the validation set
                    continue

                p = w.dot(x_data_ptr, x_ind_ptr, xnnz) + intercept
                if learning_rate == OPTIMAL:
                    eta = 1.0 / (alpha * (optimal_init + t - 1))
                elif learning_rate == INVSCALING:
                    eta = eta0 / pow(t, power_t)

                if verbose or not early_stopping:
                    sumloss += loss.loss(y, p)

                if y > 0.0:
                    class_weight = weight_pos
                else:
                    class_weight = weight_neg

                if learning_rate == PA1:
                    update = sqnorm(x_data_ptr, x_ind_ptr, xnnz)
                    if update == 0:
                        continue
                    update = min(C, loss.loss(y, p) / update)
                elif learning_rate == PA2:
                    update = sqnorm(x_data_ptr, x_ind_ptr, xnnz)
                    update = loss.loss(y, p) / (update + 0.5 / C)
                else:
                    dloss = loss.dloss(y, p)
                    # clip dloss with large values to avoid numerical
                    # instabilities
                    if dloss < -MAX_DLOSS:
                        dloss = -MAX_DLOSS
                    elif dloss > MAX_DLOSS:
                        dloss = MAX_DLOSS
                    update = -eta * dloss

                if learning_rate >= PA1:
                    if is_hinge:
                        # classification
                        update *= y
                    elif y - p < 0:
                        # regression
                        update *= -1

                update *= class_weight * sample_weight

                if penalty_type >= L2:
                    # do not scale to negative values when eta or alpha are too
                    # big: instead set the weights to zero
                    w.scale(max(0, 1.0 - ((1.0 - l1_ratio) * eta * alpha)))

                if update != 0.0:
                    w.add(x_data_ptr, x_ind_ptr, xnnz, update)
                if fit_intercept == 1:
                    intercept_update = update
                    if one_class:  # specific for One-Class SVM
                        intercept_update -= 2. * eta * alpha
                    if intercept_update != 0:
                        intercept += intercept_update * intercept_decay

                if 0 < average <= t:
                    # compute the average for the intercept and update the
                    # average weights, this is done regardless as to whether
                    # the update is 0

                    w.add_average(x_data_ptr, x_ind_ptr, xnnz,
                                  update, (t - average + 1))
                    average_intercept += ((intercept - average_intercept) /
                                          (t - average + 1))

                if penalty_type == L1 or penalty_type == ELASTICNET:
                    u += (l1_ratio * eta * alpha)
                    l1penalty{{name_suffix}}(w, q_data_ptr, x_ind_ptr, xnnz, u)

                t += 1
                count += 1

            # report epoch information
            if verbose > 0:
                with gil:
                    print("Norm: %.2f, NNZs: %d, Bias: %.6f, T: %d, "
                          "Avg. loss: %f"
                          % (w.norm(), np.nonzero(weights)[0].shape[0],
                             intercept, count, sumloss / train_count))
                    print("Total training time: %.2f seconds."
                          % (time() - t_start))

            # floating-point under-/overflow check.
            if (not isfinite(intercept) or any_nonfinite(weights)):
                infinity = True
                break

            # evaluate the score on the validation set
            if early_stopping:
                with gil:
                    score = validation_score_cb(weights.base, intercept)
                if tol > -INFINITY and score < best_score + tol:
                    no_improvement_count += 1
                else:
                    no_improvement_count = 0
                if score > best_score:
                    best_score = score
            # or evaluate the loss on the training set
            else:
                if tol > -INFINITY and sumloss > best_loss - tol * train_count:
                    no_improvement_count += 1
                else:
                    no_improvement_count = 0
                if sumloss < best_loss:
                    best_loss = sumloss

            # if there is no improvement several times in a row
            if no_improvement_count >= n_iter_no_change:
                if learning_rate == ADAPTIVE and eta > 1e-6:
                    eta = eta / 5
                    no_improvement_count = 0
                else:
                    if verbose:
                        with gil:
                            print("Convergence after %d epochs took %.2f "
                                  "seconds" % (epoch + 1, time() - t_start))
                    break

    if infinity:
        raise ValueError(("Floating-point under-/overflow occurred at epoch"
                          " #%d. Scaling input data with StandardScaler or"
                          " MinMaxScaler might help.") % (epoch + 1))

    w.reset_wscale()

    return (
        weights.base,
        intercept,
        None if average_weights is None else average_weights.base,
        average_intercept,
        epoch + 1
    )

{{endfor}}


cdef inline bint any_nonfinite(const floating[::1] w) noexcept nogil:
    for i in range(w.shape[0]):
        if not isfinite(w[i]):
            return True
    return 0


cdef inline double sqnorm(
    floating * x_data_ptr,
    int * x_ind_ptr,
    int xnnz,
) noexcept nogil:
    cdef double x_norm = 0.0
    cdef int j
    cdef double z
    for j in range(xnnz):
        z = x_data_ptr[j]
        x_norm += z * z
    return x_norm


{{for name_suffix, c_type, np_type in dtypes}}

cdef void l1penalty{{name_suffix}}(
    WeightVector{{name_suffix}} w,
    {{c_type}} * q_data_ptr,
    int *x_ind_ptr,
    int xnnz,
    double u,
) noexcept nogil:
    """Apply the L1 penalty to each updated feature

    This implements the truncated gradient approach by
    [Tsuruoka, Y., Tsujii, J., and Ananiadou, S., 2009].
    """
    cdef double z = 0.0
    cdef int j = 0
    cdef int idx = 0
    cdef double wscale = w.wscale
    cdef {{c_type}} *w_data_ptr = w.w_data_ptr
    for j in range(xnnz):
        idx = x_ind_ptr[j]
        z = w_data_ptr[idx]
        if wscale * z > 0.0:
            w_data_ptr[idx] = max(
                0.0, w_data_ptr[idx] - ((u + q_data_ptr[idx]) / wscale))

        elif wscale * z < 0.0:
            w_data_ptr[idx] = min(
                0.0, w_data_ptr[idx] + ((u - q_data_ptr[idx]) / wscale))

        q_data_ptr[idx] += wscale * (w_data_ptr[idx] - z)

{{endfor}}