{{py: """ Template file to easily generate fused types consistent code using Tempita (https://github.com/cython/cython/blob/master/Cython/Tempita/_tempita.py). Generated file: _sgd_fast.pyx Each relevant function is duplicated for the dtypes float and double. The keywords between double braces are substituted in setup.py. Authors: Peter Prettenhofer Mathieu Blondel (partial_fit support) Rob Zinkov (passive-aggressive) Lars Buitinck License: BSD 3 clause """ # The dtypes are defined as follows (name_suffix, c_type, np_type) dtypes = [ ("64", "double", "np.float64"), ("32", "float", "np.float32"), ] }} """SGD implementation""" import numpy as np from time import time from cython cimport floating from libc.math cimport exp, fabs, isfinite, log, pow, INFINITY from ..utils._typedefs cimport uint32_t from ..utils._weight_vector cimport WeightVector32, WeightVector64 from ..utils._seq_dataset cimport SequentialDataset32, SequentialDataset64 cdef extern from *: """ /* Penalty constants */ #define NO_PENALTY 0 #define L1 1 #define L2 2 #define ELASTICNET 3 /* Learning rate constants */ #define CONSTANT 1 #define OPTIMAL 2 #define INVSCALING 3 #define ADAPTIVE 4 #define PA1 5 #define PA2 6 """ int NO_PENALTY = 0 int L1 = 1 int L2 = 2 int ELASTICNET = 3 int CONSTANT = 1 int OPTIMAL = 2 int INVSCALING = 3 int ADAPTIVE = 4 int PA1 = 5 int PA2 = 6 # ---------------------------------------- # Extension Types for Loss Functions # ---------------------------------------- cdef class LossFunction: """Base class for convex loss functions""" cdef double loss(self, double y, double p) noexcept nogil: """Evaluate the loss function. Parameters ---------- y : double The true value (aka target). p : double The prediction, `p = w^T x + intercept`. Returns ------- double The loss evaluated at `p` and `y`. """ return 0. def py_dloss(self, double p, double y): """Python version of `dloss` for testing. Pytest needs a python function and can't use cdef functions. Parameters ---------- p : double The prediction, `p = w^T x`. y : double The true value (aka target). Returns ------- double The derivative of the loss function with regards to `p`. """ return self.dloss(y, p) def py_loss(self, double p, double y): """Python version of `loss` for testing. Pytest needs a python function and can't use cdef functions. Parameters ---------- p : double The prediction, `p = w^T x + intercept`. y : double The true value (aka target). Returns ------- double The loss evaluated at `p` and `y`. """ return self.loss(y, p) cdef double dloss(self, double y, double p) noexcept nogil: """Evaluate the derivative of the loss function with respect to the prediction `p`. Parameters ---------- y : double The true value (aka target). p : double The prediction, `p = w^T x`. Returns ------- double The derivative of the loss function with regards to `p`. """ return 0. cdef class Regression(LossFunction): """Base class for loss functions for regression""" cdef double loss(self, double y, double p) noexcept nogil: return 0. cdef double dloss(self, double y, double p) noexcept nogil: return 0. cdef class Classification(LossFunction): """Base class for loss functions for classification""" cdef double loss(self, double y, double p) noexcept nogil: return 0. cdef double dloss(self, double y, double p) noexcept nogil: return 0. cdef class ModifiedHuber(Classification): """Modified Huber loss for binary classification with y in {-1, 1} This is equivalent to quadratically smoothed SVM with gamma = 2. See T. Zhang 'Solving Large Scale Linear Prediction Problems Using Stochastic Gradient Descent', ICML'04. """ cdef double loss(self, double y, double p) noexcept nogil: cdef double z = p * y if z >= 1.0: return 0.0 elif z >= -1.0: return (1.0 - z) * (1.0 - z) else: return -4.0 * z cdef double dloss(self, double y, double p) noexcept nogil: cdef double z = p * y if z >= 1.0: return 0.0 elif z >= -1.0: return 2.0 * (1.0 - z) * -y else: return -4.0 * y def __reduce__(self): return ModifiedHuber, () cdef class Hinge(Classification): """Hinge loss for binary classification tasks with y in {-1,1} Parameters ---------- threshold : float > 0.0 Margin threshold. When threshold=1.0, one gets the loss used by SVM. When threshold=0.0, one gets the loss used by the Perceptron. """ cdef double threshold def __init__(self, double threshold=1.0): self.threshold = threshold cdef double loss(self, double y, double p) noexcept nogil: cdef double z = p * y if z <= self.threshold: return self.threshold - z return 0.0 cdef double dloss(self, double y, double p) noexcept nogil: cdef double z = p * y if z <= self.threshold: return -y return 0.0 def __reduce__(self): return Hinge, (self.threshold,) cdef class SquaredHinge(Classification): """Squared Hinge loss for binary classification tasks with y in {-1,1} Parameters ---------- threshold : float > 0.0 Margin threshold. When threshold=1.0, one gets the loss used by (quadratically penalized) SVM. """ cdef double threshold def __init__(self, double threshold=1.0): self.threshold = threshold cdef double loss(self, double y, double p) noexcept nogil: cdef double z = self.threshold - p * y if z > 0: return z * z return 0.0 cdef double dloss(self, double y, double p) noexcept nogil: cdef double z = self.threshold - p * y if z > 0: return -2 * y * z return 0.0 def __reduce__(self): return SquaredHinge, (self.threshold,) cdef class Log(Classification): """Logistic regression loss for binary classification with y in {-1, 1}""" cdef double loss(self, double y, double p) noexcept nogil: cdef double z = p * y # approximately equal and saves the computation of the log if z > 18: return exp(-z) if z < -18: return -z return log(1.0 + exp(-z)) cdef double dloss(self, double y, double p) noexcept nogil: cdef double z = p * y # approximately equal and saves the computation of the log if z > 18.0: return exp(-z) * -y if z < -18.0: return -y return -y / (exp(z) + 1.0) def __reduce__(self): return Log, () cdef class SquaredLoss(Regression): """Squared loss traditional used in linear regression.""" cdef double loss(self, double y, double p) noexcept nogil: return 0.5 * (p - y) * (p - y) cdef double dloss(self, double y, double p) noexcept nogil: return p - y def __reduce__(self): return SquaredLoss, () cdef class Huber(Regression): """Huber regression loss Variant of the SquaredLoss that is robust to outliers (quadratic near zero, linear in for large errors). https://en.wikipedia.org/wiki/Huber_Loss_Function """ cdef double c def __init__(self, double c): self.c = c cdef double loss(self, double y, double p) noexcept nogil: cdef double r = p - y cdef double abs_r = fabs(r) if abs_r <= self.c: return 0.5 * r * r else: return self.c * abs_r - (0.5 * self.c * self.c) cdef double dloss(self, double y, double p) noexcept nogil: cdef double r = p - y cdef double abs_r = fabs(r) if abs_r <= self.c: return r elif r > 0.0: return self.c else: return -self.c def __reduce__(self): return Huber, (self.c,) cdef class EpsilonInsensitive(Regression): """Epsilon-Insensitive loss (used by SVR). loss = max(0, |y - p| - epsilon) """ cdef double epsilon def __init__(self, double epsilon): self.epsilon = epsilon cdef double loss(self, double y, double p) noexcept nogil: cdef double ret = fabs(y - p) - self.epsilon return ret if ret > 0 else 0 cdef double dloss(self, double y, double p) noexcept nogil: if y - p > self.epsilon: return -1 elif p - y > self.epsilon: return 1 else: return 0 def __reduce__(self): return EpsilonInsensitive, (self.epsilon,) cdef class SquaredEpsilonInsensitive(Regression): """Epsilon-Insensitive loss. loss = max(0, |y - p| - epsilon)^2 """ cdef double epsilon def __init__(self, double epsilon): self.epsilon = epsilon cdef double loss(self, double y, double p) noexcept nogil: cdef double ret = fabs(y - p) - self.epsilon return ret * ret if ret > 0 else 0 cdef double dloss(self, double y, double p) noexcept nogil: cdef double z z = y - p if z > self.epsilon: return -2 * (z - self.epsilon) elif z < -self.epsilon: return 2 * (-z - self.epsilon) else: return 0 def __reduce__(self): return SquaredEpsilonInsensitive, (self.epsilon,) {{for name_suffix, c_type, np_type in dtypes}} def _plain_sgd{{name_suffix}}( const {{c_type}}[::1] weights, double intercept, const {{c_type}}[::1] average_weights, double average_intercept, LossFunction loss, int penalty_type, double alpha, double C, double l1_ratio, SequentialDataset{{name_suffix}} dataset, const unsigned char[::1] validation_mask, bint early_stopping, validation_score_cb, int n_iter_no_change, unsigned int max_iter, double tol, int fit_intercept, int verbose, bint shuffle, uint32_t seed, double weight_pos, double weight_neg, int learning_rate, double eta0, double power_t, bint one_class, double t=1.0, double intercept_decay=1.0, int average=0, ): """SGD for generic loss functions and penalties with optional averaging Parameters ---------- weights : ndarray[{{c_type}}, ndim=1] The allocated vector of weights. intercept : double The initial intercept. average_weights : ndarray[{{c_type}}, ndim=1] The average weights as computed for ASGD. Should be None if average is 0. average_intercept : double The average intercept for ASGD. Should be 0 if average is 0. loss : LossFunction A concrete ``LossFunction`` object. penalty_type : int The penalty 2 for L2, 1 for L1, and 3 for Elastic-Net. alpha : float The regularization parameter. C : float Maximum step size for passive aggressive. l1_ratio : float The Elastic Net mixing parameter, with 0 <= l1_ratio <= 1. l1_ratio=0 corresponds to L2 penalty, l1_ratio=1 to L1. dataset : SequentialDataset A concrete ``SequentialDataset`` object. validation_mask : ndarray[unsigned char, ndim=1] Equal to True on the validation set. early_stopping : boolean Whether to use a stopping criterion based on the validation set. validation_score_cb : callable A callable to compute a validation score given the current coefficients and intercept values. Used only if early_stopping is True. n_iter_no_change : int Number of iteration with no improvement to wait before stopping. max_iter : int The maximum number of iterations (epochs). tol: double The tolerance for the stopping criterion. fit_intercept : int Whether or not to fit the intercept (1 or 0). verbose : int Print verbose output; 0 for quite. shuffle : boolean Whether to shuffle the training data before each epoch. weight_pos : float The weight of the positive class. weight_neg : float The weight of the negative class. seed : uint32_t Seed of the pseudorandom number generator used to shuffle the data. learning_rate : int The learning rate: (1) constant, eta = eta0 (2) optimal, eta = 1.0/(alpha * t). (3) inverse scaling, eta = eta0 / pow(t, power_t) (4) adaptive decrease (5) Passive Aggressive-I, eta = min(alpha, loss/norm(x)) (6) Passive Aggressive-II, eta = 1.0 / (norm(x) + 0.5*alpha) eta0 : double The initial learning rate. power_t : double The exponent for inverse scaling learning rate. one_class : boolean Whether to solve the One-Class SVM optimization problem. t : double Initial state of the learning rate. This value is equal to the iteration count except when the learning rate is set to `optimal`. Default: 1.0. average : int The number of iterations before averaging starts. average=1 is equivalent to averaging for all iterations. Returns ------- weights : array, shape=[n_features] The fitted weight vector. intercept : float The fitted intercept term. average_weights : array shape=[n_features] The averaged weights across iterations. Values are valid only if average > 0. average_intercept : float The averaged intercept across iterations. Values are valid only if average > 0. n_iter_ : int The actual number of iter (epochs). """ # get the data information into easy vars cdef Py_ssize_t n_samples = dataset.n_samples cdef Py_ssize_t n_features = weights.shape[0] cdef WeightVector{{name_suffix}} w = WeightVector{{name_suffix}}(weights, average_weights) cdef {{c_type}} *x_data_ptr = NULL cdef int *x_ind_ptr = NULL # helper variables cdef int no_improvement_count = 0 cdef bint infinity = False cdef int xnnz cdef double eta = 0.0 cdef double p = 0.0 cdef double update = 0.0 cdef double intercept_update = 0.0 cdef double sumloss = 0.0 cdef double score = 0.0 cdef double best_loss = INFINITY cdef double best_score = -INFINITY cdef {{c_type}} y = 0.0 cdef {{c_type}} sample_weight cdef {{c_type}} class_weight = 1.0 cdef unsigned int count = 0 cdef unsigned int train_count = n_samples - np.sum(validation_mask) cdef unsigned int epoch = 0 cdef unsigned int i = 0 cdef int is_hinge = isinstance(loss, Hinge) cdef double optimal_init = 0.0 cdef double dloss = 0.0 cdef double MAX_DLOSS = 1e12 cdef long long sample_index # q vector is only used for L1 regularization cdef {{c_type}}[::1] q = None cdef {{c_type}} * q_data_ptr = NULL if penalty_type == L1 or penalty_type == ELASTICNET: q = np.zeros((n_features,), dtype={{np_type}}, order="c") q_data_ptr = &q[0] cdef double u = 0.0 if penalty_type == L2: l1_ratio = 0.0 elif penalty_type == L1: l1_ratio = 1.0 eta = eta0 if learning_rate == OPTIMAL: typw = np.sqrt(1.0 / np.sqrt(alpha)) # computing eta0, the initial learning rate initial_eta0 = typw / max(1.0, loss.dloss(1.0, -typw)) # initialize t such that eta at first sample equals eta0 optimal_init = 1.0 / (initial_eta0 * alpha) t_start = time() with nogil: for epoch in range(max_iter): sumloss = 0 if verbose > 0: with gil: print("-- Epoch %d" % (epoch + 1)) if shuffle: dataset.shuffle(seed) for i in range(n_samples): dataset.next(&x_data_ptr, &x_ind_ptr, &xnnz, &y, &sample_weight) sample_index = dataset.index_data_ptr[dataset.current_index] if validation_mask[sample_index]: # do not learn on the validation set continue p = w.dot(x_data_ptr, x_ind_ptr, xnnz) + intercept if learning_rate == OPTIMAL: eta = 1.0 / (alpha * (optimal_init + t - 1)) elif learning_rate == INVSCALING: eta = eta0 / pow(t, power_t) if verbose or not early_stopping: sumloss += loss.loss(y, p) if y > 0.0: class_weight = weight_pos else: class_weight = weight_neg if learning_rate == PA1: update = sqnorm(x_data_ptr, x_ind_ptr, xnnz) if update == 0: continue update = min(C, loss.loss(y, p) / update) elif learning_rate == PA2: update = sqnorm(x_data_ptr, x_ind_ptr, xnnz) update = loss.loss(y, p) / (update + 0.5 / C) else: dloss = loss.dloss(y, p) # clip dloss with large values to avoid numerical # instabilities if dloss < -MAX_DLOSS: dloss = -MAX_DLOSS elif dloss > MAX_DLOSS: dloss = MAX_DLOSS update = -eta * dloss if learning_rate >= PA1: if is_hinge: # classification update *= y elif y - p < 0: # regression update *= -1 update *= class_weight * sample_weight if penalty_type >= L2: # do not scale to negative values when eta or alpha are too # big: instead set the weights to zero w.scale(max(0, 1.0 - ((1.0 - l1_ratio) * eta * alpha))) if update != 0.0: w.add(x_data_ptr, x_ind_ptr, xnnz, update) if fit_intercept == 1: intercept_update = update if one_class: # specific for One-Class SVM intercept_update -= 2. * eta * alpha if intercept_update != 0: intercept += intercept_update * intercept_decay if 0 < average <= t: # compute the average for the intercept and update the # average weights, this is done regardless as to whether # the update is 0 w.add_average(x_data_ptr, x_ind_ptr, xnnz, update, (t - average + 1)) average_intercept += ((intercept - average_intercept) / (t - average + 1)) if penalty_type == L1 or penalty_type == ELASTICNET: u += (l1_ratio * eta * alpha) l1penalty{{name_suffix}}(w, q_data_ptr, x_ind_ptr, xnnz, u) t += 1 count += 1 # report epoch information if verbose > 0: with gil: print("Norm: %.2f, NNZs: %d, Bias: %.6f, T: %d, " "Avg. loss: %f" % (w.norm(), np.nonzero(weights)[0].shape[0], intercept, count, sumloss / train_count)) print("Total training time: %.2f seconds." % (time() - t_start)) # floating-point under-/overflow check. if (not isfinite(intercept) or any_nonfinite(weights)): infinity = True break # evaluate the score on the validation set if early_stopping: with gil: score = validation_score_cb(weights.base, intercept) if tol > -INFINITY and score < best_score + tol: no_improvement_count += 1 else: no_improvement_count = 0 if score > best_score: best_score = score # or evaluate the loss on the training set else: if tol > -INFINITY and sumloss > best_loss - tol * train_count: no_improvement_count += 1 else: no_improvement_count = 0 if sumloss < best_loss: best_loss = sumloss # if there is no improvement several times in a row if no_improvement_count >= n_iter_no_change: if learning_rate == ADAPTIVE and eta > 1e-6: eta = eta / 5 no_improvement_count = 0 else: if verbose: with gil: print("Convergence after %d epochs took %.2f " "seconds" % (epoch + 1, time() - t_start)) break if infinity: raise ValueError(("Floating-point under-/overflow occurred at epoch" " #%d. Scaling input data with StandardScaler or" " MinMaxScaler might help.") % (epoch + 1)) w.reset_wscale() return ( weights.base, intercept, None if average_weights is None else average_weights.base, average_intercept, epoch + 1 ) {{endfor}} cdef inline bint any_nonfinite(const floating[::1] w) noexcept nogil: for i in range(w.shape[0]): if not isfinite(w[i]): return True return 0 cdef inline double sqnorm( floating * x_data_ptr, int * x_ind_ptr, int xnnz, ) noexcept nogil: cdef double x_norm = 0.0 cdef int j cdef double z for j in range(xnnz): z = x_data_ptr[j] x_norm += z * z return x_norm {{for name_suffix, c_type, np_type in dtypes}} cdef void l1penalty{{name_suffix}}( WeightVector{{name_suffix}} w, {{c_type}} * q_data_ptr, int *x_ind_ptr, int xnnz, double u, ) noexcept nogil: """Apply the L1 penalty to each updated feature This implements the truncated gradient approach by [Tsuruoka, Y., Tsujii, J., and Ananiadou, S., 2009]. """ cdef double z = 0.0 cdef int j = 0 cdef int idx = 0 cdef double wscale = w.wscale cdef {{c_type}} *w_data_ptr = w.w_data_ptr for j in range(xnnz): idx = x_ind_ptr[j] z = w_data_ptr[idx] if wscale * z > 0.0: w_data_ptr[idx] = max( 0.0, w_data_ptr[idx] - ((u + q_data_ptr[idx]) / wscale)) elif wscale * z < 0.0: w_data_ptr[idx] = min( 0.0, w_data_ptr[idx] + ((u - q_data_ptr[idx]) / wscale)) q_data_ptr[idx] += wscale * (w_data_ptr[idx] - z) {{endfor}}