781 lines
23 KiB
Plaintext
781 lines
23 KiB
Plaintext
{{py:
|
||
|
||
"""
|
||
Template file to easily generate fused types consistent code using Tempita
|
||
(https://github.com/cython/cython/blob/master/Cython/Tempita/_tempita.py).
|
||
|
||
Generated file: _sgd_fast.pyx
|
||
|
||
Each relevant function is duplicated for the dtypes float and double.
|
||
The keywords between double braces are substituted in setup.py.
|
||
|
||
Authors: Peter Prettenhofer <peter.prettenhofer@gmail.com>
|
||
Mathieu Blondel (partial_fit support)
|
||
Rob Zinkov (passive-aggressive)
|
||
Lars Buitinck
|
||
|
||
License: BSD 3 clause
|
||
"""
|
||
|
||
# The dtypes are defined as follows (name_suffix, c_type, np_type)
|
||
dtypes = [
|
||
("64", "double", "np.float64"),
|
||
("32", "float", "np.float32"),
|
||
]
|
||
|
||
}}
|
||
"""SGD implementation"""
|
||
|
||
import numpy as np
|
||
from time import time
|
||
|
||
from cython cimport floating
|
||
from libc.math cimport exp, fabs, isfinite, log, pow, INFINITY
|
||
|
||
from ..utils._typedefs cimport uint32_t
|
||
from ..utils._weight_vector cimport WeightVector32, WeightVector64
|
||
from ..utils._seq_dataset cimport SequentialDataset32, SequentialDataset64
|
||
|
||
|
||
cdef extern from *:
|
||
"""
|
||
/* Penalty constants */
|
||
#define NO_PENALTY 0
|
||
#define L1 1
|
||
#define L2 2
|
||
#define ELASTICNET 3
|
||
|
||
/* Learning rate constants */
|
||
#define CONSTANT 1
|
||
#define OPTIMAL 2
|
||
#define INVSCALING 3
|
||
#define ADAPTIVE 4
|
||
#define PA1 5
|
||
#define PA2 6
|
||
"""
|
||
int NO_PENALTY = 0
|
||
int L1 = 1
|
||
int L2 = 2
|
||
int ELASTICNET = 3
|
||
|
||
int CONSTANT = 1
|
||
int OPTIMAL = 2
|
||
int INVSCALING = 3
|
||
int ADAPTIVE = 4
|
||
int PA1 = 5
|
||
int PA2 = 6
|
||
|
||
|
||
# ----------------------------------------
|
||
# Extension Types for Loss Functions
|
||
# ----------------------------------------
|
||
|
||
cdef class LossFunction:
|
||
"""Base class for convex loss functions"""
|
||
|
||
cdef double loss(self, double y, double p) noexcept nogil:
|
||
"""Evaluate the loss function.
|
||
|
||
Parameters
|
||
----------
|
||
y : double
|
||
The true value (aka target).
|
||
p : double
|
||
The prediction, `p = w^T x + intercept`.
|
||
|
||
Returns
|
||
-------
|
||
double
|
||
The loss evaluated at `p` and `y`.
|
||
"""
|
||
return 0.
|
||
|
||
def py_dloss(self, double p, double y):
|
||
"""Python version of `dloss` for testing.
|
||
|
||
Pytest needs a python function and can't use cdef functions.
|
||
|
||
Parameters
|
||
----------
|
||
p : double
|
||
The prediction, `p = w^T x`.
|
||
y : double
|
||
The true value (aka target).
|
||
|
||
Returns
|
||
-------
|
||
double
|
||
The derivative of the loss function with regards to `p`.
|
||
"""
|
||
return self.dloss(y, p)
|
||
|
||
def py_loss(self, double p, double y):
|
||
"""Python version of `loss` for testing.
|
||
|
||
Pytest needs a python function and can't use cdef functions.
|
||
|
||
Parameters
|
||
----------
|
||
p : double
|
||
The prediction, `p = w^T x + intercept`.
|
||
y : double
|
||
The true value (aka target).
|
||
|
||
Returns
|
||
-------
|
||
double
|
||
The loss evaluated at `p` and `y`.
|
||
"""
|
||
return self.loss(y, p)
|
||
|
||
cdef double dloss(self, double y, double p) noexcept nogil:
|
||
"""Evaluate the derivative of the loss function with respect to
|
||
the prediction `p`.
|
||
|
||
Parameters
|
||
----------
|
||
y : double
|
||
The true value (aka target).
|
||
p : double
|
||
The prediction, `p = w^T x`.
|
||
|
||
Returns
|
||
-------
|
||
double
|
||
The derivative of the loss function with regards to `p`.
|
||
"""
|
||
return 0.
|
||
|
||
|
||
cdef class Regression(LossFunction):
|
||
"""Base class for loss functions for regression"""
|
||
|
||
cdef double loss(self, double y, double p) noexcept nogil:
|
||
return 0.
|
||
|
||
cdef double dloss(self, double y, double p) noexcept nogil:
|
||
return 0.
|
||
|
||
|
||
cdef class Classification(LossFunction):
|
||
"""Base class for loss functions for classification"""
|
||
|
||
cdef double loss(self, double y, double p) noexcept nogil:
|
||
return 0.
|
||
|
||
cdef double dloss(self, double y, double p) noexcept nogil:
|
||
return 0.
|
||
|
||
|
||
cdef class ModifiedHuber(Classification):
|
||
"""Modified Huber loss for binary classification with y in {-1, 1}
|
||
|
||
This is equivalent to quadratically smoothed SVM with gamma = 2.
|
||
|
||
See T. Zhang 'Solving Large Scale Linear Prediction Problems Using
|
||
Stochastic Gradient Descent', ICML'04.
|
||
"""
|
||
cdef double loss(self, double y, double p) noexcept nogil:
|
||
cdef double z = p * y
|
||
if z >= 1.0:
|
||
return 0.0
|
||
elif z >= -1.0:
|
||
return (1.0 - z) * (1.0 - z)
|
||
else:
|
||
return -4.0 * z
|
||
|
||
cdef double dloss(self, double y, double p) noexcept nogil:
|
||
cdef double z = p * y
|
||
if z >= 1.0:
|
||
return 0.0
|
||
elif z >= -1.0:
|
||
return 2.0 * (1.0 - z) * -y
|
||
else:
|
||
return -4.0 * y
|
||
|
||
def __reduce__(self):
|
||
return ModifiedHuber, ()
|
||
|
||
|
||
cdef class Hinge(Classification):
|
||
"""Hinge loss for binary classification tasks with y in {-1,1}
|
||
|
||
Parameters
|
||
----------
|
||
|
||
threshold : float > 0.0
|
||
Margin threshold. When threshold=1.0, one gets the loss used by SVM.
|
||
When threshold=0.0, one gets the loss used by the Perceptron.
|
||
"""
|
||
|
||
cdef double threshold
|
||
|
||
def __init__(self, double threshold=1.0):
|
||
self.threshold = threshold
|
||
|
||
cdef double loss(self, double y, double p) noexcept nogil:
|
||
cdef double z = p * y
|
||
if z <= self.threshold:
|
||
return self.threshold - z
|
||
return 0.0
|
||
|
||
cdef double dloss(self, double y, double p) noexcept nogil:
|
||
cdef double z = p * y
|
||
if z <= self.threshold:
|
||
return -y
|
||
return 0.0
|
||
|
||
def __reduce__(self):
|
||
return Hinge, (self.threshold,)
|
||
|
||
|
||
cdef class SquaredHinge(Classification):
|
||
"""Squared Hinge loss for binary classification tasks with y in {-1,1}
|
||
|
||
Parameters
|
||
----------
|
||
|
||
threshold : float > 0.0
|
||
Margin threshold. When threshold=1.0, one gets the loss used by
|
||
(quadratically penalized) SVM.
|
||
"""
|
||
|
||
cdef double threshold
|
||
|
||
def __init__(self, double threshold=1.0):
|
||
self.threshold = threshold
|
||
|
||
cdef double loss(self, double y, double p) noexcept nogil:
|
||
cdef double z = self.threshold - p * y
|
||
if z > 0:
|
||
return z * z
|
||
return 0.0
|
||
|
||
cdef double dloss(self, double y, double p) noexcept nogil:
|
||
cdef double z = self.threshold - p * y
|
||
if z > 0:
|
||
return -2 * y * z
|
||
return 0.0
|
||
|
||
def __reduce__(self):
|
||
return SquaredHinge, (self.threshold,)
|
||
|
||
|
||
cdef class Log(Classification):
|
||
"""Logistic regression loss for binary classification with y in {-1, 1}"""
|
||
|
||
cdef double loss(self, double y, double p) noexcept nogil:
|
||
cdef double z = p * y
|
||
# approximately equal and saves the computation of the log
|
||
if z > 18:
|
||
return exp(-z)
|
||
if z < -18:
|
||
return -z
|
||
return log(1.0 + exp(-z))
|
||
|
||
cdef double dloss(self, double y, double p) noexcept nogil:
|
||
cdef double z = p * y
|
||
# approximately equal and saves the computation of the log
|
||
if z > 18.0:
|
||
return exp(-z) * -y
|
||
if z < -18.0:
|
||
return -y
|
||
return -y / (exp(z) + 1.0)
|
||
|
||
def __reduce__(self):
|
||
return Log, ()
|
||
|
||
|
||
cdef class SquaredLoss(Regression):
|
||
"""Squared loss traditional used in linear regression."""
|
||
cdef double loss(self, double y, double p) noexcept nogil:
|
||
return 0.5 * (p - y) * (p - y)
|
||
|
||
cdef double dloss(self, double y, double p) noexcept nogil:
|
||
return p - y
|
||
|
||
def __reduce__(self):
|
||
return SquaredLoss, ()
|
||
|
||
|
||
cdef class Huber(Regression):
|
||
"""Huber regression loss
|
||
|
||
Variant of the SquaredLoss that is robust to outliers (quadratic near zero,
|
||
linear in for large errors).
|
||
|
||
https://en.wikipedia.org/wiki/Huber_Loss_Function
|
||
"""
|
||
|
||
cdef double c
|
||
|
||
def __init__(self, double c):
|
||
self.c = c
|
||
|
||
cdef double loss(self, double y, double p) noexcept nogil:
|
||
cdef double r = p - y
|
||
cdef double abs_r = fabs(r)
|
||
if abs_r <= self.c:
|
||
return 0.5 * r * r
|
||
else:
|
||
return self.c * abs_r - (0.5 * self.c * self.c)
|
||
|
||
cdef double dloss(self, double y, double p) noexcept nogil:
|
||
cdef double r = p - y
|
||
cdef double abs_r = fabs(r)
|
||
if abs_r <= self.c:
|
||
return r
|
||
elif r > 0.0:
|
||
return self.c
|
||
else:
|
||
return -self.c
|
||
|
||
def __reduce__(self):
|
||
return Huber, (self.c,)
|
||
|
||
|
||
cdef class EpsilonInsensitive(Regression):
|
||
"""Epsilon-Insensitive loss (used by SVR).
|
||
|
||
loss = max(0, |y - p| - epsilon)
|
||
"""
|
||
|
||
cdef double epsilon
|
||
|
||
def __init__(self, double epsilon):
|
||
self.epsilon = epsilon
|
||
|
||
cdef double loss(self, double y, double p) noexcept nogil:
|
||
cdef double ret = fabs(y - p) - self.epsilon
|
||
return ret if ret > 0 else 0
|
||
|
||
cdef double dloss(self, double y, double p) noexcept nogil:
|
||
if y - p > self.epsilon:
|
||
return -1
|
||
elif p - y > self.epsilon:
|
||
return 1
|
||
else:
|
||
return 0
|
||
|
||
def __reduce__(self):
|
||
return EpsilonInsensitive, (self.epsilon,)
|
||
|
||
|
||
cdef class SquaredEpsilonInsensitive(Regression):
|
||
"""Epsilon-Insensitive loss.
|
||
|
||
loss = max(0, |y - p| - epsilon)^2
|
||
"""
|
||
|
||
cdef double epsilon
|
||
|
||
def __init__(self, double epsilon):
|
||
self.epsilon = epsilon
|
||
|
||
cdef double loss(self, double y, double p) noexcept nogil:
|
||
cdef double ret = fabs(y - p) - self.epsilon
|
||
return ret * ret if ret > 0 else 0
|
||
|
||
cdef double dloss(self, double y, double p) noexcept nogil:
|
||
cdef double z
|
||
z = y - p
|
||
if z > self.epsilon:
|
||
return -2 * (z - self.epsilon)
|
||
elif z < -self.epsilon:
|
||
return 2 * (-z - self.epsilon)
|
||
else:
|
||
return 0
|
||
|
||
def __reduce__(self):
|
||
return SquaredEpsilonInsensitive, (self.epsilon,)
|
||
|
||
{{for name_suffix, c_type, np_type in dtypes}}
|
||
|
||
def _plain_sgd{{name_suffix}}(
|
||
const {{c_type}}[::1] weights,
|
||
double intercept,
|
||
const {{c_type}}[::1] average_weights,
|
||
double average_intercept,
|
||
LossFunction loss,
|
||
int penalty_type,
|
||
double alpha,
|
||
double C,
|
||
double l1_ratio,
|
||
SequentialDataset{{name_suffix}} dataset,
|
||
const unsigned char[::1] validation_mask,
|
||
bint early_stopping,
|
||
validation_score_cb,
|
||
int n_iter_no_change,
|
||
unsigned int max_iter,
|
||
double tol,
|
||
int fit_intercept,
|
||
int verbose,
|
||
bint shuffle,
|
||
uint32_t seed,
|
||
double weight_pos,
|
||
double weight_neg,
|
||
int learning_rate,
|
||
double eta0,
|
||
double power_t,
|
||
bint one_class,
|
||
double t=1.0,
|
||
double intercept_decay=1.0,
|
||
int average=0,
|
||
):
|
||
"""SGD for generic loss functions and penalties with optional averaging
|
||
|
||
Parameters
|
||
----------
|
||
weights : ndarray[{{c_type}}, ndim=1]
|
||
The allocated vector of weights.
|
||
intercept : double
|
||
The initial intercept.
|
||
average_weights : ndarray[{{c_type}}, ndim=1]
|
||
The average weights as computed for ASGD. Should be None if average
|
||
is 0.
|
||
average_intercept : double
|
||
The average intercept for ASGD. Should be 0 if average is 0.
|
||
loss : LossFunction
|
||
A concrete ``LossFunction`` object.
|
||
penalty_type : int
|
||
The penalty 2 for L2, 1 for L1, and 3 for Elastic-Net.
|
||
alpha : float
|
||
The regularization parameter.
|
||
C : float
|
||
Maximum step size for passive aggressive.
|
||
l1_ratio : float
|
||
The Elastic Net mixing parameter, with 0 <= l1_ratio <= 1.
|
||
l1_ratio=0 corresponds to L2 penalty, l1_ratio=1 to L1.
|
||
dataset : SequentialDataset
|
||
A concrete ``SequentialDataset`` object.
|
||
validation_mask : ndarray[unsigned char, ndim=1]
|
||
Equal to True on the validation set.
|
||
early_stopping : boolean
|
||
Whether to use a stopping criterion based on the validation set.
|
||
validation_score_cb : callable
|
||
A callable to compute a validation score given the current
|
||
coefficients and intercept values.
|
||
Used only if early_stopping is True.
|
||
n_iter_no_change : int
|
||
Number of iteration with no improvement to wait before stopping.
|
||
max_iter : int
|
||
The maximum number of iterations (epochs).
|
||
tol: double
|
||
The tolerance for the stopping criterion.
|
||
fit_intercept : int
|
||
Whether or not to fit the intercept (1 or 0).
|
||
verbose : int
|
||
Print verbose output; 0 for quite.
|
||
shuffle : boolean
|
||
Whether to shuffle the training data before each epoch.
|
||
weight_pos : float
|
||
The weight of the positive class.
|
||
weight_neg : float
|
||
The weight of the negative class.
|
||
seed : uint32_t
|
||
Seed of the pseudorandom number generator used to shuffle the data.
|
||
learning_rate : int
|
||
The learning rate:
|
||
(1) constant, eta = eta0
|
||
(2) optimal, eta = 1.0/(alpha * t).
|
||
(3) inverse scaling, eta = eta0 / pow(t, power_t)
|
||
(4) adaptive decrease
|
||
(5) Passive Aggressive-I, eta = min(alpha, loss/norm(x))
|
||
(6) Passive Aggressive-II, eta = 1.0 / (norm(x) + 0.5*alpha)
|
||
eta0 : double
|
||
The initial learning rate.
|
||
power_t : double
|
||
The exponent for inverse scaling learning rate.
|
||
one_class : boolean
|
||
Whether to solve the One-Class SVM optimization problem.
|
||
t : double
|
||
Initial state of the learning rate. This value is equal to the
|
||
iteration count except when the learning rate is set to `optimal`.
|
||
Default: 1.0.
|
||
average : int
|
||
The number of iterations before averaging starts. average=1 is
|
||
equivalent to averaging for all iterations.
|
||
|
||
|
||
Returns
|
||
-------
|
||
weights : array, shape=[n_features]
|
||
The fitted weight vector.
|
||
intercept : float
|
||
The fitted intercept term.
|
||
average_weights : array shape=[n_features]
|
||
The averaged weights across iterations. Values are valid only if
|
||
average > 0.
|
||
average_intercept : float
|
||
The averaged intercept across iterations.
|
||
Values are valid only if average > 0.
|
||
n_iter_ : int
|
||
The actual number of iter (epochs).
|
||
"""
|
||
|
||
# get the data information into easy vars
|
||
cdef Py_ssize_t n_samples = dataset.n_samples
|
||
cdef Py_ssize_t n_features = weights.shape[0]
|
||
|
||
cdef WeightVector{{name_suffix}} w = WeightVector{{name_suffix}}(weights, average_weights)
|
||
cdef {{c_type}} *x_data_ptr = NULL
|
||
cdef int *x_ind_ptr = NULL
|
||
|
||
# helper variables
|
||
cdef int no_improvement_count = 0
|
||
cdef bint infinity = False
|
||
cdef int xnnz
|
||
cdef double eta = 0.0
|
||
cdef double p = 0.0
|
||
cdef double update = 0.0
|
||
cdef double intercept_update = 0.0
|
||
cdef double sumloss = 0.0
|
||
cdef double score = 0.0
|
||
cdef double best_loss = INFINITY
|
||
cdef double best_score = -INFINITY
|
||
cdef {{c_type}} y = 0.0
|
||
cdef {{c_type}} sample_weight
|
||
cdef {{c_type}} class_weight = 1.0
|
||
cdef unsigned int count = 0
|
||
cdef unsigned int train_count = n_samples - np.sum(validation_mask)
|
||
cdef unsigned int epoch = 0
|
||
cdef unsigned int i = 0
|
||
cdef int is_hinge = isinstance(loss, Hinge)
|
||
cdef double optimal_init = 0.0
|
||
cdef double dloss = 0.0
|
||
cdef double MAX_DLOSS = 1e12
|
||
|
||
cdef long long sample_index
|
||
|
||
# q vector is only used for L1 regularization
|
||
cdef {{c_type}}[::1] q = None
|
||
cdef {{c_type}} * q_data_ptr = NULL
|
||
if penalty_type == L1 or penalty_type == ELASTICNET:
|
||
q = np.zeros((n_features,), dtype={{np_type}}, order="c")
|
||
q_data_ptr = &q[0]
|
||
cdef double u = 0.0
|
||
|
||
if penalty_type == L2:
|
||
l1_ratio = 0.0
|
||
elif penalty_type == L1:
|
||
l1_ratio = 1.0
|
||
|
||
eta = eta0
|
||
|
||
if learning_rate == OPTIMAL:
|
||
typw = np.sqrt(1.0 / np.sqrt(alpha))
|
||
# computing eta0, the initial learning rate
|
||
initial_eta0 = typw / max(1.0, loss.dloss(1.0, -typw))
|
||
# initialize t such that eta at first sample equals eta0
|
||
optimal_init = 1.0 / (initial_eta0 * alpha)
|
||
|
||
t_start = time()
|
||
with nogil:
|
||
for epoch in range(max_iter):
|
||
sumloss = 0
|
||
if verbose > 0:
|
||
with gil:
|
||
print("-- Epoch %d" % (epoch + 1))
|
||
if shuffle:
|
||
dataset.shuffle(seed)
|
||
for i in range(n_samples):
|
||
dataset.next(&x_data_ptr, &x_ind_ptr, &xnnz,
|
||
&y, &sample_weight)
|
||
|
||
sample_index = dataset.index_data_ptr[dataset.current_index]
|
||
if validation_mask[sample_index]:
|
||
# do not learn on the validation set
|
||
continue
|
||
|
||
p = w.dot(x_data_ptr, x_ind_ptr, xnnz) + intercept
|
||
if learning_rate == OPTIMAL:
|
||
eta = 1.0 / (alpha * (optimal_init + t - 1))
|
||
elif learning_rate == INVSCALING:
|
||
eta = eta0 / pow(t, power_t)
|
||
|
||
if verbose or not early_stopping:
|
||
sumloss += loss.loss(y, p)
|
||
|
||
if y > 0.0:
|
||
class_weight = weight_pos
|
||
else:
|
||
class_weight = weight_neg
|
||
|
||
if learning_rate == PA1:
|
||
update = sqnorm(x_data_ptr, x_ind_ptr, xnnz)
|
||
if update == 0:
|
||
continue
|
||
update = min(C, loss.loss(y, p) / update)
|
||
elif learning_rate == PA2:
|
||
update = sqnorm(x_data_ptr, x_ind_ptr, xnnz)
|
||
update = loss.loss(y, p) / (update + 0.5 / C)
|
||
else:
|
||
dloss = loss.dloss(y, p)
|
||
# clip dloss with large values to avoid numerical
|
||
# instabilities
|
||
if dloss < -MAX_DLOSS:
|
||
dloss = -MAX_DLOSS
|
||
elif dloss > MAX_DLOSS:
|
||
dloss = MAX_DLOSS
|
||
update = -eta * dloss
|
||
|
||
if learning_rate >= PA1:
|
||
if is_hinge:
|
||
# classification
|
||
update *= y
|
||
elif y - p < 0:
|
||
# regression
|
||
update *= -1
|
||
|
||
update *= class_weight * sample_weight
|
||
|
||
if penalty_type >= L2:
|
||
# do not scale to negative values when eta or alpha are too
|
||
# big: instead set the weights to zero
|
||
w.scale(max(0, 1.0 - ((1.0 - l1_ratio) * eta * alpha)))
|
||
|
||
if update != 0.0:
|
||
w.add(x_data_ptr, x_ind_ptr, xnnz, update)
|
||
if fit_intercept == 1:
|
||
intercept_update = update
|
||
if one_class: # specific for One-Class SVM
|
||
intercept_update -= 2. * eta * alpha
|
||
if intercept_update != 0:
|
||
intercept += intercept_update * intercept_decay
|
||
|
||
if 0 < average <= t:
|
||
# compute the average for the intercept and update the
|
||
# average weights, this is done regardless as to whether
|
||
# the update is 0
|
||
|
||
w.add_average(x_data_ptr, x_ind_ptr, xnnz,
|
||
update, (t - average + 1))
|
||
average_intercept += ((intercept - average_intercept) /
|
||
(t - average + 1))
|
||
|
||
if penalty_type == L1 or penalty_type == ELASTICNET:
|
||
u += (l1_ratio * eta * alpha)
|
||
l1penalty{{name_suffix}}(w, q_data_ptr, x_ind_ptr, xnnz, u)
|
||
|
||
t += 1
|
||
count += 1
|
||
|
||
# report epoch information
|
||
if verbose > 0:
|
||
with gil:
|
||
print("Norm: %.2f, NNZs: %d, Bias: %.6f, T: %d, "
|
||
"Avg. loss: %f"
|
||
% (w.norm(), np.nonzero(weights)[0].shape[0],
|
||
intercept, count, sumloss / train_count))
|
||
print("Total training time: %.2f seconds."
|
||
% (time() - t_start))
|
||
|
||
# floating-point under-/overflow check.
|
||
if (not isfinite(intercept) or any_nonfinite(weights)):
|
||
infinity = True
|
||
break
|
||
|
||
# evaluate the score on the validation set
|
||
if early_stopping:
|
||
with gil:
|
||
score = validation_score_cb(weights.base, intercept)
|
||
if tol > -INFINITY and score < best_score + tol:
|
||
no_improvement_count += 1
|
||
else:
|
||
no_improvement_count = 0
|
||
if score > best_score:
|
||
best_score = score
|
||
# or evaluate the loss on the training set
|
||
else:
|
||
if tol > -INFINITY and sumloss > best_loss - tol * train_count:
|
||
no_improvement_count += 1
|
||
else:
|
||
no_improvement_count = 0
|
||
if sumloss < best_loss:
|
||
best_loss = sumloss
|
||
|
||
# if there is no improvement several times in a row
|
||
if no_improvement_count >= n_iter_no_change:
|
||
if learning_rate == ADAPTIVE and eta > 1e-6:
|
||
eta = eta / 5
|
||
no_improvement_count = 0
|
||
else:
|
||
if verbose:
|
||
with gil:
|
||
print("Convergence after %d epochs took %.2f "
|
||
"seconds" % (epoch + 1, time() - t_start))
|
||
break
|
||
|
||
if infinity:
|
||
raise ValueError(("Floating-point under-/overflow occurred at epoch"
|
||
" #%d. Scaling input data with StandardScaler or"
|
||
" MinMaxScaler might help.") % (epoch + 1))
|
||
|
||
w.reset_wscale()
|
||
|
||
return (
|
||
weights.base,
|
||
intercept,
|
||
None if average_weights is None else average_weights.base,
|
||
average_intercept,
|
||
epoch + 1
|
||
)
|
||
|
||
{{endfor}}
|
||
|
||
|
||
cdef inline bint any_nonfinite(const floating[::1] w) noexcept nogil:
|
||
for i in range(w.shape[0]):
|
||
if not isfinite(w[i]):
|
||
return True
|
||
return 0
|
||
|
||
|
||
cdef inline double sqnorm(
|
||
floating * x_data_ptr,
|
||
int * x_ind_ptr,
|
||
int xnnz,
|
||
) noexcept nogil:
|
||
cdef double x_norm = 0.0
|
||
cdef int j
|
||
cdef double z
|
||
for j in range(xnnz):
|
||
z = x_data_ptr[j]
|
||
x_norm += z * z
|
||
return x_norm
|
||
|
||
|
||
{{for name_suffix, c_type, np_type in dtypes}}
|
||
|
||
cdef void l1penalty{{name_suffix}}(
|
||
WeightVector{{name_suffix}} w,
|
||
{{c_type}} * q_data_ptr,
|
||
int *x_ind_ptr,
|
||
int xnnz,
|
||
double u,
|
||
) noexcept nogil:
|
||
"""Apply the L1 penalty to each updated feature
|
||
|
||
This implements the truncated gradient approach by
|
||
[Tsuruoka, Y., Tsujii, J., and Ananiadou, S., 2009].
|
||
"""
|
||
cdef double z = 0.0
|
||
cdef int j = 0
|
||
cdef int idx = 0
|
||
cdef double wscale = w.wscale
|
||
cdef {{c_type}} *w_data_ptr = w.w_data_ptr
|
||
for j in range(xnnz):
|
||
idx = x_ind_ptr[j]
|
||
z = w_data_ptr[idx]
|
||
if wscale * z > 0.0:
|
||
w_data_ptr[idx] = max(
|
||
0.0, w_data_ptr[idx] - ((u + q_data_ptr[idx]) / wscale))
|
||
|
||
elif wscale * z < 0.0:
|
||
w_data_ptr[idx] = min(
|
||
0.0, w_data_ptr[idx] + ((u - q_data_ptr[idx]) / wscale))
|
||
|
||
q_data_ptr[idx] += wscale * (w_data_ptr[idx] - z)
|
||
|
||
{{endfor}}
|