781 lines
23 KiB
Plaintext
781 lines
23 KiB
Plaintext
|
{{py:
|
|||
|
|
|||
|
"""
|
|||
|
Template file to easily generate fused types consistent code using Tempita
|
|||
|
(https://github.com/cython/cython/blob/master/Cython/Tempita/_tempita.py).
|
|||
|
|
|||
|
Generated file: _sgd_fast.pyx
|
|||
|
|
|||
|
Each relevant function is duplicated for the dtypes float and double.
|
|||
|
The keywords between double braces are substituted in setup.py.
|
|||
|
|
|||
|
Authors: Peter Prettenhofer <peter.prettenhofer@gmail.com>
|
|||
|
Mathieu Blondel (partial_fit support)
|
|||
|
Rob Zinkov (passive-aggressive)
|
|||
|
Lars Buitinck
|
|||
|
|
|||
|
License: BSD 3 clause
|
|||
|
"""
|
|||
|
|
|||
|
# The dtypes are defined as follows (name_suffix, c_type, np_type)
|
|||
|
dtypes = [
|
|||
|
("64", "double", "np.float64"),
|
|||
|
("32", "float", "np.float32"),
|
|||
|
]
|
|||
|
|
|||
|
}}
|
|||
|
"""SGD implementation"""
|
|||
|
|
|||
|
import numpy as np
|
|||
|
from time import time
|
|||
|
|
|||
|
from cython cimport floating
|
|||
|
from libc.math cimport exp, fabs, isfinite, log, pow, INFINITY
|
|||
|
|
|||
|
from ..utils._typedefs cimport uint32_t
|
|||
|
from ..utils._weight_vector cimport WeightVector32, WeightVector64
|
|||
|
from ..utils._seq_dataset cimport SequentialDataset32, SequentialDataset64
|
|||
|
|
|||
|
|
|||
|
cdef extern from *:
|
|||
|
"""
|
|||
|
/* Penalty constants */
|
|||
|
#define NO_PENALTY 0
|
|||
|
#define L1 1
|
|||
|
#define L2 2
|
|||
|
#define ELASTICNET 3
|
|||
|
|
|||
|
/* Learning rate constants */
|
|||
|
#define CONSTANT 1
|
|||
|
#define OPTIMAL 2
|
|||
|
#define INVSCALING 3
|
|||
|
#define ADAPTIVE 4
|
|||
|
#define PA1 5
|
|||
|
#define PA2 6
|
|||
|
"""
|
|||
|
int NO_PENALTY = 0
|
|||
|
int L1 = 1
|
|||
|
int L2 = 2
|
|||
|
int ELASTICNET = 3
|
|||
|
|
|||
|
int CONSTANT = 1
|
|||
|
int OPTIMAL = 2
|
|||
|
int INVSCALING = 3
|
|||
|
int ADAPTIVE = 4
|
|||
|
int PA1 = 5
|
|||
|
int PA2 = 6
|
|||
|
|
|||
|
|
|||
|
# ----------------------------------------
|
|||
|
# Extension Types for Loss Functions
|
|||
|
# ----------------------------------------
|
|||
|
|
|||
|
cdef class LossFunction:
|
|||
|
"""Base class for convex loss functions"""
|
|||
|
|
|||
|
cdef double loss(self, double y, double p) noexcept nogil:
|
|||
|
"""Evaluate the loss function.
|
|||
|
|
|||
|
Parameters
|
|||
|
----------
|
|||
|
y : double
|
|||
|
The true value (aka target).
|
|||
|
p : double
|
|||
|
The prediction, `p = w^T x + intercept`.
|
|||
|
|
|||
|
Returns
|
|||
|
-------
|
|||
|
double
|
|||
|
The loss evaluated at `p` and `y`.
|
|||
|
"""
|
|||
|
return 0.
|
|||
|
|
|||
|
def py_dloss(self, double p, double y):
|
|||
|
"""Python version of `dloss` for testing.
|
|||
|
|
|||
|
Pytest needs a python function and can't use cdef functions.
|
|||
|
|
|||
|
Parameters
|
|||
|
----------
|
|||
|
p : double
|
|||
|
The prediction, `p = w^T x`.
|
|||
|
y : double
|
|||
|
The true value (aka target).
|
|||
|
|
|||
|
Returns
|
|||
|
-------
|
|||
|
double
|
|||
|
The derivative of the loss function with regards to `p`.
|
|||
|
"""
|
|||
|
return self.dloss(y, p)
|
|||
|
|
|||
|
def py_loss(self, double p, double y):
|
|||
|
"""Python version of `loss` for testing.
|
|||
|
|
|||
|
Pytest needs a python function and can't use cdef functions.
|
|||
|
|
|||
|
Parameters
|
|||
|
----------
|
|||
|
p : double
|
|||
|
The prediction, `p = w^T x + intercept`.
|
|||
|
y : double
|
|||
|
The true value (aka target).
|
|||
|
|
|||
|
Returns
|
|||
|
-------
|
|||
|
double
|
|||
|
The loss evaluated at `p` and `y`.
|
|||
|
"""
|
|||
|
return self.loss(y, p)
|
|||
|
|
|||
|
cdef double dloss(self, double y, double p) noexcept nogil:
|
|||
|
"""Evaluate the derivative of the loss function with respect to
|
|||
|
the prediction `p`.
|
|||
|
|
|||
|
Parameters
|
|||
|
----------
|
|||
|
y : double
|
|||
|
The true value (aka target).
|
|||
|
p : double
|
|||
|
The prediction, `p = w^T x`.
|
|||
|
|
|||
|
Returns
|
|||
|
-------
|
|||
|
double
|
|||
|
The derivative of the loss function with regards to `p`.
|
|||
|
"""
|
|||
|
return 0.
|
|||
|
|
|||
|
|
|||
|
cdef class Regression(LossFunction):
|
|||
|
"""Base class for loss functions for regression"""
|
|||
|
|
|||
|
cdef double loss(self, double y, double p) noexcept nogil:
|
|||
|
return 0.
|
|||
|
|
|||
|
cdef double dloss(self, double y, double p) noexcept nogil:
|
|||
|
return 0.
|
|||
|
|
|||
|
|
|||
|
cdef class Classification(LossFunction):
|
|||
|
"""Base class for loss functions for classification"""
|
|||
|
|
|||
|
cdef double loss(self, double y, double p) noexcept nogil:
|
|||
|
return 0.
|
|||
|
|
|||
|
cdef double dloss(self, double y, double p) noexcept nogil:
|
|||
|
return 0.
|
|||
|
|
|||
|
|
|||
|
cdef class ModifiedHuber(Classification):
|
|||
|
"""Modified Huber loss for binary classification with y in {-1, 1}
|
|||
|
|
|||
|
This is equivalent to quadratically smoothed SVM with gamma = 2.
|
|||
|
|
|||
|
See T. Zhang 'Solving Large Scale Linear Prediction Problems Using
|
|||
|
Stochastic Gradient Descent', ICML'04.
|
|||
|
"""
|
|||
|
cdef double loss(self, double y, double p) noexcept nogil:
|
|||
|
cdef double z = p * y
|
|||
|
if z >= 1.0:
|
|||
|
return 0.0
|
|||
|
elif z >= -1.0:
|
|||
|
return (1.0 - z) * (1.0 - z)
|
|||
|
else:
|
|||
|
return -4.0 * z
|
|||
|
|
|||
|
cdef double dloss(self, double y, double p) noexcept nogil:
|
|||
|
cdef double z = p * y
|
|||
|
if z >= 1.0:
|
|||
|
return 0.0
|
|||
|
elif z >= -1.0:
|
|||
|
return 2.0 * (1.0 - z) * -y
|
|||
|
else:
|
|||
|
return -4.0 * y
|
|||
|
|
|||
|
def __reduce__(self):
|
|||
|
return ModifiedHuber, ()
|
|||
|
|
|||
|
|
|||
|
cdef class Hinge(Classification):
|
|||
|
"""Hinge loss for binary classification tasks with y in {-1,1}
|
|||
|
|
|||
|
Parameters
|
|||
|
----------
|
|||
|
|
|||
|
threshold : float > 0.0
|
|||
|
Margin threshold. When threshold=1.0, one gets the loss used by SVM.
|
|||
|
When threshold=0.0, one gets the loss used by the Perceptron.
|
|||
|
"""
|
|||
|
|
|||
|
cdef double threshold
|
|||
|
|
|||
|
def __init__(self, double threshold=1.0):
|
|||
|
self.threshold = threshold
|
|||
|
|
|||
|
cdef double loss(self, double y, double p) noexcept nogil:
|
|||
|
cdef double z = p * y
|
|||
|
if z <= self.threshold:
|
|||
|
return self.threshold - z
|
|||
|
return 0.0
|
|||
|
|
|||
|
cdef double dloss(self, double y, double p) noexcept nogil:
|
|||
|
cdef double z = p * y
|
|||
|
if z <= self.threshold:
|
|||
|
return -y
|
|||
|
return 0.0
|
|||
|
|
|||
|
def __reduce__(self):
|
|||
|
return Hinge, (self.threshold,)
|
|||
|
|
|||
|
|
|||
|
cdef class SquaredHinge(Classification):
|
|||
|
"""Squared Hinge loss for binary classification tasks with y in {-1,1}
|
|||
|
|
|||
|
Parameters
|
|||
|
----------
|
|||
|
|
|||
|
threshold : float > 0.0
|
|||
|
Margin threshold. When threshold=1.0, one gets the loss used by
|
|||
|
(quadratically penalized) SVM.
|
|||
|
"""
|
|||
|
|
|||
|
cdef double threshold
|
|||
|
|
|||
|
def __init__(self, double threshold=1.0):
|
|||
|
self.threshold = threshold
|
|||
|
|
|||
|
cdef double loss(self, double y, double p) noexcept nogil:
|
|||
|
cdef double z = self.threshold - p * y
|
|||
|
if z > 0:
|
|||
|
return z * z
|
|||
|
return 0.0
|
|||
|
|
|||
|
cdef double dloss(self, double y, double p) noexcept nogil:
|
|||
|
cdef double z = self.threshold - p * y
|
|||
|
if z > 0:
|
|||
|
return -2 * y * z
|
|||
|
return 0.0
|
|||
|
|
|||
|
def __reduce__(self):
|
|||
|
return SquaredHinge, (self.threshold,)
|
|||
|
|
|||
|
|
|||
|
cdef class Log(Classification):
|
|||
|
"""Logistic regression loss for binary classification with y in {-1, 1}"""
|
|||
|
|
|||
|
cdef double loss(self, double y, double p) noexcept nogil:
|
|||
|
cdef double z = p * y
|
|||
|
# approximately equal and saves the computation of the log
|
|||
|
if z > 18:
|
|||
|
return exp(-z)
|
|||
|
if z < -18:
|
|||
|
return -z
|
|||
|
return log(1.0 + exp(-z))
|
|||
|
|
|||
|
cdef double dloss(self, double y, double p) noexcept nogil:
|
|||
|
cdef double z = p * y
|
|||
|
# approximately equal and saves the computation of the log
|
|||
|
if z > 18.0:
|
|||
|
return exp(-z) * -y
|
|||
|
if z < -18.0:
|
|||
|
return -y
|
|||
|
return -y / (exp(z) + 1.0)
|
|||
|
|
|||
|
def __reduce__(self):
|
|||
|
return Log, ()
|
|||
|
|
|||
|
|
|||
|
cdef class SquaredLoss(Regression):
|
|||
|
"""Squared loss traditional used in linear regression."""
|
|||
|
cdef double loss(self, double y, double p) noexcept nogil:
|
|||
|
return 0.5 * (p - y) * (p - y)
|
|||
|
|
|||
|
cdef double dloss(self, double y, double p) noexcept nogil:
|
|||
|
return p - y
|
|||
|
|
|||
|
def __reduce__(self):
|
|||
|
return SquaredLoss, ()
|
|||
|
|
|||
|
|
|||
|
cdef class Huber(Regression):
|
|||
|
"""Huber regression loss
|
|||
|
|
|||
|
Variant of the SquaredLoss that is robust to outliers (quadratic near zero,
|
|||
|
linear in for large errors).
|
|||
|
|
|||
|
https://en.wikipedia.org/wiki/Huber_Loss_Function
|
|||
|
"""
|
|||
|
|
|||
|
cdef double c
|
|||
|
|
|||
|
def __init__(self, double c):
|
|||
|
self.c = c
|
|||
|
|
|||
|
cdef double loss(self, double y, double p) noexcept nogil:
|
|||
|
cdef double r = p - y
|
|||
|
cdef double abs_r = fabs(r)
|
|||
|
if abs_r <= self.c:
|
|||
|
return 0.5 * r * r
|
|||
|
else:
|
|||
|
return self.c * abs_r - (0.5 * self.c * self.c)
|
|||
|
|
|||
|
cdef double dloss(self, double y, double p) noexcept nogil:
|
|||
|
cdef double r = p - y
|
|||
|
cdef double abs_r = fabs(r)
|
|||
|
if abs_r <= self.c:
|
|||
|
return r
|
|||
|
elif r > 0.0:
|
|||
|
return self.c
|
|||
|
else:
|
|||
|
return -self.c
|
|||
|
|
|||
|
def __reduce__(self):
|
|||
|
return Huber, (self.c,)
|
|||
|
|
|||
|
|
|||
|
cdef class EpsilonInsensitive(Regression):
|
|||
|
"""Epsilon-Insensitive loss (used by SVR).
|
|||
|
|
|||
|
loss = max(0, |y - p| - epsilon)
|
|||
|
"""
|
|||
|
|
|||
|
cdef double epsilon
|
|||
|
|
|||
|
def __init__(self, double epsilon):
|
|||
|
self.epsilon = epsilon
|
|||
|
|
|||
|
cdef double loss(self, double y, double p) noexcept nogil:
|
|||
|
cdef double ret = fabs(y - p) - self.epsilon
|
|||
|
return ret if ret > 0 else 0
|
|||
|
|
|||
|
cdef double dloss(self, double y, double p) noexcept nogil:
|
|||
|
if y - p > self.epsilon:
|
|||
|
return -1
|
|||
|
elif p - y > self.epsilon:
|
|||
|
return 1
|
|||
|
else:
|
|||
|
return 0
|
|||
|
|
|||
|
def __reduce__(self):
|
|||
|
return EpsilonInsensitive, (self.epsilon,)
|
|||
|
|
|||
|
|
|||
|
cdef class SquaredEpsilonInsensitive(Regression):
|
|||
|
"""Epsilon-Insensitive loss.
|
|||
|
|
|||
|
loss = max(0, |y - p| - epsilon)^2
|
|||
|
"""
|
|||
|
|
|||
|
cdef double epsilon
|
|||
|
|
|||
|
def __init__(self, double epsilon):
|
|||
|
self.epsilon = epsilon
|
|||
|
|
|||
|
cdef double loss(self, double y, double p) noexcept nogil:
|
|||
|
cdef double ret = fabs(y - p) - self.epsilon
|
|||
|
return ret * ret if ret > 0 else 0
|
|||
|
|
|||
|
cdef double dloss(self, double y, double p) noexcept nogil:
|
|||
|
cdef double z
|
|||
|
z = y - p
|
|||
|
if z > self.epsilon:
|
|||
|
return -2 * (z - self.epsilon)
|
|||
|
elif z < -self.epsilon:
|
|||
|
return 2 * (-z - self.epsilon)
|
|||
|
else:
|
|||
|
return 0
|
|||
|
|
|||
|
def __reduce__(self):
|
|||
|
return SquaredEpsilonInsensitive, (self.epsilon,)
|
|||
|
|
|||
|
{{for name_suffix, c_type, np_type in dtypes}}
|
|||
|
|
|||
|
def _plain_sgd{{name_suffix}}(
|
|||
|
const {{c_type}}[::1] weights,
|
|||
|
double intercept,
|
|||
|
const {{c_type}}[::1] average_weights,
|
|||
|
double average_intercept,
|
|||
|
LossFunction loss,
|
|||
|
int penalty_type,
|
|||
|
double alpha,
|
|||
|
double C,
|
|||
|
double l1_ratio,
|
|||
|
SequentialDataset{{name_suffix}} dataset,
|
|||
|
const unsigned char[::1] validation_mask,
|
|||
|
bint early_stopping,
|
|||
|
validation_score_cb,
|
|||
|
int n_iter_no_change,
|
|||
|
unsigned int max_iter,
|
|||
|
double tol,
|
|||
|
int fit_intercept,
|
|||
|
int verbose,
|
|||
|
bint shuffle,
|
|||
|
uint32_t seed,
|
|||
|
double weight_pos,
|
|||
|
double weight_neg,
|
|||
|
int learning_rate,
|
|||
|
double eta0,
|
|||
|
double power_t,
|
|||
|
bint one_class,
|
|||
|
double t=1.0,
|
|||
|
double intercept_decay=1.0,
|
|||
|
int average=0,
|
|||
|
):
|
|||
|
"""SGD for generic loss functions and penalties with optional averaging
|
|||
|
|
|||
|
Parameters
|
|||
|
----------
|
|||
|
weights : ndarray[{{c_type}}, ndim=1]
|
|||
|
The allocated vector of weights.
|
|||
|
intercept : double
|
|||
|
The initial intercept.
|
|||
|
average_weights : ndarray[{{c_type}}, ndim=1]
|
|||
|
The average weights as computed for ASGD. Should be None if average
|
|||
|
is 0.
|
|||
|
average_intercept : double
|
|||
|
The average intercept for ASGD. Should be 0 if average is 0.
|
|||
|
loss : LossFunction
|
|||
|
A concrete ``LossFunction`` object.
|
|||
|
penalty_type : int
|
|||
|
The penalty 2 for L2, 1 for L1, and 3 for Elastic-Net.
|
|||
|
alpha : float
|
|||
|
The regularization parameter.
|
|||
|
C : float
|
|||
|
Maximum step size for passive aggressive.
|
|||
|
l1_ratio : float
|
|||
|
The Elastic Net mixing parameter, with 0 <= l1_ratio <= 1.
|
|||
|
l1_ratio=0 corresponds to L2 penalty, l1_ratio=1 to L1.
|
|||
|
dataset : SequentialDataset
|
|||
|
A concrete ``SequentialDataset`` object.
|
|||
|
validation_mask : ndarray[unsigned char, ndim=1]
|
|||
|
Equal to True on the validation set.
|
|||
|
early_stopping : boolean
|
|||
|
Whether to use a stopping criterion based on the validation set.
|
|||
|
validation_score_cb : callable
|
|||
|
A callable to compute a validation score given the current
|
|||
|
coefficients and intercept values.
|
|||
|
Used only if early_stopping is True.
|
|||
|
n_iter_no_change : int
|
|||
|
Number of iteration with no improvement to wait before stopping.
|
|||
|
max_iter : int
|
|||
|
The maximum number of iterations (epochs).
|
|||
|
tol: double
|
|||
|
The tolerance for the stopping criterion.
|
|||
|
fit_intercept : int
|
|||
|
Whether or not to fit the intercept (1 or 0).
|
|||
|
verbose : int
|
|||
|
Print verbose output; 0 for quite.
|
|||
|
shuffle : boolean
|
|||
|
Whether to shuffle the training data before each epoch.
|
|||
|
weight_pos : float
|
|||
|
The weight of the positive class.
|
|||
|
weight_neg : float
|
|||
|
The weight of the negative class.
|
|||
|
seed : uint32_t
|
|||
|
Seed of the pseudorandom number generator used to shuffle the data.
|
|||
|
learning_rate : int
|
|||
|
The learning rate:
|
|||
|
(1) constant, eta = eta0
|
|||
|
(2) optimal, eta = 1.0/(alpha * t).
|
|||
|
(3) inverse scaling, eta = eta0 / pow(t, power_t)
|
|||
|
(4) adaptive decrease
|
|||
|
(5) Passive Aggressive-I, eta = min(alpha, loss/norm(x))
|
|||
|
(6) Passive Aggressive-II, eta = 1.0 / (norm(x) + 0.5*alpha)
|
|||
|
eta0 : double
|
|||
|
The initial learning rate.
|
|||
|
power_t : double
|
|||
|
The exponent for inverse scaling learning rate.
|
|||
|
one_class : boolean
|
|||
|
Whether to solve the One-Class SVM optimization problem.
|
|||
|
t : double
|
|||
|
Initial state of the learning rate. This value is equal to the
|
|||
|
iteration count except when the learning rate is set to `optimal`.
|
|||
|
Default: 1.0.
|
|||
|
average : int
|
|||
|
The number of iterations before averaging starts. average=1 is
|
|||
|
equivalent to averaging for all iterations.
|
|||
|
|
|||
|
|
|||
|
Returns
|
|||
|
-------
|
|||
|
weights : array, shape=[n_features]
|
|||
|
The fitted weight vector.
|
|||
|
intercept : float
|
|||
|
The fitted intercept term.
|
|||
|
average_weights : array shape=[n_features]
|
|||
|
The averaged weights across iterations. Values are valid only if
|
|||
|
average > 0.
|
|||
|
average_intercept : float
|
|||
|
The averaged intercept across iterations.
|
|||
|
Values are valid only if average > 0.
|
|||
|
n_iter_ : int
|
|||
|
The actual number of iter (epochs).
|
|||
|
"""
|
|||
|
|
|||
|
# get the data information into easy vars
|
|||
|
cdef Py_ssize_t n_samples = dataset.n_samples
|
|||
|
cdef Py_ssize_t n_features = weights.shape[0]
|
|||
|
|
|||
|
cdef WeightVector{{name_suffix}} w = WeightVector{{name_suffix}}(weights, average_weights)
|
|||
|
cdef {{c_type}} *x_data_ptr = NULL
|
|||
|
cdef int *x_ind_ptr = NULL
|
|||
|
|
|||
|
# helper variables
|
|||
|
cdef int no_improvement_count = 0
|
|||
|
cdef bint infinity = False
|
|||
|
cdef int xnnz
|
|||
|
cdef double eta = 0.0
|
|||
|
cdef double p = 0.0
|
|||
|
cdef double update = 0.0
|
|||
|
cdef double intercept_update = 0.0
|
|||
|
cdef double sumloss = 0.0
|
|||
|
cdef double score = 0.0
|
|||
|
cdef double best_loss = INFINITY
|
|||
|
cdef double best_score = -INFINITY
|
|||
|
cdef {{c_type}} y = 0.0
|
|||
|
cdef {{c_type}} sample_weight
|
|||
|
cdef {{c_type}} class_weight = 1.0
|
|||
|
cdef unsigned int count = 0
|
|||
|
cdef unsigned int train_count = n_samples - np.sum(validation_mask)
|
|||
|
cdef unsigned int epoch = 0
|
|||
|
cdef unsigned int i = 0
|
|||
|
cdef int is_hinge = isinstance(loss, Hinge)
|
|||
|
cdef double optimal_init = 0.0
|
|||
|
cdef double dloss = 0.0
|
|||
|
cdef double MAX_DLOSS = 1e12
|
|||
|
|
|||
|
cdef long long sample_index
|
|||
|
|
|||
|
# q vector is only used for L1 regularization
|
|||
|
cdef {{c_type}}[::1] q = None
|
|||
|
cdef {{c_type}} * q_data_ptr = NULL
|
|||
|
if penalty_type == L1 or penalty_type == ELASTICNET:
|
|||
|
q = np.zeros((n_features,), dtype={{np_type}}, order="c")
|
|||
|
q_data_ptr = &q[0]
|
|||
|
cdef double u = 0.0
|
|||
|
|
|||
|
if penalty_type == L2:
|
|||
|
l1_ratio = 0.0
|
|||
|
elif penalty_type == L1:
|
|||
|
l1_ratio = 1.0
|
|||
|
|
|||
|
eta = eta0
|
|||
|
|
|||
|
if learning_rate == OPTIMAL:
|
|||
|
typw = np.sqrt(1.0 / np.sqrt(alpha))
|
|||
|
# computing eta0, the initial learning rate
|
|||
|
initial_eta0 = typw / max(1.0, loss.dloss(1.0, -typw))
|
|||
|
# initialize t such that eta at first sample equals eta0
|
|||
|
optimal_init = 1.0 / (initial_eta0 * alpha)
|
|||
|
|
|||
|
t_start = time()
|
|||
|
with nogil:
|
|||
|
for epoch in range(max_iter):
|
|||
|
sumloss = 0
|
|||
|
if verbose > 0:
|
|||
|
with gil:
|
|||
|
print("-- Epoch %d" % (epoch + 1))
|
|||
|
if shuffle:
|
|||
|
dataset.shuffle(seed)
|
|||
|
for i in range(n_samples):
|
|||
|
dataset.next(&x_data_ptr, &x_ind_ptr, &xnnz,
|
|||
|
&y, &sample_weight)
|
|||
|
|
|||
|
sample_index = dataset.index_data_ptr[dataset.current_index]
|
|||
|
if validation_mask[sample_index]:
|
|||
|
# do not learn on the validation set
|
|||
|
continue
|
|||
|
|
|||
|
p = w.dot(x_data_ptr, x_ind_ptr, xnnz) + intercept
|
|||
|
if learning_rate == OPTIMAL:
|
|||
|
eta = 1.0 / (alpha * (optimal_init + t - 1))
|
|||
|
elif learning_rate == INVSCALING:
|
|||
|
eta = eta0 / pow(t, power_t)
|
|||
|
|
|||
|
if verbose or not early_stopping:
|
|||
|
sumloss += loss.loss(y, p)
|
|||
|
|
|||
|
if y > 0.0:
|
|||
|
class_weight = weight_pos
|
|||
|
else:
|
|||
|
class_weight = weight_neg
|
|||
|
|
|||
|
if learning_rate == PA1:
|
|||
|
update = sqnorm(x_data_ptr, x_ind_ptr, xnnz)
|
|||
|
if update == 0:
|
|||
|
continue
|
|||
|
update = min(C, loss.loss(y, p) / update)
|
|||
|
elif learning_rate == PA2:
|
|||
|
update = sqnorm(x_data_ptr, x_ind_ptr, xnnz)
|
|||
|
update = loss.loss(y, p) / (update + 0.5 / C)
|
|||
|
else:
|
|||
|
dloss = loss.dloss(y, p)
|
|||
|
# clip dloss with large values to avoid numerical
|
|||
|
# instabilities
|
|||
|
if dloss < -MAX_DLOSS:
|
|||
|
dloss = -MAX_DLOSS
|
|||
|
elif dloss > MAX_DLOSS:
|
|||
|
dloss = MAX_DLOSS
|
|||
|
update = -eta * dloss
|
|||
|
|
|||
|
if learning_rate >= PA1:
|
|||
|
if is_hinge:
|
|||
|
# classification
|
|||
|
update *= y
|
|||
|
elif y - p < 0:
|
|||
|
# regression
|
|||
|
update *= -1
|
|||
|
|
|||
|
update *= class_weight * sample_weight
|
|||
|
|
|||
|
if penalty_type >= L2:
|
|||
|
# do not scale to negative values when eta or alpha are too
|
|||
|
# big: instead set the weights to zero
|
|||
|
w.scale(max(0, 1.0 - ((1.0 - l1_ratio) * eta * alpha)))
|
|||
|
|
|||
|
if update != 0.0:
|
|||
|
w.add(x_data_ptr, x_ind_ptr, xnnz, update)
|
|||
|
if fit_intercept == 1:
|
|||
|
intercept_update = update
|
|||
|
if one_class: # specific for One-Class SVM
|
|||
|
intercept_update -= 2. * eta * alpha
|
|||
|
if intercept_update != 0:
|
|||
|
intercept += intercept_update * intercept_decay
|
|||
|
|
|||
|
if 0 < average <= t:
|
|||
|
# compute the average for the intercept and update the
|
|||
|
# average weights, this is done regardless as to whether
|
|||
|
# the update is 0
|
|||
|
|
|||
|
w.add_average(x_data_ptr, x_ind_ptr, xnnz,
|
|||
|
update, (t - average + 1))
|
|||
|
average_intercept += ((intercept - average_intercept) /
|
|||
|
(t - average + 1))
|
|||
|
|
|||
|
if penalty_type == L1 or penalty_type == ELASTICNET:
|
|||
|
u += (l1_ratio * eta * alpha)
|
|||
|
l1penalty{{name_suffix}}(w, q_data_ptr, x_ind_ptr, xnnz, u)
|
|||
|
|
|||
|
t += 1
|
|||
|
count += 1
|
|||
|
|
|||
|
# report epoch information
|
|||
|
if verbose > 0:
|
|||
|
with gil:
|
|||
|
print("Norm: %.2f, NNZs: %d, Bias: %.6f, T: %d, "
|
|||
|
"Avg. loss: %f"
|
|||
|
% (w.norm(), np.nonzero(weights)[0].shape[0],
|
|||
|
intercept, count, sumloss / train_count))
|
|||
|
print("Total training time: %.2f seconds."
|
|||
|
% (time() - t_start))
|
|||
|
|
|||
|
# floating-point under-/overflow check.
|
|||
|
if (not isfinite(intercept) or any_nonfinite(weights)):
|
|||
|
infinity = True
|
|||
|
break
|
|||
|
|
|||
|
# evaluate the score on the validation set
|
|||
|
if early_stopping:
|
|||
|
with gil:
|
|||
|
score = validation_score_cb(weights.base, intercept)
|
|||
|
if tol > -INFINITY and score < best_score + tol:
|
|||
|
no_improvement_count += 1
|
|||
|
else:
|
|||
|
no_improvement_count = 0
|
|||
|
if score > best_score:
|
|||
|
best_score = score
|
|||
|
# or evaluate the loss on the training set
|
|||
|
else:
|
|||
|
if tol > -INFINITY and sumloss > best_loss - tol * train_count:
|
|||
|
no_improvement_count += 1
|
|||
|
else:
|
|||
|
no_improvement_count = 0
|
|||
|
if sumloss < best_loss:
|
|||
|
best_loss = sumloss
|
|||
|
|
|||
|
# if there is no improvement several times in a row
|
|||
|
if no_improvement_count >= n_iter_no_change:
|
|||
|
if learning_rate == ADAPTIVE and eta > 1e-6:
|
|||
|
eta = eta / 5
|
|||
|
no_improvement_count = 0
|
|||
|
else:
|
|||
|
if verbose:
|
|||
|
with gil:
|
|||
|
print("Convergence after %d epochs took %.2f "
|
|||
|
"seconds" % (epoch + 1, time() - t_start))
|
|||
|
break
|
|||
|
|
|||
|
if infinity:
|
|||
|
raise ValueError(("Floating-point under-/overflow occurred at epoch"
|
|||
|
" #%d. Scaling input data with StandardScaler or"
|
|||
|
" MinMaxScaler might help.") % (epoch + 1))
|
|||
|
|
|||
|
w.reset_wscale()
|
|||
|
|
|||
|
return (
|
|||
|
weights.base,
|
|||
|
intercept,
|
|||
|
None if average_weights is None else average_weights.base,
|
|||
|
average_intercept,
|
|||
|
epoch + 1
|
|||
|
)
|
|||
|
|
|||
|
{{endfor}}
|
|||
|
|
|||
|
|
|||
|
cdef inline bint any_nonfinite(const floating[::1] w) noexcept nogil:
|
|||
|
for i in range(w.shape[0]):
|
|||
|
if not isfinite(w[i]):
|
|||
|
return True
|
|||
|
return 0
|
|||
|
|
|||
|
|
|||
|
cdef inline double sqnorm(
|
|||
|
floating * x_data_ptr,
|
|||
|
int * x_ind_ptr,
|
|||
|
int xnnz,
|
|||
|
) noexcept nogil:
|
|||
|
cdef double x_norm = 0.0
|
|||
|
cdef int j
|
|||
|
cdef double z
|
|||
|
for j in range(xnnz):
|
|||
|
z = x_data_ptr[j]
|
|||
|
x_norm += z * z
|
|||
|
return x_norm
|
|||
|
|
|||
|
|
|||
|
{{for name_suffix, c_type, np_type in dtypes}}
|
|||
|
|
|||
|
cdef void l1penalty{{name_suffix}}(
|
|||
|
WeightVector{{name_suffix}} w,
|
|||
|
{{c_type}} * q_data_ptr,
|
|||
|
int *x_ind_ptr,
|
|||
|
int xnnz,
|
|||
|
double u,
|
|||
|
) noexcept nogil:
|
|||
|
"""Apply the L1 penalty to each updated feature
|
|||
|
|
|||
|
This implements the truncated gradient approach by
|
|||
|
[Tsuruoka, Y., Tsujii, J., and Ananiadou, S., 2009].
|
|||
|
"""
|
|||
|
cdef double z = 0.0
|
|||
|
cdef int j = 0
|
|||
|
cdef int idx = 0
|
|||
|
cdef double wscale = w.wscale
|
|||
|
cdef {{c_type}} *w_data_ptr = w.w_data_ptr
|
|||
|
for j in range(xnnz):
|
|||
|
idx = x_ind_ptr[j]
|
|||
|
z = w_data_ptr[idx]
|
|||
|
if wscale * z > 0.0:
|
|||
|
w_data_ptr[idx] = max(
|
|||
|
0.0, w_data_ptr[idx] - ((u + q_data_ptr[idx]) / wscale))
|
|||
|
|
|||
|
elif wscale * z < 0.0:
|
|||
|
w_data_ptr[idx] = min(
|
|||
|
0.0, w_data_ptr[idx] + ((u - q_data_ptr[idx]) / wscale))
|
|||
|
|
|||
|
q_data_ptr[idx] += wscale * (w_data_ptr[idx] - z)
|
|||
|
|
|||
|
{{endfor}}
|