3RNN/Lib/site-packages/sklearn/linear_model/_sgd_fast.pyx.tp

781 lines
23 KiB
Plaintext
Raw Permalink Normal View History

2024-05-26 19:49:15 +02:00
{{py:
"""
Template file to easily generate fused types consistent code using Tempita
(https://github.com/cython/cython/blob/master/Cython/Tempita/_tempita.py).
Generated file: _sgd_fast.pyx
Each relevant function is duplicated for the dtypes float and double.
The keywords between double braces are substituted in setup.py.
Authors: Peter Prettenhofer <peter.prettenhofer@gmail.com>
Mathieu Blondel (partial_fit support)
Rob Zinkov (passive-aggressive)
Lars Buitinck
License: BSD 3 clause
"""
# The dtypes are defined as follows (name_suffix, c_type, np_type)
dtypes = [
("64", "double", "np.float64"),
("32", "float", "np.float32"),
]
}}
"""SGD implementation"""
import numpy as np
from time import time
from cython cimport floating
from libc.math cimport exp, fabs, isfinite, log, pow, INFINITY
from ..utils._typedefs cimport uint32_t
from ..utils._weight_vector cimport WeightVector32, WeightVector64
from ..utils._seq_dataset cimport SequentialDataset32, SequentialDataset64
cdef extern from *:
"""
/* Penalty constants */
#define NO_PENALTY 0
#define L1 1
#define L2 2
#define ELASTICNET 3
/* Learning rate constants */
#define CONSTANT 1
#define OPTIMAL 2
#define INVSCALING 3
#define ADAPTIVE 4
#define PA1 5
#define PA2 6
"""
int NO_PENALTY = 0
int L1 = 1
int L2 = 2
int ELASTICNET = 3
int CONSTANT = 1
int OPTIMAL = 2
int INVSCALING = 3
int ADAPTIVE = 4
int PA1 = 5
int PA2 = 6
# ----------------------------------------
# Extension Types for Loss Functions
# ----------------------------------------
cdef class LossFunction:
"""Base class for convex loss functions"""
cdef double loss(self, double y, double p) noexcept nogil:
"""Evaluate the loss function.
Parameters
----------
y : double
The true value (aka target).
p : double
The prediction, `p = w^T x + intercept`.
Returns
-------
double
The loss evaluated at `p` and `y`.
"""
return 0.
def py_dloss(self, double p, double y):
"""Python version of `dloss` for testing.
Pytest needs a python function and can't use cdef functions.
Parameters
----------
p : double
The prediction, `p = w^T x`.
y : double
The true value (aka target).
Returns
-------
double
The derivative of the loss function with regards to `p`.
"""
return self.dloss(y, p)
def py_loss(self, double p, double y):
"""Python version of `loss` for testing.
Pytest needs a python function and can't use cdef functions.
Parameters
----------
p : double
The prediction, `p = w^T x + intercept`.
y : double
The true value (aka target).
Returns
-------
double
The loss evaluated at `p` and `y`.
"""
return self.loss(y, p)
cdef double dloss(self, double y, double p) noexcept nogil:
"""Evaluate the derivative of the loss function with respect to
the prediction `p`.
Parameters
----------
y : double
The true value (aka target).
p : double
The prediction, `p = w^T x`.
Returns
-------
double
The derivative of the loss function with regards to `p`.
"""
return 0.
cdef class Regression(LossFunction):
"""Base class for loss functions for regression"""
cdef double loss(self, double y, double p) noexcept nogil:
return 0.
cdef double dloss(self, double y, double p) noexcept nogil:
return 0.
cdef class Classification(LossFunction):
"""Base class for loss functions for classification"""
cdef double loss(self, double y, double p) noexcept nogil:
return 0.
cdef double dloss(self, double y, double p) noexcept nogil:
return 0.
cdef class ModifiedHuber(Classification):
"""Modified Huber loss for binary classification with y in {-1, 1}
This is equivalent to quadratically smoothed SVM with gamma = 2.
See T. Zhang 'Solving Large Scale Linear Prediction Problems Using
Stochastic Gradient Descent', ICML'04.
"""
cdef double loss(self, double y, double p) noexcept nogil:
cdef double z = p * y
if z >= 1.0:
return 0.0
elif z >= -1.0:
return (1.0 - z) * (1.0 - z)
else:
return -4.0 * z
cdef double dloss(self, double y, double p) noexcept nogil:
cdef double z = p * y
if z >= 1.0:
return 0.0
elif z >= -1.0:
return 2.0 * (1.0 - z) * -y
else:
return -4.0 * y
def __reduce__(self):
return ModifiedHuber, ()
cdef class Hinge(Classification):
"""Hinge loss for binary classification tasks with y in {-1,1}
Parameters
----------
threshold : float > 0.0
Margin threshold. When threshold=1.0, one gets the loss used by SVM.
When threshold=0.0, one gets the loss used by the Perceptron.
"""
cdef double threshold
def __init__(self, double threshold=1.0):
self.threshold = threshold
cdef double loss(self, double y, double p) noexcept nogil:
cdef double z = p * y
if z <= self.threshold:
return self.threshold - z
return 0.0
cdef double dloss(self, double y, double p) noexcept nogil:
cdef double z = p * y
if z <= self.threshold:
return -y
return 0.0
def __reduce__(self):
return Hinge, (self.threshold,)
cdef class SquaredHinge(Classification):
"""Squared Hinge loss for binary classification tasks with y in {-1,1}
Parameters
----------
threshold : float > 0.0
Margin threshold. When threshold=1.0, one gets the loss used by
(quadratically penalized) SVM.
"""
cdef double threshold
def __init__(self, double threshold=1.0):
self.threshold = threshold
cdef double loss(self, double y, double p) noexcept nogil:
cdef double z = self.threshold - p * y
if z > 0:
return z * z
return 0.0
cdef double dloss(self, double y, double p) noexcept nogil:
cdef double z = self.threshold - p * y
if z > 0:
return -2 * y * z
return 0.0
def __reduce__(self):
return SquaredHinge, (self.threshold,)
cdef class Log(Classification):
"""Logistic regression loss for binary classification with y in {-1, 1}"""
cdef double loss(self, double y, double p) noexcept nogil:
cdef double z = p * y
# approximately equal and saves the computation of the log
if z > 18:
return exp(-z)
if z < -18:
return -z
return log(1.0 + exp(-z))
cdef double dloss(self, double y, double p) noexcept nogil:
cdef double z = p * y
# approximately equal and saves the computation of the log
if z > 18.0:
return exp(-z) * -y
if z < -18.0:
return -y
return -y / (exp(z) + 1.0)
def __reduce__(self):
return Log, ()
cdef class SquaredLoss(Regression):
"""Squared loss traditional used in linear regression."""
cdef double loss(self, double y, double p) noexcept nogil:
return 0.5 * (p - y) * (p - y)
cdef double dloss(self, double y, double p) noexcept nogil:
return p - y
def __reduce__(self):
return SquaredLoss, ()
cdef class Huber(Regression):
"""Huber regression loss
Variant of the SquaredLoss that is robust to outliers (quadratic near zero,
linear in for large errors).
https://en.wikipedia.org/wiki/Huber_Loss_Function
"""
cdef double c
def __init__(self, double c):
self.c = c
cdef double loss(self, double y, double p) noexcept nogil:
cdef double r = p - y
cdef double abs_r = fabs(r)
if abs_r <= self.c:
return 0.5 * r * r
else:
return self.c * abs_r - (0.5 * self.c * self.c)
cdef double dloss(self, double y, double p) noexcept nogil:
cdef double r = p - y
cdef double abs_r = fabs(r)
if abs_r <= self.c:
return r
elif r > 0.0:
return self.c
else:
return -self.c
def __reduce__(self):
return Huber, (self.c,)
cdef class EpsilonInsensitive(Regression):
"""Epsilon-Insensitive loss (used by SVR).
loss = max(0, |y - p| - epsilon)
"""
cdef double epsilon
def __init__(self, double epsilon):
self.epsilon = epsilon
cdef double loss(self, double y, double p) noexcept nogil:
cdef double ret = fabs(y - p) - self.epsilon
return ret if ret > 0 else 0
cdef double dloss(self, double y, double p) noexcept nogil:
if y - p > self.epsilon:
return -1
elif p - y > self.epsilon:
return 1
else:
return 0
def __reduce__(self):
return EpsilonInsensitive, (self.epsilon,)
cdef class SquaredEpsilonInsensitive(Regression):
"""Epsilon-Insensitive loss.
loss = max(0, |y - p| - epsilon)^2
"""
cdef double epsilon
def __init__(self, double epsilon):
self.epsilon = epsilon
cdef double loss(self, double y, double p) noexcept nogil:
cdef double ret = fabs(y - p) - self.epsilon
return ret * ret if ret > 0 else 0
cdef double dloss(self, double y, double p) noexcept nogil:
cdef double z
z = y - p
if z > self.epsilon:
return -2 * (z - self.epsilon)
elif z < -self.epsilon:
return 2 * (-z - self.epsilon)
else:
return 0
def __reduce__(self):
return SquaredEpsilonInsensitive, (self.epsilon,)
{{for name_suffix, c_type, np_type in dtypes}}
def _plain_sgd{{name_suffix}}(
const {{c_type}}[::1] weights,
double intercept,
const {{c_type}}[::1] average_weights,
double average_intercept,
LossFunction loss,
int penalty_type,
double alpha,
double C,
double l1_ratio,
SequentialDataset{{name_suffix}} dataset,
const unsigned char[::1] validation_mask,
bint early_stopping,
validation_score_cb,
int n_iter_no_change,
unsigned int max_iter,
double tol,
int fit_intercept,
int verbose,
bint shuffle,
uint32_t seed,
double weight_pos,
double weight_neg,
int learning_rate,
double eta0,
double power_t,
bint one_class,
double t=1.0,
double intercept_decay=1.0,
int average=0,
):
"""SGD for generic loss functions and penalties with optional averaging
Parameters
----------
weights : ndarray[{{c_type}}, ndim=1]
The allocated vector of weights.
intercept : double
The initial intercept.
average_weights : ndarray[{{c_type}}, ndim=1]
The average weights as computed for ASGD. Should be None if average
is 0.
average_intercept : double
The average intercept for ASGD. Should be 0 if average is 0.
loss : LossFunction
A concrete ``LossFunction`` object.
penalty_type : int
The penalty 2 for L2, 1 for L1, and 3 for Elastic-Net.
alpha : float
The regularization parameter.
C : float
Maximum step size for passive aggressive.
l1_ratio : float
The Elastic Net mixing parameter, with 0 <= l1_ratio <= 1.
l1_ratio=0 corresponds to L2 penalty, l1_ratio=1 to L1.
dataset : SequentialDataset
A concrete ``SequentialDataset`` object.
validation_mask : ndarray[unsigned char, ndim=1]
Equal to True on the validation set.
early_stopping : boolean
Whether to use a stopping criterion based on the validation set.
validation_score_cb : callable
A callable to compute a validation score given the current
coefficients and intercept values.
Used only if early_stopping is True.
n_iter_no_change : int
Number of iteration with no improvement to wait before stopping.
max_iter : int
The maximum number of iterations (epochs).
tol: double
The tolerance for the stopping criterion.
fit_intercept : int
Whether or not to fit the intercept (1 or 0).
verbose : int
Print verbose output; 0 for quite.
shuffle : boolean
Whether to shuffle the training data before each epoch.
weight_pos : float
The weight of the positive class.
weight_neg : float
The weight of the negative class.
seed : uint32_t
Seed of the pseudorandom number generator used to shuffle the data.
learning_rate : int
The learning rate:
(1) constant, eta = eta0
(2) optimal, eta = 1.0/(alpha * t).
(3) inverse scaling, eta = eta0 / pow(t, power_t)
(4) adaptive decrease
(5) Passive Aggressive-I, eta = min(alpha, loss/norm(x))
(6) Passive Aggressive-II, eta = 1.0 / (norm(x) + 0.5*alpha)
eta0 : double
The initial learning rate.
power_t : double
The exponent for inverse scaling learning rate.
one_class : boolean
Whether to solve the One-Class SVM optimization problem.
t : double
Initial state of the learning rate. This value is equal to the
iteration count except when the learning rate is set to `optimal`.
Default: 1.0.
average : int
The number of iterations before averaging starts. average=1 is
equivalent to averaging for all iterations.
Returns
-------
weights : array, shape=[n_features]
The fitted weight vector.
intercept : float
The fitted intercept term.
average_weights : array shape=[n_features]
The averaged weights across iterations. Values are valid only if
average > 0.
average_intercept : float
The averaged intercept across iterations.
Values are valid only if average > 0.
n_iter_ : int
The actual number of iter (epochs).
"""
# get the data information into easy vars
cdef Py_ssize_t n_samples = dataset.n_samples
cdef Py_ssize_t n_features = weights.shape[0]
cdef WeightVector{{name_suffix}} w = WeightVector{{name_suffix}}(weights, average_weights)
cdef {{c_type}} *x_data_ptr = NULL
cdef int *x_ind_ptr = NULL
# helper variables
cdef int no_improvement_count = 0
cdef bint infinity = False
cdef int xnnz
cdef double eta = 0.0
cdef double p = 0.0
cdef double update = 0.0
cdef double intercept_update = 0.0
cdef double sumloss = 0.0
cdef double score = 0.0
cdef double best_loss = INFINITY
cdef double best_score = -INFINITY
cdef {{c_type}} y = 0.0
cdef {{c_type}} sample_weight
cdef {{c_type}} class_weight = 1.0
cdef unsigned int count = 0
cdef unsigned int train_count = n_samples - np.sum(validation_mask)
cdef unsigned int epoch = 0
cdef unsigned int i = 0
cdef int is_hinge = isinstance(loss, Hinge)
cdef double optimal_init = 0.0
cdef double dloss = 0.0
cdef double MAX_DLOSS = 1e12
cdef long long sample_index
# q vector is only used for L1 regularization
cdef {{c_type}}[::1] q = None
cdef {{c_type}} * q_data_ptr = NULL
if penalty_type == L1 or penalty_type == ELASTICNET:
q = np.zeros((n_features,), dtype={{np_type}}, order="c")
q_data_ptr = &q[0]
cdef double u = 0.0
if penalty_type == L2:
l1_ratio = 0.0
elif penalty_type == L1:
l1_ratio = 1.0
eta = eta0
if learning_rate == OPTIMAL:
typw = np.sqrt(1.0 / np.sqrt(alpha))
# computing eta0, the initial learning rate
initial_eta0 = typw / max(1.0, loss.dloss(1.0, -typw))
# initialize t such that eta at first sample equals eta0
optimal_init = 1.0 / (initial_eta0 * alpha)
t_start = time()
with nogil:
for epoch in range(max_iter):
sumloss = 0
if verbose > 0:
with gil:
print("-- Epoch %d" % (epoch + 1))
if shuffle:
dataset.shuffle(seed)
for i in range(n_samples):
dataset.next(&x_data_ptr, &x_ind_ptr, &xnnz,
&y, &sample_weight)
sample_index = dataset.index_data_ptr[dataset.current_index]
if validation_mask[sample_index]:
# do not learn on the validation set
continue
p = w.dot(x_data_ptr, x_ind_ptr, xnnz) + intercept
if learning_rate == OPTIMAL:
eta = 1.0 / (alpha * (optimal_init + t - 1))
elif learning_rate == INVSCALING:
eta = eta0 / pow(t, power_t)
if verbose or not early_stopping:
sumloss += loss.loss(y, p)
if y > 0.0:
class_weight = weight_pos
else:
class_weight = weight_neg
if learning_rate == PA1:
update = sqnorm(x_data_ptr, x_ind_ptr, xnnz)
if update == 0:
continue
update = min(C, loss.loss(y, p) / update)
elif learning_rate == PA2:
update = sqnorm(x_data_ptr, x_ind_ptr, xnnz)
update = loss.loss(y, p) / (update + 0.5 / C)
else:
dloss = loss.dloss(y, p)
# clip dloss with large values to avoid numerical
# instabilities
if dloss < -MAX_DLOSS:
dloss = -MAX_DLOSS
elif dloss > MAX_DLOSS:
dloss = MAX_DLOSS
update = -eta * dloss
if learning_rate >= PA1:
if is_hinge:
# classification
update *= y
elif y - p < 0:
# regression
update *= -1
update *= class_weight * sample_weight
if penalty_type >= L2:
# do not scale to negative values when eta or alpha are too
# big: instead set the weights to zero
w.scale(max(0, 1.0 - ((1.0 - l1_ratio) * eta * alpha)))
if update != 0.0:
w.add(x_data_ptr, x_ind_ptr, xnnz, update)
if fit_intercept == 1:
intercept_update = update
if one_class: # specific for One-Class SVM
intercept_update -= 2. * eta * alpha
if intercept_update != 0:
intercept += intercept_update * intercept_decay
if 0 < average <= t:
# compute the average for the intercept and update the
# average weights, this is done regardless as to whether
# the update is 0
w.add_average(x_data_ptr, x_ind_ptr, xnnz,
update, (t - average + 1))
average_intercept += ((intercept - average_intercept) /
(t - average + 1))
if penalty_type == L1 or penalty_type == ELASTICNET:
u += (l1_ratio * eta * alpha)
l1penalty{{name_suffix}}(w, q_data_ptr, x_ind_ptr, xnnz, u)
t += 1
count += 1
# report epoch information
if verbose > 0:
with gil:
print("Norm: %.2f, NNZs: %d, Bias: %.6f, T: %d, "
"Avg. loss: %f"
% (w.norm(), np.nonzero(weights)[0].shape[0],
intercept, count, sumloss / train_count))
print("Total training time: %.2f seconds."
% (time() - t_start))
# floating-point under-/overflow check.
if (not isfinite(intercept) or any_nonfinite(weights)):
infinity = True
break
# evaluate the score on the validation set
if early_stopping:
with gil:
score = validation_score_cb(weights.base, intercept)
if tol > -INFINITY and score < best_score + tol:
no_improvement_count += 1
else:
no_improvement_count = 0
if score > best_score:
best_score = score
# or evaluate the loss on the training set
else:
if tol > -INFINITY and sumloss > best_loss - tol * train_count:
no_improvement_count += 1
else:
no_improvement_count = 0
if sumloss < best_loss:
best_loss = sumloss
# if there is no improvement several times in a row
if no_improvement_count >= n_iter_no_change:
if learning_rate == ADAPTIVE and eta > 1e-6:
eta = eta / 5
no_improvement_count = 0
else:
if verbose:
with gil:
print("Convergence after %d epochs took %.2f "
"seconds" % (epoch + 1, time() - t_start))
break
if infinity:
raise ValueError(("Floating-point under-/overflow occurred at epoch"
" #%d. Scaling input data with StandardScaler or"
" MinMaxScaler might help.") % (epoch + 1))
w.reset_wscale()
return (
weights.base,
intercept,
None if average_weights is None else average_weights.base,
average_intercept,
epoch + 1
)
{{endfor}}
cdef inline bint any_nonfinite(const floating[::1] w) noexcept nogil:
for i in range(w.shape[0]):
if not isfinite(w[i]):
return True
return 0
cdef inline double sqnorm(
floating * x_data_ptr,
int * x_ind_ptr,
int xnnz,
) noexcept nogil:
cdef double x_norm = 0.0
cdef int j
cdef double z
for j in range(xnnz):
z = x_data_ptr[j]
x_norm += z * z
return x_norm
{{for name_suffix, c_type, np_type in dtypes}}
cdef void l1penalty{{name_suffix}}(
WeightVector{{name_suffix}} w,
{{c_type}} * q_data_ptr,
int *x_ind_ptr,
int xnnz,
double u,
) noexcept nogil:
"""Apply the L1 penalty to each updated feature
This implements the truncated gradient approach by
[Tsuruoka, Y., Tsujii, J., and Ananiadou, S., 2009].
"""
cdef double z = 0.0
cdef int j = 0
cdef int idx = 0
cdef double wscale = w.wscale
cdef {{c_type}} *w_data_ptr = w.w_data_ptr
for j in range(xnnz):
idx = x_ind_ptr[j]
z = w_data_ptr[idx]
if wscale * z > 0.0:
w_data_ptr[idx] = max(
0.0, w_data_ptr[idx] - ((u + q_data_ptr[idx]) / wscale))
elif wscale * z < 0.0:
w_data_ptr[idx] = min(
0.0, w_data_ptr[idx] + ((u - q_data_ptr[idx]) / wscale))
q_data_ptr[idx] += wscale * (w_data_ptr[idx] - z)
{{endfor}}