3RNN/Lib/site-packages/sklearn/linear_model/_sgd_fast.pyx.tp
2024-05-26 19:49:15 +02:00

781 lines
23 KiB
Plaintext
Raw Permalink Blame History

This file contains invisible Unicode characters

This file contains invisible Unicode characters that are indistinguishable to humans but may be processed differently by a computer. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

{{py:
"""
Template file to easily generate fused types consistent code using Tempita
(https://github.com/cython/cython/blob/master/Cython/Tempita/_tempita.py).
Generated file: _sgd_fast.pyx
Each relevant function is duplicated for the dtypes float and double.
The keywords between double braces are substituted in setup.py.
Authors: Peter Prettenhofer <peter.prettenhofer@gmail.com>
Mathieu Blondel (partial_fit support)
Rob Zinkov (passive-aggressive)
Lars Buitinck
License: BSD 3 clause
"""
# The dtypes are defined as follows (name_suffix, c_type, np_type)
dtypes = [
("64", "double", "np.float64"),
("32", "float", "np.float32"),
]
}}
"""SGD implementation"""
import numpy as np
from time import time
from cython cimport floating
from libc.math cimport exp, fabs, isfinite, log, pow, INFINITY
from ..utils._typedefs cimport uint32_t
from ..utils._weight_vector cimport WeightVector32, WeightVector64
from ..utils._seq_dataset cimport SequentialDataset32, SequentialDataset64
cdef extern from *:
"""
/* Penalty constants */
#define NO_PENALTY 0
#define L1 1
#define L2 2
#define ELASTICNET 3
/* Learning rate constants */
#define CONSTANT 1
#define OPTIMAL 2
#define INVSCALING 3
#define ADAPTIVE 4
#define PA1 5
#define PA2 6
"""
int NO_PENALTY = 0
int L1 = 1
int L2 = 2
int ELASTICNET = 3
int CONSTANT = 1
int OPTIMAL = 2
int INVSCALING = 3
int ADAPTIVE = 4
int PA1 = 5
int PA2 = 6
# ----------------------------------------
# Extension Types for Loss Functions
# ----------------------------------------
cdef class LossFunction:
"""Base class for convex loss functions"""
cdef double loss(self, double y, double p) noexcept nogil:
"""Evaluate the loss function.
Parameters
----------
y : double
The true value (aka target).
p : double
The prediction, `p = w^T x + intercept`.
Returns
-------
double
The loss evaluated at `p` and `y`.
"""
return 0.
def py_dloss(self, double p, double y):
"""Python version of `dloss` for testing.
Pytest needs a python function and can't use cdef functions.
Parameters
----------
p : double
The prediction, `p = w^T x`.
y : double
The true value (aka target).
Returns
-------
double
The derivative of the loss function with regards to `p`.
"""
return self.dloss(y, p)
def py_loss(self, double p, double y):
"""Python version of `loss` for testing.
Pytest needs a python function and can't use cdef functions.
Parameters
----------
p : double
The prediction, `p = w^T x + intercept`.
y : double
The true value (aka target).
Returns
-------
double
The loss evaluated at `p` and `y`.
"""
return self.loss(y, p)
cdef double dloss(self, double y, double p) noexcept nogil:
"""Evaluate the derivative of the loss function with respect to
the prediction `p`.
Parameters
----------
y : double
The true value (aka target).
p : double
The prediction, `p = w^T x`.
Returns
-------
double
The derivative of the loss function with regards to `p`.
"""
return 0.
cdef class Regression(LossFunction):
"""Base class for loss functions for regression"""
cdef double loss(self, double y, double p) noexcept nogil:
return 0.
cdef double dloss(self, double y, double p) noexcept nogil:
return 0.
cdef class Classification(LossFunction):
"""Base class for loss functions for classification"""
cdef double loss(self, double y, double p) noexcept nogil:
return 0.
cdef double dloss(self, double y, double p) noexcept nogil:
return 0.
cdef class ModifiedHuber(Classification):
"""Modified Huber loss for binary classification with y in {-1, 1}
This is equivalent to quadratically smoothed SVM with gamma = 2.
See T. Zhang 'Solving Large Scale Linear Prediction Problems Using
Stochastic Gradient Descent', ICML'04.
"""
cdef double loss(self, double y, double p) noexcept nogil:
cdef double z = p * y
if z >= 1.0:
return 0.0
elif z >= -1.0:
return (1.0 - z) * (1.0 - z)
else:
return -4.0 * z
cdef double dloss(self, double y, double p) noexcept nogil:
cdef double z = p * y
if z >= 1.0:
return 0.0
elif z >= -1.0:
return 2.0 * (1.0 - z) * -y
else:
return -4.0 * y
def __reduce__(self):
return ModifiedHuber, ()
cdef class Hinge(Classification):
"""Hinge loss for binary classification tasks with y in {-1,1}
Parameters
----------
threshold : float > 0.0
Margin threshold. When threshold=1.0, one gets the loss used by SVM.
When threshold=0.0, one gets the loss used by the Perceptron.
"""
cdef double threshold
def __init__(self, double threshold=1.0):
self.threshold = threshold
cdef double loss(self, double y, double p) noexcept nogil:
cdef double z = p * y
if z <= self.threshold:
return self.threshold - z
return 0.0
cdef double dloss(self, double y, double p) noexcept nogil:
cdef double z = p * y
if z <= self.threshold:
return -y
return 0.0
def __reduce__(self):
return Hinge, (self.threshold,)
cdef class SquaredHinge(Classification):
"""Squared Hinge loss for binary classification tasks with y in {-1,1}
Parameters
----------
threshold : float > 0.0
Margin threshold. When threshold=1.0, one gets the loss used by
(quadratically penalized) SVM.
"""
cdef double threshold
def __init__(self, double threshold=1.0):
self.threshold = threshold
cdef double loss(self, double y, double p) noexcept nogil:
cdef double z = self.threshold - p * y
if z > 0:
return z * z
return 0.0
cdef double dloss(self, double y, double p) noexcept nogil:
cdef double z = self.threshold - p * y
if z > 0:
return -2 * y * z
return 0.0
def __reduce__(self):
return SquaredHinge, (self.threshold,)
cdef class Log(Classification):
"""Logistic regression loss for binary classification with y in {-1, 1}"""
cdef double loss(self, double y, double p) noexcept nogil:
cdef double z = p * y
# approximately equal and saves the computation of the log
if z > 18:
return exp(-z)
if z < -18:
return -z
return log(1.0 + exp(-z))
cdef double dloss(self, double y, double p) noexcept nogil:
cdef double z = p * y
# approximately equal and saves the computation of the log
if z > 18.0:
return exp(-z) * -y
if z < -18.0:
return -y
return -y / (exp(z) + 1.0)
def __reduce__(self):
return Log, ()
cdef class SquaredLoss(Regression):
"""Squared loss traditional used in linear regression."""
cdef double loss(self, double y, double p) noexcept nogil:
return 0.5 * (p - y) * (p - y)
cdef double dloss(self, double y, double p) noexcept nogil:
return p - y
def __reduce__(self):
return SquaredLoss, ()
cdef class Huber(Regression):
"""Huber regression loss
Variant of the SquaredLoss that is robust to outliers (quadratic near zero,
linear in for large errors).
https://en.wikipedia.org/wiki/Huber_Loss_Function
"""
cdef double c
def __init__(self, double c):
self.c = c
cdef double loss(self, double y, double p) noexcept nogil:
cdef double r = p - y
cdef double abs_r = fabs(r)
if abs_r <= self.c:
return 0.5 * r * r
else:
return self.c * abs_r - (0.5 * self.c * self.c)
cdef double dloss(self, double y, double p) noexcept nogil:
cdef double r = p - y
cdef double abs_r = fabs(r)
if abs_r <= self.c:
return r
elif r > 0.0:
return self.c
else:
return -self.c
def __reduce__(self):
return Huber, (self.c,)
cdef class EpsilonInsensitive(Regression):
"""Epsilon-Insensitive loss (used by SVR).
loss = max(0, |y - p| - epsilon)
"""
cdef double epsilon
def __init__(self, double epsilon):
self.epsilon = epsilon
cdef double loss(self, double y, double p) noexcept nogil:
cdef double ret = fabs(y - p) - self.epsilon
return ret if ret > 0 else 0
cdef double dloss(self, double y, double p) noexcept nogil:
if y - p > self.epsilon:
return -1
elif p - y > self.epsilon:
return 1
else:
return 0
def __reduce__(self):
return EpsilonInsensitive, (self.epsilon,)
cdef class SquaredEpsilonInsensitive(Regression):
"""Epsilon-Insensitive loss.
loss = max(0, |y - p| - epsilon)^2
"""
cdef double epsilon
def __init__(self, double epsilon):
self.epsilon = epsilon
cdef double loss(self, double y, double p) noexcept nogil:
cdef double ret = fabs(y - p) - self.epsilon
return ret * ret if ret > 0 else 0
cdef double dloss(self, double y, double p) noexcept nogil:
cdef double z
z = y - p
if z > self.epsilon:
return -2 * (z - self.epsilon)
elif z < -self.epsilon:
return 2 * (-z - self.epsilon)
else:
return 0
def __reduce__(self):
return SquaredEpsilonInsensitive, (self.epsilon,)
{{for name_suffix, c_type, np_type in dtypes}}
def _plain_sgd{{name_suffix}}(
const {{c_type}}[::1] weights,
double intercept,
const {{c_type}}[::1] average_weights,
double average_intercept,
LossFunction loss,
int penalty_type,
double alpha,
double C,
double l1_ratio,
SequentialDataset{{name_suffix}} dataset,
const unsigned char[::1] validation_mask,
bint early_stopping,
validation_score_cb,
int n_iter_no_change,
unsigned int max_iter,
double tol,
int fit_intercept,
int verbose,
bint shuffle,
uint32_t seed,
double weight_pos,
double weight_neg,
int learning_rate,
double eta0,
double power_t,
bint one_class,
double t=1.0,
double intercept_decay=1.0,
int average=0,
):
"""SGD for generic loss functions and penalties with optional averaging
Parameters
----------
weights : ndarray[{{c_type}}, ndim=1]
The allocated vector of weights.
intercept : double
The initial intercept.
average_weights : ndarray[{{c_type}}, ndim=1]
The average weights as computed for ASGD. Should be None if average
is 0.
average_intercept : double
The average intercept for ASGD. Should be 0 if average is 0.
loss : LossFunction
A concrete ``LossFunction`` object.
penalty_type : int
The penalty 2 for L2, 1 for L1, and 3 for Elastic-Net.
alpha : float
The regularization parameter.
C : float
Maximum step size for passive aggressive.
l1_ratio : float
The Elastic Net mixing parameter, with 0 <= l1_ratio <= 1.
l1_ratio=0 corresponds to L2 penalty, l1_ratio=1 to L1.
dataset : SequentialDataset
A concrete ``SequentialDataset`` object.
validation_mask : ndarray[unsigned char, ndim=1]
Equal to True on the validation set.
early_stopping : boolean
Whether to use a stopping criterion based on the validation set.
validation_score_cb : callable
A callable to compute a validation score given the current
coefficients and intercept values.
Used only if early_stopping is True.
n_iter_no_change : int
Number of iteration with no improvement to wait before stopping.
max_iter : int
The maximum number of iterations (epochs).
tol: double
The tolerance for the stopping criterion.
fit_intercept : int
Whether or not to fit the intercept (1 or 0).
verbose : int
Print verbose output; 0 for quite.
shuffle : boolean
Whether to shuffle the training data before each epoch.
weight_pos : float
The weight of the positive class.
weight_neg : float
The weight of the negative class.
seed : uint32_t
Seed of the pseudorandom number generator used to shuffle the data.
learning_rate : int
The learning rate:
(1) constant, eta = eta0
(2) optimal, eta = 1.0/(alpha * t).
(3) inverse scaling, eta = eta0 / pow(t, power_t)
(4) adaptive decrease
(5) Passive Aggressive-I, eta = min(alpha, loss/norm(x))
(6) Passive Aggressive-II, eta = 1.0 / (norm(x) + 0.5*alpha)
eta0 : double
The initial learning rate.
power_t : double
The exponent for inverse scaling learning rate.
one_class : boolean
Whether to solve the One-Class SVM optimization problem.
t : double
Initial state of the learning rate. This value is equal to the
iteration count except when the learning rate is set to `optimal`.
Default: 1.0.
average : int
The number of iterations before averaging starts. average=1 is
equivalent to averaging for all iterations.
Returns
-------
weights : array, shape=[n_features]
The fitted weight vector.
intercept : float
The fitted intercept term.
average_weights : array shape=[n_features]
The averaged weights across iterations. Values are valid only if
average > 0.
average_intercept : float
The averaged intercept across iterations.
Values are valid only if average > 0.
n_iter_ : int
The actual number of iter (epochs).
"""
# get the data information into easy vars
cdef Py_ssize_t n_samples = dataset.n_samples
cdef Py_ssize_t n_features = weights.shape[0]
cdef WeightVector{{name_suffix}} w = WeightVector{{name_suffix}}(weights, average_weights)
cdef {{c_type}} *x_data_ptr = NULL
cdef int *x_ind_ptr = NULL
# helper variables
cdef int no_improvement_count = 0
cdef bint infinity = False
cdef int xnnz
cdef double eta = 0.0
cdef double p = 0.0
cdef double update = 0.0
cdef double intercept_update = 0.0
cdef double sumloss = 0.0
cdef double score = 0.0
cdef double best_loss = INFINITY
cdef double best_score = -INFINITY
cdef {{c_type}} y = 0.0
cdef {{c_type}} sample_weight
cdef {{c_type}} class_weight = 1.0
cdef unsigned int count = 0
cdef unsigned int train_count = n_samples - np.sum(validation_mask)
cdef unsigned int epoch = 0
cdef unsigned int i = 0
cdef int is_hinge = isinstance(loss, Hinge)
cdef double optimal_init = 0.0
cdef double dloss = 0.0
cdef double MAX_DLOSS = 1e12
cdef long long sample_index
# q vector is only used for L1 regularization
cdef {{c_type}}[::1] q = None
cdef {{c_type}} * q_data_ptr = NULL
if penalty_type == L1 or penalty_type == ELASTICNET:
q = np.zeros((n_features,), dtype={{np_type}}, order="c")
q_data_ptr = &q[0]
cdef double u = 0.0
if penalty_type == L2:
l1_ratio = 0.0
elif penalty_type == L1:
l1_ratio = 1.0
eta = eta0
if learning_rate == OPTIMAL:
typw = np.sqrt(1.0 / np.sqrt(alpha))
# computing eta0, the initial learning rate
initial_eta0 = typw / max(1.0, loss.dloss(1.0, -typw))
# initialize t such that eta at first sample equals eta0
optimal_init = 1.0 / (initial_eta0 * alpha)
t_start = time()
with nogil:
for epoch in range(max_iter):
sumloss = 0
if verbose > 0:
with gil:
print("-- Epoch %d" % (epoch + 1))
if shuffle:
dataset.shuffle(seed)
for i in range(n_samples):
dataset.next(&x_data_ptr, &x_ind_ptr, &xnnz,
&y, &sample_weight)
sample_index = dataset.index_data_ptr[dataset.current_index]
if validation_mask[sample_index]:
# do not learn on the validation set
continue
p = w.dot(x_data_ptr, x_ind_ptr, xnnz) + intercept
if learning_rate == OPTIMAL:
eta = 1.0 / (alpha * (optimal_init + t - 1))
elif learning_rate == INVSCALING:
eta = eta0 / pow(t, power_t)
if verbose or not early_stopping:
sumloss += loss.loss(y, p)
if y > 0.0:
class_weight = weight_pos
else:
class_weight = weight_neg
if learning_rate == PA1:
update = sqnorm(x_data_ptr, x_ind_ptr, xnnz)
if update == 0:
continue
update = min(C, loss.loss(y, p) / update)
elif learning_rate == PA2:
update = sqnorm(x_data_ptr, x_ind_ptr, xnnz)
update = loss.loss(y, p) / (update + 0.5 / C)
else:
dloss = loss.dloss(y, p)
# clip dloss with large values to avoid numerical
# instabilities
if dloss < -MAX_DLOSS:
dloss = -MAX_DLOSS
elif dloss > MAX_DLOSS:
dloss = MAX_DLOSS
update = -eta * dloss
if learning_rate >= PA1:
if is_hinge:
# classification
update *= y
elif y - p < 0:
# regression
update *= -1
update *= class_weight * sample_weight
if penalty_type >= L2:
# do not scale to negative values when eta or alpha are too
# big: instead set the weights to zero
w.scale(max(0, 1.0 - ((1.0 - l1_ratio) * eta * alpha)))
if update != 0.0:
w.add(x_data_ptr, x_ind_ptr, xnnz, update)
if fit_intercept == 1:
intercept_update = update
if one_class: # specific for One-Class SVM
intercept_update -= 2. * eta * alpha
if intercept_update != 0:
intercept += intercept_update * intercept_decay
if 0 < average <= t:
# compute the average for the intercept and update the
# average weights, this is done regardless as to whether
# the update is 0
w.add_average(x_data_ptr, x_ind_ptr, xnnz,
update, (t - average + 1))
average_intercept += ((intercept - average_intercept) /
(t - average + 1))
if penalty_type == L1 or penalty_type == ELASTICNET:
u += (l1_ratio * eta * alpha)
l1penalty{{name_suffix}}(w, q_data_ptr, x_ind_ptr, xnnz, u)
t += 1
count += 1
# report epoch information
if verbose > 0:
with gil:
print("Norm: %.2f, NNZs: %d, Bias: %.6f, T: %d, "
"Avg. loss: %f"
% (w.norm(), np.nonzero(weights)[0].shape[0],
intercept, count, sumloss / train_count))
print("Total training time: %.2f seconds."
% (time() - t_start))
# floating-point under-/overflow check.
if (not isfinite(intercept) or any_nonfinite(weights)):
infinity = True
break
# evaluate the score on the validation set
if early_stopping:
with gil:
score = validation_score_cb(weights.base, intercept)
if tol > -INFINITY and score < best_score + tol:
no_improvement_count += 1
else:
no_improvement_count = 0
if score > best_score:
best_score = score
# or evaluate the loss on the training set
else:
if tol > -INFINITY and sumloss > best_loss - tol * train_count:
no_improvement_count += 1
else:
no_improvement_count = 0
if sumloss < best_loss:
best_loss = sumloss
# if there is no improvement several times in a row
if no_improvement_count >= n_iter_no_change:
if learning_rate == ADAPTIVE and eta > 1e-6:
eta = eta / 5
no_improvement_count = 0
else:
if verbose:
with gil:
print("Convergence after %d epochs took %.2f "
"seconds" % (epoch + 1, time() - t_start))
break
if infinity:
raise ValueError(("Floating-point under-/overflow occurred at epoch"
" #%d. Scaling input data with StandardScaler or"
" MinMaxScaler might help.") % (epoch + 1))
w.reset_wscale()
return (
weights.base,
intercept,
None if average_weights is None else average_weights.base,
average_intercept,
epoch + 1
)
{{endfor}}
cdef inline bint any_nonfinite(const floating[::1] w) noexcept nogil:
for i in range(w.shape[0]):
if not isfinite(w[i]):
return True
return 0
cdef inline double sqnorm(
floating * x_data_ptr,
int * x_ind_ptr,
int xnnz,
) noexcept nogil:
cdef double x_norm = 0.0
cdef int j
cdef double z
for j in range(xnnz):
z = x_data_ptr[j]
x_norm += z * z
return x_norm
{{for name_suffix, c_type, np_type in dtypes}}
cdef void l1penalty{{name_suffix}}(
WeightVector{{name_suffix}} w,
{{c_type}} * q_data_ptr,
int *x_ind_ptr,
int xnnz,
double u,
) noexcept nogil:
"""Apply the L1 penalty to each updated feature
This implements the truncated gradient approach by
[Tsuruoka, Y., Tsujii, J., and Ananiadou, S., 2009].
"""
cdef double z = 0.0
cdef int j = 0
cdef int idx = 0
cdef double wscale = w.wscale
cdef {{c_type}} *w_data_ptr = w.w_data_ptr
for j in range(xnnz):
idx = x_ind_ptr[j]
z = w_data_ptr[idx]
if wscale * z > 0.0:
w_data_ptr[idx] = max(
0.0, w_data_ptr[idx] - ((u + q_data_ptr[idx]) / wscale))
elif wscale * z < 0.0:
w_data_ptr[idx] = min(
0.0, w_data_ptr[idx] + ((u - q_data_ptr[idx]) / wscale))
q_data_ptr[idx] += wscale * (w_data_ptr[idx] - z)
{{endfor}}