1419 lines
49 KiB
Plaintext
1419 lines
49 KiB
Plaintext
{{py:
|
|
|
|
"""
|
|
Template file to easily generate loops over samples using Tempita
|
|
(https://github.com/cython/cython/blob/master/Cython/Tempita/_tempita.py).
|
|
|
|
Generated file: _loss.pyx
|
|
|
|
Each loss class is generated by a cdef functions on single samples.
|
|
The keywords between double braces are substituted in setup.py.
|
|
"""
|
|
|
|
doc_HalfSquaredError = (
|
|
"""Half Squared Error with identity link.
|
|
|
|
Domain:
|
|
y_true and y_pred all real numbers
|
|
|
|
Link:
|
|
y_pred = raw_prediction
|
|
"""
|
|
)
|
|
|
|
doc_AbsoluteError = (
|
|
"""Absolute Error with identity link.
|
|
|
|
Domain:
|
|
y_true and y_pred all real numbers
|
|
|
|
Link:
|
|
y_pred = raw_prediction
|
|
"""
|
|
)
|
|
|
|
doc_PinballLoss = (
|
|
"""Quantile Loss aka Pinball Loss with identity link.
|
|
|
|
Domain:
|
|
y_true and y_pred all real numbers
|
|
quantile in (0, 1)
|
|
|
|
Link:
|
|
y_pred = raw_prediction
|
|
|
|
Note: 2 * cPinballLoss(quantile=0.5) equals cAbsoluteError()
|
|
"""
|
|
)
|
|
|
|
doc_HuberLoss = (
|
|
"""Huber Loss with identity link.
|
|
|
|
Domain:
|
|
y_true and y_pred all real numbers
|
|
delta in positive real numbers
|
|
|
|
Link:
|
|
y_pred = raw_prediction
|
|
"""
|
|
)
|
|
|
|
doc_HalfPoissonLoss = (
|
|
"""Half Poisson deviance loss with log-link.
|
|
|
|
Domain:
|
|
y_true in non-negative real numbers
|
|
y_pred in positive real numbers
|
|
|
|
Link:
|
|
y_pred = exp(raw_prediction)
|
|
|
|
Half Poisson deviance with log-link is
|
|
y_true * log(y_true/y_pred) + y_pred - y_true
|
|
= y_true * log(y_true) - y_true * raw_prediction
|
|
+ exp(raw_prediction) - y_true
|
|
|
|
Dropping constant terms, this gives:
|
|
exp(raw_prediction) - y_true * raw_prediction
|
|
"""
|
|
)
|
|
|
|
doc_HalfGammaLoss = (
|
|
"""Half Gamma deviance loss with log-link.
|
|
|
|
Domain:
|
|
y_true and y_pred in positive real numbers
|
|
|
|
Link:
|
|
y_pred = exp(raw_prediction)
|
|
|
|
Half Gamma deviance with log-link is
|
|
log(y_pred/y_true) + y_true/y_pred - 1
|
|
= raw_prediction - log(y_true) + y_true * exp(-raw_prediction) - 1
|
|
|
|
Dropping constant terms, this gives:
|
|
raw_prediction + y_true * exp(-raw_prediction)
|
|
"""
|
|
)
|
|
|
|
doc_HalfTweedieLoss = (
|
|
"""Half Tweedie deviance loss with log-link.
|
|
|
|
Domain:
|
|
y_true in real numbers if p <= 0
|
|
y_true in non-negative real numbers if 0 < p < 2
|
|
y_true in positive real numbers if p >= 2
|
|
y_pred and power in positive real numbers
|
|
|
|
Link:
|
|
y_pred = exp(raw_prediction)
|
|
|
|
Half Tweedie deviance with log-link and p=power is
|
|
max(y_true, 0)**(2-p) / (1-p) / (2-p)
|
|
- y_true * y_pred**(1-p) / (1-p)
|
|
+ y_pred**(2-p) / (2-p)
|
|
= max(y_true, 0)**(2-p) / (1-p) / (2-p)
|
|
- y_true * exp((1-p) * raw_prediction) / (1-p)
|
|
+ exp((2-p) * raw_prediction) / (2-p)
|
|
|
|
Dropping constant terms, this gives:
|
|
exp((2-p) * raw_prediction) / (2-p)
|
|
- y_true * exp((1-p) * raw_prediction) / (1-p)
|
|
|
|
Notes:
|
|
- Poisson with p=1 and and Gamma with p=2 have different terms dropped such
|
|
that cHalfTweedieLoss is not continuous in p=power at p=1 and p=2.
|
|
- While the Tweedie distribution only exists for p<=0 or p>=1, the range
|
|
0<p<1 still gives a strictly consistent scoring function for the
|
|
expectation.
|
|
"""
|
|
)
|
|
|
|
doc_HalfTweedieLossIdentity = (
|
|
"""Half Tweedie deviance loss with identity link.
|
|
|
|
Domain:
|
|
y_true in real numbers if p <= 0
|
|
y_true in non-negative real numbers if 0 < p < 2
|
|
y_true in positive real numbers if p >= 2
|
|
y_pred and power in positive real numbers, y_pred may be negative for p=0.
|
|
|
|
Link:
|
|
y_pred = raw_prediction
|
|
|
|
Half Tweedie deviance with identity link and p=power is
|
|
max(y_true, 0)**(2-p) / (1-p) / (2-p)
|
|
- y_true * y_pred**(1-p) / (1-p)
|
|
+ y_pred**(2-p) / (2-p)
|
|
|
|
Notes:
|
|
- Here, we do not drop constant terms in contrast to the version with log-link.
|
|
"""
|
|
)
|
|
|
|
doc_HalfBinomialLoss = (
|
|
"""Half Binomial deviance loss with logit link.
|
|
|
|
Domain:
|
|
y_true in [0, 1]
|
|
y_pred in (0, 1), i.e. boundaries excluded
|
|
|
|
Link:
|
|
y_pred = expit(raw_prediction)
|
|
"""
|
|
)
|
|
|
|
doc_ExponentialLoss = (
|
|
""""Exponential loss with (half) logit link
|
|
|
|
Domain:
|
|
y_true in [0, 1]
|
|
y_pred in (0, 1), i.e. boundaries excluded
|
|
|
|
Link:
|
|
y_pred = expit(2 * raw_prediction)
|
|
"""
|
|
)
|
|
|
|
# loss class name, docstring, param,
|
|
# cy_loss, cy_loss_grad,
|
|
# cy_grad, cy_grad_hess,
|
|
class_list = [
|
|
("CyHalfSquaredError", doc_HalfSquaredError, None,
|
|
"closs_half_squared_error", None,
|
|
"cgradient_half_squared_error", "cgrad_hess_half_squared_error"),
|
|
("CyAbsoluteError", doc_AbsoluteError, None,
|
|
"closs_absolute_error", None,
|
|
"cgradient_absolute_error", "cgrad_hess_absolute_error"),
|
|
("CyPinballLoss", doc_PinballLoss, "quantile",
|
|
"closs_pinball_loss", None,
|
|
"cgradient_pinball_loss", "cgrad_hess_pinball_loss"),
|
|
("CyHuberLoss", doc_HuberLoss, "delta",
|
|
"closs_huber_loss", None,
|
|
"cgradient_huber_loss", "cgrad_hess_huber_loss"),
|
|
("CyHalfPoissonLoss", doc_HalfPoissonLoss, None,
|
|
"closs_half_poisson", "closs_grad_half_poisson",
|
|
"cgradient_half_poisson", "cgrad_hess_half_poisson"),
|
|
("CyHalfGammaLoss", doc_HalfGammaLoss, None,
|
|
"closs_half_gamma", "closs_grad_half_gamma",
|
|
"cgradient_half_gamma", "cgrad_hess_half_gamma"),
|
|
("CyHalfTweedieLoss", doc_HalfTweedieLoss, "power",
|
|
"closs_half_tweedie", "closs_grad_half_tweedie",
|
|
"cgradient_half_tweedie", "cgrad_hess_half_tweedie"),
|
|
("CyHalfTweedieLossIdentity", doc_HalfTweedieLossIdentity, "power",
|
|
"closs_half_tweedie_identity", "closs_grad_half_tweedie_identity",
|
|
"cgradient_half_tweedie_identity", "cgrad_hess_half_tweedie_identity"),
|
|
("CyHalfBinomialLoss", doc_HalfBinomialLoss, None,
|
|
"closs_half_binomial", "closs_grad_half_binomial",
|
|
"cgradient_half_binomial", "cgrad_hess_half_binomial"),
|
|
("CyExponentialLoss", doc_ExponentialLoss, None,
|
|
"closs_exponential", "closs_grad_exponential",
|
|
"cgradient_exponential", "cgrad_hess_exponential"),
|
|
]
|
|
}}
|
|
|
|
# Design:
|
|
# See https://github.com/scikit-learn/scikit-learn/issues/15123 for reasons.
|
|
# a) Merge link functions into loss functions for speed and numerical
|
|
# stability, i.e. use raw_prediction instead of y_pred in signature.
|
|
# b) Pure C functions (nogil) calculate single points (single sample)
|
|
# c) Wrap C functions in a loop to get Python functions operating on ndarrays.
|
|
# - Write loops manually---use Tempita for this.
|
|
# Reason: There is still some performance overhead when using a wrapper
|
|
# function "wrap" that carries out the loop and gets as argument a function
|
|
# pointer to one of the C functions from b), e.g.
|
|
# wrap(closs_half_poisson, y_true, ...)
|
|
# - Pass n_threads as argument to prange and propagate option to all callers.
|
|
# d) Provide classes (Cython extension types) per loss (names start with Cy) in
|
|
# order to have semantical structured objects.
|
|
# - Member functions for single points just call the C function from b).
|
|
# These are used e.g. in SGD `_plain_sgd`.
|
|
# - Member functions operating on ndarrays, see c), looping over calls to C
|
|
# functions from b).
|
|
# e) Provide convenience Python classes that compose from these extension types
|
|
# elsewhere (see loss.py)
|
|
# - Example: loss.gradient calls CyLoss.gradient but does some input
|
|
# checking like None -> np.empty().
|
|
#
|
|
# Note: We require 1-dim ndarrays to be contiguous.
|
|
|
|
from cython.parallel import parallel, prange
|
|
import numpy as np
|
|
|
|
from libc.math cimport exp, fabs, log, log1p, pow
|
|
from libc.stdlib cimport malloc, free
|
|
|
|
|
|
# -------------------------------------
|
|
# Helper functions
|
|
# -------------------------------------
|
|
# Numerically stable version of log(1 + exp(x)) for double precision, see Eq. (10) of
|
|
# https://cran.r-project.org/web/packages/Rmpfr/vignettes/log1mexp-note.pdf
|
|
# Note: The only important cutoff is at x = 18. All others are to save computation
|
|
# time. Compared to the reference, we add the additional case distinction x <= -2 in
|
|
# order to use log instead of log1p for improved performance. As with the other
|
|
# cutoffs, this is accurate within machine precision of double.
|
|
cdef inline double log1pexp(double x) noexcept nogil:
|
|
if x <= -37:
|
|
return exp(x)
|
|
elif x <= -2:
|
|
return log1p(exp(x))
|
|
elif x <= 18:
|
|
return log(1. + exp(x))
|
|
elif x <= 33.3:
|
|
return x + exp(-x)
|
|
else:
|
|
return x
|
|
|
|
|
|
cdef inline void sum_exp_minus_max(
|
|
const int i,
|
|
const floating_in[:, :] raw_prediction, # IN
|
|
floating_in *p # OUT
|
|
) noexcept nogil:
|
|
# Thread local buffers are used to store results of this function via p.
|
|
# The results are stored as follows:
|
|
# p[k] = exp(raw_prediction_i_k - max_value) for k = 0 to n_classes-1
|
|
# p[-2] = max(raw_prediction_i_k, k = 0 to n_classes-1)
|
|
# p[-1] = sum(p[k], k = 0 to n_classes-1) = sum of exponentials
|
|
# len(p) must be n_classes + 2
|
|
# Notes:
|
|
# - Using "by reference" arguments doesn't work well, therefore we use a
|
|
# longer p, see https://github.com/cython/cython/issues/1863
|
|
# - i needs to be passed (and stays constant) because otherwise Cython does
|
|
# not generate optimal code, see
|
|
# https://github.com/scikit-learn/scikit-learn/issues/17299
|
|
# - We do not normalize p by calculating p[k] = p[k] / sum_exps.
|
|
# This helps to save one loop over k.
|
|
cdef:
|
|
int k
|
|
int n_classes = raw_prediction.shape[1]
|
|
double max_value = raw_prediction[i, 0]
|
|
double sum_exps = 0
|
|
for k in range(1, n_classes):
|
|
# Compute max value of array for numerical stability
|
|
if max_value < raw_prediction[i, k]:
|
|
max_value = raw_prediction[i, k]
|
|
|
|
for k in range(n_classes):
|
|
p[k] = exp(raw_prediction[i, k] - max_value)
|
|
sum_exps += p[k]
|
|
|
|
p[n_classes] = max_value # same as p[-2]
|
|
p[n_classes + 1] = sum_exps # same as p[-1]
|
|
|
|
|
|
# -------------------------------------
|
|
# Single point inline C functions
|
|
# -------------------------------------
|
|
# Half Squared Error
|
|
cdef inline double closs_half_squared_error(
|
|
double y_true,
|
|
double raw_prediction
|
|
) noexcept nogil:
|
|
return 0.5 * (raw_prediction - y_true) * (raw_prediction - y_true)
|
|
|
|
|
|
cdef inline double cgradient_half_squared_error(
|
|
double y_true,
|
|
double raw_prediction
|
|
) noexcept nogil:
|
|
return raw_prediction - y_true
|
|
|
|
|
|
cdef inline double_pair cgrad_hess_half_squared_error(
|
|
double y_true,
|
|
double raw_prediction
|
|
) noexcept nogil:
|
|
cdef double_pair gh
|
|
gh.val1 = raw_prediction - y_true # gradient
|
|
gh.val2 = 1. # hessian
|
|
return gh
|
|
|
|
|
|
# Absolute Error
|
|
cdef inline double closs_absolute_error(
|
|
double y_true,
|
|
double raw_prediction
|
|
) noexcept nogil:
|
|
return fabs(raw_prediction - y_true)
|
|
|
|
|
|
cdef inline double cgradient_absolute_error(
|
|
double y_true,
|
|
double raw_prediction
|
|
) noexcept nogil:
|
|
return 1. if raw_prediction > y_true else -1.
|
|
|
|
|
|
cdef inline double_pair cgrad_hess_absolute_error(
|
|
double y_true,
|
|
double raw_prediction
|
|
) noexcept nogil:
|
|
cdef double_pair gh
|
|
# Note that exact hessian = 0 almost everywhere. Optimization routines like
|
|
# in HGBT, however, need a hessian > 0. Therefore, we assign 1.
|
|
gh.val1 = 1. if raw_prediction > y_true else -1. # gradient
|
|
gh.val2 = 1. # hessian
|
|
return gh
|
|
|
|
|
|
# Quantile Loss / Pinball Loss
|
|
cdef inline double closs_pinball_loss(
|
|
double y_true,
|
|
double raw_prediction,
|
|
double quantile
|
|
) noexcept nogil:
|
|
return (quantile * (y_true - raw_prediction) if y_true >= raw_prediction
|
|
else (1. - quantile) * (raw_prediction - y_true))
|
|
|
|
|
|
cdef inline double cgradient_pinball_loss(
|
|
double y_true,
|
|
double raw_prediction,
|
|
double quantile
|
|
) noexcept nogil:
|
|
return -quantile if y_true >=raw_prediction else 1. - quantile
|
|
|
|
|
|
cdef inline double_pair cgrad_hess_pinball_loss(
|
|
double y_true,
|
|
double raw_prediction,
|
|
double quantile
|
|
) noexcept nogil:
|
|
cdef double_pair gh
|
|
# Note that exact hessian = 0 almost everywhere. Optimization routines like
|
|
# in HGBT, however, need a hessian > 0. Therefore, we assign 1.
|
|
gh.val1 = -quantile if y_true >=raw_prediction else 1. - quantile # gradient
|
|
gh.val2 = 1. # hessian
|
|
return gh
|
|
|
|
|
|
# Huber Loss
|
|
cdef inline double closs_huber_loss(
|
|
double y_true,
|
|
double raw_prediction,
|
|
double delta,
|
|
) noexcept nogil:
|
|
cdef double abserr = fabs(y_true - raw_prediction)
|
|
if abserr <= delta:
|
|
return 0.5 * abserr**2
|
|
else:
|
|
return delta * (abserr - 0.5 * delta)
|
|
|
|
|
|
cdef inline double cgradient_huber_loss(
|
|
double y_true,
|
|
double raw_prediction,
|
|
double delta,
|
|
) noexcept nogil:
|
|
cdef double res = raw_prediction - y_true
|
|
if fabs(res) <= delta:
|
|
return res
|
|
else:
|
|
return delta if res >=0 else -delta
|
|
|
|
|
|
cdef inline double_pair cgrad_hess_huber_loss(
|
|
double y_true,
|
|
double raw_prediction,
|
|
double delta,
|
|
) noexcept nogil:
|
|
cdef double_pair gh
|
|
gh.val2 = raw_prediction - y_true # used as temporary
|
|
if fabs(gh.val2) <= delta:
|
|
gh.val1 = gh.val2 # gradient
|
|
gh.val2 = 1 # hessian
|
|
else:
|
|
gh.val1 = delta if gh.val2 >=0 else -delta # gradient
|
|
gh.val2 = 0 # hessian
|
|
return gh
|
|
|
|
|
|
# Half Poisson Deviance with Log-Link, dropping constant terms
|
|
cdef inline double closs_half_poisson(
|
|
double y_true,
|
|
double raw_prediction
|
|
) noexcept nogil:
|
|
return exp(raw_prediction) - y_true * raw_prediction
|
|
|
|
|
|
cdef inline double cgradient_half_poisson(
|
|
double y_true,
|
|
double raw_prediction
|
|
) noexcept nogil:
|
|
# y_pred - y_true
|
|
return exp(raw_prediction) - y_true
|
|
|
|
|
|
cdef inline double_pair closs_grad_half_poisson(
|
|
double y_true,
|
|
double raw_prediction
|
|
) noexcept nogil:
|
|
cdef double_pair lg
|
|
lg.val2 = exp(raw_prediction) # used as temporary
|
|
lg.val1 = lg.val2 - y_true * raw_prediction # loss
|
|
lg.val2 -= y_true # gradient
|
|
return lg
|
|
|
|
|
|
cdef inline double_pair cgrad_hess_half_poisson(
|
|
double y_true,
|
|
double raw_prediction
|
|
) noexcept nogil:
|
|
cdef double_pair gh
|
|
gh.val2 = exp(raw_prediction) # hessian
|
|
gh.val1 = gh.val2 - y_true # gradient
|
|
return gh
|
|
|
|
|
|
# Half Gamma Deviance with Log-Link, dropping constant terms
|
|
cdef inline double closs_half_gamma(
|
|
double y_true,
|
|
double raw_prediction
|
|
) noexcept nogil:
|
|
return raw_prediction + y_true * exp(-raw_prediction)
|
|
|
|
|
|
cdef inline double cgradient_half_gamma(
|
|
double y_true,
|
|
double raw_prediction
|
|
) noexcept nogil:
|
|
return 1. - y_true * exp(-raw_prediction)
|
|
|
|
|
|
cdef inline double_pair closs_grad_half_gamma(
|
|
double y_true,
|
|
double raw_prediction
|
|
) noexcept nogil:
|
|
cdef double_pair lg
|
|
lg.val2 = exp(-raw_prediction) # used as temporary
|
|
lg.val1 = raw_prediction + y_true * lg.val2 # loss
|
|
lg.val2 = 1. - y_true * lg.val2 # gradient
|
|
return lg
|
|
|
|
|
|
cdef inline double_pair cgrad_hess_half_gamma(
|
|
double y_true,
|
|
double raw_prediction
|
|
) noexcept nogil:
|
|
cdef double_pair gh
|
|
gh.val2 = exp(-raw_prediction) # used as temporary
|
|
gh.val1 = 1. - y_true * gh.val2 # gradient
|
|
gh.val2 *= y_true # hessian
|
|
return gh
|
|
|
|
|
|
# Half Tweedie Deviance with Log-Link, dropping constant terms
|
|
# Note that by dropping constants this is no longer continuous in parameter power.
|
|
cdef inline double closs_half_tweedie(
|
|
double y_true,
|
|
double raw_prediction,
|
|
double power
|
|
) noexcept nogil:
|
|
if power == 0.:
|
|
return closs_half_squared_error(y_true, exp(raw_prediction))
|
|
elif power == 1.:
|
|
return closs_half_poisson(y_true, raw_prediction)
|
|
elif power == 2.:
|
|
return closs_half_gamma(y_true, raw_prediction)
|
|
else:
|
|
return (exp((2. - power) * raw_prediction) / (2. - power)
|
|
- y_true * exp((1. - power) * raw_prediction) / (1. - power))
|
|
|
|
|
|
cdef inline double cgradient_half_tweedie(
|
|
double y_true,
|
|
double raw_prediction,
|
|
double power
|
|
) noexcept nogil:
|
|
cdef double exp1
|
|
if power == 0.:
|
|
exp1 = exp(raw_prediction)
|
|
return exp1 * (exp1 - y_true)
|
|
elif power == 1.:
|
|
return cgradient_half_poisson(y_true, raw_prediction)
|
|
elif power == 2.:
|
|
return cgradient_half_gamma(y_true, raw_prediction)
|
|
else:
|
|
return (exp((2. - power) * raw_prediction)
|
|
- y_true * exp((1. - power) * raw_prediction))
|
|
|
|
|
|
cdef inline double_pair closs_grad_half_tweedie(
|
|
double y_true,
|
|
double raw_prediction,
|
|
double power
|
|
) noexcept nogil:
|
|
cdef double_pair lg
|
|
cdef double exp1, exp2
|
|
if power == 0.:
|
|
exp1 = exp(raw_prediction)
|
|
lg.val1 = closs_half_squared_error(y_true, exp1) # loss
|
|
lg.val2 = exp1 * (exp1 - y_true) # gradient
|
|
elif power == 1.:
|
|
return closs_grad_half_poisson(y_true, raw_prediction)
|
|
elif power == 2.:
|
|
return closs_grad_half_gamma(y_true, raw_prediction)
|
|
else:
|
|
exp1 = exp((1. - power) * raw_prediction)
|
|
exp2 = exp((2. - power) * raw_prediction)
|
|
lg.val1 = exp2 / (2. - power) - y_true * exp1 / (1. - power) # loss
|
|
lg.val2 = exp2 - y_true * exp1 # gradient
|
|
return lg
|
|
|
|
|
|
cdef inline double_pair cgrad_hess_half_tweedie(
|
|
double y_true,
|
|
double raw_prediction,
|
|
double power
|
|
) noexcept nogil:
|
|
cdef double_pair gh
|
|
cdef double exp1, exp2
|
|
if power == 0.:
|
|
exp1 = exp(raw_prediction)
|
|
gh.val1 = exp1 * (exp1 - y_true) # gradient
|
|
gh.val2 = exp1 * (2 * exp1 - y_true) # hessian
|
|
elif power == 1.:
|
|
return cgrad_hess_half_poisson(y_true, raw_prediction)
|
|
elif power == 2.:
|
|
return cgrad_hess_half_gamma(y_true, raw_prediction)
|
|
else:
|
|
exp1 = exp((1. - power) * raw_prediction)
|
|
exp2 = exp((2. - power) * raw_prediction)
|
|
gh.val1 = exp2 - y_true * exp1 # gradient
|
|
gh.val2 = (2. - power) * exp2 - (1. - power) * y_true * exp1 # hessian
|
|
return gh
|
|
|
|
|
|
# Half Tweedie Deviance with identity link, without dropping constant terms!
|
|
# Therefore, best loss value is zero.
|
|
cdef inline double closs_half_tweedie_identity(
|
|
double y_true,
|
|
double raw_prediction,
|
|
double power
|
|
) noexcept nogil:
|
|
cdef double tmp
|
|
if power == 0.:
|
|
return closs_half_squared_error(y_true, raw_prediction)
|
|
elif power == 1.:
|
|
if y_true == 0:
|
|
return raw_prediction
|
|
else:
|
|
return y_true * log(y_true/raw_prediction) + raw_prediction - y_true
|
|
elif power == 2.:
|
|
return log(raw_prediction/y_true) + y_true/raw_prediction - 1.
|
|
else:
|
|
tmp = pow(raw_prediction, 1. - power)
|
|
tmp = raw_prediction * tmp / (2. - power) - y_true * tmp / (1. - power)
|
|
if y_true > 0:
|
|
tmp += pow(y_true, 2. - power) / ((1. - power) * (2. - power))
|
|
return tmp
|
|
|
|
|
|
cdef inline double cgradient_half_tweedie_identity(
|
|
double y_true,
|
|
double raw_prediction,
|
|
double power
|
|
) noexcept nogil:
|
|
if power == 0.:
|
|
return raw_prediction - y_true
|
|
elif power == 1.:
|
|
return 1. - y_true / raw_prediction
|
|
elif power == 2.:
|
|
return (raw_prediction - y_true) / (raw_prediction * raw_prediction)
|
|
else:
|
|
return pow(raw_prediction, -power) * (raw_prediction - y_true)
|
|
|
|
|
|
cdef inline double_pair closs_grad_half_tweedie_identity(
|
|
double y_true,
|
|
double raw_prediction,
|
|
double power
|
|
) noexcept nogil:
|
|
cdef double_pair lg
|
|
cdef double tmp
|
|
if power == 0.:
|
|
lg.val2 = raw_prediction - y_true # gradient
|
|
lg.val1 = 0.5 * lg.val2 * lg.val2 # loss
|
|
elif power == 1.:
|
|
if y_true == 0:
|
|
lg.val1 = raw_prediction
|
|
else:
|
|
lg.val1 = (y_true * log(y_true/raw_prediction) # loss
|
|
+ raw_prediction - y_true)
|
|
lg.val2 = 1. - y_true / raw_prediction # gradient
|
|
elif power == 2.:
|
|
lg.val1 = log(raw_prediction/y_true) + y_true/raw_prediction - 1. # loss
|
|
tmp = raw_prediction * raw_prediction
|
|
lg.val2 = (raw_prediction - y_true) / tmp # gradient
|
|
else:
|
|
tmp = pow(raw_prediction, 1. - power)
|
|
lg.val1 = (raw_prediction * tmp / (2. - power) # loss
|
|
- y_true * tmp / (1. - power))
|
|
if y_true > 0:
|
|
lg.val1 += (pow(y_true, 2. - power)
|
|
/ ((1. - power) * (2. - power)))
|
|
lg.val2 = tmp * (1. - y_true / raw_prediction) # gradient
|
|
return lg
|
|
|
|
|
|
cdef inline double_pair cgrad_hess_half_tweedie_identity(
|
|
double y_true,
|
|
double raw_prediction,
|
|
double power
|
|
) noexcept nogil:
|
|
cdef double_pair gh
|
|
cdef double tmp
|
|
if power == 0.:
|
|
gh.val1 = raw_prediction - y_true # gradient
|
|
gh.val2 = 1. # hessian
|
|
elif power == 1.:
|
|
gh.val1 = 1. - y_true / raw_prediction # gradient
|
|
gh.val2 = y_true / (raw_prediction * raw_prediction) # hessian
|
|
elif power == 2.:
|
|
tmp = raw_prediction * raw_prediction
|
|
gh.val1 = (raw_prediction - y_true) / tmp # gradient
|
|
gh.val2 = (-1. + 2. * y_true / raw_prediction) / tmp # hessian
|
|
else:
|
|
tmp = pow(raw_prediction, -power)
|
|
gh.val1 = tmp * (raw_prediction - y_true) # gradient
|
|
gh.val2 = tmp * ((1. - power) + power * y_true / raw_prediction) # hessian
|
|
return gh
|
|
|
|
|
|
# Half Binomial deviance with logit-link, aka log-loss or binary cross entropy
|
|
cdef inline double closs_half_binomial(
|
|
double y_true,
|
|
double raw_prediction
|
|
) noexcept nogil:
|
|
# log1p(exp(raw_prediction)) - y_true * raw_prediction
|
|
return log1pexp(raw_prediction) - y_true * raw_prediction
|
|
|
|
|
|
cdef inline double cgradient_half_binomial(
|
|
double y_true,
|
|
double raw_prediction
|
|
) noexcept nogil:
|
|
# gradient = y_pred - y_true = expit(raw_prediction) - y_true
|
|
# Numerically more stable, see http://fa.bianp.net/blog/2019/evaluate_logistic/
|
|
# if raw_prediction < 0:
|
|
# exp_tmp = exp(raw_prediction)
|
|
# return ((1 - y_true) * exp_tmp - y_true) / (1 + exp_tmp)
|
|
# else:
|
|
# exp_tmp = exp(-raw_prediction)
|
|
# return ((1 - y_true) - y_true * exp_tmp) / (1 + exp_tmp)
|
|
# Note that optimal speed would be achieved, at the cost of precision, by
|
|
# return expit(raw_prediction) - y_true
|
|
# i.e. no "if else" and an own inline implementation of expit instead of
|
|
# from scipy.special.cython_special cimport expit
|
|
# The case distinction raw_prediction < 0 in the stable implementation does not
|
|
# provide significant better precision apart from protecting overflow of exp(..).
|
|
# The branch (if else), however, can incur runtime costs of up to 30%.
|
|
# Instead, we help branch prediction by almost always ending in the first if clause
|
|
# and making the second branch (else) a bit simpler. This has the exact same
|
|
# precision but is faster than the stable implementation.
|
|
# As branching criteria, we use the same cutoff as in log1pexp. Note that the
|
|
# maximal value to get gradient = -1 with y_true = 1 is -37.439198610162731
|
|
# (based on mpmath), and scipy.special.logit(np.finfo(float).eps) ~ -36.04365.
|
|
cdef double exp_tmp
|
|
if raw_prediction > -37:
|
|
exp_tmp = exp(-raw_prediction)
|
|
return ((1 - y_true) - y_true * exp_tmp) / (1 + exp_tmp)
|
|
else:
|
|
# expit(raw_prediction) = exp(raw_prediction) for raw_prediction <= -37
|
|
return exp(raw_prediction) - y_true
|
|
|
|
|
|
cdef inline double_pair closs_grad_half_binomial(
|
|
double y_true,
|
|
double raw_prediction
|
|
) noexcept nogil:
|
|
cdef double_pair lg
|
|
# Same if else conditions as in log1pexp.
|
|
if raw_prediction <= -37:
|
|
lg.val2 = exp(raw_prediction) # used as temporary
|
|
lg.val1 = lg.val2 - y_true * raw_prediction # loss
|
|
lg.val2 -= y_true # gradient
|
|
elif raw_prediction <= -2:
|
|
lg.val2 = exp(raw_prediction) # used as temporary
|
|
lg.val1 = log1p(lg.val2) - y_true * raw_prediction # loss
|
|
lg.val2 = ((1 - y_true) * lg.val2 - y_true) / (1 + lg.val2) # gradient
|
|
elif raw_prediction <= 18:
|
|
lg.val2 = exp(-raw_prediction) # used as temporary
|
|
# log1p(exp(x)) = log(1 + exp(x)) = x + log1p(exp(-x))
|
|
lg.val1 = log1p(lg.val2) + (1 - y_true) * raw_prediction # loss
|
|
lg.val2 = ((1 - y_true) - y_true * lg.val2) / (1 + lg.val2) # gradient
|
|
else:
|
|
lg.val2 = exp(-raw_prediction) # used as temporary
|
|
lg.val1 = lg.val2 + (1 - y_true) * raw_prediction # loss
|
|
lg.val2 = ((1 - y_true) - y_true * lg.val2) / (1 + lg.val2) # gradient
|
|
return lg
|
|
|
|
|
|
cdef inline double_pair cgrad_hess_half_binomial(
|
|
double y_true,
|
|
double raw_prediction
|
|
) noexcept nogil:
|
|
# with y_pred = expit(raw)
|
|
# hessian = y_pred * (1 - y_pred) = exp( raw) / (1 + exp( raw))**2
|
|
# = exp(-raw) / (1 + exp(-raw))**2
|
|
cdef double_pair gh
|
|
# See comment in cgradient_half_binomial.
|
|
if raw_prediction > -37:
|
|
gh.val2 = exp(-raw_prediction) # used as temporary
|
|
gh.val1 = ((1 - y_true) - y_true * gh.val2) / (1 + gh.val2) # gradient
|
|
gh.val2 = gh.val2 / (1 + gh.val2)**2 # hessian
|
|
else:
|
|
gh.val2 = exp(raw_prediction) # = 1. order Taylor in exp(raw_prediction)
|
|
gh.val1 = gh.val2 - y_true
|
|
return gh
|
|
|
|
|
|
# Exponential loss with (half) logit-link, aka boosting loss
|
|
cdef inline double closs_exponential(
|
|
double y_true,
|
|
double raw_prediction
|
|
) noexcept nogil:
|
|
cdef double tmp = exp(raw_prediction)
|
|
return y_true / tmp + (1 - y_true) * tmp
|
|
|
|
|
|
cdef inline double cgradient_exponential(
|
|
double y_true,
|
|
double raw_prediction
|
|
) noexcept nogil:
|
|
cdef double tmp = exp(raw_prediction)
|
|
return -y_true / tmp + (1 - y_true) * tmp
|
|
|
|
|
|
cdef inline double_pair closs_grad_exponential(
|
|
double y_true,
|
|
double raw_prediction
|
|
) noexcept nogil:
|
|
cdef double_pair lg
|
|
lg.val2 = exp(raw_prediction) # used as temporary
|
|
|
|
lg.val1 = y_true / lg.val2 + (1 - y_true) * lg.val2 # loss
|
|
lg.val2 = -y_true / lg.val2 + (1 - y_true) * lg.val2 # gradient
|
|
return lg
|
|
|
|
|
|
cdef inline double_pair cgrad_hess_exponential(
|
|
double y_true,
|
|
double raw_prediction
|
|
) noexcept nogil:
|
|
# Note that hessian = loss
|
|
cdef double_pair gh
|
|
gh.val2 = exp(raw_prediction) # used as temporary
|
|
|
|
gh.val1 = -y_true / gh.val2 + (1 - y_true) * gh.val2 # gradient
|
|
gh.val2 = y_true / gh.val2 + (1 - y_true) * gh.val2 # hessian
|
|
return gh
|
|
|
|
|
|
# ---------------------------------------------------
|
|
# Extension Types for Loss Functions of 1-dim targets
|
|
# ---------------------------------------------------
|
|
cdef class CyLossFunction:
|
|
"""Base class for convex loss functions."""
|
|
|
|
cdef double cy_loss(self, double y_true, double raw_prediction) noexcept nogil:
|
|
"""Compute the loss for a single sample.
|
|
|
|
Parameters
|
|
----------
|
|
y_true : double
|
|
Observed, true target value.
|
|
raw_prediction : double
|
|
Raw prediction value (in link space).
|
|
|
|
Returns
|
|
-------
|
|
double
|
|
The loss evaluated at `y_true` and `raw_prediction`.
|
|
"""
|
|
pass
|
|
|
|
cdef double cy_gradient(self, double y_true, double raw_prediction) noexcept nogil:
|
|
"""Compute gradient of loss w.r.t. raw_prediction for a single sample.
|
|
|
|
Parameters
|
|
----------
|
|
y_true : double
|
|
Observed, true target value.
|
|
raw_prediction : double
|
|
Raw prediction value (in link space).
|
|
|
|
Returns
|
|
-------
|
|
double
|
|
The derivative of the loss function w.r.t. `raw_prediction`.
|
|
"""
|
|
pass
|
|
|
|
cdef double_pair cy_grad_hess(
|
|
self, double y_true, double raw_prediction
|
|
) noexcept nogil:
|
|
"""Compute gradient and hessian.
|
|
|
|
Gradient and hessian of loss w.r.t. raw_prediction for a single sample.
|
|
|
|
This is usually diagonal in raw_prediction_i and raw_prediction_j.
|
|
Therefore, we return the diagonal element i=j.
|
|
|
|
For a loss with a non-canonical link, this might implement the diagonal
|
|
of the Fisher matrix (=expected hessian) instead of the hessian.
|
|
|
|
Parameters
|
|
----------
|
|
y_true : double
|
|
Observed, true target value.
|
|
raw_prediction : double
|
|
Raw prediction value (in link space).
|
|
|
|
Returns
|
|
-------
|
|
double_pair
|
|
Gradient and hessian of the loss function w.r.t. `raw_prediction`.
|
|
"""
|
|
pass
|
|
|
|
def loss(
|
|
self,
|
|
const floating_in[::1] y_true, # IN
|
|
const floating_in[::1] raw_prediction, # IN
|
|
const floating_in[::1] sample_weight, # IN
|
|
floating_out[::1] loss_out, # OUT
|
|
int n_threads=1
|
|
):
|
|
"""Compute the point-wise loss value for each input.
|
|
|
|
The point-wise loss is written to `loss_out` and no array is returned.
|
|
|
|
Parameters
|
|
----------
|
|
y_true : array of shape (n_samples,)
|
|
Observed, true target values.
|
|
raw_prediction : array of shape (n_samples,)
|
|
Raw prediction values (in link space).
|
|
sample_weight : array of shape (n_samples,) or None
|
|
Sample weights.
|
|
loss_out : array of shape (n_samples,)
|
|
A location into which the result is stored.
|
|
n_threads : int
|
|
Number of threads used by OpenMP (if any).
|
|
"""
|
|
pass
|
|
|
|
def gradient(
|
|
self,
|
|
const floating_in[::1] y_true, # IN
|
|
const floating_in[::1] raw_prediction, # IN
|
|
const floating_in[::1] sample_weight, # IN
|
|
floating_out[::1] gradient_out, # OUT
|
|
int n_threads=1
|
|
):
|
|
"""Compute gradient of loss w.r.t raw_prediction for each input.
|
|
|
|
The gradient is written to `gradient_out` and no array is returned.
|
|
|
|
Parameters
|
|
----------
|
|
y_true : array of shape (n_samples,)
|
|
Observed, true target values.
|
|
raw_prediction : array of shape (n_samples,)
|
|
Raw prediction values (in link space).
|
|
sample_weight : array of shape (n_samples,) or None
|
|
Sample weights.
|
|
gradient_out : array of shape (n_samples,)
|
|
A location into which the result is stored.
|
|
n_threads : int
|
|
Number of threads used by OpenMP (if any).
|
|
"""
|
|
pass
|
|
|
|
def loss_gradient(
|
|
self,
|
|
const floating_in[::1] y_true, # IN
|
|
const floating_in[::1] raw_prediction, # IN
|
|
const floating_in[::1] sample_weight, # IN
|
|
floating_out[::1] loss_out, # OUT
|
|
floating_out[::1] gradient_out, # OUT
|
|
int n_threads=1
|
|
):
|
|
"""Compute loss and gradient of loss w.r.t raw_prediction.
|
|
|
|
The loss and gradient are written to `loss_out` and `gradient_out` and no arrays
|
|
are returned.
|
|
|
|
Parameters
|
|
----------
|
|
y_true : array of shape (n_samples,)
|
|
Observed, true target values.
|
|
raw_prediction : array of shape (n_samples,)
|
|
Raw prediction values (in link space).
|
|
sample_weight : array of shape (n_samples,) or None
|
|
Sample weights.
|
|
loss_out : array of shape (n_samples,) or None
|
|
A location into which the element-wise loss is stored.
|
|
gradient_out : array of shape (n_samples,)
|
|
A location into which the gradient is stored.
|
|
n_threads : int
|
|
Number of threads used by OpenMP (if any).
|
|
"""
|
|
self.loss(y_true, raw_prediction, sample_weight, loss_out, n_threads)
|
|
self.gradient(y_true, raw_prediction, sample_weight, gradient_out, n_threads)
|
|
|
|
def gradient_hessian(
|
|
self,
|
|
const floating_in[::1] y_true, # IN
|
|
const floating_in[::1] raw_prediction, # IN
|
|
const floating_in[::1] sample_weight, # IN
|
|
floating_out[::1] gradient_out, # OUT
|
|
floating_out[::1] hessian_out, # OUT
|
|
int n_threads=1
|
|
):
|
|
"""Compute gradient and hessian of loss w.r.t raw_prediction.
|
|
|
|
The gradient and hessian are written to `gradient_out` and `hessian_out` and no
|
|
arrays are returned.
|
|
|
|
Parameters
|
|
----------
|
|
y_true : array of shape (n_samples,)
|
|
Observed, true target values.
|
|
raw_prediction : array of shape (n_samples,)
|
|
Raw prediction values (in link space).
|
|
sample_weight : array of shape (n_samples,) or None
|
|
Sample weights.
|
|
gradient_out : array of shape (n_samples,)
|
|
A location into which the gradient is stored.
|
|
hessian_out : array of shape (n_samples,)
|
|
A location into which the hessian is stored.
|
|
n_threads : int
|
|
Number of threads used by OpenMP (if any).
|
|
"""
|
|
pass
|
|
|
|
|
|
{{for name, docstring, param, closs, closs_grad, cgrad, cgrad_hess, in class_list}}
|
|
{{py:
|
|
if param is None:
|
|
with_param = ""
|
|
else:
|
|
with_param = ", self." + param
|
|
}}
|
|
|
|
cdef class {{name}}(CyLossFunction):
|
|
"""{{docstring}}"""
|
|
|
|
{{if param is not None}}
|
|
def __init__(self, {{param}}):
|
|
self.{{param}} = {{param}}
|
|
{{endif}}
|
|
|
|
cdef inline double cy_loss(self, double y_true, double raw_prediction) noexcept nogil:
|
|
return {{closs}}(y_true, raw_prediction{{with_param}})
|
|
|
|
cdef inline double cy_gradient(self, double y_true, double raw_prediction) noexcept nogil:
|
|
return {{cgrad}}(y_true, raw_prediction{{with_param}})
|
|
|
|
cdef inline double_pair cy_grad_hess(self, double y_true, double raw_prediction) noexcept nogil:
|
|
return {{cgrad_hess}}(y_true, raw_prediction{{with_param}})
|
|
|
|
def loss(
|
|
self,
|
|
const floating_in[::1] y_true, # IN
|
|
const floating_in[::1] raw_prediction, # IN
|
|
const floating_in[::1] sample_weight, # IN
|
|
floating_out[::1] loss_out, # OUT
|
|
int n_threads=1
|
|
):
|
|
cdef:
|
|
int i
|
|
int n_samples = y_true.shape[0]
|
|
|
|
if sample_weight is None:
|
|
for i in prange(
|
|
n_samples, schedule='static', nogil=True, num_threads=n_threads
|
|
):
|
|
loss_out[i] = {{closs}}(y_true[i], raw_prediction[i]{{with_param}})
|
|
else:
|
|
for i in prange(
|
|
n_samples, schedule='static', nogil=True, num_threads=n_threads
|
|
):
|
|
loss_out[i] = sample_weight[i] * {{closs}}(y_true[i], raw_prediction[i]{{with_param}})
|
|
|
|
{{if closs_grad is not None}}
|
|
def loss_gradient(
|
|
self,
|
|
const floating_in[::1] y_true, # IN
|
|
const floating_in[::1] raw_prediction, # IN
|
|
const floating_in[::1] sample_weight, # IN
|
|
floating_out[::1] loss_out, # OUT
|
|
floating_out[::1] gradient_out, # OUT
|
|
int n_threads=1
|
|
):
|
|
cdef:
|
|
int i
|
|
int n_samples = y_true.shape[0]
|
|
double_pair dbl2
|
|
|
|
if sample_weight is None:
|
|
for i in prange(
|
|
n_samples, schedule='static', nogil=True, num_threads=n_threads
|
|
):
|
|
dbl2 = {{closs_grad}}(y_true[i], raw_prediction[i]{{with_param}})
|
|
loss_out[i] = dbl2.val1
|
|
gradient_out[i] = dbl2.val2
|
|
else:
|
|
for i in prange(
|
|
n_samples, schedule='static', nogil=True, num_threads=n_threads
|
|
):
|
|
dbl2 = {{closs_grad}}(y_true[i], raw_prediction[i]{{with_param}})
|
|
loss_out[i] = sample_weight[i] * dbl2.val1
|
|
gradient_out[i] = sample_weight[i] * dbl2.val2
|
|
|
|
{{endif}}
|
|
|
|
def gradient(
|
|
self,
|
|
const floating_in[::1] y_true, # IN
|
|
const floating_in[::1] raw_prediction, # IN
|
|
const floating_in[::1] sample_weight, # IN
|
|
floating_out[::1] gradient_out, # OUT
|
|
int n_threads=1
|
|
):
|
|
cdef:
|
|
int i
|
|
int n_samples = y_true.shape[0]
|
|
|
|
if sample_weight is None:
|
|
for i in prange(
|
|
n_samples, schedule='static', nogil=True, num_threads=n_threads
|
|
):
|
|
gradient_out[i] = {{cgrad}}(y_true[i], raw_prediction[i]{{with_param}})
|
|
else:
|
|
for i in prange(
|
|
n_samples, schedule='static', nogil=True, num_threads=n_threads
|
|
):
|
|
gradient_out[i] = sample_weight[i] * {{cgrad}}(y_true[i], raw_prediction[i]{{with_param}})
|
|
|
|
def gradient_hessian(
|
|
self,
|
|
const floating_in[::1] y_true, # IN
|
|
const floating_in[::1] raw_prediction, # IN
|
|
const floating_in[::1] sample_weight, # IN
|
|
floating_out[::1] gradient_out, # OUT
|
|
floating_out[::1] hessian_out, # OUT
|
|
int n_threads=1
|
|
):
|
|
cdef:
|
|
int i
|
|
int n_samples = y_true.shape[0]
|
|
double_pair dbl2
|
|
|
|
if sample_weight is None:
|
|
for i in prange(
|
|
n_samples, schedule='static', nogil=True, num_threads=n_threads
|
|
):
|
|
dbl2 = {{cgrad_hess}}(y_true[i], raw_prediction[i]{{with_param}})
|
|
gradient_out[i] = dbl2.val1
|
|
hessian_out[i] = dbl2.val2
|
|
else:
|
|
for i in prange(
|
|
n_samples, schedule='static', nogil=True, num_threads=n_threads
|
|
):
|
|
dbl2 = {{cgrad_hess}}(y_true[i], raw_prediction[i]{{with_param}})
|
|
gradient_out[i] = sample_weight[i] * dbl2.val1
|
|
hessian_out[i] = sample_weight[i] * dbl2.val2
|
|
|
|
{{endfor}}
|
|
|
|
|
|
# The multinomial deviance loss is also known as categorical cross-entropy or
|
|
# multinomial log-likelihood
|
|
cdef class CyHalfMultinomialLoss(CyLossFunction):
|
|
"""Half Multinomial deviance loss with multinomial logit link.
|
|
|
|
Domain:
|
|
y_true in {0, 1, 2, 3, .., n_classes - 1}
|
|
y_pred in (0, 1)**n_classes, i.e. interval with boundaries excluded
|
|
|
|
Link:
|
|
y_pred = softmax(raw_prediction)
|
|
|
|
Note: Label encoding is built-in, i.e. {0, 1, 2, 3, .., n_classes - 1} is
|
|
mapped to (y_true == k) for k = 0 .. n_classes - 1 which is either 0 or 1.
|
|
"""
|
|
|
|
# Note that we do not assume memory alignment/contiguity of 2d arrays.
|
|
# There seems to be little benefit in doing so. Benchmarks proofing the
|
|
# opposite are welcome.
|
|
def loss(
|
|
self,
|
|
const floating_in[::1] y_true, # IN
|
|
const floating_in[:, :] raw_prediction, # IN
|
|
const floating_in[::1] sample_weight, # IN
|
|
floating_out[::1] loss_out, # OUT
|
|
int n_threads=1
|
|
):
|
|
cdef:
|
|
int i, k
|
|
int n_samples = y_true.shape[0]
|
|
int n_classes = raw_prediction.shape[1]
|
|
floating_in max_value, sum_exps
|
|
floating_in* p # temporary buffer
|
|
|
|
# We assume n_samples > n_classes. In this case having the inner loop
|
|
# over n_classes is a good default.
|
|
# TODO: If every memoryview is contiguous and raw_prediction is
|
|
# f-contiguous, can we write a better algo (loops) to improve
|
|
# performance?
|
|
if sample_weight is None:
|
|
# inner loop over n_classes
|
|
with nogil, parallel(num_threads=n_threads):
|
|
# Define private buffer variables as each thread might use its
|
|
# own.
|
|
p = <floating_in *> malloc(sizeof(floating_in) * (n_classes + 2))
|
|
|
|
for i in prange(n_samples, schedule='static'):
|
|
sum_exp_minus_max(i, raw_prediction, p)
|
|
max_value = p[n_classes] # p[-2]
|
|
sum_exps = p[n_classes + 1] # p[-1]
|
|
loss_out[i] = log(sum_exps) + max_value
|
|
|
|
# label encoded y_true
|
|
k = int(y_true[i])
|
|
loss_out[i] -= raw_prediction[i, k]
|
|
|
|
free(p)
|
|
else:
|
|
with nogil, parallel(num_threads=n_threads):
|
|
p = <floating_in *> malloc(sizeof(floating_in) * (n_classes + 2))
|
|
|
|
for i in prange(n_samples, schedule='static'):
|
|
sum_exp_minus_max(i, raw_prediction, p)
|
|
max_value = p[n_classes] # p[-2]
|
|
sum_exps = p[n_classes + 1] # p[-1]
|
|
loss_out[i] = log(sum_exps) + max_value
|
|
|
|
# label encoded y_true
|
|
k = int(y_true[i])
|
|
loss_out[i] -= raw_prediction[i, k]
|
|
|
|
loss_out[i] *= sample_weight[i]
|
|
|
|
free(p)
|
|
|
|
def loss_gradient(
|
|
self,
|
|
const floating_in[::1] y_true, # IN
|
|
const floating_in[:, :] raw_prediction, # IN
|
|
const floating_in[::1] sample_weight, # IN
|
|
floating_out[::1] loss_out, # OUT
|
|
floating_out[:, :] gradient_out, # OUT
|
|
int n_threads=1
|
|
):
|
|
cdef:
|
|
int i, k
|
|
int n_samples = y_true.shape[0]
|
|
int n_classes = raw_prediction.shape[1]
|
|
floating_in max_value, sum_exps
|
|
floating_in* p # temporary buffer
|
|
|
|
if sample_weight is None:
|
|
# inner loop over n_classes
|
|
with nogil, parallel(num_threads=n_threads):
|
|
# Define private buffer variables as each thread might use its
|
|
# own.
|
|
p = <floating_in *> malloc(sizeof(floating_in) * (n_classes + 2))
|
|
|
|
for i in prange(n_samples, schedule='static'):
|
|
sum_exp_minus_max(i, raw_prediction, p)
|
|
max_value = p[n_classes] # p[-2]
|
|
sum_exps = p[n_classes + 1] # p[-1]
|
|
loss_out[i] = log(sum_exps) + max_value
|
|
|
|
for k in range(n_classes):
|
|
# label decode y_true
|
|
if y_true[i] == k:
|
|
loss_out[i] -= raw_prediction[i, k]
|
|
p[k] /= sum_exps # p_k = y_pred_k = prob of class k
|
|
# gradient_k = p_k - (y_true == k)
|
|
gradient_out[i, k] = p[k] - (y_true[i] == k)
|
|
|
|
free(p)
|
|
else:
|
|
with nogil, parallel(num_threads=n_threads):
|
|
p = <floating_in *> malloc(sizeof(floating_in) * (n_classes + 2))
|
|
|
|
for i in prange(n_samples, schedule='static'):
|
|
sum_exp_minus_max(i, raw_prediction, p)
|
|
max_value = p[n_classes] # p[-2]
|
|
sum_exps = p[n_classes + 1] # p[-1]
|
|
loss_out[i] = log(sum_exps) + max_value
|
|
|
|
for k in range(n_classes):
|
|
# label decode y_true
|
|
if y_true[i] == k:
|
|
loss_out[i] -= raw_prediction[i, k]
|
|
p[k] /= sum_exps # p_k = y_pred_k = prob of class k
|
|
# gradient_k = (p_k - (y_true == k)) * sw
|
|
gradient_out[i, k] = (p[k] - (y_true[i] == k)) * sample_weight[i]
|
|
|
|
loss_out[i] *= sample_weight[i]
|
|
|
|
free(p)
|
|
|
|
def gradient(
|
|
self,
|
|
const floating_in[::1] y_true, # IN
|
|
const floating_in[:, :] raw_prediction, # IN
|
|
const floating_in[::1] sample_weight, # IN
|
|
floating_out[:, :] gradient_out, # OUT
|
|
int n_threads=1
|
|
):
|
|
cdef:
|
|
int i, k
|
|
int n_samples = y_true.shape[0]
|
|
int n_classes = raw_prediction.shape[1]
|
|
floating_in sum_exps
|
|
floating_in* p # temporary buffer
|
|
|
|
if sample_weight is None:
|
|
# inner loop over n_classes
|
|
with nogil, parallel(num_threads=n_threads):
|
|
# Define private buffer variables as each thread might use its
|
|
# own.
|
|
p = <floating_in *> malloc(sizeof(floating_in) * (n_classes + 2))
|
|
|
|
for i in prange(n_samples, schedule='static'):
|
|
sum_exp_minus_max(i, raw_prediction, p)
|
|
sum_exps = p[n_classes + 1] # p[-1]
|
|
|
|
for k in range(n_classes):
|
|
p[k] /= sum_exps # p_k = y_pred_k = prob of class k
|
|
# gradient_k = y_pred_k - (y_true == k)
|
|
gradient_out[i, k] = p[k] - (y_true[i] == k)
|
|
|
|
free(p)
|
|
else:
|
|
with nogil, parallel(num_threads=n_threads):
|
|
p = <floating_in *> malloc(sizeof(floating_in) * (n_classes + 2))
|
|
|
|
for i in prange(n_samples, schedule='static'):
|
|
sum_exp_minus_max(i, raw_prediction, p)
|
|
sum_exps = p[n_classes + 1] # p[-1]
|
|
|
|
for k in range(n_classes):
|
|
p[k] /= sum_exps # p_k = y_pred_k = prob of class k
|
|
# gradient_k = (p_k - (y_true == k)) * sw
|
|
gradient_out[i, k] = (p[k] - (y_true[i] == k)) * sample_weight[i]
|
|
|
|
free(p)
|
|
|
|
def gradient_hessian(
|
|
self,
|
|
const floating_in[::1] y_true, # IN
|
|
const floating_in[:, :] raw_prediction, # IN
|
|
const floating_in[::1] sample_weight, # IN
|
|
floating_out[:, :] gradient_out, # OUT
|
|
floating_out[:, :] hessian_out, # OUT
|
|
int n_threads=1
|
|
):
|
|
cdef:
|
|
int i, k
|
|
int n_samples = y_true.shape[0]
|
|
int n_classes = raw_prediction.shape[1]
|
|
floating_in sum_exps
|
|
floating_in* p # temporary buffer
|
|
|
|
if sample_weight is None:
|
|
# inner loop over n_classes
|
|
with nogil, parallel(num_threads=n_threads):
|
|
# Define private buffer variables as each thread might use its
|
|
# own.
|
|
p = <floating_in *> malloc(sizeof(floating_in) * (n_classes + 2))
|
|
|
|
for i in prange(n_samples, schedule='static'):
|
|
sum_exp_minus_max(i, raw_prediction, p)
|
|
sum_exps = p[n_classes + 1] # p[-1]
|
|
|
|
for k in range(n_classes):
|
|
p[k] /= sum_exps # p_k = y_pred_k = prob of class k
|
|
# hessian_k = p_k * (1 - p_k)
|
|
# gradient_k = p_k - (y_true == k)
|
|
gradient_out[i, k] = p[k] - (y_true[i] == k)
|
|
hessian_out[i, k] = p[k] * (1. - p[k])
|
|
|
|
free(p)
|
|
else:
|
|
with nogil, parallel(num_threads=n_threads):
|
|
p = <floating_in *> malloc(sizeof(floating_in) * (n_classes + 2))
|
|
|
|
for i in prange(n_samples, schedule='static'):
|
|
sum_exp_minus_max(i, raw_prediction, p)
|
|
sum_exps = p[n_classes + 1] # p[-1]
|
|
|
|
for k in range(n_classes):
|
|
p[k] /= sum_exps # p_k = y_pred_k = prob of class k
|
|
# gradient_k = (p_k - (y_true == k)) * sw
|
|
# hessian_k = p_k * (1 - p_k) * sw
|
|
gradient_out[i, k] = (p[k] - (y_true[i] == k)) * sample_weight[i]
|
|
hessian_out[i, k] = (p[k] * (1. - p[k])) * sample_weight[i]
|
|
|
|
free(p)
|
|
|
|
# This method simplifies the implementation of hessp in linear models,
|
|
# i.e. the matrix-vector product of the full hessian, not only of the
|
|
# diagonal (in the classes) approximation as implemented above.
|
|
def gradient_proba(
|
|
self,
|
|
const floating_in[::1] y_true, # IN
|
|
const floating_in[:, :] raw_prediction, # IN
|
|
const floating_in[::1] sample_weight, # IN
|
|
floating_out[:, :] gradient_out, # OUT
|
|
floating_out[:, :] proba_out, # OUT
|
|
int n_threads=1
|
|
):
|
|
cdef:
|
|
int i, k
|
|
int n_samples = y_true.shape[0]
|
|
int n_classes = raw_prediction.shape[1]
|
|
floating_in sum_exps
|
|
floating_in* p # temporary buffer
|
|
|
|
if sample_weight is None:
|
|
# inner loop over n_classes
|
|
with nogil, parallel(num_threads=n_threads):
|
|
# Define private buffer variables as each thread might use its
|
|
# own.
|
|
p = <floating_in *> malloc(sizeof(floating_in) * (n_classes + 2))
|
|
|
|
for i in prange(n_samples, schedule='static'):
|
|
sum_exp_minus_max(i, raw_prediction, p)
|
|
sum_exps = p[n_classes + 1] # p[-1]
|
|
|
|
for k in range(n_classes):
|
|
proba_out[i, k] = p[k] / sum_exps # y_pred_k = prob of class k
|
|
# gradient_k = y_pred_k - (y_true == k)
|
|
gradient_out[i, k] = proba_out[i, k] - (y_true[i] == k)
|
|
|
|
free(p)
|
|
else:
|
|
with nogil, parallel(num_threads=n_threads):
|
|
p = <floating_in *> malloc(sizeof(floating_in) * (n_classes + 2))
|
|
|
|
for i in prange(n_samples, schedule='static'):
|
|
sum_exp_minus_max(i, raw_prediction, p)
|
|
sum_exps = p[n_classes + 1] # p[-1]
|
|
|
|
for k in range(n_classes):
|
|
proba_out[i, k] = p[k] / sum_exps # y_pred_k = prob of class k
|
|
# gradient_k = (p_k - (y_true == k)) * sw
|
|
gradient_out[i, k] = (proba_out[i, k] - (y_true[i] == k)) * sample_weight[i]
|
|
|
|
free(p)
|