2391 lines
83 KiB
Python
2391 lines
83 KiB
Python
"""Kernels for Gaussian process regression and classification.
|
|
|
|
The kernels in this module allow kernel-engineering, i.e., they can be
|
|
combined via the "+" and "*" operators or be exponentiated with a scalar
|
|
via "**". These sum and product expressions can also contain scalar values,
|
|
which are automatically converted to a constant kernel.
|
|
|
|
All kernels allow (analytic) gradient-based hyperparameter optimization.
|
|
The space of hyperparameters can be specified by giving lower und upper
|
|
boundaries for the value of each hyperparameter (the search space is thus
|
|
rectangular). Instead of specifying bounds, hyperparameters can also be
|
|
declared to be "fixed", which causes these hyperparameters to be excluded from
|
|
optimization.
|
|
"""
|
|
|
|
# Author: Jan Hendrik Metzen <jhm@informatik.uni-bremen.de>
|
|
# License: BSD 3 clause
|
|
|
|
# Note: this module is strongly inspired by the kernel module of the george
|
|
# package.
|
|
|
|
from abc import ABCMeta, abstractmethod
|
|
from collections import namedtuple
|
|
import math
|
|
from inspect import signature
|
|
|
|
import numpy as np
|
|
from scipy.special import kv, gamma
|
|
from scipy.spatial.distance import pdist, cdist, squareform
|
|
|
|
from ..metrics.pairwise import pairwise_kernels
|
|
from ..base import clone
|
|
from ..utils.validation import _num_samples
|
|
from ..exceptions import ConvergenceWarning
|
|
|
|
import warnings
|
|
|
|
|
|
def _check_length_scale(X, length_scale):
|
|
length_scale = np.squeeze(length_scale).astype(float)
|
|
if np.ndim(length_scale) > 1:
|
|
raise ValueError("length_scale cannot be of dimension greater than 1")
|
|
if np.ndim(length_scale) == 1 and X.shape[1] != length_scale.shape[0]:
|
|
raise ValueError(
|
|
"Anisotropic kernel must have the same number of "
|
|
"dimensions as data (%d!=%d)" % (length_scale.shape[0], X.shape[1])
|
|
)
|
|
return length_scale
|
|
|
|
|
|
class Hyperparameter(
|
|
namedtuple(
|
|
"Hyperparameter", ("name", "value_type", "bounds", "n_elements", "fixed")
|
|
)
|
|
):
|
|
"""A kernel hyperparameter's specification in form of a namedtuple.
|
|
|
|
.. versionadded:: 0.18
|
|
|
|
Attributes
|
|
----------
|
|
name : str
|
|
The name of the hyperparameter. Note that a kernel using a
|
|
hyperparameter with name "x" must have the attributes self.x and
|
|
self.x_bounds
|
|
|
|
value_type : str
|
|
The type of the hyperparameter. Currently, only "numeric"
|
|
hyperparameters are supported.
|
|
|
|
bounds : pair of floats >= 0 or "fixed"
|
|
The lower and upper bound on the parameter. If n_elements>1, a pair
|
|
of 1d array with n_elements each may be given alternatively. If
|
|
the string "fixed" is passed as bounds, the hyperparameter's value
|
|
cannot be changed.
|
|
|
|
n_elements : int, default=1
|
|
The number of elements of the hyperparameter value. Defaults to 1,
|
|
which corresponds to a scalar hyperparameter. n_elements > 1
|
|
corresponds to a hyperparameter which is vector-valued,
|
|
such as, e.g., anisotropic length-scales.
|
|
|
|
fixed : bool, default=None
|
|
Whether the value of this hyperparameter is fixed, i.e., cannot be
|
|
changed during hyperparameter tuning. If None is passed, the "fixed" is
|
|
derived based on the given bounds.
|
|
|
|
Examples
|
|
--------
|
|
>>> from sklearn.gaussian_process.kernels import ConstantKernel
|
|
>>> from sklearn.datasets import make_friedman2
|
|
>>> from sklearn.gaussian_process import GaussianProcessRegressor
|
|
>>> from sklearn.gaussian_process.kernels import Hyperparameter
|
|
>>> X, y = make_friedman2(n_samples=50, noise=0, random_state=0)
|
|
>>> kernel = ConstantKernel(constant_value=1.0,
|
|
... constant_value_bounds=(0.0, 10.0))
|
|
|
|
We can access each hyperparameter:
|
|
|
|
>>> for hyperparameter in kernel.hyperparameters:
|
|
... print(hyperparameter)
|
|
Hyperparameter(name='constant_value', value_type='numeric',
|
|
bounds=array([[ 0., 10.]]), n_elements=1, fixed=False)
|
|
|
|
>>> params = kernel.get_params()
|
|
>>> for key in sorted(params): print(f"{key} : {params[key]}")
|
|
constant_value : 1.0
|
|
constant_value_bounds : (0.0, 10.0)
|
|
"""
|
|
|
|
# A raw namedtuple is very memory efficient as it packs the attributes
|
|
# in a struct to get rid of the __dict__ of attributes in particular it
|
|
# does not copy the string for the keys on each instance.
|
|
# By deriving a namedtuple class just to introduce the __init__ method we
|
|
# would also reintroduce the __dict__ on the instance. By telling the
|
|
# Python interpreter that this subclass uses static __slots__ instead of
|
|
# dynamic attributes. Furthermore we don't need any additional slot in the
|
|
# subclass so we set __slots__ to the empty tuple.
|
|
__slots__ = ()
|
|
|
|
def __new__(cls, name, value_type, bounds, n_elements=1, fixed=None):
|
|
if not isinstance(bounds, str) or bounds != "fixed":
|
|
bounds = np.atleast_2d(bounds)
|
|
if n_elements > 1: # vector-valued parameter
|
|
if bounds.shape[0] == 1:
|
|
bounds = np.repeat(bounds, n_elements, 0)
|
|
elif bounds.shape[0] != n_elements:
|
|
raise ValueError(
|
|
"Bounds on %s should have either 1 or "
|
|
"%d dimensions. Given are %d"
|
|
% (name, n_elements, bounds.shape[0])
|
|
)
|
|
|
|
if fixed is None:
|
|
fixed = isinstance(bounds, str) and bounds == "fixed"
|
|
return super(Hyperparameter, cls).__new__(
|
|
cls, name, value_type, bounds, n_elements, fixed
|
|
)
|
|
|
|
# This is mainly a testing utility to check that two hyperparameters
|
|
# are equal.
|
|
def __eq__(self, other):
|
|
return (
|
|
self.name == other.name
|
|
and self.value_type == other.value_type
|
|
and np.all(self.bounds == other.bounds)
|
|
and self.n_elements == other.n_elements
|
|
and self.fixed == other.fixed
|
|
)
|
|
|
|
|
|
class Kernel(metaclass=ABCMeta):
|
|
"""Base class for all kernels.
|
|
|
|
.. versionadded:: 0.18
|
|
"""
|
|
|
|
def get_params(self, deep=True):
|
|
"""Get parameters of this kernel.
|
|
|
|
Parameters
|
|
----------
|
|
deep : bool, default=True
|
|
If True, will return the parameters for this estimator and
|
|
contained subobjects that are estimators.
|
|
|
|
Returns
|
|
-------
|
|
params : dict
|
|
Parameter names mapped to their values.
|
|
"""
|
|
params = dict()
|
|
|
|
# introspect the constructor arguments to find the model parameters
|
|
# to represent
|
|
cls = self.__class__
|
|
init = getattr(cls.__init__, "deprecated_original", cls.__init__)
|
|
init_sign = signature(init)
|
|
args, varargs = [], []
|
|
for parameter in init_sign.parameters.values():
|
|
if parameter.kind != parameter.VAR_KEYWORD and parameter.name != "self":
|
|
args.append(parameter.name)
|
|
if parameter.kind == parameter.VAR_POSITIONAL:
|
|
varargs.append(parameter.name)
|
|
|
|
if len(varargs) != 0:
|
|
raise RuntimeError(
|
|
"scikit-learn kernels should always "
|
|
"specify their parameters in the signature"
|
|
" of their __init__ (no varargs)."
|
|
" %s doesn't follow this convention." % (cls,)
|
|
)
|
|
for arg in args:
|
|
params[arg] = getattr(self, arg)
|
|
|
|
return params
|
|
|
|
def set_params(self, **params):
|
|
"""Set the parameters of this kernel.
|
|
|
|
The method works on simple kernels as well as on nested kernels.
|
|
The latter have parameters of the form ``<component>__<parameter>``
|
|
so that it's possible to update each component of a nested object.
|
|
|
|
Returns
|
|
-------
|
|
self
|
|
"""
|
|
if not params:
|
|
# Simple optimisation to gain speed (inspect is slow)
|
|
return self
|
|
valid_params = self.get_params(deep=True)
|
|
for key, value in params.items():
|
|
split = key.split("__", 1)
|
|
if len(split) > 1:
|
|
# nested objects case
|
|
name, sub_name = split
|
|
if name not in valid_params:
|
|
raise ValueError(
|
|
"Invalid parameter %s for kernel %s. "
|
|
"Check the list of available parameters "
|
|
"with `kernel.get_params().keys()`." % (name, self)
|
|
)
|
|
sub_object = valid_params[name]
|
|
sub_object.set_params(**{sub_name: value})
|
|
else:
|
|
# simple objects case
|
|
if key not in valid_params:
|
|
raise ValueError(
|
|
"Invalid parameter %s for kernel %s. "
|
|
"Check the list of available parameters "
|
|
"with `kernel.get_params().keys()`."
|
|
% (key, self.__class__.__name__)
|
|
)
|
|
setattr(self, key, value)
|
|
return self
|
|
|
|
def clone_with_theta(self, theta):
|
|
"""Returns a clone of self with given hyperparameters theta.
|
|
|
|
Parameters
|
|
----------
|
|
theta : ndarray of shape (n_dims,)
|
|
The hyperparameters
|
|
"""
|
|
cloned = clone(self)
|
|
cloned.theta = theta
|
|
return cloned
|
|
|
|
@property
|
|
def n_dims(self):
|
|
"""Returns the number of non-fixed hyperparameters of the kernel."""
|
|
return self.theta.shape[0]
|
|
|
|
@property
|
|
def hyperparameters(self):
|
|
"""Returns a list of all hyperparameter specifications."""
|
|
r = [
|
|
getattr(self, attr)
|
|
for attr in dir(self)
|
|
if attr.startswith("hyperparameter_")
|
|
]
|
|
return r
|
|
|
|
@property
|
|
def theta(self):
|
|
"""Returns the (flattened, log-transformed) non-fixed hyperparameters.
|
|
|
|
Note that theta are typically the log-transformed values of the
|
|
kernel's hyperparameters as this representation of the search space
|
|
is more amenable for hyperparameter search, as hyperparameters like
|
|
length-scales naturally live on a log-scale.
|
|
|
|
Returns
|
|
-------
|
|
theta : ndarray of shape (n_dims,)
|
|
The non-fixed, log-transformed hyperparameters of the kernel
|
|
"""
|
|
theta = []
|
|
params = self.get_params()
|
|
for hyperparameter in self.hyperparameters:
|
|
if not hyperparameter.fixed:
|
|
theta.append(params[hyperparameter.name])
|
|
if len(theta) > 0:
|
|
return np.log(np.hstack(theta))
|
|
else:
|
|
return np.array([])
|
|
|
|
@theta.setter
|
|
def theta(self, theta):
|
|
"""Sets the (flattened, log-transformed) non-fixed hyperparameters.
|
|
|
|
Parameters
|
|
----------
|
|
theta : ndarray of shape (n_dims,)
|
|
The non-fixed, log-transformed hyperparameters of the kernel
|
|
"""
|
|
params = self.get_params()
|
|
i = 0
|
|
for hyperparameter in self.hyperparameters:
|
|
if hyperparameter.fixed:
|
|
continue
|
|
if hyperparameter.n_elements > 1:
|
|
# vector-valued parameter
|
|
params[hyperparameter.name] = np.exp(
|
|
theta[i : i + hyperparameter.n_elements]
|
|
)
|
|
i += hyperparameter.n_elements
|
|
else:
|
|
params[hyperparameter.name] = np.exp(theta[i])
|
|
i += 1
|
|
|
|
if i != len(theta):
|
|
raise ValueError(
|
|
"theta has not the correct number of entries."
|
|
" Should be %d; given are %d" % (i, len(theta))
|
|
)
|
|
self.set_params(**params)
|
|
|
|
@property
|
|
def bounds(self):
|
|
"""Returns the log-transformed bounds on the theta.
|
|
|
|
Returns
|
|
-------
|
|
bounds : ndarray of shape (n_dims, 2)
|
|
The log-transformed bounds on the kernel's hyperparameters theta
|
|
"""
|
|
bounds = [
|
|
hyperparameter.bounds
|
|
for hyperparameter in self.hyperparameters
|
|
if not hyperparameter.fixed
|
|
]
|
|
if len(bounds) > 0:
|
|
return np.log(np.vstack(bounds))
|
|
else:
|
|
return np.array([])
|
|
|
|
def __add__(self, b):
|
|
if not isinstance(b, Kernel):
|
|
return Sum(self, ConstantKernel(b))
|
|
return Sum(self, b)
|
|
|
|
def __radd__(self, b):
|
|
if not isinstance(b, Kernel):
|
|
return Sum(ConstantKernel(b), self)
|
|
return Sum(b, self)
|
|
|
|
def __mul__(self, b):
|
|
if not isinstance(b, Kernel):
|
|
return Product(self, ConstantKernel(b))
|
|
return Product(self, b)
|
|
|
|
def __rmul__(self, b):
|
|
if not isinstance(b, Kernel):
|
|
return Product(ConstantKernel(b), self)
|
|
return Product(b, self)
|
|
|
|
def __pow__(self, b):
|
|
return Exponentiation(self, b)
|
|
|
|
def __eq__(self, b):
|
|
if type(self) != type(b):
|
|
return False
|
|
params_a = self.get_params()
|
|
params_b = b.get_params()
|
|
for key in set(list(params_a.keys()) + list(params_b.keys())):
|
|
if np.any(params_a.get(key, None) != params_b.get(key, None)):
|
|
return False
|
|
return True
|
|
|
|
def __repr__(self):
|
|
return "{0}({1})".format(
|
|
self.__class__.__name__, ", ".join(map("{0:.3g}".format, self.theta))
|
|
)
|
|
|
|
@abstractmethod
|
|
def __call__(self, X, Y=None, eval_gradient=False):
|
|
"""Evaluate the kernel."""
|
|
|
|
@abstractmethod
|
|
def diag(self, X):
|
|
"""Returns the diagonal of the kernel k(X, X).
|
|
|
|
The result of this method is identical to np.diag(self(X)); however,
|
|
it can be evaluated more efficiently since only the diagonal is
|
|
evaluated.
|
|
|
|
Parameters
|
|
----------
|
|
X : array-like of shape (n_samples,)
|
|
Left argument of the returned kernel k(X, Y)
|
|
|
|
Returns
|
|
-------
|
|
K_diag : ndarray of shape (n_samples_X,)
|
|
Diagonal of kernel k(X, X)
|
|
"""
|
|
|
|
@abstractmethod
|
|
def is_stationary(self):
|
|
"""Returns whether the kernel is stationary."""
|
|
|
|
@property
|
|
def requires_vector_input(self):
|
|
"""Returns whether the kernel is defined on fixed-length feature
|
|
vectors or generic objects. Defaults to True for backward
|
|
compatibility."""
|
|
return True
|
|
|
|
def _check_bounds_params(self):
|
|
"""Called after fitting to warn if bounds may have been too tight."""
|
|
list_close = np.isclose(self.bounds, np.atleast_2d(self.theta).T)
|
|
idx = 0
|
|
for hyp in self.hyperparameters:
|
|
if hyp.fixed:
|
|
continue
|
|
for dim in range(hyp.n_elements):
|
|
if list_close[idx, 0]:
|
|
warnings.warn(
|
|
"The optimal value found for "
|
|
"dimension %s of parameter %s is "
|
|
"close to the specified lower "
|
|
"bound %s. Decreasing the bound and"
|
|
" calling fit again may find a "
|
|
"better value." % (dim, hyp.name, hyp.bounds[dim][0]),
|
|
ConvergenceWarning,
|
|
)
|
|
elif list_close[idx, 1]:
|
|
warnings.warn(
|
|
"The optimal value found for "
|
|
"dimension %s of parameter %s is "
|
|
"close to the specified upper "
|
|
"bound %s. Increasing the bound and"
|
|
" calling fit again may find a "
|
|
"better value." % (dim, hyp.name, hyp.bounds[dim][1]),
|
|
ConvergenceWarning,
|
|
)
|
|
idx += 1
|
|
|
|
|
|
class NormalizedKernelMixin:
|
|
"""Mixin for kernels which are normalized: k(X, X)=1.
|
|
|
|
.. versionadded:: 0.18
|
|
"""
|
|
|
|
def diag(self, X):
|
|
"""Returns the diagonal of the kernel k(X, X).
|
|
|
|
The result of this method is identical to np.diag(self(X)); however,
|
|
it can be evaluated more efficiently since only the diagonal is
|
|
evaluated.
|
|
|
|
Parameters
|
|
----------
|
|
X : ndarray of shape (n_samples_X, n_features)
|
|
Left argument of the returned kernel k(X, Y)
|
|
|
|
Returns
|
|
-------
|
|
K_diag : ndarray of shape (n_samples_X,)
|
|
Diagonal of kernel k(X, X)
|
|
"""
|
|
return np.ones(X.shape[0])
|
|
|
|
|
|
class StationaryKernelMixin:
|
|
"""Mixin for kernels which are stationary: k(X, Y)= f(X-Y).
|
|
|
|
.. versionadded:: 0.18
|
|
"""
|
|
|
|
def is_stationary(self):
|
|
"""Returns whether the kernel is stationary."""
|
|
return True
|
|
|
|
|
|
class GenericKernelMixin:
|
|
"""Mixin for kernels which operate on generic objects such as variable-
|
|
length sequences, trees, and graphs.
|
|
|
|
.. versionadded:: 0.22
|
|
"""
|
|
|
|
@property
|
|
def requires_vector_input(self):
|
|
"""Whether the kernel works only on fixed-length feature vectors."""
|
|
return False
|
|
|
|
|
|
class CompoundKernel(Kernel):
|
|
"""Kernel which is composed of a set of other kernels.
|
|
|
|
.. versionadded:: 0.18
|
|
|
|
Parameters
|
|
----------
|
|
kernels : list of Kernels
|
|
The other kernels
|
|
|
|
Examples
|
|
--------
|
|
>>> from sklearn.gaussian_process.kernels import WhiteKernel
|
|
>>> from sklearn.gaussian_process.kernels import RBF
|
|
>>> from sklearn.gaussian_process.kernels import CompoundKernel
|
|
>>> kernel = CompoundKernel(
|
|
... [WhiteKernel(noise_level=3.0), RBF(length_scale=2.0)])
|
|
>>> print(kernel.bounds)
|
|
[[-11.51292546 11.51292546]
|
|
[-11.51292546 11.51292546]]
|
|
>>> print(kernel.n_dims)
|
|
2
|
|
>>> print(kernel.theta)
|
|
[1.09861229 0.69314718]
|
|
"""
|
|
|
|
def __init__(self, kernels):
|
|
self.kernels = kernels
|
|
|
|
def get_params(self, deep=True):
|
|
"""Get parameters of this kernel.
|
|
|
|
Parameters
|
|
----------
|
|
deep : bool, default=True
|
|
If True, will return the parameters for this estimator and
|
|
contained subobjects that are estimators.
|
|
|
|
Returns
|
|
-------
|
|
params : dict
|
|
Parameter names mapped to their values.
|
|
"""
|
|
return dict(kernels=self.kernels)
|
|
|
|
@property
|
|
def theta(self):
|
|
"""Returns the (flattened, log-transformed) non-fixed hyperparameters.
|
|
|
|
Note that theta are typically the log-transformed values of the
|
|
kernel's hyperparameters as this representation of the search space
|
|
is more amenable for hyperparameter search, as hyperparameters like
|
|
length-scales naturally live on a log-scale.
|
|
|
|
Returns
|
|
-------
|
|
theta : ndarray of shape (n_dims,)
|
|
The non-fixed, log-transformed hyperparameters of the kernel
|
|
"""
|
|
return np.hstack([kernel.theta for kernel in self.kernels])
|
|
|
|
@theta.setter
|
|
def theta(self, theta):
|
|
"""Sets the (flattened, log-transformed) non-fixed hyperparameters.
|
|
|
|
Parameters
|
|
----------
|
|
theta : array of shape (n_dims,)
|
|
The non-fixed, log-transformed hyperparameters of the kernel
|
|
"""
|
|
k_dims = self.k1.n_dims
|
|
for i, kernel in enumerate(self.kernels):
|
|
kernel.theta = theta[i * k_dims : (i + 1) * k_dims]
|
|
|
|
@property
|
|
def bounds(self):
|
|
"""Returns the log-transformed bounds on the theta.
|
|
|
|
Returns
|
|
-------
|
|
bounds : array of shape (n_dims, 2)
|
|
The log-transformed bounds on the kernel's hyperparameters theta
|
|
"""
|
|
return np.vstack([kernel.bounds for kernel in self.kernels])
|
|
|
|
def __call__(self, X, Y=None, eval_gradient=False):
|
|
"""Return the kernel k(X, Y) and optionally its gradient.
|
|
|
|
Note that this compound kernel returns the results of all simple kernel
|
|
stacked along an additional axis.
|
|
|
|
Parameters
|
|
----------
|
|
X : array-like of shape (n_samples_X, n_features) or list of object, \
|
|
default=None
|
|
Left argument of the returned kernel k(X, Y)
|
|
|
|
Y : array-like of shape (n_samples_X, n_features) or list of object, \
|
|
default=None
|
|
Right argument of the returned kernel k(X, Y). If None, k(X, X)
|
|
is evaluated instead.
|
|
|
|
eval_gradient : bool, default=False
|
|
Determines whether the gradient with respect to the log of the
|
|
kernel hyperparameter is computed.
|
|
|
|
Returns
|
|
-------
|
|
K : ndarray of shape (n_samples_X, n_samples_Y, n_kernels)
|
|
Kernel k(X, Y)
|
|
|
|
K_gradient : ndarray of shape \
|
|
(n_samples_X, n_samples_X, n_dims, n_kernels), optional
|
|
The gradient of the kernel k(X, X) with respect to the log of the
|
|
hyperparameter of the kernel. Only returned when `eval_gradient`
|
|
is True.
|
|
"""
|
|
if eval_gradient:
|
|
K = []
|
|
K_grad = []
|
|
for kernel in self.kernels:
|
|
K_single, K_grad_single = kernel(X, Y, eval_gradient)
|
|
K.append(K_single)
|
|
K_grad.append(K_grad_single[..., np.newaxis])
|
|
return np.dstack(K), np.concatenate(K_grad, 3)
|
|
else:
|
|
return np.dstack([kernel(X, Y, eval_gradient) for kernel in self.kernels])
|
|
|
|
def __eq__(self, b):
|
|
if type(self) != type(b) or len(self.kernels) != len(b.kernels):
|
|
return False
|
|
return np.all(
|
|
[self.kernels[i] == b.kernels[i] for i in range(len(self.kernels))]
|
|
)
|
|
|
|
def is_stationary(self):
|
|
"""Returns whether the kernel is stationary."""
|
|
return np.all([kernel.is_stationary() for kernel in self.kernels])
|
|
|
|
@property
|
|
def requires_vector_input(self):
|
|
"""Returns whether the kernel is defined on discrete structures."""
|
|
return np.any([kernel.requires_vector_input for kernel in self.kernels])
|
|
|
|
def diag(self, X):
|
|
"""Returns the diagonal of the kernel k(X, X).
|
|
|
|
The result of this method is identical to `np.diag(self(X))`; however,
|
|
it can be evaluated more efficiently since only the diagonal is
|
|
evaluated.
|
|
|
|
Parameters
|
|
----------
|
|
X : array-like of shape (n_samples_X, n_features) or list of object
|
|
Argument to the kernel.
|
|
|
|
Returns
|
|
-------
|
|
K_diag : ndarray of shape (n_samples_X, n_kernels)
|
|
Diagonal of kernel k(X, X)
|
|
"""
|
|
return np.vstack([kernel.diag(X) for kernel in self.kernels]).T
|
|
|
|
|
|
class KernelOperator(Kernel):
|
|
"""Base class for all kernel operators.
|
|
|
|
.. versionadded:: 0.18
|
|
"""
|
|
|
|
def __init__(self, k1, k2):
|
|
self.k1 = k1
|
|
self.k2 = k2
|
|
|
|
def get_params(self, deep=True):
|
|
"""Get parameters of this kernel.
|
|
|
|
Parameters
|
|
----------
|
|
deep : bool, default=True
|
|
If True, will return the parameters for this estimator and
|
|
contained subobjects that are estimators.
|
|
|
|
Returns
|
|
-------
|
|
params : dict
|
|
Parameter names mapped to their values.
|
|
"""
|
|
params = dict(k1=self.k1, k2=self.k2)
|
|
if deep:
|
|
deep_items = self.k1.get_params().items()
|
|
params.update(("k1__" + k, val) for k, val in deep_items)
|
|
deep_items = self.k2.get_params().items()
|
|
params.update(("k2__" + k, val) for k, val in deep_items)
|
|
|
|
return params
|
|
|
|
@property
|
|
def hyperparameters(self):
|
|
"""Returns a list of all hyperparameter."""
|
|
r = [
|
|
Hyperparameter(
|
|
"k1__" + hyperparameter.name,
|
|
hyperparameter.value_type,
|
|
hyperparameter.bounds,
|
|
hyperparameter.n_elements,
|
|
)
|
|
for hyperparameter in self.k1.hyperparameters
|
|
]
|
|
|
|
for hyperparameter in self.k2.hyperparameters:
|
|
r.append(
|
|
Hyperparameter(
|
|
"k2__" + hyperparameter.name,
|
|
hyperparameter.value_type,
|
|
hyperparameter.bounds,
|
|
hyperparameter.n_elements,
|
|
)
|
|
)
|
|
return r
|
|
|
|
@property
|
|
def theta(self):
|
|
"""Returns the (flattened, log-transformed) non-fixed hyperparameters.
|
|
|
|
Note that theta are typically the log-transformed values of the
|
|
kernel's hyperparameters as this representation of the search space
|
|
is more amenable for hyperparameter search, as hyperparameters like
|
|
length-scales naturally live on a log-scale.
|
|
|
|
Returns
|
|
-------
|
|
theta : ndarray of shape (n_dims,)
|
|
The non-fixed, log-transformed hyperparameters of the kernel
|
|
"""
|
|
return np.append(self.k1.theta, self.k2.theta)
|
|
|
|
@theta.setter
|
|
def theta(self, theta):
|
|
"""Sets the (flattened, log-transformed) non-fixed hyperparameters.
|
|
|
|
Parameters
|
|
----------
|
|
theta : ndarray of shape (n_dims,)
|
|
The non-fixed, log-transformed hyperparameters of the kernel
|
|
"""
|
|
k1_dims = self.k1.n_dims
|
|
self.k1.theta = theta[:k1_dims]
|
|
self.k2.theta = theta[k1_dims:]
|
|
|
|
@property
|
|
def bounds(self):
|
|
"""Returns the log-transformed bounds on the theta.
|
|
|
|
Returns
|
|
-------
|
|
bounds : ndarray of shape (n_dims, 2)
|
|
The log-transformed bounds on the kernel's hyperparameters theta
|
|
"""
|
|
if self.k1.bounds.size == 0:
|
|
return self.k2.bounds
|
|
if self.k2.bounds.size == 0:
|
|
return self.k1.bounds
|
|
return np.vstack((self.k1.bounds, self.k2.bounds))
|
|
|
|
def __eq__(self, b):
|
|
if type(self) != type(b):
|
|
return False
|
|
return (self.k1 == b.k1 and self.k2 == b.k2) or (
|
|
self.k1 == b.k2 and self.k2 == b.k1
|
|
)
|
|
|
|
def is_stationary(self):
|
|
"""Returns whether the kernel is stationary."""
|
|
return self.k1.is_stationary() and self.k2.is_stationary()
|
|
|
|
@property
|
|
def requires_vector_input(self):
|
|
"""Returns whether the kernel is stationary."""
|
|
return self.k1.requires_vector_input or self.k2.requires_vector_input
|
|
|
|
|
|
class Sum(KernelOperator):
|
|
"""The `Sum` kernel takes two kernels :math:`k_1` and :math:`k_2`
|
|
and combines them via
|
|
|
|
.. math::
|
|
k_{sum}(X, Y) = k_1(X, Y) + k_2(X, Y)
|
|
|
|
Note that the `__add__` magic method is overridden, so
|
|
`Sum(RBF(), RBF())` is equivalent to using the + operator
|
|
with `RBF() + RBF()`.
|
|
|
|
|
|
Read more in the :ref:`User Guide <gp_kernels>`.
|
|
|
|
.. versionadded:: 0.18
|
|
|
|
Parameters
|
|
----------
|
|
k1 : Kernel
|
|
The first base-kernel of the sum-kernel
|
|
|
|
k2 : Kernel
|
|
The second base-kernel of the sum-kernel
|
|
|
|
Examples
|
|
--------
|
|
>>> from sklearn.datasets import make_friedman2
|
|
>>> from sklearn.gaussian_process import GaussianProcessRegressor
|
|
>>> from sklearn.gaussian_process.kernels import RBF, Sum, ConstantKernel
|
|
>>> X, y = make_friedman2(n_samples=500, noise=0, random_state=0)
|
|
>>> kernel = Sum(ConstantKernel(2), RBF())
|
|
>>> gpr = GaussianProcessRegressor(kernel=kernel,
|
|
... random_state=0).fit(X, y)
|
|
>>> gpr.score(X, y)
|
|
1.0
|
|
>>> kernel
|
|
1.41**2 + RBF(length_scale=1)
|
|
"""
|
|
|
|
def __call__(self, X, Y=None, eval_gradient=False):
|
|
"""Return the kernel k(X, Y) and optionally its gradient.
|
|
|
|
Parameters
|
|
----------
|
|
X : array-like of shape (n_samples_X, n_features) or list of object
|
|
Left argument of the returned kernel k(X, Y)
|
|
|
|
Y : array-like of shape (n_samples_X, n_features) or list of object,\
|
|
default=None
|
|
Right argument of the returned kernel k(X, Y). If None, k(X, X)
|
|
is evaluated instead.
|
|
|
|
eval_gradient : bool, default=False
|
|
Determines whether the gradient with respect to the log of
|
|
the kernel hyperparameter is computed.
|
|
|
|
Returns
|
|
-------
|
|
K : ndarray of shape (n_samples_X, n_samples_Y)
|
|
Kernel k(X, Y)
|
|
|
|
K_gradient : ndarray of shape (n_samples_X, n_samples_X, n_dims),\
|
|
optional
|
|
The gradient of the kernel k(X, X) with respect to the log of the
|
|
hyperparameter of the kernel. Only returned when `eval_gradient`
|
|
is True.
|
|
"""
|
|
if eval_gradient:
|
|
K1, K1_gradient = self.k1(X, Y, eval_gradient=True)
|
|
K2, K2_gradient = self.k2(X, Y, eval_gradient=True)
|
|
return K1 + K2, np.dstack((K1_gradient, K2_gradient))
|
|
else:
|
|
return self.k1(X, Y) + self.k2(X, Y)
|
|
|
|
def diag(self, X):
|
|
"""Returns the diagonal of the kernel k(X, X).
|
|
|
|
The result of this method is identical to `np.diag(self(X))`; however,
|
|
it can be evaluated more efficiently since only the diagonal is
|
|
evaluated.
|
|
|
|
Parameters
|
|
----------
|
|
X : array-like of shape (n_samples_X, n_features) or list of object
|
|
Argument to the kernel.
|
|
|
|
Returns
|
|
-------
|
|
K_diag : ndarray of shape (n_samples_X,)
|
|
Diagonal of kernel k(X, X)
|
|
"""
|
|
return self.k1.diag(X) + self.k2.diag(X)
|
|
|
|
def __repr__(self):
|
|
return "{0} + {1}".format(self.k1, self.k2)
|
|
|
|
|
|
class Product(KernelOperator):
|
|
"""The `Product` kernel takes two kernels :math:`k_1` and :math:`k_2`
|
|
and combines them via
|
|
|
|
.. math::
|
|
k_{prod}(X, Y) = k_1(X, Y) * k_2(X, Y)
|
|
|
|
Note that the `__mul__` magic method is overridden, so
|
|
`Product(RBF(), RBF())` is equivalent to using the * operator
|
|
with `RBF() * RBF()`.
|
|
|
|
Read more in the :ref:`User Guide <gp_kernels>`.
|
|
|
|
.. versionadded:: 0.18
|
|
|
|
Parameters
|
|
----------
|
|
k1 : Kernel
|
|
The first base-kernel of the product-kernel
|
|
|
|
k2 : Kernel
|
|
The second base-kernel of the product-kernel
|
|
|
|
|
|
Examples
|
|
--------
|
|
>>> from sklearn.datasets import make_friedman2
|
|
>>> from sklearn.gaussian_process import GaussianProcessRegressor
|
|
>>> from sklearn.gaussian_process.kernels import (RBF, Product,
|
|
... ConstantKernel)
|
|
>>> X, y = make_friedman2(n_samples=500, noise=0, random_state=0)
|
|
>>> kernel = Product(ConstantKernel(2), RBF())
|
|
>>> gpr = GaussianProcessRegressor(kernel=kernel,
|
|
... random_state=0).fit(X, y)
|
|
>>> gpr.score(X, y)
|
|
1.0
|
|
>>> kernel
|
|
1.41**2 * RBF(length_scale=1)
|
|
"""
|
|
|
|
def __call__(self, X, Y=None, eval_gradient=False):
|
|
"""Return the kernel k(X, Y) and optionally its gradient.
|
|
|
|
Parameters
|
|
----------
|
|
X : array-like of shape (n_samples_X, n_features) or list of object
|
|
Left argument of the returned kernel k(X, Y)
|
|
|
|
Y : array-like of shape (n_samples_Y, n_features) or list of object,\
|
|
default=None
|
|
Right argument of the returned kernel k(X, Y). If None, k(X, X)
|
|
is evaluated instead.
|
|
|
|
eval_gradient : bool, default=False
|
|
Determines whether the gradient with respect to the log of
|
|
the kernel hyperparameter is computed.
|
|
|
|
Returns
|
|
-------
|
|
K : ndarray of shape (n_samples_X, n_samples_Y)
|
|
Kernel k(X, Y)
|
|
|
|
K_gradient : ndarray of shape (n_samples_X, n_samples_X, n_dims), \
|
|
optional
|
|
The gradient of the kernel k(X, X) with respect to the log of the
|
|
hyperparameter of the kernel. Only returned when `eval_gradient`
|
|
is True.
|
|
"""
|
|
if eval_gradient:
|
|
K1, K1_gradient = self.k1(X, Y, eval_gradient=True)
|
|
K2, K2_gradient = self.k2(X, Y, eval_gradient=True)
|
|
return K1 * K2, np.dstack(
|
|
(K1_gradient * K2[:, :, np.newaxis], K2_gradient * K1[:, :, np.newaxis])
|
|
)
|
|
else:
|
|
return self.k1(X, Y) * self.k2(X, Y)
|
|
|
|
def diag(self, X):
|
|
"""Returns the diagonal of the kernel k(X, X).
|
|
|
|
The result of this method is identical to np.diag(self(X)); however,
|
|
it can be evaluated more efficiently since only the diagonal is
|
|
evaluated.
|
|
|
|
Parameters
|
|
----------
|
|
X : array-like of shape (n_samples_X, n_features) or list of object
|
|
Argument to the kernel.
|
|
|
|
Returns
|
|
-------
|
|
K_diag : ndarray of shape (n_samples_X,)
|
|
Diagonal of kernel k(X, X)
|
|
"""
|
|
return self.k1.diag(X) * self.k2.diag(X)
|
|
|
|
def __repr__(self):
|
|
return "{0} * {1}".format(self.k1, self.k2)
|
|
|
|
|
|
class Exponentiation(Kernel):
|
|
"""The Exponentiation kernel takes one base kernel and a scalar parameter
|
|
:math:`p` and combines them via
|
|
|
|
.. math::
|
|
k_{exp}(X, Y) = k(X, Y) ^p
|
|
|
|
Note that the `__pow__` magic method is overridden, so
|
|
`Exponentiation(RBF(), 2)` is equivalent to using the ** operator
|
|
with `RBF() ** 2`.
|
|
|
|
|
|
Read more in the :ref:`User Guide <gp_kernels>`.
|
|
|
|
.. versionadded:: 0.18
|
|
|
|
Parameters
|
|
----------
|
|
kernel : Kernel
|
|
The base kernel
|
|
|
|
exponent : float
|
|
The exponent for the base kernel
|
|
|
|
|
|
Examples
|
|
--------
|
|
>>> from sklearn.datasets import make_friedman2
|
|
>>> from sklearn.gaussian_process import GaussianProcessRegressor
|
|
>>> from sklearn.gaussian_process.kernels import (RationalQuadratic,
|
|
... Exponentiation)
|
|
>>> X, y = make_friedman2(n_samples=500, noise=0, random_state=0)
|
|
>>> kernel = Exponentiation(RationalQuadratic(), exponent=2)
|
|
>>> gpr = GaussianProcessRegressor(kernel=kernel, alpha=5,
|
|
... random_state=0).fit(X, y)
|
|
>>> gpr.score(X, y)
|
|
0.419...
|
|
>>> gpr.predict(X[:1,:], return_std=True)
|
|
(array([635.5...]), array([0.559...]))
|
|
"""
|
|
|
|
def __init__(self, kernel, exponent):
|
|
self.kernel = kernel
|
|
self.exponent = exponent
|
|
|
|
def get_params(self, deep=True):
|
|
"""Get parameters of this kernel.
|
|
|
|
Parameters
|
|
----------
|
|
deep : bool, default=True
|
|
If True, will return the parameters for this estimator and
|
|
contained subobjects that are estimators.
|
|
|
|
Returns
|
|
-------
|
|
params : dict
|
|
Parameter names mapped to their values.
|
|
"""
|
|
params = dict(kernel=self.kernel, exponent=self.exponent)
|
|
if deep:
|
|
deep_items = self.kernel.get_params().items()
|
|
params.update(("kernel__" + k, val) for k, val in deep_items)
|
|
return params
|
|
|
|
@property
|
|
def hyperparameters(self):
|
|
"""Returns a list of all hyperparameter."""
|
|
r = []
|
|
for hyperparameter in self.kernel.hyperparameters:
|
|
r.append(
|
|
Hyperparameter(
|
|
"kernel__" + hyperparameter.name,
|
|
hyperparameter.value_type,
|
|
hyperparameter.bounds,
|
|
hyperparameter.n_elements,
|
|
)
|
|
)
|
|
return r
|
|
|
|
@property
|
|
def theta(self):
|
|
"""Returns the (flattened, log-transformed) non-fixed hyperparameters.
|
|
|
|
Note that theta are typically the log-transformed values of the
|
|
kernel's hyperparameters as this representation of the search space
|
|
is more amenable for hyperparameter search, as hyperparameters like
|
|
length-scales naturally live on a log-scale.
|
|
|
|
Returns
|
|
-------
|
|
theta : ndarray of shape (n_dims,)
|
|
The non-fixed, log-transformed hyperparameters of the kernel
|
|
"""
|
|
return self.kernel.theta
|
|
|
|
@theta.setter
|
|
def theta(self, theta):
|
|
"""Sets the (flattened, log-transformed) non-fixed hyperparameters.
|
|
|
|
Parameters
|
|
----------
|
|
theta : ndarray of shape (n_dims,)
|
|
The non-fixed, log-transformed hyperparameters of the kernel
|
|
"""
|
|
self.kernel.theta = theta
|
|
|
|
@property
|
|
def bounds(self):
|
|
"""Returns the log-transformed bounds on the theta.
|
|
|
|
Returns
|
|
-------
|
|
bounds : ndarray of shape (n_dims, 2)
|
|
The log-transformed bounds on the kernel's hyperparameters theta
|
|
"""
|
|
return self.kernel.bounds
|
|
|
|
def __eq__(self, b):
|
|
if type(self) != type(b):
|
|
return False
|
|
return self.kernel == b.kernel and self.exponent == b.exponent
|
|
|
|
def __call__(self, X, Y=None, eval_gradient=False):
|
|
"""Return the kernel k(X, Y) and optionally its gradient.
|
|
|
|
Parameters
|
|
----------
|
|
X : array-like of shape (n_samples_X, n_features) or list of object
|
|
Left argument of the returned kernel k(X, Y)
|
|
|
|
Y : array-like of shape (n_samples_Y, n_features) or list of object,\
|
|
default=None
|
|
Right argument of the returned kernel k(X, Y). If None, k(X, X)
|
|
is evaluated instead.
|
|
|
|
eval_gradient : bool, default=False
|
|
Determines whether the gradient with respect to the log of
|
|
the kernel hyperparameter is computed.
|
|
|
|
Returns
|
|
-------
|
|
K : ndarray of shape (n_samples_X, n_samples_Y)
|
|
Kernel k(X, Y)
|
|
|
|
K_gradient : ndarray of shape (n_samples_X, n_samples_X, n_dims),\
|
|
optional
|
|
The gradient of the kernel k(X, X) with respect to the log of the
|
|
hyperparameter of the kernel. Only returned when `eval_gradient`
|
|
is True.
|
|
"""
|
|
if eval_gradient:
|
|
K, K_gradient = self.kernel(X, Y, eval_gradient=True)
|
|
K_gradient *= self.exponent * K[:, :, np.newaxis] ** (self.exponent - 1)
|
|
return K**self.exponent, K_gradient
|
|
else:
|
|
K = self.kernel(X, Y, eval_gradient=False)
|
|
return K**self.exponent
|
|
|
|
def diag(self, X):
|
|
"""Returns the diagonal of the kernel k(X, X).
|
|
|
|
The result of this method is identical to np.diag(self(X)); however,
|
|
it can be evaluated more efficiently since only the diagonal is
|
|
evaluated.
|
|
|
|
Parameters
|
|
----------
|
|
X : array-like of shape (n_samples_X, n_features) or list of object
|
|
Argument to the kernel.
|
|
|
|
Returns
|
|
-------
|
|
K_diag : ndarray of shape (n_samples_X,)
|
|
Diagonal of kernel k(X, X)
|
|
"""
|
|
return self.kernel.diag(X) ** self.exponent
|
|
|
|
def __repr__(self):
|
|
return "{0} ** {1}".format(self.kernel, self.exponent)
|
|
|
|
def is_stationary(self):
|
|
"""Returns whether the kernel is stationary."""
|
|
return self.kernel.is_stationary()
|
|
|
|
@property
|
|
def requires_vector_input(self):
|
|
"""Returns whether the kernel is defined on discrete structures."""
|
|
return self.kernel.requires_vector_input
|
|
|
|
|
|
class ConstantKernel(StationaryKernelMixin, GenericKernelMixin, Kernel):
|
|
"""Constant kernel.
|
|
|
|
Can be used as part of a product-kernel where it scales the magnitude of
|
|
the other factor (kernel) or as part of a sum-kernel, where it modifies
|
|
the mean of the Gaussian process.
|
|
|
|
.. math::
|
|
k(x_1, x_2) = constant\\_value \\;\\forall\\; x_1, x_2
|
|
|
|
Adding a constant kernel is equivalent to adding a constant::
|
|
|
|
kernel = RBF() + ConstantKernel(constant_value=2)
|
|
|
|
is the same as::
|
|
|
|
kernel = RBF() + 2
|
|
|
|
|
|
Read more in the :ref:`User Guide <gp_kernels>`.
|
|
|
|
.. versionadded:: 0.18
|
|
|
|
Parameters
|
|
----------
|
|
constant_value : float, default=1.0
|
|
The constant value which defines the covariance:
|
|
k(x_1, x_2) = constant_value
|
|
|
|
constant_value_bounds : pair of floats >= 0 or "fixed", default=(1e-5, 1e5)
|
|
The lower and upper bound on `constant_value`.
|
|
If set to "fixed", `constant_value` cannot be changed during
|
|
hyperparameter tuning.
|
|
|
|
Examples
|
|
--------
|
|
>>> from sklearn.datasets import make_friedman2
|
|
>>> from sklearn.gaussian_process import GaussianProcessRegressor
|
|
>>> from sklearn.gaussian_process.kernels import RBF, ConstantKernel
|
|
>>> X, y = make_friedman2(n_samples=500, noise=0, random_state=0)
|
|
>>> kernel = RBF() + ConstantKernel(constant_value=2)
|
|
>>> gpr = GaussianProcessRegressor(kernel=kernel, alpha=5,
|
|
... random_state=0).fit(X, y)
|
|
>>> gpr.score(X, y)
|
|
0.3696...
|
|
>>> gpr.predict(X[:1,:], return_std=True)
|
|
(array([606.1...]), array([0.24...]))
|
|
"""
|
|
|
|
def __init__(self, constant_value=1.0, constant_value_bounds=(1e-5, 1e5)):
|
|
self.constant_value = constant_value
|
|
self.constant_value_bounds = constant_value_bounds
|
|
|
|
@property
|
|
def hyperparameter_constant_value(self):
|
|
return Hyperparameter("constant_value", "numeric", self.constant_value_bounds)
|
|
|
|
def __call__(self, X, Y=None, eval_gradient=False):
|
|
"""Return the kernel k(X, Y) and optionally its gradient.
|
|
|
|
Parameters
|
|
----------
|
|
X : array-like of shape (n_samples_X, n_features) or list of object
|
|
Left argument of the returned kernel k(X, Y)
|
|
|
|
Y : array-like of shape (n_samples_X, n_features) or list of object, \
|
|
default=None
|
|
Right argument of the returned kernel k(X, Y). If None, k(X, X)
|
|
is evaluated instead.
|
|
|
|
eval_gradient : bool, default=False
|
|
Determines whether the gradient with respect to the log of
|
|
the kernel hyperparameter is computed.
|
|
Only supported when Y is None.
|
|
|
|
Returns
|
|
-------
|
|
K : ndarray of shape (n_samples_X, n_samples_Y)
|
|
Kernel k(X, Y)
|
|
|
|
K_gradient : ndarray of shape (n_samples_X, n_samples_X, n_dims), \
|
|
optional
|
|
The gradient of the kernel k(X, X) with respect to the log of the
|
|
hyperparameter of the kernel. Only returned when eval_gradient
|
|
is True.
|
|
"""
|
|
if Y is None:
|
|
Y = X
|
|
elif eval_gradient:
|
|
raise ValueError("Gradient can only be evaluated when Y is None.")
|
|
|
|
K = np.full(
|
|
(_num_samples(X), _num_samples(Y)),
|
|
self.constant_value,
|
|
dtype=np.array(self.constant_value).dtype,
|
|
)
|
|
if eval_gradient:
|
|
if not self.hyperparameter_constant_value.fixed:
|
|
return (
|
|
K,
|
|
np.full(
|
|
(_num_samples(X), _num_samples(X), 1),
|
|
self.constant_value,
|
|
dtype=np.array(self.constant_value).dtype,
|
|
),
|
|
)
|
|
else:
|
|
return K, np.empty((_num_samples(X), _num_samples(X), 0))
|
|
else:
|
|
return K
|
|
|
|
def diag(self, X):
|
|
"""Returns the diagonal of the kernel k(X, X).
|
|
|
|
The result of this method is identical to np.diag(self(X)); however,
|
|
it can be evaluated more efficiently since only the diagonal is
|
|
evaluated.
|
|
|
|
Parameters
|
|
----------
|
|
X : array-like of shape (n_samples_X, n_features) or list of object
|
|
Argument to the kernel.
|
|
|
|
Returns
|
|
-------
|
|
K_diag : ndarray of shape (n_samples_X,)
|
|
Diagonal of kernel k(X, X)
|
|
"""
|
|
return np.full(
|
|
_num_samples(X),
|
|
self.constant_value,
|
|
dtype=np.array(self.constant_value).dtype,
|
|
)
|
|
|
|
def __repr__(self):
|
|
return "{0:.3g}**2".format(np.sqrt(self.constant_value))
|
|
|
|
|
|
class WhiteKernel(StationaryKernelMixin, GenericKernelMixin, Kernel):
|
|
"""White kernel.
|
|
|
|
The main use-case of this kernel is as part of a sum-kernel where it
|
|
explains the noise of the signal as independently and identically
|
|
normally-distributed. The parameter noise_level equals the variance of this
|
|
noise.
|
|
|
|
.. math::
|
|
k(x_1, x_2) = noise\\_level \\text{ if } x_i == x_j \\text{ else } 0
|
|
|
|
|
|
Read more in the :ref:`User Guide <gp_kernels>`.
|
|
|
|
.. versionadded:: 0.18
|
|
|
|
Parameters
|
|
----------
|
|
noise_level : float, default=1.0
|
|
Parameter controlling the noise level (variance)
|
|
|
|
noise_level_bounds : pair of floats >= 0 or "fixed", default=(1e-5, 1e5)
|
|
The lower and upper bound on 'noise_level'.
|
|
If set to "fixed", 'noise_level' cannot be changed during
|
|
hyperparameter tuning.
|
|
|
|
Examples
|
|
--------
|
|
>>> from sklearn.datasets import make_friedman2
|
|
>>> from sklearn.gaussian_process import GaussianProcessRegressor
|
|
>>> from sklearn.gaussian_process.kernels import DotProduct, WhiteKernel
|
|
>>> X, y = make_friedman2(n_samples=500, noise=0, random_state=0)
|
|
>>> kernel = DotProduct() + WhiteKernel(noise_level=0.5)
|
|
>>> gpr = GaussianProcessRegressor(kernel=kernel,
|
|
... random_state=0).fit(X, y)
|
|
>>> gpr.score(X, y)
|
|
0.3680...
|
|
>>> gpr.predict(X[:2,:], return_std=True)
|
|
(array([653.0..., 592.1... ]), array([316.6..., 316.6...]))
|
|
"""
|
|
|
|
def __init__(self, noise_level=1.0, noise_level_bounds=(1e-5, 1e5)):
|
|
self.noise_level = noise_level
|
|
self.noise_level_bounds = noise_level_bounds
|
|
|
|
@property
|
|
def hyperparameter_noise_level(self):
|
|
return Hyperparameter("noise_level", "numeric", self.noise_level_bounds)
|
|
|
|
def __call__(self, X, Y=None, eval_gradient=False):
|
|
"""Return the kernel k(X, Y) and optionally its gradient.
|
|
|
|
Parameters
|
|
----------
|
|
X : array-like of shape (n_samples_X, n_features) or list of object
|
|
Left argument of the returned kernel k(X, Y)
|
|
|
|
Y : array-like of shape (n_samples_X, n_features) or list of object,\
|
|
default=None
|
|
Right argument of the returned kernel k(X, Y). If None, k(X, X)
|
|
is evaluated instead.
|
|
|
|
eval_gradient : bool, default=False
|
|
Determines whether the gradient with respect to the log of
|
|
the kernel hyperparameter is computed.
|
|
Only supported when Y is None.
|
|
|
|
Returns
|
|
-------
|
|
K : ndarray of shape (n_samples_X, n_samples_Y)
|
|
Kernel k(X, Y)
|
|
|
|
K_gradient : ndarray of shape (n_samples_X, n_samples_X, n_dims),\
|
|
optional
|
|
The gradient of the kernel k(X, X) with respect to the log of the
|
|
hyperparameter of the kernel. Only returned when eval_gradient
|
|
is True.
|
|
"""
|
|
if Y is not None and eval_gradient:
|
|
raise ValueError("Gradient can only be evaluated when Y is None.")
|
|
|
|
if Y is None:
|
|
K = self.noise_level * np.eye(_num_samples(X))
|
|
if eval_gradient:
|
|
if not self.hyperparameter_noise_level.fixed:
|
|
return (
|
|
K,
|
|
self.noise_level * np.eye(_num_samples(X))[:, :, np.newaxis],
|
|
)
|
|
else:
|
|
return K, np.empty((_num_samples(X), _num_samples(X), 0))
|
|
else:
|
|
return K
|
|
else:
|
|
return np.zeros((_num_samples(X), _num_samples(Y)))
|
|
|
|
def diag(self, X):
|
|
"""Returns the diagonal of the kernel k(X, X).
|
|
|
|
The result of this method is identical to np.diag(self(X)); however,
|
|
it can be evaluated more efficiently since only the diagonal is
|
|
evaluated.
|
|
|
|
Parameters
|
|
----------
|
|
X : array-like of shape (n_samples_X, n_features) or list of object
|
|
Argument to the kernel.
|
|
|
|
Returns
|
|
-------
|
|
K_diag : ndarray of shape (n_samples_X,)
|
|
Diagonal of kernel k(X, X)
|
|
"""
|
|
return np.full(
|
|
_num_samples(X), self.noise_level, dtype=np.array(self.noise_level).dtype
|
|
)
|
|
|
|
def __repr__(self):
|
|
return "{0}(noise_level={1:.3g})".format(
|
|
self.__class__.__name__, self.noise_level
|
|
)
|
|
|
|
|
|
class RBF(StationaryKernelMixin, NormalizedKernelMixin, Kernel):
|
|
"""Radial basis function kernel (aka squared-exponential kernel).
|
|
|
|
The RBF kernel is a stationary kernel. It is also known as the
|
|
"squared exponential" kernel. It is parameterized by a length scale
|
|
parameter :math:`l>0`, which can either be a scalar (isotropic variant
|
|
of the kernel) or a vector with the same number of dimensions as the inputs
|
|
X (anisotropic variant of the kernel). The kernel is given by:
|
|
|
|
.. math::
|
|
k(x_i, x_j) = \\exp\\left(- \\frac{d(x_i, x_j)^2}{2l^2} \\right)
|
|
|
|
where :math:`l` is the length scale of the kernel and
|
|
:math:`d(\\cdot,\\cdot)` is the Euclidean distance.
|
|
For advice on how to set the length scale parameter, see e.g. [1]_.
|
|
|
|
This kernel is infinitely differentiable, which implies that GPs with this
|
|
kernel as covariance function have mean square derivatives of all orders,
|
|
and are thus very smooth.
|
|
See [2]_, Chapter 4, Section 4.2, for further details of the RBF kernel.
|
|
|
|
Read more in the :ref:`User Guide <gp_kernels>`.
|
|
|
|
.. versionadded:: 0.18
|
|
|
|
Parameters
|
|
----------
|
|
length_scale : float or ndarray of shape (n_features,), default=1.0
|
|
The length scale of the kernel. If a float, an isotropic kernel is
|
|
used. If an array, an anisotropic kernel is used where each dimension
|
|
of l defines the length-scale of the respective feature dimension.
|
|
|
|
length_scale_bounds : pair of floats >= 0 or "fixed", default=(1e-5, 1e5)
|
|
The lower and upper bound on 'length_scale'.
|
|
If set to "fixed", 'length_scale' cannot be changed during
|
|
hyperparameter tuning.
|
|
|
|
References
|
|
----------
|
|
.. [1] `David Duvenaud (2014). "The Kernel Cookbook:
|
|
Advice on Covariance functions".
|
|
<https://www.cs.toronto.edu/~duvenaud/cookbook/>`_
|
|
|
|
.. [2] `Carl Edward Rasmussen, Christopher K. I. Williams (2006).
|
|
"Gaussian Processes for Machine Learning". The MIT Press.
|
|
<http://www.gaussianprocess.org/gpml/>`_
|
|
|
|
Examples
|
|
--------
|
|
>>> from sklearn.datasets import load_iris
|
|
>>> from sklearn.gaussian_process import GaussianProcessClassifier
|
|
>>> from sklearn.gaussian_process.kernels import RBF
|
|
>>> X, y = load_iris(return_X_y=True)
|
|
>>> kernel = 1.0 * RBF(1.0)
|
|
>>> gpc = GaussianProcessClassifier(kernel=kernel,
|
|
... random_state=0).fit(X, y)
|
|
>>> gpc.score(X, y)
|
|
0.9866...
|
|
>>> gpc.predict_proba(X[:2,:])
|
|
array([[0.8354..., 0.03228..., 0.1322...],
|
|
[0.7906..., 0.0652..., 0.1441...]])
|
|
"""
|
|
|
|
def __init__(self, length_scale=1.0, length_scale_bounds=(1e-5, 1e5)):
|
|
self.length_scale = length_scale
|
|
self.length_scale_bounds = length_scale_bounds
|
|
|
|
@property
|
|
def anisotropic(self):
|
|
return np.iterable(self.length_scale) and len(self.length_scale) > 1
|
|
|
|
@property
|
|
def hyperparameter_length_scale(self):
|
|
if self.anisotropic:
|
|
return Hyperparameter(
|
|
"length_scale",
|
|
"numeric",
|
|
self.length_scale_bounds,
|
|
len(self.length_scale),
|
|
)
|
|
return Hyperparameter("length_scale", "numeric", self.length_scale_bounds)
|
|
|
|
def __call__(self, X, Y=None, eval_gradient=False):
|
|
"""Return the kernel k(X, Y) and optionally its gradient.
|
|
|
|
Parameters
|
|
----------
|
|
X : ndarray of shape (n_samples_X, n_features)
|
|
Left argument of the returned kernel k(X, Y)
|
|
|
|
Y : ndarray of shape (n_samples_Y, n_features), default=None
|
|
Right argument of the returned kernel k(X, Y). If None, k(X, X)
|
|
if evaluated instead.
|
|
|
|
eval_gradient : bool, default=False
|
|
Determines whether the gradient with respect to the log of
|
|
the kernel hyperparameter is computed.
|
|
Only supported when Y is None.
|
|
|
|
Returns
|
|
-------
|
|
K : ndarray of shape (n_samples_X, n_samples_Y)
|
|
Kernel k(X, Y)
|
|
|
|
K_gradient : ndarray of shape (n_samples_X, n_samples_X, n_dims), \
|
|
optional
|
|
The gradient of the kernel k(X, X) with respect to the log of the
|
|
hyperparameter of the kernel. Only returned when `eval_gradient`
|
|
is True.
|
|
"""
|
|
X = np.atleast_2d(X)
|
|
length_scale = _check_length_scale(X, self.length_scale)
|
|
if Y is None:
|
|
dists = pdist(X / length_scale, metric="sqeuclidean")
|
|
K = np.exp(-0.5 * dists)
|
|
# convert from upper-triangular matrix to square matrix
|
|
K = squareform(K)
|
|
np.fill_diagonal(K, 1)
|
|
else:
|
|
if eval_gradient:
|
|
raise ValueError("Gradient can only be evaluated when Y is None.")
|
|
dists = cdist(X / length_scale, Y / length_scale, metric="sqeuclidean")
|
|
K = np.exp(-0.5 * dists)
|
|
|
|
if eval_gradient:
|
|
if self.hyperparameter_length_scale.fixed:
|
|
# Hyperparameter l kept fixed
|
|
return K, np.empty((X.shape[0], X.shape[0], 0))
|
|
elif not self.anisotropic or length_scale.shape[0] == 1:
|
|
K_gradient = (K * squareform(dists))[:, :, np.newaxis]
|
|
return K, K_gradient
|
|
elif self.anisotropic:
|
|
# We need to recompute the pairwise dimension-wise distances
|
|
K_gradient = (X[:, np.newaxis, :] - X[np.newaxis, :, :]) ** 2 / (
|
|
length_scale**2
|
|
)
|
|
K_gradient *= K[..., np.newaxis]
|
|
return K, K_gradient
|
|
else:
|
|
return K
|
|
|
|
def __repr__(self):
|
|
if self.anisotropic:
|
|
return "{0}(length_scale=[{1}])".format(
|
|
self.__class__.__name__,
|
|
", ".join(map("{0:.3g}".format, self.length_scale)),
|
|
)
|
|
else: # isotropic
|
|
return "{0}(length_scale={1:.3g})".format(
|
|
self.__class__.__name__, np.ravel(self.length_scale)[0]
|
|
)
|
|
|
|
|
|
class Matern(RBF):
|
|
"""Matern kernel.
|
|
|
|
The class of Matern kernels is a generalization of the :class:`RBF`.
|
|
It has an additional parameter :math:`\\nu` which controls the
|
|
smoothness of the resulting function. The smaller :math:`\\nu`,
|
|
the less smooth the approximated function is.
|
|
As :math:`\\nu\\rightarrow\\infty`, the kernel becomes equivalent to
|
|
the :class:`RBF` kernel. When :math:`\\nu = 1/2`, the Matérn kernel
|
|
becomes identical to the absolute exponential kernel.
|
|
Important intermediate values are
|
|
:math:`\\nu=1.5` (once differentiable functions)
|
|
and :math:`\\nu=2.5` (twice differentiable functions).
|
|
|
|
The kernel is given by:
|
|
|
|
.. math::
|
|
k(x_i, x_j) = \\frac{1}{\\Gamma(\\nu)2^{\\nu-1}}\\Bigg(
|
|
\\frac{\\sqrt{2\\nu}}{l} d(x_i , x_j )
|
|
\\Bigg)^\\nu K_\\nu\\Bigg(
|
|
\\frac{\\sqrt{2\\nu}}{l} d(x_i , x_j )\\Bigg)
|
|
|
|
|
|
|
|
where :math:`d(\\cdot,\\cdot)` is the Euclidean distance,
|
|
:math:`K_{\\nu}(\\cdot)` is a modified Bessel function and
|
|
:math:`\\Gamma(\\cdot)` is the gamma function.
|
|
See [1]_, Chapter 4, Section 4.2, for details regarding the different
|
|
variants of the Matern kernel.
|
|
|
|
Read more in the :ref:`User Guide <gp_kernels>`.
|
|
|
|
.. versionadded:: 0.18
|
|
|
|
Parameters
|
|
----------
|
|
length_scale : float or ndarray of shape (n_features,), default=1.0
|
|
The length scale of the kernel. If a float, an isotropic kernel is
|
|
used. If an array, an anisotropic kernel is used where each dimension
|
|
of l defines the length-scale of the respective feature dimension.
|
|
|
|
length_scale_bounds : pair of floats >= 0 or "fixed", default=(1e-5, 1e5)
|
|
The lower and upper bound on 'length_scale'.
|
|
If set to "fixed", 'length_scale' cannot be changed during
|
|
hyperparameter tuning.
|
|
|
|
nu : float, default=1.5
|
|
The parameter nu controlling the smoothness of the learned function.
|
|
The smaller nu, the less smooth the approximated function is.
|
|
For nu=inf, the kernel becomes equivalent to the RBF kernel and for
|
|
nu=0.5 to the absolute exponential kernel. Important intermediate
|
|
values are nu=1.5 (once differentiable functions) and nu=2.5
|
|
(twice differentiable functions). Note that values of nu not in
|
|
[0.5, 1.5, 2.5, inf] incur a considerably higher computational cost
|
|
(appr. 10 times higher) since they require to evaluate the modified
|
|
Bessel function. Furthermore, in contrast to l, nu is kept fixed to
|
|
its initial value and not optimized.
|
|
|
|
References
|
|
----------
|
|
.. [1] `Carl Edward Rasmussen, Christopher K. I. Williams (2006).
|
|
"Gaussian Processes for Machine Learning". The MIT Press.
|
|
<http://www.gaussianprocess.org/gpml/>`_
|
|
|
|
Examples
|
|
--------
|
|
>>> from sklearn.datasets import load_iris
|
|
>>> from sklearn.gaussian_process import GaussianProcessClassifier
|
|
>>> from sklearn.gaussian_process.kernels import Matern
|
|
>>> X, y = load_iris(return_X_y=True)
|
|
>>> kernel = 1.0 * Matern(length_scale=1.0, nu=1.5)
|
|
>>> gpc = GaussianProcessClassifier(kernel=kernel,
|
|
... random_state=0).fit(X, y)
|
|
>>> gpc.score(X, y)
|
|
0.9866...
|
|
>>> gpc.predict_proba(X[:2,:])
|
|
array([[0.8513..., 0.0368..., 0.1117...],
|
|
[0.8086..., 0.0693..., 0.1220...]])
|
|
"""
|
|
|
|
def __init__(self, length_scale=1.0, length_scale_bounds=(1e-5, 1e5), nu=1.5):
|
|
super().__init__(length_scale, length_scale_bounds)
|
|
self.nu = nu
|
|
|
|
def __call__(self, X, Y=None, eval_gradient=False):
|
|
"""Return the kernel k(X, Y) and optionally its gradient.
|
|
|
|
Parameters
|
|
----------
|
|
X : ndarray of shape (n_samples_X, n_features)
|
|
Left argument of the returned kernel k(X, Y)
|
|
|
|
Y : ndarray of shape (n_samples_Y, n_features), default=None
|
|
Right argument of the returned kernel k(X, Y). If None, k(X, X)
|
|
if evaluated instead.
|
|
|
|
eval_gradient : bool, default=False
|
|
Determines whether the gradient with respect to the log of
|
|
the kernel hyperparameter is computed.
|
|
Only supported when Y is None.
|
|
|
|
Returns
|
|
-------
|
|
K : ndarray of shape (n_samples_X, n_samples_Y)
|
|
Kernel k(X, Y)
|
|
|
|
K_gradient : ndarray of shape (n_samples_X, n_samples_X, n_dims), \
|
|
optional
|
|
The gradient of the kernel k(X, X) with respect to the log of the
|
|
hyperparameter of the kernel. Only returned when `eval_gradient`
|
|
is True.
|
|
"""
|
|
X = np.atleast_2d(X)
|
|
length_scale = _check_length_scale(X, self.length_scale)
|
|
if Y is None:
|
|
dists = pdist(X / length_scale, metric="euclidean")
|
|
else:
|
|
if eval_gradient:
|
|
raise ValueError("Gradient can only be evaluated when Y is None.")
|
|
dists = cdist(X / length_scale, Y / length_scale, metric="euclidean")
|
|
|
|
if self.nu == 0.5:
|
|
K = np.exp(-dists)
|
|
elif self.nu == 1.5:
|
|
K = dists * math.sqrt(3)
|
|
K = (1.0 + K) * np.exp(-K)
|
|
elif self.nu == 2.5:
|
|
K = dists * math.sqrt(5)
|
|
K = (1.0 + K + K**2 / 3.0) * np.exp(-K)
|
|
elif self.nu == np.inf:
|
|
K = np.exp(-(dists**2) / 2.0)
|
|
else: # general case; expensive to evaluate
|
|
K = dists
|
|
K[K == 0.0] += np.finfo(float).eps # strict zeros result in nan
|
|
tmp = math.sqrt(2 * self.nu) * K
|
|
K.fill((2 ** (1.0 - self.nu)) / gamma(self.nu))
|
|
K *= tmp**self.nu
|
|
K *= kv(self.nu, tmp)
|
|
|
|
if Y is None:
|
|
# convert from upper-triangular matrix to square matrix
|
|
K = squareform(K)
|
|
np.fill_diagonal(K, 1)
|
|
|
|
if eval_gradient:
|
|
if self.hyperparameter_length_scale.fixed:
|
|
# Hyperparameter l kept fixed
|
|
K_gradient = np.empty((X.shape[0], X.shape[0], 0))
|
|
return K, K_gradient
|
|
|
|
# We need to recompute the pairwise dimension-wise distances
|
|
if self.anisotropic:
|
|
D = (X[:, np.newaxis, :] - X[np.newaxis, :, :]) ** 2 / (
|
|
length_scale**2
|
|
)
|
|
else:
|
|
D = squareform(dists**2)[:, :, np.newaxis]
|
|
|
|
if self.nu == 0.5:
|
|
denominator = np.sqrt(D.sum(axis=2))[:, :, np.newaxis]
|
|
divide_result = np.zeros_like(D)
|
|
np.divide(
|
|
D,
|
|
denominator,
|
|
out=divide_result,
|
|
where=denominator != 0,
|
|
)
|
|
K_gradient = K[..., np.newaxis] * divide_result
|
|
elif self.nu == 1.5:
|
|
K_gradient = 3 * D * np.exp(-np.sqrt(3 * D.sum(-1)))[..., np.newaxis]
|
|
elif self.nu == 2.5:
|
|
tmp = np.sqrt(5 * D.sum(-1))[..., np.newaxis]
|
|
K_gradient = 5.0 / 3.0 * D * (tmp + 1) * np.exp(-tmp)
|
|
elif self.nu == np.inf:
|
|
K_gradient = D * K[..., np.newaxis]
|
|
else:
|
|
# approximate gradient numerically
|
|
def f(theta): # helper function
|
|
return self.clone_with_theta(theta)(X, Y)
|
|
|
|
return K, _approx_fprime(self.theta, f, 1e-10)
|
|
|
|
if not self.anisotropic:
|
|
return K, K_gradient[:, :].sum(-1)[:, :, np.newaxis]
|
|
else:
|
|
return K, K_gradient
|
|
else:
|
|
return K
|
|
|
|
def __repr__(self):
|
|
if self.anisotropic:
|
|
return "{0}(length_scale=[{1}], nu={2:.3g})".format(
|
|
self.__class__.__name__,
|
|
", ".join(map("{0:.3g}".format, self.length_scale)),
|
|
self.nu,
|
|
)
|
|
else:
|
|
return "{0}(length_scale={1:.3g}, nu={2:.3g})".format(
|
|
self.__class__.__name__, np.ravel(self.length_scale)[0], self.nu
|
|
)
|
|
|
|
|
|
class RationalQuadratic(StationaryKernelMixin, NormalizedKernelMixin, Kernel):
|
|
"""Rational Quadratic kernel.
|
|
|
|
The RationalQuadratic kernel can be seen as a scale mixture (an infinite
|
|
sum) of RBF kernels with different characteristic length scales. It is
|
|
parameterized by a length scale parameter :math:`l>0` and a scale
|
|
mixture parameter :math:`\\alpha>0`. Only the isotropic variant
|
|
where length_scale :math:`l` is a scalar is supported at the moment.
|
|
The kernel is given by:
|
|
|
|
.. math::
|
|
k(x_i, x_j) = \\left(
|
|
1 + \\frac{d(x_i, x_j)^2 }{ 2\\alpha l^2}\\right)^{-\\alpha}
|
|
|
|
where :math:`\\alpha` is the scale mixture parameter, :math:`l` is
|
|
the length scale of the kernel and :math:`d(\\cdot,\\cdot)` is the
|
|
Euclidean distance.
|
|
For advice on how to set the parameters, see e.g. [1]_.
|
|
|
|
Read more in the :ref:`User Guide <gp_kernels>`.
|
|
|
|
.. versionadded:: 0.18
|
|
|
|
Parameters
|
|
----------
|
|
length_scale : float > 0, default=1.0
|
|
The length scale of the kernel.
|
|
|
|
alpha : float > 0, default=1.0
|
|
Scale mixture parameter
|
|
|
|
length_scale_bounds : pair of floats >= 0 or "fixed", default=(1e-5, 1e5)
|
|
The lower and upper bound on 'length_scale'.
|
|
If set to "fixed", 'length_scale' cannot be changed during
|
|
hyperparameter tuning.
|
|
|
|
alpha_bounds : pair of floats >= 0 or "fixed", default=(1e-5, 1e5)
|
|
The lower and upper bound on 'alpha'.
|
|
If set to "fixed", 'alpha' cannot be changed during
|
|
hyperparameter tuning.
|
|
|
|
References
|
|
----------
|
|
.. [1] `David Duvenaud (2014). "The Kernel Cookbook:
|
|
Advice on Covariance functions".
|
|
<https://www.cs.toronto.edu/~duvenaud/cookbook/>`_
|
|
|
|
Examples
|
|
--------
|
|
>>> from sklearn.datasets import load_iris
|
|
>>> from sklearn.gaussian_process import GaussianProcessClassifier
|
|
>>> from sklearn.gaussian_process.kernels import RationalQuadratic
|
|
>>> X, y = load_iris(return_X_y=True)
|
|
>>> kernel = RationalQuadratic(length_scale=1.0, alpha=1.5)
|
|
>>> gpc = GaussianProcessClassifier(kernel=kernel,
|
|
... random_state=0).fit(X, y)
|
|
>>> gpc.score(X, y)
|
|
0.9733...
|
|
>>> gpc.predict_proba(X[:2,:])
|
|
array([[0.8881..., 0.0566..., 0.05518...],
|
|
[0.8678..., 0.0707... , 0.0614...]])
|
|
"""
|
|
|
|
def __init__(
|
|
self,
|
|
length_scale=1.0,
|
|
alpha=1.0,
|
|
length_scale_bounds=(1e-5, 1e5),
|
|
alpha_bounds=(1e-5, 1e5),
|
|
):
|
|
self.length_scale = length_scale
|
|
self.alpha = alpha
|
|
self.length_scale_bounds = length_scale_bounds
|
|
self.alpha_bounds = alpha_bounds
|
|
|
|
@property
|
|
def hyperparameter_length_scale(self):
|
|
return Hyperparameter("length_scale", "numeric", self.length_scale_bounds)
|
|
|
|
@property
|
|
def hyperparameter_alpha(self):
|
|
return Hyperparameter("alpha", "numeric", self.alpha_bounds)
|
|
|
|
def __call__(self, X, Y=None, eval_gradient=False):
|
|
"""Return the kernel k(X, Y) and optionally its gradient.
|
|
|
|
Parameters
|
|
----------
|
|
X : ndarray of shape (n_samples_X, n_features)
|
|
Left argument of the returned kernel k(X, Y)
|
|
|
|
Y : ndarray of shape (n_samples_Y, n_features), default=None
|
|
Right argument of the returned kernel k(X, Y). If None, k(X, X)
|
|
if evaluated instead.
|
|
|
|
eval_gradient : bool, default=False
|
|
Determines whether the gradient with respect to the log of
|
|
the kernel hyperparameter is computed.
|
|
Only supported when Y is None.
|
|
|
|
Returns
|
|
-------
|
|
K : ndarray of shape (n_samples_X, n_samples_Y)
|
|
Kernel k(X, Y)
|
|
|
|
K_gradient : ndarray of shape (n_samples_X, n_samples_X, n_dims)
|
|
The gradient of the kernel k(X, X) with respect to the log of the
|
|
hyperparameter of the kernel. Only returned when eval_gradient
|
|
is True.
|
|
"""
|
|
if len(np.atleast_1d(self.length_scale)) > 1:
|
|
raise AttributeError(
|
|
"RationalQuadratic kernel only supports isotropic version, "
|
|
"please use a single scalar for length_scale"
|
|
)
|
|
X = np.atleast_2d(X)
|
|
if Y is None:
|
|
dists = squareform(pdist(X, metric="sqeuclidean"))
|
|
tmp = dists / (2 * self.alpha * self.length_scale**2)
|
|
base = 1 + tmp
|
|
K = base**-self.alpha
|
|
np.fill_diagonal(K, 1)
|
|
else:
|
|
if eval_gradient:
|
|
raise ValueError("Gradient can only be evaluated when Y is None.")
|
|
dists = cdist(X, Y, metric="sqeuclidean")
|
|
K = (1 + dists / (2 * self.alpha * self.length_scale**2)) ** -self.alpha
|
|
|
|
if eval_gradient:
|
|
# gradient with respect to length_scale
|
|
if not self.hyperparameter_length_scale.fixed:
|
|
length_scale_gradient = dists * K / (self.length_scale**2 * base)
|
|
length_scale_gradient = length_scale_gradient[:, :, np.newaxis]
|
|
else: # l is kept fixed
|
|
length_scale_gradient = np.empty((K.shape[0], K.shape[1], 0))
|
|
|
|
# gradient with respect to alpha
|
|
if not self.hyperparameter_alpha.fixed:
|
|
alpha_gradient = K * (
|
|
-self.alpha * np.log(base)
|
|
+ dists / (2 * self.length_scale**2 * base)
|
|
)
|
|
alpha_gradient = alpha_gradient[:, :, np.newaxis]
|
|
else: # alpha is kept fixed
|
|
alpha_gradient = np.empty((K.shape[0], K.shape[1], 0))
|
|
|
|
return K, np.dstack((alpha_gradient, length_scale_gradient))
|
|
else:
|
|
return K
|
|
|
|
def __repr__(self):
|
|
return "{0}(alpha={1:.3g}, length_scale={2:.3g})".format(
|
|
self.__class__.__name__, self.alpha, self.length_scale
|
|
)
|
|
|
|
|
|
class ExpSineSquared(StationaryKernelMixin, NormalizedKernelMixin, Kernel):
|
|
r"""Exp-Sine-Squared kernel (aka periodic kernel).
|
|
|
|
The ExpSineSquared kernel allows one to model functions which repeat
|
|
themselves exactly. It is parameterized by a length scale
|
|
parameter :math:`l>0` and a periodicity parameter :math:`p>0`.
|
|
Only the isotropic variant where :math:`l` is a scalar is
|
|
supported at the moment. The kernel is given by:
|
|
|
|
.. math::
|
|
k(x_i, x_j) = \text{exp}\left(-
|
|
\frac{ 2\sin^2(\pi d(x_i, x_j)/p) }{ l^ 2} \right)
|
|
|
|
where :math:`l` is the length scale of the kernel, :math:`p` the
|
|
periodicity of the kernel and :math:`d(\\cdot,\\cdot)` is the
|
|
Euclidean distance.
|
|
|
|
Read more in the :ref:`User Guide <gp_kernels>`.
|
|
|
|
.. versionadded:: 0.18
|
|
|
|
Parameters
|
|
----------
|
|
|
|
length_scale : float > 0, default=1.0
|
|
The length scale of the kernel.
|
|
|
|
periodicity : float > 0, default=1.0
|
|
The periodicity of the kernel.
|
|
|
|
length_scale_bounds : pair of floats >= 0 or "fixed", default=(1e-5, 1e5)
|
|
The lower and upper bound on 'length_scale'.
|
|
If set to "fixed", 'length_scale' cannot be changed during
|
|
hyperparameter tuning.
|
|
|
|
periodicity_bounds : pair of floats >= 0 or "fixed", default=(1e-5, 1e5)
|
|
The lower and upper bound on 'periodicity'.
|
|
If set to "fixed", 'periodicity' cannot be changed during
|
|
hyperparameter tuning.
|
|
|
|
Examples
|
|
--------
|
|
>>> from sklearn.datasets import make_friedman2
|
|
>>> from sklearn.gaussian_process import GaussianProcessRegressor
|
|
>>> from sklearn.gaussian_process.kernels import ExpSineSquared
|
|
>>> X, y = make_friedman2(n_samples=50, noise=0, random_state=0)
|
|
>>> kernel = ExpSineSquared(length_scale=1, periodicity=1)
|
|
>>> gpr = GaussianProcessRegressor(kernel=kernel, alpha=5,
|
|
... random_state=0).fit(X, y)
|
|
>>> gpr.score(X, y)
|
|
0.0144...
|
|
>>> gpr.predict(X[:2,:], return_std=True)
|
|
(array([425.6..., 457.5...]), array([0.3894..., 0.3467...]))
|
|
"""
|
|
|
|
def __init__(
|
|
self,
|
|
length_scale=1.0,
|
|
periodicity=1.0,
|
|
length_scale_bounds=(1e-5, 1e5),
|
|
periodicity_bounds=(1e-5, 1e5),
|
|
):
|
|
self.length_scale = length_scale
|
|
self.periodicity = periodicity
|
|
self.length_scale_bounds = length_scale_bounds
|
|
self.periodicity_bounds = periodicity_bounds
|
|
|
|
@property
|
|
def hyperparameter_length_scale(self):
|
|
"""Returns the length scale"""
|
|
return Hyperparameter("length_scale", "numeric", self.length_scale_bounds)
|
|
|
|
@property
|
|
def hyperparameter_periodicity(self):
|
|
return Hyperparameter("periodicity", "numeric", self.periodicity_bounds)
|
|
|
|
def __call__(self, X, Y=None, eval_gradient=False):
|
|
"""Return the kernel k(X, Y) and optionally its gradient.
|
|
|
|
Parameters
|
|
----------
|
|
X : ndarray of shape (n_samples_X, n_features)
|
|
Left argument of the returned kernel k(X, Y)
|
|
|
|
Y : ndarray of shape (n_samples_Y, n_features), default=None
|
|
Right argument of the returned kernel k(X, Y). If None, k(X, X)
|
|
if evaluated instead.
|
|
|
|
eval_gradient : bool, default=False
|
|
Determines whether the gradient with respect to the log of
|
|
the kernel hyperparameter is computed.
|
|
Only supported when Y is None.
|
|
|
|
Returns
|
|
-------
|
|
K : ndarray of shape (n_samples_X, n_samples_Y)
|
|
Kernel k(X, Y)
|
|
|
|
K_gradient : ndarray of shape (n_samples_X, n_samples_X, n_dims), \
|
|
optional
|
|
The gradient of the kernel k(X, X) with respect to the log of the
|
|
hyperparameter of the kernel. Only returned when `eval_gradient`
|
|
is True.
|
|
"""
|
|
X = np.atleast_2d(X)
|
|
if Y is None:
|
|
dists = squareform(pdist(X, metric="euclidean"))
|
|
arg = np.pi * dists / self.periodicity
|
|
sin_of_arg = np.sin(arg)
|
|
K = np.exp(-2 * (sin_of_arg / self.length_scale) ** 2)
|
|
else:
|
|
if eval_gradient:
|
|
raise ValueError("Gradient can only be evaluated when Y is None.")
|
|
dists = cdist(X, Y, metric="euclidean")
|
|
K = np.exp(
|
|
-2 * (np.sin(np.pi / self.periodicity * dists) / self.length_scale) ** 2
|
|
)
|
|
|
|
if eval_gradient:
|
|
cos_of_arg = np.cos(arg)
|
|
# gradient with respect to length_scale
|
|
if not self.hyperparameter_length_scale.fixed:
|
|
length_scale_gradient = 4 / self.length_scale**2 * sin_of_arg**2 * K
|
|
length_scale_gradient = length_scale_gradient[:, :, np.newaxis]
|
|
else: # length_scale is kept fixed
|
|
length_scale_gradient = np.empty((K.shape[0], K.shape[1], 0))
|
|
# gradient with respect to p
|
|
if not self.hyperparameter_periodicity.fixed:
|
|
periodicity_gradient = (
|
|
4 * arg / self.length_scale**2 * cos_of_arg * sin_of_arg * K
|
|
)
|
|
periodicity_gradient = periodicity_gradient[:, :, np.newaxis]
|
|
else: # p is kept fixed
|
|
periodicity_gradient = np.empty((K.shape[0], K.shape[1], 0))
|
|
|
|
return K, np.dstack((length_scale_gradient, periodicity_gradient))
|
|
else:
|
|
return K
|
|
|
|
def __repr__(self):
|
|
return "{0}(length_scale={1:.3g}, periodicity={2:.3g})".format(
|
|
self.__class__.__name__, self.length_scale, self.periodicity
|
|
)
|
|
|
|
|
|
class DotProduct(Kernel):
|
|
r"""Dot-Product kernel.
|
|
|
|
The DotProduct kernel is non-stationary and can be obtained from linear
|
|
regression by putting :math:`N(0, 1)` priors on the coefficients
|
|
of :math:`x_d (d = 1, . . . , D)` and a prior of :math:`N(0, \sigma_0^2)`
|
|
on the bias. The DotProduct kernel is invariant to a rotation of
|
|
the coordinates about the origin, but not translations.
|
|
It is parameterized by a parameter sigma_0 :math:`\sigma`
|
|
which controls the inhomogenity of the kernel. For :math:`\sigma_0^2 =0`,
|
|
the kernel is called the homogeneous linear kernel, otherwise
|
|
it is inhomogeneous. The kernel is given by
|
|
|
|
.. math::
|
|
k(x_i, x_j) = \sigma_0 ^ 2 + x_i \cdot x_j
|
|
|
|
The DotProduct kernel is commonly combined with exponentiation.
|
|
|
|
See [1]_, Chapter 4, Section 4.2, for further details regarding the
|
|
DotProduct kernel.
|
|
|
|
Read more in the :ref:`User Guide <gp_kernels>`.
|
|
|
|
.. versionadded:: 0.18
|
|
|
|
Parameters
|
|
----------
|
|
sigma_0 : float >= 0, default=1.0
|
|
Parameter controlling the inhomogenity of the kernel. If sigma_0=0,
|
|
the kernel is homogeneous.
|
|
|
|
sigma_0_bounds : pair of floats >= 0 or "fixed", default=(1e-5, 1e5)
|
|
The lower and upper bound on 'sigma_0'.
|
|
If set to "fixed", 'sigma_0' cannot be changed during
|
|
hyperparameter tuning.
|
|
|
|
References
|
|
----------
|
|
.. [1] `Carl Edward Rasmussen, Christopher K. I. Williams (2006).
|
|
"Gaussian Processes for Machine Learning". The MIT Press.
|
|
<http://www.gaussianprocess.org/gpml/>`_
|
|
|
|
Examples
|
|
--------
|
|
>>> from sklearn.datasets import make_friedman2
|
|
>>> from sklearn.gaussian_process import GaussianProcessRegressor
|
|
>>> from sklearn.gaussian_process.kernels import DotProduct, WhiteKernel
|
|
>>> X, y = make_friedman2(n_samples=500, noise=0, random_state=0)
|
|
>>> kernel = DotProduct() + WhiteKernel()
|
|
>>> gpr = GaussianProcessRegressor(kernel=kernel,
|
|
... random_state=0).fit(X, y)
|
|
>>> gpr.score(X, y)
|
|
0.3680...
|
|
>>> gpr.predict(X[:2,:], return_std=True)
|
|
(array([653.0..., 592.1...]), array([316.6..., 316.6...]))
|
|
"""
|
|
|
|
def __init__(self, sigma_0=1.0, sigma_0_bounds=(1e-5, 1e5)):
|
|
self.sigma_0 = sigma_0
|
|
self.sigma_0_bounds = sigma_0_bounds
|
|
|
|
@property
|
|
def hyperparameter_sigma_0(self):
|
|
return Hyperparameter("sigma_0", "numeric", self.sigma_0_bounds)
|
|
|
|
def __call__(self, X, Y=None, eval_gradient=False):
|
|
"""Return the kernel k(X, Y) and optionally its gradient.
|
|
|
|
Parameters
|
|
----------
|
|
X : ndarray of shape (n_samples_X, n_features)
|
|
Left argument of the returned kernel k(X, Y)
|
|
|
|
Y : ndarray of shape (n_samples_Y, n_features), default=None
|
|
Right argument of the returned kernel k(X, Y). If None, k(X, X)
|
|
if evaluated instead.
|
|
|
|
eval_gradient : bool, default=False
|
|
Determines whether the gradient with respect to the log of
|
|
the kernel hyperparameter is computed.
|
|
Only supported when Y is None.
|
|
|
|
Returns
|
|
-------
|
|
K : ndarray of shape (n_samples_X, n_samples_Y)
|
|
Kernel k(X, Y)
|
|
|
|
K_gradient : ndarray of shape (n_samples_X, n_samples_X, n_dims),\
|
|
optional
|
|
The gradient of the kernel k(X, X) with respect to the log of the
|
|
hyperparameter of the kernel. Only returned when `eval_gradient`
|
|
is True.
|
|
"""
|
|
X = np.atleast_2d(X)
|
|
if Y is None:
|
|
K = np.inner(X, X) + self.sigma_0**2
|
|
else:
|
|
if eval_gradient:
|
|
raise ValueError("Gradient can only be evaluated when Y is None.")
|
|
K = np.inner(X, Y) + self.sigma_0**2
|
|
|
|
if eval_gradient:
|
|
if not self.hyperparameter_sigma_0.fixed:
|
|
K_gradient = np.empty((K.shape[0], K.shape[1], 1))
|
|
K_gradient[..., 0] = 2 * self.sigma_0**2
|
|
return K, K_gradient
|
|
else:
|
|
return K, np.empty((X.shape[0], X.shape[0], 0))
|
|
else:
|
|
return K
|
|
|
|
def diag(self, X):
|
|
"""Returns the diagonal of the kernel k(X, X).
|
|
|
|
The result of this method is identical to np.diag(self(X)); however,
|
|
it can be evaluated more efficiently since only the diagonal is
|
|
evaluated.
|
|
|
|
Parameters
|
|
----------
|
|
X : ndarray of shape (n_samples_X, n_features)
|
|
Left argument of the returned kernel k(X, Y).
|
|
|
|
Returns
|
|
-------
|
|
K_diag : ndarray of shape (n_samples_X,)
|
|
Diagonal of kernel k(X, X).
|
|
"""
|
|
return np.einsum("ij,ij->i", X, X) + self.sigma_0**2
|
|
|
|
def is_stationary(self):
|
|
"""Returns whether the kernel is stationary."""
|
|
return False
|
|
|
|
def __repr__(self):
|
|
return "{0}(sigma_0={1:.3g})".format(self.__class__.__name__, self.sigma_0)
|
|
|
|
|
|
# adapted from scipy/optimize/optimize.py for functions with 2d output
|
|
def _approx_fprime(xk, f, epsilon, args=()):
|
|
f0 = f(*((xk,) + args))
|
|
grad = np.zeros((f0.shape[0], f0.shape[1], len(xk)), float)
|
|
ei = np.zeros((len(xk),), float)
|
|
for k in range(len(xk)):
|
|
ei[k] = 1.0
|
|
d = epsilon * ei
|
|
grad[:, :, k] = (f(*((xk + d,) + args)) - f0) / d[k]
|
|
ei[k] = 0.0
|
|
return grad
|
|
|
|
|
|
class PairwiseKernel(Kernel):
|
|
"""Wrapper for kernels in sklearn.metrics.pairwise.
|
|
|
|
A thin wrapper around the functionality of the kernels in
|
|
sklearn.metrics.pairwise.
|
|
|
|
Note: Evaluation of eval_gradient is not analytic but numeric and all
|
|
kernels support only isotropic distances. The parameter gamma is
|
|
considered to be a hyperparameter and may be optimized. The other
|
|
kernel parameters are set directly at initialization and are kept
|
|
fixed.
|
|
|
|
.. versionadded:: 0.18
|
|
|
|
Parameters
|
|
----------
|
|
gamma : float, default=1.0
|
|
Parameter gamma of the pairwise kernel specified by metric. It should
|
|
be positive.
|
|
|
|
gamma_bounds : pair of floats >= 0 or "fixed", default=(1e-5, 1e5)
|
|
The lower and upper bound on 'gamma'.
|
|
If set to "fixed", 'gamma' cannot be changed during
|
|
hyperparameter tuning.
|
|
|
|
metric : {"linear", "additive_chi2", "chi2", "poly", "polynomial", \
|
|
"rbf", "laplacian", "sigmoid", "cosine"} or callable, \
|
|
default="linear"
|
|
The metric to use when calculating kernel between instances in a
|
|
feature array. If metric is a string, it must be one of the metrics
|
|
in pairwise.PAIRWISE_KERNEL_FUNCTIONS.
|
|
If metric is "precomputed", X is assumed to be a kernel matrix.
|
|
Alternatively, if metric is a callable function, it is called on each
|
|
pair of instances (rows) and the resulting value recorded. The callable
|
|
should take two arrays from X as input and return a value indicating
|
|
the distance between them.
|
|
|
|
pairwise_kernels_kwargs : dict, default=None
|
|
All entries of this dict (if any) are passed as keyword arguments to
|
|
the pairwise kernel function.
|
|
|
|
Examples
|
|
--------
|
|
>>> from sklearn.datasets import load_iris
|
|
>>> from sklearn.gaussian_process import GaussianProcessClassifier
|
|
>>> from sklearn.gaussian_process.kernels import PairwiseKernel
|
|
>>> X, y = load_iris(return_X_y=True)
|
|
>>> kernel = PairwiseKernel(metric='rbf')
|
|
>>> gpc = GaussianProcessClassifier(kernel=kernel,
|
|
... random_state=0).fit(X, y)
|
|
>>> gpc.score(X, y)
|
|
0.9733...
|
|
>>> gpc.predict_proba(X[:2,:])
|
|
array([[0.8880..., 0.05663..., 0.05532...],
|
|
[0.8676..., 0.07073..., 0.06165...]])
|
|
"""
|
|
|
|
def __init__(
|
|
self,
|
|
gamma=1.0,
|
|
gamma_bounds=(1e-5, 1e5),
|
|
metric="linear",
|
|
pairwise_kernels_kwargs=None,
|
|
):
|
|
self.gamma = gamma
|
|
self.gamma_bounds = gamma_bounds
|
|
self.metric = metric
|
|
self.pairwise_kernels_kwargs = pairwise_kernels_kwargs
|
|
|
|
@property
|
|
def hyperparameter_gamma(self):
|
|
return Hyperparameter("gamma", "numeric", self.gamma_bounds)
|
|
|
|
def __call__(self, X, Y=None, eval_gradient=False):
|
|
"""Return the kernel k(X, Y) and optionally its gradient.
|
|
|
|
Parameters
|
|
----------
|
|
X : ndarray of shape (n_samples_X, n_features)
|
|
Left argument of the returned kernel k(X, Y)
|
|
|
|
Y : ndarray of shape (n_samples_Y, n_features), default=None
|
|
Right argument of the returned kernel k(X, Y). If None, k(X, X)
|
|
if evaluated instead.
|
|
|
|
eval_gradient : bool, default=False
|
|
Determines whether the gradient with respect to the log of
|
|
the kernel hyperparameter is computed.
|
|
Only supported when Y is None.
|
|
|
|
Returns
|
|
-------
|
|
K : ndarray of shape (n_samples_X, n_samples_Y)
|
|
Kernel k(X, Y)
|
|
|
|
K_gradient : ndarray of shape (n_samples_X, n_samples_X, n_dims),\
|
|
optional
|
|
The gradient of the kernel k(X, X) with respect to the log of the
|
|
hyperparameter of the kernel. Only returned when `eval_gradient`
|
|
is True.
|
|
"""
|
|
pairwise_kernels_kwargs = self.pairwise_kernels_kwargs
|
|
if self.pairwise_kernels_kwargs is None:
|
|
pairwise_kernels_kwargs = {}
|
|
|
|
X = np.atleast_2d(X)
|
|
K = pairwise_kernels(
|
|
X,
|
|
Y,
|
|
metric=self.metric,
|
|
gamma=self.gamma,
|
|
filter_params=True,
|
|
**pairwise_kernels_kwargs,
|
|
)
|
|
if eval_gradient:
|
|
if self.hyperparameter_gamma.fixed:
|
|
return K, np.empty((X.shape[0], X.shape[0], 0))
|
|
else:
|
|
# approximate gradient numerically
|
|
def f(gamma): # helper function
|
|
return pairwise_kernels(
|
|
X,
|
|
Y,
|
|
metric=self.metric,
|
|
gamma=np.exp(gamma),
|
|
filter_params=True,
|
|
**pairwise_kernels_kwargs,
|
|
)
|
|
|
|
return K, _approx_fprime(self.theta, f, 1e-10)
|
|
else:
|
|
return K
|
|
|
|
def diag(self, X):
|
|
"""Returns the diagonal of the kernel k(X, X).
|
|
|
|
The result of this method is identical to np.diag(self(X)); however,
|
|
it can be evaluated more efficiently since only the diagonal is
|
|
evaluated.
|
|
|
|
Parameters
|
|
----------
|
|
X : ndarray of shape (n_samples_X, n_features)
|
|
Left argument of the returned kernel k(X, Y)
|
|
|
|
Returns
|
|
-------
|
|
K_diag : ndarray of shape (n_samples_X,)
|
|
Diagonal of kernel k(X, X)
|
|
"""
|
|
# We have to fall back to slow way of computing diagonal
|
|
return np.apply_along_axis(self, 1, X).ravel()
|
|
|
|
def is_stationary(self):
|
|
"""Returns whether the kernel is stationary."""
|
|
return self.metric in ["rbf"]
|
|
|
|
def __repr__(self):
|
|
return "{0}(gamma={1}, metric={2})".format(
|
|
self.__class__.__name__, self.gamma, self.metric
|
|
)
|