Inzynierka/Lib/site-packages/sklearn/neural_network/_stochastic_optimizers.py

289 lines
8.6 KiB
Python
Raw Normal View History

2023-06-02 12:51:02 +02:00
"""Stochastic optimization methods for MLP
"""
# Authors: Jiyuan Qian <jq401@nyu.edu>
# License: BSD 3 clause
import numpy as np
class BaseOptimizer:
"""Base (Stochastic) gradient descent optimizer
Parameters
----------
learning_rate_init : float, default=0.1
The initial learning rate used. It controls the step-size in updating
the weights
Attributes
----------
learning_rate : float
the current learning rate
"""
def __init__(self, learning_rate_init=0.1):
self.learning_rate_init = learning_rate_init
self.learning_rate = float(learning_rate_init)
def update_params(self, params, grads):
"""Update parameters with given gradients
Parameters
----------
params : list of length = len(coefs_) + len(intercepts_)
The concatenated list containing coefs_ and intercepts_ in MLP
model. Used for initializing velocities and updating params
grads : list of length = len(params)
Containing gradients with respect to coefs_ and intercepts_ in MLP
model. So length should be aligned with params
"""
updates = self._get_updates(grads)
for param, update in zip((p for p in params), updates):
param += update
def iteration_ends(self, time_step):
"""Perform update to learning rate and potentially other states at the
end of an iteration
"""
pass
def trigger_stopping(self, msg, verbose):
"""Decides whether it is time to stop training
Parameters
----------
msg : str
Message passed in for verbose output
verbose : bool
Print message to stdin if True
Returns
-------
is_stopping : bool
True if training needs to stop
"""
if verbose:
print(msg + " Stopping.")
return True
class SGDOptimizer(BaseOptimizer):
"""Stochastic gradient descent optimizer with momentum
Parameters
----------
params : list, length = len(coefs_) + len(intercepts_)
The concatenated list containing coefs_ and intercepts_ in MLP model.
Used for initializing velocities and updating params
learning_rate_init : float, default=0.1
The initial learning rate used. It controls the step-size in updating
the weights
lr_schedule : {'constant', 'adaptive', 'invscaling'}, default='constant'
Learning rate schedule for weight updates.
-'constant', is a constant learning rate given by
'learning_rate_init'.
-'invscaling' gradually decreases the learning rate 'learning_rate_' at
each time step 't' using an inverse scaling exponent of 'power_t'.
learning_rate_ = learning_rate_init / pow(t, power_t)
-'adaptive', keeps the learning rate constant to
'learning_rate_init' as long as the training keeps decreasing.
Each time 2 consecutive epochs fail to decrease the training loss by
tol, or fail to increase validation score by tol if 'early_stopping'
is on, the current learning rate is divided by 5.
momentum : float, default=0.9
Value of momentum used, must be larger than or equal to 0
nesterov : bool, default=True
Whether to use nesterov's momentum or not. Use nesterov's if True
power_t : float, default=0.5
Power of time step 't' in inverse scaling. See `lr_schedule` for
more details.
Attributes
----------
learning_rate : float
the current learning rate
velocities : list, length = len(params)
velocities that are used to update params
"""
def __init__(
self,
params,
learning_rate_init=0.1,
lr_schedule="constant",
momentum=0.9,
nesterov=True,
power_t=0.5,
):
super().__init__(learning_rate_init)
self.lr_schedule = lr_schedule
self.momentum = momentum
self.nesterov = nesterov
self.power_t = power_t
self.velocities = [np.zeros_like(param) for param in params]
def iteration_ends(self, time_step):
"""Perform updates to learning rate and potential other states at the
end of an iteration
Parameters
----------
time_step : int
number of training samples trained on so far, used to update
learning rate for 'invscaling'
"""
if self.lr_schedule == "invscaling":
self.learning_rate = (
float(self.learning_rate_init) / (time_step + 1) ** self.power_t
)
def trigger_stopping(self, msg, verbose):
if self.lr_schedule != "adaptive":
if verbose:
print(msg + " Stopping.")
return True
if self.learning_rate <= 1e-6:
if verbose:
print(msg + " Learning rate too small. Stopping.")
return True
self.learning_rate /= 5.0
if verbose:
print(msg + " Setting learning rate to %f" % self.learning_rate)
return False
def _get_updates(self, grads):
"""Get the values used to update params with given gradients
Parameters
----------
grads : list, length = len(coefs_) + len(intercepts_)
Containing gradients with respect to coefs_ and intercepts_ in MLP
model. So length should be aligned with params
Returns
-------
updates : list, length = len(grads)
The values to add to params
"""
updates = [
self.momentum * velocity - self.learning_rate * grad
for velocity, grad in zip(self.velocities, grads)
]
self.velocities = updates
if self.nesterov:
updates = [
self.momentum * velocity - self.learning_rate * grad
for velocity, grad in zip(self.velocities, grads)
]
return updates
class AdamOptimizer(BaseOptimizer):
"""Stochastic gradient descent optimizer with Adam
Note: All default values are from the original Adam paper
Parameters
----------
params : list, length = len(coefs_) + len(intercepts_)
The concatenated list containing coefs_ and intercepts_ in MLP model.
Used for initializing velocities and updating params
learning_rate_init : float, default=0.001
The initial learning rate used. It controls the step-size in updating
the weights
beta_1 : float, default=0.9
Exponential decay rate for estimates of first moment vector, should be
in [0, 1)
beta_2 : float, default=0.999
Exponential decay rate for estimates of second moment vector, should be
in [0, 1)
epsilon : float, default=1e-8
Value for numerical stability
Attributes
----------
learning_rate : float
The current learning rate
t : int
Timestep
ms : list, length = len(params)
First moment vectors
vs : list, length = len(params)
Second moment vectors
References
----------
:arxiv:`Kingma, Diederik, and Jimmy Ba (2014) "Adam: A method for
stochastic optimization." <1412.6980>
"""
def __init__(
self, params, learning_rate_init=0.001, beta_1=0.9, beta_2=0.999, epsilon=1e-8
):
super().__init__(learning_rate_init)
self.beta_1 = beta_1
self.beta_2 = beta_2
self.epsilon = epsilon
self.t = 0
self.ms = [np.zeros_like(param) for param in params]
self.vs = [np.zeros_like(param) for param in params]
def _get_updates(self, grads):
"""Get the values used to update params with given gradients
Parameters
----------
grads : list, length = len(coefs_) + len(intercepts_)
Containing gradients with respect to coefs_ and intercepts_ in MLP
model. So length should be aligned with params
Returns
-------
updates : list, length = len(grads)
The values to add to params
"""
self.t += 1
self.ms = [
self.beta_1 * m + (1 - self.beta_1) * grad
for m, grad in zip(self.ms, grads)
]
self.vs = [
self.beta_2 * v + (1 - self.beta_2) * (grad**2)
for v, grad in zip(self.vs, grads)
]
self.learning_rate = (
self.learning_rate_init
* np.sqrt(1 - self.beta_2**self.t)
/ (1 - self.beta_1**self.t)
)
updates = [
-self.learning_rate * m / (np.sqrt(v) + self.epsilon)
for m, v in zip(self.ms, self.vs)
]
return updates