824 lines
30 KiB
Python
824 lines
30 KiB
Python
|
# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
|
||
|
#
|
||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||
|
# you may not use this file except in compliance with the License.
|
||
|
# You may obtain a copy of the License at
|
||
|
#
|
||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||
|
#
|
||
|
# Unless required by applicable law or agreed to in writing, software
|
||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||
|
# See the License for the specific language governing permissions and
|
||
|
# limitations under the License.
|
||
|
# ==============================================================================
|
||
|
"""Decorator to overrides the gradient for a function."""
|
||
|
|
||
|
from tensorflow.python.eager import backprop
|
||
|
from tensorflow.python.eager import context
|
||
|
from tensorflow.python.eager import record
|
||
|
from tensorflow.python.framework import composite_tensor_gradient
|
||
|
from tensorflow.python.framework import dtypes
|
||
|
from tensorflow.python.framework import ops
|
||
|
from tensorflow.python.ops import array_ops
|
||
|
from tensorflow.python.ops import gen_array_ops
|
||
|
from tensorflow.python.ops import handle_data_util
|
||
|
from tensorflow.python.ops import math_ops
|
||
|
from tensorflow.python.ops import op_selector
|
||
|
from tensorflow.python.ops import resource_variable_ops
|
||
|
from tensorflow.python.ops import variable_scope
|
||
|
from tensorflow.python.ops.unconnected_gradients import UnconnectedGradients
|
||
|
from tensorflow.python.platform import tf_logging as logging
|
||
|
from tensorflow.python.util import nest
|
||
|
from tensorflow.python.util import tf_decorator
|
||
|
from tensorflow.python.util import tf_inspect
|
||
|
from tensorflow.python.util import variable_utils
|
||
|
from tensorflow.python.util.tf_export import tf_export
|
||
|
|
||
|
|
||
|
VAR_OP_TYPES = [
|
||
|
"VariableV2",
|
||
|
"VarHandleOp",
|
||
|
]
|
||
|
|
||
|
|
||
|
@tf_export("custom_gradient")
|
||
|
def custom_gradient(f=None):
|
||
|
"""Decorator to define a function with a custom gradient.
|
||
|
|
||
|
This decorator allows fine grained control over the gradients of a sequence
|
||
|
for operations. This may be useful for multiple reasons, including providing
|
||
|
a more efficient or numerically stable gradient for a sequence of operations.
|
||
|
|
||
|
For example, consider the following function that commonly occurs in the
|
||
|
computation of cross entropy and log likelihoods:
|
||
|
|
||
|
```python
|
||
|
def log1pexp(x):
|
||
|
return tf.math.log(1 + tf.exp(x))
|
||
|
```
|
||
|
|
||
|
Due to numerical instability, the gradient of this function evaluated at x=100
|
||
|
is NaN. For example:
|
||
|
|
||
|
```python
|
||
|
with tf.GradientTape() as tape:
|
||
|
tape.watch(x)
|
||
|
y=log1pexp(x)
|
||
|
dy_dx = tape.gradient(y, x) # Will be NaN when evaluated.
|
||
|
```
|
||
|
|
||
|
The gradient expression can be analytically simplified to provide numerical
|
||
|
stability:
|
||
|
|
||
|
```python
|
||
|
@tf.custom_gradient
|
||
|
def log1pexp(x):
|
||
|
e = tf.exp(x)
|
||
|
def grad(upstream):
|
||
|
return upstream * (1 - 1 / (1 + e))
|
||
|
return tf.math.log(1 + e), grad
|
||
|
```
|
||
|
|
||
|
With this definition, the gradient `dy_dx` at `x = 100` will be correctly
|
||
|
evaluated as 1.0.
|
||
|
|
||
|
The variable `upstream` is defined as the upstream gradient. i.e. the gradient
|
||
|
from all the layers or functions originating from this layer. The above
|
||
|
example has no upstream functions, therefore `upstream = dy/dy = 1.0`.
|
||
|
|
||
|
Assume that `x_i` is `log1pexp` in the forward pass `x_1 = x_1(x_0)`,
|
||
|
`x_2 = x_2(x_1)`, ..., `x_i = x_i(x_i-1)`, ..., `x_n = x_n(x_n-1)`. By
|
||
|
chain rule we know that `dx_n/dx_0 = dx_n/dx_n-1 * dx_n-1/dx_n-2 * ... *
|
||
|
dx_i/dx_i-1 * ... * dx_1/dx_0`.
|
||
|
|
||
|
In this case the gradient of our current function defined as
|
||
|
`dx_i/dx_i-1 = (exp(x_i) / (1 + exp(x_i))) = (1 - 1 / (1 + exp(x_i)))`. The
|
||
|
upstream gradient `upstream` would be `dx_n/dx_n-1 * dx_n-1/dx_n-2 * ... *
|
||
|
dx_i+1/dx_i`. The upstream gradient multiplied by the current gradient is
|
||
|
then passed downstream.
|
||
|
|
||
|
In case the function takes multiple variables as input, the `grad`
|
||
|
function must also return the same number of variables.
|
||
|
We take the function `z = x * y` as an example.
|
||
|
|
||
|
>>> @tf.custom_gradient
|
||
|
... def bar(x, y):
|
||
|
... def grad(upstream):
|
||
|
... dz_dx = y
|
||
|
... dz_dy = x
|
||
|
... return upstream * dz_dx, upstream * dz_dy
|
||
|
... z = x * y
|
||
|
... return z, grad
|
||
|
>>> x = tf.constant(2.0, dtype=tf.float32)
|
||
|
>>> y = tf.constant(3.0, dtype=tf.float32)
|
||
|
>>> with tf.GradientTape(persistent=True) as tape:
|
||
|
... tape.watch(x)
|
||
|
... tape.watch(y)
|
||
|
... z = bar(x, y)
|
||
|
>>> z
|
||
|
<tf.Tensor: shape=(), dtype=float32, numpy=6.0>
|
||
|
>>> tape.gradient(z, x)
|
||
|
<tf.Tensor: shape=(), dtype=float32, numpy=3.0>
|
||
|
>>> tape.gradient(z, y)
|
||
|
<tf.Tensor: shape=(), dtype=float32, numpy=2.0>
|
||
|
|
||
|
Nesting custom gradients can lead to unintuitive results. The default
|
||
|
behavior does not correspond to n-th order derivatives. For example
|
||
|
|
||
|
```python
|
||
|
@tf.custom_gradient
|
||
|
def op(x):
|
||
|
y = op1(x)
|
||
|
@tf.custom_gradient
|
||
|
def grad_fn(dy):
|
||
|
gdy = op2(x, y, dy)
|
||
|
def grad_grad_fn(ddy): # Not the 2nd order gradient of op w.r.t. x.
|
||
|
return op3(x, y, dy, ddy)
|
||
|
return gdy, grad_grad_fn
|
||
|
return y, grad_fn
|
||
|
```
|
||
|
|
||
|
The function `grad_grad_fn` will be calculating the first order gradient
|
||
|
of `grad_fn` with respect to `dy`, which is used to generate forward-mode
|
||
|
gradient graphs from backward-mode gradient graphs, but is not the same as
|
||
|
the second order gradient of `op` with respect to `x`.
|
||
|
|
||
|
Instead, wrap nested `@tf.custom_gradients` in another function:
|
||
|
|
||
|
```python
|
||
|
@tf.custom_gradient
|
||
|
def op_with_fused_backprop(x):
|
||
|
y, x_grad = fused_op(x)
|
||
|
def first_order_gradient(dy):
|
||
|
@tf.custom_gradient
|
||
|
def first_order_custom(unused_x):
|
||
|
def second_order_and_transpose(ddy):
|
||
|
return second_order_for_x(...), gradient_wrt_dy(...)
|
||
|
return x_grad, second_order_and_transpose
|
||
|
return dy * first_order_custom(x)
|
||
|
return y, first_order_gradient
|
||
|
```
|
||
|
|
||
|
Additional arguments to the inner `@tf.custom_gradient`-decorated function
|
||
|
control the expected return values of the innermost function.
|
||
|
|
||
|
The examples above illustrate how to specify custom gradients for functions
|
||
|
which do not read from variables. The following example uses variables, which
|
||
|
require special handling because they are effectively inputs of the forward
|
||
|
function.
|
||
|
|
||
|
>>> weights = tf.Variable(tf.ones([2])) # Trainable variable weights
|
||
|
>>> @tf.custom_gradient
|
||
|
... def linear_poly(x):
|
||
|
... # Creating polynomial
|
||
|
... poly = weights[1] * x + weights[0]
|
||
|
...
|
||
|
... def grad_fn(dpoly, variables):
|
||
|
... # dy/dx = weights[1] and we need to left multiply dpoly
|
||
|
... grad_xs = dpoly * weights[1] # Scalar gradient
|
||
|
...
|
||
|
... grad_vars = [] # To store gradients of passed variables
|
||
|
... assert variables is not None
|
||
|
... assert len(variables) == 1
|
||
|
... assert variables[0] is weights
|
||
|
... # Manually computing dy/dweights
|
||
|
... dy_dw = dpoly * tf.stack([x ** 1, x ** 0])
|
||
|
... grad_vars.append(
|
||
|
... tf.reduce_sum(tf.reshape(dy_dw, [2, -1]), axis=1)
|
||
|
... )
|
||
|
... return grad_xs, grad_vars
|
||
|
... return poly, grad_fn
|
||
|
>>> x = tf.constant([1., 2., 3.])
|
||
|
>>> with tf.GradientTape(persistent=True) as tape:
|
||
|
... tape.watch(x)
|
||
|
... poly = linear_poly(x)
|
||
|
>>> poly # poly = x + 1
|
||
|
<tf.Tensor: shape=(3,),
|
||
|
dtype=float32,
|
||
|
numpy=array([2., 3., 4.], dtype=float32)>
|
||
|
>>> tape.gradient(poly, x) # conventional scalar gradient dy/dx
|
||
|
<tf.Tensor: shape=(3,),
|
||
|
dtype=float32,
|
||
|
numpy=array([1., 1., 1.], dtype=float32)>
|
||
|
>>> tape.gradient(poly, weights)
|
||
|
<tf.Tensor: shape=(2,), dtype=float32, numpy=array([6., 3.], dtype=float32)>
|
||
|
|
||
|
Above example illustrates usage of trainable variable `weights`.
|
||
|
In the example, the inner `grad_fn` accepts an extra `variables` input
|
||
|
parameter and also returns an extra `grad_vars` output. That extra argument
|
||
|
is passed if the forward function reads any variables. You need to
|
||
|
compute the gradient w.r.t. each of those `variables` and output it as a list
|
||
|
of `grad_vars`. Note here that default value of `variables` is set to `None`
|
||
|
when no variables are used in the forward function.
|
||
|
|
||
|
It should be noted `tf.GradientTape` is still watching the forward pass of a
|
||
|
`tf.custom_gradient`, and will use the ops it watches. As a consequence,
|
||
|
calling `tf.function` while the tape is still watching leads
|
||
|
to a gradient graph being built. If an op is used in `tf.function` without
|
||
|
registered gradient, a `LookupError` will be raised.
|
||
|
|
||
|
Users can insert `tf.stop_gradient` to customize this behavior. This
|
||
|
is demonstrated in the example below. `tf.random.shuffle` does not have a
|
||
|
registered gradient. As a result `tf.stop_gradient` is used to avoid the
|
||
|
`LookupError`.
|
||
|
|
||
|
```python
|
||
|
x = tf.constant([0.3, 0.5], dtype=tf.float32)
|
||
|
|
||
|
@tf.custom_gradient
|
||
|
def test_func_with_stop_grad(x):
|
||
|
@tf.function
|
||
|
def _inner_func():
|
||
|
# Avoid exception during the forward pass
|
||
|
return tf.stop_gradient(tf.random.shuffle(x))
|
||
|
# return tf.random.shuffle(x) # This will raise
|
||
|
|
||
|
res = _inner_func()
|
||
|
def grad(upstream):
|
||
|
return upstream # Arbitrarily defined custom gradient
|
||
|
return res, grad
|
||
|
|
||
|
with tf.GradientTape() as g:
|
||
|
g.watch(x)
|
||
|
res = test_func_with_stop_grad(x)
|
||
|
|
||
|
g.gradient(res, x)
|
||
|
```
|
||
|
|
||
|
See also `tf.RegisterGradient` which registers a gradient function for a
|
||
|
primitive TensorFlow operation. `tf.custom_gradient` on the other hand allows
|
||
|
for fine grained control over the gradient computation of a sequence of
|
||
|
operations.
|
||
|
|
||
|
Note that if the decorated function uses `Variable`s, the enclosing variable
|
||
|
scope must be using
|
||
|
[ResourceVariables](https://www.tensorflow.org/guide/migrate/tf1_vs_tf2#resourcevariables_instead_of_referencevariables).
|
||
|
|
||
|
Args:
|
||
|
f: function `f(*x)` that returns a tuple `(y, grad_fn)` where: - `x` is a
|
||
|
sequence of (nested structures of) `Tensor` inputs to the function. - `y`
|
||
|
is a (nested structure of) `Tensor` outputs of applying TensorFlow
|
||
|
operations in `f` to `x`. - `grad_fn` is a function with the signature
|
||
|
`g(*grad_ys)` which returns a list of `Tensor`s the same size as
|
||
|
(flattened) `x` - the derivatives of `Tensor`s in `y` with respect to the
|
||
|
`Tensor`s in `x`. `grad_ys` is a sequence of `Tensor`s the same size as
|
||
|
(flattened) `y` holding the initial value gradients for each `Tensor` in
|
||
|
`y`. In a pure mathematical sense, a vector-argument vector-valued
|
||
|
function `f`'s derivatives should be its Jacobian matrix `J`. Here we are
|
||
|
expressing the Jacobian `J` as a function `grad_fn` which defines how `J`
|
||
|
will transform a vector `grad_ys` when left-multiplied with it (`grad_ys *
|
||
|
J`, the vector-Jacobian product, or VJP). This functional representation
|
||
|
of a matrix is convenient to use for chain-rule calculation (in e.g. the
|
||
|
back-propagation algorithm). If `f` uses `Variable`s (that are not part
|
||
|
of the inputs), i.e. through `get_variable`, then `grad_fn` should have
|
||
|
signature `g(*grad_ys, variables=None)`, where `variables` is a list of
|
||
|
the `Variable`s, and return a 2-tuple `(grad_xs, grad_vars)`, where
|
||
|
`grad_xs` is the same as above, and `grad_vars` is a `list<Tensor>` with
|
||
|
the derivatives of `Tensor`s in `y` with respect to the variables (that
|
||
|
is, grad_vars has one Tensor per variable in variables).
|
||
|
|
||
|
Returns:
|
||
|
A function `h(x)` which returns the same value as `f(x)[0]` and whose
|
||
|
gradient (as calculated by `tf.gradients`) is determined by `f(x)[1]`.
|
||
|
"""
|
||
|
|
||
|
if f is None:
|
||
|
return lambda f: custom_gradient(f=f)
|
||
|
|
||
|
@Bind.decorator
|
||
|
def decorated(wrapped, args, kwargs):
|
||
|
"""Decorated function with custom gradient."""
|
||
|
if context.executing_eagerly():
|
||
|
return _eager_mode_decorator(wrapped, args, kwargs)
|
||
|
else:
|
||
|
return _graph_mode_decorator(wrapped, args, kwargs)
|
||
|
|
||
|
return tf_decorator.make_decorator(f, decorated(f)) # pylint: disable=no-value-for-parameter
|
||
|
|
||
|
|
||
|
class Bind:
|
||
|
"""When called evaluates `d(f, args, kwargs)` but supports binding `f`.
|
||
|
|
||
|
>>> @Bind.decorator
|
||
|
... def my_decorator(f, args, kwargs):
|
||
|
... print("my_decorator called with", args, kwargs)
|
||
|
... return f(*args, **kwargs)
|
||
|
|
||
|
>>> class Foo:
|
||
|
... @my_decorator
|
||
|
... def bar(self, a, b, c):
|
||
|
... return a * b * c
|
||
|
|
||
|
>>> Foo.bar(None, 1, 2, c=3)
|
||
|
my_decorator called with (None, 1, 2) {'c': 3}
|
||
|
6
|
||
|
|
||
|
>>> foo = Foo()
|
||
|
>>> foo.bar(1, 2, c=3)
|
||
|
my_decorator called with (1, 2) {'c': 3}
|
||
|
6
|
||
|
"""
|
||
|
|
||
|
@classmethod
|
||
|
def decorator(cls, d):
|
||
|
return lambda f: Bind(f, d)
|
||
|
|
||
|
def __init__(self, f, d):
|
||
|
self._f = f
|
||
|
self._d = d
|
||
|
|
||
|
def __get__(self, instance, owner):
|
||
|
if instance is not None:
|
||
|
f = self._f.__get__(instance, owner)
|
||
|
return tf_decorator.make_decorator(f, Bind(f, self._d))
|
||
|
else:
|
||
|
return self
|
||
|
|
||
|
def __call__(self, *a, **k):
|
||
|
return self._d(self._f, a, k)
|
||
|
|
||
|
|
||
|
def get_variable_by_name(var_name):
|
||
|
"""Given a variable name, retrieves a handle on the tensorflow Variable."""
|
||
|
global_vars = ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)
|
||
|
|
||
|
def _filter_fn(item):
|
||
|
try:
|
||
|
return var_name == item.op.name
|
||
|
except AttributeError:
|
||
|
# Collection items without operation are ignored.
|
||
|
return False
|
||
|
|
||
|
candidate_vars = list(filter(_filter_fn, global_vars))
|
||
|
|
||
|
if len(candidate_vars) >= 1:
|
||
|
# Filter out non-trainable variables.
|
||
|
candidate_vars = [v for v in candidate_vars if v.trainable]
|
||
|
else:
|
||
|
raise ValueError("Unsuccessful at finding variable {}.".format(var_name))
|
||
|
|
||
|
if len(candidate_vars) == 1:
|
||
|
return candidate_vars[0]
|
||
|
elif len(candidate_vars) > 1:
|
||
|
raise ValueError(
|
||
|
"Unsuccessful at finding trainable variable {}. "
|
||
|
"Number of candidates: {}. "
|
||
|
"Candidates: {}".format(var_name, len(candidate_vars), candidate_vars))
|
||
|
else:
|
||
|
# The variable is not trainable.
|
||
|
return None
|
||
|
|
||
|
|
||
|
def _get_dependent_variables(input_ops, output_ops):
|
||
|
"""Finds variables involved in the subgraph between input_ops and output_ops.
|
||
|
|
||
|
Args:
|
||
|
input_ops: Flattened list of input ops
|
||
|
output_ops: Flattened list of output ops
|
||
|
|
||
|
Returns:
|
||
|
A list of variables
|
||
|
"""
|
||
|
|
||
|
# avoids the edge-case when input_ops == output_ops.
|
||
|
output_ops = nest.map_structure(gen_array_ops.identity, output_ops)
|
||
|
inbetween_ops = op_selector.get_backward_walk_ops(
|
||
|
seed_ops=output_ops,
|
||
|
stop_at_ts=input_ops,
|
||
|
inclusive=False,
|
||
|
only_differentiable=True)
|
||
|
var_ops = (op for op in inbetween_ops if op.type in VAR_OP_TYPES)
|
||
|
var_names = (op.name for op in var_ops)
|
||
|
tf_vars = (get_variable_by_name(var_name) for var_name in var_names)
|
||
|
tf_vars = [v for v in tf_vars if v is not None]
|
||
|
return tf_vars
|
||
|
|
||
|
|
||
|
def generate_name():
|
||
|
return "CustomGradient-%s" % ops.uid()
|
||
|
|
||
|
|
||
|
def _graph_mode_decorator(f, args, kwargs):
|
||
|
"""Implement custom gradient decorator for graph mode."""
|
||
|
# TODO(rsepassi): Add support for kwargs
|
||
|
if kwargs:
|
||
|
raise ValueError(
|
||
|
"The custom_gradient decorator currently supports keywords "
|
||
|
"arguments only when eager execution is enabled.")
|
||
|
name = generate_name()
|
||
|
args = variable_utils.convert_variables_to_tensors(args)
|
||
|
args = nest.map_structure(ops.convert_to_tensor, args, expand_composites=True)
|
||
|
|
||
|
# Checking global and local variables attempts to ensure that no non-resource
|
||
|
# Variables are added to the graph.
|
||
|
current_var_scope = variable_scope.get_variable_scope()
|
||
|
before_vars = set([
|
||
|
v.ref() for v in current_var_scope.global_variables() +
|
||
|
current_var_scope.local_variables()
|
||
|
])
|
||
|
with record.VariableWatcher() as variable_watcher:
|
||
|
result, grad_fn = f(*args)
|
||
|
|
||
|
flat_args = composite_tensor_gradient.get_flat_tensors_for_gradients(
|
||
|
nest.flatten(args))
|
||
|
flat_result = composite_tensor_gradient.get_flat_tensors_for_gradients(
|
||
|
nest.flatten(result))
|
||
|
flat_result_len = len(flat_result)
|
||
|
|
||
|
after_vars = set([
|
||
|
v.ref() for v in current_var_scope.global_variables() +
|
||
|
current_var_scope.local_variables()
|
||
|
])
|
||
|
new_vars = after_vars - before_vars
|
||
|
new_vars_list = [v.deref() for v in new_vars]
|
||
|
for v in new_vars_list:
|
||
|
if not resource_variable_ops.is_resource_variable(v):
|
||
|
raise TypeError(
|
||
|
"All variables used by a function wrapped with @custom_gradient must "
|
||
|
"be `ResourceVariable`s. Ensure that no `variable_scope` is created "
|
||
|
"with `use_resource=False`.")
|
||
|
|
||
|
# The variables that grad_fn needs to return gradients for are the set of
|
||
|
# variables used that are *not* part of the inputs.
|
||
|
variables_in_tape = frozenset([
|
||
|
v.ref() for v in variable_watcher.watched_variables()
|
||
|
])
|
||
|
|
||
|
graphs = {getattr(o, "graph", None) for o in flat_result}
|
||
|
# Not all results may be tensors. However, we want to ensure all tensor
|
||
|
# outputs are from the same graph and get a list of captured inputs for
|
||
|
# variable search
|
||
|
graphs.discard(None) # Discard non-graph outputs
|
||
|
if graphs:
|
||
|
if len(graphs) > 1:
|
||
|
raise ValueError(
|
||
|
"All custom_gradient outputs should be from the same graph")
|
||
|
output_graph = graphs.pop()
|
||
|
filtered_input_tensors = []
|
||
|
for i in flat_args:
|
||
|
if i.graph == output_graph:
|
||
|
filtered_input_tensors.append(i)
|
||
|
else:
|
||
|
filtered_input_tensors = flat_args
|
||
|
|
||
|
variables_in_subgraph = frozenset([
|
||
|
v.ref() for v in _get_dependent_variables(
|
||
|
input_ops=filtered_input_tensors, output_ops=flat_result)
|
||
|
])
|
||
|
variables = sorted(
|
||
|
[v.deref() for v in variables_in_subgraph.union(variables_in_tape)],
|
||
|
key=lambda v: v.name)
|
||
|
|
||
|
grad_argspec = tf_inspect.getfullargspec(grad_fn)
|
||
|
variables_in_signature = ("variables" in grad_argspec.args or
|
||
|
"variables" in grad_argspec.kwonlyargs or
|
||
|
grad_argspec.varkw)
|
||
|
if variables and not variables_in_signature:
|
||
|
raise TypeError(
|
||
|
"@tf.custom_gradient grad_fn must accept keyword argument 'variables', "
|
||
|
"since function uses variables: {}".format(variables))
|
||
|
if variables_in_signature and not variables:
|
||
|
# User seems to intend to use variables but none were captured.
|
||
|
logging.vlog(
|
||
|
1, "@custom_gradient grad_fn has 'variables' in signature, "
|
||
|
"but no ResourceVariables were used on the forward pass.")
|
||
|
|
||
|
all_tensors = flat_result + flat_args + variables
|
||
|
|
||
|
def tape_grad_fn(*result_grad_components):
|
||
|
"""Custom grad fn wrapper."""
|
||
|
result_grads = composite_tensor_gradient.replace_flat_tensors_for_gradients(
|
||
|
nest.flatten(result), result_grad_components[:flat_result_len])
|
||
|
if not isinstance(result_grads, (list, tuple)):
|
||
|
result_grads = [result_grads]
|
||
|
|
||
|
if variables:
|
||
|
input_grads, variable_grads = grad_fn(*result_grads, variables=variables)
|
||
|
if len(variable_grads) != len(variables):
|
||
|
raise ValueError("Must return gradient for each variable from "
|
||
|
"@custom_gradient grad_fn.")
|
||
|
else:
|
||
|
input_grads = grad_fn(*result_grads)
|
||
|
variable_grads = []
|
||
|
|
||
|
# Need to return one value per input to the IdentityN, so pad the
|
||
|
# gradients of the inputs of the custom_gradient function with the
|
||
|
# gradients of the outputs as well.
|
||
|
input_grads = composite_tensor_gradient.get_flat_tensors_for_gradients(
|
||
|
nest.flatten(input_grads))
|
||
|
return ([None] * flat_result_len) + input_grads + variable_grads
|
||
|
|
||
|
@ops.RegisterGradient(name)
|
||
|
def internal_grad_fn(unused_op, *result_grads): # pylint: disable=unused-variable
|
||
|
"""Custom grad fn wrapper."""
|
||
|
return tape_grad_fn(*result_grads)
|
||
|
|
||
|
original_tensors = all_tensors
|
||
|
with ops.get_default_graph().gradient_override_map({"IdentityN": name}):
|
||
|
all_tensors = array_ops.identity_n(all_tensors)
|
||
|
|
||
|
original_tensors = [ops.convert_to_tensor(x) for x in original_tensors]
|
||
|
|
||
|
# Propagate handle data for happier shape inference for resource variables.
|
||
|
for i, t in enumerate(original_tensors):
|
||
|
if t.dtype == dtypes.resource and hasattr(t, "_handle_data"):
|
||
|
all_tensors[i]._handle_data = t._handle_data # pylint: disable=protected-access
|
||
|
record.record_operation(
|
||
|
f.__name__, all_tensors, original_tensors, tape_grad_fn)
|
||
|
for ot, t in zip(original_tensors, all_tensors):
|
||
|
handle_data_util.copy_handle_data(ot, t)
|
||
|
flat_result = composite_tensor_gradient.replace_flat_tensors_for_gradients(
|
||
|
nest.flatten(result), all_tensors[:flat_result_len])
|
||
|
return nest.pack_sequence_as(result, flat_result)
|
||
|
|
||
|
|
||
|
def _eager_mode_decorator(f, args, kwargs):
|
||
|
"""Implement custom gradient decorator for eager mode."""
|
||
|
with record.VariableWatcher() as variable_watcher:
|
||
|
result, grad_fn = f(*args, **kwargs)
|
||
|
flat_args = composite_tensor_gradient.get_flat_tensors_for_gradients(
|
||
|
nest.flatten(args))
|
||
|
flat_kwargs = composite_tensor_gradient.get_flat_tensors_for_gradients(
|
||
|
nest.flatten(kwargs))
|
||
|
all_inputs = flat_args + flat_kwargs
|
||
|
# The variables that grad_fn needs to return gradients for are the set of
|
||
|
# variables used that are *not* part of the inputs.
|
||
|
variables = [
|
||
|
v.deref() # pylint: disable=g-complex-comprehension
|
||
|
for v in set(v.ref() for v in variable_watcher.watched_variables())
|
||
|
if all(v.deref() is not i for i in all_inputs)
|
||
|
]
|
||
|
grad_argspec = tf_inspect.getfullargspec(grad_fn)
|
||
|
if (variables and ("variables" not in grad_argspec.args) and
|
||
|
("variables" not in grad_argspec.kwonlyargs) and
|
||
|
not grad_argspec.varkw):
|
||
|
raise TypeError(
|
||
|
"@tf.custom_gradient grad_fn must accept keyword argument 'variables', "
|
||
|
"since function uses variables: {}".format(variables))
|
||
|
flat_result = composite_tensor_gradient.get_flat_tensors_for_gradients(
|
||
|
nest.flatten(result))
|
||
|
# TODO(apassos) consider removing the identity below.
|
||
|
flat_result = [gen_array_ops.identity(x) for x in flat_result]
|
||
|
|
||
|
input_tensors = [
|
||
|
ops.convert_to_tensor(x) for x in flat_args + list(variables)]
|
||
|
|
||
|
recorded_inputs = input_tensors
|
||
|
arg_count = len(flat_args)
|
||
|
|
||
|
def actual_grad_fn(*result_grad_components):
|
||
|
"""Custom grad fn wrapper."""
|
||
|
result_grads = composite_tensor_gradient.replace_flat_tensors_for_gradients(
|
||
|
nest.flatten(result), result_grad_components)
|
||
|
if not isinstance(result_grads, (list, tuple)):
|
||
|
result_grads = [result_grads]
|
||
|
|
||
|
if variables:
|
||
|
input_grads, variable_grads = grad_fn(*result_grads, variables=variables)
|
||
|
if len(variable_grads) != len(variables):
|
||
|
raise ValueError("Must return gradient for each variable from "
|
||
|
"@custom_gradient grad_fn.")
|
||
|
else:
|
||
|
input_grads = grad_fn(*result_grads)
|
||
|
variable_grads = []
|
||
|
flat_grads = composite_tensor_gradient.get_flat_tensors_for_gradients(
|
||
|
nest.flatten(input_grads))
|
||
|
if len(flat_grads) != arg_count:
|
||
|
raise ValueError(
|
||
|
f"custom_gradient function expected to return {arg_count} "
|
||
|
f"gradients, but returned {len(flat_grads)} instead.")
|
||
|
return flat_grads + variable_grads
|
||
|
|
||
|
record.record_operation(f.__name__, flat_result, recorded_inputs,
|
||
|
actual_grad_fn)
|
||
|
flat_result = composite_tensor_gradient.replace_flat_tensors_for_gradients(
|
||
|
nest.flatten(result), flat_result)
|
||
|
return nest.pack_sequence_as(result, flat_result)
|
||
|
|
||
|
|
||
|
@tf_export("recompute_grad")
|
||
|
def recompute_grad(f):
|
||
|
"""Defines a function as a recompute-checkpoint for the tape auto-diff.
|
||
|
|
||
|
Tape checkpointing is a technique to reduce the memory consumption of the
|
||
|
auto-diff tape:
|
||
|
|
||
|
- Without tape checkpointing operations and intermediate values are
|
||
|
recorded to the tape for use in the backward pass.
|
||
|
|
||
|
- With tape checkpointing, only the function call and its inputs are
|
||
|
recorded. During back-propagation the `recompute_grad` custom gradient
|
||
|
(`tf.custom_gradient`) recomputes the function under a localized Tape object.
|
||
|
This recomputation of the function during backpropagation performs redundant
|
||
|
calculation, but reduces the overall memory usage of the Tape.
|
||
|
|
||
|
>>> y = tf.Variable(1.0)
|
||
|
|
||
|
>>> def my_function(x):
|
||
|
... tf.print('running')
|
||
|
... z = x*y
|
||
|
... return z
|
||
|
|
||
|
>>> my_function_recompute = tf.recompute_grad(my_function)
|
||
|
|
||
|
>>> with tf.GradientTape() as tape:
|
||
|
... r = tf.constant(1.0)
|
||
|
... for i in range(4):
|
||
|
... r = my_function_recompute(r)
|
||
|
running
|
||
|
running
|
||
|
running
|
||
|
running
|
||
|
|
||
|
>>> grad = tape.gradient(r, [y])
|
||
|
running
|
||
|
running
|
||
|
running
|
||
|
running
|
||
|
|
||
|
Without `recompute_grad`, the tape contains all intermitate steps, and no
|
||
|
recomputation is performed.
|
||
|
|
||
|
>>> with tf.GradientTape() as tape:
|
||
|
... r = tf.constant(1.0)
|
||
|
... for i in range(4):
|
||
|
... r = my_function(r)
|
||
|
running
|
||
|
running
|
||
|
running
|
||
|
running
|
||
|
|
||
|
>>> grad = tape.gradient(r, [y])
|
||
|
|
||
|
|
||
|
If `f` was a `tf.keras` `Model` or `Layer` object, methods and attributes
|
||
|
such as `f.variables` are not available on the returned function `g`.
|
||
|
Either keep a reference of `f` , or use `g.__wrapped__` for accessing
|
||
|
these variables and methods.
|
||
|
|
||
|
|
||
|
>>> def print_running_and_return(x):
|
||
|
... tf.print("running")
|
||
|
... return x
|
||
|
|
||
|
>>> model = tf.keras.Sequential([
|
||
|
... tf.keras.layers.Lambda(print_running_and_return),
|
||
|
... tf.keras.layers.Dense(2)
|
||
|
... ])
|
||
|
|
||
|
>>> model_recompute = tf.recompute_grad(model)
|
||
|
|
||
|
>>> with tf.GradientTape(persistent=True) as tape:
|
||
|
... r = tf.constant([[1,2]])
|
||
|
... for i in range(4):
|
||
|
... r = model_recompute(r)
|
||
|
running
|
||
|
running
|
||
|
running
|
||
|
running
|
||
|
|
||
|
>>> grad = tape.gradient(r, model.variables)
|
||
|
running
|
||
|
running
|
||
|
running
|
||
|
running
|
||
|
|
||
|
Alternatively, use the `__wrapped__` attribute to access the original
|
||
|
model object.
|
||
|
|
||
|
>>> grad = tape.gradient(r, model_recompute.__wrapped__.variables)
|
||
|
running
|
||
|
running
|
||
|
running
|
||
|
running
|
||
|
|
||
|
|
||
|
Args:
|
||
|
f: function `f(*x)` that returns a `Tensor` or sequence of `Tensor` outputs.
|
||
|
|
||
|
Returns:
|
||
|
A function `g` wrapping `f` that defines a custom gradient, which recomputes
|
||
|
`f` on the backwards pass of a gradient call.
|
||
|
"""
|
||
|
# TODO(cdfreeman) Add is_recomputing functionality from graph mode version
|
||
|
|
||
|
@custom_gradient
|
||
|
def inner(*args, **kwargs):
|
||
|
"""Inner function closure for calculating gradients."""
|
||
|
current_var_scope = variable_scope.get_variable_scope()
|
||
|
with record.stop_recording():
|
||
|
result = f(*args, **kwargs)
|
||
|
|
||
|
def grad_wrapper(*wrapper_args, variables=None):
|
||
|
"""Wrapper function to accomodate lack of kwargs in graph mode custom_gradient."""
|
||
|
|
||
|
@custom_gradient
|
||
|
def inner_recompute_grad(*dresult):
|
||
|
"""Nested custom gradient function for computing grads in reverse and forward mode autodiff."""
|
||
|
# Gradient calculation for reverse mode autodiff.
|
||
|
with backprop.GradientTape() as t:
|
||
|
id_args = nest.map_structure(gen_array_ops.identity, args)
|
||
|
# Tuple `dresult` should contain at least one tensor.
|
||
|
assert len(dresult) >= 1
|
||
|
|
||
|
if not context.executing_eagerly():
|
||
|
# XLA doesn't respect `tf.control_dependencies`. The code block
|
||
|
# below manually adds a data dependency to `dresult` to ensure
|
||
|
# recomputation of `f(*args, **kwargs)` happens after `dresult`.
|
||
|
|
||
|
# This works even if `dresult[0]` is a size 0 tensor as reduce_max
|
||
|
# of a size 0 tensor returns -inf. Use reshape here to avoid reading
|
||
|
# the entire `dresult[0]`.
|
||
|
elem = math_ops.reduce_max(array_ops.reshape(dresult[0], [-1])[:1])
|
||
|
# Cast elem to bool in case elem is NaN.
|
||
|
elem_bool = math_ops.cast(elem, dtypes.bool)
|
||
|
dresult_dep = array_ops.where_v2(
|
||
|
elem_bool == elem_bool, 0., float("nan")) # pylint: disable=comparison-with-itself
|
||
|
id_args = nest.map_structure(
|
||
|
lambda x: x + math_ops.cast(dresult_dep, x.dtype), id_args)
|
||
|
|
||
|
t.watch(id_args)
|
||
|
if variables is not None:
|
||
|
t.watch(variables)
|
||
|
with variable_scope.variable_scope(current_var_scope):
|
||
|
recomputed_result = f(*id_args, **kwargs)
|
||
|
kw_vars = []
|
||
|
if variables is not None:
|
||
|
kw_vars = list(variables)
|
||
|
grads = t.gradient(
|
||
|
recomputed_result,
|
||
|
list(id_args) + kw_vars,
|
||
|
output_gradients=dresult,
|
||
|
unconnected_gradients=UnconnectedGradients.ZERO)
|
||
|
|
||
|
def transpose(*t_args, **t_kwargs):
|
||
|
"""Gradient function calculation for forward mode autodiff."""
|
||
|
# Just throw an error since gradients / activations are not stored on
|
||
|
# tape for recompute.
|
||
|
raise NotImplementedError(
|
||
|
"recompute_grad tried to transpose grad of {}. "
|
||
|
"Consider not using recompute_grad in forward mode"
|
||
|
"autodiff".format(f.__name__))
|
||
|
|
||
|
return (grads[:len(id_args)], grads[len(id_args):]), transpose
|
||
|
|
||
|
return inner_recompute_grad(*wrapper_args)
|
||
|
|
||
|
return result, grad_wrapper
|
||
|
|
||
|
return tf_decorator.make_decorator(f, inner)
|
||
|
|
||
|
|
||
|
@tf_export("grad_pass_through")
|
||
|
def grad_pass_through(f):
|
||
|
"""Creates a grad-pass-through op with the forward behavior provided in f.
|
||
|
|
||
|
Use this function to wrap any op, maintaining its behavior in the forward
|
||
|
pass, but replacing the original op in the backward graph with an identity.
|
||
|
For example:
|
||
|
|
||
|
```python
|
||
|
x = tf.Variable(1.0, name="x")
|
||
|
z = tf.Variable(3.0, name="z")
|
||
|
|
||
|
with tf.GradientTape() as tape:
|
||
|
# y will evaluate to 9.0
|
||
|
y = tf.grad_pass_through(x.assign)(z**2)
|
||
|
# grads will evaluate to 6.0
|
||
|
grads = tape.gradient(y, z)
|
||
|
```
|
||
|
|
||
|
Another example is a 'differentiable' moving average approximation, where
|
||
|
gradients are allowed to flow into the last value fed to the moving average,
|
||
|
but the moving average is still used for the forward pass:
|
||
|
|
||
|
```python
|
||
|
x = ... # Some scalar value
|
||
|
# A moving average object, we don't need to know how this is implemented
|
||
|
moving_average = MovingAverage()
|
||
|
with backprop.GradientTape() as tape:
|
||
|
# mavg_x will evaluate to the current running average value
|
||
|
mavg_x = tf.grad_pass_through(moving_average)(x)
|
||
|
grads = tape.gradient(mavg_x, x) # grads will evaluate to 1.0
|
||
|
```
|
||
|
|
||
|
Args:
|
||
|
f: function `f(*x)` that returns a `Tensor` or nested structure of `Tensor`
|
||
|
outputs.
|
||
|
|
||
|
Returns:
|
||
|
A function `h(x)` which returns the same values as `f(x)` and whose
|
||
|
gradients are the same as those of an identity function.
|
||
|
"""
|
||
|
@custom_gradient
|
||
|
def _grad_pass_through_op(*args, **kwargs):
|
||
|
def grad(*args, **kwargs):
|
||
|
variables = kwargs.get("variables")
|
||
|
if variables is not None:
|
||
|
# Variables involved in the wrapped op will not receive gradients.
|
||
|
return args, [None] * len(variables)
|
||
|
return args
|
||
|
return f(*args, **kwargs), grad
|
||
|
return tf_decorator.make_decorator(f, _grad_pass_through_op)
|